Compare commits
	
		
			4 Commits
		
	
	
		
			b1d7ebaaea
			...
			53ea542f36
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 53ea542f36 | |||
| d246c54cd9 | |||
| 2a89d061a0 | |||
| 7add8793a5 | 
							
								
								
									
										1099
									
								
								src/main/rust/Cargo.lock
									
									
									
										generated
									
									
									
								
							
							
						
						
									
										1099
									
								
								src/main/rust/Cargo.lock
									
									
									
										generated
									
									
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							| @@ -6,18 +6,17 @@ edition = "2021" | |||||||
| # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html | ||||||
|  |  | ||||||
| [dependencies] | [dependencies] | ||||||
| bstr = "1.9.1" | bstr = "1.10.0" | ||||||
| fast-float = "0.2.0" | fast-float = "0.2.0" | ||||||
| memchr = "2.7.4" | memchr = "2.7.4" | ||||||
| memmap = "0.7.0" | memmap2 = "0.9.4" | ||||||
| polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]} |  | ||||||
| rayon = "1.10.0" | rayon = "1.10.0" | ||||||
| rustc-hash = "2.0.0" | rustc-hash = "2.0.0" | ||||||
| libc = "0.2.155" | libc = "0.2.158" | ||||||
| smol = "2.0.0" | smol = "2.0.1" | ||||||
|  |  | ||||||
| [dev-dependencies] | [dev-dependencies] | ||||||
| criterion = { version = "0.5", features = ["html_reports"] } | criterion = { version = "0.5.1", features = ["html_reports"] } | ||||||
|  |  | ||||||
| [features] | [features] | ||||||
| json = [] | json = [] | ||||||
| @@ -36,7 +35,7 @@ name = "multi_threaded" | |||||||
| harness = false | harness = false | ||||||
|  |  | ||||||
| [[bench]] | [[bench]] | ||||||
| name = "polars" | name = "multi_threaded_smol" | ||||||
| harness = false | harness = false | ||||||
|  |  | ||||||
| [[bench]] | [[bench]] | ||||||
|   | |||||||
| @@ -1,7 +1,8 @@ | |||||||
| use criterion::{Criterion, criterion_group, criterion_main}; | use criterion::{Criterion, criterion_group, criterion_main}; | ||||||
|  | use onebrc::implementations::libraries::run; | ||||||
| 
 | 
 | ||||||
| pub fn criterion_benchmark(c: &mut Criterion) { | pub fn criterion_benchmark(c: &mut Criterion) { | ||||||
|     c.bench_function("polars", |b| {b.iter(|| /*run_polars()*/ ())}); |     c.bench_function("libraries", |b| {b.iter(|| run())}); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| criterion_group!(benches, criterion_benchmark); | criterion_group!(benches, criterion_benchmark); | ||||||
							
								
								
									
										9
									
								
								src/main/rust/benches/multi_threaded_smol.rs
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										9
									
								
								src/main/rust/benches/multi_threaded_smol.rs
									
									
									
									
									
										Normal file
									
								
							| @@ -0,0 +1,9 @@ | |||||||
|  | use criterion::{Criterion, criterion_group, criterion_main}; | ||||||
|  | use onebrc::implementations::multi_threaded_smol::run; | ||||||
|  |  | ||||||
|  | pub fn criterion_benchmark(c: &mut Criterion) { | ||||||
|  |     c.bench_function("multithreadedsmol", |b| {b.iter(|| run())}); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | criterion_group!(benches, criterion_benchmark); | ||||||
|  | criterion_main!(benches); | ||||||
| @@ -1,4 +0,0 @@ | |||||||
|  |  | ||||||
| fn main() { |  | ||||||
|     // let _ = run_polars(); |  | ||||||
| } |  | ||||||
| @@ -4,7 +4,6 @@ pub mod multi_threaded; | |||||||
| pub mod multi_threaded_smol; | pub mod multi_threaded_smol; | ||||||
| pub mod multi_threaded_structured; | pub mod multi_threaded_structured; | ||||||
| pub mod phcs; | pub mod phcs; | ||||||
| pub mod polars; |  | ||||||
| pub mod reference_impl; | pub mod reference_impl; | ||||||
| pub mod single_thread; | pub mod single_thread; | ||||||
| pub mod smol; | pub mod smol; | ||||||
|   | |||||||
| @@ -1,67 +1,57 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::io::{BufRead, Seek, SeekFrom}; |  | ||||||
| use std::sync::mpsc; |  | ||||||
| use std::time::Instant; |  | ||||||
| use std::{fs::File, io::BufReader, thread}; |  | ||||||
| use memmap::MmapOptions; |  | ||||||
| use crate::models::station_measurements::StationMeasurements; | use crate::models::station_measurements::StationMeasurements; | ||||||
| use crate::utils::parse; | use crate::utils::parse; | ||||||
| use crate::utils::parse::hashstr; | use memmap2::MmapOptions; | ||||||
|  | use rustc_hash::{FxBuildHasher, FxHashMap as HashMap}; | ||||||
|  | use std::sync::mpsc; | ||||||
|  | use std::time::Instant; | ||||||
|  | use std::{fs::File, thread}; | ||||||
|  |  | ||||||
| const DEFAULT_HASHMAP_LENGTH: usize = 10000; | const DEFAULT_HASHMAP_LENGTH: usize = 10000; | ||||||
|  |  | ||||||
| pub fn run() { | pub fn run() { | ||||||
|     const FILE_PATH: &str = "../../../measurements.txt"; |  | ||||||
|     let now = Instant::now(); |     let now = Instant::now(); | ||||||
|     thread::scope(|s| { |     const FILE_PATH: &str = "../../../measurements.txt"; | ||||||
|         let mut stations: HashMap<usize, (String, StationMeasurements)> = |  | ||||||
|             HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); |  | ||||||
|         let (tx, rx) = mpsc::channel(); |  | ||||||
|         let cores = thread::available_parallelism().unwrap().into(); |  | ||||||
|     let file = File::open(FILE_PATH).expect("File measurements.txt not found"); |     let file = File::open(FILE_PATH).expect("File measurements.txt not found"); | ||||||
|     let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; |     let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; | ||||||
|     let file_length = mmap.len(); |     let file_length = mmap.len(); | ||||||
|  |     let hasher = FxBuildHasher::default(); | ||||||
|  |     let mut stations: HashMap<String, StationMeasurements> = | ||||||
|  |         HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); | ||||||
|  |     let (tx, rx) = mpsc::channel(); | ||||||
|  |     let cores = thread::available_parallelism().unwrap().into(); | ||||||
|     let chunk_length = file_length / cores; |     let chunk_length = file_length / cores; | ||||||
|     let mut bounds = Vec::with_capacity(cores + 1); |     let mut bounds = Vec::with_capacity(cores + 1); | ||||||
|         bounds.push(0); |     let mut start = 0; | ||||||
|         for i in 1..cores { |     for _ in 0..cores { | ||||||
|             let mut reader = BufReader::new(&file); |         let end = (start + chunk_length).min(mmap.len()); | ||||||
|             let mut byte_start = chunk_length * i; |         let next_new_line = match memchr::memchr(b'\n', &mmap[end..]) { | ||||||
|             reader |             Some(v) => v, | ||||||
|                 .seek(SeekFrom::Start(byte_start as u64)) |             None => { | ||||||
|                 .expect("could not seek"); |                 assert_eq!(end, mmap.len()); | ||||||
|             let mut line = Vec::with_capacity(108); |                 0 | ||||||
|             let line_len = reader |  | ||||||
|                 .read_until(b'\n', &mut line) |  | ||||||
|                 .expect("could not read bytes"); |  | ||||||
|             byte_start += line_len; |  | ||||||
|             bounds.push(byte_start); |  | ||||||
|             } |             } | ||||||
|         bounds.push(file_length); |         }; | ||||||
|  |         let end = end + next_new_line; | ||||||
|  |         bounds.push((start, end)); | ||||||
|  |         start = end + 1; | ||||||
|  |     } | ||||||
|  |     thread::scope(|s| { | ||||||
|         for i in 0..cores { |         for i in 0..cores { | ||||||
|             let tx = tx.clone(); |             let tx = tx.clone(); | ||||||
|             let mut currposition = *bounds.get(i).unwrap(); |             let (start, end) = *bounds.get(i).unwrap(); | ||||||
|             let end = *bounds.get(i + 1).unwrap(); |             let mmap_slice = &mmap[start..end]; | ||||||
|             s.spawn(move || { |             s.spawn(move || { | ||||||
|                 let file = File::open(FILE_PATH).expect("File measurements.txt not found"); |                 let mut t_stations: HashMap<String, StationMeasurements> = | ||||||
|                 let mut reader = BufReader::new(&file); |                     HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); | ||||||
|                 reader.seek(SeekFrom::Start(currposition as u64)).unwrap(); |                 for line in mmap_slice.split(|&byte| { byte == b'\n' }) { | ||||||
|                 let mut t_stations: HashMap<usize, (String, StationMeasurements)> = |                     if line.len() == 0 { | ||||||
|                     HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); |  | ||||||
|                 let mut line = Vec::with_capacity(108); |  | ||||||
|                 loop { |  | ||||||
|                     let line_len = reader |  | ||||||
|                         .read_until(b'\n', &mut line) |  | ||||||
|                         .expect("could not read bytes"); |  | ||||||
|                     if line_len == 0 { |  | ||||||
|                         break; |                         break; | ||||||
|                     } |                     } | ||||||
|                     let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); |                     let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); | ||||||
|                     let hash = hashstr(station); |  | ||||||
|                     let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; |                     let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; | ||||||
|                     let temp = parse::temp(temp.split_last().unwrap().1); |                     let temp = parse::temp(temp); | ||||||
|                     let measurements_option = t_stations.get_mut(&hash); |                     let measurements_option = t_stations.get_mut(&station); | ||||||
|                     if let Some((_, measurements)) = measurements_option { |                     if let Some(measurements) = measurements_option { | ||||||
|                         measurements.update(temp); |                         measurements.update(temp); | ||||||
|                     } else { |                     } else { | ||||||
|                         let measurements = StationMeasurements { |                         let measurements = StationMeasurements { | ||||||
| @@ -70,31 +60,26 @@ pub fn run() { | |||||||
|                             count: 1, |                             count: 1, | ||||||
|                             sum: temp, |                             sum: temp, | ||||||
|                         }; |                         }; | ||||||
|                         t_stations.insert(hash, (station, measurements)); |                         t_stations.insert(station, measurements); | ||||||
|                     } |                     } | ||||||
|                     currposition += line_len; |  | ||||||
|                     if currposition >= end { |  | ||||||
|                         break; |  | ||||||
|                     } |  | ||||||
|                     line.clear(); |  | ||||||
|                 } |                 } | ||||||
|                 let _ = tx.send(t_stations); |                 let _ = tx.send(t_stations); | ||||||
|             }); |             }); | ||||||
|         } |         } | ||||||
|         drop(tx); |         drop(tx); | ||||||
|         while let Ok(t_stations) = rx.recv() { |         while let Ok(t_stations) = rx.recv() { | ||||||
|             for (&hash, (station, measurements)) in t_stations.iter() { |             for (station, measurements) in t_stations.iter() { | ||||||
|                 let joined_measurements_options = stations.get_mut(&hash); |                 let joined_measurements_options = stations.get_mut(station); | ||||||
|                 if let Some((_, joined_measurements)) = joined_measurements_options { |                 if let Some(joined_measurements) = joined_measurements_options { | ||||||
|                     joined_measurements.merge(measurements); |                     joined_measurements.merge(measurements); | ||||||
|                 } else { |                 } else { | ||||||
|                     stations.insert(hash, (station.to_owned(), *measurements)); |                     stations.insert(station.to_owned(), *measurements); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         let mut stations: Vec<String> = stations |         let mut stations: Vec<String> = stations | ||||||
|             .iter() |             .iter() | ||||||
|             .map(|(_, (station, measurements))| { |             .map(|(station, measurements)| { | ||||||
|                 let measurements = measurements.to_string(); |                 let measurements = measurements.to_string(); | ||||||
|                 #[cfg(feature = "json")] |                 #[cfg(feature = "json")] | ||||||
|                 { |                 { | ||||||
|   | |||||||
| @@ -1,10 +1,9 @@ | |||||||
| use smol::fs::File; | use smol::fs::File; | ||||||
| use smol::io::{AsyncBufReadExt, AsyncSeekExt, BufReader, SeekFrom}; | use smol::io::{AsyncBufReadExt, AsyncSeekExt, BufReader, SeekFrom}; | ||||||
|  | use rustc_hash::{FxHashMap as HashMap, FxBuildHasher}; | ||||||
|  |  | ||||||
| use crate::models::station_measurements::StationMeasurements; | use crate::models::station_measurements::StationMeasurements; | ||||||
| use crate::utils::parse; | use crate::utils::parse; | ||||||
| use crate::utils::parse::hashstr; |  | ||||||
| use std::collections::HashMap; |  | ||||||
| use std::sync::mpsc; | use std::sync::mpsc; | ||||||
| use std::thread; | use std::thread; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
| @@ -15,8 +14,9 @@ pub fn run() { | |||||||
|     const FILE_PATH: &str = "../../../measurements.txt"; |     const FILE_PATH: &str = "../../../measurements.txt"; | ||||||
|     let now = Instant::now(); |     let now = Instant::now(); | ||||||
|     thread::scope(|s| { |     thread::scope(|s| { | ||||||
|         let mut stations: HashMap<usize, (String, StationMeasurements)> = |         let hasher = FxBuildHasher::default(); | ||||||
|             HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); |         let mut stations: HashMap<String, StationMeasurements> = | ||||||
|  |             HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); | ||||||
|         let (tx, rx) = mpsc::channel(); |         let (tx, rx) = mpsc::channel(); | ||||||
|         let cores = thread::available_parallelism().unwrap().into(); |         let cores = thread::available_parallelism().unwrap().into(); | ||||||
|         let bounds = smol::block_on(async { |         let bounds = smol::block_on(async { | ||||||
| @@ -57,8 +57,8 @@ pub fn run() { | |||||||
|                         .expect("File measurements.txt not found"); |                         .expect("File measurements.txt not found"); | ||||||
|                     let mut reader = BufReader::new(&mut file); |                     let mut reader = BufReader::new(&mut file); | ||||||
|                     reader.seek(SeekFrom::Start(currposition)).await.unwrap(); |                     reader.seek(SeekFrom::Start(currposition)).await.unwrap(); | ||||||
|                     let mut t_stations: HashMap<usize, (String, StationMeasurements)> = |                     let mut t_stations: HashMap<String, StationMeasurements> = | ||||||
|                         HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); |                         HashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); | ||||||
|                     let mut line = Vec::with_capacity(108); |                     let mut line = Vec::with_capacity(108); | ||||||
|                     loop { |                     loop { | ||||||
|                         let line_len = reader |                         let line_len = reader | ||||||
| @@ -69,11 +69,10 @@ pub fn run() { | |||||||
|                             break; |                             break; | ||||||
|                         } |                         } | ||||||
|                         let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); |                         let (station, temp) = line.rsplit_once(|&byte| byte == b';').unwrap(); | ||||||
|                         let hash = hashstr(station); |  | ||||||
|                         let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; |                         let station = unsafe { String::from_utf8_unchecked(Vec::from(station)) }; | ||||||
|                         let temp = parse::temp(temp.split_last().unwrap().1); |                         let temp = parse::temp(temp.split_last().unwrap().1); | ||||||
|                         let measurements_option = t_stations.get_mut(&hash); |                         let measurements_option = t_stations.get_mut(&station); | ||||||
|                         if let Some((_, measurements)) = measurements_option { |                         if let Some(measurements) = measurements_option { | ||||||
|                             measurements.update(temp); |                             measurements.update(temp); | ||||||
|                         } else { |                         } else { | ||||||
|                             let measurements = StationMeasurements { |                             let measurements = StationMeasurements { | ||||||
| @@ -82,7 +81,7 @@ pub fn run() { | |||||||
|                                 count: 1, |                                 count: 1, | ||||||
|                                 sum: temp, |                                 sum: temp, | ||||||
|                             }; |                             }; | ||||||
|                             t_stations.insert(hash, (station, measurements)); |                             t_stations.insert(station, measurements); | ||||||
|                         } |                         } | ||||||
|                         currposition += line_len as u64; |                         currposition += line_len as u64; | ||||||
|                         if currposition >= end { |                         if currposition >= end { | ||||||
| @@ -96,18 +95,18 @@ pub fn run() { | |||||||
|         } |         } | ||||||
|         drop(tx); |         drop(tx); | ||||||
|         while let Ok(t_stations) = rx.recv() { |         while let Ok(t_stations) = rx.recv() { | ||||||
|             for (&hash, (station, measurements)) in t_stations.iter() { |             for (station, measurements) in t_stations.iter() { | ||||||
|                 let joined_measurements_options = stations.get_mut(&hash); |                 let joined_measurements_options = stations.get_mut(station); | ||||||
|                 if let Some((_, joined_measurements)) = joined_measurements_options { |                 if let Some(joined_measurements) = joined_measurements_options { | ||||||
|                     joined_measurements.merge(measurements); |                     joined_measurements.merge(measurements); | ||||||
|                 } else { |                 } else { | ||||||
|                     stations.insert(hash, (station.to_owned(), *measurements)); |                     stations.insert(station.to_owned(), *measurements); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         let mut stations: Vec<String> = stations |         let mut stations: Vec<String> = stations | ||||||
|             .iter() |             .iter() | ||||||
|             .map(|(_, (station, measurements))| { |             .map(|(station, measurements)| { | ||||||
|                 let measurements = measurements.to_string(); |                 let measurements = measurements.to_string(); | ||||||
|                 #[cfg(feature = "json")] |                 #[cfg(feature = "json")] | ||||||
|                 { |                 { | ||||||
|   | |||||||
| @@ -1,12 +1,11 @@ | |||||||
| use std::collections::HashMap; |  | ||||||
| use std::io::{Read, Seek, SeekFrom}; |  | ||||||
| use std::sync::mpsc; | use std::sync::mpsc; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
| use std::{fs::File, io::BufReader, thread}; | use std::{fs::File, thread}; | ||||||
| use std::ffi::CStr; | use std::ffi::CStr; | ||||||
|  | use memmap2::MmapOptions; | ||||||
|  | use rustc_hash::{FxBuildHasher, FxHashMap}; | ||||||
| use crate::models::station_measurements::StationMeasurements; | use crate::models::station_measurements::StationMeasurements; | ||||||
| use crate::utils::parse; | use crate::utils::parse; | ||||||
| use crate::utils::parse::hashstr; |  | ||||||
|  |  | ||||||
| const DEFAULT_HASHMAP_LENGTH: usize = 10000; | const DEFAULT_HASHMAP_LENGTH: usize = 10000; | ||||||
|  |  | ||||||
| @@ -14,55 +13,54 @@ pub fn run() { | |||||||
|     print!("\x1b[J"); |     print!("\x1b[J"); | ||||||
|     const FILE_PATH: &str = "structured_measurements.txt"; |     const FILE_PATH: &str = "structured_measurements.txt"; | ||||||
|     let now = Instant::now(); |     let now = Instant::now(); | ||||||
|     thread::scope(|s| { |     let file = File::open(FILE_PATH).expect("File structured_measurements.txt not found"); | ||||||
|         let mut stations: HashMap<usize, (String, StationMeasurements)> = |     let mmap = unsafe { MmapOptions::new().map(&file).unwrap() }; | ||||||
|             HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); |     let file_length = mmap.len(); | ||||||
|  |     let hasher = FxBuildHasher::default(); | ||||||
|  |     let mut stations: FxHashMap<String, StationMeasurements> = | ||||||
|  |         FxHashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); | ||||||
|     let (tx, rx) = mpsc::channel(); |     let (tx, rx) = mpsc::channel(); | ||||||
|     let cores = thread::available_parallelism().unwrap().into(); |     let cores = thread::available_parallelism().unwrap().into(); | ||||||
|         let file = File::open(FILE_PATH).expect("File structured_measurements.txt not found"); |     let chunk_length = file_length / cores; | ||||||
|         let mut reader = BufReader::new(&file); |  | ||||||
|         let file_length = reader.seek(SeekFrom::End(0)).unwrap(); |  | ||||||
|         let chunk_length = file_length as usize / cores; |  | ||||||
|     let mut bounds = Vec::with_capacity(cores + 1); |     let mut bounds = Vec::with_capacity(cores + 1); | ||||||
|         bounds.push(0); |     let mut start = 0; | ||||||
|  |     for _ in 0..cores { | ||||||
|  |         let end = (start + chunk_length).min(mmap.len()); | ||||||
|  |         let next_new_line = match memchr::memchr(b'\n', &mmap[end..]) { | ||||||
|  |             Some(v) => v, | ||||||
|  |             None => { | ||||||
|  |                 assert_eq!(end, mmap.len()); | ||||||
|  |                 0 | ||||||
|  |             } | ||||||
|  |         }; | ||||||
|  |         let end = end + next_new_line; | ||||||
|  |         bounds.push((start, end)); | ||||||
|  |         start = end + 1; | ||||||
|  |     } | ||||||
|  |     thread::scope(|s| { | ||||||
|         for i in 0..cores { |         for i in 0..cores { | ||||||
|             let tx = tx.clone(); |             let tx = tx.clone(); | ||||||
|             let mut currposition = (i * chunk_length) as u64; |             let (start, end) = *bounds.get(i).unwrap(); | ||||||
|             let end = ((i + 1) * chunk_length) as u64; |             let mmap_slice = &mmap[start..end]; | ||||||
|             s.spawn(move || { |             s.spawn(move || { | ||||||
|                 let file = File::open(FILE_PATH).expect("File measurements.txt not found"); |                 let mut t_stations: FxHashMap<String, StationMeasurements> = | ||||||
|                 let mut reader = BufReader::new(&file); |                     FxHashMap::with_capacity_and_hasher(DEFAULT_HASHMAP_LENGTH, hasher); | ||||||
|                 reader.seek(SeekFrom::Start(currposition)).unwrap(); |                 let lines = mmap_slice.chunks_exact(107); | ||||||
|                 let mut t_stations: HashMap<usize, (String, StationMeasurements)> = |                 for (line_num, line) in lines.enumerate() { | ||||||
|                     HashMap::with_capacity(DEFAULT_HASHMAP_LENGTH); |  | ||||||
|                 let mut line = [0u8; 107]; |  | ||||||
|                 let mut line_num = 0; |  | ||||||
|                 loop { |  | ||||||
|                     if line_num % 100000 == 0 { |                     if line_num % 100000 == 0 { | ||||||
|                         print!("\x1b[{i};0Hlines: {line_num}"); |                         print!("\x1b[{i};0Hlines: {line_num}"); | ||||||
|                     } |                     } | ||||||
|                     line_num += 1; |  | ||||||
|                     let read_res = reader |  | ||||||
|                         .read_exact(&mut line); |  | ||||||
|                     match read_res { |  | ||||||
|                         Ok(_) => (), |  | ||||||
|                         Err(e) => match e.kind() { |  | ||||||
|                             std::io::ErrorKind::UnexpectedEof => break, |  | ||||||
|                             _ => panic!("Could not read") |  | ||||||
|                         }, |  | ||||||
|                     }; |  | ||||||
|                     let (station, temp) = unsafe { line.split_at_unchecked(100) }; |                     let (station, temp) = unsafe { line.split_at_unchecked(100) }; | ||||||
|                     let hash = hashstr(station); |  | ||||||
|                     let station = { |                     let station = { | ||||||
|                         if station[station.len() - 1] == 0u8 { |                         if station[station.len() - 1] == 0u8 { | ||||||
|                             unsafe { std::str::from_utf8_unchecked(CStr::from_bytes_until_nul(station).unwrap().to_bytes()) } |                             unsafe { std::str::from_utf8_unchecked(CStr::from_bytes_until_nul(station).unwrap().to_bytes()) } | ||||||
|                         } else { |                         } else { | ||||||
|                             unsafe { std::str::from_utf8_unchecked(station) } |                             unsafe { std::str::from_utf8_unchecked(station) } | ||||||
|                         } |                         } | ||||||
|                     }; |                     }.to_owned(); | ||||||
|                     let temp = parse::temp_new(&temp[1..6]); |                     let temp = parse::temp_new(&temp[1..6]); | ||||||
|                     let measurements_option = t_stations.get_mut(&hash); |                     let measurements_option = t_stations.get_mut(&station); | ||||||
|                     if let Some((_, measurements)) = measurements_option { |                     if let Some(measurements) = measurements_option { | ||||||
|                         measurements.update(temp); |                         measurements.update(temp); | ||||||
|                     } else { |                     } else { | ||||||
|                         let measurements = StationMeasurements { |                         let measurements = StationMeasurements { | ||||||
| @@ -71,11 +69,7 @@ pub fn run() { | |||||||
|                             count: 1, |                             count: 1, | ||||||
|                             sum: temp, |                             sum: temp, | ||||||
|                         }; |                         }; | ||||||
|                         t_stations.insert(hash, (station.to_string(), measurements)); |                         t_stations.insert(station, measurements); | ||||||
|                     } |  | ||||||
|                     currposition += 107; |  | ||||||
|                     if currposition >= end { |  | ||||||
|                         break; |  | ||||||
|                     } |                     } | ||||||
|                 } |                 } | ||||||
|                 let _ = tx.send(t_stations); |                 let _ = tx.send(t_stations); | ||||||
| @@ -83,18 +77,18 @@ pub fn run() { | |||||||
|         } |         } | ||||||
|         drop(tx); |         drop(tx); | ||||||
|         while let Ok(t_stations) = rx.recv() { |         while let Ok(t_stations) = rx.recv() { | ||||||
|             for (&hash, (station, measurements)) in t_stations.iter() { |             for (station, measurements) in t_stations.iter() { | ||||||
|                 let joined_measurements_options = stations.get_mut(&hash); |                 let joined_measurements_options = stations.get_mut(station); | ||||||
|                 if let Some((_, joined_measurements)) = joined_measurements_options { |                 if let Some(joined_measurements) = joined_measurements_options { | ||||||
|                     joined_measurements.merge(measurements); |                     joined_measurements.merge(measurements); | ||||||
|                 } else { |                 } else { | ||||||
|                     stations.insert(hash, (station.to_owned(), *measurements)); |                     stations.insert(station.to_owned(), *measurements); | ||||||
|                 } |                 } | ||||||
|             } |             } | ||||||
|         } |         } | ||||||
|         let mut stations: Vec<String> = stations |         let mut stations: Vec<String> = stations | ||||||
|             .iter() |             .iter() | ||||||
|             .map(|(_, (station, measurements))| { |             .map(|(station, measurements)| { | ||||||
|                 let measurements = measurements.to_string(); |                 let measurements = measurements.to_string(); | ||||||
|                 #[cfg(feature = "json")] |                 #[cfg(feature = "json")] | ||||||
|                 { |                 { | ||||||
|   | |||||||
| @@ -1,31 +0,0 @@ | |||||||
| use polars::prelude::*; |  | ||||||
| use std::time::Instant; |  | ||||||
| use std::vec; |  | ||||||
|  |  | ||||||
| pub fn run_polars() -> Result<DataFrame, PolarsError> { |  | ||||||
|     let now = Instant::now(); |  | ||||||
|  |  | ||||||
|     let f1: Field = Field::new("station", DataType::String); |  | ||||||
|     let f2: Field = Field::new("measure", DataType::Float64); |  | ||||||
|     let sc: Schema = Schema::from_iter(vec![f1, f2]); |  | ||||||
|  |  | ||||||
|     let q = LazyCsvReader::new("../../../measurements.txt") |  | ||||||
|         .has_header(false) |  | ||||||
|         .with_schema(Some(Arc::new(sc))) |  | ||||||
|         .with_separator(b';') |  | ||||||
|         .finish()? |  | ||||||
|         .group_by(vec![col("station")]) |  | ||||||
|         .agg(vec![ |  | ||||||
|             col("measure").alias("min").min(), |  | ||||||
|             col("measure").alias("mean").mean(), |  | ||||||
|             col("measure").alias("max").max(), |  | ||||||
|         ]) |  | ||||||
|         .sort("station", Default::default()) |  | ||||||
|         .with_streaming(true); |  | ||||||
|  |  | ||||||
|     let df = q.collect()?; |  | ||||||
|  |  | ||||||
|     println!("Time={} μs", now.elapsed().as_micros()); |  | ||||||
|  |  | ||||||
|     Ok(df) |  | ||||||
| } |  | ||||||
| @@ -1,5 +1,5 @@ | |||||||
| use bstr::{BStr, ByteSlice}; | use bstr::{BStr, ByteSlice}; | ||||||
| use memmap::MmapOptions; | use memmap2::MmapOptions; | ||||||
| use rayon::prelude::*; | use rayon::prelude::*; | ||||||
| use rustc_hash::FxHashMap as HashMap; | use rustc_hash::FxHashMap as HashMap; | ||||||
| use std::time::Instant; | use std::time::Instant; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user