The polars solution I saw on reddit a few months ago is also super slow. I'm thinking it might be my macbook

This commit is contained in:
Fabian Schmidt 2024-04-30 15:35:16 +02:00
parent b6e8b41bb1
commit 0adcf3dec5
3 changed files with 1345 additions and 1 deletions

1310
rust/Cargo.lock generated

File diff suppressed because it is too large Load Diff

View File

@ -7,3 +7,4 @@ edition = "2021"
[dependencies]
hashbrown = "0.14.3"
polars = { version = "0.36.2", features = ["csv", "lazy", "nightly", "streaming"]}

35
rust/src/bin/polars.rs Normal file
View File

@ -0,0 +1,35 @@
use polars::prelude::*;
use std::time::Instant;
use std::vec;
fn run_polars() -> Result<DataFrame, PolarsError> {
let now = Instant::now();
let f1: Field = Field::new("station", DataType::String);
let f2: Field = Field::new("measure", DataType::Float64);
let sc: Schema = Schema::from_iter(vec![f1, f2]);
let q = LazyCsvReader::new("../measurements.txt")
.has_header(false)
.with_schema(Some(Arc::new(sc)))
.with_separator(b';')
.finish()?
.group_by(vec![col("station")])
.agg(vec![
col("measure").alias("min").min(),
col("measure").alias("mean").mean(),
col("measure").alias("max").max(),
])
.sort("station", Default::default())
.with_streaming(true);
let df = q.collect()?;
println!("Time={} μs", now.elapsed().as_micros());
Ok(df)
}
fn main() {
run_polars();
}