Generate measurements with random names
Name length goes from 1 to 100.
This commit is contained in:
parent
d8b300b677
commit
0f1f204a0d
19
create_measurements3.sh
Executable file
19
create_measurements3.sh
Executable file
@ -0,0 +1,19 @@
|
|||||||
|
#!/bin/sh
|
||||||
|
#
|
||||||
|
# Copyright 2023 The original authors
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
#
|
||||||
|
|
||||||
|
|
||||||
|
java --class-path target/average-1.0.0-SNAPSHOT.jar dev.morling.onebrc.CreateMeasurements3 $1
|
44691
data/weather_stations.csv
Normal file
44691
data/weather_stations.csv
Normal file
File diff suppressed because it is too large
Load Diff
128
src/main/java/dev/morling/onebrc/CreateMeasurements3.java
Normal file
128
src/main/java/dev/morling/onebrc/CreateMeasurements3.java
Normal file
@ -0,0 +1,128 @@
|
|||||||
|
/*
|
||||||
|
* Copyright 2023 The original authors
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
package dev.morling.onebrc;
|
||||||
|
|
||||||
|
import java.io.BufferedReader;
|
||||||
|
import java.io.BufferedWriter;
|
||||||
|
import java.io.FileReader;
|
||||||
|
import java.io.FileWriter;
|
||||||
|
import java.io.StringReader;
|
||||||
|
import java.util.ArrayList;
|
||||||
|
import java.util.concurrent.ThreadLocalRandom;
|
||||||
|
|
||||||
|
public class CreateMeasurements3 {
|
||||||
|
|
||||||
|
public static final int MAX_NAME_LEN = 100;
|
||||||
|
public static final int KEYSET_SIZE = 10_000;
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
if (args.length != 1) {
|
||||||
|
System.out.println("Usage: create_measurements3.sh <number of records to create>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
int size = 0;
|
||||||
|
try {
|
||||||
|
size = Integer.parseInt(args[0]);
|
||||||
|
}
|
||||||
|
catch (NumberFormatException e) {
|
||||||
|
System.out.println("Invalid value for <number of records to create>");
|
||||||
|
System.out.println("Usage: create_measurements3.sh <number of records to create>");
|
||||||
|
System.exit(1);
|
||||||
|
}
|
||||||
|
final var weatherStations = generateWeatherStations();
|
||||||
|
final var start = System.currentTimeMillis();
|
||||||
|
final var rnd = ThreadLocalRandom.current();
|
||||||
|
try (var out = new BufferedWriter(new FileWriter("measurements.txt"))) {
|
||||||
|
for (int i = 1; i <= size; i++) {
|
||||||
|
var station = weatherStations.get(rnd.nextInt(KEYSET_SIZE));
|
||||||
|
double temp = rnd.nextGaussian(station.avgTemp, 7.0);
|
||||||
|
out.write(station.name);
|
||||||
|
out.write(';');
|
||||||
|
out.write(Double.toString(Math.round(temp * 10.0) / 10.0));
|
||||||
|
out.newLine();
|
||||||
|
if (i % 50_000_000 == 0) {
|
||||||
|
System.out.printf("Wrote %,d measurements in %,d ms%n", i, System.currentTimeMillis() - start);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
record WeatherStation(String name, float avgTemp) {
|
||||||
|
}
|
||||||
|
|
||||||
|
private static ArrayList<WeatherStation> generateWeatherStations() throws Exception {
|
||||||
|
// Use a public list of city names and concatenate them all into a long string,
|
||||||
|
// which we'll use as a "source of city name randomness"
|
||||||
|
var bigName = new StringBuilder(1 << 20);
|
||||||
|
// Source: https://simplemaps.com/data/world-cities
|
||||||
|
try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"));) {
|
||||||
|
while (true) {
|
||||||
|
var row = rows.readLine();
|
||||||
|
if (row == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
bigName.append(row, 0, row.indexOf(';'));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
final var weatherStations = new ArrayList<WeatherStation>();
|
||||||
|
var minLen = Integer.MAX_VALUE;
|
||||||
|
var maxLen = Integer.MIN_VALUE;
|
||||||
|
try (var rows = new BufferedReader(new FileReader("data/weather_stations.csv"))) {
|
||||||
|
final var nameSource = new StringReader(bigName.toString());
|
||||||
|
final var buf = new char[MAX_NAME_LEN];
|
||||||
|
final var rnd = ThreadLocalRandom.current();
|
||||||
|
final double yOffset = 4;
|
||||||
|
final double factor = 2500;
|
||||||
|
final double xOffset = 0.372;
|
||||||
|
final double power = 7;
|
||||||
|
for (int i = 0; i < KEYSET_SIZE; i++) {
|
||||||
|
var row = rows.readLine();
|
||||||
|
if (row == null) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// Use a 7th-order curve to simulate the name length distribution.
|
||||||
|
// It gives us mostly short names, but with large outliers.
|
||||||
|
var nameLen = (int) (yOffset + factor * Math.pow(rnd.nextDouble() - xOffset, power));
|
||||||
|
minLen = Integer.min(minLen, nameLen);
|
||||||
|
maxLen = Integer.max(maxLen, nameLen);
|
||||||
|
var count = nameSource.read(buf, 0, nameLen);
|
||||||
|
if (count == -1) {
|
||||||
|
throw new Exception("Name source exhausted");
|
||||||
|
}
|
||||||
|
var name = new String(buf, 0, nameLen).trim();
|
||||||
|
while (name.length() < nameLen) {
|
||||||
|
var n = nameSource.read();
|
||||||
|
if (n == -1) {
|
||||||
|
throw new Exception("Name source exhausted");
|
||||||
|
}
|
||||||
|
var ch = (char) n;
|
||||||
|
if (ch != ' ') {
|
||||||
|
name += ch;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (name.indexOf(';') != -1) {
|
||||||
|
throw new Exception("Station name contains a semicolon!");
|
||||||
|
}
|
||||||
|
var lat = Float.parseFloat(row.substring(row.indexOf(';') + 1));
|
||||||
|
// Guesstimate mean temperature using cosine of latitude
|
||||||
|
var avgTemp = (float) (30 * Math.cos(Math.toRadians(lat))) - 10;
|
||||||
|
weatherStations.add(new WeatherStation(name, avgTemp));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
System.out.format("Generated %,d station names with length from %,d to %,d%n", KEYSET_SIZE, minLen, maxLen);
|
||||||
|
return weatherStations;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user