added python script to build test data (#366)
* added python script to build test data * moved create_measurements.py to src/main/python and updated paths for file io * Updated readme to include blurb about python script to generate measurements
This commit is contained in:
parent
38f8b8d544
commit
e4f0891d2d
@ -237,6 +237,8 @@ Execute the following steps to run the challenge:
|
|||||||
This will take a few minutes.
|
This will take a few minutes.
|
||||||
**Attention:** the generated file has a size of approx. **12 GB**, so make sure to have enough diskspace.
|
**Attention:** the generated file has a size of approx. **12 GB**, so make sure to have enough diskspace.
|
||||||
|
|
||||||
|
If you're running the challenge with a non-Java language, there's a non-authoritative Python script to generate the measurements file at `src/main/python/create_measurements.py`. The authoritative method for generating the measurements is the Java program `dev.morling.onebrc.CreateMeasurements`.
|
||||||
|
|
||||||
3. Calculate the average measurement values:
|
3. Calculate the average measurement values:
|
||||||
|
|
||||||
```
|
```
|
||||||
|
143
src/main/python/create_measurements.py
Executable file
143
src/main/python/create_measurements.py
Executable file
@ -0,0 +1,143 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
|
||||||
|
# Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import random
|
||||||
|
import time
|
||||||
|
|
||||||
|
|
||||||
|
def check_args(file_args):
|
||||||
|
"""
|
||||||
|
Sanity checks out input and prints out usage if input is not a positive integer
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
if len(file_args) != 2 or int(file_args[1]) <= 0:
|
||||||
|
raise Exception()
|
||||||
|
except:
|
||||||
|
print("Usage: create_measurements.sh <positive integer number of records to create>")
|
||||||
|
print(" You can use underscore notation for large number of records.")
|
||||||
|
print(" For example: 1_000_000_000 for one billion")
|
||||||
|
exit()
|
||||||
|
|
||||||
|
|
||||||
|
def build_weather_station_name_list():
|
||||||
|
"""
|
||||||
|
Grabs the weather station names from example data provided in repo and dedups
|
||||||
|
"""
|
||||||
|
station_names = []
|
||||||
|
with open('../../../data/weather_stations.csv', 'r') as file:
|
||||||
|
file_contents = file.read()
|
||||||
|
for station in file_contents.splitlines():
|
||||||
|
if "#" in station:
|
||||||
|
next
|
||||||
|
else:
|
||||||
|
station_names.append(station.split(';')[0])
|
||||||
|
return list(set(station_names))
|
||||||
|
|
||||||
|
|
||||||
|
def convert_bytes(num):
|
||||||
|
"""
|
||||||
|
Convert bytes to a human-readable format (e.g., KiB, MiB, GiB)
|
||||||
|
"""
|
||||||
|
for x in ['bytes', 'KiB', 'MiB', 'GiB']:
|
||||||
|
if num < 1024.0:
|
||||||
|
return "%3.1f %s" % (num, x)
|
||||||
|
num /= 1024.0
|
||||||
|
|
||||||
|
|
||||||
|
def format_elapsed_time(seconds):
|
||||||
|
"""
|
||||||
|
Format elapsed time in a human-readable format
|
||||||
|
"""
|
||||||
|
if seconds < 60:
|
||||||
|
return f"{seconds:.3f} seconds"
|
||||||
|
elif seconds < 3600:
|
||||||
|
minutes, seconds = divmod(seconds, 60)
|
||||||
|
return f"{int(minutes)} minutes {int(seconds)} seconds"
|
||||||
|
else:
|
||||||
|
hours, remainder = divmod(seconds, 3600)
|
||||||
|
minutes, seconds = divmod(remainder, 60)
|
||||||
|
if minutes == 0:
|
||||||
|
return f"{int(hours)} hours {int(seconds)} seconds"
|
||||||
|
else:
|
||||||
|
return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds"
|
||||||
|
|
||||||
|
|
||||||
|
def estimate_file_size(weather_station_names, num_rows_to_create):
|
||||||
|
"""
|
||||||
|
Tries to estimate how large a file the test data will be
|
||||||
|
"""
|
||||||
|
max_string = float('-inf')
|
||||||
|
min_string = float('inf')
|
||||||
|
per_record_size = 0
|
||||||
|
record_size_unit = "bytes"
|
||||||
|
|
||||||
|
for station in weather_station_names:
|
||||||
|
if len(station) > max_string:
|
||||||
|
max_string = len(station)
|
||||||
|
if len(station) < min_string:
|
||||||
|
min_string = len(station)
|
||||||
|
per_record_size = ((max_string + min_string * 2) + len(",-123.4")) / 2
|
||||||
|
|
||||||
|
total_file_size = num_rows_to_create * per_record_size
|
||||||
|
human_file_size = convert_bytes(total_file_size)
|
||||||
|
|
||||||
|
return f"Estimated max file size is: {human_file_size}.\nTrue size is probably much smaller (around half)."
|
||||||
|
|
||||||
|
|
||||||
|
def build_test_data(weather_station_names, num_rows_to_create):
|
||||||
|
"""
|
||||||
|
Generates and writes to file the requested length of test data
|
||||||
|
"""
|
||||||
|
start_time = time.time()
|
||||||
|
coldest_temp = -99.9
|
||||||
|
hottest_temp = 99.9
|
||||||
|
station_names_10k_max = random.choices(weather_station_names, k=10_000)
|
||||||
|
progress_step = max(1, int(num_rows_to_create / 100))
|
||||||
|
print('Building test data...')
|
||||||
|
|
||||||
|
try:
|
||||||
|
with open("../../../data/measurements.txt", 'w') as file:
|
||||||
|
for s in range(0,num_rows_to_create):
|
||||||
|
random_station = random.choice(station_names_10k_max)
|
||||||
|
random_temp = round(random.uniform(coldest_temp, hottest_temp), 1)
|
||||||
|
file.write(f"{random_station};{random_temp}\n")
|
||||||
|
# Update progress bar every 1%
|
||||||
|
if s % progress_step == 0 or s == num_rows_to_create - 1:
|
||||||
|
sys.stdout.write('\r')
|
||||||
|
sys.stdout.write("[%-50s] %d%%" % ('=' * int((s + 1) / num_rows_to_create * 50), (s + 1) / num_rows_to_create * 100))
|
||||||
|
sys.stdout.flush()
|
||||||
|
sys.stdout.write('\n')
|
||||||
|
except Exception as e:
|
||||||
|
print("Something went wrong. Printing error info and exiting...")
|
||||||
|
print(e)
|
||||||
|
exit()
|
||||||
|
|
||||||
|
end_time = time.time()
|
||||||
|
elapsed_time = end_time - start_time
|
||||||
|
file_size = os.path.getsize("../../../data/measurements.txt")
|
||||||
|
human_file_size = convert_bytes(file_size)
|
||||||
|
|
||||||
|
print("Test data successfully written to 1brc/data/measurements.txt")
|
||||||
|
print(f"Actual file size: {human_file_size}")
|
||||||
|
print(f"Elapsed time: {format_elapsed_time(elapsed_time)}")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
"""
|
||||||
|
main program function
|
||||||
|
"""
|
||||||
|
check_args(sys.argv)
|
||||||
|
num_rows_to_create = int(sys.argv[1])
|
||||||
|
weather_station_names = []
|
||||||
|
weather_station_names = build_weather_station_name_list()
|
||||||
|
print(estimate_file_size(weather_station_names, num_rows_to_create))
|
||||||
|
build_test_data(weather_station_names, num_rows_to_create)
|
||||||
|
print("Test data build complete.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
exit()
|
Loading…
Reference in New Issue
Block a user