From e4f0891d2dddff9461945cc83fe36b36c26dba4a Mon Sep 17 00:00:00 2001 From: Eadrom <Eadrom@users.noreply.github.com> Date: Sun, 14 Jan 2024 06:31:46 -0700 Subject: [PATCH] added python script to build test data (#366) * added python script to build test data * moved create_measurements.py to src/main/python and updated paths for file io * Updated readme to include blurb about python script to generate measurements --- README.md | 2 + src/main/python/create_measurements.py | 143 +++++++++++++++++++++++++ 2 files changed, 145 insertions(+) create mode 100755 src/main/python/create_measurements.py diff --git a/README.md b/README.md index 17507da..7c69007 100644 --- a/README.md +++ b/README.md @@ -237,6 +237,8 @@ Execute the following steps to run the challenge: This will take a few minutes. **Attention:** the generated file has a size of approx. **12 GB**, so make sure to have enough diskspace. + If you're running the challenge with a non-Java language, there's a non-authoritative Python script to generate the measurements file at `src/main/python/create_measurements.py`. The authoritative method for generating the measurements is the Java program `dev.morling.onebrc.CreateMeasurements`. + 3. Calculate the average measurement values: ``` diff --git a/src/main/python/create_measurements.py b/src/main/python/create_measurements.py new file mode 100755 index 0000000..f48972a --- /dev/null +++ b/src/main/python/create_measurements.py @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +# Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java + +import os +import sys +import random +import time + + +def check_args(file_args): + """ + Sanity checks out input and prints out usage if input is not a positive integer + """ + try: + if len(file_args) != 2 or int(file_args[1]) <= 0: + raise Exception() + except: + print("Usage: create_measurements.sh <positive integer number of records to create>") + print(" You can use underscore notation for large number of records.") + print(" For example: 1_000_000_000 for one billion") + exit() + + +def build_weather_station_name_list(): + """ + Grabs the weather station names from example data provided in repo and dedups + """ + station_names = [] + with open('../../../data/weather_stations.csv', 'r') as file: + file_contents = file.read() + for station in file_contents.splitlines(): + if "#" in station: + next + else: + station_names.append(station.split(';')[0]) + return list(set(station_names)) + + +def convert_bytes(num): + """ + Convert bytes to a human-readable format (e.g., KiB, MiB, GiB) + """ + for x in ['bytes', 'KiB', 'MiB', 'GiB']: + if num < 1024.0: + return "%3.1f %s" % (num, x) + num /= 1024.0 + + +def format_elapsed_time(seconds): + """ + Format elapsed time in a human-readable format + """ + if seconds < 60: + return f"{seconds:.3f} seconds" + elif seconds < 3600: + minutes, seconds = divmod(seconds, 60) + return f"{int(minutes)} minutes {int(seconds)} seconds" + else: + hours, remainder = divmod(seconds, 3600) + minutes, seconds = divmod(remainder, 60) + if minutes == 0: + return f"{int(hours)} hours {int(seconds)} seconds" + else: + return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds" + + +def estimate_file_size(weather_station_names, num_rows_to_create): + """ + Tries to estimate how large a file the test data will be + """ + max_string = float('-inf') + min_string = float('inf') + per_record_size = 0 + record_size_unit = "bytes" + + for station in weather_station_names: + if len(station) > max_string: + max_string = len(station) + if len(station) < min_string: + min_string = len(station) + per_record_size = ((max_string + min_string * 2) + len(",-123.4")) / 2 + + total_file_size = num_rows_to_create * per_record_size + human_file_size = convert_bytes(total_file_size) + + return f"Estimated max file size is: {human_file_size}.\nTrue size is probably much smaller (around half)." + + +def build_test_data(weather_station_names, num_rows_to_create): + """ + Generates and writes to file the requested length of test data + """ + start_time = time.time() + coldest_temp = -99.9 + hottest_temp = 99.9 + station_names_10k_max = random.choices(weather_station_names, k=10_000) + progress_step = max(1, int(num_rows_to_create / 100)) + print('Building test data...') + + try: + with open("../../../data/measurements.txt", 'w') as file: + for s in range(0,num_rows_to_create): + random_station = random.choice(station_names_10k_max) + random_temp = round(random.uniform(coldest_temp, hottest_temp), 1) + file.write(f"{random_station};{random_temp}\n") + # Update progress bar every 1% + if s % progress_step == 0 or s == num_rows_to_create - 1: + sys.stdout.write('\r') + sys.stdout.write("[%-50s] %d%%" % ('=' * int((s + 1) / num_rows_to_create * 50), (s + 1) / num_rows_to_create * 100)) + sys.stdout.flush() + sys.stdout.write('\n') + except Exception as e: + print("Something went wrong. Printing error info and exiting...") + print(e) + exit() + + end_time = time.time() + elapsed_time = end_time - start_time + file_size = os.path.getsize("../../../data/measurements.txt") + human_file_size = convert_bytes(file_size) + + print("Test data successfully written to 1brc/data/measurements.txt") + print(f"Actual file size: {human_file_size}") + print(f"Elapsed time: {format_elapsed_time(elapsed_time)}") + + +def main(): + """ + main program function + """ + check_args(sys.argv) + num_rows_to_create = int(sys.argv[1]) + weather_station_names = [] + weather_station_names = build_weather_station_name_list() + print(estimate_file_size(weather_station_names, num_rows_to_create)) + build_test_data(weather_station_names, num_rows_to_create) + print("Test data build complete.") + + +if __name__ == "__main__": + main() +exit()