From e4f0891d2dddff9461945cc83fe36b36c26dba4a Mon Sep 17 00:00:00 2001
From: Eadrom <Eadrom@users.noreply.github.com>
Date: Sun, 14 Jan 2024 06:31:46 -0700
Subject: [PATCH] added python script to build test data (#366)

* added python script to build test data

* moved create_measurements.py to src/main/python and updated paths for file io

* Updated readme to include blurb about python script to generate measurements
---
 README.md                              |   2 +
 src/main/python/create_measurements.py | 143 +++++++++++++++++++++++++
 2 files changed, 145 insertions(+)
 create mode 100755 src/main/python/create_measurements.py

diff --git a/README.md b/README.md
index 17507da..7c69007 100644
--- a/README.md
+++ b/README.md
@@ -237,6 +237,8 @@ Execute the following steps to run the challenge:
     This will take a few minutes.
     **Attention:** the generated file has a size of approx. **12 GB**, so make sure to have enough diskspace.
 
+    If you're running the challenge with a non-Java language, there's a non-authoritative Python script to generate the measurements file at `src/main/python/create_measurements.py`. The authoritative method for generating the measurements is the Java program `dev.morling.onebrc.CreateMeasurements`.
+
 3. Calculate the average measurement values:
 
     ```
diff --git a/src/main/python/create_measurements.py b/src/main/python/create_measurements.py
new file mode 100755
index 0000000..f48972a
--- /dev/null
+++ b/src/main/python/create_measurements.py
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+# Based on https://github.com/gunnarmorling/1brc/blob/main/src/main/java/dev/morling/onebrc/CreateMeasurements.java
+
+import os
+import sys
+import random
+import time
+
+
+def check_args(file_args):
+    """
+    Sanity checks out input and prints out usage if input is not a positive integer
+    """
+    try:
+        if len(file_args) != 2 or int(file_args[1]) <= 0:
+            raise Exception()
+    except:
+        print("Usage:  create_measurements.sh <positive integer number of records to create>")
+        print("        You can use underscore notation for large number of records.")
+        print("        For example:  1_000_000_000 for one billion")
+        exit()
+
+
+def build_weather_station_name_list():
+    """
+    Grabs the weather station names from example data provided in repo and dedups
+    """
+    station_names = []
+    with open('../../../data/weather_stations.csv', 'r') as file:
+        file_contents = file.read()
+    for station in file_contents.splitlines():
+        if "#" in station:
+            next
+        else:
+            station_names.append(station.split(';')[0])
+    return list(set(station_names))
+
+
+def convert_bytes(num):
+    """
+    Convert bytes to a human-readable format (e.g., KiB, MiB, GiB)
+    """
+    for x in ['bytes', 'KiB', 'MiB', 'GiB']:
+        if num < 1024.0:
+            return "%3.1f %s" % (num, x)
+        num /= 1024.0
+
+
+def format_elapsed_time(seconds):
+    """
+    Format elapsed time in a human-readable format
+    """
+    if seconds < 60:
+        return f"{seconds:.3f} seconds"
+    elif seconds < 3600:
+        minutes, seconds = divmod(seconds, 60)
+        return f"{int(minutes)} minutes {int(seconds)} seconds"
+    else:
+        hours, remainder = divmod(seconds, 3600)
+        minutes, seconds = divmod(remainder, 60)
+        if minutes == 0:
+            return f"{int(hours)} hours {int(seconds)} seconds"
+        else:
+            return f"{int(hours)} hours {int(minutes)} minutes {int(seconds)} seconds"
+
+
+def estimate_file_size(weather_station_names, num_rows_to_create):
+    """
+    Tries to estimate how large a file the test data will be
+    """
+    max_string = float('-inf')
+    min_string = float('inf')
+    per_record_size = 0
+    record_size_unit = "bytes"
+
+    for station in weather_station_names:
+        if len(station) > max_string:
+            max_string = len(station)
+        if len(station) < min_string:
+            min_string = len(station)
+        per_record_size = ((max_string + min_string * 2) + len(",-123.4")) / 2
+
+    total_file_size = num_rows_to_create * per_record_size
+    human_file_size = convert_bytes(total_file_size)
+
+    return f"Estimated max file size is:  {human_file_size}.\nTrue size is probably much smaller (around half)."
+
+
+def build_test_data(weather_station_names, num_rows_to_create):
+    """
+    Generates and writes to file the requested length of test data
+    """
+    start_time = time.time()
+    coldest_temp = -99.9
+    hottest_temp = 99.9
+    station_names_10k_max = random.choices(weather_station_names, k=10_000)
+    progress_step = max(1, int(num_rows_to_create / 100))
+    print('Building test data...')
+
+    try:
+        with open("../../../data/measurements.txt", 'w') as file:
+            for s in range(0,num_rows_to_create):
+                random_station = random.choice(station_names_10k_max)
+                random_temp = round(random.uniform(coldest_temp, hottest_temp), 1)
+                file.write(f"{random_station};{random_temp}\n")
+                # Update progress bar every 1%
+                if s % progress_step == 0 or s == num_rows_to_create - 1:
+                    sys.stdout.write('\r')
+                    sys.stdout.write("[%-50s] %d%%" % ('=' * int((s + 1) / num_rows_to_create * 50), (s + 1) / num_rows_to_create * 100))
+                    sys.stdout.flush()
+        sys.stdout.write('\n')
+    except Exception as e:
+        print("Something went wrong. Printing error info and exiting...")
+        print(e)
+        exit()
+    
+    end_time = time.time()
+    elapsed_time = end_time - start_time
+    file_size = os.path.getsize("../../../data/measurements.txt")
+    human_file_size = convert_bytes(file_size)
+ 
+    print("Test data successfully written to 1brc/data/measurements.txt")
+    print(f"Actual file size:  {human_file_size}")
+    print(f"Elapsed time: {format_elapsed_time(elapsed_time)}")
+
+
+def main():
+    """
+    main program function
+    """
+    check_args(sys.argv)
+    num_rows_to_create = int(sys.argv[1])
+    weather_station_names = []
+    weather_station_names = build_weather_station_name_list()
+    print(estimate_file_size(weather_station_names, num_rows_to_create))
+    build_test_data(weather_station_names, num_rows_to_create)
+    print("Test data build complete.")
+
+
+if __name__ == "__main__":
+    main()
+exit()