Go implementation by AlexanderYastrebov (#298)

* Go implementation by AlexanderYastrebov

This is a proof-of-concept to demonstrate non-java submission.
It requires Docker with BuildKit plugin to build and export binary.

Updates
* #67
* #253

* Use collision-free id lookup

* Use number of buckets greater than max number of keys
This commit is contained in:
Alexander Yastrebov 2024-01-23 20:49:28 +01:00 committed by GitHub
parent 464ba6209b
commit 4daeb94b04
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 492 additions and 0 deletions

1
.gitignore vendored
View File

@ -46,3 +46,4 @@ out/
/measurements*.txt /measurements*.txt
/*.out /*.out
out_expected.txt out_expected.txt
/*-timing.json

View File

@ -0,0 +1,20 @@
#!/bin/bash
#
# Copyright 2023 The original authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
INPUT=${1:-"measurements.txt"}
target/AlexanderYastrebov/1brc "$INPUT"

View File

@ -53,3 +53,4 @@ vemana;Subrahmanyam
jincongho;Jin Cong Ho jincongho;Jin Cong Ho
yonatang;Yonatan Graber yonatang;Yonatan Graber
adriacabeza;Adrià Cabeza adriacabeza;Adrià Cabeza
AlexanderYastrebov;Alexander Yastrebov

18
prepare_AlexanderYastrebov.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
#
# Copyright 2023 The original authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
DOCKER_BUILDKIT=1 docker build -o target/AlexanderYastrebov src/main/go/AlexanderYastrebov

View File

@ -0,0 +1,22 @@
#
# Copyright 2023 The original authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
FROM golang AS build-stage
COPY . src/
RUN cd src && go build .
FROM scratch AS export-stage
COPY --from=build-stage /go/src/1brc /

View File

@ -0,0 +1,58 @@
# 1brc in go
It uses Docker with BuildKit plugin to build and [export binary](https://docs.docker.com/engine/reference/commandline/build/#output) binary,
see [prepare_AlexanderYastrebov.sh](../../../../prepare_AlexanderYastrebov.sh)
and [calculate_average_AlexanderYastrebov.sh](../../../../calculate_average_AlexanderYastrebov.sh).
Demo:
```sh
$ ./test.sh AlexanderYastrebov
[+] Building 0.2s (9/9) FINISHED
=> [internal] load .dockerignore 0.0s
=> => transferring context: 2B 0.0s
=> [internal] load build definition from Dockerfile 0.0s
=> => transferring dockerfile: 172B 0.0s
=> [internal] load metadata for docker.io/library/golang:latest 0.0s
=> [internal] load build context 0.0s
=> => transferring context: 145B 0.0s
=> [build-stage 1/3] FROM docker.io/library/golang 0.0s
=> CACHED [build-stage 2/3] COPY . src/ 0.0s
=> CACHED [build-stage 3/3] RUN cd src && go build . 0.0s
=> CACHED [export-stage 1/1] COPY --from=build-stage /go/src/1brc / 0.0s
=> exporting to client directory 0.1s
=> => copying files 2.03MB 0.0s
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10000-unique-keys.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-10.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-1.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-20.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-2.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-3.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-boundaries.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-complex-utf8.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-dot.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-shortest.txt
Validating calculate_average_AlexanderYastrebov.sh -- src/test/resources/samples/measurements-short.txt
# Run once to setup the benchmark
# ./create_measurements.sh 1000000000
# mv measurements.txt measurements_1B.txt
# ln -s measurements_1B.txt measurements.txt
# ./calculate_average_baseline.sh > out_expected.txt
$ wc -l measurements_1B.txt
1000000000 measurements_1B.txt
$ ./evaluate2.sh AlexanderYastrebov royvanrijn
... 0.0s
Benchmark 1: ./calculate_average_AlexanderYastrebov.sh 2>&1
Time (mean ± σ): 16.786 s ± 0.545 s [User: 56.030 s, System: 10.068 s]
Range (min … max): 15.918 s … 17.309 s 5 runs
...
Benchmark 1: ./calculate_average_royvanrijn.sh 2>&1
Time (mean ± σ): 16.731 s ± 0.190 s [User: 56.485 s, System: 10.279 s]
Range (min … max): 16.490 s … 16.951 s 5 runs
Summary
AlexanderYastrebov: trimmed mean 16.901712789513336, raw times 16.69836470718,17.30911065018,16.83413600418,15.91787706218,17.17263765718
royvanrijn: trimmed mean 16.738037123633333, raw times 16.4900939703,16.9513459953,16.5794539913,16.8297746273,16.8048827523
```

View File

@ -0,0 +1,283 @@
package main
import (
"bytes"
"fmt"
"log"
"math"
"os"
"runtime"
"sort"
"sync"
"syscall"
)
type measurement struct {
min, max, sum, count int64
}
func main() {
if len(os.Args) != 2 {
log.Fatalf("Missing measurements filename")
}
measurements := processFile(os.Args[1])
ids := make([]string, 0, len(measurements))
for id := range measurements {
ids = append(ids, id)
}
sort.Strings(ids)
fmt.Print("{")
for i, id := range ids {
if i > 0 {
fmt.Print(", ")
}
m := measurements[id]
fmt.Printf("%s=%.1f/%.1f/%.1f", id, round(float64(m.min)/10.0), round(float64(m.sum)/10.0/float64(m.count)), round(float64(m.max)/10.0))
}
fmt.Println("}")
}
func processFile(filename string) map[string]*measurement {
f, err := os.Open(filename)
if err != nil {
log.Fatalf("Open: %v", err)
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
log.Fatalf("Stat: %v", err)
}
size := fi.Size()
if size <= 0 || size != int64(int(size)) {
log.Fatalf("Invalid file size: %d", size)
}
data, err := syscall.Mmap(int(f.Fd()), 0, int(size), syscall.PROT_READ, syscall.MAP_SHARED)
if err != nil {
log.Fatalf("Mmap: %v", err)
}
defer func() {
if err := syscall.Munmap(data); err != nil {
log.Fatalf("Munmap: %v", err)
}
}()
return process(data)
}
func process(data []byte) map[string]*measurement {
nChunks := runtime.NumCPU()
chunkSize := len(data) / nChunks
if chunkSize == 0 {
chunkSize = len(data)
}
chunks := make([]int, 0, nChunks)
offset := 0
for offset < len(data) {
offset += chunkSize
if offset >= len(data) {
chunks = append(chunks, len(data))
break
}
nlPos := bytes.IndexByte(data[offset:], '\n')
if nlPos == -1 {
chunks = append(chunks, len(data))
break
} else {
offset += nlPos + 1
chunks = append(chunks, offset)
}
}
var wg sync.WaitGroup
wg.Add(len(chunks))
results := make([]map[string]*measurement, len(chunks))
start := 0
for i, chunk := range chunks {
go func(data []byte, i int) {
results[i] = processChunk(data)
wg.Done()
}(data[start:chunk], i)
start = chunk
}
wg.Wait()
measurements := make(map[string]*measurement)
for _, r := range results {
for id, rm := range r {
m := measurements[id]
if m == nil {
measurements[id] = rm
} else {
m.min = min(m.min, rm.min)
m.max = max(m.max, rm.max)
m.sum += rm.sum
m.count += rm.count
}
}
}
return measurements
}
func processChunk(data []byte) map[string]*measurement {
// Use fixed size linear probe lookup table
const (
// use power of 2 for fast modulo calculation,
// should be larger than max number of keys which is 10_000
entriesSize = 1 << 14
// use FNV-1a hash
fnv1aOffset64 = 14695981039346656037
fnv1aPrime64 = 1099511628211
)
type entry struct {
m measurement
hash uint64
vlen int
value [128]byte // use power of 2 > 100 for alignment
}
entries := make([]entry, entriesSize)
entriesCount := 0
// keep short and inlinable
getMeasurement := func(hash uint64, value []byte) *measurement {
i := hash & uint64(entriesSize-1)
entry := &entries[i]
// bytes.Equal could be commented to speedup assuming no hash collisions
for entry.vlen > 0 && !(entry.hash == hash && bytes.Equal(entry.value[:entry.vlen], value)) {
i = (i + 1) & uint64(entriesSize-1)
entry = &entries[i]
}
if entry.vlen == 0 {
entry.hash = hash
entry.vlen = copy(entry.value[:], value)
entriesCount++
}
return &entry.m
}
// assume valid input
for len(data) > 0 {
idHash := uint64(fnv1aOffset64)
semiPos := 0
for i, b := range data {
if b == ';' {
semiPos = i
break
}
// calculate FNV-1a hash
idHash ^= uint64(b)
idHash *= fnv1aPrime64
}
idData := data[:semiPos]
data = data[semiPos+1:]
var temp int64
// parseNumber
{
negative := data[0] == '-'
if negative {
data = data[1:]
}
_ = data[3]
if data[1] == '.' {
// 1.2\n
temp = int64(data[0])*10 + int64(data[2]) - '0'*(10+1)
data = data[4:]
// 12.3\n
} else {
_ = data[4]
temp = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1)
data = data[5:]
}
if negative {
temp = -temp
}
}
m := getMeasurement(idHash, idData)
if m.count == 0 {
m.min = temp
m.max = temp
m.sum = temp
m.count = 1
} else {
m.min = min(m.min, temp)
m.max = max(m.max, temp)
m.sum += temp
m.count++
}
}
result := make(map[string]*measurement, entriesCount)
for i := range entries {
entry := &entries[i]
if entry.m.count > 0 {
result[string(entry.value[:entry.vlen])] = &entry.m
}
}
return result
}
func round(x float64) float64 {
return roundJava(x*10.0) / 10.0
}
// roundJava returns the closest integer to the argument, with ties
// rounding to positive infinity, see java's Math.round
func roundJava(x float64) float64 {
t := math.Trunc(x)
if x < 0.0 && t-x == 0.5 {
//return t
} else if math.Abs(x-t) >= 0.5 {
t += math.Copysign(1, x)
}
if t == 0 { // check -0
return 0.0
}
return t
}
// parseNumber reads decimal number that matches "^-?[0-9]{1,2}[.][0-9]" pattern,
// e.g.: -12.3, -3.4, 5.6, 78.9 and return the value*10, i.e. -123, -34, 56, 789.
func parseNumber(data []byte) int64 {
negative := data[0] == '-'
if negative {
data = data[1:]
}
var result int64
switch len(data) {
// 1.2
case 3:
result = int64(data[0])*10 + int64(data[2]) - '0'*(10+1)
// 12.3
case 4:
result = int64(data[0])*100 + int64(data[1])*10 + int64(data[3]) - '0'*(100+10+1)
}
if negative {
return -result
}
return result
}

View File

@ -0,0 +1,86 @@
package main
import (
"fmt"
"os"
"testing"
)
func TestRoundJava(t *testing.T) {
for _, tc := range []struct {
value float64
expected string
}{
{value: -1.5, expected: "-1.0"},
{value: -1.0, expected: "-1.0"},
{value: -0.7, expected: "-1.0"},
{value: -0.5, expected: "0.0"},
{value: -0.3, expected: "0.0"},
{value: 0.0, expected: "0.0"},
{value: 0.3, expected: "0.0"},
{value: 0.5, expected: "1.0"},
{value: 0.7, expected: "1.0"},
{value: 1.0, expected: "1.0"},
{value: 1.5, expected: "2.0"},
} {
if rounded := roundJava(tc.value); fmt.Sprintf("%.1f", rounded) != tc.expected {
t.Errorf("Wrong rounding of %v, expected: %s, got: %.1f", tc.value, tc.expected, rounded)
}
}
}
func TestParseNumber(t *testing.T) {
for _, tc := range []struct {
value string
expected string
}{
{value: "-99.9", expected: "-999"},
{value: "-12.3", expected: "-123"},
{value: "-1.5", expected: "-15"},
{value: "-1.0", expected: "-10"},
{value: "0.0", expected: "0"},
{value: "0.3", expected: "3"},
{value: "12.3", expected: "123"},
{value: "99.9", expected: "999"},
} {
if number := parseNumber([]byte(tc.value)); fmt.Sprintf("%d", number) != tc.expected {
t.Errorf("Wrong parsing of %v, expected: %s, got: %d", tc.value, tc.expected, number)
}
}
}
var parseNumberSink int64
func BenchmarkParseNumber(b *testing.B) {
data1 := []byte("1.2")
data2 := []byte("-12.3")
for i := 0; i < b.N; i++ {
parseNumberSink = parseNumber(data1) + parseNumber(data2)
}
}
func BenchmarkProcess(b *testing.B) {
// $ ./create_measurements.sh 1000000 && mv measurements.txt measurements-1e6.txt
// Created file with 1,000,000 measurements in 514 ms
const filename = "../../../../measurements-1e6.txt"
data, err := os.ReadFile(filename)
if err != nil {
b.Fatal(err)
}
measurements := process(data)
rows := int64(0)
for _, m := range measurements {
rows += m.count
}
b.ReportAllocs()
b.ResetTimer()
b.ReportMetric(float64(rows), "rows/op")
for i := 0; i < b.N; i++ {
process(data)
}
}

View File

@ -0,0 +1,3 @@
module github.com/AlexanderYastrebov/1brc
go 1.21.5