Faster version of the data generator

This commit is contained in:
Rene Schwietzke
2024-01-03 00:13:11 +01:00
committed by Gunnar Morling
parent 7d485d0e8b
commit 04bd2d69b6
5 changed files with 1435 additions and 2 deletions

View File

@@ -0,0 +1,715 @@
/*
* Copyright 2023 The original authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.rschwietzke;
import java.util.Arrays;
/**
* <p>This class is meant to replaces the old {@link CheaperCharBuffer} in all areas
* where performance and memory-efficency is key. XMLString compatibility
* remains in place in case one has used that in their own code.
*
* <p>This buffer is mutable and when you use it, make sure you work with
* it responsibly. In many cases, we will reuse the buffer to avoid fresh
* memory allocations, hence you have to pay attention to its usage pattern.
* It is not meant to be a general String replacement.
*
* <p>This class avoids many of the standard runtime checks that will result
* in a runtime or array exception anyway. Why check twice and raise the
* same exception?
*
* @author René Schwietzke
* @since 3.10.0
*/
public class CheaperCharBuffer implements CharSequence {
// our data, can grow - that is not safe and has be altered from the original code
// to allow speed
public char[] data_;
// the current size of the string data
public int length_;
// the current size of the string data
private final int growBy_;
// how much do we grow if needed, half a cache line
public static final int CAPACITY_GROWTH = 64 / 2;
// what is our start size?
// a cache line is 64 byte mostly, the overhead is mostly 24 bytes
// a char is two bytes, let's use one cache lines
public static final int INITIAL_CAPACITY = (64 - 24) / 2;
// static empty version; DON'T MODIFY IT
public static final CheaperCharBuffer EMPTY = new CheaperCharBuffer(0);
// the <20> character
private static final char REPLACEMENT_CHARACTER = '\uFFFD';
/**
* Constructs an XMLCharBuffer with a default size.
*/
public CheaperCharBuffer() {
this.data_ = new char[INITIAL_CAPACITY];
this.length_ = 0;
this.growBy_ = CAPACITY_GROWTH;
}
/**
* Constructs an XMLCharBuffer with a desired size.
*
* @param startSize the size of the buffer to start with
*/
public CheaperCharBuffer(final int startSize) {
this(startSize, CAPACITY_GROWTH);
}
/**
* Constructs an XMLCharBuffer with a desired size.
*
* @param startSize the size of the buffer to start with
* @param growBy by how much do we want to grow when needed
*/
public CheaperCharBuffer(final int startSize, final int growBy) {
this.data_ = new char[startSize];
this.length_ = 0;
this.growBy_ = Math.max(1, growBy);
}
/**
* Constructs an XMLCharBuffer from another buffer. Copies the data
* over. The new buffer capacity matches the length of the source.
*
* @param src the source buffer to copy from
*/
public CheaperCharBuffer(final CheaperCharBuffer src) {
this(src, 0);
}
/**
* Constructs an XMLCharBuffer from another buffer. Copies the data
* over. You can add more capacity on top of the source length. If
* you specify 0, the capacity will match the src length.
*
* @param src the source buffer to copy from
* @param addCapacity how much capacity to add to origin length
*/
public CheaperCharBuffer(final CheaperCharBuffer src, final int addCapacity) {
this.data_ = Arrays.copyOf(src.data_, src.length_ + Math.max(0, addCapacity));
this.length_ = src.length();
this.growBy_ = Math.max(1, CAPACITY_GROWTH);
}
/**
* Constructs an XMLCharBuffer from a string. To avoid
* too much allocation, we just take the string array as is and
* don't allocate extra space in the first place.
*
* @param src the string to copy from
*/
public CheaperCharBuffer(final String src) {
this.data_ = src.toCharArray();
this.length_ = src.length();
this.growBy_ = CAPACITY_GROWTH;
}
/**
* Constructs an XMLString structure preset with the specified values.
* There will not be any room to grow, if you need that, construct an
* empty one and append.
*
* <p>There are not range checks performed. Make sure your data is correct.
*
* @param ch The character array, must not be null
* @param offset The offset into the character array.
* @param length The length of characters from the offset.
*/
public CheaperCharBuffer(final char[] ch, final int offset, final int length) {
// just as big as we need it
this(length);
append(ch, offset, length);
}
/**
* Check capacity and grow if needed automatically
*
* @param minimumCapacity how much space do we need at least
*/
private void ensureCapacity(final int minimumCapacity) {
if (minimumCapacity > this.data_.length) {
final int newSize = Math.max(minimumCapacity + this.growBy_, (this.data_.length << 1) + 2);
this.data_ = Arrays.copyOf(this.data_, newSize);
}
}
/**
* Returns the current max capacity without growth. Does not
* indicate how much capacity is already in use. Use {@link #length()}
* for that.
*
* @return the current capacity, not taken any usage into account
*/
public int capacity() {
return this.data_.length;
}
/**
* Appends a single character to the buffer.
*
* @param c the character to append
* @return this instance
*/
public CheaperCharBuffer append(final char c) {
final int oldLength = this.length_++;
// ensureCapacity is not inlined by the compiler, so put that here for the most
// called method of all appends. Duplicate code, but for a reason.
if (oldLength == this.data_.length) {
final int newSize = Math.max(oldLength + this.growBy_, (this.data_.length << 1) + 2);
this.data_ = Arrays.copyOf(this.data_, newSize);
}
this.data_[oldLength] = c;
return this;
}
/**
* Append a string to this buffer without copying the string first.
*
* @param src the string to append
* @return this instance
*/
public CheaperCharBuffer append(final String src) {
final int start = this.length_;
this.length_ = this.length_ + src.length();
ensureCapacity(this.length_);
// copy char by char because we don't get a copy for free
// from a string yet, this might change when immutable arrays
// make it into Java, but that will not be very soon
for (int i = 0; i < src.length(); i++) {
this.data_[start + i] = src.charAt(i);
}
return this;
}
/**
* Add another buffer to this one.
*
* @param src the buffer to append
* @return this instance
*/
public CheaperCharBuffer append(final CheaperCharBuffer src) {
final int start = this.length_;
this.length_ = this.length_ + src.length();
ensureCapacity(this.length_);
System.arraycopy(src.data_, 0, this.data_, start, src.length_);
return this;
}
/**
* Add data from a char array to this buffer with the ability to specify
* a range to copy from
*
* @param src the source char array
* @param offset the pos to start to copy from
* @param length the length of the data to copy
*
* @return this instance
*/
public CheaperCharBuffer append(final char[] src, final int offset, final int length) {
final int start = this.length_;
this.length_ = start + length;
ensureCapacity(this.length_);
System.arraycopy(src, offset, this.data_, start, length);
return this;
}
/**
* Returns the current length
*
* @return the length of the charbuffer data
*/
public int length() {
return length_;
}
/**
* Tell us how much the capacity grows if needed
*
* @return the value that determines how much we grow the backing
* array in case we have to
*/
public int getGrowBy() {
return this.growBy_;
}
/**
* Resets the buffer to 0 length. It won't resize it to avoid memory
* churn.
*
* @return this instance for fluid programming
*/
public CheaperCharBuffer clear() {
this.length_ = 0;
return this;
}
/**
* Resets the buffer to 0 length and sets the new data. This
* is a little cheaper than clear().append(c) depending on
* the where and the inlining decisions.
*
* @param c the char to set
* @return this instance for fluid programming
*/
public CheaperCharBuffer clearAndAppend(final char c) {
this.length_ = 0;
if (this.data_.length > 0) {
this.data_[this.length_] = c;
this.length_++;
}
else {
// the rare case when we don't have any buffer at hand
append(c);
}
return this;
}
/**
* Does this buffer end with this string? If we check for
* the empty string, we get true. If we would support JDK 11, we could
* use Arrays.mismatch and be way faster.
*
* @param s the string to check the end against
* @return true of the end matches the buffer, false otherwise
*/
public boolean endsWith(final String s) {
// length does not match, cannot be the end
if (this.length_ < s.length()) {
return false;
}
// check the string by each char, avoids a copy of the string
final int start = this.length_ - s.length();
// change this to Arrays.mismatch when going JDK 11 or higher
for (int i = 0; i < s.length(); i++) {
if (this.data_[i + start] != s.charAt(i)) {
return false;
}
}
return true;
}
/**
* Reduces the buffer to the content between start and end marker when
* only whitespaces are found before the startMarker as well as after the end marker.
* If both strings overlap due to identical characters such as "foo" and "oof"
* and the buffer is " foof ", we don't do anything.
*
* <p>If a marker is empty, it behaves like {@link java.lang.String#trim()} on that side.
*
* @param startMarker the start string to find, must not be null
* @param endMarker the end string to find, must not be null
* @return this instance
*
* @deprecated Use the new method {@link #trimToContent(String, String)} instead.
*/
public CheaperCharBuffer reduceToContent(final String startMarker, final String endMarker) {
return trimToContent(startMarker, endMarker);
}
/**
* Reduces the buffer to the content between start and end marker when
* only whitespaces are found before the startMarker as well as after the end marker.
* If both strings overlap due to identical characters such as "foo" and "oof"
* and the buffer is " foof ", we don't do anything.
*
* <p>If a marker is empty, it behaves like {@link java.lang.String#trim()} on that side.
*
* @param startMarker the start string to find, must not be null
* @param endMarker the end string to find, must not be null
* @return this instance
*/
public CheaperCharBuffer trimToContent(final String startMarker, final String endMarker) {
// if both are longer or same length than content, don't do anything
final int markerLength = startMarker.length() + endMarker.length();
if (markerLength >= this.length_) {
return this;
}
// run over starting whitespaces
int sPos = 0;
for (; sPos < this.length_ - markerLength; sPos++) {
if (!Character.isWhitespace(this.data_[sPos])) {
break;
}
}
// run over ending whitespaces
int ePos = this.length_ - 1;
for (; ePos > sPos - markerLength; ePos--) {
if (!Character.isWhitespace(this.data_[ePos])) {
break;
}
}
// if we have less content than marker length, give up
// this also helps when markers overlap such as
// <!-- and --> and the string is " <!---> "
if (ePos - sPos + 1 < markerLength) {
return this;
}
// check the start
for (int i = 0; i < startMarker.length(); i++) {
if (startMarker.charAt(i) != this.data_[i + sPos]) {
// no start match, stop and don't do anything
return this;
}
}
// check the end, ePos is when the first good char
// occurred
final int endStartCheckPos = ePos - endMarker.length() + 1;
for (int i = 0; i < endMarker.length(); i++) {
if (endMarker.charAt(i) != this.data_[endStartCheckPos + i]) {
// no start match, stop and don't do anything
return this;
}
}
// shift left and cut length
final int newLength = ePos - sPos + 1 - markerLength;
System.arraycopy(this.data_,
sPos + startMarker.length(),
this.data_,
0, newLength);
this.length_ = newLength;
return this;
}
/**
* Check if we have only whitespaces
*
* @return true if we have only whitespace, false otherwise
*/
public boolean isWhitespace() {
for (int i = 0; i < this.length_; i++) {
if (!Character.isWhitespace(this.data_[i])) {
return false;
}
}
return true;
}
/**
* Trims the string similar to {@link java.lang.String#trim()}
*
* @return a string with removed whitespace at the beginning and the end
*/
public CheaperCharBuffer trim() {
// clean the end first, because it is cheap
return trimTrailing().trimLeading();
}
/**
* Removes all whitespace before the first non-whitespace char.
* If all are whitespaces, we get an empty buffer
*
* @return this instance
*/
public CheaperCharBuffer trimLeading() {
// run over starting whitespace
int sPos = 0;
for (; sPos < this.length_; sPos++) {
if (!Character.isWhitespace(this.data_[sPos])) {
break;
}
}
if (sPos == 0) {
// nothing to do
return this;
}
else if (sPos == this.length_) {
// only whitespace
this.length_ = 0;
return this;
}
// shift left
final int newLength = this.length_ - sPos;
System.arraycopy(this.data_,
sPos,
this.data_,
0, newLength);
this.length_ = newLength;
return this;
}
/**
* Removes all whitespace at the end.
* If all are whitespace, we get an empty buffer
*
* @return this instance
*
* @deprecated Use {@link #trimTrailing()} instead.
*/
public CheaperCharBuffer trimWhitespaceAtEnd() {
return trimTrailing();
}
/**
* Removes all whitespace at the end.
* If all are whitespace, we get an empty buffer
*
* @return this instance
*/
public CheaperCharBuffer trimTrailing() {
// run over ending whitespaces
int ePos = this.length_ - 1;
for (; ePos >= 0; ePos--) {
if (!Character.isWhitespace(this.data_[ePos])) {
break;
}
}
this.length_ = ePos + 1;
return this;
}
/**
* Shortens the buffer by that many positions. If the count is
* larger than the length, we get just an empty buffer. If you pass in negative
* values, we are failing, likely often silently. It is all about performance and
* not a general all-purpose API.
*
* @param count a positive number, no runtime checks, if count is larger than
* length, we get length = 0
* @return this instance
*/
public CheaperCharBuffer shortenBy(final int count) {
final int newLength = this.length_ - count;
this.length_ = newLength < 0 ? 0 : newLength;
return this;
}
/**
* Get the characters as char array, this will be a copy!
*
* @return a copy of the underlying char darta
*/
public char[] getChars() {
return Arrays.copyOf(this.data_, this.length_);
}
/**
* Returns a string representation of this buffer. This will be a copy
* operation. If the buffer is emoty, we get a constant empty String back
* to avoid any overhead.
*
* @return a string of the content of this buffer
*/
@Override
public String toString() {
if (this.length_ > 0) {
return new String(this.data_, 0, this.length_);
}
else {
return "";
}
}
/**
* Returns the char a the given position. Will complain if
* we try to read outside the range. We do a range check here
* because we might not notice when we are within the buffer
* but outside the current length.
*
* @param index the position to read from
* @return the char at the position
* @throws IndexOutOfBoundsException
* in case one tries to read outside of valid buffer range
*/
@Override
public char charAt(final int index) {
if (index > this.length_ - 1 || index < 0) {
throw new IndexOutOfBoundsException(
"Tried to read outside of the valid buffer data");
}
return this.data_[index];
}
/**
* Returns the char at the given position. No checks are
* performed. It is up to the caller to make sure we
* read correctly. Reading outside of the array will
* cause an {@link IndexOutOfBoundsException} but using an
* incorrect position in the array (such as beyond length)
* might stay unnoticed! This is a performance method,
* use at your own risk.
*
* @param index the position to read from
* @return the char at the position
*/
public char unsafeCharAt(final int index) {
return this.data_[index];
}
/**
* Returns a content copy of this buffer
*
* @return a copy of this buffer, the capacity might differ
*/
@Override
public CheaperCharBuffer clone() {
return new CheaperCharBuffer(this);
}
/**
* Returns a <code>CharSequence</code> that is a subsequence of this sequence.
* The subsequence starts with the <code>char</code> value at the specified index and
* ends with the <code>char</code> value at index <tt>end - 1</tt>. The length
* (in <code>char</code>s) of the
* returned sequence is <tt>end - start</tt>, so if <tt>start == end</tt>
* then an empty sequence is returned.
*
* @param start the start index, inclusive
* @param end the end index, exclusive
*
* @return the specified subsequence
*
* @throws IndexOutOfBoundsException
* if <tt>start</tt> or <tt>end</tt> are negative,
* if <tt>end</tt> is greater than <tt>length()</tt>,
* or if <tt>start</tt> is greater than <tt>end</tt>
*
* @return a charsequence of this buffer
*/
@Override
public CharSequence subSequence(final int start, final int end) {
if (start < 0) {
throw new StringIndexOutOfBoundsException(start);
}
if (end > this.length_) {
throw new StringIndexOutOfBoundsException(end);
}
final int l = end - start;
if (l < 0) {
throw new StringIndexOutOfBoundsException(l);
}
return new String(this.data_, start, l);
}
/**
* Two buffers are identical when the length and
* the content of the backing array (only for the
* data in view) are identical.
*
* @param o the object to compare with
* @return true if length and array content match, false otherwise
*/
@Override
public boolean equals(final Object o) {
if (o instanceof CharSequence) {
final CharSequence ob = (CharSequence) o;
if (ob.length() != this.length_) {
return false;
}
// ok, in JDK 11 or up, we could use an
// Arrays.mismatch, but we cannot do that
// due to JDK 8 compatibility
for (int i = 0; i < this.length_; i++) {
if (ob.charAt(i) != this.data_[i]) {
return false;
}
}
// length and content match, be happy
return true;
}
return false;
}
/**
* We don't cache the hashcode because we mutate often. Don't use this in
* hashmaps as key. But you can use that to look up in a hashmap against
* a string using the CharSequence interface.
*
* @return the hashcode, similar to what a normal string would deliver
*/
@Override
public int hashCode() {
int h = 0;
for (int i = 0; i < this.length_; i++) {
h = ((h << 5) - h) + this.data_[i];
}
return h;
}
/**
* Append a character to an XMLCharBuffer. The character is an int value, and
* can either be a single UTF-16 character or a supplementary character
* represented by two UTF-16 code points.
*
* @param value The character value.
* @return this instance for fluid programming
*
* @throws IllegalArgumentException if the specified
* {@code codePoint} is not a valid Unicode code point.
*/
public CheaperCharBuffer appendCodePoint(final int value) {
if (value <= Character.MAX_VALUE) {
return this.append((char) value);
}
else {
try {
final char[] chars = Character.toChars(value);
return this.append(chars, 0, chars.length);
}
catch (final IllegalArgumentException e) {
// when value is not valid as UTF-16
this.append(REPLACEMENT_CHARACTER);
throw e;
}
}
}
}

View File

@@ -0,0 +1,76 @@
/*
* Copyright 2023 The original authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.rschwietzke;
/**
* Ultra-fast pseudo random generator that is not synchronized!
* Don't use anything from Random by inheritance, this will inherit
* a volatile! Not my idea, copyied in parts some demo random
* generator lessons.
*
* @author rschwietzke
*
*/
public class FastRandom {
private long seed;
public FastRandom() {
this.seed = System.currentTimeMillis();
}
public FastRandom(long seed) {
this.seed = seed;
}
protected int next(int nbits) {
// N.B. Not thread-safe!
long x = this.seed;
x ^= (x << 21);
x ^= (x >>> 35);
x ^= (x << 4);
this.seed = x;
x &= ((1L << nbits) - 1);
return (int) x;
}
/**
* Borrowed from the JDK
*
* @param bound
* @return
*/
public int nextInt(int bound) {
int r = next(31);
int m = bound - 1;
if ((bound & m) == 0) // i.e., bound is a power of 2
r = (int) ((bound * (long) r) >> 31);
else {
for (int u = r; u - (r = u % bound) + m < 0; u = next(31))
;
}
return r;
}
/**
* Borrowed from the JDK
* @return
*/
public int nextInt() {
return next(32);
}
}