Skip to content

Commit

Permalink
Implement fast_float for String#to_f
Browse files Browse the repository at this point in the history
  • Loading branch information
HertzDevil committed Nov 15, 2024
1 parent caf57c2 commit 6e96919
Show file tree
Hide file tree
Showing 13 changed files with 2,735 additions and 60 deletions.
27 changes: 27 additions & 0 deletions spec/manual/string_to_f32_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
require "spec"

# Exhaustively checks that for all 4294967296 possible `Float32` values,
# `to_s.to_f32` returns the original number. Splits the floats into 4096 bins
# for better progress tracking. Also useful as a sort of benchmark.
#
# This was originally added when `String#to_f` moved from `LibC.strtod` to
# `fast_float`, but is applicable to any other implementation as well.
describe "x.to_s.to_f32 == x" do
(0_u32..0xFFF_u32).each do |i|
it "%03x00000..%03xfffff" % {i, i} do
0x100000.times do |j|
bits = i << 20 | j
float = bits.unsafe_as(Float32)
str = float.to_s
val = str.to_f32?.should_not be_nil

if float.nan?
val.nan?.should be_true
else
val.should eq(float)
Math.copysign(1, val).should eq(Math.copysign(1, float))
end
end
end
end
end
5 changes: 5 additions & 0 deletions spec/std/string/fast_float_spec.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
require "spec"

describe String do
# TODO
end
1 change: 1 addition & 0 deletions spec/std/string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ describe "String" do
it { "1Y2P0IJ32E8E7".to_i64(36).should eq(9223372036854775807) }
end

# more specs are available in `spec/std/string/fast_float_spec.cr`
it "does to_f" do
expect_raises(ArgumentError) { "".to_f }
"".to_f?.should be_nil
Expand Down
82 changes: 82 additions & 0 deletions src/float/fast_float.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
struct Float
# :nodoc:
# Source port of the floating-point part of fast_float for C++:
# https://github.com/fastfloat/fast_float
#
# fast_float implements the C++17 `std::from_chars`, which accepts a subset of
# the C `strtod` / `strtof`'s string format:
#
# - a leading plus sign is disallowed, but both fast_float and this port
# accept it;
# - the exponent may be required or disallowed, depending on the format
# argument (this port always allows both);
# - hexfloats are not enabled by default, and fast_float doesn't implement it;
# (https://github.com/fastfloat/fast_float/issues/124)
# - hexfloats cannot start with `0x` or `0X`.
#
# The following is their license:
#
# Licensed under either of Apache License, Version 2.0 or MIT license or
# BOOST license.
#
# Unless you explicitly state otherwise, any contribution intentionally
# submitted for inclusion in this repository by you, as defined in the
# Apache-2.0 license, shall be triple licensed as above, without any
# additional terms or conditions.
#
# Main differences from the original fast_float:
#
# - Only `UC == UInt8` is implemented and tested, not the other wide chars;
# - No explicit SIMD (the original mainly uses this for wide char strings).
#
# The following compile-time configuration is assumed:
#
# - #define FASTFLOAT_ALLOWS_LEADING_PLUS
# - #define FLT_EVAL_METHOD 0
module FastFloat
# Current revision: https://github.com/fastfloat/fast_float/tree/v6.1.6

def self.to_f64?(str : String, whitespace : Bool, strict : Bool) : Float64?
value = uninitialized Float64
start = str.to_unsafe
finish = start + str.bytesize
options = ParseOptionsT(typeof(str.to_unsafe.value)).new(format: :general)

ret = BinaryFormat_Float64.new.from_chars_advanced(start, finish, pointerof(value), options, whitespace: whitespace)
if ret.ec == Errno::NONE
if trailing_chars_allowed?(ret.ptr, finish, whitespace, strict)
value
end
end
end

def self.to_f32?(str : String, whitespace : Bool, strict : Bool) : Float32?
value = uninitialized Float32
start = str.to_unsafe
finish = start + str.bytesize
options = ParseOptionsT(typeof(str.to_unsafe.value)).new(format: :general)

ret = BinaryFormat_Float32.new.from_chars_advanced(start, finish, pointerof(value), options, whitespace: whitespace)
if ret.ec == Errno::NONE
if trailing_chars_allowed?(ret.ptr, finish, whitespace, strict)
value
end
end
end

private def self.trailing_chars_allowed?(ptr, finish, whitespace, strict)
if strict
if whitespace
while ptr < finish && ptr.value.unsafe_chr.ascii_whitespace?
ptr += 1
end
end
ptr == finish
else
true
end
end
end
end

require "./fast_float/parse_number"
270 changes: 270 additions & 0 deletions src/float/fast_float/ascii_number.cr
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
require "./float_common"

module Float::FastFloat
# Next function can be micro-optimized, but compilers are entirely able to
# optimize it well.
def self.is_integer?(c : UC) : Bool forall UC
!(c > '9'.ord || c < '0'.ord)
end

# Read 8 UC into a u64. Truncates UC if not char.
def self.read8_to_u64(chars : UC*) : UInt64 forall UC
val = uninitialized UInt64
chars.as(UInt8*).copy_to(pointerof(val).as(UInt8*), sizeof(UInt64))
{% if IO::ByteFormat::SystemEndian == IO::ByteFormat::BigEndian %}
val.byte_swap
{% else %}
val
{% end %}
end

# credit @aqrit
def self.parse_eight_digits_unrolled(val : UInt64) : UInt32
mask = 0x000000FF000000FF_u64
mul1 = 0x000F424000000064_u64 # 100 + (1000000ULL << 32)
mul2 = 0x0000271000000001_u64 # 1 + (10000ULL << 32)
val &-= 0x3030303030303030
val = (val &* 10) &+ val.unsafe_shr(8) # val = (val * 2561) >> 8
val = (((val & mask) &* mul1) &+ ((val.unsafe_shr(16) & mask) &* mul2)).unsafe_shr(32)
val.to_u32!
end

# Call this if chars are definitely 8 digits.
def self.parse_eight_digits_unrolled(chars : UC*) : UInt32 forall UC
parse_eight_digits_unrolled(read8_to_u64(chars))
end

# credit @aqrit
def self.is_made_of_eight_digits_fast?(val : UInt64) : Bool
((val &+ 0x4646464646464646_u64) | (val &- 0x3030303030303030_u64)) & 0x8080808080808080_u64 == 0
end

# NOTE(crystal): returns {p, i}
def self.loop_parse_if_eight_digits(p : UInt8*, pend : UInt8*, i : UInt64) : {UInt8*, UInt64}
# optimizes better than parse_if_eight_digits_unrolled() for UC = char.
while pend - p >= 8 && is_made_of_eight_digits_fast?(read8_to_u64(p))
i = i &* 100000000 &+ parse_eight_digits_unrolled(read8_to_u64(p)) # in rare cases, this will overflow, but that's ok
p += 8
end
{p, i}
end

enum ParseError
NoError

# [JSON-only] The minus sign must be followed by an integer.
MissingIntegerAfterSign

# A sign must be followed by an integer or dot.
MissingIntegerOrDotAfterSign

# [JSON-only] The integer part must not have leading zeros.
LeadingZerosInIntegerPart

# [JSON-only] The integer part must have at least one digit.
NoDigitsInIntegerPart

# [JSON-only] If there is a decimal point, there must be digits in the
# fractional part.
NoDigitsInFractionalPart

# The mantissa must have at least one digit.
NoDigitsInMantissa

# Scientific notation requires an exponential part.
MissingExponentialPart
end

struct ParsedNumberStringT(UC)
property exponent : Int64 = 0
property mantissa : UInt64 = 0
property lastmatch : UC* = Pointer(UC).null
property negative : Bool = false
property valid : Bool = false
property too_many_digits : Bool = false
# contains the range of the significant digits
property integer : Slice(UC) = Slice(UC).empty # non-nullable
property fraction : Slice(UC) = Slice(UC).empty # nullable
property error : ParseError = :no_error
end

alias ByteSpan = ::Bytes
alias ParsedNumberString = ParsedNumberStringT(UInt8)

def self.report_parse_error(p : UC*, error : ParseError) : ParsedNumberStringT(UC) forall UC
answer = ParsedNumberStringT(UC).new
answer.valid = false
answer.lastmatch = p
answer.error = error
answer
end

# Assuming that you use no more than 19 digits, this will parse an ASCII
# string.
def self.parse_number_string(p : UC*, pend : UC*, options : ParseOptionsT(UC)) : ParsedNumberStringT(UC) forall UC
fmt = options.format
decimal_point = options.decimal_point

answer = ParsedNumberStringT(UInt8).new
answer.valid = false
answer.too_many_digits = false
answer.negative = p.value === '-'

if p.value === '-' || (!fmt.json_fmt? && p.value === '+')
p += 1
if p == pend
return report_parse_error(p, :missing_integer_or_dot_after_sign)
end
if fmt.json_fmt?
if !is_integer?(p.value) # a sign must be followed by an integer
return report_parse_error(p, :missing_integer_after_sign)
end
else
if !is_integer?(p.value) && p.value != decimal_point # a sign must be followed by an integer or the dot
return report_parse_error(p, :missing_integer_or_dot_after_sign)
end
end
end
start_digits = p

i = 0_u64 # an unsigned int avoids signed overflows (which are bad)

while p != pend && is_integer?(p.value)
# a multiplication by 10 is cheaper than an arbitrary integer multiplication
i = i &* 10 &+ (p.value &- '0'.ord).to_u64! # might overflow, we will handle the overflow later
p += 1
end
end_of_integer_part = p
digit_count = (end_of_integer_part - start_digits).to_i64!
answer.integer = Slice.new(start_digits, digit_count.to_i32)
if fmt.json_fmt?
# at least 1 digit in integer part, without leading zeros
if digit_count == 0
return report_parse_error(p, :no_digits_in_integer_part)
end
if start_digits[0] === '0' && digit_count > 1
return report_parse_error(p, :leading_zeros_in_integer_part)
end
end

exponent = 0_i64
has_decimal_point = p != pend && p.value == decimal_point
if has_decimal_point
p += 1
before = p
# can occur at most twice without overflowing, but let it occur more, since
# for integers with many digits, digit parsing is the primary bottleneck.
p, i = loop_parse_if_eight_digits(p, pend, i)

while p != pend && is_integer?(p.value)
digit = (p.value &- '0'.ord).to_u8!
p += 1
i = i &* 10 &+ digit # in rare cases, this will overflow, but that's ok
end
exponent = before - p
answer.fraction = Slice.new(before, (p - before).to_i32)
digit_count &-= exponent
end
if fmt.json_fmt?
# at least 1 digit in fractional part
if has_decimal_point && exponent == 0
return report_parse_error(p, :no_digits_in_fractional_part)
end
elsif digit_count == 0 # we must have encountered at least one integer!
return report_parse_error(p, :no_digits_in_mantissa)
end
exp_number = 0_i64 # explicit exponential part
if (fmt.scientific? && p != pend && p.value.unsafe_chr.in?('e', 'E')) ||
(fmt.fortran_fmt? && p != pend && p.value.unsafe_chr.in?('+', '-', 'd', 'D'))
location_of_e = p
if p.value.unsafe_chr.in?('e', 'E', 'd', 'D')
p += 1
end
neg_exp = false
if p != pend && p.value === '-'
neg_exp = true
p += 1
elsif p != pend && p.value === '+' # '+' on exponent is allowed by C++17 20.19.3.(7.1)
p += 1
end
if p == pend || !is_integer?(p.value)
if !fmt.fixed?
# The exponential part is invalid for scientific notation, so it must
# be a trailing token for fixed notation. However, fixed notation is
# disabled, so report a scientific notation error.
return report_parse_error(p, :missing_exponential_part)
end
# Otherwise, we will be ignoring the 'e'.
p = location_of_e
else
while p != pend && is_integer?(p.value)
digit = (p.value &- '0'.ord).to_u8!
if exp_number < 0x10000000
exp_number = exp_number &* 10 &+ digit
end
p += 1
end
if neg_exp
exp_number = 0_i64 &- exp_number
end
exponent &+= exp_number
end
else
# If it scientific and not fixed, we have to bail out.
if fmt.scientific? && !fmt.fixed?
return report_parse_error(p, :missing_exponential_part)
end
end
answer.lastmatch = p
answer.valid = true

# If we frequently had to deal with long strings of digits,
# we could extend our code by using a 128-bit integer instead
# of a 64-bit integer. However, this is uncommon.
#
# We can deal with up to 19 digits.
if digit_count > 19 # this is uncommon
# It is possible that the integer had an overflow.
# We have to handle the case where we have 0.0000somenumber.
# We need to be mindful of the case where we only have zeroes...
# E.g., 0.000000000...000.
start = start_digits
while start != pend && (start.value === '0' || start.value == decimal_point)
if start.value === '0'
digit_count &-= 1
end
start += 1
end

if digit_count > 19
answer.too_many_digits = true
# Let us start again, this time, avoiding overflows.
# We don't need to check if is_integer, since we use the
# pre-tokenized spans from above.
i = 0_u64
p = answer.integer.to_unsafe
int_end = p + answer.integer.size
minimal_nineteen_digit_integer = 1000000000000000000_u64
while i < minimal_nineteen_digit_integer && p != int_end
i = i &* 10 &+ (p.value - '0'.ord).to_u64!
p += 1
end
if i >= minimal_nineteen_digit_integer # We have a big integers
exponent = (end_of_integer_part - p) &+ exp_number
else # We have a value with a fractional component.
p = answer.fraction.to_unsafe
frac_end = p + answer.fraction.size
while i < minimal_nineteen_digit_integer && p != frac_end
i = i &* 10 &+ (p.value - '0'.ord).to_u64!
p += 1
end
exponent = (answer.fraction.to_unsafe - p) &+ exp_number
end
# We have now corrected both exponent and i, to a truncated value
end
end
answer.exponent = exponent
answer.mantissa = i
answer
end
end
Loading

0 comments on commit 6e96919

Please sign in to comment.