Skip to content

Commit

Permalink
Improve accuracy of conversions from floating-point numbers
Browse files Browse the repository at this point in the history
This fixes a problem with the input range checking.
  • Loading branch information
kimikage committed Oct 29, 2019
1 parent 2a04bc5 commit a41e65c
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 14 deletions.
64 changes: 50 additions & 14 deletions src/normed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,27 +41,63 @@ rawone(v) = reinterpret(one(v))
function Normed{T,f}(x::Normed{T2}) where {T <: Unsigned,T2 <: Unsigned,f}
U = Normed{T,f}
y = round((rawone(U)/rawone(x))*reinterpret(x))
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
0 <= y <= typemax(T) || throw_converterror(U, x)
reinterpret(U, _unsafe_trunc(T, y))
end
N0f16(x::N0f8) = reinterpret(N0f16, convert(UInt16, 0x0101*reinterpret(x)))

(::Type{U})(x::Real) where {U <: Normed} = _convert(U, rawtype(U), x)
(::Type{U})(x::Real) where {U <: Normed} = _convert(U, x)

function _convert(::Type{U}, ::Type{T}, x) where {U <: Normed,T}
y = round(widen1(rawone(U))*x)
(0 <= y) & (y <= typemax(T)) || throw_converterror(U, x)
U(_unsafe_trunc(T, y), 0)
function _convert(::Type{U}, x) where {T, f, U <: Normed{T,f}}
if T == UInt128 # for UInt128, we can't widen
# the following range is not exact.
0 <= x <= ((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
y = round(rawone(U)*x)
else
y = round(widen1(rawone(U))*x)
0 <= y <= typemax(T) || throw_converterror(U, x)
end
reinterpret(U, _unsafe_trunc(T, y))
end
# Prevent overflow (https://discourse.julialang.org/t/saving-greater-than-8-bit-images/6057)
_convert(::Type{U}, ::Type{T}, x::Float16) where {U <: Normed,T} =
_convert(U, T, Float32(x))
_convert(::Type{U}, ::Type{UInt128}, x::Float16) where {U <: Normed} =
_convert(U, UInt128, Float32(x))
function _convert(::Type{U}, ::Type{UInt128}, x) where {U <: Normed}
y = round(rawone(U)*x) # for UInt128, we can't widen
(0 <= y) & (y <= typemax(UInt128)) & (x <= Float64(typemax(U))) || throw_converterror(U, x)
U(_unsafe_trunc(UInt128, y), 0)
function _convert(::Type{U}, x::Float16) where {T, f, U <: Normed{T,f}}
if Float16(typemax(T)/rawone(U)) > Float32(typemax(T)/rawone(U))
x == Float16(typemax(T)/rawone(U)) && return typemax(U)
end
return _convert(U, Float32(x))
end
function _convert(::Type{U}, x::Tf) where {T, f, U <: Normed{T,f}, Tf <: Union{Float32, Float64}}
if T == UInt128 && f == 53
0 <= x <= Tf(3.777893186295717e22) || throw_converterror(U, x)
else
0 <= x <= Tf((typemax(T)-rawone(U))/rawone(U)+1) || throw_converterror(U, x)
end

significand_bits = Tf == Float64 ? 52 : 23
if f <= (significand_bits + 1) && sizeof(T) * 8 < significand_bits
return reinterpret(U, unsafe_trunc(T, round(rawone(U) * x)))
end
# cf. the implementation of `frexp`
Tw = f < sizeof(T) * 8 ? T : widen1(T)
bits = sizeof(Tw) * 8 - 1
xu = reinterpret(Tf == Float64 ? UInt64 : UInt32, x)
k = Int(xu >> significand_bits)
k == 0 && return zero(U) # neglect subnormal numbers
significand = xu | (one(xu) << significand_bits)
yh = unsafe_trunc(Tw, significand) << (bits - significand_bits)
exponent_bias = Tf == Float64 ? 1023 : 127
ex = exponent_bias - k + bits - f
if ex <= 0
yi = yh - (yh >> f)
ex == 0 && return reinterpret(U, unsafe_trunc(T, yi))
if ex == -1
(yi & (one(Tw) << bits)) != 0 && return typemax(U)
return reinterpret(U, unsafe_trunc(T, yi + yi))
end
return typemax(U)
end
yi = bits >= f ? yh - (yh >> f) + (one(Tw)<<(ex - 1)) : yh
return reinterpret(U, unsafe_trunc(T, yi >> ex))
end

rem(x::T, ::Type{T}) where {T <: Normed} = x
Expand Down
30 changes: 30 additions & 0 deletions test/normed.jl
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,36 @@ end
@test convert(Normed{UInt16,7}, Normed{UInt8,7}(0.504)) === Normed{UInt16,7}(0.504)
end

@testset "conversion from float" begin
# issue 102
for T in (UInt8, UInt16, UInt32, UInt64, UInt128)
for Tf in (Float16, Float32, Float64)
@testset "Normed{$T,$f}(::$Tf)" for f = 1:sizeof(T)*8
U = Normed{T,f}
r = FixedPointNumbers.rawone(U)

@test reinterpret(U(zero(Tf))) == 0x0

# TODO: fix issue #129
# input_typemax = Tf(typemax(U))
input_typemax = Tf(BigFloat(typemax(T)) / r)
if isinf(input_typemax)
@test reinterpret(U(floatmax(Tf))) >= round(T, floatmax(Tf))
else
@test reinterpret(U(input_typemax)) >= (typemax(T)>>1) # overflow check
end

input_upper = Tf(BigFloat(typemax(T)) / r, RoundDown)
@test reinterpret(U(input_upper)) == T(min(round(BigFloat(input_upper) * r), typemax(T)))

input_exp2 = Tf(exp2(sizeof(T) * 8 - f))
isinf(input_exp2) && continue
@test reinterpret(U(input_exp2)) == T(input_exp2) * r
end
end
end
end

@testset "modulus" begin
@test N0f8(0.2) % N0f8 === N0f8(0.2)
@test N2f14(1.2) % N0f16 === N0f16(0.20002)
Expand Down

0 comments on commit a41e65c

Please sign in to comment.