Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Conversion F8_4 <-> F32 corrected for subnormals #5

Merged
merged 1 commit into from
Feb 12, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion src/Float8s.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ module Float8s
UInt8,Int8,Int16,Int32,Int64,
(+), (-), (*), (/), (\), (^),
sin,cos,tan,asin,acos,atan,sinh,cosh,tanh,asinh,acosh,
atanh,exp,exp2,exp10,log,log2,log10,sqrt,lgamma,log1p
atanh,exp,exp2,exp10,log,log2,log10,sqrt,log1p,
atan,hypot

export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4
Expand Down
37 changes: 27 additions & 10 deletions src/float8.jl
Original file line number Diff line number Diff line change
Expand Up @@ -91,34 +91,51 @@ function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8}
shifttable = Vector{UInt8}(undef, 512)

if T == Float8
# elements derive from
# [1] 2^-6 = Float8(0x01) the smallest representable number (subnormal)
# [2] 2^-2 = Float8(0x10) the first non-subnormal number
# [3] 2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8)

e_limits = [-6,-2,4]

# shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
# to the first significand bit
# e_shift_subnorm is 17 for Float8
e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1
elseif T == Float8_4
e_limits = []

# see above
e_limits = [-9,-6,8]

# shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
# to the first significand bit
# e_shift_subnorm is 14 for Float8_4
e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1
end

for i = 0:255 # all possible exponents for Float32
e = i - 127 # subtract Float32 bias
if e < -6 # Very small numbers map to +- zero
for i = 0:255 # all possible exponents for Float32
e = i - 127 # subtract Float32 bias
if e < e_limits[1] # Very small numbers map to +- zero
basetable[i|0x000+1] = zero(T)
basetable[i|0x100+1] = -zero(T)
shifttable[i|0x000+1] = n_significant_bits(T)+1
shifttable[i|0x100+1] = n_significant_bits(T)+1
elseif e < -2 # Small numbers map to denorms
elseif e < e_limits[2] # Small numbers map to denorms
basetable[i|0x000+1] = zero(T)
basetable[i|0x100+1] = -zero(T)
shifttable[i|0x000+1] = -e+17
shifttable[i|0x100+1] = -e+17
elseif e < 4 # Normal numbers just lose precision
shifttable[i|0x000+1] = -e+e_shift_subnorm
shifttable[i|0x100+1] = -e+e_shift_subnorm
elseif e < e_limits[3] # Normal numbers just lose precision
basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T))
basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T)
shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
elseif e < 128 # Large numbers map to Infinity
elseif e < 128 # Large numbers map to Infinity
basetable[i|0x000+1] = inf8(T)
basetable[i|0x100+1] = -inf8(T)
shifttable[i|0x000+1] = n_significant_bits(T)+1
shifttable[i|0x100+1] = n_significant_bits(T)+1
else # Infinity and NaN's stay Infinity and NaN's
else # Infinity and NaN's stay Infinity and NaN's
basetable[i|0x000+1] = inf8(T)
basetable[i|0x100+1] = -inf8(T)
shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
Expand Down
2 changes: 1 addition & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ using Test
end
end

@testset "Conversion Float8 <-> Float32" begin
@testset "Conversion Float8_4 <-> Float32" begin

for i in 0x00:0xff
if ~isnan(Float8_4(i))
Expand Down