JuliaMath · milankl · Feb 12, 2020 · Feb 12, 2020
diff --git a/src/Float8s.jl b/src/Float8s.jl
@@ -7,7 +7,7 @@ module Float8s
                 UInt8,Int8,Int16,Int32,Int64,
                 (+), (-), (*), (/), (\), (^),
                 sin,cos,tan,asin,acos,atan,sinh,cosh,tanh,asinh,acosh,
-                atanh,exp,exp2,exp10,log,log2,log10,sqrt,lgamma,log1p
+                atanh,exp,exp2,exp10,log,log2,log10,sqrt,log1p,
                 atan,hypot
 
     export Float8, Float8_4, NaN8, Inf8, NaN8_4, Inf8_4

diff --git a/src/float8.jl b/src/float8.jl
@@ -91,34 +91,51 @@ function create_base_shifttable(::Type{T}) where {T<:AbstractFloat8}
     shifttable = Vector{UInt8}(undef, 512)
 
     if T == Float8
+        # elements derive from
+        # [1]   2^-6 = Float8(0x01) the smallest representable number (subnormal)
+        # [2]   2^-2 = Float8(0x10) the first non-subnormal number
+        # [3]   2^4 = 16 > floatmax(Float8) is the smallest power of two that is larger than floatmax(Float8)
+
         e_limits = [-6,-2,4]
+
+        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
+        # to the first significand bit
+        # e_shift_subnorm is 17 for Float8
+        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8)-1)+e_limits[2]-1
     elseif T == Float8_4
-        e_limits = []
+
+        # see above
+        e_limits = [-9,-6,8]
+
+        # shift a 0x1 in the exponent bits created by "significand_mask(Float32) + 0x1"
+        # to the first significand bit
+        # e_shift_subnorm is 14 for Float8_4
+        e_shift_subnorm = n_significant_bits(Float32)-(n_significant_bits(Float8_4)-1)+e_limits[2]-1
     end
 
-    for i = 0:255                   # all possible exponents for Float32
-        e = i - 127                 # subtract Float32 bias
-        if e < -6                   # Very small numbers map to +- zero
+    for i = 0:255                               # all possible exponents for Float32
+        e = i - 127                             # subtract Float32 bias
+        if e < e_limits[1]                      # Very small numbers map to +- zero
             basetable[i|0x000+1] = zero(T)
             basetable[i|0x100+1] = -zero(T)
             shifttable[i|0x000+1] = n_significant_bits(T)+1
             shifttable[i|0x100+1] = n_significant_bits(T)+1
-        elseif e < -2               # Small numbers map to denorms
+        elseif e < e_limits[2]                  # Small numbers map to denorms
             basetable[i|0x000+1] = zero(T)
             basetable[i|0x100+1] = -zero(T)
-            shifttable[i|0x000+1] = -e+17
-            shifttable[i|0x100+1] = -e+17
-        elseif e < 4                # Normal numbers just lose precision
+            shifttable[i|0x000+1] = -e+e_shift_subnorm
+            shifttable[i|0x100+1] = -e+e_shift_subnorm
+        elseif e < e_limits[3]                  # Normal numbers just lose precision
             basetable[i|0x000+1] = ((e+bias(T)) << n_significant_bits(T))
             basetable[i|0x100+1] = ((e+bias(T)) << n_significant_bits(T)) | sign_mask(T)
             shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)
             shifttable[i|0x100+1] = n_significant_bits(Float32)-n_significant_bits(T)
-        elseif e < 128              # Large numbers map to Infinity
+        elseif e < 128                          # Large numbers map to Infinity
             basetable[i|0x000+1] = inf8(T)
             basetable[i|0x100+1] = -inf8(T)
             shifttable[i|0x000+1] = n_significant_bits(T)+1
             shifttable[i|0x100+1] = n_significant_bits(T)+1
-        else                        # Infinity and NaN's stay Infinity and NaN's
+        else                                    # Infinity and NaN's stay Infinity and NaN's
             basetable[i|0x000+1] = inf8(T)
             basetable[i|0x100+1] = -inf8(T)
             shifttable[i|0x000+1] = n_significant_bits(Float32)-n_significant_bits(T)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -10,7 +10,7 @@ using Test
     end
 end
 
-@testset "Conversion Float8 <-> Float32" begin
+@testset "Conversion Float8_4 <-> Float32" begin
 
     for i in 0x00:0xff
         if ~isnan(Float8_4(i))