StatProfilerHTML.jl report
Generated on Thu, 21 Dec 2023 13:06:16
File source code
Line Exclusive Inclusive Code
1 # This file is a part of Julia. License is MIT: https://julialang.org/license
2
3 const IEEEFloat = Union{Float16, Float32, Float64}
4
5 ## floating point traits ##
6
7 """
8 Inf16
9
10 Positive infinity of type [`Float16`](@ref).
11 """
12 const Inf16 = bitcast(Float16, 0x7c00)
13 """
14 NaN16
15
16 A not-a-number value of type [`Float16`](@ref).
17 """
18 const NaN16 = bitcast(Float16, 0x7e00)
19 """
20 Inf32
21
22 Positive infinity of type [`Float32`](@ref).
23 """
24 const Inf32 = bitcast(Float32, 0x7f800000)
25 """
26 NaN32
27
28 A not-a-number value of type [`Float32`](@ref).
29 """
30 const NaN32 = bitcast(Float32, 0x7fc00000)
31 const Inf64 = bitcast(Float64, 0x7ff0000000000000)
32 const NaN64 = bitcast(Float64, 0x7ff8000000000000)
33
34 const Inf = Inf64
35 """
36 Inf, Inf64
37
38 Positive infinity of type [`Float64`](@ref).
39
40 See also: [`isfinite`](@ref), [`typemax`](@ref), [`NaN`](@ref), [`Inf32`](@ref).
41
42 # Examples
43 ```jldoctest
44 julia> π/0
45 Inf
46
47 julia> +1.0 / -0.0
48 -Inf
49
50 julia> ℯ^-Inf
51 0.0
52 ```
53 """
54 Inf, Inf64
55
56 const NaN = NaN64
57 """
58 NaN, NaN64
59
60 A not-a-number value of type [`Float64`](@ref).
61
62 See also: [`isnan`](@ref), [`missing`](@ref), [`NaN32`](@ref), [`Inf`](@ref).
63
64 # Examples
65 ```jldoctest
66 julia> 0/0
67 NaN
68
69 julia> Inf - Inf
70 NaN
71
72 julia> NaN == NaN, isequal(NaN, NaN), NaN === NaN
73 (false, true, true)
74 ```
75 """
76 NaN, NaN64
77
78 # bit patterns
79 reinterpret(::Type{Unsigned}, x::Float64) = reinterpret(UInt64, x)
80 reinterpret(::Type{Unsigned}, x::Float32) = reinterpret(UInt32, x)
81 reinterpret(::Type{Unsigned}, x::Float16) = reinterpret(UInt16, x)
82 reinterpret(::Type{Signed}, x::Float64) = reinterpret(Int64, x)
83 reinterpret(::Type{Signed}, x::Float32) = reinterpret(Int32, x)
84 reinterpret(::Type{Signed}, x::Float16) = reinterpret(Int16, x)
85
86 sign_mask(::Type{Float64}) = 0x8000_0000_0000_0000
87 exponent_mask(::Type{Float64}) = 0x7ff0_0000_0000_0000
88 exponent_one(::Type{Float64}) = 0x3ff0_0000_0000_0000
89 exponent_half(::Type{Float64}) = 0x3fe0_0000_0000_0000
90 significand_mask(::Type{Float64}) = 0x000f_ffff_ffff_ffff
91
92 sign_mask(::Type{Float32}) = 0x8000_0000
93 exponent_mask(::Type{Float32}) = 0x7f80_0000
94 exponent_one(::Type{Float32}) = 0x3f80_0000
95 exponent_half(::Type{Float32}) = 0x3f00_0000
96 significand_mask(::Type{Float32}) = 0x007f_ffff
97
98 sign_mask(::Type{Float16}) = 0x8000
99 exponent_mask(::Type{Float16}) = 0x7c00
100 exponent_one(::Type{Float16}) = 0x3c00
101 exponent_half(::Type{Float16}) = 0x3800
102 significand_mask(::Type{Float16}) = 0x03ff
103
104 mantissa(x::T) where {T} = reinterpret(Unsigned, x) & significand_mask(T)
105
106 for T in (Float16, Float32, Float64)
107 @eval significand_bits(::Type{$T}) = $(trailing_ones(significand_mask(T)))
108 @eval exponent_bits(::Type{$T}) = $(sizeof(T)*8 - significand_bits(T) - 1)
109 @eval exponent_bias(::Type{$T}) = $(Int(exponent_one(T) >> significand_bits(T)))
110 # maximum float exponent
111 @eval exponent_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)) - exponent_bias(T) - 1)
112 # maximum float exponent without bias
113 @eval exponent_raw_max(::Type{$T}) = $(Int(exponent_mask(T) >> significand_bits(T)))
114 end
115
116 """
117 exponent_max(T)
118
119 Maximum [`exponent`](@ref) value for a floating point number of type `T`.
120
121 # Examples
122 ```jldoctest
123 julia> Base.exponent_max(Float64)
124 1023
125 ```
126
127 Note, `exponent_max(T) + 1` is a possible value of the exponent field
128 with bias, which might be used as sentinel value for `Inf` or `NaN`.
129 """
130 function exponent_max end
131
132 """
133 exponent_raw_max(T)
134
135 Maximum value of the [`exponent`](@ref) field for a floating point number of type `T` without bias,
136 i.e. the maximum integer value representable by [`exponent_bits(T)`](@ref) bits.
137 """
138 function exponent_raw_max end
139
140 """
141 uabs(x::Integer)
142
143 Return the absolute value of `x`, possibly returning a different type should the
144 operation be susceptible to overflow. This typically arises when `x` is a two's complement
145 signed integer, so that `abs(typemin(x)) == typemin(x) < 0`, in which case the result of
146 `uabs(x)` will be an unsigned integer of the same size.
147 """
148 uabs(x::Integer) = abs(x)
149 uabs(x::BitSigned) = unsigned(abs(x))
150
151 ## conversions to floating-point ##
152
153 # TODO: deprecate in 2.0
154 Float16(x::Integer) = convert(Float16, convert(Float32, x)::Float32)
155
156 for t1 in (Float16, Float32, Float64)
157 for st in (Int8, Int16, Int32, Int64)
158 @eval begin
159 (::Type{$t1})(x::($st)) = sitofp($t1, x)
160 promote_rule(::Type{$t1}, ::Type{$st}) = $t1
161 end
162 end
163 for ut in (Bool, UInt8, UInt16, UInt32, UInt64)
164 @eval begin
165 (::Type{$t1})(x::($ut)) = uitofp($t1, x)
166 promote_rule(::Type{$t1}, ::Type{$ut}) = $t1
167 end
168 end
169 end
170
171 Bool(x::Real) = x==0 ? false : x==1 ? true : throw(InexactError(:Bool, Bool, x))
172
173 promote_rule(::Type{Float64}, ::Type{UInt128}) = Float64
174 promote_rule(::Type{Float64}, ::Type{Int128}) = Float64
175 promote_rule(::Type{Float32}, ::Type{UInt128}) = Float32
176 promote_rule(::Type{Float32}, ::Type{Int128}) = Float32
177 promote_rule(::Type{Float16}, ::Type{UInt128}) = Float16
178 promote_rule(::Type{Float16}, ::Type{Int128}) = Float16
179
180 function Float64(x::UInt128)
181 if x < UInt128(1) << 104 # Can fit it in two 52 bits mantissas
182 low_exp = 0x1p52
183 high_exp = 0x1p104
184 low_bits = (x % UInt64) & Base.significand_mask(Float64)
185 low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
186 high_bits = ((x >> 52) % UInt64)
187 high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
188 low_value + high_value
189 else # Large enough that low bits only affect rounding, pack low bits
190 low_exp = 0x1p76
191 high_exp = 0x1p128
192 low_bits = ((x >> 12) % UInt64) >> 12 | (x % UInt64) & 0xFFFFFF
193 low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
194 high_bits = ((x >> 76) % UInt64)
195 high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
196 low_value + high_value
197 end
198 end
199
200 function Float64(x::Int128)
201 sign_bit = ((x >> 127) % UInt64) << 63
202 ux = uabs(x)
203 if ux < UInt128(1) << 104 # Can fit it in two 52 bits mantissas
204 low_exp = 0x1p52
205 high_exp = 0x1p104
206 low_bits = (ux % UInt64) & Base.significand_mask(Float64)
207 low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
208 high_bits = ((ux >> 52) % UInt64)
209 high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
210 reinterpret(Float64, sign_bit | reinterpret(UInt64, low_value + high_value))
211 else # Large enough that low bits only affect rounding, pack low bits
212 low_exp = 0x1p76
213 high_exp = 0x1p128
214 low_bits = ((ux >> 12) % UInt64) >> 12 | (ux % UInt64) & 0xFFFFFF
215 low_value = reinterpret(Float64, reinterpret(UInt64, low_exp) | low_bits) - low_exp
216 high_bits = ((ux >> 76) % UInt64)
217 high_value = reinterpret(Float64, reinterpret(UInt64, high_exp) | high_bits) - high_exp
218 reinterpret(Float64, sign_bit | reinterpret(UInt64, low_value + high_value))
219 end
220 end
221
222 function Float32(x::UInt128)
223 x == 0 && return 0f0
224 n = top_set_bit(x) # ndigits0z(x,2)
225 if n <= 24
226 y = ((x % UInt32) << (24-n)) & 0x007f_ffff
227 else
228 y = ((x >> (n-25)) % UInt32) & 0x00ff_ffff # keep 1 extra bit
229 y = (y+one(UInt32))>>1 # round, ties up (extra leading bit in case of next exponent)
230 y &= ~UInt32(trailing_zeros(x) == (n-25)) # fix last bit to round to even
231 end
232 d = ((n+126) % UInt32) << 23
233 reinterpret(Float32, d + y)
234 end
235
236 function Float32(x::Int128)
237 x == 0 && return 0f0
238 s = ((x >>> 96) % UInt32) & 0x8000_0000 # sign bit
239 x = abs(x) % UInt128
240 n = top_set_bit(x) # ndigits0z(x,2)
241 if n <= 24
242 y = ((x % UInt32) << (24-n)) & 0x007f_ffff
243 else
244 y = ((x >> (n-25)) % UInt32) & 0x00ff_ffff # keep 1 extra bit
245 y = (y+one(UInt32))>>1 # round, ties up (extra leading bit in case of next exponent)
246 y &= ~UInt32(trailing_zeros(x) == (n-25)) # fix last bit to round to even
247 end
248 d = ((n+126) % UInt32) << 23
249 reinterpret(Float32, s | d + y)
250 end
251
252 # TODO: optimize
253 Float16(x::UInt128) = convert(Float16, Float64(x))
254 Float16(x::Int128) = convert(Float16, Float64(x))
255
256 Float16(x::Float32) = fptrunc(Float16, x)
257 Float16(x::Float64) = fptrunc(Float16, x)
258 Float32(x::Float64) = fptrunc(Float32, x)
259
260 Float32(x::Float16) = fpext(Float32, x)
261 Float64(x::Float32) = fpext(Float64, x)
262 Float64(x::Float16) = fpext(Float64, x)
263
264 AbstractFloat(x::Bool) = Float64(x)
265 AbstractFloat(x::Int8) = Float64(x)
266 AbstractFloat(x::Int16) = Float64(x)
267 AbstractFloat(x::Int32) = Float64(x)
268 AbstractFloat(x::Int64) = Float64(x) # LOSSY
269 AbstractFloat(x::Int128) = Float64(x) # LOSSY
270 AbstractFloat(x::UInt8) = Float64(x)
271 AbstractFloat(x::UInt16) = Float64(x)
272 AbstractFloat(x::UInt32) = Float64(x)
273 AbstractFloat(x::UInt64) = Float64(x) # LOSSY
274 AbstractFloat(x::UInt128) = Float64(x) # LOSSY
275
276 Bool(x::Float16) = x==0 ? false : x==1 ? true : throw(InexactError(:Bool, Bool, x))
277
278 """
279 float(x)
280
281 Convert a number or array to a floating point data type.
282
283 See also: [`complex`](@ref), [`oftype`](@ref), [`convert`](@ref).
284
285 # Examples
286 ```jldoctest
287 julia> float(1:1000)
288 1.0:1.0:1000.0
289
290 julia> float(typemax(Int32))
291 2.147483647e9
292 ```
293 """
294 float(x) = AbstractFloat(x)
295
296 """
297 float(T::Type)
298
299 Return an appropriate type to represent a value of type `T` as a floating point value.
300 Equivalent to `typeof(float(zero(T)))`.
301
302 # Examples
303 ```jldoctest
304 julia> float(Complex{Int})
305 ComplexF64 (alias for Complex{Float64})
306
307 julia> float(Int)
308 Float64
309 ```
310 """
311 float(::Type{T}) where {T<:Number} = typeof(float(zero(T)))
312 float(::Type{T}) where {T<:AbstractFloat} = T
313 float(::Type{Union{}}, slurp...) = Union{}(0.0)
314
315 """
316 unsafe_trunc(T, x)
317
318 Return the nearest integral value of type `T` whose absolute value is
319 less than or equal to the absolute value of `x`. If the value is not representable by `T`,
320 an arbitrary value will be returned.
321 See also [`trunc`](@ref).
322
323 # Examples
324 ```jldoctest
325 julia> unsafe_trunc(Int, -2.2)
326 -2
327
328 julia> unsafe_trunc(Int, NaN)
329 -9223372036854775808
330 ```
331 """
332 function unsafe_trunc end
333
334 for Ti in (Int8, Int16, Int32, Int64)
335 @eval begin
336 unsafe_trunc(::Type{$Ti}, x::IEEEFloat) = fptosi($Ti, x)
337 end
338 end
339 for Ti in (UInt8, UInt16, UInt32, UInt64)
340 @eval begin
341 unsafe_trunc(::Type{$Ti}, x::IEEEFloat) = fptoui($Ti, x)
342 end
343 end
344
345 function unsafe_trunc(::Type{UInt128}, x::Float64)
346 xu = reinterpret(UInt64,x)
347 k = Int(xu >> 52) & 0x07ff - 1075
348 xu = (xu & 0x000f_ffff_ffff_ffff) | 0x0010_0000_0000_0000
349 if k <= 0
350 UInt128(xu >> -k)
351 else
352 UInt128(xu) << k
353 end
354 end
355 function unsafe_trunc(::Type{Int128}, x::Float64)
356 copysign(unsafe_trunc(UInt128,x) % Int128, x)
357 end
358
359 function unsafe_trunc(::Type{UInt128}, x::Float32)
360 xu = reinterpret(UInt32,x)
361 k = Int(xu >> 23) & 0x00ff - 150
362 xu = (xu & 0x007f_ffff) | 0x0080_0000
363 if k <= 0
364 UInt128(xu >> -k)
365 else
366 UInt128(xu) << k
367 end
368 end
369 function unsafe_trunc(::Type{Int128}, x::Float32)
370 copysign(unsafe_trunc(UInt128,x) % Int128, x)
371 end
372
373 unsafe_trunc(::Type{UInt128}, x::Float16) = unsafe_trunc(UInt128, Float32(x))
374 unsafe_trunc(::Type{Int128}, x::Float16) = unsafe_trunc(Int128, Float32(x))
375
376 # matches convert methods
377 # also determines floor, ceil, round
378 trunc(::Type{Signed}, x::IEEEFloat) = trunc(Int,x)
379 trunc(::Type{Unsigned}, x::IEEEFloat) = trunc(UInt,x)
380 trunc(::Type{Integer}, x::IEEEFloat) = trunc(Int,x)
381
382 # fallbacks
383 floor(::Type{T}, x::AbstractFloat) where {T<:Integer} = trunc(T,round(x, RoundDown))
384 ceil(::Type{T}, x::AbstractFloat) where {T<:Integer} = trunc(T,round(x, RoundUp))
385 round(::Type{T}, x::AbstractFloat) where {T<:Integer} = trunc(T,round(x, RoundNearest))
386
387 # Bool
388 trunc(::Type{Bool}, x::AbstractFloat) = (-1 < x < 2) ? 1 <= x : throw(InexactError(:trunc, Bool, x))
389 floor(::Type{Bool}, x::AbstractFloat) = (0 <= x < 2) ? 1 <= x : throw(InexactError(:floor, Bool, x))
390 ceil(::Type{Bool}, x::AbstractFloat) = (-1 < x <= 1) ? 0 < x : throw(InexactError(:ceil, Bool, x))
391 round(::Type{Bool}, x::AbstractFloat) = (-0.5 <= x < 1.5) ? 0.5 < x : throw(InexactError(:round, Bool, x))
392
393 round(x::IEEEFloat, r::RoundingMode{:ToZero}) = trunc_llvm(x)
394 round(x::IEEEFloat, r::RoundingMode{:Down}) = floor_llvm(x)
395 round(x::IEEEFloat, r::RoundingMode{:Up}) = ceil_llvm(x)
396 round(x::IEEEFloat, r::RoundingMode{:Nearest}) = rint_llvm(x)
397
398 ## floating point promotions ##
399 promote_rule(::Type{Float32}, ::Type{Float16}) = Float32
400 promote_rule(::Type{Float64}, ::Type{Float16}) = Float64
401 promote_rule(::Type{Float64}, ::Type{Float32}) = Float64
402
403 widen(::Type{Float16}) = Float32
404 widen(::Type{Float32}) = Float64
405
406 ## floating point arithmetic ##
407 -(x::IEEEFloat) = neg_float(x)
408
409 5 (2 %) 5 (2 %)
5 (2 %) samples spent in +
3 (60 %) (ex.), 3 (60 %) (incl.) when called from macro expansion line 157
1 (20 %) (ex.), 1 (20 %) (incl.) when called from + line 473
1 (20 %) (ex.), 1 (20 %) (incl.) when called from brusselator_f line 17
+(x::T, y::T) where {T<:IEEEFloat} = add_float(x, y)
410 4 (1 %) 4 (1 %)
4 (1 %) samples spent in -
4 (100 %) (ex.), 4 (100 %) (incl.) when called from macro expansion line 157
-(x::T, y::T) where {T<:IEEEFloat} = sub_float(x, y)
411 15 (6 %) 15 (6 %)
15 (6 %) samples spent in *
15 (100 %) (ex.), 15 (100 %) (incl.) when called from macro expansion line 157
*(x::T, y::T) where {T<:IEEEFloat} = mul_float(x, y)
412 /(x::T, y::T) where {T<:IEEEFloat} = div_float(x, y)
413
414 muladd(x::T, y::T, z::T) where {T<:IEEEFloat} = muladd_float(x, y, z)
415
416 # TODO: faster floating point div?
417 # TODO: faster floating point fld?
418 # TODO: faster floating point mod?
419
420 function unbiased_exponent(x::T) where {T<:IEEEFloat}
421 return (reinterpret(Unsigned, x) & exponent_mask(T)) >> significand_bits(T)
422 end
423
424 function explicit_mantissa_noinfnan(x::T) where {T<:IEEEFloat}
425 m = mantissa(x)
426 issubnormal(x) || (m |= significand_mask(T) + uinttype(T)(1))
427 return m
428 end
429
430 function _to_float(number::U, ep) where {U<:Unsigned}
431 F = floattype(U)
432 S = signed(U)
433 epint = unsafe_trunc(S,ep)
434 lz::signed(U) = unsafe_trunc(S, Core.Intrinsics.ctlz_int(number) - U(exponent_bits(F)))
435 number <<= lz
436 epint -= lz
437 bits = U(0)
438 if epint >= 0
439 bits = number & significand_mask(F)
440 bits |= ((epint + S(1)) << significand_bits(F)) & exponent_mask(F)
441 else
442 bits = (number >> -epint) & significand_mask(F)
443 end
444 return reinterpret(F, bits)
445 end
446
447 @assume_effects :terminates_locally :nothrow function rem_internal(x::T, y::T) where {T<:IEEEFloat}
448 xuint = reinterpret(Unsigned, x)
449 yuint = reinterpret(Unsigned, y)
450 if xuint <= yuint
451 if xuint < yuint
452 return x
453 end
454 return zero(T)
455 end
456
457 e_x = unbiased_exponent(x)
458 e_y = unbiased_exponent(y)
459 # Most common case where |y| is "very normal" and |x/y| < 2^EXPONENT_WIDTH
460 if e_y > (significand_bits(T)) && (e_x - e_y) <= (exponent_bits(T))
461 m_x = explicit_mantissa_noinfnan(x)
462 m_y = explicit_mantissa_noinfnan(y)
463 d = urem_int((m_x << (e_x - e_y)), m_y)
464 iszero(d) && return zero(T)
465 return _to_float(d, e_y - uinttype(T)(1))
466 end
467 # Both are subnormals
468 if e_x == 0 && e_y == 0
469 return reinterpret(T, urem_int(xuint, yuint) & significand_mask(T))
470 end
471
472 m_x = explicit_mantissa_noinfnan(x)
473 e_x -= uinttype(T)(1)
474 m_y = explicit_mantissa_noinfnan(y)
475 lz_m_y = uinttype(T)(exponent_bits(T))
476 if e_y > 0
477 e_y -= uinttype(T)(1)
478 else
479 m_y = mantissa(y)
480 lz_m_y = Core.Intrinsics.ctlz_int(m_y)
481 end
482
483 tz_m_y = Core.Intrinsics.cttz_int(m_y)
484 sides_zeroes_cnt = lz_m_y + tz_m_y
485
486 # n>0
487 exp_diff = e_x - e_y
488 # Shift hy right until the end or n = 0
489 right_shift = min(exp_diff, tz_m_y)
490 m_y >>= right_shift
491 exp_diff -= right_shift
492 e_y += right_shift
493 # Shift hx left until the end or n = 0
494 left_shift = min(exp_diff, uinttype(T)(exponent_bits(T)))
495 m_x <<= left_shift
496 exp_diff -= left_shift
497
498 m_x = urem_int(m_x, m_y)
499 iszero(m_x) && return zero(T)
500 iszero(exp_diff) && return _to_float(m_x, e_y)
501
502 while exp_diff > sides_zeroes_cnt
503 exp_diff -= sides_zeroes_cnt
504 m_x <<= sides_zeroes_cnt
505 m_x = urem_int(m_x, m_y)
506 end
507 m_x <<= exp_diff
508 m_x = urem_int(m_x, m_y)
509 return _to_float(m_x, e_y)
510 end
511
512 function rem(x::T, y::T) where {T<:IEEEFloat}
513 if isfinite(x) && !iszero(x) && isfinite(y) && !iszero(y)
514 return copysign(rem_internal(abs(x), abs(y)), x)
515 elseif isinf(x) || isnan(y) || iszero(y) # y can still be Inf
516 return T(NaN)
517 else
518 return x
519 end
520 end
521
522 function mod(x::T, y::T) where {T<:AbstractFloat}
523 r = rem(x,y)
524 if r == 0
525 copysign(r,y)
526 elseif (r > 0) ⊻ (y > 0)
527 r+y
528 else
529 r
530 end
531 end
532
533 ## floating point comparisons ##
534 ==(x::T, y::T) where {T<:IEEEFloat} = eq_float(x, y)
535 !=(x::T, y::T) where {T<:IEEEFloat} = ne_float(x, y)
536 <( x::T, y::T) where {T<:IEEEFloat} = lt_float(x, y)
537 <=(x::T, y::T) where {T<:IEEEFloat} = le_float(x, y)
538
539 isequal(x::T, y::T) where {T<:IEEEFloat} = fpiseq(x, y)
540
541 # interpret as sign-magnitude integer
542 @inline function _fpint(x)
543 IntT = inttype(typeof(x))
544 ix = reinterpret(IntT, x)
545 return ifelse(ix < zero(IntT), ix ⊻ typemax(IntT), ix)
546 end
547
548 @inline function isless(a::T, b::T) where T<:IEEEFloat
549 (isnan(a) || isnan(b)) && return !isnan(a)
550
551 return _fpint(a) < _fpint(b)
552 end
553
554 # Exact Float (Tf) vs Integer (Ti) comparisons
555 # Assumes:
556 # - typemax(Ti) == 2^n-1
557 # - typemax(Ti) can't be exactly represented by Tf:
558 # => Tf(typemax(Ti)) == 2^n or Inf
559 # - typemin(Ti) can be exactly represented by Tf
560 #
561 # 1. convert y::Ti to float fy::Tf
562 # 2. perform Tf comparison x vs fy
563 # 3. if x == fy, check if (1) resulted in rounding:
564 # a. convert fy back to Ti and compare with original y
565 # b. unsafe_convert undefined behaviour if fy == Tf(typemax(Ti))
566 # (but consequently x == fy > y)
567 for Ti in (Int64,UInt64,Int128,UInt128)
568 for Tf in (Float32,Float64)
569 @eval begin
570 function ==(x::$Tf, y::$Ti)
571 fy = ($Tf)(y)
572 (x == fy) & (fy != $(Tf(typemax(Ti)))) & (y == unsafe_trunc($Ti,fy))
573 end
574 ==(y::$Ti, x::$Tf) = x==y
575
576 function <(x::$Ti, y::$Tf)
577 fx = ($Tf)(x)
578 (fx < y) | ((fx == y) & ((fx == $(Tf(typemax(Ti)))) | (x < unsafe_trunc($Ti,fx)) ))
579 end
580 function <=(x::$Ti, y::$Tf)
581 fx = ($Tf)(x)
582 (fx < y) | ((fx == y) & ((fx == $(Tf(typemax(Ti)))) | (x <= unsafe_trunc($Ti,fx)) ))
583 end
584
585 function <(x::$Tf, y::$Ti)
586 fy = ($Tf)(y)
587 (x < fy) | ((x == fy) & (fy < $(Tf(typemax(Ti)))) & (unsafe_trunc($Ti,fy) < y))
588 end
589 function <=(x::$Tf, y::$Ti)
590 fy = ($Tf)(y)
591 (x < fy) | ((x == fy) & (fy < $(Tf(typemax(Ti)))) & (unsafe_trunc($Ti,fy) <= y))
592 end
593 end
594 end
595 end
596 for op in (:(==), :<, :<=)
597 @eval begin
598 ($op)(x::Float16, y::Union{Int128,UInt128,Int64,UInt64}) = ($op)(Float64(x), Float64(y))
599 ($op)(x::Union{Int128,UInt128,Int64,UInt64}, y::Float16) = ($op)(Float64(x), Float64(y))
600
601 ($op)(x::Union{Float16,Float32}, y::Union{Int32,UInt32}) = ($op)(Float64(x), Float64(y))
602 ($op)(x::Union{Int32,UInt32}, y::Union{Float16,Float32}) = ($op)(Float64(x), Float64(y))
603
604 ($op)(x::Float16, y::Union{Int16,UInt16}) = ($op)(Float32(x), Float32(y))
605 ($op)(x::Union{Int16,UInt16}, y::Float16) = ($op)(Float32(x), Float32(y))
606 end
607 end
608
609
610 abs(x::IEEEFloat) = abs_float(x)
611
612 """
613 isnan(f) -> Bool
614
615 Test whether a number value is a NaN, an indeterminate value which is neither an infinity
616 nor a finite number ("not a number").
617
618 See also: [`iszero`](@ref), [`isone`](@ref), [`isinf`](@ref), [`ismissing`](@ref).
619 """
620 isnan(x::AbstractFloat) = (x != x)::Bool
621 isnan(x::Number) = false
622
623 isfinite(x::AbstractFloat) = !isnan(x - x)
624 isfinite(x::Real) = decompose(x)[3] != 0
625 isfinite(x::Integer) = true
626
627 """
628 isinf(f) -> Bool
629
630 Test whether a number is infinite.
631
632 See also: [`Inf`](@ref), [`iszero`](@ref), [`isfinite`](@ref), [`isnan`](@ref).
633 """
634 isinf(x::Real) = !isnan(x) & !isfinite(x)
635 isinf(x::IEEEFloat) = abs(x) === oftype(x, Inf)
636
637 const hx_NaN = hash_uint64(reinterpret(UInt64, NaN))
638 function hash(x::Float64, h::UInt)
639 # see comments on trunc and hash(Real, UInt)
640 if typemin(Int64) <= x < typemax(Int64)
641 xi = fptosi(Int64, x)
642 if isequal(xi, x)
643 return hash(xi, h)
644 end
645 elseif typemin(UInt64) <= x < typemax(UInt64)
646 xu = fptoui(UInt64, x)
647 if isequal(xu, x)
648 return hash(xu, h)
649 end
650 elseif isnan(x)
651 return hx_NaN ⊻ h # NaN does not have a stable bit pattern
652 end
653 return hash_uint64(bitcast(UInt64, x)) - 3h
654 end
655
656 hash(x::Float32, h::UInt) = hash(Float64(x), h)
657
658 function hash(x::Float16, h::UInt)
659 # see comments on trunc and hash(Real, UInt)
660 if isfinite(x) # all finite Float16 fit in Int64
661 xi = fptosi(Int64, x)
662 if isequal(xi, x)
663 return hash(xi, h)
664 end
665 elseif isnan(x)
666 return hx_NaN ⊻ h # NaN does not have a stable bit pattern
667 end
668 return hash_uint64(bitcast(UInt64, Float64(x))) - 3h
669 end
670
671 ## generic hashing for rational values ##
672 function hash(x::Real, h::UInt)
673 # decompose x as num*2^pow/den
674 num, pow, den = decompose(x)
675
676 # handle special values
677 num == 0 && den == 0 && return hash(NaN, h)
678 num == 0 && return hash(ifelse(den > 0, 0.0, -0.0), h)
679 den == 0 && return hash(ifelse(num > 0, Inf, -Inf), h)
680
681 # normalize decomposition
682 if den < 0
683 num = -num
684 den = -den
685 end
686 num_z = trailing_zeros(num)
687 num >>= num_z
688 den_z = trailing_zeros(den)
689 den >>= den_z
690 pow += num_z - den_z
691 # If the real can be represented as an Int64, UInt64, or Float64, hash as those types.
692 # To be an Integer the denominator must be 1 and the power must be non-negative.
693 if den == 1
694 # left = ceil(log2(num*2^pow))
695 left = top_set_bit(abs(num)) + pow
696 # 2^-1074 is the minimum Float64 so if the power is smaller, not a Float64
697 if -1074 <= pow
698 if 0 <= pow # if pow is non-negative, it is an integer
699 left <= 63 && return hash(Int64(num) << Int(pow), h)
700 left <= 64 && !signbit(num) && return hash(UInt64(num) << Int(pow), h)
701 end # typemin(Int64) handled by Float64 case
702 # 2^1024 is the maximum Float64 so if the power is greater, not a Float64
703 # Float64s only have 53 mantisa bits (including implicit bit)
704 left <= 1024 && left - pow <= 53 && return hash(ldexp(Float64(num), pow), h)
705 end
706 else
707 h = hash_integer(den, h)
708 end
709 # handle generic rational values
710 h = hash_integer(pow, h)
711 h = hash_integer(num, h)
712 return h
713 end
714
715 #=
716 `decompose(x)`: non-canonical decomposition of rational values as `num*2^pow/den`.
717
718 The decompose function is the point where rational-valued numeric types that support
719 hashing hook into the hashing protocol. `decompose(x)` should return three integer
720 values `num, pow, den`, such that the value of `x` is mathematically equal to
721
722 num*2^pow/den
723
724 The decomposition need not be canonical in the sense that it just needs to be *some*
725 way to express `x` in this form, not any particular way – with the restriction that
726 `num` and `den` may not share any odd common factors. They may, however, have powers
727 of two in common – the generic hashing code will normalize those as necessary.
728
729 Special values:
730
731 - `x` is zero: `num` should be zero and `den` should have the same sign as `x`
732 - `x` is infinite: `den` should be zero and `num` should have the same sign as `x`
733 - `x` is not a number: `num` and `den` should both be zero
734 =#
735
736 decompose(x::Integer) = x, 0, 1
737
738 function decompose(x::Float16)::NTuple{3,Int}
739 isnan(x) && return 0, 0, 0
740 isinf(x) && return ifelse(x < 0, -1, 1), 0, 0
741 n = reinterpret(UInt16, x)
742 s = (n & 0x03ff) % Int16
743 e = ((n & 0x7c00) >> 10) % Int
744 s |= Int16(e != 0) << 10
745 d = ifelse(signbit(x), -1, 1)
746 s, e - 25 + (e == 0), d
747 end
748
749 function decompose(x::Float32)::NTuple{3,Int}
750 isnan(x) && return 0, 0, 0
751 isinf(x) && return ifelse(x < 0, -1, 1), 0, 0
752 n = reinterpret(UInt32, x)
753 s = (n & 0x007fffff) % Int32
754 e = ((n & 0x7f800000) >> 23) % Int
755 s |= Int32(e != 0) << 23
756 d = ifelse(signbit(x), -1, 1)
757 s, e - 150 + (e == 0), d
758 end
759
760 function decompose(x::Float64)::Tuple{Int64, Int, Int}
761 isnan(x) && return 0, 0, 0
762 isinf(x) && return ifelse(x < 0, -1, 1), 0, 0
763 n = reinterpret(UInt64, x)
764 s = (n & 0x000fffffffffffff) % Int64
765 e = ((n & 0x7ff0000000000000) >> 52) % Int
766 s |= Int64(e != 0) << 52
767 d = ifelse(signbit(x), -1, 1)
768 s, e - 1075 + (e == 0), d
769 end
770
771
772 """
773 precision(num::AbstractFloat; base::Integer=2)
774 precision(T::Type; base::Integer=2)
775
776 Get the precision of a floating point number, as defined by the effective number of bits in
777 the significand, or the precision of a floating-point type `T` (its current default, if
778 `T` is a variable-precision type like [`BigFloat`](@ref)).
779
780 If `base` is specified, then it returns the maximum corresponding
781 number of significand digits in that base.
782
783 !!! compat "Julia 1.8"
784 The `base` keyword requires at least Julia 1.8.
785 """
786 function precision end
787
788 _precision(::Type{Float16}) = 11
789 _precision(::Type{Float32}) = 24
790 _precision(::Type{Float64}) = 53
791 function _precision(x, base::Integer=2)
792 base > 1 || throw(DomainError(base, "`base` cannot be less than 2."))
793 p = _precision(x)
794 return base == 2 ? Int(p) : floor(Int, p / log2(base))
795 end
796 precision(::Type{T}; base::Integer=2) where {T<:AbstractFloat} = _precision(T, base)
797 precision(::T; base::Integer=2) where {T<:AbstractFloat} = precision(T; base)
798
799
800 """
801 nextfloat(x::AbstractFloat, n::Integer)
802
803 The result of `n` iterative applications of `nextfloat` to `x` if `n >= 0`, or `-n`
804 applications of [`prevfloat`](@ref) if `n < 0`.
805 """
806 function nextfloat(f::IEEEFloat, d::Integer)
807 F = typeof(f)
808 fumax = reinterpret(Unsigned, F(Inf))
809 U = typeof(fumax)
810
811 isnan(f) && return f
812 fi = reinterpret(Signed, f)
813 fneg = fi < 0
814 fu = unsigned(fi & typemax(fi))
815
816 dneg = d < 0
817 da = uabs(d)
818 if da > typemax(U)
819 fneg = dneg
820 fu = fumax
821 else
822 du = da % U
823 if fneg ⊻ dneg
824 if du > fu
825 fu = min(fumax, du - fu)
826 fneg = !fneg
827 else
828 fu = fu - du
829 end
830 else
831 if fumax - fu < du
832 fu = fumax
833 else
834 fu = fu + du
835 end
836 end
837 end
838 if fneg
839 fu |= sign_mask(F)
840 end
841 reinterpret(F, fu)
842 end
843
844 """
845 nextfloat(x::AbstractFloat)
846
847 Return the smallest floating point number `y` of the same type as `x` such `x < y`. If no
848 such `y` exists (e.g. if `x` is `Inf` or `NaN`), then return `x`.
849
850 See also: [`prevfloat`](@ref), [`eps`](@ref), [`issubnormal`](@ref).
851 """
852 nextfloat(x::AbstractFloat) = nextfloat(x,1)
853
854 """
855 prevfloat(x::AbstractFloat, n::Integer)
856
857 The result of `n` iterative applications of `prevfloat` to `x` if `n >= 0`, or `-n`
858 applications of [`nextfloat`](@ref) if `n < 0`.
859 """
860 prevfloat(x::AbstractFloat, d::Integer) = nextfloat(x, -d)
861
862 """
863 prevfloat(x::AbstractFloat)
864
865 Return the largest floating point number `y` of the same type as `x` such `y < x`. If no
866 such `y` exists (e.g. if `x` is `-Inf` or `NaN`), then return `x`.
867 """
868 prevfloat(x::AbstractFloat) = nextfloat(x,-1)
869
870 for Ti in (Int8, Int16, Int32, Int64, Int128, UInt8, UInt16, UInt32, UInt64, UInt128)
871 for Tf in (Float16, Float32, Float64)
872 if Ti <: Unsigned || sizeof(Ti) < sizeof(Tf)
873 # Here `Tf(typemin(Ti))-1` is exact, so we can compare the lower-bound
874 # directly. `Tf(typemax(Ti))+1` is either always exactly representable, or
875 # rounded to `Inf` (e.g. when `Ti==UInt128 && Tf==Float32`).
876 @eval begin
877 function trunc(::Type{$Ti},x::$Tf)
878 if $(Tf(typemin(Ti))-one(Tf)) < x < $(Tf(typemax(Ti))+one(Tf))
879 return unsafe_trunc($Ti,x)
880 else
881 throw(InexactError(:trunc, $Ti, x))
882 end
883 end
884 function (::Type{$Ti})(x::$Tf)
885 if ($(Tf(typemin(Ti))) <= x <= $(Tf(typemax(Ti)))) && isinteger(x)
886 return unsafe_trunc($Ti,x)
887 else
888 throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x))
889 end
890 end
891 end
892 else
893 # Here `eps(Tf(typemin(Ti))) > 1`, so the only value which can be truncated to
894 # `Tf(typemin(Ti)` is itself. Similarly, `Tf(typemax(Ti))` is inexact and will
895 # be rounded up. This assumes that `Tf(typemin(Ti)) > -Inf`, which is true for
896 # these types, but not for `Float16` or larger integer types.
897 @eval begin
898 function trunc(::Type{$Ti},x::$Tf)
899 if $(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti)))
900 return unsafe_trunc($Ti,x)
901 else
902 throw(InexactError(:trunc, $Ti, x))
903 end
904 end
905 function (::Type{$Ti})(x::$Tf)
906 if ($(Tf(typemin(Ti))) <= x < $(Tf(typemax(Ti)))) && isinteger(x)
907 return unsafe_trunc($Ti,x)
908 else
909 throw(InexactError($(Expr(:quote,Ti.name.name)), $Ti, x))
910 end
911 end
912 end
913 end
914 end
915 end
916
917 """
918 issubnormal(f) -> Bool
919
920 Test whether a floating point number is subnormal.
921
922 An IEEE floating point number is [subnormal](https://en.wikipedia.org/wiki/Subnormal_number)
923 when its exponent bits are zero and its significand is not zero.
924
925 # Examples
926 ```jldoctest
927 julia> floatmin(Float32)
928 1.1754944f-38
929
930 julia> issubnormal(1.0f-37)
931 false
932
933 julia> issubnormal(1.0f-38)
934 true
935 ```
936 """
937 function issubnormal(x::T) where {T<:IEEEFloat}
938 y = reinterpret(Unsigned, x)
939 (y & exponent_mask(T) == 0) & (y & significand_mask(T) != 0)
940 end
941
942 ispow2(x::AbstractFloat) = !iszero(x) && frexp(x)[1] == 0.5
943 iseven(x::AbstractFloat) = isinteger(x) && (abs(x) > maxintfloat(x) || iseven(Integer(x)))
944 isodd(x::AbstractFloat) = isinteger(x) && abs(x) ≤ maxintfloat(x) && isodd(Integer(x))
945
946 @eval begin
947 typemin(::Type{Float16}) = $(bitcast(Float16, 0xfc00))
948 typemax(::Type{Float16}) = $(Inf16)
949 typemin(::Type{Float32}) = $(-Inf32)
950 typemax(::Type{Float32}) = $(Inf32)
951 typemin(::Type{Float64}) = $(-Inf64)
952 typemax(::Type{Float64}) = $(Inf64)
953 typemin(x::T) where {T<:Real} = typemin(T)
954 typemax(x::T) where {T<:Real} = typemax(T)
955
956 floatmin(::Type{Float16}) = $(bitcast(Float16, 0x0400))
957 floatmin(::Type{Float32}) = $(bitcast(Float32, 0x00800000))
958 floatmin(::Type{Float64}) = $(bitcast(Float64, 0x0010000000000000))
959 floatmax(::Type{Float16}) = $(bitcast(Float16, 0x7bff))
960 floatmax(::Type{Float32}) = $(bitcast(Float32, 0x7f7fffff))
961 floatmax(::Type{Float64}) = $(bitcast(Float64, 0x7fefffffffffffff))
962
963 eps(x::AbstractFloat) = isfinite(x) ? abs(x) >= floatmin(x) ? ldexp(eps(typeof(x)), exponent(x)) : nextfloat(zero(x)) : oftype(x, NaN)
964 eps(::Type{Float16}) = $(bitcast(Float16, 0x1400))
965 eps(::Type{Float32}) = $(bitcast(Float32, 0x34000000))
966 eps(::Type{Float64}) = $(bitcast(Float64, 0x3cb0000000000000))
967 eps() = eps(Float64)
968 end
969
970 """
971 floatmin(T = Float64)
972
973 Return the smallest positive normal number representable by the floating-point
974 type `T`.
975
976 # Examples
977 ```jldoctest
978 julia> floatmin(Float16)
979 Float16(6.104e-5)
980
981 julia> floatmin(Float32)
982 1.1754944f-38
983
984 julia> floatmin()
985 2.2250738585072014e-308
986 ```
987 """
988 floatmin(x::T) where {T<:AbstractFloat} = floatmin(T)
989
990 """
991 floatmax(T = Float64)
992
993 Return the largest finite number representable by the floating-point type `T`.
994
995 See also: [`typemax`](@ref), [`floatmin`](@ref), [`eps`](@ref).
996
997 # Examples
998 ```jldoctest
999 julia> floatmax(Float16)
1000 Float16(6.55e4)
1001
1002 julia> floatmax(Float32)
1003 3.4028235f38
1004
1005 julia> floatmax()
1006 1.7976931348623157e308
1007
1008 julia> typemax(Float64)
1009 Inf
1010 ```
1011 """
1012 floatmax(x::T) where {T<:AbstractFloat} = floatmax(T)
1013
1014 floatmin() = floatmin(Float64)
1015 floatmax() = floatmax(Float64)
1016
1017 """
1018 eps(::Type{T}) where T<:AbstractFloat
1019 eps()
1020
1021 Return the *machine epsilon* of the floating point type `T` (`T = Float64` by
1022 default). This is defined as the gap between 1 and the next largest value representable by
1023 `typeof(one(T))`, and is equivalent to `eps(one(T))`. (Since `eps(T)` is a
1024 bound on the *relative error* of `T`, it is a "dimensionless" quantity like [`one`](@ref).)
1025
1026 # Examples
1027 ```jldoctest
1028 julia> eps()
1029 2.220446049250313e-16
1030
1031 julia> eps(Float32)
1032 1.1920929f-7
1033
1034 julia> 1.0 + eps()
1035 1.0000000000000002
1036
1037 julia> 1.0 + eps()/2
1038 1.0
1039 ```
1040 """
1041 eps(::Type{<:AbstractFloat})
1042
1043 """
1044 eps(x::AbstractFloat)
1045
1046 Return the *unit in last place* (ulp) of `x`. This is the distance between consecutive
1047 representable floating point values at `x`. In most cases, if the distance on either side
1048 of `x` is different, then the larger of the two is taken, that is
1049
1050 eps(x) == max(x-prevfloat(x), nextfloat(x)-x)
1051
1052 The exceptions to this rule are the smallest and largest finite values
1053 (e.g. `nextfloat(-Inf)` and `prevfloat(Inf)` for [`Float64`](@ref)), which round to the
1054 smaller of the values.
1055
1056 The rationale for this behavior is that `eps` bounds the floating point rounding
1057 error. Under the default `RoundNearest` rounding mode, if ``y`` is a real number and ``x``
1058 is the nearest floating point number to ``y``, then
1059
1060 ```math
1061 |y-x| \\leq \\operatorname{eps}(x)/2.
1062 ```
1063
1064 See also: [`nextfloat`](@ref), [`issubnormal`](@ref), [`floatmax`](@ref).
1065
1066 # Examples
1067 ```jldoctest
1068 julia> eps(1.0)
1069 2.220446049250313e-16
1070
1071 julia> eps(prevfloat(2.0))
1072 2.220446049250313e-16
1073
1074 julia> eps(2.0)
1075 4.440892098500626e-16
1076
1077 julia> x = prevfloat(Inf) # largest finite Float64
1078 1.7976931348623157e308
1079
1080 julia> x + eps(x)/2 # rounds up
1081 Inf
1082
1083 julia> x + prevfloat(eps(x)/2) # rounds down
1084 1.7976931348623157e308
1085 ```
1086 """
1087 eps(::AbstractFloat)
1088
1089
1090 ## byte order swaps for arbitrary-endianness serialization/deserialization ##
1091 bswap(x::IEEEFloat) = bswap_int(x)
1092
1093 # integer size of float
1094 uinttype(::Type{Float64}) = UInt64
1095 uinttype(::Type{Float32}) = UInt32
1096 uinttype(::Type{Float16}) = UInt16
1097 inttype(::Type{Float64}) = Int64
1098 inttype(::Type{Float32}) = Int32
1099 inttype(::Type{Float16}) = Int16
1100 # float size of integer
1101 floattype(::Type{UInt64}) = Float64
1102 floattype(::Type{UInt32}) = Float32
1103 floattype(::Type{UInt16}) = Float16
1104 floattype(::Type{Int64}) = Float64
1105 floattype(::Type{Int32}) = Float32
1106 floattype(::Type{Int16}) = Float16
1107
1108
1109 ## Array operations on floating point numbers ##
1110
1111 float(A::AbstractArray{<:AbstractFloat}) = A
1112
1113 function float(A::AbstractArray{T}) where T
1114 if !isconcretetype(T)
1115 error("`float` not defined on abstractly-typed arrays; please convert to a more specific type")
1116 end
1117 convert(AbstractArray{typeof(float(zero(T)))}, A)
1118 end
1119
1120 float(r::StepRange) = float(r.start):float(r.step):float(last(r))
1121 float(r::UnitRange) = float(r.start):float(last(r))
1122 float(r::StepRangeLen{T}) where {T} =
1123 StepRangeLen{typeof(float(T(r.ref)))}(float(r.ref), float(r.step), length(r), r.offset)
1124 function float(r::LinRange)
1125 LinRange(float(r.start), float(r.stop), length(r))
1126 end