core/num/dec2flt/
float.rs

1//! Helper trait for generic float types.
2
3use core::f64;
4
5use crate::fmt::{Debug, LowerExp};
6use crate::num::FpCategory;
7use crate::ops::{self, Add, Div, Mul, Neg};
8
9/// Lossy `as` casting between two types.
10pub trait CastInto<T: Copy>: Copy {
11    fn cast(self) -> T;
12}
13
14/// Collection of traits that allow us to be generic over integer size.
15pub trait Integer:
16    Sized
17    + Clone
18    + Copy
19    + Debug
20    + ops::Shr<u32, Output = Self>
21    + ops::Shl<u32, Output = Self>
22    + ops::BitAnd<Output = Self>
23    + ops::BitOr<Output = Self>
24    + PartialEq
25    + CastInto<i16>
26{
27    const ZERO: Self;
28    const ONE: Self;
29}
30
31macro_rules! int {
32    ($($ty:ty),+) => {
33        $(
34            impl CastInto<i16> for $ty {
35                fn cast(self) -> i16 {
36                    self as i16
37                }
38            }
39
40            impl Integer for $ty {
41                const ZERO: Self = 0;
42                const ONE: Self = 1;
43            }
44        )+
45    }
46}
47
48int!(u32, u64);
49
50/// A helper trait to avoid duplicating basically all the conversion code for IEEE floats.
51///
52/// See the parent module's doc comment for why this is necessary.
53///
54/// Should **never ever** be implemented for other types or be used outside the `dec2flt` module.
55#[doc(hidden)]
56pub trait RawFloat:
57    Sized
58    + Div<Output = Self>
59    + Neg<Output = Self>
60    + Mul<Output = Self>
61    + Add<Output = Self>
62    + LowerExp
63    + PartialEq
64    + PartialOrd
65    + Default
66    + Clone
67    + Copy
68    + Debug
69{
70    /// The unsigned integer with the same size as the float
71    type Int: Integer + Into<u64>;
72
73    /* general constants */
74
75    const INFINITY: Self;
76    const NEG_INFINITY: Self;
77    const NAN: Self;
78    const NEG_NAN: Self;
79
80    /// Bit width of the float
81    const BITS: u32;
82
83    /// The number of bits in the significand, *including* the hidden bit.
84    const SIG_TOTAL_BITS: u32;
85
86    const EXP_MASK: Self::Int;
87    const SIG_MASK: Self::Int;
88
89    /// The number of bits in the significand, *excluding* the hidden bit.
90    const SIG_BITS: u32 = Self::SIG_TOTAL_BITS - 1;
91
92    /// Number of bits in the exponent.
93    const EXP_BITS: u32 = Self::BITS - Self::SIG_BITS - 1;
94
95    /// The saturated (maximum bitpattern) value of the exponent, i.e. the infinite
96    /// representation.
97    ///
98    /// This shifted fully right, use `EXP_MASK` for the shifted value.
99    const EXP_SAT: u32 = (1 << Self::EXP_BITS) - 1;
100
101    /// Signed version of `EXP_SAT` since we convert a lot.
102    const INFINITE_POWER: i32 = Self::EXP_SAT as i32;
103
104    /// The exponent bias value. This is also the maximum value of the exponent.
105    const EXP_BIAS: u32 = Self::EXP_SAT >> 1;
106
107    /// Minimum exponent value of normal values.
108    const EXP_MIN: i32 = -(Self::EXP_BIAS as i32 - 1);
109
110    /// Round-to-even only happens for negative values of q
111    /// when q ≥ −4 in the 64-bit case and when q ≥ −17 in
112    /// the 32-bitcase.
113    ///
114    /// When q ≥ 0,we have that 5^q ≤ 2m+1. In the 64-bit case,we
115    /// have 5^q ≤ 2m+1 ≤ 2^54 or q ≤ 23. In the 32-bit case,we have
116    /// 5^q ≤ 2m+1 ≤ 2^25 or q ≤ 10.
117    ///
118    /// When q < 0, we have w ≥ (2m+1)×5^−q. We must have that w < 2^64
119    /// so (2m+1)×5^−q < 2^64. We have that 2m+1 > 2^53 (64-bit case)
120    /// or 2m+1 > 2^24 (32-bit case). Hence,we must have 2^53×5^−q < 2^64
121    /// (64-bit) and 2^24×5^−q < 2^64 (32-bit). Hence we have 5^−q < 2^11
122    /// or q ≥ −4 (64-bit case) and 5^−q < 2^40 or q ≥ −17 (32-bitcase).
123    ///
124    /// Thus we have that we only need to round ties to even when
125    /// we have that q ∈ [−4,23](in the 64-bit case) or q∈[−17,10]
126    /// (in the 32-bit case). In both cases,the power of five(5^|q|)
127    /// fits in a 64-bit word.
128    const MIN_EXPONENT_ROUND_TO_EVEN: i32;
129    const MAX_EXPONENT_ROUND_TO_EVEN: i32;
130
131    /* limits related to Fast pathing */
132
133    /// Largest decimal exponent for a non-infinite value.
134    ///
135    /// This is the max exponent in binary converted to the max exponent in decimal. Allows fast
136    /// pathing anything larger than `10^LARGEST_POWER_OF_TEN`, which will round to infinity.
137    const LARGEST_POWER_OF_TEN: i32 = {
138        let largest_pow2 = Self::EXP_BIAS + 1;
139        pow2_to_pow10(largest_pow2 as i64) as i32
140    };
141
142    /// Smallest decimal exponent for a non-zero value. This allows for fast pathing anything
143    /// smaller than `10^SMALLEST_POWER_OF_TEN`, which will round to zero.
144    ///
145    /// The smallest power of ten is represented by `⌊log10(2^-n / (2^64 - 1))⌋`, where `n` is
146    /// the smallest power of two. The `2^64 - 1)` denomenator comes from the number of values
147    /// that are representable by the intermediate storage format. I don't actually know _why_
148    /// the storage format is relevant here.
149    ///
150    /// The values may be calculated using the formula. Unfortunately we cannot calculate them at
151    /// compile time since intermediates exceed the range of an `f64`.
152    const SMALLEST_POWER_OF_TEN: i32;
153
154    /// Maximum exponent for a fast path case, or `⌊(SIG_BITS+1)/log2(5)⌋`
155    // assuming FLT_EVAL_METHOD = 0
156    const MAX_EXPONENT_FAST_PATH: i64 = {
157        let log2_5 = f64::consts::LOG2_10 - 1.0;
158        (Self::SIG_TOTAL_BITS as f64 / log2_5) as i64
159    };
160
161    /// Minimum exponent for a fast path case, or `-⌊(SIG_BITS+1)/log2(5)⌋`
162    const MIN_EXPONENT_FAST_PATH: i64 = -Self::MAX_EXPONENT_FAST_PATH;
163
164    /// Maximum exponent that can be represented for a disguised-fast path case.
165    /// This is `MAX_EXPONENT_FAST_PATH + ⌊(SIG_BITS+1)/log2(10)⌋`
166    const MAX_EXPONENT_DISGUISED_FAST_PATH: i64 =
167        Self::MAX_EXPONENT_FAST_PATH + (Self::SIG_TOTAL_BITS as f64 / f64::consts::LOG2_10) as i64;
168
169    /// Maximum mantissa for the fast-path (`1 << 53` for f64).
170    const MAX_MANTISSA_FAST_PATH: u64 = 1 << Self::SIG_TOTAL_BITS;
171
172    /// Converts integer into float through an as cast.
173    /// This is only called in the fast-path algorithm, and therefore
174    /// will not lose precision, since the value will always have
175    /// only if the value is <= Self::MAX_MANTISSA_FAST_PATH.
176    fn from_u64(v: u64) -> Self;
177
178    /// Performs a raw transmutation from an integer.
179    fn from_u64_bits(v: u64) -> Self;
180
181    /// Gets a small power-of-ten for fast-path multiplication.
182    fn pow10_fast_path(exponent: usize) -> Self;
183
184    /// Returns the category that this number falls into.
185    fn classify(self) -> FpCategory;
186
187    /// Transmute to the integer representation
188    fn to_bits(self) -> Self::Int;
189
190    /// Returns the mantissa, exponent and sign as integers.
191    ///
192    /// That is, this returns `(m, p, s)` such that `s * m * 2^p` represents the original float.
193    /// For 0, the exponent will be `-(EXP_BIAS + SIG_BITS`, which is the
194    /// minimum subnormal power.
195    fn integer_decode(self) -> (u64, i16, i8) {
196        let bits = self.to_bits();
197        let sign: i8 = if bits >> (Self::BITS - 1) == Self::Int::ZERO { 1 } else { -1 };
198        let mut exponent: i16 = ((bits & Self::EXP_MASK) >> Self::SIG_BITS).cast();
199        let mantissa = if exponent == 0 {
200            (bits & Self::SIG_MASK) << 1
201        } else {
202            (bits & Self::SIG_MASK) | (Self::Int::ONE << Self::SIG_BITS)
203        };
204        // Exponent bias + mantissa shift
205        exponent -= (Self::EXP_BIAS + Self::SIG_BITS) as i16;
206        (mantissa.into(), exponent, sign)
207    }
208}
209
210/// Solve for `b` in `10^b = 2^a`
211const fn pow2_to_pow10(a: i64) -> i64 {
212    let res = (a as f64) / f64::consts::LOG2_10;
213    res as i64
214}
215
216impl RawFloat for f32 {
217    type Int = u32;
218
219    const INFINITY: Self = f32::INFINITY;
220    const NEG_INFINITY: Self = f32::NEG_INFINITY;
221    const NAN: Self = f32::NAN;
222    const NEG_NAN: Self = -f32::NAN;
223
224    const BITS: u32 = 32;
225    const SIG_TOTAL_BITS: u32 = Self::MANTISSA_DIGITS;
226    const EXP_MASK: Self::Int = Self::EXP_MASK;
227    const SIG_MASK: Self::Int = Self::MAN_MASK;
228
229    const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -17;
230    const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 10;
231    const SMALLEST_POWER_OF_TEN: i32 = -65;
232
233    #[inline]
234    fn from_u64(v: u64) -> Self {
235        debug_assert!(v <= Self::MAX_MANTISSA_FAST_PATH);
236        v as _
237    }
238
239    #[inline]
240    fn from_u64_bits(v: u64) -> Self {
241        f32::from_bits((v & 0xFFFFFFFF) as u32)
242    }
243
244    fn pow10_fast_path(exponent: usize) -> Self {
245        #[allow(clippy::use_self)]
246        const TABLE: [f32; 16] =
247            [1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 0., 0., 0., 0., 0.];
248        TABLE[exponent & 15]
249    }
250
251    fn to_bits(self) -> Self::Int {
252        self.to_bits()
253    }
254
255    fn classify(self) -> FpCategory {
256        self.classify()
257    }
258}
259
260impl RawFloat for f64 {
261    type Int = u64;
262
263    const INFINITY: Self = Self::INFINITY;
264    const NEG_INFINITY: Self = Self::NEG_INFINITY;
265    const NAN: Self = Self::NAN;
266    const NEG_NAN: Self = -Self::NAN;
267
268    const BITS: u32 = 64;
269    const SIG_TOTAL_BITS: u32 = Self::MANTISSA_DIGITS;
270    const EXP_MASK: Self::Int = Self::EXP_MASK;
271    const SIG_MASK: Self::Int = Self::MAN_MASK;
272
273    const MIN_EXPONENT_ROUND_TO_EVEN: i32 = -4;
274    const MAX_EXPONENT_ROUND_TO_EVEN: i32 = 23;
275    const SMALLEST_POWER_OF_TEN: i32 = -342;
276
277    #[inline]
278    fn from_u64(v: u64) -> Self {
279        debug_assert!(v <= Self::MAX_MANTISSA_FAST_PATH);
280        v as _
281    }
282
283    #[inline]
284    fn from_u64_bits(v: u64) -> Self {
285        f64::from_bits(v)
286    }
287
288    fn pow10_fast_path(exponent: usize) -> Self {
289        const TABLE: [f64; 32] = [
290            1e0, 1e1, 1e2, 1e3, 1e4, 1e5, 1e6, 1e7, 1e8, 1e9, 1e10, 1e11, 1e12, 1e13, 1e14, 1e15,
291            1e16, 1e17, 1e18, 1e19, 1e20, 1e21, 1e22, 0., 0., 0., 0., 0., 0., 0., 0., 0.,
292        ];
293        TABLE[exponent & 31]
294    }
295
296    fn to_bits(self) -> Self::Int {
297        self.to_bits()
298    }
299
300    fn classify(self) -> FpCategory {
301        self.classify()
302    }
303}