Float16.h (11440B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- 2 * vim: set ts=8 sts=2 et sw=2 tw=80: 3 * This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef vm_Float16_h 8 #define vm_Float16_h 9 10 #include "mozilla/FloatingPoint.h" 11 #include "mozilla/MathAlgorithms.h" 12 13 #include <cstdint> 14 #include <cstring> 15 #include <limits> 16 #include <type_traits> 17 18 namespace js { 19 20 namespace half { 21 // This is extracted from Version 2.2.0 of the half library by Christian Rau. 22 // See https://sourceforge.net/projects/half/. 23 // The original copyright and MIT license are reproduced below: 24 25 // half - IEEE 754-based half-precision floating-point library. 26 // 27 // Copyright (c) 2012-2021 Christian Rau <rauy@users.sourceforge.net> 28 // 29 // Permission is hereby granted, free of charge, to any person obtaining a copy 30 // of this software and associated documentation files (the "Software"), to deal 31 // in the Software without restriction, including without limitation the rights 32 // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 33 // copies of the Software, and to permit persons to whom the Software is 34 // furnished to do so, subject to the following conditions: 35 // 36 // The above copyright notice and this permission notice shall be included in 37 // all copies or substantial portions of the Software. 38 // 39 // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 40 // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 41 // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 42 // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 43 // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 44 // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 45 // SOFTWARE. 46 47 /// Type traits for floating-point bits. 48 template <typename T> 49 struct bits { 50 typedef unsigned char type; 51 }; 52 template <typename T> 53 struct bits<const T> : bits<T> {}; 54 template <typename T> 55 struct bits<volatile T> : bits<T> {}; 56 template <typename T> 57 struct bits<const volatile T> : bits<T> {}; 58 59 /// Unsigned integer of (at least) 32 bits width. 60 template <> 61 struct bits<float> { 62 typedef std::uint_least32_t type; 63 }; 64 65 /// Unsigned integer of (at least) 64 bits width. 66 template <> 67 struct bits<double> { 68 typedef std::uint_least64_t type; 69 }; 70 71 /// Fastest unsigned integer of (at least) 32 bits width. 72 typedef std::uint_fast32_t uint32; 73 74 /// Half-precision overflow. 75 /// \param sign half-precision value with sign bit only 76 /// \return rounded overflowing half-precision value 77 constexpr unsigned int overflow(unsigned int sign = 0) { return sign | 0x7C00; } 78 79 /// Half-precision underflow. 80 /// \param sign half-precision value with sign bit only 81 /// \return rounded underflowing half-precision value 82 constexpr unsigned int underflow(unsigned int sign = 0) { return sign; } 83 84 /// Round half-precision number. 85 /// \param value finite half-precision number to round 86 /// \param g guard bit (most significant discarded bit) 87 /// \param s sticky bit (or of all but the most significant discarded bits) 88 /// \return rounded half-precision value 89 constexpr unsigned int rounded(unsigned int value, int g, int s) { 90 return value + (g & (s | value)); 91 } 92 93 /// Convert IEEE single-precision to half-precision. 94 /// \param value single-precision value to convert 95 /// \return rounded half-precision value 96 inline unsigned int float2half_impl(float value) { 97 bits<float>::type fbits; 98 std::memcpy(&fbits, &value, sizeof(float)); 99 unsigned int sign = (fbits >> 16) & 0x8000; 100 fbits &= 0x7FFFFFFF; 101 if (fbits >= 0x7F800000) 102 return sign | 0x7C00 | 103 ((fbits > 0x7F800000) ? (0x200 | ((fbits >> 13) & 0x3FF)) : 0); 104 if (fbits >= 0x47800000) return overflow(sign); 105 if (fbits >= 0x38800000) 106 return rounded( 107 sign | (((fbits >> 23) - 112) << 10) | ((fbits >> 13) & 0x3FF), 108 (fbits >> 12) & 1, (fbits & 0xFFF) != 0); 109 if (fbits >= 0x33000000) { 110 int i = 125 - (fbits >> 23); 111 fbits = (fbits & 0x7FFFFF) | 0x800000; 112 return rounded(sign | (fbits >> (i + 1)), (fbits >> i) & 1, 113 (fbits & ((static_cast<uint32>(1) << i) - 1)) != 0); 114 } 115 if (fbits != 0) return underflow(sign); 116 return sign; 117 } 118 119 /// Convert IEEE double-precision to half-precision. 120 /// \param value double-precision value to convert 121 /// \return rounded half-precision value 122 inline unsigned int float2half_impl(double value) { 123 bits<double>::type dbits; 124 std::memcpy(&dbits, &value, sizeof(double)); 125 uint32 hi = dbits >> 32, lo = dbits & 0xFFFFFFFF; 126 unsigned int sign = (hi >> 16) & 0x8000; 127 hi &= 0x7FFFFFFF; 128 if (hi >= 0x7FF00000) 129 return sign | 0x7C00 | 130 ((dbits & 0xFFFFFFFFFFFFF) ? (0x200 | ((hi >> 10) & 0x3FF)) : 0); 131 if (hi >= 0x40F00000) return overflow(sign); 132 if (hi >= 0x3F100000) 133 return rounded(sign | (((hi >> 20) - 1008) << 10) | ((hi >> 10) & 0x3FF), 134 (hi >> 9) & 1, ((hi & 0x1FF) | lo) != 0); 135 if (hi >= 0x3E600000) { 136 int i = 1018 - (hi >> 20); 137 hi = (hi & 0xFFFFF) | 0x100000; 138 return rounded(sign | (hi >> (i + 1)), (hi >> i) & 1, 139 ((hi & ((static_cast<uint32>(1) << i) - 1)) | lo) != 0); 140 } 141 if ((hi | lo) != 0) return underflow(sign); 142 return sign; 143 } 144 145 template <typename T> 146 inline T half2float_impl(unsigned int value); 147 148 /// Convert half-precision to IEEE double-precision. 149 /// \param value half-precision value to convert 150 /// \return double-precision value 151 template <> 152 inline double half2float_impl(unsigned int value) { 153 uint32 hi = static_cast<uint32>(value & 0x8000) << 16; 154 unsigned int abs = value & 0x7FFF; 155 if (abs) { 156 hi |= 0x3F000000 << static_cast<unsigned>(abs >= 0x7C00); 157 158 // Mozilla change: Replace the loop with CountLeadingZeroes32. 159 // for (; abs < 0x400; abs <<= 1, hi -= 0x100000); 160 if (abs < 0x400) { 161 // NOTE: CountLeadingZeroes32(0x400) is 21. 162 uint32 shift = mozilla::CountLeadingZeroes32(uint32_t(abs)) - 21; 163 abs <<= shift; 164 hi -= shift * 0x100000; 165 } 166 167 hi += static_cast<uint32>(abs) << 10; 168 } 169 bits<double>::type dbits = static_cast<bits<double>::type>(hi) << 32; 170 double out; 171 std::memcpy(&out, &dbits, sizeof(double)); 172 return out; 173 } 174 175 /// Convert half-precision to IEEE single-precision. 176 /// \param value half-precision value to convert 177 /// \return single-precision value 178 template <> 179 inline float half2float_impl(unsigned int value) { 180 bits<float>::type fbits = static_cast<bits<float>::type>(value & 0x8000) 181 << 16; 182 unsigned int abs = value & 0x7FFF; 183 if (abs) { 184 fbits |= 0x38000000 << static_cast<unsigned>(abs >= 0x7C00); 185 186 // Mozilla change: Replace the loop with CountLeadingZeroes32. 187 // for (; abs < 0x400; abs <<= 1, fbits -= 0x800000); 188 if (abs < 0x400) { 189 // NOTE: CountLeadingZeroes32(0x400) is 21. 190 uint32 shift = mozilla::CountLeadingZeroes32(uint32_t(abs)) - 21; 191 abs <<= shift; 192 fbits -= shift * 0x800000; 193 } 194 195 fbits += static_cast<bits<float>::type>(abs) << 13; 196 } 197 198 float out; 199 std::memcpy(&out, &fbits, sizeof(float)); 200 return out; 201 } 202 } // namespace half 203 204 class float16 final { 205 uint16_t val; 206 207 public: 208 constexpr float16() = default; 209 constexpr float16(const float16&) = default; 210 211 explicit float16(float x) : val(half::float2half_impl(x)) {} 212 explicit float16(double x) : val(half::float2half_impl(x)) {} 213 214 explicit float16(std::int8_t x) : float16(float(x)) {} 215 explicit float16(std::int16_t x) : float16(float(x)) {} 216 explicit float16(std::int32_t x) : float16(float(x)) {} 217 explicit float16(std::int64_t x) : float16(double(x)) {} 218 219 explicit float16(std::uint8_t x) : float16(float(x)) {} 220 explicit float16(std::uint16_t x) : float16(float(x)) {} 221 explicit float16(std::uint32_t x) : float16(float(x)) {} 222 explicit float16(std::uint64_t x) : float16(double(x)) {} 223 224 explicit float16(bool x) : float16(float(x)) {} 225 226 constexpr float16& operator=(const float16&) = default; 227 228 float16& operator=(float x) { 229 *this = float16{x}; 230 return *this; 231 } 232 233 float16& operator=(double x) { 234 *this = float16{x}; 235 return *this; 236 } 237 238 explicit operator float() const { return half::half2float_impl<float>(val); } 239 explicit operator double() const { 240 return half::half2float_impl<double>(val); 241 } 242 243 bool operator==(float16 x) const { 244 uint16_t abs = val & 0x7FFF; 245 246 // ±0 is equal to ±0. 247 if (abs == 0) { 248 return (x.val & 0x7FFF) == 0; 249 } 250 251 // If neither +0 nor NaN, then both bit representations must be equal. 252 if (abs <= 0x7C00) { 253 return val == x.val; 254 } 255 256 // NaN isn't equal to any value. 257 return false; 258 } 259 260 bool operator!=(float16 x) const { return !(*this == x); } 261 262 uint16_t toRawBits() const { return val; } 263 264 static constexpr float16 fromRawBits(uint16_t bits) { 265 float16 f16{}; 266 f16.val = bits; 267 return f16; 268 } 269 }; 270 271 static_assert(sizeof(float16) == 2, "float16 has no extra padding"); 272 273 static_assert( 274 std::is_trivial_v<float16>, 275 "float16 must be trivial to be eligible for memcpy/memset optimizations"); 276 277 } // namespace js 278 279 template <> 280 class std::numeric_limits<js::float16> { 281 public: 282 static constexpr bool is_specialized = true; 283 static constexpr bool is_signed = true; 284 static constexpr bool is_integer = false; 285 static constexpr bool is_exact = false; 286 static constexpr bool has_infinity = true; 287 static constexpr bool has_quiet_NaN = true; 288 static constexpr bool has_signaling_NaN = true; 289 static constexpr std::float_denorm_style has_denorm = std::denorm_present; 290 static constexpr bool has_denorm_loss = false; 291 static constexpr std::float_round_style round_style = std::round_to_nearest; 292 static constexpr bool is_iec559 = true; 293 static constexpr bool is_bounded = true; 294 static constexpr bool is_modulo = false; 295 static constexpr int digits = 11; 296 static constexpr int digits10 = 3; 297 static constexpr int max_digits10 = 5; 298 static constexpr int radix = 2; 299 static constexpr int min_exponent = -13; 300 static constexpr int min_exponent10 = -4; 301 static constexpr int max_exponent = 16; 302 static constexpr int max_exponent10 = 4; 303 static constexpr bool traps = false; 304 static constexpr bool tinyness_before = false; 305 306 static constexpr auto min() noexcept { 307 return js::float16::fromRawBits(0x400); 308 } 309 static constexpr auto lowest() noexcept { 310 return js::float16::fromRawBits(0xFBFF); 311 } 312 static constexpr auto max() noexcept { 313 return js::float16::fromRawBits(0x7BFF); 314 } 315 static constexpr auto epsilon() noexcept { 316 return js::float16::fromRawBits(0x1400); 317 } 318 static constexpr auto round_error() noexcept { 319 return js::float16::fromRawBits(0x3800); 320 } 321 static constexpr auto infinity() noexcept { 322 return js::float16::fromRawBits(0x7C00); 323 } 324 static constexpr auto quiet_NaN() noexcept { 325 return js::float16::fromRawBits(0x7E00); 326 } 327 static constexpr auto signaling_NaN() noexcept { 328 return js::float16::fromRawBits(0x7D00); 329 } 330 static constexpr auto denorm_min() noexcept { 331 return js::float16::fromRawBits(0x0001); 332 } 333 }; 334 335 template <> 336 struct mozilla::FloatingPointTrait<js::float16> { 337 protected: 338 using Bits = uint16_t; 339 340 static constexpr unsigned kExponentWidth = 5; 341 static constexpr unsigned kSignificandWidth = 10; 342 }; 343 344 #endif // vm_Float16_h