tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

math-inl.h (59380B)


      1 // Copyright 2020 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 // Include guard (still compiled once per target)
     17 #if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
     18    defined(HWY_TARGET_TOGGLE)  // NOLINT
     19 #ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
     20 #undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
     21 #else
     22 #define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
     23 #endif
     24 
     25 #include <stddef.h>
     26 #include <stdint.h>
     27 
     28 #include "hwy/highway.h"
     29 
     30 HWY_BEFORE_NAMESPACE();
     31 namespace hwy {
     32 namespace HWY_NAMESPACE {
     33 
     34 /**
     35 * Highway SIMD version of std::acos(x).
     36 *
     37 * Valid Lane Types: float32, float64
     38 *        Max Error: ULP = 2
     39 *      Valid Range: [-1, +1]
     40 * @return arc cosine of 'x'
     41 */
     42 template <class D, class V>
     43 HWY_INLINE V Acos(D d, V x);
     44 template <class D, class V>
     45 HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) {
     46  return Acos(d, x);
     47 }
     48 
     49 /**
     50 * Highway SIMD version of std::acosh(x).
     51 *
     52 * Valid Lane Types: float32, float64
     53 *        Max Error: ULP = 3
     54 *      Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX]
     55 * @return hyperbolic arc cosine of 'x'
     56 */
     57 template <class D, class V>
     58 HWY_INLINE V Acosh(D d, V x);
     59 template <class D, class V>
     60 HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) {
     61  return Acosh(d, x);
     62 }
     63 
     64 /**
     65 * Highway SIMD version of std::asin(x).
     66 *
     67 * Valid Lane Types: float32, float64
     68 *        Max Error: ULP = 2
     69 *      Valid Range: [-1, +1]
     70 * @return arc sine of 'x'
     71 */
     72 template <class D, class V>
     73 HWY_INLINE V Asin(D d, V x);
     74 template <class D, class V>
     75 HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) {
     76  return Asin(d, x);
     77 }
     78 
     79 /**
     80 * Highway SIMD version of std::asinh(x).
     81 *
     82 * Valid Lane Types: float32, float64
     83 *        Max Error: ULP = 3
     84 *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
     85 * @return hyperbolic arc sine of 'x'
     86 */
     87 template <class D, class V>
     88 HWY_INLINE V Asinh(D d, V x);
     89 template <class D, class V>
     90 HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) {
     91  return Asinh(d, x);
     92 }
     93 
     94 /**
     95 * Highway SIMD version of std::atan(x).
     96 *
     97 * Valid Lane Types: float32, float64
     98 *        Max Error: ULP = 3
     99 *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
    100 * @return arc tangent of 'x'
    101 */
    102 template <class D, class V>
    103 HWY_INLINE V Atan(D d, V x);
    104 template <class D, class V>
    105 HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) {
    106  return Atan(d, x);
    107 }
    108 
    109 /**
    110 * Highway SIMD version of std::atanh(x).
    111 *
    112 * Valid Lane Types: float32, float64
    113 *        Max Error: ULP = 3
    114 *      Valid Range: (-1, +1)
    115 * @return hyperbolic arc tangent of 'x'
    116 */
    117 template <class D, class V>
    118 HWY_INLINE V Atanh(D d, V x);
    119 template <class D, class V>
    120 HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) {
    121  return Atanh(d, x);
    122 }
    123 
    124 // Atan2 was added later and some users may be implementing it themselves, so
    125 // notify them that this version of Highway defines it already.
    126 #ifndef HWY_HAVE_ATAN2
    127 #define HWY_HAVE_ATAN2 1
    128 #endif
    129 
    130 /**
    131 * Highway SIMD version of std::atan2(x).
    132 *
    133 * Valid Lane Types: float32, float64
    134 * Correctly handles negative zero, infinities, and NaN.
    135 * @return atan2 of 'y', 'x'
    136 */
    137 template <class D, class V = VFromD<D>, class M = MFromD<D>,
    138          typename T = TFromD<D>>
    139 HWY_INLINE V Atan2(const D d, V y, V x) {
    140  const V kHalf = Set(d, static_cast<T>(+0.5));
    141  const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
    142  const V kPi2 = Mul(kPi, kHalf);
    143 
    144  const V k0 = Zero(d);
    145  const M y_0 = Eq(y, k0);
    146  const M x_0 = Eq(x, k0);
    147  const M x_neg = Lt(x, k0);
    148  const M y_inf = IsInf(y);
    149  const M x_inf = IsInf(x);
    150  const M nan = Or(IsNaN(y), IsNaN(x));
    151 
    152  const V if_xneg_pi = IfThenElseZero(x_neg, kPi);
    153  // x= +inf: pi/4; -inf: 3*pi/4; else: pi/2
    154  const V if_yinf = Mul(kHalf, IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi));
    155 
    156  V t = Atan(d, Div(y, x));
    157  // Disambiguate between quadrants 1/3 and 2/4 by adding (Q2: Pi; Q3: -Pi).
    158  t = Add(t, CopySignToAbs(if_xneg_pi, y));
    159  // Special cases for 0 and infinity:
    160  t = IfThenElse(x_inf, if_xneg_pi, t);
    161  t = IfThenElse(x_0, kPi2, t);
    162  t = IfThenElse(y_inf, if_yinf, t);
    163  t = IfThenElse(y_0, if_xneg_pi, t);
    164  // Any input NaN => NaN, otherwise fix sign.
    165  return IfThenElse(nan, NaN(d), CopySign(t, y));
    166 }
    167 template <class D, class V>
    168 HWY_NOINLINE V CallAtan2(const D d, VecArg<V> y, VecArg<V> x) {
    169  return Atan2(d, y, x);
    170 }
    171 
    172 /**
    173 * Highway SIMD version of std::cos(x).
    174 *
    175 * Valid Lane Types: float32, float64
    176 *        Max Error: ULP = 3
    177 *      Valid Range: [-39000, +39000]
    178 * @return cosine of 'x'
    179 */
    180 template <class D, class V>
    181 HWY_INLINE V Cos(D d, V x);
    182 template <class D, class V>
    183 HWY_NOINLINE V CallCos(const D d, VecArg<V> x) {
    184  return Cos(d, x);
    185 }
    186 
    187 /**
    188 * Highway SIMD version of std::exp(x).
    189 *
    190 * Valid Lane Types: float32, float64
    191 *        Max Error: ULP = 1
    192 *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706]
    193 * @return e^x
    194 */
    195 template <class D, class V>
    196 HWY_INLINE V Exp(D d, V x);
    197 template <class D, class V>
    198 HWY_NOINLINE V CallExp(const D d, VecArg<V> x) {
    199  return Exp(d, x);
    200 }
    201 
    202 /**
    203 * Highway SIMD version of std::exp2(x).
    204 *
    205 * Valid Lane Types: float32, float64
    206 *        Max Error: ULP = 2
    207 *      Valid Range: float32[-FLT_MAX, +128], float64[-DBL_MAX, +1024]
    208 * @return 2^x
    209 */
    210 template <class D, class V>
    211 HWY_INLINE V Exp2(D d, V x);
    212 template <class D, class V>
    213 HWY_NOINLINE V CallExp2(const D d, VecArg<V> x) {
    214  return Exp2(d, x);
    215 }
    216 
    217 /**
    218 * Highway SIMD version of std::expm1(x).
    219 *
    220 * Valid Lane Types: float32, float64
    221 *        Max Error: ULP = 4
    222 *      Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706]
    223 * @return e^x - 1
    224 */
    225 template <class D, class V>
    226 HWY_INLINE V Expm1(D d, V x);
    227 template <class D, class V>
    228 HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) {
    229  return Expm1(d, x);
    230 }
    231 
    232 /**
    233 * Highway SIMD version of std::log(x).
    234 *
    235 * Valid Lane Types: float32, float64
    236 *        Max Error: ULP = 4
    237 *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
    238 * @return natural logarithm of 'x'
    239 */
    240 template <class D, class V>
    241 HWY_INLINE V Log(D d, V x);
    242 template <class D, class V>
    243 HWY_NOINLINE V CallLog(const D d, VecArg<V> x) {
    244  return Log(d, x);
    245 }
    246 
    247 /**
    248 * Highway SIMD version of std::log10(x).
    249 *
    250 * Valid Lane Types: float32, float64
    251 *        Max Error: ULP = 2
    252 *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
    253 * @return base 10 logarithm of 'x'
    254 */
    255 template <class D, class V>
    256 HWY_INLINE V Log10(D d, V x);
    257 template <class D, class V>
    258 HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) {
    259  return Log10(d, x);
    260 }
    261 
    262 /**
    263 * Highway SIMD version of std::log1p(x).
    264 *
    265 * Valid Lane Types: float32, float64
    266 *        Max Error: ULP = 2
    267 *      Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX]
    268 * @return log(1 + x)
    269 */
    270 template <class D, class V>
    271 HWY_INLINE V Log1p(D d, V x);
    272 template <class D, class V>
    273 HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) {
    274  return Log1p(d, x);
    275 }
    276 
    277 /**
    278 * Highway SIMD version of std::log2(x).
    279 *
    280 * Valid Lane Types: float32, float64
    281 *        Max Error: ULP = 2
    282 *      Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX]
    283 * @return base 2 logarithm of 'x'
    284 */
    285 template <class D, class V>
    286 HWY_INLINE V Log2(D d, V x);
    287 template <class D, class V>
    288 HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) {
    289  return Log2(d, x);
    290 }
    291 
    292 /**
    293 * Highway SIMD version of std::sin(x).
    294 *
    295 * Valid Lane Types: float32, float64
    296 *        Max Error: ULP = 3
    297 *      Valid Range: [-39000, +39000]
    298 * @return sine of 'x'
    299 */
    300 template <class D, class V>
    301 HWY_INLINE V Sin(D d, V x);
    302 template <class D, class V>
    303 HWY_NOINLINE V CallSin(const D d, VecArg<V> x) {
    304  return Sin(d, x);
    305 }
    306 
    307 /**
    308 * Highway SIMD version of std::sinh(x).
    309 *
    310 * Valid Lane Types: float32, float64
    311 *        Max Error: ULP = 4
    312 *      Valid Range: float32[-88.7228, +88.7228], float64[-709, +709]
    313 * @return hyperbolic sine of 'x'
    314 */
    315 template <class D, class V>
    316 HWY_INLINE V Sinh(D d, V x);
    317 template <class D, class V>
    318 HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) {
    319  return Sinh(d, x);
    320 }
    321 
    322 /**
    323 * Highway SIMD version of std::tanh(x).
    324 *
    325 * Valid Lane Types: float32, float64
    326 *        Max Error: ULP = 4
    327 *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
    328 * @return hyperbolic tangent of 'x'
    329 */
    330 template <class D, class V>
    331 HWY_INLINE V Tanh(D d, V x);
    332 template <class D, class V>
    333 HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
    334  return Tanh(d, x);
    335 }
    336 
    337 /**
    338 * Highway SIMD version of SinCos.
    339 * Compute the sine and cosine at the same time
    340 * The performance should be around the same as calling Sin.
    341 *
    342 * Valid Lane Types: float32, float64
    343 *        Max Error: ULP = 1
    344 *      Valid Range: [-39000, +39000]
    345 */
    346 template <class D, class V>
    347 HWY_INLINE void SinCos(D d, V x, V& s, V& c);
    348 template <class D, class V>
    349 HWY_NOINLINE void CallSinCos(const D d, VecArg<V> x, V& s, V& c) {
    350  SinCos(d, x, s, c);
    351 }
    352 
    353 /**
    354 * Highway SIMD version of Hypot
    355 *
    356 * Valid Lane Types: float32, float64
    357 *        Max Error: ULP = 4
    358 *      Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX]
    359 * @return hypotenuse of a and b
    360 */
    361 template <class D, class V>
    362 HWY_INLINE V Hypot(D d, V a, V b);
    363 template <class D, class V>
    364 HWY_NOINLINE V CallHypot(const D d, VecArg<V> a, VecArg<V> b) {
    365  return Hypot(d, a, b);
    366 }
    367 
    368 ////////////////////////////////////////////////////////////////////////////////
    369 // Implementation
    370 ////////////////////////////////////////////////////////////////////////////////
    371 namespace impl {
    372 
    373 // Estrin's Scheme is a faster method for evaluating large polynomials on
    374 // super scalar architectures. It works by factoring the Horner's Method
    375 // polynomial into power of two sub-trees that can be evaluated in parallel.
    376 // Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
    377 template <class T>
    378 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
    379  return MulAdd(c1, x, c0);
    380 }
    381 template <class T>
    382 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
    383  T x2 = Mul(x, x);
    384  return MulAdd(x2, c2, MulAdd(c1, x, c0));
    385 }
    386 template <class T>
    387 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
    388  T x2 = Mul(x, x);
    389  return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
    390 }
    391 template <class T>
    392 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
    393  T x2 = Mul(x, x);
    394  T x4 = Mul(x2, x2);
    395  return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
    396 }
    397 template <class T>
    398 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
    399  T x2 = Mul(x, x);
    400  T x4 = Mul(x2, x2);
    401  return MulAdd(x4, MulAdd(c5, x, c4),
    402                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
    403 }
    404 template <class T>
    405 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    406                                     T c6) {
    407  T x2 = Mul(x, x);
    408  T x4 = Mul(x2, x2);
    409  return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
    410                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
    411 }
    412 template <class T>
    413 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    414                                     T c6, T c7) {
    415  T x2 = Mul(x, x);
    416  T x4 = Mul(x2, x2);
    417  return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    418                MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
    419 }
    420 template <class T>
    421 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    422                                     T c6, T c7, T c8) {
    423  T x2 = Mul(x, x);
    424  T x4 = Mul(x2, x2);
    425  T x8 = Mul(x4, x4);
    426  return MulAdd(x8, c8,
    427                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    428                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    429 }
    430 template <class T>
    431 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    432                                     T c6, T c7, T c8, T c9) {
    433  T x2 = Mul(x, x);
    434  T x4 = Mul(x2, x2);
    435  T x8 = Mul(x4, x4);
    436  return MulAdd(x8, MulAdd(c9, x, c8),
    437                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    438                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    439 }
    440 template <class T>
    441 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    442                                     T c6, T c7, T c8, T c9, T c10) {
    443  T x2 = Mul(x, x);
    444  T x4 = Mul(x2, x2);
    445  T x8 = Mul(x4, x4);
    446  return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
    447                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    448                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    449 }
    450 template <class T>
    451 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    452                                     T c6, T c7, T c8, T c9, T c10, T c11) {
    453  T x2 = Mul(x, x);
    454  T x4 = Mul(x2, x2);
    455  T x8 = Mul(x4, x4);
    456  return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
    457                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    458                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    459 }
    460 template <class T>
    461 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    462                                     T c6, T c7, T c8, T c9, T c10, T c11,
    463                                     T c12) {
    464  T x2 = Mul(x, x);
    465  T x4 = Mul(x2, x2);
    466  T x8 = Mul(x4, x4);
    467  return MulAdd(
    468      x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    469      MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    470             MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    471 }
    472 template <class T>
    473 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    474                                     T c6, T c7, T c8, T c9, T c10, T c11,
    475                                     T c12, T c13) {
    476  T x2 = Mul(x, x);
    477  T x4 = Mul(x2, x2);
    478  T x8 = Mul(x4, x4);
    479  return MulAdd(x8,
    480                MulAdd(x4, MulAdd(c13, x, c12),
    481                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    482                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    483                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    484 }
    485 template <class T>
    486 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    487                                     T c6, T c7, T c8, T c9, T c10, T c11,
    488                                     T c12, T c13, T c14) {
    489  T x2 = Mul(x, x);
    490  T x4 = Mul(x2, x2);
    491  T x8 = Mul(x4, x4);
    492  return MulAdd(x8,
    493                MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
    494                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    495                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    496                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    497 }
    498 template <class T>
    499 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    500                                     T c6, T c7, T c8, T c9, T c10, T c11,
    501                                     T c12, T c13, T c14, T c15) {
    502  T x2 = Mul(x, x);
    503  T x4 = Mul(x2, x2);
    504  T x8 = Mul(x4, x4);
    505  return MulAdd(x8,
    506                MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
    507                       MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    508                MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    509                       MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
    510 }
    511 template <class T>
    512 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    513                                     T c6, T c7, T c8, T c9, T c10, T c11,
    514                                     T c12, T c13, T c14, T c15, T c16) {
    515  T x2 = Mul(x, x);
    516  T x4 = Mul(x2, x2);
    517  T x8 = Mul(x4, x4);
    518  T x16 = Mul(x8, x8);
    519  return MulAdd(
    520      x16, c16,
    521      MulAdd(x8,
    522             MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
    523                    MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    524             MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    525                    MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
    526 }
    527 template <class T>
    528 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    529                                     T c6, T c7, T c8, T c9, T c10, T c11,
    530                                     T c12, T c13, T c14, T c15, T c16, T c17) {
    531  T x2 = Mul(x, x);
    532  T x4 = Mul(x2, x2);
    533  T x8 = Mul(x4, x4);
    534  T x16 = Mul(x8, x8);
    535  return MulAdd(
    536      x16, MulAdd(c17, x, c16),
    537      MulAdd(x8,
    538             MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
    539                    MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    540             MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    541                    MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
    542 }
    543 template <class T>
    544 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
    545                                     T c6, T c7, T c8, T c9, T c10, T c11,
    546                                     T c12, T c13, T c14, T c15, T c16, T c17,
    547                                     T c18) {
    548  T x2 = Mul(x, x);
    549  T x4 = Mul(x2, x2);
    550  T x8 = Mul(x4, x4);
    551  T x16 = Mul(x8, x8);
    552  return MulAdd(
    553      x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
    554      MulAdd(x8,
    555             MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
    556                    MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
    557             MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
    558                    MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
    559 }
    560 
    561 template <class FloatOrDouble>
    562 struct AsinImpl {};
    563 template <class FloatOrDouble>
    564 struct AtanImpl {};
    565 template <class FloatOrDouble>
    566 struct CosSinImpl {};
    567 template <class FloatOrDouble>
    568 struct ExpImpl {};
    569 template <class FloatOrDouble>
    570 struct LogImpl {};
    571 template <class FloatOrDouble>
    572 struct SinCosImpl {};
    573 
    574 template <>
    575 struct AsinImpl<float> {
    576  // Polynomial approximation for asin(x) over the range [0, 0.5).
    577  template <class D, class V>
    578  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
    579    const auto k0 = Set(d, +0.1666677296f);
    580    const auto k1 = Set(d, +0.07495029271f);
    581    const auto k2 = Set(d, +0.04547423869f);
    582    const auto k3 = Set(d, +0.02424046025f);
    583    const auto k4 = Set(d, +0.04197454825f);
    584 
    585    return Estrin(x2, k0, k1, k2, k3, k4);
    586  }
    587 };
    588 
    589 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
    590 
    591 template <>
    592 struct AsinImpl<double> {
    593  // Polynomial approximation for asin(x) over the range [0, 0.5).
    594  template <class D, class V>
    595  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
    596    const auto k0 = Set(d, +0.1666666666666497543);
    597    const auto k1 = Set(d, +0.07500000000378581611);
    598    const auto k2 = Set(d, +0.04464285681377102438);
    599    const auto k3 = Set(d, +0.03038195928038132237);
    600    const auto k4 = Set(d, +0.02237176181932048341);
    601    const auto k5 = Set(d, +0.01735956991223614604);
    602    const auto k6 = Set(d, +0.01388715184501609218);
    603    const auto k7 = Set(d, +0.01215360525577377331);
    604    const auto k8 = Set(d, +0.006606077476277170610);
    605    const auto k9 = Set(d, +0.01929045477267910674);
    606    const auto k10 = Set(d, -0.01581918243329996643);
    607    const auto k11 = Set(d, +0.03161587650653934628);
    608 
    609    return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
    610  }
    611 };
    612 
    613 #endif
    614 
    615 template <>
    616 struct AtanImpl<float> {
    617  // Polynomial approximation for atan(x) over the range [0, 1.0).
    618  template <class D, class V>
    619  HWY_INLINE V AtanPoly(D d, V x) {
    620    const auto k0 = Set(d, -0.333331018686294555664062f);
    621    const auto k1 = Set(d, +0.199926957488059997558594f);
    622    const auto k2 = Set(d, -0.142027363181114196777344f);
    623    const auto k3 = Set(d, +0.106347933411598205566406f);
    624    const auto k4 = Set(d, -0.0748900920152664184570312f);
    625    const auto k5 = Set(d, +0.0425049886107444763183594f);
    626    const auto k6 = Set(d, -0.0159569028764963150024414f);
    627    const auto k7 = Set(d, +0.00282363896258175373077393f);
    628 
    629    const auto y = Mul(x, x);
    630    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
    631  }
    632 };
    633 
    634 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
    635 
    636 template <>
    637 struct AtanImpl<double> {
    638  // Polynomial approximation for atan(x) over the range [0, 1.0).
    639  template <class D, class V>
    640  HWY_INLINE V AtanPoly(D d, V x) {
    641    const auto k0 = Set(d, -0.333333333333311110369124);
    642    const auto k1 = Set(d, +0.199999999996591265594148);
    643    const auto k2 = Set(d, -0.14285714266771329383765);
    644    const auto k3 = Set(d, +0.111111105648261418443745);
    645    const auto k4 = Set(d, -0.090908995008245008229153);
    646    const auto k5 = Set(d, +0.0769219538311769618355029);
    647    const auto k6 = Set(d, -0.0666573579361080525984562);
    648    const auto k7 = Set(d, +0.0587666392926673580854313);
    649    const auto k8 = Set(d, -0.0523674852303482457616113);
    650    const auto k9 = Set(d, +0.0466667150077840625632675);
    651    const auto k10 = Set(d, -0.0407629191276836500001934);
    652    const auto k11 = Set(d, +0.0337852580001353069993897);
    653    const auto k12 = Set(d, -0.0254517624932312641616861);
    654    const auto k13 = Set(d, +0.016599329773529201970117);
    655    const auto k14 = Set(d, -0.00889896195887655491740809);
    656    const auto k15 = Set(d, +0.00370026744188713119232403);
    657    const auto k16 = Set(d, -0.00110611831486672482563471);
    658    const auto k17 = Set(d, +0.000209850076645816976906797);
    659    const auto k18 = Set(d, -1.88796008463073496563746e-5);
    660 
    661    const auto y = Mul(x, x);
    662    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
    663                         k12, k13, k14, k15, k16, k17, k18),
    664                  Mul(y, x), x);
    665  }
    666 };
    667 
    668 #endif
    669 
    670 template <>
    671 struct CosSinImpl<float> {
    672  // Rounds float toward zero and returns as int32_t.
    673  template <class D, class V>
    674  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
    675    return ConvertTo(Rebind<int32_t, D>(), x);
    676  }
    677 
    678  template <class D, class V>
    679  HWY_INLINE V Poly(D d, V x) {
    680    const auto k0 = Set(d, -1.66666597127914428710938e-1f);
    681    const auto k1 = Set(d, +8.33307858556509017944336e-3f);
    682    const auto k2 = Set(d, -1.981069071916863322258e-4f);
    683    const auto k3 = Set(d, +2.6083159809786593541503e-6f);
    684 
    685    const auto y = Mul(x, x);
    686    return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
    687  }
    688 
    689  template <class D, class V, class VI32>
    690  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
    691    // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
    692    const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
    693    const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
    694    const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
    695    const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);
    696 
    697    // Extended precision modular arithmetic.
    698    const V qf = ConvertTo(d, q);
    699    x = MulAdd(qf, kHalfPiPart0f, x);
    700    x = MulAdd(qf, kHalfPiPart1f, x);
    701    x = MulAdd(qf, kHalfPiPart2f, x);
    702    x = MulAdd(qf, kHalfPiPart3f, x);
    703    return x;
    704  }
    705 
    706  template <class D, class V, class VI32>
    707  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
    708    // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
    709    const V kPiPart0f = Set(d, -3.140625f);
    710    const V kPiPart1f = Set(d, -0.0009670257568359375f);
    711    const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
    712    const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);
    713 
    714    // Extended precision modular arithmetic.
    715    const V qf = ConvertTo(d, q);
    716    x = MulAdd(qf, kPiPart0f, x);
    717    x = MulAdd(qf, kPiPart1f, x);
    718    x = MulAdd(qf, kPiPart2f, x);
    719    x = MulAdd(qf, kPiPart3f, x);
    720    return x;
    721  }
    722 
    723  // (q & 2) == 0 ? -0.0 : +0.0
    724  template <class D, class VI32>
    725  HWY_INLINE Vec<Rebind<float, D>> CosSignFromQuadrant(D d, VI32 q) {
    726    const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
    727    return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
    728  }
    729 
    730  // ((q & 1) ? -0.0 : +0.0)
    731  template <class D, class VI32>
    732  HWY_INLINE Vec<Rebind<float, D>> SinSignFromQuadrant(D d, VI32 q) {
    733    const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
    734    return BitCast(d, ShiftLeft<31>(And(q, kOne)));
    735  }
    736 };
    737 
    738 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
    739 
    740 template <>
    741 struct CosSinImpl<double> {
    742  // Rounds double toward zero and returns as int32_t.
    743  template <class D, class V>
    744  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
    745    return DemoteTo(Rebind<int32_t, D>(), x);
    746  }
    747 
    748  template <class D, class V>
    749  HWY_INLINE V Poly(D d, V x) {
    750    const auto k0 = Set(d, -0.166666666666666657414808);
    751    const auto k1 = Set(d, +0.00833333333333332974823815);
    752    const auto k2 = Set(d, -0.000198412698412696162806809);
    753    const auto k3 = Set(d, +2.75573192239198747630416e-6);
    754    const auto k4 = Set(d, -2.50521083763502045810755e-8);
    755    const auto k5 = Set(d, +1.60590430605664501629054e-10);
    756    const auto k6 = Set(d, -7.64712219118158833288484e-13);
    757    const auto k7 = Set(d, +2.81009972710863200091251e-15);
    758    const auto k8 = Set(d, -7.97255955009037868891952e-18);
    759 
    760    const auto y = Mul(x, x);
    761    return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
    762  }
    763 
    764  template <class D, class V, class VI32>
    765  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
    766    // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
    767    const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
    768    const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
    769    const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
    770    const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);
    771 
    772    // Extended precision modular arithmetic.
    773    const V qf = PromoteTo(d, q);
    774    x = MulAdd(qf, kHalfPiPart0d, x);
    775    x = MulAdd(qf, kHalfPiPart1d, x);
    776    x = MulAdd(qf, kHalfPiPart2d, x);
    777    x = MulAdd(qf, kHalfPiPart3d, x);
    778    return x;
    779  }
    780 
    781  template <class D, class V, class VI32>
    782  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
    783    // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
    784    const V kPiPart0d = Set(d, -3.1415926218032836914);
    785    const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
    786    const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
    787    const V kPiPart3d = Set(d, -1.2736634327021899816e-24);
    788 
    789    // Extended precision modular arithmetic.
    790    const V qf = PromoteTo(d, q);
    791    x = MulAdd(qf, kPiPart0d, x);
    792    x = MulAdd(qf, kPiPart1d, x);
    793    x = MulAdd(qf, kPiPart2d, x);
    794    x = MulAdd(qf, kPiPart3d, x);
    795    return x;
    796  }
    797 
    798  // (q & 2) == 0 ? -0.0 : +0.0
    799  template <class D, class VI32>
    800  HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
    801    const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
    802    return BitCast(
    803        d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
    804  }
    805 
    806  // ((q & 1) ? -0.0 : +0.0)
    807  template <class D, class VI32>
    808  HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
    809    const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
    810    return BitCast(
    811        d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
    812  }
    813 };
    814 
    815 #endif
    816 
    817 template <>
    818 struct ExpImpl<float> {
    819  // Rounds float toward zero and returns as int32_t.
    820  template <class D, class V>
    821  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
    822    return ConvertTo(Rebind<int32_t, D>(), x);
    823  }
    824 
    825  // Rounds float to nearest int32_t
    826  template <class D, class V>
    827  HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) {
    828    return NearestInt(x);
    829  }
    830 
    831  template <class D, class V>
    832  HWY_INLINE V ExpPoly(D d, V x) {
    833    const auto k0 = Set(d, +0.5f);
    834    const auto k1 = Set(d, +0.166666671633720397949219f);
    835    const auto k2 = Set(d, +0.0416664853692054748535156f);
    836    const auto k3 = Set(d, +0.00833336077630519866943359f);
    837    const auto k4 = Set(d, +0.00139304355252534151077271f);
    838    const auto k5 = Set(d, +0.000198527617612853646278381f);
    839 
    840    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
    841  }
    842 
    843  // Computes 2^x, where x is an integer.
    844  template <class D, class VI32>
    845  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
    846    const Rebind<int32_t, D> di32;
    847    const VI32 kOffset = Set(di32, 0x7F);
    848    return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
    849  }
    850 
    851  // Sets the exponent of 'x' to 2^e.
    852  template <class D, class V, class VI32>
    853  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
    854    const VI32 y = ShiftRight<1>(e);
    855    return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
    856  }
    857 
    858  template <class D, class V, class VI32>
    859  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
    860    // kLn2Part0f + kLn2Part1f ~= -ln(2)
    861    const V kLn2Part0f = Set(d, -0.693145751953125f);
    862    const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);
    863 
    864    // Extended precision modular arithmetic.
    865    const V qf = ConvertTo(d, q);
    866    x = MulAdd(qf, kLn2Part0f, x);
    867    x = MulAdd(qf, kLn2Part1f, x);
    868    return x;
    869  }
    870 
    871  template <class D, class V, class VI32>
    872  HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) {
    873    const V x_frac = Sub(x, ConvertTo(d, q));
    874    return MulAdd(x_frac, Set(d, 0.193147182464599609375f),
    875                  Mul(x_frac, Set(d, 0.5f)));
    876  }
    877 };
    878 
    879 template <>
    880 struct LogImpl<float> {
    881  template <class D, class V>
    882  HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
    883    const Rebind<int32_t, D> di32;
    884    const Rebind<uint32_t, D> du32;
    885    const auto kBias = Set(di32, 0x7F);
    886    return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
    887  }
    888 
    889  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
    890  template <class D, class V>
    891  HWY_INLINE V LogPoly(D d, V x) {
    892    const V k0 = Set(d, 0.66666662693f);
    893    const V k1 = Set(d, 0.40000972152f);
    894    const V k2 = Set(d, 0.28498786688f);
    895    const V k3 = Set(d, 0.24279078841f);
    896 
    897    const V x2 = Mul(x, x);
    898    const V x4 = Mul(x2, x2);
    899    return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
    900  }
    901 };
    902 
    903 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
    904 template <>
    905 struct ExpImpl<double> {
    906  // Rounds double toward zero and returns as int32_t.
    907  template <class D, class V>
    908  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
    909    return DemoteTo(Rebind<int32_t, D>(), x);
    910  }
    911 
    912  // Rounds double to nearest int32_t
    913  template <class D, class V>
    914  HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) {
    915    return DemoteToNearestInt(Rebind<int32_t, D>(), x);
    916  }
    917 
    918  template <class D, class V>
    919  HWY_INLINE V ExpPoly(D d, V x) {
    920    const auto k0 = Set(d, +0.5);
    921    const auto k1 = Set(d, +0.166666666666666851703837);
    922    const auto k2 = Set(d, +0.0416666666666665047591422);
    923    const auto k3 = Set(d, +0.00833333333331652721664984);
    924    const auto k4 = Set(d, +0.00138888888889774492207962);
    925    const auto k5 = Set(d, +0.000198412698960509205564975);
    926    const auto k6 = Set(d, +2.4801587159235472998791e-5);
    927    const auto k7 = Set(d, +2.75572362911928827629423e-6);
    928    const auto k8 = Set(d, +2.75573911234900471893338e-7);
    929    const auto k9 = Set(d, +2.51112930892876518610661e-8);
    930    const auto k10 = Set(d, +2.08860621107283687536341e-9);
    931 
    932    return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
    933                  Mul(x, x), x);
    934  }
    935 
    936  // Computes 2^x, where x is an integer.
    937  template <class D, class VI32>
    938  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
    939    const Rebind<int32_t, D> di32;
    940    const Rebind<int64_t, D> di64;
    941    const VI32 kOffset = Set(di32, 0x3FF);
    942    return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
    943  }
    944 
    945  // Sets the exponent of 'x' to 2^e.
    946  template <class D, class V, class VI32>
    947  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
    948    const VI32 y = ShiftRight<1>(e);
    949    return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
    950  }
    951 
    952  template <class D, class V, class VI32>
    953  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
    954    // kLn2Part0d + kLn2Part1d ~= -ln(2)
    955    const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
    956    const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);
    957 
    958    // Extended precision modular arithmetic.
    959    const V qf = PromoteTo(d, q);
    960    x = MulAdd(qf, kLn2Part0d, x);
    961    x = MulAdd(qf, kLn2Part1d, x);
    962    return x;
    963  }
    964 
    965  template <class D, class V, class VI32>
    966  HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) {
    967    const V x_frac = Sub(x, PromoteTo(d, q));
    968    return MulAdd(x_frac, Set(d, 0.1931471805599453139823396),
    969                  Mul(x_frac, Set(d, 0.5)));
    970  }
    971 };
    972 
    973 template <>
    974 struct LogImpl<double> {
    975  template <class D, class V>
    976  HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
    977    const Rebind<int64_t, D> di64;
    978    const Rebind<uint64_t, D> du64;
    979    return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
    980               Set(di64, 0x3FF));
    981  }
    982 
    983  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
    984  template <class D, class V>
    985  HWY_INLINE V LogPoly(D d, V x) {
    986    const V k0 = Set(d, 0.6666666666666735130);
    987    const V k1 = Set(d, 0.3999999999940941908);
    988    const V k2 = Set(d, 0.2857142874366239149);
    989    const V k3 = Set(d, 0.2222219843214978396);
    990    const V k4 = Set(d, 0.1818357216161805012);
    991    const V k5 = Set(d, 0.1531383769920937332);
    992    const V k6 = Set(d, 0.1479819860511658591);
    993 
    994    const V x2 = Mul(x, x);
    995    const V x4 = Mul(x2, x2);
    996    return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
    997                  (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
    998  }
    999 };
   1000 
   1001 #endif
   1002 
   1003 template <class D, class V, bool kAllowSubnormals = true>
   1004 HWY_INLINE V Log(const D d, V x) {
   1005  // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
   1006  using T = TFromD<D>;
   1007  impl::LogImpl<T> impl;
   1008 
   1009  constexpr bool kIsF32 = (sizeof(T) == 4);
   1010 
   1011  // Float Constants
   1012  const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f)
   1013                                 : static_cast<T>(0.693147180369123816490));
   1014  const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f)
   1015                                 : static_cast<T>(1.90821492927058770002e-10));
   1016  const V kOne = Set(d, static_cast<T>(+1.0));
   1017  const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f)
   1018                                     : static_cast<T>(2.2250738585072014e-308));
   1019  const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f)
   1020                                 : static_cast<T>(1.8014398509481984e+16));
   1021 
   1022  // Integer Constants
   1023  using TI = MakeSigned<T>;
   1024  const Rebind<TI, D> di;
   1025  using VI = decltype(Zero(di));
   1026  const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L)
   1027                                       : static_cast<TI>(0xFFFFFFFFLL));
   1028  const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L)
   1029                                   : static_cast<TI>(0x3FE6A09E00000000LL));
   1030  const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L)
   1031                                     : static_cast<TI>(0x3FF0000000000000LL));
   1032  const VI kExpScale =
   1033      Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54));
   1034  const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL)
   1035                                     : static_cast<TI>(0xFFFFF00000000LL));
   1036 
   1037  // Scale up 'x' so that it is no longer denormalized.
   1038  VI exp_bits;
   1039  V exp;
   1040  if (kAllowSubnormals == true) {
   1041    const auto is_denormal = Lt(x, kMinNormal);
   1042    x = IfThenElse(is_denormal, Mul(x, kScale), x);
   1043 
   1044    // Compute the new exponent.
   1045    exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
   1046    const VI exp_scale =
   1047        BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
   1048    exp = ConvertTo(
   1049        d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
   1050  } else {
   1051    // Compute the new exponent.
   1052    exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
   1053    exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
   1054  }
   1055 
   1056  // Renormalize.
   1057  const V y = Or(And(x, BitCast(d, kLowerBits)),
   1058                 BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
   1059 
   1060  // Approximate and reconstruct.
   1061  const V ym1 = Sub(y, kOne);
   1062  const V z = Div(ym1, Add(y, kOne));
   1063 
   1064  return MulSub(
   1065      exp, kLn2Hi,
   1066      Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
   1067 }
   1068 
   1069 // SinCos
   1070 // Based on "sse_mathfun.h", by Julien Pommier
   1071 // http://gruntthepeon.free.fr/ssemath/
   1072 
   1073 // Third degree poly
   1074 template <class D, class V>
   1075 HWY_INLINE void SinCos3(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x,
   1076                        V& s, V& c) {
   1077  using T = TFromD<D>;
   1078  using TI = MakeSigned<T>;
   1079  using DI = Rebind<TI, D>;
   1080  const DI di;
   1081  using VI = decltype(Zero(di));
   1082  using M = Mask<D>;
   1083 
   1084  static constexpr size_t bits = sizeof(TI) * 8;
   1085  const VI sign_mask = SignBit(di);
   1086  const VI ci_0 = Zero(di);
   1087  const VI ci_1 = Set(di, 1);
   1088  const VI ci_2 = Set(di, 2);
   1089  const VI ci_4 = Set(di, 4);
   1090  const V cos_p0 = Set(d, ConvertScalarTo<T>(2.443315711809948E-005));
   1091  const V cos_p1 = Set(d, ConvertScalarTo<T>(-1.388731625493765E-003));
   1092  const V cos_p2 = Set(d, ConvertScalarTo<T>(4.166664568298827E-002));
   1093  const V sin_p0 = Set(d, ConvertScalarTo<T>(-1.9515295891E-4));
   1094  const V sin_p1 = Set(d, ConvertScalarTo<T>(8.3321608736E-3));
   1095  const V sin_p2 = Set(d, ConvertScalarTo<T>(-1.6666654611E-1));
   1096  const V FOPI = Set(d, ConvertScalarTo<T>(1.27323954473516));  // 4 / M_PI
   1097  const V DP1 = Set(d, dp1);
   1098  const V DP2 = Set(d, dp2);
   1099  const V DP3 = Set(d, dp3);
   1100 
   1101  V xmm1, xmm2, sign_bit_sin, y;
   1102  VI imm0, imm2, imm4;
   1103 
   1104  sign_bit_sin = x;
   1105  x = Abs(x);
   1106 
   1107  /* extract the sign bit (upper one) */
   1108  sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask));
   1109 
   1110  /* scale by 4/Pi */
   1111  y = Mul(x, FOPI);
   1112 
   1113  /* store the integer part of y in imm2 */
   1114  imm2 = ConvertTo(di, y);
   1115 
   1116  /* j=(j+1) & (~1) (see the cephes sources) */
   1117  imm2 = Add(imm2, ci_1);
   1118  imm2 = AndNot(ci_1, imm2);
   1119 
   1120  y = ConvertTo(d, imm2);
   1121  imm4 = imm2;
   1122 
   1123  /* get the swap sign flag for the sine */
   1124  imm0 = And(imm2, ci_4);
   1125  imm0 = ShiftLeft<bits - 3>(imm0);
   1126 
   1127  V swap_sign_bit_sin = BitCast(d, imm0);
   1128 
   1129  /* get the polynomial selection mask for the sine*/
   1130  imm2 = And(imm2, ci_2);
   1131  M poly_mask = RebindMask(d, Eq(imm2, ci_0));
   1132 
   1133  /* The magic pass: "Extended precision modular arithmetic"
   1134  x = ((x - y * DP1) - y * DP2) - y * DP3; */
   1135  x = MulAdd(y, DP1, x);
   1136  x = MulAdd(y, DP2, x);
   1137  x = MulAdd(y, DP3, x);
   1138 
   1139  imm4 = Sub(imm4, ci_2);
   1140  imm4 = AndNot(imm4, ci_4);
   1141  imm4 = ShiftLeft<bits - 3>(imm4);
   1142 
   1143  V sign_bit_cos = BitCast(d, imm4);
   1144 
   1145  sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin);
   1146 
   1147  /* Evaluate the first polynomial  (0 <= x <= Pi/4) */
   1148  V z = Mul(x, x);
   1149 
   1150  y = MulAdd(cos_p0, z, cos_p1);
   1151  y = MulAdd(y, z, cos_p2);
   1152  y = Mul(y, z);
   1153  y = Mul(y, z);
   1154  y = NegMulAdd(z, Set(d, 0.5f), y);
   1155  y = Add(y, Set(d, 1));
   1156 
   1157  /* Evaluate the second polynomial  (Pi/4 <= x <= 0) */
   1158  V y2 = MulAdd(sin_p0, z, sin_p1);
   1159  y2 = MulAdd(y2, z, sin_p2);
   1160  y2 = Mul(y2, z);
   1161  y2 = MulAdd(y2, x, x);
   1162 
   1163  /* select the correct result from the two polynomials */
   1164  xmm1 = IfThenElse(poly_mask, y2, y);
   1165  xmm2 = IfThenElse(poly_mask, y, y2);
   1166 
   1167  /* update the sign */
   1168  s = Xor(xmm1, sign_bit_sin);
   1169  c = Xor(xmm2, sign_bit_cos);
   1170 }
   1171 
   1172 // Sixth degree poly
   1173 template <class D, class V>
   1174 HWY_INLINE void SinCos6(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x,
   1175                        V& s, V& c) {
   1176  using T = TFromD<D>;
   1177  using TI = MakeSigned<T>;
   1178  using DI = Rebind<TI, D>;
   1179  const DI di;
   1180  using VI = decltype(Zero(di));
   1181  using M = Mask<D>;
   1182 
   1183  static constexpr size_t bits = sizeof(TI) * 8;
   1184  const VI sign_mask = SignBit(di);
   1185  const VI ci_0 = Zero(di);
   1186  const VI ci_1 = Set(di, 1);
   1187  const VI ci_2 = Set(di, 2);
   1188  const VI ci_4 = Set(di, 4);
   1189  const V cos_p0 = Set(d, ConvertScalarTo<T>(-1.13585365213876817300E-11));
   1190  const V cos_p1 = Set(d, ConvertScalarTo<T>(2.08757008419747316778E-9));
   1191  const V cos_p2 = Set(d, ConvertScalarTo<T>(-2.75573141792967388112E-7));
   1192  const V cos_p3 = Set(d, ConvertScalarTo<T>(2.48015872888517045348E-5));
   1193  const V cos_p4 = Set(d, ConvertScalarTo<T>(-1.38888888888730564116E-3));
   1194  const V cos_p5 = Set(d, ConvertScalarTo<T>(4.16666666666665929218E-2));
   1195  const V sin_p0 = Set(d, ConvertScalarTo<T>(1.58962301576546568060E-10));
   1196  const V sin_p1 = Set(d, ConvertScalarTo<T>(-2.50507477628578072866E-8));
   1197  const V sin_p2 = Set(d, ConvertScalarTo<T>(2.75573136213857245213E-6));
   1198  const V sin_p3 = Set(d, ConvertScalarTo<T>(-1.98412698295895385996E-4));
   1199  const V sin_p4 = Set(d, ConvertScalarTo<T>(8.33333333332211858878E-3));
   1200  const V sin_p5 = Set(d, ConvertScalarTo<T>(-1.66666666666666307295E-1));
   1201  const V FOPI =  // 4 / M_PI
   1202      Set(d, ConvertScalarTo<T>(1.2732395447351626861510701069801148));
   1203  const V DP1 = Set(d, dp1);
   1204  const V DP2 = Set(d, dp2);
   1205  const V DP3 = Set(d, dp3);
   1206 
   1207  V xmm1, xmm2, sign_bit_sin, y;
   1208  VI imm0, imm2, imm4;
   1209 
   1210  sign_bit_sin = x;
   1211  x = Abs(x);
   1212 
   1213  /* extract the sign bit (upper one) */
   1214  sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask));
   1215 
   1216  /* scale by 4/Pi */
   1217  y = Mul(x, FOPI);
   1218 
   1219  /* store the integer part of y in imm2 */
   1220  imm2 = ConvertTo(di, y);
   1221 
   1222  /* j=(j+1) & (~1) (see the cephes sources) */
   1223  imm2 = Add(imm2, ci_1);
   1224  imm2 = AndNot(ci_1, imm2);
   1225 
   1226  y = ConvertTo(d, imm2);
   1227  imm4 = imm2;
   1228 
   1229  /* get the swap sign flag for the sine */
   1230  imm0 = And(imm2, ci_4);
   1231  imm0 = ShiftLeft<bits - 3>(imm0);
   1232 
   1233  V swap_sign_bit_sin = BitCast(d, imm0);
   1234 
   1235  /* get the polynomial selection mask for the sine*/
   1236  imm2 = And(imm2, ci_2);
   1237  M poly_mask = RebindMask(d, Eq(imm2, ci_0));
   1238 
   1239  /* The magic pass: "Extended precision modular arithmetic"
   1240    x = ((x - y * DP1) - y * DP2) - y * DP3; */
   1241  x = MulAdd(y, DP1, x);
   1242  x = MulAdd(y, DP2, x);
   1243  x = MulAdd(y, DP3, x);
   1244 
   1245  imm4 = Sub(imm4, ci_2);
   1246  imm4 = AndNot(imm4, ci_4);
   1247  imm4 = ShiftLeft<bits - 3>(imm4);
   1248 
   1249  V sign_bit_cos = BitCast(d, imm4);
   1250  sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin);
   1251 
   1252  /* Evaluate the first polynomial  (0 <= x <= Pi/4) */
   1253  V z = Mul(x, x);
   1254 
   1255  y = MulAdd(cos_p0, z, cos_p1);
   1256  y = MulAdd(y, z, cos_p2);
   1257  y = MulAdd(y, z, cos_p3);
   1258  y = MulAdd(y, z, cos_p4);
   1259  y = MulAdd(y, z, cos_p5);
   1260  y = Mul(y, z);
   1261  y = Mul(y, z);
   1262  y = NegMulAdd(z, Set(d, 0.5f), y);
   1263  y = Add(y, Set(d, 1.0f));
   1264 
   1265  /* Evaluate the second polynomial  (Pi/4 <= x <= 0) */
   1266  V y2 = MulAdd(sin_p0, z, sin_p1);
   1267  y2 = MulAdd(y2, z, sin_p2);
   1268  y2 = MulAdd(y2, z, sin_p3);
   1269  y2 = MulAdd(y2, z, sin_p4);
   1270  y2 = MulAdd(y2, z, sin_p5);
   1271  y2 = Mul(y2, z);
   1272  y2 = MulAdd(y2, x, x);
   1273 
   1274  /* select the correct result from the two polynomials */
   1275  xmm1 = IfThenElse(poly_mask, y2, y);
   1276  xmm2 = IfThenElse(poly_mask, y, y2);
   1277 
   1278  /* update the sign */
   1279  s = Xor(xmm1, sign_bit_sin);
   1280  c = Xor(xmm2, sign_bit_cos);
   1281 }
   1282 
   1283 template <>
   1284 struct SinCosImpl<float> {
   1285  template <class D, class V>
   1286  HWY_INLINE void SinCos(D d, V x, V& s, V& c) {
   1287    SinCos3(d, -0.78515625f, -2.4187564849853515625e-4f,
   1288            -3.77489497744594108e-8f, x, s, c);
   1289  }
   1290 };
   1291 
   1292 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64
   1293 template <>
   1294 struct SinCosImpl<double> {
   1295  template <class D, class V>
   1296  HWY_INLINE void SinCos(D d, V x, V& s, V& c) {
   1297    SinCos6(d, -7.85398125648498535156E-1, -3.77489470793079817668E-8,
   1298            -2.69515142907905952645E-15, x, s, c);
   1299  }
   1300 };
   1301 #endif
   1302 
   1303 }  // namespace impl
   1304 
   1305 template <class D, class V>
   1306 HWY_INLINE V Acos(const D d, V x) {
   1307  using T = TFromD<D>;
   1308 
   1309  const V kZero = Zero(d);
   1310  const V kHalf = Set(d, static_cast<T>(+0.5));
   1311  const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264));
   1312  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
   1313 
   1314  const V sign_x = And(SignBit(d), x);
   1315  const V abs_x = Xor(x, sign_x);
   1316  const auto mask = Lt(abs_x, kHalf);
   1317  const V yy =
   1318      IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
   1319  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
   1320 
   1321  impl::AsinImpl<T> impl;
   1322  const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
   1323 
   1324  const V t_plus_y = Add(t, y);
   1325  const V z =
   1326      IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
   1327                 Add(t_plus_y, t_plus_y));
   1328  return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
   1329 }
   1330 
   1331 template <class D, class V>
   1332 HWY_INLINE V Acosh(const D d, V x) {
   1333  using T = TFromD<D>;
   1334 
   1335  const V kLarge = Set(d, static_cast<T>(268435456.0));
   1336  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
   1337  const V kOne = Set(d, static_cast<T>(+1.0));
   1338  const V kTwo = Set(d, static_cast<T>(+2.0));
   1339 
   1340  const auto is_x_large = Gt(x, kLarge);
   1341  const auto is_x_gt_2 = Gt(x, kTwo);
   1342 
   1343  const V x_minus_1 = Sub(x, kOne);
   1344  const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
   1345  const V y1 =
   1346      Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
   1347  const V y2 =
   1348      IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
   1349  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
   1350 
   1351  const auto is_pole = Eq(y2, kOne);
   1352  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
   1353  return Add(IfThenElse(is_x_gt_2, z,
   1354                        IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
   1355             IfThenElseZero(is_x_large, kLog2));
   1356 }
   1357 
   1358 template <class D, class V>
   1359 HWY_INLINE V Asin(const D d, V x) {
   1360  using T = TFromD<D>;
   1361 
   1362  const V kHalf = Set(d, static_cast<T>(+0.5));
   1363  const V kTwo = Set(d, static_cast<T>(+2.0));
   1364  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
   1365 
   1366  const V sign_x = And(SignBit(d), x);
   1367  const V abs_x = Xor(x, sign_x);
   1368  const auto mask = Lt(abs_x, kHalf);
   1369  const V yy =
   1370      IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
   1371  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
   1372 
   1373  impl::AsinImpl<T> impl;
   1374  const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
   1375  const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
   1376  return Or(IfThenElse(mask, z0, z1), sign_x);
   1377 }
   1378 
   1379 template <class D, class V>
   1380 HWY_INLINE V Asinh(const D d, V x) {
   1381  using T = TFromD<D>;
   1382 
   1383  const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0));
   1384  const V kLarge = Set(d, static_cast<T>(268435456.0));
   1385  const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227));
   1386  const V kOne = Set(d, static_cast<T>(+1.0));
   1387  const V kTwo = Set(d, static_cast<T>(+2.0));
   1388 
   1389  const V sign_x = And(SignBit(d), x);  // Extract the sign bit
   1390  const V abs_x = Xor(x, sign_x);
   1391 
   1392  const auto is_x_large = Gt(abs_x, kLarge);
   1393  const auto is_x_lt_2 = Lt(abs_x, kTwo);
   1394 
   1395  const V x2 = Mul(x, x);
   1396  const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
   1397 
   1398  const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
   1399  const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
   1400  const V y2 =
   1401      IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
   1402  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
   1403 
   1404  const auto is_pole = Eq(y2, kOne);
   1405  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
   1406  const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
   1407  const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
   1408  return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
   1409            sign_x);
   1410 }
   1411 
   1412 template <class D, class V>
   1413 HWY_INLINE V Atan(const D d, V x) {
   1414  using T = TFromD<D>;
   1415 
   1416  const V kOne = Set(d, static_cast<T>(+1.0));
   1417  const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169));
   1418 
   1419  const V sign = And(SignBit(d), x);
   1420  const V abs_x = Xor(x, sign);
   1421  const auto mask = Gt(abs_x, kOne);
   1422 
   1423  impl::AtanImpl<T> impl;
   1424  const auto divisor = IfThenElse(mask, abs_x, kOne);
   1425  const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
   1426  return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
   1427 }
   1428 
   1429 template <class D, class V>
   1430 HWY_INLINE V Atanh(const D d, V x) {
   1431  using T = TFromD<D>;
   1432 
   1433  const V kHalf = Set(d, static_cast<T>(+0.5));
   1434  const V kOne = Set(d, static_cast<T>(+1.0));
   1435 
   1436  const V sign = And(SignBit(d), x);  // Extract the sign bit
   1437  const V abs_x = Xor(x, sign);
   1438  return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
   1439             Xor(kHalf, sign));
   1440 }
   1441 
   1442 template <class D, class V>
   1443 HWY_INLINE V Cos(const D d, V x) {
   1444  using T = TFromD<D>;
   1445  impl::CosSinImpl<T> impl;
   1446 
   1447  // Float Constants
   1448  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
   1449 
   1450  // Integer Constants
   1451  const Rebind<int32_t, D> di32;
   1452  using VI32 = decltype(Zero(di32));
   1453  const VI32 kOne = Set(di32, 1);
   1454 
   1455  const V y = Abs(x);  // cos(x) == cos(|x|)
   1456 
   1457  // Compute the quadrant, q = int(|x| / pi) * 2 + 1
   1458  const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
   1459 
   1460  // Reduce range, apply sign, and approximate.
   1461  return impl.Poly(
   1462      d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
   1463 }
   1464 
   1465 template <class D, class V>
   1466 HWY_INLINE V Exp(const D d, V x) {
   1467  using T = TFromD<D>;
   1468 
   1469  const V kHalf = Set(d, static_cast<T>(+0.5));
   1470  const V kLowerBound =
   1471      Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
   1472  const V kNegZero = Set(d, static_cast<T>(-0.0));
   1473  const V kOne = Set(d, static_cast<T>(+1.0));
   1474  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
   1475 
   1476  impl::ExpImpl<T> impl;
   1477 
   1478  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
   1479  const auto q =
   1480      impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
   1481 
   1482  // Reduce, approximate, and then reconstruct.
   1483  const V y = impl.LoadExpShortRange(
   1484      d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
   1485  return IfThenElseZero(Ge(x, kLowerBound), y);
   1486 }
   1487 
   1488 template <class D, class V>
   1489 HWY_INLINE V Exp2(const D d, V x) {
   1490  using T = TFromD<D>;
   1491 
   1492  const V kLowerBound =
   1493      Set(d, static_cast<T>((sizeof(T) == 4 ? -150.0 : -1075.0)));
   1494  const V kOne = Set(d, static_cast<T>(+1.0));
   1495 
   1496  impl::ExpImpl<T> impl;
   1497 
   1498  // q = static_cast<int32_t>(std::lrint(x))
   1499  const auto q = impl.ToNearestInt32(d, x);
   1500 
   1501  // Reduce, approximate, and then reconstruct.
   1502  const V y = impl.LoadExpShortRange(
   1503      d, Add(impl.ExpPoly(d, impl.Exp2Reduce(d, x, q)), kOne), q);
   1504  return IfThenElseZero(Ge(x, kLowerBound), y);
   1505 }
   1506 
   1507 template <class D, class V>
   1508 HWY_INLINE V Expm1(const D d, V x) {
   1509  using T = TFromD<D>;
   1510 
   1511  const V kHalf = Set(d, static_cast<T>(+0.5));
   1512  const V kLowerBound =
   1513      Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0)));
   1514  const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616));
   1515  const V kNegOne = Set(d, static_cast<T>(-1.0));
   1516  const V kNegZero = Set(d, static_cast<T>(-0.0));
   1517  const V kOne = Set(d, static_cast<T>(+1.0));
   1518  const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681));
   1519 
   1520  impl::ExpImpl<T> impl;
   1521 
   1522  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
   1523  const auto q =
   1524      impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
   1525 
   1526  // Reduce, approximate, and then reconstruct.
   1527  const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
   1528  const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
   1529                         Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
   1530  return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
   1531 }
   1532 
   1533 template <class D, class V>
   1534 HWY_INLINE V Log(const D d, V x) {
   1535  return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
   1536 }
   1537 
   1538 template <class D, class V>
   1539 HWY_INLINE V Log10(const D d, V x) {
   1540  using T = TFromD<D>;
   1541  return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511)));
   1542 }
   1543 
   1544 template <class D, class V>
   1545 HWY_INLINE V Log1p(const D d, V x) {
   1546  using T = TFromD<D>;
   1547  const V kOne = Set(d, static_cast<T>(+1.0));
   1548 
   1549  const V y = Add(x, kOne);
   1550  const auto is_pole = Eq(y, kOne);
   1551  const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
   1552  const auto non_pole =
   1553      Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
   1554  return IfThenElse(is_pole, x, non_pole);
   1555 }
   1556 
   1557 template <class D, class V>
   1558 HWY_INLINE V Log2(const D d, V x) {
   1559  using T = TFromD<D>;
   1560  return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992)));
   1561 }
   1562 
   1563 template <class D, class V>
   1564 HWY_INLINE V Sin(const D d, V x) {
   1565  using T = TFromD<D>;
   1566  impl::CosSinImpl<T> impl;
   1567 
   1568  // Float Constants
   1569  const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153));
   1570  const V kHalf = Set(d, static_cast<T>(0.5));
   1571 
   1572  // Integer Constants
   1573  const Rebind<int32_t, D> di32;
   1574  using VI32 = decltype(Zero(di32));
   1575 
   1576  const V abs_x = Abs(x);
   1577  const V sign_x = Xor(abs_x, x);
   1578 
   1579  // Compute the quadrant, q = int((|x| / pi) + 0.5)
   1580  const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));
   1581 
   1582  // Reduce range, apply sign, and approximate.
   1583  return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
   1584                          Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
   1585 }
   1586 
   1587 template <class D, class V>
   1588 HWY_INLINE V Sinh(const D d, V x) {
   1589  using T = TFromD<D>;
   1590  const V kHalf = Set(d, static_cast<T>(+0.5));
   1591  const V kOne = Set(d, static_cast<T>(+1.0));
   1592  const V kTwo = Set(d, static_cast<T>(+2.0));
   1593 
   1594  const V sign = And(SignBit(d), x);  // Extract the sign bit
   1595  const V abs_x = Xor(x, sign);
   1596  const V y = Expm1(d, abs_x);
   1597  const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
   1598  return Xor(z, sign);  // Reapply the sign bit
   1599 }
   1600 
   1601 template <class D, class V>
   1602 HWY_INLINE V Tanh(const D d, V x) {
   1603  using T = TFromD<D>;
   1604  const V kLimit = Set(d, static_cast<T>(18.714973875));
   1605  const V kOne = Set(d, static_cast<T>(+1.0));
   1606  const V kTwo = Set(d, static_cast<T>(+2.0));
   1607 
   1608  const V sign = And(SignBit(d), x);  // Extract the sign bit
   1609  const V abs_x = Xor(x, sign);
   1610  const V y = Expm1(d, Mul(abs_x, kTwo));
   1611  const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
   1612  return Xor(z, sign);  // Reapply the sign bit
   1613 }
   1614 
   1615 template <class D, class V>
   1616 HWY_INLINE void SinCos(const D d, V x, V& s, V& c) {
   1617  using T = TFromD<D>;
   1618  impl::SinCosImpl<T> impl;
   1619  impl.SinCos(d, x, s, c);
   1620 }
   1621 
   1622 template <class D, class V>
   1623 HWY_INLINE V Hypot(const D d, V a, V b) {
   1624  using T = TFromD<D>;
   1625  using TI = MakeSigned<T>;
   1626  const RebindToUnsigned<decltype(d)> du;
   1627  const RebindToSigned<decltype(d)> di;
   1628  using VI = VFromD<decltype(di)>;
   1629 
   1630  constexpr int kMaxBiasedExp = static_cast<int>(MaxExponentField<T>());
   1631  static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true");
   1632 
   1633  constexpr int kNumOfMantBits = MantissaBits<T>();
   1634  static_assert(kNumOfMantBits > 0, "kNumOfMantBits > 0 must be true");
   1635 
   1636  constexpr int kExpBias = kMaxBiasedExp / 2;
   1637 
   1638  static_assert(
   1639      static_cast<unsigned>(kExpBias) + static_cast<unsigned>(kNumOfMantBits) <
   1640          static_cast<unsigned>(kMaxBiasedExp),
   1641      "kExpBias + kNumOfMantBits < kMaxBiasedExp must be true");
   1642 
   1643  // kMinValToSquareBiasedExp is the smallest biased exponent such that
   1644  // pow(pow(2, kMinValToSquareBiasedExp - kExpBias) * x, 2) is either a normal
   1645  // floating-point value or infinity if x is a non-zero, non-NaN value
   1646  constexpr int kMinValToSquareBiasedExp = (kExpBias / 2) + kNumOfMantBits;
   1647  static_assert(kMinValToSquareBiasedExp < kExpBias,
   1648                "kMinValToSquareBiasedExp < kExpBias must be true");
   1649 
   1650  // kMaxValToSquareBiasedExp is the largest biased exponent such that
   1651  // pow(pow(2, kMaxValToSquareBiasedExp - kExpBias) * x, 2) * 2 is guaranteed
   1652  // to be a finite value if x is a finite value
   1653  constexpr int kMaxValToSquareBiasedExp = kExpBias + ((kExpBias / 2) - 1);
   1654  static_assert(kMaxValToSquareBiasedExp > kExpBias,
   1655                "kMaxValToSquareBiasedExp > kExpBias must be true");
   1656  static_assert(kMaxValToSquareBiasedExp < kMaxBiasedExp,
   1657                "kMaxValToSquareBiasedExp < kMaxBiasedExp must be true");
   1658 
   1659 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 || \
   1660    HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
   1661  using TExpSatSub = MakeUnsigned<T>;
   1662  using TExpMinMax = TI;
   1663 #else
   1664  using TExpSatSub = uint16_t;
   1665  using TExpMinMax = int16_t;
   1666 #endif
   1667 
   1668  const Repartition<TExpSatSub, decltype(d)> d_exp_sat_sub;
   1669  const Repartition<TExpMinMax, decltype(d)> d_exp_min_max;
   1670 
   1671  const V abs_a = Abs(a);
   1672  const V abs_b = Abs(b);
   1673 
   1674  const MFromD<D> either_inf = Or(IsInf(a), IsInf(b));
   1675 
   1676  const VI zero = Zero(di);
   1677 
   1678  // exp_a[i] is the biased exponent of abs_a[i]
   1679  const VI exp_a = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_a)));
   1680 
   1681  // exp_b[i] is the biased exponent of abs_b[i]
   1682  const VI exp_b = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_b)));
   1683 
   1684  // max_exp[i] is equal to HWY_MAX(exp_a[i], exp_b[i])
   1685 
   1686  // If abs_a[i] and abs_b[i] are both NaN values, max_exp[i] will be equal to
   1687  // the biased exponent of the larger value. Otherwise, if either abs_a[i] or
   1688  // abs_b[i] is NaN, max_exp[i] will be equal to kMaxBiasedExp.
   1689  const VI max_exp = BitCast(
   1690      di, Max(BitCast(d_exp_min_max, exp_a), BitCast(d_exp_min_max, exp_b)));
   1691 
   1692  // If either abs_a[i] or abs_b[i] is zero, min_exp[i] is equal to max_exp[i].
   1693  // Otherwise, if abs_a[i] and abs_b[i] are both nonzero, min_exp[i] is equal
   1694  // to HWY_MIN(exp_a[i], exp_b[i]).
   1695  const VI min_exp = IfThenElse(
   1696      Or(Eq(BitCast(di, abs_a), zero), Eq(BitCast(di, abs_b), zero)), max_exp,
   1697      BitCast(di, Min(BitCast(d_exp_min_max, exp_a),
   1698                      BitCast(d_exp_min_max, exp_b))));
   1699 
   1700  // scl_pow2[i] is the power of 2 to scale abs_a[i] and abs_b[i] by
   1701 
   1702  // abs_a[i] and abs_b[i] should be scaled by a factor that is greater than
   1703  // zero but less than or equal to
   1704  // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that that the
   1705  // multiplications or addition operations do not overflow if
   1706  // std::hypot(abs_a[i], abs_b[i]) is finite
   1707 
   1708  // If either abs_a[i] or abs_b[i] is a a positive value that is less than
   1709  // pow(2, kMinValToSquareBiasedExp - kExpBias), then scaling up abs_a[i] and
   1710  // abs_b[i] by pow(2, kMinValToSquareBiasedExp - min_exp[i]) will ensure that
   1711  // the multiplications and additions result in normal floating point values,
   1712  // infinities, or NaNs.
   1713 
   1714  // If HWY_MAX(kMinValToSquareBiasedExp - min_exp[i], 0) is greater than
   1715  // kMaxValToSquareBiasedExp - max_exp[i], scale abs_a[i] and abs_b[i] up by
   1716  // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that the
   1717  // multiplication and addition operations result in a finite result if
   1718  // std::hypot(abs_a[i], abs_b[i]) is finite.
   1719 
   1720  const VI scl_pow2 = BitCast(
   1721      di,
   1722      Min(BitCast(d_exp_min_max,
   1723                  SaturatedSub(BitCast(d_exp_sat_sub,
   1724                                       Set(di, static_cast<TI>(
   1725                                                   kMinValToSquareBiasedExp))),
   1726                               BitCast(d_exp_sat_sub, min_exp))),
   1727          BitCast(d_exp_min_max,
   1728                  Sub(Set(di, static_cast<TI>(kMaxValToSquareBiasedExp)),
   1729                      max_exp))));
   1730 
   1731  const VI exp_bias = Set(di, static_cast<TI>(kExpBias));
   1732 
   1733  const V ab_scl_factor =
   1734      BitCast(d, ShiftLeft<kNumOfMantBits>(Add(exp_bias, scl_pow2)));
   1735  const V hypot_scl_factor =
   1736      BitCast(d, ShiftLeft<kNumOfMantBits>(Sub(exp_bias, scl_pow2)));
   1737 
   1738  const V scl_a = Mul(abs_a, ab_scl_factor);
   1739  const V scl_b = Mul(abs_b, ab_scl_factor);
   1740 
   1741  const V scl_hypot = Sqrt(MulAdd(scl_a, scl_a, Mul(scl_b, scl_b)));
   1742  // std::hypot returns inf if one input is +/- inf, even if the other is NaN.
   1743  return IfThenElse(either_inf, Inf(d), Mul(scl_hypot, hypot_scl_factor));
   1744 }
   1745 
   1746 // NOLINTNEXTLINE(google-readability-namespace-comments)
   1747 }  // namespace HWY_NAMESPACE
   1748 }  // namespace hwy
   1749 HWY_AFTER_NAMESPACE();
   1750 
   1751 #endif  // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_