math-inl.h (59380B)
1 // Copyright 2020 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 // Include guard (still compiled once per target) 17 #if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \ 18 defined(HWY_TARGET_TOGGLE) // NOLINT 19 #ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ 20 #undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ 21 #else 22 #define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_ 23 #endif 24 25 #include <stddef.h> 26 #include <stdint.h> 27 28 #include "hwy/highway.h" 29 30 HWY_BEFORE_NAMESPACE(); 31 namespace hwy { 32 namespace HWY_NAMESPACE { 33 34 /** 35 * Highway SIMD version of std::acos(x). 36 * 37 * Valid Lane Types: float32, float64 38 * Max Error: ULP = 2 39 * Valid Range: [-1, +1] 40 * @return arc cosine of 'x' 41 */ 42 template <class D, class V> 43 HWY_INLINE V Acos(D d, V x); 44 template <class D, class V> 45 HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) { 46 return Acos(d, x); 47 } 48 49 /** 50 * Highway SIMD version of std::acosh(x). 51 * 52 * Valid Lane Types: float32, float64 53 * Max Error: ULP = 3 54 * Valid Range: float32[1, +FLT_MAX], float64[1, +DBL_MAX] 55 * @return hyperbolic arc cosine of 'x' 56 */ 57 template <class D, class V> 58 HWY_INLINE V Acosh(D d, V x); 59 template <class D, class V> 60 HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) { 61 return Acosh(d, x); 62 } 63 64 /** 65 * Highway SIMD version of std::asin(x). 66 * 67 * Valid Lane Types: float32, float64 68 * Max Error: ULP = 2 69 * Valid Range: [-1, +1] 70 * @return arc sine of 'x' 71 */ 72 template <class D, class V> 73 HWY_INLINE V Asin(D d, V x); 74 template <class D, class V> 75 HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) { 76 return Asin(d, x); 77 } 78 79 /** 80 * Highway SIMD version of std::asinh(x). 81 * 82 * Valid Lane Types: float32, float64 83 * Max Error: ULP = 3 84 * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] 85 * @return hyperbolic arc sine of 'x' 86 */ 87 template <class D, class V> 88 HWY_INLINE V Asinh(D d, V x); 89 template <class D, class V> 90 HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) { 91 return Asinh(d, x); 92 } 93 94 /** 95 * Highway SIMD version of std::atan(x). 96 * 97 * Valid Lane Types: float32, float64 98 * Max Error: ULP = 3 99 * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] 100 * @return arc tangent of 'x' 101 */ 102 template <class D, class V> 103 HWY_INLINE V Atan(D d, V x); 104 template <class D, class V> 105 HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) { 106 return Atan(d, x); 107 } 108 109 /** 110 * Highway SIMD version of std::atanh(x). 111 * 112 * Valid Lane Types: float32, float64 113 * Max Error: ULP = 3 114 * Valid Range: (-1, +1) 115 * @return hyperbolic arc tangent of 'x' 116 */ 117 template <class D, class V> 118 HWY_INLINE V Atanh(D d, V x); 119 template <class D, class V> 120 HWY_NOINLINE V CallAtanh(const D d, VecArg<V> x) { 121 return Atanh(d, x); 122 } 123 124 // Atan2 was added later and some users may be implementing it themselves, so 125 // notify them that this version of Highway defines it already. 126 #ifndef HWY_HAVE_ATAN2 127 #define HWY_HAVE_ATAN2 1 128 #endif 129 130 /** 131 * Highway SIMD version of std::atan2(x). 132 * 133 * Valid Lane Types: float32, float64 134 * Correctly handles negative zero, infinities, and NaN. 135 * @return atan2 of 'y', 'x' 136 */ 137 template <class D, class V = VFromD<D>, class M = MFromD<D>, 138 typename T = TFromD<D>> 139 HWY_INLINE V Atan2(const D d, V y, V x) { 140 const V kHalf = Set(d, static_cast<T>(+0.5)); 141 const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264)); 142 const V kPi2 = Mul(kPi, kHalf); 143 144 const V k0 = Zero(d); 145 const M y_0 = Eq(y, k0); 146 const M x_0 = Eq(x, k0); 147 const M x_neg = Lt(x, k0); 148 const M y_inf = IsInf(y); 149 const M x_inf = IsInf(x); 150 const M nan = Or(IsNaN(y), IsNaN(x)); 151 152 const V if_xneg_pi = IfThenElseZero(x_neg, kPi); 153 // x= +inf: pi/4; -inf: 3*pi/4; else: pi/2 154 const V if_yinf = Mul(kHalf, IfThenElse(x_inf, Add(kPi2, if_xneg_pi), kPi)); 155 156 V t = Atan(d, Div(y, x)); 157 // Disambiguate between quadrants 1/3 and 2/4 by adding (Q2: Pi; Q3: -Pi). 158 t = Add(t, CopySignToAbs(if_xneg_pi, y)); 159 // Special cases for 0 and infinity: 160 t = IfThenElse(x_inf, if_xneg_pi, t); 161 t = IfThenElse(x_0, kPi2, t); 162 t = IfThenElse(y_inf, if_yinf, t); 163 t = IfThenElse(y_0, if_xneg_pi, t); 164 // Any input NaN => NaN, otherwise fix sign. 165 return IfThenElse(nan, NaN(d), CopySign(t, y)); 166 } 167 template <class D, class V> 168 HWY_NOINLINE V CallAtan2(const D d, VecArg<V> y, VecArg<V> x) { 169 return Atan2(d, y, x); 170 } 171 172 /** 173 * Highway SIMD version of std::cos(x). 174 * 175 * Valid Lane Types: float32, float64 176 * Max Error: ULP = 3 177 * Valid Range: [-39000, +39000] 178 * @return cosine of 'x' 179 */ 180 template <class D, class V> 181 HWY_INLINE V Cos(D d, V x); 182 template <class D, class V> 183 HWY_NOINLINE V CallCos(const D d, VecArg<V> x) { 184 return Cos(d, x); 185 } 186 187 /** 188 * Highway SIMD version of std::exp(x). 189 * 190 * Valid Lane Types: float32, float64 191 * Max Error: ULP = 1 192 * Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] 193 * @return e^x 194 */ 195 template <class D, class V> 196 HWY_INLINE V Exp(D d, V x); 197 template <class D, class V> 198 HWY_NOINLINE V CallExp(const D d, VecArg<V> x) { 199 return Exp(d, x); 200 } 201 202 /** 203 * Highway SIMD version of std::exp2(x). 204 * 205 * Valid Lane Types: float32, float64 206 * Max Error: ULP = 2 207 * Valid Range: float32[-FLT_MAX, +128], float64[-DBL_MAX, +1024] 208 * @return 2^x 209 */ 210 template <class D, class V> 211 HWY_INLINE V Exp2(D d, V x); 212 template <class D, class V> 213 HWY_NOINLINE V CallExp2(const D d, VecArg<V> x) { 214 return Exp2(d, x); 215 } 216 217 /** 218 * Highway SIMD version of std::expm1(x). 219 * 220 * Valid Lane Types: float32, float64 221 * Max Error: ULP = 4 222 * Valid Range: float32[-FLT_MAX, +104], float64[-DBL_MAX, +706] 223 * @return e^x - 1 224 */ 225 template <class D, class V> 226 HWY_INLINE V Expm1(D d, V x); 227 template <class D, class V> 228 HWY_NOINLINE V CallExpm1(const D d, VecArg<V> x) { 229 return Expm1(d, x); 230 } 231 232 /** 233 * Highway SIMD version of std::log(x). 234 * 235 * Valid Lane Types: float32, float64 236 * Max Error: ULP = 4 237 * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] 238 * @return natural logarithm of 'x' 239 */ 240 template <class D, class V> 241 HWY_INLINE V Log(D d, V x); 242 template <class D, class V> 243 HWY_NOINLINE V CallLog(const D d, VecArg<V> x) { 244 return Log(d, x); 245 } 246 247 /** 248 * Highway SIMD version of std::log10(x). 249 * 250 * Valid Lane Types: float32, float64 251 * Max Error: ULP = 2 252 * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] 253 * @return base 10 logarithm of 'x' 254 */ 255 template <class D, class V> 256 HWY_INLINE V Log10(D d, V x); 257 template <class D, class V> 258 HWY_NOINLINE V CallLog10(const D d, VecArg<V> x) { 259 return Log10(d, x); 260 } 261 262 /** 263 * Highway SIMD version of std::log1p(x). 264 * 265 * Valid Lane Types: float32, float64 266 * Max Error: ULP = 2 267 * Valid Range: float32[0, +FLT_MAX], float64[0, +DBL_MAX] 268 * @return log(1 + x) 269 */ 270 template <class D, class V> 271 HWY_INLINE V Log1p(D d, V x); 272 template <class D, class V> 273 HWY_NOINLINE V CallLog1p(const D d, VecArg<V> x) { 274 return Log1p(d, x); 275 } 276 277 /** 278 * Highway SIMD version of std::log2(x). 279 * 280 * Valid Lane Types: float32, float64 281 * Max Error: ULP = 2 282 * Valid Range: float32(0, +FLT_MAX], float64(0, +DBL_MAX] 283 * @return base 2 logarithm of 'x' 284 */ 285 template <class D, class V> 286 HWY_INLINE V Log2(D d, V x); 287 template <class D, class V> 288 HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) { 289 return Log2(d, x); 290 } 291 292 /** 293 * Highway SIMD version of std::sin(x). 294 * 295 * Valid Lane Types: float32, float64 296 * Max Error: ULP = 3 297 * Valid Range: [-39000, +39000] 298 * @return sine of 'x' 299 */ 300 template <class D, class V> 301 HWY_INLINE V Sin(D d, V x); 302 template <class D, class V> 303 HWY_NOINLINE V CallSin(const D d, VecArg<V> x) { 304 return Sin(d, x); 305 } 306 307 /** 308 * Highway SIMD version of std::sinh(x). 309 * 310 * Valid Lane Types: float32, float64 311 * Max Error: ULP = 4 312 * Valid Range: float32[-88.7228, +88.7228], float64[-709, +709] 313 * @return hyperbolic sine of 'x' 314 */ 315 template <class D, class V> 316 HWY_INLINE V Sinh(D d, V x); 317 template <class D, class V> 318 HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) { 319 return Sinh(d, x); 320 } 321 322 /** 323 * Highway SIMD version of std::tanh(x). 324 * 325 * Valid Lane Types: float32, float64 326 * Max Error: ULP = 4 327 * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] 328 * @return hyperbolic tangent of 'x' 329 */ 330 template <class D, class V> 331 HWY_INLINE V Tanh(D d, V x); 332 template <class D, class V> 333 HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) { 334 return Tanh(d, x); 335 } 336 337 /** 338 * Highway SIMD version of SinCos. 339 * Compute the sine and cosine at the same time 340 * The performance should be around the same as calling Sin. 341 * 342 * Valid Lane Types: float32, float64 343 * Max Error: ULP = 1 344 * Valid Range: [-39000, +39000] 345 */ 346 template <class D, class V> 347 HWY_INLINE void SinCos(D d, V x, V& s, V& c); 348 template <class D, class V> 349 HWY_NOINLINE void CallSinCos(const D d, VecArg<V> x, V& s, V& c) { 350 SinCos(d, x, s, c); 351 } 352 353 /** 354 * Highway SIMD version of Hypot 355 * 356 * Valid Lane Types: float32, float64 357 * Max Error: ULP = 4 358 * Valid Range: float32[-FLT_MAX, +FLT_MAX], float64[-DBL_MAX, +DBL_MAX] 359 * @return hypotenuse of a and b 360 */ 361 template <class D, class V> 362 HWY_INLINE V Hypot(D d, V a, V b); 363 template <class D, class V> 364 HWY_NOINLINE V CallHypot(const D d, VecArg<V> a, VecArg<V> b) { 365 return Hypot(d, a, b); 366 } 367 368 //////////////////////////////////////////////////////////////////////////////// 369 // Implementation 370 //////////////////////////////////////////////////////////////////////////////// 371 namespace impl { 372 373 // Estrin's Scheme is a faster method for evaluating large polynomials on 374 // super scalar architectures. It works by factoring the Horner's Method 375 // polynomial into power of two sub-trees that can be evaluated in parallel. 376 // Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme 377 template <class T> 378 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) { 379 return MulAdd(c1, x, c0); 380 } 381 template <class T> 382 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) { 383 T x2 = Mul(x, x); 384 return MulAdd(x2, c2, MulAdd(c1, x, c0)); 385 } 386 template <class T> 387 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) { 388 T x2 = Mul(x, x); 389 return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)); 390 } 391 template <class T> 392 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) { 393 T x2 = Mul(x, x); 394 T x4 = Mul(x2, x2); 395 return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); 396 } 397 template <class T> 398 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) { 399 T x2 = Mul(x, x); 400 T x4 = Mul(x2, x2); 401 return MulAdd(x4, MulAdd(c5, x, c4), 402 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); 403 } 404 template <class T> 405 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 406 T c6) { 407 T x2 = Mul(x, x); 408 T x4 = Mul(x2, x2); 409 return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)), 410 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); 411 } 412 template <class T> 413 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 414 T c6, T c7) { 415 T x2 = Mul(x, x); 416 T x4 = Mul(x2, x2); 417 return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 418 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))); 419 } 420 template <class T> 421 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 422 T c6, T c7, T c8) { 423 T x2 = Mul(x, x); 424 T x4 = Mul(x2, x2); 425 T x8 = Mul(x4, x4); 426 return MulAdd(x8, c8, 427 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 428 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 429 } 430 template <class T> 431 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 432 T c6, T c7, T c8, T c9) { 433 T x2 = Mul(x, x); 434 T x4 = Mul(x2, x2); 435 T x8 = Mul(x4, x4); 436 return MulAdd(x8, MulAdd(c9, x, c8), 437 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 438 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 439 } 440 template <class T> 441 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 442 T c6, T c7, T c8, T c9, T c10) { 443 T x2 = Mul(x, x); 444 T x4 = Mul(x2, x2); 445 T x8 = Mul(x4, x4); 446 return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)), 447 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 448 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 449 } 450 template <class T> 451 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 452 T c6, T c7, T c8, T c9, T c10, T c11) { 453 T x2 = Mul(x, x); 454 T x4 = Mul(x2, x2); 455 T x8 = Mul(x4, x4); 456 return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)), 457 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 458 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 459 } 460 template <class T> 461 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 462 T c6, T c7, T c8, T c9, T c10, T c11, 463 T c12) { 464 T x2 = Mul(x, x); 465 T x4 = Mul(x2, x2); 466 T x8 = Mul(x4, x4); 467 return MulAdd( 468 x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 469 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 470 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 471 } 472 template <class T> 473 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 474 T c6, T c7, T c8, T c9, T c10, T c11, 475 T c12, T c13) { 476 T x2 = Mul(x, x); 477 T x4 = Mul(x2, x2); 478 T x8 = Mul(x4, x4); 479 return MulAdd(x8, 480 MulAdd(x4, MulAdd(c13, x, c12), 481 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 482 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 483 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 484 } 485 template <class T> 486 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 487 T c6, T c7, T c8, T c9, T c10, T c11, 488 T c12, T c13, T c14) { 489 T x2 = Mul(x, x); 490 T x4 = Mul(x2, x2); 491 T x8 = Mul(x4, x4); 492 return MulAdd(x8, 493 MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)), 494 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 495 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 496 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 497 } 498 template <class T> 499 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 500 T c6, T c7, T c8, T c9, T c10, T c11, 501 T c12, T c13, T c14, T c15) { 502 T x2 = Mul(x, x); 503 T x4 = Mul(x2, x2); 504 T x8 = Mul(x4, x4); 505 return MulAdd(x8, 506 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), 507 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 508 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 509 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))); 510 } 511 template <class T> 512 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 513 T c6, T c7, T c8, T c9, T c10, T c11, 514 T c12, T c13, T c14, T c15, T c16) { 515 T x2 = Mul(x, x); 516 T x4 = Mul(x2, x2); 517 T x8 = Mul(x4, x4); 518 T x16 = Mul(x8, x8); 519 return MulAdd( 520 x16, c16, 521 MulAdd(x8, 522 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), 523 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 524 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 525 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); 526 } 527 template <class T> 528 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 529 T c6, T c7, T c8, T c9, T c10, T c11, 530 T c12, T c13, T c14, T c15, T c16, T c17) { 531 T x2 = Mul(x, x); 532 T x4 = Mul(x2, x2); 533 T x8 = Mul(x4, x4); 534 T x16 = Mul(x8, x8); 535 return MulAdd( 536 x16, MulAdd(c17, x, c16), 537 MulAdd(x8, 538 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), 539 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 540 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 541 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); 542 } 543 template <class T> 544 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5, 545 T c6, T c7, T c8, T c9, T c10, T c11, 546 T c12, T c13, T c14, T c15, T c16, T c17, 547 T c18) { 548 T x2 = Mul(x, x); 549 T x4 = Mul(x2, x2); 550 T x8 = Mul(x4, x4); 551 T x16 = Mul(x8, x8); 552 return MulAdd( 553 x16, MulAdd(x2, c18, MulAdd(c17, x, c16)), 554 MulAdd(x8, 555 MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)), 556 MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))), 557 MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)), 558 MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))))); 559 } 560 561 template <class FloatOrDouble> 562 struct AsinImpl {}; 563 template <class FloatOrDouble> 564 struct AtanImpl {}; 565 template <class FloatOrDouble> 566 struct CosSinImpl {}; 567 template <class FloatOrDouble> 568 struct ExpImpl {}; 569 template <class FloatOrDouble> 570 struct LogImpl {}; 571 template <class FloatOrDouble> 572 struct SinCosImpl {}; 573 574 template <> 575 struct AsinImpl<float> { 576 // Polynomial approximation for asin(x) over the range [0, 0.5). 577 template <class D, class V> 578 HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { 579 const auto k0 = Set(d, +0.1666677296f); 580 const auto k1 = Set(d, +0.07495029271f); 581 const auto k2 = Set(d, +0.04547423869f); 582 const auto k3 = Set(d, +0.02424046025f); 583 const auto k4 = Set(d, +0.04197454825f); 584 585 return Estrin(x2, k0, k1, k2, k3, k4); 586 } 587 }; 588 589 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 590 591 template <> 592 struct AsinImpl<double> { 593 // Polynomial approximation for asin(x) over the range [0, 0.5). 594 template <class D, class V> 595 HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) { 596 const auto k0 = Set(d, +0.1666666666666497543); 597 const auto k1 = Set(d, +0.07500000000378581611); 598 const auto k2 = Set(d, +0.04464285681377102438); 599 const auto k3 = Set(d, +0.03038195928038132237); 600 const auto k4 = Set(d, +0.02237176181932048341); 601 const auto k5 = Set(d, +0.01735956991223614604); 602 const auto k6 = Set(d, +0.01388715184501609218); 603 const auto k7 = Set(d, +0.01215360525577377331); 604 const auto k8 = Set(d, +0.006606077476277170610); 605 const auto k9 = Set(d, +0.01929045477267910674); 606 const auto k10 = Set(d, -0.01581918243329996643); 607 const auto k11 = Set(d, +0.03161587650653934628); 608 609 return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11); 610 } 611 }; 612 613 #endif 614 615 template <> 616 struct AtanImpl<float> { 617 // Polynomial approximation for atan(x) over the range [0, 1.0). 618 template <class D, class V> 619 HWY_INLINE V AtanPoly(D d, V x) { 620 const auto k0 = Set(d, -0.333331018686294555664062f); 621 const auto k1 = Set(d, +0.199926957488059997558594f); 622 const auto k2 = Set(d, -0.142027363181114196777344f); 623 const auto k3 = Set(d, +0.106347933411598205566406f); 624 const auto k4 = Set(d, -0.0748900920152664184570312f); 625 const auto k5 = Set(d, +0.0425049886107444763183594f); 626 const auto k6 = Set(d, -0.0159569028764963150024414f); 627 const auto k7 = Set(d, +0.00282363896258175373077393f); 628 629 const auto y = Mul(x, x); 630 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x); 631 } 632 }; 633 634 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 635 636 template <> 637 struct AtanImpl<double> { 638 // Polynomial approximation for atan(x) over the range [0, 1.0). 639 template <class D, class V> 640 HWY_INLINE V AtanPoly(D d, V x) { 641 const auto k0 = Set(d, -0.333333333333311110369124); 642 const auto k1 = Set(d, +0.199999999996591265594148); 643 const auto k2 = Set(d, -0.14285714266771329383765); 644 const auto k3 = Set(d, +0.111111105648261418443745); 645 const auto k4 = Set(d, -0.090908995008245008229153); 646 const auto k5 = Set(d, +0.0769219538311769618355029); 647 const auto k6 = Set(d, -0.0666573579361080525984562); 648 const auto k7 = Set(d, +0.0587666392926673580854313); 649 const auto k8 = Set(d, -0.0523674852303482457616113); 650 const auto k9 = Set(d, +0.0466667150077840625632675); 651 const auto k10 = Set(d, -0.0407629191276836500001934); 652 const auto k11 = Set(d, +0.0337852580001353069993897); 653 const auto k12 = Set(d, -0.0254517624932312641616861); 654 const auto k13 = Set(d, +0.016599329773529201970117); 655 const auto k14 = Set(d, -0.00889896195887655491740809); 656 const auto k15 = Set(d, +0.00370026744188713119232403); 657 const auto k16 = Set(d, -0.00110611831486672482563471); 658 const auto k17 = Set(d, +0.000209850076645816976906797); 659 const auto k18 = Set(d, -1.88796008463073496563746e-5); 660 661 const auto y = Mul(x, x); 662 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11, 663 k12, k13, k14, k15, k16, k17, k18), 664 Mul(y, x), x); 665 } 666 }; 667 668 #endif 669 670 template <> 671 struct CosSinImpl<float> { 672 // Rounds float toward zero and returns as int32_t. 673 template <class D, class V> 674 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { 675 return ConvertTo(Rebind<int32_t, D>(), x); 676 } 677 678 template <class D, class V> 679 HWY_INLINE V Poly(D d, V x) { 680 const auto k0 = Set(d, -1.66666597127914428710938e-1f); 681 const auto k1 = Set(d, +8.33307858556509017944336e-3f); 682 const auto k2 = Set(d, -1.981069071916863322258e-4f); 683 const auto k3 = Set(d, +2.6083159809786593541503e-6f); 684 685 const auto y = Mul(x, x); 686 return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x); 687 } 688 689 template <class D, class V, class VI32> 690 HWY_INLINE V CosReduce(D d, V x, VI32 q) { 691 // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2 692 const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f); 693 const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f); 694 const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f); 695 const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f); 696 697 // Extended precision modular arithmetic. 698 const V qf = ConvertTo(d, q); 699 x = MulAdd(qf, kHalfPiPart0f, x); 700 x = MulAdd(qf, kHalfPiPart1f, x); 701 x = MulAdd(qf, kHalfPiPart2f, x); 702 x = MulAdd(qf, kHalfPiPart3f, x); 703 return x; 704 } 705 706 template <class D, class V, class VI32> 707 HWY_INLINE V SinReduce(D d, V x, VI32 q) { 708 // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi 709 const V kPiPart0f = Set(d, -3.140625f); 710 const V kPiPart1f = Set(d, -0.0009670257568359375f); 711 const V kPiPart2f = Set(d, -6.2771141529083251953e-7f); 712 const V kPiPart3f = Set(d, -1.2154201256553420762e-10f); 713 714 // Extended precision modular arithmetic. 715 const V qf = ConvertTo(d, q); 716 x = MulAdd(qf, kPiPart0f, x); 717 x = MulAdd(qf, kPiPart1f, x); 718 x = MulAdd(qf, kPiPart2f, x); 719 x = MulAdd(qf, kPiPart3f, x); 720 return x; 721 } 722 723 // (q & 2) == 0 ? -0.0 : +0.0 724 template <class D, class VI32> 725 HWY_INLINE Vec<Rebind<float, D>> CosSignFromQuadrant(D d, VI32 q) { 726 const VI32 kTwo = Set(Rebind<int32_t, D>(), 2); 727 return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo))); 728 } 729 730 // ((q & 1) ? -0.0 : +0.0) 731 template <class D, class VI32> 732 HWY_INLINE Vec<Rebind<float, D>> SinSignFromQuadrant(D d, VI32 q) { 733 const VI32 kOne = Set(Rebind<int32_t, D>(), 1); 734 return BitCast(d, ShiftLeft<31>(And(q, kOne))); 735 } 736 }; 737 738 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 739 740 template <> 741 struct CosSinImpl<double> { 742 // Rounds double toward zero and returns as int32_t. 743 template <class D, class V> 744 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { 745 return DemoteTo(Rebind<int32_t, D>(), x); 746 } 747 748 template <class D, class V> 749 HWY_INLINE V Poly(D d, V x) { 750 const auto k0 = Set(d, -0.166666666666666657414808); 751 const auto k1 = Set(d, +0.00833333333333332974823815); 752 const auto k2 = Set(d, -0.000198412698412696162806809); 753 const auto k3 = Set(d, +2.75573192239198747630416e-6); 754 const auto k4 = Set(d, -2.50521083763502045810755e-8); 755 const auto k5 = Set(d, +1.60590430605664501629054e-10); 756 const auto k6 = Set(d, -7.64712219118158833288484e-13); 757 const auto k7 = Set(d, +2.81009972710863200091251e-15); 758 const auto k8 = Set(d, -7.97255955009037868891952e-18); 759 760 const auto y = Mul(x, x); 761 return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x); 762 } 763 764 template <class D, class V, class VI32> 765 HWY_INLINE V CosReduce(D d, V x, VI32 q) { 766 // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2 767 const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914); 768 const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8); 769 const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16); 770 const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24); 771 772 // Extended precision modular arithmetic. 773 const V qf = PromoteTo(d, q); 774 x = MulAdd(qf, kHalfPiPart0d, x); 775 x = MulAdd(qf, kHalfPiPart1d, x); 776 x = MulAdd(qf, kHalfPiPart2d, x); 777 x = MulAdd(qf, kHalfPiPart3d, x); 778 return x; 779 } 780 781 template <class D, class V, class VI32> 782 HWY_INLINE V SinReduce(D d, V x, VI32 q) { 783 // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi 784 const V kPiPart0d = Set(d, -3.1415926218032836914); 785 const V kPiPart1d = Set(d, -3.1786509424591713469e-8); 786 const V kPiPart2d = Set(d, -1.2246467864107188502e-16); 787 const V kPiPart3d = Set(d, -1.2736634327021899816e-24); 788 789 // Extended precision modular arithmetic. 790 const V qf = PromoteTo(d, q); 791 x = MulAdd(qf, kPiPart0d, x); 792 x = MulAdd(qf, kPiPart1d, x); 793 x = MulAdd(qf, kPiPart2d, x); 794 x = MulAdd(qf, kPiPart3d, x); 795 return x; 796 } 797 798 // (q & 2) == 0 ? -0.0 : +0.0 799 template <class D, class VI32> 800 HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) { 801 const VI32 kTwo = Set(Rebind<int32_t, D>(), 2); 802 return BitCast( 803 d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo)))); 804 } 805 806 // ((q & 1) ? -0.0 : +0.0) 807 template <class D, class VI32> 808 HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) { 809 const VI32 kOne = Set(Rebind<int32_t, D>(), 1); 810 return BitCast( 811 d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne)))); 812 } 813 }; 814 815 #endif 816 817 template <> 818 struct ExpImpl<float> { 819 // Rounds float toward zero and returns as int32_t. 820 template <class D, class V> 821 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { 822 return ConvertTo(Rebind<int32_t, D>(), x); 823 } 824 825 // Rounds float to nearest int32_t 826 template <class D, class V> 827 HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) { 828 return NearestInt(x); 829 } 830 831 template <class D, class V> 832 HWY_INLINE V ExpPoly(D d, V x) { 833 const auto k0 = Set(d, +0.5f); 834 const auto k1 = Set(d, +0.166666671633720397949219f); 835 const auto k2 = Set(d, +0.0416664853692054748535156f); 836 const auto k3 = Set(d, +0.00833336077630519866943359f); 837 const auto k4 = Set(d, +0.00139304355252534151077271f); 838 const auto k5 = Set(d, +0.000198527617612853646278381f); 839 840 return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x); 841 } 842 843 // Computes 2^x, where x is an integer. 844 template <class D, class VI32> 845 HWY_INLINE Vec<D> Pow2I(D d, VI32 x) { 846 const Rebind<int32_t, D> di32; 847 const VI32 kOffset = Set(di32, 0x7F); 848 return BitCast(d, ShiftLeft<23>(Add(x, kOffset))); 849 } 850 851 // Sets the exponent of 'x' to 2^e. 852 template <class D, class V, class VI32> 853 HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { 854 const VI32 y = ShiftRight<1>(e); 855 return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); 856 } 857 858 template <class D, class V, class VI32> 859 HWY_INLINE V ExpReduce(D d, V x, VI32 q) { 860 // kLn2Part0f + kLn2Part1f ~= -ln(2) 861 const V kLn2Part0f = Set(d, -0.693145751953125f); 862 const V kLn2Part1f = Set(d, -1.428606765330187045e-6f); 863 864 // Extended precision modular arithmetic. 865 const V qf = ConvertTo(d, q); 866 x = MulAdd(qf, kLn2Part0f, x); 867 x = MulAdd(qf, kLn2Part1f, x); 868 return x; 869 } 870 871 template <class D, class V, class VI32> 872 HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) { 873 const V x_frac = Sub(x, ConvertTo(d, q)); 874 return MulAdd(x_frac, Set(d, 0.193147182464599609375f), 875 Mul(x_frac, Set(d, 0.5f))); 876 } 877 }; 878 879 template <> 880 struct LogImpl<float> { 881 template <class D, class V> 882 HWY_INLINE Vec<Rebind<int32_t, D>> Log2p1NoSubnormal(D /*d*/, V x) { 883 const Rebind<int32_t, D> di32; 884 const Rebind<uint32_t, D> du32; 885 const auto kBias = Set(di32, 0x7F); 886 return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias); 887 } 888 889 // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. 890 template <class D, class V> 891 HWY_INLINE V LogPoly(D d, V x) { 892 const V k0 = Set(d, 0.66666662693f); 893 const V k1 = Set(d, 0.40000972152f); 894 const V k2 = Set(d, 0.28498786688f); 895 const V k3 = Set(d, 0.24279078841f); 896 897 const V x2 = Mul(x, x); 898 const V x4 = Mul(x2, x2); 899 return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4)); 900 } 901 }; 902 903 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 904 template <> 905 struct ExpImpl<double> { 906 // Rounds double toward zero and returns as int32_t. 907 template <class D, class V> 908 HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) { 909 return DemoteTo(Rebind<int32_t, D>(), x); 910 } 911 912 // Rounds double to nearest int32_t 913 template <class D, class V> 914 HWY_INLINE Vec<Rebind<int32_t, D>> ToNearestInt32(D /*unused*/, V x) { 915 return DemoteToNearestInt(Rebind<int32_t, D>(), x); 916 } 917 918 template <class D, class V> 919 HWY_INLINE V ExpPoly(D d, V x) { 920 const auto k0 = Set(d, +0.5); 921 const auto k1 = Set(d, +0.166666666666666851703837); 922 const auto k2 = Set(d, +0.0416666666666665047591422); 923 const auto k3 = Set(d, +0.00833333333331652721664984); 924 const auto k4 = Set(d, +0.00138888888889774492207962); 925 const auto k5 = Set(d, +0.000198412698960509205564975); 926 const auto k6 = Set(d, +2.4801587159235472998791e-5); 927 const auto k7 = Set(d, +2.75572362911928827629423e-6); 928 const auto k8 = Set(d, +2.75573911234900471893338e-7); 929 const auto k9 = Set(d, +2.51112930892876518610661e-8); 930 const auto k10 = Set(d, +2.08860621107283687536341e-9); 931 932 return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10), 933 Mul(x, x), x); 934 } 935 936 // Computes 2^x, where x is an integer. 937 template <class D, class VI32> 938 HWY_INLINE Vec<D> Pow2I(D d, VI32 x) { 939 const Rebind<int32_t, D> di32; 940 const Rebind<int64_t, D> di64; 941 const VI32 kOffset = Set(di32, 0x3FF); 942 return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset)))); 943 } 944 945 // Sets the exponent of 'x' to 2^e. 946 template <class D, class V, class VI32> 947 HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) { 948 const VI32 y = ShiftRight<1>(e); 949 return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y))); 950 } 951 952 template <class D, class V, class VI32> 953 HWY_INLINE V ExpReduce(D d, V x, VI32 q) { 954 // kLn2Part0d + kLn2Part1d ~= -ln(2) 955 const V kLn2Part0d = Set(d, -0.6931471805596629565116018); 956 const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12); 957 958 // Extended precision modular arithmetic. 959 const V qf = PromoteTo(d, q); 960 x = MulAdd(qf, kLn2Part0d, x); 961 x = MulAdd(qf, kLn2Part1d, x); 962 return x; 963 } 964 965 template <class D, class V, class VI32> 966 HWY_INLINE V Exp2Reduce(D d, V x, VI32 q) { 967 const V x_frac = Sub(x, PromoteTo(d, q)); 968 return MulAdd(x_frac, Set(d, 0.1931471805599453139823396), 969 Mul(x_frac, Set(d, 0.5))); 970 } 971 }; 972 973 template <> 974 struct LogImpl<double> { 975 template <class D, class V> 976 HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) { 977 const Rebind<int64_t, D> di64; 978 const Rebind<uint64_t, D> du64; 979 return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))), 980 Set(di64, 0x3FF)); 981 } 982 983 // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)]. 984 template <class D, class V> 985 HWY_INLINE V LogPoly(D d, V x) { 986 const V k0 = Set(d, 0.6666666666666735130); 987 const V k1 = Set(d, 0.3999999999940941908); 988 const V k2 = Set(d, 0.2857142874366239149); 989 const V k3 = Set(d, 0.2222219843214978396); 990 const V k4 = Set(d, 0.1818357216161805012); 991 const V k5 = Set(d, 0.1531383769920937332); 992 const V k6 = Set(d, 0.1479819860511658591); 993 994 const V x2 = Mul(x, x); 995 const V x4 = Mul(x2, x2); 996 return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2, 997 (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4))); 998 } 999 }; 1000 1001 #endif 1002 1003 template <class D, class V, bool kAllowSubnormals = true> 1004 HWY_INLINE V Log(const D d, V x) { 1005 // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info. 1006 using T = TFromD<D>; 1007 impl::LogImpl<T> impl; 1008 1009 constexpr bool kIsF32 = (sizeof(T) == 4); 1010 1011 // Float Constants 1012 const V kLn2Hi = Set(d, kIsF32 ? static_cast<T>(0.69313812256f) 1013 : static_cast<T>(0.693147180369123816490)); 1014 const V kLn2Lo = Set(d, kIsF32 ? static_cast<T>(9.0580006145e-6f) 1015 : static_cast<T>(1.90821492927058770002e-10)); 1016 const V kOne = Set(d, static_cast<T>(+1.0)); 1017 const V kMinNormal = Set(d, kIsF32 ? static_cast<T>(1.175494351e-38f) 1018 : static_cast<T>(2.2250738585072014e-308)); 1019 const V kScale = Set(d, kIsF32 ? static_cast<T>(3.355443200e+7f) 1020 : static_cast<T>(1.8014398509481984e+16)); 1021 1022 // Integer Constants 1023 using TI = MakeSigned<T>; 1024 const Rebind<TI, D> di; 1025 using VI = decltype(Zero(di)); 1026 const VI kLowerBits = Set(di, kIsF32 ? static_cast<TI>(0x00000000L) 1027 : static_cast<TI>(0xFFFFFFFFLL)); 1028 const VI kMagic = Set(di, kIsF32 ? static_cast<TI>(0x3F3504F3L) 1029 : static_cast<TI>(0x3FE6A09E00000000LL)); 1030 const VI kExpMask = Set(di, kIsF32 ? static_cast<TI>(0x3F800000L) 1031 : static_cast<TI>(0x3FF0000000000000LL)); 1032 const VI kExpScale = 1033 Set(di, kIsF32 ? static_cast<TI>(-25) : static_cast<TI>(-54)); 1034 const VI kManMask = Set(di, kIsF32 ? static_cast<TI>(0x7FFFFFL) 1035 : static_cast<TI>(0xFFFFF00000000LL)); 1036 1037 // Scale up 'x' so that it is no longer denormalized. 1038 VI exp_bits; 1039 V exp; 1040 if (kAllowSubnormals == true) { 1041 const auto is_denormal = Lt(x, kMinNormal); 1042 x = IfThenElse(is_denormal, Mul(x, kScale), x); 1043 1044 // Compute the new exponent. 1045 exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); 1046 const VI exp_scale = 1047 BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale))); 1048 exp = ConvertTo( 1049 d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)))); 1050 } else { 1051 // Compute the new exponent. 1052 exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic)); 1053 exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))); 1054 } 1055 1056 // Renormalize. 1057 const V y = Or(And(x, BitCast(d, kLowerBits)), 1058 BitCast(d, Add(And(exp_bits, kManMask), kMagic))); 1059 1060 // Approximate and reconstruct. 1061 const V ym1 = Sub(y, kOne); 1062 const V z = Div(ym1, Add(y, kOne)); 1063 1064 return MulSub( 1065 exp, kLn2Hi, 1066 Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1)); 1067 } 1068 1069 // SinCos 1070 // Based on "sse_mathfun.h", by Julien Pommier 1071 // http://gruntthepeon.free.fr/ssemath/ 1072 1073 // Third degree poly 1074 template <class D, class V> 1075 HWY_INLINE void SinCos3(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x, 1076 V& s, V& c) { 1077 using T = TFromD<D>; 1078 using TI = MakeSigned<T>; 1079 using DI = Rebind<TI, D>; 1080 const DI di; 1081 using VI = decltype(Zero(di)); 1082 using M = Mask<D>; 1083 1084 static constexpr size_t bits = sizeof(TI) * 8; 1085 const VI sign_mask = SignBit(di); 1086 const VI ci_0 = Zero(di); 1087 const VI ci_1 = Set(di, 1); 1088 const VI ci_2 = Set(di, 2); 1089 const VI ci_4 = Set(di, 4); 1090 const V cos_p0 = Set(d, ConvertScalarTo<T>(2.443315711809948E-005)); 1091 const V cos_p1 = Set(d, ConvertScalarTo<T>(-1.388731625493765E-003)); 1092 const V cos_p2 = Set(d, ConvertScalarTo<T>(4.166664568298827E-002)); 1093 const V sin_p0 = Set(d, ConvertScalarTo<T>(-1.9515295891E-4)); 1094 const V sin_p1 = Set(d, ConvertScalarTo<T>(8.3321608736E-3)); 1095 const V sin_p2 = Set(d, ConvertScalarTo<T>(-1.6666654611E-1)); 1096 const V FOPI = Set(d, ConvertScalarTo<T>(1.27323954473516)); // 4 / M_PI 1097 const V DP1 = Set(d, dp1); 1098 const V DP2 = Set(d, dp2); 1099 const V DP3 = Set(d, dp3); 1100 1101 V xmm1, xmm2, sign_bit_sin, y; 1102 VI imm0, imm2, imm4; 1103 1104 sign_bit_sin = x; 1105 x = Abs(x); 1106 1107 /* extract the sign bit (upper one) */ 1108 sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask)); 1109 1110 /* scale by 4/Pi */ 1111 y = Mul(x, FOPI); 1112 1113 /* store the integer part of y in imm2 */ 1114 imm2 = ConvertTo(di, y); 1115 1116 /* j=(j+1) & (~1) (see the cephes sources) */ 1117 imm2 = Add(imm2, ci_1); 1118 imm2 = AndNot(ci_1, imm2); 1119 1120 y = ConvertTo(d, imm2); 1121 imm4 = imm2; 1122 1123 /* get the swap sign flag for the sine */ 1124 imm0 = And(imm2, ci_4); 1125 imm0 = ShiftLeft<bits - 3>(imm0); 1126 1127 V swap_sign_bit_sin = BitCast(d, imm0); 1128 1129 /* get the polynomial selection mask for the sine*/ 1130 imm2 = And(imm2, ci_2); 1131 M poly_mask = RebindMask(d, Eq(imm2, ci_0)); 1132 1133 /* The magic pass: "Extended precision modular arithmetic" 1134 x = ((x - y * DP1) - y * DP2) - y * DP3; */ 1135 x = MulAdd(y, DP1, x); 1136 x = MulAdd(y, DP2, x); 1137 x = MulAdd(y, DP3, x); 1138 1139 imm4 = Sub(imm4, ci_2); 1140 imm4 = AndNot(imm4, ci_4); 1141 imm4 = ShiftLeft<bits - 3>(imm4); 1142 1143 V sign_bit_cos = BitCast(d, imm4); 1144 1145 sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin); 1146 1147 /* Evaluate the first polynomial (0 <= x <= Pi/4) */ 1148 V z = Mul(x, x); 1149 1150 y = MulAdd(cos_p0, z, cos_p1); 1151 y = MulAdd(y, z, cos_p2); 1152 y = Mul(y, z); 1153 y = Mul(y, z); 1154 y = NegMulAdd(z, Set(d, 0.5f), y); 1155 y = Add(y, Set(d, 1)); 1156 1157 /* Evaluate the second polynomial (Pi/4 <= x <= 0) */ 1158 V y2 = MulAdd(sin_p0, z, sin_p1); 1159 y2 = MulAdd(y2, z, sin_p2); 1160 y2 = Mul(y2, z); 1161 y2 = MulAdd(y2, x, x); 1162 1163 /* select the correct result from the two polynomials */ 1164 xmm1 = IfThenElse(poly_mask, y2, y); 1165 xmm2 = IfThenElse(poly_mask, y, y2); 1166 1167 /* update the sign */ 1168 s = Xor(xmm1, sign_bit_sin); 1169 c = Xor(xmm2, sign_bit_cos); 1170 } 1171 1172 // Sixth degree poly 1173 template <class D, class V> 1174 HWY_INLINE void SinCos6(D d, TFromD<D> dp1, TFromD<D> dp2, TFromD<D> dp3, V x, 1175 V& s, V& c) { 1176 using T = TFromD<D>; 1177 using TI = MakeSigned<T>; 1178 using DI = Rebind<TI, D>; 1179 const DI di; 1180 using VI = decltype(Zero(di)); 1181 using M = Mask<D>; 1182 1183 static constexpr size_t bits = sizeof(TI) * 8; 1184 const VI sign_mask = SignBit(di); 1185 const VI ci_0 = Zero(di); 1186 const VI ci_1 = Set(di, 1); 1187 const VI ci_2 = Set(di, 2); 1188 const VI ci_4 = Set(di, 4); 1189 const V cos_p0 = Set(d, ConvertScalarTo<T>(-1.13585365213876817300E-11)); 1190 const V cos_p1 = Set(d, ConvertScalarTo<T>(2.08757008419747316778E-9)); 1191 const V cos_p2 = Set(d, ConvertScalarTo<T>(-2.75573141792967388112E-7)); 1192 const V cos_p3 = Set(d, ConvertScalarTo<T>(2.48015872888517045348E-5)); 1193 const V cos_p4 = Set(d, ConvertScalarTo<T>(-1.38888888888730564116E-3)); 1194 const V cos_p5 = Set(d, ConvertScalarTo<T>(4.16666666666665929218E-2)); 1195 const V sin_p0 = Set(d, ConvertScalarTo<T>(1.58962301576546568060E-10)); 1196 const V sin_p1 = Set(d, ConvertScalarTo<T>(-2.50507477628578072866E-8)); 1197 const V sin_p2 = Set(d, ConvertScalarTo<T>(2.75573136213857245213E-6)); 1198 const V sin_p3 = Set(d, ConvertScalarTo<T>(-1.98412698295895385996E-4)); 1199 const V sin_p4 = Set(d, ConvertScalarTo<T>(8.33333333332211858878E-3)); 1200 const V sin_p5 = Set(d, ConvertScalarTo<T>(-1.66666666666666307295E-1)); 1201 const V FOPI = // 4 / M_PI 1202 Set(d, ConvertScalarTo<T>(1.2732395447351626861510701069801148)); 1203 const V DP1 = Set(d, dp1); 1204 const V DP2 = Set(d, dp2); 1205 const V DP3 = Set(d, dp3); 1206 1207 V xmm1, xmm2, sign_bit_sin, y; 1208 VI imm0, imm2, imm4; 1209 1210 sign_bit_sin = x; 1211 x = Abs(x); 1212 1213 /* extract the sign bit (upper one) */ 1214 sign_bit_sin = And(sign_bit_sin, BitCast(d, sign_mask)); 1215 1216 /* scale by 4/Pi */ 1217 y = Mul(x, FOPI); 1218 1219 /* store the integer part of y in imm2 */ 1220 imm2 = ConvertTo(di, y); 1221 1222 /* j=(j+1) & (~1) (see the cephes sources) */ 1223 imm2 = Add(imm2, ci_1); 1224 imm2 = AndNot(ci_1, imm2); 1225 1226 y = ConvertTo(d, imm2); 1227 imm4 = imm2; 1228 1229 /* get the swap sign flag for the sine */ 1230 imm0 = And(imm2, ci_4); 1231 imm0 = ShiftLeft<bits - 3>(imm0); 1232 1233 V swap_sign_bit_sin = BitCast(d, imm0); 1234 1235 /* get the polynomial selection mask for the sine*/ 1236 imm2 = And(imm2, ci_2); 1237 M poly_mask = RebindMask(d, Eq(imm2, ci_0)); 1238 1239 /* The magic pass: "Extended precision modular arithmetic" 1240 x = ((x - y * DP1) - y * DP2) - y * DP3; */ 1241 x = MulAdd(y, DP1, x); 1242 x = MulAdd(y, DP2, x); 1243 x = MulAdd(y, DP3, x); 1244 1245 imm4 = Sub(imm4, ci_2); 1246 imm4 = AndNot(imm4, ci_4); 1247 imm4 = ShiftLeft<bits - 3>(imm4); 1248 1249 V sign_bit_cos = BitCast(d, imm4); 1250 sign_bit_sin = Xor(sign_bit_sin, swap_sign_bit_sin); 1251 1252 /* Evaluate the first polynomial (0 <= x <= Pi/4) */ 1253 V z = Mul(x, x); 1254 1255 y = MulAdd(cos_p0, z, cos_p1); 1256 y = MulAdd(y, z, cos_p2); 1257 y = MulAdd(y, z, cos_p3); 1258 y = MulAdd(y, z, cos_p4); 1259 y = MulAdd(y, z, cos_p5); 1260 y = Mul(y, z); 1261 y = Mul(y, z); 1262 y = NegMulAdd(z, Set(d, 0.5f), y); 1263 y = Add(y, Set(d, 1.0f)); 1264 1265 /* Evaluate the second polynomial (Pi/4 <= x <= 0) */ 1266 V y2 = MulAdd(sin_p0, z, sin_p1); 1267 y2 = MulAdd(y2, z, sin_p2); 1268 y2 = MulAdd(y2, z, sin_p3); 1269 y2 = MulAdd(y2, z, sin_p4); 1270 y2 = MulAdd(y2, z, sin_p5); 1271 y2 = Mul(y2, z); 1272 y2 = MulAdd(y2, x, x); 1273 1274 /* select the correct result from the two polynomials */ 1275 xmm1 = IfThenElse(poly_mask, y2, y); 1276 xmm2 = IfThenElse(poly_mask, y, y2); 1277 1278 /* update the sign */ 1279 s = Xor(xmm1, sign_bit_sin); 1280 c = Xor(xmm2, sign_bit_cos); 1281 } 1282 1283 template <> 1284 struct SinCosImpl<float> { 1285 template <class D, class V> 1286 HWY_INLINE void SinCos(D d, V x, V& s, V& c) { 1287 SinCos3(d, -0.78515625f, -2.4187564849853515625e-4f, 1288 -3.77489497744594108e-8f, x, s, c); 1289 } 1290 }; 1291 1292 #if HWY_HAVE_FLOAT64 && HWY_HAVE_INTEGER64 1293 template <> 1294 struct SinCosImpl<double> { 1295 template <class D, class V> 1296 HWY_INLINE void SinCos(D d, V x, V& s, V& c) { 1297 SinCos6(d, -7.85398125648498535156E-1, -3.77489470793079817668E-8, 1298 -2.69515142907905952645E-15, x, s, c); 1299 } 1300 }; 1301 #endif 1302 1303 } // namespace impl 1304 1305 template <class D, class V> 1306 HWY_INLINE V Acos(const D d, V x) { 1307 using T = TFromD<D>; 1308 1309 const V kZero = Zero(d); 1310 const V kHalf = Set(d, static_cast<T>(+0.5)); 1311 const V kPi = Set(d, static_cast<T>(+3.14159265358979323846264)); 1312 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); 1313 1314 const V sign_x = And(SignBit(d), x); 1315 const V abs_x = Xor(x, sign_x); 1316 const auto mask = Lt(abs_x, kHalf); 1317 const V yy = 1318 IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); 1319 const V y = IfThenElse(mask, abs_x, Sqrt(yy)); 1320 1321 impl::AsinImpl<T> impl; 1322 const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy)); 1323 1324 const V t_plus_y = Add(t, y); 1325 const V z = 1326 IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))), 1327 Add(t_plus_y, t_plus_y)); 1328 return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z)); 1329 } 1330 1331 template <class D, class V> 1332 HWY_INLINE V Acosh(const D d, V x) { 1333 using T = TFromD<D>; 1334 1335 const V kLarge = Set(d, static_cast<T>(268435456.0)); 1336 const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227)); 1337 const V kOne = Set(d, static_cast<T>(+1.0)); 1338 const V kTwo = Set(d, static_cast<T>(+2.0)); 1339 1340 const auto is_x_large = Gt(x, kLarge); 1341 const auto is_x_gt_2 = Gt(x, kTwo); 1342 1343 const V x_minus_1 = Sub(x, kOne); 1344 const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x))); 1345 const V y1 = 1346 Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1); 1347 const V y2 = 1348 IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne)); 1349 const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2); 1350 1351 const auto is_pole = Eq(y2, kOne); 1352 const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); 1353 return Add(IfThenElse(is_x_gt_2, z, 1354 IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))), 1355 IfThenElseZero(is_x_large, kLog2)); 1356 } 1357 1358 template <class D, class V> 1359 HWY_INLINE V Asin(const D d, V x) { 1360 using T = TFromD<D>; 1361 1362 const V kHalf = Set(d, static_cast<T>(+0.5)); 1363 const V kTwo = Set(d, static_cast<T>(+2.0)); 1364 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); 1365 1366 const V sign_x = And(SignBit(d), x); 1367 const V abs_x = Xor(x, sign_x); 1368 const auto mask = Lt(abs_x, kHalf); 1369 const V yy = 1370 IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf)); 1371 const V y = IfThenElse(mask, abs_x, Sqrt(yy)); 1372 1373 impl::AsinImpl<T> impl; 1374 const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y); 1375 const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo); 1376 return Or(IfThenElse(mask, z0, z1), sign_x); 1377 } 1378 1379 template <class D, class V> 1380 HWY_INLINE V Asinh(const D d, V x) { 1381 using T = TFromD<D>; 1382 1383 const V kSmall = Set(d, static_cast<T>(1.0 / 268435456.0)); 1384 const V kLarge = Set(d, static_cast<T>(268435456.0)); 1385 const V kLog2 = Set(d, static_cast<T>(0.693147180559945286227)); 1386 const V kOne = Set(d, static_cast<T>(+1.0)); 1387 const V kTwo = Set(d, static_cast<T>(+2.0)); 1388 1389 const V sign_x = And(SignBit(d), x); // Extract the sign bit 1390 const V abs_x = Xor(x, sign_x); 1391 1392 const auto is_x_large = Gt(abs_x, kLarge); 1393 const auto is_x_lt_2 = Lt(abs_x, kTwo); 1394 1395 const V x2 = Mul(x, x); 1396 const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne)); 1397 1398 const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x))); 1399 const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x); 1400 const V y2 = 1401 IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0)); 1402 const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2); 1403 1404 const auto is_pole = Eq(y2, kOne); 1405 const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne); 1406 const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor)); 1407 const V y = IfThenElse(Lt(abs_x, kSmall), x, large); 1408 return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)), 1409 sign_x); 1410 } 1411 1412 template <class D, class V> 1413 HWY_INLINE V Atan(const D d, V x) { 1414 using T = TFromD<D>; 1415 1416 const V kOne = Set(d, static_cast<T>(+1.0)); 1417 const V kPiOverTwo = Set(d, static_cast<T>(+1.57079632679489661923132169)); 1418 1419 const V sign = And(SignBit(d), x); 1420 const V abs_x = Xor(x, sign); 1421 const auto mask = Gt(abs_x, kOne); 1422 1423 impl::AtanImpl<T> impl; 1424 const auto divisor = IfThenElse(mask, abs_x, kOne); 1425 const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x)); 1426 return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign); 1427 } 1428 1429 template <class D, class V> 1430 HWY_INLINE V Atanh(const D d, V x) { 1431 using T = TFromD<D>; 1432 1433 const V kHalf = Set(d, static_cast<T>(+0.5)); 1434 const V kOne = Set(d, static_cast<T>(+1.0)); 1435 1436 const V sign = And(SignBit(d), x); // Extract the sign bit 1437 const V abs_x = Xor(x, sign); 1438 return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))), 1439 Xor(kHalf, sign)); 1440 } 1441 1442 template <class D, class V> 1443 HWY_INLINE V Cos(const D d, V x) { 1444 using T = TFromD<D>; 1445 impl::CosSinImpl<T> impl; 1446 1447 // Float Constants 1448 const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153)); 1449 1450 // Integer Constants 1451 const Rebind<int32_t, D> di32; 1452 using VI32 = decltype(Zero(di32)); 1453 const VI32 kOne = Set(di32, 1); 1454 1455 const V y = Abs(x); // cos(x) == cos(|x|) 1456 1457 // Compute the quadrant, q = int(|x| / pi) * 2 + 1 1458 const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne); 1459 1460 // Reduce range, apply sign, and approximate. 1461 return impl.Poly( 1462 d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q))); 1463 } 1464 1465 template <class D, class V> 1466 HWY_INLINE V Exp(const D d, V x) { 1467 using T = TFromD<D>; 1468 1469 const V kHalf = Set(d, static_cast<T>(+0.5)); 1470 const V kLowerBound = 1471 Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0))); 1472 const V kNegZero = Set(d, static_cast<T>(-0.0)); 1473 const V kOne = Set(d, static_cast<T>(+1.0)); 1474 const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681)); 1475 1476 impl::ExpImpl<T> impl; 1477 1478 // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) 1479 const auto q = 1480 impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); 1481 1482 // Reduce, approximate, and then reconstruct. 1483 const V y = impl.LoadExpShortRange( 1484 d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q); 1485 return IfThenElseZero(Ge(x, kLowerBound), y); 1486 } 1487 1488 template <class D, class V> 1489 HWY_INLINE V Exp2(const D d, V x) { 1490 using T = TFromD<D>; 1491 1492 const V kLowerBound = 1493 Set(d, static_cast<T>((sizeof(T) == 4 ? -150.0 : -1075.0))); 1494 const V kOne = Set(d, static_cast<T>(+1.0)); 1495 1496 impl::ExpImpl<T> impl; 1497 1498 // q = static_cast<int32_t>(std::lrint(x)) 1499 const auto q = impl.ToNearestInt32(d, x); 1500 1501 // Reduce, approximate, and then reconstruct. 1502 const V y = impl.LoadExpShortRange( 1503 d, Add(impl.ExpPoly(d, impl.Exp2Reduce(d, x, q)), kOne), q); 1504 return IfThenElseZero(Ge(x, kLowerBound), y); 1505 } 1506 1507 template <class D, class V> 1508 HWY_INLINE V Expm1(const D d, V x) { 1509 using T = TFromD<D>; 1510 1511 const V kHalf = Set(d, static_cast<T>(+0.5)); 1512 const V kLowerBound = 1513 Set(d, static_cast<T>((sizeof(T) == 4 ? -104.0 : -1000.0))); 1514 const V kLn2Over2 = Set(d, static_cast<T>(+0.346573590279972654708616)); 1515 const V kNegOne = Set(d, static_cast<T>(-1.0)); 1516 const V kNegZero = Set(d, static_cast<T>(-0.0)); 1517 const V kOne = Set(d, static_cast<T>(+1.0)); 1518 const V kOneOverLog2 = Set(d, static_cast<T>(+1.442695040888963407359924681)); 1519 1520 impl::ExpImpl<T> impl; 1521 1522 // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5)) 1523 const auto q = 1524 impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero)))); 1525 1526 // Reduce, approximate, and then reconstruct. 1527 const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q)); 1528 const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y, 1529 Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne)); 1530 return IfThenElse(Lt(x, kLowerBound), kNegOne, z); 1531 } 1532 1533 template <class D, class V> 1534 HWY_INLINE V Log(const D d, V x) { 1535 return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x); 1536 } 1537 1538 template <class D, class V> 1539 HWY_INLINE V Log10(const D d, V x) { 1540 using T = TFromD<D>; 1541 return Mul(Log(d, x), Set(d, static_cast<T>(0.4342944819032518276511))); 1542 } 1543 1544 template <class D, class V> 1545 HWY_INLINE V Log1p(const D d, V x) { 1546 using T = TFromD<D>; 1547 const V kOne = Set(d, static_cast<T>(+1.0)); 1548 1549 const V y = Add(x, kOne); 1550 const auto is_pole = Eq(y, kOne); 1551 const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne); 1552 const auto non_pole = 1553 Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor)); 1554 return IfThenElse(is_pole, x, non_pole); 1555 } 1556 1557 template <class D, class V> 1558 HWY_INLINE V Log2(const D d, V x) { 1559 using T = TFromD<D>; 1560 return Mul(Log(d, x), Set(d, static_cast<T>(1.44269504088896340735992))); 1561 } 1562 1563 template <class D, class V> 1564 HWY_INLINE V Sin(const D d, V x) { 1565 using T = TFromD<D>; 1566 impl::CosSinImpl<T> impl; 1567 1568 // Float Constants 1569 const V kOneOverPi = Set(d, static_cast<T>(0.31830988618379067153)); 1570 const V kHalf = Set(d, static_cast<T>(0.5)); 1571 1572 // Integer Constants 1573 const Rebind<int32_t, D> di32; 1574 using VI32 = decltype(Zero(di32)); 1575 1576 const V abs_x = Abs(x); 1577 const V sign_x = Xor(abs_x, x); 1578 1579 // Compute the quadrant, q = int((|x| / pi) + 0.5) 1580 const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf)); 1581 1582 // Reduce range, apply sign, and approximate. 1583 return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q), 1584 Xor(impl.SinSignFromQuadrant(d, q), sign_x))); 1585 } 1586 1587 template <class D, class V> 1588 HWY_INLINE V Sinh(const D d, V x) { 1589 using T = TFromD<D>; 1590 const V kHalf = Set(d, static_cast<T>(+0.5)); 1591 const V kOne = Set(d, static_cast<T>(+1.0)); 1592 const V kTwo = Set(d, static_cast<T>(+2.0)); 1593 1594 const V sign = And(SignBit(d), x); // Extract the sign bit 1595 const V abs_x = Xor(x, sign); 1596 const V y = Expm1(d, abs_x); 1597 const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf)); 1598 return Xor(z, sign); // Reapply the sign bit 1599 } 1600 1601 template <class D, class V> 1602 HWY_INLINE V Tanh(const D d, V x) { 1603 using T = TFromD<D>; 1604 const V kLimit = Set(d, static_cast<T>(18.714973875)); 1605 const V kOne = Set(d, static_cast<T>(+1.0)); 1606 const V kTwo = Set(d, static_cast<T>(+2.0)); 1607 1608 const V sign = And(SignBit(d), x); // Extract the sign bit 1609 const V abs_x = Xor(x, sign); 1610 const V y = Expm1(d, Mul(abs_x, kTwo)); 1611 const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo))); 1612 return Xor(z, sign); // Reapply the sign bit 1613 } 1614 1615 template <class D, class V> 1616 HWY_INLINE void SinCos(const D d, V x, V& s, V& c) { 1617 using T = TFromD<D>; 1618 impl::SinCosImpl<T> impl; 1619 impl.SinCos(d, x, s, c); 1620 } 1621 1622 template <class D, class V> 1623 HWY_INLINE V Hypot(const D d, V a, V b) { 1624 using T = TFromD<D>; 1625 using TI = MakeSigned<T>; 1626 const RebindToUnsigned<decltype(d)> du; 1627 const RebindToSigned<decltype(d)> di; 1628 using VI = VFromD<decltype(di)>; 1629 1630 constexpr int kMaxBiasedExp = static_cast<int>(MaxExponentField<T>()); 1631 static_assert(kMaxBiasedExp > 0, "kMaxBiasedExp > 0 must be true"); 1632 1633 constexpr int kNumOfMantBits = MantissaBits<T>(); 1634 static_assert(kNumOfMantBits > 0, "kNumOfMantBits > 0 must be true"); 1635 1636 constexpr int kExpBias = kMaxBiasedExp / 2; 1637 1638 static_assert( 1639 static_cast<unsigned>(kExpBias) + static_cast<unsigned>(kNumOfMantBits) < 1640 static_cast<unsigned>(kMaxBiasedExp), 1641 "kExpBias + kNumOfMantBits < kMaxBiasedExp must be true"); 1642 1643 // kMinValToSquareBiasedExp is the smallest biased exponent such that 1644 // pow(pow(2, kMinValToSquareBiasedExp - kExpBias) * x, 2) is either a normal 1645 // floating-point value or infinity if x is a non-zero, non-NaN value 1646 constexpr int kMinValToSquareBiasedExp = (kExpBias / 2) + kNumOfMantBits; 1647 static_assert(kMinValToSquareBiasedExp < kExpBias, 1648 "kMinValToSquareBiasedExp < kExpBias must be true"); 1649 1650 // kMaxValToSquareBiasedExp is the largest biased exponent such that 1651 // pow(pow(2, kMaxValToSquareBiasedExp - kExpBias) * x, 2) * 2 is guaranteed 1652 // to be a finite value if x is a finite value 1653 constexpr int kMaxValToSquareBiasedExp = kExpBias + ((kExpBias / 2) - 1); 1654 static_assert(kMaxValToSquareBiasedExp > kExpBias, 1655 "kMaxValToSquareBiasedExp > kExpBias must be true"); 1656 static_assert(kMaxValToSquareBiasedExp < kMaxBiasedExp, 1657 "kMaxValToSquareBiasedExp < kMaxBiasedExp must be true"); 1658 1659 #if HWY_TARGET == HWY_SCALAR || HWY_TARGET == HWY_EMU128 || \ 1660 HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15 1661 using TExpSatSub = MakeUnsigned<T>; 1662 using TExpMinMax = TI; 1663 #else 1664 using TExpSatSub = uint16_t; 1665 using TExpMinMax = int16_t; 1666 #endif 1667 1668 const Repartition<TExpSatSub, decltype(d)> d_exp_sat_sub; 1669 const Repartition<TExpMinMax, decltype(d)> d_exp_min_max; 1670 1671 const V abs_a = Abs(a); 1672 const V abs_b = Abs(b); 1673 1674 const MFromD<D> either_inf = Or(IsInf(a), IsInf(b)); 1675 1676 const VI zero = Zero(di); 1677 1678 // exp_a[i] is the biased exponent of abs_a[i] 1679 const VI exp_a = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_a))); 1680 1681 // exp_b[i] is the biased exponent of abs_b[i] 1682 const VI exp_b = BitCast(di, ShiftRight<kNumOfMantBits>(BitCast(du, abs_b))); 1683 1684 // max_exp[i] is equal to HWY_MAX(exp_a[i], exp_b[i]) 1685 1686 // If abs_a[i] and abs_b[i] are both NaN values, max_exp[i] will be equal to 1687 // the biased exponent of the larger value. Otherwise, if either abs_a[i] or 1688 // abs_b[i] is NaN, max_exp[i] will be equal to kMaxBiasedExp. 1689 const VI max_exp = BitCast( 1690 di, Max(BitCast(d_exp_min_max, exp_a), BitCast(d_exp_min_max, exp_b))); 1691 1692 // If either abs_a[i] or abs_b[i] is zero, min_exp[i] is equal to max_exp[i]. 1693 // Otherwise, if abs_a[i] and abs_b[i] are both nonzero, min_exp[i] is equal 1694 // to HWY_MIN(exp_a[i], exp_b[i]). 1695 const VI min_exp = IfThenElse( 1696 Or(Eq(BitCast(di, abs_a), zero), Eq(BitCast(di, abs_b), zero)), max_exp, 1697 BitCast(di, Min(BitCast(d_exp_min_max, exp_a), 1698 BitCast(d_exp_min_max, exp_b)))); 1699 1700 // scl_pow2[i] is the power of 2 to scale abs_a[i] and abs_b[i] by 1701 1702 // abs_a[i] and abs_b[i] should be scaled by a factor that is greater than 1703 // zero but less than or equal to 1704 // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that that the 1705 // multiplications or addition operations do not overflow if 1706 // std::hypot(abs_a[i], abs_b[i]) is finite 1707 1708 // If either abs_a[i] or abs_b[i] is a a positive value that is less than 1709 // pow(2, kMinValToSquareBiasedExp - kExpBias), then scaling up abs_a[i] and 1710 // abs_b[i] by pow(2, kMinValToSquareBiasedExp - min_exp[i]) will ensure that 1711 // the multiplications and additions result in normal floating point values, 1712 // infinities, or NaNs. 1713 1714 // If HWY_MAX(kMinValToSquareBiasedExp - min_exp[i], 0) is greater than 1715 // kMaxValToSquareBiasedExp - max_exp[i], scale abs_a[i] and abs_b[i] up by 1716 // pow(2, kMaxValToSquareBiasedExp - max_exp[i]) to ensure that the 1717 // multiplication and addition operations result in a finite result if 1718 // std::hypot(abs_a[i], abs_b[i]) is finite. 1719 1720 const VI scl_pow2 = BitCast( 1721 di, 1722 Min(BitCast(d_exp_min_max, 1723 SaturatedSub(BitCast(d_exp_sat_sub, 1724 Set(di, static_cast<TI>( 1725 kMinValToSquareBiasedExp))), 1726 BitCast(d_exp_sat_sub, min_exp))), 1727 BitCast(d_exp_min_max, 1728 Sub(Set(di, static_cast<TI>(kMaxValToSquareBiasedExp)), 1729 max_exp)))); 1730 1731 const VI exp_bias = Set(di, static_cast<TI>(kExpBias)); 1732 1733 const V ab_scl_factor = 1734 BitCast(d, ShiftLeft<kNumOfMantBits>(Add(exp_bias, scl_pow2))); 1735 const V hypot_scl_factor = 1736 BitCast(d, ShiftLeft<kNumOfMantBits>(Sub(exp_bias, scl_pow2))); 1737 1738 const V scl_a = Mul(abs_a, ab_scl_factor); 1739 const V scl_b = Mul(abs_b, ab_scl_factor); 1740 1741 const V scl_hypot = Sqrt(MulAdd(scl_a, scl_a, Mul(scl_b, scl_b))); 1742 // std::hypot returns inf if one input is +/- inf, even if the other is NaN. 1743 return IfThenElse(either_inf, Inf(d), Mul(scl_hypot, hypot_scl_factor)); 1744 } 1745 1746 // NOLINTNEXTLINE(google-readability-namespace-comments) 1747 } // namespace HWY_NAMESPACE 1748 } // namespace hwy 1749 HWY_AFTER_NAMESPACE(); 1750 1751 #endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_