tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

set_macros-inl.h (24982B)


      1 // Copyright 2020 Google LLC
      2 // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
      3 // SPDX-License-Identifier: Apache-2.0
      4 // SPDX-License-Identifier: BSD-3-Clause
      5 //
      6 // Licensed under the Apache License, Version 2.0 (the "License");
      7 // you may not use this file except in compliance with the License.
      8 // You may obtain a copy of the License at
      9 //
     10 //      http://www.apache.org/licenses/LICENSE-2.0
     11 //
     12 // Unless required by applicable law or agreed to in writing, software
     13 // distributed under the License is distributed on an "AS IS" BASIS,
     14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 // See the License for the specific language governing permissions and
     16 // limitations under the License.
     17 
     18 // Sets macros based on HWY_TARGET.
     19 
     20 // This include guard is toggled by foreach_target, so avoid the usual _H_
     21 // suffix to prevent copybara from renaming it.
     22 #if defined(HWY_SET_MACROS_PER_TARGET) == defined(HWY_TARGET_TOGGLE)
     23 #ifdef HWY_SET_MACROS_PER_TARGET
     24 #undef HWY_SET_MACROS_PER_TARGET
     25 #else
     26 #define HWY_SET_MACROS_PER_TARGET
     27 #endif
     28 
     29 #endif  // HWY_SET_MACROS_PER_TARGET
     30 
     31 #include "hwy/detect_compiler_arch.h"  // IWYU: export
     32 #include "hwy/detect_targets.h"        // IWYU: export
     33 
     34 #undef HWY_NAMESPACE
     35 #undef HWY_ALIGN
     36 #undef HWY_MAX_BYTES
     37 #undef HWY_LANES
     38 
     39 #undef HWY_HAVE_SCALABLE
     40 #undef HWY_HAVE_TUPLE
     41 #undef HWY_HAVE_INTEGER64
     42 #undef HWY_HAVE_FLOAT16
     43 #undef HWY_HAVE_FLOAT64
     44 #undef HWY_MEM_OPS_MIGHT_FAULT
     45 #undef HWY_NATIVE_FMA
     46 #undef HWY_NATIVE_DOT_BF16
     47 #undef HWY_NATIVE_MASK
     48 #undef HWY_CAP_GE256
     49 #undef HWY_CAP_GE512
     50 
     51 #undef HWY_TARGET_IS_SVE
     52 #if HWY_TARGET & HWY_ALL_SVE
     53 #define HWY_TARGET_IS_SVE 1
     54 #else
     55 #define HWY_TARGET_IS_SVE 0
     56 #endif
     57 
     58 #undef HWY_TARGET_IS_NEON
     59 #if HWY_TARGET & HWY_ALL_NEON
     60 #define HWY_TARGET_IS_NEON 1
     61 #else
     62 #define HWY_TARGET_IS_NEON 0
     63 #endif
     64 
     65 #undef HWY_TARGET_IS_PPC
     66 #if HWY_TARGET & HWY_ALL_PPC
     67 #define HWY_TARGET_IS_PPC 1
     68 #else
     69 #define HWY_TARGET_IS_PPC 0
     70 #endif
     71 
     72 #undef HWY_TARGET_IS_AVX10_2
     73 #if HWY_TARGET == HWY_AVX10_2
     74 #define HWY_TARGET_IS_AVX10_2 1
     75 #else
     76 #define HWY_TARGET_IS_AVX10_2 0
     77 #endif
     78 
     79 // Supported on all targets except RVV (requires GCC 14 or upcoming Clang)
     80 #if HWY_TARGET == HWY_RVV &&                                        \
     81    ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400) || \
     82     (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1700))
     83 #define HWY_HAVE_TUPLE 0
     84 #else
     85 #define HWY_HAVE_TUPLE 1
     86 #endif
     87 
     88 // For internal use (clamping/validating N for Simd<>)
     89 #undef HWY_MAX_N
     90 #if HWY_TARGET == HWY_SCALAR
     91 #define HWY_MAX_N 1
     92 #else
     93 #define HWY_MAX_N 65536
     94 #endif
     95 
     96 // For internal use (clamping kPow2 for Simd<>)
     97 #undef HWY_MAX_POW2
     98 // For HWY_TARGET == HWY_RVV, LMUL <= 8. Even on other targets, we want to
     99 // support say Rebind<uint64_t, Simd<uint8_t, 1, 0>> d; whose kPow2 is also 3.
    100 // However, those other targets do not actually support multiple vectors, and
    101 // thus Lanes(d) must not exceed Lanes(ScalableTag<T>()).
    102 #define HWY_MAX_POW2 3
    103 
    104 // User-visible. Loose lower bound that guarantees HWY_MAX_BYTES >>
    105 // (-HWY_MIN_POW2) <= 1. Useful for terminating compile-time recursions.
    106 #undef HWY_MIN_POW2
    107 #if HWY_TARGET == HWY_RVV
    108 #define HWY_MIN_POW2 -16
    109 #else
    110 // Tighter bound for other targets, whose vectors are smaller, to potentially
    111 // save compile time.
    112 #define HWY_MIN_POW2 -8
    113 #endif  // HWY_TARGET == HWY_RVV
    114 
    115 #undef HWY_TARGET_STR
    116 
    117 #if defined(HWY_DISABLE_PCLMUL_AES)
    118 #define HWY_TARGET_STR_PCLMUL_AES ""
    119 #else
    120 #define HWY_TARGET_STR_PCLMUL_AES ",pclmul,aes"
    121 #endif
    122 
    123 #if defined(HWY_DISABLE_BMI2_FMA)
    124 #define HWY_TARGET_STR_BMI2_FMA ""
    125 #else
    126 #define HWY_TARGET_STR_BMI2_FMA ",bmi,bmi2,fma"
    127 #endif
    128 
    129 #if defined(HWY_DISABLE_F16C)
    130 #define HWY_TARGET_STR_F16C ""
    131 #else
    132 #define HWY_TARGET_STR_F16C ",f16c"
    133 #endif
    134 
    135 #define HWY_TARGET_STR_SSE2 "sse2"
    136 
    137 #define HWY_TARGET_STR_SSSE3 "sse2,ssse3"
    138 
    139 #define HWY_TARGET_STR_SSE4 \
    140  HWY_TARGET_STR_SSSE3 ",sse4.1,sse4.2" HWY_TARGET_STR_PCLMUL_AES
    141 // Include previous targets, which are the half-vectors of the next target.
    142 #define HWY_TARGET_STR_AVX2 \
    143  HWY_TARGET_STR_SSE4 ",avx,avx2" HWY_TARGET_STR_BMI2_FMA HWY_TARGET_STR_F16C
    144 
    145 #ifndef HWY_HAVE_EVEX512  // allow override
    146 // evex512 has been removed from clang 22, see
    147 // https://github.com/llvm/llvm-project/pull/157034
    148 #if (1400 <= HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1600) || \
    149    (1800 <= HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 2200)
    150 #define HWY_HAVE_EVEX512 1
    151 #else
    152 #define HWY_HAVE_EVEX512 0
    153 #endif
    154 #endif
    155 
    156 #if (HWY_HAVE_EVEX512 == 1)
    157 #define HWY_TARGET_STR_AVX3_VL512 ",evex512"
    158 #else
    159 #define HWY_TARGET_STR_AVX3_VL512
    160 #endif
    161 
    162 #define HWY_TARGET_STR_AVX3 \
    163  HWY_TARGET_STR_AVX2       \
    164  ",avx512f,avx512cd,avx512vl,avx512dq,avx512bw" HWY_TARGET_STR_AVX3_VL512
    165 
    166 #define HWY_TARGET_STR_AVX3_DL                                       \
    167  HWY_TARGET_STR_AVX3                                                \
    168  ",vpclmulqdq,avx512vbmi,avx512vbmi2,vaes,avx512vnni,avx512bitalg," \
    169  "avx512vpopcntdq,gfni"
    170 
    171 // Opt-out for compilers that do not properly support avx512bf16.
    172 #ifndef HWY_AVX3_ENABLE_AVX512BF16  // allow override
    173 // Default is to disable if the DISABLE macro is defined, or if old compiler.
    174 // clang-cl 21.1.4 reportedly works; feel free to define this to 1 there.
    175 #if defined(HWY_AVX3_DISABLE_AVX512BF16) ||                         \
    176    (HWY_COMPILER_CLANGCL ||                                        \
    177     (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1000) || \
    178     (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 900))
    179 #define HWY_AVX3_ENABLE_AVX512BF16 0
    180 #else
    181 #define HWY_AVX3_ENABLE_AVX512BF16 1
    182 #endif
    183 #endif  // HWY_AVX3_ENABLE_AVX512BF16
    184 
    185 #if HWY_AVX3_ENABLE_AVX512BF16
    186 #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL ",avx512bf16"
    187 #else
    188 #define HWY_TARGET_STR_AVX3_ZEN4 HWY_TARGET_STR_AVX3_DL
    189 #endif
    190 
    191 #if HWY_COMPILER_GCC_ACTUAL >= 1200 || HWY_COMPILER_CLANG >= 1400
    192 #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4 ",avx512fp16"
    193 #else
    194 #define HWY_TARGET_STR_AVX3_SPR HWY_TARGET_STR_AVX3_ZEN4
    195 #endif
    196 
    197 // Support for avx10.2-512 was removed between clang 22 and 23 without a
    198 // feature test macro.
    199 #if HWY_COMPILER_CLANG >= 2200 && HWY_HAVE_EVEX512
    200 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2-512"
    201 // Recent compilers drop the -512 suffix because 512 bits are always available.
    202 #elif HWY_COMPILER_GCC_ACTUAL >= 1500 || HWY_COMPILER_CLANG >= 2200
    203 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR ",avx10.2"
    204 #else
    205 #define HWY_TARGET_STR_AVX10_2 HWY_TARGET_STR_AVX3_SPR
    206 #endif
    207 
    208 #if defined(HWY_DISABLE_PPC8_CRYPTO)
    209 #define HWY_TARGET_STR_PPC8_CRYPTO ""
    210 #else
    211 #define HWY_TARGET_STR_PPC8_CRYPTO ",crypto"
    212 #endif
    213 
    214 #define HWY_TARGET_STR_PPC8 \
    215  "altivec,vsx,power8-vector" HWY_TARGET_STR_PPC8_CRYPTO
    216 #define HWY_TARGET_STR_PPC9 HWY_TARGET_STR_PPC8 ",power9-vector"
    217 
    218 #if HWY_COMPILER_CLANG
    219 #define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",power10-vector"
    220 #else
    221 // See #1707 and https://gcc.gnu.org/bugzilla/show_bug.cgi?id=102059#c35.
    222 // When the baseline is PPC 8 or 9, inlining functions such as PreventElision
    223 // into PPC10 code fails because PPC10 defaults to no-htm and is thus worse than
    224 // the baseline, which has htm. We cannot have pragma target on functions
    225 // outside HWY_NAMESPACE such as those in base.h. It would be possible for users
    226 // to set -mno-htm globally, but we can also work around this at the library
    227 // level by claiming that PPC10 still has HTM, thus avoiding the mismatch. This
    228 // seems to be safe because HTM uses builtins rather than modifying codegen, see
    229 // https://gcc.gnu.org/legacy-ml/gcc-patches/2013-07/msg00167.html.
    230 #define HWY_TARGET_STR_PPC10 HWY_TARGET_STR_PPC9 ",cpu=power10,htm"
    231 #endif
    232 
    233 #define HWY_TARGET_STR_Z14 "arch=z14"
    234 #define HWY_TARGET_STR_Z15 "arch=z15"
    235 
    236 // Before include guard so we redefine HWY_TARGET_STR on each include,
    237 // governed by the current HWY_TARGET.
    238 
    239 //-----------------------------------------------------------------------------
    240 // SSE2
    241 #if HWY_TARGET == HWY_SSE2
    242 
    243 #define HWY_NAMESPACE N_SSE2
    244 #define HWY_ALIGN alignas(16)
    245 #define HWY_MAX_BYTES 16
    246 #define HWY_LANES(T) (16 / sizeof(T))
    247 
    248 #define HWY_HAVE_SCALABLE 0
    249 #define HWY_HAVE_INTEGER64 1
    250 #define HWY_HAVE_FLOAT16 0
    251 #define HWY_HAVE_FLOAT64 1
    252 #define HWY_MEM_OPS_MIGHT_FAULT 1
    253 #define HWY_NATIVE_FMA 0
    254 #define HWY_NATIVE_DOT_BF16 0
    255 #define HWY_NATIVE_MASK 0  // a few actually are
    256 #define HWY_CAP_GE256 0
    257 #define HWY_CAP_GE512 0
    258 
    259 #define HWY_TARGET_STR HWY_TARGET_STR_SSE2
    260 //-----------------------------------------------------------------------------
    261 // SSSE3
    262 #elif HWY_TARGET == HWY_SSSE3
    263 
    264 #define HWY_NAMESPACE N_SSSE3
    265 #define HWY_ALIGN alignas(16)
    266 #define HWY_MAX_BYTES 16
    267 #define HWY_LANES(T) (16 / sizeof(T))
    268 
    269 #define HWY_HAVE_SCALABLE 0
    270 #define HWY_HAVE_INTEGER64 1
    271 #define HWY_HAVE_FLOAT16 0
    272 #define HWY_HAVE_FLOAT64 1
    273 #define HWY_MEM_OPS_MIGHT_FAULT 1
    274 #define HWY_NATIVE_FMA 0
    275 #define HWY_NATIVE_DOT_BF16 0
    276 #define HWY_NATIVE_MASK 0  // a few actually are
    277 #define HWY_CAP_GE256 0
    278 #define HWY_CAP_GE512 0
    279 
    280 #define HWY_TARGET_STR HWY_TARGET_STR_SSSE3
    281 
    282 //-----------------------------------------------------------------------------
    283 // SSE4
    284 #elif HWY_TARGET == HWY_SSE4
    285 
    286 #define HWY_NAMESPACE N_SSE4
    287 #define HWY_ALIGN alignas(16)
    288 #define HWY_MAX_BYTES 16
    289 #define HWY_LANES(T) (16 / sizeof(T))
    290 
    291 #define HWY_HAVE_SCALABLE 0
    292 #define HWY_HAVE_INTEGER64 1
    293 #define HWY_HAVE_FLOAT16 0
    294 #define HWY_HAVE_FLOAT64 1
    295 #define HWY_MEM_OPS_MIGHT_FAULT 1
    296 #define HWY_NATIVE_FMA 0
    297 #define HWY_NATIVE_DOT_BF16 0
    298 #define HWY_NATIVE_MASK 0  // a few actually are
    299 #define HWY_CAP_GE256 0
    300 #define HWY_CAP_GE512 0
    301 
    302 #define HWY_TARGET_STR HWY_TARGET_STR_SSE4
    303 
    304 //-----------------------------------------------------------------------------
    305 // AVX2
    306 #elif HWY_TARGET == HWY_AVX2
    307 
    308 #define HWY_NAMESPACE N_AVX2
    309 #define HWY_ALIGN alignas(32)
    310 #define HWY_MAX_BYTES 32
    311 #define HWY_LANES(T) (32 / sizeof(T))
    312 
    313 #define HWY_HAVE_SCALABLE 0
    314 #define HWY_HAVE_INTEGER64 1
    315 #define HWY_HAVE_FLOAT16 0
    316 #define HWY_HAVE_FLOAT64 1
    317 #define HWY_MEM_OPS_MIGHT_FAULT 1
    318 
    319 #ifdef HWY_DISABLE_BMI2_FMA
    320 #define HWY_NATIVE_FMA 0
    321 #else
    322 #define HWY_NATIVE_FMA 1
    323 #endif
    324 #define HWY_NATIVE_DOT_BF16 0
    325 #define HWY_NATIVE_MASK 0  // a few actually are
    326 
    327 #define HWY_CAP_GE256 1
    328 #define HWY_CAP_GE512 0
    329 
    330 #define HWY_TARGET_STR HWY_TARGET_STR_AVX2
    331 
    332 //-----------------------------------------------------------------------------
    333 // AVX3[_DL/ZEN4/SPR]/AVX10
    334 #elif HWY_TARGET <= HWY_AVX3
    335 
    336 #define HWY_ALIGN alignas(64)
    337 #define HWY_MAX_BYTES 64
    338 #define HWY_LANES(T) (64 / sizeof(T))
    339 
    340 #define HWY_HAVE_SCALABLE 0
    341 #define HWY_HAVE_INTEGER64 1
    342 #if HWY_TARGET <= HWY_AVX3_SPR &&                              \
    343    (HWY_COMPILER_GCC_ACTUAL || HWY_COMPILER_CLANG >= 2200) && \
    344    HWY_HAVE_SCALAR_F16_TYPE
    345 #define HWY_HAVE_FLOAT16 1
    346 #else
    347 #define HWY_HAVE_FLOAT16 0
    348 #endif
    349 #define HWY_HAVE_FLOAT64 1
    350 #define HWY_MEM_OPS_MIGHT_FAULT 0
    351 #define HWY_NATIVE_FMA 1
    352 #if (HWY_TARGET <= HWY_AVX3_ZEN4) && HWY_AVX3_ENABLE_AVX512BF16
    353 #define HWY_NATIVE_DOT_BF16 1
    354 #else
    355 #define HWY_NATIVE_DOT_BF16 0
    356 #endif
    357 #define HWY_NATIVE_MASK 1
    358 #define HWY_CAP_GE256 1
    359 
    360 #if HWY_MAX_BYTES >= 64
    361 #define HWY_CAP_GE512 1
    362 #else
    363 #define HWY_CAP_GE512 0
    364 #endif
    365 
    366 #if HWY_TARGET == HWY_AVX3
    367 
    368 #define HWY_NAMESPACE N_AVX3
    369 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3
    370 
    371 #elif HWY_TARGET == HWY_AVX3_DL
    372 
    373 #define HWY_NAMESPACE N_AVX3_DL
    374 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_DL
    375 
    376 #elif HWY_TARGET == HWY_AVX3_ZEN4
    377 
    378 #define HWY_NAMESPACE N_AVX3_ZEN4
    379 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_ZEN4
    380 
    381 #elif HWY_TARGET == HWY_AVX3_SPR
    382 
    383 #define HWY_NAMESPACE N_AVX3_SPR
    384 #define HWY_TARGET_STR HWY_TARGET_STR_AVX3_SPR
    385 
    386 #elif HWY_TARGET == HWY_AVX10_2
    387 
    388 #define HWY_NAMESPACE N_AVX10_2
    389 #define HWY_TARGET_STR HWY_TARGET_STR_AVX10_2
    390 
    391 #else
    392 #error "Logic error"
    393 #endif  // HWY_TARGET
    394 
    395 //-----------------------------------------------------------------------------
    396 // PPC8, PPC9, PPC10
    397 #elif HWY_TARGET_IS_PPC
    398 
    399 #define HWY_ALIGN alignas(16)
    400 #define HWY_MAX_BYTES 16
    401 #define HWY_LANES(T) (16 / sizeof(T))
    402 
    403 #define HWY_HAVE_SCALABLE 0
    404 #define HWY_HAVE_INTEGER64 1
    405 #define HWY_HAVE_FLOAT16 0
    406 #define HWY_HAVE_FLOAT64 1
    407 #define HWY_MEM_OPS_MIGHT_FAULT 1
    408 #define HWY_NATIVE_FMA 1
    409 #define HWY_NATIVE_DOT_BF16 0
    410 #define HWY_NATIVE_MASK 0
    411 #define HWY_CAP_GE256 0
    412 #define HWY_CAP_GE512 0
    413 
    414 #if HWY_TARGET == HWY_PPC8
    415 
    416 #define HWY_NAMESPACE N_PPC8
    417 #define HWY_TARGET_STR HWY_TARGET_STR_PPC8
    418 
    419 #elif HWY_TARGET == HWY_PPC9
    420 
    421 #define HWY_NAMESPACE N_PPC9
    422 #define HWY_TARGET_STR HWY_TARGET_STR_PPC9
    423 
    424 #elif HWY_TARGET == HWY_PPC10
    425 
    426 #define HWY_NAMESPACE N_PPC10
    427 #define HWY_TARGET_STR HWY_TARGET_STR_PPC10
    428 
    429 #else
    430 #error "Logic error"
    431 #endif  // HWY_TARGET
    432 
    433 //-----------------------------------------------------------------------------
    434 // Z14, Z15
    435 #elif HWY_TARGET == HWY_Z14 || HWY_TARGET == HWY_Z15
    436 
    437 #define HWY_ALIGN alignas(16)
    438 #define HWY_MAX_BYTES 16
    439 #define HWY_LANES(T) (16 / sizeof(T))
    440 
    441 #define HWY_HAVE_SCALABLE 0
    442 #define HWY_HAVE_INTEGER64 1
    443 #define HWY_HAVE_FLOAT16 0
    444 #define HWY_HAVE_FLOAT64 1
    445 #define HWY_MEM_OPS_MIGHT_FAULT 1
    446 #define HWY_NATIVE_FMA 1
    447 #define HWY_NATIVE_DOT_BF16 0
    448 #define HWY_NATIVE_MASK 0
    449 #define HWY_CAP_GE256 0
    450 #define HWY_CAP_GE512 0
    451 
    452 #if HWY_TARGET == HWY_Z14
    453 
    454 #define HWY_NAMESPACE N_Z14
    455 #define HWY_TARGET_STR HWY_TARGET_STR_Z14
    456 
    457 #elif HWY_TARGET == HWY_Z15
    458 
    459 #define HWY_NAMESPACE N_Z15
    460 #define HWY_TARGET_STR HWY_TARGET_STR_Z15
    461 
    462 #else
    463 #error "Logic error"
    464 #endif  // HWY_TARGET == HWY_Z15
    465 
    466 //-----------------------------------------------------------------------------
    467 // NEON
    468 #elif HWY_TARGET_IS_NEON
    469 
    470 // Clang 17 crashes with bf16, see github.com/llvm/llvm-project/issues/64179.
    471 #undef HWY_NEON_HAVE_BFLOAT16
    472 #if HWY_HAVE_SCALAR_BF16_TYPE &&                              \
    473    ((HWY_TARGET == HWY_NEON_BF16 &&                          \
    474      (!HWY_COMPILER_CLANG || HWY_COMPILER_CLANG >= 1800)) || \
    475     defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC))
    476 #define HWY_NEON_HAVE_BFLOAT16 1
    477 #else
    478 #define HWY_NEON_HAVE_BFLOAT16 0
    479 #endif
    480 
    481 // HWY_NEON_HAVE_F32_TO_BF16C is defined if NEON vcvt_bf16_f32 and
    482 // vbfdot_f32 are available, even if the __bf16 type is disabled due to
    483 // GCC/Clang bugs.
    484 #undef HWY_NEON_HAVE_F32_TO_BF16C
    485 #if HWY_NEON_HAVE_BFLOAT16 || HWY_TARGET == HWY_NEON_BF16 || \
    486    (defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) &&        \
    487     (HWY_COMPILER_GCC_ACTUAL >= 1000 || HWY_COMPILER_CLANG >= 1100))
    488 #define HWY_NEON_HAVE_F32_TO_BF16C 1
    489 #else
    490 #define HWY_NEON_HAVE_F32_TO_BF16C 0
    491 #endif
    492 
    493 #define HWY_ALIGN alignas(16)
    494 #define HWY_MAX_BYTES 16
    495 #define HWY_LANES(T) (16 / sizeof(T))
    496 
    497 #define HWY_HAVE_SCALABLE 0
    498 #define HWY_HAVE_INTEGER64 1
    499 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) || HWY_TARGET == HWY_NEON_BF16
    500 #define HWY_HAVE_FLOAT16 1
    501 #else
    502 #define HWY_HAVE_FLOAT16 0
    503 #endif
    504 
    505 #if HWY_ARCH_ARM_A64
    506 #define HWY_HAVE_FLOAT64 1
    507 #else
    508 #define HWY_HAVE_FLOAT64 0
    509 #endif
    510 
    511 #define HWY_MEM_OPS_MIGHT_FAULT 1
    512 
    513 #if defined(__ARM_FEATURE_FMA) || defined(__ARM_VFPV4__) || HWY_ARCH_ARM_A64
    514 #define HWY_NATIVE_FMA 1
    515 #else
    516 #define HWY_NATIVE_FMA 0
    517 #endif
    518 
    519 #if HWY_NEON_HAVE_F32_TO_BF16C
    520 #define HWY_NATIVE_DOT_BF16 1
    521 #else
    522 #define HWY_NATIVE_DOT_BF16 0
    523 #endif
    524 
    525 #define HWY_NATIVE_MASK 0
    526 
    527 #define HWY_CAP_GE256 0
    528 #define HWY_CAP_GE512 0
    529 
    530 #if HWY_TARGET == HWY_NEON_WITHOUT_AES
    531 #define HWY_NAMESPACE N_NEON_WITHOUT_AES
    532 #elif HWY_TARGET == HWY_NEON
    533 #define HWY_NAMESPACE N_NEON
    534 #elif HWY_TARGET == HWY_NEON_BF16
    535 #define HWY_NAMESPACE N_NEON_BF16
    536 #else
    537 #error "Logic error, missing case"
    538 #endif  // HWY_TARGET
    539 
    540 // Can use pragmas instead of -march compiler flag
    541 #if HWY_HAVE_RUNTIME_DISPATCH
    542 #if HWY_ARCH_ARM_V7
    543 
    544 // The __attribute__((target(+neon-vfpv4)) was introduced in gcc >= 8.
    545 #if HWY_COMPILER_GCC_ACTUAL >= 800
    546 #define HWY_TARGET_STR "+neon-vfpv4"
    547 #else   // GCC < 7
    548 // Do not define HWY_TARGET_STR (no pragma).
    549 #endif  // HWY_COMPILER_GCC_ACTUAL
    550 
    551 #else  // !HWY_ARCH_ARM_V7
    552 
    553 #if (HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1300) || \
    554    (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1300)
    555 // GCC 12 or earlier and Clang 12 or earlier require +crypto be added to the
    556 // target string to enable AArch64 AES intrinsics
    557 #define HWY_TARGET_STR_NEON "+crypto"
    558 #else
    559 #define HWY_TARGET_STR_NEON "+aes"
    560 #endif
    561 
    562 // Clang >= 16 requires +fullfp16 instead of fp16, but Apple Clang 15 = 1600
    563 // fails to parse unless the string starts with armv8, whereas 1700 refuses it.
    564 #if HWY_COMPILER_CLANG >= 1700
    565 #define HWY_TARGET_STR_FP16 "+fullfp16"
    566 #elif HWY_COMPILER_CLANG >= 1600 && defined(__apple_build_version__)
    567 #define HWY_TARGET_STR_FP16 "armv8.4-a+fullfp16"
    568 #else
    569 #define HWY_TARGET_STR_FP16 "+fp16"
    570 #endif
    571 
    572 #define HWY_TARGET_STR_I8MM "+i8mm"
    573 
    574 #if HWY_TARGET == HWY_NEON_WITHOUT_AES
    575 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
    576 // Prevents inadvertent use of SVE by GCC 13.4 and earlier, see #2689.
    577 #define HWY_TARGET_STR "+nosve"
    578 #else
    579 // Do not define HWY_TARGET_STR (no pragma).
    580 #endif  // HWY_COMPILER_GCC_ACTUAL
    581 #elif HWY_TARGET == HWY_NEON
    582 #define HWY_TARGET_STR HWY_TARGET_STR_NEON
    583 #elif HWY_TARGET == HWY_NEON_BF16
    584 #define HWY_TARGET_STR \
    585  HWY_TARGET_STR_FP16 HWY_TARGET_STR_I8MM "+bf16+dotprod" HWY_TARGET_STR_NEON
    586 #else
    587 #error "Logic error, missing case"
    588 #endif  // HWY_TARGET
    589 
    590 #endif  // !HWY_ARCH_ARM_V7
    591 #else   // !HWY_HAVE_RUNTIME_DISPATCH
    592 // HWY_TARGET_STR remains undefined
    593 #endif
    594 
    595 //-----------------------------------------------------------------------------
    596 // SVE[2]
    597 #elif HWY_TARGET_IS_SVE
    598 
    599 // SVE only requires lane alignment, not natural alignment of the entire vector.
    600 #define HWY_ALIGN alignas(8)
    601 
    602 // Value ensures MaxLanes() is the tightest possible upper bound to reduce
    603 // overallocation.
    604 #define HWY_LANES(T) ((HWY_MAX_BYTES) / sizeof(T))
    605 
    606 #define HWY_HAVE_INTEGER64 1
    607 #define HWY_HAVE_FLOAT16 1
    608 #define HWY_HAVE_FLOAT64 1
    609 #define HWY_MEM_OPS_MIGHT_FAULT 0
    610 #define HWY_NATIVE_FMA 1
    611 #if HWY_SVE_HAVE_BF16_FEATURE
    612 #define HWY_NATIVE_DOT_BF16 1
    613 #else
    614 #define HWY_NATIVE_DOT_BF16 0
    615 #endif
    616 #define HWY_NATIVE_MASK 1
    617 #define HWY_CAP_GE256 0
    618 #define HWY_CAP_GE512 0
    619 
    620 #if HWY_TARGET == HWY_SVE2
    621 #define HWY_NAMESPACE N_SVE2
    622 #define HWY_MAX_BYTES 256
    623 #define HWY_HAVE_SCALABLE 1
    624 #elif HWY_TARGET == HWY_SVE_256
    625 #define HWY_NAMESPACE N_SVE_256
    626 #define HWY_MAX_BYTES 32
    627 #define HWY_HAVE_SCALABLE 0
    628 #elif HWY_TARGET == HWY_SVE2_128
    629 #define HWY_NAMESPACE N_SVE2_128
    630 #define HWY_MAX_BYTES 16
    631 #define HWY_HAVE_SCALABLE 0
    632 #else
    633 #define HWY_NAMESPACE N_SVE
    634 #define HWY_MAX_BYTES 256
    635 #define HWY_HAVE_SCALABLE 1
    636 #endif
    637 
    638 #define HWY_TARGET_STR_I8MM "+i8mm"
    639 
    640 // Can use pragmas instead of -march compiler flag
    641 #if HWY_HAVE_RUNTIME_DISPATCH
    642 #if HWY_TARGET == HWY_SVE2 || HWY_TARGET == HWY_SVE2_128
    643 // Static dispatch with -march=armv8-a+sve2+aes, or no baseline, hence dynamic
    644 // dispatch, which checks for AES support at runtime.
    645 #if defined(__ARM_FEATURE_SVE2_AES) || (HWY_BASELINE_SVE2 == 0)
    646 #define HWY_TARGET_STR "+sve2+sve2-aes,+sve" HWY_TARGET_STR_I8MM
    647 #else  // SVE2 without AES
    648 #define HWY_TARGET_STR "+sve2,+sve" HWY_TARGET_STR_I8MM
    649 #endif
    650 #else  // not SVE2 target
    651 #define HWY_TARGET_STR "+sve"
    652 #endif
    653 #else  // !HWY_HAVE_RUNTIME_DISPATCH
    654 // HWY_TARGET_STR remains undefined
    655 #endif
    656 
    657 //-----------------------------------------------------------------------------
    658 // WASM
    659 #elif HWY_TARGET == HWY_WASM
    660 
    661 #define HWY_ALIGN alignas(16)
    662 #define HWY_MAX_BYTES 16
    663 #define HWY_LANES(T) (16 / sizeof(T))
    664 
    665 #define HWY_HAVE_SCALABLE 0
    666 #define HWY_HAVE_INTEGER64 1
    667 #define HWY_HAVE_FLOAT16 0
    668 #define HWY_HAVE_FLOAT64 1
    669 #define HWY_MEM_OPS_MIGHT_FAULT 1
    670 #define HWY_NATIVE_FMA 0
    671 #define HWY_NATIVE_DOT_BF16 0
    672 #define HWY_NATIVE_MASK 0
    673 #define HWY_CAP_GE256 0
    674 #define HWY_CAP_GE512 0
    675 
    676 #define HWY_NAMESPACE N_WASM
    677 
    678 #define HWY_TARGET_STR "simd128"
    679 
    680 //-----------------------------------------------------------------------------
    681 // WASM_EMU256
    682 #elif HWY_TARGET == HWY_WASM_EMU256
    683 
    684 #define HWY_ALIGN alignas(32)
    685 #define HWY_MAX_BYTES 32
    686 #define HWY_LANES(T) (32 / sizeof(T))
    687 
    688 #define HWY_HAVE_SCALABLE 0
    689 #define HWY_HAVE_INTEGER64 1
    690 #define HWY_HAVE_FLOAT16 0
    691 #define HWY_HAVE_FLOAT64 1
    692 #define HWY_MEM_OPS_MIGHT_FAULT 1
    693 #define HWY_NATIVE_FMA 0
    694 #define HWY_NATIVE_DOT_BF16 0
    695 #define HWY_NATIVE_MASK 0
    696 #define HWY_CAP_GE256 1
    697 #define HWY_CAP_GE512 0
    698 
    699 #define HWY_NAMESPACE N_WASM_EMU256
    700 
    701 #define HWY_TARGET_STR "simd128"
    702 
    703 //-----------------------------------------------------------------------------
    704 // RVV
    705 #elif HWY_TARGET == HWY_RVV
    706 
    707 // RVV only requires lane alignment, not natural alignment of the entire vector,
    708 // and the compiler already aligns builtin types, so nothing to do here.
    709 #define HWY_ALIGN
    710 
    711 // The spec requires VLEN <= 2^16 bits, so the limit is 2^16 bytes (LMUL=8).
    712 #define HWY_MAX_BYTES 65536
    713 
    714 // = HWY_MAX_BYTES divided by max LMUL=8 because MaxLanes includes the actual
    715 // LMUL. This is the tightest possible upper bound.
    716 #define HWY_LANES(T) (8192 / sizeof(T))
    717 
    718 #define HWY_HAVE_SCALABLE 1
    719 #define HWY_HAVE_INTEGER64 1
    720 #define HWY_HAVE_FLOAT64 1
    721 #define HWY_MEM_OPS_MIGHT_FAULT 0
    722 #define HWY_NATIVE_FMA 1
    723 #define HWY_NATIVE_DOT_BF16 0
    724 #define HWY_NATIVE_MASK 1
    725 #define HWY_CAP_GE256 0
    726 #define HWY_CAP_GE512 0
    727 
    728 #if HWY_RVV_HAVE_F16_VEC
    729 #define HWY_HAVE_FLOAT16 1
    730 #else
    731 #define HWY_HAVE_FLOAT16 0
    732 #endif
    733 
    734 #define HWY_NAMESPACE N_RVV
    735 
    736 #if HWY_COMPILER_CLANG >= 1900
    737 // https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc#181-zvl-minimum-vector-length-standard-extensions
    738 #define HWY_TARGET_STR "arch=+v"
    739 #else
    740 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
    741 #endif
    742 
    743 //-----------------------------------------------------------------------------
    744 // LSX/LASX
    745 #elif HWY_TARGET == HWY_LSX || HWY_TARGET == HWY_LASX
    746 
    747 #if HWY_TARGET == HWY_LSX
    748 #define HWY_ALIGN alignas(16)
    749 #define HWY_MAX_BYTES 16
    750 #ifndef __loongarch_sx
    751 #define HWY_TARGET_STR "lsx"
    752 #endif
    753 #else
    754 #define HWY_ALIGN alignas(32)
    755 #define HWY_MAX_BYTES 32
    756 #ifndef __loongarch_asx
    757 #define HWY_TARGET_STR "lsx,lasx"
    758 #endif
    759 #endif
    760 
    761 #define HWY_LANES(T) (HWY_MAX_BYTES / sizeof(T))
    762 
    763 #define HWY_HAVE_SCALABLE 0
    764 #define HWY_HAVE_INTEGER64 1
    765 #define HWY_HAVE_FLOAT16 0
    766 #define HWY_HAVE_FLOAT64 1
    767 #define HWY_MEM_OPS_MIGHT_FAULT 1
    768 #define HWY_NATIVE_FMA 1
    769 #define HWY_NATIVE_DOT_BF16 0
    770 #define HWY_NATIVE_MASK 0
    771 
    772 #if HWY_TARGET == HWY_LSX
    773 #define HWY_CAP_GE256 0
    774 #else
    775 #define HWY_CAP_GE256 1
    776 #endif
    777 
    778 #define HWY_CAP_GE512 0
    779 
    780 #if HWY_TARGET == HWY_LSX
    781 #define HWY_NAMESPACE N_LSX
    782 #else
    783 #define HWY_NAMESPACE N_LASX
    784 #endif
    785 
    786 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
    787 
    788 //-----------------------------------------------------------------------------
    789 // EMU128
    790 #elif HWY_TARGET == HWY_EMU128
    791 
    792 #define HWY_ALIGN alignas(16)
    793 #define HWY_MAX_BYTES 16
    794 #define HWY_LANES(T) (16 / sizeof(T))
    795 
    796 #define HWY_HAVE_SCALABLE 0
    797 #define HWY_HAVE_INTEGER64 1
    798 #define HWY_HAVE_FLOAT16 0
    799 #define HWY_HAVE_FLOAT64 1
    800 #define HWY_MEM_OPS_MIGHT_FAULT 1
    801 #define HWY_NATIVE_FMA 0
    802 #define HWY_NATIVE_DOT_BF16 0
    803 #define HWY_NATIVE_MASK 0
    804 #define HWY_CAP_GE256 0
    805 #define HWY_CAP_GE512 0
    806 
    807 #define HWY_NAMESPACE N_EMU128
    808 
    809 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
    810 
    811 //-----------------------------------------------------------------------------
    812 // SCALAR
    813 #elif HWY_TARGET == HWY_SCALAR
    814 
    815 #define HWY_ALIGN
    816 #define HWY_MAX_BYTES 8
    817 #define HWY_LANES(T) 1
    818 
    819 #define HWY_HAVE_SCALABLE 0
    820 #define HWY_HAVE_INTEGER64 1
    821 #define HWY_HAVE_FLOAT16 0
    822 #define HWY_HAVE_FLOAT64 1
    823 #define HWY_MEM_OPS_MIGHT_FAULT 0
    824 #define HWY_NATIVE_FMA 0
    825 #define HWY_NATIVE_DOT_BF16 0
    826 #define HWY_NATIVE_MASK 0
    827 #define HWY_CAP_GE256 0
    828 #define HWY_CAP_GE512 0
    829 
    830 #define HWY_NAMESPACE N_SCALAR
    831 
    832 // HWY_TARGET_STR remains undefined so HWY_ATTR is a no-op.
    833 
    834 #else
    835 #pragma message("HWY_TARGET does not match any known target")
    836 #endif  // HWY_TARGET
    837 
    838 //-----------------------------------------------------------------------------
    839 
    840 // Sanity check: if we have f16 vector support, then base.h should also be
    841 // using a built-in type for f16 scalars.
    842 #if HWY_HAVE_FLOAT16 && !HWY_HAVE_SCALAR_F16_TYPE
    843 #error "Logic error: f16 vectors but no scalars"
    844 #endif
    845 
    846 // Override this to 1 in asan/msan builds, which will still fault.
    847 #if HWY_IS_ASAN || HWY_IS_MSAN
    848 #undef HWY_MEM_OPS_MIGHT_FAULT
    849 #define HWY_MEM_OPS_MIGHT_FAULT 1
    850 #endif
    851 
    852 // Clang <9 requires this be invoked at file scope, before any namespace.
    853 #undef HWY_BEFORE_NAMESPACE
    854 #if defined(HWY_TARGET_STR)
    855 #define HWY_BEFORE_NAMESPACE()        \
    856  HWY_PUSH_ATTRIBUTES(HWY_TARGET_STR) \
    857  static_assert(true, "For requiring trailing semicolon")
    858 #else
    859 // avoids compiler warning if no HWY_TARGET_STR
    860 #define HWY_BEFORE_NAMESPACE() \
    861  static_assert(true, "For requiring trailing semicolon")
    862 #endif
    863 
    864 // Clang <9 requires any namespaces be closed before this macro.
    865 #undef HWY_AFTER_NAMESPACE
    866 #if defined(HWY_TARGET_STR)
    867 #define HWY_AFTER_NAMESPACE() \
    868  HWY_POP_ATTRIBUTES          \
    869  static_assert(true, "For requiring trailing semicolon")
    870 #else
    871 // avoids compiler warning if no HWY_TARGET_STR
    872 #define HWY_AFTER_NAMESPACE() \
    873  static_assert(true, "For requiring trailing semicolon")
    874 #endif
    875 
    876 #undef HWY_ATTR
    877 #if defined(HWY_TARGET_STR) && HWY_HAS_ATTRIBUTE(target)
    878 #define HWY_ATTR __attribute__((target(HWY_TARGET_STR)))
    879 #else
    880 #define HWY_ATTR
    881 #endif