tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

arm_neon-inl.h (393118B)


      1 // Copyright 2019 Google LLC
      2 // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com>
      3 // SPDX-License-Identifier: Apache-2.0
      4 // SPDX-License-Identifier: BSD-3-Clause
      5 //
      6 // Licensed under the Apache License, Version 2.0 (the "License");
      7 // you may not use this file except in compliance with the License.
      8 // You may obtain a copy of the License at
      9 //
     10 //      http://www.apache.org/licenses/LICENSE-2.0
     11 //
     12 // Unless required by applicable law or agreed to in writing, software
     13 // distributed under the License is distributed on an "AS IS" BASIS,
     14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     15 // See the License for the specific language governing permissions and
     16 // limitations under the License.
     17 
     18 // 128-bit Arm NEON vectors and operations.
     19 // External include guard in highway.h - see comment there.
     20 
     21 // Arm NEON intrinsics are documented at:
     22 // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon]
     23 
     24 #include "hwy/base.h"
     25 #include "hwy/ops/shared-inl.h"
     26 
     27 HWY_DIAGNOSTICS(push)
     28 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized")
     29 #include <arm_neon.h>  // NOLINT(build/include_order)
     30 HWY_DIAGNOSTICS(pop)
     31 
     32 HWY_BEFORE_NAMESPACE();
     33 namespace hwy {
     34 namespace HWY_NAMESPACE {
     35 
     36 namespace detail {  // for code folding and Raw128
     37 
     38 // Macros used to define single and double function calls for multiple types
     39 // for full and half vectors. These macros are undefined at the end of the file.
     40 
     41 // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function.
     42 #define HWY_NEON_BUILD_TPL_1
     43 #define HWY_NEON_BUILD_TPL_2
     44 #define HWY_NEON_BUILD_TPL_3
     45 
     46 // HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can
     47 // extend it to int32x4x2_t packs.
     48 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size>
     49 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size>
     50 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size>
     51 
     52 // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives.
     53 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a
     54 #define HWY_NEON_BUILD_PARAM_2(type, size) \
     55  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
     56 #define HWY_NEON_BUILD_PARAM_3(type, size)                        \
     57  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \
     58      const Vec128<type##_t, size> c
     59 
     60 // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying
     61 // function.
     62 #define HWY_NEON_BUILD_ARG_1 a.raw
     63 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw
     64 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw
     65 
     66 // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after
     67 // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on
     68 // itself like with some of the library "functions" such as vshlq_u8. For
     69 // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as
     70 // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed.
     71 // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro
     72 // expects two arguments.
     73 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__)
     74 
     75 // Main macro definition that defines a single function for the given type and
     76 // size of vector, using the underlying (prefix##infix##suffix) function and
     77 // the template, return type, parameters and arguments defined by the "args"
     78 // parameters passed here (see HWY_NEON_BUILD_* macros defined before).
     79 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
     80  HWY_CONCAT(HWY_NEON_BUILD_TPL_, args)                                      \
     81  HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)                  \
     82      name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) {            \
     83    return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)(                \
     84        HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args));    \
     85  }
     86 
     87 // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function
     88 // called "name" using the set of neon functions starting with the given
     89 // "prefix" for all the variants of certain types, as specified next to each
     90 // macro. For example, the prefix "vsub" can be used to define the operator-
     91 // using args=2.
     92 
     93 // uint8_t
     94 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)      \
     95  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \
     96  HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args)     \
     97  HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args)     \
     98  HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args)     \
     99  HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args)
    100 
    101 // int8_t
    102 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)      \
    103  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \
    104  HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args)     \
    105  HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args)     \
    106  HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args)     \
    107  HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args)
    108 
    109 // uint16_t
    110 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)      \
    111  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
    112  HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args)    \
    113  HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args)    \
    114  HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args)
    115 
    116 // int16_t
    117 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)      \
    118  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \
    119  HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args)    \
    120  HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args)    \
    121  HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args)
    122 
    123 // uint32_t
    124 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)      \
    125  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
    126  HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args)    \
    127  HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args)
    128 
    129 // int32_t
    130 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)      \
    131  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \
    132  HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args)    \
    133  HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args)
    134 
    135 // uint64_t
    136 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)      \
    137  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
    138  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
    139 
    140 // int64_t
    141 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)      \
    142  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \
    143  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)
    144 
    145 // bfloat16_t
    146 #if HWY_NEON_HAVE_BFLOAT16
    147 #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)       \
    148  HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \
    149  HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args)    \
    150  HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args)    \
    151  HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args)
    152 #else
    153 #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
    154 #endif
    155 
    156 // Used for conversion instructions if HWY_NEON_HAVE_F16C.
    157 #define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \
    158                                                     args)                \
    159  HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args)    \
    160  HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args)       \
    161  HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args)       \
    162  HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args)
    163 
    164 // float16_t
    165 #if HWY_HAVE_FLOAT16
    166 #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \
    167  HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args)
    168 #else
    169 #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)
    170 #endif
    171 
    172 // Enable generic functions for whichever of (f16, bf16) are not supported.
    173 #if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
    174 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
    175 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
    176 #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D)
    177 #elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
    178 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D)
    179 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_F16_D(D)
    180 #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_F16_D(D)
    181 #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16
    182 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
    183 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
    184 #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D)
    185 #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16
    186 // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of
    187 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since
    188 // !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause
    189 // SFINAE to occur instead of a hard error due to a dependency on the D template
    190 // argument
    191 #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
    192 #define HWY_GENERIC_IF_EMULATED_D(D) \
    193  hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr
    194 #define HWY_NEON_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr
    195 #else
    196 #error "Logic error, handled all four cases"
    197 #endif
    198 
    199 // float
    200 #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)      \
    201  HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \
    202  HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args)    \
    203  HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args)
    204 
    205 // double
    206 #if HWY_HAVE_FLOAT64
    207 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)      \
    208  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \
    209  HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args)
    210 #else
    211 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
    212 #endif
    213 
    214 // Helper macros to define for more than one type.
    215 // uint8_t, uint16_t and uint32_t
    216 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
    217  HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args)             \
    218  HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args)            \
    219  HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args)
    220 
    221 // int8_t, int16_t and int32_t
    222 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
    223  HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args)             \
    224  HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args)            \
    225  HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args)
    226 
    227 // uint8_t, uint16_t, uint32_t and uint64_t
    228 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)  \
    229  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \
    230  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)
    231 
    232 // int8_t, int16_t, int32_t and int64_t
    233 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)  \
    234  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \
    235  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)
    236 
    237 // All int*_t and uint*_t up to 64
    238 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \
    239  HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args)             \
    240  HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args)
    241 
    242 #define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \
    243  HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args)          \
    244  HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args)
    245 
    246 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \
    247  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)      \
    248  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
    249 
    250 // All previous types.
    251 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \
    252  HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args)      \
    253  HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args)
    254 
    255 #define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \
    256  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)     \
    257  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)
    258 
    259 #define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \
    260  HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args)        \
    261  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)
    262 
    263 #define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \
    264  HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args)      \
    265  HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args)       \
    266  HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args)
    267 
    268 // For vzip1/2
    269 #define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)   \
    270  HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \
    271  HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args)
    272 #define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \
    273  HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)        \
    274  HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args)
    275 
    276 // For eor3q, which is only defined for full vectors.
    277 #define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args)      \
    278  HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args)  \
    279  HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \
    280  HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \
    281  HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args)   \
    282  HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args)  \
    283  HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args)  \
    284  HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args)
    285 // Emulation of some intrinsics on armv7.
    286 #if HWY_ARCH_ARM_V7
    287 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0]
    288 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0]
    289 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0]
    290 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0]
    291 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0]
    292 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0]
    293 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0]
    294 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0]
    295 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0]
    296 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0]
    297 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0]
    298 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0]
    299 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0]
    300 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0]
    301 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1]
    302 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1]
    303 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1]
    304 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1]
    305 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1]
    306 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1]
    307 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1]
    308 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1]
    309 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1]
    310 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1]
    311 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1]
    312 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1]
    313 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1]
    314 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1]
    315 #define vzip1_s8(x, y) vzip_s8(x, y).val[0]
    316 #define vzip1_u8(x, y) vzip_u8(x, y).val[0]
    317 #define vzip1_s16(x, y) vzip_s16(x, y).val[0]
    318 #define vzip1_u16(x, y) vzip_u16(x, y).val[0]
    319 #define vzip1_f32(x, y) vzip_f32(x, y).val[0]
    320 #define vzip1_u32(x, y) vzip_u32(x, y).val[0]
    321 #define vzip1_s32(x, y) vzip_s32(x, y).val[0]
    322 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0]
    323 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0]
    324 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0]
    325 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0]
    326 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0]
    327 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0]
    328 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0]
    329 #define vzip2_s8(x, y) vzip_s8(x, y).val[1]
    330 #define vzip2_u8(x, y) vzip_u8(x, y).val[1]
    331 #define vzip2_s16(x, y) vzip_s16(x, y).val[1]
    332 #define vzip2_u16(x, y) vzip_u16(x, y).val[1]
    333 #define vzip2_s32(x, y) vzip_s32(x, y).val[1]
    334 #define vzip2_u32(x, y) vzip_u32(x, y).val[1]
    335 #define vzip2_f32(x, y) vzip_f32(x, y).val[1]
    336 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1]
    337 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1]
    338 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1]
    339 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1]
    340 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1]
    341 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1]
    342 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1]
    343 #endif
    344 
    345 // Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2
    346 // overloads for all vector types, even those (bfloat16_t) where the
    347 // underlying vector is the same as others (uint16_t).
    348 template <typename T, size_t N>
    349 struct Tuple2;
    350 template <typename T, size_t N>
    351 struct Tuple3;
    352 template <typename T, size_t N>
    353 struct Tuple4;
    354 
    355 template <>
    356 struct Tuple2<uint8_t, 16> {
    357  uint8x16x2_t raw;
    358 };
    359 template <size_t N>
    360 struct Tuple2<uint8_t, N> {
    361  uint8x8x2_t raw;
    362 };
    363 template <>
    364 struct Tuple2<int8_t, 16> {
    365  int8x16x2_t raw;
    366 };
    367 template <size_t N>
    368 struct Tuple2<int8_t, N> {
    369  int8x8x2_t raw;
    370 };
    371 template <>
    372 struct Tuple2<uint16_t, 8> {
    373  uint16x8x2_t raw;
    374 };
    375 template <size_t N>
    376 struct Tuple2<uint16_t, N> {
    377  uint16x4x2_t raw;
    378 };
    379 template <>
    380 struct Tuple2<int16_t, 8> {
    381  int16x8x2_t raw;
    382 };
    383 template <size_t N>
    384 struct Tuple2<int16_t, N> {
    385  int16x4x2_t raw;
    386 };
    387 template <>
    388 struct Tuple2<uint32_t, 4> {
    389  uint32x4x2_t raw;
    390 };
    391 template <size_t N>
    392 struct Tuple2<uint32_t, N> {
    393  uint32x2x2_t raw;
    394 };
    395 template <>
    396 struct Tuple2<int32_t, 4> {
    397  int32x4x2_t raw;
    398 };
    399 template <size_t N>
    400 struct Tuple2<int32_t, N> {
    401  int32x2x2_t raw;
    402 };
    403 template <>
    404 struct Tuple2<uint64_t, 2> {
    405  uint64x2x2_t raw;
    406 };
    407 template <size_t N>
    408 struct Tuple2<uint64_t, N> {
    409  uint64x1x2_t raw;
    410 };
    411 template <>
    412 struct Tuple2<int64_t, 2> {
    413  int64x2x2_t raw;
    414 };
    415 template <size_t N>
    416 struct Tuple2<int64_t, N> {
    417  int64x1x2_t raw;
    418 };
    419 
    420 template <>
    421 struct Tuple2<float32_t, 4> {
    422  float32x4x2_t raw;
    423 };
    424 template <size_t N>
    425 struct Tuple2<float32_t, N> {
    426  float32x2x2_t raw;
    427 };
    428 #if HWY_HAVE_FLOAT64
    429 template <>
    430 struct Tuple2<float64_t, 2> {
    431  float64x2x2_t raw;
    432 };
    433 template <size_t N>
    434 struct Tuple2<float64_t, N> {
    435  float64x1x2_t raw;
    436 };
    437 #endif  // HWY_HAVE_FLOAT64
    438 
    439 template <>
    440 struct Tuple3<uint8_t, 16> {
    441  uint8x16x3_t raw;
    442 };
    443 template <size_t N>
    444 struct Tuple3<uint8_t, N> {
    445  uint8x8x3_t raw;
    446 };
    447 template <>
    448 struct Tuple3<int8_t, 16> {
    449  int8x16x3_t raw;
    450 };
    451 template <size_t N>
    452 struct Tuple3<int8_t, N> {
    453  int8x8x3_t raw;
    454 };
    455 template <>
    456 struct Tuple3<uint16_t, 8> {
    457  uint16x8x3_t raw;
    458 };
    459 template <size_t N>
    460 struct Tuple3<uint16_t, N> {
    461  uint16x4x3_t raw;
    462 };
    463 template <>
    464 struct Tuple3<int16_t, 8> {
    465  int16x8x3_t raw;
    466 };
    467 template <size_t N>
    468 struct Tuple3<int16_t, N> {
    469  int16x4x3_t raw;
    470 };
    471 template <>
    472 struct Tuple3<uint32_t, 4> {
    473  uint32x4x3_t raw;
    474 };
    475 template <size_t N>
    476 struct Tuple3<uint32_t, N> {
    477  uint32x2x3_t raw;
    478 };
    479 template <>
    480 struct Tuple3<int32_t, 4> {
    481  int32x4x3_t raw;
    482 };
    483 template <size_t N>
    484 struct Tuple3<int32_t, N> {
    485  int32x2x3_t raw;
    486 };
    487 template <>
    488 struct Tuple3<uint64_t, 2> {
    489  uint64x2x3_t raw;
    490 };
    491 template <size_t N>
    492 struct Tuple3<uint64_t, N> {
    493  uint64x1x3_t raw;
    494 };
    495 template <>
    496 struct Tuple3<int64_t, 2> {
    497  int64x2x3_t raw;
    498 };
    499 template <size_t N>
    500 struct Tuple3<int64_t, N> {
    501  int64x1x3_t raw;
    502 };
    503 
    504 template <>
    505 struct Tuple3<float32_t, 4> {
    506  float32x4x3_t raw;
    507 };
    508 template <size_t N>
    509 struct Tuple3<float32_t, N> {
    510  float32x2x3_t raw;
    511 };
    512 #if HWY_HAVE_FLOAT64
    513 template <>
    514 struct Tuple3<float64_t, 2> {
    515  float64x2x3_t raw;
    516 };
    517 template <size_t N>
    518 struct Tuple3<float64_t, N> {
    519  float64x1x3_t raw;
    520 };
    521 #endif  // HWY_HAVE_FLOAT64
    522 
    523 template <>
    524 struct Tuple4<uint8_t, 16> {
    525  uint8x16x4_t raw;
    526 };
    527 template <size_t N>
    528 struct Tuple4<uint8_t, N> {
    529  uint8x8x4_t raw;
    530 };
    531 template <>
    532 struct Tuple4<int8_t, 16> {
    533  int8x16x4_t raw;
    534 };
    535 template <size_t N>
    536 struct Tuple4<int8_t, N> {
    537  int8x8x4_t raw;
    538 };
    539 template <>
    540 struct Tuple4<uint16_t, 8> {
    541  uint16x8x4_t raw;
    542 };
    543 template <size_t N>
    544 struct Tuple4<uint16_t, N> {
    545  uint16x4x4_t raw;
    546 };
    547 template <>
    548 struct Tuple4<int16_t, 8> {
    549  int16x8x4_t raw;
    550 };
    551 template <size_t N>
    552 struct Tuple4<int16_t, N> {
    553  int16x4x4_t raw;
    554 };
    555 template <>
    556 struct Tuple4<uint32_t, 4> {
    557  uint32x4x4_t raw;
    558 };
    559 template <size_t N>
    560 struct Tuple4<uint32_t, N> {
    561  uint32x2x4_t raw;
    562 };
    563 template <>
    564 struct Tuple4<int32_t, 4> {
    565  int32x4x4_t raw;
    566 };
    567 template <size_t N>
    568 struct Tuple4<int32_t, N> {
    569  int32x2x4_t raw;
    570 };
    571 template <>
    572 struct Tuple4<uint64_t, 2> {
    573  uint64x2x4_t raw;
    574 };
    575 template <size_t N>
    576 struct Tuple4<uint64_t, N> {
    577  uint64x1x4_t raw;
    578 };
    579 template <>
    580 struct Tuple4<int64_t, 2> {
    581  int64x2x4_t raw;
    582 };
    583 template <size_t N>
    584 struct Tuple4<int64_t, N> {
    585  int64x1x4_t raw;
    586 };
    587 
    588 template <>
    589 struct Tuple4<float32_t, 4> {
    590  float32x4x4_t raw;
    591 };
    592 template <size_t N>
    593 struct Tuple4<float32_t, N> {
    594  float32x2x4_t raw;
    595 };
    596 #if HWY_HAVE_FLOAT64
    597 template <>
    598 struct Tuple4<float64_t, 2> {
    599  float64x2x4_t raw;
    600 };
    601 template <size_t N>
    602 struct Tuple4<float64_t, N> {
    603  float64x1x4_t raw;
    604 };
    605 #endif  // HWY_HAVE_FLOAT64
    606 
    607 template <typename T, size_t N>
    608 struct Raw128;
    609 
    610 template <>
    611 struct Raw128<uint8_t, 16> {
    612  using type = uint8x16_t;
    613 };
    614 template <size_t N>
    615 struct Raw128<uint8_t, N> {
    616  using type = uint8x8_t;
    617 };
    618 
    619 template <>
    620 struct Raw128<uint16_t, 8> {
    621  using type = uint16x8_t;
    622 };
    623 template <size_t N>
    624 struct Raw128<uint16_t, N> {
    625  using type = uint16x4_t;
    626 };
    627 
    628 template <>
    629 struct Raw128<uint32_t, 4> {
    630  using type = uint32x4_t;
    631 };
    632 template <size_t N>
    633 struct Raw128<uint32_t, N> {
    634  using type = uint32x2_t;
    635 };
    636 
    637 template <>
    638 struct Raw128<uint64_t, 2> {
    639  using type = uint64x2_t;
    640 };
    641 template <>
    642 struct Raw128<uint64_t, 1> {
    643  using type = uint64x1_t;
    644 };
    645 
    646 template <>
    647 struct Raw128<int8_t, 16> {
    648  using type = int8x16_t;
    649 };
    650 template <size_t N>
    651 struct Raw128<int8_t, N> {
    652  using type = int8x8_t;
    653 };
    654 
    655 template <>
    656 struct Raw128<int16_t, 8> {
    657  using type = int16x8_t;
    658 };
    659 template <size_t N>
    660 struct Raw128<int16_t, N> {
    661  using type = int16x4_t;
    662 };
    663 
    664 template <>
    665 struct Raw128<int32_t, 4> {
    666  using type = int32x4_t;
    667 };
    668 template <size_t N>
    669 struct Raw128<int32_t, N> {
    670  using type = int32x2_t;
    671 };
    672 
    673 template <>
    674 struct Raw128<int64_t, 2> {
    675  using type = int64x2_t;
    676 };
    677 template <>
    678 struct Raw128<int64_t, 1> {
    679  using type = int64x1_t;
    680 };
    681 
    682 template <>
    683 struct Raw128<float, 4> {
    684  using type = float32x4_t;
    685 };
    686 template <size_t N>
    687 struct Raw128<float, N> {
    688  using type = float32x2_t;
    689 };
    690 
    691 #if HWY_HAVE_FLOAT64
    692 template <>
    693 struct Raw128<double, 2> {
    694  using type = float64x2_t;
    695 };
    696 template <>
    697 struct Raw128<double, 1> {
    698  using type = float64x1_t;
    699 };
    700 #endif  // HWY_HAVE_FLOAT64
    701 
    702 #if HWY_NEON_HAVE_F16C
    703 
    704 template <>
    705 struct Tuple2<float16_t, 8> {
    706  float16x8x2_t raw;
    707 };
    708 template <size_t N>
    709 struct Tuple2<float16_t, N> {
    710  float16x4x2_t raw;
    711 };
    712 
    713 template <>
    714 struct Tuple3<float16_t, 8> {
    715  float16x8x3_t raw;
    716 };
    717 template <size_t N>
    718 struct Tuple3<float16_t, N> {
    719  float16x4x3_t raw;
    720 };
    721 
    722 template <>
    723 struct Tuple4<float16_t, 8> {
    724  float16x8x4_t raw;
    725 };
    726 template <size_t N>
    727 struct Tuple4<float16_t, N> {
    728  float16x4x4_t raw;
    729 };
    730 
    731 template <>
    732 struct Raw128<float16_t, 8> {
    733  using type = float16x8_t;
    734 };
    735 template <size_t N>
    736 struct Raw128<float16_t, N> {
    737  using type = float16x4_t;
    738 };
    739 
    740 #else  // !HWY_NEON_HAVE_F16C
    741 
    742 template <size_t N>
    743 struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {};
    744 template <size_t N>
    745 struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {};
    746 template <size_t N>
    747 struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {};
    748 template <size_t N>
    749 struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {};
    750 
    751 #endif  // HWY_NEON_HAVE_F16C
    752 
    753 #if HWY_NEON_HAVE_BFLOAT16
    754 
    755 template <>
    756 struct Tuple2<bfloat16_t, 8> {
    757  bfloat16x8x2_t raw;
    758 };
    759 template <size_t N>
    760 struct Tuple2<bfloat16_t, N> {
    761  bfloat16x4x2_t raw;
    762 };
    763 
    764 template <>
    765 struct Tuple3<bfloat16_t, 8> {
    766  bfloat16x8x3_t raw;
    767 };
    768 template <size_t N>
    769 struct Tuple3<bfloat16_t, N> {
    770  bfloat16x4x3_t raw;
    771 };
    772 
    773 template <>
    774 struct Tuple4<bfloat16_t, 8> {
    775  bfloat16x8x4_t raw;
    776 };
    777 template <size_t N>
    778 struct Tuple4<bfloat16_t, N> {
    779  bfloat16x4x4_t raw;
    780 };
    781 
    782 template <>
    783 struct Raw128<bfloat16_t, 8> {
    784  using type = bfloat16x8_t;
    785 };
    786 template <size_t N>
    787 struct Raw128<bfloat16_t, N> {
    788  using type = bfloat16x4_t;
    789 };
    790 
    791 #else  // !HWY_NEON_HAVE_BFLOAT16
    792 
    793 template <size_t N>
    794 struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {};
    795 template <size_t N>
    796 struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {};
    797 template <size_t N>
    798 struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {};
    799 template <size_t N>
    800 struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {};
    801 
    802 #endif  // HWY_NEON_HAVE_BFLOAT16
    803 
    804 }  // namespace detail
    805 
    806 template <typename T, size_t N = 16 / sizeof(T)>
    807 class Vec128 {
    808 public:
    809  using Raw = typename detail::Raw128<T, N>::type;
    810  using PrivateT = T;                     // only for DFromV
    811  static constexpr size_t kPrivateN = N;  // only for DFromV
    812 
    813  HWY_INLINE Vec128() {}
    814  Vec128(const Vec128&) = default;
    815  Vec128& operator=(const Vec128&) = default;
    816  HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {}
    817 
    818  // Compound assignment. Only usable if there is a corresponding non-member
    819  // binary operator overload. For example, only f32 and f64 support division.
    820  HWY_INLINE Vec128& operator*=(const Vec128 other) {
    821    return *this = (*this * other);
    822  }
    823  HWY_INLINE Vec128& operator/=(const Vec128 other) {
    824    return *this = (*this / other);
    825  }
    826  HWY_INLINE Vec128& operator+=(const Vec128 other) {
    827    return *this = (*this + other);
    828  }
    829  HWY_INLINE Vec128& operator-=(const Vec128 other) {
    830    return *this = (*this - other);
    831  }
    832  HWY_INLINE Vec128& operator%=(const Vec128 other) {
    833    return *this = (*this % other);
    834  }
    835  HWY_INLINE Vec128& operator&=(const Vec128 other) {
    836    return *this = (*this & other);
    837  }
    838  HWY_INLINE Vec128& operator|=(const Vec128 other) {
    839    return *this = (*this | other);
    840  }
    841  HWY_INLINE Vec128& operator^=(const Vec128 other) {
    842    return *this = (*this ^ other);
    843  }
    844 
    845  Raw raw;
    846 };
    847 
    848 template <typename T>
    849 using Vec64 = Vec128<T, 8 / sizeof(T)>;
    850 
    851 template <typename T>
    852 using Vec32 = Vec128<T, 4 / sizeof(T)>;
    853 
    854 template <typename T>
    855 using Vec16 = Vec128<T, 2 / sizeof(T)>;
    856 
    857 // FF..FF or 0.
    858 template <typename T, size_t N = 16 / sizeof(T)>
    859 class Mask128 {
    860 public:
    861  // Arm C Language Extensions return and expect unsigned type.
    862  using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type;
    863 
    864  using PrivateT = T;                     // only for DFromM
    865  static constexpr size_t kPrivateN = N;  // only for DFromM
    866 
    867  HWY_INLINE Mask128() {}
    868  Mask128(const Mask128&) = default;
    869  Mask128& operator=(const Mask128&) = default;
    870  HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {}
    871 
    872  Raw raw;
    873 };
    874 
    875 template <typename T>
    876 using Mask64 = Mask128<T, 8 / sizeof(T)>;
    877 
    878 template <class V>
    879 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>;
    880 
    881 template <class M>
    882 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>;
    883 
    884 template <class V>
    885 using TFromV = typename V::PrivateT;
    886 
    887 // TODO(janwas): ForDemoteVectors, in convert_test and demote_test, appear to
    888 // instantiate this with D = double x 4. The cause is unknown. Previously,
    889 // defining this in terms of Set rejected that via SFINAE because only
    890 // V_SIZE = 16 and V_SIZE <= 8 overloads were defined. As a workaround,
    891 // truncate the lane count to 128 bits.
    892 template <class D>
    893 using VFromD =
    894    Vec128<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), MaxLanes(D()))>;
    895 
    896 // ------------------------------ BitCast
    897 
    898 namespace detail {
    899 
    900 // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the
    901 // vreinterpret*_u8_*() set of functions.
    902 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
    903 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \
    904  Vec128<uint8_t, size * sizeof(type##_t)>
    905 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v
    906 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw
    907 
    908 // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined.
    909 template <size_t N>
    910 HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) {
    911  return v;
    912 }
    913 
    914 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_,
    915                                 HWY_CAST_TO_U8)
    916 HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_,
    917                                HWY_CAST_TO_U8)
    918 
    919 HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
    920 HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
    921 HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
    922 HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8)
    923 
    924 #if !HWY_HAVE_FLOAT16
    925 #if HWY_NEON_HAVE_F16C
    926 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_,
    927                                             HWY_CAST_TO_U8)
    928 #else
    929 template <size_t N>
    930 HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) {
    931  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
    932 }
    933 #endif  // HWY_NEON_HAVE_F16C
    934 #endif  // !HWY_HAVE_FLOAT16
    935 
    936 #if !HWY_NEON_HAVE_BFLOAT16
    937 template <size_t N>
    938 HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) {
    939  return BitCastToByte(Vec128<uint16_t, N>(v.raw));
    940 }
    941 #endif  // !HWY_NEON_HAVE_BFLOAT16
    942 
    943 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8
    944 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8
    945 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8
    946 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8
    947 
    948 template <class D, HWY_IF_U8_D(D)>
    949 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) {
    950  return v;
    951 }
    952 
    953 // 64-bit or less:
    954 
    955 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)>
    956 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    957                                     VFromD<RebindToUnsigned<D>> v) {
    958  return VFromD<D>(vreinterpret_s8_u8(v.raw));
    959 }
    960 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
    961 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    962                                     VFromD<Repartition<uint8_t, D>> v) {
    963  return VFromD<D>(vreinterpret_u16_u8(v.raw));
    964 }
    965 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
    966 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    967                                     VFromD<Repartition<uint8_t, D>> v) {
    968  return VFromD<D>(vreinterpret_s16_u8(v.raw));
    969 }
    970 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
    971 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    972                                     VFromD<Repartition<uint8_t, D>> v) {
    973  return VFromD<D>(vreinterpret_u32_u8(v.raw));
    974 }
    975 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
    976 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
    977                                     VFromD<Repartition<uint8_t, D>> v) {
    978  return VFromD<D>(vreinterpret_s32_u8(v.raw));
    979 }
    980 
    981 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
    982 HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
    983  return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw));
    984 }
    985 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
    986 HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
    987  return Vec64<int64_t>(vreinterpret_s64_u8(v.raw));
    988 }
    989 
    990 // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
    991 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
    992 HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
    993 #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
    994  return VFromD<D>(vreinterpret_f16_u8(v.raw));
    995 #else
    996  const RebindToUnsigned<D> du;
    997  return VFromD<D>(BitCastFromByte(du, v).raw);
    998 #endif
    999 }
   1000 
   1001 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)>
   1002 HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) {
   1003 #if HWY_NEON_HAVE_BFLOAT16
   1004  return VFromD<D>(vreinterpret_bf16_u8(v.raw));
   1005 #else
   1006  const RebindToUnsigned<D> du;
   1007  return VFromD<D>(BitCastFromByte(du, v).raw);
   1008 #endif
   1009 }
   1010 
   1011 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
   1012 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */,
   1013                                     VFromD<Repartition<uint8_t, D>> v) {
   1014  return VFromD<D>(vreinterpret_f32_u8(v.raw));
   1015 }
   1016 
   1017 #if HWY_HAVE_FLOAT64
   1018 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)>
   1019 HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) {
   1020  return Vec64<double>(vreinterpret_f64_u8(v.raw));
   1021 }
   1022 #endif  // HWY_HAVE_FLOAT64
   1023 
   1024 // 128-bit full:
   1025 
   1026 template <class D, HWY_IF_I8_D(D)>
   1027 HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1028  return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw));
   1029 }
   1030 template <class D, HWY_IF_U16_D(D)>
   1031 HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1032  return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw));
   1033 }
   1034 template <class D, HWY_IF_I16_D(D)>
   1035 HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1036  return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw));
   1037 }
   1038 template <class D, HWY_IF_U32_D(D)>
   1039 HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1040  return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw));
   1041 }
   1042 template <class D, HWY_IF_I32_D(D)>
   1043 HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1044  return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw));
   1045 }
   1046 template <class D, HWY_IF_U64_D(D)>
   1047 HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1048  return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw));
   1049 }
   1050 template <class D, HWY_IF_I64_D(D)>
   1051 HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1052  return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw));
   1053 }
   1054 
   1055 template <class D, HWY_IF_F32_D(D)>
   1056 HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1057  return Vec128<float>(vreinterpretq_f32_u8(v.raw));
   1058 }
   1059 
   1060 #if HWY_HAVE_FLOAT64
   1061 template <class D, HWY_IF_F64_D(D)>
   1062 HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) {
   1063  return Vec128<double>(vreinterpretq_f64_u8(v.raw));
   1064 }
   1065 #endif  // HWY_HAVE_FLOAT64
   1066 
   1067 // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C.
   1068 template <class D, HWY_IF_F16_D(D)>
   1069 HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
   1070 #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C
   1071  return VFromD<D>(vreinterpretq_f16_u8(v.raw));
   1072 #else
   1073  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
   1074 #endif
   1075 }
   1076 
   1077 template <class D, HWY_IF_BF16_D(D)>
   1078 HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) {
   1079 #if HWY_NEON_HAVE_BFLOAT16
   1080  return VFromD<D>(vreinterpretq_bf16_u8(v.raw));
   1081 #else
   1082  return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw);
   1083 #endif
   1084 }
   1085 
   1086 }  // namespace detail
   1087 
   1088 template <class D, class FromT>
   1089 HWY_API VFromD<D> BitCast(D d,
   1090                          Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) {
   1091  return detail::BitCastFromByte(d, detail::BitCastToByte(v));
   1092 }
   1093 
   1094 // ------------------------------ ResizeBitCast
   1095 
   1096 // <= 8 byte vector to <= 8 byte vector
   1097 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
   1098          HWY_IF_V_SIZE_LE_D(D, 8)>
   1099 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
   1100  const Repartition<uint8_t, decltype(d)> du8;
   1101  return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw});
   1102 }
   1103 
   1104 // 16-byte vector to 16-byte vector: same as BitCast
   1105 template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
   1106          HWY_IF_V_SIZE_D(D, 16)>
   1107 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
   1108  return BitCast(d, v);
   1109 }
   1110 
   1111 // 16-byte vector to <= 8-byte vector
   1112 template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16),
   1113          HWY_IF_V_SIZE_LE_D(D, 8)>
   1114 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
   1115  const DFromV<decltype(v)> d_from;
   1116  const Half<decltype(d_from)> dh_from;
   1117  return ResizeBitCast(d, LowerHalf(dh_from, v));
   1118 }
   1119 
   1120 // <= 8-bit vector to 16-byte vector
   1121 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8),
   1122          HWY_IF_V_SIZE_D(D, 16)>
   1123 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) {
   1124  const Full64<TFromV<FromV>> d_full64_from;
   1125  const Full128<TFromV<FromV>> d_full128_from;
   1126  return BitCast(d, Combine(d_full128_from, Zero(d_full64_from),
   1127                            ResizeBitCast(d_full64_from, v)));
   1128 }
   1129 
   1130 // ------------------------------ Set
   1131 
   1132 namespace detail {
   1133 // We want to route any combination of N/kPow2 to the intrinsics depending on
   1134 // whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is
   1135 // unconditional and currently does not accept inputs (such as whether the
   1136 // vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for
   1137 // SFINAE. We instead define a private NativeSet which receives a Simd<> whose
   1138 // kPow2 has already been folded into its N.
   1139 #define HWY_NEON_BUILD_TPL_HWY_SET
   1140 #define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size>
   1141 #define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \
   1142  Simd<type##_t, size, 0> /* tag */, type##_t t
   1143 #define HWY_NEON_BUILD_ARG_HWY_SET t
   1144 
   1145 HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET)
   1146 #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C && HWY_HAVE_SCALAR_F16_TYPE
   1147 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET)
   1148 #endif
   1149 HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET)
   1150 
   1151 #if !HWY_NEON_HAVE_F16C || !HWY_HAVE_SCALAR_F16_TYPE
   1152 template <class D, HWY_IF_F16_D(D)>
   1153 HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) {
   1154  const uint16_t tu = BitCastScalar<uint16_t>(t);
   1155  return BitCast(d, Set(RebindToUnsigned<D>(), tu));
   1156 }
   1157 #endif
   1158 
   1159 #if !HWY_NEON_HAVE_BFLOAT16
   1160 template <class D, HWY_IF_BF16_D(D)>
   1161 HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) {
   1162  const uint16_t tu = BitCastScalar<uint16_t>(t);
   1163  return BitCast(d, Set(RebindToUnsigned<D>(), tu));
   1164 }
   1165 #endif
   1166 
   1167 #undef HWY_NEON_BUILD_TPL_HWY_SET
   1168 #undef HWY_NEON_BUILD_RET_HWY_SET
   1169 #undef HWY_NEON_BUILD_PARAM_HWY_SET
   1170 #undef HWY_NEON_BUILD_ARG_HWY_SET
   1171 
   1172 }  // namespace detail
   1173 
   1174 // Full vector.
   1175 // Do not use a typename T = TFromD<D> argument because T will be deduced from
   1176 // the actual argument type, which can differ from TFromD<D>.
   1177 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T>
   1178 HWY_INLINE VFromD<D> Set(D /* tag */, T t) {
   1179  return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t));
   1180 }
   1181 
   1182 // Partial vector: create 64-bit and return wrapper.
   1183 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T>
   1184 HWY_API VFromD<D> Set(D /* tag */, T t) {
   1185  const Full64<TFromD<D>> dfull;
   1186  return VFromD<D>(detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw);
   1187 }
   1188 
   1189 template <class D>
   1190 HWY_API VFromD<D> Zero(D d) {
   1191  // Default ctor also works for bfloat16_t and float16_t.
   1192  return Set(d, TFromD<D>{});
   1193 }
   1194 
   1195 HWY_DIAGNOSTICS(push)
   1196 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized")
   1197 #if HWY_COMPILER_GCC_ACTUAL
   1198 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
   1199 #endif
   1200 
   1201 template <class D>
   1202 HWY_API VFromD<D> Undefined(D /*tag*/) {
   1203 #if HWY_HAS_BUILTIN(__builtin_nondeterministic_value)
   1204  return VFromD<D>{__builtin_nondeterministic_value(Zero(D()).raw)};
   1205 #else
   1206  VFromD<D> v;
   1207  return v;
   1208 #endif
   1209 }
   1210 
   1211 HWY_DIAGNOSTICS(pop)
   1212 
   1213 #if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
   1214 namespace detail {
   1215 
   1216 #pragma pack(push, 1)
   1217 
   1218 template <class T>
   1219 struct alignas(8) Vec64ValsWrapper {
   1220  static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
   1221  static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
   1222  T vals[8 / sizeof(T)];
   1223 };
   1224 
   1225 #pragma pack(pop)
   1226 
   1227 }  // namespace detail
   1228 #endif  // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL
   1229 
   1230 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   1231 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1232                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   1233                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
   1234                                      TFromD<D> /*t8*/, TFromD<D> /*t9*/,
   1235                                      TFromD<D> /*t10*/, TFromD<D> /*t11*/,
   1236                                      TFromD<D> /*t12*/, TFromD<D> /*t13*/,
   1237                                      TFromD<D> /*t14*/, TFromD<D> /*t15*/) {
   1238 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1239  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8)));
   1240  (void)d;
   1241  const GccI8RawVectType raw = {
   1242      static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2),
   1243      static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5),
   1244      static_cast<int8_t>(t6), static_cast<int8_t>(t7)};
   1245  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1246 #else
   1247  return ResizeBitCast(
   1248      d, Set(Full64<uint64_t>(),
   1249             BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
   1250                 {t0, t1, t2, t3, t4, t5, t6, t7}})));
   1251 #endif
   1252 }
   1253 
   1254 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   1255 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1256                                      TFromD<D> t2, TFromD<D> t3,
   1257                                      TFromD<D> /*t4*/, TFromD<D> /*t5*/,
   1258                                      TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
   1259 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1260  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8)));
   1261  (void)d;
   1262  const GccI16RawVectType raw = {
   1263      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
   1264      static_cast<int16_t>(t2), static_cast<int16_t>(t3)};
   1265  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1266 #else
   1267  return ResizeBitCast(
   1268      d, Set(Full64<uint64_t>(),
   1269             BitCastScalar<uint64_t>(
   1270                 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}})));
   1271 #endif
   1272 }
   1273 
   1274 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   1275 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1276                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
   1277 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1278  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8)));
   1279  (void)d;
   1280  const GccI32RawVectType raw = {static_cast<int32_t>(t0),
   1281                                 static_cast<int32_t>(t1)};
   1282  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1283 #else
   1284  return ResizeBitCast(d,
   1285                       Set(Full64<uint64_t>(),
   1286                           BitCastScalar<uint64_t>(
   1287                               detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
   1288 #endif
   1289 }
   1290 
   1291 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   1292 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1293                                      TFromD<D> /*t2*/, TFromD<D> /*t3*/) {
   1294 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1295  typedef float GccF32RawVectType __attribute__((__vector_size__(8)));
   1296  (void)d;
   1297  const GccF32RawVectType raw = {t0, t1};
   1298  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1299 #else
   1300  return ResizeBitCast(d,
   1301                       Set(Full64<uint64_t>(),
   1302                           BitCastScalar<uint64_t>(
   1303                               detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}})));
   1304 #endif
   1305 }
   1306 
   1307 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)>
   1308 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
   1309  return Set(d, t0);
   1310 }
   1311 
   1312 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1313 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1314                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   1315                                      TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
   1316                                      TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
   1317                                      TFromD<D> t11, TFromD<D> t12,
   1318                                      TFromD<D> t13, TFromD<D> t14,
   1319                                      TFromD<D> t15) {
   1320 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1321  typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16)));
   1322  (void)d;
   1323  const GccI8RawVectType raw = {
   1324      static_cast<int8_t>(t0),  static_cast<int8_t>(t1),
   1325      static_cast<int8_t>(t2),  static_cast<int8_t>(t3),
   1326      static_cast<int8_t>(t4),  static_cast<int8_t>(t5),
   1327      static_cast<int8_t>(t6),  static_cast<int8_t>(t7),
   1328      static_cast<int8_t>(t8),  static_cast<int8_t>(t9),
   1329      static_cast<int8_t>(t10), static_cast<int8_t>(t11),
   1330      static_cast<int8_t>(t12), static_cast<int8_t>(t13),
   1331      static_cast<int8_t>(t14), static_cast<int8_t>(t15)};
   1332  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1333 #else
   1334  const Half<decltype(d)> dh;
   1335  return Combine(d,
   1336                 Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15,
   1337                                     t8, t9, t10, t11, t12, t13, t14, t15),
   1338                 Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1,
   1339                                     t2, t3, t4, t5, t6, t7));
   1340 #endif
   1341 }
   1342 
   1343 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1344 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1345                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   1346                                      TFromD<D> t5, TFromD<D> t6,
   1347                                      TFromD<D> t7) {
   1348 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1349  typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16)));
   1350  (void)d;
   1351  const GccI16RawVectType raw = {
   1352      static_cast<int16_t>(t0), static_cast<int16_t>(t1),
   1353      static_cast<int16_t>(t2), static_cast<int16_t>(t3),
   1354      static_cast<int16_t>(t4), static_cast<int16_t>(t5),
   1355      static_cast<int16_t>(t6), static_cast<int16_t>(t7)};
   1356  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1357 #else
   1358  const Half<decltype(d)> dh;
   1359  return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7),
   1360                 Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3));
   1361 #endif
   1362 }
   1363 
   1364 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1365 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1366                                      TFromD<D> t2, TFromD<D> t3) {
   1367 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1368  typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16)));
   1369  (void)d;
   1370  const GccI32RawVectType raw = {
   1371      static_cast<int32_t>(t0), static_cast<int32_t>(t1),
   1372      static_cast<int32_t>(t2), static_cast<int32_t>(t3)};
   1373  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1374 #else
   1375  const Half<decltype(d)> dh;
   1376  return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
   1377                 Dup128VecFromValues(dh, t0, t1, t0, t1));
   1378 #endif
   1379 }
   1380 
   1381 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1382 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1383                                      TFromD<D> t2, TFromD<D> t3) {
   1384 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1385  typedef float GccF32RawVectType __attribute__((__vector_size__(16)));
   1386  (void)d;
   1387  const GccF32RawVectType raw = {t0, t1, t2, t3};
   1388  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1389 #else
   1390  const Half<decltype(d)> dh;
   1391  return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3),
   1392                 Dup128VecFromValues(dh, t0, t1, t0, t1));
   1393 #endif
   1394 }
   1395 
   1396 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1397 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
   1398 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1399  typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16)));
   1400  (void)d;
   1401  const GccI64RawVectType raw = {static_cast<int64_t>(t0),
   1402                                 static_cast<int64_t>(t1)};
   1403  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1404 #else
   1405  const Half<decltype(d)> dh;
   1406  return Combine(d, Set(dh, t1), Set(dh, t0));
   1407 #endif
   1408 }
   1409 
   1410 #if HWY_HAVE_FLOAT64
   1411 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1412 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
   1413 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL
   1414  typedef double GccF64RawVectType __attribute__((__vector_size__(16)));
   1415  (void)d;
   1416  const GccF64RawVectType raw = {t0, t1};
   1417  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1418 #else
   1419  const Half<decltype(d)> dh;
   1420  return Combine(d, Set(dh, t1), Set(dh, t0));
   1421 #endif
   1422 }
   1423 #endif
   1424 
   1425 // Generic for all vector lengths
   1426 template <class D, HWY_IF_BF16_D(D)>
   1427 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1428                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   1429                                      TFromD<D> t5, TFromD<D> t6,
   1430                                      TFromD<D> t7) {
   1431  const RebindToSigned<decltype(d)> di;
   1432  return BitCast(d,
   1433                 Dup128VecFromValues(
   1434                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
   1435                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
   1436                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
   1437                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
   1438 }
   1439 
   1440 #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C && \
   1441    HWY_HAVE_SCALAR_F16_TYPE
   1442 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   1443 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1444                                      TFromD<D> t2, TFromD<D> t3,
   1445                                      TFromD<D> /*t4*/, TFromD<D> /*t5*/,
   1446                                      TFromD<D> /*t6*/, TFromD<D> /*t7*/) {
   1447  typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8)));
   1448  (void)d;
   1449  const GccF16RawVectType raw = {
   1450      static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
   1451      static_cast<__fp16>(t3)};
   1452  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1453 }
   1454 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
   1455 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1456                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   1457                                      TFromD<D> t5, TFromD<D> t6,
   1458                                      TFromD<D> t7) {
   1459  typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16)));
   1460  (void)d;
   1461  const GccF16RawVectType raw = {
   1462      static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2),
   1463      static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5),
   1464      static_cast<__fp16>(t6), static_cast<__fp16>(t7)};
   1465  return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw));
   1466 }
   1467 #else
   1468 // Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C
   1469 template <class D, HWY_IF_F16_D(D)>
   1470 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
   1471                                      TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
   1472                                      TFromD<D> t5, TFromD<D> t6,
   1473                                      TFromD<D> t7) {
   1474  const RebindToSigned<decltype(d)> di;
   1475  return BitCast(d,
   1476                 Dup128VecFromValues(
   1477                     di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1),
   1478                     BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3),
   1479                     BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5),
   1480                     BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7)));
   1481 }
   1482 #endif  // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C
   1483 
   1484 namespace detail {
   1485 
   1486 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   1487 HWY_INLINE VFromD<D> Iota0(D d) {
   1488  return Dup128VecFromValues(
   1489      d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4},
   1490      TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9},
   1491      TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14},
   1492      TFromD<D>{15});
   1493 }
   1494 
   1495 template <class D, HWY_IF_UI16_D(D)>
   1496 HWY_INLINE VFromD<D> Iota0(D d) {
   1497  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
   1498                             TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5},
   1499                             TFromD<D>{6}, TFromD<D>{7});
   1500 }
   1501 
   1502 template <class D, HWY_IF_F16_D(D)>
   1503 HWY_INLINE VFromD<D> Iota0(D d) {
   1504  const RebindToUnsigned<decltype(d)> du;
   1505  return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00},
   1506                                        uint16_t{0x4000}, uint16_t{0x4200},
   1507                                        uint16_t{0x4400}, uint16_t{0x4500},
   1508                                        uint16_t{0x4600}, uint16_t{0x4700}));
   1509 }
   1510 
   1511 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   1512 HWY_INLINE VFromD<D> Iota0(D d) {
   1513  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2},
   1514                             TFromD<D>{3});
   1515 }
   1516 
   1517 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   1518 HWY_INLINE VFromD<D> Iota0(D d) {
   1519  return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1});
   1520 }
   1521 
   1522 #if HWY_COMPILER_MSVC
   1523 template <class V, HWY_IF_V_SIZE_LE_V(V, 4)>
   1524 static HWY_INLINE V MaskOutIota(V v) {
   1525  constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV<V>);
   1526  constexpr uint64_t kU64MaskOutMask =
   1527      hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>();
   1528 
   1529  const DFromV<decltype(v)> d;
   1530  const Repartition<uint8_t, decltype(d)> du8;
   1531  using VU8 = VFromD<decltype(du8)>;
   1532  const auto mask_out_mask =
   1533      BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask))));
   1534  return v & mask_out_mask;
   1535 }
   1536 template <class V, HWY_IF_V_SIZE_GT_V(V, 4)>
   1537 static HWY_INLINE V MaskOutIota(V v) {
   1538  return v;
   1539 }
   1540 #endif
   1541 
   1542 }  // namespace detail
   1543 
   1544 template <class D, typename T2>
   1545 HWY_API VFromD<D> Iota(D d, const T2 first) {
   1546  const auto result_iota =
   1547      detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first));
   1548 #if HWY_COMPILER_MSVC
   1549  return detail::MaskOutIota(result_iota);
   1550 #else
   1551  return result_iota;
   1552 #endif
   1553 }
   1554 
   1555 // ------------------------------ Combine
   1556 
   1557 // Full result
   1558 template <class D, HWY_IF_U8_D(D)>
   1559 HWY_API Vec128<uint8_t> Combine(D /* tag */, Vec64<uint8_t> hi,
   1560                                Vec64<uint8_t> lo) {
   1561  return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw));
   1562 }
   1563 template <class D, HWY_IF_U16_D(D)>
   1564 HWY_API Vec128<uint16_t> Combine(D /* tag */, Vec64<uint16_t> hi,
   1565                                 Vec64<uint16_t> lo) {
   1566  return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw));
   1567 }
   1568 template <class D, HWY_IF_U32_D(D)>
   1569 HWY_API Vec128<uint32_t> Combine(D /* tag */, Vec64<uint32_t> hi,
   1570                                 Vec64<uint32_t> lo) {
   1571  return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw));
   1572 }
   1573 template <class D, HWY_IF_U64_D(D)>
   1574 HWY_API Vec128<uint64_t> Combine(D /* tag */, Vec64<uint64_t> hi,
   1575                                 Vec64<uint64_t> lo) {
   1576  return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw));
   1577 }
   1578 
   1579 template <class D, HWY_IF_I8_D(D)>
   1580 HWY_API Vec128<int8_t> Combine(D /* tag */, Vec64<int8_t> hi,
   1581                               Vec64<int8_t> lo) {
   1582  return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw));
   1583 }
   1584 template <class D, HWY_IF_I16_D(D)>
   1585 HWY_API Vec128<int16_t> Combine(D /* tag */, Vec64<int16_t> hi,
   1586                                Vec64<int16_t> lo) {
   1587  return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw));
   1588 }
   1589 template <class D, HWY_IF_I32_D(D)>
   1590 HWY_API Vec128<int32_t> Combine(D /* tag */, Vec64<int32_t> hi,
   1591                                Vec64<int32_t> lo) {
   1592  return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw));
   1593 }
   1594 template <class D, HWY_IF_I64_D(D)>
   1595 HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi,
   1596                                Vec64<int64_t> lo) {
   1597  return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw));
   1598 }
   1599 
   1600 #if HWY_HAVE_FLOAT16
   1601 template <class D, HWY_IF_F16_D(D)>
   1602 HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) {
   1603  return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw));
   1604 }
   1605 #endif  // HWY_HAVE_FLOAT16
   1606 
   1607 #if HWY_NEON_HAVE_BFLOAT16
   1608 template <class D, HWY_IF_BF16_D(D)>
   1609 HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) {
   1610  return VFromD<D>(vcombine_bf16(lo.raw, hi.raw));
   1611 }
   1612 #endif  // HWY_NEON_HAVE_BFLOAT16
   1613 
   1614 template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)>
   1615 HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) {
   1616  const RebindToUnsigned<D> du;
   1617  const Half<decltype(du)> duh;
   1618  return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo)));
   1619 }
   1620 
   1621 template <class D, HWY_IF_F32_D(D)>
   1622 HWY_API Vec128<float> Combine(D /* tag */, Vec64<float> hi, Vec64<float> lo) {
   1623  return Vec128<float>(vcombine_f32(lo.raw, hi.raw));
   1624 }
   1625 #if HWY_HAVE_FLOAT64
   1626 template <class D, HWY_IF_F64_D(D)>
   1627 HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi,
   1628                               Vec64<double> lo) {
   1629  return Vec128<double>(vcombine_f64(lo.raw, hi.raw));
   1630 }
   1631 #endif  // HWY_HAVE_FLOAT64
   1632 
   1633 // ------------------------------ GetLane
   1634 
   1635 namespace detail {
   1636 #define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane>
   1637 #define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t
   1638 #define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v
   1639 #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane
   1640 
   1641 HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET)
   1642 HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET)
   1643 
   1644 template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)>
   1645 static HWY_INLINE HWY_MAYBE_UNUSED TFromV<V> GetLane(V v) {
   1646  const DFromV<decltype(v)> d;
   1647  const RebindToUnsigned<decltype(d)> du;
   1648  return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v)));
   1649 }
   1650 
   1651 #undef HWY_NEON_BUILD_TPL_HWY_GET
   1652 #undef HWY_NEON_BUILD_RET_HWY_GET
   1653 #undef HWY_NEON_BUILD_PARAM_HWY_GET
   1654 #undef HWY_NEON_BUILD_ARG_HWY_GET
   1655 
   1656 }  // namespace detail
   1657 
   1658 template <class V>
   1659 HWY_API TFromV<V> GetLane(const V v) {
   1660  return detail::GetLane<0>(v);
   1661 }
   1662 
   1663 // ------------------------------ ExtractLane
   1664 
   1665 // Requires one overload per vector length because GetLane<3> is a compile error
   1666 // if v is a uint32x2_t.
   1667 template <typename T>
   1668 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) {
   1669  HWY_DASSERT(i == 0);
   1670  (void)i;
   1671  return detail::GetLane<0>(v);
   1672 }
   1673 
   1674 template <typename T>
   1675 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) {
   1676 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1677  if (__builtin_constant_p(i)) {
   1678    switch (i) {
   1679      case 0:
   1680        return detail::GetLane<0>(v);
   1681      case 1:
   1682        return detail::GetLane<1>(v);
   1683    }
   1684  }
   1685 #endif
   1686  alignas(16) T lanes[2];
   1687  Store(v, DFromV<decltype(v)>(), lanes);
   1688  return lanes[i];
   1689 }
   1690 
   1691 template <typename T>
   1692 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) {
   1693 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1694  if (__builtin_constant_p(i)) {
   1695    switch (i) {
   1696      case 0:
   1697        return detail::GetLane<0>(v);
   1698      case 1:
   1699        return detail::GetLane<1>(v);
   1700      case 2:
   1701        return detail::GetLane<2>(v);
   1702      case 3:
   1703        return detail::GetLane<3>(v);
   1704    }
   1705  }
   1706 #endif
   1707  alignas(16) T lanes[4];
   1708  Store(v, DFromV<decltype(v)>(), lanes);
   1709  return lanes[i];
   1710 }
   1711 
   1712 template <typename T>
   1713 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) {
   1714 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1715  if (__builtin_constant_p(i)) {
   1716    switch (i) {
   1717      case 0:
   1718        return detail::GetLane<0>(v);
   1719      case 1:
   1720        return detail::GetLane<1>(v);
   1721      case 2:
   1722        return detail::GetLane<2>(v);
   1723      case 3:
   1724        return detail::GetLane<3>(v);
   1725      case 4:
   1726        return detail::GetLane<4>(v);
   1727      case 5:
   1728        return detail::GetLane<5>(v);
   1729      case 6:
   1730        return detail::GetLane<6>(v);
   1731      case 7:
   1732        return detail::GetLane<7>(v);
   1733    }
   1734  }
   1735 #endif
   1736  alignas(16) T lanes[8];
   1737  Store(v, DFromV<decltype(v)>(), lanes);
   1738  return lanes[i];
   1739 }
   1740 
   1741 template <typename T>
   1742 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) {
   1743 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1744  if (__builtin_constant_p(i)) {
   1745    switch (i) {
   1746      case 0:
   1747        return detail::GetLane<0>(v);
   1748      case 1:
   1749        return detail::GetLane<1>(v);
   1750      case 2:
   1751        return detail::GetLane<2>(v);
   1752      case 3:
   1753        return detail::GetLane<3>(v);
   1754      case 4:
   1755        return detail::GetLane<4>(v);
   1756      case 5:
   1757        return detail::GetLane<5>(v);
   1758      case 6:
   1759        return detail::GetLane<6>(v);
   1760      case 7:
   1761        return detail::GetLane<7>(v);
   1762      case 8:
   1763        return detail::GetLane<8>(v);
   1764      case 9:
   1765        return detail::GetLane<9>(v);
   1766      case 10:
   1767        return detail::GetLane<10>(v);
   1768      case 11:
   1769        return detail::GetLane<11>(v);
   1770      case 12:
   1771        return detail::GetLane<12>(v);
   1772      case 13:
   1773        return detail::GetLane<13>(v);
   1774      case 14:
   1775        return detail::GetLane<14>(v);
   1776      case 15:
   1777        return detail::GetLane<15>(v);
   1778    }
   1779  }
   1780 #endif
   1781  alignas(16) T lanes[16];
   1782  Store(v, DFromV<decltype(v)>(), lanes);
   1783  return lanes[i];
   1784 }
   1785 
   1786 // ------------------------------ InsertLane
   1787 
   1788 namespace detail {
   1789 #define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane>
   1790 #define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size>
   1791 #define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \
   1792  Vec128<type##_t, size> v, type##_t t
   1793 #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane
   1794 
   1795 HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT)
   1796 HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT)
   1797 
   1798 #undef HWY_NEON_BUILD_TPL_HWY_INSERT
   1799 #undef HWY_NEON_BUILD_RET_HWY_INSERT
   1800 #undef HWY_NEON_BUILD_PARAM_HWY_INSERT
   1801 #undef HWY_NEON_BUILD_ARG_HWY_INSERT
   1802 
   1803 template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)>
   1804 HWY_API V InsertLane(const V v, TFromD<D> t) {
   1805  const D d;
   1806  const RebindToUnsigned<D> du;
   1807  const uint16_t tu = BitCastScalar<uint16_t>(t);
   1808  return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu));
   1809 }
   1810 
   1811 }  // namespace detail
   1812 
   1813 // Requires one overload per vector length because InsertLane<3> may be a
   1814 // compile error.
   1815 
   1816 template <typename T>
   1817 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) {
   1818  HWY_DASSERT(i == 0);
   1819  (void)i;
   1820  return Set(DFromV<decltype(v)>(), t);
   1821 }
   1822 
   1823 template <typename T>
   1824 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) {
   1825 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1826  if (__builtin_constant_p(i)) {
   1827    switch (i) {
   1828      case 0:
   1829        return detail::InsertLane<0>(v, t);
   1830      case 1:
   1831        return detail::InsertLane<1>(v, t);
   1832    }
   1833  }
   1834 #endif
   1835  const DFromV<decltype(v)> d;
   1836  alignas(16) T lanes[2];
   1837  Store(v, d, lanes);
   1838  lanes[i] = t;
   1839  return Load(d, lanes);
   1840 }
   1841 
   1842 template <typename T>
   1843 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) {
   1844 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1845  if (__builtin_constant_p(i)) {
   1846    switch (i) {
   1847      case 0:
   1848        return detail::InsertLane<0>(v, t);
   1849      case 1:
   1850        return detail::InsertLane<1>(v, t);
   1851      case 2:
   1852        return detail::InsertLane<2>(v, t);
   1853      case 3:
   1854        return detail::InsertLane<3>(v, t);
   1855    }
   1856  }
   1857 #endif
   1858  const DFromV<decltype(v)> d;
   1859  alignas(16) T lanes[4];
   1860  Store(v, d, lanes);
   1861  lanes[i] = t;
   1862  return Load(d, lanes);
   1863 }
   1864 
   1865 template <typename T>
   1866 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) {
   1867 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1868  if (__builtin_constant_p(i)) {
   1869    switch (i) {
   1870      case 0:
   1871        return detail::InsertLane<0>(v, t);
   1872      case 1:
   1873        return detail::InsertLane<1>(v, t);
   1874      case 2:
   1875        return detail::InsertLane<2>(v, t);
   1876      case 3:
   1877        return detail::InsertLane<3>(v, t);
   1878      case 4:
   1879        return detail::InsertLane<4>(v, t);
   1880      case 5:
   1881        return detail::InsertLane<5>(v, t);
   1882      case 6:
   1883        return detail::InsertLane<6>(v, t);
   1884      case 7:
   1885        return detail::InsertLane<7>(v, t);
   1886    }
   1887  }
   1888 #endif
   1889  const DFromV<decltype(v)> d;
   1890  alignas(16) T lanes[8];
   1891  Store(v, d, lanes);
   1892  lanes[i] = t;
   1893  return Load(d, lanes);
   1894 }
   1895 
   1896 template <typename T>
   1897 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) {
   1898 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   1899  if (__builtin_constant_p(i)) {
   1900    switch (i) {
   1901      case 0:
   1902        return detail::InsertLane<0>(v, t);
   1903      case 1:
   1904        return detail::InsertLane<1>(v, t);
   1905      case 2:
   1906        return detail::InsertLane<2>(v, t);
   1907      case 3:
   1908        return detail::InsertLane<3>(v, t);
   1909      case 4:
   1910        return detail::InsertLane<4>(v, t);
   1911      case 5:
   1912        return detail::InsertLane<5>(v, t);
   1913      case 6:
   1914        return detail::InsertLane<6>(v, t);
   1915      case 7:
   1916        return detail::InsertLane<7>(v, t);
   1917      case 8:
   1918        return detail::InsertLane<8>(v, t);
   1919      case 9:
   1920        return detail::InsertLane<9>(v, t);
   1921      case 10:
   1922        return detail::InsertLane<10>(v, t);
   1923      case 11:
   1924        return detail::InsertLane<11>(v, t);
   1925      case 12:
   1926        return detail::InsertLane<12>(v, t);
   1927      case 13:
   1928        return detail::InsertLane<13>(v, t);
   1929      case 14:
   1930        return detail::InsertLane<14>(v, t);
   1931      case 15:
   1932        return detail::InsertLane<15>(v, t);
   1933    }
   1934  }
   1935 #endif
   1936  const DFromV<decltype(v)> d;
   1937  alignas(16) T lanes[16];
   1938  Store(v, d, lanes);
   1939  lanes[i] = t;
   1940  return Load(d, lanes);
   1941 }
   1942 
   1943 // ================================================== ARITHMETIC
   1944 
   1945 // ------------------------------ Addition
   1946 HWY_NEON_DEF_FUNCTION_UINTS(operator+, vadd, _, 2)
   1947 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator+, vadd, _, 2)
   1948 
   1949 template <size_t N>
   1950 HWY_API Vec128<int8_t, N> operator+(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   1951  const DFromV<decltype(a)> d;
   1952  const RebindToUnsigned<decltype(d)> du;
   1953  return BitCast(d, BitCast(du, a) + BitCast(du, b));
   1954 }
   1955 
   1956 template <size_t N>
   1957 HWY_API Vec128<int16_t, N> operator+(Vec128<int16_t, N> a,
   1958                                     Vec128<int16_t, N> b) {
   1959  const DFromV<decltype(a)> d;
   1960  const RebindToUnsigned<decltype(d)> du;
   1961  return BitCast(d, BitCast(du, a) + BitCast(du, b));
   1962 }
   1963 
   1964 template <size_t N>
   1965 HWY_API Vec128<int32_t, N> operator+(Vec128<int32_t, N> a,
   1966                                     Vec128<int32_t, N> b) {
   1967  const DFromV<decltype(a)> d;
   1968  const RebindToUnsigned<decltype(d)> du;
   1969  return BitCast(d, BitCast(du, a) + BitCast(du, b));
   1970 }
   1971 
   1972 template <size_t N>
   1973 HWY_API Vec128<int64_t, N> operator+(Vec128<int64_t, N> a,
   1974                                     Vec128<int64_t, N> b) {
   1975  const DFromV<decltype(a)> d;
   1976  const RebindToUnsigned<decltype(d)> du;
   1977  return BitCast(d, BitCast(du, a) + BitCast(du, b));
   1978 }
   1979 
   1980 // ------------------------------ Subtraction
   1981 HWY_NEON_DEF_FUNCTION_UINTS(operator-, vsub, _, 2)
   1982 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator-, vsub, _, 2)
   1983 
   1984 template <size_t N>
   1985 HWY_API Vec128<int8_t, N> operator-(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   1986  const DFromV<decltype(a)> d;
   1987  const RebindToUnsigned<decltype(d)> du;
   1988  return BitCast(d, BitCast(du, a) - BitCast(du, b));
   1989 }
   1990 
   1991 template <size_t N>
   1992 HWY_API Vec128<int16_t, N> operator-(Vec128<int16_t, N> a,
   1993                                     Vec128<int16_t, N> b) {
   1994  const DFromV<decltype(a)> d;
   1995  const RebindToUnsigned<decltype(d)> du;
   1996  return BitCast(d, BitCast(du, a) - BitCast(du, b));
   1997 }
   1998 
   1999 template <size_t N>
   2000 HWY_API Vec128<int32_t, N> operator-(Vec128<int32_t, N> a,
   2001                                     Vec128<int32_t, N> b) {
   2002  const DFromV<decltype(a)> d;
   2003  const RebindToUnsigned<decltype(d)> du;
   2004  return BitCast(d, BitCast(du, a) - BitCast(du, b));
   2005 }
   2006 
   2007 template <size_t N>
   2008 HWY_API Vec128<int64_t, N> operator-(Vec128<int64_t, N> a,
   2009                                     Vec128<int64_t, N> b) {
   2010  const DFromV<decltype(a)> d;
   2011  const RebindToUnsigned<decltype(d)> du;
   2012  return BitCast(d, BitCast(du, a) - BitCast(du, b));
   2013 }
   2014 
   2015 // ------------------------------ SumsOf8
   2016 
   2017 HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) {
   2018  return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw))));
   2019 }
   2020 HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) {
   2021  return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw))));
   2022 }
   2023 HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) {
   2024  return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw))));
   2025 }
   2026 HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) {
   2027  return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw))));
   2028 }
   2029 
   2030 // ------------------------------ SumsOf2
   2031 namespace detail {
   2032 
   2033 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   2034 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2035    hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
   2036  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw));
   2037 }
   2038 
   2039 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   2040 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2041    hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
   2042  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw));
   2043 }
   2044 
   2045 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   2046 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2047    hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
   2048  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw));
   2049 }
   2050 
   2051 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   2052 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2053    hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) {
   2054  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw));
   2055 }
   2056 
   2057 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   2058 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2059    hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
   2060  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw));
   2061 }
   2062 
   2063 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   2064 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2065    hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
   2066  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw));
   2067 }
   2068 
   2069 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   2070 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2071    hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
   2072  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw));
   2073 }
   2074 
   2075 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   2076 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2077    hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) {
   2078  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw));
   2079 }
   2080 
   2081 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   2082 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2083    hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
   2084  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw));
   2085 }
   2086 
   2087 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   2088 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2089    hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
   2090  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw));
   2091 }
   2092 
   2093 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   2094 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2095    hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
   2096  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw));
   2097 }
   2098 
   2099 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   2100 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2(
   2101    hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) {
   2102  return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw));
   2103 }
   2104 
   2105 }  // namespace detail
   2106 
   2107 // ------------------------------ SaturatedAdd
   2108 
   2109 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
   2110 #undef HWY_NATIVE_I32_SATURATED_ADDSUB
   2111 #else
   2112 #define HWY_NATIVE_I32_SATURATED_ADDSUB
   2113 #endif
   2114 
   2115 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
   2116 #undef HWY_NATIVE_U32_SATURATED_ADDSUB
   2117 #else
   2118 #define HWY_NATIVE_U32_SATURATED_ADDSUB
   2119 #endif
   2120 
   2121 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
   2122 #undef HWY_NATIVE_I64_SATURATED_ADDSUB
   2123 #else
   2124 #define HWY_NATIVE_I64_SATURATED_ADDSUB
   2125 #endif
   2126 
   2127 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
   2128 #undef HWY_NATIVE_U64_SATURATED_ADDSUB
   2129 #else
   2130 #define HWY_NATIVE_U64_SATURATED_ADDSUB
   2131 #endif
   2132 
   2133 // Returns a + b clamped to the destination range.
   2134 HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedAdd, vqadd, _, 2)
   2135 
   2136 // ------------------------------ SaturatedSub
   2137 
   2138 // Returns a - b clamped to the destination range.
   2139 HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2)
   2140 
   2141 // ------------------------------ Average
   2142 
   2143 // Returns (a + b + 1) / 2
   2144 
   2145 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32
   2146 #undef HWY_NATIVE_AVERAGE_ROUND_UI32
   2147 #else
   2148 #define HWY_NATIVE_AVERAGE_ROUND_UI32
   2149 #endif
   2150 
   2151 HWY_NEON_DEF_FUNCTION_UI_8_16_32(AverageRound, vrhadd, _, 2)
   2152 
   2153 // ------------------------------ Neg
   2154 
   2155 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1)
   2156 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1)  // i64 implemented below
   2157 
   2158 #if !HWY_HAVE_FLOAT16
   2159 template <size_t N>
   2160 HWY_API Vec128<float16_t, N> Neg(const Vec128<float16_t, N> v) {
   2161  const DFromV<decltype(v)> d;
   2162  const RebindToUnsigned<decltype(d)> du;
   2163  using TU = TFromD<decltype(du)>;
   2164  return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
   2165 }
   2166 #endif  // !HWY_HAVE_FLOAT16
   2167 
   2168 // There is no vneg for bf16, but we can cast to f16 (emulated or native).
   2169 template <size_t N>
   2170 HWY_API Vec128<bfloat16_t, N> Neg(const Vec128<bfloat16_t, N> v) {
   2171  const DFromV<decltype(v)> d;
   2172  const Rebind<float16_t, decltype(d)> df16;
   2173  return BitCast(d, Neg(BitCast(df16, v)));
   2174 }
   2175 
   2176 HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) {
   2177 #if HWY_ARCH_ARM_A64
   2178  return Vec64<int64_t>(vneg_s64(v.raw));
   2179 #else
   2180  return Zero(DFromV<decltype(v)>()) - v;
   2181 #endif
   2182 }
   2183 
   2184 HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) {
   2185 #if HWY_ARCH_ARM_A64
   2186  return Vec128<int64_t>(vnegq_s64(v.raw));
   2187 #else
   2188  return Zero(DFromV<decltype(v)>()) - v;
   2189 #endif
   2190 }
   2191 
   2192 // ------------------------------ SaturatedNeg
   2193 #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32
   2194 #undef HWY_NATIVE_SATURATED_NEG_8_16_32
   2195 #else
   2196 #define HWY_NATIVE_SATURATED_NEG_8_16_32
   2197 #endif
   2198 
   2199 HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1)
   2200 
   2201 #if HWY_ARCH_ARM_A64
   2202 #ifdef HWY_NATIVE_SATURATED_NEG_64
   2203 #undef HWY_NATIVE_SATURATED_NEG_64
   2204 #else
   2205 #define HWY_NATIVE_SATURATED_NEG_64
   2206 #endif
   2207 
   2208 HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) {
   2209  return Vec64<int64_t>(vqneg_s64(v.raw));
   2210 }
   2211 
   2212 HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) {
   2213  return Vec128<int64_t>(vqnegq_s64(v.raw));
   2214 }
   2215 #endif
   2216 
   2217 // ------------------------------ ShiftLeft
   2218 
   2219 #ifdef HWY_NATIVE_ROUNDING_SHR
   2220 #undef HWY_NATIVE_ROUNDING_SHR
   2221 #else
   2222 #define HWY_NATIVE_ROUNDING_SHR
   2223 #endif
   2224 
   2225 // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported).
   2226 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
   2227 #undef HWY_NEON_DEF_FUNCTION
   2228 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args)   \
   2229  template <int kBits>                                                         \
   2230  HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) {        \
   2231    return kBits == 0 ? v                                                      \
   2232                      : Vec128<type##_t, size>(HWY_NEON_EVAL(                  \
   2233                            prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \
   2234  }
   2235 
   2236 HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored)
   2237 
   2238 HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored)
   2239 HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored)
   2240 HWY_NEON_DEF_FUNCTION_UINTS(RoundingShiftRight, vrshr, _n_, ignored)
   2241 HWY_NEON_DEF_FUNCTION_INTS(RoundingShiftRight, vrshr, _n_, ignored)
   2242 
   2243 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
   2244 
   2245 // ------------------------------ RotateRight (ShiftRight, Or)
   2246 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)>
   2247 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) {
   2248  const DFromV<decltype(v)> d;
   2249  const RebindToUnsigned<decltype(d)> du;
   2250 
   2251  constexpr size_t kSizeInBits = sizeof(T) * 8;
   2252  static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
   2253  if (kBits == 0) return v;
   2254 
   2255  return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
   2256            ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
   2257 }
   2258 
   2259 // NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a
   2260 // mechanism for checking for extensions to Armv8.
   2261 
   2262 // ------------------------------ Shl
   2263 
   2264 HWY_API Vec128<uint8_t> operator<<(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
   2265  return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw)));
   2266 }
   2267 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
   2268 HWY_API Vec128<uint8_t, N> operator<<(Vec128<uint8_t, N> v,
   2269                                      Vec128<uint8_t, N> bits) {
   2270  return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw)));
   2271 }
   2272 
   2273 HWY_API Vec128<uint16_t> operator<<(Vec128<uint16_t> v, Vec128<uint16_t> bits) {
   2274  return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw)));
   2275 }
   2276 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
   2277 HWY_API Vec128<uint16_t, N> operator<<(Vec128<uint16_t, N> v,
   2278                                       Vec128<uint16_t, N> bits) {
   2279  return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw)));
   2280 }
   2281 
   2282 HWY_API Vec128<uint32_t> operator<<(Vec128<uint32_t> v, Vec128<uint32_t> bits) {
   2283  return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw)));
   2284 }
   2285 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
   2286 HWY_API Vec128<uint32_t, N> operator<<(Vec128<uint32_t, N> v,
   2287                                       Vec128<uint32_t, N> bits) {
   2288  return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw)));
   2289 }
   2290 
   2291 HWY_API Vec128<uint64_t> operator<<(Vec128<uint64_t> v, Vec128<uint64_t> bits) {
   2292  return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw)));
   2293 }
   2294 HWY_API Vec64<uint64_t> operator<<(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
   2295  return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw)));
   2296 }
   2297 
   2298 HWY_API Vec128<int8_t> operator<<(Vec128<int8_t> v, Vec128<int8_t> bits) {
   2299  return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw));
   2300 }
   2301 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
   2302 HWY_API Vec128<int8_t, N> operator<<(Vec128<int8_t, N> v,
   2303                                     Vec128<int8_t, N> bits) {
   2304  return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw));
   2305 }
   2306 
   2307 HWY_API Vec128<int16_t> operator<<(Vec128<int16_t> v, Vec128<int16_t> bits) {
   2308  return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw));
   2309 }
   2310 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
   2311 HWY_API Vec128<int16_t, N> operator<<(Vec128<int16_t, N> v,
   2312                                      Vec128<int16_t, N> bits) {
   2313  return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw));
   2314 }
   2315 
   2316 HWY_API Vec128<int32_t> operator<<(Vec128<int32_t> v, Vec128<int32_t> bits) {
   2317  return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw));
   2318 }
   2319 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
   2320 HWY_API Vec128<int32_t, N> operator<<(Vec128<int32_t, N> v,
   2321                                      Vec128<int32_t, N> bits) {
   2322  return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw));
   2323 }
   2324 
   2325 HWY_API Vec128<int64_t> operator<<(Vec128<int64_t> v, Vec128<int64_t> bits) {
   2326  return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw));
   2327 }
   2328 HWY_API Vec64<int64_t> operator<<(Vec64<int64_t> v, Vec64<int64_t> bits) {
   2329  return Vec64<int64_t>(vshl_s64(v.raw, bits.raw));
   2330 }
   2331 
   2332 // ------------------------------ Shr (Neg)
   2333 
   2334 HWY_API Vec128<uint8_t> operator>>(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
   2335  const RebindToSigned<DFromV<decltype(v)>> di;
   2336  const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw;
   2337  return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits));
   2338 }
   2339 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
   2340 HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> v,
   2341                                      Vec128<uint8_t, N> bits) {
   2342  const RebindToSigned<DFromV<decltype(v)>> di;
   2343  const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw;
   2344  return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits));
   2345 }
   2346 
   2347 HWY_API Vec128<uint16_t> operator>>(Vec128<uint16_t> v, Vec128<uint16_t> bits) {
   2348  const RebindToSigned<DFromV<decltype(v)>> di;
   2349  const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw;
   2350  return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits));
   2351 }
   2352 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
   2353 HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> v,
   2354                                       Vec128<uint16_t, N> bits) {
   2355  const RebindToSigned<DFromV<decltype(v)>> di;
   2356  const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw;
   2357  return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits));
   2358 }
   2359 
   2360 HWY_API Vec128<uint32_t> operator>>(Vec128<uint32_t> v, Vec128<uint32_t> bits) {
   2361  const RebindToSigned<DFromV<decltype(v)>> di;
   2362  const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw;
   2363  return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits));
   2364 }
   2365 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
   2366 HWY_API Vec128<uint32_t, N> operator>>(Vec128<uint32_t, N> v,
   2367                                       Vec128<uint32_t, N> bits) {
   2368  const RebindToSigned<DFromV<decltype(v)>> di;
   2369  const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw;
   2370  return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits));
   2371 }
   2372 
   2373 HWY_API Vec128<uint64_t> operator>>(Vec128<uint64_t> v, Vec128<uint64_t> bits) {
   2374  const RebindToSigned<DFromV<decltype(v)>> di;
   2375  const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw;
   2376  return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits));
   2377 }
   2378 HWY_API Vec64<uint64_t> operator>>(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
   2379  const RebindToSigned<DFromV<decltype(v)>> di;
   2380  const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw;
   2381  return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits));
   2382 }
   2383 
   2384 HWY_API Vec128<int8_t> operator>>(Vec128<int8_t> v, Vec128<int8_t> bits) {
   2385  return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw));
   2386 }
   2387 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
   2388 HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v,
   2389                                     Vec128<int8_t, N> bits) {
   2390  return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw));
   2391 }
   2392 
   2393 HWY_API Vec128<int16_t> operator>>(Vec128<int16_t> v, Vec128<int16_t> bits) {
   2394  return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw));
   2395 }
   2396 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
   2397 HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v,
   2398                                      Vec128<int16_t, N> bits) {
   2399  return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw));
   2400 }
   2401 
   2402 HWY_API Vec128<int32_t> operator>>(Vec128<int32_t> v, Vec128<int32_t> bits) {
   2403  return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw));
   2404 }
   2405 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
   2406 HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v,
   2407                                      Vec128<int32_t, N> bits) {
   2408  return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw));
   2409 }
   2410 
   2411 HWY_API Vec128<int64_t> operator>>(Vec128<int64_t> v, Vec128<int64_t> bits) {
   2412  return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw));
   2413 }
   2414 HWY_API Vec64<int64_t> operator>>(Vec64<int64_t> v, Vec64<int64_t> bits) {
   2415  return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw));
   2416 }
   2417 
   2418 // ------------------------------ RoundingShr (Neg)
   2419 
   2420 HWY_API Vec128<uint8_t> RoundingShr(Vec128<uint8_t> v, Vec128<uint8_t> bits) {
   2421  const RebindToSigned<DFromV<decltype(v)>> di;
   2422  const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw;
   2423  return Vec128<uint8_t>(vrshlq_u8(v.raw, neg_bits));
   2424 }
   2425 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
   2426 HWY_API Vec128<uint8_t, N> RoundingShr(Vec128<uint8_t, N> v,
   2427                                       Vec128<uint8_t, N> bits) {
   2428  const RebindToSigned<DFromV<decltype(v)>> di;
   2429  const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw;
   2430  return Vec128<uint8_t, N>(vrshl_u8(v.raw, neg_bits));
   2431 }
   2432 
   2433 HWY_API Vec128<uint16_t> RoundingShr(Vec128<uint16_t> v,
   2434                                     Vec128<uint16_t> bits) {
   2435  const RebindToSigned<DFromV<decltype(v)>> di;
   2436  const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw;
   2437  return Vec128<uint16_t>(vrshlq_u16(v.raw, neg_bits));
   2438 }
   2439 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
   2440 HWY_API Vec128<uint16_t, N> RoundingShr(Vec128<uint16_t, N> v,
   2441                                        Vec128<uint16_t, N> bits) {
   2442  const RebindToSigned<DFromV<decltype(v)>> di;
   2443  const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw;
   2444  return Vec128<uint16_t, N>(vrshl_u16(v.raw, neg_bits));
   2445 }
   2446 
   2447 HWY_API Vec128<uint32_t> RoundingShr(Vec128<uint32_t> v,
   2448                                     Vec128<uint32_t> bits) {
   2449  const RebindToSigned<DFromV<decltype(v)>> di;
   2450  const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw;
   2451  return Vec128<uint32_t>(vrshlq_u32(v.raw, neg_bits));
   2452 }
   2453 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
   2454 HWY_API Vec128<uint32_t, N> RoundingShr(Vec128<uint32_t, N> v,
   2455                                        Vec128<uint32_t, N> bits) {
   2456  const RebindToSigned<DFromV<decltype(v)>> di;
   2457  const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw;
   2458  return Vec128<uint32_t, N>(vrshl_u32(v.raw, neg_bits));
   2459 }
   2460 
   2461 HWY_API Vec128<uint64_t> RoundingShr(Vec128<uint64_t> v,
   2462                                     Vec128<uint64_t> bits) {
   2463  const RebindToSigned<DFromV<decltype(v)>> di;
   2464  const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw;
   2465  return Vec128<uint64_t>(vrshlq_u64(v.raw, neg_bits));
   2466 }
   2467 HWY_API Vec64<uint64_t> RoundingShr(Vec64<uint64_t> v, Vec64<uint64_t> bits) {
   2468  const RebindToSigned<DFromV<decltype(v)>> di;
   2469  const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw;
   2470  return Vec64<uint64_t>(vrshl_u64(v.raw, neg_bits));
   2471 }
   2472 
   2473 HWY_API Vec128<int8_t> RoundingShr(Vec128<int8_t> v, Vec128<int8_t> bits) {
   2474  return Vec128<int8_t>(vrshlq_s8(v.raw, Neg(bits).raw));
   2475 }
   2476 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
   2477 HWY_API Vec128<int8_t, N> RoundingShr(Vec128<int8_t, N> v,
   2478                                      Vec128<int8_t, N> bits) {
   2479  return Vec128<int8_t, N>(vrshl_s8(v.raw, Neg(bits).raw));
   2480 }
   2481 
   2482 HWY_API Vec128<int16_t> RoundingShr(Vec128<int16_t> v, Vec128<int16_t> bits) {
   2483  return Vec128<int16_t>(vrshlq_s16(v.raw, Neg(bits).raw));
   2484 }
   2485 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
   2486 HWY_API Vec128<int16_t, N> RoundingShr(Vec128<int16_t, N> v,
   2487                                       Vec128<int16_t, N> bits) {
   2488  return Vec128<int16_t, N>(vrshl_s16(v.raw, Neg(bits).raw));
   2489 }
   2490 
   2491 HWY_API Vec128<int32_t> RoundingShr(Vec128<int32_t> v, Vec128<int32_t> bits) {
   2492  return Vec128<int32_t>(vrshlq_s32(v.raw, Neg(bits).raw));
   2493 }
   2494 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
   2495 HWY_API Vec128<int32_t, N> RoundingShr(Vec128<int32_t, N> v,
   2496                                       Vec128<int32_t, N> bits) {
   2497  return Vec128<int32_t, N>(vrshl_s32(v.raw, Neg(bits).raw));
   2498 }
   2499 
   2500 HWY_API Vec128<int64_t> RoundingShr(Vec128<int64_t> v, Vec128<int64_t> bits) {
   2501  return Vec128<int64_t>(vrshlq_s64(v.raw, Neg(bits).raw));
   2502 }
   2503 HWY_API Vec64<int64_t> RoundingShr(Vec64<int64_t> v, Vec64<int64_t> bits) {
   2504  return Vec64<int64_t>(vrshl_s64(v.raw, Neg(bits).raw));
   2505 }
   2506 
   2507 // ------------------------------ ShiftLeftSame (Shl)
   2508 
   2509 template <typename T, size_t N>
   2510 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) {
   2511  return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits));
   2512 }
   2513 template <typename T, size_t N>
   2514 HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) {
   2515  return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits));
   2516 }
   2517 
   2518 // ------------------------------ RoundingShiftRightSame (RoundingShr)
   2519 
   2520 template <typename T, size_t N>
   2521 HWY_API Vec128<T, N> RoundingShiftRightSame(const Vec128<T, N> v, int bits) {
   2522  return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits)));
   2523 }
   2524 
   2525 // ------------------------------ Int/float multiplication
   2526 
   2527 // Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*.
   2528 #ifdef HWY_NATIVE_MUL_8
   2529 #undef HWY_NATIVE_MUL_8
   2530 #else
   2531 #define HWY_NATIVE_MUL_8
   2532 #endif
   2533 
   2534 // All except ui64
   2535 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2)
   2536 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2)
   2537 
   2538 template <size_t N>
   2539 HWY_API Vec128<int8_t, N> operator*(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   2540  const DFromV<decltype(a)> d;
   2541  const RebindToUnsigned<decltype(d)> du;
   2542  return BitCast(d, BitCast(du, a) * BitCast(du, b));
   2543 }
   2544 
   2545 template <size_t N>
   2546 HWY_API Vec128<int16_t, N> operator*(Vec128<int16_t, N> a,
   2547                                     Vec128<int16_t, N> b) {
   2548  const DFromV<decltype(a)> d;
   2549  const RebindToUnsigned<decltype(d)> du;
   2550  return BitCast(d, BitCast(du, a) * BitCast(du, b));
   2551 }
   2552 
   2553 template <size_t N>
   2554 HWY_API Vec128<int32_t, N> operator*(Vec128<int32_t, N> a,
   2555                                     Vec128<int32_t, N> b) {
   2556  const DFromV<decltype(a)> d;
   2557  const RebindToUnsigned<decltype(d)> du;
   2558  return BitCast(d, BitCast(du, a) * BitCast(du, b));
   2559 }
   2560 
   2561 // ------------------------------ Integer multiplication
   2562 
   2563 // Returns the upper sizeof(T)*8 bits of a * b in each lane.
   2564 HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) {
   2565  int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw));
   2566 #if HWY_ARCH_ARM_A64
   2567  int16x8_t rhi = vmull_high_s8(a.raw, b.raw);
   2568 #else
   2569  int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw));
   2570 #endif
   2571  return Vec128<int8_t>(
   2572      vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi)));
   2573 }
   2574 HWY_API Vec128<uint8_t> MulHigh(Vec128<uint8_t> a, Vec128<uint8_t> b) {
   2575  uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw));
   2576 #if HWY_ARCH_ARM_A64
   2577  uint16x8_t rhi = vmull_high_u8(a.raw, b.raw);
   2578 #else
   2579  uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw));
   2580 #endif
   2581  return Vec128<uint8_t>(
   2582      vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi)));
   2583 }
   2584 
   2585 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)>
   2586 HWY_API Vec128<int8_t, N> MulHigh(Vec128<int8_t, N> a, Vec128<int8_t, N> b) {
   2587  int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw));
   2588  return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo)));
   2589 }
   2590 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)>
   2591 HWY_API Vec128<uint8_t, N> MulHigh(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) {
   2592  uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw));
   2593  return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo)));
   2594 }
   2595 
   2596 HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) {
   2597  int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw));
   2598 #if HWY_ARCH_ARM_A64
   2599  int32x4_t rhi = vmull_high_s16(a.raw, b.raw);
   2600 #else
   2601  int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw));
   2602 #endif
   2603  return Vec128<int16_t>(
   2604      vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi)));
   2605 }
   2606 HWY_API Vec128<uint16_t> MulHigh(Vec128<uint16_t> a, Vec128<uint16_t> b) {
   2607  uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw));
   2608 #if HWY_ARCH_ARM_A64
   2609  uint32x4_t rhi = vmull_high_u16(a.raw, b.raw);
   2610 #else
   2611  uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw));
   2612 #endif
   2613  return Vec128<uint16_t>(
   2614      vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi)));
   2615 }
   2616 
   2617 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
   2618 HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) {
   2619  int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw));
   2620  return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo)));
   2621 }
   2622 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)>
   2623 HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a,
   2624                                    Vec128<uint16_t, N> b) {
   2625  uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw));
   2626  return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo)));
   2627 }
   2628 
   2629 HWY_API Vec128<int32_t> MulHigh(Vec128<int32_t> a, Vec128<int32_t> b) {
   2630  int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw));
   2631 #if HWY_ARCH_ARM_A64
   2632  int64x2_t rhi = vmull_high_s32(a.raw, b.raw);
   2633 #else
   2634  int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw));
   2635 #endif
   2636  return Vec128<int32_t>(
   2637      vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi)));
   2638 }
   2639 HWY_API Vec128<uint32_t> MulHigh(Vec128<uint32_t> a, Vec128<uint32_t> b) {
   2640  uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw));
   2641 #if HWY_ARCH_ARM_A64
   2642  uint64x2_t rhi = vmull_high_u32(a.raw, b.raw);
   2643 #else
   2644  uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw));
   2645 #endif
   2646  return Vec128<uint32_t>(
   2647      vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi)));
   2648 }
   2649 
   2650 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)>
   2651 HWY_API Vec128<int32_t, N> MulHigh(Vec128<int32_t, N> a, Vec128<int32_t, N> b) {
   2652  int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw));
   2653  return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo)));
   2654 }
   2655 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
   2656 HWY_API Vec128<uint32_t, N> MulHigh(Vec128<uint32_t, N> a,
   2657                                    Vec128<uint32_t, N> b) {
   2658  uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw));
   2659  return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo)));
   2660 }
   2661 
   2662 template <class T, HWY_IF_UI64(T)>
   2663 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) {
   2664  T hi_0;
   2665  T hi_1;
   2666 
   2667  Mul128(GetLane(a), GetLane(b), &hi_0);
   2668  Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1);
   2669 
   2670  return Dup128VecFromValues(Full128<T>(), hi_0, hi_1);
   2671 }
   2672 
   2673 template <class T, HWY_IF_UI64(T)>
   2674 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) {
   2675  T hi;
   2676  Mul128(GetLane(a), GetLane(b), &hi);
   2677  return Set(Full64<T>(), hi);
   2678 }
   2679 
   2680 HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) {
   2681  return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw));
   2682 }
   2683 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)>
   2684 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a,
   2685                                           Vec128<int16_t, N> b) {
   2686  return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw));
   2687 }
   2688 
   2689 // ------------------------------ Floating-point division
   2690 
   2691 // Emulate missing intrinsic
   2692 #if HWY_HAVE_FLOAT64 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   2693 HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) {
   2694  const CappedTag<double, 1> d;
   2695  const Twice<decltype(d)> dt;
   2696  using VT = VFromD<decltype(dt)>;
   2697  return LowerHalf(d, VT(vrecpeq_f64(Combine(dt, v, v).raw))).raw;
   2698 }
   2699 #endif
   2700 
   2701 // Approximate reciprocal
   2702 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocal, vrecpe, _, 1)
   2703 
   2704 #if HWY_HAVE_FLOAT64
   2705 #ifdef HWY_NATIVE_F64_APPROX_RECIP
   2706 #undef HWY_NATIVE_F64_APPROX_RECIP
   2707 #else
   2708 #define HWY_NATIVE_F64_APPROX_RECIP
   2709 #endif
   2710 
   2711 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2)
   2712 #else   // !HWY_HAVE_FLOAT64
   2713 namespace detail {
   2714 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalNewtonRaphsonStep, vrecps, _, 2)
   2715 }  // namespace detail
   2716 
   2717 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2718 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) {
   2719  auto x = ApproximateReciprocal(b);
   2720  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
   2721  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
   2722  x *= detail::ReciprocalNewtonRaphsonStep(x, b);
   2723  return a * x;
   2724 }
   2725 #endif  // HWY_HAVE_FLOAT64
   2726 
   2727 // ------------------------------ Absolute value of difference.
   2728 
   2729 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(AbsDiff, vabd, _, 2)
   2730 HWY_NEON_DEF_FUNCTION_UI_8_16_32(AbsDiff, vabd, _, 2)  // no UI64
   2731 
   2732 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF
   2733 #undef HWY_NATIVE_INTEGER_ABS_DIFF
   2734 #else
   2735 #define HWY_NATIVE_INTEGER_ABS_DIFF
   2736 #endif
   2737 
   2738 // ------------------------------ Integer multiply-add
   2739 
   2740 // Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
   2741 #ifdef HWY_NATIVE_INT_FMA
   2742 #undef HWY_NATIVE_INT_FMA
   2743 #else
   2744 #define HWY_NATIVE_INT_FMA
   2745 #endif
   2746 
   2747 // Wrappers for changing argument order to what intrinsics expect.
   2748 namespace detail {
   2749 // All except ui64
   2750 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(MulAdd, vmla, _, 3)
   2751 HWY_NEON_DEF_FUNCTION_INT_8_16_32(MulAdd, vmla, _, 3)
   2752 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(NegMulAdd, vmls, _, 3)
   2753 HWY_NEON_DEF_FUNCTION_INT_8_16_32(NegMulAdd, vmls, _, 3)
   2754 }  // namespace detail
   2755 
   2756 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
   2757 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2758                            Vec128<T, N> add) {
   2759  return detail::MulAdd(add, mul, x);
   2760 }
   2761 
   2762 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)>
   2763 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2764                               Vec128<T, N> add) {
   2765  return detail::NegMulAdd(add, mul, x);
   2766 }
   2767 
   2768 // 64-bit integer
   2769 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
   2770 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2771                            Vec128<T, N> add) {
   2772  return Add(Mul(mul, x), add);
   2773 }
   2774 
   2775 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)>
   2776 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2777                               Vec128<T, N> add) {
   2778  return Sub(add, Mul(mul, x));
   2779 }
   2780 
   2781 // ------------------------------ Floating-point multiply-add variants
   2782 
   2783 namespace detail {
   2784 
   2785 #if HWY_NATIVE_FMA
   2786 // Wrappers for changing argument order to what intrinsics expect.
   2787 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3)
   2788 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3)
   2789 #else
   2790 // Emulate. Matches intrinsics arg order.
   2791 template <size_t N>
   2792 HWY_API Vec128<float, N> MulAdd(Vec128<float, N> add, Vec128<float, N> mul,
   2793                                Vec128<float, N> x) {
   2794  return mul * x + add;
   2795 }
   2796 
   2797 template <size_t N>
   2798 HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> add, Vec128<float, N> mul,
   2799                                   Vec128<float, N> x) {
   2800  return add - mul * x;
   2801 }
   2802 
   2803 #endif  // HWY_NATIVE_FMA
   2804 }  // namespace detail
   2805 
   2806 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2807 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2808                            Vec128<T, N> add) {
   2809  return detail::MulAdd(add, mul, x);
   2810 }
   2811 
   2812 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2813 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x,
   2814                               Vec128<T, N> add) {
   2815  return detail::NegMulAdd(add, mul, x);
   2816 }
   2817 
   2818 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2819 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x,
   2820                            Vec128<T, N> sub) {
   2821  return MulAdd(mul, x, Neg(sub));
   2822 }
   2823 
   2824 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2825 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x,
   2826                               Vec128<T, N> sub) {
   2827  return Neg(MulAdd(mul, x, sub));
   2828 }
   2829 
   2830 // ------------------------------ Floating-point square root (IfThenZeroElse)
   2831 
   2832 // Emulate missing intrinsic
   2833 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 490
   2834 HWY_INLINE float64x1_t vrsqrte_f64(float64x1_t raw) {
   2835  const CappedTag<double, 1> d;
   2836  const Twice<decltype(d)> dt;
   2837  using VT = VFromD<decltype(dt)>;
   2838  const VFromD<decltype(d)> v(raw);
   2839  return LowerHalf(d, VT(vrsqrteq_f64(Combine(dt, v, v).raw))).raw;
   2840 }
   2841 #endif
   2842 
   2843 // Approximate reciprocal square root
   2844 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocalSqrt, vrsqrte, _, 1)
   2845 
   2846 #if HWY_HAVE_FLOAT64
   2847 #ifdef HWY_NATIVE_F64_APPROX_RSQRT
   2848 #undef HWY_NATIVE_F64_APPROX_RSQRT
   2849 #else
   2850 #define HWY_NATIVE_F64_APPROX_RSQRT
   2851 #endif
   2852 
   2853 // Full precision square root
   2854 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1)
   2855 #else   // !HWY_HAVE_FLOAT64
   2856 namespace detail {
   2857 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalSqrtStep, vrsqrts, _, 2)
   2858 }  // namespace detail
   2859 
   2860 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2861 HWY_API Vec128<T, N> Sqrt(const Vec128<T, N> v) {
   2862  auto recip = ApproximateReciprocalSqrt(v);
   2863 
   2864  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
   2865  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
   2866  recip *= detail::ReciprocalSqrtStep(v * recip, recip);
   2867 
   2868  const auto root = v * recip;
   2869  return IfThenZeroElse(v == Zero(Simd<T, N, 0>()), root);
   2870 }
   2871 #endif  // HWY_HAVE_FLOAT64
   2872 
   2873 // ================================================== LOGICAL
   2874 
   2875 // ------------------------------ Not
   2876 
   2877 // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION.
   2878 template <typename T>
   2879 HWY_API Vec128<T> Not(const Vec128<T> v) {
   2880  const DFromV<decltype(v)> d;
   2881  const Repartition<uint8_t, decltype(d)> d8;
   2882  return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw)));
   2883 }
   2884 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   2885 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) {
   2886  const DFromV<decltype(v)> d;
   2887  const Repartition<uint8_t, decltype(d)> d8;
   2888  using V8 = decltype(Zero(d8));
   2889  return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw)));
   2890 }
   2891 
   2892 // ------------------------------ And
   2893 HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2)
   2894 
   2895 // Uses the u32/64 defined above.
   2896 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2897 HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) {
   2898  const DFromV<decltype(a)> d;
   2899  const RebindToUnsigned<decltype(d)> du;
   2900  return BitCast(d, BitCast(du, a) & BitCast(du, b));
   2901 }
   2902 
   2903 // ------------------------------ AndNot
   2904 
   2905 namespace detail {
   2906 // reversed_andnot returns a & ~b.
   2907 HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2)
   2908 }  // namespace detail
   2909 
   2910 // Returns ~not_mask & mask.
   2911 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
   2912 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
   2913                            const Vec128<T, N> mask) {
   2914  return detail::reversed_andnot(mask, not_mask);
   2915 }
   2916 
   2917 // Uses the u32/64 defined above.
   2918 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2919 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask,
   2920                            const Vec128<T, N> mask) {
   2921  const DFromV<decltype(mask)> d;
   2922  const RebindToUnsigned<decltype(d)> du;
   2923  VFromD<decltype(du)> ret =
   2924      detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask));
   2925  return BitCast(d, ret);
   2926 }
   2927 
   2928 // ------------------------------ Or
   2929 
   2930 HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2)
   2931 
   2932 // Uses the u32/64 defined above.
   2933 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2934 HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) {
   2935  const DFromV<decltype(a)> d;
   2936  const RebindToUnsigned<decltype(d)> du;
   2937  return BitCast(d, BitCast(du, a) | BitCast(du, b));
   2938 }
   2939 
   2940 // ------------------------------ Xor
   2941 
   2942 HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2)
   2943 
   2944 // Uses the u32/64 defined above.
   2945 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2946 HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) {
   2947  const DFromV<decltype(a)> d;
   2948  const RebindToUnsigned<decltype(d)> du;
   2949  return BitCast(d, BitCast(du, a) ^ BitCast(du, b));
   2950 }
   2951 
   2952 // ------------------------------ Xor3
   2953 #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3)
   2954 HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3)
   2955 
   2956 // Half vectors are not natively supported. Two Xor are likely more efficient
   2957 // than Combine to 128-bit.
   2958 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_NOT_FLOAT(T)>
   2959 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
   2960  return Xor(x1, Xor(x2, x3));
   2961 }
   2962 
   2963 template <typename T, size_t N, HWY_IF_FLOAT(T)>
   2964 HWY_API Vec128<T, N> Xor3(const Vec128<T, N> x1, const Vec128<T, N> x2,
   2965                          const Vec128<T, N> x3) {
   2966  const DFromV<decltype(x1)> d;
   2967  const RebindToUnsigned<decltype(d)> du;
   2968  return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3)));
   2969 }
   2970 
   2971 #else
   2972 template <typename T, size_t N>
   2973 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) {
   2974  return Xor(x1, Xor(x2, x3));
   2975 }
   2976 #endif
   2977 
   2978 // ------------------------------ Or3
   2979 template <typename T, size_t N>
   2980 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) {
   2981  return Or(o1, Or(o2, o3));
   2982 }
   2983 
   2984 // ------------------------------ OrAnd
   2985 template <typename T, size_t N>
   2986 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) {
   2987  return Or(o, And(a1, a2));
   2988 }
   2989 
   2990 // ------------------------------ Operator overloads (internal-only if float)
   2991 
   2992 template <typename T, size_t N>
   2993 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) {
   2994  return And(a, b);
   2995 }
   2996 
   2997 template <typename T, size_t N>
   2998 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) {
   2999  return Or(a, b);
   3000 }
   3001 
   3002 template <typename T, size_t N>
   3003 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) {
   3004  return Xor(a, b);
   3005 }
   3006 
   3007 // ------------------------------ I64/U64 AbsDiff
   3008 
   3009 template <size_t N>
   3010 HWY_API Vec128<int64_t, N> AbsDiff(const Vec128<int64_t, N> a,
   3011                                   const Vec128<int64_t, N> b) {
   3012  return Max(a, b) - Min(a, b);
   3013 }
   3014 
   3015 template <size_t N>
   3016 HWY_API Vec128<uint64_t, N> AbsDiff(const Vec128<uint64_t, N> a,
   3017                                    const Vec128<uint64_t, N> b) {
   3018  return Or(SaturatedSub(a, b), SaturatedSub(b, a));
   3019 }
   3020 
   3021 // ------------------------------ PopulationCount
   3022 
   3023 #ifdef HWY_NATIVE_POPCNT
   3024 #undef HWY_NATIVE_POPCNT
   3025 #else
   3026 #define HWY_NATIVE_POPCNT
   3027 #endif
   3028 
   3029 namespace detail {
   3030 
   3031 template <typename T>
   3032 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec128<T> v) {
   3033  const Full128<uint8_t> d8;
   3034  return Vec128<T>(vcntq_u8(BitCast(d8, v).raw));
   3035 }
   3036 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   3037 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */,
   3038                                        Vec128<T, N> v) {
   3039  const Simd<uint8_t, N, 0> d8;
   3040  return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw));
   3041 }
   3042 
   3043 // NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes.
   3044 template <typename T>
   3045 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec128<T> v) {
   3046  const Full128<uint8_t> d8;
   3047  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
   3048  return Vec128<T>(vpaddlq_u8(bytes));
   3049 }
   3050 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   3051 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */,
   3052                                        Vec128<T, N> v) {
   3053  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
   3054  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
   3055  return Vec128<T, N>(vpaddl_u8(bytes));
   3056 }
   3057 
   3058 template <typename T>
   3059 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec128<T> v) {
   3060  const Full128<uint8_t> d8;
   3061  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
   3062  return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes)));
   3063 }
   3064 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   3065 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */,
   3066                                        Vec128<T, N> v) {
   3067  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
   3068  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
   3069  return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes)));
   3070 }
   3071 
   3072 template <typename T>
   3073 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec128<T> v) {
   3074  const Full128<uint8_t> d8;
   3075  const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw);
   3076  return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes))));
   3077 }
   3078 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   3079 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */,
   3080                                        Vec128<T, N> v) {
   3081  const Repartition<uint8_t, DFromV<decltype(v)>> d8;
   3082  const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw);
   3083  return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes))));
   3084 }
   3085 
   3086 }  // namespace detail
   3087 
   3088 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)>
   3089 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) {
   3090  return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v);
   3091 }
   3092 
   3093 // ================================================== SIGN
   3094 
   3095 // ------------------------------ Abs
   3096 // i64 is implemented after BroadcastSignBit.
   3097 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1)
   3098 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1)
   3099 
   3100 // ------------------------------ SaturatedAbs
   3101 #ifdef HWY_NATIVE_SATURATED_ABS
   3102 #undef HWY_NATIVE_SATURATED_ABS
   3103 #else
   3104 #define HWY_NATIVE_SATURATED_ABS
   3105 #endif
   3106 
   3107 HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1)
   3108 
   3109 // ------------------------------ CopySignToAbs
   3110 template <typename T, size_t N>
   3111 HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) {
   3112  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   3113  const DFromV<decltype(abs)> d;
   3114  return OrAnd(abs, SignBit(d), sign);
   3115 }
   3116 
   3117 // ------------------------------ BroadcastSignBit
   3118 
   3119 template <typename T, size_t N, HWY_IF_SIGNED(T)>
   3120 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) {
   3121  return ShiftRight<sizeof(T) * 8 - 1>(v);
   3122 }
   3123 
   3124 // ================================================== MASK
   3125 
   3126 // ------------------------------ To/from vector
   3127 
   3128 // Mask and Vec have the same representation (true = FF..FF).
   3129 template <typename T, size_t N>
   3130 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) {
   3131  const Simd<MakeUnsigned<T>, N, 0> du;
   3132  return Mask128<T, N>(BitCast(du, v).raw);
   3133 }
   3134 
   3135 template <class D>
   3136 using MFromD = decltype(MaskFromVec(VFromD<D>()));
   3137 
   3138 template <class D>
   3139 HWY_API VFromD<D> VecFromMask(D d, const MFromD<D> m) {
   3140  // Raw type of masks is unsigned.
   3141  const RebindToUnsigned<D> du;
   3142  return BitCast(d, VFromD<decltype(du)>(m.raw));
   3143 }
   3144 
   3145 // ------------------------------ RebindMask (MaskFromVec)
   3146 
   3147 template <typename TFrom, size_t NFrom, class DTo>
   3148 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) {
   3149  static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size");
   3150  return MFromD<DTo>(m.raw);
   3151 }
   3152 
   3153 // ------------------------------ IfThenElse
   3154 
   3155 // Workaround for incorrect codegen.
   3156 #if HWY_ARCH_ARM_V7
   3157 
   3158 template <class V, class D = DFromV<V>>
   3159 HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) {
   3160  const RebindToUnsigned<D> du;
   3161  using VU = VFromD<decltype(du)>;
   3162  const VU no_u = BitCast(du, no);
   3163  const VU diff_u = BitCast(du, yes) ^ no_u;
   3164  const VU mask_u = BitCast(du, VecFromMask(D(), mask));
   3165  return BitCast(D(), no_u ^ (diff_u & mask_u));
   3166 }
   3167 
   3168 #else  // normal VBSL instruction
   3169 
   3170 #define HWY_NEON_BUILD_TPL_HWY_IF
   3171 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size>
   3172 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size)                         \
   3173  const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \
   3174      const Vec128<type##_t, size> no
   3175 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw
   3176 
   3177 HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF)
   3178 
   3179 #endif  // HWY_ARCH_ARM_V7
   3180 
   3181 #if HWY_HAVE_FLOAT16
   3182 #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>)
   3183 #else
   3184 #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V)
   3185 #endif
   3186 
   3187 template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)>
   3188 HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) {
   3189  const DFromV<decltype(yes)> d;
   3190  const RebindToUnsigned<decltype(d)> du;
   3191  return BitCast(
   3192      d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no)));
   3193 }
   3194 
   3195 #undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE
   3196 #undef HWY_NEON_BUILD_TPL_HWY_IF
   3197 #undef HWY_NEON_BUILD_RET_HWY_IF
   3198 #undef HWY_NEON_BUILD_PARAM_HWY_IF
   3199 #undef HWY_NEON_BUILD_ARG_HWY_IF
   3200 
   3201 // mask ? yes : 0
   3202 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
   3203 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   3204  return yes & VecFromMask(DFromV<decltype(yes)>(), mask);
   3205 }
   3206 template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
   3207 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) {
   3208  const DFromV<decltype(yes)> d;
   3209  const RebindToUnsigned<decltype(d)> du;
   3210  return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes)));
   3211 }
   3212 
   3213 // mask ? 0 : no
   3214 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)>
   3215 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   3216  return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no);
   3217 }
   3218 template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)>
   3219 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) {
   3220  const DFromV<decltype(no)> d;
   3221  const RebindToUnsigned<decltype(d)> du;
   3222  return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no)));
   3223 }
   3224 
   3225 template <typename T, size_t N>
   3226 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes,
   3227                                        Vec128<T, N> no) {
   3228  static_assert(IsSigned<T>(), "Only works for signed/float");
   3229  const DFromV<decltype(no)> d;
   3230  const RebindToSigned<decltype(d)> di;
   3231 
   3232  Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v))));
   3233  return IfThenElse(m, yes, no);
   3234 }
   3235 
   3236 template <typename T, size_t N>
   3237 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes,
   3238                                   Vec128<T, N> no) {
   3239  return IfThenElse(MaskFromVec(mask), yes, no);
   3240 }
   3241 
   3242 // ------------------------------ BitwiseIfThenElse
   3243 
   3244 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE
   3245 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE
   3246 #else
   3247 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE
   3248 #endif
   3249 
   3250 template <class V>
   3251 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) {
   3252  return IfVecThenElse(mask, yes, no);
   3253 }
   3254 
   3255 // ------------------------------ CopySign (BitwiseIfThenElse)
   3256 template <typename T, size_t N>
   3257 HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) {
   3258  static_assert(IsFloat<T>(), "Only makes sense for floating-point");
   3259  const DFromV<decltype(magn)> d;
   3260  return BitwiseIfThenElse(SignBit(d), sign, magn);
   3261 }
   3262 
   3263 // ------------------------------ Mask logical
   3264 
   3265 template <typename T, size_t N>
   3266 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) {
   3267  return MaskFromVec(Not(VecFromMask(DFromM<decltype(m)>(), m)));
   3268 }
   3269 
   3270 template <typename T, size_t N>
   3271 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) {
   3272  const DFromM<decltype(a)> d;
   3273  return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b)));
   3274 }
   3275 
   3276 template <typename T, size_t N>
   3277 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) {
   3278  const DFromM<decltype(a)> d;
   3279  return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b)));
   3280 }
   3281 
   3282 template <typename T, size_t N>
   3283 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) {
   3284  const DFromM<decltype(a)> d;
   3285  return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b)));
   3286 }
   3287 
   3288 template <typename T, size_t N>
   3289 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) {
   3290  const DFromM<decltype(a)> d;
   3291  return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b)));
   3292 }
   3293 
   3294 template <typename T, size_t N>
   3295 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) {
   3296  const DFromM<decltype(a)> d;
   3297  return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b))));
   3298 }
   3299 
   3300 // ================================================== COMPARE
   3301 
   3302 // Comparisons fill a lane with 1-bits if the condition is true, else 0.
   3303 
   3304 // ------------------------------ Shuffle2301 (for i64 compares)
   3305 
   3306 // Swap 32-bit halves in 64-bits
   3307 HWY_API Vec64<uint32_t> Shuffle2301(const Vec64<uint32_t> v) {
   3308  return Vec64<uint32_t>(vrev64_u32(v.raw));
   3309 }
   3310 HWY_API Vec64<int32_t> Shuffle2301(const Vec64<int32_t> v) {
   3311  return Vec64<int32_t>(vrev64_s32(v.raw));
   3312 }
   3313 HWY_API Vec64<float> Shuffle2301(const Vec64<float> v) {
   3314  return Vec64<float>(vrev64_f32(v.raw));
   3315 }
   3316 HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) {
   3317  return Vec128<uint32_t>(vrev64q_u32(v.raw));
   3318 }
   3319 HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) {
   3320  return Vec128<int32_t>(vrev64q_s32(v.raw));
   3321 }
   3322 HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) {
   3323  return Vec128<float>(vrev64q_f32(v.raw));
   3324 }
   3325 
   3326 #define HWY_NEON_BUILD_TPL_HWY_COMPARE
   3327 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size>
   3328 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \
   3329  const Vec128<type##_t, size> a, const Vec128<type##_t, size> b
   3330 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw
   3331 
   3332 // ------------------------------ Equality
   3333 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE)
   3334 #if HWY_ARCH_ARM_A64
   3335 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE)
   3336 #else
   3337 // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301.
   3338 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE)
   3339 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE)
   3340 #endif
   3341 
   3342 // ------------------------------ Strict inequality (signed, float)
   3343 #if HWY_ARCH_ARM_A64
   3344 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE)
   3345 #else
   3346 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE)
   3347 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE)
   3348 #endif
   3349 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE)
   3350 
   3351 // ------------------------------ Weak inequality (float)
   3352 #if HWY_ARCH_ARM_A64
   3353 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE)
   3354 #else
   3355 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE)
   3356 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE)
   3357 #endif
   3358 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE)
   3359 
   3360 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE
   3361 #undef HWY_NEON_BUILD_RET_HWY_COMPARE
   3362 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE
   3363 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE
   3364 
   3365 // ------------------------------ Armv7 i64 compare (Shuffle2301, Eq)
   3366 
   3367 #if HWY_ARCH_ARM_V7
   3368 
   3369 template <size_t N>
   3370 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a,
   3371                                       const Vec128<int64_t, N> b) {
   3372  const Simd<int32_t, N * 2, 0> d32;
   3373  const Simd<int64_t, N, 0> d64;
   3374  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
   3375  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
   3376  return MaskFromVec(BitCast(d64, cmp64));
   3377 }
   3378 
   3379 template <size_t N>
   3380 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a,
   3381                                        const Vec128<uint64_t, N> b) {
   3382  const Simd<uint32_t, N * 2, 0> d32;
   3383  const Simd<uint64_t, N, 0> d64;
   3384  const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b)));
   3385  const auto cmp64 = cmp32 & Shuffle2301(cmp32);
   3386  return MaskFromVec(BitCast(d64, cmp64));
   3387 }
   3388 
   3389 HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a,
   3390                                   const Vec128<int64_t> b) {
   3391  const int64x2_t sub = vqsubq_s64(a.raw, b.raw);
   3392  return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub)));
   3393 }
   3394 HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a,
   3395                                      const Vec64<int64_t> b) {
   3396  const int64x1_t sub = vqsub_s64(a.raw, b.raw);
   3397  return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub)));
   3398 }
   3399 
   3400 template <size_t N>
   3401 HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a,
   3402                                       const Vec128<uint64_t, N> b) {
   3403  const DFromV<decltype(a)> du;
   3404  const RebindToSigned<decltype(du)> di;
   3405  const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b);
   3406  return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb))));
   3407 }
   3408 
   3409 template <size_t N>
   3410 HWY_API Mask128<int64_t, N> operator<=(const Vec128<int64_t, N> a,
   3411                                       const Vec128<int64_t, N> b) {
   3412  return Not(b < a);
   3413 }
   3414 
   3415 template <size_t N>
   3416 HWY_API Mask128<uint64_t, N> operator<=(const Vec128<uint64_t, N> a,
   3417                                        const Vec128<uint64_t, N> b) {
   3418  return Not(b < a);
   3419 }
   3420 
   3421 #endif
   3422 
   3423 // ------------------------------ operator!= (operator==)
   3424 
   3425 // Customize HWY_NEON_DEF_FUNCTION to call 2 functions.
   3426 #pragma push_macro("HWY_NEON_DEF_FUNCTION")
   3427 #undef HWY_NEON_DEF_FUNCTION
   3428 // This cannot have _any_ template argument (in x86_128 we can at least have N
   3429 // as an argument), otherwise it is not more specialized than rewritten
   3430 // operator== in C++20, leading to compile errors.
   3431 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \
   3432  HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a,             \
   3433                                       Vec128<type##_t, size> b) {           \
   3434    return Not(a == b);                                                      \
   3435  }
   3436 
   3437 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored)
   3438 
   3439 #pragma pop_macro("HWY_NEON_DEF_FUNCTION")
   3440 
   3441 // ------------------------------ Reversed comparisons
   3442 
   3443 template <typename T, size_t N>
   3444 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) {
   3445  return operator<(b, a);
   3446 }
   3447 template <typename T, size_t N>
   3448 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) {
   3449  return operator<=(b, a);
   3450 }
   3451 
   3452 // ------------------------------ FirstN (Iota, Lt)
   3453 
   3454 template <class D>
   3455 HWY_API MFromD<D> FirstN(D d, size_t num) {
   3456  const RebindToSigned<decltype(d)> di;  // Signed comparisons are cheaper.
   3457  using TI = TFromD<decltype(di)>;
   3458  return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num)));
   3459 }
   3460 
   3461 // ------------------------------ TestBit (Eq)
   3462 
   3463 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT
   3464 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size>
   3465 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \
   3466  Vec128<type##_t, size> v, Vec128<type##_t, size> bit
   3467 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw
   3468 
   3469 #if HWY_ARCH_ARM_A64
   3470 HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT)
   3471 #else
   3472 // No 64-bit versions on armv7
   3473 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
   3474 HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT)
   3475 
   3476 template <size_t N>
   3477 HWY_API Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v,
   3478                                     Vec128<uint64_t, N> bit) {
   3479  return (v & bit) == bit;
   3480 }
   3481 template <size_t N>
   3482 HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v,
   3483                                    Vec128<int64_t, N> bit) {
   3484  return (v & bit) == bit;
   3485 }
   3486 
   3487 #endif
   3488 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT
   3489 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT
   3490 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT
   3491 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT
   3492 
   3493 // ------------------------------ Abs i64 (IfNegativeThenElse, Neg)
   3494 HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) {
   3495 #if HWY_ARCH_ARM_A64
   3496  return Vec128<int64_t>(vabsq_s64(v.raw));
   3497 #else
   3498  return IfNegativeThenElse(v, Neg(v), v);
   3499 #endif
   3500 }
   3501 HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) {
   3502 #if HWY_ARCH_ARM_A64
   3503  return Vec64<int64_t>(vabs_s64(v.raw));
   3504 #else
   3505  return IfNegativeThenElse(v, Neg(v), v);
   3506 #endif
   3507 }
   3508 
   3509 HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) {
   3510 #if HWY_ARCH_ARM_A64
   3511  return Vec128<int64_t>(vqabsq_s64(v.raw));
   3512 #else
   3513  const auto zero = Zero(DFromV<decltype(v)>());
   3514  return IfNegativeThenElse(v, SaturatedSub(zero, v), v);
   3515 #endif
   3516 }
   3517 HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) {
   3518 #if HWY_ARCH_ARM_A64
   3519  return Vec64<int64_t>(vqabs_s64(v.raw));
   3520 #else
   3521  const auto zero = Zero(DFromV<decltype(v)>());
   3522  return IfNegativeThenElse(v, SaturatedSub(zero, v), v);
   3523 #endif
   3524 }
   3525 
   3526 // ------------------------------ Min (IfThenElse, BroadcastSignBit)
   3527 
   3528 // Unsigned
   3529 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2)
   3530 
   3531 template <size_t N>
   3532 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   3533 #if HWY_ARCH_ARM_A64
   3534  return IfThenElse(b < a, b, a);
   3535 #else
   3536  const DFromV<decltype(a)> du;
   3537  const RebindToSigned<decltype(du)> di;
   3538  return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b)));
   3539 #endif
   3540 }
   3541 
   3542 // Signed
   3543 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2)
   3544 
   3545 template <size_t N>
   3546 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   3547 #if HWY_ARCH_ARM_A64
   3548  return IfThenElse(b < a, b, a);
   3549 #else
   3550  const Vec128<int64_t, N> sign = SaturatedSub(a, b);
   3551  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b);
   3552 #endif
   3553 }
   3554 
   3555 // Float: IEEE minimumNumber on v8
   3556 #if HWY_ARCH_ARM_A64
   3557 
   3558 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Min, vminnm, _, 2)
   3559 
   3560 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define
   3561 // in terms of the 128-bit intrinsic.
   3562 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   3563 namespace detail {
   3564 
   3565 template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
   3566 HWY_INLINE V F64Vec64Min(V a, V b) {
   3567  const DFromV<decltype(a)> d;
   3568  const Twice<decltype(d)> dt;
   3569  return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b)));
   3570 }
   3571 
   3572 }  // namespace detail
   3573 #endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   3574 
   3575 HWY_API Vec64<double> Min(Vec64<double> a, Vec64<double> b) {
   3576 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   3577  return detail::F64Vec64Min(a, b);
   3578 #else
   3579  return Vec64<double>(vminnm_f64(a.raw, b.raw));
   3580 #endif
   3581 }
   3582 
   3583 HWY_API Vec128<double> Min(Vec128<double> a, Vec128<double> b) {
   3584  return Vec128<double>(vminnmq_f64(a.raw, b.raw));
   3585 }
   3586 
   3587 #else
   3588 // Armv7: NaN if any is NaN.
   3589 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2)
   3590 #endif  // HWY_ARCH_ARM_A64
   3591 
   3592 // ------------------------------ Max (IfThenElse, BroadcastSignBit)
   3593 
   3594 // Unsigned (no u64)
   3595 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2)
   3596 
   3597 template <size_t N>
   3598 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) {
   3599 #if HWY_ARCH_ARM_A64
   3600  return IfThenElse(b < a, a, b);
   3601 #else
   3602  const DFromV<decltype(a)> du;
   3603  const RebindToSigned<decltype(du)> di;
   3604  return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b)));
   3605 #endif
   3606 }
   3607 
   3608 // Signed (no i64)
   3609 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2)
   3610 
   3611 template <size_t N>
   3612 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) {
   3613 #if HWY_ARCH_ARM_A64
   3614  return IfThenElse(b < a, a, b);
   3615 #else
   3616  const Vec128<int64_t, N> sign = SaturatedSub(a, b);
   3617  return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a);
   3618 #endif
   3619 }
   3620 
   3621 // Float: IEEE minimumNumber on v8
   3622 #if HWY_ARCH_ARM_A64
   3623 
   3624 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Max, vmaxnm, _, 2)
   3625 
   3626 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define
   3627 // in terms of the 128-bit intrinsic.
   3628 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   3629 namespace detail {
   3630 
   3631 template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)>
   3632 HWY_INLINE V F64Vec64Max(V a, V b) {
   3633  const DFromV<decltype(a)> d;
   3634  const Twice<decltype(d)> dt;
   3635  return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b)));
   3636 }
   3637 
   3638 }  // namespace detail
   3639 #endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   3640 
   3641 HWY_API Vec64<double> Max(Vec64<double> a, Vec64<double> b) {
   3642 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   3643  return detail::F64Vec64Max(a, b);
   3644 #else
   3645  return Vec64<double>(vmaxnm_f64(a.raw, b.raw));
   3646 #endif
   3647 }
   3648 
   3649 HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) {
   3650  return Vec128<double>(vmaxnmq_f64(a.raw, b.raw));
   3651 }
   3652 
   3653 #else
   3654 // Armv7: NaN if any is NaN.
   3655 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2)
   3656 #endif  // HWY_ARCH_ARM_A64
   3657 
   3658 // ------------------------------ MinNumber and MaxNumber
   3659 
   3660 #if !HWY_ARCH_ARM_A64
   3661 
   3662 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
   3663 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
   3664 #else
   3665 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER
   3666 #endif
   3667 
   3668 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
   3669 HWY_API V MinNumber(V a, V b) {
   3670  return Min(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b));
   3671 }
   3672 
   3673 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)>
   3674 HWY_API V MaxNumber(V a, V b) {
   3675  return Max(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b));
   3676 }
   3677 
   3678 #endif
   3679 
   3680 // ================================================== MEMORY
   3681 
   3682 // ------------------------------ Load 128
   3683 
   3684 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
   3685 HWY_API Vec128<uint8_t> LoadU(D /* tag */,
   3686                              const uint8_t* HWY_RESTRICT unaligned) {
   3687  return Vec128<uint8_t>(vld1q_u8(unaligned));
   3688 }
   3689 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
   3690 HWY_API Vec128<uint16_t> LoadU(D /* tag */,
   3691                               const uint16_t* HWY_RESTRICT unaligned) {
   3692  return Vec128<uint16_t>(vld1q_u16(unaligned));
   3693 }
   3694 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
   3695 HWY_API Vec128<uint32_t> LoadU(D /* tag */,
   3696                               const uint32_t* HWY_RESTRICT unaligned) {
   3697  return Vec128<uint32_t>(vld1q_u32(unaligned));
   3698 }
   3699 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
   3700 HWY_API Vec128<uint64_t> LoadU(D /* tag */,
   3701                               const uint64_t* HWY_RESTRICT unaligned) {
   3702  return Vec128<uint64_t>(vld1q_u64(unaligned));
   3703 }
   3704 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
   3705 HWY_API Vec128<int8_t> LoadU(D /* tag */,
   3706                             const int8_t* HWY_RESTRICT unaligned) {
   3707  return Vec128<int8_t>(vld1q_s8(unaligned));
   3708 }
   3709 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
   3710 HWY_API Vec128<int16_t> LoadU(D /* tag */,
   3711                              const int16_t* HWY_RESTRICT unaligned) {
   3712  return Vec128<int16_t>(vld1q_s16(unaligned));
   3713 }
   3714 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
   3715 HWY_API Vec128<int32_t> LoadU(D /* tag */,
   3716                              const int32_t* HWY_RESTRICT unaligned) {
   3717  return Vec128<int32_t>(vld1q_s32(unaligned));
   3718 }
   3719 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
   3720 HWY_API Vec128<int64_t> LoadU(D /* tag */,
   3721                              const int64_t* HWY_RESTRICT unaligned) {
   3722  return Vec128<int64_t>(vld1q_s64(unaligned));
   3723 }
   3724 #if HWY_HAVE_FLOAT16
   3725 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   3726 HWY_API Vec128<float16_t> LoadU(D /* tag */,
   3727                                const float16_t* HWY_RESTRICT unaligned) {
   3728  return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned)));
   3729 }
   3730 #endif  // HWY_HAVE_FLOAT16
   3731 #if HWY_NEON_HAVE_BFLOAT16
   3732 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
   3733 HWY_API Vec128<bfloat16_t> LoadU(D /* tag */,
   3734                                 const bfloat16_t* HWY_RESTRICT unaligned) {
   3735  return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned)));
   3736 }
   3737 #endif  // HWY_NEON_HAVE_BFLOAT16
   3738 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   3739 HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) {
   3740  return Vec128<float>(vld1q_f32(unaligned));
   3741 }
   3742 #if HWY_HAVE_FLOAT64
   3743 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   3744 HWY_API Vec128<double> LoadU(D /* tag */,
   3745                             const double* HWY_RESTRICT unaligned) {
   3746  return Vec128<double>(vld1q_f64(unaligned));
   3747 }
   3748 #endif  // HWY_HAVE_FLOAT64
   3749 
   3750 // ------------------------------ Load 64
   3751 
   3752 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
   3753 HWY_API Vec64<uint8_t> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) {
   3754  return Vec64<uint8_t>(vld1_u8(p));
   3755 }
   3756 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
   3757 HWY_API Vec64<uint16_t> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) {
   3758  return Vec64<uint16_t>(vld1_u16(p));
   3759 }
   3760 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
   3761 HWY_API Vec64<uint32_t> LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) {
   3762  return Vec64<uint32_t>(vld1_u32(p));
   3763 }
   3764 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
   3765 HWY_API Vec64<uint64_t> LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) {
   3766  return Vec64<uint64_t>(vld1_u64(p));
   3767 }
   3768 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
   3769 HWY_API Vec64<int8_t> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
   3770  return Vec64<int8_t>(vld1_s8(p));
   3771 }
   3772 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
   3773 HWY_API Vec64<int16_t> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
   3774  return Vec64<int16_t>(vld1_s16(p));
   3775 }
   3776 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
   3777 HWY_API Vec64<int32_t> LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) {
   3778  return Vec64<int32_t>(vld1_s32(p));
   3779 }
   3780 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
   3781 HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) {
   3782  return Vec64<int64_t>(vld1_s64(p));
   3783 }
   3784 #if HWY_HAVE_FLOAT16
   3785 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
   3786 HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
   3787  return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p)));
   3788 }
   3789 #endif  // HWY_HAVE_FLOAT16
   3790 #if HWY_NEON_HAVE_BFLOAT16
   3791 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
   3792 HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
   3793  return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p)));
   3794 }
   3795 #endif  // HWY_NEON_HAVE_BFLOAT16
   3796 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
   3797 HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) {
   3798  return Vec64<float>(vld1_f32(p));
   3799 }
   3800 #if HWY_HAVE_FLOAT64
   3801 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
   3802 HWY_API Vec64<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) {
   3803  return Vec64<double>(vld1_f64(p));
   3804 }
   3805 #endif  // HWY_HAVE_FLOAT64
   3806 
   3807 // ------------------------------ Load 32
   3808 
   3809 // Actual 32-bit broadcast load - used to implement the other lane types
   3810 // because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
   3811 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
   3812 HWY_API Vec32<uint32_t> LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) {
   3813  return Vec32<uint32_t>(vld1_dup_u32(p));
   3814 }
   3815 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
   3816 HWY_API Vec32<int32_t> LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) {
   3817  return Vec32<int32_t>(vld1_dup_s32(p));
   3818 }
   3819 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
   3820 HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) {
   3821  return Vec32<float>(vld1_dup_f32(p));
   3822 }
   3823 
   3824 // {u,i}{8,16}
   3825 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
   3826          HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
   3827 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   3828  const Repartition<uint32_t, decltype(d)> d32;
   3829  uint32_t buf;
   3830  CopyBytes<4>(p, &buf);
   3831  return BitCast(d, LoadU(d32, &buf));
   3832 }
   3833 
   3834 #if HWY_HAVE_FLOAT16
   3835 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
   3836 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   3837  const Repartition<uint32_t, decltype(d)> d32;
   3838  uint32_t buf;
   3839  CopyBytes<4>(p, &buf);
   3840  return BitCast(d, LoadU(d32, &buf));
   3841 }
   3842 #endif  // HWY_HAVE_FLOAT16
   3843 #if HWY_NEON_HAVE_BFLOAT16
   3844 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
   3845 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   3846  const Repartition<uint32_t, decltype(d)> d32;
   3847  uint32_t buf;
   3848  CopyBytes<4>(p, &buf);
   3849  return BitCast(d, LoadU(d32, &buf));
   3850 }
   3851 #endif  // HWY_NEON_HAVE_BFLOAT16
   3852 
   3853 // ------------------------------ Load 16
   3854 
   3855 // Actual 16-bit broadcast load - used to implement the other lane types
   3856 // because reinterpret_cast of the pointer leads to incorrect codegen on GCC.
   3857 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U16_D(D)>
   3858 HWY_API VFromD<D> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) {
   3859  return VFromD<D>(vld1_dup_u16(p));
   3860 }
   3861 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)>
   3862 HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) {
   3863  return VFromD<D>(vld1_dup_s16(p));
   3864 }
   3865 #if HWY_HAVE_FLOAT16
   3866 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)>
   3867 HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) {
   3868  return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p)));
   3869 }
   3870 #endif  // HWY_HAVE_FLOAT16
   3871 #if HWY_NEON_HAVE_BFLOAT16
   3872 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)>
   3873 HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) {
   3874  return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p)));
   3875 }
   3876 #endif  // HWY_NEON_HAVE_BFLOAT16
   3877 
   3878 // 8-bit x2
   3879 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
   3880 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   3881  const Repartition<uint16_t, decltype(d)> d16;
   3882  uint16_t buf;
   3883  CopyBytes<2>(p, &buf);
   3884  return BitCast(d, LoadU(d16, &buf));
   3885 }
   3886 
   3887 // ------------------------------ Load 8
   3888 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U8_D(D)>
   3889 HWY_API VFromD<D> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) {
   3890  return VFromD<D>(vld1_dup_u8(p));
   3891 }
   3892 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I8_D(D)>
   3893 HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) {
   3894  return VFromD<D>(vld1_dup_s8(p));
   3895 }
   3896 
   3897 // ------------------------------ Load misc
   3898 
   3899 template <class D, HWY_NEON_IF_EMULATED_D(D)>
   3900 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
   3901  const RebindToUnsigned<decltype(d)> du;
   3902  return BitCast(d, LoadU(du, detail::U16LanePointer(p)));
   3903 }
   3904 
   3905 // On Arm, Load is the same as LoadU.
   3906 template <class D>
   3907 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) {
   3908  return LoadU(d, p);
   3909 }
   3910 
   3911 template <class D>
   3912 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
   3913                             const TFromD<D>* HWY_RESTRICT aligned) {
   3914  return IfThenElseZero(m, Load(d, aligned));
   3915 }
   3916 
   3917 template <class D>
   3918 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d,
   3919                               const TFromD<D>* HWY_RESTRICT aligned) {
   3920  return IfThenElse(m, Load(d, aligned), v);
   3921 }
   3922 
   3923 // 128-bit SIMD => nothing to duplicate, same as an unaligned load.
   3924 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   3925 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) {
   3926  return LoadU(d, p);
   3927 }
   3928 
   3929 // ------------------------------ Store 128
   3930 
   3931 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)>
   3932 HWY_API void StoreU(Vec128<uint8_t> v, D /* tag */,
   3933                    uint8_t* HWY_RESTRICT unaligned) {
   3934  vst1q_u8(unaligned, v.raw);
   3935 }
   3936 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
   3937 HWY_API void StoreU(Vec128<uint16_t> v, D /* tag */,
   3938                    uint16_t* HWY_RESTRICT unaligned) {
   3939  vst1q_u16(unaligned, v.raw);
   3940 }
   3941 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
   3942 HWY_API void StoreU(Vec128<uint32_t> v, D /* tag */,
   3943                    uint32_t* HWY_RESTRICT unaligned) {
   3944  vst1q_u32(unaligned, v.raw);
   3945 }
   3946 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
   3947 HWY_API void StoreU(Vec128<uint64_t> v, D /* tag */,
   3948                    uint64_t* HWY_RESTRICT unaligned) {
   3949  vst1q_u64(unaligned, v.raw);
   3950 }
   3951 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)>
   3952 HWY_API void StoreU(Vec128<int8_t> v, D /* tag */,
   3953                    int8_t* HWY_RESTRICT unaligned) {
   3954  vst1q_s8(unaligned, v.raw);
   3955 }
   3956 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
   3957 HWY_API void StoreU(Vec128<int16_t> v, D /* tag */,
   3958                    int16_t* HWY_RESTRICT unaligned) {
   3959  vst1q_s16(unaligned, v.raw);
   3960 }
   3961 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
   3962 HWY_API void StoreU(Vec128<int32_t> v, D /* tag */,
   3963                    int32_t* HWY_RESTRICT unaligned) {
   3964  vst1q_s32(unaligned, v.raw);
   3965 }
   3966 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
   3967 HWY_API void StoreU(Vec128<int64_t> v, D /* tag */,
   3968                    int64_t* HWY_RESTRICT unaligned) {
   3969  vst1q_s64(unaligned, v.raw);
   3970 }
   3971 #if HWY_HAVE_FLOAT16
   3972 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)>
   3973 HWY_API void StoreU(Vec128<float16_t> v, D /* tag */,
   3974                    float16_t* HWY_RESTRICT unaligned) {
   3975  vst1q_f16(detail::NativeLanePointer(unaligned), v.raw);
   3976 }
   3977 #endif  // HWY_HAVE_FLOAT16
   3978 #if HWY_NEON_HAVE_BFLOAT16
   3979 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)>
   3980 HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */,
   3981                    bfloat16_t* HWY_RESTRICT unaligned) {
   3982  vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw);
   3983 }
   3984 #endif  // HWY_NEON_HAVE_BFLOAT16
   3985 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   3986 HWY_API void StoreU(Vec128<float> v, D /* tag */,
   3987                    float* HWY_RESTRICT unaligned) {
   3988  vst1q_f32(unaligned, v.raw);
   3989 }
   3990 #if HWY_HAVE_FLOAT64
   3991 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)>
   3992 HWY_API void StoreU(Vec128<double> v, D /* tag */,
   3993                    double* HWY_RESTRICT unaligned) {
   3994  vst1q_f64(unaligned, v.raw);
   3995 }
   3996 #endif  // HWY_HAVE_FLOAT64
   3997 
   3998 // ------------------------------ Store 64
   3999 
   4000 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)>
   4001 HWY_API void StoreU(Vec64<uint8_t> v, D /* tag */, uint8_t* HWY_RESTRICT p) {
   4002  vst1_u8(p, v.raw);
   4003 }
   4004 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)>
   4005 HWY_API void StoreU(Vec64<uint16_t> v, D /* tag */, uint16_t* HWY_RESTRICT p) {
   4006  vst1_u16(p, v.raw);
   4007 }
   4008 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)>
   4009 HWY_API void StoreU(Vec64<uint32_t> v, D /* tag */, uint32_t* HWY_RESTRICT p) {
   4010  vst1_u32(p, v.raw);
   4011 }
   4012 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
   4013 HWY_API void StoreU(Vec64<uint64_t> v, D /* tag */, uint64_t* HWY_RESTRICT p) {
   4014  vst1_u64(p, v.raw);
   4015 }
   4016 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)>
   4017 HWY_API void StoreU(Vec64<int8_t> v, D /* tag */, int8_t* HWY_RESTRICT p) {
   4018  vst1_s8(p, v.raw);
   4019 }
   4020 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)>
   4021 HWY_API void StoreU(Vec64<int16_t> v, D /* tag */, int16_t* HWY_RESTRICT p) {
   4022  vst1_s16(p, v.raw);
   4023 }
   4024 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)>
   4025 HWY_API void StoreU(Vec64<int32_t> v, D /* tag */, int32_t* HWY_RESTRICT p) {
   4026  vst1_s32(p, v.raw);
   4027 }
   4028 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
   4029 HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) {
   4030  vst1_s64(p, v.raw);
   4031 }
   4032 #if HWY_HAVE_FLOAT16
   4033 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)>
   4034 HWY_API void StoreU(Vec64<float16_t> v, D /* tag */,
   4035                    float16_t* HWY_RESTRICT p) {
   4036  vst1_f16(detail::NativeLanePointer(p), v.raw);
   4037 }
   4038 #endif  // HWY_HAVE_FLOAT16
   4039 #if HWY_NEON_HAVE_BFLOAT16
   4040 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
   4041 HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */,
   4042                    bfloat16_t* HWY_RESTRICT p) {
   4043  vst1_bf16(detail::NativeLanePointer(p), v.raw);
   4044 }
   4045 #endif  // HWY_NEON_HAVE_BFLOAT16
   4046 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)>
   4047 HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) {
   4048  vst1_f32(p, v.raw);
   4049 }
   4050 #if HWY_HAVE_FLOAT64
   4051 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)>
   4052 HWY_API void StoreU(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) {
   4053  vst1_f64(p, v.raw);
   4054 }
   4055 #endif  // HWY_HAVE_FLOAT64
   4056 
   4057 // ------------------------------ Store 32
   4058 
   4059 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)>
   4060 HWY_API void StoreU(Vec32<uint32_t> v, D, uint32_t* HWY_RESTRICT p) {
   4061  vst1_lane_u32(p, v.raw, 0);
   4062 }
   4063 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)>
   4064 HWY_API void StoreU(Vec32<int32_t> v, D, int32_t* HWY_RESTRICT p) {
   4065  vst1_lane_s32(p, v.raw, 0);
   4066 }
   4067 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)>
   4068 HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) {
   4069  vst1_lane_f32(p, v.raw, 0);
   4070 }
   4071 
   4072 // {u,i}{8,16}
   4073 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2),
   4074          HWY_IF_NOT_SPECIAL_FLOAT_D(D)>
   4075 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   4076  Repartition<uint32_t, decltype(d)> d32;
   4077  uint32_t buf = GetLane(BitCast(d32, v));
   4078  CopyBytes<4>(&buf, p);
   4079 }
   4080 
   4081 #if HWY_HAVE_FLOAT16
   4082 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)>
   4083 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   4084  Repartition<uint32_t, decltype(d)> d32;
   4085  uint32_t buf = GetLane(BitCast(d32, v));
   4086  CopyBytes<4>(&buf, p);
   4087 }
   4088 #endif
   4089 #if HWY_NEON_HAVE_BFLOAT16
   4090 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)>
   4091 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   4092  Repartition<uint32_t, decltype(d)> d32;
   4093  uint32_t buf = GetLane(BitCast(d32, v));
   4094  CopyBytes<4>(&buf, p);
   4095 }
   4096 #endif  // HWY_NEON_HAVE_BFLOAT16
   4097 
   4098 // ------------------------------ Store 16
   4099 
   4100 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U16_D(D)>
   4101 HWY_API void StoreU(Vec16<uint16_t> v, D, uint16_t* HWY_RESTRICT p) {
   4102  vst1_lane_u16(p, v.raw, 0);
   4103 }
   4104 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)>
   4105 HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) {
   4106  vst1_lane_s16(p, v.raw, 0);
   4107 }
   4108 #if HWY_HAVE_FLOAT16
   4109 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)>
   4110 HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) {
   4111  vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0);
   4112 }
   4113 #endif  // HWY_HAVE_FLOAT16
   4114 #if HWY_NEON_HAVE_BFLOAT16
   4115 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)>
   4116 HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) {
   4117  vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0);
   4118 }
   4119 #endif  // HWY_NEON_HAVE_BFLOAT16
   4120 
   4121 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)>
   4122 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   4123  const Repartition<uint16_t, decltype(d)> d16;
   4124  const uint16_t buf = GetLane(BitCast(d16, v));
   4125  CopyBytes<2>(&buf, p);
   4126 }
   4127 
   4128 // ------------------------------ Store 8
   4129 
   4130 template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_U8_D(D)>
   4131 HWY_API void StoreU(Vec128<uint8_t, 1> v, D, uint8_t* HWY_RESTRICT p) {
   4132  vst1_lane_u8(p, v.raw, 0);
   4133 }
   4134 template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_I8_D(D)>
   4135 HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) {
   4136  vst1_lane_s8(p, v.raw, 0);
   4137 }
   4138 
   4139 // ------------------------------ Store misc
   4140 
   4141 template <class D, HWY_NEON_IF_EMULATED_D(D)>
   4142 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
   4143  const RebindToUnsigned<decltype(d)> du;
   4144  return StoreU(BitCast(du, v), du, detail::U16LanePointer(p));
   4145 }
   4146 
   4147 HWY_DIAGNOSTICS(push)
   4148 #if HWY_COMPILER_GCC_ACTUAL
   4149 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized")
   4150 #endif
   4151 
   4152 // On Arm, Store is the same as StoreU.
   4153 template <class D>
   4154 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   4155  StoreU(v, d, aligned);
   4156 }
   4157 
   4158 HWY_DIAGNOSTICS(pop)
   4159 
   4160 template <class D>
   4161 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
   4162                          TFromD<D>* HWY_RESTRICT p) {
   4163  // Treat as unsigned so that we correctly support float16.
   4164  const RebindToUnsigned<decltype(d)> du;
   4165  const auto blended =
   4166      IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p)));
   4167  StoreU(BitCast(d, blended), d, p);
   4168 }
   4169 
   4170 // ------------------------------ Non-temporal stores
   4171 
   4172 // Same as aligned stores on non-x86.
   4173 
   4174 template <class D>
   4175 HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) {
   4176 #if HWY_ARCH_ARM_A64
   4177 #if HWY_COMPILER_GCC
   4178  __builtin_prefetch(aligned, 1, 0);
   4179 #elif HWY_COMPILER_MSVC
   4180  __prefetch2(aligned, 0x11);
   4181 #endif
   4182 #endif
   4183  Store(v, d, aligned);
   4184 }
   4185 
   4186 // ================================================== CONVERT
   4187 
   4188 // ------------------------------ ConvertTo
   4189 
   4190 #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
   4191 
   4192 // TODO(janwas): use macro generator instead of handwritten
   4193 template <class D, HWY_IF_F16_D(D)>
   4194 HWY_API Vec128<float16_t> ConvertTo(D /* tag */, Vec128<int16_t> v) {
   4195  return Vec128<float16_t>(vcvtq_f16_s16(v.raw));
   4196 }
   4197 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
   4198 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   4199  return VFromD<D>(vcvt_f16_s16(v.raw));
   4200 }
   4201 
   4202 template <class D, HWY_IF_F16_D(D)>
   4203 HWY_API Vec128<float16_t> ConvertTo(D /* tag */, Vec128<uint16_t> v) {
   4204  return Vec128<float16_t>(vcvtq_f16_u16(v.raw));
   4205 }
   4206 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)>
   4207 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4208  return VFromD<D>(vcvt_f16_u16(v.raw));
   4209 }
   4210 
   4211 #endif  // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
   4212 
   4213 template <class D, HWY_IF_F32_D(D)>
   4214 HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<int32_t> v) {
   4215  return Vec128<float>(vcvtq_f32_s32(v.raw));
   4216 }
   4217 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
   4218 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToSigned<D>> v) {
   4219  return VFromD<D>(vcvt_f32_s32(v.raw));
   4220 }
   4221 
   4222 template <class D, HWY_IF_F32_D(D)>
   4223 HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<uint32_t> v) {
   4224  return Vec128<float>(vcvtq_f32_u32(v.raw));
   4225 }
   4226 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
   4227 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) {
   4228  return VFromD<D>(vcvt_f32_u32(v.raw));
   4229 }
   4230 
   4231 #if HWY_HAVE_FLOAT64
   4232 
   4233 template <class D, HWY_IF_F64_D(D)>
   4234 HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<int64_t> v) {
   4235  return Vec128<double>(vcvtq_f64_s64(v.raw));
   4236 }
   4237 template <class D, HWY_IF_F64_D(D)>
   4238 HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) {
   4239 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
   4240 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   4241  return Set(Full64<double>(), static_cast<double>(GetLane(v)));
   4242 #else
   4243  return Vec64<double>(vcvt_f64_s64(v.raw));
   4244 #endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   4245 }
   4246 
   4247 template <class D, HWY_IF_F64_D(D)>
   4248 HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) {
   4249  return Vec128<double>(vcvtq_f64_u64(v.raw));
   4250 }
   4251 template <class D, HWY_IF_F64_D(D)>
   4252 HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) {
   4253  // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic.
   4254 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   4255  return Set(Full64<double>(), static_cast<double>(GetLane(v)));
   4256 #else
   4257  return Vec64<double>(vcvt_f64_u64(v.raw));
   4258 #endif  // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700
   4259 }
   4260 
   4261 #endif  // HWY_HAVE_FLOAT64
   4262 
   4263 namespace detail {
   4264 // Truncates (rounds toward zero).
   4265 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)>
   4266 HWY_INLINE Vec128<int32_t> ConvertFToI(D /* tag */, Vec128<float> v) {
   4267 #if HWY_COMPILER_CLANG && \
   4268    ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
   4269  // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
   4270  // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
   4271  // outside of the range of an int32_t.
   4272 
   4273  int32x4_t raw_result;
   4274  __asm__(
   4275 #if HWY_ARCH_ARM_A64
   4276      "fcvtzs %0.4s, %1.4s"
   4277 #else
   4278      "vcvt.s32.f32 %0, %1"
   4279 #endif
   4280      : "=w"(raw_result)
   4281      : "w"(v.raw));
   4282  return Vec128<int32_t>(raw_result);
   4283 #else
   4284  return Vec128<int32_t>(vcvtq_s32_f32(v.raw));
   4285 #endif
   4286 }
   4287 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4288 HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<RebindToFloat<D>> v) {
   4289 #if HWY_COMPILER_CLANG && \
   4290    ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
   4291  // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
   4292  // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
   4293  // outside of the range of an int32_t.
   4294 
   4295  int32x2_t raw_result;
   4296  __asm__(
   4297 #if HWY_ARCH_ARM_A64
   4298      "fcvtzs %0.2s, %1.2s"
   4299 #else
   4300      "vcvt.s32.f32 %0, %1"
   4301 #endif
   4302      : "=w"(raw_result)
   4303      : "w"(v.raw));
   4304  return VFromD<D>(raw_result);
   4305 #else
   4306  return VFromD<D>(vcvt_s32_f32(v.raw));
   4307 #endif
   4308 }
   4309 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)>
   4310 HWY_INLINE Vec128<uint32_t> ConvertFToU(D /* tag */, Vec128<float> v) {
   4311 #if HWY_COMPILER_CLANG && \
   4312    ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
   4313  // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
   4314  // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
   4315  // outside of the range of an uint32_t.
   4316 
   4317  uint32x4_t raw_result;
   4318  __asm__(
   4319 #if HWY_ARCH_ARM_A64
   4320      "fcvtzu %0.4s, %1.4s"
   4321 #else
   4322      "vcvt.u32.f32 %0, %1"
   4323 #endif
   4324      : "=w"(raw_result)
   4325      : "w"(v.raw));
   4326  return Vec128<uint32_t>(raw_result);
   4327 #else
   4328  return Vec128<uint32_t>(vcvtq_u32_f32(v.raw));
   4329 #endif
   4330 }
   4331 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
   4332 HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<RebindToFloat<D>> v) {
   4333 #if HWY_COMPILER_CLANG && \
   4334    ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7)
   4335  // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for
   4336  // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is
   4337  // outside of the range of an uint32_t.
   4338 
   4339  uint32x2_t raw_result;
   4340  __asm__(
   4341 #if HWY_ARCH_ARM_A64
   4342      "fcvtzu %0.2s, %1.2s"
   4343 #else
   4344      "vcvt.u32.f32 %0, %1"
   4345 #endif
   4346      : "=w"(raw_result)
   4347      : "w"(v.raw));
   4348  return VFromD<D>(raw_result);
   4349 #else
   4350  return VFromD<D>(vcvt_u32_f32(v.raw));
   4351 #endif
   4352 }
   4353 
   4354 #if HWY_HAVE_FLOAT64
   4355 
   4356 // Truncates (rounds toward zero).
   4357 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)>
   4358 HWY_INLINE Vec128<int64_t> ConvertFToI(D /* tag */, Vec128<double> v) {
   4359 #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
   4360  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4361  // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
   4362  int64x2_t raw_result;
   4363  __asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
   4364  return Vec128<int64_t>(raw_result);
   4365 #else
   4366  return Vec128<int64_t>(vcvtq_s64_f64(v.raw));
   4367 #endif
   4368 }
   4369 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)>
   4370 HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) {
   4371 #if HWY_ARCH_ARM_A64 &&                                            \
   4372    ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
   4373     (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
   4374  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4375  // to avoid undefined behavior if v[i] is outside of the range of an int64_t.
   4376  // If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to
   4377  // work around the missing vcvt_s64_f64 intrinsic.
   4378  int64x1_t raw_result;
   4379  __asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
   4380  return Vec64<int64_t>(raw_result);
   4381 #else
   4382  return Vec64<int64_t>(vcvt_s64_f64(v.raw));
   4383 #endif
   4384 }
   4385 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)>
   4386 HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) {
   4387 #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200
   4388  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4389  // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
   4390  uint64x2_t raw_result;
   4391  __asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw));
   4392  return Vec128<uint64_t>(raw_result);
   4393 #else
   4394  return Vec128<uint64_t>(vcvtq_u64_f64(v.raw));
   4395 #endif
   4396 }
   4397 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)>
   4398 HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) {
   4399 #if HWY_ARCH_ARM_A64 &&                                            \
   4400    ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \
   4401     (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200))
   4402  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4403  // to avoid undefined behavior if v[i] is outside of the range of an uint64_t.
   4404 
   4405  // Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or
   4406  // earlier to work around the issue of the missing vcvt_u64_f64 intrinsic.
   4407  uint64x1_t raw_result;
   4408  __asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw));
   4409  return Vec64<uint64_t>(raw_result);
   4410 #else
   4411  return Vec64<uint64_t>(vcvt_u64_f64(v.raw));
   4412 #endif
   4413 }
   4414 
   4415 #endif  // HWY_HAVE_FLOAT64
   4416 
   4417 #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
   4418 
   4419 // Truncates (rounds toward zero).
   4420 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)>
   4421 HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) {
   4422 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
   4423  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4424  // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
   4425  int16x8_t raw_result;
   4426  __asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
   4427  return Vec128<int16_t>(raw_result);
   4428 #else
   4429  return Vec128<int16_t>(vcvtq_s16_f16(v.raw));
   4430 #endif
   4431 }
   4432 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
   4433 HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
   4434 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
   4435  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4436  // to avoid undefined behavior if v[i] is outside of the range of an int16_t.
   4437  int16x4_t raw_result;
   4438  __asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
   4439  return VFromD<D>(raw_result);
   4440 #else
   4441  return VFromD<D>(vcvt_s16_f16(v.raw));
   4442 #endif
   4443 }
   4444 
   4445 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)>
   4446 HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) {
   4447 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
   4448  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4449  // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
   4450  uint16x8_t raw_result;
   4451  __asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw));
   4452  return Vec128<uint16_t>(raw_result);
   4453 #else
   4454  return Vec128<uint16_t>(vcvtq_u16_f16(v.raw));
   4455 #endif
   4456 }
   4457 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
   4458 HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
   4459 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200
   4460  // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly
   4461  // to avoid undefined behavior if v[i] is outside of the range of an uint16_t.
   4462  uint16x4_t raw_result;
   4463  __asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw));
   4464  return VFromD<D>(raw_result);
   4465 #else
   4466  return VFromD<D>(vcvt_u16_f16(v.raw));
   4467 #endif
   4468 }
   4469 
   4470 #endif  // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16
   4471 }  // namespace detail
   4472 
   4473 template <class D, HWY_IF_SIGNED_D(D),
   4474          HWY_IF_T_SIZE_ONE_OF_D(
   4475              D, (1 << 4) |
   4476                     ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
   4477                     (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
   4478 HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) {
   4479  return detail::ConvertFToI(di, v);
   4480 }
   4481 
   4482 template <class D, HWY_IF_UNSIGNED_D(D),
   4483          HWY_IF_T_SIZE_ONE_OF_D(
   4484              D, (1 << 4) |
   4485                     ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) |
   4486                     (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))>
   4487 HWY_API VFromD<D> ConvertTo(D du, VFromD<RebindToFloat<D>> v) {
   4488  return detail::ConvertFToU(du, v);
   4489 }
   4490 
   4491 // ------------------------------ PromoteTo (ConvertTo)
   4492 
   4493 // Unsigned: zero-extend to full vector.
   4494 template <class D, HWY_IF_U16_D(D)>
   4495 HWY_API Vec128<uint16_t> PromoteTo(D /* tag */, Vec64<uint8_t> v) {
   4496  return Vec128<uint16_t>(vmovl_u8(v.raw));
   4497 }
   4498 template <class D, HWY_IF_U32_D(D)>
   4499 HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec32<uint8_t> v) {
   4500  uint16x8_t a = vmovl_u8(v.raw);
   4501  return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)));
   4502 }
   4503 template <class D, HWY_IF_U32_D(D)>
   4504 HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec64<uint16_t> v) {
   4505  return Vec128<uint32_t>(vmovl_u16(v.raw));
   4506 }
   4507 template <class D, HWY_IF_U64_D(D)>
   4508 HWY_API Vec128<uint64_t> PromoteTo(D /* tag */, Vec64<uint32_t> v) {
   4509  return Vec128<uint64_t>(vmovl_u32(v.raw));
   4510 }
   4511 template <class D, HWY_IF_I16_D(D)>
   4512 HWY_API Vec128<int16_t> PromoteTo(D d, Vec64<uint8_t> v) {
   4513  return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw)));
   4514 }
   4515 template <class D, HWY_IF_I32_D(D)>
   4516 HWY_API Vec128<int32_t> PromoteTo(D d, Vec32<uint8_t> v) {
   4517  uint16x8_t a = vmovl_u8(v.raw);
   4518  return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))));
   4519 }
   4520 template <class D, HWY_IF_I32_D(D)>
   4521 HWY_API Vec128<int32_t> PromoteTo(D d, Vec64<uint16_t> v) {
   4522  return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw)));
   4523 }
   4524 template <class D, HWY_IF_I64_D(D)>
   4525 HWY_API Vec128<int64_t> PromoteTo(D d, Vec64<uint32_t> v) {
   4526  return BitCast(d, Vec128<uint64_t>(vmovl_u32(v.raw)));
   4527 }
   4528 
   4529 // Unsigned: zero-extend to half vector.
   4530 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)>
   4531 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   4532  return VFromD<D>(vget_low_u16(vmovl_u8(v.raw)));
   4533 }
   4534 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
   4535 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   4536  return VFromD<D>(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw)))));
   4537 }
   4538 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)>
   4539 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4540  return VFromD<D>(vget_low_u32(vmovl_u16(v.raw)));
   4541 }
   4542 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)>
   4543 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4544  return VFromD<D>(vget_low_u64(vmovl_u32(v.raw)));
   4545 }
   4546 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
   4547 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) {
   4548  using VU16 = VFromD<RebindToUnsigned<D>>;
   4549  return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw))));
   4550 }
   4551 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4552 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) {
   4553  const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw)));
   4554  return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(u32)));
   4555 }
   4556 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4557 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4558  return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw))));
   4559 }
   4560 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
   4561 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) {
   4562  using DU = RebindToUnsigned<D>;
   4563  return BitCast(d, VFromD<DU>(vget_low_u64(vmovl_u32(v.raw))));
   4564 }
   4565 
   4566 // U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to
   4567 // TFromD<D>
   4568 template <class D, class V, HWY_IF_UI64_D(D),
   4569          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V),
   4570          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
   4571 HWY_API VFromD<D> PromoteTo(D d, V v) {
   4572  const Rebind<uint32_t, decltype(d)> du32;
   4573  return PromoteTo(d, PromoteTo(du32, v));
   4574 }
   4575 
   4576 // Signed: replicate sign bit to full vector.
   4577 template <class D, HWY_IF_I16_D(D)>
   4578 HWY_API Vec128<int16_t> PromoteTo(D /* tag */, Vec64<int8_t> v) {
   4579  return Vec128<int16_t>(vmovl_s8(v.raw));
   4580 }
   4581 template <class D, HWY_IF_I32_D(D)>
   4582 HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec32<int8_t> v) {
   4583  int16x8_t a = vmovl_s8(v.raw);
   4584  return Vec128<int32_t>(vmovl_s16(vget_low_s16(a)));
   4585 }
   4586 template <class D, HWY_IF_I32_D(D)>
   4587 HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec64<int16_t> v) {
   4588  return Vec128<int32_t>(vmovl_s16(v.raw));
   4589 }
   4590 template <class D, HWY_IF_I64_D(D)>
   4591 HWY_API Vec128<int64_t> PromoteTo(D /* tag */, Vec64<int32_t> v) {
   4592  return Vec128<int64_t>(vmovl_s32(v.raw));
   4593 }
   4594 
   4595 // Signed: replicate sign bit to half vector.
   4596 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)>
   4597 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   4598  return VFromD<D>(vget_low_s16(vmovl_s8(v.raw)));
   4599 }
   4600 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4601 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) {
   4602  return VFromD<D>(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw)))));
   4603 }
   4604 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)>
   4605 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   4606  return VFromD<D>(vget_low_s32(vmovl_s16(v.raw)));
   4607 }
   4608 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)>
   4609 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4610  return VFromD<D>(vget_low_s64(vmovl_s32(v.raw)));
   4611 }
   4612 
   4613 // I8/I16 to I64: First, promote to I32, and then promote to I64
   4614 template <class D, class V, HWY_IF_I64_D(D),
   4615          HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V),
   4616          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))>
   4617 HWY_API VFromD<D> PromoteTo(D d, V v) {
   4618  const Rebind<int32_t, decltype(d)> di32;
   4619  return PromoteTo(d, PromoteTo(di32, v));
   4620 }
   4621 
   4622 #if HWY_NEON_HAVE_F16C
   4623 
   4624 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
   4625 #ifdef HWY_NATIVE_F16C
   4626 #undef HWY_NATIVE_F16C
   4627 #else
   4628 #define HWY_NATIVE_F16C
   4629 #endif
   4630 
   4631 template <class D, HWY_IF_F32_D(D)>
   4632 HWY_API Vec128<float> PromoteTo(D /* tag */, Vec64<float16_t> v) {
   4633  return Vec128<float>(vcvt_f32_f16(v.raw));
   4634 }
   4635 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)>
   4636 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) {
   4637  return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw)));
   4638 }
   4639 
   4640 #endif  // HWY_NEON_HAVE_F16C
   4641 
   4642 #if HWY_HAVE_FLOAT64
   4643 
   4644 template <class D, HWY_IF_F64_D(D)>
   4645 HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<float> v) {
   4646  return Vec128<double>(vcvt_f64_f32(v.raw));
   4647 }
   4648 
   4649 template <class D, HWY_IF_F64_D(D)>
   4650 HWY_API Vec64<double> PromoteTo(D /* tag */, Vec32<float> v) {
   4651  return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw)));
   4652 }
   4653 
   4654 template <class D, HWY_IF_F64_D(D)>
   4655 HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<int32_t> v) {
   4656  const int64x2_t i64 = vmovl_s32(v.raw);
   4657  return Vec128<double>(vcvtq_f64_s64(i64));
   4658 }
   4659 
   4660 template <class D, HWY_IF_F64_D(D)>
   4661 HWY_API Vec64<double> PromoteTo(D d, Vec32<int32_t> v) {
   4662  return ConvertTo(d, Vec64<int64_t>(vget_low_s64(vmovl_s32(v.raw))));
   4663 }
   4664 
   4665 template <class D, HWY_IF_F64_D(D)>
   4666 HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<uint32_t> v) {
   4667  const uint64x2_t u64 = vmovl_u32(v.raw);
   4668  return Vec128<double>(vcvtq_f64_u64(u64));
   4669 }
   4670 
   4671 template <class D, HWY_IF_F64_D(D)>
   4672 HWY_API Vec64<double> PromoteTo(D d, Vec32<uint32_t> v) {
   4673  return ConvertTo(d, Vec64<uint64_t>(vget_low_u64(vmovl_u32(v.raw))));
   4674 }
   4675 
   4676 template <class D, HWY_IF_UI64_D(D)>
   4677 HWY_API VFromD<D> PromoteTo(D d64, VFromD<Rebind<float, D>> v) {
   4678  const RebindToFloat<decltype(d64)> df64;
   4679  return ConvertTo(d64, PromoteTo(df64, v));
   4680 }
   4681 
   4682 #else  // !HWY_HAVE_FLOAT64
   4683 
   4684 template <class D, HWY_IF_I64_D(D)>
   4685 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) {
   4686  const Rebind<int32_t, decltype(di64)> di32;
   4687  const RebindToFloat<decltype(di32)> df32;
   4688  const RebindToUnsigned<decltype(di32)> du32;
   4689  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
   4690 
   4691  const auto exponent_adj = BitCast(
   4692      du32,
   4693      Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
   4694                       BitCast(du32_as_du8, Set(du32, uint32_t{157}))),
   4695          BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
   4696  const auto adj_v =
   4697      BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
   4698 
   4699  const auto f32_to_i32_result = ConvertTo(di32, adj_v);
   4700  const auto lo64_or_mask = PromoteTo(
   4701      di64,
   4702      BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result,
   4703                                         Set(di32, LimitsMax<int32_t>())))));
   4704 
   4705  return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result))
   4706                << PromoteTo(di64, exponent_adj),
   4707            lo64_or_mask);
   4708 }
   4709 
   4710 template <class D, HWY_IF_U64_D(D)>
   4711 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) {
   4712  const Rebind<uint32_t, decltype(du64)> du32;
   4713  const RebindToFloat<decltype(du32)> df32;
   4714  const Repartition<uint8_t, decltype(du32)> du32_as_du8;
   4715 
   4716  const auto exponent_adj = BitCast(
   4717      du32,
   4718      Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
   4719                       BitCast(du32_as_du8, Set(du32, uint32_t{158}))),
   4720          BitCast(du32_as_du8, Set(du32, uint32_t{32}))));
   4721 
   4722  const auto adj_v =
   4723      BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
   4724  const auto f32_to_u32_result = ConvertTo(du32, adj_v);
   4725  const auto lo32_or_mask = PromoteTo(
   4726      du64,
   4727      VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax<uint32_t>())));
   4728 
   4729  return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj),
   4730            lo32_or_mask);
   4731 }
   4732 
   4733 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   4734 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   4735 #else
   4736 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO
   4737 #endif
   4738 
   4739 template <class D, HWY_IF_UI64_D(D)>
   4740 HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) {
   4741  const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32;
   4742  const RebindToFloat<decltype(d32)> df32;
   4743  const RebindToUnsigned<decltype(d32)> du32;
   4744  const Repartition<uint8_t, decltype(d32)> du32_as_du8;
   4745 
   4746  constexpr uint32_t kExpAdjDecr =
   4747      0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>());
   4748 
   4749  const auto exponent_adj = BitCast(
   4750      du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))),
   4751                         BitCast(du32_as_du8, Set(du32, kExpAdjDecr))));
   4752  const auto adj_v =
   4753      BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj));
   4754 
   4755  return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj);
   4756 }
   4757 
   4758 #endif  // HWY_HAVE_FLOAT64
   4759 
   4760 // ------------------------------ PromoteEvenTo/PromoteOddTo
   4761 #include "hwy/ops/inside-inl.h"
   4762 
   4763 // ------------------------------ PromoteUpperTo
   4764 
   4765 #if HWY_ARCH_ARM_A64
   4766 
   4767 // Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo.
   4768 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO
   4769 #undef HWY_NATIVE_PROMOTE_UPPER_TO
   4770 #else
   4771 #define HWY_NATIVE_PROMOTE_UPPER_TO
   4772 #endif
   4773 
   4774 // Unsigned: zero-extend to full vector.
   4775 template <class D, HWY_IF_U16_D(D)>
   4776 HWY_API Vec128<uint16_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) {
   4777  return Vec128<uint16_t>(vmovl_high_u8(v.raw));
   4778 }
   4779 template <class D, HWY_IF_U32_D(D)>
   4780 HWY_API Vec128<uint32_t> PromoteUpperTo(D /* tag */, Vec128<uint16_t> v) {
   4781  return Vec128<uint32_t>(vmovl_high_u16(v.raw));
   4782 }
   4783 template <class D, HWY_IF_U64_D(D)>
   4784 HWY_API Vec128<uint64_t> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) {
   4785  return Vec128<uint64_t>(vmovl_high_u32(v.raw));
   4786 }
   4787 template <class D, HWY_IF_I16_D(D)>
   4788 HWY_API Vec128<int16_t> PromoteUpperTo(D d, Vec128<uint8_t> v) {
   4789  return BitCast(d, Vec128<uint16_t>(vmovl_high_u8(v.raw)));
   4790 }
   4791 template <class D, HWY_IF_I32_D(D)>
   4792 HWY_API Vec128<int32_t> PromoteUpperTo(D d, Vec128<uint16_t> v) {
   4793  return BitCast(d, Vec128<uint32_t>(vmovl_high_u16(v.raw)));
   4794 }
   4795 template <class D, HWY_IF_I64_D(D)>
   4796 HWY_API Vec128<int64_t> PromoteUpperTo(D d, Vec128<uint32_t> v) {
   4797  return BitCast(d, Vec128<uint64_t>(vmovl_high_u32(v.raw)));
   4798 }
   4799 
   4800 // Signed: replicate sign bit to full vector.
   4801 template <class D, HWY_IF_I16_D(D)>
   4802 HWY_API Vec128<int16_t> PromoteUpperTo(D /* tag */, Vec128<int8_t> v) {
   4803  return Vec128<int16_t>(vmovl_high_s8(v.raw));
   4804 }
   4805 template <class D, HWY_IF_I32_D(D)>
   4806 HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<int16_t> v) {
   4807  return Vec128<int32_t>(vmovl_high_s16(v.raw));
   4808 }
   4809 template <class D, HWY_IF_I64_D(D)>
   4810 HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
   4811  return Vec128<int64_t>(vmovl_high_s32(v.raw));
   4812 }
   4813 
   4814 #if HWY_NEON_HAVE_F16C
   4815 
   4816 template <class D, HWY_IF_F32_D(D)>
   4817 HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) {
   4818  return Vec128<float>(vcvt_high_f32_f16(v.raw));
   4819 }
   4820 
   4821 #endif  // HWY_NEON_HAVE_F16C
   4822 
   4823 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)>
   4824 HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) {
   4825  const Repartition<uint16_t, decltype(df32)> du16;
   4826  const RebindToSigned<decltype(df32)> di32;
   4827  return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v))));
   4828 }
   4829 
   4830 #if HWY_HAVE_FLOAT64
   4831 
   4832 template <class D, HWY_IF_F64_D(D)>
   4833 HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<float> v) {
   4834  return Vec128<double>(vcvt_high_f64_f32(v.raw));
   4835 }
   4836 
   4837 template <class D, HWY_IF_F64_D(D)>
   4838 HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) {
   4839  const int64x2_t i64 = vmovl_high_s32(v.raw);
   4840  return Vec128<double>(vcvtq_f64_s64(i64));
   4841 }
   4842 
   4843 template <class D, HWY_IF_F64_D(D)>
   4844 HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) {
   4845  const uint64x2_t u64 = vmovl_high_u32(v.raw);
   4846  return Vec128<double>(vcvtq_f64_u64(u64));
   4847 }
   4848 
   4849 #endif  // HWY_HAVE_FLOAT64
   4850 
   4851 template <class D, HWY_IF_UI64_D(D)>
   4852 HWY_API VFromD<D> PromoteUpperTo(D d64, Vec128<float> v) {
   4853 #if HWY_HAVE_FLOAT64
   4854  const RebindToFloat<decltype(d64)> df64;
   4855  return ConvertTo(d64, PromoteUpperTo(df64, v));
   4856 #else
   4857  const Rebind<float, decltype(d)> dh;
   4858  return PromoteTo(d, UpperHalf(dh, v));
   4859 #endif
   4860 }
   4861 
   4862 // Generic version for <=64 bit input/output (_high is only for full vectors).
   4863 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V>
   4864 HWY_API VFromD<D> PromoteUpperTo(D d, V v) {
   4865  const Rebind<TFromV<V>, decltype(d)> dh;
   4866  return PromoteTo(d, UpperHalf(dh, v));
   4867 }
   4868 
   4869 #endif  // HWY_ARCH_ARM_A64
   4870 
   4871 // ------------------------------ DemoteTo (ConvertTo)
   4872 
   4873 // From full vector to half or quarter
   4874 template <class D, HWY_IF_U16_D(D)>
   4875 HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
   4876  return Vec64<uint16_t>(vqmovun_s32(v.raw));
   4877 }
   4878 template <class D, HWY_IF_I16_D(D)>
   4879 HWY_API Vec64<int16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
   4880  return Vec64<int16_t>(vqmovn_s32(v.raw));
   4881 }
   4882 template <class D, HWY_IF_U8_D(D)>
   4883 HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
   4884  const uint16x4_t a = vqmovun_s32(v.raw);
   4885  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
   4886 }
   4887 template <class D, HWY_IF_U8_D(D)>
   4888 HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) {
   4889  return Vec64<uint8_t>(vqmovun_s16(v.raw));
   4890 }
   4891 template <class D, HWY_IF_I8_D(D)>
   4892 HWY_API Vec32<int8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) {
   4893  const int16x4_t a = vqmovn_s32(v.raw);
   4894  return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a)));
   4895 }
   4896 template <class D, HWY_IF_I8_D(D)>
   4897 HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) {
   4898  return Vec64<int8_t>(vqmovn_s16(v.raw));
   4899 }
   4900 template <class D, HWY_IF_U16_D(D)>
   4901 HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) {
   4902  return Vec64<uint16_t>(vqmovn_u32(v.raw));
   4903 }
   4904 template <class D, HWY_IF_U8_D(D)>
   4905 HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) {
   4906  const uint16x4_t a = vqmovn_u32(v.raw);
   4907  return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a)));
   4908 }
   4909 template <class D, HWY_IF_U8_D(D)>
   4910 HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<uint16_t> v) {
   4911  return Vec64<uint8_t>(vqmovn_u16(v.raw));
   4912 }
   4913 
   4914 // From half vector to partial half
   4915 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
   4916 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4917  return VFromD<D>(vqmovun_s32(vcombine_s32(v.raw, v.raw)));
   4918 }
   4919 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)>
   4920 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4921  return VFromD<D>(vqmovn_s32(vcombine_s32(v.raw, v.raw)));
   4922 }
   4923 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
   4924 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4925  const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw));
   4926  return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
   4927 }
   4928 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
   4929 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   4930  return VFromD<D>(vqmovun_s16(vcombine_s16(v.raw, v.raw)));
   4931 }
   4932 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)>
   4933 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) {
   4934  const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw));
   4935  return VFromD<D>(vqmovn_s16(vcombine_s16(a, a)));
   4936 }
   4937 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)>
   4938 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) {
   4939  return VFromD<D>(vqmovn_s16(vcombine_s16(v.raw, v.raw)));
   4940 }
   4941 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)>
   4942 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4943  return VFromD<D>(vqmovn_u32(vcombine_u32(v.raw, v.raw)));
   4944 }
   4945 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)>
   4946 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   4947  const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw));
   4948  return VFromD<D>(vqmovn_u16(vcombine_u16(a, a)));
   4949 }
   4950 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)>
   4951 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   4952  return VFromD<D>(vqmovn_u16(vcombine_u16(v.raw, v.raw)));
   4953 }
   4954 
   4955 template <class D, HWY_IF_I32_D(D)>
   4956 HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) {
   4957  return Vec64<int32_t>(vqmovn_s64(v.raw));
   4958 }
   4959 template <class D, HWY_IF_U32_D(D)>
   4960 HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) {
   4961  return Vec64<uint32_t>(vqmovun_s64(v.raw));
   4962 }
   4963 template <class D, HWY_IF_U32_D(D)>
   4964 HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<uint64_t> v) {
   4965  return Vec64<uint32_t>(vqmovn_u64(v.raw));
   4966 }
   4967 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   4968          HWY_IF_SIGNED_D(D)>
   4969 HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) {
   4970  const Rebind<int32_t, D> di32;
   4971  return DemoteTo(d, DemoteTo(di32, v));
   4972 }
   4973 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   4974          HWY_IF_UNSIGNED_D(D)>
   4975 HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) {
   4976  const Rebind<uint32_t, D> du32;
   4977  return DemoteTo(d, DemoteTo(du32, v));
   4978 }
   4979 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)),
   4980          HWY_IF_UNSIGNED_D(D)>
   4981 HWY_API VFromD<D> DemoteTo(D d, Vec128<uint64_t> v) {
   4982  const Rebind<uint32_t, D> du32;
   4983  return DemoteTo(d, DemoteTo(du32, v));
   4984 }
   4985 
   4986 template <class D, HWY_IF_I32_D(D)>
   4987 HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) {
   4988  return Vec32<int32_t>(vqmovn_s64(vcombine_s64(v.raw, v.raw)));
   4989 }
   4990 template <class D, HWY_IF_U32_D(D)>
   4991 HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) {
   4992  return Vec32<uint32_t>(vqmovun_s64(vcombine_s64(v.raw, v.raw)));
   4993 }
   4994 template <class D, HWY_IF_U32_D(D)>
   4995 HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<uint64_t> v) {
   4996  return Vec32<uint32_t>(vqmovn_u64(vcombine_u64(v.raw, v.raw)));
   4997 }
   4998 template <class D, HWY_IF_SIGNED_D(D),
   4999          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   5000 HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) {
   5001  const Rebind<int32_t, D> di32;
   5002  return DemoteTo(d, DemoteTo(di32, v));
   5003 }
   5004 template <class D, HWY_IF_UNSIGNED_D(D),
   5005          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   5006 HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) {
   5007  const Rebind<uint32_t, D> du32;
   5008  return DemoteTo(d, DemoteTo(du32, v));
   5009 }
   5010 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_UNSIGNED_D(D),
   5011          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   5012 HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) {
   5013  const Rebind<uint32_t, D> du32;
   5014  return DemoteTo(d, DemoteTo(du32, v));
   5015 }
   5016 
   5017 #if HWY_NEON_HAVE_F16C
   5018 
   5019 // We already toggled HWY_NATIVE_F16C above.
   5020 
   5021 template <class D, HWY_IF_F16_D(D)>
   5022 HWY_API Vec64<float16_t> DemoteTo(D /* tag */, Vec128<float> v) {
   5023  return Vec64<float16_t>{vcvt_f16_f32(v.raw)};
   5024 }
   5025 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)>
   5026 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) {
   5027  return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw)));
   5028 }
   5029 
   5030 #endif  // HWY_NEON_HAVE_F16C
   5031 
   5032 #if HWY_NEON_HAVE_F32_TO_BF16C
   5033 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
   5034 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16
   5035 #else
   5036 #define HWY_NATIVE_DEMOTE_F32_TO_BF16
   5037 #endif
   5038 
   5039 namespace detail {
   5040 #if HWY_NEON_HAVE_BFLOAT16
   5041 // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
   5042 // bfloat16x4_t or bfloat16x8_t.
   5043 static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
   5044  return raw;
   5045 }
   5046 #else
   5047 // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
   5048 // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
   5049 // work around compiler bugs that are there with GCC 13 or earlier or Clang 16
   5050 // or earlier on AArch64.
   5051 
   5052 // The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to
   5053 // an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C &&
   5054 // !HWY_NEON_HAVE_BFLOAT16 is true.
   5055 static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) {
   5056  return vreinterpret_u16_bf16(raw);
   5057 }
   5058 #endif
   5059 }  // namespace detail
   5060 
   5061 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)>
   5062 HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
   5063  return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw)));
   5064 }
   5065 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)>
   5066 HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) {
   5067  return VFromD<D>(detail::BitCastFromRawNeonBF16(
   5068      vcvt_bf16_f32(vcombine_f32(v.raw, v.raw))));
   5069 }
   5070 #endif  // HWY_NEON_HAVE_F32_TO_BF16C
   5071 
   5072 #if HWY_HAVE_FLOAT64
   5073 
   5074 template <class D, HWY_IF_F32_D(D)>
   5075 HWY_API Vec64<float> DemoteTo(D /* tag */, Vec128<double> v) {
   5076  return Vec64<float>(vcvt_f32_f64(v.raw));
   5077 }
   5078 template <class D, HWY_IF_F32_D(D)>
   5079 HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) {
   5080  return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw)));
   5081 }
   5082 
   5083 template <class D, HWY_IF_UI32_D(D)>
   5084 HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) {
   5085  const Rebind<MakeWide<TFromD<D>>, D> d64;
   5086  return DemoteTo(d32, ConvertTo(d64, v));
   5087 }
   5088 
   5089 #endif  // HWY_HAVE_FLOAT64
   5090 
   5091 template <class D, HWY_IF_F32_D(D)>
   5092 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) {
   5093  const Rebind<int64_t, decltype(df32)> di64;
   5094  const RebindToUnsigned<decltype(di64)> du64;
   5095 
   5096 #if HWY_ARCH_ARM_A64
   5097  const RebindToFloat<decltype(du64)> df64;
   5098 
   5099  const auto k2p64_63 = Set(df64, 27670116110564327424.0);
   5100  const auto f64_hi52 =
   5101      Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63;
   5102  const auto f64_lo12 =
   5103      ConvertTo(df64, And(BitCast(du64, v), Set(du64, uint64_t{0x00000FFF})));
   5104 
   5105  const auto f64_sum = f64_hi52 + f64_lo12;
   5106  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
   5107 
   5108  const auto f64_sum_is_inexact =
   5109      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
   5110  const auto f64_bits_decrement =
   5111      And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))),
   5112          f64_sum_is_inexact);
   5113 
   5114  const auto adj_f64_val = BitCast(
   5115      df64,
   5116      Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact));
   5117 
   5118  return DemoteTo(df32, adj_f64_val);
   5119 #else
   5120  const RebindToUnsigned<decltype(df32)> du32;
   5121  const auto hi23 = TruncateTo(du32, ShiftRight<41>(BitCast(du64, v)));
   5122  const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(BitCast(du64, v))),
   5123                         Set(du32, uint32_t{0x007FFFFFu}));
   5124  const auto lo18 =
   5125      And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x0003FFFFu}));
   5126 
   5127  const auto k2p41_f32 = Set(df32, 2199023255552.0f);
   5128  const auto k2p64_63_f32 = Set(df32, 27670116110564327424.0f);
   5129 
   5130  const auto hi23_f32 =
   5131      BitCast(df32, Xor(hi23, BitCast(du32, k2p64_63_f32))) - k2p64_63_f32;
   5132  const auto mid23_f32 =
   5133      BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32;
   5134  const auto lo18_f32 = ConvertTo(df32, lo18);
   5135 
   5136  const auto s_hi46 = hi23_f32 + mid23_f32;
   5137  const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32;
   5138 
   5139  auto s_lo = c_hi46 + lo18_f32;
   5140  const auto c_lo = (c_hi46 - s_lo) + lo18_f32;
   5141 
   5142  const auto s_lo_inexact_mask =
   5143      VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32)));
   5144  const auto s_lo_mag_adj = ShiftRight<31>(
   5145      And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo))));
   5146 
   5147  s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj);
   5148  s_lo =
   5149      BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask)));
   5150  return s_hi46 + s_lo;
   5151 #endif
   5152 }
   5153 
   5154 template <class D, HWY_IF_F32_D(D)>
   5155 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) {
   5156 #if HWY_ARCH_ARM_A64
   5157  const Rebind<uint64_t, decltype(df32)> du64;
   5158  const RebindToFloat<decltype(du64)> df64;
   5159 
   5160  const auto k2p64 = Set(df64, 18446744073709551616.0);
   5161  const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64;
   5162  const auto f64_lo12 =
   5163      ConvertTo(df64, And(v, Set(du64, uint64_t{0x00000FFF})));
   5164 
   5165  const auto f64_sum = f64_hi52 + f64_lo12;
   5166  const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12;
   5167  const auto f64_sum_is_inexact =
   5168      ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64))));
   5169 
   5170  const auto adj_f64_val = BitCast(
   5171      df64,
   5172      Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)),
   5173         f64_sum_is_inexact));
   5174 
   5175  return DemoteTo(df32, adj_f64_val);
   5176 #else
   5177  const RebindToUnsigned<decltype(df32)> du32;
   5178 
   5179  const auto hi23 = TruncateTo(du32, ShiftRight<41>(v));
   5180  const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(v)),
   5181                         Set(du32, uint32_t{0x007FFFFFu}));
   5182  const auto lo18 = And(TruncateTo(du32, v), Set(du32, uint32_t{0x0003FFFFu}));
   5183 
   5184  const auto k2p41_f32 = Set(df32, 2199023255552.0f);
   5185  const auto k2p64_f32 = Set(df32, 18446744073709551616.0f);
   5186 
   5187  const auto hi23_f32 =
   5188      BitCast(df32, Or(hi23, BitCast(du32, k2p64_f32))) - k2p64_f32;
   5189  const auto mid23_f32 =
   5190      BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32;
   5191  const auto lo18_f32 = ConvertTo(df32, lo18);
   5192 
   5193  const auto s_hi46 = hi23_f32 + mid23_f32;
   5194  const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32;
   5195 
   5196  auto s_lo = c_hi46 + lo18_f32;
   5197  const auto c_lo = (c_hi46 - s_lo) + lo18_f32;
   5198 
   5199  const auto s_lo_inexact_mask =
   5200      VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32)));
   5201  const auto s_lo_mag_adj = ShiftRight<31>(
   5202      And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo))));
   5203 
   5204  s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj);
   5205  s_lo =
   5206      BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask)));
   5207  return s_hi46 + s_lo;
   5208 #endif
   5209 }
   5210 
   5211 HWY_API Vec32<uint8_t> U8FromU32(Vec128<uint32_t> v) {
   5212  const uint8x16_t org_v = detail::BitCastToByte(v).raw;
   5213  const uint8x16_t w = vuzp1q_u8(org_v, org_v);
   5214  return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w)));
   5215 }
   5216 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)>
   5217 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) {
   5218  const uint8x8_t org_v = detail::BitCastToByte(v).raw;
   5219  const uint8x8_t w = vuzp1_u8(org_v, org_v);
   5220  return Vec128<uint8_t, N>(vuzp1_u8(w, w));
   5221 }
   5222 
   5223 // ------------------------------ Round (IfThenElse, mask, logical)
   5224 
   5225 #if HWY_ARCH_ARM_A64
   5226 // Toward nearest integer
   5227 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1)
   5228 
   5229 // Toward zero, aka truncate
   5230 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1)
   5231 
   5232 // Toward +infinity, aka ceiling
   5233 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1)
   5234 
   5235 // Toward -infinity, aka floor
   5236 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1)
   5237 #else
   5238 
   5239 // ------------------------------ Trunc
   5240 
   5241 // Armv7 only supports truncation to integer. We can either convert back to
   5242 // float (3 floating-point and 2 logic operations) or manipulate the binary32
   5243 // representation, clearing the lowest 23-exp mantissa bits. This requires 9
   5244 // integer operations and 3 constants, which is likely more expensive.
   5245 
   5246 namespace detail {
   5247 
   5248 // The original value is already the desired result if NaN or the magnitude is
   5249 // large (i.e. the value is already an integer).
   5250 template <size_t N>
   5251 HWY_INLINE Mask128<float, N> UseInt(const Vec128<float, N> v) {
   5252  return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>());
   5253 }
   5254 
   5255 }  // namespace detail
   5256 
   5257 template <size_t N>
   5258 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) {
   5259  const DFromV<decltype(v)> df;
   5260  const RebindToSigned<decltype(df)> di;
   5261 
   5262  const auto integer = ConvertTo(di, v);  // round toward 0
   5263  const auto int_f = ConvertTo(df, integer);
   5264 
   5265  return IfThenElse(detail::UseInt(v), int_f, v);
   5266 }
   5267 
   5268 template <size_t N>
   5269 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) {
   5270  const DFromV<decltype(v)> df;
   5271 
   5272  // Armv7 also lacks a native NearestInt, but we can instead rely on rounding
   5273  // (we assume the current mode is nearest-even) after addition with a large
   5274  // value such that no mantissa bits remain. We may need a compiler flag for
   5275  // precise floating-point to prevent this from being "optimized" out.
   5276  const auto max = Set(df, MantissaEnd<float>());
   5277  const auto large = CopySignToAbs(max, v);
   5278  const auto added = large + v;
   5279  const auto rounded = added - large;
   5280 
   5281  // Keep original if NaN or the magnitude is large (already an int).
   5282  return IfThenElse(Abs(v) < max, rounded, v);
   5283 }
   5284 
   5285 template <size_t N>
   5286 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) {
   5287  const DFromV<decltype(v)> df;
   5288  const RebindToSigned<decltype(df)> di;
   5289 
   5290  const auto integer = ConvertTo(di, v);  // round toward 0
   5291  const auto int_f = ConvertTo(df, integer);
   5292 
   5293  // Truncating a positive non-integer ends up smaller; if so, add 1.
   5294  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v)));
   5295 
   5296  return IfThenElse(detail::UseInt(v), int_f - neg1, v);
   5297 }
   5298 
   5299 template <size_t N>
   5300 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) {
   5301  const DFromV<decltype(v)> df;
   5302  const RebindToSigned<decltype(df)> di;
   5303 
   5304  const auto integer = ConvertTo(di, v);  // round toward 0
   5305  const auto int_f = ConvertTo(df, integer);
   5306 
   5307  // Truncating a negative non-integer ends up larger; if so, subtract 1.
   5308  const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v)));
   5309 
   5310  return IfThenElse(detail::UseInt(v), int_f + neg1, v);
   5311 }
   5312 
   5313 #endif
   5314 
   5315 // ------------------------------ CeilInt/FloorInt
   5316 #if HWY_ARCH_ARM_A64
   5317 
   5318 #ifdef HWY_NATIVE_CEIL_FLOOR_INT
   5319 #undef HWY_NATIVE_CEIL_FLOOR_INT
   5320 #else
   5321 #define HWY_NATIVE_CEIL_FLOOR_INT
   5322 #endif
   5323 
   5324 #if HWY_HAVE_FLOAT16
   5325 HWY_API Vec128<int16_t> CeilInt(const Vec128<float16_t> v) {
   5326  return Vec128<int16_t>(vcvtpq_s16_f16(v.raw));
   5327 }
   5328 
   5329 template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
   5330 HWY_API Vec128<int16_t, N> CeilInt(const Vec128<float16_t, N> v) {
   5331  return Vec128<int16_t, N>(vcvtp_s16_f16(v.raw));
   5332 }
   5333 
   5334 HWY_API Vec128<int16_t> FloorInt(const Vec128<float16_t> v) {
   5335  return Vec128<int16_t>(vcvtmq_s16_f16(v.raw));
   5336 }
   5337 
   5338 template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
   5339 HWY_API Vec128<int16_t, N> FloorInt(const Vec128<float16_t, N> v) {
   5340  return Vec128<int16_t, N>(vcvtm_s16_f16(v.raw));
   5341 }
   5342 #endif  // HWY_HAVE_FLOAT16
   5343 
   5344 HWY_API Vec128<int32_t> CeilInt(const Vec128<float> v) {
   5345  return Vec128<int32_t>(vcvtpq_s32_f32(v.raw));
   5346 }
   5347 
   5348 template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
   5349 HWY_API Vec128<int32_t, N> CeilInt(const Vec128<float, N> v) {
   5350  return Vec128<int32_t, N>(vcvtp_s32_f32(v.raw));
   5351 }
   5352 
   5353 HWY_API Vec128<int64_t> CeilInt(const Vec128<double> v) {
   5354  return Vec128<int64_t>(vcvtpq_s64_f64(v.raw));
   5355 }
   5356 
   5357 template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
   5358 HWY_API Vec128<int64_t, N> CeilInt(const Vec128<double, N> v) {
   5359 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
   5360  // Workaround for missing vcvtp_s64_f64 intrinsic
   5361  const DFromV<decltype(v)> d;
   5362  const RebindToSigned<decltype(d)> di;
   5363  const Twice<decltype(d)> dt;
   5364  return LowerHalf(di, CeilInt(Combine(dt, v, v)));
   5365 #else
   5366  return Vec128<int64_t, N>(vcvtp_s64_f64(v.raw));
   5367 #endif
   5368 }
   5369 
   5370 HWY_API Vec128<int32_t> FloorInt(const Vec128<float> v) {
   5371  return Vec128<int32_t>(vcvtmq_s32_f32(v.raw));
   5372 }
   5373 
   5374 template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
   5375 HWY_API Vec128<int32_t, N> FloorInt(const Vec128<float, N> v) {
   5376  return Vec128<int32_t, N>(vcvtm_s32_f32(v.raw));
   5377 }
   5378 
   5379 HWY_API Vec128<int64_t> FloorInt(const Vec128<double> v) {
   5380  return Vec128<int64_t>(vcvtmq_s64_f64(v.raw));
   5381 }
   5382 
   5383 template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
   5384 HWY_API Vec128<int64_t, N> FloorInt(const Vec128<double, N> v) {
   5385 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
   5386  // Workaround for missing vcvtm_s64_f64 intrinsic
   5387  const DFromV<decltype(v)> d;
   5388  const RebindToSigned<decltype(d)> di;
   5389  const Twice<decltype(d)> dt;
   5390  return LowerHalf(di, FloorInt(Combine(dt, v, v)));
   5391 #else
   5392  return Vec128<int64_t, N>(vcvtm_s64_f64(v.raw));
   5393 #endif
   5394 }
   5395 
   5396 #endif  // HWY_ARCH_ARM_A64
   5397 
   5398 // ------------------------------ NearestInt (Round)
   5399 
   5400 #if HWY_HAVE_FLOAT16
   5401 HWY_API Vec128<int16_t> NearestInt(const Vec128<float16_t> v) {
   5402  return Vec128<int16_t>(vcvtnq_s16_f16(v.raw));
   5403 }
   5404 template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)>
   5405 HWY_API Vec128<int16_t, N> NearestInt(const Vec128<float16_t, N> v) {
   5406  return Vec128<int16_t, N>(vcvtn_s16_f16(v.raw));
   5407 }
   5408 #endif
   5409 
   5410 #if HWY_ARCH_ARM_A64
   5411 
   5412 HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) {
   5413  return Vec128<int32_t>(vcvtnq_s32_f32(v.raw));
   5414 }
   5415 template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)>
   5416 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
   5417  return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw));
   5418 }
   5419 
   5420 HWY_API Vec128<int64_t> NearestInt(const Vec128<double> v) {
   5421  return Vec128<int64_t>(vcvtnq_s64_f64(v.raw));
   5422 }
   5423 
   5424 template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)>
   5425 HWY_API Vec128<int64_t, N> NearestInt(const Vec128<double, N> v) {
   5426 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610
   5427  // Workaround for missing vcvtn_s64_f64 intrinsic
   5428  const DFromV<decltype(v)> d;
   5429  const RebindToSigned<decltype(d)> di;
   5430  const Twice<decltype(d)> dt;
   5431  return LowerHalf(di, NearestInt(Combine(dt, v, v)));
   5432 #else
   5433  return Vec128<int64_t, N>(vcvtn_s64_f64(v.raw));
   5434 #endif
   5435 }
   5436 
   5437 template <class DI32, HWY_IF_I32_D(DI32)>
   5438 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32,
   5439                                        VFromD<Rebind<double, DI32>> v) {
   5440  return DemoteTo(di32, NearestInt(v));
   5441 }
   5442 
   5443 #else
   5444 
   5445 template <size_t N>
   5446 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) {
   5447  const RebindToSigned<DFromV<decltype(v)>> di;
   5448  return ConvertTo(di, Round(v));
   5449 }
   5450 
   5451 #endif
   5452 
   5453 // ------------------------------ Floating-point classification
   5454 
   5455 #if !HWY_COMPILER_CLANG || HWY_COMPILER_CLANG > 1801 || HWY_ARCH_ARM_V7
   5456 template <typename T, size_t N>
   5457 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5458  return v != v;
   5459 }
   5460 #else
   5461 // Clang up to 18.1 generates less efficient code than the expected FCMEQ, see
   5462 // https://github.com/numpy/numpy/issues/27313 and
   5463 // https://github.com/numpy/numpy/pull/22954/files and
   5464 // https://github.com/llvm/llvm-project/issues/59855
   5465 
   5466 #if HWY_HAVE_FLOAT16
   5467 template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE(T, N, 16)>
   5468 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5469  typename Mask128<T, N>::Raw ret;
   5470  __asm__ volatile("fcmeq %0.8h, %1.8h, %1.8h" : "=w"(ret) : "w"(v.raw));
   5471  return Not(Mask128<T, N>(ret));
   5472 }
   5473 template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE_LE(T, N, 8)>
   5474 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5475  typename Mask128<T, N>::Raw ret;
   5476  __asm__ volatile("fcmeq %0.4h, %1.4h, %1.4h" : "=w"(ret) : "w"(v.raw));
   5477  return Not(Mask128<T, N>(ret));
   5478 }
   5479 #endif  // HWY_HAVE_FLOAT16
   5480 
   5481 template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE(T, N, 16)>
   5482 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5483  typename Mask128<T, N>::Raw ret;
   5484  __asm__ volatile("fcmeq %0.4s, %1.4s, %1.4s" : "=w"(ret) : "w"(v.raw));
   5485  return Not(Mask128<T, N>(ret));
   5486 }
   5487 template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE_LE(T, N, 8)>
   5488 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5489  typename Mask128<T, N>::Raw ret;
   5490  __asm__ volatile("fcmeq %0.2s, %1.2s, %1.2s" : "=w"(ret) : "w"(v.raw));
   5491  return Not(Mask128<T, N>(ret));
   5492 }
   5493 
   5494 #if HWY_HAVE_FLOAT64
   5495 template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE(T, N, 16)>
   5496 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5497  typename Mask128<T, N>::Raw ret;
   5498  __asm__ volatile("fcmeq %0.2d, %1.2d, %1.2d" : "=w"(ret) : "w"(v.raw));
   5499  return Not(Mask128<T, N>(ret));
   5500 }
   5501 template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE_LE(T, N, 8)>
   5502 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) {
   5503  typename Mask128<T, N>::Raw ret;
   5504  __asm__ volatile("fcmeq %d0, %d1, %d1" : "=w"(ret) : "w"(v.raw));
   5505  return Not(Mask128<T, N>(ret));
   5506 }
   5507 #endif  // HWY_HAVE_FLOAT64
   5508 
   5509 #endif  // HWY_COMPILER_CLANG
   5510 
   5511 // ================================================== SWIZZLE
   5512 
   5513 // ------------------------------ LowerHalf
   5514 
   5515 // <= 64 bit: just return different type
   5516 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   5517 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) {
   5518  return Vec128<T, N / 2>(v.raw);
   5519 }
   5520 
   5521 HWY_API Vec64<uint8_t> LowerHalf(Vec128<uint8_t> v) {
   5522  return Vec64<uint8_t>(vget_low_u8(v.raw));
   5523 }
   5524 HWY_API Vec64<uint16_t> LowerHalf(Vec128<uint16_t> v) {
   5525  return Vec64<uint16_t>(vget_low_u16(v.raw));
   5526 }
   5527 HWY_API Vec64<uint32_t> LowerHalf(Vec128<uint32_t> v) {
   5528  return Vec64<uint32_t>(vget_low_u32(v.raw));
   5529 }
   5530 HWY_API Vec64<uint64_t> LowerHalf(Vec128<uint64_t> v) {
   5531  return Vec64<uint64_t>(vget_low_u64(v.raw));
   5532 }
   5533 HWY_API Vec64<int8_t> LowerHalf(Vec128<int8_t> v) {
   5534  return Vec64<int8_t>(vget_low_s8(v.raw));
   5535 }
   5536 HWY_API Vec64<int16_t> LowerHalf(Vec128<int16_t> v) {
   5537  return Vec64<int16_t>(vget_low_s16(v.raw));
   5538 }
   5539 HWY_API Vec64<int32_t> LowerHalf(Vec128<int32_t> v) {
   5540  return Vec64<int32_t>(vget_low_s32(v.raw));
   5541 }
   5542 HWY_API Vec64<int64_t> LowerHalf(Vec128<int64_t> v) {
   5543  return Vec64<int64_t>(vget_low_s64(v.raw));
   5544 }
   5545 HWY_API Vec64<float> LowerHalf(Vec128<float> v) {
   5546  return Vec64<float>(vget_low_f32(v.raw));
   5547 }
   5548 #if HWY_HAVE_FLOAT16
   5549 HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) {
   5550  return Vec64<float16_t>(vget_low_f16(v.raw));
   5551 }
   5552 #endif  // HWY_HAVE_FLOAT16
   5553 #if HWY_NEON_HAVE_BFLOAT16
   5554 HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) {
   5555  return Vec64<bfloat16_t>(vget_low_bf16(v.raw));
   5556 }
   5557 #endif  // HWY_NEON_HAVE_BFLOAT16
   5558 #if HWY_HAVE_FLOAT64
   5559 HWY_API Vec64<double> LowerHalf(Vec128<double> v) {
   5560  return Vec64<double>(vget_low_f64(v.raw));
   5561 }
   5562 #endif  // HWY_HAVE_FLOAT64
   5563 
   5564 template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)>
   5565 HWY_API VFromD<Half<DFromV<V>>> LowerHalf(V v) {
   5566  const Full128<uint16_t> du;
   5567  const Half<DFromV<V>> dh;
   5568  return BitCast(dh, LowerHalf(BitCast(du, v)));
   5569 }
   5570 
   5571 template <class DH>
   5572 HWY_API VFromD<DH> LowerHalf(DH /* tag */, VFromD<Twice<DH>> v) {
   5573  return LowerHalf(v);
   5574 }
   5575 
   5576 // ------------------------------ CombineShiftRightBytes
   5577 
   5578 // 128-bit
   5579 template <int kBytes, class D, typename T = TFromD<D>>
   5580 HWY_API Vec128<T> CombineShiftRightBytes(D d, Vec128<T> hi, Vec128<T> lo) {
   5581  static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]");
   5582  const Repartition<uint8_t, decltype(d)> d8;
   5583  uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
   5584  return BitCast(d, Vec128<uint8_t>(v8));
   5585 }
   5586 
   5587 // 64-bit
   5588 template <int kBytes, class D, typename T = TFromD<D>>
   5589 HWY_API Vec64<T> CombineShiftRightBytes(D d, Vec64<T> hi, Vec64<T> lo) {
   5590  static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]");
   5591  const Repartition<uint8_t, decltype(d)> d8;
   5592  uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes);
   5593  return BitCast(d, VFromD<decltype(d8)>(v8));
   5594 }
   5595 
   5596 // <= 32-bit defined after ShiftLeftBytes.
   5597 
   5598 // ------------------------------ Shift vector by constant #bytes
   5599 
   5600 namespace detail {
   5601 
   5602 // Partially specialize because kBytes = 0 and >= size are compile errors;
   5603 // callers replace the latter with 0xFF for easier specialization.
   5604 template <int kBytes>
   5605 struct ShiftLeftBytesT {
   5606  // Full
   5607  template <class T>
   5608  HWY_INLINE Vec128<T> operator()(const Vec128<T> v) {
   5609    const Full128<T> d;
   5610    return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d));
   5611  }
   5612 
   5613  // Partial
   5614  template <class T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)>
   5615  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
   5616    // Expand to 64-bit so we only use the native EXT instruction.
   5617    const Full64<T> d64;
   5618    const auto zero64 = Zero(d64);
   5619    const decltype(zero64) v64(v.raw);
   5620    return Vec128<T, N>(
   5621        CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw);
   5622  }
   5623 };
   5624 template <>
   5625 struct ShiftLeftBytesT<0> {
   5626  template <class T, size_t N>
   5627  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
   5628    return v;
   5629  }
   5630 };
   5631 template <>
   5632 struct ShiftLeftBytesT<0xFF> {
   5633  template <class T, size_t N>
   5634  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
   5635    return Xor(v, v);
   5636  }
   5637 };
   5638 
   5639 template <int kBytes>
   5640 struct ShiftRightBytesT {
   5641  template <class T, size_t N>
   5642  HWY_INLINE Vec128<T, N> operator()(Vec128<T, N> v) {
   5643    const DFromV<decltype(v)> d;
   5644    // For < 64-bit vectors, zero undefined lanes so we shift in zeros.
   5645    if (d.MaxBytes() < 8) {
   5646      constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8;
   5647      const Simd<T, kReg / sizeof(T), 0> dreg;
   5648      v = Vec128<T, N>(
   5649          IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw);
   5650    }
   5651    return CombineShiftRightBytes<kBytes>(d, Zero(d), v);
   5652  }
   5653 };
   5654 template <>
   5655 struct ShiftRightBytesT<0> {
   5656  template <class T, size_t N>
   5657  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
   5658    return v;
   5659  }
   5660 };
   5661 template <>
   5662 struct ShiftRightBytesT<0xFF> {
   5663  template <class T, size_t N>
   5664  HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) {
   5665    return Xor(v, v);
   5666  }
   5667 };
   5668 
   5669 }  // namespace detail
   5670 
   5671 template <int kBytes, class D>
   5672 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) {
   5673  return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v);
   5674 }
   5675 
   5676 template <int kBytes, typename T, size_t N>
   5677 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) {
   5678  return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v);
   5679 }
   5680 
   5681 template <int kLanes, class D>
   5682 HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) {
   5683  const Repartition<uint8_t, decltype(d)> d8;
   5684  return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v)));
   5685 }
   5686 
   5687 template <int kLanes, typename T, size_t N>
   5688 HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) {
   5689  return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v);
   5690 }
   5691 
   5692 // 0x01..0F, kBytes = 1 => 0x0001..0E
   5693 template <int kBytes, class D>
   5694 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) {
   5695  return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(
   5696      v);
   5697 }
   5698 
   5699 template <int kLanes, class D>
   5700 HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) {
   5701  const Repartition<uint8_t, decltype(d)> d8;
   5702  return BitCast(
   5703      d, ShiftRightBytes<kLanes * sizeof(TFromD<D>)>(d8, BitCast(d8, v)));
   5704 }
   5705 
   5706 // Calls ShiftLeftBytes
   5707 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   5708 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) {
   5709  constexpr size_t kSize = d.MaxBytes();
   5710  static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid");
   5711  const Repartition<uint8_t, decltype(d)> d8;
   5712  const Full64<uint8_t> d_full8;
   5713  const Repartition<TFromD<D>, decltype(d_full8)> d_full;
   5714  using V64 = VFromD<decltype(d_full8)>;
   5715  const V64 hi64(BitCast(d8, hi).raw);
   5716  // Move into most-significant bytes
   5717  const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw));
   5718  const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64);
   5719  // After casting to full 64-bit vector of correct type, shrink to 32-bit
   5720  return VFromD<D>(BitCast(d_full, r).raw);
   5721 }
   5722 
   5723 // ------------------------------ UpperHalf (ShiftRightBytes)
   5724 
   5725 // Full input
   5726 template <class D, HWY_IF_U8_D(D)>
   5727 HWY_API Vec64<uint8_t> UpperHalf(D /* tag */, Vec128<uint8_t> v) {
   5728  return Vec64<uint8_t>(vget_high_u8(v.raw));
   5729 }
   5730 template <class D, HWY_IF_U16_D(D)>
   5731 HWY_API Vec64<uint16_t> UpperHalf(D /* tag */, Vec128<uint16_t> v) {
   5732  return Vec64<uint16_t>(vget_high_u16(v.raw));
   5733 }
   5734 template <class D, HWY_IF_U32_D(D)>
   5735 HWY_API Vec64<uint32_t> UpperHalf(D /* tag */, Vec128<uint32_t> v) {
   5736  return Vec64<uint32_t>(vget_high_u32(v.raw));
   5737 }
   5738 template <class D, HWY_IF_U64_D(D)>
   5739 HWY_API Vec64<uint64_t> UpperHalf(D /* tag */, Vec128<uint64_t> v) {
   5740  return Vec64<uint64_t>(vget_high_u64(v.raw));
   5741 }
   5742 template <class D, HWY_IF_I8_D(D)>
   5743 HWY_API Vec64<int8_t> UpperHalf(D /* tag */, Vec128<int8_t> v) {
   5744  return Vec64<int8_t>(vget_high_s8(v.raw));
   5745 }
   5746 template <class D, HWY_IF_I16_D(D)>
   5747 HWY_API Vec64<int16_t> UpperHalf(D /* tag */, Vec128<int16_t> v) {
   5748  return Vec64<int16_t>(vget_high_s16(v.raw));
   5749 }
   5750 template <class D, HWY_IF_I32_D(D)>
   5751 HWY_API Vec64<int32_t> UpperHalf(D /* tag */, Vec128<int32_t> v) {
   5752  return Vec64<int32_t>(vget_high_s32(v.raw));
   5753 }
   5754 template <class D, HWY_IF_I64_D(D)>
   5755 HWY_API Vec64<int64_t> UpperHalf(D /* tag */, Vec128<int64_t> v) {
   5756  return Vec64<int64_t>(vget_high_s64(v.raw));
   5757 }
   5758 #if HWY_HAVE_FLOAT16
   5759 template <class D, HWY_IF_F16_D(D)>
   5760 HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) {
   5761  return Vec64<float16_t>(vget_high_f16(v.raw));
   5762 }
   5763 #endif
   5764 #if HWY_NEON_HAVE_BFLOAT16
   5765 template <class D, HWY_IF_BF16_D(D)>
   5766 HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) {
   5767  return Vec64<bfloat16_t>(vget_high_bf16(v.raw));
   5768 }
   5769 #endif  // HWY_NEON_HAVE_BFLOAT16
   5770 template <class D, HWY_IF_F32_D(D)>
   5771 HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) {
   5772  return Vec64<float>(vget_high_f32(v.raw));
   5773 }
   5774 #if HWY_HAVE_FLOAT64
   5775 template <class D, HWY_IF_F64_D(D)>
   5776 HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) {
   5777  return Vec64<double>(vget_high_f64(v.raw));
   5778 }
   5779 #endif  // HWY_HAVE_FLOAT64
   5780 
   5781 template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)>
   5782 HWY_API VFromD<D> UpperHalf(D dh, VFromD<Twice<D>> v) {
   5783  const RebindToUnsigned<Twice<decltype(dh)>> du;
   5784  const Half<decltype(du)> duh;
   5785  return BitCast(dh, UpperHalf(duh, BitCast(du, v)));
   5786 }
   5787 
   5788 // Partial
   5789 template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)>
   5790 HWY_API VFromD<DH> UpperHalf(DH dh, VFromD<Twice<DH>> v) {
   5791  const Twice<DH> d;
   5792  const RebindToUnsigned<decltype(d)> du;
   5793  const VFromD<decltype(du)> upper =
   5794      ShiftRightBytes<dh.MaxBytes()>(du, BitCast(du, v));
   5795  return VFromD<DH>(BitCast(d, upper).raw);
   5796 }
   5797 
   5798 // ------------------------------ Broadcast/splat any lane
   5799 
   5800 template <int kLane, typename T>
   5801 HWY_API Vec128<T, 1> Broadcast(Vec128<T, 1> v) {
   5802  return v;
   5803 }
   5804 
   5805 #if HWY_ARCH_ARM_A64
   5806 // Unsigned
   5807 template <int kLane>
   5808 HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) {
   5809  static_assert(0 <= kLane && kLane < 16, "Invalid lane");
   5810  return Vec128<uint8_t>(vdupq_laneq_u8(v.raw, kLane));
   5811 }
   5812 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8),
   5813          HWY_IF_LANES_GT(N, 1)>
   5814 HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) {
   5815  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5816  return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
   5817 }
   5818 template <int kLane>
   5819 HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
   5820  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   5821  return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane));
   5822 }
   5823 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8),
   5824          HWY_IF_LANES_GT(N, 1)>
   5825 HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
   5826  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5827  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
   5828 }
   5829 template <int kLane>
   5830 HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
   5831  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   5832  return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane));
   5833 }
   5834 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8),
   5835          HWY_IF_LANES_GT(N, 1)>
   5836 HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
   5837  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5838  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
   5839 }
   5840 template <int kLane>
   5841 HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
   5842  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   5843  return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane));
   5844 }
   5845 
   5846 // Signed
   5847 template <int kLane>
   5848 HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) {
   5849  static_assert(0 <= kLane && kLane < 16, "Invalid lane");
   5850  return Vec128<int8_t>(vdupq_laneq_s8(v.raw, kLane));
   5851 }
   5852 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8),
   5853          HWY_IF_LANES_GT(N, 1)>
   5854 HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) {
   5855  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5856  return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
   5857 }
   5858 template <int kLane>
   5859 HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
   5860  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   5861  return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane));
   5862 }
   5863 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8),
   5864          HWY_IF_LANES_GT(N, 1)>
   5865 HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
   5866  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5867  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
   5868 }
   5869 template <int kLane>
   5870 HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
   5871  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   5872  return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane));
   5873 }
   5874 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8),
   5875          HWY_IF_LANES_GT(N, 1)>
   5876 HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
   5877  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5878  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
   5879 }
   5880 template <int kLane>
   5881 HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
   5882  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   5883  return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane));
   5884 }
   5885 
   5886 // Float
   5887 #if HWY_HAVE_FLOAT16
   5888 template <int kLane>
   5889 HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
   5890  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   5891  return Vec128<float16_t>(vdupq_laneq_f16(v.raw, kLane));
   5892 }
   5893 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
   5894          HWY_IF_LANES_GT(N, 1)>
   5895 HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
   5896  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5897  return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
   5898 }
   5899 #endif  // HWY_HAVE_FLOAT16
   5900 
   5901 #if HWY_NEON_HAVE_BFLOAT16
   5902 template <int kLane>
   5903 HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
   5904  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   5905  return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane));
   5906 }
   5907 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
   5908          HWY_IF_LANES_GT(N, 1)>
   5909 HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
   5910  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5911  return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
   5912 }
   5913 #endif  // HWY_NEON_HAVE_BFLOAT16
   5914 
   5915 template <int kLane>
   5916 HWY_API Vec128<float> Broadcast(Vec128<float> v) {
   5917  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   5918  return Vec128<float>(vdupq_laneq_f32(v.raw, kLane));
   5919 }
   5920 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8),
   5921          HWY_IF_LANES_GT(N, 1)>
   5922 HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
   5923  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5924  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
   5925 }
   5926 template <int kLane>
   5927 HWY_API Vec128<double> Broadcast(Vec128<double> v) {
   5928  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   5929  return Vec128<double>(vdupq_laneq_f64(v.raw, kLane));
   5930 }
   5931 
   5932 #else  // !HWY_ARCH_ARM_A64
   5933 // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*.
   5934 
   5935 // Unsigned
   5936 template <int kLane>
   5937 HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) {
   5938  static_assert(0 <= kLane && kLane < 16, "Invalid lane");
   5939  return Vec128<uint8_t>(vdupq_n_u8(vgetq_lane_u8(v.raw, kLane)));
   5940 }
   5941 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8),
   5942          HWY_IF_LANES_GT(N, 1)>
   5943 HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) {
   5944  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5945  return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane));
   5946 }
   5947 template <int kLane>
   5948 HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) {
   5949  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   5950  return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane)));
   5951 }
   5952 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8),
   5953          HWY_IF_LANES_GT(N, 1)>
   5954 HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) {
   5955  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5956  return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane));
   5957 }
   5958 template <int kLane>
   5959 HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) {
   5960  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   5961  return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane)));
   5962 }
   5963 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8),
   5964          HWY_IF_LANES_GT(N, 1)>
   5965 HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) {
   5966  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5967  return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane));
   5968 }
   5969 template <int kLane>
   5970 HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) {
   5971  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   5972  return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane)));
   5973 }
   5974 
   5975 // Signed
   5976 template <int kLane>
   5977 HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) {
   5978  static_assert(0 <= kLane && kLane < 16, "Invalid lane");
   5979  return Vec128<int8_t>(vdupq_n_s8(vgetq_lane_s8(v.raw, kLane)));
   5980 }
   5981 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8),
   5982          HWY_IF_LANES_GT(N, 1)>
   5983 HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) {
   5984  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5985  return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane));
   5986 }
   5987 template <int kLane>
   5988 HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) {
   5989  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   5990  return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane)));
   5991 }
   5992 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8),
   5993          HWY_IF_LANES_GT(N, 1)>
   5994 HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) {
   5995  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   5996  return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane));
   5997 }
   5998 template <int kLane>
   5999 HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) {
   6000  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   6001  return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane)));
   6002 }
   6003 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8),
   6004          HWY_IF_LANES_GT(N, 1)>
   6005 HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) {
   6006  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6007  return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane));
   6008 }
   6009 template <int kLane>
   6010 HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) {
   6011  static_assert(0 <= kLane && kLane < 2, "Invalid lane");
   6012  return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane)));
   6013 }
   6014 
   6015 // Float
   6016 #if HWY_HAVE_FLOAT16
   6017 template <int kLane>
   6018 HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) {
   6019  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   6020  return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane)));
   6021 }
   6022 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8),
   6023          HWY_IF_LANES_GT(N, 1)>
   6024 HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) {
   6025  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6026  return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane));
   6027 }
   6028 #endif  // HWY_HAVE_FLOAT16
   6029 #if HWY_NEON_HAVE_BFLOAT16
   6030 template <int kLane>
   6031 HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) {
   6032  static_assert(0 <= kLane && kLane < 8, "Invalid lane");
   6033  return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane)));
   6034 }
   6035 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8),
   6036          HWY_IF_LANES_GT(N, 1)>
   6037 HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) {
   6038  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6039  return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane));
   6040 }
   6041 #endif  // HWY_NEON_HAVE_BFLOAT16
   6042 template <int kLane>
   6043 HWY_API Vec128<float> Broadcast(Vec128<float> v) {
   6044  static_assert(0 <= kLane && kLane < 4, "Invalid lane");
   6045  return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane)));
   6046 }
   6047 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8),
   6048          HWY_IF_LANES_GT(N, 1)>
   6049 HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) {
   6050  static_assert(0 <= kLane && kLane < N, "Invalid lane");
   6051  return Vec128<float, N>(vdup_lane_f32(v.raw, kLane));
   6052 }
   6053 
   6054 #endif  // HWY_ARCH_ARM_A64
   6055 
   6056 template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>),
   6057          HWY_IF_LANES_GT_D(DFromV<V>, 1)>
   6058 HWY_API V Broadcast(V v) {
   6059  const DFromV<V> d;
   6060  const RebindToUnsigned<decltype(d)> du;
   6061  return BitCast(d, Broadcast<kLane>(BitCast(du, v)));
   6062 }
   6063 
   6064 // ------------------------------ TableLookupLanes
   6065 
   6066 // Returned by SetTableIndices for use by TableLookupLanes.
   6067 template <typename T, size_t N>
   6068 struct Indices128 {
   6069  typename detail::Raw128<T, N>::type raw;
   6070 };
   6071 
   6072 namespace detail {
   6073 
   6074 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   6075 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   6076    D d) {
   6077  const Repartition<uint8_t, decltype(d)> d8;
   6078  return Iota(d8, 0);
   6079 }
   6080 
   6081 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   6082 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   6083    D d) {
   6084  const Repartition<uint8_t, decltype(d)> d8;
   6085  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   6086      0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14};
   6087  return Load(d8, kBroadcastLaneBytes);
   6088 }
   6089 
   6090 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   6091 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   6092    D d) {
   6093  const Repartition<uint8_t, decltype(d)> d8;
   6094  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   6095      0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12};
   6096  return Load(d8, kBroadcastLaneBytes);
   6097 }
   6098 
   6099 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   6100 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes(
   6101    D d) {
   6102  const Repartition<uint8_t, decltype(d)> d8;
   6103  alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = {
   6104      0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8};
   6105  return Load(d8, kBroadcastLaneBytes);
   6106 }
   6107 
   6108 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   6109 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   6110  const Repartition<uint8_t, decltype(d)> d8;
   6111  return Zero(d8);
   6112 }
   6113 
   6114 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   6115 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   6116  const Repartition<uint8_t, decltype(d)> d8;
   6117  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   6118      0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1};
   6119  return Load(d8, kByteOffsets);
   6120 }
   6121 
   6122 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   6123 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   6124  const Repartition<uint8_t, decltype(d)> d8;
   6125  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   6126      0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3};
   6127  return Load(d8, kByteOffsets);
   6128 }
   6129 
   6130 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   6131 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) {
   6132  const Repartition<uint8_t, decltype(d)> d8;
   6133  alignas(16) static constexpr uint8_t kByteOffsets[16] = {
   6134      0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7};
   6135  return Load(d8, kByteOffsets);
   6136 }
   6137 
   6138 }  // namespace detail
   6139 
   6140 template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)>
   6141 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
   6142    D d, Vec128<TI, MaxLanes(D())> vec) {
   6143  using T = TFromD<D>;
   6144  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   6145 #if HWY_IS_DEBUG_BUILD
   6146  const RebindToUnsigned<decltype(d)> du;
   6147  using TU = TFromD<decltype(du)>;
   6148  HWY_DASSERT(AllTrue(
   6149      du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
   6150 #endif
   6151 
   6152  (void)d;
   6153  return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw};
   6154 }
   6155 
   6156 template <class D, typename TI,
   6157          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
   6158 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec(
   6159    D d, Vec128<TI, MaxLanes(D())> vec) {
   6160  using T = TFromD<D>;
   6161  static_assert(sizeof(T) == sizeof(TI), "Index size must match lane");
   6162 #if HWY_IS_DEBUG_BUILD
   6163  const RebindToUnsigned<decltype(d)> du;
   6164  using TU = TFromD<decltype(du)>;
   6165  HWY_DASSERT(AllTrue(
   6166      du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2)))));
   6167 #endif
   6168 
   6169  const Repartition<uint8_t, decltype(d)> d8;
   6170  using V8 = VFromD<decltype(d8)>;
   6171 
   6172  // Broadcast each lane index to all bytes of T and shift to bytes
   6173  const V8 lane_indices = TableLookupBytes(
   6174      BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d));
   6175  constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T)));
   6176  const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices);
   6177  const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d));
   6178  return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw};
   6179 }
   6180 
   6181 template <class D, typename TI>
   6182 HWY_API Indices128<TFromD<D>, MaxLanes(D())> SetTableIndices(D d,
   6183                                                             const TI* idx) {
   6184  const Rebind<TI, decltype(d)> di;
   6185  return IndicesFromVec(d, LoadU(di, idx));
   6186 }
   6187 
   6188 template <typename T, size_t N>
   6189 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) {
   6190  const DFromV<decltype(v)> d;
   6191  const RebindToSigned<decltype(d)> di;
   6192  return BitCast(
   6193      d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw})));
   6194 }
   6195 
   6196 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
   6197 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b,
   6198                                          Indices128<T, N> idx) {
   6199  const DFromV<decltype(a)> d;
   6200  const Twice<decltype(d)> dt;
   6201 // TableLookupLanes currently requires table and index vectors to be the same
   6202 // size, though a half-length index vector would be sufficient here.
   6203 #if HWY_IS_MSAN
   6204  const Vec128<T, N> idx_vec{idx.raw};
   6205  const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw};
   6206 #else
   6207  // We only keep LowerHalf of the result, which is valid in idx.
   6208  const Indices128<T, N * 2> idx2{idx.raw};
   6209 #endif
   6210  return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2));
   6211 }
   6212 
   6213 template <typename T>
   6214 HWY_API Vec64<T> TwoTablesLookupLanes(Vec64<T> a, Vec64<T> b,
   6215                                      Indices128<T, 8 / sizeof(T)> idx) {
   6216  const DFromV<decltype(a)> d;
   6217  const Repartition<uint8_t, decltype(d)> du8;
   6218  const auto a_u8 = BitCast(du8, a);
   6219  const auto b_u8 = BitCast(du8, b);
   6220  const auto idx_u8 = BitCast(du8, Vec64<T>{idx.raw});
   6221 
   6222 #if HWY_ARCH_ARM_A64
   6223  const Twice<decltype(du8)> dt_u8;
   6224  return BitCast(
   6225      d, Vec64<uint8_t>{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)});
   6226 #else
   6227  detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
   6228  return BitCast(d, Vec64<uint8_t>{vtbl2_u8(tup.raw, idx_u8.raw)});
   6229 #endif
   6230 }
   6231 
   6232 template <typename T>
   6233 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b,
   6234                                       Indices128<T, 16 / sizeof(T)> idx) {
   6235  const DFromV<decltype(a)> d;
   6236  const Repartition<uint8_t, decltype(d)> du8;
   6237  const auto a_u8 = BitCast(du8, a);
   6238  const auto b_u8 = BitCast(du8, b);
   6239  const auto idx_u8 = BitCast(du8, Vec128<T>{idx.raw});
   6240 
   6241 #if HWY_ARCH_ARM_A64
   6242  detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}};
   6243  return BitCast(d, Vec128<uint8_t>{vqtbl2q_u8(tup.raw, idx_u8.raw)});
   6244 #else
   6245  const Half<decltype(d)> dh;
   6246  const Repartition<uint8_t, decltype(dh)> dh_u8;
   6247  const auto a_lo_u8 = LowerHalf(dh_u8, a_u8);
   6248  const auto a_hi_u8 = UpperHalf(dh_u8, a_u8);
   6249  const auto b_lo_u8 = LowerHalf(dh_u8, b_u8);
   6250  const auto b_hi_u8 = UpperHalf(dh_u8, b_u8);
   6251  const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8);
   6252  const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8);
   6253 
   6254  detail::Tuple4<uint8_t, dh_u8.MaxLanes()> tup = {
   6255      {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}};
   6256  const auto lo_result =
   6257      BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_lo_u8.raw)});
   6258  const auto hi_result =
   6259      BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_hi_u8.raw)});
   6260  return Combine(d, hi_result, lo_result);
   6261 #endif
   6262 }
   6263 
   6264 // ------------------------------ Reverse2 (CombineShiftRightBytes)
   6265 
   6266 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
   6267 #ifdef HWY_NATIVE_REVERSE2_8
   6268 #undef HWY_NATIVE_REVERSE2_8
   6269 #else
   6270 #define HWY_NATIVE_REVERSE2_8
   6271 #endif
   6272 
   6273 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   6274 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   6275  const RebindToUnsigned<decltype(d)> du;
   6276  return BitCast(d, VFromD<decltype(du)>(vrev16_u8(BitCast(du, v).raw)));
   6277 }
   6278 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
   6279 HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
   6280  const RebindToUnsigned<decltype(d)> du;
   6281  return BitCast(d, Vec128<uint8_t>(vrev16q_u8(BitCast(du, v).raw)));
   6282 }
   6283 
   6284 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   6285 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   6286  const RebindToUnsigned<decltype(d)> du;
   6287  return BitCast(d, VFromD<decltype(du)>(vrev32_u16(BitCast(du, v).raw)));
   6288 }
   6289 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
   6290 HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
   6291  const RebindToUnsigned<decltype(d)> du;
   6292  return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw)));
   6293 }
   6294 
   6295 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)>
   6296 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   6297  const RebindToUnsigned<decltype(d)> du;
   6298  return BitCast(d, VFromD<decltype(du)>(vrev64_u32(BitCast(du, v).raw)));
   6299 }
   6300 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)>
   6301 HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) {
   6302  const RebindToUnsigned<decltype(d)> du;
   6303  return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw)));
   6304 }
   6305 
   6306 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   6307 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) {
   6308  return CombineShiftRightBytes<8>(d, v, v);
   6309 }
   6310 
   6311 // ------------------------------ Reverse4 (Reverse2)
   6312 
   6313 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   6314 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   6315  const RebindToUnsigned<decltype(d)> du;
   6316  return BitCast(d, VFromD<decltype(du)>(vrev32_u8(BitCast(du, v).raw)));
   6317 }
   6318 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
   6319 HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) {
   6320  const RebindToUnsigned<decltype(d)> du;
   6321  return BitCast(d, Vec128<uint8_t>(vrev32q_u8(BitCast(du, v).raw)));
   6322 }
   6323 
   6324 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)>
   6325 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   6326  const RebindToUnsigned<decltype(d)> du;
   6327  return BitCast(d, VFromD<decltype(du)>(vrev64_u16(BitCast(du, v).raw)));
   6328 }
   6329 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)>
   6330 HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) {
   6331  const RebindToUnsigned<decltype(d)> du;
   6332  return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw)));
   6333 }
   6334 
   6335 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   6336 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) {
   6337  const RepartitionToWide<RebindToUnsigned<decltype(d)>> duw;
   6338  return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v))));
   6339 }
   6340 
   6341 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   6342 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D>) {
   6343  HWY_ASSERT(0);  // don't have 8 u64 lanes
   6344 }
   6345 
   6346 // ------------------------------ Reverse8 (Reverse2, Reverse4)
   6347 
   6348 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)>
   6349 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
   6350  const RebindToUnsigned<decltype(d)> du;
   6351  return BitCast(d, VFromD<decltype(du)>(vrev64_u8(BitCast(du, v).raw)));
   6352 }
   6353 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
   6354 HWY_API Vec128<T> Reverse8(D d, Vec128<T> v) {
   6355  const RebindToUnsigned<decltype(d)> du;
   6356  return BitCast(d, Vec128<uint8_t>(vrev64q_u8(BitCast(du, v).raw)));
   6357 }
   6358 
   6359 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   6360 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) {
   6361  const Repartition<uint64_t, decltype(d)> du64;
   6362  return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v))));
   6363 }
   6364 
   6365 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))>
   6366 HWY_API VFromD<D> Reverse8(D, VFromD<D>) {
   6367  HWY_ASSERT(0);  // don't have 8 lanes if larger than 16-bit
   6368 }
   6369 
   6370 // ------------------------------ Reverse (Reverse2, Reverse4, Reverse8)
   6371 
   6372 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)>
   6373 HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) {
   6374  return v;
   6375 }
   6376 
   6377 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)>
   6378 HWY_API Vec128<T, 2> Reverse(D d, Vec128<T, 2> v) {
   6379  return Reverse2(d, v);
   6380 }
   6381 
   6382 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 4)>
   6383 HWY_API Vec128<T, 4> Reverse(D d, Vec128<T, 4> v) {
   6384  return Reverse4(d, v);
   6385 }
   6386 
   6387 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 8)>
   6388 HWY_API Vec128<T, 8> Reverse(D d, Vec128<T, 8> v) {
   6389  return Reverse8(d, v);
   6390 }
   6391 
   6392 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 16)>
   6393 HWY_API Vec128<T> Reverse(D d, Vec128<T> v) {
   6394  const Repartition<uint64_t, decltype(d)> du64;
   6395  return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v))));
   6396 }
   6397 
   6398 // ------------------------------ ReverseBits
   6399 
   6400 #if HWY_ARCH_ARM_A64
   6401 
   6402 #ifdef HWY_NATIVE_REVERSE_BITS_UI8
   6403 #undef HWY_NATIVE_REVERSE_BITS_UI8
   6404 #else
   6405 #define HWY_NATIVE_REVERSE_BITS_UI8
   6406 #endif
   6407 
   6408 HWY_NEON_DEF_FUNCTION_INT_8(ReverseBits, vrbit, _, 1)
   6409 HWY_NEON_DEF_FUNCTION_UINT_8(ReverseBits, vrbit, _, 1)
   6410 
   6411 #endif  // HWY_ARCH_ARM_A64
   6412 
   6413 // ------------------------------ Other shuffles (TableLookupBytes)
   6414 
   6415 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant).
   6416 // Shuffle0321 rotates one lane to the right (the previous least-significant
   6417 // lane is now most-significant). These could also be implemented via
   6418 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient.
   6419 
   6420 // Swap 64-bit halves
   6421 template <typename T>
   6422 HWY_API Vec128<T> Shuffle1032(Vec128<T> v) {
   6423  return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v);
   6424 }
   6425 template <typename T>
   6426 HWY_API Vec128<T> Shuffle01(Vec128<T> v) {
   6427  return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v);
   6428 }
   6429 
   6430 // Rotate right 32 bits
   6431 template <typename T>
   6432 HWY_API Vec128<T> Shuffle0321(Vec128<T> v) {
   6433  return CombineShiftRightBytes<4>(DFromV<decltype(v)>(), v, v);
   6434 }
   6435 
   6436 // Rotate left 32 bits
   6437 template <typename T>
   6438 HWY_API Vec128<T> Shuffle2103(Vec128<T> v) {
   6439  return CombineShiftRightBytes<12>(DFromV<decltype(v)>(), v, v);
   6440 }
   6441 
   6442 // Reverse
   6443 template <typename T>
   6444 HWY_API Vec128<T> Shuffle0123(Vec128<T> v) {
   6445  return Reverse4(DFromV<decltype(v)>(), v);
   6446 }
   6447 
   6448 // ------------------------------ InterleaveLower
   6449 
   6450 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides
   6451 // the least-significant lane) and "b". To concatenate two half-width integers
   6452 // into one, use ZipLower/Upper instead (also works with scalar).
   6453 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveLower, vzip1, _, 2)
   6454 #if HWY_ARCH_ARM_A64
   6455 // N=1 makes no sense (in that case, there would be no upper/lower).
   6456 HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveLower, vzip1, _, 2)
   6457 #else
   6458 // Emulated version for Armv7.
   6459 template <typename T, HWY_IF_T_SIZE(T, 8)>
   6460 HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) {
   6461  const DFromV<decltype(a)> d;
   6462  return CombineShiftRightBytes<8>(d, b, Shuffle01(a));
   6463 }
   6464 #endif
   6465 
   6466 #if !HWY_HAVE_FLOAT16
   6467 template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)>
   6468 HWY_API Vec128<float16_t, N> InterleaveLower(Vec128<float16_t, N> a,
   6469                                             Vec128<float16_t, N> b) {
   6470  const DFromV<decltype(a)> d;
   6471  const RebindToUnsigned<decltype(d)> du;
   6472  return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b)));
   6473 }
   6474 #endif  // !HWY_HAVE_FLOAT16
   6475 
   6476 // < 64 bit parts
   6477 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)>
   6478 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) {
   6479  return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw);
   6480 }
   6481 
   6482 // Additional overload for the optional Simd<> tag.
   6483 template <class D>
   6484 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) {
   6485  return InterleaveLower(a, b);
   6486 }
   6487 
   6488 // ------------------------------ InterleaveUpper (UpperHalf)
   6489 
   6490 // All functions inside detail lack the required D parameter.
   6491 namespace detail {
   6492 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveUpper, vzip2, _, 2)
   6493 
   6494 #if HWY_ARCH_ARM_A64
   6495 // N=1 makes no sense (in that case, there would be no upper/lower).
   6496 HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveUpper, vzip2, _, 2)
   6497 #else
   6498 // Emulated version for Armv7.
   6499 template <typename T, HWY_IF_T_SIZE(T, 8)>
   6500 HWY_API Vec128<T> InterleaveUpper(Vec128<T> a, Vec128<T> b) {
   6501  const DFromV<decltype(a)> d;
   6502  return CombineShiftRightBytes<8>(d, Shuffle01(b), a);
   6503 }
   6504 #endif
   6505 }  // namespace detail
   6506 
   6507 // Full register
   6508 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
   6509 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) {
   6510  return detail::InterleaveUpper(a, b);
   6511 }
   6512 
   6513 // Partial
   6514 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   6515 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) {
   6516  const Half<decltype(d)> d2;
   6517  const VFromD<D> a2(UpperHalf(d2, a).raw);
   6518  const VFromD<D> b2(UpperHalf(d2, b).raw);
   6519  return InterleaveLower(d, a2, b2);
   6520 }
   6521 
   6522 // ------------------------------ ZipLower/ZipUpper (InterleaveLower)
   6523 
   6524 // Same as Interleave*, except that the return lanes are double-width integers;
   6525 // this is necessary because the single-lane scalar cannot return two values.
   6526 template <class V, class DW = RepartitionToWide<DFromV<V>>>
   6527 HWY_API VFromD<DW> ZipLower(V a, V b) {
   6528  return BitCast(DW(), InterleaveLower(a, b));
   6529 }
   6530 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   6531 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
   6532  return BitCast(dw, InterleaveLower(D(), a, b));
   6533 }
   6534 
   6535 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>>
   6536 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
   6537  return BitCast(dw, InterleaveUpper(D(), a, b));
   6538 }
   6539 
   6540 // ------------------------------ Per4LaneBlockShuffle
   6541 namespace detail {
   6542 
   6543 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG
   6544 
   6545 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   6546 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   6547 #else
   6548 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32
   6549 #endif
   6550 
   6551 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   6552 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t /*x3*/,
   6553                                                const uint32_t /*x2*/,
   6554                                                const uint32_t x1,
   6555                                                const uint32_t x0) {
   6556  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8)));
   6557  const GccU32RawVectType raw = {x0, x1};
   6558  return ResizeBitCast(d, Vec64<uint32_t>(reinterpret_cast<uint32x2_t>(raw)));
   6559 }
   6560 
   6561 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   6562 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3,
   6563                                                const uint32_t x2,
   6564                                                const uint32_t x1,
   6565                                                const uint32_t x0) {
   6566  typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16)));
   6567  const GccU32RawVectType raw = {x0, x1, x2, x3};
   6568  return ResizeBitCast(d, Vec128<uint32_t>(reinterpret_cast<uint32x4_t>(raw)));
   6569 }
   6570 #endif  // HWY_COMPILER_GCC || HWY_COMPILER_CLANG
   6571 
   6572 template <size_t kLaneSize, size_t kVectSize, class V,
   6573          HWY_IF_LANES_GT_D(DFromV<V>, 4)>
   6574 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/,
   6575                                  hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
   6576                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
   6577                                  V v) {
   6578  const DFromV<decltype(v)> d;
   6579  const RebindToUnsigned<decltype(d)> du;
   6580  const RepartitionToWide<decltype(du)> dw;
   6581 
   6582  const auto evens = BitCast(dw, ConcatEven(d, v, v));
   6583  return BitCast(d, InterleaveLower(dw, evens, evens));
   6584 }
   6585 
   6586 template <size_t kLaneSize, size_t kVectSize, class V,
   6587          HWY_IF_LANES_GT_D(DFromV<V>, 4)>
   6588 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/,
   6589                                  hwy::SizeTag<kLaneSize> /*lane_size_tag*/,
   6590                                  hwy::SizeTag<kVectSize> /*vect_size_tag*/,
   6591                                  V v) {
   6592  const DFromV<decltype(v)> d;
   6593  const RebindToUnsigned<decltype(d)> du;
   6594  const RepartitionToWide<decltype(du)> dw;
   6595 
   6596  const auto odds = BitCast(dw, ConcatOdd(d, v, v));
   6597  return BitCast(d, InterleaveLower(dw, odds, odds));
   6598 }
   6599 
   6600 template <class V>
   6601 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/,
   6602                                  hwy::SizeTag<2> /*lane_size_tag*/,
   6603                                  hwy::SizeTag<8> /*vect_size_tag*/, V v) {
   6604  const DFromV<decltype(v)> d;
   6605  return InterleaveUpper(d, v, v);
   6606 }
   6607 
   6608 }  // namespace detail
   6609 
   6610 // ------------------------------ SlideUpLanes
   6611 
   6612 namespace detail {
   6613 
   6614 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   6615 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   6616  const DFromV<decltype(v)> d;
   6617  using TU = UnsignedFromSize<d.MaxBytes()>;
   6618  const Repartition<TU, decltype(d)> du;
   6619  return BitCast(d, BitCast(du, v) << Set(
   6620                        du, static_cast<TU>(amt * sizeof(TFromV<V>) * 8)));
   6621 }
   6622 
   6623 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   6624 HWY_INLINE V SlideUpLanes(V v, size_t amt) {
   6625  const DFromV<decltype(v)> d;
   6626  const Repartition<uint8_t, decltype(d)> du8;
   6627  const auto idx =
   6628      Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>)));
   6629  return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx));
   6630 }
   6631 
   6632 }  // namespace detail
   6633 
   6634 template <class D, HWY_IF_LANES_D(D, 1)>
   6635 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   6636  return v;
   6637 }
   6638 
   6639 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
   6640 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   6641 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6642  if (__builtin_constant_p(amt)) {
   6643    switch (amt) {
   6644      case 0:
   6645        return v;
   6646      case 1:
   6647        return ShiftLeftLanes<1>(d, v);
   6648    }
   6649  }
   6650 #else
   6651  (void)d;
   6652 #endif
   6653 
   6654  return detail::SlideUpLanes(v, amt);
   6655 }
   6656 
   6657 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
   6658 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   6659 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6660  if (__builtin_constant_p(amt)) {
   6661    switch (amt) {
   6662      case 0:
   6663        return v;
   6664      case 1:
   6665        return ShiftLeftLanes<1>(d, v);
   6666      case 2:
   6667        return ShiftLeftLanes<2>(d, v);
   6668      case 3:
   6669        return ShiftLeftLanes<3>(d, v);
   6670    }
   6671  }
   6672 #else
   6673  (void)d;
   6674 #endif
   6675 
   6676  return detail::SlideUpLanes(v, amt);
   6677 }
   6678 
   6679 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
   6680 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   6681 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6682  if (__builtin_constant_p(amt)) {
   6683    switch (amt) {
   6684      case 0:
   6685        return v;
   6686      case 1:
   6687        return ShiftLeftLanes<1>(d, v);
   6688      case 2:
   6689        return ShiftLeftLanes<2>(d, v);
   6690      case 3:
   6691        return ShiftLeftLanes<3>(d, v);
   6692      case 4:
   6693        return ShiftLeftLanes<4>(d, v);
   6694      case 5:
   6695        return ShiftLeftLanes<5>(d, v);
   6696      case 6:
   6697        return ShiftLeftLanes<6>(d, v);
   6698      case 7:
   6699        return ShiftLeftLanes<7>(d, v);
   6700    }
   6701  }
   6702 #else
   6703  (void)d;
   6704 #endif
   6705 
   6706  return detail::SlideUpLanes(v, amt);
   6707 }
   6708 
   6709 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
   6710 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
   6711 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6712  if (__builtin_constant_p(amt)) {
   6713    switch (amt) {
   6714      case 0:
   6715        return v;
   6716      case 1:
   6717        return ShiftLeftLanes<1>(d, v);
   6718      case 2:
   6719        return ShiftLeftLanes<2>(d, v);
   6720      case 3:
   6721        return ShiftLeftLanes<3>(d, v);
   6722      case 4:
   6723        return ShiftLeftLanes<4>(d, v);
   6724      case 5:
   6725        return ShiftLeftLanes<5>(d, v);
   6726      case 6:
   6727        return ShiftLeftLanes<6>(d, v);
   6728      case 7:
   6729        return ShiftLeftLanes<7>(d, v);
   6730      case 8:
   6731        return ShiftLeftLanes<8>(d, v);
   6732      case 9:
   6733        return ShiftLeftLanes<9>(d, v);
   6734      case 10:
   6735        return ShiftLeftLanes<10>(d, v);
   6736      case 11:
   6737        return ShiftLeftLanes<11>(d, v);
   6738      case 12:
   6739        return ShiftLeftLanes<12>(d, v);
   6740      case 13:
   6741        return ShiftLeftLanes<13>(d, v);
   6742      case 14:
   6743        return ShiftLeftLanes<14>(d, v);
   6744      case 15:
   6745        return ShiftLeftLanes<15>(d, v);
   6746    }
   6747  }
   6748 #else
   6749  (void)d;
   6750 #endif
   6751 
   6752  return detail::SlideUpLanes(v, amt);
   6753 }
   6754 
   6755 // ------------------------------ SlideDownLanes
   6756 
   6757 namespace detail {
   6758 
   6759 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)>
   6760 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   6761  const DFromV<decltype(v)> d;
   6762  using TU = UnsignedFromSize<d.MaxBytes()>;
   6763  const Repartition<TU, decltype(d)> du;
   6764  return BitCast(d,
   6765                 BitCast(du, v) << Set(
   6766                     du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8)));
   6767 }
   6768 
   6769 template <class V, HWY_IF_V_SIZE_V(V, 16)>
   6770 HWY_INLINE V SlideDownLanes(V v, size_t amt) {
   6771  const DFromV<decltype(v)> d;
   6772  const Repartition<int8_t, decltype(d)> di8;
   6773  auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>)));
   6774  idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15})));
   6775  return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx));
   6776 }
   6777 
   6778 }  // namespace detail
   6779 
   6780 template <class D, HWY_IF_LANES_D(D, 1)>
   6781 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) {
   6782  return v;
   6783 }
   6784 
   6785 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)>
   6786 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   6787 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6788  if (__builtin_constant_p(amt)) {
   6789    switch (amt) {
   6790      case 0:
   6791        return v;
   6792      case 1:
   6793        return ShiftRightLanes<1>(d, v);
   6794    }
   6795  }
   6796 #else
   6797  (void)d;
   6798 #endif
   6799 
   6800  return detail::SlideDownLanes(v, amt);
   6801 }
   6802 
   6803 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)>
   6804 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   6805 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6806  if (__builtin_constant_p(amt)) {
   6807    switch (amt) {
   6808      case 0:
   6809        return v;
   6810      case 1:
   6811        return ShiftRightLanes<1>(d, v);
   6812      case 2:
   6813        return ShiftRightLanes<2>(d, v);
   6814      case 3:
   6815        return ShiftRightLanes<3>(d, v);
   6816    }
   6817  }
   6818 #else
   6819  (void)d;
   6820 #endif
   6821 
   6822  return detail::SlideDownLanes(v, amt);
   6823 }
   6824 
   6825 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)>
   6826 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   6827 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6828  if (__builtin_constant_p(amt)) {
   6829    switch (amt) {
   6830      case 0:
   6831        return v;
   6832      case 1:
   6833        return ShiftRightLanes<1>(d, v);
   6834      case 2:
   6835        return ShiftRightLanes<2>(d, v);
   6836      case 3:
   6837        return ShiftRightLanes<3>(d, v);
   6838      case 4:
   6839        return ShiftRightLanes<4>(d, v);
   6840      case 5:
   6841        return ShiftRightLanes<5>(d, v);
   6842      case 6:
   6843        return ShiftRightLanes<6>(d, v);
   6844      case 7:
   6845        return ShiftRightLanes<7>(d, v);
   6846    }
   6847  }
   6848 #else
   6849  (void)d;
   6850 #endif
   6851 
   6852  return detail::SlideDownLanes(v, amt);
   6853 }
   6854 
   6855 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)>
   6856 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
   6857 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC  // includes clang
   6858  if (__builtin_constant_p(amt)) {
   6859    switch (amt) {
   6860      case 0:
   6861        return v;
   6862      case 1:
   6863        return ShiftRightLanes<1>(d, v);
   6864      case 2:
   6865        return ShiftRightLanes<2>(d, v);
   6866      case 3:
   6867        return ShiftRightLanes<3>(d, v);
   6868      case 4:
   6869        return ShiftRightLanes<4>(d, v);
   6870      case 5:
   6871        return ShiftRightLanes<5>(d, v);
   6872      case 6:
   6873        return ShiftRightLanes<6>(d, v);
   6874      case 7:
   6875        return ShiftRightLanes<7>(d, v);
   6876      case 8:
   6877        return ShiftRightLanes<8>(d, v);
   6878      case 9:
   6879        return ShiftRightLanes<9>(d, v);
   6880      case 10:
   6881        return ShiftRightLanes<10>(d, v);
   6882      case 11:
   6883        return ShiftRightLanes<11>(d, v);
   6884      case 12:
   6885        return ShiftRightLanes<12>(d, v);
   6886      case 13:
   6887        return ShiftRightLanes<13>(d, v);
   6888      case 14:
   6889        return ShiftRightLanes<14>(d, v);
   6890      case 15:
   6891        return ShiftRightLanes<15>(d, v);
   6892    }
   6893  }
   6894 #else
   6895  (void)d;
   6896 #endif
   6897 
   6898  return detail::SlideDownLanes(v, amt);
   6899 }
   6900 
   6901 // ------------------------------- WidenHighMulAdd
   6902 
   6903 #ifdef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
   6904 #undef HWY_NATIVE_WIDEN_HIGH_MUL_ADD
   6905 #else
   6906 #define HWY_NATIVE_WIDEN_HIGH_MUL_ADD
   6907 #endif
   6908 
   6909 namespace detail {
   6910 
   6911 template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
   6912         HWY_IF_LANES_GT_D(DN, 2)>
   6913 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   6914                                   VFromD<DN> x, VFromD<D> add) {
   6915 #if HWY_ARCH_ARM_A64
   6916  return Vec128<uint64_t>(vmlal_high_u32(add.raw, mul.raw, x.raw));
   6917 #else
   6918  const Full64<uint32_t> dh;
   6919  return Vec128<uint64_t>(
   6920      vmlal_u32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
   6921 #endif
   6922 }
   6923 
   6924 template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>,
   6925         HWY_IF_LANES_LE_D(DN, 2)>
   6926 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   6927                                  VFromD<DN> x, VFromD<D> add) {
   6928  Vec128<uint64_t> mulResult = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
   6929  return UpperHalf(d, mulResult) + add;
   6930 }
   6931 
   6932 template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
   6933         HWY_IF_LANES_GT_D(DN, 2)>
   6934 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   6935                                   VFromD<DN> x, VFromD<D> add) {
   6936 #if HWY_ARCH_ARM_A64
   6937  return Vec128<int64_t>(vmlal_high_s32(add.raw, mul.raw, x.raw));
   6938 #else
   6939  const Full64<int32_t> dh;
   6940  return Vec128<int64_t>(
   6941      vmlal_s32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
   6942 #endif
   6943 }
   6944 
   6945 template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>,
   6946         HWY_IF_LANES_LE_D(DN, 2)>
   6947 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   6948                                  VFromD<DN> x, VFromD<D> add) {
   6949  Vec128<int64_t> mulResult = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
   6950  return UpperHalf(d, mulResult) + add;
   6951 }
   6952 
   6953 template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
   6954         HWY_IF_LANES_GT_D(DN, 4)>
   6955 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   6956                                  VFromD<DN> x, VFromD<D> add) {
   6957 #if HWY_ARCH_ARM_A64
   6958  return Vec128<int32_t>(vmlal_high_s16(add.raw, mul.raw, x.raw));
   6959 #else
   6960  const Full64<int16_t> dh;
   6961  return Vec128<int32_t>(
   6962      vmlal_s16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
   6963 #endif
   6964 }
   6965 
   6966 template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
   6967         HWY_IF_LANES_D(DN, 4)>
   6968 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   6969                                  VFromD<DN> x, VFromD<D> add) {
   6970  Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
   6971  Vec64<int32_t> hi = UpperHalf(d, widen);
   6972  return hi + add;
   6973 }
   6974 
   6975 template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>,
   6976         HWY_IF_LANES_D(DN, 2)>
   6977 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   6978                                  VFromD<DN> x, VFromD<D> add) {
   6979  Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
   6980  Vec32<int32_t> hi = UpperHalf(d, Vec64<int32_t>(vget_high_s32(widen.raw)));
   6981  return hi + add;
   6982 }
   6983 
   6984 template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
   6985         HWY_IF_LANES_GT_D(DN, 4)>
   6986 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   6987                                   VFromD<DN> x, VFromD<D> add) {
   6988 #if HWY_ARCH_ARM_A64
   6989  return Vec128<uint32_t>(vmlal_high_u16(add.raw, mul.raw, x.raw));
   6990 #else
   6991  const Full64<uint16_t> dh;
   6992  return Vec128<uint32_t>(
   6993      vmlal_u16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
   6994 #endif
   6995 }
   6996 
   6997 template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>,
   6998         HWY_IF_LANES_D(DN, 4)>
   6999 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7000                                   VFromD<DN> x, VFromD<D> add) {
   7001  Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
   7002  VFromD<D> hi = UpperHalf(d, widen);
   7003  return hi + add;
   7004 }
   7005 
   7006 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
   7007         class DN = RepartitionToNarrow<D>>
   7008 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7009                                  VFromD<DN> x, VFromD<D> add) {
   7010  Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
   7011  VFromD<D> hi = UpperHalf(d, Vec64<uint32_t>(vget_high_u32(widen.raw)));
   7012  return hi + add;
   7013 }
   7014 
   7015 template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
   7016         HWY_IF_LANES_GT_D(DN, 8)>
   7017 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   7018                                   VFromD<DN> x, VFromD<D> add) {
   7019 #if HWY_ARCH_ARM_A64
   7020  return Vec128<uint16_t>(vmlal_high_u8(add.raw, mul.raw, x.raw));
   7021 #else
   7022  const Full64<uint8_t> dh;
   7023  return Vec128<uint16_t>(
   7024      vmlal_u8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
   7025 #endif
   7026 }
   7027 
   7028 template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>,
   7029         HWY_IF_LANES_D(DN, 8)>
   7030 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7031                                  VFromD<DN> x, VFromD<D> add) {
   7032  Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
   7033  VFromD<D> hi = UpperHalf(d, widen);
   7034  return hi + add;
   7035 }
   7036 
   7037 template<class D, HWY_IF_U16(TFromD<D>), class DN = RepartitionToNarrow<D>,
   7038         HWY_IF_LANES_LE_D(DN, 4)>
   7039 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7040                                  VFromD<DN> x, VFromD<D> add) {
   7041  Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw));
   7042  const Twice<decltype(d)> d16F;
   7043  VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_u16(widen.raw)));
   7044  return hi + add;
   7045 }
   7046 
   7047 template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
   7048         HWY_IF_LANES_GT_D(DN, 8)>
   7049 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   7050                                  VFromD<DN> x, VFromD<D> add) {
   7051 #if HWY_ARCH_ARM_A64
   7052  return Vec128<int16_t>(vmlal_high_s8(add.raw, mul.raw, x.raw));
   7053 #else
   7054  const Full64<int8_t> dh;
   7055  return Vec128<int16_t>(
   7056      vmlal_s8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw));
   7057 #endif
   7058 }
   7059 
   7060 template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
   7061         HWY_IF_LANES_D(DN, 8)>
   7062 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7063                                  VFromD<DN> x, VFromD<D> add) {
   7064  Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
   7065  VFromD<D> hi = UpperHalf(d, widen);
   7066  return hi + add;
   7067 }
   7068 
   7069 template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>,
   7070         HWY_IF_LANES_LE_D(DN, 4)>
   7071 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7072                                  VFromD<DN> x, VFromD<D> add) {
   7073  Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw));
   7074  const Twice<decltype(d)> d16F;
   7075  VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_s16(widen.raw)));
   7076  return hi + add;
   7077 }
   7078 
   7079 #if 0
   7080 #if HWY_HAVE_FLOAT16
   7081 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 4),
   7082         class DN = RepartitionToNarrow<D>>
   7083 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   7084                                  VFromD<DN> x, VFromD<D> add) {
   7085  return VFromD<D>(vfmlalq_high_f16(add.raw, mul.raw, x.raw));
   7086 }
   7087 
   7088 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 2),
   7089         class DN = RepartitionToNarrow<D>>
   7090 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul,
   7091                                  VFromD<DN> x, VFromD<D> add) {
   7092  return Vec64<float32_t>(vfmlal_high_f16(add.raw, mul.raw, x.raw));
   7093 }
   7094 
   7095 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
   7096         class DN = RepartitionToNarrow<D>>
   7097 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul,
   7098                                  VFromD<DN> x, VFromD<D> add) {
   7099  return MulAdd(add, PromoteUpperTo(d, mul), PromoteUpperTo(d, x));
   7100 }
   7101 #endif
   7102 #endif
   7103 
   7104 }  // namespace detail
   7105 
   7106 // ------------------------------- WidenMulAdd
   7107 
   7108 #ifdef HWY_NATIVE_WIDEN_MUL_ADD
   7109 #undef HWY_NATIVE_WIDEN_MUL_ADD
   7110 #else
   7111 #define HWY_NATIVE_WIDEN_MUL_ADD
   7112 #endif
   7113 
   7114 namespace detail {
   7115 
   7116 template<class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 4),
   7117         class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7118 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7119                              VFromD<DN> x, VFromD<D> add) {
   7120  return Vec128<uint16_t>(vmlal_u8(add.raw, mul.raw, x.raw));
   7121 }
   7122 
   7123 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, 4),
   7124          class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7125 HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
   7126                              VFromD<D> add) {
   7127  return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
   7128 }
   7129 
   7130 template<class D, HWY_IF_I16_D(D), HWY_IF_LANES_GT_D(D, 4),
   7131         class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7132 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7133                              VFromD<DN> x, VFromD<D> add) {
   7134  return VFromD<D>(vmlal_s8(add.raw, mul.raw, x.raw));
   7135 }
   7136 
   7137 template <class D, HWY_IF_I16_D(D), HWY_IF_LANES_LE_D(D, 4),
   7138          class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7139 HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x,
   7140                              VFromD<D> add) {
   7141  return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x));
   7142 }
   7143 
   7144 template<class D, HWY_IF_I32_D(D),
   7145         class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
   7146         HWY_IF_LANES_GT_D(DN, 2)>
   7147 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7148                              VFromD<DN> x, VFromD<D> add) {
   7149  return Vec128<int32_t>(vmlal_s16(add.raw, mul.raw, x.raw));
   7150 }
   7151 
   7152 template<class D, HWY_IF_I32_D(D),
   7153         class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
   7154         HWY_IF_LANES_D(DN, 2)>
   7155 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7156                              VFromD<DN> x, VFromD<D> add) {
   7157  Vec128<int32_t> mulRs = Vec128<int32_t>(vmull_s16(mul.raw, x.raw));
   7158  const VFromD<D> mul10 = LowerHalf(mulRs);
   7159  return add + mul10;
   7160 }
   7161 
   7162 template<class D, HWY_IF_I32_D(D),
   7163         class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
   7164         HWY_IF_LANES_D(D, 1)>
   7165 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7166                               VFromD<DN> x, VFromD<D> add) {
   7167  Vec64<int32_t> mulRs = LowerHalf(Vec128<int32_t>(vmull_s16(mul.raw, x.raw)));
   7168  const Vec32<int32_t> mul10(LowerHalf(mulRs));
   7169  return add + mul10;
   7170 }
   7171 
   7172 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_GT_D(D, 2),
   7173         class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7174 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7175                              VFromD<DN> x, VFromD<D> add) {
   7176  return Vec128<uint32_t>(vmlal_u16(add.raw, mul.raw, x.raw));
   7177 }
   7178 
   7179 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 2),
   7180         class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7181 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7182                              VFromD<DN> x, VFromD<D> add) {
   7183  Vec128<uint32_t> mulRs = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw));
   7184  const Vec64<uint32_t> mul10(LowerHalf(mulRs));
   7185  return add + mul10;
   7186 }
   7187 
   7188 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1),
   7189         class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7190 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7191                              VFromD<DN> x, VFromD<D> add) {
   7192  Vec64<uint32_t> mulRs =
   7193      LowerHalf(Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)));
   7194  const Vec32<uint32_t> mul10(LowerHalf(mulRs));
   7195  return add + mul10;
   7196 }
   7197 
   7198 template<class D, HWY_IF_I64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
   7199         HWY_IF_LANES_D(DN, 2)>
   7200 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7201                               VFromD<DN> x, VFromD<D> add) {
   7202  return VFromD<D>(vmlal_s32(add.raw, mul.raw, x.raw));
   7203 }
   7204 
   7205 template<class D, HWY_IF_I64_D(D), HWY_IF_LANES_D(D, 1),
   7206         class DN = Rebind<MakeNarrow<TFromD<D>>, D>>
   7207 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7208                              VFromD<DN> x, VFromD<D> add) {
   7209  Vec128<int64_t> mulRs = Vec128<int64_t>(vmull_s32(mul.raw, x.raw));
   7210  const VFromD<D> mul10(LowerHalf(mulRs));
   7211  return add + mul10;
   7212 }
   7213 
   7214 template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
   7215         HWY_IF_LANES_D(DN, 2)>
   7216 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7217                              VFromD<DN> x, VFromD<D> add) {
   7218  return VFromD<D>(vmlal_u32(add.raw, mul.raw, x.raw));
   7219 }
   7220 
   7221 template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>,
   7222         HWY_IF_LANES_D(DN, 1)>
   7223 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul,
   7224                              VFromD<DN> x, VFromD<D> add) {
   7225  Vec128<uint64_t> mulRs = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw));
   7226  const VFromD<D> mul10(LowerHalf(mulRs));
   7227  return add + mul10;
   7228 }
   7229 
   7230 #if 0
   7231 #if HWY_HAVE_FLOAT16
   7232 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
   7233         HWY_IF_LANES_D(D, 4)>
   7234 HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
   7235                                  VFromD<DN> x, VFromD<D> add) {
   7236  return VFromD<D>(vfmlalq_low_f16(add.raw, mul.raw, x.raw));
   7237 }
   7238 
   7239 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>,
   7240         HWY_IF_LANES_D(DN, 4)>
   7241 HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul,
   7242                                  VFromD<DN> x, VFromD<D> add) {
   7243  return Vec64<float32_t>(vfmlal_low_f16(add.raw, mul.raw, x.raw));
   7244 }
   7245 
   7246 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1),
   7247         class DN = RepartitionToNarrow<D>>
   7248 HWY_API VFromD<D> WidenLowMulAdd(D d, VFromD<DN> mul,
   7249                                 VFromD<DN> x, VFromD<D> add) {
   7250  return MulAdd(add, PromoteLowerTo(d, mul), PromoteLowerTo(d, x));
   7251 }
   7252 #endif
   7253 #endif
   7254 
   7255 }  // namespace detail
   7256 
   7257 // ------------------------------ WidenMulAccumulate
   7258 
   7259 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   7260 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   7261 #else
   7262 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE
   7263 #endif
   7264 
   7265 template<class D, HWY_IF_INTEGER(TFromD<D>), class DN = RepartitionToNarrow<D>>
   7266 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
   7267                                     VFromD<D> low, VFromD<D>& high) {
   7268  high = detail::WidenHighMulAdd(d, mul, x, high);
   7269  return detail::WidenMulAdd(d, LowerHalf(mul), LowerHalf(x), low);
   7270 }
   7271 
   7272 #if 0
   7273 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   7274 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   7275 #else
   7276 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16
   7277 #endif
   7278 
   7279 #if HWY_HAVE_FLOAT16
   7280 
   7281 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>>
   7282 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x,
   7283                                     VFromD<D> low, VFromD<D>& high) {
   7284  high = detail::WidenHighMulAdd(d, mul, x, high);
   7285  return detail::WidenLowMulAdd(d, mul, x, low);
   7286 }
   7287 
   7288 #endif
   7289 #endif
   7290 
   7291 // ------------------------------ SatWidenMulAccumFixedPoint
   7292 
   7293 #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   7294 #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   7295 #else
   7296 #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT
   7297 #endif
   7298 
   7299 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
   7300 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/,
   7301                                                VFromD<Rebind<int16_t, DI32>> a,
   7302                                                VFromD<Rebind<int16_t, DI32>> b,
   7303                                                VFromD<DI32> sum) {
   7304  return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw));
   7305 }
   7306 
   7307 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
   7308 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32,
   7309                                                VFromD<Rebind<int16_t, DI32>> a,
   7310                                                VFromD<Rebind<int16_t, DI32>> b,
   7311                                                VFromD<DI32> sum) {
   7312  const Full128<TFromD<DI32>> di32_full;
   7313  const Rebind<int16_t, decltype(di32_full)> di16_full64;
   7314  return ResizeBitCast(
   7315      di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a),
   7316                                       ResizeBitCast(di16_full64, b),
   7317                                       ResizeBitCast(di32_full, sum)));
   7318 }
   7319 
   7320 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
   7321 
   7322 #if HWY_NEON_HAVE_F32_TO_BF16C
   7323 
   7324 #ifdef HWY_NATIVE_MUL_EVEN_BF16
   7325 #undef HWY_NATIVE_MUL_EVEN_BF16
   7326 #else
   7327 #define HWY_NATIVE_MUL_EVEN_BF16
   7328 #endif
   7329 
   7330 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   7331 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   7332 #else
   7333 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16
   7334 #endif
   7335 
   7336 namespace detail {
   7337 #if HWY_NEON_HAVE_BFLOAT16
   7338 // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is
   7339 // bfloat16x4_t or bfloat16x8_t.
   7340 static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) {
   7341  return raw;
   7342 }
   7343 static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) {
   7344  return raw;
   7345 }
   7346 #else
   7347 // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true,
   7348 // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to
   7349 // work around compiler bugs that are there with GCC 13 or earlier or Clang 16
   7350 // or earlier on AArch64.
   7351 
   7352 // The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t
   7353 // or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if
   7354 // HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true
   7355 static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) {
   7356  return vreinterpret_bf16_u16(raw);
   7357 }
   7358 static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) {
   7359  return vreinterpretq_bf16_u16(raw);
   7360 }
   7361 #endif
   7362 }  // namespace detail
   7363 
   7364 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   7365 HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a,
   7366                                 Vec128<bfloat16_t> b, const Vec128<float> c) {
   7367  return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
   7368                                   detail::BitCastToRawNeonBF16(b.raw)));
   7369 }
   7370 
   7371 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   7372 HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a,
   7373                                 Vec128<bfloat16_t> b, const Vec128<float> c) {
   7374  return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw),
   7375                                   detail::BitCastToRawNeonBF16(b.raw)));
   7376 }
   7377 
   7378 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   7379 HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a,
   7380                                                Vec128<bfloat16_t> b,
   7381                                                const Vec128<float> sum0,
   7382                                                Vec128<float>& /*sum1*/) {
   7383  return Vec128<float>(vbfdotq_f32(sum0.raw,
   7384                                   detail::BitCastToRawNeonBF16(a.raw),
   7385                                   detail::BitCastToRawNeonBF16(b.raw)));
   7386 }
   7387 
   7388 // There is no non-q version of these instructions.
   7389 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   7390 HWY_API VFromD<D> MulEvenAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
   7391                             VFromD<Repartition<bfloat16_t, D>> b,
   7392                             const VFromD<D> c) {
   7393  const Full128<float> d32f;
   7394  const Full128<bfloat16_t> d16f;
   7395  return ResizeBitCast(
   7396      d32, MulEvenAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
   7397                      ResizeBitCast(d32f, c)));
   7398 }
   7399 
   7400 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   7401 HWY_API VFromD<D> MulOddAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a,
   7402                            VFromD<Repartition<bfloat16_t, D>> b,
   7403                            const VFromD<D> c) {
   7404  const Full128<float> d32f;
   7405  const Full128<bfloat16_t> d16f;
   7406  return ResizeBitCast(
   7407      d32, MulOddAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b),
   7408                     ResizeBitCast(d32f, c)));
   7409 }
   7410 
   7411 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   7412 HWY_API VFromD<D> ReorderWidenMulAccumulate(
   7413    D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a,
   7414    VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0,
   7415    VFromD<D>& /*sum1*/) {
   7416  return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw),
   7417                              detail::BitCastToRawNeonBF16(b.raw)));
   7418 }
   7419 
   7420 #endif  // HWY_NEON_HAVE_F32_TO_BF16C
   7421 
   7422 template <class D, HWY_IF_I32_D(D)>
   7423 HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a,
   7424                                                  Vec128<int16_t> b,
   7425                                                  const Vec128<int32_t> sum0,
   7426                                                  Vec128<int32_t>& sum1) {
   7427 #if HWY_ARCH_ARM_A64
   7428  sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw));
   7429 #else
   7430  const Full64<int16_t> dh;
   7431  sum1 = Vec128<int32_t>(
   7432      vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
   7433 #endif
   7434  return Vec128<int32_t>(
   7435      vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
   7436 }
   7437 
   7438 template <class D, HWY_IF_I32_D(D)>
   7439 HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(D d32, Vec64<int16_t> a,
   7440                                                 Vec64<int16_t> b,
   7441                                                 const Vec64<int32_t> sum0,
   7442                                                 Vec64<int32_t>& sum1) {
   7443  // vmlal writes into the upper half, which the caller cannot use, so
   7444  // split into two halves.
   7445  const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
   7446  const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210);
   7447  sum1 += mul_32;
   7448  return sum0 + LowerHalf(mul_3210);
   7449 }
   7450 
   7451 template <class D, HWY_IF_I32_D(D)>
   7452 HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(D d32, Vec32<int16_t> a,
   7453                                                 Vec32<int16_t> b,
   7454                                                 const Vec32<int32_t> sum0,
   7455                                                 Vec32<int32_t>& sum1) {
   7456  const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
   7457  const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
   7458  const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
   7459  const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
   7460  sum1 += mul1;
   7461  return sum0 + mul0;
   7462 }
   7463 
   7464 template <class D, HWY_IF_U32_D(D)>
   7465 HWY_API Vec128<uint32_t> ReorderWidenMulAccumulate(D /*d32*/,
   7466                                                   Vec128<uint16_t> a,
   7467                                                   Vec128<uint16_t> b,
   7468                                                   const Vec128<uint32_t> sum0,
   7469                                                   Vec128<uint32_t>& sum1) {
   7470 #if HWY_ARCH_ARM_A64
   7471  sum1 = Vec128<uint32_t>(vmlal_high_u16(sum1.raw, a.raw, b.raw));
   7472 #else
   7473  const Full64<uint16_t> dh;
   7474  sum1 = Vec128<uint32_t>(
   7475      vmlal_u16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
   7476 #endif
   7477  return Vec128<uint32_t>(
   7478      vmlal_u16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw));
   7479 }
   7480 
   7481 template <class D, HWY_IF_U32_D(D)>
   7482 HWY_API Vec64<uint32_t> ReorderWidenMulAccumulate(D d32, Vec64<uint16_t> a,
   7483                                                  Vec64<uint16_t> b,
   7484                                                  const Vec64<uint32_t> sum0,
   7485                                                  Vec64<uint32_t>& sum1) {
   7486  // vmlal writes into the upper half, which the caller cannot use, so
   7487  // split into two halves.
   7488  const Vec128<uint32_t> mul_3210(vmull_u16(a.raw, b.raw));
   7489  const Vec64<uint32_t> mul_32 = UpperHalf(d32, mul_3210);
   7490  sum1 += mul_32;
   7491  return sum0 + LowerHalf(mul_3210);
   7492 }
   7493 
   7494 template <class D, HWY_IF_U32_D(D)>
   7495 HWY_API Vec32<uint32_t> ReorderWidenMulAccumulate(D du32, Vec32<uint16_t> a,
   7496                                                  Vec32<uint16_t> b,
   7497                                                  const Vec32<uint32_t> sum0,
   7498                                                  Vec32<uint32_t>& sum1) {
   7499  const Vec128<uint32_t> mul_xx10(vmull_u16(a.raw, b.raw));
   7500  const Vec64<uint32_t> mul_10(LowerHalf(mul_xx10));
   7501  const Vec32<uint32_t> mul0 = LowerHalf(du32, mul_10);
   7502  const Vec32<uint32_t> mul1 = UpperHalf(du32, mul_10);
   7503  sum1 += mul1;
   7504  return sum0 + mul0;
   7505 }
   7506 
   7507 // ------------------------------ Combine partial (InterleaveLower)
   7508 // < 64bit input, <= 64 bit result
   7509 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   7510 HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) {
   7511  // First double N (only lower halves will be used).
   7512  const VFromD<D> hi2(hi.raw);
   7513  const VFromD<D> lo2(lo.raw);
   7514  // Repartition to two unsigned lanes (each the size of the valid input).
   7515  const Simd<UnsignedFromSize<d.MaxBytes() / 2>, 2, 0> du;
   7516  return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2)));
   7517 }
   7518 
   7519 // ------------------------------ RearrangeToOddPlusEven (Combine)
   7520 
   7521 namespace detail {
   7522 // Armv7 only provides 64-bit (half-vector) pairwise operations.
   7523 #define HWY_NEON_DEF_PAIRWISE_OP(T, name, prefix, suffix)      \
   7524  HWY_INLINE Vec64<T> Pairwise##name(Vec64<T> a, Vec64<T> b) { \
   7525    return Vec64<T>(prefix##_##suffix(a.raw, b.raw));          \
   7526  }
   7527 
   7528 // Note that Armv7 also lacks [u]int64 instructions, which are handled by
   7529 // generic_ops-inl.h SumOfLanes etc., hence no 64-bit overloads here.
   7530 #define HWY_NEON_DEF_PAIRWISE_OPS(name, prefix)         \
   7531  HWY_NEON_DEF_PAIRWISE_OP(uint32_t, name, prefix, u32) \
   7532  HWY_NEON_DEF_PAIRWISE_OP(uint16_t, name, prefix, u16) \
   7533  HWY_NEON_DEF_PAIRWISE_OP(uint8_t, name, prefix, u8)   \
   7534  HWY_NEON_DEF_PAIRWISE_OP(int32_t, name, prefix, s32)  \
   7535  HWY_NEON_DEF_PAIRWISE_OP(int16_t, name, prefix, s16)  \
   7536  HWY_NEON_DEF_PAIRWISE_OP(int8_t, name, prefix, s8)    \
   7537  HWY_NEON_DEF_PAIRWISE_OP(float32_t, name, prefix, f32)
   7538 
   7539 HWY_NEON_DEF_PAIRWISE_OPS(Sum, vpadd)
   7540 HWY_NEON_DEF_PAIRWISE_OPS(Min, vpmin)
   7541 HWY_NEON_DEF_PAIRWISE_OPS(Max, vpmax)
   7542 #undef HWY_NEON_DEF_PAIRWISE_OPS
   7543 #undef HWY_NEON_DEF_PAIRWISE_OP
   7544 }  // namespace detail
   7545 
   7546 template <size_t N>
   7547 HWY_API Vec128<float, N> RearrangeToOddPlusEven(Vec128<float, N> sum0,
   7548                                                Vec128<float, N> sum1) {
   7549 #if HWY_NEON_HAVE_BFLOAT16
   7550  (void)sum1;  // unused by bf16 ReorderWidenMulAccumulate
   7551  return sum0;
   7552 #else
   7553  return Add(sum0, sum1);
   7554 #endif
   7555 }
   7556 
   7557 HWY_API Vec128<int32_t> RearrangeToOddPlusEven(Vec128<int32_t> sum0,
   7558                                               Vec128<int32_t> sum1) {
   7559 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
   7560 #if HWY_ARCH_ARM_A64  // pairwise sum is available and what we want
   7561  return Vec128<int32_t>(vpaddq_s32(sum0.raw, sum1.raw));
   7562 #else
   7563  const Full128<int32_t> d;
   7564  const Half<decltype(d)> d64;
   7565  const Vec64<int32_t> hi =
   7566      detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1));
   7567  const Vec64<int32_t> lo(
   7568      detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0)));
   7569  return Combine(d, hi, lo);
   7570 #endif
   7571 }
   7572 
   7573 HWY_API Vec64<int32_t> RearrangeToOddPlusEven(Vec64<int32_t> sum0,
   7574                                              Vec64<int32_t> sum1) {
   7575  // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
   7576  return detail::PairwiseSum(sum0, sum1);
   7577 }
   7578 
   7579 HWY_API Vec32<int32_t> RearrangeToOddPlusEven(Vec32<int32_t> sum0,
   7580                                              Vec32<int32_t> sum1) {
   7581  // Only one widened sum per register, so add them for sum of odd and even.
   7582  return sum0 + sum1;
   7583 }
   7584 
   7585 HWY_API Vec128<uint32_t> RearrangeToOddPlusEven(Vec128<uint32_t> sum0,
   7586                                                Vec128<uint32_t> sum1) {
   7587 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1.
   7588 #if HWY_ARCH_ARM_A64  // pairwise sum is available and what we want
   7589  return Vec128<uint32_t>(vpaddq_u32(sum0.raw, sum1.raw));
   7590 #else
   7591  const Full128<uint32_t> d;
   7592  const Half<decltype(d)> d64;
   7593  const Vec64<uint32_t> hi =
   7594      detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1));
   7595  const Vec64<uint32_t> lo =
   7596      detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0));
   7597  return Combine(d, hi, lo);
   7598 #endif
   7599 }
   7600 
   7601 HWY_API Vec64<uint32_t> RearrangeToOddPlusEven(Vec64<uint32_t> sum0,
   7602                                               Vec64<uint32_t> sum1) {
   7603  // vmlal_u16 multiplied the lower half into sum0 and upper into sum1.
   7604  return detail::PairwiseSum(sum0, sum1);
   7605 }
   7606 
   7607 HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0,
   7608                                               Vec32<uint32_t> sum1) {
   7609  // Only one widened sum per register, so add them for sum of odd and even.
   7610  return sum0 + sum1;
   7611 }
   7612 
   7613 // ------------------------------ SumOfMulQuadAccumulate
   7614 
   7615 #if HWY_TARGET == HWY_NEON_BF16
   7616 
   7617 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
   7618 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
   7619 #else
   7620 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE
   7621 #endif
   7622 
   7623 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
   7624 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
   7625                                            VFromD<Repartition<int8_t, DI32>> a,
   7626                                            VFromD<Repartition<int8_t, DI32>> b,
   7627                                            VFromD<DI32> sum) {
   7628  return VFromD<DI32>(vdot_s32(sum.raw, a.raw, b.raw));
   7629 }
   7630 
   7631 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
   7632 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/,
   7633                                            VFromD<Repartition<int8_t, DI32>> a,
   7634                                            VFromD<Repartition<int8_t, DI32>> b,
   7635                                            VFromD<DI32> sum) {
   7636  return VFromD<DI32>(vdotq_s32(sum.raw, a.raw, b.raw));
   7637 }
   7638 
   7639 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
   7640 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
   7641 #else
   7642 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE
   7643 #endif
   7644 
   7645 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 8)>
   7646 HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
   7647    DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
   7648    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
   7649  return VFromD<DU32>(vdot_u32(sum.raw, a.raw, b.raw));
   7650 }
   7651 
   7652 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_D(DU32, 16)>
   7653 HWY_API VFromD<DU32> SumOfMulQuadAccumulate(
   7654    DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a,
   7655    VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) {
   7656  return VFromD<DU32>(vdotq_u32(sum.raw, a.raw, b.raw));
   7657 }
   7658 
   7659 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
   7660 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
   7661 #else
   7662 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE
   7663 #endif
   7664 
   7665 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)>
   7666 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
   7667    DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
   7668    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
   7669  return VFromD<DI32>(vusdot_s32(sum.raw, a_u.raw, b_i.raw));
   7670 }
   7671 
   7672 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)>
   7673 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(
   7674    DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u,
   7675    VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) {
   7676  return VFromD<DI32>(vusdotq_s32(sum.raw, a_u.raw, b_i.raw));
   7677 }
   7678 
   7679 #endif  // HWY_TARGET == HWY_NEON_BF16
   7680 
   7681 // ------------------------------ WidenMulPairwiseAdd
   7682 
   7683 #if HWY_NEON_HAVE_F32_TO_BF16C
   7684 
   7685 template <class DF, HWY_IF_V_SIZE_D(DF, 16)>
   7686 HWY_API Vec128<float> WidenMulPairwiseAdd(DF df, Vec128<bfloat16_t> a,
   7687                                          Vec128<bfloat16_t> b) {
   7688  return Vec128<float>(vbfdotq_f32(Zero(df).raw,
   7689                                   detail::BitCastToRawNeonBF16(a.raw),
   7690                                   detail::BitCastToRawNeonBF16(b.raw)));
   7691 }
   7692 
   7693 template <class DF, HWY_IF_V_SIZE_LE_D(DF, 8)>
   7694 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
   7695                                       VFromD<Repartition<bfloat16_t, DF>> a,
   7696                                       VFromD<Repartition<bfloat16_t, DF>> b) {
   7697  return VFromD<DF>(vbfdot_f32(Zero(df).raw,
   7698                               detail::BitCastToRawNeonBF16(a.raw),
   7699                               detail::BitCastToRawNeonBF16(b.raw)));
   7700 }
   7701 
   7702 #else
   7703 template <class DF, HWY_IF_F32_D(DF)>
   7704 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df,
   7705                                       VFromD<Repartition<bfloat16_t, DF>> a,
   7706                                       VFromD<Repartition<bfloat16_t, DF>> b) {
   7707  return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b),
   7708                Mul(PromoteOddTo(df, a), PromoteOddTo(df, b)));
   7709 }
   7710 #endif  // HWY_NEON_HAVE_F32_TO_BF16C
   7711 
   7712 template <class D, HWY_IF_I32_D(D)>
   7713 HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a,
   7714                                            Vec128<int16_t> b) {
   7715  Vec128<int32_t> sum1;
   7716 #if HWY_ARCH_ARM_A64
   7717  sum1 = Vec128<int32_t>(vmull_high_s16(a.raw, b.raw));
   7718 #else
   7719  const Full64<int16_t> dh;
   7720  sum1 = Vec128<int32_t>(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
   7721 #endif
   7722  Vec128<int32_t> sum0 =
   7723      Vec128<int32_t>(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw));
   7724  return RearrangeToOddPlusEven(sum0, sum1);
   7725 }
   7726 
   7727 template <class D, HWY_IF_I32_D(D)>
   7728 HWY_API Vec64<int32_t> WidenMulPairwiseAdd(D d32, Vec64<int16_t> a,
   7729                                           Vec64<int16_t> b) {
   7730  // vmlal writes into the upper half, which the caller cannot use, so
   7731  // split into two halves.
   7732  const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw));
   7733  const Vec64<int32_t> mul0 = LowerHalf(mul_3210);
   7734  const Vec64<int32_t> mul1 = UpperHalf(d32, mul_3210);
   7735  return RearrangeToOddPlusEven(mul0, mul1);
   7736 }
   7737 
   7738 template <class D, HWY_IF_I32_D(D)>
   7739 HWY_API Vec32<int32_t> WidenMulPairwiseAdd(D d32, Vec32<int16_t> a,
   7740                                           Vec32<int16_t> b) {
   7741  const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw));
   7742  const Vec64<int32_t> mul_10(LowerHalf(mul_xx10));
   7743  const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10);
   7744  const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10);
   7745  return RearrangeToOddPlusEven(mul0, mul1);
   7746 }
   7747 
   7748 template <class D, HWY_IF_U32_D(D)>
   7749 HWY_API Vec128<uint32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<uint16_t> a,
   7750                                             Vec128<uint16_t> b) {
   7751  Vec128<uint32_t> sum1;
   7752 #if HWY_ARCH_ARM_A64
   7753  sum1 = Vec128<uint32_t>(vmull_high_u16(a.raw, b.raw));
   7754 #else
   7755  const Full64<uint16_t> dh;
   7756  sum1 =
   7757      Vec128<uint32_t>(vmull_u16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw));
   7758 #endif
   7759  Vec128<uint32_t> sum0 =
   7760      Vec128<uint32_t>(vmull_u16(LowerHalf(a).raw, LowerHalf(b).raw));
   7761  return RearrangeToOddPlusEven(sum0, sum1);
   7762 }
   7763 
   7764 template <class D, HWY_IF_U32_D(D)>
   7765 HWY_API Vec64<uint32_t> WidenMulPairwiseAdd(D d32, Vec64<uint16_t> a,
   7766                                            Vec64<uint16_t> b) {
   7767  // vmlal writes into the upper half, which the caller cannot use, so
   7768  // split into two halves.
   7769  const Vec128<uint32_t> mul_3210(vmull_u16(a.raw, b.raw));
   7770  const Vec64<uint32_t> mul0 = LowerHalf(mul_3210);
   7771  const Vec64<uint32_t> mul1 = UpperHalf(d32, mul_3210);
   7772  return RearrangeToOddPlusEven(mul0, mul1);
   7773 }
   7774 
   7775 template <class D, HWY_IF_U32_D(D)>
   7776 HWY_API Vec32<uint32_t> WidenMulPairwiseAdd(D d32, Vec32<uint16_t> a,
   7777                                            Vec32<uint16_t> b) {
   7778  const Vec128<uint32_t> mul_xx10(vmull_u16(a.raw, b.raw));
   7779  const Vec64<uint32_t> mul_10(LowerHalf(mul_xx10));
   7780  const Vec32<uint32_t> mul0 = LowerHalf(d32, mul_10);
   7781  const Vec32<uint32_t> mul1 = UpperHalf(d32, mul_10);
   7782  return RearrangeToOddPlusEven(mul0, mul1);
   7783 }
   7784 
   7785 // ------------------------------ ZeroExtendVector (Combine)
   7786 
   7787 template <class D>
   7788 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) {
   7789  return Combine(d, Zero(Half<decltype(d)>()), lo);
   7790 }
   7791 
   7792 // ------------------------------ ConcatLowerLower
   7793 
   7794 // 64 or 128-bit input: just interleave
   7795 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
   7796 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   7797  // Treat half-width input as a single lane and interleave them.
   7798  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
   7799  return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi)));
   7800 }
   7801 
   7802 namespace detail {
   7803 #if HWY_ARCH_ARM_A64
   7804 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveEven, vtrn1, _, 2)
   7805 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveOdd, vtrn2, _, 2)
   7806 #else
   7807 
   7808 // vtrn returns a struct with even and odd result.
   7809 #define HWY_NEON_BUILD_TPL_HWY_TRN
   7810 #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t
   7811 // Pass raw args so we can accept uint16x2 args, for which there is no
   7812 // corresponding uint16x2x2 return type.
   7813 #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \
   7814  Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b
   7815 #define HWY_NEON_BUILD_ARG_HWY_TRN a, b
   7816 
   7817 // Cannot use UINT8 etc. type macros because the x2_t tuples are only defined
   7818 // for full and half vectors.
   7819 HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN)
   7820 HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN)
   7821 HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN)
   7822 HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN)
   7823 HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN)
   7824 HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN)
   7825 HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN)
   7826 HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN)
   7827 HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN)
   7828 HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN)
   7829 HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN)
   7830 HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN)
   7831 HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN)
   7832 HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN)
   7833 
   7834 #undef HWY_NEON_BUILD_TPL_HWY_TRN
   7835 #undef HWY_NEON_BUILD_RET_HWY_TRN
   7836 #undef HWY_NEON_BUILD_PARAM_HWY_TRN
   7837 #undef HWY_NEON_BUILD_ARG_HWY_TRN
   7838 
   7839 #endif  // HWY_ARCH_ARM_A64
   7840 }  // namespace detail
   7841 
   7842 // <= 32-bit input/output
   7843 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   7844 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) {
   7845  // Treat half-width input as two lanes and take every second one.
   7846  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
   7847 #if HWY_ARCH_ARM_A64
   7848  return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi)));
   7849 #else
   7850  using VU = VFromD<decltype(du)>;
   7851  return BitCast(
   7852      d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
   7853                .val[0]));
   7854 #endif
   7855 }
   7856 
   7857 // ------------------------------ ConcatUpperUpper
   7858 
   7859 // 64 or 128-bit input: just interleave
   7860 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
   7861 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   7862  // Treat half-width input as a single lane and interleave them.
   7863  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
   7864  return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi)));
   7865 }
   7866 
   7867 // <= 32-bit input/output
   7868 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   7869 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   7870  // Treat half-width input as two lanes and take every second one.
   7871  const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du;
   7872 #if HWY_ARCH_ARM_A64
   7873  return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi)));
   7874 #else
   7875  using VU = VFromD<decltype(du)>;
   7876  return BitCast(
   7877      d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw)
   7878                .val[1]));
   7879 #endif
   7880 }
   7881 
   7882 // ------------------------------ ConcatLowerUpper (ShiftLeftBytes)
   7883 
   7884 // 64 or 128-bit input: extract from concatenated
   7885 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
   7886 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   7887  return CombineShiftRightBytes<d.MaxBytes() / 2>(d, hi, lo);
   7888 }
   7889 
   7890 // <= 32-bit input/output
   7891 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   7892 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) {
   7893  constexpr size_t kSize = d.MaxBytes();
   7894  const Repartition<uint8_t, decltype(d)> d8;
   7895  const Full64<uint8_t> d8x8;
   7896  const Full64<TFromD<D>> d64;
   7897  using V8x8 = VFromD<decltype(d8x8)>;
   7898  const V8x8 hi8x8(BitCast(d8, hi).raw);
   7899  // Move into most-significant bytes
   7900  const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw));
   7901  const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8);
   7902  // Back to original lane type, then shrink N.
   7903  return VFromD<D>(BitCast(d64, r).raw);
   7904 }
   7905 
   7906 // ------------------------------ ConcatUpperLower
   7907 
   7908 // Works for all N.
   7909 template <class D>
   7910 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) {
   7911  return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi);
   7912 }
   7913 
   7914 // ------------------------------ ConcatOdd (InterleaveUpper)
   7915 
   7916 namespace detail {
   7917 // There is no vuzpq_u64.
   7918 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2)
   7919 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2)
   7920 
   7921 #if !HWY_HAVE_FLOAT16
   7922 template <size_t N>
   7923 HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi,
   7924                                           Vec128<float16_t, N> lo) {
   7925  const DFromV<decltype(hi)> d;
   7926  const RebindToUnsigned<decltype(d)> du;
   7927  return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo)));
   7928 }
   7929 template <size_t N>
   7930 HWY_INLINE Vec128<float16_t, N> ConcatOdd(Vec128<float16_t, N> hi,
   7931                                          Vec128<float16_t, N> lo) {
   7932  const DFromV<decltype(hi)> d;
   7933  const RebindToUnsigned<decltype(d)> du;
   7934  return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo)));
   7935 }
   7936 #endif  // !HWY_HAVE_FLOAT16
   7937 }  // namespace detail
   7938 
   7939 // Full/half vector
   7940 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
   7941 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   7942  return detail::ConcatOdd(lo, hi);
   7943 }
   7944 
   7945 // 8-bit x4
   7946 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
   7947 HWY_API Vec32<T> ConcatOdd(D d, Vec32<T> hi, Vec32<T> lo) {
   7948  const Twice<decltype(d)> d2;
   7949  const Repartition<uint16_t, decltype(d2)> dw2;
   7950  const VFromD<decltype(d2)> hi2(hi.raw);
   7951  const VFromD<decltype(d2)> lo2(lo.raw);
   7952  const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2));
   7953  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
   7954  // vcopy_lane_u16, but that's A64-only.
   7955  return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw);
   7956 }
   7957 
   7958 // Any type x2
   7959 template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>>
   7960 HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
   7961  return InterleaveUpper(d, lo, hi);
   7962 }
   7963 
   7964 // ------------------------------ ConcatEven (InterleaveLower)
   7965 
   7966 // Full/half vector
   7967 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)>
   7968 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) {
   7969  return detail::ConcatEven(lo, hi);
   7970 }
   7971 
   7972 // 8-bit x4
   7973 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)>
   7974 HWY_API Vec32<T> ConcatEven(D d, Vec32<T> hi, Vec32<T> lo) {
   7975  const Twice<decltype(d)> d2;
   7976  const Repartition<uint16_t, decltype(d2)> dw2;
   7977  const VFromD<decltype(d2)> hi2(hi.raw);
   7978  const VFromD<decltype(d2)> lo2(lo.raw);
   7979  const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2));
   7980  // Compact into two pairs of u8, skipping the invalid x lanes. Could also use
   7981  // vcopy_lane_u16, but that's A64-only.
   7982  return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw);
   7983 }
   7984 
   7985 // Any type x2
   7986 template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>>
   7987 HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) {
   7988  return InterleaveLower(d, lo, hi);
   7989 }
   7990 
   7991 // ------------------------------ DupEven (InterleaveLower)
   7992 
   7993 template <typename T, size_t N,
   7994          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
   7995 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   7996 #if HWY_ARCH_ARM_A64
   7997  return detail::InterleaveEven(v, v);
   7998 #else
   7999  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]);
   8000 #endif
   8001 }
   8002 
   8003 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   8004 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) {
   8005  return InterleaveLower(DFromV<decltype(v)>(), v, v);
   8006 }
   8007 
   8008 // ------------------------------ DupOdd (InterleaveUpper)
   8009 
   8010 template <typename T, size_t N,
   8011          HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))>
   8012 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   8013 #if HWY_ARCH_ARM_A64
   8014  return detail::InterleaveOdd(v, v);
   8015 #else
   8016  return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]);
   8017 #endif
   8018 }
   8019 
   8020 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   8021 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) {
   8022  return InterleaveUpper(DFromV<decltype(v)>(), v, v);
   8023 }
   8024 
   8025 // ------------------------------ OddEven (IfThenElse)
   8026 
   8027 template <typename T, size_t N>
   8028 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) {
   8029  const DFromV<decltype(a)> d;
   8030  const Repartition<uint8_t, decltype(d)> d8;
   8031  alignas(16) static constexpr uint8_t kBytes[16] = {
   8032      ((0 / sizeof(T)) & 1) ? 0 : 0xFF,  ((1 / sizeof(T)) & 1) ? 0 : 0xFF,
   8033      ((2 / sizeof(T)) & 1) ? 0 : 0xFF,  ((3 / sizeof(T)) & 1) ? 0 : 0xFF,
   8034      ((4 / sizeof(T)) & 1) ? 0 : 0xFF,  ((5 / sizeof(T)) & 1) ? 0 : 0xFF,
   8035      ((6 / sizeof(T)) & 1) ? 0 : 0xFF,  ((7 / sizeof(T)) & 1) ? 0 : 0xFF,
   8036      ((8 / sizeof(T)) & 1) ? 0 : 0xFF,  ((9 / sizeof(T)) & 1) ? 0 : 0xFF,
   8037      ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF,
   8038      ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF,
   8039      ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF,
   8040  };
   8041  const auto vec = BitCast(d, Load(d8, kBytes));
   8042  return IfThenElse(MaskFromVec(vec), b, a);
   8043 }
   8044 
   8045 // ------------------------------ InterleaveEven
   8046 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
   8047 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   8048 #if HWY_ARCH_ARM_A64
   8049  return detail::InterleaveEven(a, b);
   8050 #else
   8051  return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]);
   8052 #endif
   8053 }
   8054 
   8055 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   8056 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) {
   8057  return InterleaveLower(a, b);
   8058 }
   8059 
   8060 // ------------------------------ InterleaveOdd
   8061 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
   8062 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) {
   8063 #if HWY_ARCH_ARM_A64
   8064  return detail::InterleaveOdd(a, b);
   8065 #else
   8066  return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]);
   8067 #endif
   8068 }
   8069 
   8070 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   8071 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) {
   8072  return InterleaveUpper(d, a, b);
   8073 }
   8074 
   8075 // ------------------------------ OddEvenBlocks
   8076 template <typename T, size_t N>
   8077 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) {
   8078  return even;
   8079 }
   8080 
   8081 // ------------------------------ SwapAdjacentBlocks
   8082 template <typename T, size_t N>
   8083 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) {
   8084  return v;
   8085 }
   8086 
   8087 // ------------------------------ InterleaveEvenBlocks
   8088 template <class D, class V = VFromD<D>>
   8089 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) {
   8090  return a;
   8091 }
   8092 // ------------------------------ InterleaveOddBlocks
   8093 template <class D, class V = VFromD<D>>
   8094 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) {
   8095  return a;
   8096 }
   8097 
   8098 // ------------------------------ ReverseBlocks
   8099 // Single block: no change
   8100 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   8101 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) {
   8102  return v;
   8103 }
   8104 
   8105 // ------------------------------ ReorderDemote2To (OddEven)
   8106 
   8107 #if HWY_NEON_HAVE_F32_TO_BF16C
   8108 template <class D, HWY_IF_BF16_D(D)>
   8109 HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
   8110                                   VFromD<Repartition<float, D>> b) {
   8111  const Half<decltype(dbf16)> dh_bf16;
   8112  return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a));
   8113 }
   8114 #endif  // HWY_NEON_HAVE_F32_TO_BF16C
   8115 
   8116 template <class D, HWY_IF_I32_D(D)>
   8117 HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
   8118                                         Vec128<int64_t> b) {
   8119  const Vec64<int32_t> a32(vqmovn_s64(a.raw));
   8120 #if HWY_ARCH_ARM_A64
   8121  (void)d32;
   8122  return Vec128<int32_t>(vqmovn_high_s64(a32.raw, b.raw));
   8123 #else
   8124  const Vec64<int32_t> b32(vqmovn_s64(b.raw));
   8125  return Combine(d32, b32, a32);
   8126 #endif
   8127 }
   8128 
   8129 template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   8130 HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a,
   8131                                   VFromD<Repartition<int64_t, D>> b) {
   8132  const Rebind<int64_t, decltype(d32)> dt;
   8133  return DemoteTo(d32, Combine(dt, b, a));
   8134 }
   8135 
   8136 template <class D, HWY_IF_U32_D(D)>
   8137 HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<int64_t> a,
   8138                                          Vec128<int64_t> b) {
   8139  const Vec64<uint32_t> a32(vqmovun_s64(a.raw));
   8140 #if HWY_ARCH_ARM_A64
   8141  (void)d32;
   8142  return Vec128<uint32_t>(vqmovun_high_s64(a32.raw, b.raw));
   8143 #else
   8144  const Vec64<uint32_t> b32(vqmovun_s64(b.raw));
   8145  return Combine(d32, b32, a32);
   8146 #endif
   8147 }
   8148 
   8149 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   8150 HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a,
   8151                                   VFromD<Repartition<int64_t, D>> b) {
   8152  const Rebind<int64_t, decltype(d32)> dt;
   8153  return DemoteTo(d32, Combine(dt, b, a));
   8154 }
   8155 
   8156 template <class D, HWY_IF_U32_D(D)>
   8157 HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<uint64_t> a,
   8158                                          Vec128<uint64_t> b) {
   8159  const Vec64<uint32_t> a32(vqmovn_u64(a.raw));
   8160 #if HWY_ARCH_ARM_A64
   8161  (void)d32;
   8162  return Vec128<uint32_t>(vqmovn_high_u64(a32.raw, b.raw));
   8163 #else
   8164  const Vec64<uint32_t> b32(vqmovn_u64(b.raw));
   8165  return Combine(d32, b32, a32);
   8166 #endif
   8167 }
   8168 
   8169 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   8170 HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<uint64_t, D>> a,
   8171                                   VFromD<Repartition<uint64_t, D>> b) {
   8172  const Rebind<uint64_t, decltype(d32)> dt;
   8173  return DemoteTo(d32, Combine(dt, b, a));
   8174 }
   8175 
   8176 template <class D, HWY_IF_I16_D(D)>
   8177 HWY_API Vec128<int16_t> ReorderDemote2To(D d16, Vec128<int32_t> a,
   8178                                         Vec128<int32_t> b) {
   8179  const Vec64<int16_t> a16(vqmovn_s32(a.raw));
   8180 #if HWY_ARCH_ARM_A64
   8181  (void)d16;
   8182  return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw));
   8183 #else
   8184  const Vec64<int16_t> b16(vqmovn_s32(b.raw));
   8185  return Combine(d16, b16, a16);
   8186 #endif
   8187 }
   8188 
   8189 template <class D, HWY_IF_I16_D(D)>
   8190 HWY_API Vec64<int16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a,
   8191                                        Vec64<int32_t> b) {
   8192  const Full128<int32_t> d32;
   8193  const Vec128<int32_t> ab = Combine(d32, b, a);
   8194  return Vec64<int16_t>(vqmovn_s32(ab.raw));
   8195 }
   8196 
   8197 template <class D, HWY_IF_I16_D(D)>
   8198 HWY_API Vec32<int16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a,
   8199                                        Vec32<int32_t> b) {
   8200  const Full128<int32_t> d32;
   8201  const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
   8202  return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw));
   8203 }
   8204 
   8205 template <class D, HWY_IF_U16_D(D)>
   8206 HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<int32_t> a,
   8207                                          Vec128<int32_t> b) {
   8208  const Vec64<uint16_t> a16(vqmovun_s32(a.raw));
   8209 #if HWY_ARCH_ARM_A64
   8210  (void)d16;
   8211  return Vec128<uint16_t>(vqmovun_high_s32(a16.raw, b.raw));
   8212 #else
   8213  const Vec64<uint16_t> b16(vqmovun_s32(b.raw));
   8214  return Combine(d16, b16, a16);
   8215 #endif
   8216 }
   8217 
   8218 template <class D, HWY_IF_U16_D(D)>
   8219 HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a,
   8220                                         Vec64<int32_t> b) {
   8221  const Full128<int32_t> d32;
   8222  const Vec128<int32_t> ab = Combine(d32, b, a);
   8223  return Vec64<uint16_t>(vqmovun_s32(ab.raw));
   8224 }
   8225 
   8226 template <class D, HWY_IF_U16_D(D)>
   8227 HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a,
   8228                                         Vec32<int32_t> b) {
   8229  const Full128<int32_t> d32;
   8230  const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw));
   8231  return Vec32<uint16_t>(vqmovun_s32(Combine(d32, ab, ab).raw));
   8232 }
   8233 
   8234 template <class D, HWY_IF_U16_D(D)>
   8235 HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<uint32_t> a,
   8236                                          Vec128<uint32_t> b) {
   8237  const Vec64<uint16_t> a16(vqmovn_u32(a.raw));
   8238 #if HWY_ARCH_ARM_A64
   8239  (void)d16;
   8240  return Vec128<uint16_t>(vqmovn_high_u32(a16.raw, b.raw));
   8241 #else
   8242  const Vec64<uint16_t> b16(vqmovn_u32(b.raw));
   8243  return Combine(d16, b16, a16);
   8244 #endif
   8245 }
   8246 
   8247 template <class D, HWY_IF_U16_D(D)>
   8248 HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<uint32_t> a,
   8249                                         Vec64<uint32_t> b) {
   8250  const Full128<uint32_t> d32;
   8251  const Vec128<uint32_t> ab = Combine(d32, b, a);
   8252  return Vec64<uint16_t>(vqmovn_u32(ab.raw));
   8253 }
   8254 
   8255 template <class D, HWY_IF_U16_D(D)>
   8256 HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<uint32_t> a,
   8257                                         Vec32<uint32_t> b) {
   8258  const Full128<uint32_t> d32;
   8259  const Vec64<uint32_t> ab(vzip1_u32(a.raw, b.raw));
   8260  return Vec32<uint16_t>(vqmovn_u32(Combine(d32, ab, ab).raw));
   8261 }
   8262 
   8263 template <class D, HWY_IF_I8_D(D)>
   8264 HWY_API Vec128<int8_t> ReorderDemote2To(D d8, Vec128<int16_t> a,
   8265                                        Vec128<int16_t> b) {
   8266  const Vec64<int8_t> a8(vqmovn_s16(a.raw));
   8267 #if HWY_ARCH_ARM_A64
   8268  (void)d8;
   8269  return Vec128<int8_t>(vqmovn_high_s16(a8.raw, b.raw));
   8270 #else
   8271  const Vec64<int8_t> b8(vqmovn_s16(b.raw));
   8272  return Combine(d8, b8, a8);
   8273 #endif
   8274 }
   8275 
   8276 template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   8277 HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a,
   8278                                   VFromD<Repartition<int16_t, D>> b) {
   8279  const Rebind<int16_t, decltype(d8)> dt;
   8280  return DemoteTo(d8, Combine(dt, b, a));
   8281 }
   8282 
   8283 template <class D, HWY_IF_U8_D(D)>
   8284 HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<int16_t> a,
   8285                                         Vec128<int16_t> b) {
   8286  const Vec64<uint8_t> a8(vqmovun_s16(a.raw));
   8287 #if HWY_ARCH_ARM_A64
   8288  (void)d8;
   8289  return Vec128<uint8_t>(vqmovun_high_s16(a8.raw, b.raw));
   8290 #else
   8291  const Vec64<uint8_t> b8(vqmovun_s16(b.raw));
   8292  return Combine(d8, b8, a8);
   8293 #endif
   8294 }
   8295 
   8296 template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   8297 HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a,
   8298                                   VFromD<Repartition<int16_t, D>> b) {
   8299  const Rebind<int16_t, decltype(d8)> dt;
   8300  return DemoteTo(d8, Combine(dt, b, a));
   8301 }
   8302 
   8303 template <class D, HWY_IF_U8_D(D)>
   8304 HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<uint16_t> a,
   8305                                         Vec128<uint16_t> b) {
   8306  const Vec64<uint8_t> a8(vqmovn_u16(a.raw));
   8307 #if HWY_ARCH_ARM_A64
   8308  (void)d8;
   8309  return Vec128<uint8_t>(vqmovn_high_u16(a8.raw, b.raw));
   8310 #else
   8311  const Vec64<uint8_t> b8(vqmovn_u16(b.raw));
   8312  return Combine(d8, b8, a8);
   8313 #endif
   8314 }
   8315 
   8316 template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)>
   8317 HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<uint16_t, D>> a,
   8318                                   VFromD<Repartition<uint16_t, D>> b) {
   8319  const Rebind<uint16_t, decltype(d8)> dt;
   8320  return DemoteTo(d8, Combine(dt, b, a));
   8321 }
   8322 
   8323 template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
   8324          HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>),
   8325          HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)>
   8326 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) {
   8327  return ReorderDemote2To(d, a, b);
   8328 }
   8329 
   8330 #if HWY_NEON_HAVE_F32_TO_BF16C
   8331 template <class D, HWY_IF_BF16_D(D)>
   8332 HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a,
   8333                                   VFromD<Repartition<float, D>> b) {
   8334  return ReorderDemote2To(dbf16, a, b);
   8335 }
   8336 #endif  // HWY_NEON_HAVE_F32_TO_BF16C
   8337 
   8338 // ================================================== CRYPTO
   8339 
   8340 // (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH).
   8341 // Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*.
   8342 #if HWY_TARGET != HWY_NEON_WITHOUT_AES
   8343 
   8344 #ifdef HWY_NATIVE_AES
   8345 #undef HWY_NATIVE_AES
   8346 #else
   8347 #define HWY_NATIVE_AES
   8348 #endif
   8349 
   8350 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state,
   8351                                 Vec128<uint8_t> round_key) {
   8352  // NOTE: it is important that AESE and AESMC be consecutive instructions so
   8353  // they can be fused. AESE includes AddRoundKey, which is a different ordering
   8354  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
   8355  // round key (the compiler will hopefully optimize this for multiple rounds).
   8356  return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^
   8357         round_key;
   8358 }
   8359 
   8360 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state,
   8361                                     Vec128<uint8_t> round_key) {
   8362  return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
   8363 }
   8364 
   8365 HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) {
   8366  return Vec128<uint8_t>{vaesimcq_u8(state.raw)};
   8367 }
   8368 
   8369 HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state,
   8370                                    Vec128<uint8_t> round_key) {
   8371  // NOTE: it is important that AESD and AESIMC be consecutive instructions so
   8372  // they can be fused. AESD includes AddRoundKey, which is a different ordering
   8373  // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual
   8374  // round key (the compiler will hopefully optimize this for multiple rounds).
   8375  return Vec128<uint8_t>(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^
   8376         round_key;
   8377 }
   8378 
   8379 HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state,
   8380                                        Vec128<uint8_t> round_key) {
   8381  return Vec128<uint8_t>(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key;
   8382 }
   8383 
   8384 HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) {
   8385  return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b)));
   8386 }
   8387 
   8388 HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) {
   8389  return Vec128<uint64_t>(
   8390      (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw));
   8391 }
   8392 
   8393 #endif  // HWY_TARGET != HWY_NEON_WITHOUT_AES
   8394 
   8395 // ================================================== MISC
   8396 
   8397 template <class D, HWY_IF_F32_D(D)>
   8398 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) {
   8399  const Rebind<uint16_t, decltype(df32)> du16;
   8400  const RebindToSigned<decltype(df32)> di32;
   8401  return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
   8402 }
   8403 
   8404 // ------------------------------ Truncations
   8405 
   8406 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom,
   8407          HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo),
   8408          hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr>
   8409 HWY_API Vec128<TTo, 1> TruncateTo(DTo /* tag */, Vec128<TFrom, 1> v) {
   8410  const Repartition<TTo, DFromV<decltype(v)>> d;
   8411  return Vec128<TTo, 1>{BitCast(d, v).raw};
   8412 }
   8413 
   8414 template <class D, HWY_IF_U8_D(D)>
   8415 HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   8416  const Repartition<uint8_t, DFromV<decltype(v)>> d;
   8417  const auto v1 = BitCast(d, v);
   8418  const auto v2 = detail::ConcatEven(v1, v1);
   8419  const auto v3 = detail::ConcatEven(v2, v2);
   8420  const auto v4 = detail::ConcatEven(v3, v3);
   8421  return LowerHalf(LowerHalf(LowerHalf(v4)));
   8422 }
   8423 
   8424 template <class D, HWY_IF_U16_D(D)>
   8425 HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   8426  const Repartition<uint16_t, DFromV<decltype(v)>> d;
   8427  const auto v1 = BitCast(d, v);
   8428  const auto v2 = detail::ConcatEven(v1, v1);
   8429  const auto v3 = detail::ConcatEven(v2, v2);
   8430  return LowerHalf(LowerHalf(v3));
   8431 }
   8432 
   8433 template <class D, HWY_IF_U32_D(D)>
   8434 HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) {
   8435  const Repartition<uint32_t, DFromV<decltype(v)>> d;
   8436  const auto v1 = BitCast(d, v);
   8437  const auto v2 = detail::ConcatEven(v1, v1);
   8438  return LowerHalf(v2);
   8439 }
   8440 
   8441 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
   8442 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   8443  const Repartition<uint8_t, DFromV<decltype(v)>> d;
   8444  const auto v1 = BitCast(d, v);
   8445  const auto v2 = detail::ConcatEven(v1, v1);
   8446  const auto v3 = detail::ConcatEven(v2, v2);
   8447  return LowerHalf(LowerHalf(v3));
   8448 }
   8449 
   8450 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 1)>
   8451 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) {
   8452  const Repartition<uint16_t, DFromV<decltype(v)>> d;
   8453  const auto v1 = BitCast(d, v);
   8454  const auto v2 = detail::ConcatEven(v1, v1);
   8455  return LowerHalf(v2);
   8456 }
   8457 
   8458 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)>
   8459 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) {
   8460  const Repartition<uint8_t, DFromV<decltype(v)>> d;
   8461  const auto v1 = BitCast(d, v);
   8462  const auto v2 = detail::ConcatEven(v1, v1);
   8463  return LowerHalf(v2);
   8464 }
   8465 
   8466 // ------------------------------ MulEven (ConcatEven)
   8467 
   8468 // Multiplies even lanes (0, 2 ..) and places the double-wide result into
   8469 // even and the upper half into its odd neighbor lane.
   8470 HWY_API Vec128<int16_t> MulEven(Vec128<int8_t> a, Vec128<int8_t> b) {
   8471  const DFromV<decltype(a)> d;
   8472  int8x16_t a_packed = ConcatEven(d, a, a).raw;
   8473  int8x16_t b_packed = ConcatEven(d, b, b).raw;
   8474  return Vec128<int16_t>(
   8475      vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed)));
   8476 }
   8477 HWY_API Vec128<uint16_t> MulEven(Vec128<uint8_t> a, Vec128<uint8_t> b) {
   8478  const DFromV<decltype(a)> d;
   8479  uint8x16_t a_packed = ConcatEven(d, a, a).raw;
   8480  uint8x16_t b_packed = ConcatEven(d, b, b).raw;
   8481  return Vec128<uint16_t>(
   8482      vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed)));
   8483 }
   8484 HWY_API Vec128<int32_t> MulEven(Vec128<int16_t> a, Vec128<int16_t> b) {
   8485  const DFromV<decltype(a)> d;
   8486  int16x8_t a_packed = ConcatEven(d, a, a).raw;
   8487  int16x8_t b_packed = ConcatEven(d, b, b).raw;
   8488  return Vec128<int32_t>(
   8489      vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed)));
   8490 }
   8491 HWY_API Vec128<uint32_t> MulEven(Vec128<uint16_t> a, Vec128<uint16_t> b) {
   8492  const DFromV<decltype(a)> d;
   8493  uint16x8_t a_packed = ConcatEven(d, a, a).raw;
   8494  uint16x8_t b_packed = ConcatEven(d, b, b).raw;
   8495  return Vec128<uint32_t>(
   8496      vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed)));
   8497 }
   8498 HWY_API Vec128<int64_t> MulEven(Vec128<int32_t> a, Vec128<int32_t> b) {
   8499  const DFromV<decltype(a)> d;
   8500  int32x4_t a_packed = ConcatEven(d, a, a).raw;
   8501  int32x4_t b_packed = ConcatEven(d, b, b).raw;
   8502  return Vec128<int64_t>(
   8503      vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
   8504 }
   8505 HWY_API Vec128<uint64_t> MulEven(Vec128<uint32_t> a, Vec128<uint32_t> b) {
   8506  const DFromV<decltype(a)> d;
   8507  uint32x4_t a_packed = ConcatEven(d, a, a).raw;
   8508  uint32x4_t b_packed = ConcatEven(d, b, b).raw;
   8509  return Vec128<uint64_t>(
   8510      vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
   8511 }
   8512 
   8513 template <size_t N>
   8514 HWY_API Vec128<int16_t, (N + 1) / 2> MulEven(Vec128<int8_t, N> a,
   8515                                             Vec128<int8_t, N> b) {
   8516  const DFromV<decltype(a)> d;
   8517  int8x8_t a_packed = ConcatEven(d, a, a).raw;
   8518  int8x8_t b_packed = ConcatEven(d, b, b).raw;
   8519  return Vec128<int16_t, (N + 1) / 2>(
   8520      vget_low_s16(vmull_s8(a_packed, b_packed)));
   8521 }
   8522 template <size_t N>
   8523 HWY_API Vec128<uint16_t, (N + 1) / 2> MulEven(Vec128<uint8_t, N> a,
   8524                                              Vec128<uint8_t, N> b) {
   8525  const DFromV<decltype(a)> d;
   8526  uint8x8_t a_packed = ConcatEven(d, a, a).raw;
   8527  uint8x8_t b_packed = ConcatEven(d, b, b).raw;
   8528  return Vec128<uint16_t, (N + 1) / 2>(
   8529      vget_low_u16(vmull_u8(a_packed, b_packed)));
   8530 }
   8531 template <size_t N>
   8532 HWY_API Vec128<int32_t, (N + 1) / 2> MulEven(Vec128<int16_t, N> a,
   8533                                             Vec128<int16_t, N> b) {
   8534  const DFromV<decltype(a)> d;
   8535  int16x4_t a_packed = ConcatEven(d, a, a).raw;
   8536  int16x4_t b_packed = ConcatEven(d, b, b).raw;
   8537  return Vec128<int32_t, (N + 1) / 2>(
   8538      vget_low_s32(vmull_s16(a_packed, b_packed)));
   8539 }
   8540 template <size_t N>
   8541 HWY_API Vec128<uint32_t, (N + 1) / 2> MulEven(Vec128<uint16_t, N> a,
   8542                                              Vec128<uint16_t, N> b) {
   8543  const DFromV<decltype(a)> d;
   8544  uint16x4_t a_packed = ConcatEven(d, a, a).raw;
   8545  uint16x4_t b_packed = ConcatEven(d, b, b).raw;
   8546  return Vec128<uint32_t, (N + 1) / 2>(
   8547      vget_low_u32(vmull_u16(a_packed, b_packed)));
   8548 }
   8549 template <size_t N>
   8550 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a,
   8551                                             Vec128<int32_t, N> b) {
   8552  const DFromV<decltype(a)> d;
   8553  int32x2_t a_packed = ConcatEven(d, a, a).raw;
   8554  int32x2_t b_packed = ConcatEven(d, b, b).raw;
   8555  return Vec128<int64_t, (N + 1) / 2>(
   8556      vget_low_s64(vmull_s32(a_packed, b_packed)));
   8557 }
   8558 template <size_t N>
   8559 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a,
   8560                                              Vec128<uint32_t, N> b) {
   8561  const DFromV<decltype(a)> d;
   8562  uint32x2_t a_packed = ConcatEven(d, a, a).raw;
   8563  uint32x2_t b_packed = ConcatEven(d, b, b).raw;
   8564  return Vec128<uint64_t, (N + 1) / 2>(
   8565      vget_low_u64(vmull_u32(a_packed, b_packed)));
   8566 }
   8567 
   8568 template <class T, HWY_IF_UI64(T)>
   8569 HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) {
   8570  T hi;
   8571  T lo = Mul128(GetLane(a), GetLane(b), &hi);
   8572  return Dup128VecFromValues(Full128<T>(), lo, hi);
   8573 }
   8574 
   8575 // Multiplies odd lanes (1, 3 ..) and places the double-wide result into
   8576 // even and the upper half into its odd neighbor lane.
   8577 HWY_API Vec128<int16_t> MulOdd(Vec128<int8_t> a, Vec128<int8_t> b) {
   8578  const DFromV<decltype(a)> d;
   8579  int8x16_t a_packed = ConcatOdd(d, a, a).raw;
   8580  int8x16_t b_packed = ConcatOdd(d, b, b).raw;
   8581  return Vec128<int16_t>(
   8582      vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed)));
   8583 }
   8584 HWY_API Vec128<uint16_t> MulOdd(Vec128<uint8_t> a, Vec128<uint8_t> b) {
   8585  const DFromV<decltype(a)> d;
   8586  uint8x16_t a_packed = ConcatOdd(d, a, a).raw;
   8587  uint8x16_t b_packed = ConcatOdd(d, b, b).raw;
   8588  return Vec128<uint16_t>(
   8589      vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed)));
   8590 }
   8591 HWY_API Vec128<int32_t> MulOdd(Vec128<int16_t> a, Vec128<int16_t> b) {
   8592  const DFromV<decltype(a)> d;
   8593  int16x8_t a_packed = ConcatOdd(d, a, a).raw;
   8594  int16x8_t b_packed = ConcatOdd(d, b, b).raw;
   8595  return Vec128<int32_t>(
   8596      vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed)));
   8597 }
   8598 HWY_API Vec128<uint32_t> MulOdd(Vec128<uint16_t> a, Vec128<uint16_t> b) {
   8599  const DFromV<decltype(a)> d;
   8600  uint16x8_t a_packed = ConcatOdd(d, a, a).raw;
   8601  uint16x8_t b_packed = ConcatOdd(d, b, b).raw;
   8602  return Vec128<uint32_t>(
   8603      vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed)));
   8604 }
   8605 HWY_API Vec128<int64_t> MulOdd(Vec128<int32_t> a, Vec128<int32_t> b) {
   8606  const DFromV<decltype(a)> d;
   8607  int32x4_t a_packed = ConcatOdd(d, a, a).raw;
   8608  int32x4_t b_packed = ConcatOdd(d, b, b).raw;
   8609  return Vec128<int64_t>(
   8610      vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed)));
   8611 }
   8612 HWY_API Vec128<uint64_t> MulOdd(Vec128<uint32_t> a, Vec128<uint32_t> b) {
   8613  const DFromV<decltype(a)> d;
   8614  uint32x4_t a_packed = ConcatOdd(d, a, a).raw;
   8615  uint32x4_t b_packed = ConcatOdd(d, b, b).raw;
   8616  return Vec128<uint64_t>(
   8617      vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed)));
   8618 }
   8619 
   8620 template <size_t N>
   8621 HWY_API Vec128<int16_t, (N + 1) / 2> MulOdd(Vec128<int8_t, N> a,
   8622                                            Vec128<int8_t, N> b) {
   8623  const DFromV<decltype(a)> d;
   8624  int8x8_t a_packed = ConcatOdd(d, a, a).raw;
   8625  int8x8_t b_packed = ConcatOdd(d, b, b).raw;
   8626  return Vec128<int16_t, (N + 1) / 2>(
   8627      vget_low_s16(vmull_s8(a_packed, b_packed)));
   8628 }
   8629 template <size_t N>
   8630 HWY_API Vec128<uint16_t, (N + 1) / 2> MulOdd(Vec128<uint8_t, N> a,
   8631                                             Vec128<uint8_t, N> b) {
   8632  const DFromV<decltype(a)> d;
   8633  uint8x8_t a_packed = ConcatOdd(d, a, a).raw;
   8634  uint8x8_t b_packed = ConcatOdd(d, b, b).raw;
   8635  return Vec128<uint16_t, (N + 1) / 2>(
   8636      vget_low_u16(vmull_u8(a_packed, b_packed)));
   8637 }
   8638 template <size_t N>
   8639 HWY_API Vec128<int32_t, (N + 1) / 2> MulOdd(Vec128<int16_t, N> a,
   8640                                            Vec128<int16_t, N> b) {
   8641  const DFromV<decltype(a)> d;
   8642  int16x4_t a_packed = ConcatOdd(d, a, a).raw;
   8643  int16x4_t b_packed = ConcatOdd(d, b, b).raw;
   8644  return Vec128<int32_t, (N + 1) / 2>(
   8645      vget_low_s32(vmull_s16(a_packed, b_packed)));
   8646 }
   8647 template <size_t N>
   8648 HWY_API Vec128<uint32_t, (N + 1) / 2> MulOdd(Vec128<uint16_t, N> a,
   8649                                             Vec128<uint16_t, N> b) {
   8650  const DFromV<decltype(a)> d;
   8651  uint16x4_t a_packed = ConcatOdd(d, a, a).raw;
   8652  uint16x4_t b_packed = ConcatOdd(d, b, b).raw;
   8653  return Vec128<uint32_t, (N + 1) / 2>(
   8654      vget_low_u32(vmull_u16(a_packed, b_packed)));
   8655 }
   8656 template <size_t N>
   8657 HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(Vec128<int32_t, N> a,
   8658                                            Vec128<int32_t, N> b) {
   8659  const DFromV<decltype(a)> d;
   8660  int32x2_t a_packed = ConcatOdd(d, a, a).raw;
   8661  int32x2_t b_packed = ConcatOdd(d, b, b).raw;
   8662  return Vec128<int64_t, (N + 1) / 2>(
   8663      vget_low_s64(vmull_s32(a_packed, b_packed)));
   8664 }
   8665 template <size_t N>
   8666 HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a,
   8667                                             Vec128<uint32_t, N> b) {
   8668  const DFromV<decltype(a)> d;
   8669  uint32x2_t a_packed = ConcatOdd(d, a, a).raw;
   8670  uint32x2_t b_packed = ConcatOdd(d, b, b).raw;
   8671  return Vec128<uint64_t, (N + 1) / 2>(
   8672      vget_low_u64(vmull_u32(a_packed, b_packed)));
   8673 }
   8674 
   8675 template <class T, HWY_IF_UI64(T)>
   8676 HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) {
   8677  T hi;
   8678  T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi);
   8679  return Dup128VecFromValues(Full128<T>(), lo, hi);
   8680 }
   8681 
   8682 // ------------------------------ TableLookupBytes (Combine, LowerHalf)
   8683 
   8684 // Both full
   8685 template <typename T, typename TI>
   8686 HWY_API Vec128<TI> TableLookupBytes(Vec128<T> bytes, Vec128<TI> from) {
   8687  const DFromV<decltype(from)> d;
   8688  const Repartition<uint8_t, decltype(d)> d8;
   8689 #if HWY_ARCH_ARM_A64
   8690  return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw,
   8691                                               BitCast(d8, from).raw)));
   8692 #else
   8693  uint8x16_t table0 = BitCast(d8, bytes).raw;
   8694  uint8x8x2_t table;
   8695  table.val[0] = vget_low_u8(table0);
   8696  table.val[1] = vget_high_u8(table0);
   8697  uint8x16_t idx = BitCast(d8, from).raw;
   8698  uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx));
   8699  uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx));
   8700  return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi)));
   8701 #endif
   8702 }
   8703 
   8704 // Partial index vector
   8705 template <typename T, typename TI, size_t NI, HWY_IF_V_SIZE_LE(TI, NI, 8)>
   8706 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T> bytes, Vec128<TI, NI> from) {
   8707  const Full128<TI> d_full;
   8708  const Vec64<TI> from64(from.raw);
   8709  const auto idx_full = Combine(d_full, from64, from64);
   8710  const auto out_full = TableLookupBytes(bytes, idx_full);
   8711  return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw);
   8712 }
   8713 
   8714 // Partial table vector
   8715 template <typename T, size_t N, typename TI, HWY_IF_V_SIZE_LE(T, N, 8)>
   8716 HWY_API Vec128<TI> TableLookupBytes(Vec128<T, N> bytes, Vec128<TI> from) {
   8717  const Full128<T> d_full;
   8718  return TableLookupBytes(Combine(d_full, bytes, bytes), from);
   8719 }
   8720 
   8721 // Partial both
   8722 template <typename T, size_t N, typename TI, size_t NI,
   8723          HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_V_SIZE_LE(TI, NI, 8)>
   8724 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes,
   8725                                        Vec128<TI, NI> from) {
   8726  const DFromV<decltype(bytes)> d;
   8727  const Simd<TI, NI, 0> d_idx;
   8728  const Repartition<uint8_t, decltype(d_idx)> d_idx8;
   8729  // uint8x8
   8730  const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes);
   8731  const auto from8 = BitCast(d_idx8, from);
   8732  const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw));
   8733  return BitCast(d_idx, v8);
   8734 }
   8735 
   8736 // For all vector widths; Arm anyway zeroes if >= 0x10.
   8737 template <class V, class VI>
   8738 HWY_API VI TableLookupBytesOr0(V bytes, VI from) {
   8739  return TableLookupBytes(bytes, from);
   8740 }
   8741 
   8742 // ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes)
   8743 
   8744 #if HWY_TARGET != HWY_NEON_WITHOUT_AES
   8745 template <uint8_t kRcon>
   8746 HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) {
   8747  alignas(16) static constexpr uint8_t kRconXorMask[16] = {
   8748      0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0};
   8749  alignas(16) static constexpr uint8_t kRotWordShuffle[16] = {
   8750      0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12};
   8751  const DFromV<decltype(v)> d;
   8752  const Repartition<uint32_t, decltype(d)> du32;
   8753  const auto w13 = BitCast(d, DupOdd(BitCast(du32, v)));
   8754  const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask));
   8755  return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle));
   8756 }
   8757 #endif  // HWY_TARGET != HWY_NEON_WITHOUT_AES
   8758 
   8759 // ------------------------------ Scatter in generic_ops-inl.h
   8760 // ------------------------------ Gather in generic_ops-inl.h
   8761 
   8762 // ------------------------------ Reductions
   8763 
   8764 // On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set.
   8765 #if HWY_ARCH_ARM_A64
   8766 
   8767 #ifdef HWY_NATIVE_REDUCE_SCALAR
   8768 #undef HWY_NATIVE_REDUCE_SCALAR
   8769 #else
   8770 #define HWY_NATIVE_REDUCE_SCALAR
   8771 #endif
   8772 
   8773 // TODO(janwas): use normal HWY_NEON_DEF, then FULL type list.
   8774 #define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \
   8775  template <class D, HWY_IF_LANES_D(D, size)>                           \
   8776  HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) {        \
   8777    return HWY_NEON_EVAL(prefix##infix##suffix, v.raw);                 \
   8778  }
   8779 
   8780 // Excludes u64/s64 (missing minv/maxv) and f16 (missing addv).
   8781 #define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix)       \
   8782  HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8)       \
   8783  HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8)   \
   8784  HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16)     \
   8785  HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16)  \
   8786  HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32)     \
   8787  HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32)  \
   8788  HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8)        \
   8789  HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8)    \
   8790  HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16)      \
   8791  HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16)   \
   8792  HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32)      \
   8793  HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32)   \
   8794  HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32)    \
   8795  HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \
   8796  HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64)
   8797 
   8798 // Different interface than HWY_NEON_DEF_FUNCTION_FULL_UI_64.
   8799 #define HWY_NEON_DEF_REDUCTION_UI64(name, prefix)            \
   8800  HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \
   8801  HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64)
   8802 
   8803 #if HWY_HAVE_FLOAT16
   8804 #define HWY_NEON_DEF_REDUCTION_F16(name, prefix)           \
   8805  HWY_NEON_DEF_REDUCTION(float16, 4, name, prefix, _, f16) \
   8806  HWY_NEON_DEF_REDUCTION(float16, 8, name, prefix##q, _, f16)
   8807 #else
   8808 #define HWY_NEON_DEF_REDUCTION_F16(name, prefix)
   8809 #endif
   8810 
   8811 HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv)
   8812 HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv)
   8813 HWY_NEON_DEF_REDUCTION_F16(ReduceMin, vminv)
   8814 HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv)
   8815 
   8816 HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv)
   8817 HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv)
   8818 
   8819 // Emulate missing UI64 and partial N=2.
   8820 template <class D, HWY_IF_LANES_D(D, 2),
   8821          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))>
   8822 HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) {
   8823  return GetLane(v10) + ExtractLane(v10, 1);
   8824 }
   8825 
   8826 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
   8827          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
   8828 HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) {
   8829  return HWY_MIN(GetLane(v10), ExtractLane(v10, 1));
   8830 }
   8831 
   8832 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D),
   8833          HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))>
   8834 HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) {
   8835  return HWY_MAX(GetLane(v10), ExtractLane(v10, 1));
   8836 }
   8837 
   8838 #if HWY_HAVE_FLOAT16
   8839 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
   8840 HWY_API float16_t ReduceMin(D d, VFromD<D> v10) {
   8841  return GetLane(Min(v10, Reverse2(d, v10)));
   8842 }
   8843 
   8844 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)>
   8845 HWY_API float16_t ReduceMax(D d, VFromD<D> v10) {
   8846  return GetLane(Max(v10, Reverse2(d, v10)));
   8847 }
   8848 
   8849 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)>
   8850 HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) {
   8851  const float16x4_t x2 = vpadd_f16(v.raw, v.raw);
   8852  return GetLane(VFromD<D>(vpadd_f16(x2, x2)));
   8853 }
   8854 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)>
   8855 HWY_API float16_t ReduceSum(D d, VFromD<D> v) {
   8856  const Half<decltype(d)> dh;
   8857  return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw))));
   8858 }
   8859 #endif  // HWY_HAVE_FLOAT16
   8860 
   8861 #undef HWY_NEON_DEF_REDUCTION_CORE_TYPES
   8862 #undef HWY_NEON_DEF_REDUCTION_F16
   8863 #undef HWY_NEON_DEF_REDUCTION_UI64
   8864 #undef HWY_NEON_DEF_REDUCTION
   8865 
   8866 // ------------------------------ SumOfLanes
   8867 
   8868 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   8869 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) {
   8870  return Set(d, ReduceSum(d, v));
   8871 }
   8872 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   8873 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) {
   8874  return Set(d, ReduceMin(d, v));
   8875 }
   8876 template <class D, HWY_IF_LANES_GT_D(D, 1)>
   8877 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) {
   8878  return Set(d, ReduceMax(d, v));
   8879 }
   8880 
   8881 // On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane.
   8882 #else  // !HWY_ARCH_ARM_A64
   8883 
   8884 // Armv7 lacks N=2 (except 32-bit) and 8-bit x4, so enable them in generic_ops.
   8885 #undef HWY_IF_SUM_OF_LANES_D
   8886 #define HWY_IF_SUM_OF_LANES_D(D)                                        \
   8887  hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) ||  \
   8888                (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
   8889      nullptr
   8890 #undef HWY_IF_MINMAX_OF_LANES_D
   8891 #define HWY_IF_MINMAX_OF_LANES_D(D)                                     \
   8892  hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) ||  \
   8893                (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \
   8894      nullptr
   8895 
   8896 // For arm7, we implement reductions using a series of pairwise operations. This
   8897 // produces the full vector result, so we express Reduce* in terms of *OfLanes.
   8898 
   8899 #define HWY_NEON_DEF_PAIRWISE_REDUCTION(name)                               \
   8900  /* generic_ops-inl.h handles 64-bit types. */                             \
   8901  template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_T_SIZE_D(D, 8)>      \
   8902  HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) {                       \
   8903    HWY_LANES_CONSTEXPR size_t N = Lanes(d);                                \
   8904    VFromD<D> tmp = detail::Pairwise##name(v, v);                           \
   8905    if ((N / 2) > 1) tmp = detail::Pairwise##name(tmp, tmp);                \
   8906    if ((N / 4) > 1) tmp = detail::Pairwise##name(tmp, tmp);                \
   8907    return tmp;                                                             \
   8908  }                                                                         \
   8909  /* Armv7 lacks q (full-vector) instructions, so first reduce 128-bit v */ \
   8910  /* into a half-vector, then reduce that. */                               \
   8911  template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 8)>     \
   8912  HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) {                       \
   8913    const Half<D> dh;                                                       \
   8914    VFromD<decltype(dh)> upper = UpperHalf(dh, v);                          \
   8915    VFromD<decltype(dh)> lower = LowerHalf(dh, v);                          \
   8916    VFromD<decltype(dh)> half = detail::Pairwise##name(upper, lower);       \
   8917    half = name##OfLanes(dh, half);                                         \
   8918    return Combine(d, half, half);                                          \
   8919  }
   8920 
   8921 HWY_NEON_DEF_PAIRWISE_REDUCTION(Sum)
   8922 HWY_NEON_DEF_PAIRWISE_REDUCTION(Min)
   8923 HWY_NEON_DEF_PAIRWISE_REDUCTION(Max)
   8924 #undef HWY_NEON_DEF_PAIRWISE_REDUCTION
   8925 
   8926 // GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default
   8927 // N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h
   8928 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
   8929 #undef HWY_NATIVE_REDUCE_SUM_4_UI8
   8930 #else
   8931 #define HWY_NATIVE_REDUCE_SUM_4_UI8
   8932 #endif
   8933 
   8934 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)>
   8935 HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) {
   8936  return static_cast<TFromD<D>>(GetLane(SumsOf4(v)));
   8937 }
   8938 
   8939 #endif  // HWY_ARCH_ARM_A64
   8940 
   8941 // ------------------------------ LoadMaskBits (TestBit)
   8942 
   8943 namespace detail {
   8944 
   8945 // Helper function to set 64 bits and potentially return a smaller vector. The
   8946 // overload is required to call the q vs non-q intrinsics. Note that 8-bit
   8947 // LoadMaskBits only requires 16 bits, but 64 avoids casting.
   8948 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   8949 HWY_INLINE VFromD<D> Set64(D /* tag */, uint64_t mask_bits) {
   8950  const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits));
   8951  return VFromD<D>(BitCast(Full64<TFromD<D>>(), v64).raw);
   8952 }
   8953 template <typename T>
   8954 HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) {
   8955  return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits)));
   8956 }
   8957 
   8958 template <class D, HWY_IF_T_SIZE_D(D, 1)>
   8959 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
   8960  const RebindToUnsigned<decltype(d)> du;
   8961  // Easier than Set(), which would require an >8-bit type, which would not
   8962  // compile for T=uint8_t, N=1.
   8963  const auto vmask_bits = Set64(du, mask_bits);
   8964 
   8965  // Replicate bytes 8x such that each byte contains the bit that governs it.
   8966  alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0,
   8967                                                    1, 1, 1, 1, 1, 1, 1, 1};
   8968  const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8));
   8969 
   8970  alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128,
   8971                                                   1, 2, 4, 8, 16, 32, 64, 128};
   8972  return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit)));
   8973 }
   8974 
   8975 template <class D, HWY_IF_T_SIZE_D(D, 2)>
   8976 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
   8977  const RebindToUnsigned<decltype(d)> du;
   8978  alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128};
   8979  const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits));
   8980  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
   8981 }
   8982 
   8983 template <class D, HWY_IF_T_SIZE_D(D, 4)>
   8984 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
   8985  const RebindToUnsigned<decltype(d)> du;
   8986  alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8};
   8987  const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits));
   8988  return RebindMask(d, TestBit(vmask_bits, Load(du, kBit)));
   8989 }
   8990 
   8991 template <class D, HWY_IF_T_SIZE_D(D, 8)>
   8992 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) {
   8993  const RebindToUnsigned<decltype(d)> du;
   8994  alignas(16) static constexpr uint64_t kBit[8] = {1, 2};
   8995  return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit)));
   8996 }
   8997 
   8998 }  // namespace detail
   8999 
   9000 // `p` points to at least 8 readable bytes, not all of which need be valid.
   9001 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
   9002 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) {
   9003  uint64_t mask_bits = 0;
   9004  CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits);
   9005  return detail::LoadMaskBits(d, mask_bits);
   9006 }
   9007 
   9008 // ------------------------------ Dup128MaskFromMaskBits
   9009 
   9010 template <class D>
   9011 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
   9012  constexpr size_t kN = MaxLanes(d);
   9013  if (kN < 8) mask_bits &= (1u << kN) - 1;
   9014  return detail::LoadMaskBits(d, mask_bits);
   9015 }
   9016 
   9017 // ------------------------------ Mask
   9018 
   9019 namespace detail {
   9020 
   9021 // Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than
   9022 // BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse.
   9023 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   9024 HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
   9025  const Full128<uint16_t> du16;
   9026  const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask));
   9027  const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4));
   9028  return GetLane(BitCast(Full64<uint64_t>(), nib));
   9029 }
   9030 
   9031 template <class D, HWY_IF_V_SIZE_D(D, 8)>
   9032 HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
   9033  // There is no vshrn_n_u16 for uint16x4, so zero-extend.
   9034  const Twice<decltype(d)> d2;
   9035  const VFromD<decltype(d2)> v128 = ZeroExtendVector(d2, VecFromMask(d, mask));
   9036  // No need to mask, upper half is zero thanks to ZeroExtendVector.
   9037  return NibblesFromMask(d2, MaskFromVec(v128));
   9038 }
   9039 
   9040 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)>
   9041 HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) {
   9042  const Mask64<TFromD<D>> mask64(mask.raw);
   9043  const uint64_t nib = NibblesFromMask(Full64<TFromD<D>>(), mask64);
   9044  // Clear nibbles from upper half of 64-bits
   9045  return nib & ((1ull << (d.MaxBytes() * 4)) - 1);
   9046 }
   9047 
   9048 // Returns the lowest N for the BitsFromMask result.
   9049 template <class D>
   9050 constexpr uint64_t OnlyActive(D d, uint64_t bits) {
   9051  return (d.MaxBytes() >= 8) ? bits : (bits & ((1ull << d.MaxLanes()) - 1));
   9052 }
   9053 
   9054 }  // namespace detail
   9055 
   9056 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)>
   9057 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9058  alignas(16) static constexpr uint8_t kSliceLanes[16] = {
   9059      1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80,
   9060  };
   9061  const RebindToUnsigned<D> du;
   9062  const Vec128<uint8_t> values =
   9063      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
   9064 
   9065 #if HWY_ARCH_ARM_A64
   9066  // Can't vaddv - we need two separate bytes (16 bits).
   9067  const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw));
   9068  const uint8x8_t x4 = vpadd_u8(x2, x2);
   9069  const uint8x8_t x8 = vpadd_u8(x4, x4);
   9070  return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF;
   9071 #else
   9072  // Don't have vpaddq, so keep doubling lane size.
   9073  const uint16x8_t x2 = vpaddlq_u8(values.raw);
   9074  const uint32x4_t x4 = vpaddlq_u16(x2);
   9075  const uint64x2_t x8 = vpaddlq_u32(x4);
   9076  return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0);
   9077 #endif
   9078 }
   9079 
   9080 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)>
   9081 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9082  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
   9083  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
   9084  alignas(8) static constexpr uint8_t kSliceLanes[8] = {1,    2,    4,    8,
   9085                                                        0x10, 0x20, 0x40, 0x80};
   9086  const RebindToUnsigned<decltype(d)> du;
   9087  using VU = VFromD<decltype(du)>;
   9088  const VU slice(Load(Full64<uint8_t>(), kSliceLanes).raw);
   9089  const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
   9090 
   9091 #if HWY_ARCH_ARM_A64
   9092  return detail::OnlyActive(d, vaddv_u8(values.raw));
   9093 #else
   9094  const uint16x4_t x2 = vpaddl_u8(values.raw);
   9095  const uint32x2_t x4 = vpaddl_u16(x2);
   9096  const uint64x1_t x8 = vpaddl_u32(x4);
   9097  return detail::OnlyActive(d, vget_lane_u64(x8, 0));
   9098 #endif
   9099 }
   9100 
   9101 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 16)>
   9102 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9103  alignas(16) static constexpr uint16_t kSliceLanes[8] = {
   9104      1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80};
   9105  const RebindToUnsigned<D> du;
   9106  const Vec128<uint16_t> values =
   9107      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
   9108 #if HWY_ARCH_ARM_A64
   9109  return detail::OnlyActive(d, vaddvq_u16(values.raw));
   9110 #else
   9111  const uint32x4_t x2 = vpaddlq_u16(values.raw);
   9112  const uint64x2_t x4 = vpaddlq_u32(x2);
   9113  return detail::OnlyActive(d, vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1));
   9114 #endif
   9115 }
   9116 
   9117 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 8)>
   9118 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9119  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
   9120  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
   9121  alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8};
   9122  const RebindToUnsigned<decltype(d)> du;
   9123  using VU = VFromD<decltype(du)>;
   9124  const VU slice(Load(Full64<uint16_t>(), kSliceLanes).raw);
   9125  const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
   9126 #if HWY_ARCH_ARM_A64
   9127  return detail::OnlyActive(d, vaddv_u16(values.raw));
   9128 #else
   9129  const uint32x2_t x2 = vpaddl_u16(values.raw);
   9130  const uint64x1_t x4 = vpaddl_u32(x2);
   9131  return detail::OnlyActive(d, vget_lane_u64(x4, 0));
   9132 #endif
   9133 }
   9134 
   9135 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)>
   9136 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9137  alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8};
   9138  const RebindToUnsigned<D> du;
   9139  const Vec128<uint32_t> values =
   9140      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
   9141 #if HWY_ARCH_ARM_A64
   9142  return detail::OnlyActive(d, vaddvq_u32(values.raw));
   9143 #else
   9144  const uint64x2_t x2 = vpaddlq_u32(values.raw);
   9145  return detail::OnlyActive(d, vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1));
   9146 #endif
   9147 }
   9148 
   9149 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 8)>
   9150 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9151  // Upper lanes of partial loads are undefined. OnlyActive will fix this if
   9152  // we load all kSliceLanes so the upper lanes do not pollute the valid bits.
   9153  alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2};
   9154  const RebindToUnsigned<decltype(d)> du;
   9155  using VU = VFromD<decltype(du)>;
   9156  const VU slice(Load(Full64<uint32_t>(), kSliceLanes).raw);
   9157  const VU values = BitCast(du, VecFromMask(d, mask)) & slice;
   9158 #if HWY_ARCH_ARM_A64
   9159  return detail::OnlyActive(d, vaddv_u32(values.raw));
   9160 #else
   9161  const uint64x1_t x2 = vpaddl_u32(values.raw);
   9162  return detail::OnlyActive(d, vget_lane_u64(x2, 0));
   9163 #endif
   9164 }
   9165 
   9166 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)>
   9167 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9168  alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2};
   9169  const RebindToUnsigned<decltype(d)> du;
   9170  const Vec128<uint64_t> values =
   9171      BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes);
   9172 #if HWY_ARCH_ARM_A64
   9173  return detail::OnlyActive(d, vaddvq_u64(values.raw));
   9174 #else
   9175  return detail::OnlyActive(
   9176      d, vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1));
   9177 #endif
   9178 }
   9179 
   9180 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 8)>
   9181 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) {
   9182  const RebindToUnsigned<decltype(d)> du;
   9183  const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, mask)) & Set(du, 1);
   9184  return vget_lane_u64(values.raw, 0);
   9185 }
   9186 
   9187 namespace detail {
   9188 
   9189 // Returns number of lanes whose mask is set.
   9190 //
   9191 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op
   9192 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also
   9193 // changes each lane to 1 (if mask set) or 0.
   9194 // NOTE: PopCount also operates on vectors, so we still have to do horizontal
   9195 // sums separately. We specialize CountTrue for full vectors (negating instead
   9196 // of PopCount because it avoids an extra shift), and use PopCount of
   9197 // NibblesFromMask for partial vectors.
   9198 
   9199 template <typename T>
   9200 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) {
   9201  const Full128<int8_t> di;
   9202  const int8x16_t ones =
   9203      vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
   9204 
   9205 #if HWY_ARCH_ARM_A64
   9206  return static_cast<size_t>(vaddvq_s8(ones));
   9207 #else
   9208  const int16x8_t x2 = vpaddlq_s8(ones);
   9209  const int32x4_t x4 = vpaddlq_s16(x2);
   9210  const int64x2_t x8 = vpaddlq_s32(x4);
   9211  return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1));
   9212 #endif
   9213 }
   9214 template <typename T>
   9215 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) {
   9216  const Full128<int16_t> di;
   9217  const int16x8_t ones =
   9218      vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
   9219 
   9220 #if HWY_ARCH_ARM_A64
   9221  return static_cast<size_t>(vaddvq_s16(ones));
   9222 #else
   9223  const int32x4_t x2 = vpaddlq_s16(ones);
   9224  const int64x2_t x4 = vpaddlq_s32(x2);
   9225  return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1));
   9226 #endif
   9227 }
   9228 
   9229 template <typename T>
   9230 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) {
   9231  const Full128<int32_t> di;
   9232  const int32x4_t ones =
   9233      vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
   9234 
   9235 #if HWY_ARCH_ARM_A64
   9236  return static_cast<size_t>(vaddvq_s32(ones));
   9237 #else
   9238  const int64x2_t x2 = vpaddlq_s32(ones);
   9239  return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1));
   9240 #endif
   9241 }
   9242 
   9243 template <typename T>
   9244 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, Mask128<T> mask) {
   9245 #if HWY_ARCH_ARM_A64
   9246  const Full128<int64_t> di;
   9247  const int64x2_t ones =
   9248      vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw);
   9249  return static_cast<size_t>(vaddvq_s64(ones));
   9250 #else
   9251  const Full128<uint64_t> du;
   9252  const auto mask_u = VecFromMask(du, RebindMask(du, mask));
   9253  const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63);
   9254  return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1));
   9255 #endif
   9256 }
   9257 
   9258 }  // namespace detail
   9259 
   9260 // Full
   9261 template <class D, typename T = TFromD<D>>
   9262 HWY_API size_t CountTrue(D /* tag */, Mask128<T> mask) {
   9263  return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask);
   9264 }
   9265 
   9266 // Partial
   9267 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   9268 HWY_API size_t CountTrue(D d, MFromD<D> mask) {
   9269  constexpr int kDiv = 4 * sizeof(TFromD<D>);
   9270  return PopCount(detail::NibblesFromMask(d, mask)) / kDiv;
   9271 }
   9272 
   9273 template <class D>
   9274 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) {
   9275  const uint64_t nib = detail::NibblesFromMask(d, mask);
   9276  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
   9277  return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv;
   9278 }
   9279 
   9280 template <class D>
   9281 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) {
   9282  const uint64_t nib = detail::NibblesFromMask(d, mask);
   9283  if (nib == 0) return -1;
   9284  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
   9285  return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv);
   9286 }
   9287 
   9288 template <class D>
   9289 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) {
   9290  const uint64_t nib = detail::NibblesFromMask(d, mask);
   9291  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
   9292  return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv;
   9293 }
   9294 
   9295 template <class D>
   9296 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) {
   9297  const uint64_t nib = detail::NibblesFromMask(d, mask);
   9298  if (nib == 0) return -1;
   9299  constexpr size_t kDiv = 4 * sizeof(TFromD<D>);
   9300  return static_cast<intptr_t>((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) /
   9301                               kDiv);
   9302 }
   9303 
   9304 // `p` points to at least 8 writable bytes.
   9305 template <class D>
   9306 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) {
   9307  const uint64_t mask_bits = BitsFromMask(d, mask);
   9308  const size_t kNumBytes = (d.MaxLanes() + 7) / 8;
   9309  CopyBytes<kNumBytes>(&mask_bits, bits);
   9310  return kNumBytes;
   9311 }
   9312 
   9313 template <class D>
   9314 HWY_API bool AllFalse(D d, MFromD<D> m) {
   9315  return detail::NibblesFromMask(d, m) == 0;
   9316 }
   9317 
   9318 // Full
   9319 template <class D, typename T = TFromD<D>>
   9320 HWY_API bool AllTrue(D d, Mask128<T> m) {
   9321  return detail::NibblesFromMask(d, m) == ~0ull;
   9322 }
   9323 // Partial
   9324 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   9325 HWY_API bool AllTrue(D d, MFromD<D> m) {
   9326  return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1;
   9327 }
   9328 
   9329 // ------------------------------ Compress
   9330 
   9331 template <typename T>
   9332 struct CompressIsPartition {
   9333  enum { value = (sizeof(T) != 1) };
   9334 };
   9335 
   9336 namespace detail {
   9337 
   9338 // Load 8 bytes, replicate into upper half so ZipLower can use the lower half.
   9339 template <class D, HWY_IF_V_SIZE_D(D, 16)>
   9340 HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) {
   9341  return Vec128<uint8_t>(vreinterpretq_u8_u64(
   9342      vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes))));
   9343 }
   9344 
   9345 // Load 8 bytes and return half-reg with N <= 8 bytes.
   9346 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)>
   9347 HWY_INLINE VFromD<D> Load8Bytes(D d, const uint8_t* bytes) {
   9348  return Load(d, bytes);
   9349 }
   9350 
   9351 template <typename T, size_t N>
   9352 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/,
   9353                                    uint64_t mask_bits) {
   9354  HWY_DASSERT(mask_bits < 256);
   9355  const Simd<T, N, 0> d;
   9356  const Repartition<uint8_t, decltype(d)> d8;
   9357  const Simd<uint16_t, N, 0> du;
   9358 
   9359  // NEON does not provide an equivalent of AVX2 permutevar, so we need byte
   9360  // indices for VTBL (one vector's worth for each of 256 combinations of
   9361  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
   9362  // store lane indices and convert to byte indices (2*lane + 0..1), with the
   9363  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
   9364  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
   9365  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
   9366  // is likely more costly than the higher cache footprint from storing bytes.
   9367  alignas(16) static constexpr uint8_t table[256 * 8] = {
   9368      // PrintCompress16x8Tables
   9369      0,  2,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9370      2,  0,  4,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9371      4,  0,  2,  6,  8,  10, 12, 14, /**/ 0, 4,  2,  6,  8,  10, 12, 14,  //
   9372      2,  4,  0,  6,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9373      6,  0,  2,  4,  8,  10, 12, 14, /**/ 0, 6,  2,  4,  8,  10, 12, 14,  //
   9374      2,  6,  0,  4,  8,  10, 12, 14, /**/ 0, 2,  6,  4,  8,  10, 12, 14,  //
   9375      4,  6,  0,  2,  8,  10, 12, 14, /**/ 0, 4,  6,  2,  8,  10, 12, 14,  //
   9376      2,  4,  6,  0,  8,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9377      8,  0,  2,  4,  6,  10, 12, 14, /**/ 0, 8,  2,  4,  6,  10, 12, 14,  //
   9378      2,  8,  0,  4,  6,  10, 12, 14, /**/ 0, 2,  8,  4,  6,  10, 12, 14,  //
   9379      4,  8,  0,  2,  6,  10, 12, 14, /**/ 0, 4,  8,  2,  6,  10, 12, 14,  //
   9380      2,  4,  8,  0,  6,  10, 12, 14, /**/ 0, 2,  4,  8,  6,  10, 12, 14,  //
   9381      6,  8,  0,  2,  4,  10, 12, 14, /**/ 0, 6,  8,  2,  4,  10, 12, 14,  //
   9382      2,  6,  8,  0,  4,  10, 12, 14, /**/ 0, 2,  6,  8,  4,  10, 12, 14,  //
   9383      4,  6,  8,  0,  2,  10, 12, 14, /**/ 0, 4,  6,  8,  2,  10, 12, 14,  //
   9384      2,  4,  6,  8,  0,  10, 12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9385      10, 0,  2,  4,  6,  8,  12, 14, /**/ 0, 10, 2,  4,  6,  8,  12, 14,  //
   9386      2,  10, 0,  4,  6,  8,  12, 14, /**/ 0, 2,  10, 4,  6,  8,  12, 14,  //
   9387      4,  10, 0,  2,  6,  8,  12, 14, /**/ 0, 4,  10, 2,  6,  8,  12, 14,  //
   9388      2,  4,  10, 0,  6,  8,  12, 14, /**/ 0, 2,  4,  10, 6,  8,  12, 14,  //
   9389      6,  10, 0,  2,  4,  8,  12, 14, /**/ 0, 6,  10, 2,  4,  8,  12, 14,  //
   9390      2,  6,  10, 0,  4,  8,  12, 14, /**/ 0, 2,  6,  10, 4,  8,  12, 14,  //
   9391      4,  6,  10, 0,  2,  8,  12, 14, /**/ 0, 4,  6,  10, 2,  8,  12, 14,  //
   9392      2,  4,  6,  10, 0,  8,  12, 14, /**/ 0, 2,  4,  6,  10, 8,  12, 14,  //
   9393      8,  10, 0,  2,  4,  6,  12, 14, /**/ 0, 8,  10, 2,  4,  6,  12, 14,  //
   9394      2,  8,  10, 0,  4,  6,  12, 14, /**/ 0, 2,  8,  10, 4,  6,  12, 14,  //
   9395      4,  8,  10, 0,  2,  6,  12, 14, /**/ 0, 4,  8,  10, 2,  6,  12, 14,  //
   9396      2,  4,  8,  10, 0,  6,  12, 14, /**/ 0, 2,  4,  8,  10, 6,  12, 14,  //
   9397      6,  8,  10, 0,  2,  4,  12, 14, /**/ 0, 6,  8,  10, 2,  4,  12, 14,  //
   9398      2,  6,  8,  10, 0,  4,  12, 14, /**/ 0, 2,  6,  8,  10, 4,  12, 14,  //
   9399      4,  6,  8,  10, 0,  2,  12, 14, /**/ 0, 4,  6,  8,  10, 2,  12, 14,  //
   9400      2,  4,  6,  8,  10, 0,  12, 14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9401      12, 0,  2,  4,  6,  8,  10, 14, /**/ 0, 12, 2,  4,  6,  8,  10, 14,  //
   9402      2,  12, 0,  4,  6,  8,  10, 14, /**/ 0, 2,  12, 4,  6,  8,  10, 14,  //
   9403      4,  12, 0,  2,  6,  8,  10, 14, /**/ 0, 4,  12, 2,  6,  8,  10, 14,  //
   9404      2,  4,  12, 0,  6,  8,  10, 14, /**/ 0, 2,  4,  12, 6,  8,  10, 14,  //
   9405      6,  12, 0,  2,  4,  8,  10, 14, /**/ 0, 6,  12, 2,  4,  8,  10, 14,  //
   9406      2,  6,  12, 0,  4,  8,  10, 14, /**/ 0, 2,  6,  12, 4,  8,  10, 14,  //
   9407      4,  6,  12, 0,  2,  8,  10, 14, /**/ 0, 4,  6,  12, 2,  8,  10, 14,  //
   9408      2,  4,  6,  12, 0,  8,  10, 14, /**/ 0, 2,  4,  6,  12, 8,  10, 14,  //
   9409      8,  12, 0,  2,  4,  6,  10, 14, /**/ 0, 8,  12, 2,  4,  6,  10, 14,  //
   9410      2,  8,  12, 0,  4,  6,  10, 14, /**/ 0, 2,  8,  12, 4,  6,  10, 14,  //
   9411      4,  8,  12, 0,  2,  6,  10, 14, /**/ 0, 4,  8,  12, 2,  6,  10, 14,  //
   9412      2,  4,  8,  12, 0,  6,  10, 14, /**/ 0, 2,  4,  8,  12, 6,  10, 14,  //
   9413      6,  8,  12, 0,  2,  4,  10, 14, /**/ 0, 6,  8,  12, 2,  4,  10, 14,  //
   9414      2,  6,  8,  12, 0,  4,  10, 14, /**/ 0, 2,  6,  8,  12, 4,  10, 14,  //
   9415      4,  6,  8,  12, 0,  2,  10, 14, /**/ 0, 4,  6,  8,  12, 2,  10, 14,  //
   9416      2,  4,  6,  8,  12, 0,  10, 14, /**/ 0, 2,  4,  6,  8,  12, 10, 14,  //
   9417      10, 12, 0,  2,  4,  6,  8,  14, /**/ 0, 10, 12, 2,  4,  6,  8,  14,  //
   9418      2,  10, 12, 0,  4,  6,  8,  14, /**/ 0, 2,  10, 12, 4,  6,  8,  14,  //
   9419      4,  10, 12, 0,  2,  6,  8,  14, /**/ 0, 4,  10, 12, 2,  6,  8,  14,  //
   9420      2,  4,  10, 12, 0,  6,  8,  14, /**/ 0, 2,  4,  10, 12, 6,  8,  14,  //
   9421      6,  10, 12, 0,  2,  4,  8,  14, /**/ 0, 6,  10, 12, 2,  4,  8,  14,  //
   9422      2,  6,  10, 12, 0,  4,  8,  14, /**/ 0, 2,  6,  10, 12, 4,  8,  14,  //
   9423      4,  6,  10, 12, 0,  2,  8,  14, /**/ 0, 4,  6,  10, 12, 2,  8,  14,  //
   9424      2,  4,  6,  10, 12, 0,  8,  14, /**/ 0, 2,  4,  6,  10, 12, 8,  14,  //
   9425      8,  10, 12, 0,  2,  4,  6,  14, /**/ 0, 8,  10, 12, 2,  4,  6,  14,  //
   9426      2,  8,  10, 12, 0,  4,  6,  14, /**/ 0, 2,  8,  10, 12, 4,  6,  14,  //
   9427      4,  8,  10, 12, 0,  2,  6,  14, /**/ 0, 4,  8,  10, 12, 2,  6,  14,  //
   9428      2,  4,  8,  10, 12, 0,  6,  14, /**/ 0, 2,  4,  8,  10, 12, 6,  14,  //
   9429      6,  8,  10, 12, 0,  2,  4,  14, /**/ 0, 6,  8,  10, 12, 2,  4,  14,  //
   9430      2,  6,  8,  10, 12, 0,  4,  14, /**/ 0, 2,  6,  8,  10, 12, 4,  14,  //
   9431      4,  6,  8,  10, 12, 0,  2,  14, /**/ 0, 4,  6,  8,  10, 12, 2,  14,  //
   9432      2,  4,  6,  8,  10, 12, 0,  14, /**/ 0, 2,  4,  6,  8,  10, 12, 14,  //
   9433      14, 0,  2,  4,  6,  8,  10, 12, /**/ 0, 14, 2,  4,  6,  8,  10, 12,  //
   9434      2,  14, 0,  4,  6,  8,  10, 12, /**/ 0, 2,  14, 4,  6,  8,  10, 12,  //
   9435      4,  14, 0,  2,  6,  8,  10, 12, /**/ 0, 4,  14, 2,  6,  8,  10, 12,  //
   9436      2,  4,  14, 0,  6,  8,  10, 12, /**/ 0, 2,  4,  14, 6,  8,  10, 12,  //
   9437      6,  14, 0,  2,  4,  8,  10, 12, /**/ 0, 6,  14, 2,  4,  8,  10, 12,  //
   9438      2,  6,  14, 0,  4,  8,  10, 12, /**/ 0, 2,  6,  14, 4,  8,  10, 12,  //
   9439      4,  6,  14, 0,  2,  8,  10, 12, /**/ 0, 4,  6,  14, 2,  8,  10, 12,  //
   9440      2,  4,  6,  14, 0,  8,  10, 12, /**/ 0, 2,  4,  6,  14, 8,  10, 12,  //
   9441      8,  14, 0,  2,  4,  6,  10, 12, /**/ 0, 8,  14, 2,  4,  6,  10, 12,  //
   9442      2,  8,  14, 0,  4,  6,  10, 12, /**/ 0, 2,  8,  14, 4,  6,  10, 12,  //
   9443      4,  8,  14, 0,  2,  6,  10, 12, /**/ 0, 4,  8,  14, 2,  6,  10, 12,  //
   9444      2,  4,  8,  14, 0,  6,  10, 12, /**/ 0, 2,  4,  8,  14, 6,  10, 12,  //
   9445      6,  8,  14, 0,  2,  4,  10, 12, /**/ 0, 6,  8,  14, 2,  4,  10, 12,  //
   9446      2,  6,  8,  14, 0,  4,  10, 12, /**/ 0, 2,  6,  8,  14, 4,  10, 12,  //
   9447      4,  6,  8,  14, 0,  2,  10, 12, /**/ 0, 4,  6,  8,  14, 2,  10, 12,  //
   9448      2,  4,  6,  8,  14, 0,  10, 12, /**/ 0, 2,  4,  6,  8,  14, 10, 12,  //
   9449      10, 14, 0,  2,  4,  6,  8,  12, /**/ 0, 10, 14, 2,  4,  6,  8,  12,  //
   9450      2,  10, 14, 0,  4,  6,  8,  12, /**/ 0, 2,  10, 14, 4,  6,  8,  12,  //
   9451      4,  10, 14, 0,  2,  6,  8,  12, /**/ 0, 4,  10, 14, 2,  6,  8,  12,  //
   9452      2,  4,  10, 14, 0,  6,  8,  12, /**/ 0, 2,  4,  10, 14, 6,  8,  12,  //
   9453      6,  10, 14, 0,  2,  4,  8,  12, /**/ 0, 6,  10, 14, 2,  4,  8,  12,  //
   9454      2,  6,  10, 14, 0,  4,  8,  12, /**/ 0, 2,  6,  10, 14, 4,  8,  12,  //
   9455      4,  6,  10, 14, 0,  2,  8,  12, /**/ 0, 4,  6,  10, 14, 2,  8,  12,  //
   9456      2,  4,  6,  10, 14, 0,  8,  12, /**/ 0, 2,  4,  6,  10, 14, 8,  12,  //
   9457      8,  10, 14, 0,  2,  4,  6,  12, /**/ 0, 8,  10, 14, 2,  4,  6,  12,  //
   9458      2,  8,  10, 14, 0,  4,  6,  12, /**/ 0, 2,  8,  10, 14, 4,  6,  12,  //
   9459      4,  8,  10, 14, 0,  2,  6,  12, /**/ 0, 4,  8,  10, 14, 2,  6,  12,  //
   9460      2,  4,  8,  10, 14, 0,  6,  12, /**/ 0, 2,  4,  8,  10, 14, 6,  12,  //
   9461      6,  8,  10, 14, 0,  2,  4,  12, /**/ 0, 6,  8,  10, 14, 2,  4,  12,  //
   9462      2,  6,  8,  10, 14, 0,  4,  12, /**/ 0, 2,  6,  8,  10, 14, 4,  12,  //
   9463      4,  6,  8,  10, 14, 0,  2,  12, /**/ 0, 4,  6,  8,  10, 14, 2,  12,  //
   9464      2,  4,  6,  8,  10, 14, 0,  12, /**/ 0, 2,  4,  6,  8,  10, 14, 12,  //
   9465      12, 14, 0,  2,  4,  6,  8,  10, /**/ 0, 12, 14, 2,  4,  6,  8,  10,  //
   9466      2,  12, 14, 0,  4,  6,  8,  10, /**/ 0, 2,  12, 14, 4,  6,  8,  10,  //
   9467      4,  12, 14, 0,  2,  6,  8,  10, /**/ 0, 4,  12, 14, 2,  6,  8,  10,  //
   9468      2,  4,  12, 14, 0,  6,  8,  10, /**/ 0, 2,  4,  12, 14, 6,  8,  10,  //
   9469      6,  12, 14, 0,  2,  4,  8,  10, /**/ 0, 6,  12, 14, 2,  4,  8,  10,  //
   9470      2,  6,  12, 14, 0,  4,  8,  10, /**/ 0, 2,  6,  12, 14, 4,  8,  10,  //
   9471      4,  6,  12, 14, 0,  2,  8,  10, /**/ 0, 4,  6,  12, 14, 2,  8,  10,  //
   9472      2,  4,  6,  12, 14, 0,  8,  10, /**/ 0, 2,  4,  6,  12, 14, 8,  10,  //
   9473      8,  12, 14, 0,  2,  4,  6,  10, /**/ 0, 8,  12, 14, 2,  4,  6,  10,  //
   9474      2,  8,  12, 14, 0,  4,  6,  10, /**/ 0, 2,  8,  12, 14, 4,  6,  10,  //
   9475      4,  8,  12, 14, 0,  2,  6,  10, /**/ 0, 4,  8,  12, 14, 2,  6,  10,  //
   9476      2,  4,  8,  12, 14, 0,  6,  10, /**/ 0, 2,  4,  8,  12, 14, 6,  10,  //
   9477      6,  8,  12, 14, 0,  2,  4,  10, /**/ 0, 6,  8,  12, 14, 2,  4,  10,  //
   9478      2,  6,  8,  12, 14, 0,  4,  10, /**/ 0, 2,  6,  8,  12, 14, 4,  10,  //
   9479      4,  6,  8,  12, 14, 0,  2,  10, /**/ 0, 4,  6,  8,  12, 14, 2,  10,  //
   9480      2,  4,  6,  8,  12, 14, 0,  10, /**/ 0, 2,  4,  6,  8,  12, 14, 10,  //
   9481      10, 12, 14, 0,  2,  4,  6,  8,  /**/ 0, 10, 12, 14, 2,  4,  6,  8,   //
   9482      2,  10, 12, 14, 0,  4,  6,  8,  /**/ 0, 2,  10, 12, 14, 4,  6,  8,   //
   9483      4,  10, 12, 14, 0,  2,  6,  8,  /**/ 0, 4,  10, 12, 14, 2,  6,  8,   //
   9484      2,  4,  10, 12, 14, 0,  6,  8,  /**/ 0, 2,  4,  10, 12, 14, 6,  8,   //
   9485      6,  10, 12, 14, 0,  2,  4,  8,  /**/ 0, 6,  10, 12, 14, 2,  4,  8,   //
   9486      2,  6,  10, 12, 14, 0,  4,  8,  /**/ 0, 2,  6,  10, 12, 14, 4,  8,   //
   9487      4,  6,  10, 12, 14, 0,  2,  8,  /**/ 0, 4,  6,  10, 12, 14, 2,  8,   //
   9488      2,  4,  6,  10, 12, 14, 0,  8,  /**/ 0, 2,  4,  6,  10, 12, 14, 8,   //
   9489      8,  10, 12, 14, 0,  2,  4,  6,  /**/ 0, 8,  10, 12, 14, 2,  4,  6,   //
   9490      2,  8,  10, 12, 14, 0,  4,  6,  /**/ 0, 2,  8,  10, 12, 14, 4,  6,   //
   9491      4,  8,  10, 12, 14, 0,  2,  6,  /**/ 0, 4,  8,  10, 12, 14, 2,  6,   //
   9492      2,  4,  8,  10, 12, 14, 0,  6,  /**/ 0, 2,  4,  8,  10, 12, 14, 6,   //
   9493      6,  8,  10, 12, 14, 0,  2,  4,  /**/ 0, 6,  8,  10, 12, 14, 2,  4,   //
   9494      2,  6,  8,  10, 12, 14, 0,  4,  /**/ 0, 2,  6,  8,  10, 12, 14, 4,   //
   9495      4,  6,  8,  10, 12, 14, 0,  2,  /**/ 0, 4,  6,  8,  10, 12, 14, 2,   //
   9496      2,  4,  6,  8,  10, 12, 14, 0,  /**/ 0, 2,  4,  6,  8,  10, 12, 14};
   9497 
   9498  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
   9499  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
   9500  return BitCast(d, pairs + Set(du, 0x0100));
   9501 }
   9502 
   9503 template <typename T, size_t N>
   9504 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/,
   9505                                       uint64_t mask_bits) {
   9506  HWY_DASSERT(mask_bits < 256);
   9507  const Simd<T, N, 0> d;
   9508  const Repartition<uint8_t, decltype(d)> d8;
   9509  const Simd<uint16_t, N, 0> du;
   9510 
   9511  // NEON does not provide an equivalent of AVX2 permutevar, so we need byte
   9512  // indices for VTBL (one vector's worth for each of 256 combinations of
   9513  // 8 mask bits). Loading them directly would require 4 KiB. We can instead
   9514  // store lane indices and convert to byte indices (2*lane + 0..1), with the
   9515  // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane
   9516  // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts.
   9517  // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles
   9518  // is likely more costly than the higher cache footprint from storing bytes.
   9519  alignas(16) static constexpr uint8_t table[256 * 8] = {
   9520      // PrintCompressNot16x8Tables
   9521      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 14, 0,   //
   9522      0, 4,  6,  8,  10, 12, 14, 2,  /**/ 4,  6,  8,  10, 12, 14, 0,  2,   //
   9523      0, 2,  6,  8,  10, 12, 14, 4,  /**/ 2,  6,  8,  10, 12, 14, 0,  4,   //
   9524      0, 6,  8,  10, 12, 14, 2,  4,  /**/ 6,  8,  10, 12, 14, 0,  2,  4,   //
   9525      0, 2,  4,  8,  10, 12, 14, 6,  /**/ 2,  4,  8,  10, 12, 14, 0,  6,   //
   9526      0, 4,  8,  10, 12, 14, 2,  6,  /**/ 4,  8,  10, 12, 14, 0,  2,  6,   //
   9527      0, 2,  8,  10, 12, 14, 4,  6,  /**/ 2,  8,  10, 12, 14, 0,  4,  6,   //
   9528      0, 8,  10, 12, 14, 2,  4,  6,  /**/ 8,  10, 12, 14, 0,  2,  4,  6,   //
   9529      0, 2,  4,  6,  10, 12, 14, 8,  /**/ 2,  4,  6,  10, 12, 14, 0,  8,   //
   9530      0, 4,  6,  10, 12, 14, 2,  8,  /**/ 4,  6,  10, 12, 14, 0,  2,  8,   //
   9531      0, 2,  6,  10, 12, 14, 4,  8,  /**/ 2,  6,  10, 12, 14, 0,  4,  8,   //
   9532      0, 6,  10, 12, 14, 2,  4,  8,  /**/ 6,  10, 12, 14, 0,  2,  4,  8,   //
   9533      0, 2,  4,  10, 12, 14, 6,  8,  /**/ 2,  4,  10, 12, 14, 0,  6,  8,   //
   9534      0, 4,  10, 12, 14, 2,  6,  8,  /**/ 4,  10, 12, 14, 0,  2,  6,  8,   //
   9535      0, 2,  10, 12, 14, 4,  6,  8,  /**/ 2,  10, 12, 14, 0,  4,  6,  8,   //
   9536      0, 10, 12, 14, 2,  4,  6,  8,  /**/ 10, 12, 14, 0,  2,  4,  6,  8,   //
   9537      0, 2,  4,  6,  8,  12, 14, 10, /**/ 2,  4,  6,  8,  12, 14, 0,  10,  //
   9538      0, 4,  6,  8,  12, 14, 2,  10, /**/ 4,  6,  8,  12, 14, 0,  2,  10,  //
   9539      0, 2,  6,  8,  12, 14, 4,  10, /**/ 2,  6,  8,  12, 14, 0,  4,  10,  //
   9540      0, 6,  8,  12, 14, 2,  4,  10, /**/ 6,  8,  12, 14, 0,  2,  4,  10,  //
   9541      0, 2,  4,  8,  12, 14, 6,  10, /**/ 2,  4,  8,  12, 14, 0,  6,  10,  //
   9542      0, 4,  8,  12, 14, 2,  6,  10, /**/ 4,  8,  12, 14, 0,  2,  6,  10,  //
   9543      0, 2,  8,  12, 14, 4,  6,  10, /**/ 2,  8,  12, 14, 0,  4,  6,  10,  //
   9544      0, 8,  12, 14, 2,  4,  6,  10, /**/ 8,  12, 14, 0,  2,  4,  6,  10,  //
   9545      0, 2,  4,  6,  12, 14, 8,  10, /**/ 2,  4,  6,  12, 14, 0,  8,  10,  //
   9546      0, 4,  6,  12, 14, 2,  8,  10, /**/ 4,  6,  12, 14, 0,  2,  8,  10,  //
   9547      0, 2,  6,  12, 14, 4,  8,  10, /**/ 2,  6,  12, 14, 0,  4,  8,  10,  //
   9548      0, 6,  12, 14, 2,  4,  8,  10, /**/ 6,  12, 14, 0,  2,  4,  8,  10,  //
   9549      0, 2,  4,  12, 14, 6,  8,  10, /**/ 2,  4,  12, 14, 0,  6,  8,  10,  //
   9550      0, 4,  12, 14, 2,  6,  8,  10, /**/ 4,  12, 14, 0,  2,  6,  8,  10,  //
   9551      0, 2,  12, 14, 4,  6,  8,  10, /**/ 2,  12, 14, 0,  4,  6,  8,  10,  //
   9552      0, 12, 14, 2,  4,  6,  8,  10, /**/ 12, 14, 0,  2,  4,  6,  8,  10,  //
   9553      0, 2,  4,  6,  8,  10, 14, 12, /**/ 2,  4,  6,  8,  10, 14, 0,  12,  //
   9554      0, 4,  6,  8,  10, 14, 2,  12, /**/ 4,  6,  8,  10, 14, 0,  2,  12,  //
   9555      0, 2,  6,  8,  10, 14, 4,  12, /**/ 2,  6,  8,  10, 14, 0,  4,  12,  //
   9556      0, 6,  8,  10, 14, 2,  4,  12, /**/ 6,  8,  10, 14, 0,  2,  4,  12,  //
   9557      0, 2,  4,  8,  10, 14, 6,  12, /**/ 2,  4,  8,  10, 14, 0,  6,  12,  //
   9558      0, 4,  8,  10, 14, 2,  6,  12, /**/ 4,  8,  10, 14, 0,  2,  6,  12,  //
   9559      0, 2,  8,  10, 14, 4,  6,  12, /**/ 2,  8,  10, 14, 0,  4,  6,  12,  //
   9560      0, 8,  10, 14, 2,  4,  6,  12, /**/ 8,  10, 14, 0,  2,  4,  6,  12,  //
   9561      0, 2,  4,  6,  10, 14, 8,  12, /**/ 2,  4,  6,  10, 14, 0,  8,  12,  //
   9562      0, 4,  6,  10, 14, 2,  8,  12, /**/ 4,  6,  10, 14, 0,  2,  8,  12,  //
   9563      0, 2,  6,  10, 14, 4,  8,  12, /**/ 2,  6,  10, 14, 0,  4,  8,  12,  //
   9564      0, 6,  10, 14, 2,  4,  8,  12, /**/ 6,  10, 14, 0,  2,  4,  8,  12,  //
   9565      0, 2,  4,  10, 14, 6,  8,  12, /**/ 2,  4,  10, 14, 0,  6,  8,  12,  //
   9566      0, 4,  10, 14, 2,  6,  8,  12, /**/ 4,  10, 14, 0,  2,  6,  8,  12,  //
   9567      0, 2,  10, 14, 4,  6,  8,  12, /**/ 2,  10, 14, 0,  4,  6,  8,  12,  //
   9568      0, 10, 14, 2,  4,  6,  8,  12, /**/ 10, 14, 0,  2,  4,  6,  8,  12,  //
   9569      0, 2,  4,  6,  8,  14, 10, 12, /**/ 2,  4,  6,  8,  14, 0,  10, 12,  //
   9570      0, 4,  6,  8,  14, 2,  10, 12, /**/ 4,  6,  8,  14, 0,  2,  10, 12,  //
   9571      0, 2,  6,  8,  14, 4,  10, 12, /**/ 2,  6,  8,  14, 0,  4,  10, 12,  //
   9572      0, 6,  8,  14, 2,  4,  10, 12, /**/ 6,  8,  14, 0,  2,  4,  10, 12,  //
   9573      0, 2,  4,  8,  14, 6,  10, 12, /**/ 2,  4,  8,  14, 0,  6,  10, 12,  //
   9574      0, 4,  8,  14, 2,  6,  10, 12, /**/ 4,  8,  14, 0,  2,  6,  10, 12,  //
   9575      0, 2,  8,  14, 4,  6,  10, 12, /**/ 2,  8,  14, 0,  4,  6,  10, 12,  //
   9576      0, 8,  14, 2,  4,  6,  10, 12, /**/ 8,  14, 0,  2,  4,  6,  10, 12,  //
   9577      0, 2,  4,  6,  14, 8,  10, 12, /**/ 2,  4,  6,  14, 0,  8,  10, 12,  //
   9578      0, 4,  6,  14, 2,  8,  10, 12, /**/ 4,  6,  14, 0,  2,  8,  10, 12,  //
   9579      0, 2,  6,  14, 4,  8,  10, 12, /**/ 2,  6,  14, 0,  4,  8,  10, 12,  //
   9580      0, 6,  14, 2,  4,  8,  10, 12, /**/ 6,  14, 0,  2,  4,  8,  10, 12,  //
   9581      0, 2,  4,  14, 6,  8,  10, 12, /**/ 2,  4,  14, 0,  6,  8,  10, 12,  //
   9582      0, 4,  14, 2,  6,  8,  10, 12, /**/ 4,  14, 0,  2,  6,  8,  10, 12,  //
   9583      0, 2,  14, 4,  6,  8,  10, 12, /**/ 2,  14, 0,  4,  6,  8,  10, 12,  //
   9584      0, 14, 2,  4,  6,  8,  10, 12, /**/ 14, 0,  2,  4,  6,  8,  10, 12,  //
   9585      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 12, 0,  14,  //
   9586      0, 4,  6,  8,  10, 12, 2,  14, /**/ 4,  6,  8,  10, 12, 0,  2,  14,  //
   9587      0, 2,  6,  8,  10, 12, 4,  14, /**/ 2,  6,  8,  10, 12, 0,  4,  14,  //
   9588      0, 6,  8,  10, 12, 2,  4,  14, /**/ 6,  8,  10, 12, 0,  2,  4,  14,  //
   9589      0, 2,  4,  8,  10, 12, 6,  14, /**/ 2,  4,  8,  10, 12, 0,  6,  14,  //
   9590      0, 4,  8,  10, 12, 2,  6,  14, /**/ 4,  8,  10, 12, 0,  2,  6,  14,  //
   9591      0, 2,  8,  10, 12, 4,  6,  14, /**/ 2,  8,  10, 12, 0,  4,  6,  14,  //
   9592      0, 8,  10, 12, 2,  4,  6,  14, /**/ 8,  10, 12, 0,  2,  4,  6,  14,  //
   9593      0, 2,  4,  6,  10, 12, 8,  14, /**/ 2,  4,  6,  10, 12, 0,  8,  14,  //
   9594      0, 4,  6,  10, 12, 2,  8,  14, /**/ 4,  6,  10, 12, 0,  2,  8,  14,  //
   9595      0, 2,  6,  10, 12, 4,  8,  14, /**/ 2,  6,  10, 12, 0,  4,  8,  14,  //
   9596      0, 6,  10, 12, 2,  4,  8,  14, /**/ 6,  10, 12, 0,  2,  4,  8,  14,  //
   9597      0, 2,  4,  10, 12, 6,  8,  14, /**/ 2,  4,  10, 12, 0,  6,  8,  14,  //
   9598      0, 4,  10, 12, 2,  6,  8,  14, /**/ 4,  10, 12, 0,  2,  6,  8,  14,  //
   9599      0, 2,  10, 12, 4,  6,  8,  14, /**/ 2,  10, 12, 0,  4,  6,  8,  14,  //
   9600      0, 10, 12, 2,  4,  6,  8,  14, /**/ 10, 12, 0,  2,  4,  6,  8,  14,  //
   9601      0, 2,  4,  6,  8,  12, 10, 14, /**/ 2,  4,  6,  8,  12, 0,  10, 14,  //
   9602      0, 4,  6,  8,  12, 2,  10, 14, /**/ 4,  6,  8,  12, 0,  2,  10, 14,  //
   9603      0, 2,  6,  8,  12, 4,  10, 14, /**/ 2,  6,  8,  12, 0,  4,  10, 14,  //
   9604      0, 6,  8,  12, 2,  4,  10, 14, /**/ 6,  8,  12, 0,  2,  4,  10, 14,  //
   9605      0, 2,  4,  8,  12, 6,  10, 14, /**/ 2,  4,  8,  12, 0,  6,  10, 14,  //
   9606      0, 4,  8,  12, 2,  6,  10, 14, /**/ 4,  8,  12, 0,  2,  6,  10, 14,  //
   9607      0, 2,  8,  12, 4,  6,  10, 14, /**/ 2,  8,  12, 0,  4,  6,  10, 14,  //
   9608      0, 8,  12, 2,  4,  6,  10, 14, /**/ 8,  12, 0,  2,  4,  6,  10, 14,  //
   9609      0, 2,  4,  6,  12, 8,  10, 14, /**/ 2,  4,  6,  12, 0,  8,  10, 14,  //
   9610      0, 4,  6,  12, 2,  8,  10, 14, /**/ 4,  6,  12, 0,  2,  8,  10, 14,  //
   9611      0, 2,  6,  12, 4,  8,  10, 14, /**/ 2,  6,  12, 0,  4,  8,  10, 14,  //
   9612      0, 6,  12, 2,  4,  8,  10, 14, /**/ 6,  12, 0,  2,  4,  8,  10, 14,  //
   9613      0, 2,  4,  12, 6,  8,  10, 14, /**/ 2,  4,  12, 0,  6,  8,  10, 14,  //
   9614      0, 4,  12, 2,  6,  8,  10, 14, /**/ 4,  12, 0,  2,  6,  8,  10, 14,  //
   9615      0, 2,  12, 4,  6,  8,  10, 14, /**/ 2,  12, 0,  4,  6,  8,  10, 14,  //
   9616      0, 12, 2,  4,  6,  8,  10, 14, /**/ 12, 0,  2,  4,  6,  8,  10, 14,  //
   9617      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  10, 0,  12, 14,  //
   9618      0, 4,  6,  8,  10, 2,  12, 14, /**/ 4,  6,  8,  10, 0,  2,  12, 14,  //
   9619      0, 2,  6,  8,  10, 4,  12, 14, /**/ 2,  6,  8,  10, 0,  4,  12, 14,  //
   9620      0, 6,  8,  10, 2,  4,  12, 14, /**/ 6,  8,  10, 0,  2,  4,  12, 14,  //
   9621      0, 2,  4,  8,  10, 6,  12, 14, /**/ 2,  4,  8,  10, 0,  6,  12, 14,  //
   9622      0, 4,  8,  10, 2,  6,  12, 14, /**/ 4,  8,  10, 0,  2,  6,  12, 14,  //
   9623      0, 2,  8,  10, 4,  6,  12, 14, /**/ 2,  8,  10, 0,  4,  6,  12, 14,  //
   9624      0, 8,  10, 2,  4,  6,  12, 14, /**/ 8,  10, 0,  2,  4,  6,  12, 14,  //
   9625      0, 2,  4,  6,  10, 8,  12, 14, /**/ 2,  4,  6,  10, 0,  8,  12, 14,  //
   9626      0, 4,  6,  10, 2,  8,  12, 14, /**/ 4,  6,  10, 0,  2,  8,  12, 14,  //
   9627      0, 2,  6,  10, 4,  8,  12, 14, /**/ 2,  6,  10, 0,  4,  8,  12, 14,  //
   9628      0, 6,  10, 2,  4,  8,  12, 14, /**/ 6,  10, 0,  2,  4,  8,  12, 14,  //
   9629      0, 2,  4,  10, 6,  8,  12, 14, /**/ 2,  4,  10, 0,  6,  8,  12, 14,  //
   9630      0, 4,  10, 2,  6,  8,  12, 14, /**/ 4,  10, 0,  2,  6,  8,  12, 14,  //
   9631      0, 2,  10, 4,  6,  8,  12, 14, /**/ 2,  10, 0,  4,  6,  8,  12, 14,  //
   9632      0, 10, 2,  4,  6,  8,  12, 14, /**/ 10, 0,  2,  4,  6,  8,  12, 14,  //
   9633      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  8,  0,  10, 12, 14,  //
   9634      0, 4,  6,  8,  2,  10, 12, 14, /**/ 4,  6,  8,  0,  2,  10, 12, 14,  //
   9635      0, 2,  6,  8,  4,  10, 12, 14, /**/ 2,  6,  8,  0,  4,  10, 12, 14,  //
   9636      0, 6,  8,  2,  4,  10, 12, 14, /**/ 6,  8,  0,  2,  4,  10, 12, 14,  //
   9637      0, 2,  4,  8,  6,  10, 12, 14, /**/ 2,  4,  8,  0,  6,  10, 12, 14,  //
   9638      0, 4,  8,  2,  6,  10, 12, 14, /**/ 4,  8,  0,  2,  6,  10, 12, 14,  //
   9639      0, 2,  8,  4,  6,  10, 12, 14, /**/ 2,  8,  0,  4,  6,  10, 12, 14,  //
   9640      0, 8,  2,  4,  6,  10, 12, 14, /**/ 8,  0,  2,  4,  6,  10, 12, 14,  //
   9641      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  6,  0,  8,  10, 12, 14,  //
   9642      0, 4,  6,  2,  8,  10, 12, 14, /**/ 4,  6,  0,  2,  8,  10, 12, 14,  //
   9643      0, 2,  6,  4,  8,  10, 12, 14, /**/ 2,  6,  0,  4,  8,  10, 12, 14,  //
   9644      0, 6,  2,  4,  8,  10, 12, 14, /**/ 6,  0,  2,  4,  8,  10, 12, 14,  //
   9645      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  4,  0,  6,  8,  10, 12, 14,  //
   9646      0, 4,  2,  6,  8,  10, 12, 14, /**/ 4,  0,  2,  6,  8,  10, 12, 14,  //
   9647      0, 2,  4,  6,  8,  10, 12, 14, /**/ 2,  0,  4,  6,  8,  10, 12, 14,  //
   9648      0, 2,  4,  6,  8,  10, 12, 14, /**/ 0,  2,  4,  6,  8,  10, 12, 14};
   9649 
   9650  const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8);
   9651  const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx);
   9652  return BitCast(d, pairs + Set(du, 0x0100));
   9653 }
   9654 
   9655 template <typename T, size_t N>
   9656 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/,
   9657                                    uint64_t mask_bits) {
   9658  HWY_DASSERT(mask_bits < 16);
   9659 
   9660  // There are only 4 lanes, so we can afford to load the index vector directly.
   9661  alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
   9662      // PrintCompress32x4Tables
   9663      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   9664      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   9665      4,  5,  6,  7,  0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15,  //
   9666      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   9667      8,  9,  10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15,  //
   9668      0,  1,  2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15,  //
   9669      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15,  //
   9670      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,  //
   9671      12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,  //
   9672      0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11,  //
   9673      4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  8,  9,  10, 11,  //
   9674      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,  //
   9675      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,   //
   9676      0,  1,  2,  3,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,   //
   9677      4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,   //
   9678      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15};
   9679  const Simd<T, N, 0> d;
   9680  const Repartition<uint8_t, decltype(d)> d8;
   9681  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   9682 }
   9683 
   9684 template <typename T, size_t N>
   9685 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/,
   9686                                       uint64_t mask_bits) {
   9687  HWY_DASSERT(mask_bits < 16);
   9688 
   9689  // There are only 4 lanes, so we can afford to load the index vector directly.
   9690  alignas(16) static constexpr uint8_t u8_indices[16 * 16] = {
   9691      // PrintCompressNot32x4Tables
   9692      0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,
   9693      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  0,  1,  2,  3,
   9694      8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  8,  9,  10, 11, 12, 13,
   9695      14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
   9696      12, 13, 14, 15, 8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 0,  1,
   9697      2,  3,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 4,  5,  6,  7,
   9698      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
   9699      10, 11, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
   9700      4,  5,  6,  7,  8,  9,  10, 11, 0,  1,  2,  3,  12, 13, 14, 15, 0,  1,
   9701      2,  3,  8,  9,  10, 11, 4,  5,  6,  7,  12, 13, 14, 15, 8,  9,  10, 11,
   9702      0,  1,  2,  3,  4,  5,  6,  7,  12, 13, 14, 15, 0,  1,  2,  3,  4,  5,
   9703      6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 4,  5,  6,  7,  0,  1,  2,  3,
   9704      8,  9,  10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
   9705      10, 11, 12, 13, 14, 15, 0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11,
   9706      12, 13, 14, 15};
   9707  const Simd<T, N, 0> d;
   9708  const Repartition<uint8_t, decltype(d)> d8;
   9709  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   9710 }
   9711 
   9712 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64
   9713 
   9714 template <typename T, size_t N>
   9715 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/,
   9716                                    uint64_t mask_bits) {
   9717  HWY_DASSERT(mask_bits < 4);
   9718 
   9719  // There are only 2 lanes, so we can afford to load the index vector directly.
   9720  alignas(16) static constexpr uint8_t u8_indices[64] = {
   9721      // PrintCompress64x2Tables
   9722      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   9723      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   9724      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
   9725      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
   9726 
   9727  const Simd<T, N, 0> d;
   9728  const Repartition<uint8_t, decltype(d)> d8;
   9729  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   9730 }
   9731 
   9732 template <typename T, size_t N>
   9733 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/,
   9734                                       uint64_t mask_bits) {
   9735  HWY_DASSERT(mask_bits < 4);
   9736 
   9737  // There are only 2 lanes, so we can afford to load the index vector directly.
   9738  alignas(16) static constexpr uint8_t u8_indices[4 * 16] = {
   9739      // PrintCompressNot64x2Tables
   9740      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   9741      8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2,  3,  4,  5,  6,  7,
   9742      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15,
   9743      0, 1, 2,  3,  4,  5,  6,  7,  8, 9, 10, 11, 12, 13, 14, 15};
   9744 
   9745  const Simd<T, N, 0> d;
   9746  const Repartition<uint8_t, decltype(d)> d8;
   9747  return BitCast(d, Load(d8, u8_indices + 16 * mask_bits));
   9748 }
   9749 
   9750 #endif
   9751 
   9752 // Helper function called by both Compress and CompressStore - avoids a
   9753 // redundant BitsFromMask in the latter.
   9754 template <typename T, size_t N>
   9755 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, uint64_t mask_bits) {
   9756  const auto idx =
   9757      detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
   9758  using D = DFromV<decltype(v)>;
   9759  const RebindToSigned<D> di;
   9760  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
   9761 }
   9762 
   9763 template <typename T, size_t N>
   9764 HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, uint64_t mask_bits) {
   9765  const auto idx =
   9766      detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits);
   9767  using D = DFromV<decltype(v)>;
   9768  const RebindToSigned<D> di;
   9769  return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx)));
   9770 }
   9771 
   9772 }  // namespace detail
   9773 
   9774 // Single lane: no-op
   9775 template <typename T>
   9776 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
   9777  return v;
   9778 }
   9779 
   9780 // Two lanes: conditional swap
   9781 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)>
   9782 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   9783  // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep.
   9784  const DFromV<decltype(v)> d;
   9785  const Vec128<T, N> m = VecFromMask(d, mask);
   9786  const Vec128<T, N> maskL = DupEven(m);
   9787  const Vec128<T, N> maskH = DupOdd(m);
   9788  const Vec128<T, N> swap = AndNot(maskL, maskH);
   9789  return IfVecThenElse(swap, Shuffle01(v), v);
   9790 }
   9791 
   9792 // General case, 2 or 4 byte lanes
   9793 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
   9794 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) {
   9795  const DFromV<decltype(v)> d;
   9796  return detail::Compress(v, BitsFromMask(d, mask));
   9797 }
   9798 
   9799 // Single lane: no-op
   9800 template <typename T>
   9801 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) {
   9802  return v;
   9803 }
   9804 
   9805 // Two lanes: conditional swap
   9806 template <typename T, HWY_IF_T_SIZE(T, 8)>
   9807 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) {
   9808  // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep.
   9809  const DFromV<decltype(v)> d;
   9810  const Vec128<T> m = VecFromMask(d, mask);
   9811  const Vec128<T> maskL = DupEven(m);
   9812  const Vec128<T> maskH = DupOdd(m);
   9813  const Vec128<T> swap = AndNot(maskH, maskL);
   9814  return IfVecThenElse(swap, Shuffle01(v), v);
   9815 }
   9816 
   9817 // General case, 2 or 4 byte lanes
   9818 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))>
   9819 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) {
   9820  const DFromV<decltype(v)> d;
   9821  // For partial vectors, we cannot pull the Not() into the table because
   9822  // BitsFromMask clears the upper bits.
   9823  if (N < 16 / sizeof(T)) {
   9824    return detail::Compress(v, BitsFromMask(d, Not(mask)));
   9825  }
   9826  return detail::CompressNot(v, BitsFromMask(d, mask));
   9827 }
   9828 
   9829 // ------------------------------ CompressBlocksNot
   9830 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v,
   9831                                           Mask128<uint64_t> /* m */) {
   9832  return v;
   9833 }
   9834 
   9835 // ------------------------------ CompressBits
   9836 
   9837 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)>
   9838 HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v,
   9839                                     const uint8_t* HWY_RESTRICT bits) {
   9840  uint64_t mask_bits = 0;
   9841  constexpr size_t kNumBytes = (N + 7) / 8;
   9842  CopyBytes<kNumBytes>(bits, &mask_bits);
   9843  if (N < 8) {
   9844    mask_bits &= (1ull << N) - 1;
   9845  }
   9846 
   9847  return detail::Compress(v, mask_bits);
   9848 }
   9849 
   9850 // ------------------------------ CompressStore
   9851 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   9852 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d,
   9853                             TFromD<D>* HWY_RESTRICT unaligned) {
   9854  const uint64_t mask_bits = BitsFromMask(d, mask);
   9855  StoreU(detail::Compress(v, mask_bits), d, unaligned);
   9856  return PopCount(mask_bits);
   9857 }
   9858 
   9859 // ------------------------------ CompressBlendedStore
   9860 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   9861 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d,
   9862                                    TFromD<D>* HWY_RESTRICT unaligned) {
   9863  const RebindToUnsigned<decltype(d)> du;  // so we can support fp16/bf16
   9864  const uint64_t mask_bits = BitsFromMask(d, m);
   9865  const size_t count = PopCount(mask_bits);
   9866  const MFromD<D> store_mask = RebindMask(d, FirstN(du, count));
   9867  const VFromD<decltype(du)> compressed =
   9868      detail::Compress(BitCast(du, v), mask_bits);
   9869  BlendedStore(BitCast(d, compressed), store_mask, d, unaligned);
   9870  return count;
   9871 }
   9872 
   9873 // ------------------------------ CompressBitsStore
   9874 
   9875 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
   9876 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
   9877                                 D d, TFromD<D>* HWY_RESTRICT unaligned) {
   9878  uint64_t mask_bits = 0;
   9879  constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8;
   9880  CopyBytes<kNumBytes>(bits, &mask_bits);
   9881  if (d.MaxLanes() < 8) {
   9882    mask_bits &= (1ull << d.MaxLanes()) - 1;
   9883  }
   9884 
   9885  StoreU(detail::Compress(v, mask_bits), d, unaligned);
   9886  return PopCount(mask_bits);
   9887 }
   9888 
   9889 // ------------------------------ LoadInterleaved2
   9890 
   9891 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
   9892 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   9893 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
   9894 #else
   9895 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED
   9896 #endif
   9897 
   9898 namespace detail {
   9899 
   9900 #define HWY_NEON_BUILD_TPL_HWY_LOAD_INT
   9901 #define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from
   9902 
   9903 #if HWY_ARCH_ARM_A64
   9904 #define HWY_IF_LOAD_INT(D) \
   9905  HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D)
   9906 #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
   9907  HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)      \
   9908  HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
   9909 #else
   9910 // Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any
   9911 // emulated types.
   9912 #define HWY_IF_LOAD_INT(D)                                                 \
   9913  HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),                 \
   9914      hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
   9915          nullptr
   9916 #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \
   9917  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)    \
   9918  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)   \
   9919  HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)      \
   9920  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)    \
   9921  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \
   9922  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
   9923 #endif  // HWY_ARCH_ARM_A64
   9924 
   9925 // Must return raw tuple because Tuple2 lack a ctor, and we cannot use
   9926 // brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return
   9927 // void.
   9928 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
   9929  decltype(Tuple2<type##_t, size>().raw)
   9930 // Tuple tag arg allows overloading (cannot just overload on return type)
   9931 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
   9932  const NativeLaneType<type##_t>*from, Tuple2<type##_t, size>
   9933 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT)
   9934 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
   9935 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
   9936 
   9937 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
   9938  decltype(Tuple3<type##_t, size>().raw)
   9939 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
   9940  const NativeLaneType<type##_t>*from, Tuple3<type##_t, size>
   9941 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT)
   9942 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
   9943 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
   9944 
   9945 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \
   9946  decltype(Tuple4<type##_t, size>().raw)
   9947 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \
   9948  const NativeLaneType<type##_t>*from, Tuple4<type##_t, size>
   9949 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT)
   9950 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT
   9951 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT
   9952 
   9953 #undef HWY_NEON_DEF_FUNCTION_LOAD_INT
   9954 #undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT
   9955 #undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT
   9956 
   9957 }  // namespace detail
   9958 
   9959 template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
   9960 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
   9961                              VFromD<D>& v0, VFromD<D>& v1) {
   9962  auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned),
   9963                                      detail::Tuple2<T, d.MaxLanes()>());
   9964  v0 = VFromD<D>(raw.val[0]);
   9965  v1 = VFromD<D>(raw.val[1]);
   9966 }
   9967 
   9968 // <= 32 bits: avoid loading more than N bytes by copying to buffer
   9969 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
   9970          typename T = TFromD<D>>
   9971 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
   9972                              VFromD<D>& v0, VFromD<D>& v1) {
   9973  // The smallest vector registers are 64-bits and we want space for two.
   9974  alignas(16) T buf[2 * 8 / sizeof(T)] = {};
   9975  CopyBytes<d.MaxBytes() * 2>(unaligned, buf);
   9976  auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf),
   9977                                      detail::Tuple2<T, d.MaxLanes()>());
   9978  v0 = VFromD<D>(raw.val[0]);
   9979  v1 = VFromD<D>(raw.val[1]);
   9980 }
   9981 
   9982 #if HWY_ARCH_ARM_V7
   9983 // 64x2: split into two 64x1
   9984 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
   9985          HWY_NEON_IF_NOT_EMULATED_D(D)>
   9986 HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0,
   9987                              Vec128<T>& v1) {
   9988  const Half<decltype(d)> dh;
   9989  VFromD<decltype(dh)> v00, v10, v01, v11;
   9990  LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10);
   9991  LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11);
   9992  v0 = Combine(d, v01, v00);
   9993  v1 = Combine(d, v11, v10);
   9994 }
   9995 #endif  // HWY_ARCH_ARM_V7
   9996 
   9997 // ------------------------------ LoadInterleaved3
   9998 
   9999 template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
  10000 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
  10001                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  10002  auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned),
  10003                                      detail::Tuple3<T, d.MaxLanes()>());
  10004  v0 = VFromD<D>(raw.val[0]);
  10005  v1 = VFromD<D>(raw.val[1]);
  10006  v2 = VFromD<D>(raw.val[2]);
  10007 }
  10008 
  10009 // <= 32 bits: avoid writing more than N bytes by copying to buffer
  10010 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
  10011          typename T = TFromD<D>>
  10012 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned,
  10013                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
  10014  // The smallest vector registers are 64-bits and we want space for three.
  10015  alignas(16) T buf[3 * 8 / sizeof(T)] = {};
  10016  CopyBytes<d.MaxBytes() * 3>(unaligned, buf);
  10017  auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf),
  10018                                      detail::Tuple3<T, d.MaxLanes()>());
  10019  v0 = VFromD<D>(raw.val[0]);
  10020  v1 = VFromD<D>(raw.val[1]);
  10021  v2 = VFromD<D>(raw.val[2]);
  10022 }
  10023 
  10024 #if HWY_ARCH_ARM_V7
  10025 // 64x2: split into two 64x1
  10026 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
  10027          HWY_NEON_IF_NOT_EMULATED_D(D)>
  10028 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
  10029                              Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) {
  10030  const Half<decltype(d)> dh;
  10031  VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21;
  10032  LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20);
  10033  LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21);
  10034  v0 = Combine(d, v01, v00);
  10035  v1 = Combine(d, v11, v10);
  10036  v2 = Combine(d, v21, v20);
  10037 }
  10038 #endif  // HWY_ARCH_ARM_V7
  10039 
  10040 // ------------------------------ LoadInterleaved4
  10041 
  10042 template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>>
  10043 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
  10044                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  10045                              VFromD<D>& v3) {
  10046  auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned),
  10047                                      detail::Tuple4<T, d.MaxLanes()>());
  10048  v0 = VFromD<D>(raw.val[0]);
  10049  v1 = VFromD<D>(raw.val[1]);
  10050  v2 = VFromD<D>(raw.val[2]);
  10051  v3 = VFromD<D>(raw.val[3]);
  10052 }
  10053 
  10054 // <= 32 bits: avoid writing more than N bytes by copying to buffer
  10055 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
  10056          typename T = TFromD<D>>
  10057 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
  10058                              VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
  10059                              VFromD<D>& v3) {
  10060  alignas(16) T buf[4 * 8 / sizeof(T)] = {};
  10061  CopyBytes<d.MaxBytes() * 4>(unaligned, buf);
  10062  auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf),
  10063                                      detail::Tuple4<T, d.MaxLanes()>());
  10064  v0 = VFromD<D>(raw.val[0]);
  10065  v1 = VFromD<D>(raw.val[1]);
  10066  v2 = VFromD<D>(raw.val[2]);
  10067  v3 = VFromD<D>(raw.val[3]);
  10068 }
  10069 
  10070 #if HWY_ARCH_ARM_V7
  10071 // 64x2: split into two 64x1
  10072 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
  10073          HWY_NEON_IF_NOT_EMULATED_D(D)>
  10074 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned,
  10075                              Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2,
  10076                              Vec128<T>& v3) {
  10077  const Half<decltype(d)> dh;
  10078  VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31;
  10079  LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20,
  10080                   v30);
  10081  LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21,
  10082                   v31);
  10083  v0 = Combine(d, v01, v00);
  10084  v1 = Combine(d, v11, v10);
  10085  v2 = Combine(d, v21, v20);
  10086  v3 = Combine(d, v31, v30);
  10087 }
  10088 #endif  // HWY_ARCH_ARM_V7
  10089 
  10090 #undef HWY_IF_LOAD_INT
  10091 
  10092 // ------------------------------ StoreInterleaved2
  10093 
  10094 namespace detail {
  10095 #define HWY_NEON_BUILD_TPL_HWY_STORE_INT
  10096 #define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void
  10097 #define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw
  10098 
  10099 #if HWY_ARCH_ARM_A64
  10100 #define HWY_IF_STORE_INT(D) \
  10101  HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D)
  10102 #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
  10103  HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args)       \
  10104  HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)
  10105 #else
  10106 // Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any
  10107 // emulated types.
  10108 #define HWY_IF_STORE_INT(D)                                                \
  10109  HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),                 \
  10110      hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \
  10111          nullptr
  10112 #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \
  10113  HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args)     \
  10114  HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args)    \
  10115  HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args)       \
  10116  HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args)     \
  10117  HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args)  \
  10118  HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args)
  10119 #endif  // HWY_ARCH_ARM_A64
  10120 
  10121 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
  10122  Tuple2<type##_t, size> tup, NativeLaneType<type##_t>*to
  10123 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT)
  10124 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
  10125 
  10126 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
  10127  Tuple3<type##_t, size> tup, NativeLaneType<type##_t>*to
  10128 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT)
  10129 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
  10130 
  10131 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \
  10132  Tuple4<type##_t, size> tup, NativeLaneType<type##_t>*to
  10133 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT)
  10134 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT
  10135 
  10136 #undef HWY_NEON_DEF_FUNCTION_STORE_INT
  10137 #undef HWY_NEON_BUILD_TPL_HWY_STORE_INT
  10138 #undef HWY_NEON_BUILD_RET_HWY_STORE_INT
  10139 #undef HWY_NEON_BUILD_ARG_HWY_STORE_INT
  10140 }  // namespace detail
  10141 
  10142 template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
  10143 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
  10144                               T* HWY_RESTRICT unaligned) {
  10145  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
  10146  detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned));
  10147 }
  10148 
  10149 // <= 32 bits: avoid writing more than N bytes by copying to buffer
  10150 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
  10151          typename T = TFromD<D>>
  10152 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
  10153                               T* HWY_RESTRICT unaligned) {
  10154  alignas(16) T buf[2 * 8 / sizeof(T)];
  10155  detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}};
  10156  detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf));
  10157  CopyBytes<d.MaxBytes() * 2>(buf, unaligned);
  10158 }
  10159 
  10160 #if HWY_ARCH_ARM_V7
  10161 // 64x2: split into two 64x1
  10162 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
  10163          HWY_NEON_IF_NOT_EMULATED_D(D)>
  10164 HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d,
  10165                               T* HWY_RESTRICT unaligned) {
  10166  const Half<decltype(d)> dh;
  10167  StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh,
  10168                    detail::NativeLanePointer(unaligned));
  10169  StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh,
  10170                    detail::NativeLanePointer(unaligned + 2));
  10171 }
  10172 #endif  // HWY_ARCH_ARM_V7
  10173 
  10174 // ------------------------------ StoreInterleaved3
  10175 
  10176 template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
  10177 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  10178                               T* HWY_RESTRICT unaligned) {
  10179  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
  10180  detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned));
  10181 }
  10182 
  10183 // <= 32 bits: avoid writing more than N bytes by copying to buffer
  10184 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
  10185          typename T = TFromD<D>>
  10186 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d,
  10187                               T* HWY_RESTRICT unaligned) {
  10188  alignas(16) T buf[3 * 8 / sizeof(T)];
  10189  detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}};
  10190  detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf));
  10191  CopyBytes<d.MaxBytes() * 3>(buf, unaligned);
  10192 }
  10193 
  10194 #if HWY_ARCH_ARM_V7
  10195 // 64x2: split into two 64x1
  10196 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
  10197          HWY_NEON_IF_NOT_EMULATED_D(D)>
  10198 HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d,
  10199                               T* HWY_RESTRICT unaligned) {
  10200  const Half<decltype(d)> dh;
  10201  StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh,
  10202                    detail::NativeLanePointer(unaligned));
  10203  StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh,
  10204                    detail::NativeLanePointer(unaligned + 3));
  10205 }
  10206 #endif  // HWY_ARCH_ARM_V7
  10207 
  10208 // ------------------------------ StoreInterleaved4
  10209 
  10210 template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>>
  10211 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
  10212                               VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
  10213  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
  10214  detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned));
  10215 }
  10216 
  10217 // <= 32 bits: avoid writing more than N bytes by copying to buffer
  10218 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D),
  10219          typename T = TFromD<D>>
  10220 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2,
  10221                               VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
  10222  alignas(16) T buf[4 * 8 / sizeof(T)];
  10223  detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}};
  10224  detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf));
  10225  CopyBytes<d.MaxBytes() * 4>(buf, unaligned);
  10226 }
  10227 
  10228 #if HWY_ARCH_ARM_V7
  10229 // 64x2: split into two 64x1
  10230 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8),
  10231          HWY_NEON_IF_NOT_EMULATED_D(D)>
  10232 HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2,
  10233                               Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) {
  10234  const Half<decltype(d)> dh;
  10235  StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2),
  10236                    LowerHalf(dh, v3), dh,
  10237                    detail::NativeLanePointer(unaligned));
  10238  StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2),
  10239                    UpperHalf(dh, v3), dh,
  10240                    detail::NativeLanePointer(unaligned + 4));
  10241 }
  10242 #endif  // HWY_ARCH_ARM_V7
  10243 
  10244 #undef HWY_IF_STORE_INT
  10245 
  10246 // Fall back on generic Load/StoreInterleaved[234] for any emulated types.
  10247 // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_NEON_IF_EMULATED_D.
  10248 
  10249 // ------------------------------ Additional mask logical operations
  10250 template <class T>
  10251 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) {
  10252  return mask;
  10253 }
  10254 template <class T>
  10255 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) {
  10256  const FixedTag<T, 2> d;
  10257  const auto vmask = VecFromMask(d, mask);
  10258  return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask)));
  10259 }
  10260 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
  10261 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) {
  10262  const Simd<T, N, 0> d;
  10263  const auto vmask = VecFromMask(d, mask);
  10264  const auto neg_vmask =
  10265      ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask)));
  10266  return MaskFromVec(Or(vmask, neg_vmask));
  10267 }
  10268 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
  10269 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) {
  10270  const Full128<T> d;
  10271  const Repartition<int64_t, decltype(d)> di64;
  10272 
  10273  auto vmask = BitCast(di64, VecFromMask(d, mask));
  10274  vmask = Or(vmask, Neg(vmask));
  10275 
  10276  // Copy the sign bit of the first int64_t lane to the second int64_t lane
  10277  const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask));
  10278  return MaskFromVec(BitCast(d, Or(vmask, vmask2)));
  10279 }
  10280 
  10281 template <class T, size_t N>
  10282 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) {
  10283  return Not(SetAtOrAfterFirst(mask));
  10284 }
  10285 
  10286 template <class T>
  10287 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) {
  10288  return mask;
  10289 }
  10290 template <class T>
  10291 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) {
  10292  const FixedTag<T, 2> d;
  10293  const RebindToSigned<decltype(d)> di;
  10294 
  10295  const auto vmask = BitCast(di, VecFromMask(d, mask));
  10296  const auto zero = Zero(di);
  10297  const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero);
  10298  return MaskFromVec(BitCast(d, And(vmask, vmask2)));
  10299 }
  10300 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)>
  10301 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) {
  10302  const Simd<T, N, 0> d;
  10303  const RebindToSigned<decltype(d)> di;
  10304 
  10305  const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask));
  10306  const auto only_first_vmask =
  10307      BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask)))));
  10308  return MaskFromVec(only_first_vmask);
  10309 }
  10310 template <class T, HWY_IF_NOT_T_SIZE(T, 8)>
  10311 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) {
  10312  const Full128<T> d;
  10313  const RebindToSigned<decltype(d)> di;
  10314  const Repartition<int64_t, decltype(d)> di64;
  10315 
  10316  const auto zero = Zero(di64);
  10317  const auto vmask = BitCast(di64, VecFromMask(d, mask));
  10318  const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero);
  10319  const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask))));
  10320  return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2))));
  10321 }
  10322 
  10323 template <class T>
  10324 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) {
  10325  const FixedTag<T, 1> d;
  10326  const RebindToSigned<decltype(d)> di;
  10327  using TI = MakeSigned<T>;
  10328 
  10329  return RebindMask(d, MaskFromVec(Set(di, TI(-1))));
  10330 }
  10331 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)>
  10332 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) {
  10333  const Simd<T, N, 0> d;
  10334  return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask))));
  10335 }
  10336 
  10337 // ------------------------------ Lt128
  10338 
  10339 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  10340 HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) {
  10341  static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
  10342  // Truth table of Eq and Lt for Hi and Lo u64.
  10343  // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
  10344  // =H =L cH cL  | out = cH | (=H & cL)
  10345  //  0  0  0  0  |  0
  10346  //  0  0  0  1  |  0
  10347  //  0  0  1  0  |  1
  10348  //  0  0  1  1  |  1
  10349  //  0  1  0  0  |  0
  10350  //  0  1  0  1  |  0
  10351  //  0  1  1  0  |  1
  10352  //  1  0  0  0  |  0
  10353  //  1  0  0  1  |  1
  10354  //  1  1  0  0  |  0
  10355  const MFromD<D> eqHL = Eq(a, b);
  10356  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  10357  // We need to bring cL to the upper lane/bit corresponding to cH. Comparing
  10358  // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the
  10359  // comparison result leftwards requires only 4. IfThenElse compiles to the
  10360  // same code as OrAnd().
  10361  const VFromD<D> ltLx = DupEven(ltHL);
  10362  const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL);
  10363  return MaskFromVec(DupOdd(outHx));
  10364 }
  10365 
  10366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  10367 HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) {
  10368  const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
  10369  return MaskFromVec(InterleaveUpper(d, ltHL, ltHL));
  10370 }
  10371 
  10372 // ------------------------------ Eq128
  10373 
  10374 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  10375 HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) {
  10376  static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
  10377  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  10378  return MaskFromVec(And(Reverse2(d, eqHL), eqHL));
  10379 }
  10380 
  10381 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  10382 HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) {
  10383  const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
  10384  return MaskFromVec(InterleaveUpper(d, eqHL, eqHL));
  10385 }
  10386 
  10387 // ------------------------------ Ne128
  10388 
  10389 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  10390 HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) {
  10391  static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64");
  10392  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
  10393  return MaskFromVec(Or(Reverse2(d, neHL), neHL));
  10394 }
  10395 
  10396 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)>
  10397 HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) {
  10398  const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
  10399  return MaskFromVec(InterleaveUpper(d, neHL, neHL));
  10400 }
  10401 
  10402 // ------------------------------ Min128, Max128 (Lt128)
  10403 
  10404 // Without a native OddEven, it seems infeasible to go faster than Lt128.
  10405 template <class D>
  10406 HWY_INLINE VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) {
  10407  return IfThenElse(Lt128(d, a, b), a, b);
  10408 }
  10409 
  10410 template <class D>
  10411 HWY_INLINE VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) {
  10412  return IfThenElse(Lt128(d, b, a), a, b);
  10413 }
  10414 
  10415 template <class D>
  10416 HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) {
  10417  return IfThenElse(Lt128Upper(d, a, b), a, b);
  10418 }
  10419 
  10420 template <class D>
  10421 HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) {
  10422  return IfThenElse(Lt128Upper(d, b, a), a, b);
  10423 }
  10424 
  10425 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex
  10426 
  10427 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT
  10428 #undef HWY_NATIVE_LEADING_ZERO_COUNT
  10429 #else
  10430 #define HWY_NATIVE_LEADING_ZERO_COUNT
  10431 #endif
  10432 
  10433 HWY_NEON_DEF_FUNCTION_INT_8_16_32(LeadingZeroCount, vclz, _, 1)
  10434 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(LeadingZeroCount, vclz, _, 1)
  10435 
  10436 template <class V, HWY_IF_UI64_D(DFromV<V>)>
  10437 HWY_API V LeadingZeroCount(V v) {
  10438  const DFromV<decltype(v)> d;
  10439  const RebindToUnsigned<decltype(d)> du;
  10440  const Repartition<uint32_t, decltype(d)> du32;
  10441 
  10442  const auto v_k32 = BitCast(du32, Set(du, 32));
  10443  const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32;
  10444  const auto v_u32_lo_lzcnt =
  10445      And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu)));
  10446  const auto v_u32_hi_lzcnt =
  10447      BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt)));
  10448 
  10449  return BitCast(
  10450      d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt));
  10451 }
  10452 
  10453 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
  10454 HWY_API V HighestSetBitIndex(V v) {
  10455  const DFromV<decltype(v)> d;
  10456  using T = TFromD<decltype(d)>;
  10457  return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v));
  10458 }
  10459 
  10460 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, 1)>
  10461 HWY_API V TrailingZeroCount(V v) {
  10462  return LeadingZeroCount(ReverseBits(v));
  10463 }
  10464 
  10465 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
  10466          HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))>
  10467 HWY_API V TrailingZeroCount(V v) {
  10468  const DFromV<decltype(v)> d;
  10469  const Repartition<uint8_t, decltype(d)> du8;
  10470  return LeadingZeroCount(
  10471      ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v)))));
  10472 }
  10473 
  10474 namespace detail {  // for code folding
  10475 #if HWY_ARCH_ARM_V7
  10476 #undef vuzp1_s8
  10477 #undef vuzp1_u8
  10478 #undef vuzp1_s16
  10479 #undef vuzp1_u16
  10480 #undef vuzp1_s32
  10481 #undef vuzp1_u32
  10482 #undef vuzp1_f32
  10483 #undef vuzp1q_s8
  10484 #undef vuzp1q_u8
  10485 #undef vuzp1q_s16
  10486 #undef vuzp1q_u16
  10487 #undef vuzp1q_s32
  10488 #undef vuzp1q_u32
  10489 #undef vuzp1q_f32
  10490 #undef vuzp2_s8
  10491 #undef vuzp2_u8
  10492 #undef vuzp2_s16
  10493 #undef vuzp2_u16
  10494 #undef vuzp2_s32
  10495 #undef vuzp2_u32
  10496 #undef vuzp2_f32
  10497 #undef vuzp2q_s8
  10498 #undef vuzp2q_u8
  10499 #undef vuzp2q_s16
  10500 #undef vuzp2q_u16
  10501 #undef vuzp2q_s32
  10502 #undef vuzp2q_u32
  10503 #undef vuzp2q_f32
  10504 #undef vzip1_s8
  10505 #undef vzip1_u8
  10506 #undef vzip1_s16
  10507 #undef vzip1_u16
  10508 #undef vzip1_s32
  10509 #undef vzip1_u32
  10510 #undef vzip1_f32
  10511 #undef vzip1q_s8
  10512 #undef vzip1q_u8
  10513 #undef vzip1q_s16
  10514 #undef vzip1q_u16
  10515 #undef vzip1q_s32
  10516 #undef vzip1q_u32
  10517 #undef vzip1q_f32
  10518 #undef vzip2_s8
  10519 #undef vzip2_u8
  10520 #undef vzip2_s16
  10521 #undef vzip2_u16
  10522 #undef vzip2_s32
  10523 #undef vzip2_u32
  10524 #undef vzip2_f32
  10525 #undef vzip2q_s8
  10526 #undef vzip2q_u8
  10527 #undef vzip2q_s16
  10528 #undef vzip2q_u16
  10529 #undef vzip2q_s32
  10530 #undef vzip2q_u32
  10531 #undef vzip2q_f32
  10532 #endif
  10533 
  10534 #undef HWY_NEON_BUILD_ARG_1
  10535 #undef HWY_NEON_BUILD_ARG_2
  10536 #undef HWY_NEON_BUILD_ARG_3
  10537 #undef HWY_NEON_BUILD_PARAM_1
  10538 #undef HWY_NEON_BUILD_PARAM_2
  10539 #undef HWY_NEON_BUILD_PARAM_3
  10540 #undef HWY_NEON_BUILD_RET_1
  10541 #undef HWY_NEON_BUILD_RET_2
  10542 #undef HWY_NEON_BUILD_RET_3
  10543 #undef HWY_NEON_BUILD_TPL_1
  10544 #undef HWY_NEON_BUILD_TPL_2
  10545 #undef HWY_NEON_BUILD_TPL_3
  10546 #undef HWY_NEON_DEF_FUNCTION
  10547 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS
  10548 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES
  10549 #undef HWY_NEON_DEF_FUNCTION_BFLOAT_16
  10550 #undef HWY_NEON_DEF_FUNCTION_FLOAT_16
  10551 #undef HWY_NEON_DEF_FUNCTION_FLOAT_16_32
  10552 #undef HWY_NEON_DEF_FUNCTION_FLOAT_32
  10553 #undef HWY_NEON_DEF_FUNCTION_FLOAT_64
  10554 #undef HWY_NEON_DEF_FUNCTION_FULL_UI
  10555 #undef HWY_NEON_DEF_FUNCTION_FULL_UI_64
  10556 #undef HWY_NEON_DEF_FUNCTION_FULL_UIF_64
  10557 #undef HWY_NEON_DEF_FUNCTION_INT_16
  10558 #undef HWY_NEON_DEF_FUNCTION_INT_32
  10559 #undef HWY_NEON_DEF_FUNCTION_INT_64
  10560 #undef HWY_NEON_DEF_FUNCTION_INT_8
  10561 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32
  10562 #undef HWY_NEON_DEF_FUNCTION_INTS
  10563 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS
  10564 #undef HWY_NEON_DEF_FUNCTION_UI_8_16_32
  10565 #undef HWY_NEON_DEF_FUNCTION_UIF_64
  10566 #undef HWY_NEON_DEF_FUNCTION_UIF_8_16_32
  10567 #undef HWY_NEON_DEF_FUNCTION_UINT_16
  10568 #undef HWY_NEON_DEF_FUNCTION_UINT_32
  10569 #undef HWY_NEON_DEF_FUNCTION_UINT_64
  10570 #undef HWY_NEON_DEF_FUNCTION_UINT_8
  10571 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32
  10572 #undef HWY_NEON_DEF_FUNCTION_UINTS
  10573 #undef HWY_NEON_EVAL
  10574 #undef HWY_NEON_IF_EMULATED_D
  10575 #undef HWY_NEON_IF_NOT_EMULATED_D
  10576 }  // namespace detail
  10577 
  10578 // NOLINTNEXTLINE(google-readability-namespace-comments)
  10579 }  // namespace HWY_NAMESPACE
  10580 }  // namespace hwy
  10581 HWY_AFTER_NAMESPACE();