arm_neon-inl.h (393118B)
1 // Copyright 2019 Google LLC 2 // Copyright 2024 Arm Limited and/or its affiliates <open-source-office@arm.com> 3 // SPDX-License-Identifier: Apache-2.0 4 // SPDX-License-Identifier: BSD-3-Clause 5 // 6 // Licensed under the Apache License, Version 2.0 (the "License"); 7 // you may not use this file except in compliance with the License. 8 // You may obtain a copy of the License at 9 // 10 // http://www.apache.org/licenses/LICENSE-2.0 11 // 12 // Unless required by applicable law or agreed to in writing, software 13 // distributed under the License is distributed on an "AS IS" BASIS, 14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 // See the License for the specific language governing permissions and 16 // limitations under the License. 17 18 // 128-bit Arm NEON vectors and operations. 19 // External include guard in highway.h - see comment there. 20 21 // Arm NEON intrinsics are documented at: 22 // https://developer.arm.com/architectures/instruction-sets/intrinsics/#f:@navigationhierarchiessimdisa=[Neon] 23 24 #include "hwy/base.h" 25 #include "hwy/ops/shared-inl.h" 26 27 HWY_DIAGNOSTICS(push) 28 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wuninitialized") 29 #include <arm_neon.h> // NOLINT(build/include_order) 30 HWY_DIAGNOSTICS(pop) 31 32 HWY_BEFORE_NAMESPACE(); 33 namespace hwy { 34 namespace HWY_NAMESPACE { 35 36 namespace detail { // for code folding and Raw128 37 38 // Macros used to define single and double function calls for multiple types 39 // for full and half vectors. These macros are undefined at the end of the file. 40 41 // HWY_NEON_BUILD_TPL_* is the template<...> prefix to the function. 42 #define HWY_NEON_BUILD_TPL_1 43 #define HWY_NEON_BUILD_TPL_2 44 #define HWY_NEON_BUILD_TPL_3 45 46 // HWY_NEON_BUILD_RET_* is return type; type arg is without _t suffix so we can 47 // extend it to int32x4x2_t packs. 48 #define HWY_NEON_BUILD_RET_1(type, size) Vec128<type##_t, size> 49 #define HWY_NEON_BUILD_RET_2(type, size) Vec128<type##_t, size> 50 #define HWY_NEON_BUILD_RET_3(type, size) Vec128<type##_t, size> 51 52 // HWY_NEON_BUILD_PARAM_* is the list of parameters the function receives. 53 #define HWY_NEON_BUILD_PARAM_1(type, size) const Vec128<type##_t, size> a 54 #define HWY_NEON_BUILD_PARAM_2(type, size) \ 55 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b 56 #define HWY_NEON_BUILD_PARAM_3(type, size) \ 57 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b, \ 58 const Vec128<type##_t, size> c 59 60 // HWY_NEON_BUILD_ARG_* is the list of arguments passed to the underlying 61 // function. 62 #define HWY_NEON_BUILD_ARG_1 a.raw 63 #define HWY_NEON_BUILD_ARG_2 a.raw, b.raw 64 #define HWY_NEON_BUILD_ARG_3 a.raw, b.raw, c.raw 65 66 // We use HWY_NEON_EVAL(func, ...) to delay the evaluation of func until after 67 // the __VA_ARGS__ have been expanded. This allows "func" to be a macro on 68 // itself like with some of the library "functions" such as vshlq_u8. For 69 // example, HWY_NEON_EVAL(vshlq_u8, MY_PARAMS) where MY_PARAMS is defined as 70 // "a, b" (without the quotes) will end up expanding "vshlq_u8(a, b)" if needed. 71 // Directly writing vshlq_u8(MY_PARAMS) would fail since vshlq_u8() macro 72 // expects two arguments. 73 #define HWY_NEON_EVAL(func, ...) func(__VA_ARGS__) 74 75 // Main macro definition that defines a single function for the given type and 76 // size of vector, using the underlying (prefix##infix##suffix) function and 77 // the template, return type, parameters and arguments defined by the "args" 78 // parameters passed here (see HWY_NEON_BUILD_* macros defined before). 79 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ 80 HWY_CONCAT(HWY_NEON_BUILD_TPL_, args) \ 81 HWY_API HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size) \ 82 name(HWY_CONCAT(HWY_NEON_BUILD_PARAM_, args)(type, size)) { \ 83 return HWY_CONCAT(HWY_NEON_BUILD_RET_, args)(type, size)( \ 84 HWY_NEON_EVAL(prefix##infix##suffix, HWY_NEON_BUILD_ARG_##args)); \ 85 } 86 87 // The HWY_NEON_DEF_FUNCTION_* macros define all the variants of a function 88 // called "name" using the set of neon functions starting with the given 89 // "prefix" for all the variants of certain types, as specified next to each 90 // macro. For example, the prefix "vsub" can be used to define the operator- 91 // using args=2. 92 93 // uint8_t 94 #define HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ 95 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ 96 HWY_NEON_DEF_FUNCTION(uint8, 8, name, prefix, infix, u8, args) \ 97 HWY_NEON_DEF_FUNCTION(uint8, 4, name, prefix, infix, u8, args) \ 98 HWY_NEON_DEF_FUNCTION(uint8, 2, name, prefix, infix, u8, args) \ 99 HWY_NEON_DEF_FUNCTION(uint8, 1, name, prefix, infix, u8, args) 100 101 // int8_t 102 #define HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ 103 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ 104 HWY_NEON_DEF_FUNCTION(int8, 8, name, prefix, infix, s8, args) \ 105 HWY_NEON_DEF_FUNCTION(int8, 4, name, prefix, infix, s8, args) \ 106 HWY_NEON_DEF_FUNCTION(int8, 2, name, prefix, infix, s8, args) \ 107 HWY_NEON_DEF_FUNCTION(int8, 1, name, prefix, infix, s8, args) 108 109 // uint16_t 110 #define HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ 111 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ 112 HWY_NEON_DEF_FUNCTION(uint16, 4, name, prefix, infix, u16, args) \ 113 HWY_NEON_DEF_FUNCTION(uint16, 2, name, prefix, infix, u16, args) \ 114 HWY_NEON_DEF_FUNCTION(uint16, 1, name, prefix, infix, u16, args) 115 116 // int16_t 117 #define HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ 118 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ 119 HWY_NEON_DEF_FUNCTION(int16, 4, name, prefix, infix, s16, args) \ 120 HWY_NEON_DEF_FUNCTION(int16, 2, name, prefix, infix, s16, args) \ 121 HWY_NEON_DEF_FUNCTION(int16, 1, name, prefix, infix, s16, args) 122 123 // uint32_t 124 #define HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) \ 125 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ 126 HWY_NEON_DEF_FUNCTION(uint32, 2, name, prefix, infix, u32, args) \ 127 HWY_NEON_DEF_FUNCTION(uint32, 1, name, prefix, infix, u32, args) 128 129 // int32_t 130 #define HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) \ 131 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ 132 HWY_NEON_DEF_FUNCTION(int32, 2, name, prefix, infix, s32, args) \ 133 HWY_NEON_DEF_FUNCTION(int32, 1, name, prefix, infix, s32, args) 134 135 // uint64_t 136 #define HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ 137 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ 138 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) 139 140 // int64_t 141 #define HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ 142 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) \ 143 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) 144 145 // bfloat16_t 146 #if HWY_NEON_HAVE_BFLOAT16 147 #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \ 148 HWY_NEON_DEF_FUNCTION(bfloat16, 8, name, prefix##q, infix, bf16, args) \ 149 HWY_NEON_DEF_FUNCTION(bfloat16, 4, name, prefix, infix, bf16, args) \ 150 HWY_NEON_DEF_FUNCTION(bfloat16, 2, name, prefix, infix, bf16, args) \ 151 HWY_NEON_DEF_FUNCTION(bfloat16, 1, name, prefix, infix, bf16, args) 152 #else 153 #define HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) 154 #endif 155 156 // Used for conversion instructions if HWY_NEON_HAVE_F16C. 157 #define HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, \ 158 args) \ 159 HWY_NEON_DEF_FUNCTION(float16, 8, name, prefix##q, infix, f16, args) \ 160 HWY_NEON_DEF_FUNCTION(float16, 4, name, prefix, infix, f16, args) \ 161 HWY_NEON_DEF_FUNCTION(float16, 2, name, prefix, infix, f16, args) \ 162 HWY_NEON_DEF_FUNCTION(float16, 1, name, prefix, infix, f16, args) 163 164 // float16_t 165 #if HWY_HAVE_FLOAT16 166 #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \ 167 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(name, prefix, infix, args) 168 #else 169 #define HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) 170 #endif 171 172 // Enable generic functions for whichever of (f16, bf16) are not supported. 173 #if !HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16 174 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) 175 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D) 176 #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_SPECIAL_FLOAT_D(D) 177 #elif !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16 178 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_F16_D(D) 179 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_F16_D(D) 180 #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_F16_D(D) 181 #elif HWY_HAVE_FLOAT16 && !HWY_NEON_HAVE_BFLOAT16 182 #define HWY_NEON_IF_EMULATED_D(D) HWY_IF_BF16_D(D) 183 #define HWY_GENERIC_IF_EMULATED_D(D) HWY_IF_BF16_D(D) 184 #define HWY_NEON_IF_NOT_EMULATED_D(D) HWY_IF_NOT_BF16_D(D) 185 #elif HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_BFLOAT16 186 // NOTE: hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr is used instead of 187 // hwy::EnableIf<false>* = nullptr to avoid compiler errors since 188 // !hwy::IsSame<D, D>() is always false and as !hwy::IsSame<D, D>() will cause 189 // SFINAE to occur instead of a hard error due to a dependency on the D template 190 // argument 191 #define HWY_NEON_IF_EMULATED_D(D) hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr 192 #define HWY_GENERIC_IF_EMULATED_D(D) \ 193 hwy::EnableIf<!hwy::IsSame<D, D>()>* = nullptr 194 #define HWY_NEON_IF_NOT_EMULATED_D(D) hwy::EnableIf<true>* = nullptr 195 #else 196 #error "Logic error, handled all four cases" 197 #endif 198 199 // float 200 #define HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) \ 201 HWY_NEON_DEF_FUNCTION(float32, 4, name, prefix##q, infix, f32, args) \ 202 HWY_NEON_DEF_FUNCTION(float32, 2, name, prefix, infix, f32, args) \ 203 HWY_NEON_DEF_FUNCTION(float32, 1, name, prefix, infix, f32, args) 204 205 // double 206 #if HWY_HAVE_FLOAT64 207 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) \ 208 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) \ 209 HWY_NEON_DEF_FUNCTION(float64, 1, name, prefix, infix, f64, args) 210 #else 211 #define HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) 212 #endif 213 214 // Helper macros to define for more than one type. 215 // uint8_t, uint16_t and uint32_t 216 #define HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ 217 HWY_NEON_DEF_FUNCTION_UINT_8(name, prefix, infix, args) \ 218 HWY_NEON_DEF_FUNCTION_UINT_16(name, prefix, infix, args) \ 219 HWY_NEON_DEF_FUNCTION_UINT_32(name, prefix, infix, args) 220 221 // int8_t, int16_t and int32_t 222 #define HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ 223 HWY_NEON_DEF_FUNCTION_INT_8(name, prefix, infix, args) \ 224 HWY_NEON_DEF_FUNCTION_INT_16(name, prefix, infix, args) \ 225 HWY_NEON_DEF_FUNCTION_INT_32(name, prefix, infix, args) 226 227 // uint8_t, uint16_t, uint32_t and uint64_t 228 #define HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) \ 229 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ 230 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) 231 232 // int8_t, int16_t, int32_t and int64_t 233 #define HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ 234 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ 235 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) 236 237 // All int*_t and uint*_t up to 64 238 #define HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ 239 HWY_NEON_DEF_FUNCTION_INTS(name, prefix, infix, args) \ 240 HWY_NEON_DEF_FUNCTION_UINTS(name, prefix, infix, args) 241 242 #define HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ 243 HWY_NEON_DEF_FUNCTION_FLOAT_16(name, prefix, infix, args) \ 244 HWY_NEON_DEF_FUNCTION_FLOAT_32(name, prefix, infix, args) 245 246 #define HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) \ 247 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ 248 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) 249 250 // All previous types. 251 #define HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ 252 HWY_NEON_DEF_FUNCTION_INTS_UINTS(name, prefix, infix, args) \ 253 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(name, prefix, infix, args) 254 255 #define HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \ 256 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ 257 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) 258 259 #define HWY_NEON_DEF_FUNCTION_UIF_8_16_32(name, prefix, infix, args) \ 260 HWY_NEON_DEF_FUNCTION_UI_8_16_32(name, prefix, infix, args) \ 261 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) 262 263 #define HWY_NEON_DEF_FUNCTION_UIF_64(name, prefix, infix, args) \ 264 HWY_NEON_DEF_FUNCTION_UINT_64(name, prefix, infix, args) \ 265 HWY_NEON_DEF_FUNCTION_INT_64(name, prefix, infix, args) \ 266 HWY_NEON_DEF_FUNCTION_FLOAT_64(name, prefix, infix, args) 267 268 // For vzip1/2 269 #define HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \ 270 HWY_NEON_DEF_FUNCTION(uint64, 2, name, prefix##q, infix, u64, args) \ 271 HWY_NEON_DEF_FUNCTION(int64, 2, name, prefix##q, infix, s64, args) 272 #define HWY_NEON_DEF_FUNCTION_FULL_UIF_64(name, prefix, infix, args) \ 273 HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) \ 274 HWY_NEON_DEF_FUNCTION(float64, 2, name, prefix##q, infix, f64, args) 275 276 // For eor3q, which is only defined for full vectors. 277 #define HWY_NEON_DEF_FUNCTION_FULL_UI(name, prefix, infix, args) \ 278 HWY_NEON_DEF_FUNCTION(uint8, 16, name, prefix##q, infix, u8, args) \ 279 HWY_NEON_DEF_FUNCTION(uint16, 8, name, prefix##q, infix, u16, args) \ 280 HWY_NEON_DEF_FUNCTION(uint32, 4, name, prefix##q, infix, u32, args) \ 281 HWY_NEON_DEF_FUNCTION(int8, 16, name, prefix##q, infix, s8, args) \ 282 HWY_NEON_DEF_FUNCTION(int16, 8, name, prefix##q, infix, s16, args) \ 283 HWY_NEON_DEF_FUNCTION(int32, 4, name, prefix##q, infix, s32, args) \ 284 HWY_NEON_DEF_FUNCTION_FULL_UI_64(name, prefix, infix, args) 285 // Emulation of some intrinsics on armv7. 286 #if HWY_ARCH_ARM_V7 287 #define vuzp1_s8(x, y) vuzp_s8(x, y).val[0] 288 #define vuzp1_u8(x, y) vuzp_u8(x, y).val[0] 289 #define vuzp1_s16(x, y) vuzp_s16(x, y).val[0] 290 #define vuzp1_u16(x, y) vuzp_u16(x, y).val[0] 291 #define vuzp1_s32(x, y) vuzp_s32(x, y).val[0] 292 #define vuzp1_u32(x, y) vuzp_u32(x, y).val[0] 293 #define vuzp1_f32(x, y) vuzp_f32(x, y).val[0] 294 #define vuzp1q_s8(x, y) vuzpq_s8(x, y).val[0] 295 #define vuzp1q_u8(x, y) vuzpq_u8(x, y).val[0] 296 #define vuzp1q_s16(x, y) vuzpq_s16(x, y).val[0] 297 #define vuzp1q_u16(x, y) vuzpq_u16(x, y).val[0] 298 #define vuzp1q_s32(x, y) vuzpq_s32(x, y).val[0] 299 #define vuzp1q_u32(x, y) vuzpq_u32(x, y).val[0] 300 #define vuzp1q_f32(x, y) vuzpq_f32(x, y).val[0] 301 #define vuzp2_s8(x, y) vuzp_s8(x, y).val[1] 302 #define vuzp2_u8(x, y) vuzp_u8(x, y).val[1] 303 #define vuzp2_s16(x, y) vuzp_s16(x, y).val[1] 304 #define vuzp2_u16(x, y) vuzp_u16(x, y).val[1] 305 #define vuzp2_s32(x, y) vuzp_s32(x, y).val[1] 306 #define vuzp2_u32(x, y) vuzp_u32(x, y).val[1] 307 #define vuzp2_f32(x, y) vuzp_f32(x, y).val[1] 308 #define vuzp2q_s8(x, y) vuzpq_s8(x, y).val[1] 309 #define vuzp2q_u8(x, y) vuzpq_u8(x, y).val[1] 310 #define vuzp2q_s16(x, y) vuzpq_s16(x, y).val[1] 311 #define vuzp2q_u16(x, y) vuzpq_u16(x, y).val[1] 312 #define vuzp2q_s32(x, y) vuzpq_s32(x, y).val[1] 313 #define vuzp2q_u32(x, y) vuzpq_u32(x, y).val[1] 314 #define vuzp2q_f32(x, y) vuzpq_f32(x, y).val[1] 315 #define vzip1_s8(x, y) vzip_s8(x, y).val[0] 316 #define vzip1_u8(x, y) vzip_u8(x, y).val[0] 317 #define vzip1_s16(x, y) vzip_s16(x, y).val[0] 318 #define vzip1_u16(x, y) vzip_u16(x, y).val[0] 319 #define vzip1_f32(x, y) vzip_f32(x, y).val[0] 320 #define vzip1_u32(x, y) vzip_u32(x, y).val[0] 321 #define vzip1_s32(x, y) vzip_s32(x, y).val[0] 322 #define vzip1q_s8(x, y) vzipq_s8(x, y).val[0] 323 #define vzip1q_u8(x, y) vzipq_u8(x, y).val[0] 324 #define vzip1q_s16(x, y) vzipq_s16(x, y).val[0] 325 #define vzip1q_u16(x, y) vzipq_u16(x, y).val[0] 326 #define vzip1q_s32(x, y) vzipq_s32(x, y).val[0] 327 #define vzip1q_u32(x, y) vzipq_u32(x, y).val[0] 328 #define vzip1q_f32(x, y) vzipq_f32(x, y).val[0] 329 #define vzip2_s8(x, y) vzip_s8(x, y).val[1] 330 #define vzip2_u8(x, y) vzip_u8(x, y).val[1] 331 #define vzip2_s16(x, y) vzip_s16(x, y).val[1] 332 #define vzip2_u16(x, y) vzip_u16(x, y).val[1] 333 #define vzip2_s32(x, y) vzip_s32(x, y).val[1] 334 #define vzip2_u32(x, y) vzip_u32(x, y).val[1] 335 #define vzip2_f32(x, y) vzip_f32(x, y).val[1] 336 #define vzip2q_s8(x, y) vzipq_s8(x, y).val[1] 337 #define vzip2q_u8(x, y) vzipq_u8(x, y).val[1] 338 #define vzip2q_s16(x, y) vzipq_s16(x, y).val[1] 339 #define vzip2q_u16(x, y) vzipq_u16(x, y).val[1] 340 #define vzip2q_s32(x, y) vzipq_s32(x, y).val[1] 341 #define vzip2q_u32(x, y) vzipq_u32(x, y).val[1] 342 #define vzip2q_f32(x, y) vzipq_f32(x, y).val[1] 343 #endif 344 345 // Wrappers over uint8x16x2_t etc. so we can define StoreInterleaved2 346 // overloads for all vector types, even those (bfloat16_t) where the 347 // underlying vector is the same as others (uint16_t). 348 template <typename T, size_t N> 349 struct Tuple2; 350 template <typename T, size_t N> 351 struct Tuple3; 352 template <typename T, size_t N> 353 struct Tuple4; 354 355 template <> 356 struct Tuple2<uint8_t, 16> { 357 uint8x16x2_t raw; 358 }; 359 template <size_t N> 360 struct Tuple2<uint8_t, N> { 361 uint8x8x2_t raw; 362 }; 363 template <> 364 struct Tuple2<int8_t, 16> { 365 int8x16x2_t raw; 366 }; 367 template <size_t N> 368 struct Tuple2<int8_t, N> { 369 int8x8x2_t raw; 370 }; 371 template <> 372 struct Tuple2<uint16_t, 8> { 373 uint16x8x2_t raw; 374 }; 375 template <size_t N> 376 struct Tuple2<uint16_t, N> { 377 uint16x4x2_t raw; 378 }; 379 template <> 380 struct Tuple2<int16_t, 8> { 381 int16x8x2_t raw; 382 }; 383 template <size_t N> 384 struct Tuple2<int16_t, N> { 385 int16x4x2_t raw; 386 }; 387 template <> 388 struct Tuple2<uint32_t, 4> { 389 uint32x4x2_t raw; 390 }; 391 template <size_t N> 392 struct Tuple2<uint32_t, N> { 393 uint32x2x2_t raw; 394 }; 395 template <> 396 struct Tuple2<int32_t, 4> { 397 int32x4x2_t raw; 398 }; 399 template <size_t N> 400 struct Tuple2<int32_t, N> { 401 int32x2x2_t raw; 402 }; 403 template <> 404 struct Tuple2<uint64_t, 2> { 405 uint64x2x2_t raw; 406 }; 407 template <size_t N> 408 struct Tuple2<uint64_t, N> { 409 uint64x1x2_t raw; 410 }; 411 template <> 412 struct Tuple2<int64_t, 2> { 413 int64x2x2_t raw; 414 }; 415 template <size_t N> 416 struct Tuple2<int64_t, N> { 417 int64x1x2_t raw; 418 }; 419 420 template <> 421 struct Tuple2<float32_t, 4> { 422 float32x4x2_t raw; 423 }; 424 template <size_t N> 425 struct Tuple2<float32_t, N> { 426 float32x2x2_t raw; 427 }; 428 #if HWY_HAVE_FLOAT64 429 template <> 430 struct Tuple2<float64_t, 2> { 431 float64x2x2_t raw; 432 }; 433 template <size_t N> 434 struct Tuple2<float64_t, N> { 435 float64x1x2_t raw; 436 }; 437 #endif // HWY_HAVE_FLOAT64 438 439 template <> 440 struct Tuple3<uint8_t, 16> { 441 uint8x16x3_t raw; 442 }; 443 template <size_t N> 444 struct Tuple3<uint8_t, N> { 445 uint8x8x3_t raw; 446 }; 447 template <> 448 struct Tuple3<int8_t, 16> { 449 int8x16x3_t raw; 450 }; 451 template <size_t N> 452 struct Tuple3<int8_t, N> { 453 int8x8x3_t raw; 454 }; 455 template <> 456 struct Tuple3<uint16_t, 8> { 457 uint16x8x3_t raw; 458 }; 459 template <size_t N> 460 struct Tuple3<uint16_t, N> { 461 uint16x4x3_t raw; 462 }; 463 template <> 464 struct Tuple3<int16_t, 8> { 465 int16x8x3_t raw; 466 }; 467 template <size_t N> 468 struct Tuple3<int16_t, N> { 469 int16x4x3_t raw; 470 }; 471 template <> 472 struct Tuple3<uint32_t, 4> { 473 uint32x4x3_t raw; 474 }; 475 template <size_t N> 476 struct Tuple3<uint32_t, N> { 477 uint32x2x3_t raw; 478 }; 479 template <> 480 struct Tuple3<int32_t, 4> { 481 int32x4x3_t raw; 482 }; 483 template <size_t N> 484 struct Tuple3<int32_t, N> { 485 int32x2x3_t raw; 486 }; 487 template <> 488 struct Tuple3<uint64_t, 2> { 489 uint64x2x3_t raw; 490 }; 491 template <size_t N> 492 struct Tuple3<uint64_t, N> { 493 uint64x1x3_t raw; 494 }; 495 template <> 496 struct Tuple3<int64_t, 2> { 497 int64x2x3_t raw; 498 }; 499 template <size_t N> 500 struct Tuple3<int64_t, N> { 501 int64x1x3_t raw; 502 }; 503 504 template <> 505 struct Tuple3<float32_t, 4> { 506 float32x4x3_t raw; 507 }; 508 template <size_t N> 509 struct Tuple3<float32_t, N> { 510 float32x2x3_t raw; 511 }; 512 #if HWY_HAVE_FLOAT64 513 template <> 514 struct Tuple3<float64_t, 2> { 515 float64x2x3_t raw; 516 }; 517 template <size_t N> 518 struct Tuple3<float64_t, N> { 519 float64x1x3_t raw; 520 }; 521 #endif // HWY_HAVE_FLOAT64 522 523 template <> 524 struct Tuple4<uint8_t, 16> { 525 uint8x16x4_t raw; 526 }; 527 template <size_t N> 528 struct Tuple4<uint8_t, N> { 529 uint8x8x4_t raw; 530 }; 531 template <> 532 struct Tuple4<int8_t, 16> { 533 int8x16x4_t raw; 534 }; 535 template <size_t N> 536 struct Tuple4<int8_t, N> { 537 int8x8x4_t raw; 538 }; 539 template <> 540 struct Tuple4<uint16_t, 8> { 541 uint16x8x4_t raw; 542 }; 543 template <size_t N> 544 struct Tuple4<uint16_t, N> { 545 uint16x4x4_t raw; 546 }; 547 template <> 548 struct Tuple4<int16_t, 8> { 549 int16x8x4_t raw; 550 }; 551 template <size_t N> 552 struct Tuple4<int16_t, N> { 553 int16x4x4_t raw; 554 }; 555 template <> 556 struct Tuple4<uint32_t, 4> { 557 uint32x4x4_t raw; 558 }; 559 template <size_t N> 560 struct Tuple4<uint32_t, N> { 561 uint32x2x4_t raw; 562 }; 563 template <> 564 struct Tuple4<int32_t, 4> { 565 int32x4x4_t raw; 566 }; 567 template <size_t N> 568 struct Tuple4<int32_t, N> { 569 int32x2x4_t raw; 570 }; 571 template <> 572 struct Tuple4<uint64_t, 2> { 573 uint64x2x4_t raw; 574 }; 575 template <size_t N> 576 struct Tuple4<uint64_t, N> { 577 uint64x1x4_t raw; 578 }; 579 template <> 580 struct Tuple4<int64_t, 2> { 581 int64x2x4_t raw; 582 }; 583 template <size_t N> 584 struct Tuple4<int64_t, N> { 585 int64x1x4_t raw; 586 }; 587 588 template <> 589 struct Tuple4<float32_t, 4> { 590 float32x4x4_t raw; 591 }; 592 template <size_t N> 593 struct Tuple4<float32_t, N> { 594 float32x2x4_t raw; 595 }; 596 #if HWY_HAVE_FLOAT64 597 template <> 598 struct Tuple4<float64_t, 2> { 599 float64x2x4_t raw; 600 }; 601 template <size_t N> 602 struct Tuple4<float64_t, N> { 603 float64x1x4_t raw; 604 }; 605 #endif // HWY_HAVE_FLOAT64 606 607 template <typename T, size_t N> 608 struct Raw128; 609 610 template <> 611 struct Raw128<uint8_t, 16> { 612 using type = uint8x16_t; 613 }; 614 template <size_t N> 615 struct Raw128<uint8_t, N> { 616 using type = uint8x8_t; 617 }; 618 619 template <> 620 struct Raw128<uint16_t, 8> { 621 using type = uint16x8_t; 622 }; 623 template <size_t N> 624 struct Raw128<uint16_t, N> { 625 using type = uint16x4_t; 626 }; 627 628 template <> 629 struct Raw128<uint32_t, 4> { 630 using type = uint32x4_t; 631 }; 632 template <size_t N> 633 struct Raw128<uint32_t, N> { 634 using type = uint32x2_t; 635 }; 636 637 template <> 638 struct Raw128<uint64_t, 2> { 639 using type = uint64x2_t; 640 }; 641 template <> 642 struct Raw128<uint64_t, 1> { 643 using type = uint64x1_t; 644 }; 645 646 template <> 647 struct Raw128<int8_t, 16> { 648 using type = int8x16_t; 649 }; 650 template <size_t N> 651 struct Raw128<int8_t, N> { 652 using type = int8x8_t; 653 }; 654 655 template <> 656 struct Raw128<int16_t, 8> { 657 using type = int16x8_t; 658 }; 659 template <size_t N> 660 struct Raw128<int16_t, N> { 661 using type = int16x4_t; 662 }; 663 664 template <> 665 struct Raw128<int32_t, 4> { 666 using type = int32x4_t; 667 }; 668 template <size_t N> 669 struct Raw128<int32_t, N> { 670 using type = int32x2_t; 671 }; 672 673 template <> 674 struct Raw128<int64_t, 2> { 675 using type = int64x2_t; 676 }; 677 template <> 678 struct Raw128<int64_t, 1> { 679 using type = int64x1_t; 680 }; 681 682 template <> 683 struct Raw128<float, 4> { 684 using type = float32x4_t; 685 }; 686 template <size_t N> 687 struct Raw128<float, N> { 688 using type = float32x2_t; 689 }; 690 691 #if HWY_HAVE_FLOAT64 692 template <> 693 struct Raw128<double, 2> { 694 using type = float64x2_t; 695 }; 696 template <> 697 struct Raw128<double, 1> { 698 using type = float64x1_t; 699 }; 700 #endif // HWY_HAVE_FLOAT64 701 702 #if HWY_NEON_HAVE_F16C 703 704 template <> 705 struct Tuple2<float16_t, 8> { 706 float16x8x2_t raw; 707 }; 708 template <size_t N> 709 struct Tuple2<float16_t, N> { 710 float16x4x2_t raw; 711 }; 712 713 template <> 714 struct Tuple3<float16_t, 8> { 715 float16x8x3_t raw; 716 }; 717 template <size_t N> 718 struct Tuple3<float16_t, N> { 719 float16x4x3_t raw; 720 }; 721 722 template <> 723 struct Tuple4<float16_t, 8> { 724 float16x8x4_t raw; 725 }; 726 template <size_t N> 727 struct Tuple4<float16_t, N> { 728 float16x4x4_t raw; 729 }; 730 731 template <> 732 struct Raw128<float16_t, 8> { 733 using type = float16x8_t; 734 }; 735 template <size_t N> 736 struct Raw128<float16_t, N> { 737 using type = float16x4_t; 738 }; 739 740 #else // !HWY_NEON_HAVE_F16C 741 742 template <size_t N> 743 struct Tuple2<float16_t, N> : public Tuple2<uint16_t, N> {}; 744 template <size_t N> 745 struct Tuple3<float16_t, N> : public Tuple3<uint16_t, N> {}; 746 template <size_t N> 747 struct Tuple4<float16_t, N> : public Tuple4<uint16_t, N> {}; 748 template <size_t N> 749 struct Raw128<float16_t, N> : public Raw128<uint16_t, N> {}; 750 751 #endif // HWY_NEON_HAVE_F16C 752 753 #if HWY_NEON_HAVE_BFLOAT16 754 755 template <> 756 struct Tuple2<bfloat16_t, 8> { 757 bfloat16x8x2_t raw; 758 }; 759 template <size_t N> 760 struct Tuple2<bfloat16_t, N> { 761 bfloat16x4x2_t raw; 762 }; 763 764 template <> 765 struct Tuple3<bfloat16_t, 8> { 766 bfloat16x8x3_t raw; 767 }; 768 template <size_t N> 769 struct Tuple3<bfloat16_t, N> { 770 bfloat16x4x3_t raw; 771 }; 772 773 template <> 774 struct Tuple4<bfloat16_t, 8> { 775 bfloat16x8x4_t raw; 776 }; 777 template <size_t N> 778 struct Tuple4<bfloat16_t, N> { 779 bfloat16x4x4_t raw; 780 }; 781 782 template <> 783 struct Raw128<bfloat16_t, 8> { 784 using type = bfloat16x8_t; 785 }; 786 template <size_t N> 787 struct Raw128<bfloat16_t, N> { 788 using type = bfloat16x4_t; 789 }; 790 791 #else // !HWY_NEON_HAVE_BFLOAT16 792 793 template <size_t N> 794 struct Tuple2<bfloat16_t, N> : public Tuple2<uint16_t, N> {}; 795 template <size_t N> 796 struct Tuple3<bfloat16_t, N> : public Tuple3<uint16_t, N> {}; 797 template <size_t N> 798 struct Tuple4<bfloat16_t, N> : public Tuple4<uint16_t, N> {}; 799 template <size_t N> 800 struct Raw128<bfloat16_t, N> : public Raw128<uint16_t, N> {}; 801 802 #endif // HWY_NEON_HAVE_BFLOAT16 803 804 } // namespace detail 805 806 template <typename T, size_t N = 16 / sizeof(T)> 807 class Vec128 { 808 public: 809 using Raw = typename detail::Raw128<T, N>::type; 810 using PrivateT = T; // only for DFromV 811 static constexpr size_t kPrivateN = N; // only for DFromV 812 813 HWY_INLINE Vec128() {} 814 Vec128(const Vec128&) = default; 815 Vec128& operator=(const Vec128&) = default; 816 HWY_INLINE explicit Vec128(const Raw raw) : raw(raw) {} 817 818 // Compound assignment. Only usable if there is a corresponding non-member 819 // binary operator overload. For example, only f32 and f64 support division. 820 HWY_INLINE Vec128& operator*=(const Vec128 other) { 821 return *this = (*this * other); 822 } 823 HWY_INLINE Vec128& operator/=(const Vec128 other) { 824 return *this = (*this / other); 825 } 826 HWY_INLINE Vec128& operator+=(const Vec128 other) { 827 return *this = (*this + other); 828 } 829 HWY_INLINE Vec128& operator-=(const Vec128 other) { 830 return *this = (*this - other); 831 } 832 HWY_INLINE Vec128& operator%=(const Vec128 other) { 833 return *this = (*this % other); 834 } 835 HWY_INLINE Vec128& operator&=(const Vec128 other) { 836 return *this = (*this & other); 837 } 838 HWY_INLINE Vec128& operator|=(const Vec128 other) { 839 return *this = (*this | other); 840 } 841 HWY_INLINE Vec128& operator^=(const Vec128 other) { 842 return *this = (*this ^ other); 843 } 844 845 Raw raw; 846 }; 847 848 template <typename T> 849 using Vec64 = Vec128<T, 8 / sizeof(T)>; 850 851 template <typename T> 852 using Vec32 = Vec128<T, 4 / sizeof(T)>; 853 854 template <typename T> 855 using Vec16 = Vec128<T, 2 / sizeof(T)>; 856 857 // FF..FF or 0. 858 template <typename T, size_t N = 16 / sizeof(T)> 859 class Mask128 { 860 public: 861 // Arm C Language Extensions return and expect unsigned type. 862 using Raw = typename detail::Raw128<MakeUnsigned<T>, N>::type; 863 864 using PrivateT = T; // only for DFromM 865 static constexpr size_t kPrivateN = N; // only for DFromM 866 867 HWY_INLINE Mask128() {} 868 Mask128(const Mask128&) = default; 869 Mask128& operator=(const Mask128&) = default; 870 HWY_INLINE explicit Mask128(const Raw raw) : raw(raw) {} 871 872 Raw raw; 873 }; 874 875 template <typename T> 876 using Mask64 = Mask128<T, 8 / sizeof(T)>; 877 878 template <class V> 879 using DFromV = Simd<typename V::PrivateT, V::kPrivateN, 0>; 880 881 template <class M> 882 using DFromM = Simd<typename M::PrivateT, M::kPrivateN, 0>; 883 884 template <class V> 885 using TFromV = typename V::PrivateT; 886 887 // TODO(janwas): ForDemoteVectors, in convert_test and demote_test, appear to 888 // instantiate this with D = double x 4. The cause is unknown. Previously, 889 // defining this in terms of Set rejected that via SFINAE because only 890 // V_SIZE = 16 and V_SIZE <= 8 overloads were defined. As a workaround, 891 // truncate the lane count to 128 bits. 892 template <class D> 893 using VFromD = 894 Vec128<TFromD<D>, HWY_MIN(16 / sizeof(TFromD<D>), MaxLanes(D()))>; 895 896 // ------------------------------ BitCast 897 898 namespace detail { 899 900 // Converts from Vec128<T, N> to Vec128<uint8_t, N * sizeof(T)> using the 901 // vreinterpret*_u8_*() set of functions. 902 #define HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 903 #define HWY_NEON_BUILD_RET_HWY_CAST_TO_U8(type, size) \ 904 Vec128<uint8_t, size * sizeof(type##_t)> 905 #define HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8(type, size) Vec128<type##_t, size> v 906 #define HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 v.raw 907 908 // Special case of u8 to u8 since vreinterpret*_u8_u8 is obviously not defined. 909 template <size_t N> 910 HWY_INLINE Vec128<uint8_t, N> BitCastToByte(Vec128<uint8_t, N> v) { 911 return v; 912 } 913 914 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(BitCastToByte, vreinterpret, _u8_, 915 HWY_CAST_TO_U8) 916 HWY_NEON_DEF_FUNCTION_BFLOAT_16(BitCastToByte, vreinterpret, _u8_, 917 HWY_CAST_TO_U8) 918 919 HWY_NEON_DEF_FUNCTION_INTS(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) 920 HWY_NEON_DEF_FUNCTION_UINT_16(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) 921 HWY_NEON_DEF_FUNCTION_UINT_32(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) 922 HWY_NEON_DEF_FUNCTION_UINT_64(BitCastToByte, vreinterpret, _u8_, HWY_CAST_TO_U8) 923 924 #if !HWY_HAVE_FLOAT16 925 #if HWY_NEON_HAVE_F16C 926 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(BitCastToByte, vreinterpret, _u8_, 927 HWY_CAST_TO_U8) 928 #else 929 template <size_t N> 930 HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<float16_t, N> v) { 931 return BitCastToByte(Vec128<uint16_t, N>(v.raw)); 932 } 933 #endif // HWY_NEON_HAVE_F16C 934 #endif // !HWY_HAVE_FLOAT16 935 936 #if !HWY_NEON_HAVE_BFLOAT16 937 template <size_t N> 938 HWY_INLINE Vec128<uint8_t, N * 2> BitCastToByte(Vec128<bfloat16_t, N> v) { 939 return BitCastToByte(Vec128<uint16_t, N>(v.raw)); 940 } 941 #endif // !HWY_NEON_HAVE_BFLOAT16 942 943 #undef HWY_NEON_BUILD_TPL_HWY_CAST_TO_U8 944 #undef HWY_NEON_BUILD_RET_HWY_CAST_TO_U8 945 #undef HWY_NEON_BUILD_PARAM_HWY_CAST_TO_U8 946 #undef HWY_NEON_BUILD_ARG_HWY_CAST_TO_U8 947 948 template <class D, HWY_IF_U8_D(D)> 949 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, VFromD<D> v) { 950 return v; 951 } 952 953 // 64-bit or less: 954 955 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I8_D(D)> 956 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 957 VFromD<RebindToUnsigned<D>> v) { 958 return VFromD<D>(vreinterpret_s8_u8(v.raw)); 959 } 960 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 961 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 962 VFromD<Repartition<uint8_t, D>> v) { 963 return VFromD<D>(vreinterpret_u16_u8(v.raw)); 964 } 965 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 966 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 967 VFromD<Repartition<uint8_t, D>> v) { 968 return VFromD<D>(vreinterpret_s16_u8(v.raw)); 969 } 970 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 971 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 972 VFromD<Repartition<uint8_t, D>> v) { 973 return VFromD<D>(vreinterpret_u32_u8(v.raw)); 974 } 975 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 976 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 977 VFromD<Repartition<uint8_t, D>> v) { 978 return VFromD<D>(vreinterpret_s32_u8(v.raw)); 979 } 980 981 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)> 982 HWY_INLINE Vec64<uint64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) { 983 return Vec64<uint64_t>(vreinterpret_u64_u8(v.raw)); 984 } 985 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)> 986 HWY_INLINE Vec64<int64_t> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) { 987 return Vec64<int64_t>(vreinterpret_s64_u8(v.raw)); 988 } 989 990 // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C. 991 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> 992 HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) { 993 #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C 994 return VFromD<D>(vreinterpret_f16_u8(v.raw)); 995 #else 996 const RebindToUnsigned<D> du; 997 return VFromD<D>(BitCastFromByte(du, v).raw); 998 #endif 999 } 1000 1001 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_BF16_D(D)> 1002 HWY_INLINE VFromD<D> BitCastFromByte(D, VFromD<Repartition<uint8_t, D>> v) { 1003 #if HWY_NEON_HAVE_BFLOAT16 1004 return VFromD<D>(vreinterpret_bf16_u8(v.raw)); 1005 #else 1006 const RebindToUnsigned<D> du; 1007 return VFromD<D>(BitCastFromByte(du, v).raw); 1008 #endif 1009 } 1010 1011 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 1012 HWY_INLINE VFromD<D> BitCastFromByte(D /* tag */, 1013 VFromD<Repartition<uint8_t, D>> v) { 1014 return VFromD<D>(vreinterpret_f32_u8(v.raw)); 1015 } 1016 1017 #if HWY_HAVE_FLOAT64 1018 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F64_D(D)> 1019 HWY_INLINE Vec64<double> BitCastFromByte(D /* tag */, Vec64<uint8_t> v) { 1020 return Vec64<double>(vreinterpret_f64_u8(v.raw)); 1021 } 1022 #endif // HWY_HAVE_FLOAT64 1023 1024 // 128-bit full: 1025 1026 template <class D, HWY_IF_I8_D(D)> 1027 HWY_INLINE Vec128<int8_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1028 return Vec128<int8_t>(vreinterpretq_s8_u8(v.raw)); 1029 } 1030 template <class D, HWY_IF_U16_D(D)> 1031 HWY_INLINE Vec128<uint16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1032 return Vec128<uint16_t>(vreinterpretq_u16_u8(v.raw)); 1033 } 1034 template <class D, HWY_IF_I16_D(D)> 1035 HWY_INLINE Vec128<int16_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1036 return Vec128<int16_t>(vreinterpretq_s16_u8(v.raw)); 1037 } 1038 template <class D, HWY_IF_U32_D(D)> 1039 HWY_INLINE Vec128<uint32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1040 return Vec128<uint32_t>(vreinterpretq_u32_u8(v.raw)); 1041 } 1042 template <class D, HWY_IF_I32_D(D)> 1043 HWY_INLINE Vec128<int32_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1044 return Vec128<int32_t>(vreinterpretq_s32_u8(v.raw)); 1045 } 1046 template <class D, HWY_IF_U64_D(D)> 1047 HWY_INLINE Vec128<uint64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1048 return Vec128<uint64_t>(vreinterpretq_u64_u8(v.raw)); 1049 } 1050 template <class D, HWY_IF_I64_D(D)> 1051 HWY_INLINE Vec128<int64_t> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1052 return Vec128<int64_t>(vreinterpretq_s64_u8(v.raw)); 1053 } 1054 1055 template <class D, HWY_IF_F32_D(D)> 1056 HWY_INLINE Vec128<float> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1057 return Vec128<float>(vreinterpretq_f32_u8(v.raw)); 1058 } 1059 1060 #if HWY_HAVE_FLOAT64 1061 template <class D, HWY_IF_F64_D(D)> 1062 HWY_INLINE Vec128<double> BitCastFromByte(D /* tag */, Vec128<uint8_t> v) { 1063 return Vec128<double>(vreinterpretq_f64_u8(v.raw)); 1064 } 1065 #endif // HWY_HAVE_FLOAT64 1066 1067 // Cannot use HWY_NEON_IF_EMULATED_D due to the extra HWY_NEON_HAVE_F16C. 1068 template <class D, HWY_IF_F16_D(D)> 1069 HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) { 1070 #if HWY_HAVE_FLOAT16 || HWY_NEON_HAVE_F16C 1071 return VFromD<D>(vreinterpretq_f16_u8(v.raw)); 1072 #else 1073 return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw); 1074 #endif 1075 } 1076 1077 template <class D, HWY_IF_BF16_D(D)> 1078 HWY_INLINE VFromD<D> BitCastFromByte(D, Vec128<uint8_t> v) { 1079 #if HWY_NEON_HAVE_BFLOAT16 1080 return VFromD<D>(vreinterpretq_bf16_u8(v.raw)); 1081 #else 1082 return VFromD<D>(BitCastFromByte(RebindToUnsigned<D>(), v).raw); 1083 #endif 1084 } 1085 1086 } // namespace detail 1087 1088 template <class D, class FromT> 1089 HWY_API VFromD<D> BitCast(D d, 1090 Vec128<FromT, Repartition<FromT, D>().MaxLanes()> v) { 1091 return detail::BitCastFromByte(d, detail::BitCastToByte(v)); 1092 } 1093 1094 // ------------------------------ ResizeBitCast 1095 1096 // <= 8 byte vector to <= 8 byte vector 1097 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8), 1098 HWY_IF_V_SIZE_LE_D(D, 8)> 1099 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 1100 const Repartition<uint8_t, decltype(d)> du8; 1101 return BitCast(d, VFromD<decltype(du8)>{detail::BitCastToByte(v).raw}); 1102 } 1103 1104 // 16-byte vector to 16-byte vector: same as BitCast 1105 template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16), 1106 HWY_IF_V_SIZE_D(D, 16)> 1107 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 1108 return BitCast(d, v); 1109 } 1110 1111 // 16-byte vector to <= 8-byte vector 1112 template <class D, class FromV, HWY_IF_V_SIZE_V(FromV, 16), 1113 HWY_IF_V_SIZE_LE_D(D, 8)> 1114 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 1115 const DFromV<decltype(v)> d_from; 1116 const Half<decltype(d_from)> dh_from; 1117 return ResizeBitCast(d, LowerHalf(dh_from, v)); 1118 } 1119 1120 // <= 8-bit vector to 16-byte vector 1121 template <class D, class FromV, HWY_IF_V_SIZE_LE_V(FromV, 8), 1122 HWY_IF_V_SIZE_D(D, 16)> 1123 HWY_API VFromD<D> ResizeBitCast(D d, FromV v) { 1124 const Full64<TFromV<FromV>> d_full64_from; 1125 const Full128<TFromV<FromV>> d_full128_from; 1126 return BitCast(d, Combine(d_full128_from, Zero(d_full64_from), 1127 ResizeBitCast(d_full64_from, v))); 1128 } 1129 1130 // ------------------------------ Set 1131 1132 namespace detail { 1133 // We want to route any combination of N/kPow2 to the intrinsics depending on 1134 // whether the requested size is <= 64 bits or 128. HWY_NEON_BUILD_TPL is 1135 // unconditional and currently does not accept inputs (such as whether the 1136 // vector is 64 or 128-bit). Thus we are not able to use HWY_IF_V_SIZE_D for 1137 // SFINAE. We instead define a private NativeSet which receives a Simd<> whose 1138 // kPow2 has already been folded into its N. 1139 #define HWY_NEON_BUILD_TPL_HWY_SET 1140 #define HWY_NEON_BUILD_RET_HWY_SET(type, size) Vec128<type##_t, size> 1141 #define HWY_NEON_BUILD_PARAM_HWY_SET(type, size) \ 1142 Simd<type##_t, size, 0> /* tag */, type##_t t 1143 #define HWY_NEON_BUILD_ARG_HWY_SET t 1144 1145 HWY_NEON_DEF_FUNCTION_ALL_TYPES(NativeSet, vdup, _n_, HWY_SET) 1146 #if !HWY_HAVE_FLOAT16 && HWY_NEON_HAVE_F16C && HWY_HAVE_SCALAR_F16_TYPE 1147 HWY_NEON_DEF_FUNCTION_FLOAT_16_UNCONDITIONAL(NativeSet, vdup, _n_, HWY_SET) 1148 #endif 1149 HWY_NEON_DEF_FUNCTION_BFLOAT_16(NativeSet, vdup, _n_, HWY_SET) 1150 1151 #if !HWY_NEON_HAVE_F16C || !HWY_HAVE_SCALAR_F16_TYPE 1152 template <class D, HWY_IF_F16_D(D)> 1153 HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) { 1154 const uint16_t tu = BitCastScalar<uint16_t>(t); 1155 return BitCast(d, Set(RebindToUnsigned<D>(), tu)); 1156 } 1157 #endif 1158 1159 #if !HWY_NEON_HAVE_BFLOAT16 1160 template <class D, HWY_IF_BF16_D(D)> 1161 HWY_API VFromD<D> NativeSet(D d, TFromD<D> t) { 1162 const uint16_t tu = BitCastScalar<uint16_t>(t); 1163 return BitCast(d, Set(RebindToUnsigned<D>(), tu)); 1164 } 1165 #endif 1166 1167 #undef HWY_NEON_BUILD_TPL_HWY_SET 1168 #undef HWY_NEON_BUILD_RET_HWY_SET 1169 #undef HWY_NEON_BUILD_PARAM_HWY_SET 1170 #undef HWY_NEON_BUILD_ARG_HWY_SET 1171 1172 } // namespace detail 1173 1174 // Full vector. 1175 // Do not use a typename T = TFromD<D> argument because T will be deduced from 1176 // the actual argument type, which can differ from TFromD<D>. 1177 template <class D, HWY_IF_V_SIZE_D(D, 16), typename T> 1178 HWY_INLINE VFromD<D> Set(D /* tag */, T t) { 1179 return detail::NativeSet(Full128<TFromD<D>>(), static_cast<TFromD<D>>(t)); 1180 } 1181 1182 // Partial vector: create 64-bit and return wrapper. 1183 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), typename T> 1184 HWY_API VFromD<D> Set(D /* tag */, T t) { 1185 const Full64<TFromD<D>> dfull; 1186 return VFromD<D>(detail::NativeSet(dfull, static_cast<TFromD<D>>(t)).raw); 1187 } 1188 1189 template <class D> 1190 HWY_API VFromD<D> Zero(D d) { 1191 // Default ctor also works for bfloat16_t and float16_t. 1192 return Set(d, TFromD<D>{}); 1193 } 1194 1195 HWY_DIAGNOSTICS(push) 1196 HWY_DIAGNOSTICS_OFF(disable : 4700, ignored "-Wuninitialized") 1197 #if HWY_COMPILER_GCC_ACTUAL 1198 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") 1199 #endif 1200 1201 template <class D> 1202 HWY_API VFromD<D> Undefined(D /*tag*/) { 1203 #if HWY_HAS_BUILTIN(__builtin_nondeterministic_value) 1204 return VFromD<D>{__builtin_nondeterministic_value(Zero(D()).raw)}; 1205 #else 1206 VFromD<D> v; 1207 return v; 1208 #endif 1209 } 1210 1211 HWY_DIAGNOSTICS(pop) 1212 1213 #if !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL 1214 namespace detail { 1215 1216 #pragma pack(push, 1) 1217 1218 template <class T> 1219 struct alignas(8) Vec64ValsWrapper { 1220 static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true"); 1221 static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true"); 1222 T vals[8 / sizeof(T)]; 1223 }; 1224 1225 #pragma pack(pop) 1226 1227 } // namespace detail 1228 #endif // !HWY_COMPILER_GCC && !HWY_COMPILER_CLANGCL 1229 1230 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 1231 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1232 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 1233 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 1234 TFromD<D> /*t8*/, TFromD<D> /*t9*/, 1235 TFromD<D> /*t10*/, TFromD<D> /*t11*/, 1236 TFromD<D> /*t12*/, TFromD<D> /*t13*/, 1237 TFromD<D> /*t14*/, TFromD<D> /*t15*/) { 1238 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1239 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(8))); 1240 (void)d; 1241 const GccI8RawVectType raw = { 1242 static_cast<int8_t>(t0), static_cast<int8_t>(t1), static_cast<int8_t>(t2), 1243 static_cast<int8_t>(t3), static_cast<int8_t>(t4), static_cast<int8_t>(t5), 1244 static_cast<int8_t>(t6), static_cast<int8_t>(t7)}; 1245 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1246 #else 1247 return ResizeBitCast( 1248 d, Set(Full64<uint64_t>(), 1249 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{ 1250 {t0, t1, t2, t3, t4, t5, t6, t7}}))); 1251 #endif 1252 } 1253 1254 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 1255 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1256 TFromD<D> t2, TFromD<D> t3, 1257 TFromD<D> /*t4*/, TFromD<D> /*t5*/, 1258 TFromD<D> /*t6*/, TFromD<D> /*t7*/) { 1259 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1260 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(8))); 1261 (void)d; 1262 const GccI16RawVectType raw = { 1263 static_cast<int16_t>(t0), static_cast<int16_t>(t1), 1264 static_cast<int16_t>(t2), static_cast<int16_t>(t3)}; 1265 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1266 #else 1267 return ResizeBitCast( 1268 d, Set(Full64<uint64_t>(), 1269 BitCastScalar<uint64_t>( 1270 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}))); 1271 #endif 1272 } 1273 1274 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 1275 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1276 TFromD<D> /*t2*/, TFromD<D> /*t3*/) { 1277 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1278 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(8))); 1279 (void)d; 1280 const GccI32RawVectType raw = {static_cast<int32_t>(t0), 1281 static_cast<int32_t>(t1)}; 1282 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1283 #else 1284 return ResizeBitCast(d, 1285 Set(Full64<uint64_t>(), 1286 BitCastScalar<uint64_t>( 1287 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}))); 1288 #endif 1289 } 1290 1291 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 1292 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1293 TFromD<D> /*t2*/, TFromD<D> /*t3*/) { 1294 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1295 typedef float GccF32RawVectType __attribute__((__vector_size__(8))); 1296 (void)d; 1297 const GccF32RawVectType raw = {t0, t1}; 1298 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1299 #else 1300 return ResizeBitCast(d, 1301 Set(Full64<uint64_t>(), 1302 BitCastScalar<uint64_t>( 1303 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}))); 1304 #endif 1305 } 1306 1307 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 8)> 1308 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) { 1309 return Set(d, t0); 1310 } 1311 1312 template <class D, HWY_IF_UI8_D(D), HWY_IF_V_SIZE_D(D, 16)> 1313 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1314 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 1315 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7, 1316 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10, 1317 TFromD<D> t11, TFromD<D> t12, 1318 TFromD<D> t13, TFromD<D> t14, 1319 TFromD<D> t15) { 1320 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1321 typedef int8_t GccI8RawVectType __attribute__((__vector_size__(16))); 1322 (void)d; 1323 const GccI8RawVectType raw = { 1324 static_cast<int8_t>(t0), static_cast<int8_t>(t1), 1325 static_cast<int8_t>(t2), static_cast<int8_t>(t3), 1326 static_cast<int8_t>(t4), static_cast<int8_t>(t5), 1327 static_cast<int8_t>(t6), static_cast<int8_t>(t7), 1328 static_cast<int8_t>(t8), static_cast<int8_t>(t9), 1329 static_cast<int8_t>(t10), static_cast<int8_t>(t11), 1330 static_cast<int8_t>(t12), static_cast<int8_t>(t13), 1331 static_cast<int8_t>(t14), static_cast<int8_t>(t15)}; 1332 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1333 #else 1334 const Half<decltype(d)> dh; 1335 return Combine(d, 1336 Dup128VecFromValues(dh, t8, t9, t10, t11, t12, t13, t14, t15, 1337 t8, t9, t10, t11, t12, t13, t14, t15), 1338 Dup128VecFromValues(dh, t0, t1, t2, t3, t4, t5, t6, t7, t0, t1, 1339 t2, t3, t4, t5, t6, t7)); 1340 #endif 1341 } 1342 1343 template <class D, HWY_IF_UI16_D(D), HWY_IF_V_SIZE_D(D, 16)> 1344 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1345 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 1346 TFromD<D> t5, TFromD<D> t6, 1347 TFromD<D> t7) { 1348 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1349 typedef int16_t GccI16RawVectType __attribute__((__vector_size__(16))); 1350 (void)d; 1351 const GccI16RawVectType raw = { 1352 static_cast<int16_t>(t0), static_cast<int16_t>(t1), 1353 static_cast<int16_t>(t2), static_cast<int16_t>(t3), 1354 static_cast<int16_t>(t4), static_cast<int16_t>(t5), 1355 static_cast<int16_t>(t6), static_cast<int16_t>(t7)}; 1356 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1357 #else 1358 const Half<decltype(d)> dh; 1359 return Combine(d, Dup128VecFromValues(dh, t4, t5, t6, t7, t4, t5, t6, t7), 1360 Dup128VecFromValues(dh, t0, t1, t2, t3, t0, t1, t2, t3)); 1361 #endif 1362 } 1363 1364 template <class D, HWY_IF_UI32_D(D), HWY_IF_V_SIZE_D(D, 16)> 1365 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1366 TFromD<D> t2, TFromD<D> t3) { 1367 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1368 typedef int32_t GccI32RawVectType __attribute__((__vector_size__(16))); 1369 (void)d; 1370 const GccI32RawVectType raw = { 1371 static_cast<int32_t>(t0), static_cast<int32_t>(t1), 1372 static_cast<int32_t>(t2), static_cast<int32_t>(t3)}; 1373 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1374 #else 1375 const Half<decltype(d)> dh; 1376 return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3), 1377 Dup128VecFromValues(dh, t0, t1, t0, t1)); 1378 #endif 1379 } 1380 1381 template <class D, HWY_IF_F32_D(D), HWY_IF_V_SIZE_D(D, 16)> 1382 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1383 TFromD<D> t2, TFromD<D> t3) { 1384 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1385 typedef float GccF32RawVectType __attribute__((__vector_size__(16))); 1386 (void)d; 1387 const GccF32RawVectType raw = {t0, t1, t2, t3}; 1388 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1389 #else 1390 const Half<decltype(d)> dh; 1391 return Combine(d, Dup128VecFromValues(dh, t2, t3, t2, t3), 1392 Dup128VecFromValues(dh, t0, t1, t0, t1)); 1393 #endif 1394 } 1395 1396 template <class D, HWY_IF_UI64_D(D), HWY_IF_V_SIZE_D(D, 16)> 1397 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) { 1398 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1399 typedef int64_t GccI64RawVectType __attribute__((__vector_size__(16))); 1400 (void)d; 1401 const GccI64RawVectType raw = {static_cast<int64_t>(t0), 1402 static_cast<int64_t>(t1)}; 1403 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1404 #else 1405 const Half<decltype(d)> dh; 1406 return Combine(d, Set(dh, t1), Set(dh, t0)); 1407 #endif 1408 } 1409 1410 #if HWY_HAVE_FLOAT64 1411 template <class D, HWY_IF_F64_D(D), HWY_IF_V_SIZE_D(D, 16)> 1412 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) { 1413 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL 1414 typedef double GccF64RawVectType __attribute__((__vector_size__(16))); 1415 (void)d; 1416 const GccF64RawVectType raw = {t0, t1}; 1417 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1418 #else 1419 const Half<decltype(d)> dh; 1420 return Combine(d, Set(dh, t1), Set(dh, t0)); 1421 #endif 1422 } 1423 #endif 1424 1425 // Generic for all vector lengths 1426 template <class D, HWY_IF_BF16_D(D)> 1427 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1428 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 1429 TFromD<D> t5, TFromD<D> t6, 1430 TFromD<D> t7) { 1431 const RebindToSigned<decltype(d)> di; 1432 return BitCast(d, 1433 Dup128VecFromValues( 1434 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), 1435 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), 1436 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), 1437 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); 1438 } 1439 1440 #if (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C && \ 1441 HWY_HAVE_SCALAR_F16_TYPE 1442 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 1443 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1444 TFromD<D> t2, TFromD<D> t3, 1445 TFromD<D> /*t4*/, TFromD<D> /*t5*/, 1446 TFromD<D> /*t6*/, TFromD<D> /*t7*/) { 1447 typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(8))); 1448 (void)d; 1449 const GccF16RawVectType raw = { 1450 static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2), 1451 static_cast<__fp16>(t3)}; 1452 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1453 } 1454 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)> 1455 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1456 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 1457 TFromD<D> t5, TFromD<D> t6, 1458 TFromD<D> t7) { 1459 typedef __fp16 GccF16RawVectType __attribute__((__vector_size__(16))); 1460 (void)d; 1461 const GccF16RawVectType raw = { 1462 static_cast<__fp16>(t0), static_cast<__fp16>(t1), static_cast<__fp16>(t2), 1463 static_cast<__fp16>(t3), static_cast<__fp16>(t4), static_cast<__fp16>(t5), 1464 static_cast<__fp16>(t6), static_cast<__fp16>(t7)}; 1465 return VFromD<D>(reinterpret_cast<typename VFromD<D>::Raw>(raw)); 1466 } 1467 #else 1468 // Generic for all vector lengths if MSVC or !HWY_NEON_HAVE_F16C 1469 template <class D, HWY_IF_F16_D(D)> 1470 HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1, 1471 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4, 1472 TFromD<D> t5, TFromD<D> t6, 1473 TFromD<D> t7) { 1474 const RebindToSigned<decltype(d)> di; 1475 return BitCast(d, 1476 Dup128VecFromValues( 1477 di, BitCastScalar<int16_t>(t0), BitCastScalar<int16_t>(t1), 1478 BitCastScalar<int16_t>(t2), BitCastScalar<int16_t>(t3), 1479 BitCastScalar<int16_t>(t4), BitCastScalar<int16_t>(t5), 1480 BitCastScalar<int16_t>(t6), BitCastScalar<int16_t>(t7))); 1481 } 1482 #endif // (HWY_COMPILER_GCC || HWY_COMPILER_CLANGCL) && HWY_NEON_HAVE_F16C 1483 1484 namespace detail { 1485 1486 template <class D, HWY_IF_T_SIZE_D(D, 1)> 1487 HWY_INLINE VFromD<D> Iota0(D d) { 1488 return Dup128VecFromValues( 1489 d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, TFromD<D>{3}, TFromD<D>{4}, 1490 TFromD<D>{5}, TFromD<D>{6}, TFromD<D>{7}, TFromD<D>{8}, TFromD<D>{9}, 1491 TFromD<D>{10}, TFromD<D>{11}, TFromD<D>{12}, TFromD<D>{13}, TFromD<D>{14}, 1492 TFromD<D>{15}); 1493 } 1494 1495 template <class D, HWY_IF_UI16_D(D)> 1496 HWY_INLINE VFromD<D> Iota0(D d) { 1497 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, 1498 TFromD<D>{3}, TFromD<D>{4}, TFromD<D>{5}, 1499 TFromD<D>{6}, TFromD<D>{7}); 1500 } 1501 1502 template <class D, HWY_IF_F16_D(D)> 1503 HWY_INLINE VFromD<D> Iota0(D d) { 1504 const RebindToUnsigned<decltype(d)> du; 1505 return BitCast(d, Dup128VecFromValues(du, uint16_t{0}, uint16_t{0x3C00}, 1506 uint16_t{0x4000}, uint16_t{0x4200}, 1507 uint16_t{0x4400}, uint16_t{0x4500}, 1508 uint16_t{0x4600}, uint16_t{0x4700})); 1509 } 1510 1511 template <class D, HWY_IF_T_SIZE_D(D, 4)> 1512 HWY_INLINE VFromD<D> Iota0(D d) { 1513 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}, TFromD<D>{2}, 1514 TFromD<D>{3}); 1515 } 1516 1517 template <class D, HWY_IF_T_SIZE_D(D, 8)> 1518 HWY_INLINE VFromD<D> Iota0(D d) { 1519 return Dup128VecFromValues(d, TFromD<D>{0}, TFromD<D>{1}); 1520 } 1521 1522 #if HWY_COMPILER_MSVC 1523 template <class V, HWY_IF_V_SIZE_LE_V(V, 4)> 1524 static HWY_INLINE V MaskOutIota(V v) { 1525 constexpr size_t kVecSizeInBytes = HWY_MAX_LANES_V(V) * sizeof(TFromV<V>); 1526 constexpr uint64_t kU64MaskOutMask = 1527 hwy::LimitsMax<hwy::UnsignedFromSize<kVecSizeInBytes>>(); 1528 1529 const DFromV<decltype(v)> d; 1530 const Repartition<uint8_t, decltype(d)> du8; 1531 using VU8 = VFromD<decltype(du8)>; 1532 const auto mask_out_mask = 1533 BitCast(d, VU8(vreinterpret_u8_u64(vdup_n_u64(kU64MaskOutMask)))); 1534 return v & mask_out_mask; 1535 } 1536 template <class V, HWY_IF_V_SIZE_GT_V(V, 4)> 1537 static HWY_INLINE V MaskOutIota(V v) { 1538 return v; 1539 } 1540 #endif 1541 1542 } // namespace detail 1543 1544 template <class D, typename T2> 1545 HWY_API VFromD<D> Iota(D d, const T2 first) { 1546 const auto result_iota = 1547 detail::Iota0(d) + Set(d, static_cast<TFromD<D>>(first)); 1548 #if HWY_COMPILER_MSVC 1549 return detail::MaskOutIota(result_iota); 1550 #else 1551 return result_iota; 1552 #endif 1553 } 1554 1555 // ------------------------------ Combine 1556 1557 // Full result 1558 template <class D, HWY_IF_U8_D(D)> 1559 HWY_API Vec128<uint8_t> Combine(D /* tag */, Vec64<uint8_t> hi, 1560 Vec64<uint8_t> lo) { 1561 return Vec128<uint8_t>(vcombine_u8(lo.raw, hi.raw)); 1562 } 1563 template <class D, HWY_IF_U16_D(D)> 1564 HWY_API Vec128<uint16_t> Combine(D /* tag */, Vec64<uint16_t> hi, 1565 Vec64<uint16_t> lo) { 1566 return Vec128<uint16_t>(vcombine_u16(lo.raw, hi.raw)); 1567 } 1568 template <class D, HWY_IF_U32_D(D)> 1569 HWY_API Vec128<uint32_t> Combine(D /* tag */, Vec64<uint32_t> hi, 1570 Vec64<uint32_t> lo) { 1571 return Vec128<uint32_t>(vcombine_u32(lo.raw, hi.raw)); 1572 } 1573 template <class D, HWY_IF_U64_D(D)> 1574 HWY_API Vec128<uint64_t> Combine(D /* tag */, Vec64<uint64_t> hi, 1575 Vec64<uint64_t> lo) { 1576 return Vec128<uint64_t>(vcombine_u64(lo.raw, hi.raw)); 1577 } 1578 1579 template <class D, HWY_IF_I8_D(D)> 1580 HWY_API Vec128<int8_t> Combine(D /* tag */, Vec64<int8_t> hi, 1581 Vec64<int8_t> lo) { 1582 return Vec128<int8_t>(vcombine_s8(lo.raw, hi.raw)); 1583 } 1584 template <class D, HWY_IF_I16_D(D)> 1585 HWY_API Vec128<int16_t> Combine(D /* tag */, Vec64<int16_t> hi, 1586 Vec64<int16_t> lo) { 1587 return Vec128<int16_t>(vcombine_s16(lo.raw, hi.raw)); 1588 } 1589 template <class D, HWY_IF_I32_D(D)> 1590 HWY_API Vec128<int32_t> Combine(D /* tag */, Vec64<int32_t> hi, 1591 Vec64<int32_t> lo) { 1592 return Vec128<int32_t>(vcombine_s32(lo.raw, hi.raw)); 1593 } 1594 template <class D, HWY_IF_I64_D(D)> 1595 HWY_API Vec128<int64_t> Combine(D /* tag */, Vec64<int64_t> hi, 1596 Vec64<int64_t> lo) { 1597 return Vec128<int64_t>(vcombine_s64(lo.raw, hi.raw)); 1598 } 1599 1600 #if HWY_HAVE_FLOAT16 1601 template <class D, HWY_IF_F16_D(D)> 1602 HWY_API Vec128<float16_t> Combine(D, Vec64<float16_t> hi, Vec64<float16_t> lo) { 1603 return Vec128<float16_t>(vcombine_f16(lo.raw, hi.raw)); 1604 } 1605 #endif // HWY_HAVE_FLOAT16 1606 1607 #if HWY_NEON_HAVE_BFLOAT16 1608 template <class D, HWY_IF_BF16_D(D)> 1609 HWY_API VFromD<D> Combine(D, Vec64<bfloat16_t> hi, Vec64<bfloat16_t> lo) { 1610 return VFromD<D>(vcombine_bf16(lo.raw, hi.raw)); 1611 } 1612 #endif // HWY_NEON_HAVE_BFLOAT16 1613 1614 template <class D, class DH = Half<D>, HWY_NEON_IF_EMULATED_D(D)> 1615 HWY_API VFromD<D> Combine(D d, VFromD<DH> hi, VFromD<DH> lo) { 1616 const RebindToUnsigned<D> du; 1617 const Half<decltype(du)> duh; 1618 return BitCast(d, Combine(du, BitCast(duh, hi), BitCast(duh, lo))); 1619 } 1620 1621 template <class D, HWY_IF_F32_D(D)> 1622 HWY_API Vec128<float> Combine(D /* tag */, Vec64<float> hi, Vec64<float> lo) { 1623 return Vec128<float>(vcombine_f32(lo.raw, hi.raw)); 1624 } 1625 #if HWY_HAVE_FLOAT64 1626 template <class D, HWY_IF_F64_D(D)> 1627 HWY_API Vec128<double> Combine(D /* tag */, Vec64<double> hi, 1628 Vec64<double> lo) { 1629 return Vec128<double>(vcombine_f64(lo.raw, hi.raw)); 1630 } 1631 #endif // HWY_HAVE_FLOAT64 1632 1633 // ------------------------------ GetLane 1634 1635 namespace detail { 1636 #define HWY_NEON_BUILD_TPL_HWY_GET template <size_t kLane> 1637 #define HWY_NEON_BUILD_RET_HWY_GET(type, size) type##_t 1638 #define HWY_NEON_BUILD_PARAM_HWY_GET(type, size) Vec128<type##_t, size> v 1639 #define HWY_NEON_BUILD_ARG_HWY_GET v.raw, kLane 1640 1641 HWY_NEON_DEF_FUNCTION_ALL_TYPES(GetLane, vget, _lane_, HWY_GET) 1642 HWY_NEON_DEF_FUNCTION_BFLOAT_16(GetLane, vget, _lane_, HWY_GET) 1643 1644 template <size_t kLane, class V, HWY_NEON_IF_EMULATED_D(DFromV<V>)> 1645 static HWY_INLINE HWY_MAYBE_UNUSED TFromV<V> GetLane(V v) { 1646 const DFromV<decltype(v)> d; 1647 const RebindToUnsigned<decltype(d)> du; 1648 return BitCastScalar<TFromV<V>>(GetLane<kLane>(BitCast(du, v))); 1649 } 1650 1651 #undef HWY_NEON_BUILD_TPL_HWY_GET 1652 #undef HWY_NEON_BUILD_RET_HWY_GET 1653 #undef HWY_NEON_BUILD_PARAM_HWY_GET 1654 #undef HWY_NEON_BUILD_ARG_HWY_GET 1655 1656 } // namespace detail 1657 1658 template <class V> 1659 HWY_API TFromV<V> GetLane(const V v) { 1660 return detail::GetLane<0>(v); 1661 } 1662 1663 // ------------------------------ ExtractLane 1664 1665 // Requires one overload per vector length because GetLane<3> is a compile error 1666 // if v is a uint32x2_t. 1667 template <typename T> 1668 HWY_API T ExtractLane(const Vec128<T, 1> v, size_t i) { 1669 HWY_DASSERT(i == 0); 1670 (void)i; 1671 return detail::GetLane<0>(v); 1672 } 1673 1674 template <typename T> 1675 HWY_API T ExtractLane(const Vec128<T, 2> v, size_t i) { 1676 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1677 if (__builtin_constant_p(i)) { 1678 switch (i) { 1679 case 0: 1680 return detail::GetLane<0>(v); 1681 case 1: 1682 return detail::GetLane<1>(v); 1683 } 1684 } 1685 #endif 1686 alignas(16) T lanes[2]; 1687 Store(v, DFromV<decltype(v)>(), lanes); 1688 return lanes[i]; 1689 } 1690 1691 template <typename T> 1692 HWY_API T ExtractLane(const Vec128<T, 4> v, size_t i) { 1693 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1694 if (__builtin_constant_p(i)) { 1695 switch (i) { 1696 case 0: 1697 return detail::GetLane<0>(v); 1698 case 1: 1699 return detail::GetLane<1>(v); 1700 case 2: 1701 return detail::GetLane<2>(v); 1702 case 3: 1703 return detail::GetLane<3>(v); 1704 } 1705 } 1706 #endif 1707 alignas(16) T lanes[4]; 1708 Store(v, DFromV<decltype(v)>(), lanes); 1709 return lanes[i]; 1710 } 1711 1712 template <typename T> 1713 HWY_API T ExtractLane(const Vec128<T, 8> v, size_t i) { 1714 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1715 if (__builtin_constant_p(i)) { 1716 switch (i) { 1717 case 0: 1718 return detail::GetLane<0>(v); 1719 case 1: 1720 return detail::GetLane<1>(v); 1721 case 2: 1722 return detail::GetLane<2>(v); 1723 case 3: 1724 return detail::GetLane<3>(v); 1725 case 4: 1726 return detail::GetLane<4>(v); 1727 case 5: 1728 return detail::GetLane<5>(v); 1729 case 6: 1730 return detail::GetLane<6>(v); 1731 case 7: 1732 return detail::GetLane<7>(v); 1733 } 1734 } 1735 #endif 1736 alignas(16) T lanes[8]; 1737 Store(v, DFromV<decltype(v)>(), lanes); 1738 return lanes[i]; 1739 } 1740 1741 template <typename T> 1742 HWY_API T ExtractLane(const Vec128<T, 16> v, size_t i) { 1743 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1744 if (__builtin_constant_p(i)) { 1745 switch (i) { 1746 case 0: 1747 return detail::GetLane<0>(v); 1748 case 1: 1749 return detail::GetLane<1>(v); 1750 case 2: 1751 return detail::GetLane<2>(v); 1752 case 3: 1753 return detail::GetLane<3>(v); 1754 case 4: 1755 return detail::GetLane<4>(v); 1756 case 5: 1757 return detail::GetLane<5>(v); 1758 case 6: 1759 return detail::GetLane<6>(v); 1760 case 7: 1761 return detail::GetLane<7>(v); 1762 case 8: 1763 return detail::GetLane<8>(v); 1764 case 9: 1765 return detail::GetLane<9>(v); 1766 case 10: 1767 return detail::GetLane<10>(v); 1768 case 11: 1769 return detail::GetLane<11>(v); 1770 case 12: 1771 return detail::GetLane<12>(v); 1772 case 13: 1773 return detail::GetLane<13>(v); 1774 case 14: 1775 return detail::GetLane<14>(v); 1776 case 15: 1777 return detail::GetLane<15>(v); 1778 } 1779 } 1780 #endif 1781 alignas(16) T lanes[16]; 1782 Store(v, DFromV<decltype(v)>(), lanes); 1783 return lanes[i]; 1784 } 1785 1786 // ------------------------------ InsertLane 1787 1788 namespace detail { 1789 #define HWY_NEON_BUILD_TPL_HWY_INSERT template <size_t kLane> 1790 #define HWY_NEON_BUILD_RET_HWY_INSERT(type, size) Vec128<type##_t, size> 1791 #define HWY_NEON_BUILD_PARAM_HWY_INSERT(type, size) \ 1792 Vec128<type##_t, size> v, type##_t t 1793 #define HWY_NEON_BUILD_ARG_HWY_INSERT t, v.raw, kLane 1794 1795 HWY_NEON_DEF_FUNCTION_ALL_TYPES(InsertLane, vset, _lane_, HWY_INSERT) 1796 HWY_NEON_DEF_FUNCTION_BFLOAT_16(InsertLane, vset, _lane_, HWY_INSERT) 1797 1798 #undef HWY_NEON_BUILD_TPL_HWY_INSERT 1799 #undef HWY_NEON_BUILD_RET_HWY_INSERT 1800 #undef HWY_NEON_BUILD_PARAM_HWY_INSERT 1801 #undef HWY_NEON_BUILD_ARG_HWY_INSERT 1802 1803 template <size_t kLane, class V, class D = DFromV<V>, HWY_NEON_IF_EMULATED_D(D)> 1804 HWY_API V InsertLane(const V v, TFromD<D> t) { 1805 const D d; 1806 const RebindToUnsigned<D> du; 1807 const uint16_t tu = BitCastScalar<uint16_t>(t); 1808 return BitCast(d, InsertLane<kLane>(BitCast(du, v), tu)); 1809 } 1810 1811 } // namespace detail 1812 1813 // Requires one overload per vector length because InsertLane<3> may be a 1814 // compile error. 1815 1816 template <typename T> 1817 HWY_API Vec128<T, 1> InsertLane(const Vec128<T, 1> v, size_t i, T t) { 1818 HWY_DASSERT(i == 0); 1819 (void)i; 1820 return Set(DFromV<decltype(v)>(), t); 1821 } 1822 1823 template <typename T> 1824 HWY_API Vec128<T, 2> InsertLane(const Vec128<T, 2> v, size_t i, T t) { 1825 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1826 if (__builtin_constant_p(i)) { 1827 switch (i) { 1828 case 0: 1829 return detail::InsertLane<0>(v, t); 1830 case 1: 1831 return detail::InsertLane<1>(v, t); 1832 } 1833 } 1834 #endif 1835 const DFromV<decltype(v)> d; 1836 alignas(16) T lanes[2]; 1837 Store(v, d, lanes); 1838 lanes[i] = t; 1839 return Load(d, lanes); 1840 } 1841 1842 template <typename T> 1843 HWY_API Vec128<T, 4> InsertLane(const Vec128<T, 4> v, size_t i, T t) { 1844 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1845 if (__builtin_constant_p(i)) { 1846 switch (i) { 1847 case 0: 1848 return detail::InsertLane<0>(v, t); 1849 case 1: 1850 return detail::InsertLane<1>(v, t); 1851 case 2: 1852 return detail::InsertLane<2>(v, t); 1853 case 3: 1854 return detail::InsertLane<3>(v, t); 1855 } 1856 } 1857 #endif 1858 const DFromV<decltype(v)> d; 1859 alignas(16) T lanes[4]; 1860 Store(v, d, lanes); 1861 lanes[i] = t; 1862 return Load(d, lanes); 1863 } 1864 1865 template <typename T> 1866 HWY_API Vec128<T, 8> InsertLane(const Vec128<T, 8> v, size_t i, T t) { 1867 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1868 if (__builtin_constant_p(i)) { 1869 switch (i) { 1870 case 0: 1871 return detail::InsertLane<0>(v, t); 1872 case 1: 1873 return detail::InsertLane<1>(v, t); 1874 case 2: 1875 return detail::InsertLane<2>(v, t); 1876 case 3: 1877 return detail::InsertLane<3>(v, t); 1878 case 4: 1879 return detail::InsertLane<4>(v, t); 1880 case 5: 1881 return detail::InsertLane<5>(v, t); 1882 case 6: 1883 return detail::InsertLane<6>(v, t); 1884 case 7: 1885 return detail::InsertLane<7>(v, t); 1886 } 1887 } 1888 #endif 1889 const DFromV<decltype(v)> d; 1890 alignas(16) T lanes[8]; 1891 Store(v, d, lanes); 1892 lanes[i] = t; 1893 return Load(d, lanes); 1894 } 1895 1896 template <typename T> 1897 HWY_API Vec128<T, 16> InsertLane(const Vec128<T, 16> v, size_t i, T t) { 1898 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 1899 if (__builtin_constant_p(i)) { 1900 switch (i) { 1901 case 0: 1902 return detail::InsertLane<0>(v, t); 1903 case 1: 1904 return detail::InsertLane<1>(v, t); 1905 case 2: 1906 return detail::InsertLane<2>(v, t); 1907 case 3: 1908 return detail::InsertLane<3>(v, t); 1909 case 4: 1910 return detail::InsertLane<4>(v, t); 1911 case 5: 1912 return detail::InsertLane<5>(v, t); 1913 case 6: 1914 return detail::InsertLane<6>(v, t); 1915 case 7: 1916 return detail::InsertLane<7>(v, t); 1917 case 8: 1918 return detail::InsertLane<8>(v, t); 1919 case 9: 1920 return detail::InsertLane<9>(v, t); 1921 case 10: 1922 return detail::InsertLane<10>(v, t); 1923 case 11: 1924 return detail::InsertLane<11>(v, t); 1925 case 12: 1926 return detail::InsertLane<12>(v, t); 1927 case 13: 1928 return detail::InsertLane<13>(v, t); 1929 case 14: 1930 return detail::InsertLane<14>(v, t); 1931 case 15: 1932 return detail::InsertLane<15>(v, t); 1933 } 1934 } 1935 #endif 1936 const DFromV<decltype(v)> d; 1937 alignas(16) T lanes[16]; 1938 Store(v, d, lanes); 1939 lanes[i] = t; 1940 return Load(d, lanes); 1941 } 1942 1943 // ================================================== ARITHMETIC 1944 1945 // ------------------------------ Addition 1946 HWY_NEON_DEF_FUNCTION_UINTS(operator+, vadd, _, 2) 1947 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator+, vadd, _, 2) 1948 1949 template <size_t N> 1950 HWY_API Vec128<int8_t, N> operator+(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 1951 const DFromV<decltype(a)> d; 1952 const RebindToUnsigned<decltype(d)> du; 1953 return BitCast(d, BitCast(du, a) + BitCast(du, b)); 1954 } 1955 1956 template <size_t N> 1957 HWY_API Vec128<int16_t, N> operator+(Vec128<int16_t, N> a, 1958 Vec128<int16_t, N> b) { 1959 const DFromV<decltype(a)> d; 1960 const RebindToUnsigned<decltype(d)> du; 1961 return BitCast(d, BitCast(du, a) + BitCast(du, b)); 1962 } 1963 1964 template <size_t N> 1965 HWY_API Vec128<int32_t, N> operator+(Vec128<int32_t, N> a, 1966 Vec128<int32_t, N> b) { 1967 const DFromV<decltype(a)> d; 1968 const RebindToUnsigned<decltype(d)> du; 1969 return BitCast(d, BitCast(du, a) + BitCast(du, b)); 1970 } 1971 1972 template <size_t N> 1973 HWY_API Vec128<int64_t, N> operator+(Vec128<int64_t, N> a, 1974 Vec128<int64_t, N> b) { 1975 const DFromV<decltype(a)> d; 1976 const RebindToUnsigned<decltype(d)> du; 1977 return BitCast(d, BitCast(du, a) + BitCast(du, b)); 1978 } 1979 1980 // ------------------------------ Subtraction 1981 HWY_NEON_DEF_FUNCTION_UINTS(operator-, vsub, _, 2) 1982 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator-, vsub, _, 2) 1983 1984 template <size_t N> 1985 HWY_API Vec128<int8_t, N> operator-(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 1986 const DFromV<decltype(a)> d; 1987 const RebindToUnsigned<decltype(d)> du; 1988 return BitCast(d, BitCast(du, a) - BitCast(du, b)); 1989 } 1990 1991 template <size_t N> 1992 HWY_API Vec128<int16_t, N> operator-(Vec128<int16_t, N> a, 1993 Vec128<int16_t, N> b) { 1994 const DFromV<decltype(a)> d; 1995 const RebindToUnsigned<decltype(d)> du; 1996 return BitCast(d, BitCast(du, a) - BitCast(du, b)); 1997 } 1998 1999 template <size_t N> 2000 HWY_API Vec128<int32_t, N> operator-(Vec128<int32_t, N> a, 2001 Vec128<int32_t, N> b) { 2002 const DFromV<decltype(a)> d; 2003 const RebindToUnsigned<decltype(d)> du; 2004 return BitCast(d, BitCast(du, a) - BitCast(du, b)); 2005 } 2006 2007 template <size_t N> 2008 HWY_API Vec128<int64_t, N> operator-(Vec128<int64_t, N> a, 2009 Vec128<int64_t, N> b) { 2010 const DFromV<decltype(a)> d; 2011 const RebindToUnsigned<decltype(d)> du; 2012 return BitCast(d, BitCast(du, a) - BitCast(du, b)); 2013 } 2014 2015 // ------------------------------ SumsOf8 2016 2017 HWY_API Vec128<uint64_t> SumsOf8(const Vec128<uint8_t> v) { 2018 return Vec128<uint64_t>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(v.raw)))); 2019 } 2020 HWY_API Vec64<uint64_t> SumsOf8(const Vec64<uint8_t> v) { 2021 return Vec64<uint64_t>(vpaddl_u32(vpaddl_u16(vpaddl_u8(v.raw)))); 2022 } 2023 HWY_API Vec128<int64_t> SumsOf8(const Vec128<int8_t> v) { 2024 return Vec128<int64_t>(vpaddlq_s32(vpaddlq_s16(vpaddlq_s8(v.raw)))); 2025 } 2026 HWY_API Vec64<int64_t> SumsOf8(const Vec64<int8_t> v) { 2027 return Vec64<int64_t>(vpaddl_s32(vpaddl_s16(vpaddl_s8(v.raw)))); 2028 } 2029 2030 // ------------------------------ SumsOf2 2031 namespace detail { 2032 2033 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 2034 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2035 hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 2036 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s8(v.raw)); 2037 } 2038 2039 template <class V, HWY_IF_V_SIZE_V(V, 16)> 2040 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2041 hwy::SignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 2042 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s8(v.raw)); 2043 } 2044 2045 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 2046 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2047 hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 2048 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u8(v.raw)); 2049 } 2050 2051 template <class V, HWY_IF_V_SIZE_V(V, 16)> 2052 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2053 hwy::UnsignedTag, hwy::SizeTag<1> /*lane_size_tag*/, V v) { 2054 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u8(v.raw)); 2055 } 2056 2057 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 2058 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2059 hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 2060 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s16(v.raw)); 2061 } 2062 2063 template <class V, HWY_IF_V_SIZE_V(V, 16)> 2064 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2065 hwy::SignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 2066 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s16(v.raw)); 2067 } 2068 2069 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 2070 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2071 hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 2072 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u16(v.raw)); 2073 } 2074 2075 template <class V, HWY_IF_V_SIZE_V(V, 16)> 2076 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2077 hwy::UnsignedTag, hwy::SizeTag<2> /*lane_size_tag*/, V v) { 2078 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u16(v.raw)); 2079 } 2080 2081 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 2082 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2083 hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 2084 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_s32(v.raw)); 2085 } 2086 2087 template <class V, HWY_IF_V_SIZE_V(V, 16)> 2088 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2089 hwy::SignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 2090 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_s32(v.raw)); 2091 } 2092 2093 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 2094 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2095 hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 2096 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddl_u32(v.raw)); 2097 } 2098 2099 template <class V, HWY_IF_V_SIZE_V(V, 16)> 2100 HWY_INLINE VFromD<RepartitionToWide<DFromV<V>>> SumsOf2( 2101 hwy::UnsignedTag, hwy::SizeTag<4> /*lane_size_tag*/, V v) { 2102 return VFromD<RepartitionToWide<DFromV<V>>>(vpaddlq_u32(v.raw)); 2103 } 2104 2105 } // namespace detail 2106 2107 // ------------------------------ SaturatedAdd 2108 2109 #ifdef HWY_NATIVE_I32_SATURATED_ADDSUB 2110 #undef HWY_NATIVE_I32_SATURATED_ADDSUB 2111 #else 2112 #define HWY_NATIVE_I32_SATURATED_ADDSUB 2113 #endif 2114 2115 #ifdef HWY_NATIVE_U32_SATURATED_ADDSUB 2116 #undef HWY_NATIVE_U32_SATURATED_ADDSUB 2117 #else 2118 #define HWY_NATIVE_U32_SATURATED_ADDSUB 2119 #endif 2120 2121 #ifdef HWY_NATIVE_I64_SATURATED_ADDSUB 2122 #undef HWY_NATIVE_I64_SATURATED_ADDSUB 2123 #else 2124 #define HWY_NATIVE_I64_SATURATED_ADDSUB 2125 #endif 2126 2127 #ifdef HWY_NATIVE_U64_SATURATED_ADDSUB 2128 #undef HWY_NATIVE_U64_SATURATED_ADDSUB 2129 #else 2130 #define HWY_NATIVE_U64_SATURATED_ADDSUB 2131 #endif 2132 2133 // Returns a + b clamped to the destination range. 2134 HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedAdd, vqadd, _, 2) 2135 2136 // ------------------------------ SaturatedSub 2137 2138 // Returns a - b clamped to the destination range. 2139 HWY_NEON_DEF_FUNCTION_INTS_UINTS(SaturatedSub, vqsub, _, 2) 2140 2141 // ------------------------------ Average 2142 2143 // Returns (a + b + 1) / 2 2144 2145 #ifdef HWY_NATIVE_AVERAGE_ROUND_UI32 2146 #undef HWY_NATIVE_AVERAGE_ROUND_UI32 2147 #else 2148 #define HWY_NATIVE_AVERAGE_ROUND_UI32 2149 #endif 2150 2151 HWY_NEON_DEF_FUNCTION_UI_8_16_32(AverageRound, vrhadd, _, 2) 2152 2153 // ------------------------------ Neg 2154 2155 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Neg, vneg, _, 1) 2156 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Neg, vneg, _, 1) // i64 implemented below 2157 2158 #if !HWY_HAVE_FLOAT16 2159 template <size_t N> 2160 HWY_API Vec128<float16_t, N> Neg(const Vec128<float16_t, N> v) { 2161 const DFromV<decltype(v)> d; 2162 const RebindToUnsigned<decltype(d)> du; 2163 using TU = TFromD<decltype(du)>; 2164 return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>()))); 2165 } 2166 #endif // !HWY_HAVE_FLOAT16 2167 2168 // There is no vneg for bf16, but we can cast to f16 (emulated or native). 2169 template <size_t N> 2170 HWY_API Vec128<bfloat16_t, N> Neg(const Vec128<bfloat16_t, N> v) { 2171 const DFromV<decltype(v)> d; 2172 const Rebind<float16_t, decltype(d)> df16; 2173 return BitCast(d, Neg(BitCast(df16, v))); 2174 } 2175 2176 HWY_API Vec64<int64_t> Neg(const Vec64<int64_t> v) { 2177 #if HWY_ARCH_ARM_A64 2178 return Vec64<int64_t>(vneg_s64(v.raw)); 2179 #else 2180 return Zero(DFromV<decltype(v)>()) - v; 2181 #endif 2182 } 2183 2184 HWY_API Vec128<int64_t> Neg(const Vec128<int64_t> v) { 2185 #if HWY_ARCH_ARM_A64 2186 return Vec128<int64_t>(vnegq_s64(v.raw)); 2187 #else 2188 return Zero(DFromV<decltype(v)>()) - v; 2189 #endif 2190 } 2191 2192 // ------------------------------ SaturatedNeg 2193 #ifdef HWY_NATIVE_SATURATED_NEG_8_16_32 2194 #undef HWY_NATIVE_SATURATED_NEG_8_16_32 2195 #else 2196 #define HWY_NATIVE_SATURATED_NEG_8_16_32 2197 #endif 2198 2199 HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedNeg, vqneg, _, 1) 2200 2201 #if HWY_ARCH_ARM_A64 2202 #ifdef HWY_NATIVE_SATURATED_NEG_64 2203 #undef HWY_NATIVE_SATURATED_NEG_64 2204 #else 2205 #define HWY_NATIVE_SATURATED_NEG_64 2206 #endif 2207 2208 HWY_API Vec64<int64_t> SaturatedNeg(const Vec64<int64_t> v) { 2209 return Vec64<int64_t>(vqneg_s64(v.raw)); 2210 } 2211 2212 HWY_API Vec128<int64_t> SaturatedNeg(const Vec128<int64_t> v) { 2213 return Vec128<int64_t>(vqnegq_s64(v.raw)); 2214 } 2215 #endif 2216 2217 // ------------------------------ ShiftLeft 2218 2219 #ifdef HWY_NATIVE_ROUNDING_SHR 2220 #undef HWY_NATIVE_ROUNDING_SHR 2221 #else 2222 #define HWY_NATIVE_ROUNDING_SHR 2223 #endif 2224 2225 // Customize HWY_NEON_DEF_FUNCTION to special-case count=0 (not supported). 2226 #pragma push_macro("HWY_NEON_DEF_FUNCTION") 2227 #undef HWY_NEON_DEF_FUNCTION 2228 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ 2229 template <int kBits> \ 2230 HWY_API Vec128<type##_t, size> name(const Vec128<type##_t, size> v) { \ 2231 return kBits == 0 ? v \ 2232 : Vec128<type##_t, size>(HWY_NEON_EVAL( \ 2233 prefix##infix##suffix, v.raw, HWY_MAX(1, kBits))); \ 2234 } 2235 2236 HWY_NEON_DEF_FUNCTION_INTS_UINTS(ShiftLeft, vshl, _n_, ignored) 2237 2238 HWY_NEON_DEF_FUNCTION_UINTS(ShiftRight, vshr, _n_, ignored) 2239 HWY_NEON_DEF_FUNCTION_INTS(ShiftRight, vshr, _n_, ignored) 2240 HWY_NEON_DEF_FUNCTION_UINTS(RoundingShiftRight, vrshr, _n_, ignored) 2241 HWY_NEON_DEF_FUNCTION_INTS(RoundingShiftRight, vrshr, _n_, ignored) 2242 2243 #pragma pop_macro("HWY_NEON_DEF_FUNCTION") 2244 2245 // ------------------------------ RotateRight (ShiftRight, Or) 2246 template <int kBits, typename T, size_t N, HWY_IF_NOT_FLOAT_NOR_SPECIAL(T)> 2247 HWY_API Vec128<T, N> RotateRight(const Vec128<T, N> v) { 2248 const DFromV<decltype(v)> d; 2249 const RebindToUnsigned<decltype(d)> du; 2250 2251 constexpr size_t kSizeInBits = sizeof(T) * 8; 2252 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count"); 2253 if (kBits == 0) return v; 2254 2255 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))), 2256 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v)); 2257 } 2258 2259 // NOTE: vxarq_u64 can be applied to uint64_t, but we do not yet have a 2260 // mechanism for checking for extensions to Armv8. 2261 2262 // ------------------------------ Shl 2263 2264 HWY_API Vec128<uint8_t> operator<<(Vec128<uint8_t> v, Vec128<uint8_t> bits) { 2265 return Vec128<uint8_t>(vshlq_u8(v.raw, vreinterpretq_s8_u8(bits.raw))); 2266 } 2267 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> 2268 HWY_API Vec128<uint8_t, N> operator<<(Vec128<uint8_t, N> v, 2269 Vec128<uint8_t, N> bits) { 2270 return Vec128<uint8_t, N>(vshl_u8(v.raw, vreinterpret_s8_u8(bits.raw))); 2271 } 2272 2273 HWY_API Vec128<uint16_t> operator<<(Vec128<uint16_t> v, Vec128<uint16_t> bits) { 2274 return Vec128<uint16_t>(vshlq_u16(v.raw, vreinterpretq_s16_u16(bits.raw))); 2275 } 2276 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> 2277 HWY_API Vec128<uint16_t, N> operator<<(Vec128<uint16_t, N> v, 2278 Vec128<uint16_t, N> bits) { 2279 return Vec128<uint16_t, N>(vshl_u16(v.raw, vreinterpret_s16_u16(bits.raw))); 2280 } 2281 2282 HWY_API Vec128<uint32_t> operator<<(Vec128<uint32_t> v, Vec128<uint32_t> bits) { 2283 return Vec128<uint32_t>(vshlq_u32(v.raw, vreinterpretq_s32_u32(bits.raw))); 2284 } 2285 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> 2286 HWY_API Vec128<uint32_t, N> operator<<(Vec128<uint32_t, N> v, 2287 Vec128<uint32_t, N> bits) { 2288 return Vec128<uint32_t, N>(vshl_u32(v.raw, vreinterpret_s32_u32(bits.raw))); 2289 } 2290 2291 HWY_API Vec128<uint64_t> operator<<(Vec128<uint64_t> v, Vec128<uint64_t> bits) { 2292 return Vec128<uint64_t>(vshlq_u64(v.raw, vreinterpretq_s64_u64(bits.raw))); 2293 } 2294 HWY_API Vec64<uint64_t> operator<<(Vec64<uint64_t> v, Vec64<uint64_t> bits) { 2295 return Vec64<uint64_t>(vshl_u64(v.raw, vreinterpret_s64_u64(bits.raw))); 2296 } 2297 2298 HWY_API Vec128<int8_t> operator<<(Vec128<int8_t> v, Vec128<int8_t> bits) { 2299 return Vec128<int8_t>(vshlq_s8(v.raw, bits.raw)); 2300 } 2301 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> 2302 HWY_API Vec128<int8_t, N> operator<<(Vec128<int8_t, N> v, 2303 Vec128<int8_t, N> bits) { 2304 return Vec128<int8_t, N>(vshl_s8(v.raw, bits.raw)); 2305 } 2306 2307 HWY_API Vec128<int16_t> operator<<(Vec128<int16_t> v, Vec128<int16_t> bits) { 2308 return Vec128<int16_t>(vshlq_s16(v.raw, bits.raw)); 2309 } 2310 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> 2311 HWY_API Vec128<int16_t, N> operator<<(Vec128<int16_t, N> v, 2312 Vec128<int16_t, N> bits) { 2313 return Vec128<int16_t, N>(vshl_s16(v.raw, bits.raw)); 2314 } 2315 2316 HWY_API Vec128<int32_t> operator<<(Vec128<int32_t> v, Vec128<int32_t> bits) { 2317 return Vec128<int32_t>(vshlq_s32(v.raw, bits.raw)); 2318 } 2319 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> 2320 HWY_API Vec128<int32_t, N> operator<<(Vec128<int32_t, N> v, 2321 Vec128<int32_t, N> bits) { 2322 return Vec128<int32_t, N>(vshl_s32(v.raw, bits.raw)); 2323 } 2324 2325 HWY_API Vec128<int64_t> operator<<(Vec128<int64_t> v, Vec128<int64_t> bits) { 2326 return Vec128<int64_t>(vshlq_s64(v.raw, bits.raw)); 2327 } 2328 HWY_API Vec64<int64_t> operator<<(Vec64<int64_t> v, Vec64<int64_t> bits) { 2329 return Vec64<int64_t>(vshl_s64(v.raw, bits.raw)); 2330 } 2331 2332 // ------------------------------ Shr (Neg) 2333 2334 HWY_API Vec128<uint8_t> operator>>(Vec128<uint8_t> v, Vec128<uint8_t> bits) { 2335 const RebindToSigned<DFromV<decltype(v)>> di; 2336 const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw; 2337 return Vec128<uint8_t>(vshlq_u8(v.raw, neg_bits)); 2338 } 2339 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> 2340 HWY_API Vec128<uint8_t, N> operator>>(Vec128<uint8_t, N> v, 2341 Vec128<uint8_t, N> bits) { 2342 const RebindToSigned<DFromV<decltype(v)>> di; 2343 const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw; 2344 return Vec128<uint8_t, N>(vshl_u8(v.raw, neg_bits)); 2345 } 2346 2347 HWY_API Vec128<uint16_t> operator>>(Vec128<uint16_t> v, Vec128<uint16_t> bits) { 2348 const RebindToSigned<DFromV<decltype(v)>> di; 2349 const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw; 2350 return Vec128<uint16_t>(vshlq_u16(v.raw, neg_bits)); 2351 } 2352 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> 2353 HWY_API Vec128<uint16_t, N> operator>>(Vec128<uint16_t, N> v, 2354 Vec128<uint16_t, N> bits) { 2355 const RebindToSigned<DFromV<decltype(v)>> di; 2356 const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw; 2357 return Vec128<uint16_t, N>(vshl_u16(v.raw, neg_bits)); 2358 } 2359 2360 HWY_API Vec128<uint32_t> operator>>(Vec128<uint32_t> v, Vec128<uint32_t> bits) { 2361 const RebindToSigned<DFromV<decltype(v)>> di; 2362 const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw; 2363 return Vec128<uint32_t>(vshlq_u32(v.raw, neg_bits)); 2364 } 2365 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> 2366 HWY_API Vec128<uint32_t, N> operator>>(Vec128<uint32_t, N> v, 2367 Vec128<uint32_t, N> bits) { 2368 const RebindToSigned<DFromV<decltype(v)>> di; 2369 const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw; 2370 return Vec128<uint32_t, N>(vshl_u32(v.raw, neg_bits)); 2371 } 2372 2373 HWY_API Vec128<uint64_t> operator>>(Vec128<uint64_t> v, Vec128<uint64_t> bits) { 2374 const RebindToSigned<DFromV<decltype(v)>> di; 2375 const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw; 2376 return Vec128<uint64_t>(vshlq_u64(v.raw, neg_bits)); 2377 } 2378 HWY_API Vec64<uint64_t> operator>>(Vec64<uint64_t> v, Vec64<uint64_t> bits) { 2379 const RebindToSigned<DFromV<decltype(v)>> di; 2380 const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw; 2381 return Vec64<uint64_t>(vshl_u64(v.raw, neg_bits)); 2382 } 2383 2384 HWY_API Vec128<int8_t> operator>>(Vec128<int8_t> v, Vec128<int8_t> bits) { 2385 return Vec128<int8_t>(vshlq_s8(v.raw, Neg(bits).raw)); 2386 } 2387 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> 2388 HWY_API Vec128<int8_t, N> operator>>(Vec128<int8_t, N> v, 2389 Vec128<int8_t, N> bits) { 2390 return Vec128<int8_t, N>(vshl_s8(v.raw, Neg(bits).raw)); 2391 } 2392 2393 HWY_API Vec128<int16_t> operator>>(Vec128<int16_t> v, Vec128<int16_t> bits) { 2394 return Vec128<int16_t>(vshlq_s16(v.raw, Neg(bits).raw)); 2395 } 2396 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> 2397 HWY_API Vec128<int16_t, N> operator>>(Vec128<int16_t, N> v, 2398 Vec128<int16_t, N> bits) { 2399 return Vec128<int16_t, N>(vshl_s16(v.raw, Neg(bits).raw)); 2400 } 2401 2402 HWY_API Vec128<int32_t> operator>>(Vec128<int32_t> v, Vec128<int32_t> bits) { 2403 return Vec128<int32_t>(vshlq_s32(v.raw, Neg(bits).raw)); 2404 } 2405 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> 2406 HWY_API Vec128<int32_t, N> operator>>(Vec128<int32_t, N> v, 2407 Vec128<int32_t, N> bits) { 2408 return Vec128<int32_t, N>(vshl_s32(v.raw, Neg(bits).raw)); 2409 } 2410 2411 HWY_API Vec128<int64_t> operator>>(Vec128<int64_t> v, Vec128<int64_t> bits) { 2412 return Vec128<int64_t>(vshlq_s64(v.raw, Neg(bits).raw)); 2413 } 2414 HWY_API Vec64<int64_t> operator>>(Vec64<int64_t> v, Vec64<int64_t> bits) { 2415 return Vec64<int64_t>(vshl_s64(v.raw, Neg(bits).raw)); 2416 } 2417 2418 // ------------------------------ RoundingShr (Neg) 2419 2420 HWY_API Vec128<uint8_t> RoundingShr(Vec128<uint8_t> v, Vec128<uint8_t> bits) { 2421 const RebindToSigned<DFromV<decltype(v)>> di; 2422 const int8x16_t neg_bits = Neg(BitCast(di, bits)).raw; 2423 return Vec128<uint8_t>(vrshlq_u8(v.raw, neg_bits)); 2424 } 2425 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> 2426 HWY_API Vec128<uint8_t, N> RoundingShr(Vec128<uint8_t, N> v, 2427 Vec128<uint8_t, N> bits) { 2428 const RebindToSigned<DFromV<decltype(v)>> di; 2429 const int8x8_t neg_bits = Neg(BitCast(di, bits)).raw; 2430 return Vec128<uint8_t, N>(vrshl_u8(v.raw, neg_bits)); 2431 } 2432 2433 HWY_API Vec128<uint16_t> RoundingShr(Vec128<uint16_t> v, 2434 Vec128<uint16_t> bits) { 2435 const RebindToSigned<DFromV<decltype(v)>> di; 2436 const int16x8_t neg_bits = Neg(BitCast(di, bits)).raw; 2437 return Vec128<uint16_t>(vrshlq_u16(v.raw, neg_bits)); 2438 } 2439 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> 2440 HWY_API Vec128<uint16_t, N> RoundingShr(Vec128<uint16_t, N> v, 2441 Vec128<uint16_t, N> bits) { 2442 const RebindToSigned<DFromV<decltype(v)>> di; 2443 const int16x4_t neg_bits = Neg(BitCast(di, bits)).raw; 2444 return Vec128<uint16_t, N>(vrshl_u16(v.raw, neg_bits)); 2445 } 2446 2447 HWY_API Vec128<uint32_t> RoundingShr(Vec128<uint32_t> v, 2448 Vec128<uint32_t> bits) { 2449 const RebindToSigned<DFromV<decltype(v)>> di; 2450 const int32x4_t neg_bits = Neg(BitCast(di, bits)).raw; 2451 return Vec128<uint32_t>(vrshlq_u32(v.raw, neg_bits)); 2452 } 2453 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> 2454 HWY_API Vec128<uint32_t, N> RoundingShr(Vec128<uint32_t, N> v, 2455 Vec128<uint32_t, N> bits) { 2456 const RebindToSigned<DFromV<decltype(v)>> di; 2457 const int32x2_t neg_bits = Neg(BitCast(di, bits)).raw; 2458 return Vec128<uint32_t, N>(vrshl_u32(v.raw, neg_bits)); 2459 } 2460 2461 HWY_API Vec128<uint64_t> RoundingShr(Vec128<uint64_t> v, 2462 Vec128<uint64_t> bits) { 2463 const RebindToSigned<DFromV<decltype(v)>> di; 2464 const int64x2_t neg_bits = Neg(BitCast(di, bits)).raw; 2465 return Vec128<uint64_t>(vrshlq_u64(v.raw, neg_bits)); 2466 } 2467 HWY_API Vec64<uint64_t> RoundingShr(Vec64<uint64_t> v, Vec64<uint64_t> bits) { 2468 const RebindToSigned<DFromV<decltype(v)>> di; 2469 const int64x1_t neg_bits = Neg(BitCast(di, bits)).raw; 2470 return Vec64<uint64_t>(vrshl_u64(v.raw, neg_bits)); 2471 } 2472 2473 HWY_API Vec128<int8_t> RoundingShr(Vec128<int8_t> v, Vec128<int8_t> bits) { 2474 return Vec128<int8_t>(vrshlq_s8(v.raw, Neg(bits).raw)); 2475 } 2476 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> 2477 HWY_API Vec128<int8_t, N> RoundingShr(Vec128<int8_t, N> v, 2478 Vec128<int8_t, N> bits) { 2479 return Vec128<int8_t, N>(vrshl_s8(v.raw, Neg(bits).raw)); 2480 } 2481 2482 HWY_API Vec128<int16_t> RoundingShr(Vec128<int16_t> v, Vec128<int16_t> bits) { 2483 return Vec128<int16_t>(vrshlq_s16(v.raw, Neg(bits).raw)); 2484 } 2485 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> 2486 HWY_API Vec128<int16_t, N> RoundingShr(Vec128<int16_t, N> v, 2487 Vec128<int16_t, N> bits) { 2488 return Vec128<int16_t, N>(vrshl_s16(v.raw, Neg(bits).raw)); 2489 } 2490 2491 HWY_API Vec128<int32_t> RoundingShr(Vec128<int32_t> v, Vec128<int32_t> bits) { 2492 return Vec128<int32_t>(vrshlq_s32(v.raw, Neg(bits).raw)); 2493 } 2494 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> 2495 HWY_API Vec128<int32_t, N> RoundingShr(Vec128<int32_t, N> v, 2496 Vec128<int32_t, N> bits) { 2497 return Vec128<int32_t, N>(vrshl_s32(v.raw, Neg(bits).raw)); 2498 } 2499 2500 HWY_API Vec128<int64_t> RoundingShr(Vec128<int64_t> v, Vec128<int64_t> bits) { 2501 return Vec128<int64_t>(vrshlq_s64(v.raw, Neg(bits).raw)); 2502 } 2503 HWY_API Vec64<int64_t> RoundingShr(Vec64<int64_t> v, Vec64<int64_t> bits) { 2504 return Vec64<int64_t>(vrshl_s64(v.raw, Neg(bits).raw)); 2505 } 2506 2507 // ------------------------------ ShiftLeftSame (Shl) 2508 2509 template <typename T, size_t N> 2510 HWY_API Vec128<T, N> ShiftLeftSame(const Vec128<T, N> v, int bits) { 2511 return v << Set(DFromV<decltype(v)>(), static_cast<T>(bits)); 2512 } 2513 template <typename T, size_t N> 2514 HWY_API Vec128<T, N> ShiftRightSame(const Vec128<T, N> v, int bits) { 2515 return v >> Set(DFromV<decltype(v)>(), static_cast<T>(bits)); 2516 } 2517 2518 // ------------------------------ RoundingShiftRightSame (RoundingShr) 2519 2520 template <typename T, size_t N> 2521 HWY_API Vec128<T, N> RoundingShiftRightSame(const Vec128<T, N> v, int bits) { 2522 return RoundingShr(v, Set(DFromV<decltype(v)>(), static_cast<T>(bits))); 2523 } 2524 2525 // ------------------------------ Int/float multiplication 2526 2527 // Per-target flag to prevent generic_ops-inl.h from defining 8-bit operator*. 2528 #ifdef HWY_NATIVE_MUL_8 2529 #undef HWY_NATIVE_MUL_8 2530 #else 2531 #define HWY_NATIVE_MUL_8 2532 #endif 2533 2534 // All except ui64 2535 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator*, vmul, _, 2) 2536 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator*, vmul, _, 2) 2537 2538 template <size_t N> 2539 HWY_API Vec128<int8_t, N> operator*(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 2540 const DFromV<decltype(a)> d; 2541 const RebindToUnsigned<decltype(d)> du; 2542 return BitCast(d, BitCast(du, a) * BitCast(du, b)); 2543 } 2544 2545 template <size_t N> 2546 HWY_API Vec128<int16_t, N> operator*(Vec128<int16_t, N> a, 2547 Vec128<int16_t, N> b) { 2548 const DFromV<decltype(a)> d; 2549 const RebindToUnsigned<decltype(d)> du; 2550 return BitCast(d, BitCast(du, a) * BitCast(du, b)); 2551 } 2552 2553 template <size_t N> 2554 HWY_API Vec128<int32_t, N> operator*(Vec128<int32_t, N> a, 2555 Vec128<int32_t, N> b) { 2556 const DFromV<decltype(a)> d; 2557 const RebindToUnsigned<decltype(d)> du; 2558 return BitCast(d, BitCast(du, a) * BitCast(du, b)); 2559 } 2560 2561 // ------------------------------ Integer multiplication 2562 2563 // Returns the upper sizeof(T)*8 bits of a * b in each lane. 2564 HWY_API Vec128<int8_t> MulHigh(Vec128<int8_t> a, Vec128<int8_t> b) { 2565 int16x8_t rlo = vmull_s8(vget_low_s8(a.raw), vget_low_s8(b.raw)); 2566 #if HWY_ARCH_ARM_A64 2567 int16x8_t rhi = vmull_high_s8(a.raw, b.raw); 2568 #else 2569 int16x8_t rhi = vmull_s8(vget_high_s8(a.raw), vget_high_s8(b.raw)); 2570 #endif 2571 return Vec128<int8_t>( 2572 vuzp2q_s8(vreinterpretq_s8_s16(rlo), vreinterpretq_s8_s16(rhi))); 2573 } 2574 HWY_API Vec128<uint8_t> MulHigh(Vec128<uint8_t> a, Vec128<uint8_t> b) { 2575 uint16x8_t rlo = vmull_u8(vget_low_u8(a.raw), vget_low_u8(b.raw)); 2576 #if HWY_ARCH_ARM_A64 2577 uint16x8_t rhi = vmull_high_u8(a.raw, b.raw); 2578 #else 2579 uint16x8_t rhi = vmull_u8(vget_high_u8(a.raw), vget_high_u8(b.raw)); 2580 #endif 2581 return Vec128<uint8_t>( 2582 vuzp2q_u8(vreinterpretq_u8_u16(rlo), vreinterpretq_u8_u16(rhi))); 2583 } 2584 2585 template <size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8)> 2586 HWY_API Vec128<int8_t, N> MulHigh(Vec128<int8_t, N> a, Vec128<int8_t, N> b) { 2587 int8x16_t hi_lo = vreinterpretq_s8_s16(vmull_s8(a.raw, b.raw)); 2588 return Vec128<int8_t, N>(vget_low_s8(vuzp2q_s8(hi_lo, hi_lo))); 2589 } 2590 template <size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8)> 2591 HWY_API Vec128<uint8_t, N> MulHigh(Vec128<uint8_t, N> a, Vec128<uint8_t, N> b) { 2592 uint8x16_t hi_lo = vreinterpretq_u8_u16(vmull_u8(a.raw, b.raw)); 2593 return Vec128<uint8_t, N>(vget_low_u8(vuzp2q_u8(hi_lo, hi_lo))); 2594 } 2595 2596 HWY_API Vec128<int16_t> MulHigh(Vec128<int16_t> a, Vec128<int16_t> b) { 2597 int32x4_t rlo = vmull_s16(vget_low_s16(a.raw), vget_low_s16(b.raw)); 2598 #if HWY_ARCH_ARM_A64 2599 int32x4_t rhi = vmull_high_s16(a.raw, b.raw); 2600 #else 2601 int32x4_t rhi = vmull_s16(vget_high_s16(a.raw), vget_high_s16(b.raw)); 2602 #endif 2603 return Vec128<int16_t>( 2604 vuzp2q_s16(vreinterpretq_s16_s32(rlo), vreinterpretq_s16_s32(rhi))); 2605 } 2606 HWY_API Vec128<uint16_t> MulHigh(Vec128<uint16_t> a, Vec128<uint16_t> b) { 2607 uint32x4_t rlo = vmull_u16(vget_low_u16(a.raw), vget_low_u16(b.raw)); 2608 #if HWY_ARCH_ARM_A64 2609 uint32x4_t rhi = vmull_high_u16(a.raw, b.raw); 2610 #else 2611 uint32x4_t rhi = vmull_u16(vget_high_u16(a.raw), vget_high_u16(b.raw)); 2612 #endif 2613 return Vec128<uint16_t>( 2614 vuzp2q_u16(vreinterpretq_u16_u32(rlo), vreinterpretq_u16_u32(rhi))); 2615 } 2616 2617 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> 2618 HWY_API Vec128<int16_t, N> MulHigh(Vec128<int16_t, N> a, Vec128<int16_t, N> b) { 2619 int16x8_t hi_lo = vreinterpretq_s16_s32(vmull_s16(a.raw, b.raw)); 2620 return Vec128<int16_t, N>(vget_low_s16(vuzp2q_s16(hi_lo, hi_lo))); 2621 } 2622 template <size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8)> 2623 HWY_API Vec128<uint16_t, N> MulHigh(Vec128<uint16_t, N> a, 2624 Vec128<uint16_t, N> b) { 2625 uint16x8_t hi_lo = vreinterpretq_u16_u32(vmull_u16(a.raw, b.raw)); 2626 return Vec128<uint16_t, N>(vget_low_u16(vuzp2q_u16(hi_lo, hi_lo))); 2627 } 2628 2629 HWY_API Vec128<int32_t> MulHigh(Vec128<int32_t> a, Vec128<int32_t> b) { 2630 int64x2_t rlo = vmull_s32(vget_low_s32(a.raw), vget_low_s32(b.raw)); 2631 #if HWY_ARCH_ARM_A64 2632 int64x2_t rhi = vmull_high_s32(a.raw, b.raw); 2633 #else 2634 int64x2_t rhi = vmull_s32(vget_high_s32(a.raw), vget_high_s32(b.raw)); 2635 #endif 2636 return Vec128<int32_t>( 2637 vuzp2q_s32(vreinterpretq_s32_s64(rlo), vreinterpretq_s32_s64(rhi))); 2638 } 2639 HWY_API Vec128<uint32_t> MulHigh(Vec128<uint32_t> a, Vec128<uint32_t> b) { 2640 uint64x2_t rlo = vmull_u32(vget_low_u32(a.raw), vget_low_u32(b.raw)); 2641 #if HWY_ARCH_ARM_A64 2642 uint64x2_t rhi = vmull_high_u32(a.raw, b.raw); 2643 #else 2644 uint64x2_t rhi = vmull_u32(vget_high_u32(a.raw), vget_high_u32(b.raw)); 2645 #endif 2646 return Vec128<uint32_t>( 2647 vuzp2q_u32(vreinterpretq_u32_u64(rlo), vreinterpretq_u32_u64(rhi))); 2648 } 2649 2650 template <size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8)> 2651 HWY_API Vec128<int32_t, N> MulHigh(Vec128<int32_t, N> a, Vec128<int32_t, N> b) { 2652 int32x4_t hi_lo = vreinterpretq_s32_s64(vmull_s32(a.raw, b.raw)); 2653 return Vec128<int32_t, N>(vget_low_s32(vuzp2q_s32(hi_lo, hi_lo))); 2654 } 2655 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> 2656 HWY_API Vec128<uint32_t, N> MulHigh(Vec128<uint32_t, N> a, 2657 Vec128<uint32_t, N> b) { 2658 uint32x4_t hi_lo = vreinterpretq_u32_u64(vmull_u32(a.raw, b.raw)); 2659 return Vec128<uint32_t, N>(vget_low_u32(vuzp2q_u32(hi_lo, hi_lo))); 2660 } 2661 2662 template <class T, HWY_IF_UI64(T)> 2663 HWY_API Vec128<T> MulHigh(Vec128<T> a, Vec128<T> b) { 2664 T hi_0; 2665 T hi_1; 2666 2667 Mul128(GetLane(a), GetLane(b), &hi_0); 2668 Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi_1); 2669 2670 return Dup128VecFromValues(Full128<T>(), hi_0, hi_1); 2671 } 2672 2673 template <class T, HWY_IF_UI64(T)> 2674 HWY_API Vec64<T> MulHigh(Vec64<T> a, Vec64<T> b) { 2675 T hi; 2676 Mul128(GetLane(a), GetLane(b), &hi); 2677 return Set(Full64<T>(), hi); 2678 } 2679 2680 HWY_API Vec128<int16_t> MulFixedPoint15(Vec128<int16_t> a, Vec128<int16_t> b) { 2681 return Vec128<int16_t>(vqrdmulhq_s16(a.raw, b.raw)); 2682 } 2683 template <size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8)> 2684 HWY_API Vec128<int16_t, N> MulFixedPoint15(Vec128<int16_t, N> a, 2685 Vec128<int16_t, N> b) { 2686 return Vec128<int16_t, N>(vqrdmulh_s16(a.raw, b.raw)); 2687 } 2688 2689 // ------------------------------ Floating-point division 2690 2691 // Emulate missing intrinsic 2692 #if HWY_HAVE_FLOAT64 && HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 2693 HWY_INLINE float64x1_t vrecpe_f64(float64x1_t raw) { 2694 const CappedTag<double, 1> d; 2695 const Twice<decltype(d)> dt; 2696 using VT = VFromD<decltype(dt)>; 2697 return LowerHalf(d, VT(vrecpeq_f64(Combine(dt, v, v).raw))).raw; 2698 } 2699 #endif 2700 2701 // Approximate reciprocal 2702 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocal, vrecpe, _, 1) 2703 2704 #if HWY_HAVE_FLOAT64 2705 #ifdef HWY_NATIVE_F64_APPROX_RECIP 2706 #undef HWY_NATIVE_F64_APPROX_RECIP 2707 #else 2708 #define HWY_NATIVE_F64_APPROX_RECIP 2709 #endif 2710 2711 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator/, vdiv, _, 2) 2712 #else // !HWY_HAVE_FLOAT64 2713 namespace detail { 2714 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalNewtonRaphsonStep, vrecps, _, 2) 2715 } // namespace detail 2716 2717 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2718 HWY_API Vec128<T, N> operator/(Vec128<T, N> a, Vec128<T, N> b) { 2719 auto x = ApproximateReciprocal(b); 2720 x *= detail::ReciprocalNewtonRaphsonStep(x, b); 2721 x *= detail::ReciprocalNewtonRaphsonStep(x, b); 2722 x *= detail::ReciprocalNewtonRaphsonStep(x, b); 2723 return a * x; 2724 } 2725 #endif // HWY_HAVE_FLOAT64 2726 2727 // ------------------------------ Absolute value of difference. 2728 2729 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(AbsDiff, vabd, _, 2) 2730 HWY_NEON_DEF_FUNCTION_UI_8_16_32(AbsDiff, vabd, _, 2) // no UI64 2731 2732 #ifdef HWY_NATIVE_INTEGER_ABS_DIFF 2733 #undef HWY_NATIVE_INTEGER_ABS_DIFF 2734 #else 2735 #define HWY_NATIVE_INTEGER_ABS_DIFF 2736 #endif 2737 2738 // ------------------------------ Integer multiply-add 2739 2740 // Per-target flag to prevent generic_ops-inl.h from defining int MulAdd. 2741 #ifdef HWY_NATIVE_INT_FMA 2742 #undef HWY_NATIVE_INT_FMA 2743 #else 2744 #define HWY_NATIVE_INT_FMA 2745 #endif 2746 2747 // Wrappers for changing argument order to what intrinsics expect. 2748 namespace detail { 2749 // All except ui64 2750 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(MulAdd, vmla, _, 3) 2751 HWY_NEON_DEF_FUNCTION_INT_8_16_32(MulAdd, vmla, _, 3) 2752 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(NegMulAdd, vmls, _, 3) 2753 HWY_NEON_DEF_FUNCTION_INT_8_16_32(NegMulAdd, vmls, _, 3) 2754 } // namespace detail 2755 2756 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)> 2757 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2758 Vec128<T, N> add) { 2759 return detail::MulAdd(add, mul, x); 2760 } 2761 2762 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_NOT_T_SIZE(T, 8)> 2763 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2764 Vec128<T, N> add) { 2765 return detail::NegMulAdd(add, mul, x); 2766 } 2767 2768 // 64-bit integer 2769 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)> 2770 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2771 Vec128<T, N> add) { 2772 return Add(Mul(mul, x), add); 2773 } 2774 2775 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T), HWY_IF_T_SIZE(T, 8)> 2776 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2777 Vec128<T, N> add) { 2778 return Sub(add, Mul(mul, x)); 2779 } 2780 2781 // ------------------------------ Floating-point multiply-add variants 2782 2783 namespace detail { 2784 2785 #if HWY_NATIVE_FMA 2786 // Wrappers for changing argument order to what intrinsics expect. 2787 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(MulAdd, vfma, _, 3) 2788 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(NegMulAdd, vfms, _, 3) 2789 #else 2790 // Emulate. Matches intrinsics arg order. 2791 template <size_t N> 2792 HWY_API Vec128<float, N> MulAdd(Vec128<float, N> add, Vec128<float, N> mul, 2793 Vec128<float, N> x) { 2794 return mul * x + add; 2795 } 2796 2797 template <size_t N> 2798 HWY_API Vec128<float, N> NegMulAdd(Vec128<float, N> add, Vec128<float, N> mul, 2799 Vec128<float, N> x) { 2800 return add - mul * x; 2801 } 2802 2803 #endif // HWY_NATIVE_FMA 2804 } // namespace detail 2805 2806 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2807 HWY_API Vec128<T, N> MulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2808 Vec128<T, N> add) { 2809 return detail::MulAdd(add, mul, x); 2810 } 2811 2812 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2813 HWY_API Vec128<T, N> NegMulAdd(Vec128<T, N> mul, Vec128<T, N> x, 2814 Vec128<T, N> add) { 2815 return detail::NegMulAdd(add, mul, x); 2816 } 2817 2818 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2819 HWY_API Vec128<T, N> MulSub(Vec128<T, N> mul, Vec128<T, N> x, 2820 Vec128<T, N> sub) { 2821 return MulAdd(mul, x, Neg(sub)); 2822 } 2823 2824 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2825 HWY_API Vec128<T, N> NegMulSub(Vec128<T, N> mul, Vec128<T, N> x, 2826 Vec128<T, N> sub) { 2827 return Neg(MulAdd(mul, x, sub)); 2828 } 2829 2830 // ------------------------------ Floating-point square root (IfThenZeroElse) 2831 2832 // Emulate missing intrinsic 2833 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 490 2834 HWY_INLINE float64x1_t vrsqrte_f64(float64x1_t raw) { 2835 const CappedTag<double, 1> d; 2836 const Twice<decltype(d)> dt; 2837 using VT = VFromD<decltype(dt)>; 2838 const VFromD<decltype(d)> v(raw); 2839 return LowerHalf(d, VT(vrsqrteq_f64(Combine(dt, v, v).raw))).raw; 2840 } 2841 #endif 2842 2843 // Approximate reciprocal square root 2844 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ApproximateReciprocalSqrt, vrsqrte, _, 1) 2845 2846 #if HWY_HAVE_FLOAT64 2847 #ifdef HWY_NATIVE_F64_APPROX_RSQRT 2848 #undef HWY_NATIVE_F64_APPROX_RSQRT 2849 #else 2850 #define HWY_NATIVE_F64_APPROX_RSQRT 2851 #endif 2852 2853 // Full precision square root 2854 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Sqrt, vsqrt, _, 1) 2855 #else // !HWY_HAVE_FLOAT64 2856 namespace detail { 2857 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(ReciprocalSqrtStep, vrsqrts, _, 2) 2858 } // namespace detail 2859 2860 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2861 HWY_API Vec128<T, N> Sqrt(const Vec128<T, N> v) { 2862 auto recip = ApproximateReciprocalSqrt(v); 2863 2864 recip *= detail::ReciprocalSqrtStep(v * recip, recip); 2865 recip *= detail::ReciprocalSqrtStep(v * recip, recip); 2866 recip *= detail::ReciprocalSqrtStep(v * recip, recip); 2867 2868 const auto root = v * recip; 2869 return IfThenZeroElse(v == Zero(Simd<T, N, 0>()), root); 2870 } 2871 #endif // HWY_HAVE_FLOAT64 2872 2873 // ================================================== LOGICAL 2874 2875 // ------------------------------ Not 2876 2877 // There is no 64-bit vmvn, so cast instead of using HWY_NEON_DEF_FUNCTION. 2878 template <typename T> 2879 HWY_API Vec128<T> Not(const Vec128<T> v) { 2880 const DFromV<decltype(v)> d; 2881 const Repartition<uint8_t, decltype(d)> d8; 2882 return BitCast(d, Vec128<uint8_t>(vmvnq_u8(BitCast(d8, v).raw))); 2883 } 2884 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 2885 HWY_API Vec128<T, N> Not(const Vec128<T, N> v) { 2886 const DFromV<decltype(v)> d; 2887 const Repartition<uint8_t, decltype(d)> d8; 2888 using V8 = decltype(Zero(d8)); 2889 return BitCast(d, V8(vmvn_u8(BitCast(d8, v).raw))); 2890 } 2891 2892 // ------------------------------ And 2893 HWY_NEON_DEF_FUNCTION_INTS_UINTS(And, vand, _, 2) 2894 2895 // Uses the u32/64 defined above. 2896 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2897 HWY_API Vec128<T, N> And(const Vec128<T, N> a, const Vec128<T, N> b) { 2898 const DFromV<decltype(a)> d; 2899 const RebindToUnsigned<decltype(d)> du; 2900 return BitCast(d, BitCast(du, a) & BitCast(du, b)); 2901 } 2902 2903 // ------------------------------ AndNot 2904 2905 namespace detail { 2906 // reversed_andnot returns a & ~b. 2907 HWY_NEON_DEF_FUNCTION_INTS_UINTS(reversed_andnot, vbic, _, 2) 2908 } // namespace detail 2909 2910 // Returns ~not_mask & mask. 2911 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> 2912 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask, 2913 const Vec128<T, N> mask) { 2914 return detail::reversed_andnot(mask, not_mask); 2915 } 2916 2917 // Uses the u32/64 defined above. 2918 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2919 HWY_API Vec128<T, N> AndNot(const Vec128<T, N> not_mask, 2920 const Vec128<T, N> mask) { 2921 const DFromV<decltype(mask)> d; 2922 const RebindToUnsigned<decltype(d)> du; 2923 VFromD<decltype(du)> ret = 2924 detail::reversed_andnot(BitCast(du, mask), BitCast(du, not_mask)); 2925 return BitCast(d, ret); 2926 } 2927 2928 // ------------------------------ Or 2929 2930 HWY_NEON_DEF_FUNCTION_INTS_UINTS(Or, vorr, _, 2) 2931 2932 // Uses the u32/64 defined above. 2933 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2934 HWY_API Vec128<T, N> Or(const Vec128<T, N> a, const Vec128<T, N> b) { 2935 const DFromV<decltype(a)> d; 2936 const RebindToUnsigned<decltype(d)> du; 2937 return BitCast(d, BitCast(du, a) | BitCast(du, b)); 2938 } 2939 2940 // ------------------------------ Xor 2941 2942 HWY_NEON_DEF_FUNCTION_INTS_UINTS(Xor, veor, _, 2) 2943 2944 // Uses the u32/64 defined above. 2945 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2946 HWY_API Vec128<T, N> Xor(const Vec128<T, N> a, const Vec128<T, N> b) { 2947 const DFromV<decltype(a)> d; 2948 const RebindToUnsigned<decltype(d)> du; 2949 return BitCast(d, BitCast(du, a) ^ BitCast(du, b)); 2950 } 2951 2952 // ------------------------------ Xor3 2953 #if HWY_ARCH_ARM_A64 && defined(__ARM_FEATURE_SHA3) 2954 HWY_NEON_DEF_FUNCTION_FULL_UI(Xor3, veor3, _, 3) 2955 2956 // Half vectors are not natively supported. Two Xor are likely more efficient 2957 // than Combine to 128-bit. 2958 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_NOT_FLOAT(T)> 2959 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 2960 return Xor(x1, Xor(x2, x3)); 2961 } 2962 2963 template <typename T, size_t N, HWY_IF_FLOAT(T)> 2964 HWY_API Vec128<T, N> Xor3(const Vec128<T, N> x1, const Vec128<T, N> x2, 2965 const Vec128<T, N> x3) { 2966 const DFromV<decltype(x1)> d; 2967 const RebindToUnsigned<decltype(d)> du; 2968 return BitCast(d, Xor3(BitCast(du, x1), BitCast(du, x2), BitCast(du, x3))); 2969 } 2970 2971 #else 2972 template <typename T, size_t N> 2973 HWY_API Vec128<T, N> Xor3(Vec128<T, N> x1, Vec128<T, N> x2, Vec128<T, N> x3) { 2974 return Xor(x1, Xor(x2, x3)); 2975 } 2976 #endif 2977 2978 // ------------------------------ Or3 2979 template <typename T, size_t N> 2980 HWY_API Vec128<T, N> Or3(Vec128<T, N> o1, Vec128<T, N> o2, Vec128<T, N> o3) { 2981 return Or(o1, Or(o2, o3)); 2982 } 2983 2984 // ------------------------------ OrAnd 2985 template <typename T, size_t N> 2986 HWY_API Vec128<T, N> OrAnd(Vec128<T, N> o, Vec128<T, N> a1, Vec128<T, N> a2) { 2987 return Or(o, And(a1, a2)); 2988 } 2989 2990 // ------------------------------ Operator overloads (internal-only if float) 2991 2992 template <typename T, size_t N> 2993 HWY_API Vec128<T, N> operator&(const Vec128<T, N> a, const Vec128<T, N> b) { 2994 return And(a, b); 2995 } 2996 2997 template <typename T, size_t N> 2998 HWY_API Vec128<T, N> operator|(const Vec128<T, N> a, const Vec128<T, N> b) { 2999 return Or(a, b); 3000 } 3001 3002 template <typename T, size_t N> 3003 HWY_API Vec128<T, N> operator^(const Vec128<T, N> a, const Vec128<T, N> b) { 3004 return Xor(a, b); 3005 } 3006 3007 // ------------------------------ I64/U64 AbsDiff 3008 3009 template <size_t N> 3010 HWY_API Vec128<int64_t, N> AbsDiff(const Vec128<int64_t, N> a, 3011 const Vec128<int64_t, N> b) { 3012 return Max(a, b) - Min(a, b); 3013 } 3014 3015 template <size_t N> 3016 HWY_API Vec128<uint64_t, N> AbsDiff(const Vec128<uint64_t, N> a, 3017 const Vec128<uint64_t, N> b) { 3018 return Or(SaturatedSub(a, b), SaturatedSub(b, a)); 3019 } 3020 3021 // ------------------------------ PopulationCount 3022 3023 #ifdef HWY_NATIVE_POPCNT 3024 #undef HWY_NATIVE_POPCNT 3025 #else 3026 #define HWY_NATIVE_POPCNT 3027 #endif 3028 3029 namespace detail { 3030 3031 template <typename T> 3032 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<1> /* tag */, Vec128<T> v) { 3033 const Full128<uint8_t> d8; 3034 return Vec128<T>(vcntq_u8(BitCast(d8, v).raw)); 3035 } 3036 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 3037 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<1> /* tag */, 3038 Vec128<T, N> v) { 3039 const Simd<uint8_t, N, 0> d8; 3040 return Vec128<T, N>(vcnt_u8(BitCast(d8, v).raw)); 3041 } 3042 3043 // NEON lacks popcount for lane sizes > 1, so take pairwise sums of the bytes. 3044 template <typename T> 3045 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<2> /* tag */, Vec128<T> v) { 3046 const Full128<uint8_t> d8; 3047 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); 3048 return Vec128<T>(vpaddlq_u8(bytes)); 3049 } 3050 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 3051 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<2> /* tag */, 3052 Vec128<T, N> v) { 3053 const Repartition<uint8_t, DFromV<decltype(v)>> d8; 3054 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); 3055 return Vec128<T, N>(vpaddl_u8(bytes)); 3056 } 3057 3058 template <typename T> 3059 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<4> /* tag */, Vec128<T> v) { 3060 const Full128<uint8_t> d8; 3061 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); 3062 return Vec128<T>(vpaddlq_u16(vpaddlq_u8(bytes))); 3063 } 3064 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 3065 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<4> /* tag */, 3066 Vec128<T, N> v) { 3067 const Repartition<uint8_t, DFromV<decltype(v)>> d8; 3068 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); 3069 return Vec128<T, N>(vpaddl_u16(vpaddl_u8(bytes))); 3070 } 3071 3072 template <typename T> 3073 HWY_INLINE Vec128<T> PopulationCount(hwy::SizeTag<8> /* tag */, Vec128<T> v) { 3074 const Full128<uint8_t> d8; 3075 const uint8x16_t bytes = vcntq_u8(BitCast(d8, v).raw); 3076 return Vec128<T>(vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(bytes)))); 3077 } 3078 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 3079 HWY_INLINE Vec128<T, N> PopulationCount(hwy::SizeTag<8> /* tag */, 3080 Vec128<T, N> v) { 3081 const Repartition<uint8_t, DFromV<decltype(v)>> d8; 3082 const uint8x8_t bytes = vcnt_u8(BitCast(d8, v).raw); 3083 return Vec128<T, N>(vpaddl_u32(vpaddl_u16(vpaddl_u8(bytes)))); 3084 } 3085 3086 } // namespace detail 3087 3088 template <typename T, size_t N, HWY_IF_NOT_FLOAT(T)> 3089 HWY_API Vec128<T, N> PopulationCount(Vec128<T, N> v) { 3090 return detail::PopulationCount(hwy::SizeTag<sizeof(T)>(), v); 3091 } 3092 3093 // ================================================== SIGN 3094 3095 // ------------------------------ Abs 3096 // i64 is implemented after BroadcastSignBit. 3097 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Abs, vabs, _, 1) 3098 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Abs, vabs, _, 1) 3099 3100 // ------------------------------ SaturatedAbs 3101 #ifdef HWY_NATIVE_SATURATED_ABS 3102 #undef HWY_NATIVE_SATURATED_ABS 3103 #else 3104 #define HWY_NATIVE_SATURATED_ABS 3105 #endif 3106 3107 HWY_NEON_DEF_FUNCTION_INT_8_16_32(SaturatedAbs, vqabs, _, 1) 3108 3109 // ------------------------------ CopySignToAbs 3110 template <typename T, size_t N> 3111 HWY_API Vec128<T, N> CopySignToAbs(Vec128<T, N> abs, Vec128<T, N> sign) { 3112 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 3113 const DFromV<decltype(abs)> d; 3114 return OrAnd(abs, SignBit(d), sign); 3115 } 3116 3117 // ------------------------------ BroadcastSignBit 3118 3119 template <typename T, size_t N, HWY_IF_SIGNED(T)> 3120 HWY_API Vec128<T, N> BroadcastSignBit(const Vec128<T, N> v) { 3121 return ShiftRight<sizeof(T) * 8 - 1>(v); 3122 } 3123 3124 // ================================================== MASK 3125 3126 // ------------------------------ To/from vector 3127 3128 // Mask and Vec have the same representation (true = FF..FF). 3129 template <typename T, size_t N> 3130 HWY_API Mask128<T, N> MaskFromVec(const Vec128<T, N> v) { 3131 const Simd<MakeUnsigned<T>, N, 0> du; 3132 return Mask128<T, N>(BitCast(du, v).raw); 3133 } 3134 3135 template <class D> 3136 using MFromD = decltype(MaskFromVec(VFromD<D>())); 3137 3138 template <class D> 3139 HWY_API VFromD<D> VecFromMask(D d, const MFromD<D> m) { 3140 // Raw type of masks is unsigned. 3141 const RebindToUnsigned<D> du; 3142 return BitCast(d, VFromD<decltype(du)>(m.raw)); 3143 } 3144 3145 // ------------------------------ RebindMask (MaskFromVec) 3146 3147 template <typename TFrom, size_t NFrom, class DTo> 3148 HWY_API MFromD<DTo> RebindMask(DTo /* tag */, Mask128<TFrom, NFrom> m) { 3149 static_assert(sizeof(TFrom) == sizeof(TFromD<DTo>), "Must have same size"); 3150 return MFromD<DTo>(m.raw); 3151 } 3152 3153 // ------------------------------ IfThenElse 3154 3155 // Workaround for incorrect codegen. 3156 #if HWY_ARCH_ARM_V7 3157 3158 template <class V, class D = DFromV<V>> 3159 HWY_API V IfThenElse(MFromD<D> mask, V yes, V no) { 3160 const RebindToUnsigned<D> du; 3161 using VU = VFromD<decltype(du)>; 3162 const VU no_u = BitCast(du, no); 3163 const VU diff_u = BitCast(du, yes) ^ no_u; 3164 const VU mask_u = BitCast(du, VecFromMask(D(), mask)); 3165 return BitCast(D(), no_u ^ (diff_u & mask_u)); 3166 } 3167 3168 #else // normal VBSL instruction 3169 3170 #define HWY_NEON_BUILD_TPL_HWY_IF 3171 #define HWY_NEON_BUILD_RET_HWY_IF(type, size) Vec128<type##_t, size> 3172 #define HWY_NEON_BUILD_PARAM_HWY_IF(type, size) \ 3173 const Mask128<type##_t, size> mask, const Vec128<type##_t, size> yes, \ 3174 const Vec128<type##_t, size> no 3175 #define HWY_NEON_BUILD_ARG_HWY_IF mask.raw, yes.raw, no.raw 3176 3177 HWY_NEON_DEF_FUNCTION_ALL_TYPES(IfThenElse, vbsl, _, HWY_IF) 3178 3179 #endif // HWY_ARCH_ARM_V7 3180 3181 #if HWY_HAVE_FLOAT16 3182 #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_BF16(TFromV<V>) 3183 #else 3184 #define HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V) HWY_IF_SPECIAL_FLOAT_V(V) 3185 #endif 3186 3187 template <class V, HWY_NEON_IF_EMULATED_IF_THEN_ELSE(V)> 3188 HWY_API V IfThenElse(MFromD<DFromV<V>> mask, V yes, V no) { 3189 const DFromV<decltype(yes)> d; 3190 const RebindToUnsigned<decltype(d)> du; 3191 return BitCast( 3192 d, IfThenElse(RebindMask(du, mask), BitCast(du, yes), BitCast(du, no))); 3193 } 3194 3195 #undef HWY_NEON_IF_EMULATED_IF_THEN_ELSE 3196 #undef HWY_NEON_BUILD_TPL_HWY_IF 3197 #undef HWY_NEON_BUILD_RET_HWY_IF 3198 #undef HWY_NEON_BUILD_PARAM_HWY_IF 3199 #undef HWY_NEON_BUILD_ARG_HWY_IF 3200 3201 // mask ? yes : 0 3202 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 3203 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 3204 return yes & VecFromMask(DFromV<decltype(yes)>(), mask); 3205 } 3206 template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)> 3207 HWY_API Vec128<T, N> IfThenElseZero(Mask128<T, N> mask, Vec128<T, N> yes) { 3208 const DFromV<decltype(yes)> d; 3209 const RebindToUnsigned<decltype(d)> du; 3210 return BitCast(d, IfThenElseZero(RebindMask(du, mask), BitCast(du, yes))); 3211 } 3212 3213 // mask ? 0 : no 3214 template <typename T, size_t N, HWY_IF_NOT_SPECIAL_FLOAT(T)> 3215 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 3216 return AndNot(VecFromMask(DFromV<decltype(no)>(), mask), no); 3217 } 3218 template <typename T, size_t N, HWY_IF_SPECIAL_FLOAT(T)> 3219 HWY_API Vec128<T, N> IfThenZeroElse(Mask128<T, N> mask, Vec128<T, N> no) { 3220 const DFromV<decltype(no)> d; 3221 const RebindToUnsigned<decltype(d)> du; 3222 return BitCast(d, IfThenZeroElse(RebindMask(du, mask), BitCast(du, no))); 3223 } 3224 3225 template <typename T, size_t N> 3226 HWY_API Vec128<T, N> IfNegativeThenElse(Vec128<T, N> v, Vec128<T, N> yes, 3227 Vec128<T, N> no) { 3228 static_assert(IsSigned<T>(), "Only works for signed/float"); 3229 const DFromV<decltype(no)> d; 3230 const RebindToSigned<decltype(d)> di; 3231 3232 Mask128<T, N> m = MaskFromVec(BitCast(d, BroadcastSignBit(BitCast(di, v)))); 3233 return IfThenElse(m, yes, no); 3234 } 3235 3236 template <typename T, size_t N> 3237 HWY_API Vec128<T, N> IfVecThenElse(Vec128<T, N> mask, Vec128<T, N> yes, 3238 Vec128<T, N> no) { 3239 return IfThenElse(MaskFromVec(mask), yes, no); 3240 } 3241 3242 // ------------------------------ BitwiseIfThenElse 3243 3244 #ifdef HWY_NATIVE_BITWISE_IF_THEN_ELSE 3245 #undef HWY_NATIVE_BITWISE_IF_THEN_ELSE 3246 #else 3247 #define HWY_NATIVE_BITWISE_IF_THEN_ELSE 3248 #endif 3249 3250 template <class V> 3251 HWY_API V BitwiseIfThenElse(V mask, V yes, V no) { 3252 return IfVecThenElse(mask, yes, no); 3253 } 3254 3255 // ------------------------------ CopySign (BitwiseIfThenElse) 3256 template <typename T, size_t N> 3257 HWY_API Vec128<T, N> CopySign(Vec128<T, N> magn, Vec128<T, N> sign) { 3258 static_assert(IsFloat<T>(), "Only makes sense for floating-point"); 3259 const DFromV<decltype(magn)> d; 3260 return BitwiseIfThenElse(SignBit(d), sign, magn); 3261 } 3262 3263 // ------------------------------ Mask logical 3264 3265 template <typename T, size_t N> 3266 HWY_API Mask128<T, N> Not(const Mask128<T, N> m) { 3267 return MaskFromVec(Not(VecFromMask(DFromM<decltype(m)>(), m))); 3268 } 3269 3270 template <typename T, size_t N> 3271 HWY_API Mask128<T, N> And(const Mask128<T, N> a, Mask128<T, N> b) { 3272 const DFromM<decltype(a)> d; 3273 return MaskFromVec(And(VecFromMask(d, a), VecFromMask(d, b))); 3274 } 3275 3276 template <typename T, size_t N> 3277 HWY_API Mask128<T, N> AndNot(const Mask128<T, N> a, Mask128<T, N> b) { 3278 const DFromM<decltype(a)> d; 3279 return MaskFromVec(AndNot(VecFromMask(d, a), VecFromMask(d, b))); 3280 } 3281 3282 template <typename T, size_t N> 3283 HWY_API Mask128<T, N> Or(const Mask128<T, N> a, Mask128<T, N> b) { 3284 const DFromM<decltype(a)> d; 3285 return MaskFromVec(Or(VecFromMask(d, a), VecFromMask(d, b))); 3286 } 3287 3288 template <typename T, size_t N> 3289 HWY_API Mask128<T, N> Xor(const Mask128<T, N> a, Mask128<T, N> b) { 3290 const DFromM<decltype(a)> d; 3291 return MaskFromVec(Xor(VecFromMask(d, a), VecFromMask(d, b))); 3292 } 3293 3294 template <typename T, size_t N> 3295 HWY_API Mask128<T, N> ExclusiveNeither(const Mask128<T, N> a, Mask128<T, N> b) { 3296 const DFromM<decltype(a)> d; 3297 return MaskFromVec(AndNot(VecFromMask(d, a), Not(VecFromMask(d, b)))); 3298 } 3299 3300 // ================================================== COMPARE 3301 3302 // Comparisons fill a lane with 1-bits if the condition is true, else 0. 3303 3304 // ------------------------------ Shuffle2301 (for i64 compares) 3305 3306 // Swap 32-bit halves in 64-bits 3307 HWY_API Vec64<uint32_t> Shuffle2301(const Vec64<uint32_t> v) { 3308 return Vec64<uint32_t>(vrev64_u32(v.raw)); 3309 } 3310 HWY_API Vec64<int32_t> Shuffle2301(const Vec64<int32_t> v) { 3311 return Vec64<int32_t>(vrev64_s32(v.raw)); 3312 } 3313 HWY_API Vec64<float> Shuffle2301(const Vec64<float> v) { 3314 return Vec64<float>(vrev64_f32(v.raw)); 3315 } 3316 HWY_API Vec128<uint32_t> Shuffle2301(const Vec128<uint32_t> v) { 3317 return Vec128<uint32_t>(vrev64q_u32(v.raw)); 3318 } 3319 HWY_API Vec128<int32_t> Shuffle2301(const Vec128<int32_t> v) { 3320 return Vec128<int32_t>(vrev64q_s32(v.raw)); 3321 } 3322 HWY_API Vec128<float> Shuffle2301(const Vec128<float> v) { 3323 return Vec128<float>(vrev64q_f32(v.raw)); 3324 } 3325 3326 #define HWY_NEON_BUILD_TPL_HWY_COMPARE 3327 #define HWY_NEON_BUILD_RET_HWY_COMPARE(type, size) Mask128<type##_t, size> 3328 #define HWY_NEON_BUILD_PARAM_HWY_COMPARE(type, size) \ 3329 const Vec128<type##_t, size> a, const Vec128<type##_t, size> b 3330 #define HWY_NEON_BUILD_ARG_HWY_COMPARE a.raw, b.raw 3331 3332 // ------------------------------ Equality 3333 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator==, vceq, _, HWY_COMPARE) 3334 #if HWY_ARCH_ARM_A64 3335 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator==, vceq, _, HWY_COMPARE) 3336 #else 3337 // No 64-bit comparisons on armv7: emulate them below, after Shuffle2301. 3338 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator==, vceq, _, HWY_COMPARE) 3339 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator==, vceq, _, HWY_COMPARE) 3340 #endif 3341 3342 // ------------------------------ Strict inequality (signed, float) 3343 #if HWY_ARCH_ARM_A64 3344 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<, vclt, _, HWY_COMPARE) 3345 #else 3346 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<, vclt, _, HWY_COMPARE) 3347 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<, vclt, _, HWY_COMPARE) 3348 #endif 3349 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<, vclt, _, HWY_COMPARE) 3350 3351 // ------------------------------ Weak inequality (float) 3352 #if HWY_ARCH_ARM_A64 3353 HWY_NEON_DEF_FUNCTION_INTS_UINTS(operator<=, vcle, _, HWY_COMPARE) 3354 #else 3355 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(operator<=, vcle, _, HWY_COMPARE) 3356 HWY_NEON_DEF_FUNCTION_INT_8_16_32(operator<=, vcle, _, HWY_COMPARE) 3357 #endif 3358 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(operator<=, vcle, _, HWY_COMPARE) 3359 3360 #undef HWY_NEON_BUILD_TPL_HWY_COMPARE 3361 #undef HWY_NEON_BUILD_RET_HWY_COMPARE 3362 #undef HWY_NEON_BUILD_PARAM_HWY_COMPARE 3363 #undef HWY_NEON_BUILD_ARG_HWY_COMPARE 3364 3365 // ------------------------------ Armv7 i64 compare (Shuffle2301, Eq) 3366 3367 #if HWY_ARCH_ARM_V7 3368 3369 template <size_t N> 3370 HWY_API Mask128<int64_t, N> operator==(const Vec128<int64_t, N> a, 3371 const Vec128<int64_t, N> b) { 3372 const Simd<int32_t, N * 2, 0> d32; 3373 const Simd<int64_t, N, 0> d64; 3374 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); 3375 const auto cmp64 = cmp32 & Shuffle2301(cmp32); 3376 return MaskFromVec(BitCast(d64, cmp64)); 3377 } 3378 3379 template <size_t N> 3380 HWY_API Mask128<uint64_t, N> operator==(const Vec128<uint64_t, N> a, 3381 const Vec128<uint64_t, N> b) { 3382 const Simd<uint32_t, N * 2, 0> d32; 3383 const Simd<uint64_t, N, 0> d64; 3384 const auto cmp32 = VecFromMask(d32, Eq(BitCast(d32, a), BitCast(d32, b))); 3385 const auto cmp64 = cmp32 & Shuffle2301(cmp32); 3386 return MaskFromVec(BitCast(d64, cmp64)); 3387 } 3388 3389 HWY_API Mask128<int64_t> operator<(const Vec128<int64_t> a, 3390 const Vec128<int64_t> b) { 3391 const int64x2_t sub = vqsubq_s64(a.raw, b.raw); 3392 return MaskFromVec(BroadcastSignBit(Vec128<int64_t>(sub))); 3393 } 3394 HWY_API Mask128<int64_t, 1> operator<(const Vec64<int64_t> a, 3395 const Vec64<int64_t> b) { 3396 const int64x1_t sub = vqsub_s64(a.raw, b.raw); 3397 return MaskFromVec(BroadcastSignBit(Vec64<int64_t>(sub))); 3398 } 3399 3400 template <size_t N> 3401 HWY_API Mask128<uint64_t, N> operator<(const Vec128<uint64_t, N> a, 3402 const Vec128<uint64_t, N> b) { 3403 const DFromV<decltype(a)> du; 3404 const RebindToSigned<decltype(du)> di; 3405 const Vec128<uint64_t, N> msb = AndNot(a, b) | AndNot(a ^ b, a - b); 3406 return MaskFromVec(BitCast(du, BroadcastSignBit(BitCast(di, msb)))); 3407 } 3408 3409 template <size_t N> 3410 HWY_API Mask128<int64_t, N> operator<=(const Vec128<int64_t, N> a, 3411 const Vec128<int64_t, N> b) { 3412 return Not(b < a); 3413 } 3414 3415 template <size_t N> 3416 HWY_API Mask128<uint64_t, N> operator<=(const Vec128<uint64_t, N> a, 3417 const Vec128<uint64_t, N> b) { 3418 return Not(b < a); 3419 } 3420 3421 #endif 3422 3423 // ------------------------------ operator!= (operator==) 3424 3425 // Customize HWY_NEON_DEF_FUNCTION to call 2 functions. 3426 #pragma push_macro("HWY_NEON_DEF_FUNCTION") 3427 #undef HWY_NEON_DEF_FUNCTION 3428 // This cannot have _any_ template argument (in x86_128 we can at least have N 3429 // as an argument), otherwise it is not more specialized than rewritten 3430 // operator== in C++20, leading to compile errors. 3431 #define HWY_NEON_DEF_FUNCTION(type, size, name, prefix, infix, suffix, args) \ 3432 HWY_API Mask128<type##_t, size> name(Vec128<type##_t, size> a, \ 3433 Vec128<type##_t, size> b) { \ 3434 return Not(a == b); \ 3435 } 3436 3437 HWY_NEON_DEF_FUNCTION_ALL_TYPES(operator!=, ignored, ignored, ignored) 3438 3439 #pragma pop_macro("HWY_NEON_DEF_FUNCTION") 3440 3441 // ------------------------------ Reversed comparisons 3442 3443 template <typename T, size_t N> 3444 HWY_API Mask128<T, N> operator>(Vec128<T, N> a, Vec128<T, N> b) { 3445 return operator<(b, a); 3446 } 3447 template <typename T, size_t N> 3448 HWY_API Mask128<T, N> operator>=(Vec128<T, N> a, Vec128<T, N> b) { 3449 return operator<=(b, a); 3450 } 3451 3452 // ------------------------------ FirstN (Iota, Lt) 3453 3454 template <class D> 3455 HWY_API MFromD<D> FirstN(D d, size_t num) { 3456 const RebindToSigned<decltype(d)> di; // Signed comparisons are cheaper. 3457 using TI = TFromD<decltype(di)>; 3458 return RebindMask(d, detail::Iota0(di) < Set(di, static_cast<TI>(num))); 3459 } 3460 3461 // ------------------------------ TestBit (Eq) 3462 3463 #define HWY_NEON_BUILD_TPL_HWY_TESTBIT 3464 #define HWY_NEON_BUILD_RET_HWY_TESTBIT(type, size) Mask128<type##_t, size> 3465 #define HWY_NEON_BUILD_PARAM_HWY_TESTBIT(type, size) \ 3466 Vec128<type##_t, size> v, Vec128<type##_t, size> bit 3467 #define HWY_NEON_BUILD_ARG_HWY_TESTBIT v.raw, bit.raw 3468 3469 #if HWY_ARCH_ARM_A64 3470 HWY_NEON_DEF_FUNCTION_INTS_UINTS(TestBit, vtst, _, HWY_TESTBIT) 3471 #else 3472 // No 64-bit versions on armv7 3473 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) 3474 HWY_NEON_DEF_FUNCTION_INT_8_16_32(TestBit, vtst, _, HWY_TESTBIT) 3475 3476 template <size_t N> 3477 HWY_API Mask128<uint64_t, N> TestBit(Vec128<uint64_t, N> v, 3478 Vec128<uint64_t, N> bit) { 3479 return (v & bit) == bit; 3480 } 3481 template <size_t N> 3482 HWY_API Mask128<int64_t, N> TestBit(Vec128<int64_t, N> v, 3483 Vec128<int64_t, N> bit) { 3484 return (v & bit) == bit; 3485 } 3486 3487 #endif 3488 #undef HWY_NEON_BUILD_TPL_HWY_TESTBIT 3489 #undef HWY_NEON_BUILD_RET_HWY_TESTBIT 3490 #undef HWY_NEON_BUILD_PARAM_HWY_TESTBIT 3491 #undef HWY_NEON_BUILD_ARG_HWY_TESTBIT 3492 3493 // ------------------------------ Abs i64 (IfNegativeThenElse, Neg) 3494 HWY_API Vec128<int64_t> Abs(const Vec128<int64_t> v) { 3495 #if HWY_ARCH_ARM_A64 3496 return Vec128<int64_t>(vabsq_s64(v.raw)); 3497 #else 3498 return IfNegativeThenElse(v, Neg(v), v); 3499 #endif 3500 } 3501 HWY_API Vec64<int64_t> Abs(const Vec64<int64_t> v) { 3502 #if HWY_ARCH_ARM_A64 3503 return Vec64<int64_t>(vabs_s64(v.raw)); 3504 #else 3505 return IfNegativeThenElse(v, Neg(v), v); 3506 #endif 3507 } 3508 3509 HWY_API Vec128<int64_t> SaturatedAbs(const Vec128<int64_t> v) { 3510 #if HWY_ARCH_ARM_A64 3511 return Vec128<int64_t>(vqabsq_s64(v.raw)); 3512 #else 3513 const auto zero = Zero(DFromV<decltype(v)>()); 3514 return IfNegativeThenElse(v, SaturatedSub(zero, v), v); 3515 #endif 3516 } 3517 HWY_API Vec64<int64_t> SaturatedAbs(const Vec64<int64_t> v) { 3518 #if HWY_ARCH_ARM_A64 3519 return Vec64<int64_t>(vqabs_s64(v.raw)); 3520 #else 3521 const auto zero = Zero(DFromV<decltype(v)>()); 3522 return IfNegativeThenElse(v, SaturatedSub(zero, v), v); 3523 #endif 3524 } 3525 3526 // ------------------------------ Min (IfThenElse, BroadcastSignBit) 3527 3528 // Unsigned 3529 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Min, vmin, _, 2) 3530 3531 template <size_t N> 3532 HWY_API Vec128<uint64_t, N> Min(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 3533 #if HWY_ARCH_ARM_A64 3534 return IfThenElse(b < a, b, a); 3535 #else 3536 const DFromV<decltype(a)> du; 3537 const RebindToSigned<decltype(du)> di; 3538 return BitCast(du, BitCast(di, a) - BitCast(di, SaturatedSub(a, b))); 3539 #endif 3540 } 3541 3542 // Signed 3543 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Min, vmin, _, 2) 3544 3545 template <size_t N> 3546 HWY_API Vec128<int64_t, N> Min(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 3547 #if HWY_ARCH_ARM_A64 3548 return IfThenElse(b < a, b, a); 3549 #else 3550 const Vec128<int64_t, N> sign = SaturatedSub(a, b); 3551 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), a, b); 3552 #endif 3553 } 3554 3555 // Float: IEEE minimumNumber on v8 3556 #if HWY_ARCH_ARM_A64 3557 3558 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Min, vminnm, _, 2) 3559 3560 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define 3561 // in terms of the 128-bit intrinsic. 3562 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 3563 namespace detail { 3564 3565 template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)> 3566 HWY_INLINE V F64Vec64Min(V a, V b) { 3567 const DFromV<decltype(a)> d; 3568 const Twice<decltype(d)> dt; 3569 return LowerHalf(d, Min(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); 3570 } 3571 3572 } // namespace detail 3573 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 3574 3575 HWY_API Vec64<double> Min(Vec64<double> a, Vec64<double> b) { 3576 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 3577 return detail::F64Vec64Min(a, b); 3578 #else 3579 return Vec64<double>(vminnm_f64(a.raw, b.raw)); 3580 #endif 3581 } 3582 3583 HWY_API Vec128<double> Min(Vec128<double> a, Vec128<double> b) { 3584 return Vec128<double>(vminnmq_f64(a.raw, b.raw)); 3585 } 3586 3587 #else 3588 // Armv7: NaN if any is NaN. 3589 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Min, vmin, _, 2) 3590 #endif // HWY_ARCH_ARM_A64 3591 3592 // ------------------------------ Max (IfThenElse, BroadcastSignBit) 3593 3594 // Unsigned (no u64) 3595 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(Max, vmax, _, 2) 3596 3597 template <size_t N> 3598 HWY_API Vec128<uint64_t, N> Max(Vec128<uint64_t, N> a, Vec128<uint64_t, N> b) { 3599 #if HWY_ARCH_ARM_A64 3600 return IfThenElse(b < a, a, b); 3601 #else 3602 const DFromV<decltype(a)> du; 3603 const RebindToSigned<decltype(du)> di; 3604 return BitCast(du, BitCast(di, b) + BitCast(di, SaturatedSub(a, b))); 3605 #endif 3606 } 3607 3608 // Signed (no i64) 3609 HWY_NEON_DEF_FUNCTION_INT_8_16_32(Max, vmax, _, 2) 3610 3611 template <size_t N> 3612 HWY_API Vec128<int64_t, N> Max(Vec128<int64_t, N> a, Vec128<int64_t, N> b) { 3613 #if HWY_ARCH_ARM_A64 3614 return IfThenElse(b < a, a, b); 3615 #else 3616 const Vec128<int64_t, N> sign = SaturatedSub(a, b); 3617 return IfThenElse(MaskFromVec(BroadcastSignBit(sign)), b, a); 3618 #endif 3619 } 3620 3621 // Float: IEEE minimumNumber on v8 3622 #if HWY_ARCH_ARM_A64 3623 3624 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(Max, vmaxnm, _, 2) 3625 3626 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic, so define 3627 // in terms of the 128-bit intrinsic. 3628 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 3629 namespace detail { 3630 3631 template <class V, HWY_IF_V_SIZE_V(V, 8), HWY_IF_T_SIZE_V(V, 8)> 3632 HWY_INLINE V F64Vec64Max(V a, V b) { 3633 const DFromV<decltype(a)> d; 3634 const Twice<decltype(d)> dt; 3635 return LowerHalf(d, Max(ZeroExtendVector(dt, a), ZeroExtendVector(dt, b))); 3636 } 3637 3638 } // namespace detail 3639 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 3640 3641 HWY_API Vec64<double> Max(Vec64<double> a, Vec64<double> b) { 3642 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 3643 return detail::F64Vec64Max(a, b); 3644 #else 3645 return Vec64<double>(vmaxnm_f64(a.raw, b.raw)); 3646 #endif 3647 } 3648 3649 HWY_API Vec128<double> Max(Vec128<double> a, Vec128<double> b) { 3650 return Vec128<double>(vmaxnmq_f64(a.raw, b.raw)); 3651 } 3652 3653 #else 3654 // Armv7: NaN if any is NaN. 3655 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Max, vmax, _, 2) 3656 #endif // HWY_ARCH_ARM_A64 3657 3658 // ------------------------------ MinNumber and MaxNumber 3659 3660 #if !HWY_ARCH_ARM_A64 3661 3662 #ifdef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 3663 #undef HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 3664 #else 3665 #define HWY_NATIVE_FLOAT_MIN_MAX_NUMBER 3666 #endif 3667 3668 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 3669 HWY_API V MinNumber(V a, V b) { 3670 return Min(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b)); 3671 } 3672 3673 template <class V, HWY_IF_FLOAT_OR_SPECIAL_V(V)> 3674 HWY_API V MaxNumber(V a, V b) { 3675 return Max(IfThenElse(IsNaN(a), b, a), IfThenElse(IsNaN(b), a, b)); 3676 } 3677 3678 #endif 3679 3680 // ================================================== MEMORY 3681 3682 // ------------------------------ Load 128 3683 3684 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 3685 HWY_API Vec128<uint8_t> LoadU(D /* tag */, 3686 const uint8_t* HWY_RESTRICT unaligned) { 3687 return Vec128<uint8_t>(vld1q_u8(unaligned)); 3688 } 3689 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 3690 HWY_API Vec128<uint16_t> LoadU(D /* tag */, 3691 const uint16_t* HWY_RESTRICT unaligned) { 3692 return Vec128<uint16_t>(vld1q_u16(unaligned)); 3693 } 3694 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 3695 HWY_API Vec128<uint32_t> LoadU(D /* tag */, 3696 const uint32_t* HWY_RESTRICT unaligned) { 3697 return Vec128<uint32_t>(vld1q_u32(unaligned)); 3698 } 3699 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> 3700 HWY_API Vec128<uint64_t> LoadU(D /* tag */, 3701 const uint64_t* HWY_RESTRICT unaligned) { 3702 return Vec128<uint64_t>(vld1q_u64(unaligned)); 3703 } 3704 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> 3705 HWY_API Vec128<int8_t> LoadU(D /* tag */, 3706 const int8_t* HWY_RESTRICT unaligned) { 3707 return Vec128<int8_t>(vld1q_s8(unaligned)); 3708 } 3709 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 3710 HWY_API Vec128<int16_t> LoadU(D /* tag */, 3711 const int16_t* HWY_RESTRICT unaligned) { 3712 return Vec128<int16_t>(vld1q_s16(unaligned)); 3713 } 3714 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 3715 HWY_API Vec128<int32_t> LoadU(D /* tag */, 3716 const int32_t* HWY_RESTRICT unaligned) { 3717 return Vec128<int32_t>(vld1q_s32(unaligned)); 3718 } 3719 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> 3720 HWY_API Vec128<int64_t> LoadU(D /* tag */, 3721 const int64_t* HWY_RESTRICT unaligned) { 3722 return Vec128<int64_t>(vld1q_s64(unaligned)); 3723 } 3724 #if HWY_HAVE_FLOAT16 3725 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 3726 HWY_API Vec128<float16_t> LoadU(D /* tag */, 3727 const float16_t* HWY_RESTRICT unaligned) { 3728 return Vec128<float16_t>(vld1q_f16(detail::NativeLanePointer(unaligned))); 3729 } 3730 #endif // HWY_HAVE_FLOAT16 3731 #if HWY_NEON_HAVE_BFLOAT16 3732 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)> 3733 HWY_API Vec128<bfloat16_t> LoadU(D /* tag */, 3734 const bfloat16_t* HWY_RESTRICT unaligned) { 3735 return Vec128<bfloat16_t>(vld1q_bf16(detail::NativeLanePointer(unaligned))); 3736 } 3737 #endif // HWY_NEON_HAVE_BFLOAT16 3738 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3739 HWY_API Vec128<float> LoadU(D /* tag */, const float* HWY_RESTRICT unaligned) { 3740 return Vec128<float>(vld1q_f32(unaligned)); 3741 } 3742 #if HWY_HAVE_FLOAT64 3743 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3744 HWY_API Vec128<double> LoadU(D /* tag */, 3745 const double* HWY_RESTRICT unaligned) { 3746 return Vec128<double>(vld1q_f64(unaligned)); 3747 } 3748 #endif // HWY_HAVE_FLOAT64 3749 3750 // ------------------------------ Load 64 3751 3752 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 3753 HWY_API Vec64<uint8_t> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { 3754 return Vec64<uint8_t>(vld1_u8(p)); 3755 } 3756 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> 3757 HWY_API Vec64<uint16_t> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { 3758 return Vec64<uint16_t>(vld1_u16(p)); 3759 } 3760 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> 3761 HWY_API Vec64<uint32_t> LoadU(D /* tag */, const uint32_t* HWY_RESTRICT p) { 3762 return Vec64<uint32_t>(vld1_u32(p)); 3763 } 3764 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)> 3765 HWY_API Vec64<uint64_t> LoadU(D /* tag */, const uint64_t* HWY_RESTRICT p) { 3766 return Vec64<uint64_t>(vld1_u64(p)); 3767 } 3768 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> 3769 HWY_API Vec64<int8_t> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { 3770 return Vec64<int8_t>(vld1_s8(p)); 3771 } 3772 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> 3773 HWY_API Vec64<int16_t> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { 3774 return Vec64<int16_t>(vld1_s16(p)); 3775 } 3776 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)> 3777 HWY_API Vec64<int32_t> LoadU(D /* tag */, const int32_t* HWY_RESTRICT p) { 3778 return Vec64<int32_t>(vld1_s32(p)); 3779 } 3780 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)> 3781 HWY_API Vec64<int64_t> LoadU(D /* tag */, const int64_t* HWY_RESTRICT p) { 3782 return Vec64<int64_t>(vld1_s64(p)); 3783 } 3784 #if HWY_HAVE_FLOAT16 3785 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)> 3786 HWY_API Vec64<float16_t> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) { 3787 return Vec64<float16_t>(vld1_f16(detail::NativeLanePointer(p))); 3788 } 3789 #endif // HWY_HAVE_FLOAT16 3790 #if HWY_NEON_HAVE_BFLOAT16 3791 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)> 3792 HWY_API Vec64<bfloat16_t> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) { 3793 return Vec64<bfloat16_t>(vld1_bf16(detail::NativeLanePointer(p))); 3794 } 3795 #endif // HWY_NEON_HAVE_BFLOAT16 3796 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 3797 HWY_API Vec64<float> LoadU(D /* tag */, const float* HWY_RESTRICT p) { 3798 return Vec64<float>(vld1_f32(p)); 3799 } 3800 #if HWY_HAVE_FLOAT64 3801 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> 3802 HWY_API Vec64<double> LoadU(D /* tag */, const double* HWY_RESTRICT p) { 3803 return Vec64<double>(vld1_f64(p)); 3804 } 3805 #endif // HWY_HAVE_FLOAT64 3806 3807 // ------------------------------ Load 32 3808 3809 // Actual 32-bit broadcast load - used to implement the other lane types 3810 // because reinterpret_cast of the pointer leads to incorrect codegen on GCC. 3811 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)> 3812 HWY_API Vec32<uint32_t> LoadU(D /*tag*/, const uint32_t* HWY_RESTRICT p) { 3813 return Vec32<uint32_t>(vld1_dup_u32(p)); 3814 } 3815 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)> 3816 HWY_API Vec32<int32_t> LoadU(D /*tag*/, const int32_t* HWY_RESTRICT p) { 3817 return Vec32<int32_t>(vld1_dup_s32(p)); 3818 } 3819 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 3820 HWY_API Vec32<float> LoadU(D /*tag*/, const float* HWY_RESTRICT p) { 3821 return Vec32<float>(vld1_dup_f32(p)); 3822 } 3823 3824 // {u,i}{8,16} 3825 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2), 3826 HWY_IF_NOT_SPECIAL_FLOAT_D(D)> 3827 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 3828 const Repartition<uint32_t, decltype(d)> d32; 3829 uint32_t buf; 3830 CopyBytes<4>(p, &buf); 3831 return BitCast(d, LoadU(d32, &buf)); 3832 } 3833 3834 #if HWY_HAVE_FLOAT16 3835 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)> 3836 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 3837 const Repartition<uint32_t, decltype(d)> d32; 3838 uint32_t buf; 3839 CopyBytes<4>(p, &buf); 3840 return BitCast(d, LoadU(d32, &buf)); 3841 } 3842 #endif // HWY_HAVE_FLOAT16 3843 #if HWY_NEON_HAVE_BFLOAT16 3844 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)> 3845 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 3846 const Repartition<uint32_t, decltype(d)> d32; 3847 uint32_t buf; 3848 CopyBytes<4>(p, &buf); 3849 return BitCast(d, LoadU(d32, &buf)); 3850 } 3851 #endif // HWY_NEON_HAVE_BFLOAT16 3852 3853 // ------------------------------ Load 16 3854 3855 // Actual 16-bit broadcast load - used to implement the other lane types 3856 // because reinterpret_cast of the pointer leads to incorrect codegen on GCC. 3857 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U16_D(D)> 3858 HWY_API VFromD<D> LoadU(D /* tag */, const uint16_t* HWY_RESTRICT p) { 3859 return VFromD<D>(vld1_dup_u16(p)); 3860 } 3861 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I16_D(D)> 3862 HWY_API VFromD<D> LoadU(D /* tag */, const int16_t* HWY_RESTRICT p) { 3863 return VFromD<D>(vld1_dup_s16(p)); 3864 } 3865 #if HWY_HAVE_FLOAT16 3866 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_F16_D(D)> 3867 HWY_API VFromD<D> LoadU(D /* tag */, const float16_t* HWY_RESTRICT p) { 3868 return VFromD<D>(vld1_dup_f16(detail::NativeLanePointer(p))); 3869 } 3870 #endif // HWY_HAVE_FLOAT16 3871 #if HWY_NEON_HAVE_BFLOAT16 3872 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_BF16_D(D)> 3873 HWY_API VFromD<D> LoadU(D /* tag */, const bfloat16_t* HWY_RESTRICT p) { 3874 return VFromD<D>(vld1_dup_bf16(detail::NativeLanePointer(p))); 3875 } 3876 #endif // HWY_NEON_HAVE_BFLOAT16 3877 3878 // 8-bit x2 3879 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> 3880 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 3881 const Repartition<uint16_t, decltype(d)> d16; 3882 uint16_t buf; 3883 CopyBytes<2>(p, &buf); 3884 return BitCast(d, LoadU(d16, &buf)); 3885 } 3886 3887 // ------------------------------ Load 8 3888 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_U8_D(D)> 3889 HWY_API VFromD<D> LoadU(D /* tag */, const uint8_t* HWY_RESTRICT p) { 3890 return VFromD<D>(vld1_dup_u8(p)); 3891 } 3892 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_I8_D(D)> 3893 HWY_API VFromD<D> LoadU(D /* tag */, const int8_t* HWY_RESTRICT p) { 3894 return VFromD<D>(vld1_dup_s8(p)); 3895 } 3896 3897 // ------------------------------ Load misc 3898 3899 template <class D, HWY_NEON_IF_EMULATED_D(D)> 3900 HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) { 3901 const RebindToUnsigned<decltype(d)> du; 3902 return BitCast(d, LoadU(du, detail::U16LanePointer(p))); 3903 } 3904 3905 // On Arm, Load is the same as LoadU. 3906 template <class D> 3907 HWY_API VFromD<D> Load(D d, const TFromD<D>* HWY_RESTRICT p) { 3908 return LoadU(d, p); 3909 } 3910 3911 template <class D> 3912 HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d, 3913 const TFromD<D>* HWY_RESTRICT aligned) { 3914 return IfThenElseZero(m, Load(d, aligned)); 3915 } 3916 3917 template <class D> 3918 HWY_API VFromD<D> MaskedLoadOr(VFromD<D> v, MFromD<D> m, D d, 3919 const TFromD<D>* HWY_RESTRICT aligned) { 3920 return IfThenElse(m, Load(d, aligned), v); 3921 } 3922 3923 // 128-bit SIMD => nothing to duplicate, same as an unaligned load. 3924 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 3925 HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* HWY_RESTRICT p) { 3926 return LoadU(d, p); 3927 } 3928 3929 // ------------------------------ Store 128 3930 3931 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U8_D(D)> 3932 HWY_API void StoreU(Vec128<uint8_t> v, D /* tag */, 3933 uint8_t* HWY_RESTRICT unaligned) { 3934 vst1q_u8(unaligned, v.raw); 3935 } 3936 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 3937 HWY_API void StoreU(Vec128<uint16_t> v, D /* tag */, 3938 uint16_t* HWY_RESTRICT unaligned) { 3939 vst1q_u16(unaligned, v.raw); 3940 } 3941 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 3942 HWY_API void StoreU(Vec128<uint32_t> v, D /* tag */, 3943 uint32_t* HWY_RESTRICT unaligned) { 3944 vst1q_u32(unaligned, v.raw); 3945 } 3946 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> 3947 HWY_API void StoreU(Vec128<uint64_t> v, D /* tag */, 3948 uint64_t* HWY_RESTRICT unaligned) { 3949 vst1q_u64(unaligned, v.raw); 3950 } 3951 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I8_D(D)> 3952 HWY_API void StoreU(Vec128<int8_t> v, D /* tag */, 3953 int8_t* HWY_RESTRICT unaligned) { 3954 vst1q_s8(unaligned, v.raw); 3955 } 3956 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 3957 HWY_API void StoreU(Vec128<int16_t> v, D /* tag */, 3958 int16_t* HWY_RESTRICT unaligned) { 3959 vst1q_s16(unaligned, v.raw); 3960 } 3961 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 3962 HWY_API void StoreU(Vec128<int32_t> v, D /* tag */, 3963 int32_t* HWY_RESTRICT unaligned) { 3964 vst1q_s32(unaligned, v.raw); 3965 } 3966 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> 3967 HWY_API void StoreU(Vec128<int64_t> v, D /* tag */, 3968 int64_t* HWY_RESTRICT unaligned) { 3969 vst1q_s64(unaligned, v.raw); 3970 } 3971 #if HWY_HAVE_FLOAT16 3972 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F16_D(D)> 3973 HWY_API void StoreU(Vec128<float16_t> v, D /* tag */, 3974 float16_t* HWY_RESTRICT unaligned) { 3975 vst1q_f16(detail::NativeLanePointer(unaligned), v.raw); 3976 } 3977 #endif // HWY_HAVE_FLOAT16 3978 #if HWY_NEON_HAVE_BFLOAT16 3979 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_BF16_D(D)> 3980 HWY_API void StoreU(Vec128<bfloat16_t> v, D /* tag */, 3981 bfloat16_t* HWY_RESTRICT unaligned) { 3982 vst1q_bf16(detail::NativeLanePointer(unaligned), v.raw); 3983 } 3984 #endif // HWY_NEON_HAVE_BFLOAT16 3985 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 3986 HWY_API void StoreU(Vec128<float> v, D /* tag */, 3987 float* HWY_RESTRICT unaligned) { 3988 vst1q_f32(unaligned, v.raw); 3989 } 3990 #if HWY_HAVE_FLOAT64 3991 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F64_D(D)> 3992 HWY_API void StoreU(Vec128<double> v, D /* tag */, 3993 double* HWY_RESTRICT unaligned) { 3994 vst1q_f64(unaligned, v.raw); 3995 } 3996 #endif // HWY_HAVE_FLOAT64 3997 3998 // ------------------------------ Store 64 3999 4000 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U8_D(D)> 4001 HWY_API void StoreU(Vec64<uint8_t> v, D /* tag */, uint8_t* HWY_RESTRICT p) { 4002 vst1_u8(p, v.raw); 4003 } 4004 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U16_D(D)> 4005 HWY_API void StoreU(Vec64<uint16_t> v, D /* tag */, uint16_t* HWY_RESTRICT p) { 4006 vst1_u16(p, v.raw); 4007 } 4008 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U32_D(D)> 4009 HWY_API void StoreU(Vec64<uint32_t> v, D /* tag */, uint32_t* HWY_RESTRICT p) { 4010 vst1_u32(p, v.raw); 4011 } 4012 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)> 4013 HWY_API void StoreU(Vec64<uint64_t> v, D /* tag */, uint64_t* HWY_RESTRICT p) { 4014 vst1_u64(p, v.raw); 4015 } 4016 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I8_D(D)> 4017 HWY_API void StoreU(Vec64<int8_t> v, D /* tag */, int8_t* HWY_RESTRICT p) { 4018 vst1_s8(p, v.raw); 4019 } 4020 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I16_D(D)> 4021 HWY_API void StoreU(Vec64<int16_t> v, D /* tag */, int16_t* HWY_RESTRICT p) { 4022 vst1_s16(p, v.raw); 4023 } 4024 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I32_D(D)> 4025 HWY_API void StoreU(Vec64<int32_t> v, D /* tag */, int32_t* HWY_RESTRICT p) { 4026 vst1_s32(p, v.raw); 4027 } 4028 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)> 4029 HWY_API void StoreU(Vec64<int64_t> v, D /* tag */, int64_t* HWY_RESTRICT p) { 4030 vst1_s64(p, v.raw); 4031 } 4032 #if HWY_HAVE_FLOAT16 4033 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F16_D(D)> 4034 HWY_API void StoreU(Vec64<float16_t> v, D /* tag */, 4035 float16_t* HWY_RESTRICT p) { 4036 vst1_f16(detail::NativeLanePointer(p), v.raw); 4037 } 4038 #endif // HWY_HAVE_FLOAT16 4039 #if HWY_NEON_HAVE_BFLOAT16 4040 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)> 4041 HWY_API void StoreU(Vec64<bfloat16_t> v, D /* tag */, 4042 bfloat16_t* HWY_RESTRICT p) { 4043 vst1_bf16(detail::NativeLanePointer(p), v.raw); 4044 } 4045 #endif // HWY_NEON_HAVE_BFLOAT16 4046 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F32_D(D)> 4047 HWY_API void StoreU(Vec64<float> v, D /* tag */, float* HWY_RESTRICT p) { 4048 vst1_f32(p, v.raw); 4049 } 4050 #if HWY_HAVE_FLOAT64 4051 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_F64_D(D)> 4052 HWY_API void StoreU(Vec64<double> v, D /* tag */, double* HWY_RESTRICT p) { 4053 vst1_f64(p, v.raw); 4054 } 4055 #endif // HWY_HAVE_FLOAT64 4056 4057 // ------------------------------ Store 32 4058 4059 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_U32_D(D)> 4060 HWY_API void StoreU(Vec32<uint32_t> v, D, uint32_t* HWY_RESTRICT p) { 4061 vst1_lane_u32(p, v.raw, 0); 4062 } 4063 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_I32_D(D)> 4064 HWY_API void StoreU(Vec32<int32_t> v, D, int32_t* HWY_RESTRICT p) { 4065 vst1_lane_s32(p, v.raw, 0); 4066 } 4067 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F32_D(D)> 4068 HWY_API void StoreU(Vec32<float> v, D, float* HWY_RESTRICT p) { 4069 vst1_lane_f32(p, v.raw, 0); 4070 } 4071 4072 // {u,i}{8,16} 4073 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_T_SIZE_LE_D(D, 2), 4074 HWY_IF_NOT_SPECIAL_FLOAT_D(D)> 4075 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 4076 Repartition<uint32_t, decltype(d)> d32; 4077 uint32_t buf = GetLane(BitCast(d32, v)); 4078 CopyBytes<4>(&buf, p); 4079 } 4080 4081 #if HWY_HAVE_FLOAT16 4082 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_F16_D(D)> 4083 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 4084 Repartition<uint32_t, decltype(d)> d32; 4085 uint32_t buf = GetLane(BitCast(d32, v)); 4086 CopyBytes<4>(&buf, p); 4087 } 4088 #endif 4089 #if HWY_NEON_HAVE_BFLOAT16 4090 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_BF16_D(D)> 4091 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 4092 Repartition<uint32_t, decltype(d)> d32; 4093 uint32_t buf = GetLane(BitCast(d32, v)); 4094 CopyBytes<4>(&buf, p); 4095 } 4096 #endif // HWY_NEON_HAVE_BFLOAT16 4097 4098 // ------------------------------ Store 16 4099 4100 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_U16_D(D)> 4101 HWY_API void StoreU(Vec16<uint16_t> v, D, uint16_t* HWY_RESTRICT p) { 4102 vst1_lane_u16(p, v.raw, 0); 4103 } 4104 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_I16_D(D)> 4105 HWY_API void StoreU(Vec16<int16_t> v, D, int16_t* HWY_RESTRICT p) { 4106 vst1_lane_s16(p, v.raw, 0); 4107 } 4108 #if HWY_HAVE_FLOAT16 4109 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_F16_D(D)> 4110 HWY_API void StoreU(Vec16<float16_t> v, D, float16_t* HWY_RESTRICT p) { 4111 vst1_lane_f16(detail::NativeLanePointer(p), v.raw, 0); 4112 } 4113 #endif // HWY_HAVE_FLOAT16 4114 #if HWY_NEON_HAVE_BFLOAT16 4115 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_BF16_D(D)> 4116 HWY_API void StoreU(Vec16<bfloat16_t> v, D, bfloat16_t* HWY_RESTRICT p) { 4117 vst1_lane_bf16(detail::NativeLanePointer(p), v.raw, 0); 4118 } 4119 #endif // HWY_NEON_HAVE_BFLOAT16 4120 4121 template <class D, HWY_IF_V_SIZE_D(D, 2), HWY_IF_T_SIZE_D(D, 1)> 4122 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 4123 const Repartition<uint16_t, decltype(d)> d16; 4124 const uint16_t buf = GetLane(BitCast(d16, v)); 4125 CopyBytes<2>(&buf, p); 4126 } 4127 4128 // ------------------------------ Store 8 4129 4130 template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_U8_D(D)> 4131 HWY_API void StoreU(Vec128<uint8_t, 1> v, D, uint8_t* HWY_RESTRICT p) { 4132 vst1_lane_u8(p, v.raw, 0); 4133 } 4134 template <class D, HWY_IF_V_SIZE_D(D, 1), HWY_IF_I8_D(D)> 4135 HWY_API void StoreU(Vec128<int8_t, 1> v, D, int8_t* HWY_RESTRICT p) { 4136 vst1_lane_s8(p, v.raw, 0); 4137 } 4138 4139 // ------------------------------ Store misc 4140 4141 template <class D, HWY_NEON_IF_EMULATED_D(D)> 4142 HWY_API void StoreU(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) { 4143 const RebindToUnsigned<decltype(d)> du; 4144 return StoreU(BitCast(du, v), du, detail::U16LanePointer(p)); 4145 } 4146 4147 HWY_DIAGNOSTICS(push) 4148 #if HWY_COMPILER_GCC_ACTUAL 4149 HWY_DIAGNOSTICS_OFF(disable : 4701, ignored "-Wmaybe-uninitialized") 4150 #endif 4151 4152 // On Arm, Store is the same as StoreU. 4153 template <class D> 4154 HWY_API void Store(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 4155 StoreU(v, d, aligned); 4156 } 4157 4158 HWY_DIAGNOSTICS(pop) 4159 4160 template <class D> 4161 HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d, 4162 TFromD<D>* HWY_RESTRICT p) { 4163 // Treat as unsigned so that we correctly support float16. 4164 const RebindToUnsigned<decltype(d)> du; 4165 const auto blended = 4166 IfThenElse(RebindMask(du, m), BitCast(du, v), BitCast(du, LoadU(d, p))); 4167 StoreU(BitCast(d, blended), d, p); 4168 } 4169 4170 // ------------------------------ Non-temporal stores 4171 4172 // Same as aligned stores on non-x86. 4173 4174 template <class D> 4175 HWY_API void Stream(const VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT aligned) { 4176 #if HWY_ARCH_ARM_A64 4177 #if HWY_COMPILER_GCC 4178 __builtin_prefetch(aligned, 1, 0); 4179 #elif HWY_COMPILER_MSVC 4180 __prefetch2(aligned, 0x11); 4181 #endif 4182 #endif 4183 Store(v, d, aligned); 4184 } 4185 4186 // ================================================== CONVERT 4187 4188 // ------------------------------ ConvertTo 4189 4190 #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 4191 4192 // TODO(janwas): use macro generator instead of handwritten 4193 template <class D, HWY_IF_F16_D(D)> 4194 HWY_API Vec128<float16_t> ConvertTo(D /* tag */, Vec128<int16_t> v) { 4195 return Vec128<float16_t>(vcvtq_f16_s16(v.raw)); 4196 } 4197 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> 4198 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4199 return VFromD<D>(vcvt_f16_s16(v.raw)); 4200 } 4201 4202 template <class D, HWY_IF_F16_D(D)> 4203 HWY_API Vec128<float16_t> ConvertTo(D /* tag */, Vec128<uint16_t> v) { 4204 return Vec128<float16_t>(vcvtq_f16_u16(v.raw)); 4205 } 4206 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F16_D(D)> 4207 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4208 return VFromD<D>(vcvt_f16_u16(v.raw)); 4209 } 4210 4211 #endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 4212 4213 template <class D, HWY_IF_F32_D(D)> 4214 HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<int32_t> v) { 4215 return Vec128<float>(vcvtq_f32_s32(v.raw)); 4216 } 4217 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4218 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToSigned<D>> v) { 4219 return VFromD<D>(vcvt_f32_s32(v.raw)); 4220 } 4221 4222 template <class D, HWY_IF_F32_D(D)> 4223 HWY_API Vec128<float> ConvertTo(D /* tag */, Vec128<uint32_t> v) { 4224 return Vec128<float>(vcvtq_f32_u32(v.raw)); 4225 } 4226 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4227 HWY_API VFromD<D> ConvertTo(D /* tag */, VFromD<RebindToUnsigned<D>> v) { 4228 return VFromD<D>(vcvt_f32_u32(v.raw)); 4229 } 4230 4231 #if HWY_HAVE_FLOAT64 4232 4233 template <class D, HWY_IF_F64_D(D)> 4234 HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<int64_t> v) { 4235 return Vec128<double>(vcvtq_f64_s64(v.raw)); 4236 } 4237 template <class D, HWY_IF_F64_D(D)> 4238 HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<int64_t> v) { 4239 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. 4240 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 4241 return Set(Full64<double>(), static_cast<double>(GetLane(v))); 4242 #else 4243 return Vec64<double>(vcvt_f64_s64(v.raw)); 4244 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 4245 } 4246 4247 template <class D, HWY_IF_F64_D(D)> 4248 HWY_API Vec128<double> ConvertTo(D /* tag */, Vec128<uint64_t> v) { 4249 return Vec128<double>(vcvtq_f64_u64(v.raw)); 4250 } 4251 template <class D, HWY_IF_F64_D(D)> 4252 HWY_API Vec64<double> ConvertTo(D /* tag */, Vec64<uint64_t> v) { 4253 // GCC 6.5 and earlier are missing the 64-bit (non-q) intrinsic. 4254 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 4255 return Set(Full64<double>(), static_cast<double>(GetLane(v))); 4256 #else 4257 return Vec64<double>(vcvt_f64_u64(v.raw)); 4258 #endif // HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700 4259 } 4260 4261 #endif // HWY_HAVE_FLOAT64 4262 4263 namespace detail { 4264 // Truncates (rounds toward zero). 4265 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I32_D(D)> 4266 HWY_INLINE Vec128<int32_t> ConvertFToI(D /* tag */, Vec128<float> v) { 4267 #if HWY_COMPILER_CLANG && \ 4268 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) 4269 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for 4270 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is 4271 // outside of the range of an int32_t. 4272 4273 int32x4_t raw_result; 4274 __asm__( 4275 #if HWY_ARCH_ARM_A64 4276 "fcvtzs %0.4s, %1.4s" 4277 #else 4278 "vcvt.s32.f32 %0, %1" 4279 #endif 4280 : "=w"(raw_result) 4281 : "w"(v.raw)); 4282 return Vec128<int32_t>(raw_result); 4283 #else 4284 return Vec128<int32_t>(vcvtq_s32_f32(v.raw)); 4285 #endif 4286 } 4287 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4288 HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<RebindToFloat<D>> v) { 4289 #if HWY_COMPILER_CLANG && \ 4290 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) 4291 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for 4292 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is 4293 // outside of the range of an int32_t. 4294 4295 int32x2_t raw_result; 4296 __asm__( 4297 #if HWY_ARCH_ARM_A64 4298 "fcvtzs %0.2s, %1.2s" 4299 #else 4300 "vcvt.s32.f32 %0, %1" 4301 #endif 4302 : "=w"(raw_result) 4303 : "w"(v.raw)); 4304 return VFromD<D>(raw_result); 4305 #else 4306 return VFromD<D>(vcvt_s32_f32(v.raw)); 4307 #endif 4308 } 4309 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U32_D(D)> 4310 HWY_INLINE Vec128<uint32_t> ConvertFToU(D /* tag */, Vec128<float> v) { 4311 #if HWY_COMPILER_CLANG && \ 4312 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) 4313 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for 4314 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is 4315 // outside of the range of an uint32_t. 4316 4317 uint32x4_t raw_result; 4318 __asm__( 4319 #if HWY_ARCH_ARM_A64 4320 "fcvtzu %0.4s, %1.4s" 4321 #else 4322 "vcvt.u32.f32 %0, %1" 4323 #endif 4324 : "=w"(raw_result) 4325 : "w"(v.raw)); 4326 return Vec128<uint32_t>(raw_result); 4327 #else 4328 return Vec128<uint32_t>(vcvtq_u32_f32(v.raw)); 4329 #endif 4330 } 4331 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4332 HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<RebindToFloat<D>> v) { 4333 #if HWY_COMPILER_CLANG && \ 4334 ((HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200) || HWY_ARCH_ARM_V7) 4335 // If compiling for AArch64 NEON with Clang 11 or earlier or if compiling for 4336 // Armv7 NEON, use inline assembly to avoid undefined behavior if v[i] is 4337 // outside of the range of an uint32_t. 4338 4339 uint32x2_t raw_result; 4340 __asm__( 4341 #if HWY_ARCH_ARM_A64 4342 "fcvtzu %0.2s, %1.2s" 4343 #else 4344 "vcvt.u32.f32 %0, %1" 4345 #endif 4346 : "=w"(raw_result) 4347 : "w"(v.raw)); 4348 return VFromD<D>(raw_result); 4349 #else 4350 return VFromD<D>(vcvt_u32_f32(v.raw)); 4351 #endif 4352 } 4353 4354 #if HWY_HAVE_FLOAT64 4355 4356 // Truncates (rounds toward zero). 4357 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I64_D(D)> 4358 HWY_INLINE Vec128<int64_t> ConvertFToI(D /* tag */, Vec128<double> v) { 4359 #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200 4360 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4361 // to avoid undefined behavior if v[i] is outside of the range of an int64_t. 4362 int64x2_t raw_result; 4363 __asm__("fcvtzs %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw)); 4364 return Vec128<int64_t>(raw_result); 4365 #else 4366 return Vec128<int64_t>(vcvtq_s64_f64(v.raw)); 4367 #endif 4368 } 4369 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_I64_D(D)> 4370 HWY_INLINE Vec64<int64_t> ConvertFToI(D /* tag */, Vec64<double> v) { 4371 #if HWY_ARCH_ARM_A64 && \ 4372 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \ 4373 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200)) 4374 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4375 // to avoid undefined behavior if v[i] is outside of the range of an int64_t. 4376 // If compiling for AArch64 NEON with GCC 6 or earlier, use inline assembly to 4377 // work around the missing vcvt_s64_f64 intrinsic. 4378 int64x1_t raw_result; 4379 __asm__("fcvtzs %d0, %d1" : "=w"(raw_result) : "w"(v.raw)); 4380 return Vec64<int64_t>(raw_result); 4381 #else 4382 return Vec64<int64_t>(vcvt_s64_f64(v.raw)); 4383 #endif 4384 } 4385 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U64_D(D)> 4386 HWY_INLINE Vec128<uint64_t> ConvertFToU(D /* tag */, Vec128<double> v) { 4387 #if HWY_COMPILER_CLANG && HWY_ARCH_ARM_A64 && HWY_COMPILER_CLANG < 1200 4388 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4389 // to avoid undefined behavior if v[i] is outside of the range of an uint64_t. 4390 uint64x2_t raw_result; 4391 __asm__("fcvtzu %0.2d, %1.2d" : "=w"(raw_result) : "w"(v.raw)); 4392 return Vec128<uint64_t>(raw_result); 4393 #else 4394 return Vec128<uint64_t>(vcvtq_u64_f64(v.raw)); 4395 #endif 4396 } 4397 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_U64_D(D)> 4398 HWY_INLINE Vec64<uint64_t> ConvertFToU(D /* tag */, Vec64<double> v) { 4399 #if HWY_ARCH_ARM_A64 && \ 4400 ((HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 700) || \ 4401 (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200)) 4402 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4403 // to avoid undefined behavior if v[i] is outside of the range of an uint64_t. 4404 4405 // Inline assembly is also used if compiling for AArch64 NEON with GCC 6 or 4406 // earlier to work around the issue of the missing vcvt_u64_f64 intrinsic. 4407 uint64x1_t raw_result; 4408 __asm__("fcvtzu %d0, %d1" : "=w"(raw_result) : "w"(v.raw)); 4409 return Vec64<uint64_t>(raw_result); 4410 #else 4411 return Vec64<uint64_t>(vcvt_u64_f64(v.raw)); 4412 #endif 4413 } 4414 4415 #endif // HWY_HAVE_FLOAT64 4416 4417 #if HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 4418 4419 // Truncates (rounds toward zero). 4420 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_I16_D(D)> 4421 HWY_INLINE Vec128<int16_t> ConvertFToI(D /* tag */, Vec128<float16_t> v) { 4422 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 4423 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4424 // to avoid undefined behavior if v[i] is outside of the range of an int16_t. 4425 int16x8_t raw_result; 4426 __asm__("fcvtzs %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw)); 4427 return Vec128<int16_t>(raw_result); 4428 #else 4429 return Vec128<int16_t>(vcvtq_s16_f16(v.raw)); 4430 #endif 4431 } 4432 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 4433 HWY_INLINE VFromD<D> ConvertFToI(D /* tag */, VFromD<Rebind<float16_t, D>> v) { 4434 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 4435 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4436 // to avoid undefined behavior if v[i] is outside of the range of an int16_t. 4437 int16x4_t raw_result; 4438 __asm__("fcvtzs %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw)); 4439 return VFromD<D>(raw_result); 4440 #else 4441 return VFromD<D>(vcvt_s16_f16(v.raw)); 4442 #endif 4443 } 4444 4445 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_U16_D(D)> 4446 HWY_INLINE Vec128<uint16_t> ConvertFToU(D /* tag */, Vec128<float16_t> v) { 4447 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 4448 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4449 // to avoid undefined behavior if v[i] is outside of the range of an uint16_t. 4450 uint16x8_t raw_result; 4451 __asm__("fcvtzu %0.8h, %1.8h" : "=w"(raw_result) : "w"(v.raw)); 4452 return Vec128<uint16_t>(raw_result); 4453 #else 4454 return Vec128<uint16_t>(vcvtq_u16_f16(v.raw)); 4455 #endif 4456 } 4457 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4458 HWY_INLINE VFromD<D> ConvertFToU(D /* tag */, VFromD<Rebind<float16_t, D>> v) { 4459 #if HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1200 4460 // If compiling for AArch64 NEON with Clang 11 or earlier, use inline assembly 4461 // to avoid undefined behavior if v[i] is outside of the range of an uint16_t. 4462 uint16x4_t raw_result; 4463 __asm__("fcvtzu %0.4h, %1.4h" : "=w"(raw_result) : "w"(v.raw)); 4464 return VFromD<D>(raw_result); 4465 #else 4466 return VFromD<D>(vcvt_u16_f16(v.raw)); 4467 #endif 4468 } 4469 4470 #endif // HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16 4471 } // namespace detail 4472 4473 template <class D, HWY_IF_SIGNED_D(D), 4474 HWY_IF_T_SIZE_ONE_OF_D( 4475 D, (1 << 4) | 4476 ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) | 4477 (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))> 4478 HWY_API VFromD<D> ConvertTo(D di, VFromD<RebindToFloat<D>> v) { 4479 return detail::ConvertFToI(di, v); 4480 } 4481 4482 template <class D, HWY_IF_UNSIGNED_D(D), 4483 HWY_IF_T_SIZE_ONE_OF_D( 4484 D, (1 << 4) | 4485 ((HWY_ARCH_ARM_A64 && HWY_HAVE_FLOAT16) ? (1 << 2) : 0) | 4486 (HWY_HAVE_FLOAT64 ? (1 << 8) : 0))> 4487 HWY_API VFromD<D> ConvertTo(D du, VFromD<RebindToFloat<D>> v) { 4488 return detail::ConvertFToU(du, v); 4489 } 4490 4491 // ------------------------------ PromoteTo (ConvertTo) 4492 4493 // Unsigned: zero-extend to full vector. 4494 template <class D, HWY_IF_U16_D(D)> 4495 HWY_API Vec128<uint16_t> PromoteTo(D /* tag */, Vec64<uint8_t> v) { 4496 return Vec128<uint16_t>(vmovl_u8(v.raw)); 4497 } 4498 template <class D, HWY_IF_U32_D(D)> 4499 HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec32<uint8_t> v) { 4500 uint16x8_t a = vmovl_u8(v.raw); 4501 return Vec128<uint32_t>(vmovl_u16(vget_low_u16(a))); 4502 } 4503 template <class D, HWY_IF_U32_D(D)> 4504 HWY_API Vec128<uint32_t> PromoteTo(D /* tag */, Vec64<uint16_t> v) { 4505 return Vec128<uint32_t>(vmovl_u16(v.raw)); 4506 } 4507 template <class D, HWY_IF_U64_D(D)> 4508 HWY_API Vec128<uint64_t> PromoteTo(D /* tag */, Vec64<uint32_t> v) { 4509 return Vec128<uint64_t>(vmovl_u32(v.raw)); 4510 } 4511 template <class D, HWY_IF_I16_D(D)> 4512 HWY_API Vec128<int16_t> PromoteTo(D d, Vec64<uint8_t> v) { 4513 return BitCast(d, Vec128<uint16_t>(vmovl_u8(v.raw))); 4514 } 4515 template <class D, HWY_IF_I32_D(D)> 4516 HWY_API Vec128<int32_t> PromoteTo(D d, Vec32<uint8_t> v) { 4517 uint16x8_t a = vmovl_u8(v.raw); 4518 return BitCast(d, Vec128<uint32_t>(vmovl_u16(vget_low_u16(a)))); 4519 } 4520 template <class D, HWY_IF_I32_D(D)> 4521 HWY_API Vec128<int32_t> PromoteTo(D d, Vec64<uint16_t> v) { 4522 return BitCast(d, Vec128<uint32_t>(vmovl_u16(v.raw))); 4523 } 4524 template <class D, HWY_IF_I64_D(D)> 4525 HWY_API Vec128<int64_t> PromoteTo(D d, Vec64<uint32_t> v) { 4526 return BitCast(d, Vec128<uint64_t>(vmovl_u32(v.raw))); 4527 } 4528 4529 // Unsigned: zero-extend to half vector. 4530 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U16_D(D)> 4531 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 4532 return VFromD<D>(vget_low_u16(vmovl_u8(v.raw))); 4533 } 4534 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4535 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 4536 return VFromD<D>(vget_low_u32(vmovl_u16(vget_low_u16(vmovl_u8(v.raw))))); 4537 } 4538 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U32_D(D)> 4539 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4540 return VFromD<D>(vget_low_u32(vmovl_u16(v.raw))); 4541 } 4542 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_U64_D(D)> 4543 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4544 return VFromD<D>(vget_low_u64(vmovl_u32(v.raw))); 4545 } 4546 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 4547 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint8_t, D>> v) { 4548 using VU16 = VFromD<RebindToUnsigned<D>>; 4549 return BitCast(d, VU16(vget_low_u16(vmovl_u8(v.raw)))); 4550 } 4551 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4552 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint8_t, D>> v) { 4553 const uint32x4_t u32 = vmovl_u16(vget_low_u16(vmovl_u8(v.raw))); 4554 return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(u32))); 4555 } 4556 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4557 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4558 return VFromD<D>(vget_low_s32(vreinterpretq_s32_u32(vmovl_u16(v.raw)))); 4559 } 4560 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)> 4561 HWY_API VFromD<D> PromoteTo(D d, VFromD<Rebind<uint32_t, D>> v) { 4562 using DU = RebindToUnsigned<D>; 4563 return BitCast(d, VFromD<DU>(vget_low_u64(vmovl_u32(v.raw)))); 4564 } 4565 4566 // U8/U16 to U64/I64: First, zero-extend to U32, and then zero-extend to 4567 // TFromD<D> 4568 template <class D, class V, HWY_IF_UI64_D(D), 4569 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_UNSIGNED_V(V), 4570 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 4571 HWY_API VFromD<D> PromoteTo(D d, V v) { 4572 const Rebind<uint32_t, decltype(d)> du32; 4573 return PromoteTo(d, PromoteTo(du32, v)); 4574 } 4575 4576 // Signed: replicate sign bit to full vector. 4577 template <class D, HWY_IF_I16_D(D)> 4578 HWY_API Vec128<int16_t> PromoteTo(D /* tag */, Vec64<int8_t> v) { 4579 return Vec128<int16_t>(vmovl_s8(v.raw)); 4580 } 4581 template <class D, HWY_IF_I32_D(D)> 4582 HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec32<int8_t> v) { 4583 int16x8_t a = vmovl_s8(v.raw); 4584 return Vec128<int32_t>(vmovl_s16(vget_low_s16(a))); 4585 } 4586 template <class D, HWY_IF_I32_D(D)> 4587 HWY_API Vec128<int32_t> PromoteTo(D /* tag */, Vec64<int16_t> v) { 4588 return Vec128<int32_t>(vmovl_s16(v.raw)); 4589 } 4590 template <class D, HWY_IF_I64_D(D)> 4591 HWY_API Vec128<int64_t> PromoteTo(D /* tag */, Vec64<int32_t> v) { 4592 return Vec128<int64_t>(vmovl_s32(v.raw)); 4593 } 4594 4595 // Signed: replicate sign bit to half vector. 4596 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I16_D(D)> 4597 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 4598 return VFromD<D>(vget_low_s16(vmovl_s8(v.raw))); 4599 } 4600 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4601 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int8_t, D>> v) { 4602 return VFromD<D>(vget_low_s32(vmovl_s16(vget_low_s16(vmovl_s8(v.raw))))); 4603 } 4604 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I32_D(D)> 4605 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4606 return VFromD<D>(vget_low_s32(vmovl_s16(v.raw))); 4607 } 4608 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_I64_D(D)> 4609 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4610 return VFromD<D>(vget_low_s64(vmovl_s32(v.raw))); 4611 } 4612 4613 // I8/I16 to I64: First, promote to I32, and then promote to I64 4614 template <class D, class V, HWY_IF_I64_D(D), 4615 HWY_IF_LANES_D(D, HWY_MAX_LANES_V(V)), HWY_IF_SIGNED_V(V), 4616 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2))> 4617 HWY_API VFromD<D> PromoteTo(D d, V v) { 4618 const Rebind<int32_t, decltype(d)> di32; 4619 return PromoteTo(d, PromoteTo(di32, v)); 4620 } 4621 4622 #if HWY_NEON_HAVE_F16C 4623 4624 // Per-target flag to prevent generic_ops-inl.h from defining f16 conversions. 4625 #ifdef HWY_NATIVE_F16C 4626 #undef HWY_NATIVE_F16C 4627 #else 4628 #define HWY_NATIVE_F16C 4629 #endif 4630 4631 template <class D, HWY_IF_F32_D(D)> 4632 HWY_API Vec128<float> PromoteTo(D /* tag */, Vec64<float16_t> v) { 4633 return Vec128<float>(vcvt_f32_f16(v.raw)); 4634 } 4635 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_F32_D(D)> 4636 HWY_API VFromD<D> PromoteTo(D /* tag */, VFromD<Rebind<float16_t, D>> v) { 4637 return VFromD<D>(vget_low_f32(vcvt_f32_f16(v.raw))); 4638 } 4639 4640 #endif // HWY_NEON_HAVE_F16C 4641 4642 #if HWY_HAVE_FLOAT64 4643 4644 template <class D, HWY_IF_F64_D(D)> 4645 HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<float> v) { 4646 return Vec128<double>(vcvt_f64_f32(v.raw)); 4647 } 4648 4649 template <class D, HWY_IF_F64_D(D)> 4650 HWY_API Vec64<double> PromoteTo(D /* tag */, Vec32<float> v) { 4651 return Vec64<double>(vget_low_f64(vcvt_f64_f32(v.raw))); 4652 } 4653 4654 template <class D, HWY_IF_F64_D(D)> 4655 HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<int32_t> v) { 4656 const int64x2_t i64 = vmovl_s32(v.raw); 4657 return Vec128<double>(vcvtq_f64_s64(i64)); 4658 } 4659 4660 template <class D, HWY_IF_F64_D(D)> 4661 HWY_API Vec64<double> PromoteTo(D d, Vec32<int32_t> v) { 4662 return ConvertTo(d, Vec64<int64_t>(vget_low_s64(vmovl_s32(v.raw)))); 4663 } 4664 4665 template <class D, HWY_IF_F64_D(D)> 4666 HWY_API Vec128<double> PromoteTo(D /* tag */, Vec64<uint32_t> v) { 4667 const uint64x2_t u64 = vmovl_u32(v.raw); 4668 return Vec128<double>(vcvtq_f64_u64(u64)); 4669 } 4670 4671 template <class D, HWY_IF_F64_D(D)> 4672 HWY_API Vec64<double> PromoteTo(D d, Vec32<uint32_t> v) { 4673 return ConvertTo(d, Vec64<uint64_t>(vget_low_u64(vmovl_u32(v.raw)))); 4674 } 4675 4676 template <class D, HWY_IF_UI64_D(D)> 4677 HWY_API VFromD<D> PromoteTo(D d64, VFromD<Rebind<float, D>> v) { 4678 const RebindToFloat<decltype(d64)> df64; 4679 return ConvertTo(d64, PromoteTo(df64, v)); 4680 } 4681 4682 #else // !HWY_HAVE_FLOAT64 4683 4684 template <class D, HWY_IF_I64_D(D)> 4685 HWY_API VFromD<D> PromoteTo(D di64, VFromD<Rebind<float, D>> v) { 4686 const Rebind<int32_t, decltype(di64)> di32; 4687 const RebindToFloat<decltype(di32)> df32; 4688 const RebindToUnsigned<decltype(di32)> du32; 4689 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 4690 4691 const auto exponent_adj = BitCast( 4692 du32, 4693 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 4694 BitCast(du32_as_du8, Set(du32, uint32_t{157}))), 4695 BitCast(du32_as_du8, Set(du32, uint32_t{32})))); 4696 const auto adj_v = 4697 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 4698 4699 const auto f32_to_i32_result = ConvertTo(di32, adj_v); 4700 const auto lo64_or_mask = PromoteTo( 4701 di64, 4702 BitCast(du32, VecFromMask(di32, Eq(f32_to_i32_result, 4703 Set(di32, LimitsMax<int32_t>()))))); 4704 4705 return Or(PromoteTo(di64, BitCast(di32, f32_to_i32_result)) 4706 << PromoteTo(di64, exponent_adj), 4707 lo64_or_mask); 4708 } 4709 4710 template <class D, HWY_IF_U64_D(D)> 4711 HWY_API VFromD<D> PromoteTo(D du64, VFromD<Rebind<float, D>> v) { 4712 const Rebind<uint32_t, decltype(du64)> du32; 4713 const RebindToFloat<decltype(du32)> df32; 4714 const Repartition<uint8_t, decltype(du32)> du32_as_du8; 4715 4716 const auto exponent_adj = BitCast( 4717 du32, 4718 Min(SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 4719 BitCast(du32_as_du8, Set(du32, uint32_t{158}))), 4720 BitCast(du32_as_du8, Set(du32, uint32_t{32})))); 4721 4722 const auto adj_v = 4723 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 4724 const auto f32_to_u32_result = ConvertTo(du32, adj_v); 4725 const auto lo32_or_mask = PromoteTo( 4726 du64, 4727 VecFromMask(du32, f32_to_u32_result == Set(du32, LimitsMax<uint32_t>()))); 4728 4729 return Or(PromoteTo(du64, f32_to_u32_result) << PromoteTo(du64, exponent_adj), 4730 lo32_or_mask); 4731 } 4732 4733 #ifdef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 4734 #undef HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 4735 #else 4736 #define HWY_NATIVE_F32_TO_UI64_PROMOTE_IN_RANGE_TO 4737 #endif 4738 4739 template <class D, HWY_IF_UI64_D(D)> 4740 HWY_API VFromD<D> PromoteInRangeTo(D d64, VFromD<Rebind<float, D>> v) { 4741 const Rebind<MakeNarrow<TFromD<D>>, decltype(d64)> d32; 4742 const RebindToFloat<decltype(d32)> df32; 4743 const RebindToUnsigned<decltype(d32)> du32; 4744 const Repartition<uint8_t, decltype(d32)> du32_as_du8; 4745 4746 constexpr uint32_t kExpAdjDecr = 4747 0xFFFFFF9Du + static_cast<uint32_t>(!IsSigned<TFromD<D>>()); 4748 4749 const auto exponent_adj = BitCast( 4750 du32, SaturatedSub(BitCast(du32_as_du8, ShiftRight<23>(BitCast(du32, v))), 4751 BitCast(du32_as_du8, Set(du32, kExpAdjDecr)))); 4752 const auto adj_v = 4753 BitCast(df32, BitCast(du32, v) - ShiftLeft<23>(exponent_adj)); 4754 4755 return PromoteTo(d64, ConvertTo(d32, adj_v)) << PromoteTo(d64, exponent_adj); 4756 } 4757 4758 #endif // HWY_HAVE_FLOAT64 4759 4760 // ------------------------------ PromoteEvenTo/PromoteOddTo 4761 #include "hwy/ops/inside-inl.h" 4762 4763 // ------------------------------ PromoteUpperTo 4764 4765 #if HWY_ARCH_ARM_A64 4766 4767 // Per-target flag to prevent generic_ops-inl.h from defining PromoteUpperTo. 4768 #ifdef HWY_NATIVE_PROMOTE_UPPER_TO 4769 #undef HWY_NATIVE_PROMOTE_UPPER_TO 4770 #else 4771 #define HWY_NATIVE_PROMOTE_UPPER_TO 4772 #endif 4773 4774 // Unsigned: zero-extend to full vector. 4775 template <class D, HWY_IF_U16_D(D)> 4776 HWY_API Vec128<uint16_t> PromoteUpperTo(D /* tag */, Vec128<uint8_t> v) { 4777 return Vec128<uint16_t>(vmovl_high_u8(v.raw)); 4778 } 4779 template <class D, HWY_IF_U32_D(D)> 4780 HWY_API Vec128<uint32_t> PromoteUpperTo(D /* tag */, Vec128<uint16_t> v) { 4781 return Vec128<uint32_t>(vmovl_high_u16(v.raw)); 4782 } 4783 template <class D, HWY_IF_U64_D(D)> 4784 HWY_API Vec128<uint64_t> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) { 4785 return Vec128<uint64_t>(vmovl_high_u32(v.raw)); 4786 } 4787 template <class D, HWY_IF_I16_D(D)> 4788 HWY_API Vec128<int16_t> PromoteUpperTo(D d, Vec128<uint8_t> v) { 4789 return BitCast(d, Vec128<uint16_t>(vmovl_high_u8(v.raw))); 4790 } 4791 template <class D, HWY_IF_I32_D(D)> 4792 HWY_API Vec128<int32_t> PromoteUpperTo(D d, Vec128<uint16_t> v) { 4793 return BitCast(d, Vec128<uint32_t>(vmovl_high_u16(v.raw))); 4794 } 4795 template <class D, HWY_IF_I64_D(D)> 4796 HWY_API Vec128<int64_t> PromoteUpperTo(D d, Vec128<uint32_t> v) { 4797 return BitCast(d, Vec128<uint64_t>(vmovl_high_u32(v.raw))); 4798 } 4799 4800 // Signed: replicate sign bit to full vector. 4801 template <class D, HWY_IF_I16_D(D)> 4802 HWY_API Vec128<int16_t> PromoteUpperTo(D /* tag */, Vec128<int8_t> v) { 4803 return Vec128<int16_t>(vmovl_high_s8(v.raw)); 4804 } 4805 template <class D, HWY_IF_I32_D(D)> 4806 HWY_API Vec128<int32_t> PromoteUpperTo(D /* tag */, Vec128<int16_t> v) { 4807 return Vec128<int32_t>(vmovl_high_s16(v.raw)); 4808 } 4809 template <class D, HWY_IF_I64_D(D)> 4810 HWY_API Vec128<int64_t> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) { 4811 return Vec128<int64_t>(vmovl_high_s32(v.raw)); 4812 } 4813 4814 #if HWY_NEON_HAVE_F16C 4815 4816 template <class D, HWY_IF_F32_D(D)> 4817 HWY_API Vec128<float> PromoteUpperTo(D /* tag */, Vec128<float16_t> v) { 4818 return Vec128<float>(vcvt_high_f32_f16(v.raw)); 4819 } 4820 4821 #endif // HWY_NEON_HAVE_F16C 4822 4823 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_F32_D(D)> 4824 HWY_API VFromD<D> PromoteUpperTo(D df32, VFromD<Repartition<bfloat16_t, D>> v) { 4825 const Repartition<uint16_t, decltype(df32)> du16; 4826 const RebindToSigned<decltype(df32)> di32; 4827 return BitCast(df32, ShiftLeft<16>(PromoteUpperTo(di32, BitCast(du16, v)))); 4828 } 4829 4830 #if HWY_HAVE_FLOAT64 4831 4832 template <class D, HWY_IF_F64_D(D)> 4833 HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<float> v) { 4834 return Vec128<double>(vcvt_high_f64_f32(v.raw)); 4835 } 4836 4837 template <class D, HWY_IF_F64_D(D)> 4838 HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<int32_t> v) { 4839 const int64x2_t i64 = vmovl_high_s32(v.raw); 4840 return Vec128<double>(vcvtq_f64_s64(i64)); 4841 } 4842 4843 template <class D, HWY_IF_F64_D(D)> 4844 HWY_API Vec128<double> PromoteUpperTo(D /* tag */, Vec128<uint32_t> v) { 4845 const uint64x2_t u64 = vmovl_high_u32(v.raw); 4846 return Vec128<double>(vcvtq_f64_u64(u64)); 4847 } 4848 4849 #endif // HWY_HAVE_FLOAT64 4850 4851 template <class D, HWY_IF_UI64_D(D)> 4852 HWY_API VFromD<D> PromoteUpperTo(D d64, Vec128<float> v) { 4853 #if HWY_HAVE_FLOAT64 4854 const RebindToFloat<decltype(d64)> df64; 4855 return ConvertTo(d64, PromoteUpperTo(df64, v)); 4856 #else 4857 const Rebind<float, decltype(d)> dh; 4858 return PromoteTo(d, UpperHalf(dh, v)); 4859 #endif 4860 } 4861 4862 // Generic version for <=64 bit input/output (_high is only for full vectors). 4863 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), class V> 4864 HWY_API VFromD<D> PromoteUpperTo(D d, V v) { 4865 const Rebind<TFromV<V>, decltype(d)> dh; 4866 return PromoteTo(d, UpperHalf(dh, v)); 4867 } 4868 4869 #endif // HWY_ARCH_ARM_A64 4870 4871 // ------------------------------ DemoteTo (ConvertTo) 4872 4873 // From full vector to half or quarter 4874 template <class D, HWY_IF_U16_D(D)> 4875 HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { 4876 return Vec64<uint16_t>(vqmovun_s32(v.raw)); 4877 } 4878 template <class D, HWY_IF_I16_D(D)> 4879 HWY_API Vec64<int16_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { 4880 return Vec64<int16_t>(vqmovn_s32(v.raw)); 4881 } 4882 template <class D, HWY_IF_U8_D(D)> 4883 HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { 4884 const uint16x4_t a = vqmovun_s32(v.raw); 4885 return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a))); 4886 } 4887 template <class D, HWY_IF_U8_D(D)> 4888 HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) { 4889 return Vec64<uint8_t>(vqmovun_s16(v.raw)); 4890 } 4891 template <class D, HWY_IF_I8_D(D)> 4892 HWY_API Vec32<int8_t> DemoteTo(D /* tag */, Vec128<int32_t> v) { 4893 const int16x4_t a = vqmovn_s32(v.raw); 4894 return Vec32<int8_t>(vqmovn_s16(vcombine_s16(a, a))); 4895 } 4896 template <class D, HWY_IF_I8_D(D)> 4897 HWY_API Vec64<int8_t> DemoteTo(D /* tag */, Vec128<int16_t> v) { 4898 return Vec64<int8_t>(vqmovn_s16(v.raw)); 4899 } 4900 template <class D, HWY_IF_U16_D(D)> 4901 HWY_API Vec64<uint16_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) { 4902 return Vec64<uint16_t>(vqmovn_u32(v.raw)); 4903 } 4904 template <class D, HWY_IF_U8_D(D)> 4905 HWY_API Vec32<uint8_t> DemoteTo(D /* tag */, Vec128<uint32_t> v) { 4906 const uint16x4_t a = vqmovn_u32(v.raw); 4907 return Vec32<uint8_t>(vqmovn_u16(vcombine_u16(a, a))); 4908 } 4909 template <class D, HWY_IF_U8_D(D)> 4910 HWY_API Vec64<uint8_t> DemoteTo(D /* tag */, Vec128<uint16_t> v) { 4911 return Vec64<uint8_t>(vqmovn_u16(v.raw)); 4912 } 4913 4914 // From half vector to partial half 4915 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> 4916 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4917 return VFromD<D>(vqmovun_s32(vcombine_s32(v.raw, v.raw))); 4918 } 4919 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I16_D(D)> 4920 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4921 return VFromD<D>(vqmovn_s32(vcombine_s32(v.raw, v.raw))); 4922 } 4923 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> 4924 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4925 const uint16x4_t a = vqmovun_s32(vcombine_s32(v.raw, v.raw)); 4926 return VFromD<D>(vqmovn_u16(vcombine_u16(a, a))); 4927 } 4928 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 4929 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4930 return VFromD<D>(vqmovun_s16(vcombine_s16(v.raw, v.raw))); 4931 } 4932 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_I8_D(D)> 4933 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int32_t, D>> v) { 4934 const int16x4_t a = vqmovn_s32(vcombine_s32(v.raw, v.raw)); 4935 return VFromD<D>(vqmovn_s16(vcombine_s16(a, a))); 4936 } 4937 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_I8_D(D)> 4938 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<int16_t, D>> v) { 4939 return VFromD<D>(vqmovn_s16(vcombine_s16(v.raw, v.raw))); 4940 } 4941 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U16_D(D)> 4942 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4943 return VFromD<D>(vqmovn_u32(vcombine_u32(v.raw, v.raw))); 4944 } 4945 template <class D, HWY_IF_V_SIZE_LE_D(D, 2), HWY_IF_U8_D(D)> 4946 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 4947 const uint16x4_t a = vqmovn_u32(vcombine_u32(v.raw, v.raw)); 4948 return VFromD<D>(vqmovn_u16(vcombine_u16(a, a))); 4949 } 4950 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_U8_D(D)> 4951 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 4952 return VFromD<D>(vqmovn_u16(vcombine_u16(v.raw, v.raw))); 4953 } 4954 4955 template <class D, HWY_IF_I32_D(D)> 4956 HWY_API Vec64<int32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) { 4957 return Vec64<int32_t>(vqmovn_s64(v.raw)); 4958 } 4959 template <class D, HWY_IF_U32_D(D)> 4960 HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<int64_t> v) { 4961 return Vec64<uint32_t>(vqmovun_s64(v.raw)); 4962 } 4963 template <class D, HWY_IF_U32_D(D)> 4964 HWY_API Vec64<uint32_t> DemoteTo(D /* tag */, Vec128<uint64_t> v) { 4965 return Vec64<uint32_t>(vqmovn_u64(v.raw)); 4966 } 4967 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 4968 HWY_IF_SIGNED_D(D)> 4969 HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) { 4970 const Rebind<int32_t, D> di32; 4971 return DemoteTo(d, DemoteTo(di32, v)); 4972 } 4973 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 4974 HWY_IF_UNSIGNED_D(D)> 4975 HWY_API VFromD<D> DemoteTo(D d, Vec128<int64_t> v) { 4976 const Rebind<uint32_t, D> du32; 4977 return DemoteTo(d, DemoteTo(du32, v)); 4978 } 4979 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2)), 4980 HWY_IF_UNSIGNED_D(D)> 4981 HWY_API VFromD<D> DemoteTo(D d, Vec128<uint64_t> v) { 4982 const Rebind<uint32_t, D> du32; 4983 return DemoteTo(d, DemoteTo(du32, v)); 4984 } 4985 4986 template <class D, HWY_IF_I32_D(D)> 4987 HWY_API Vec32<int32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) { 4988 return Vec32<int32_t>(vqmovn_s64(vcombine_s64(v.raw, v.raw))); 4989 } 4990 template <class D, HWY_IF_U32_D(D)> 4991 HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<int64_t> v) { 4992 return Vec32<uint32_t>(vqmovun_s64(vcombine_s64(v.raw, v.raw))); 4993 } 4994 template <class D, HWY_IF_U32_D(D)> 4995 HWY_API Vec32<uint32_t> DemoteTo(D /* tag */, Vec64<uint64_t> v) { 4996 return Vec32<uint32_t>(vqmovn_u64(vcombine_u64(v.raw, v.raw))); 4997 } 4998 template <class D, HWY_IF_SIGNED_D(D), 4999 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 5000 HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) { 5001 const Rebind<int32_t, D> di32; 5002 return DemoteTo(d, DemoteTo(di32, v)); 5003 } 5004 template <class D, HWY_IF_UNSIGNED_D(D), 5005 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 5006 HWY_API VFromD<D> DemoteTo(D d, Vec64<int64_t> v) { 5007 const Rebind<uint32_t, D> du32; 5008 return DemoteTo(d, DemoteTo(du32, v)); 5009 } 5010 template <class D, HWY_IF_LANES_D(D, 1), HWY_IF_UNSIGNED_D(D), 5011 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 5012 HWY_API VFromD<D> DemoteTo(D d, Vec64<uint64_t> v) { 5013 const Rebind<uint32_t, D> du32; 5014 return DemoteTo(d, DemoteTo(du32, v)); 5015 } 5016 5017 #if HWY_NEON_HAVE_F16C 5018 5019 // We already toggled HWY_NATIVE_F16C above. 5020 5021 template <class D, HWY_IF_F16_D(D)> 5022 HWY_API Vec64<float16_t> DemoteTo(D /* tag */, Vec128<float> v) { 5023 return Vec64<float16_t>{vcvt_f16_f32(v.raw)}; 5024 } 5025 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_F16_D(D)> 5026 HWY_API VFromD<D> DemoteTo(D /* tag */, VFromD<Rebind<float, D>> v) { 5027 return VFromD<D>(vcvt_f16_f32(vcombine_f32(v.raw, v.raw))); 5028 } 5029 5030 #endif // HWY_NEON_HAVE_F16C 5031 5032 #if HWY_NEON_HAVE_F32_TO_BF16C 5033 #ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16 5034 #undef HWY_NATIVE_DEMOTE_F32_TO_BF16 5035 #else 5036 #define HWY_NATIVE_DEMOTE_F32_TO_BF16 5037 #endif 5038 5039 namespace detail { 5040 #if HWY_NEON_HAVE_BFLOAT16 5041 // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is 5042 // bfloat16x4_t or bfloat16x8_t. 5043 static HWY_INLINE bfloat16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) { 5044 return raw; 5045 } 5046 #else 5047 // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true, 5048 // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to 5049 // work around compiler bugs that are there with GCC 13 or earlier or Clang 16 5050 // or earlier on AArch64. 5051 5052 // The bfloat16x4_t vector returned by vcvt_bf16_f32 needs to be bitcasted to 5053 // an uint16x4_t vector if HWY_NEON_HAVE_F32_TO_BF16C && 5054 // !HWY_NEON_HAVE_BFLOAT16 is true. 5055 static HWY_INLINE uint16x4_t BitCastFromRawNeonBF16(bfloat16x4_t raw) { 5056 return vreinterpret_u16_bf16(raw); 5057 } 5058 #endif 5059 } // namespace detail 5060 5061 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_BF16_D(D)> 5062 HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) { 5063 return VFromD<D>(detail::BitCastFromRawNeonBF16(vcvt_bf16_f32(v.raw))); 5064 } 5065 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_IF_BF16_D(D)> 5066 HWY_API VFromD<D> DemoteTo(D /*dbf16*/, VFromD<Rebind<float, D>> v) { 5067 return VFromD<D>(detail::BitCastFromRawNeonBF16( 5068 vcvt_bf16_f32(vcombine_f32(v.raw, v.raw)))); 5069 } 5070 #endif // HWY_NEON_HAVE_F32_TO_BF16C 5071 5072 #if HWY_HAVE_FLOAT64 5073 5074 template <class D, HWY_IF_F32_D(D)> 5075 HWY_API Vec64<float> DemoteTo(D /* tag */, Vec128<double> v) { 5076 return Vec64<float>(vcvt_f32_f64(v.raw)); 5077 } 5078 template <class D, HWY_IF_F32_D(D)> 5079 HWY_API Vec32<float> DemoteTo(D /* tag */, Vec64<double> v) { 5080 return Vec32<float>(vcvt_f32_f64(vcombine_f64(v.raw, v.raw))); 5081 } 5082 5083 template <class D, HWY_IF_UI32_D(D)> 5084 HWY_API VFromD<D> DemoteTo(D d32, VFromD<Rebind<double, D>> v) { 5085 const Rebind<MakeWide<TFromD<D>>, D> d64; 5086 return DemoteTo(d32, ConvertTo(d64, v)); 5087 } 5088 5089 #endif // HWY_HAVE_FLOAT64 5090 5091 template <class D, HWY_IF_F32_D(D)> 5092 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<int64_t, D>> v) { 5093 const Rebind<int64_t, decltype(df32)> di64; 5094 const RebindToUnsigned<decltype(di64)> du64; 5095 5096 #if HWY_ARCH_ARM_A64 5097 const RebindToFloat<decltype(du64)> df64; 5098 5099 const auto k2p64_63 = Set(df64, 27670116110564327424.0); 5100 const auto f64_hi52 = 5101 Xor(BitCast(df64, ShiftRight<12>(BitCast(du64, v))), k2p64_63) - k2p64_63; 5102 const auto f64_lo12 = 5103 ConvertTo(df64, And(BitCast(du64, v), Set(du64, uint64_t{0x00000FFF}))); 5104 5105 const auto f64_sum = f64_hi52 + f64_lo12; 5106 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 5107 5108 const auto f64_sum_is_inexact = 5109 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 5110 const auto f64_bits_decrement = 5111 And(ShiftRight<63>(BitCast(du64, Xor(f64_sum, f64_carry))), 5112 f64_sum_is_inexact); 5113 5114 const auto adj_f64_val = BitCast( 5115 df64, 5116 Or(BitCast(du64, f64_sum) - f64_bits_decrement, f64_sum_is_inexact)); 5117 5118 return DemoteTo(df32, adj_f64_val); 5119 #else 5120 const RebindToUnsigned<decltype(df32)> du32; 5121 const auto hi23 = TruncateTo(du32, ShiftRight<41>(BitCast(du64, v))); 5122 const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(BitCast(du64, v))), 5123 Set(du32, uint32_t{0x007FFFFFu})); 5124 const auto lo18 = 5125 And(TruncateTo(du32, BitCast(du64, v)), Set(du32, uint32_t{0x0003FFFFu})); 5126 5127 const auto k2p41_f32 = Set(df32, 2199023255552.0f); 5128 const auto k2p64_63_f32 = Set(df32, 27670116110564327424.0f); 5129 5130 const auto hi23_f32 = 5131 BitCast(df32, Xor(hi23, BitCast(du32, k2p64_63_f32))) - k2p64_63_f32; 5132 const auto mid23_f32 = 5133 BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32; 5134 const auto lo18_f32 = ConvertTo(df32, lo18); 5135 5136 const auto s_hi46 = hi23_f32 + mid23_f32; 5137 const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32; 5138 5139 auto s_lo = c_hi46 + lo18_f32; 5140 const auto c_lo = (c_hi46 - s_lo) + lo18_f32; 5141 5142 const auto s_lo_inexact_mask = 5143 VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32))); 5144 const auto s_lo_mag_adj = ShiftRight<31>( 5145 And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo)))); 5146 5147 s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj); 5148 s_lo = 5149 BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask))); 5150 return s_hi46 + s_lo; 5151 #endif 5152 } 5153 5154 template <class D, HWY_IF_F32_D(D)> 5155 HWY_API VFromD<D> DemoteTo(D df32, VFromD<Rebind<uint64_t, D>> v) { 5156 #if HWY_ARCH_ARM_A64 5157 const Rebind<uint64_t, decltype(df32)> du64; 5158 const RebindToFloat<decltype(du64)> df64; 5159 5160 const auto k2p64 = Set(df64, 18446744073709551616.0); 5161 const auto f64_hi52 = Or(BitCast(df64, ShiftRight<12>(v)), k2p64) - k2p64; 5162 const auto f64_lo12 = 5163 ConvertTo(df64, And(v, Set(du64, uint64_t{0x00000FFF}))); 5164 5165 const auto f64_sum = f64_hi52 + f64_lo12; 5166 const auto f64_carry = (f64_hi52 - f64_sum) + f64_lo12; 5167 const auto f64_sum_is_inexact = 5168 ShiftRight<63>(BitCast(du64, VecFromMask(df64, f64_carry != Zero(df64)))); 5169 5170 const auto adj_f64_val = BitCast( 5171 df64, 5172 Or(BitCast(du64, f64_sum) - ShiftRight<63>(BitCast(du64, f64_carry)), 5173 f64_sum_is_inexact)); 5174 5175 return DemoteTo(df32, adj_f64_val); 5176 #else 5177 const RebindToUnsigned<decltype(df32)> du32; 5178 5179 const auto hi23 = TruncateTo(du32, ShiftRight<41>(v)); 5180 const auto mid23 = And(TruncateTo(du32, ShiftRight<18>(v)), 5181 Set(du32, uint32_t{0x007FFFFFu})); 5182 const auto lo18 = And(TruncateTo(du32, v), Set(du32, uint32_t{0x0003FFFFu})); 5183 5184 const auto k2p41_f32 = Set(df32, 2199023255552.0f); 5185 const auto k2p64_f32 = Set(df32, 18446744073709551616.0f); 5186 5187 const auto hi23_f32 = 5188 BitCast(df32, Or(hi23, BitCast(du32, k2p64_f32))) - k2p64_f32; 5189 const auto mid23_f32 = 5190 BitCast(df32, Or(mid23, BitCast(du32, k2p41_f32))) - k2p41_f32; 5191 const auto lo18_f32 = ConvertTo(df32, lo18); 5192 5193 const auto s_hi46 = hi23_f32 + mid23_f32; 5194 const auto c_hi46 = (hi23_f32 - s_hi46) + mid23_f32; 5195 5196 auto s_lo = c_hi46 + lo18_f32; 5197 const auto c_lo = (c_hi46 - s_lo) + lo18_f32; 5198 5199 const auto s_lo_inexact_mask = 5200 VecFromMask(du32, RebindMask(du32, c_lo != Zero(df32))); 5201 const auto s_lo_mag_adj = ShiftRight<31>( 5202 And(s_lo_inexact_mask, Xor(BitCast(du32, s_lo), BitCast(du32, c_lo)))); 5203 5204 s_lo = BitCast(df32, BitCast(du32, s_lo) - s_lo_mag_adj); 5205 s_lo = 5206 BitCast(df32, Or(BitCast(du32, s_lo), ShiftRight<31>(s_lo_inexact_mask))); 5207 return s_hi46 + s_lo; 5208 #endif 5209 } 5210 5211 HWY_API Vec32<uint8_t> U8FromU32(Vec128<uint32_t> v) { 5212 const uint8x16_t org_v = detail::BitCastToByte(v).raw; 5213 const uint8x16_t w = vuzp1q_u8(org_v, org_v); 5214 return Vec32<uint8_t>(vget_low_u8(vuzp1q_u8(w, w))); 5215 } 5216 template <size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8)> 5217 HWY_API Vec128<uint8_t, N> U8FromU32(Vec128<uint32_t, N> v) { 5218 const uint8x8_t org_v = detail::BitCastToByte(v).raw; 5219 const uint8x8_t w = vuzp1_u8(org_v, org_v); 5220 return Vec128<uint8_t, N>(vuzp1_u8(w, w)); 5221 } 5222 5223 // ------------------------------ Round (IfThenElse, mask, logical) 5224 5225 #if HWY_ARCH_ARM_A64 5226 // Toward nearest integer 5227 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Round, vrndn, _, 1) 5228 5229 // Toward zero, aka truncate 5230 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Trunc, vrnd, _, 1) 5231 5232 // Toward +infinity, aka ceiling 5233 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Ceil, vrndp, _, 1) 5234 5235 // Toward -infinity, aka floor 5236 HWY_NEON_DEF_FUNCTION_ALL_FLOATS(Floor, vrndm, _, 1) 5237 #else 5238 5239 // ------------------------------ Trunc 5240 5241 // Armv7 only supports truncation to integer. We can either convert back to 5242 // float (3 floating-point and 2 logic operations) or manipulate the binary32 5243 // representation, clearing the lowest 23-exp mantissa bits. This requires 9 5244 // integer operations and 3 constants, which is likely more expensive. 5245 5246 namespace detail { 5247 5248 // The original value is already the desired result if NaN or the magnitude is 5249 // large (i.e. the value is already an integer). 5250 template <size_t N> 5251 HWY_INLINE Mask128<float, N> UseInt(const Vec128<float, N> v) { 5252 return Abs(v) < Set(Simd<float, N, 0>(), MantissaEnd<float>()); 5253 } 5254 5255 } // namespace detail 5256 5257 template <size_t N> 5258 HWY_API Vec128<float, N> Trunc(const Vec128<float, N> v) { 5259 const DFromV<decltype(v)> df; 5260 const RebindToSigned<decltype(df)> di; 5261 5262 const auto integer = ConvertTo(di, v); // round toward 0 5263 const auto int_f = ConvertTo(df, integer); 5264 5265 return IfThenElse(detail::UseInt(v), int_f, v); 5266 } 5267 5268 template <size_t N> 5269 HWY_API Vec128<float, N> Round(const Vec128<float, N> v) { 5270 const DFromV<decltype(v)> df; 5271 5272 // Armv7 also lacks a native NearestInt, but we can instead rely on rounding 5273 // (we assume the current mode is nearest-even) after addition with a large 5274 // value such that no mantissa bits remain. We may need a compiler flag for 5275 // precise floating-point to prevent this from being "optimized" out. 5276 const auto max = Set(df, MantissaEnd<float>()); 5277 const auto large = CopySignToAbs(max, v); 5278 const auto added = large + v; 5279 const auto rounded = added - large; 5280 5281 // Keep original if NaN or the magnitude is large (already an int). 5282 return IfThenElse(Abs(v) < max, rounded, v); 5283 } 5284 5285 template <size_t N> 5286 HWY_API Vec128<float, N> Ceil(const Vec128<float, N> v) { 5287 const DFromV<decltype(v)> df; 5288 const RebindToSigned<decltype(df)> di; 5289 5290 const auto integer = ConvertTo(di, v); // round toward 0 5291 const auto int_f = ConvertTo(df, integer); 5292 5293 // Truncating a positive non-integer ends up smaller; if so, add 1. 5294 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f < v))); 5295 5296 return IfThenElse(detail::UseInt(v), int_f - neg1, v); 5297 } 5298 5299 template <size_t N> 5300 HWY_API Vec128<float, N> Floor(const Vec128<float, N> v) { 5301 const DFromV<decltype(v)> df; 5302 const RebindToSigned<decltype(df)> di; 5303 5304 const auto integer = ConvertTo(di, v); // round toward 0 5305 const auto int_f = ConvertTo(df, integer); 5306 5307 // Truncating a negative non-integer ends up larger; if so, subtract 1. 5308 const auto neg1 = ConvertTo(df, VecFromMask(di, RebindMask(di, int_f > v))); 5309 5310 return IfThenElse(detail::UseInt(v), int_f + neg1, v); 5311 } 5312 5313 #endif 5314 5315 // ------------------------------ CeilInt/FloorInt 5316 #if HWY_ARCH_ARM_A64 5317 5318 #ifdef HWY_NATIVE_CEIL_FLOOR_INT 5319 #undef HWY_NATIVE_CEIL_FLOOR_INT 5320 #else 5321 #define HWY_NATIVE_CEIL_FLOOR_INT 5322 #endif 5323 5324 #if HWY_HAVE_FLOAT16 5325 HWY_API Vec128<int16_t> CeilInt(const Vec128<float16_t> v) { 5326 return Vec128<int16_t>(vcvtpq_s16_f16(v.raw)); 5327 } 5328 5329 template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)> 5330 HWY_API Vec128<int16_t, N> CeilInt(const Vec128<float16_t, N> v) { 5331 return Vec128<int16_t, N>(vcvtp_s16_f16(v.raw)); 5332 } 5333 5334 HWY_API Vec128<int16_t> FloorInt(const Vec128<float16_t> v) { 5335 return Vec128<int16_t>(vcvtmq_s16_f16(v.raw)); 5336 } 5337 5338 template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)> 5339 HWY_API Vec128<int16_t, N> FloorInt(const Vec128<float16_t, N> v) { 5340 return Vec128<int16_t, N>(vcvtm_s16_f16(v.raw)); 5341 } 5342 #endif // HWY_HAVE_FLOAT16 5343 5344 HWY_API Vec128<int32_t> CeilInt(const Vec128<float> v) { 5345 return Vec128<int32_t>(vcvtpq_s32_f32(v.raw)); 5346 } 5347 5348 template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> 5349 HWY_API Vec128<int32_t, N> CeilInt(const Vec128<float, N> v) { 5350 return Vec128<int32_t, N>(vcvtp_s32_f32(v.raw)); 5351 } 5352 5353 HWY_API Vec128<int64_t> CeilInt(const Vec128<double> v) { 5354 return Vec128<int64_t>(vcvtpq_s64_f64(v.raw)); 5355 } 5356 5357 template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)> 5358 HWY_API Vec128<int64_t, N> CeilInt(const Vec128<double, N> v) { 5359 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610 5360 // Workaround for missing vcvtp_s64_f64 intrinsic 5361 const DFromV<decltype(v)> d; 5362 const RebindToSigned<decltype(d)> di; 5363 const Twice<decltype(d)> dt; 5364 return LowerHalf(di, CeilInt(Combine(dt, v, v))); 5365 #else 5366 return Vec128<int64_t, N>(vcvtp_s64_f64(v.raw)); 5367 #endif 5368 } 5369 5370 HWY_API Vec128<int32_t> FloorInt(const Vec128<float> v) { 5371 return Vec128<int32_t>(vcvtmq_s32_f32(v.raw)); 5372 } 5373 5374 template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> 5375 HWY_API Vec128<int32_t, N> FloorInt(const Vec128<float, N> v) { 5376 return Vec128<int32_t, N>(vcvtm_s32_f32(v.raw)); 5377 } 5378 5379 HWY_API Vec128<int64_t> FloorInt(const Vec128<double> v) { 5380 return Vec128<int64_t>(vcvtmq_s64_f64(v.raw)); 5381 } 5382 5383 template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)> 5384 HWY_API Vec128<int64_t, N> FloorInt(const Vec128<double, N> v) { 5385 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610 5386 // Workaround for missing vcvtm_s64_f64 intrinsic 5387 const DFromV<decltype(v)> d; 5388 const RebindToSigned<decltype(d)> di; 5389 const Twice<decltype(d)> dt; 5390 return LowerHalf(di, FloorInt(Combine(dt, v, v))); 5391 #else 5392 return Vec128<int64_t, N>(vcvtm_s64_f64(v.raw)); 5393 #endif 5394 } 5395 5396 #endif // HWY_ARCH_ARM_A64 5397 5398 // ------------------------------ NearestInt (Round) 5399 5400 #if HWY_HAVE_FLOAT16 5401 HWY_API Vec128<int16_t> NearestInt(const Vec128<float16_t> v) { 5402 return Vec128<int16_t>(vcvtnq_s16_f16(v.raw)); 5403 } 5404 template <size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8)> 5405 HWY_API Vec128<int16_t, N> NearestInt(const Vec128<float16_t, N> v) { 5406 return Vec128<int16_t, N>(vcvtn_s16_f16(v.raw)); 5407 } 5408 #endif 5409 5410 #if HWY_ARCH_ARM_A64 5411 5412 HWY_API Vec128<int32_t> NearestInt(const Vec128<float> v) { 5413 return Vec128<int32_t>(vcvtnq_s32_f32(v.raw)); 5414 } 5415 template <size_t N, HWY_IF_V_SIZE_LE(float, N, 8)> 5416 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { 5417 return Vec128<int32_t, N>(vcvtn_s32_f32(v.raw)); 5418 } 5419 5420 HWY_API Vec128<int64_t> NearestInt(const Vec128<double> v) { 5421 return Vec128<int64_t>(vcvtnq_s64_f64(v.raw)); 5422 } 5423 5424 template <size_t N, HWY_IF_V_SIZE_LE(double, N, 8)> 5425 HWY_API Vec128<int64_t, N> NearestInt(const Vec128<double, N> v) { 5426 #if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 610 5427 // Workaround for missing vcvtn_s64_f64 intrinsic 5428 const DFromV<decltype(v)> d; 5429 const RebindToSigned<decltype(d)> di; 5430 const Twice<decltype(d)> dt; 5431 return LowerHalf(di, NearestInt(Combine(dt, v, v))); 5432 #else 5433 return Vec128<int64_t, N>(vcvtn_s64_f64(v.raw)); 5434 #endif 5435 } 5436 5437 template <class DI32, HWY_IF_I32_D(DI32)> 5438 HWY_API VFromD<DI32> DemoteToNearestInt(DI32 di32, 5439 VFromD<Rebind<double, DI32>> v) { 5440 return DemoteTo(di32, NearestInt(v)); 5441 } 5442 5443 #else 5444 5445 template <size_t N> 5446 HWY_API Vec128<int32_t, N> NearestInt(const Vec128<float, N> v) { 5447 const RebindToSigned<DFromV<decltype(v)>> di; 5448 return ConvertTo(di, Round(v)); 5449 } 5450 5451 #endif 5452 5453 // ------------------------------ Floating-point classification 5454 5455 #if !HWY_COMPILER_CLANG || HWY_COMPILER_CLANG > 1801 || HWY_ARCH_ARM_V7 5456 template <typename T, size_t N> 5457 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5458 return v != v; 5459 } 5460 #else 5461 // Clang up to 18.1 generates less efficient code than the expected FCMEQ, see 5462 // https://github.com/numpy/numpy/issues/27313 and 5463 // https://github.com/numpy/numpy/pull/22954/files and 5464 // https://github.com/llvm/llvm-project/issues/59855 5465 5466 #if HWY_HAVE_FLOAT16 5467 template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE(T, N, 16)> 5468 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5469 typename Mask128<T, N>::Raw ret; 5470 __asm__ volatile("fcmeq %0.8h, %1.8h, %1.8h" : "=w"(ret) : "w"(v.raw)); 5471 return Not(Mask128<T, N>(ret)); 5472 } 5473 template <typename T, size_t N, HWY_IF_F16(T), HWY_IF_V_SIZE_LE(T, N, 8)> 5474 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5475 typename Mask128<T, N>::Raw ret; 5476 __asm__ volatile("fcmeq %0.4h, %1.4h, %1.4h" : "=w"(ret) : "w"(v.raw)); 5477 return Not(Mask128<T, N>(ret)); 5478 } 5479 #endif // HWY_HAVE_FLOAT16 5480 5481 template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE(T, N, 16)> 5482 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5483 typename Mask128<T, N>::Raw ret; 5484 __asm__ volatile("fcmeq %0.4s, %1.4s, %1.4s" : "=w"(ret) : "w"(v.raw)); 5485 return Not(Mask128<T, N>(ret)); 5486 } 5487 template <typename T, size_t N, HWY_IF_F32(T), HWY_IF_V_SIZE_LE(T, N, 8)> 5488 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5489 typename Mask128<T, N>::Raw ret; 5490 __asm__ volatile("fcmeq %0.2s, %1.2s, %1.2s" : "=w"(ret) : "w"(v.raw)); 5491 return Not(Mask128<T, N>(ret)); 5492 } 5493 5494 #if HWY_HAVE_FLOAT64 5495 template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE(T, N, 16)> 5496 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5497 typename Mask128<T, N>::Raw ret; 5498 __asm__ volatile("fcmeq %0.2d, %1.2d, %1.2d" : "=w"(ret) : "w"(v.raw)); 5499 return Not(Mask128<T, N>(ret)); 5500 } 5501 template <typename T, size_t N, HWY_IF_F64(T), HWY_IF_V_SIZE_LE(T, N, 8)> 5502 HWY_API Mask128<T, N> IsNaN(const Vec128<T, N> v) { 5503 typename Mask128<T, N>::Raw ret; 5504 __asm__ volatile("fcmeq %d0, %d1, %d1" : "=w"(ret) : "w"(v.raw)); 5505 return Not(Mask128<T, N>(ret)); 5506 } 5507 #endif // HWY_HAVE_FLOAT64 5508 5509 #endif // HWY_COMPILER_CLANG 5510 5511 // ================================================== SWIZZLE 5512 5513 // ------------------------------ LowerHalf 5514 5515 // <= 64 bit: just return different type 5516 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 5517 HWY_API Vec128<T, N / 2> LowerHalf(Vec128<T, N> v) { 5518 return Vec128<T, N / 2>(v.raw); 5519 } 5520 5521 HWY_API Vec64<uint8_t> LowerHalf(Vec128<uint8_t> v) { 5522 return Vec64<uint8_t>(vget_low_u8(v.raw)); 5523 } 5524 HWY_API Vec64<uint16_t> LowerHalf(Vec128<uint16_t> v) { 5525 return Vec64<uint16_t>(vget_low_u16(v.raw)); 5526 } 5527 HWY_API Vec64<uint32_t> LowerHalf(Vec128<uint32_t> v) { 5528 return Vec64<uint32_t>(vget_low_u32(v.raw)); 5529 } 5530 HWY_API Vec64<uint64_t> LowerHalf(Vec128<uint64_t> v) { 5531 return Vec64<uint64_t>(vget_low_u64(v.raw)); 5532 } 5533 HWY_API Vec64<int8_t> LowerHalf(Vec128<int8_t> v) { 5534 return Vec64<int8_t>(vget_low_s8(v.raw)); 5535 } 5536 HWY_API Vec64<int16_t> LowerHalf(Vec128<int16_t> v) { 5537 return Vec64<int16_t>(vget_low_s16(v.raw)); 5538 } 5539 HWY_API Vec64<int32_t> LowerHalf(Vec128<int32_t> v) { 5540 return Vec64<int32_t>(vget_low_s32(v.raw)); 5541 } 5542 HWY_API Vec64<int64_t> LowerHalf(Vec128<int64_t> v) { 5543 return Vec64<int64_t>(vget_low_s64(v.raw)); 5544 } 5545 HWY_API Vec64<float> LowerHalf(Vec128<float> v) { 5546 return Vec64<float>(vget_low_f32(v.raw)); 5547 } 5548 #if HWY_HAVE_FLOAT16 5549 HWY_API Vec64<float16_t> LowerHalf(Vec128<float16_t> v) { 5550 return Vec64<float16_t>(vget_low_f16(v.raw)); 5551 } 5552 #endif // HWY_HAVE_FLOAT16 5553 #if HWY_NEON_HAVE_BFLOAT16 5554 HWY_API Vec64<bfloat16_t> LowerHalf(Vec128<bfloat16_t> v) { 5555 return Vec64<bfloat16_t>(vget_low_bf16(v.raw)); 5556 } 5557 #endif // HWY_NEON_HAVE_BFLOAT16 5558 #if HWY_HAVE_FLOAT64 5559 HWY_API Vec64<double> LowerHalf(Vec128<double> v) { 5560 return Vec64<double>(vget_low_f64(v.raw)); 5561 } 5562 #endif // HWY_HAVE_FLOAT64 5563 5564 template <class V, HWY_NEON_IF_EMULATED_D(DFromV<V>), HWY_IF_V_SIZE_V(V, 16)> 5565 HWY_API VFromD<Half<DFromV<V>>> LowerHalf(V v) { 5566 const Full128<uint16_t> du; 5567 const Half<DFromV<V>> dh; 5568 return BitCast(dh, LowerHalf(BitCast(du, v))); 5569 } 5570 5571 template <class DH> 5572 HWY_API VFromD<DH> LowerHalf(DH /* tag */, VFromD<Twice<DH>> v) { 5573 return LowerHalf(v); 5574 } 5575 5576 // ------------------------------ CombineShiftRightBytes 5577 5578 // 128-bit 5579 template <int kBytes, class D, typename T = TFromD<D>> 5580 HWY_API Vec128<T> CombineShiftRightBytes(D d, Vec128<T> hi, Vec128<T> lo) { 5581 static_assert(0 < kBytes && kBytes < 16, "kBytes must be in [1, 15]"); 5582 const Repartition<uint8_t, decltype(d)> d8; 5583 uint8x16_t v8 = vextq_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); 5584 return BitCast(d, Vec128<uint8_t>(v8)); 5585 } 5586 5587 // 64-bit 5588 template <int kBytes, class D, typename T = TFromD<D>> 5589 HWY_API Vec64<T> CombineShiftRightBytes(D d, Vec64<T> hi, Vec64<T> lo) { 5590 static_assert(0 < kBytes && kBytes < 8, "kBytes must be in [1, 7]"); 5591 const Repartition<uint8_t, decltype(d)> d8; 5592 uint8x8_t v8 = vext_u8(BitCast(d8, lo).raw, BitCast(d8, hi).raw, kBytes); 5593 return BitCast(d, VFromD<decltype(d8)>(v8)); 5594 } 5595 5596 // <= 32-bit defined after ShiftLeftBytes. 5597 5598 // ------------------------------ Shift vector by constant #bytes 5599 5600 namespace detail { 5601 5602 // Partially specialize because kBytes = 0 and >= size are compile errors; 5603 // callers replace the latter with 0xFF for easier specialization. 5604 template <int kBytes> 5605 struct ShiftLeftBytesT { 5606 // Full 5607 template <class T> 5608 HWY_INLINE Vec128<T> operator()(const Vec128<T> v) { 5609 const Full128<T> d; 5610 return CombineShiftRightBytes<16 - kBytes>(d, v, Zero(d)); 5611 } 5612 5613 // Partial 5614 template <class T, size_t N, HWY_IF_V_SIZE_LE(T, N, 8)> 5615 HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { 5616 // Expand to 64-bit so we only use the native EXT instruction. 5617 const Full64<T> d64; 5618 const auto zero64 = Zero(d64); 5619 const decltype(zero64) v64(v.raw); 5620 return Vec128<T, N>( 5621 CombineShiftRightBytes<8 - kBytes>(d64, v64, zero64).raw); 5622 } 5623 }; 5624 template <> 5625 struct ShiftLeftBytesT<0> { 5626 template <class T, size_t N> 5627 HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { 5628 return v; 5629 } 5630 }; 5631 template <> 5632 struct ShiftLeftBytesT<0xFF> { 5633 template <class T, size_t N> 5634 HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { 5635 return Xor(v, v); 5636 } 5637 }; 5638 5639 template <int kBytes> 5640 struct ShiftRightBytesT { 5641 template <class T, size_t N> 5642 HWY_INLINE Vec128<T, N> operator()(Vec128<T, N> v) { 5643 const DFromV<decltype(v)> d; 5644 // For < 64-bit vectors, zero undefined lanes so we shift in zeros. 5645 if (d.MaxBytes() < 8) { 5646 constexpr size_t kReg = d.MaxBytes() == 16 ? 16 : 8; 5647 const Simd<T, kReg / sizeof(T), 0> dreg; 5648 v = Vec128<T, N>( 5649 IfThenElseZero(FirstN(dreg, N), VFromD<decltype(dreg)>(v.raw)).raw); 5650 } 5651 return CombineShiftRightBytes<kBytes>(d, Zero(d), v); 5652 } 5653 }; 5654 template <> 5655 struct ShiftRightBytesT<0> { 5656 template <class T, size_t N> 5657 HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { 5658 return v; 5659 } 5660 }; 5661 template <> 5662 struct ShiftRightBytesT<0xFF> { 5663 template <class T, size_t N> 5664 HWY_INLINE Vec128<T, N> operator()(const Vec128<T, N> v) { 5665 return Xor(v, v); 5666 } 5667 }; 5668 5669 } // namespace detail 5670 5671 template <int kBytes, class D> 5672 HWY_API VFromD<D> ShiftLeftBytes(D d, VFromD<D> v) { 5673 return detail::ShiftLeftBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()(v); 5674 } 5675 5676 template <int kBytes, typename T, size_t N> 5677 HWY_API Vec128<T, N> ShiftLeftBytes(Vec128<T, N> v) { 5678 return ShiftLeftBytes<kBytes>(DFromV<decltype(v)>(), v); 5679 } 5680 5681 template <int kLanes, class D> 5682 HWY_API VFromD<D> ShiftLeftLanes(D d, VFromD<D> v) { 5683 const Repartition<uint8_t, decltype(d)> d8; 5684 return BitCast(d, ShiftLeftBytes<kLanes * sizeof(TFromD<D>)>(BitCast(d8, v))); 5685 } 5686 5687 template <int kLanes, typename T, size_t N> 5688 HWY_API Vec128<T, N> ShiftLeftLanes(Vec128<T, N> v) { 5689 return ShiftLeftLanes<kLanes>(DFromV<decltype(v)>(), v); 5690 } 5691 5692 // 0x01..0F, kBytes = 1 => 0x0001..0E 5693 template <int kBytes, class D> 5694 HWY_API VFromD<D> ShiftRightBytes(D d, VFromD<D> v) { 5695 return detail::ShiftRightBytesT<(kBytes >= d.MaxBytes() ? 0xFF : kBytes)>()( 5696 v); 5697 } 5698 5699 template <int kLanes, class D> 5700 HWY_API VFromD<D> ShiftRightLanes(D d, VFromD<D> v) { 5701 const Repartition<uint8_t, decltype(d)> d8; 5702 return BitCast( 5703 d, ShiftRightBytes<kLanes * sizeof(TFromD<D>)>(d8, BitCast(d8, v))); 5704 } 5705 5706 // Calls ShiftLeftBytes 5707 template <int kBytes, class D, HWY_IF_V_SIZE_LE_D(D, 4)> 5708 HWY_API VFromD<D> CombineShiftRightBytes(D d, VFromD<D> hi, VFromD<D> lo) { 5709 constexpr size_t kSize = d.MaxBytes(); 5710 static_assert(0 < kBytes && kBytes < kSize, "kBytes invalid"); 5711 const Repartition<uint8_t, decltype(d)> d8; 5712 const Full64<uint8_t> d_full8; 5713 const Repartition<TFromD<D>, decltype(d_full8)> d_full; 5714 using V64 = VFromD<decltype(d_full8)>; 5715 const V64 hi64(BitCast(d8, hi).raw); 5716 // Move into most-significant bytes 5717 const V64 lo64 = ShiftLeftBytes<8 - kSize>(V64(BitCast(d8, lo).raw)); 5718 const V64 r = CombineShiftRightBytes<8 - kSize + kBytes>(d_full8, hi64, lo64); 5719 // After casting to full 64-bit vector of correct type, shrink to 32-bit 5720 return VFromD<D>(BitCast(d_full, r).raw); 5721 } 5722 5723 // ------------------------------ UpperHalf (ShiftRightBytes) 5724 5725 // Full input 5726 template <class D, HWY_IF_U8_D(D)> 5727 HWY_API Vec64<uint8_t> UpperHalf(D /* tag */, Vec128<uint8_t> v) { 5728 return Vec64<uint8_t>(vget_high_u8(v.raw)); 5729 } 5730 template <class D, HWY_IF_U16_D(D)> 5731 HWY_API Vec64<uint16_t> UpperHalf(D /* tag */, Vec128<uint16_t> v) { 5732 return Vec64<uint16_t>(vget_high_u16(v.raw)); 5733 } 5734 template <class D, HWY_IF_U32_D(D)> 5735 HWY_API Vec64<uint32_t> UpperHalf(D /* tag */, Vec128<uint32_t> v) { 5736 return Vec64<uint32_t>(vget_high_u32(v.raw)); 5737 } 5738 template <class D, HWY_IF_U64_D(D)> 5739 HWY_API Vec64<uint64_t> UpperHalf(D /* tag */, Vec128<uint64_t> v) { 5740 return Vec64<uint64_t>(vget_high_u64(v.raw)); 5741 } 5742 template <class D, HWY_IF_I8_D(D)> 5743 HWY_API Vec64<int8_t> UpperHalf(D /* tag */, Vec128<int8_t> v) { 5744 return Vec64<int8_t>(vget_high_s8(v.raw)); 5745 } 5746 template <class D, HWY_IF_I16_D(D)> 5747 HWY_API Vec64<int16_t> UpperHalf(D /* tag */, Vec128<int16_t> v) { 5748 return Vec64<int16_t>(vget_high_s16(v.raw)); 5749 } 5750 template <class D, HWY_IF_I32_D(D)> 5751 HWY_API Vec64<int32_t> UpperHalf(D /* tag */, Vec128<int32_t> v) { 5752 return Vec64<int32_t>(vget_high_s32(v.raw)); 5753 } 5754 template <class D, HWY_IF_I64_D(D)> 5755 HWY_API Vec64<int64_t> UpperHalf(D /* tag */, Vec128<int64_t> v) { 5756 return Vec64<int64_t>(vget_high_s64(v.raw)); 5757 } 5758 #if HWY_HAVE_FLOAT16 5759 template <class D, HWY_IF_F16_D(D)> 5760 HWY_API Vec64<float16_t> UpperHalf(D /* tag */, Vec128<float16_t> v) { 5761 return Vec64<float16_t>(vget_high_f16(v.raw)); 5762 } 5763 #endif 5764 #if HWY_NEON_HAVE_BFLOAT16 5765 template <class D, HWY_IF_BF16_D(D)> 5766 HWY_API Vec64<bfloat16_t> UpperHalf(D /* tag */, Vec128<bfloat16_t> v) { 5767 return Vec64<bfloat16_t>(vget_high_bf16(v.raw)); 5768 } 5769 #endif // HWY_NEON_HAVE_BFLOAT16 5770 template <class D, HWY_IF_F32_D(D)> 5771 HWY_API Vec64<float> UpperHalf(D /* tag */, Vec128<float> v) { 5772 return Vec64<float>(vget_high_f32(v.raw)); 5773 } 5774 #if HWY_HAVE_FLOAT64 5775 template <class D, HWY_IF_F64_D(D)> 5776 HWY_API Vec64<double> UpperHalf(D /* tag */, Vec128<double> v) { 5777 return Vec64<double>(vget_high_f64(v.raw)); 5778 } 5779 #endif // HWY_HAVE_FLOAT64 5780 5781 template <class D, HWY_NEON_IF_EMULATED_D(D), HWY_IF_V_SIZE_GT_D(D, 4)> 5782 HWY_API VFromD<D> UpperHalf(D dh, VFromD<Twice<D>> v) { 5783 const RebindToUnsigned<Twice<decltype(dh)>> du; 5784 const Half<decltype(du)> duh; 5785 return BitCast(dh, UpperHalf(duh, BitCast(du, v))); 5786 } 5787 5788 // Partial 5789 template <class DH, HWY_IF_V_SIZE_LE_D(DH, 4)> 5790 HWY_API VFromD<DH> UpperHalf(DH dh, VFromD<Twice<DH>> v) { 5791 const Twice<DH> d; 5792 const RebindToUnsigned<decltype(d)> du; 5793 const VFromD<decltype(du)> upper = 5794 ShiftRightBytes<dh.MaxBytes()>(du, BitCast(du, v)); 5795 return VFromD<DH>(BitCast(d, upper).raw); 5796 } 5797 5798 // ------------------------------ Broadcast/splat any lane 5799 5800 template <int kLane, typename T> 5801 HWY_API Vec128<T, 1> Broadcast(Vec128<T, 1> v) { 5802 return v; 5803 } 5804 5805 #if HWY_ARCH_ARM_A64 5806 // Unsigned 5807 template <int kLane> 5808 HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) { 5809 static_assert(0 <= kLane && kLane < 16, "Invalid lane"); 5810 return Vec128<uint8_t>(vdupq_laneq_u8(v.raw, kLane)); 5811 } 5812 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8), 5813 HWY_IF_LANES_GT(N, 1)> 5814 HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) { 5815 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5816 return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane)); 5817 } 5818 template <int kLane> 5819 HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) { 5820 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 5821 return Vec128<uint16_t>(vdupq_laneq_u16(v.raw, kLane)); 5822 } 5823 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8), 5824 HWY_IF_LANES_GT(N, 1)> 5825 HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) { 5826 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5827 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane)); 5828 } 5829 template <int kLane> 5830 HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) { 5831 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 5832 return Vec128<uint32_t>(vdupq_laneq_u32(v.raw, kLane)); 5833 } 5834 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8), 5835 HWY_IF_LANES_GT(N, 1)> 5836 HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) { 5837 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5838 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane)); 5839 } 5840 template <int kLane> 5841 HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) { 5842 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 5843 return Vec128<uint64_t>(vdupq_laneq_u64(v.raw, kLane)); 5844 } 5845 5846 // Signed 5847 template <int kLane> 5848 HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) { 5849 static_assert(0 <= kLane && kLane < 16, "Invalid lane"); 5850 return Vec128<int8_t>(vdupq_laneq_s8(v.raw, kLane)); 5851 } 5852 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8), 5853 HWY_IF_LANES_GT(N, 1)> 5854 HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) { 5855 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5856 return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane)); 5857 } 5858 template <int kLane> 5859 HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) { 5860 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 5861 return Vec128<int16_t>(vdupq_laneq_s16(v.raw, kLane)); 5862 } 5863 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8), 5864 HWY_IF_LANES_GT(N, 1)> 5865 HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) { 5866 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5867 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane)); 5868 } 5869 template <int kLane> 5870 HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) { 5871 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 5872 return Vec128<int32_t>(vdupq_laneq_s32(v.raw, kLane)); 5873 } 5874 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8), 5875 HWY_IF_LANES_GT(N, 1)> 5876 HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) { 5877 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5878 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane)); 5879 } 5880 template <int kLane> 5881 HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) { 5882 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 5883 return Vec128<int64_t>(vdupq_laneq_s64(v.raw, kLane)); 5884 } 5885 5886 // Float 5887 #if HWY_HAVE_FLOAT16 5888 template <int kLane> 5889 HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) { 5890 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 5891 return Vec128<float16_t>(vdupq_laneq_f16(v.raw, kLane)); 5892 } 5893 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8), 5894 HWY_IF_LANES_GT(N, 1)> 5895 HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) { 5896 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5897 return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane)); 5898 } 5899 #endif // HWY_HAVE_FLOAT16 5900 5901 #if HWY_NEON_HAVE_BFLOAT16 5902 template <int kLane> 5903 HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) { 5904 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 5905 return Vec128<bfloat16_t>(vdupq_laneq_bf16(v.raw, kLane)); 5906 } 5907 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8), 5908 HWY_IF_LANES_GT(N, 1)> 5909 HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) { 5910 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5911 return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane)); 5912 } 5913 #endif // HWY_NEON_HAVE_BFLOAT16 5914 5915 template <int kLane> 5916 HWY_API Vec128<float> Broadcast(Vec128<float> v) { 5917 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 5918 return Vec128<float>(vdupq_laneq_f32(v.raw, kLane)); 5919 } 5920 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8), 5921 HWY_IF_LANES_GT(N, 1)> 5922 HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) { 5923 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5924 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane)); 5925 } 5926 template <int kLane> 5927 HWY_API Vec128<double> Broadcast(Vec128<double> v) { 5928 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 5929 return Vec128<double>(vdupq_laneq_f64(v.raw, kLane)); 5930 } 5931 5932 #else // !HWY_ARCH_ARM_A64 5933 // No vdupq_laneq_* on armv7: use vgetq_lane_* + vdupq_n_*. 5934 5935 // Unsigned 5936 template <int kLane> 5937 HWY_API Vec128<uint8_t> Broadcast(Vec128<uint8_t> v) { 5938 static_assert(0 <= kLane && kLane < 16, "Invalid lane"); 5939 return Vec128<uint8_t>(vdupq_n_u8(vgetq_lane_u8(v.raw, kLane))); 5940 } 5941 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint8_t, N, 8), 5942 HWY_IF_LANES_GT(N, 1)> 5943 HWY_API Vec128<uint8_t, N> Broadcast(Vec128<uint8_t, N> v) { 5944 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5945 return Vec128<uint8_t, N>(vdup_lane_u8(v.raw, kLane)); 5946 } 5947 template <int kLane> 5948 HWY_API Vec128<uint16_t> Broadcast(Vec128<uint16_t> v) { 5949 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 5950 return Vec128<uint16_t>(vdupq_n_u16(vgetq_lane_u16(v.raw, kLane))); 5951 } 5952 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint16_t, N, 8), 5953 HWY_IF_LANES_GT(N, 1)> 5954 HWY_API Vec128<uint16_t, N> Broadcast(Vec128<uint16_t, N> v) { 5955 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5956 return Vec128<uint16_t, N>(vdup_lane_u16(v.raw, kLane)); 5957 } 5958 template <int kLane> 5959 HWY_API Vec128<uint32_t> Broadcast(Vec128<uint32_t> v) { 5960 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 5961 return Vec128<uint32_t>(vdupq_n_u32(vgetq_lane_u32(v.raw, kLane))); 5962 } 5963 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(uint32_t, N, 8), 5964 HWY_IF_LANES_GT(N, 1)> 5965 HWY_API Vec128<uint32_t, N> Broadcast(Vec128<uint32_t, N> v) { 5966 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5967 return Vec128<uint32_t, N>(vdup_lane_u32(v.raw, kLane)); 5968 } 5969 template <int kLane> 5970 HWY_API Vec128<uint64_t> Broadcast(Vec128<uint64_t> v) { 5971 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 5972 return Vec128<uint64_t>(vdupq_n_u64(vgetq_lane_u64(v.raw, kLane))); 5973 } 5974 5975 // Signed 5976 template <int kLane> 5977 HWY_API Vec128<int8_t> Broadcast(Vec128<int8_t> v) { 5978 static_assert(0 <= kLane && kLane < 16, "Invalid lane"); 5979 return Vec128<int8_t>(vdupq_n_s8(vgetq_lane_s8(v.raw, kLane))); 5980 } 5981 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int8_t, N, 8), 5982 HWY_IF_LANES_GT(N, 1)> 5983 HWY_API Vec128<int8_t, N> Broadcast(Vec128<int8_t, N> v) { 5984 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5985 return Vec128<int8_t, N>(vdup_lane_s8(v.raw, kLane)); 5986 } 5987 template <int kLane> 5988 HWY_API Vec128<int16_t> Broadcast(Vec128<int16_t> v) { 5989 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 5990 return Vec128<int16_t>(vdupq_n_s16(vgetq_lane_s16(v.raw, kLane))); 5991 } 5992 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int16_t, N, 8), 5993 HWY_IF_LANES_GT(N, 1)> 5994 HWY_API Vec128<int16_t, N> Broadcast(Vec128<int16_t, N> v) { 5995 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 5996 return Vec128<int16_t, N>(vdup_lane_s16(v.raw, kLane)); 5997 } 5998 template <int kLane> 5999 HWY_API Vec128<int32_t> Broadcast(Vec128<int32_t> v) { 6000 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 6001 return Vec128<int32_t>(vdupq_n_s32(vgetq_lane_s32(v.raw, kLane))); 6002 } 6003 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(int32_t, N, 8), 6004 HWY_IF_LANES_GT(N, 1)> 6005 HWY_API Vec128<int32_t, N> Broadcast(Vec128<int32_t, N> v) { 6006 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6007 return Vec128<int32_t, N>(vdup_lane_s32(v.raw, kLane)); 6008 } 6009 template <int kLane> 6010 HWY_API Vec128<int64_t> Broadcast(Vec128<int64_t> v) { 6011 static_assert(0 <= kLane && kLane < 2, "Invalid lane"); 6012 return Vec128<int64_t>(vdupq_n_s64(vgetq_lane_s64(v.raw, kLane))); 6013 } 6014 6015 // Float 6016 #if HWY_HAVE_FLOAT16 6017 template <int kLane> 6018 HWY_API Vec128<float16_t> Broadcast(Vec128<float16_t> v) { 6019 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 6020 return Vec128<float16_t>(vdupq_n_f16(vgetq_lane_f16(v.raw, kLane))); 6021 } 6022 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float16_t, N, 8), 6023 HWY_IF_LANES_GT(N, 1)> 6024 HWY_API Vec128<float16_t, N> Broadcast(Vec128<float16_t, N> v) { 6025 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6026 return Vec128<float16_t, N>(vdup_lane_f16(v.raw, kLane)); 6027 } 6028 #endif // HWY_HAVE_FLOAT16 6029 #if HWY_NEON_HAVE_BFLOAT16 6030 template <int kLane> 6031 HWY_API Vec128<bfloat16_t> Broadcast(Vec128<bfloat16_t> v) { 6032 static_assert(0 <= kLane && kLane < 8, "Invalid lane"); 6033 return Vec128<bfloat16_t>(vdupq_n_bf16(vgetq_lane_bf16(v.raw, kLane))); 6034 } 6035 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(bfloat16_t, N, 8), 6036 HWY_IF_LANES_GT(N, 1)> 6037 HWY_API Vec128<bfloat16_t, N> Broadcast(Vec128<bfloat16_t, N> v) { 6038 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6039 return Vec128<bfloat16_t, N>(vdup_lane_bf16(v.raw, kLane)); 6040 } 6041 #endif // HWY_NEON_HAVE_BFLOAT16 6042 template <int kLane> 6043 HWY_API Vec128<float> Broadcast(Vec128<float> v) { 6044 static_assert(0 <= kLane && kLane < 4, "Invalid lane"); 6045 return Vec128<float>(vdupq_n_f32(vgetq_lane_f32(v.raw, kLane))); 6046 } 6047 template <int kLane, size_t N, HWY_IF_V_SIZE_LE(float, N, 8), 6048 HWY_IF_LANES_GT(N, 1)> 6049 HWY_API Vec128<float, N> Broadcast(Vec128<float, N> v) { 6050 static_assert(0 <= kLane && kLane < N, "Invalid lane"); 6051 return Vec128<float, N>(vdup_lane_f32(v.raw, kLane)); 6052 } 6053 6054 #endif // HWY_ARCH_ARM_A64 6055 6056 template <int kLane, typename V, HWY_NEON_IF_EMULATED_D(DFromV<V>), 6057 HWY_IF_LANES_GT_D(DFromV<V>, 1)> 6058 HWY_API V Broadcast(V v) { 6059 const DFromV<V> d; 6060 const RebindToUnsigned<decltype(d)> du; 6061 return BitCast(d, Broadcast<kLane>(BitCast(du, v))); 6062 } 6063 6064 // ------------------------------ TableLookupLanes 6065 6066 // Returned by SetTableIndices for use by TableLookupLanes. 6067 template <typename T, size_t N> 6068 struct Indices128 { 6069 typename detail::Raw128<T, N>::type raw; 6070 }; 6071 6072 namespace detail { 6073 6074 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6075 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 6076 D d) { 6077 const Repartition<uint8_t, decltype(d)> d8; 6078 return Iota(d8, 0); 6079 } 6080 6081 template <class D, HWY_IF_T_SIZE_D(D, 2)> 6082 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 6083 D d) { 6084 const Repartition<uint8_t, decltype(d)> d8; 6085 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 6086 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14}; 6087 return Load(d8, kBroadcastLaneBytes); 6088 } 6089 6090 template <class D, HWY_IF_T_SIZE_D(D, 4)> 6091 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 6092 D d) { 6093 const Repartition<uint8_t, decltype(d)> d8; 6094 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 6095 0, 0, 0, 0, 4, 4, 4, 4, 8, 8, 8, 8, 12, 12, 12, 12}; 6096 return Load(d8, kBroadcastLaneBytes); 6097 } 6098 6099 template <class D, HWY_IF_T_SIZE_D(D, 8)> 6100 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecBroadcastLaneBytes( 6101 D d) { 6102 const Repartition<uint8_t, decltype(d)> d8; 6103 alignas(16) static constexpr uint8_t kBroadcastLaneBytes[16] = { 6104 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, 8, 8, 8, 8}; 6105 return Load(d8, kBroadcastLaneBytes); 6106 } 6107 6108 template <class D, HWY_IF_T_SIZE_D(D, 1)> 6109 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 6110 const Repartition<uint8_t, decltype(d)> d8; 6111 return Zero(d8); 6112 } 6113 6114 template <class D, HWY_IF_T_SIZE_D(D, 2)> 6115 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 6116 const Repartition<uint8_t, decltype(d)> d8; 6117 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 6118 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}; 6119 return Load(d8, kByteOffsets); 6120 } 6121 6122 template <class D, HWY_IF_T_SIZE_D(D, 4)> 6123 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 6124 const Repartition<uint8_t, decltype(d)> d8; 6125 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 6126 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3}; 6127 return Load(d8, kByteOffsets); 6128 } 6129 6130 template <class D, HWY_IF_T_SIZE_D(D, 8)> 6131 HWY_INLINE VFromD<Repartition<uint8_t, D>> IndicesFromVecByteOffsets(D d) { 6132 const Repartition<uint8_t, decltype(d)> d8; 6133 alignas(16) static constexpr uint8_t kByteOffsets[16] = { 6134 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7}; 6135 return Load(d8, kByteOffsets); 6136 } 6137 6138 } // namespace detail 6139 6140 template <class D, typename TI, HWY_IF_T_SIZE_D(D, 1)> 6141 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 6142 D d, Vec128<TI, MaxLanes(D())> vec) { 6143 using T = TFromD<D>; 6144 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 6145 #if HWY_IS_DEBUG_BUILD 6146 const RebindToUnsigned<decltype(d)> du; 6147 using TU = TFromD<decltype(du)>; 6148 HWY_DASSERT(AllTrue( 6149 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 6150 #endif 6151 6152 (void)d; 6153 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, vec).raw}; 6154 } 6155 6156 template <class D, typename TI, 6157 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))> 6158 HWY_API Indices128<TFromD<D>, MaxLanes(D())> IndicesFromVec( 6159 D d, Vec128<TI, MaxLanes(D())> vec) { 6160 using T = TFromD<D>; 6161 static_assert(sizeof(T) == sizeof(TI), "Index size must match lane"); 6162 #if HWY_IS_DEBUG_BUILD 6163 const RebindToUnsigned<decltype(d)> du; 6164 using TU = TFromD<decltype(du)>; 6165 HWY_DASSERT(AllTrue( 6166 du, Lt(BitCast(du, vec), Set(du, static_cast<TU>(MaxLanes(d) * 2))))); 6167 #endif 6168 6169 const Repartition<uint8_t, decltype(d)> d8; 6170 using V8 = VFromD<decltype(d8)>; 6171 6172 // Broadcast each lane index to all bytes of T and shift to bytes 6173 const V8 lane_indices = TableLookupBytes( 6174 BitCast(d8, vec), detail::IndicesFromVecBroadcastLaneBytes(d)); 6175 constexpr int kIndexShiftAmt = static_cast<int>(FloorLog2(sizeof(T))); 6176 const V8 byte_indices = ShiftLeft<kIndexShiftAmt>(lane_indices); 6177 const V8 sum = Add(byte_indices, detail::IndicesFromVecByteOffsets(d)); 6178 return Indices128<TFromD<D>, MaxLanes(D())>{BitCast(d, sum).raw}; 6179 } 6180 6181 template <class D, typename TI> 6182 HWY_API Indices128<TFromD<D>, MaxLanes(D())> SetTableIndices(D d, 6183 const TI* idx) { 6184 const Rebind<TI, decltype(d)> di; 6185 return IndicesFromVec(d, LoadU(di, idx)); 6186 } 6187 6188 template <typename T, size_t N> 6189 HWY_API Vec128<T, N> TableLookupLanes(Vec128<T, N> v, Indices128<T, N> idx) { 6190 const DFromV<decltype(v)> d; 6191 const RebindToSigned<decltype(d)> di; 6192 return BitCast( 6193 d, TableLookupBytes(BitCast(di, v), BitCast(di, Vec128<T, N>{idx.raw}))); 6194 } 6195 6196 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)> 6197 HWY_API Vec128<T, N> TwoTablesLookupLanes(Vec128<T, N> a, Vec128<T, N> b, 6198 Indices128<T, N> idx) { 6199 const DFromV<decltype(a)> d; 6200 const Twice<decltype(d)> dt; 6201 // TableLookupLanes currently requires table and index vectors to be the same 6202 // size, though a half-length index vector would be sufficient here. 6203 #if HWY_IS_MSAN 6204 const Vec128<T, N> idx_vec{idx.raw}; 6205 const Indices128<T, N * 2> idx2{Combine(dt, idx_vec, idx_vec).raw}; 6206 #else 6207 // We only keep LowerHalf of the result, which is valid in idx. 6208 const Indices128<T, N * 2> idx2{idx.raw}; 6209 #endif 6210 return LowerHalf(d, TableLookupLanes(Combine(dt, b, a), idx2)); 6211 } 6212 6213 template <typename T> 6214 HWY_API Vec64<T> TwoTablesLookupLanes(Vec64<T> a, Vec64<T> b, 6215 Indices128<T, 8 / sizeof(T)> idx) { 6216 const DFromV<decltype(a)> d; 6217 const Repartition<uint8_t, decltype(d)> du8; 6218 const auto a_u8 = BitCast(du8, a); 6219 const auto b_u8 = BitCast(du8, b); 6220 const auto idx_u8 = BitCast(du8, Vec64<T>{idx.raw}); 6221 6222 #if HWY_ARCH_ARM_A64 6223 const Twice<decltype(du8)> dt_u8; 6224 return BitCast( 6225 d, Vec64<uint8_t>{vqtbl1_u8(Combine(dt_u8, b_u8, a_u8).raw, idx_u8.raw)}); 6226 #else 6227 detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}}; 6228 return BitCast(d, Vec64<uint8_t>{vtbl2_u8(tup.raw, idx_u8.raw)}); 6229 #endif 6230 } 6231 6232 template <typename T> 6233 HWY_API Vec128<T> TwoTablesLookupLanes(Vec128<T> a, Vec128<T> b, 6234 Indices128<T, 16 / sizeof(T)> idx) { 6235 const DFromV<decltype(a)> d; 6236 const Repartition<uint8_t, decltype(d)> du8; 6237 const auto a_u8 = BitCast(du8, a); 6238 const auto b_u8 = BitCast(du8, b); 6239 const auto idx_u8 = BitCast(du8, Vec128<T>{idx.raw}); 6240 6241 #if HWY_ARCH_ARM_A64 6242 detail::Tuple2<uint8_t, du8.MaxLanes()> tup = {{{a_u8.raw, b_u8.raw}}}; 6243 return BitCast(d, Vec128<uint8_t>{vqtbl2q_u8(tup.raw, idx_u8.raw)}); 6244 #else 6245 const Half<decltype(d)> dh; 6246 const Repartition<uint8_t, decltype(dh)> dh_u8; 6247 const auto a_lo_u8 = LowerHalf(dh_u8, a_u8); 6248 const auto a_hi_u8 = UpperHalf(dh_u8, a_u8); 6249 const auto b_lo_u8 = LowerHalf(dh_u8, b_u8); 6250 const auto b_hi_u8 = UpperHalf(dh_u8, b_u8); 6251 const auto idx_lo_u8 = LowerHalf(dh_u8, idx_u8); 6252 const auto idx_hi_u8 = UpperHalf(dh_u8, idx_u8); 6253 6254 detail::Tuple4<uint8_t, dh_u8.MaxLanes()> tup = { 6255 {{a_lo_u8.raw, a_hi_u8.raw, b_lo_u8.raw, b_hi_u8.raw}}}; 6256 const auto lo_result = 6257 BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_lo_u8.raw)}); 6258 const auto hi_result = 6259 BitCast(dh, Vec64<uint8_t>{vtbl4_u8(tup.raw, idx_hi_u8.raw)}); 6260 return Combine(d, hi_result, lo_result); 6261 #endif 6262 } 6263 6264 // ------------------------------ Reverse2 (CombineShiftRightBytes) 6265 6266 // Per-target flag to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8. 6267 #ifdef HWY_NATIVE_REVERSE2_8 6268 #undef HWY_NATIVE_REVERSE2_8 6269 #else 6270 #define HWY_NATIVE_REVERSE2_8 6271 #endif 6272 6273 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 6274 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 6275 const RebindToUnsigned<decltype(d)> du; 6276 return BitCast(d, VFromD<decltype(du)>(vrev16_u8(BitCast(du, v).raw))); 6277 } 6278 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 6279 HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) { 6280 const RebindToUnsigned<decltype(d)> du; 6281 return BitCast(d, Vec128<uint8_t>(vrev16q_u8(BitCast(du, v).raw))); 6282 } 6283 6284 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 6285 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 6286 const RebindToUnsigned<decltype(d)> du; 6287 return BitCast(d, VFromD<decltype(du)>(vrev32_u16(BitCast(du, v).raw))); 6288 } 6289 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 6290 HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) { 6291 const RebindToUnsigned<decltype(d)> du; 6292 return BitCast(d, Vec128<uint16_t>(vrev32q_u16(BitCast(du, v).raw))); 6293 } 6294 6295 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 4)> 6296 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 6297 const RebindToUnsigned<decltype(d)> du; 6298 return BitCast(d, VFromD<decltype(du)>(vrev64_u32(BitCast(du, v).raw))); 6299 } 6300 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 4)> 6301 HWY_API Vec128<T> Reverse2(D d, Vec128<T> v) { 6302 const RebindToUnsigned<decltype(d)> du; 6303 return BitCast(d, Vec128<uint32_t>(vrev64q_u32(BitCast(du, v).raw))); 6304 } 6305 6306 template <class D, HWY_IF_T_SIZE_D(D, 8)> 6307 HWY_API VFromD<D> Reverse2(D d, VFromD<D> v) { 6308 return CombineShiftRightBytes<8>(d, v, v); 6309 } 6310 6311 // ------------------------------ Reverse4 (Reverse2) 6312 6313 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 6314 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 6315 const RebindToUnsigned<decltype(d)> du; 6316 return BitCast(d, VFromD<decltype(du)>(vrev32_u8(BitCast(du, v).raw))); 6317 } 6318 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 6319 HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) { 6320 const RebindToUnsigned<decltype(d)> du; 6321 return BitCast(d, Vec128<uint8_t>(vrev32q_u8(BitCast(du, v).raw))); 6322 } 6323 6324 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 2)> 6325 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 6326 const RebindToUnsigned<decltype(d)> du; 6327 return BitCast(d, VFromD<decltype(du)>(vrev64_u16(BitCast(du, v).raw))); 6328 } 6329 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 2)> 6330 HWY_API Vec128<T> Reverse4(D d, Vec128<T> v) { 6331 const RebindToUnsigned<decltype(d)> du; 6332 return BitCast(d, Vec128<uint16_t>(vrev64q_u16(BitCast(du, v).raw))); 6333 } 6334 6335 template <class D, HWY_IF_T_SIZE_D(D, 4)> 6336 HWY_API VFromD<D> Reverse4(D d, VFromD<D> v) { 6337 const RepartitionToWide<RebindToUnsigned<decltype(d)>> duw; 6338 return BitCast(d, Reverse2(duw, BitCast(duw, Reverse2(d, v)))); 6339 } 6340 6341 template <class D, HWY_IF_T_SIZE_D(D, 8)> 6342 HWY_API VFromD<D> Reverse4(D /* tag */, VFromD<D>) { 6343 HWY_ASSERT(0); // don't have 8 u64 lanes 6344 } 6345 6346 // ------------------------------ Reverse8 (Reverse2, Reverse4) 6347 6348 template <class D, HWY_IF_V_SIZE_LE_D(D, 8), HWY_IF_T_SIZE_D(D, 1)> 6349 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { 6350 const RebindToUnsigned<decltype(d)> du; 6351 return BitCast(d, VFromD<decltype(du)>(vrev64_u8(BitCast(du, v).raw))); 6352 } 6353 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 6354 HWY_API Vec128<T> Reverse8(D d, Vec128<T> v) { 6355 const RebindToUnsigned<decltype(d)> du; 6356 return BitCast(d, Vec128<uint8_t>(vrev64q_u8(BitCast(du, v).raw))); 6357 } 6358 6359 template <class D, HWY_IF_T_SIZE_D(D, 2)> 6360 HWY_API VFromD<D> Reverse8(D d, VFromD<D> v) { 6361 const Repartition<uint64_t, decltype(d)> du64; 6362 return BitCast(d, Reverse2(du64, BitCast(du64, Reverse4(d, v)))); 6363 } 6364 6365 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 4) | (1 << 8))> 6366 HWY_API VFromD<D> Reverse8(D, VFromD<D>) { 6367 HWY_ASSERT(0); // don't have 8 lanes if larger than 16-bit 6368 } 6369 6370 // ------------------------------ Reverse (Reverse2, Reverse4, Reverse8) 6371 6372 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 1)> 6373 HWY_API Vec128<T, 1> Reverse(D /* tag */, Vec128<T, 1> v) { 6374 return v; 6375 } 6376 6377 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 2)> 6378 HWY_API Vec128<T, 2> Reverse(D d, Vec128<T, 2> v) { 6379 return Reverse2(d, v); 6380 } 6381 6382 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 4)> 6383 HWY_API Vec128<T, 4> Reverse(D d, Vec128<T, 4> v) { 6384 return Reverse4(d, v); 6385 } 6386 6387 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 8)> 6388 HWY_API Vec128<T, 8> Reverse(D d, Vec128<T, 8> v) { 6389 return Reverse8(d, v); 6390 } 6391 6392 template <class D, typename T = TFromD<D>, HWY_IF_LANES_D(D, 16)> 6393 HWY_API Vec128<T> Reverse(D d, Vec128<T> v) { 6394 const Repartition<uint64_t, decltype(d)> du64; 6395 return BitCast(d, Reverse2(du64, BitCast(du64, Reverse8(d, v)))); 6396 } 6397 6398 // ------------------------------ ReverseBits 6399 6400 #if HWY_ARCH_ARM_A64 6401 6402 #ifdef HWY_NATIVE_REVERSE_BITS_UI8 6403 #undef HWY_NATIVE_REVERSE_BITS_UI8 6404 #else 6405 #define HWY_NATIVE_REVERSE_BITS_UI8 6406 #endif 6407 6408 HWY_NEON_DEF_FUNCTION_INT_8(ReverseBits, vrbit, _, 1) 6409 HWY_NEON_DEF_FUNCTION_UINT_8(ReverseBits, vrbit, _, 1) 6410 6411 #endif // HWY_ARCH_ARM_A64 6412 6413 // ------------------------------ Other shuffles (TableLookupBytes) 6414 6415 // Notation: let Vec128<int32_t> have lanes 3,2,1,0 (0 is least-significant). 6416 // Shuffle0321 rotates one lane to the right (the previous least-significant 6417 // lane is now most-significant). These could also be implemented via 6418 // CombineShiftRightBytes but the shuffle_abcd notation is more convenient. 6419 6420 // Swap 64-bit halves 6421 template <typename T> 6422 HWY_API Vec128<T> Shuffle1032(Vec128<T> v) { 6423 return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v); 6424 } 6425 template <typename T> 6426 HWY_API Vec128<T> Shuffle01(Vec128<T> v) { 6427 return CombineShiftRightBytes<8>(DFromV<decltype(v)>(), v, v); 6428 } 6429 6430 // Rotate right 32 bits 6431 template <typename T> 6432 HWY_API Vec128<T> Shuffle0321(Vec128<T> v) { 6433 return CombineShiftRightBytes<4>(DFromV<decltype(v)>(), v, v); 6434 } 6435 6436 // Rotate left 32 bits 6437 template <typename T> 6438 HWY_API Vec128<T> Shuffle2103(Vec128<T> v) { 6439 return CombineShiftRightBytes<12>(DFromV<decltype(v)>(), v, v); 6440 } 6441 6442 // Reverse 6443 template <typename T> 6444 HWY_API Vec128<T> Shuffle0123(Vec128<T> v) { 6445 return Reverse4(DFromV<decltype(v)>(), v); 6446 } 6447 6448 // ------------------------------ InterleaveLower 6449 6450 // Interleaves lanes from halves of the 128-bit blocks of "a" (which provides 6451 // the least-significant lane) and "b". To concatenate two half-width integers 6452 // into one, use ZipLower/Upper instead (also works with scalar). 6453 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveLower, vzip1, _, 2) 6454 #if HWY_ARCH_ARM_A64 6455 // N=1 makes no sense (in that case, there would be no upper/lower). 6456 HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveLower, vzip1, _, 2) 6457 #else 6458 // Emulated version for Armv7. 6459 template <typename T, HWY_IF_T_SIZE(T, 8)> 6460 HWY_API Vec128<T> InterleaveLower(Vec128<T> a, Vec128<T> b) { 6461 const DFromV<decltype(a)> d; 6462 return CombineShiftRightBytes<8>(d, b, Shuffle01(a)); 6463 } 6464 #endif 6465 6466 #if !HWY_HAVE_FLOAT16 6467 template <size_t N, HWY_IF_V_SIZE_GT(float16_t, N, 4)> 6468 HWY_API Vec128<float16_t, N> InterleaveLower(Vec128<float16_t, N> a, 6469 Vec128<float16_t, N> b) { 6470 const DFromV<decltype(a)> d; 6471 const RebindToUnsigned<decltype(d)> du; 6472 return BitCast(d, InterleaveLower(BitCast(du, a), BitCast(du, b))); 6473 } 6474 #endif // !HWY_HAVE_FLOAT16 6475 6476 // < 64 bit parts 6477 template <typename T, size_t N, HWY_IF_V_SIZE_LE(T, N, 4)> 6478 HWY_API Vec128<T, N> InterleaveLower(Vec128<T, N> a, Vec128<T, N> b) { 6479 return Vec128<T, N>(InterleaveLower(Vec64<T>(a.raw), Vec64<T>(b.raw)).raw); 6480 } 6481 6482 // Additional overload for the optional Simd<> tag. 6483 template <class D> 6484 HWY_API VFromD<D> InterleaveLower(D /* tag */, VFromD<D> a, VFromD<D> b) { 6485 return InterleaveLower(a, b); 6486 } 6487 6488 // ------------------------------ InterleaveUpper (UpperHalf) 6489 6490 // All functions inside detail lack the required D parameter. 6491 namespace detail { 6492 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveUpper, vzip2, _, 2) 6493 6494 #if HWY_ARCH_ARM_A64 6495 // N=1 makes no sense (in that case, there would be no upper/lower). 6496 HWY_NEON_DEF_FUNCTION_FULL_UIF_64(InterleaveUpper, vzip2, _, 2) 6497 #else 6498 // Emulated version for Armv7. 6499 template <typename T, HWY_IF_T_SIZE(T, 8)> 6500 HWY_API Vec128<T> InterleaveUpper(Vec128<T> a, Vec128<T> b) { 6501 const DFromV<decltype(a)> d; 6502 return CombineShiftRightBytes<8>(d, Shuffle01(b), a); 6503 } 6504 #endif 6505 } // namespace detail 6506 6507 // Full register 6508 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> 6509 HWY_API VFromD<D> InterleaveUpper(D /* tag */, VFromD<D> a, VFromD<D> b) { 6510 return detail::InterleaveUpper(a, b); 6511 } 6512 6513 // Partial 6514 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 6515 HWY_API VFromD<D> InterleaveUpper(D d, VFromD<D> a, VFromD<D> b) { 6516 const Half<decltype(d)> d2; 6517 const VFromD<D> a2(UpperHalf(d2, a).raw); 6518 const VFromD<D> b2(UpperHalf(d2, b).raw); 6519 return InterleaveLower(d, a2, b2); 6520 } 6521 6522 // ------------------------------ ZipLower/ZipUpper (InterleaveLower) 6523 6524 // Same as Interleave*, except that the return lanes are double-width integers; 6525 // this is necessary because the single-lane scalar cannot return two values. 6526 template <class V, class DW = RepartitionToWide<DFromV<V>>> 6527 HWY_API VFromD<DW> ZipLower(V a, V b) { 6528 return BitCast(DW(), InterleaveLower(a, b)); 6529 } 6530 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 6531 HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) { 6532 return BitCast(dw, InterleaveLower(D(), a, b)); 6533 } 6534 6535 template <class V, class D = DFromV<V>, class DW = RepartitionToWide<D>> 6536 HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) { 6537 return BitCast(dw, InterleaveUpper(D(), a, b)); 6538 } 6539 6540 // ------------------------------ Per4LaneBlockShuffle 6541 namespace detail { 6542 6543 #if HWY_COMPILER_GCC || HWY_COMPILER_CLANG 6544 6545 #ifdef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 6546 #undef HWY_NATIVE_PER4LANEBLKSHUF_DUP32 6547 #else 6548 #define HWY_NATIVE_PER4LANEBLKSHUF_DUP32 6549 #endif 6550 6551 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 6552 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t /*x3*/, 6553 const uint32_t /*x2*/, 6554 const uint32_t x1, 6555 const uint32_t x0) { 6556 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(8))); 6557 const GccU32RawVectType raw = {x0, x1}; 6558 return ResizeBitCast(d, Vec64<uint32_t>(reinterpret_cast<uint32x2_t>(raw))); 6559 } 6560 6561 template <class D, HWY_IF_V_SIZE_D(D, 16)> 6562 HWY_INLINE VFromD<D> Per4LaneBlkShufDupSet4xU32(D d, const uint32_t x3, 6563 const uint32_t x2, 6564 const uint32_t x1, 6565 const uint32_t x0) { 6566 typedef uint32_t GccU32RawVectType __attribute__((__vector_size__(16))); 6567 const GccU32RawVectType raw = {x0, x1, x2, x3}; 6568 return ResizeBitCast(d, Vec128<uint32_t>(reinterpret_cast<uint32x4_t>(raw))); 6569 } 6570 #endif // HWY_COMPILER_GCC || HWY_COMPILER_CLANG 6571 6572 template <size_t kLaneSize, size_t kVectSize, class V, 6573 HWY_IF_LANES_GT_D(DFromV<V>, 4)> 6574 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0x88> /*idx_3210_tag*/, 6575 hwy::SizeTag<kLaneSize> /*lane_size_tag*/, 6576 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 6577 V v) { 6578 const DFromV<decltype(v)> d; 6579 const RebindToUnsigned<decltype(d)> du; 6580 const RepartitionToWide<decltype(du)> dw; 6581 6582 const auto evens = BitCast(dw, ConcatEven(d, v, v)); 6583 return BitCast(d, InterleaveLower(dw, evens, evens)); 6584 } 6585 6586 template <size_t kLaneSize, size_t kVectSize, class V, 6587 HWY_IF_LANES_GT_D(DFromV<V>, 4)> 6588 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xDD> /*idx_3210_tag*/, 6589 hwy::SizeTag<kLaneSize> /*lane_size_tag*/, 6590 hwy::SizeTag<kVectSize> /*vect_size_tag*/, 6591 V v) { 6592 const DFromV<decltype(v)> d; 6593 const RebindToUnsigned<decltype(d)> du; 6594 const RepartitionToWide<decltype(du)> dw; 6595 6596 const auto odds = BitCast(dw, ConcatOdd(d, v, v)); 6597 return BitCast(d, InterleaveLower(dw, odds, odds)); 6598 } 6599 6600 template <class V> 6601 HWY_INLINE V Per4LaneBlockShuffle(hwy::SizeTag<0xFA> /*idx_3210_tag*/, 6602 hwy::SizeTag<2> /*lane_size_tag*/, 6603 hwy::SizeTag<8> /*vect_size_tag*/, V v) { 6604 const DFromV<decltype(v)> d; 6605 return InterleaveUpper(d, v, v); 6606 } 6607 6608 } // namespace detail 6609 6610 // ------------------------------ SlideUpLanes 6611 6612 namespace detail { 6613 6614 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 6615 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 6616 const DFromV<decltype(v)> d; 6617 using TU = UnsignedFromSize<d.MaxBytes()>; 6618 const Repartition<TU, decltype(d)> du; 6619 return BitCast(d, BitCast(du, v) << Set( 6620 du, static_cast<TU>(amt * sizeof(TFromV<V>) * 8))); 6621 } 6622 6623 template <class V, HWY_IF_V_SIZE_V(V, 16)> 6624 HWY_INLINE V SlideUpLanes(V v, size_t amt) { 6625 const DFromV<decltype(v)> d; 6626 const Repartition<uint8_t, decltype(d)> du8; 6627 const auto idx = 6628 Iota(du8, static_cast<uint8_t>(size_t{0} - amt * sizeof(TFromV<V>))); 6629 return BitCast(d, TableLookupBytesOr0(BitCast(du8, v), idx)); 6630 } 6631 6632 } // namespace detail 6633 6634 template <class D, HWY_IF_LANES_D(D, 1)> 6635 HWY_API VFromD<D> SlideUpLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 6636 return v; 6637 } 6638 6639 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 6640 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 6641 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6642 if (__builtin_constant_p(amt)) { 6643 switch (amt) { 6644 case 0: 6645 return v; 6646 case 1: 6647 return ShiftLeftLanes<1>(d, v); 6648 } 6649 } 6650 #else 6651 (void)d; 6652 #endif 6653 6654 return detail::SlideUpLanes(v, amt); 6655 } 6656 6657 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 6658 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 6659 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6660 if (__builtin_constant_p(amt)) { 6661 switch (amt) { 6662 case 0: 6663 return v; 6664 case 1: 6665 return ShiftLeftLanes<1>(d, v); 6666 case 2: 6667 return ShiftLeftLanes<2>(d, v); 6668 case 3: 6669 return ShiftLeftLanes<3>(d, v); 6670 } 6671 } 6672 #else 6673 (void)d; 6674 #endif 6675 6676 return detail::SlideUpLanes(v, amt); 6677 } 6678 6679 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 6680 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 6681 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6682 if (__builtin_constant_p(amt)) { 6683 switch (amt) { 6684 case 0: 6685 return v; 6686 case 1: 6687 return ShiftLeftLanes<1>(d, v); 6688 case 2: 6689 return ShiftLeftLanes<2>(d, v); 6690 case 3: 6691 return ShiftLeftLanes<3>(d, v); 6692 case 4: 6693 return ShiftLeftLanes<4>(d, v); 6694 case 5: 6695 return ShiftLeftLanes<5>(d, v); 6696 case 6: 6697 return ShiftLeftLanes<6>(d, v); 6698 case 7: 6699 return ShiftLeftLanes<7>(d, v); 6700 } 6701 } 6702 #else 6703 (void)d; 6704 #endif 6705 6706 return detail::SlideUpLanes(v, amt); 6707 } 6708 6709 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 6710 HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) { 6711 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6712 if (__builtin_constant_p(amt)) { 6713 switch (amt) { 6714 case 0: 6715 return v; 6716 case 1: 6717 return ShiftLeftLanes<1>(d, v); 6718 case 2: 6719 return ShiftLeftLanes<2>(d, v); 6720 case 3: 6721 return ShiftLeftLanes<3>(d, v); 6722 case 4: 6723 return ShiftLeftLanes<4>(d, v); 6724 case 5: 6725 return ShiftLeftLanes<5>(d, v); 6726 case 6: 6727 return ShiftLeftLanes<6>(d, v); 6728 case 7: 6729 return ShiftLeftLanes<7>(d, v); 6730 case 8: 6731 return ShiftLeftLanes<8>(d, v); 6732 case 9: 6733 return ShiftLeftLanes<9>(d, v); 6734 case 10: 6735 return ShiftLeftLanes<10>(d, v); 6736 case 11: 6737 return ShiftLeftLanes<11>(d, v); 6738 case 12: 6739 return ShiftLeftLanes<12>(d, v); 6740 case 13: 6741 return ShiftLeftLanes<13>(d, v); 6742 case 14: 6743 return ShiftLeftLanes<14>(d, v); 6744 case 15: 6745 return ShiftLeftLanes<15>(d, v); 6746 } 6747 } 6748 #else 6749 (void)d; 6750 #endif 6751 6752 return detail::SlideUpLanes(v, amt); 6753 } 6754 6755 // ------------------------------ SlideDownLanes 6756 6757 namespace detail { 6758 6759 template <class V, HWY_IF_V_SIZE_LE_V(V, 8)> 6760 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 6761 const DFromV<decltype(v)> d; 6762 using TU = UnsignedFromSize<d.MaxBytes()>; 6763 const Repartition<TU, decltype(d)> du; 6764 return BitCast(d, 6765 BitCast(du, v) << Set( 6766 du, static_cast<TU>(TU{0} - amt * sizeof(TFromV<V>) * 8))); 6767 } 6768 6769 template <class V, HWY_IF_V_SIZE_V(V, 16)> 6770 HWY_INLINE V SlideDownLanes(V v, size_t amt) { 6771 const DFromV<decltype(v)> d; 6772 const Repartition<int8_t, decltype(d)> di8; 6773 auto idx = Iota(di8, static_cast<int8_t>(amt * sizeof(TFromV<V>))); 6774 idx = Or(idx, VecFromMask(di8, idx > Set(di8, int8_t{15}))); 6775 return BitCast(d, TableLookupBytesOr0(BitCast(di8, v), idx)); 6776 } 6777 6778 } // namespace detail 6779 6780 template <class D, HWY_IF_LANES_D(D, 1)> 6781 HWY_API VFromD<D> SlideDownLanes(D /*d*/, VFromD<D> v, size_t /*amt*/) { 6782 return v; 6783 } 6784 6785 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 2)> 6786 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 6787 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6788 if (__builtin_constant_p(amt)) { 6789 switch (amt) { 6790 case 0: 6791 return v; 6792 case 1: 6793 return ShiftRightLanes<1>(d, v); 6794 } 6795 } 6796 #else 6797 (void)d; 6798 #endif 6799 6800 return detail::SlideDownLanes(v, amt); 6801 } 6802 6803 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 4)> 6804 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 6805 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6806 if (__builtin_constant_p(amt)) { 6807 switch (amt) { 6808 case 0: 6809 return v; 6810 case 1: 6811 return ShiftRightLanes<1>(d, v); 6812 case 2: 6813 return ShiftRightLanes<2>(d, v); 6814 case 3: 6815 return ShiftRightLanes<3>(d, v); 6816 } 6817 } 6818 #else 6819 (void)d; 6820 #endif 6821 6822 return detail::SlideDownLanes(v, amt); 6823 } 6824 6825 template <class D, HWY_IF_V_SIZE_LE_D(D, 16), HWY_IF_LANES_D(D, 8)> 6826 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 6827 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6828 if (__builtin_constant_p(amt)) { 6829 switch (amt) { 6830 case 0: 6831 return v; 6832 case 1: 6833 return ShiftRightLanes<1>(d, v); 6834 case 2: 6835 return ShiftRightLanes<2>(d, v); 6836 case 3: 6837 return ShiftRightLanes<3>(d, v); 6838 case 4: 6839 return ShiftRightLanes<4>(d, v); 6840 case 5: 6841 return ShiftRightLanes<5>(d, v); 6842 case 6: 6843 return ShiftRightLanes<6>(d, v); 6844 case 7: 6845 return ShiftRightLanes<7>(d, v); 6846 } 6847 } 6848 #else 6849 (void)d; 6850 #endif 6851 6852 return detail::SlideDownLanes(v, amt); 6853 } 6854 6855 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_LANES_D(D, 16)> 6856 HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) { 6857 #if !HWY_IS_DEBUG_BUILD && HWY_COMPILER_GCC // includes clang 6858 if (__builtin_constant_p(amt)) { 6859 switch (amt) { 6860 case 0: 6861 return v; 6862 case 1: 6863 return ShiftRightLanes<1>(d, v); 6864 case 2: 6865 return ShiftRightLanes<2>(d, v); 6866 case 3: 6867 return ShiftRightLanes<3>(d, v); 6868 case 4: 6869 return ShiftRightLanes<4>(d, v); 6870 case 5: 6871 return ShiftRightLanes<5>(d, v); 6872 case 6: 6873 return ShiftRightLanes<6>(d, v); 6874 case 7: 6875 return ShiftRightLanes<7>(d, v); 6876 case 8: 6877 return ShiftRightLanes<8>(d, v); 6878 case 9: 6879 return ShiftRightLanes<9>(d, v); 6880 case 10: 6881 return ShiftRightLanes<10>(d, v); 6882 case 11: 6883 return ShiftRightLanes<11>(d, v); 6884 case 12: 6885 return ShiftRightLanes<12>(d, v); 6886 case 13: 6887 return ShiftRightLanes<13>(d, v); 6888 case 14: 6889 return ShiftRightLanes<14>(d, v); 6890 case 15: 6891 return ShiftRightLanes<15>(d, v); 6892 } 6893 } 6894 #else 6895 (void)d; 6896 #endif 6897 6898 return detail::SlideDownLanes(v, amt); 6899 } 6900 6901 // ------------------------------- WidenHighMulAdd 6902 6903 #ifdef HWY_NATIVE_WIDEN_HIGH_MUL_ADD 6904 #undef HWY_NATIVE_WIDEN_HIGH_MUL_ADD 6905 #else 6906 #define HWY_NATIVE_WIDEN_HIGH_MUL_ADD 6907 #endif 6908 6909 namespace detail { 6910 6911 template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>, 6912 HWY_IF_LANES_GT_D(DN, 2)> 6913 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 6914 VFromD<DN> x, VFromD<D> add) { 6915 #if HWY_ARCH_ARM_A64 6916 return Vec128<uint64_t>(vmlal_high_u32(add.raw, mul.raw, x.raw)); 6917 #else 6918 const Full64<uint32_t> dh; 6919 return Vec128<uint64_t>( 6920 vmlal_u32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw)); 6921 #endif 6922 } 6923 6924 template<class D, HWY_IF_U64_D(D), class DN = RepartitionToNarrow<D>, 6925 HWY_IF_LANES_LE_D(DN, 2)> 6926 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 6927 VFromD<DN> x, VFromD<D> add) { 6928 Vec128<uint64_t> mulResult = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw)); 6929 return UpperHalf(d, mulResult) + add; 6930 } 6931 6932 template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>, 6933 HWY_IF_LANES_GT_D(DN, 2)> 6934 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 6935 VFromD<DN> x, VFromD<D> add) { 6936 #if HWY_ARCH_ARM_A64 6937 return Vec128<int64_t>(vmlal_high_s32(add.raw, mul.raw, x.raw)); 6938 #else 6939 const Full64<int32_t> dh; 6940 return Vec128<int64_t>( 6941 vmlal_s32(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw)); 6942 #endif 6943 } 6944 6945 template<class D, HWY_IF_I64_D(D), class DN = RepartitionToNarrow<D>, 6946 HWY_IF_LANES_LE_D(DN, 2)> 6947 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 6948 VFromD<DN> x, VFromD<D> add) { 6949 Vec128<int64_t> mulResult = Vec128<int64_t>(vmull_s32(mul.raw, x.raw)); 6950 return UpperHalf(d, mulResult) + add; 6951 } 6952 6953 template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>, 6954 HWY_IF_LANES_GT_D(DN, 4)> 6955 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 6956 VFromD<DN> x, VFromD<D> add) { 6957 #if HWY_ARCH_ARM_A64 6958 return Vec128<int32_t>(vmlal_high_s16(add.raw, mul.raw, x.raw)); 6959 #else 6960 const Full64<int16_t> dh; 6961 return Vec128<int32_t>( 6962 vmlal_s16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw)); 6963 #endif 6964 } 6965 6966 template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>, 6967 HWY_IF_LANES_D(DN, 4)> 6968 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 6969 VFromD<DN> x, VFromD<D> add) { 6970 Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw)); 6971 Vec64<int32_t> hi = UpperHalf(d, widen); 6972 return hi + add; 6973 } 6974 6975 template<class D, HWY_IF_I32_D(D), class DN = RepartitionToNarrow<D>, 6976 HWY_IF_LANES_D(DN, 2)> 6977 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 6978 VFromD<DN> x, VFromD<D> add) { 6979 Vec128<int32_t> widen = Vec128<int32_t>(vmull_s16(mul.raw, x.raw)); 6980 Vec32<int32_t> hi = UpperHalf(d, Vec64<int32_t>(vget_high_s32(widen.raw))); 6981 return hi + add; 6982 } 6983 6984 template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>, 6985 HWY_IF_LANES_GT_D(DN, 4)> 6986 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 6987 VFromD<DN> x, VFromD<D> add) { 6988 #if HWY_ARCH_ARM_A64 6989 return Vec128<uint32_t>(vmlal_high_u16(add.raw, mul.raw, x.raw)); 6990 #else 6991 const Full64<uint16_t> dh; 6992 return Vec128<uint32_t>( 6993 vmlal_u16(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw)); 6994 #endif 6995 } 6996 6997 template<class D, HWY_IF_U32_D(D), class DN = RepartitionToNarrow<D>, 6998 HWY_IF_LANES_D(DN, 4)> 6999 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7000 VFromD<DN> x, VFromD<D> add) { 7001 Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)); 7002 VFromD<D> hi = UpperHalf(d, widen); 7003 return hi + add; 7004 } 7005 7006 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1), 7007 class DN = RepartitionToNarrow<D>> 7008 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7009 VFromD<DN> x, VFromD<D> add) { 7010 Vec128<uint32_t> widen = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)); 7011 VFromD<D> hi = UpperHalf(d, Vec64<uint32_t>(vget_high_u32(widen.raw))); 7012 return hi + add; 7013 } 7014 7015 template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>, 7016 HWY_IF_LANES_GT_D(DN, 8)> 7017 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 7018 VFromD<DN> x, VFromD<D> add) { 7019 #if HWY_ARCH_ARM_A64 7020 return Vec128<uint16_t>(vmlal_high_u8(add.raw, mul.raw, x.raw)); 7021 #else 7022 const Full64<uint8_t> dh; 7023 return Vec128<uint16_t>( 7024 vmlal_u8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw)); 7025 #endif 7026 } 7027 7028 template<class D, HWY_IF_U16_D(D), class DN = RepartitionToNarrow<D>, 7029 HWY_IF_LANES_D(DN, 8)> 7030 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7031 VFromD<DN> x, VFromD<D> add) { 7032 Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw)); 7033 VFromD<D> hi = UpperHalf(d, widen); 7034 return hi + add; 7035 } 7036 7037 template<class D, HWY_IF_U16(TFromD<D>), class DN = RepartitionToNarrow<D>, 7038 HWY_IF_LANES_LE_D(DN, 4)> 7039 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7040 VFromD<DN> x, VFromD<D> add) { 7041 Vec128<uint16_t> widen = Vec128<uint16_t>(vmull_u8(mul.raw, x.raw)); 7042 const Twice<decltype(d)> d16F; 7043 VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_u16(widen.raw))); 7044 return hi + add; 7045 } 7046 7047 template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>, 7048 HWY_IF_LANES_GT_D(DN, 8)> 7049 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 7050 VFromD<DN> x, VFromD<D> add) { 7051 #if HWY_ARCH_ARM_A64 7052 return Vec128<int16_t>(vmlal_high_s8(add.raw, mul.raw, x.raw)); 7053 #else 7054 const Full64<int8_t> dh; 7055 return Vec128<int16_t>( 7056 vmlal_s8(add.raw, UpperHalf(dh, mul).raw, UpperHalf(dh, x).raw)); 7057 #endif 7058 } 7059 7060 template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>, 7061 HWY_IF_LANES_D(DN, 8)> 7062 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7063 VFromD<DN> x, VFromD<D> add) { 7064 Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw)); 7065 VFromD<D> hi = UpperHalf(d, widen); 7066 return hi + add; 7067 } 7068 7069 template<class D, HWY_IF_I16_D(D), class DN = RepartitionToNarrow<D>, 7070 HWY_IF_LANES_LE_D(DN, 4)> 7071 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7072 VFromD<DN> x, VFromD<D> add) { 7073 Vec128<int16_t> widen = Vec128<int16_t>(vmull_s8(mul.raw, x.raw)); 7074 const Twice<decltype(d)> d16F; 7075 VFromD<D> hi = UpperHalf(d, VFromD<decltype(d16F)>(vget_high_s16(widen.raw))); 7076 return hi + add; 7077 } 7078 7079 #if 0 7080 #if HWY_HAVE_FLOAT16 7081 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 4), 7082 class DN = RepartitionToNarrow<D>> 7083 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 7084 VFromD<DN> x, VFromD<D> add) { 7085 return VFromD<D>(vfmlalq_high_f16(add.raw, mul.raw, x.raw)); 7086 } 7087 7088 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 2), 7089 class DN = RepartitionToNarrow<D>> 7090 HWY_API VFromD<D> WidenHighMulAdd(D /* tag */, VFromD<DN> mul, 7091 VFromD<DN> x, VFromD<D> add) { 7092 return Vec64<float32_t>(vfmlal_high_f16(add.raw, mul.raw, x.raw)); 7093 } 7094 7095 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1), 7096 class DN = RepartitionToNarrow<D>> 7097 HWY_API VFromD<D> WidenHighMulAdd(D d, VFromD<DN> mul, 7098 VFromD<DN> x, VFromD<D> add) { 7099 return MulAdd(add, PromoteUpperTo(d, mul), PromoteUpperTo(d, x)); 7100 } 7101 #endif 7102 #endif 7103 7104 } // namespace detail 7105 7106 // ------------------------------- WidenMulAdd 7107 7108 #ifdef HWY_NATIVE_WIDEN_MUL_ADD 7109 #undef HWY_NATIVE_WIDEN_MUL_ADD 7110 #else 7111 #define HWY_NATIVE_WIDEN_MUL_ADD 7112 #endif 7113 7114 namespace detail { 7115 7116 template<class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 4), 7117 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7118 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7119 VFromD<DN> x, VFromD<D> add) { 7120 return Vec128<uint16_t>(vmlal_u8(add.raw, mul.raw, x.raw)); 7121 } 7122 7123 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_LE_D(D, 4), 7124 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7125 HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x, 7126 VFromD<D> add) { 7127 return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x)); 7128 } 7129 7130 template<class D, HWY_IF_I16_D(D), HWY_IF_LANES_GT_D(D, 4), 7131 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7132 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7133 VFromD<DN> x, VFromD<D> add) { 7134 return VFromD<D>(vmlal_s8(add.raw, mul.raw, x.raw)); 7135 } 7136 7137 template <class D, HWY_IF_I16_D(D), HWY_IF_LANES_LE_D(D, 4), 7138 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7139 HWY_API VFromD<D> WidenMulAdd(D d, VFromD<DN> mul, VFromD<DN> x, 7140 VFromD<D> add) { 7141 return MulAdd(add, PromoteTo(d, mul), PromoteTo(d, x)); 7142 } 7143 7144 template<class D, HWY_IF_I32_D(D), 7145 class DN = Rebind<MakeNarrow<TFromD<D>>, D>, 7146 HWY_IF_LANES_GT_D(DN, 2)> 7147 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7148 VFromD<DN> x, VFromD<D> add) { 7149 return Vec128<int32_t>(vmlal_s16(add.raw, mul.raw, x.raw)); 7150 } 7151 7152 template<class D, HWY_IF_I32_D(D), 7153 class DN = Rebind<MakeNarrow<TFromD<D>>, D>, 7154 HWY_IF_LANES_D(DN, 2)> 7155 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7156 VFromD<DN> x, VFromD<D> add) { 7157 Vec128<int32_t> mulRs = Vec128<int32_t>(vmull_s16(mul.raw, x.raw)); 7158 const VFromD<D> mul10 = LowerHalf(mulRs); 7159 return add + mul10; 7160 } 7161 7162 template<class D, HWY_IF_I32_D(D), 7163 class DN = Rebind<MakeNarrow<TFromD<D>>, D>, 7164 HWY_IF_LANES_D(D, 1)> 7165 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7166 VFromD<DN> x, VFromD<D> add) { 7167 Vec64<int32_t> mulRs = LowerHalf(Vec128<int32_t>(vmull_s16(mul.raw, x.raw))); 7168 const Vec32<int32_t> mul10(LowerHalf(mulRs)); 7169 return add + mul10; 7170 } 7171 7172 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_GT_D(D, 2), 7173 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7174 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7175 VFromD<DN> x, VFromD<D> add) { 7176 return Vec128<uint32_t>(vmlal_u16(add.raw, mul.raw, x.raw)); 7177 } 7178 7179 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 2), 7180 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7181 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7182 VFromD<DN> x, VFromD<D> add) { 7183 Vec128<uint32_t> mulRs = Vec128<uint32_t>(vmull_u16(mul.raw, x.raw)); 7184 const Vec64<uint32_t> mul10(LowerHalf(mulRs)); 7185 return add + mul10; 7186 } 7187 7188 template<class D, HWY_IF_U32_D(D), HWY_IF_LANES_D(D, 1), 7189 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7190 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7191 VFromD<DN> x, VFromD<D> add) { 7192 Vec64<uint32_t> mulRs = 7193 LowerHalf(Vec128<uint32_t>(vmull_u16(mul.raw, x.raw))); 7194 const Vec32<uint32_t> mul10(LowerHalf(mulRs)); 7195 return add + mul10; 7196 } 7197 7198 template<class D, HWY_IF_I64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>, 7199 HWY_IF_LANES_D(DN, 2)> 7200 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7201 VFromD<DN> x, VFromD<D> add) { 7202 return VFromD<D>(vmlal_s32(add.raw, mul.raw, x.raw)); 7203 } 7204 7205 template<class D, HWY_IF_I64_D(D), HWY_IF_LANES_D(D, 1), 7206 class DN = Rebind<MakeNarrow<TFromD<D>>, D>> 7207 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7208 VFromD<DN> x, VFromD<D> add) { 7209 Vec128<int64_t> mulRs = Vec128<int64_t>(vmull_s32(mul.raw, x.raw)); 7210 const VFromD<D> mul10(LowerHalf(mulRs)); 7211 return add + mul10; 7212 } 7213 7214 template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>, 7215 HWY_IF_LANES_D(DN, 2)> 7216 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7217 VFromD<DN> x, VFromD<D> add) { 7218 return VFromD<D>(vmlal_u32(add.raw, mul.raw, x.raw)); 7219 } 7220 7221 template<class D, HWY_IF_U64_D(D), class DN = Rebind<MakeNarrow<TFromD<D>>, D>, 7222 HWY_IF_LANES_D(DN, 1)> 7223 HWY_API VFromD<D> WidenMulAdd(D /* tag */, VFromD<DN> mul, 7224 VFromD<DN> x, VFromD<D> add) { 7225 Vec128<uint64_t> mulRs = Vec128<uint64_t>(vmull_u32(mul.raw, x.raw)); 7226 const VFromD<D> mul10(LowerHalf(mulRs)); 7227 return add + mul10; 7228 } 7229 7230 #if 0 7231 #if HWY_HAVE_FLOAT16 7232 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>, 7233 HWY_IF_LANES_D(D, 4)> 7234 HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul, 7235 VFromD<DN> x, VFromD<D> add) { 7236 return VFromD<D>(vfmlalq_low_f16(add.raw, mul.raw, x.raw)); 7237 } 7238 7239 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>, 7240 HWY_IF_LANES_D(DN, 4)> 7241 HWY_API VFromD<D> WidenLowMulAdd(D /* tag */, VFromD<DN> mul, 7242 VFromD<DN> x, VFromD<D> add) { 7243 return Vec64<float32_t>(vfmlal_low_f16(add.raw, mul.raw, x.raw)); 7244 } 7245 7246 template<class D, HWY_IF_F32_D(D), HWY_IF_LANES_D(D, 1), 7247 class DN = RepartitionToNarrow<D>> 7248 HWY_API VFromD<D> WidenLowMulAdd(D d, VFromD<DN> mul, 7249 VFromD<DN> x, VFromD<D> add) { 7250 return MulAdd(add, PromoteLowerTo(d, mul), PromoteLowerTo(d, x)); 7251 } 7252 #endif 7253 #endif 7254 7255 } // namespace detail 7256 7257 // ------------------------------ WidenMulAccumulate 7258 7259 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE 7260 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE 7261 #else 7262 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE 7263 #endif 7264 7265 template<class D, HWY_IF_INTEGER(TFromD<D>), class DN = RepartitionToNarrow<D>> 7266 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x, 7267 VFromD<D> low, VFromD<D>& high) { 7268 high = detail::WidenHighMulAdd(d, mul, x, high); 7269 return detail::WidenMulAdd(d, LowerHalf(mul), LowerHalf(x), low); 7270 } 7271 7272 #if 0 7273 #ifdef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 7274 #undef HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 7275 #else 7276 #define HWY_NATIVE_WIDEN_MUL_ACCUMULATE_F16 7277 #endif 7278 7279 #if HWY_HAVE_FLOAT16 7280 7281 template<class D, HWY_IF_F32_D(D), class DN = RepartitionToNarrow<D>> 7282 HWY_API VFromD<D> WidenMulAccumulate(D d, VFromD<DN> mul, VFromD<DN> x, 7283 VFromD<D> low, VFromD<D>& high) { 7284 high = detail::WidenHighMulAdd(d, mul, x, high); 7285 return detail::WidenLowMulAdd(d, mul, x, low); 7286 } 7287 7288 #endif 7289 #endif 7290 7291 // ------------------------------ SatWidenMulAccumFixedPoint 7292 7293 #ifdef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 7294 #undef HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 7295 #else 7296 #define HWY_NATIVE_I16_SATWIDENMULACCUMFIXEDPOINT 7297 #endif 7298 7299 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)> 7300 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 /*di32*/, 7301 VFromD<Rebind<int16_t, DI32>> a, 7302 VFromD<Rebind<int16_t, DI32>> b, 7303 VFromD<DI32> sum) { 7304 return VFromD<DI32>(vqdmlal_s16(sum.raw, a.raw, b.raw)); 7305 } 7306 7307 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)> 7308 HWY_API VFromD<DI32> SatWidenMulAccumFixedPoint(DI32 di32, 7309 VFromD<Rebind<int16_t, DI32>> a, 7310 VFromD<Rebind<int16_t, DI32>> b, 7311 VFromD<DI32> sum) { 7312 const Full128<TFromD<DI32>> di32_full; 7313 const Rebind<int16_t, decltype(di32_full)> di16_full64; 7314 return ResizeBitCast( 7315 di32, SatWidenMulAccumFixedPoint(di32_full, ResizeBitCast(di16_full64, a), 7316 ResizeBitCast(di16_full64, b), 7317 ResizeBitCast(di32_full, sum))); 7318 } 7319 7320 // ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower) 7321 7322 #if HWY_NEON_HAVE_F32_TO_BF16C 7323 7324 #ifdef HWY_NATIVE_MUL_EVEN_BF16 7325 #undef HWY_NATIVE_MUL_EVEN_BF16 7326 #else 7327 #define HWY_NATIVE_MUL_EVEN_BF16 7328 #endif 7329 7330 #ifdef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 7331 #undef HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 7332 #else 7333 #define HWY_NATIVE_REORDER_WIDEN_MUL_ACC_BF16 7334 #endif 7335 7336 namespace detail { 7337 #if HWY_NEON_HAVE_BFLOAT16 7338 // If HWY_NEON_HAVE_BFLOAT16 is true, detail::Vec128<bfloat16_t, N>::type is 7339 // bfloat16x4_t or bfloat16x8_t. 7340 static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(bfloat16x4_t raw) { 7341 return raw; 7342 } 7343 static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(bfloat16x8_t raw) { 7344 return raw; 7345 } 7346 #else 7347 // If HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true, 7348 // detail::Vec128<bfloat16_t, N>::type is uint16x4_t or uint16x8_t vector to 7349 // work around compiler bugs that are there with GCC 13 or earlier or Clang 16 7350 // or earlier on AArch64. 7351 7352 // The uint16x4_t or uint16x8_t vector neets to be bitcasted to a bfloat16x4_t 7353 // or a bfloat16x8_t vector for the vbfdot_f32 and vbfdotq_f32 intrinsics if 7354 // HWY_NEON_HAVE_F32_TO_BF16C && !HWY_NEON_HAVE_BFLOAT16 is true 7355 static HWY_INLINE bfloat16x4_t BitCastToRawNeonBF16(uint16x4_t raw) { 7356 return vreinterpret_bf16_u16(raw); 7357 } 7358 static HWY_INLINE bfloat16x8_t BitCastToRawNeonBF16(uint16x8_t raw) { 7359 return vreinterpretq_bf16_u16(raw); 7360 } 7361 #endif 7362 } // namespace detail 7363 7364 template <class D, HWY_IF_V_SIZE_D(D, 16)> 7365 HWY_API Vec128<float> MulEvenAdd(D /*d32*/, Vec128<bfloat16_t> a, 7366 Vec128<bfloat16_t> b, const Vec128<float> c) { 7367 return Vec128<float>(vbfmlalbq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw), 7368 detail::BitCastToRawNeonBF16(b.raw))); 7369 } 7370 7371 template <class D, HWY_IF_V_SIZE_D(D, 16)> 7372 HWY_API Vec128<float> MulOddAdd(D /*d32*/, Vec128<bfloat16_t> a, 7373 Vec128<bfloat16_t> b, const Vec128<float> c) { 7374 return Vec128<float>(vbfmlaltq_f32(c.raw, detail::BitCastToRawNeonBF16(a.raw), 7375 detail::BitCastToRawNeonBF16(b.raw))); 7376 } 7377 7378 template <class D, HWY_IF_V_SIZE_D(D, 16)> 7379 HWY_API Vec128<float> ReorderWidenMulAccumulate(D /*d32*/, Vec128<bfloat16_t> a, 7380 Vec128<bfloat16_t> b, 7381 const Vec128<float> sum0, 7382 Vec128<float>& /*sum1*/) { 7383 return Vec128<float>(vbfdotq_f32(sum0.raw, 7384 detail::BitCastToRawNeonBF16(a.raw), 7385 detail::BitCastToRawNeonBF16(b.raw))); 7386 } 7387 7388 // There is no non-q version of these instructions. 7389 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 7390 HWY_API VFromD<D> MulEvenAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a, 7391 VFromD<Repartition<bfloat16_t, D>> b, 7392 const VFromD<D> c) { 7393 const Full128<float> d32f; 7394 const Full128<bfloat16_t> d16f; 7395 return ResizeBitCast( 7396 d32, MulEvenAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b), 7397 ResizeBitCast(d32f, c))); 7398 } 7399 7400 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 7401 HWY_API VFromD<D> MulOddAdd(D d32, VFromD<Repartition<bfloat16_t, D>> a, 7402 VFromD<Repartition<bfloat16_t, D>> b, 7403 const VFromD<D> c) { 7404 const Full128<float> d32f; 7405 const Full128<bfloat16_t> d16f; 7406 return ResizeBitCast( 7407 d32, MulOddAdd(d32f, ResizeBitCast(d16f, a), ResizeBitCast(d16f, b), 7408 ResizeBitCast(d32f, c))); 7409 } 7410 7411 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 7412 HWY_API VFromD<D> ReorderWidenMulAccumulate( 7413 D /*d32*/, VFromD<Repartition<bfloat16_t, D>> a, 7414 VFromD<Repartition<bfloat16_t, D>> b, const VFromD<D> sum0, 7415 VFromD<D>& /*sum1*/) { 7416 return VFromD<D>(vbfdot_f32(sum0.raw, detail::BitCastToRawNeonBF16(a.raw), 7417 detail::BitCastToRawNeonBF16(b.raw))); 7418 } 7419 7420 #endif // HWY_NEON_HAVE_F32_TO_BF16C 7421 7422 template <class D, HWY_IF_I32_D(D)> 7423 HWY_API Vec128<int32_t> ReorderWidenMulAccumulate(D /*d32*/, Vec128<int16_t> a, 7424 Vec128<int16_t> b, 7425 const Vec128<int32_t> sum0, 7426 Vec128<int32_t>& sum1) { 7427 #if HWY_ARCH_ARM_A64 7428 sum1 = Vec128<int32_t>(vmlal_high_s16(sum1.raw, a.raw, b.raw)); 7429 #else 7430 const Full64<int16_t> dh; 7431 sum1 = Vec128<int32_t>( 7432 vmlal_s16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); 7433 #endif 7434 return Vec128<int32_t>( 7435 vmlal_s16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); 7436 } 7437 7438 template <class D, HWY_IF_I32_D(D)> 7439 HWY_API Vec64<int32_t> ReorderWidenMulAccumulate(D d32, Vec64<int16_t> a, 7440 Vec64<int16_t> b, 7441 const Vec64<int32_t> sum0, 7442 Vec64<int32_t>& sum1) { 7443 // vmlal writes into the upper half, which the caller cannot use, so 7444 // split into two halves. 7445 const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw)); 7446 const Vec64<int32_t> mul_32 = UpperHalf(d32, mul_3210); 7447 sum1 += mul_32; 7448 return sum0 + LowerHalf(mul_3210); 7449 } 7450 7451 template <class D, HWY_IF_I32_D(D)> 7452 HWY_API Vec32<int32_t> ReorderWidenMulAccumulate(D d32, Vec32<int16_t> a, 7453 Vec32<int16_t> b, 7454 const Vec32<int32_t> sum0, 7455 Vec32<int32_t>& sum1) { 7456 const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw)); 7457 const Vec64<int32_t> mul_10(LowerHalf(mul_xx10)); 7458 const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10); 7459 const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10); 7460 sum1 += mul1; 7461 return sum0 + mul0; 7462 } 7463 7464 template <class D, HWY_IF_U32_D(D)> 7465 HWY_API Vec128<uint32_t> ReorderWidenMulAccumulate(D /*d32*/, 7466 Vec128<uint16_t> a, 7467 Vec128<uint16_t> b, 7468 const Vec128<uint32_t> sum0, 7469 Vec128<uint32_t>& sum1) { 7470 #if HWY_ARCH_ARM_A64 7471 sum1 = Vec128<uint32_t>(vmlal_high_u16(sum1.raw, a.raw, b.raw)); 7472 #else 7473 const Full64<uint16_t> dh; 7474 sum1 = Vec128<uint32_t>( 7475 vmlal_u16(sum1.raw, UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); 7476 #endif 7477 return Vec128<uint32_t>( 7478 vmlal_u16(sum0.raw, LowerHalf(a).raw, LowerHalf(b).raw)); 7479 } 7480 7481 template <class D, HWY_IF_U32_D(D)> 7482 HWY_API Vec64<uint32_t> ReorderWidenMulAccumulate(D d32, Vec64<uint16_t> a, 7483 Vec64<uint16_t> b, 7484 const Vec64<uint32_t> sum0, 7485 Vec64<uint32_t>& sum1) { 7486 // vmlal writes into the upper half, which the caller cannot use, so 7487 // split into two halves. 7488 const Vec128<uint32_t> mul_3210(vmull_u16(a.raw, b.raw)); 7489 const Vec64<uint32_t> mul_32 = UpperHalf(d32, mul_3210); 7490 sum1 += mul_32; 7491 return sum0 + LowerHalf(mul_3210); 7492 } 7493 7494 template <class D, HWY_IF_U32_D(D)> 7495 HWY_API Vec32<uint32_t> ReorderWidenMulAccumulate(D du32, Vec32<uint16_t> a, 7496 Vec32<uint16_t> b, 7497 const Vec32<uint32_t> sum0, 7498 Vec32<uint32_t>& sum1) { 7499 const Vec128<uint32_t> mul_xx10(vmull_u16(a.raw, b.raw)); 7500 const Vec64<uint32_t> mul_10(LowerHalf(mul_xx10)); 7501 const Vec32<uint32_t> mul0 = LowerHalf(du32, mul_10); 7502 const Vec32<uint32_t> mul1 = UpperHalf(du32, mul_10); 7503 sum1 += mul1; 7504 return sum0 + mul0; 7505 } 7506 7507 // ------------------------------ Combine partial (InterleaveLower) 7508 // < 64bit input, <= 64 bit result 7509 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 7510 HWY_API VFromD<D> Combine(D d, VFromD<Half<D>> hi, VFromD<Half<D>> lo) { 7511 // First double N (only lower halves will be used). 7512 const VFromD<D> hi2(hi.raw); 7513 const VFromD<D> lo2(lo.raw); 7514 // Repartition to two unsigned lanes (each the size of the valid input). 7515 const Simd<UnsignedFromSize<d.MaxBytes() / 2>, 2, 0> du; 7516 return BitCast(d, InterleaveLower(BitCast(du, lo2), BitCast(du, hi2))); 7517 } 7518 7519 // ------------------------------ RearrangeToOddPlusEven (Combine) 7520 7521 namespace detail { 7522 // Armv7 only provides 64-bit (half-vector) pairwise operations. 7523 #define HWY_NEON_DEF_PAIRWISE_OP(T, name, prefix, suffix) \ 7524 HWY_INLINE Vec64<T> Pairwise##name(Vec64<T> a, Vec64<T> b) { \ 7525 return Vec64<T>(prefix##_##suffix(a.raw, b.raw)); \ 7526 } 7527 7528 // Note that Armv7 also lacks [u]int64 instructions, which are handled by 7529 // generic_ops-inl.h SumOfLanes etc., hence no 64-bit overloads here. 7530 #define HWY_NEON_DEF_PAIRWISE_OPS(name, prefix) \ 7531 HWY_NEON_DEF_PAIRWISE_OP(uint32_t, name, prefix, u32) \ 7532 HWY_NEON_DEF_PAIRWISE_OP(uint16_t, name, prefix, u16) \ 7533 HWY_NEON_DEF_PAIRWISE_OP(uint8_t, name, prefix, u8) \ 7534 HWY_NEON_DEF_PAIRWISE_OP(int32_t, name, prefix, s32) \ 7535 HWY_NEON_DEF_PAIRWISE_OP(int16_t, name, prefix, s16) \ 7536 HWY_NEON_DEF_PAIRWISE_OP(int8_t, name, prefix, s8) \ 7537 HWY_NEON_DEF_PAIRWISE_OP(float32_t, name, prefix, f32) 7538 7539 HWY_NEON_DEF_PAIRWISE_OPS(Sum, vpadd) 7540 HWY_NEON_DEF_PAIRWISE_OPS(Min, vpmin) 7541 HWY_NEON_DEF_PAIRWISE_OPS(Max, vpmax) 7542 #undef HWY_NEON_DEF_PAIRWISE_OPS 7543 #undef HWY_NEON_DEF_PAIRWISE_OP 7544 } // namespace detail 7545 7546 template <size_t N> 7547 HWY_API Vec128<float, N> RearrangeToOddPlusEven(Vec128<float, N> sum0, 7548 Vec128<float, N> sum1) { 7549 #if HWY_NEON_HAVE_BFLOAT16 7550 (void)sum1; // unused by bf16 ReorderWidenMulAccumulate 7551 return sum0; 7552 #else 7553 return Add(sum0, sum1); 7554 #endif 7555 } 7556 7557 HWY_API Vec128<int32_t> RearrangeToOddPlusEven(Vec128<int32_t> sum0, 7558 Vec128<int32_t> sum1) { 7559 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. 7560 #if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want 7561 return Vec128<int32_t>(vpaddq_s32(sum0.raw, sum1.raw)); 7562 #else 7563 const Full128<int32_t> d; 7564 const Half<decltype(d)> d64; 7565 const Vec64<int32_t> hi = 7566 detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1)); 7567 const Vec64<int32_t> lo( 7568 detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0))); 7569 return Combine(d, hi, lo); 7570 #endif 7571 } 7572 7573 HWY_API Vec64<int32_t> RearrangeToOddPlusEven(Vec64<int32_t> sum0, 7574 Vec64<int32_t> sum1) { 7575 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. 7576 return detail::PairwiseSum(sum0, sum1); 7577 } 7578 7579 HWY_API Vec32<int32_t> RearrangeToOddPlusEven(Vec32<int32_t> sum0, 7580 Vec32<int32_t> sum1) { 7581 // Only one widened sum per register, so add them for sum of odd and even. 7582 return sum0 + sum1; 7583 } 7584 7585 HWY_API Vec128<uint32_t> RearrangeToOddPlusEven(Vec128<uint32_t> sum0, 7586 Vec128<uint32_t> sum1) { 7587 // vmlal_s16 multiplied the lower half into sum0 and upper into sum1. 7588 #if HWY_ARCH_ARM_A64 // pairwise sum is available and what we want 7589 return Vec128<uint32_t>(vpaddq_u32(sum0.raw, sum1.raw)); 7590 #else 7591 const Full128<uint32_t> d; 7592 const Half<decltype(d)> d64; 7593 const Vec64<uint32_t> hi = 7594 detail::PairwiseSum(LowerHalf(d64, sum1), UpperHalf(d64, sum1)); 7595 const Vec64<uint32_t> lo = 7596 detail::PairwiseSum(LowerHalf(d64, sum0), UpperHalf(d64, sum0)); 7597 return Combine(d, hi, lo); 7598 #endif 7599 } 7600 7601 HWY_API Vec64<uint32_t> RearrangeToOddPlusEven(Vec64<uint32_t> sum0, 7602 Vec64<uint32_t> sum1) { 7603 // vmlal_u16 multiplied the lower half into sum0 and upper into sum1. 7604 return detail::PairwiseSum(sum0, sum1); 7605 } 7606 7607 HWY_API Vec32<uint32_t> RearrangeToOddPlusEven(Vec32<uint32_t> sum0, 7608 Vec32<uint32_t> sum1) { 7609 // Only one widened sum per register, so add them for sum of odd and even. 7610 return sum0 + sum1; 7611 } 7612 7613 // ------------------------------ SumOfMulQuadAccumulate 7614 7615 #if HWY_TARGET == HWY_NEON_BF16 7616 7617 #ifdef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 7618 #undef HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 7619 #else 7620 #define HWY_NATIVE_I8_I8_SUMOFMULQUADACCUMULATE 7621 #endif 7622 7623 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)> 7624 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/, 7625 VFromD<Repartition<int8_t, DI32>> a, 7626 VFromD<Repartition<int8_t, DI32>> b, 7627 VFromD<DI32> sum) { 7628 return VFromD<DI32>(vdot_s32(sum.raw, a.raw, b.raw)); 7629 } 7630 7631 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)> 7632 HWY_API VFromD<DI32> SumOfMulQuadAccumulate(DI32 /*di32*/, 7633 VFromD<Repartition<int8_t, DI32>> a, 7634 VFromD<Repartition<int8_t, DI32>> b, 7635 VFromD<DI32> sum) { 7636 return VFromD<DI32>(vdotq_s32(sum.raw, a.raw, b.raw)); 7637 } 7638 7639 #ifdef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 7640 #undef HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 7641 #else 7642 #define HWY_NATIVE_U8_U8_SUMOFMULQUADACCUMULATE 7643 #endif 7644 7645 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_LE_D(DU32, 8)> 7646 HWY_API VFromD<DU32> SumOfMulQuadAccumulate( 7647 DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a, 7648 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { 7649 return VFromD<DU32>(vdot_u32(sum.raw, a.raw, b.raw)); 7650 } 7651 7652 template <class DU32, HWY_IF_U32_D(DU32), HWY_IF_V_SIZE_D(DU32, 16)> 7653 HWY_API VFromD<DU32> SumOfMulQuadAccumulate( 7654 DU32 /*du32*/, VFromD<Repartition<uint8_t, DU32>> a, 7655 VFromD<Repartition<uint8_t, DU32>> b, VFromD<DU32> sum) { 7656 return VFromD<DU32>(vdotq_u32(sum.raw, a.raw, b.raw)); 7657 } 7658 7659 #ifdef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 7660 #undef HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 7661 #else 7662 #define HWY_NATIVE_U8_I8_SUMOFMULQUADACCUMULATE 7663 #endif 7664 7665 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_LE_D(DI32, 8)> 7666 HWY_API VFromD<DI32> SumOfMulQuadAccumulate( 7667 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u, 7668 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { 7669 return VFromD<DI32>(vusdot_s32(sum.raw, a_u.raw, b_i.raw)); 7670 } 7671 7672 template <class DI32, HWY_IF_I32_D(DI32), HWY_IF_V_SIZE_D(DI32, 16)> 7673 HWY_API VFromD<DI32> SumOfMulQuadAccumulate( 7674 DI32 /*di32*/, VFromD<Repartition<uint8_t, DI32>> a_u, 7675 VFromD<Repartition<int8_t, DI32>> b_i, VFromD<DI32> sum) { 7676 return VFromD<DI32>(vusdotq_s32(sum.raw, a_u.raw, b_i.raw)); 7677 } 7678 7679 #endif // HWY_TARGET == HWY_NEON_BF16 7680 7681 // ------------------------------ WidenMulPairwiseAdd 7682 7683 #if HWY_NEON_HAVE_F32_TO_BF16C 7684 7685 template <class DF, HWY_IF_V_SIZE_D(DF, 16)> 7686 HWY_API Vec128<float> WidenMulPairwiseAdd(DF df, Vec128<bfloat16_t> a, 7687 Vec128<bfloat16_t> b) { 7688 return Vec128<float>(vbfdotq_f32(Zero(df).raw, 7689 detail::BitCastToRawNeonBF16(a.raw), 7690 detail::BitCastToRawNeonBF16(b.raw))); 7691 } 7692 7693 template <class DF, HWY_IF_V_SIZE_LE_D(DF, 8)> 7694 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, 7695 VFromD<Repartition<bfloat16_t, DF>> a, 7696 VFromD<Repartition<bfloat16_t, DF>> b) { 7697 return VFromD<DF>(vbfdot_f32(Zero(df).raw, 7698 detail::BitCastToRawNeonBF16(a.raw), 7699 detail::BitCastToRawNeonBF16(b.raw))); 7700 } 7701 7702 #else 7703 template <class DF, HWY_IF_F32_D(DF)> 7704 HWY_API VFromD<DF> WidenMulPairwiseAdd(DF df, 7705 VFromD<Repartition<bfloat16_t, DF>> a, 7706 VFromD<Repartition<bfloat16_t, DF>> b) { 7707 return MulAdd(PromoteEvenTo(df, a), PromoteEvenTo(df, b), 7708 Mul(PromoteOddTo(df, a), PromoteOddTo(df, b))); 7709 } 7710 #endif // HWY_NEON_HAVE_F32_TO_BF16C 7711 7712 template <class D, HWY_IF_I32_D(D)> 7713 HWY_API Vec128<int32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<int16_t> a, 7714 Vec128<int16_t> b) { 7715 Vec128<int32_t> sum1; 7716 #if HWY_ARCH_ARM_A64 7717 sum1 = Vec128<int32_t>(vmull_high_s16(a.raw, b.raw)); 7718 #else 7719 const Full64<int16_t> dh; 7720 sum1 = Vec128<int32_t>(vmull_s16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); 7721 #endif 7722 Vec128<int32_t> sum0 = 7723 Vec128<int32_t>(vmull_s16(LowerHalf(a).raw, LowerHalf(b).raw)); 7724 return RearrangeToOddPlusEven(sum0, sum1); 7725 } 7726 7727 template <class D, HWY_IF_I32_D(D)> 7728 HWY_API Vec64<int32_t> WidenMulPairwiseAdd(D d32, Vec64<int16_t> a, 7729 Vec64<int16_t> b) { 7730 // vmlal writes into the upper half, which the caller cannot use, so 7731 // split into two halves. 7732 const Vec128<int32_t> mul_3210(vmull_s16(a.raw, b.raw)); 7733 const Vec64<int32_t> mul0 = LowerHalf(mul_3210); 7734 const Vec64<int32_t> mul1 = UpperHalf(d32, mul_3210); 7735 return RearrangeToOddPlusEven(mul0, mul1); 7736 } 7737 7738 template <class D, HWY_IF_I32_D(D)> 7739 HWY_API Vec32<int32_t> WidenMulPairwiseAdd(D d32, Vec32<int16_t> a, 7740 Vec32<int16_t> b) { 7741 const Vec128<int32_t> mul_xx10(vmull_s16(a.raw, b.raw)); 7742 const Vec64<int32_t> mul_10(LowerHalf(mul_xx10)); 7743 const Vec32<int32_t> mul0 = LowerHalf(d32, mul_10); 7744 const Vec32<int32_t> mul1 = UpperHalf(d32, mul_10); 7745 return RearrangeToOddPlusEven(mul0, mul1); 7746 } 7747 7748 template <class D, HWY_IF_U32_D(D)> 7749 HWY_API Vec128<uint32_t> WidenMulPairwiseAdd(D /*d32*/, Vec128<uint16_t> a, 7750 Vec128<uint16_t> b) { 7751 Vec128<uint32_t> sum1; 7752 #if HWY_ARCH_ARM_A64 7753 sum1 = Vec128<uint32_t>(vmull_high_u16(a.raw, b.raw)); 7754 #else 7755 const Full64<uint16_t> dh; 7756 sum1 = 7757 Vec128<uint32_t>(vmull_u16(UpperHalf(dh, a).raw, UpperHalf(dh, b).raw)); 7758 #endif 7759 Vec128<uint32_t> sum0 = 7760 Vec128<uint32_t>(vmull_u16(LowerHalf(a).raw, LowerHalf(b).raw)); 7761 return RearrangeToOddPlusEven(sum0, sum1); 7762 } 7763 7764 template <class D, HWY_IF_U32_D(D)> 7765 HWY_API Vec64<uint32_t> WidenMulPairwiseAdd(D d32, Vec64<uint16_t> a, 7766 Vec64<uint16_t> b) { 7767 // vmlal writes into the upper half, which the caller cannot use, so 7768 // split into two halves. 7769 const Vec128<uint32_t> mul_3210(vmull_u16(a.raw, b.raw)); 7770 const Vec64<uint32_t> mul0 = LowerHalf(mul_3210); 7771 const Vec64<uint32_t> mul1 = UpperHalf(d32, mul_3210); 7772 return RearrangeToOddPlusEven(mul0, mul1); 7773 } 7774 7775 template <class D, HWY_IF_U32_D(D)> 7776 HWY_API Vec32<uint32_t> WidenMulPairwiseAdd(D d32, Vec32<uint16_t> a, 7777 Vec32<uint16_t> b) { 7778 const Vec128<uint32_t> mul_xx10(vmull_u16(a.raw, b.raw)); 7779 const Vec64<uint32_t> mul_10(LowerHalf(mul_xx10)); 7780 const Vec32<uint32_t> mul0 = LowerHalf(d32, mul_10); 7781 const Vec32<uint32_t> mul1 = UpperHalf(d32, mul_10); 7782 return RearrangeToOddPlusEven(mul0, mul1); 7783 } 7784 7785 // ------------------------------ ZeroExtendVector (Combine) 7786 7787 template <class D> 7788 HWY_API VFromD<D> ZeroExtendVector(D d, VFromD<Half<D>> lo) { 7789 return Combine(d, Zero(Half<decltype(d)>()), lo); 7790 } 7791 7792 // ------------------------------ ConcatLowerLower 7793 7794 // 64 or 128-bit input: just interleave 7795 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> 7796 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 7797 // Treat half-width input as a single lane and interleave them. 7798 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; 7799 return BitCast(d, InterleaveLower(BitCast(du, lo), BitCast(du, hi))); 7800 } 7801 7802 namespace detail { 7803 #if HWY_ARCH_ARM_A64 7804 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveEven, vtrn1, _, 2) 7805 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(InterleaveOdd, vtrn2, _, 2) 7806 #else 7807 7808 // vtrn returns a struct with even and odd result. 7809 #define HWY_NEON_BUILD_TPL_HWY_TRN 7810 #define HWY_NEON_BUILD_RET_HWY_TRN(type, size) type##x##size##x2_t 7811 // Pass raw args so we can accept uint16x2 args, for which there is no 7812 // corresponding uint16x2x2 return type. 7813 #define HWY_NEON_BUILD_PARAM_HWY_TRN(TYPE, size) \ 7814 Raw128<TYPE##_t, size>::type a, Raw128<TYPE##_t, size>::type b 7815 #define HWY_NEON_BUILD_ARG_HWY_TRN a, b 7816 7817 // Cannot use UINT8 etc. type macros because the x2_t tuples are only defined 7818 // for full and half vectors. 7819 HWY_NEON_DEF_FUNCTION(uint8, 16, InterleaveEvenOdd, vtrnq, _, u8, HWY_TRN) 7820 HWY_NEON_DEF_FUNCTION(uint8, 8, InterleaveEvenOdd, vtrn, _, u8, HWY_TRN) 7821 HWY_NEON_DEF_FUNCTION(uint16, 8, InterleaveEvenOdd, vtrnq, _, u16, HWY_TRN) 7822 HWY_NEON_DEF_FUNCTION(uint16, 4, InterleaveEvenOdd, vtrn, _, u16, HWY_TRN) 7823 HWY_NEON_DEF_FUNCTION(uint32, 4, InterleaveEvenOdd, vtrnq, _, u32, HWY_TRN) 7824 HWY_NEON_DEF_FUNCTION(uint32, 2, InterleaveEvenOdd, vtrn, _, u32, HWY_TRN) 7825 HWY_NEON_DEF_FUNCTION(int8, 16, InterleaveEvenOdd, vtrnq, _, s8, HWY_TRN) 7826 HWY_NEON_DEF_FUNCTION(int8, 8, InterleaveEvenOdd, vtrn, _, s8, HWY_TRN) 7827 HWY_NEON_DEF_FUNCTION(int16, 8, InterleaveEvenOdd, vtrnq, _, s16, HWY_TRN) 7828 HWY_NEON_DEF_FUNCTION(int16, 4, InterleaveEvenOdd, vtrn, _, s16, HWY_TRN) 7829 HWY_NEON_DEF_FUNCTION(int32, 4, InterleaveEvenOdd, vtrnq, _, s32, HWY_TRN) 7830 HWY_NEON_DEF_FUNCTION(int32, 2, InterleaveEvenOdd, vtrn, _, s32, HWY_TRN) 7831 HWY_NEON_DEF_FUNCTION(float32, 4, InterleaveEvenOdd, vtrnq, _, f32, HWY_TRN) 7832 HWY_NEON_DEF_FUNCTION(float32, 2, InterleaveEvenOdd, vtrn, _, f32, HWY_TRN) 7833 7834 #undef HWY_NEON_BUILD_TPL_HWY_TRN 7835 #undef HWY_NEON_BUILD_RET_HWY_TRN 7836 #undef HWY_NEON_BUILD_PARAM_HWY_TRN 7837 #undef HWY_NEON_BUILD_ARG_HWY_TRN 7838 7839 #endif // HWY_ARCH_ARM_A64 7840 } // namespace detail 7841 7842 // <= 32-bit input/output 7843 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 7844 HWY_API VFromD<D> ConcatLowerLower(D d, VFromD<D> hi, VFromD<D> lo) { 7845 // Treat half-width input as two lanes and take every second one. 7846 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; 7847 #if HWY_ARCH_ARM_A64 7848 return BitCast(d, detail::InterleaveEven(BitCast(du, lo), BitCast(du, hi))); 7849 #else 7850 using VU = VFromD<decltype(du)>; 7851 return BitCast( 7852 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) 7853 .val[0])); 7854 #endif 7855 } 7856 7857 // ------------------------------ ConcatUpperUpper 7858 7859 // 64 or 128-bit input: just interleave 7860 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> 7861 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 7862 // Treat half-width input as a single lane and interleave them. 7863 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; 7864 return BitCast(d, InterleaveUpper(du, BitCast(du, lo), BitCast(du, hi))); 7865 } 7866 7867 // <= 32-bit input/output 7868 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 7869 HWY_API VFromD<D> ConcatUpperUpper(D d, VFromD<D> hi, VFromD<D> lo) { 7870 // Treat half-width input as two lanes and take every second one. 7871 const Repartition<UnsignedFromSize<d.MaxBytes() / 2>, decltype(d)> du; 7872 #if HWY_ARCH_ARM_A64 7873 return BitCast(d, detail::InterleaveOdd(BitCast(du, lo), BitCast(du, hi))); 7874 #else 7875 using VU = VFromD<decltype(du)>; 7876 return BitCast( 7877 d, VU(detail::InterleaveEvenOdd(BitCast(du, lo).raw, BitCast(du, hi).raw) 7878 .val[1])); 7879 #endif 7880 } 7881 7882 // ------------------------------ ConcatLowerUpper (ShiftLeftBytes) 7883 7884 // 64 or 128-bit input: extract from concatenated 7885 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> 7886 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 7887 return CombineShiftRightBytes<d.MaxBytes() / 2>(d, hi, lo); 7888 } 7889 7890 // <= 32-bit input/output 7891 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 7892 HWY_API VFromD<D> ConcatLowerUpper(D d, VFromD<D> hi, VFromD<D> lo) { 7893 constexpr size_t kSize = d.MaxBytes(); 7894 const Repartition<uint8_t, decltype(d)> d8; 7895 const Full64<uint8_t> d8x8; 7896 const Full64<TFromD<D>> d64; 7897 using V8x8 = VFromD<decltype(d8x8)>; 7898 const V8x8 hi8x8(BitCast(d8, hi).raw); 7899 // Move into most-significant bytes 7900 const V8x8 lo8x8 = ShiftLeftBytes<8 - kSize>(V8x8(BitCast(d8, lo).raw)); 7901 const V8x8 r = CombineShiftRightBytes<8 - kSize / 2>(d8x8, hi8x8, lo8x8); 7902 // Back to original lane type, then shrink N. 7903 return VFromD<D>(BitCast(d64, r).raw); 7904 } 7905 7906 // ------------------------------ ConcatUpperLower 7907 7908 // Works for all N. 7909 template <class D> 7910 HWY_API VFromD<D> ConcatUpperLower(D d, VFromD<D> hi, VFromD<D> lo) { 7911 return IfThenElse(FirstN(d, Lanes(d) / 2), lo, hi); 7912 } 7913 7914 // ------------------------------ ConcatOdd (InterleaveUpper) 7915 7916 namespace detail { 7917 // There is no vuzpq_u64. 7918 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatEven, vuzp1, _, 2) 7919 HWY_NEON_DEF_FUNCTION_UIF_8_16_32(ConcatOdd, vuzp2, _, 2) 7920 7921 #if !HWY_HAVE_FLOAT16 7922 template <size_t N> 7923 HWY_INLINE Vec128<float16_t, N> ConcatEven(Vec128<float16_t, N> hi, 7924 Vec128<float16_t, N> lo) { 7925 const DFromV<decltype(hi)> d; 7926 const RebindToUnsigned<decltype(d)> du; 7927 return BitCast(d, ConcatEven(BitCast(du, hi), BitCast(du, lo))); 7928 } 7929 template <size_t N> 7930 HWY_INLINE Vec128<float16_t, N> ConcatOdd(Vec128<float16_t, N> hi, 7931 Vec128<float16_t, N> lo) { 7932 const DFromV<decltype(hi)> d; 7933 const RebindToUnsigned<decltype(d)> du; 7934 return BitCast(d, ConcatOdd(BitCast(du, hi), BitCast(du, lo))); 7935 } 7936 #endif // !HWY_HAVE_FLOAT16 7937 } // namespace detail 7938 7939 // Full/half vector 7940 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> 7941 HWY_API VFromD<D> ConcatOdd(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 7942 return detail::ConcatOdd(lo, hi); 7943 } 7944 7945 // 8-bit x4 7946 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 7947 HWY_API Vec32<T> ConcatOdd(D d, Vec32<T> hi, Vec32<T> lo) { 7948 const Twice<decltype(d)> d2; 7949 const Repartition<uint16_t, decltype(d2)> dw2; 7950 const VFromD<decltype(d2)> hi2(hi.raw); 7951 const VFromD<decltype(d2)> lo2(lo.raw); 7952 const VFromD<decltype(dw2)> Hx1Lx1 = BitCast(dw2, ConcatOdd(d2, hi2, lo2)); 7953 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use 7954 // vcopy_lane_u16, but that's A64-only. 7955 return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx1Lx1, Hx1Lx1)).raw); 7956 } 7957 7958 // Any type x2 7959 template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>> 7960 HWY_API Vec128<T, 2> ConcatOdd(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 7961 return InterleaveUpper(d, lo, hi); 7962 } 7963 7964 // ------------------------------ ConcatEven (InterleaveLower) 7965 7966 // Full/half vector 7967 template <class D, HWY_IF_V_SIZE_GT_D(D, 4)> 7968 HWY_API VFromD<D> ConcatEven(D /* tag */, VFromD<D> hi, VFromD<D> lo) { 7969 return detail::ConcatEven(lo, hi); 7970 } 7971 7972 // 8-bit x4 7973 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 1)> 7974 HWY_API Vec32<T> ConcatEven(D d, Vec32<T> hi, Vec32<T> lo) { 7975 const Twice<decltype(d)> d2; 7976 const Repartition<uint16_t, decltype(d2)> dw2; 7977 const VFromD<decltype(d2)> hi2(hi.raw); 7978 const VFromD<decltype(d2)> lo2(lo.raw); 7979 const VFromD<decltype(dw2)> Hx0Lx0 = BitCast(dw2, ConcatEven(d2, hi2, lo2)); 7980 // Compact into two pairs of u8, skipping the invalid x lanes. Could also use 7981 // vcopy_lane_u16, but that's A64-only. 7982 return Vec32<T>(BitCast(d2, ConcatEven(dw2, Hx0Lx0, Hx0Lx0)).raw); 7983 } 7984 7985 // Any type x2 7986 template <class D, HWY_IF_LANES_D(D, 2), typename T = TFromD<D>> 7987 HWY_API Vec128<T, 2> ConcatEven(D d, Vec128<T, 2> hi, Vec128<T, 2> lo) { 7988 return InterleaveLower(d, lo, hi); 7989 } 7990 7991 // ------------------------------ DupEven (InterleaveLower) 7992 7993 template <typename T, size_t N, 7994 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> 7995 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 7996 #if HWY_ARCH_ARM_A64 7997 return detail::InterleaveEven(v, v); 7998 #else 7999 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[0]); 8000 #endif 8001 } 8002 8003 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 8004 HWY_API Vec128<T, N> DupEven(Vec128<T, N> v) { 8005 return InterleaveLower(DFromV<decltype(v)>(), v, v); 8006 } 8007 8008 // ------------------------------ DupOdd (InterleaveUpper) 8009 8010 template <typename T, size_t N, 8011 HWY_IF_T_SIZE_ONE_OF(T, (1 << 1) | (1 << 2) | (1 << 4))> 8012 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 8013 #if HWY_ARCH_ARM_A64 8014 return detail::InterleaveOdd(v, v); 8015 #else 8016 return Vec128<T, N>(detail::InterleaveEvenOdd(v.raw, v.raw).val[1]); 8017 #endif 8018 } 8019 8020 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 8021 HWY_API Vec128<T, N> DupOdd(Vec128<T, N> v) { 8022 return InterleaveUpper(DFromV<decltype(v)>(), v, v); 8023 } 8024 8025 // ------------------------------ OddEven (IfThenElse) 8026 8027 template <typename T, size_t N> 8028 HWY_API Vec128<T, N> OddEven(const Vec128<T, N> a, const Vec128<T, N> b) { 8029 const DFromV<decltype(a)> d; 8030 const Repartition<uint8_t, decltype(d)> d8; 8031 alignas(16) static constexpr uint8_t kBytes[16] = { 8032 ((0 / sizeof(T)) & 1) ? 0 : 0xFF, ((1 / sizeof(T)) & 1) ? 0 : 0xFF, 8033 ((2 / sizeof(T)) & 1) ? 0 : 0xFF, ((3 / sizeof(T)) & 1) ? 0 : 0xFF, 8034 ((4 / sizeof(T)) & 1) ? 0 : 0xFF, ((5 / sizeof(T)) & 1) ? 0 : 0xFF, 8035 ((6 / sizeof(T)) & 1) ? 0 : 0xFF, ((7 / sizeof(T)) & 1) ? 0 : 0xFF, 8036 ((8 / sizeof(T)) & 1) ? 0 : 0xFF, ((9 / sizeof(T)) & 1) ? 0 : 0xFF, 8037 ((10 / sizeof(T)) & 1) ? 0 : 0xFF, ((11 / sizeof(T)) & 1) ? 0 : 0xFF, 8038 ((12 / sizeof(T)) & 1) ? 0 : 0xFF, ((13 / sizeof(T)) & 1) ? 0 : 0xFF, 8039 ((14 / sizeof(T)) & 1) ? 0 : 0xFF, ((15 / sizeof(T)) & 1) ? 0 : 0xFF, 8040 }; 8041 const auto vec = BitCast(d, Load(d8, kBytes)); 8042 return IfThenElse(MaskFromVec(vec), b, a); 8043 } 8044 8045 // ------------------------------ InterleaveEven 8046 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))> 8047 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 8048 #if HWY_ARCH_ARM_A64 8049 return detail::InterleaveEven(a, b); 8050 #else 8051 return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[0]); 8052 #endif 8053 } 8054 8055 template <class D, HWY_IF_T_SIZE_D(D, 8)> 8056 HWY_API VFromD<D> InterleaveEven(D /*d*/, VFromD<D> a, VFromD<D> b) { 8057 return InterleaveLower(a, b); 8058 } 8059 8060 // ------------------------------ InterleaveOdd 8061 template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))> 8062 HWY_API VFromD<D> InterleaveOdd(D /*d*/, VFromD<D> a, VFromD<D> b) { 8063 #if HWY_ARCH_ARM_A64 8064 return detail::InterleaveOdd(a, b); 8065 #else 8066 return VFromD<D>(detail::InterleaveEvenOdd(a.raw, b.raw).val[1]); 8067 #endif 8068 } 8069 8070 template <class D, HWY_IF_T_SIZE_D(D, 8)> 8071 HWY_API VFromD<D> InterleaveOdd(D d, VFromD<D> a, VFromD<D> b) { 8072 return InterleaveUpper(d, a, b); 8073 } 8074 8075 // ------------------------------ OddEvenBlocks 8076 template <typename T, size_t N> 8077 HWY_API Vec128<T, N> OddEvenBlocks(Vec128<T, N> /* odd */, Vec128<T, N> even) { 8078 return even; 8079 } 8080 8081 // ------------------------------ SwapAdjacentBlocks 8082 template <typename T, size_t N> 8083 HWY_API Vec128<T, N> SwapAdjacentBlocks(Vec128<T, N> v) { 8084 return v; 8085 } 8086 8087 // ------------------------------ InterleaveEvenBlocks 8088 template <class D, class V = VFromD<D>> 8089 HWY_API V InterleaveEvenBlocks(D, V a, V /*b*/) { 8090 return a; 8091 } 8092 // ------------------------------ InterleaveOddBlocks 8093 template <class D, class V = VFromD<D>> 8094 HWY_API V InterleaveOddBlocks(D, V a, V /*b*/) { 8095 return a; 8096 } 8097 8098 // ------------------------------ ReverseBlocks 8099 // Single block: no change 8100 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 8101 HWY_API VFromD<D> ReverseBlocks(D /* tag */, VFromD<D> v) { 8102 return v; 8103 } 8104 8105 // ------------------------------ ReorderDemote2To (OddEven) 8106 8107 #if HWY_NEON_HAVE_F32_TO_BF16C 8108 template <class D, HWY_IF_BF16_D(D)> 8109 HWY_API VFromD<D> ReorderDemote2To(D dbf16, VFromD<Repartition<float, D>> a, 8110 VFromD<Repartition<float, D>> b) { 8111 const Half<decltype(dbf16)> dh_bf16; 8112 return Combine(dbf16, DemoteTo(dh_bf16, b), DemoteTo(dh_bf16, a)); 8113 } 8114 #endif // HWY_NEON_HAVE_F32_TO_BF16C 8115 8116 template <class D, HWY_IF_I32_D(D)> 8117 HWY_API Vec128<int32_t> ReorderDemote2To(D d32, Vec128<int64_t> a, 8118 Vec128<int64_t> b) { 8119 const Vec64<int32_t> a32(vqmovn_s64(a.raw)); 8120 #if HWY_ARCH_ARM_A64 8121 (void)d32; 8122 return Vec128<int32_t>(vqmovn_high_s64(a32.raw, b.raw)); 8123 #else 8124 const Vec64<int32_t> b32(vqmovn_s64(b.raw)); 8125 return Combine(d32, b32, a32); 8126 #endif 8127 } 8128 8129 template <class D, HWY_IF_I32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 8130 HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a, 8131 VFromD<Repartition<int64_t, D>> b) { 8132 const Rebind<int64_t, decltype(d32)> dt; 8133 return DemoteTo(d32, Combine(dt, b, a)); 8134 } 8135 8136 template <class D, HWY_IF_U32_D(D)> 8137 HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<int64_t> a, 8138 Vec128<int64_t> b) { 8139 const Vec64<uint32_t> a32(vqmovun_s64(a.raw)); 8140 #if HWY_ARCH_ARM_A64 8141 (void)d32; 8142 return Vec128<uint32_t>(vqmovun_high_s64(a32.raw, b.raw)); 8143 #else 8144 const Vec64<uint32_t> b32(vqmovun_s64(b.raw)); 8145 return Combine(d32, b32, a32); 8146 #endif 8147 } 8148 8149 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 8150 HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<int64_t, D>> a, 8151 VFromD<Repartition<int64_t, D>> b) { 8152 const Rebind<int64_t, decltype(d32)> dt; 8153 return DemoteTo(d32, Combine(dt, b, a)); 8154 } 8155 8156 template <class D, HWY_IF_U32_D(D)> 8157 HWY_API Vec128<uint32_t> ReorderDemote2To(D d32, Vec128<uint64_t> a, 8158 Vec128<uint64_t> b) { 8159 const Vec64<uint32_t> a32(vqmovn_u64(a.raw)); 8160 #if HWY_ARCH_ARM_A64 8161 (void)d32; 8162 return Vec128<uint32_t>(vqmovn_high_u64(a32.raw, b.raw)); 8163 #else 8164 const Vec64<uint32_t> b32(vqmovn_u64(b.raw)); 8165 return Combine(d32, b32, a32); 8166 #endif 8167 } 8168 8169 template <class D, HWY_IF_U32_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 8170 HWY_API VFromD<D> ReorderDemote2To(D d32, VFromD<Repartition<uint64_t, D>> a, 8171 VFromD<Repartition<uint64_t, D>> b) { 8172 const Rebind<uint64_t, decltype(d32)> dt; 8173 return DemoteTo(d32, Combine(dt, b, a)); 8174 } 8175 8176 template <class D, HWY_IF_I16_D(D)> 8177 HWY_API Vec128<int16_t> ReorderDemote2To(D d16, Vec128<int32_t> a, 8178 Vec128<int32_t> b) { 8179 const Vec64<int16_t> a16(vqmovn_s32(a.raw)); 8180 #if HWY_ARCH_ARM_A64 8181 (void)d16; 8182 return Vec128<int16_t>(vqmovn_high_s32(a16.raw, b.raw)); 8183 #else 8184 const Vec64<int16_t> b16(vqmovn_s32(b.raw)); 8185 return Combine(d16, b16, a16); 8186 #endif 8187 } 8188 8189 template <class D, HWY_IF_I16_D(D)> 8190 HWY_API Vec64<int16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a, 8191 Vec64<int32_t> b) { 8192 const Full128<int32_t> d32; 8193 const Vec128<int32_t> ab = Combine(d32, b, a); 8194 return Vec64<int16_t>(vqmovn_s32(ab.raw)); 8195 } 8196 8197 template <class D, HWY_IF_I16_D(D)> 8198 HWY_API Vec32<int16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a, 8199 Vec32<int32_t> b) { 8200 const Full128<int32_t> d32; 8201 const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw)); 8202 return Vec32<int16_t>(vqmovn_s32(Combine(d32, ab, ab).raw)); 8203 } 8204 8205 template <class D, HWY_IF_U16_D(D)> 8206 HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<int32_t> a, 8207 Vec128<int32_t> b) { 8208 const Vec64<uint16_t> a16(vqmovun_s32(a.raw)); 8209 #if HWY_ARCH_ARM_A64 8210 (void)d16; 8211 return Vec128<uint16_t>(vqmovun_high_s32(a16.raw, b.raw)); 8212 #else 8213 const Vec64<uint16_t> b16(vqmovun_s32(b.raw)); 8214 return Combine(d16, b16, a16); 8215 #endif 8216 } 8217 8218 template <class D, HWY_IF_U16_D(D)> 8219 HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<int32_t> a, 8220 Vec64<int32_t> b) { 8221 const Full128<int32_t> d32; 8222 const Vec128<int32_t> ab = Combine(d32, b, a); 8223 return Vec64<uint16_t>(vqmovun_s32(ab.raw)); 8224 } 8225 8226 template <class D, HWY_IF_U16_D(D)> 8227 HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<int32_t> a, 8228 Vec32<int32_t> b) { 8229 const Full128<int32_t> d32; 8230 const Vec64<int32_t> ab(vzip1_s32(a.raw, b.raw)); 8231 return Vec32<uint16_t>(vqmovun_s32(Combine(d32, ab, ab).raw)); 8232 } 8233 8234 template <class D, HWY_IF_U16_D(D)> 8235 HWY_API Vec128<uint16_t> ReorderDemote2To(D d16, Vec128<uint32_t> a, 8236 Vec128<uint32_t> b) { 8237 const Vec64<uint16_t> a16(vqmovn_u32(a.raw)); 8238 #if HWY_ARCH_ARM_A64 8239 (void)d16; 8240 return Vec128<uint16_t>(vqmovn_high_u32(a16.raw, b.raw)); 8241 #else 8242 const Vec64<uint16_t> b16(vqmovn_u32(b.raw)); 8243 return Combine(d16, b16, a16); 8244 #endif 8245 } 8246 8247 template <class D, HWY_IF_U16_D(D)> 8248 HWY_API Vec64<uint16_t> ReorderDemote2To(D /*d16*/, Vec64<uint32_t> a, 8249 Vec64<uint32_t> b) { 8250 const Full128<uint32_t> d32; 8251 const Vec128<uint32_t> ab = Combine(d32, b, a); 8252 return Vec64<uint16_t>(vqmovn_u32(ab.raw)); 8253 } 8254 8255 template <class D, HWY_IF_U16_D(D)> 8256 HWY_API Vec32<uint16_t> ReorderDemote2To(D /*d16*/, Vec32<uint32_t> a, 8257 Vec32<uint32_t> b) { 8258 const Full128<uint32_t> d32; 8259 const Vec64<uint32_t> ab(vzip1_u32(a.raw, b.raw)); 8260 return Vec32<uint16_t>(vqmovn_u32(Combine(d32, ab, ab).raw)); 8261 } 8262 8263 template <class D, HWY_IF_I8_D(D)> 8264 HWY_API Vec128<int8_t> ReorderDemote2To(D d8, Vec128<int16_t> a, 8265 Vec128<int16_t> b) { 8266 const Vec64<int8_t> a8(vqmovn_s16(a.raw)); 8267 #if HWY_ARCH_ARM_A64 8268 (void)d8; 8269 return Vec128<int8_t>(vqmovn_high_s16(a8.raw, b.raw)); 8270 #else 8271 const Vec64<int8_t> b8(vqmovn_s16(b.raw)); 8272 return Combine(d8, b8, a8); 8273 #endif 8274 } 8275 8276 template <class D, HWY_IF_I8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 8277 HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a, 8278 VFromD<Repartition<int16_t, D>> b) { 8279 const Rebind<int16_t, decltype(d8)> dt; 8280 return DemoteTo(d8, Combine(dt, b, a)); 8281 } 8282 8283 template <class D, HWY_IF_U8_D(D)> 8284 HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<int16_t> a, 8285 Vec128<int16_t> b) { 8286 const Vec64<uint8_t> a8(vqmovun_s16(a.raw)); 8287 #if HWY_ARCH_ARM_A64 8288 (void)d8; 8289 return Vec128<uint8_t>(vqmovun_high_s16(a8.raw, b.raw)); 8290 #else 8291 const Vec64<uint8_t> b8(vqmovun_s16(b.raw)); 8292 return Combine(d8, b8, a8); 8293 #endif 8294 } 8295 8296 template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 8297 HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<int16_t, D>> a, 8298 VFromD<Repartition<int16_t, D>> b) { 8299 const Rebind<int16_t, decltype(d8)> dt; 8300 return DemoteTo(d8, Combine(dt, b, a)); 8301 } 8302 8303 template <class D, HWY_IF_U8_D(D)> 8304 HWY_API Vec128<uint8_t> ReorderDemote2To(D d8, Vec128<uint16_t> a, 8305 Vec128<uint16_t> b) { 8306 const Vec64<uint8_t> a8(vqmovn_u16(a.raw)); 8307 #if HWY_ARCH_ARM_A64 8308 (void)d8; 8309 return Vec128<uint8_t>(vqmovn_high_u16(a8.raw, b.raw)); 8310 #else 8311 const Vec64<uint8_t> b8(vqmovn_u16(b.raw)); 8312 return Combine(d8, b8, a8); 8313 #endif 8314 } 8315 8316 template <class D, HWY_IF_U8_D(D), HWY_IF_V_SIZE_LE_D(D, 8)> 8317 HWY_API VFromD<D> ReorderDemote2To(D d8, VFromD<Repartition<uint16_t, D>> a, 8318 VFromD<Repartition<uint16_t, D>> b) { 8319 const Rebind<uint16_t, decltype(d8)> dt; 8320 return DemoteTo(d8, Combine(dt, b, a)); 8321 } 8322 8323 template <class D, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 8324 HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<D>), 8325 HWY_IF_LANES_D(D, HWY_MAX_LANES_D(DFromV<V>) * 2)> 8326 HWY_API VFromD<D> OrderedDemote2To(D d, V a, V b) { 8327 return ReorderDemote2To(d, a, b); 8328 } 8329 8330 #if HWY_NEON_HAVE_F32_TO_BF16C 8331 template <class D, HWY_IF_BF16_D(D)> 8332 HWY_API VFromD<D> OrderedDemote2To(D dbf16, VFromD<Repartition<float, D>> a, 8333 VFromD<Repartition<float, D>> b) { 8334 return ReorderDemote2To(dbf16, a, b); 8335 } 8336 #endif // HWY_NEON_HAVE_F32_TO_BF16C 8337 8338 // ================================================== CRYPTO 8339 8340 // (aarch64 or Arm7) and (__ARM_FEATURE_AES or HWY_HAVE_RUNTIME_DISPATCH). 8341 // Otherwise, rely on generic_ops-inl.h to emulate AESRound / CLMul*. 8342 #if HWY_TARGET != HWY_NEON_WITHOUT_AES 8343 8344 #ifdef HWY_NATIVE_AES 8345 #undef HWY_NATIVE_AES 8346 #else 8347 #define HWY_NATIVE_AES 8348 #endif 8349 8350 HWY_API Vec128<uint8_t> AESRound(Vec128<uint8_t> state, 8351 Vec128<uint8_t> round_key) { 8352 // NOTE: it is important that AESE and AESMC be consecutive instructions so 8353 // they can be fused. AESE includes AddRoundKey, which is a different ordering 8354 // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual 8355 // round key (the compiler will hopefully optimize this for multiple rounds). 8356 return Vec128<uint8_t>(vaesmcq_u8(vaeseq_u8(state.raw, vdupq_n_u8(0)))) ^ 8357 round_key; 8358 } 8359 8360 HWY_API Vec128<uint8_t> AESLastRound(Vec128<uint8_t> state, 8361 Vec128<uint8_t> round_key) { 8362 return Vec128<uint8_t>(vaeseq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; 8363 } 8364 8365 HWY_API Vec128<uint8_t> AESInvMixColumns(Vec128<uint8_t> state) { 8366 return Vec128<uint8_t>{vaesimcq_u8(state.raw)}; 8367 } 8368 8369 HWY_API Vec128<uint8_t> AESRoundInv(Vec128<uint8_t> state, 8370 Vec128<uint8_t> round_key) { 8371 // NOTE: it is important that AESD and AESIMC be consecutive instructions so 8372 // they can be fused. AESD includes AddRoundKey, which is a different ordering 8373 // than the AES-NI semantics we adopted, so XOR by 0 and later with the actual 8374 // round key (the compiler will hopefully optimize this for multiple rounds). 8375 return Vec128<uint8_t>(vaesimcq_u8(vaesdq_u8(state.raw, vdupq_n_u8(0)))) ^ 8376 round_key; 8377 } 8378 8379 HWY_API Vec128<uint8_t> AESLastRoundInv(Vec128<uint8_t> state, 8380 Vec128<uint8_t> round_key) { 8381 return Vec128<uint8_t>(vaesdq_u8(state.raw, vdupq_n_u8(0))) ^ round_key; 8382 } 8383 8384 HWY_API Vec128<uint64_t> CLMulLower(Vec128<uint64_t> a, Vec128<uint64_t> b) { 8385 return Vec128<uint64_t>((uint64x2_t)vmull_p64(GetLane(a), GetLane(b))); 8386 } 8387 8388 HWY_API Vec128<uint64_t> CLMulUpper(Vec128<uint64_t> a, Vec128<uint64_t> b) { 8389 return Vec128<uint64_t>( 8390 (uint64x2_t)vmull_high_p64((poly64x2_t)a.raw, (poly64x2_t)b.raw)); 8391 } 8392 8393 #endif // HWY_TARGET != HWY_NEON_WITHOUT_AES 8394 8395 // ================================================== MISC 8396 8397 template <class D, HWY_IF_F32_D(D)> 8398 HWY_API VFromD<D> PromoteTo(D df32, VFromD<Rebind<bfloat16_t, D>> v) { 8399 const Rebind<uint16_t, decltype(df32)> du16; 8400 const RebindToSigned<decltype(df32)> di32; 8401 return BitCast(df32, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v)))); 8402 } 8403 8404 // ------------------------------ Truncations 8405 8406 template <class DTo, typename TTo = TFromD<DTo>, typename TFrom, 8407 HWY_IF_UNSIGNED(TFrom), HWY_IF_UNSIGNED(TTo), 8408 hwy::EnableIf<(sizeof(TTo) < sizeof(TFrom))>* = nullptr> 8409 HWY_API Vec128<TTo, 1> TruncateTo(DTo /* tag */, Vec128<TFrom, 1> v) { 8410 const Repartition<TTo, DFromV<decltype(v)>> d; 8411 return Vec128<TTo, 1>{BitCast(d, v).raw}; 8412 } 8413 8414 template <class D, HWY_IF_U8_D(D)> 8415 HWY_API Vec16<uint8_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 8416 const Repartition<uint8_t, DFromV<decltype(v)>> d; 8417 const auto v1 = BitCast(d, v); 8418 const auto v2 = detail::ConcatEven(v1, v1); 8419 const auto v3 = detail::ConcatEven(v2, v2); 8420 const auto v4 = detail::ConcatEven(v3, v3); 8421 return LowerHalf(LowerHalf(LowerHalf(v4))); 8422 } 8423 8424 template <class D, HWY_IF_U16_D(D)> 8425 HWY_API Vec32<uint16_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 8426 const Repartition<uint16_t, DFromV<decltype(v)>> d; 8427 const auto v1 = BitCast(d, v); 8428 const auto v2 = detail::ConcatEven(v1, v1); 8429 const auto v3 = detail::ConcatEven(v2, v2); 8430 return LowerHalf(LowerHalf(v3)); 8431 } 8432 8433 template <class D, HWY_IF_U32_D(D)> 8434 HWY_API Vec64<uint32_t> TruncateTo(D /* tag */, Vec128<uint64_t> v) { 8435 const Repartition<uint32_t, DFromV<decltype(v)>> d; 8436 const auto v1 = BitCast(d, v); 8437 const auto v2 = detail::ConcatEven(v1, v1); 8438 return LowerHalf(v2); 8439 } 8440 8441 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)> 8442 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 8443 const Repartition<uint8_t, DFromV<decltype(v)>> d; 8444 const auto v1 = BitCast(d, v); 8445 const auto v2 = detail::ConcatEven(v1, v1); 8446 const auto v3 = detail::ConcatEven(v2, v2); 8447 return LowerHalf(LowerHalf(v3)); 8448 } 8449 8450 template <class D, HWY_IF_U16_D(D), HWY_IF_LANES_GT_D(D, 1)> 8451 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint32_t, D>> v) { 8452 const Repartition<uint16_t, DFromV<decltype(v)>> d; 8453 const auto v1 = BitCast(d, v); 8454 const auto v2 = detail::ConcatEven(v1, v1); 8455 return LowerHalf(v2); 8456 } 8457 8458 template <class D, HWY_IF_U8_D(D), HWY_IF_LANES_GT_D(D, 1)> 8459 HWY_API VFromD<D> TruncateTo(D /* tag */, VFromD<Rebind<uint16_t, D>> v) { 8460 const Repartition<uint8_t, DFromV<decltype(v)>> d; 8461 const auto v1 = BitCast(d, v); 8462 const auto v2 = detail::ConcatEven(v1, v1); 8463 return LowerHalf(v2); 8464 } 8465 8466 // ------------------------------ MulEven (ConcatEven) 8467 8468 // Multiplies even lanes (0, 2 ..) and places the double-wide result into 8469 // even and the upper half into its odd neighbor lane. 8470 HWY_API Vec128<int16_t> MulEven(Vec128<int8_t> a, Vec128<int8_t> b) { 8471 const DFromV<decltype(a)> d; 8472 int8x16_t a_packed = ConcatEven(d, a, a).raw; 8473 int8x16_t b_packed = ConcatEven(d, b, b).raw; 8474 return Vec128<int16_t>( 8475 vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed))); 8476 } 8477 HWY_API Vec128<uint16_t> MulEven(Vec128<uint8_t> a, Vec128<uint8_t> b) { 8478 const DFromV<decltype(a)> d; 8479 uint8x16_t a_packed = ConcatEven(d, a, a).raw; 8480 uint8x16_t b_packed = ConcatEven(d, b, b).raw; 8481 return Vec128<uint16_t>( 8482 vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed))); 8483 } 8484 HWY_API Vec128<int32_t> MulEven(Vec128<int16_t> a, Vec128<int16_t> b) { 8485 const DFromV<decltype(a)> d; 8486 int16x8_t a_packed = ConcatEven(d, a, a).raw; 8487 int16x8_t b_packed = ConcatEven(d, b, b).raw; 8488 return Vec128<int32_t>( 8489 vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed))); 8490 } 8491 HWY_API Vec128<uint32_t> MulEven(Vec128<uint16_t> a, Vec128<uint16_t> b) { 8492 const DFromV<decltype(a)> d; 8493 uint16x8_t a_packed = ConcatEven(d, a, a).raw; 8494 uint16x8_t b_packed = ConcatEven(d, b, b).raw; 8495 return Vec128<uint32_t>( 8496 vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed))); 8497 } 8498 HWY_API Vec128<int64_t> MulEven(Vec128<int32_t> a, Vec128<int32_t> b) { 8499 const DFromV<decltype(a)> d; 8500 int32x4_t a_packed = ConcatEven(d, a, a).raw; 8501 int32x4_t b_packed = ConcatEven(d, b, b).raw; 8502 return Vec128<int64_t>( 8503 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); 8504 } 8505 HWY_API Vec128<uint64_t> MulEven(Vec128<uint32_t> a, Vec128<uint32_t> b) { 8506 const DFromV<decltype(a)> d; 8507 uint32x4_t a_packed = ConcatEven(d, a, a).raw; 8508 uint32x4_t b_packed = ConcatEven(d, b, b).raw; 8509 return Vec128<uint64_t>( 8510 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); 8511 } 8512 8513 template <size_t N> 8514 HWY_API Vec128<int16_t, (N + 1) / 2> MulEven(Vec128<int8_t, N> a, 8515 Vec128<int8_t, N> b) { 8516 const DFromV<decltype(a)> d; 8517 int8x8_t a_packed = ConcatEven(d, a, a).raw; 8518 int8x8_t b_packed = ConcatEven(d, b, b).raw; 8519 return Vec128<int16_t, (N + 1) / 2>( 8520 vget_low_s16(vmull_s8(a_packed, b_packed))); 8521 } 8522 template <size_t N> 8523 HWY_API Vec128<uint16_t, (N + 1) / 2> MulEven(Vec128<uint8_t, N> a, 8524 Vec128<uint8_t, N> b) { 8525 const DFromV<decltype(a)> d; 8526 uint8x8_t a_packed = ConcatEven(d, a, a).raw; 8527 uint8x8_t b_packed = ConcatEven(d, b, b).raw; 8528 return Vec128<uint16_t, (N + 1) / 2>( 8529 vget_low_u16(vmull_u8(a_packed, b_packed))); 8530 } 8531 template <size_t N> 8532 HWY_API Vec128<int32_t, (N + 1) / 2> MulEven(Vec128<int16_t, N> a, 8533 Vec128<int16_t, N> b) { 8534 const DFromV<decltype(a)> d; 8535 int16x4_t a_packed = ConcatEven(d, a, a).raw; 8536 int16x4_t b_packed = ConcatEven(d, b, b).raw; 8537 return Vec128<int32_t, (N + 1) / 2>( 8538 vget_low_s32(vmull_s16(a_packed, b_packed))); 8539 } 8540 template <size_t N> 8541 HWY_API Vec128<uint32_t, (N + 1) / 2> MulEven(Vec128<uint16_t, N> a, 8542 Vec128<uint16_t, N> b) { 8543 const DFromV<decltype(a)> d; 8544 uint16x4_t a_packed = ConcatEven(d, a, a).raw; 8545 uint16x4_t b_packed = ConcatEven(d, b, b).raw; 8546 return Vec128<uint32_t, (N + 1) / 2>( 8547 vget_low_u32(vmull_u16(a_packed, b_packed))); 8548 } 8549 template <size_t N> 8550 HWY_API Vec128<int64_t, (N + 1) / 2> MulEven(Vec128<int32_t, N> a, 8551 Vec128<int32_t, N> b) { 8552 const DFromV<decltype(a)> d; 8553 int32x2_t a_packed = ConcatEven(d, a, a).raw; 8554 int32x2_t b_packed = ConcatEven(d, b, b).raw; 8555 return Vec128<int64_t, (N + 1) / 2>( 8556 vget_low_s64(vmull_s32(a_packed, b_packed))); 8557 } 8558 template <size_t N> 8559 HWY_API Vec128<uint64_t, (N + 1) / 2> MulEven(Vec128<uint32_t, N> a, 8560 Vec128<uint32_t, N> b) { 8561 const DFromV<decltype(a)> d; 8562 uint32x2_t a_packed = ConcatEven(d, a, a).raw; 8563 uint32x2_t b_packed = ConcatEven(d, b, b).raw; 8564 return Vec128<uint64_t, (N + 1) / 2>( 8565 vget_low_u64(vmull_u32(a_packed, b_packed))); 8566 } 8567 8568 template <class T, HWY_IF_UI64(T)> 8569 HWY_INLINE Vec128<T> MulEven(Vec128<T> a, Vec128<T> b) { 8570 T hi; 8571 T lo = Mul128(GetLane(a), GetLane(b), &hi); 8572 return Dup128VecFromValues(Full128<T>(), lo, hi); 8573 } 8574 8575 // Multiplies odd lanes (1, 3 ..) and places the double-wide result into 8576 // even and the upper half into its odd neighbor lane. 8577 HWY_API Vec128<int16_t> MulOdd(Vec128<int8_t> a, Vec128<int8_t> b) { 8578 const DFromV<decltype(a)> d; 8579 int8x16_t a_packed = ConcatOdd(d, a, a).raw; 8580 int8x16_t b_packed = ConcatOdd(d, b, b).raw; 8581 return Vec128<int16_t>( 8582 vmull_s8(vget_low_s8(a_packed), vget_low_s8(b_packed))); 8583 } 8584 HWY_API Vec128<uint16_t> MulOdd(Vec128<uint8_t> a, Vec128<uint8_t> b) { 8585 const DFromV<decltype(a)> d; 8586 uint8x16_t a_packed = ConcatOdd(d, a, a).raw; 8587 uint8x16_t b_packed = ConcatOdd(d, b, b).raw; 8588 return Vec128<uint16_t>( 8589 vmull_u8(vget_low_u8(a_packed), vget_low_u8(b_packed))); 8590 } 8591 HWY_API Vec128<int32_t> MulOdd(Vec128<int16_t> a, Vec128<int16_t> b) { 8592 const DFromV<decltype(a)> d; 8593 int16x8_t a_packed = ConcatOdd(d, a, a).raw; 8594 int16x8_t b_packed = ConcatOdd(d, b, b).raw; 8595 return Vec128<int32_t>( 8596 vmull_s16(vget_low_s16(a_packed), vget_low_s16(b_packed))); 8597 } 8598 HWY_API Vec128<uint32_t> MulOdd(Vec128<uint16_t> a, Vec128<uint16_t> b) { 8599 const DFromV<decltype(a)> d; 8600 uint16x8_t a_packed = ConcatOdd(d, a, a).raw; 8601 uint16x8_t b_packed = ConcatOdd(d, b, b).raw; 8602 return Vec128<uint32_t>( 8603 vmull_u16(vget_low_u16(a_packed), vget_low_u16(b_packed))); 8604 } 8605 HWY_API Vec128<int64_t> MulOdd(Vec128<int32_t> a, Vec128<int32_t> b) { 8606 const DFromV<decltype(a)> d; 8607 int32x4_t a_packed = ConcatOdd(d, a, a).raw; 8608 int32x4_t b_packed = ConcatOdd(d, b, b).raw; 8609 return Vec128<int64_t>( 8610 vmull_s32(vget_low_s32(a_packed), vget_low_s32(b_packed))); 8611 } 8612 HWY_API Vec128<uint64_t> MulOdd(Vec128<uint32_t> a, Vec128<uint32_t> b) { 8613 const DFromV<decltype(a)> d; 8614 uint32x4_t a_packed = ConcatOdd(d, a, a).raw; 8615 uint32x4_t b_packed = ConcatOdd(d, b, b).raw; 8616 return Vec128<uint64_t>( 8617 vmull_u32(vget_low_u32(a_packed), vget_low_u32(b_packed))); 8618 } 8619 8620 template <size_t N> 8621 HWY_API Vec128<int16_t, (N + 1) / 2> MulOdd(Vec128<int8_t, N> a, 8622 Vec128<int8_t, N> b) { 8623 const DFromV<decltype(a)> d; 8624 int8x8_t a_packed = ConcatOdd(d, a, a).raw; 8625 int8x8_t b_packed = ConcatOdd(d, b, b).raw; 8626 return Vec128<int16_t, (N + 1) / 2>( 8627 vget_low_s16(vmull_s8(a_packed, b_packed))); 8628 } 8629 template <size_t N> 8630 HWY_API Vec128<uint16_t, (N + 1) / 2> MulOdd(Vec128<uint8_t, N> a, 8631 Vec128<uint8_t, N> b) { 8632 const DFromV<decltype(a)> d; 8633 uint8x8_t a_packed = ConcatOdd(d, a, a).raw; 8634 uint8x8_t b_packed = ConcatOdd(d, b, b).raw; 8635 return Vec128<uint16_t, (N + 1) / 2>( 8636 vget_low_u16(vmull_u8(a_packed, b_packed))); 8637 } 8638 template <size_t N> 8639 HWY_API Vec128<int32_t, (N + 1) / 2> MulOdd(Vec128<int16_t, N> a, 8640 Vec128<int16_t, N> b) { 8641 const DFromV<decltype(a)> d; 8642 int16x4_t a_packed = ConcatOdd(d, a, a).raw; 8643 int16x4_t b_packed = ConcatOdd(d, b, b).raw; 8644 return Vec128<int32_t, (N + 1) / 2>( 8645 vget_low_s32(vmull_s16(a_packed, b_packed))); 8646 } 8647 template <size_t N> 8648 HWY_API Vec128<uint32_t, (N + 1) / 2> MulOdd(Vec128<uint16_t, N> a, 8649 Vec128<uint16_t, N> b) { 8650 const DFromV<decltype(a)> d; 8651 uint16x4_t a_packed = ConcatOdd(d, a, a).raw; 8652 uint16x4_t b_packed = ConcatOdd(d, b, b).raw; 8653 return Vec128<uint32_t, (N + 1) / 2>( 8654 vget_low_u32(vmull_u16(a_packed, b_packed))); 8655 } 8656 template <size_t N> 8657 HWY_API Vec128<int64_t, (N + 1) / 2> MulOdd(Vec128<int32_t, N> a, 8658 Vec128<int32_t, N> b) { 8659 const DFromV<decltype(a)> d; 8660 int32x2_t a_packed = ConcatOdd(d, a, a).raw; 8661 int32x2_t b_packed = ConcatOdd(d, b, b).raw; 8662 return Vec128<int64_t, (N + 1) / 2>( 8663 vget_low_s64(vmull_s32(a_packed, b_packed))); 8664 } 8665 template <size_t N> 8666 HWY_API Vec128<uint64_t, (N + 1) / 2> MulOdd(Vec128<uint32_t, N> a, 8667 Vec128<uint32_t, N> b) { 8668 const DFromV<decltype(a)> d; 8669 uint32x2_t a_packed = ConcatOdd(d, a, a).raw; 8670 uint32x2_t b_packed = ConcatOdd(d, b, b).raw; 8671 return Vec128<uint64_t, (N + 1) / 2>( 8672 vget_low_u64(vmull_u32(a_packed, b_packed))); 8673 } 8674 8675 template <class T, HWY_IF_UI64(T)> 8676 HWY_INLINE Vec128<T> MulOdd(Vec128<T> a, Vec128<T> b) { 8677 T hi; 8678 T lo = Mul128(detail::GetLane<1>(a), detail::GetLane<1>(b), &hi); 8679 return Dup128VecFromValues(Full128<T>(), lo, hi); 8680 } 8681 8682 // ------------------------------ TableLookupBytes (Combine, LowerHalf) 8683 8684 // Both full 8685 template <typename T, typename TI> 8686 HWY_API Vec128<TI> TableLookupBytes(Vec128<T> bytes, Vec128<TI> from) { 8687 const DFromV<decltype(from)> d; 8688 const Repartition<uint8_t, decltype(d)> d8; 8689 #if HWY_ARCH_ARM_A64 8690 return BitCast(d, Vec128<uint8_t>(vqtbl1q_u8(BitCast(d8, bytes).raw, 8691 BitCast(d8, from).raw))); 8692 #else 8693 uint8x16_t table0 = BitCast(d8, bytes).raw; 8694 uint8x8x2_t table; 8695 table.val[0] = vget_low_u8(table0); 8696 table.val[1] = vget_high_u8(table0); 8697 uint8x16_t idx = BitCast(d8, from).raw; 8698 uint8x8_t low = vtbl2_u8(table, vget_low_u8(idx)); 8699 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(idx)); 8700 return BitCast(d, Vec128<uint8_t>(vcombine_u8(low, hi))); 8701 #endif 8702 } 8703 8704 // Partial index vector 8705 template <typename T, typename TI, size_t NI, HWY_IF_V_SIZE_LE(TI, NI, 8)> 8706 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T> bytes, Vec128<TI, NI> from) { 8707 const Full128<TI> d_full; 8708 const Vec64<TI> from64(from.raw); 8709 const auto idx_full = Combine(d_full, from64, from64); 8710 const auto out_full = TableLookupBytes(bytes, idx_full); 8711 return Vec128<TI, NI>(LowerHalf(Half<decltype(d_full)>(), out_full).raw); 8712 } 8713 8714 // Partial table vector 8715 template <typename T, size_t N, typename TI, HWY_IF_V_SIZE_LE(T, N, 8)> 8716 HWY_API Vec128<TI> TableLookupBytes(Vec128<T, N> bytes, Vec128<TI> from) { 8717 const Full128<T> d_full; 8718 return TableLookupBytes(Combine(d_full, bytes, bytes), from); 8719 } 8720 8721 // Partial both 8722 template <typename T, size_t N, typename TI, size_t NI, 8723 HWY_IF_V_SIZE_LE(T, N, 8), HWY_IF_V_SIZE_LE(TI, NI, 8)> 8724 HWY_API Vec128<TI, NI> TableLookupBytes(Vec128<T, N> bytes, 8725 Vec128<TI, NI> from) { 8726 const DFromV<decltype(bytes)> d; 8727 const Simd<TI, NI, 0> d_idx; 8728 const Repartition<uint8_t, decltype(d_idx)> d_idx8; 8729 // uint8x8 8730 const auto bytes8 = BitCast(Repartition<uint8_t, decltype(d)>(), bytes); 8731 const auto from8 = BitCast(d_idx8, from); 8732 const VFromD<decltype(d_idx8)> v8(vtbl1_u8(bytes8.raw, from8.raw)); 8733 return BitCast(d_idx, v8); 8734 } 8735 8736 // For all vector widths; Arm anyway zeroes if >= 0x10. 8737 template <class V, class VI> 8738 HWY_API VI TableLookupBytesOr0(V bytes, VI from) { 8739 return TableLookupBytes(bytes, from); 8740 } 8741 8742 // ---------------------------- AESKeyGenAssist (AESLastRound, TableLookupBytes) 8743 8744 #if HWY_TARGET != HWY_NEON_WITHOUT_AES 8745 template <uint8_t kRcon> 8746 HWY_API Vec128<uint8_t> AESKeyGenAssist(Vec128<uint8_t> v) { 8747 alignas(16) static constexpr uint8_t kRconXorMask[16] = { 8748 0, kRcon, 0, 0, 0, 0, 0, 0, 0, kRcon, 0, 0, 0, 0, 0, 0}; 8749 alignas(16) static constexpr uint8_t kRotWordShuffle[16] = { 8750 0, 13, 10, 7, 1, 14, 11, 4, 8, 5, 2, 15, 9, 6, 3, 12}; 8751 const DFromV<decltype(v)> d; 8752 const Repartition<uint32_t, decltype(d)> du32; 8753 const auto w13 = BitCast(d, DupOdd(BitCast(du32, v))); 8754 const auto sub_word_result = AESLastRound(w13, Load(d, kRconXorMask)); 8755 return TableLookupBytes(sub_word_result, Load(d, kRotWordShuffle)); 8756 } 8757 #endif // HWY_TARGET != HWY_NEON_WITHOUT_AES 8758 8759 // ------------------------------ Scatter in generic_ops-inl.h 8760 // ------------------------------ Gather in generic_ops-inl.h 8761 8762 // ------------------------------ Reductions 8763 8764 // On Armv8 we define ReduceSum and generic_ops defines SumOfLanes via Set. 8765 #if HWY_ARCH_ARM_A64 8766 8767 #ifdef HWY_NATIVE_REDUCE_SCALAR 8768 #undef HWY_NATIVE_REDUCE_SCALAR 8769 #else 8770 #define HWY_NATIVE_REDUCE_SCALAR 8771 #endif 8772 8773 // TODO(janwas): use normal HWY_NEON_DEF, then FULL type list. 8774 #define HWY_NEON_DEF_REDUCTION(type, size, name, prefix, infix, suffix) \ 8775 template <class D, HWY_IF_LANES_D(D, size)> \ 8776 HWY_API type##_t name(D /* tag */, Vec128<type##_t, size> v) { \ 8777 return HWY_NEON_EVAL(prefix##infix##suffix, v.raw); \ 8778 } 8779 8780 // Excludes u64/s64 (missing minv/maxv) and f16 (missing addv). 8781 #define HWY_NEON_DEF_REDUCTION_CORE_TYPES(name, prefix) \ 8782 HWY_NEON_DEF_REDUCTION(uint8, 8, name, prefix, _, u8) \ 8783 HWY_NEON_DEF_REDUCTION(uint8, 16, name, prefix##q, _, u8) \ 8784 HWY_NEON_DEF_REDUCTION(uint16, 4, name, prefix, _, u16) \ 8785 HWY_NEON_DEF_REDUCTION(uint16, 8, name, prefix##q, _, u16) \ 8786 HWY_NEON_DEF_REDUCTION(uint32, 2, name, prefix, _, u32) \ 8787 HWY_NEON_DEF_REDUCTION(uint32, 4, name, prefix##q, _, u32) \ 8788 HWY_NEON_DEF_REDUCTION(int8, 8, name, prefix, _, s8) \ 8789 HWY_NEON_DEF_REDUCTION(int8, 16, name, prefix##q, _, s8) \ 8790 HWY_NEON_DEF_REDUCTION(int16, 4, name, prefix, _, s16) \ 8791 HWY_NEON_DEF_REDUCTION(int16, 8, name, prefix##q, _, s16) \ 8792 HWY_NEON_DEF_REDUCTION(int32, 2, name, prefix, _, s32) \ 8793 HWY_NEON_DEF_REDUCTION(int32, 4, name, prefix##q, _, s32) \ 8794 HWY_NEON_DEF_REDUCTION(float32, 2, name, prefix, _, f32) \ 8795 HWY_NEON_DEF_REDUCTION(float32, 4, name, prefix##q, _, f32) \ 8796 HWY_NEON_DEF_REDUCTION(float64, 2, name, prefix##q, _, f64) 8797 8798 // Different interface than HWY_NEON_DEF_FUNCTION_FULL_UI_64. 8799 #define HWY_NEON_DEF_REDUCTION_UI64(name, prefix) \ 8800 HWY_NEON_DEF_REDUCTION(uint64, 2, name, prefix##q, _, u64) \ 8801 HWY_NEON_DEF_REDUCTION(int64, 2, name, prefix##q, _, s64) 8802 8803 #if HWY_HAVE_FLOAT16 8804 #define HWY_NEON_DEF_REDUCTION_F16(name, prefix) \ 8805 HWY_NEON_DEF_REDUCTION(float16, 4, name, prefix, _, f16) \ 8806 HWY_NEON_DEF_REDUCTION(float16, 8, name, prefix##q, _, f16) 8807 #else 8808 #define HWY_NEON_DEF_REDUCTION_F16(name, prefix) 8809 #endif 8810 8811 HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMin, vminv) 8812 HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceMax, vmaxv) 8813 HWY_NEON_DEF_REDUCTION_F16(ReduceMin, vminv) 8814 HWY_NEON_DEF_REDUCTION_F16(ReduceMax, vmaxv) 8815 8816 HWY_NEON_DEF_REDUCTION_CORE_TYPES(ReduceSum, vaddv) 8817 HWY_NEON_DEF_REDUCTION_UI64(ReduceSum, vaddv) 8818 8819 // Emulate missing UI64 and partial N=2. 8820 template <class D, HWY_IF_LANES_D(D, 2), 8821 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2))> 8822 HWY_API TFromD<D> ReduceSum(D /* tag */, VFromD<D> v10) { 8823 return GetLane(v10) + ExtractLane(v10, 1); 8824 } 8825 8826 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D), 8827 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))> 8828 HWY_API TFromD<D> ReduceMin(D /* tag */, VFromD<D> v10) { 8829 return HWY_MIN(GetLane(v10), ExtractLane(v10, 1)); 8830 } 8831 8832 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_NOT_FLOAT_D(D), 8833 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 8))> 8834 HWY_API TFromD<D> ReduceMax(D /* tag */, VFromD<D> v10) { 8835 return HWY_MAX(GetLane(v10), ExtractLane(v10, 1)); 8836 } 8837 8838 #if HWY_HAVE_FLOAT16 8839 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)> 8840 HWY_API float16_t ReduceMin(D d, VFromD<D> v10) { 8841 return GetLane(Min(v10, Reverse2(d, v10))); 8842 } 8843 8844 template <class D, HWY_IF_LANES_D(D, 2), HWY_IF_F16_D(D)> 8845 HWY_API float16_t ReduceMax(D d, VFromD<D> v10) { 8846 return GetLane(Max(v10, Reverse2(d, v10))); 8847 } 8848 8849 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 8)> 8850 HWY_API float16_t ReduceSum(D /* tag */, VFromD<D> v) { 8851 const float16x4_t x2 = vpadd_f16(v.raw, v.raw); 8852 return GetLane(VFromD<D>(vpadd_f16(x2, x2))); 8853 } 8854 template <class D, HWY_IF_F16_D(D), HWY_IF_V_SIZE_D(D, 16)> 8855 HWY_API float16_t ReduceSum(D d, VFromD<D> v) { 8856 const Half<decltype(d)> dh; 8857 return ReduceSum(dh, LowerHalf(dh, VFromD<D>(vpaddq_f16(v.raw, v.raw)))); 8858 } 8859 #endif // HWY_HAVE_FLOAT16 8860 8861 #undef HWY_NEON_DEF_REDUCTION_CORE_TYPES 8862 #undef HWY_NEON_DEF_REDUCTION_F16 8863 #undef HWY_NEON_DEF_REDUCTION_UI64 8864 #undef HWY_NEON_DEF_REDUCTION 8865 8866 // ------------------------------ SumOfLanes 8867 8868 template <class D, HWY_IF_LANES_GT_D(D, 1)> 8869 HWY_API VFromD<D> SumOfLanes(D d, VFromD<D> v) { 8870 return Set(d, ReduceSum(d, v)); 8871 } 8872 template <class D, HWY_IF_LANES_GT_D(D, 1)> 8873 HWY_API VFromD<D> MinOfLanes(D d, VFromD<D> v) { 8874 return Set(d, ReduceMin(d, v)); 8875 } 8876 template <class D, HWY_IF_LANES_GT_D(D, 1)> 8877 HWY_API VFromD<D> MaxOfLanes(D d, VFromD<D> v) { 8878 return Set(d, ReduceMax(d, v)); 8879 } 8880 8881 // On Armv7 we define SumOfLanes and generic_ops defines ReduceSum via GetLane. 8882 #else // !HWY_ARCH_ARM_A64 8883 8884 // Armv7 lacks N=2 (except 32-bit) and 8-bit x4, so enable them in generic_ops. 8885 #undef HWY_IF_SUM_OF_LANES_D 8886 #define HWY_IF_SUM_OF_LANES_D(D) \ 8887 hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) || \ 8888 (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \ 8889 nullptr 8890 #undef HWY_IF_MINMAX_OF_LANES_D 8891 #define HWY_IF_MINMAX_OF_LANES_D(D) \ 8892 hwy::EnableIf<(sizeof(TFromD<D>) != 4 && HWY_MAX_LANES_D(D) == 2) || \ 8893 (sizeof(TFromD<D>) == 1 && HWY_MAX_LANES_D(D) == 4)>* = \ 8894 nullptr 8895 8896 // For arm7, we implement reductions using a series of pairwise operations. This 8897 // produces the full vector result, so we express Reduce* in terms of *OfLanes. 8898 8899 #define HWY_NEON_DEF_PAIRWISE_REDUCTION(name) \ 8900 /* generic_ops-inl.h handles 64-bit types. */ \ 8901 template <class D, HWY_IF_V_SIZE_D(D, 8), HWY_IF_NOT_T_SIZE_D(D, 8)> \ 8902 HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) { \ 8903 HWY_LANES_CONSTEXPR size_t N = Lanes(d); \ 8904 VFromD<D> tmp = detail::Pairwise##name(v, v); \ 8905 if ((N / 2) > 1) tmp = detail::Pairwise##name(tmp, tmp); \ 8906 if ((N / 4) > 1) tmp = detail::Pairwise##name(tmp, tmp); \ 8907 return tmp; \ 8908 } \ 8909 /* Armv7 lacks q (full-vector) instructions, so first reduce 128-bit v */ \ 8910 /* into a half-vector, then reduce that. */ \ 8911 template <class D, HWY_IF_V_SIZE_D(D, 16), HWY_IF_NOT_T_SIZE_D(D, 8)> \ 8912 HWY_API VFromD<D> name##OfLanes(D d, VFromD<D> v) { \ 8913 const Half<D> dh; \ 8914 VFromD<decltype(dh)> upper = UpperHalf(dh, v); \ 8915 VFromD<decltype(dh)> lower = LowerHalf(dh, v); \ 8916 VFromD<decltype(dh)> half = detail::Pairwise##name(upper, lower); \ 8917 half = name##OfLanes(dh, half); \ 8918 return Combine(d, half, half); \ 8919 } 8920 8921 HWY_NEON_DEF_PAIRWISE_REDUCTION(Sum) 8922 HWY_NEON_DEF_PAIRWISE_REDUCTION(Min) 8923 HWY_NEON_DEF_PAIRWISE_REDUCTION(Max) 8924 #undef HWY_NEON_DEF_PAIRWISE_REDUCTION 8925 8926 // GetLane(SumsOf4(v)) is more efficient on ArmV7 NEON than the default 8927 // N=4 I8/U8 ReduceSum implementation in generic_ops-inl.h 8928 #ifdef HWY_NATIVE_REDUCE_SUM_4_UI8 8929 #undef HWY_NATIVE_REDUCE_SUM_4_UI8 8930 #else 8931 #define HWY_NATIVE_REDUCE_SUM_4_UI8 8932 #endif 8933 8934 template <class D, HWY_IF_V_SIZE_D(D, 4), HWY_IF_UI8_D(D)> 8935 HWY_API TFromD<D> ReduceSum(D /*d*/, VFromD<D> v) { 8936 return static_cast<TFromD<D>>(GetLane(SumsOf4(v))); 8937 } 8938 8939 #endif // HWY_ARCH_ARM_A64 8940 8941 // ------------------------------ LoadMaskBits (TestBit) 8942 8943 namespace detail { 8944 8945 // Helper function to set 64 bits and potentially return a smaller vector. The 8946 // overload is required to call the q vs non-q intrinsics. Note that 8-bit 8947 // LoadMaskBits only requires 16 bits, but 64 avoids casting. 8948 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 8949 HWY_INLINE VFromD<D> Set64(D /* tag */, uint64_t mask_bits) { 8950 const auto v64 = Vec64<uint64_t>(vdup_n_u64(mask_bits)); 8951 return VFromD<D>(BitCast(Full64<TFromD<D>>(), v64).raw); 8952 } 8953 template <typename T> 8954 HWY_INLINE Vec128<T> Set64(Full128<T> d, uint64_t mask_bits) { 8955 return BitCast(d, Vec128<uint64_t>(vdupq_n_u64(mask_bits))); 8956 } 8957 8958 template <class D, HWY_IF_T_SIZE_D(D, 1)> 8959 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { 8960 const RebindToUnsigned<decltype(d)> du; 8961 // Easier than Set(), which would require an >8-bit type, which would not 8962 // compile for T=uint8_t, N=1. 8963 const auto vmask_bits = Set64(du, mask_bits); 8964 8965 // Replicate bytes 8x such that each byte contains the bit that governs it. 8966 alignas(16) static constexpr uint8_t kRep8[16] = {0, 0, 0, 0, 0, 0, 0, 0, 8967 1, 1, 1, 1, 1, 1, 1, 1}; 8968 const auto rep8 = TableLookupBytes(vmask_bits, Load(du, kRep8)); 8969 8970 alignas(16) static constexpr uint8_t kBit[16] = {1, 2, 4, 8, 16, 32, 64, 128, 8971 1, 2, 4, 8, 16, 32, 64, 128}; 8972 return RebindMask(d, TestBit(rep8, LoadDup128(du, kBit))); 8973 } 8974 8975 template <class D, HWY_IF_T_SIZE_D(D, 2)> 8976 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { 8977 const RebindToUnsigned<decltype(d)> du; 8978 alignas(16) static constexpr uint16_t kBit[8] = {1, 2, 4, 8, 16, 32, 64, 128}; 8979 const auto vmask_bits = Set(du, static_cast<uint16_t>(mask_bits)); 8980 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 8981 } 8982 8983 template <class D, HWY_IF_T_SIZE_D(D, 4)> 8984 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { 8985 const RebindToUnsigned<decltype(d)> du; 8986 alignas(16) static constexpr uint32_t kBit[8] = {1, 2, 4, 8}; 8987 const auto vmask_bits = Set(du, static_cast<uint32_t>(mask_bits)); 8988 return RebindMask(d, TestBit(vmask_bits, Load(du, kBit))); 8989 } 8990 8991 template <class D, HWY_IF_T_SIZE_D(D, 8)> 8992 HWY_INLINE MFromD<D> LoadMaskBits(D d, uint64_t mask_bits) { 8993 const RebindToUnsigned<decltype(d)> du; 8994 alignas(16) static constexpr uint64_t kBit[8] = {1, 2}; 8995 return RebindMask(d, TestBit(Set(du, mask_bits), Load(du, kBit))); 8996 } 8997 8998 } // namespace detail 8999 9000 // `p` points to at least 8 readable bytes, not all of which need be valid. 9001 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 9002 HWY_API MFromD<D> LoadMaskBits(D d, const uint8_t* HWY_RESTRICT bits) { 9003 uint64_t mask_bits = 0; 9004 CopyBytes<(d.MaxLanes() + 7) / 8>(bits, &mask_bits); 9005 return detail::LoadMaskBits(d, mask_bits); 9006 } 9007 9008 // ------------------------------ Dup128MaskFromMaskBits 9009 9010 template <class D> 9011 HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) { 9012 constexpr size_t kN = MaxLanes(d); 9013 if (kN < 8) mask_bits &= (1u << kN) - 1; 9014 return detail::LoadMaskBits(d, mask_bits); 9015 } 9016 9017 // ------------------------------ Mask 9018 9019 namespace detail { 9020 9021 // Returns mask[i]? 0xF : 0 in each nibble. This is more efficient than 9022 // BitsFromMask for use in (partial) CountTrue, FindFirstTrue and AllFalse. 9023 template <class D, HWY_IF_V_SIZE_D(D, 16)> 9024 HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) { 9025 const Full128<uint16_t> du16; 9026 const Vec128<uint16_t> vu16 = BitCast(du16, VecFromMask(d, mask)); 9027 const Vec64<uint8_t> nib(vshrn_n_u16(vu16.raw, 4)); 9028 return GetLane(BitCast(Full64<uint64_t>(), nib)); 9029 } 9030 9031 template <class D, HWY_IF_V_SIZE_D(D, 8)> 9032 HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) { 9033 // There is no vshrn_n_u16 for uint16x4, so zero-extend. 9034 const Twice<decltype(d)> d2; 9035 const VFromD<decltype(d2)> v128 = ZeroExtendVector(d2, VecFromMask(d, mask)); 9036 // No need to mask, upper half is zero thanks to ZeroExtendVector. 9037 return NibblesFromMask(d2, MaskFromVec(v128)); 9038 } 9039 9040 template <class D, HWY_IF_V_SIZE_LE_D(D, 4)> 9041 HWY_INLINE uint64_t NibblesFromMask(D d, MFromD<D> mask) { 9042 const Mask64<TFromD<D>> mask64(mask.raw); 9043 const uint64_t nib = NibblesFromMask(Full64<TFromD<D>>(), mask64); 9044 // Clear nibbles from upper half of 64-bits 9045 return nib & ((1ull << (d.MaxBytes() * 4)) - 1); 9046 } 9047 9048 // Returns the lowest N for the BitsFromMask result. 9049 template <class D> 9050 constexpr uint64_t OnlyActive(D d, uint64_t bits) { 9051 return (d.MaxBytes() >= 8) ? bits : (bits & ((1ull << d.MaxLanes()) - 1)); 9052 } 9053 9054 } // namespace detail 9055 9056 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_D(D, 16)> 9057 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9058 alignas(16) static constexpr uint8_t kSliceLanes[16] = { 9059 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80, 9060 }; 9061 const RebindToUnsigned<D> du; 9062 const Vec128<uint8_t> values = 9063 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); 9064 9065 #if HWY_ARCH_ARM_A64 9066 // Can't vaddv - we need two separate bytes (16 bits). 9067 const uint8x8_t x2 = vget_low_u8(vpaddq_u8(values.raw, values.raw)); 9068 const uint8x8_t x4 = vpadd_u8(x2, x2); 9069 const uint8x8_t x8 = vpadd_u8(x4, x4); 9070 return vget_lane_u64(vreinterpret_u64_u8(x8), 0) & 0xFFFF; 9071 #else 9072 // Don't have vpaddq, so keep doubling lane size. 9073 const uint16x8_t x2 = vpaddlq_u8(values.raw); 9074 const uint32x4_t x4 = vpaddlq_u16(x2); 9075 const uint64x2_t x8 = vpaddlq_u32(x4); 9076 return (vgetq_lane_u64(x8, 1) << 8) | vgetq_lane_u64(x8, 0); 9077 #endif 9078 } 9079 9080 template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_V_SIZE_LE_D(D, 8)> 9081 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9082 // Upper lanes of partial loads are undefined. OnlyActive will fix this if 9083 // we load all kSliceLanes so the upper lanes do not pollute the valid bits. 9084 alignas(8) static constexpr uint8_t kSliceLanes[8] = {1, 2, 4, 8, 9085 0x10, 0x20, 0x40, 0x80}; 9086 const RebindToUnsigned<decltype(d)> du; 9087 using VU = VFromD<decltype(du)>; 9088 const VU slice(Load(Full64<uint8_t>(), kSliceLanes).raw); 9089 const VU values = BitCast(du, VecFromMask(d, mask)) & slice; 9090 9091 #if HWY_ARCH_ARM_A64 9092 return detail::OnlyActive(d, vaddv_u8(values.raw)); 9093 #else 9094 const uint16x4_t x2 = vpaddl_u8(values.raw); 9095 const uint32x2_t x4 = vpaddl_u16(x2); 9096 const uint64x1_t x8 = vpaddl_u32(x4); 9097 return detail::OnlyActive(d, vget_lane_u64(x8, 0)); 9098 #endif 9099 } 9100 9101 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_D(D, 16)> 9102 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9103 alignas(16) static constexpr uint16_t kSliceLanes[8] = { 9104 1, 2, 4, 8, 0x10, 0x20, 0x40, 0x80}; 9105 const RebindToUnsigned<D> du; 9106 const Vec128<uint16_t> values = 9107 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); 9108 #if HWY_ARCH_ARM_A64 9109 return detail::OnlyActive(d, vaddvq_u16(values.raw)); 9110 #else 9111 const uint32x4_t x2 = vpaddlq_u16(values.raw); 9112 const uint64x2_t x4 = vpaddlq_u32(x2); 9113 return detail::OnlyActive(d, vgetq_lane_u64(x4, 0) + vgetq_lane_u64(x4, 1)); 9114 #endif 9115 } 9116 9117 template <class D, HWY_IF_T_SIZE_D(D, 2), HWY_IF_V_SIZE_LE_D(D, 8)> 9118 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9119 // Upper lanes of partial loads are undefined. OnlyActive will fix this if 9120 // we load all kSliceLanes so the upper lanes do not pollute the valid bits. 9121 alignas(8) static constexpr uint16_t kSliceLanes[4] = {1, 2, 4, 8}; 9122 const RebindToUnsigned<decltype(d)> du; 9123 using VU = VFromD<decltype(du)>; 9124 const VU slice(Load(Full64<uint16_t>(), kSliceLanes).raw); 9125 const VU values = BitCast(du, VecFromMask(d, mask)) & slice; 9126 #if HWY_ARCH_ARM_A64 9127 return detail::OnlyActive(d, vaddv_u16(values.raw)); 9128 #else 9129 const uint32x2_t x2 = vpaddl_u16(values.raw); 9130 const uint64x1_t x4 = vpaddl_u32(x2); 9131 return detail::OnlyActive(d, vget_lane_u64(x4, 0)); 9132 #endif 9133 } 9134 9135 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_D(D, 16)> 9136 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9137 alignas(16) static constexpr uint32_t kSliceLanes[4] = {1, 2, 4, 8}; 9138 const RebindToUnsigned<D> du; 9139 const Vec128<uint32_t> values = 9140 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); 9141 #if HWY_ARCH_ARM_A64 9142 return detail::OnlyActive(d, vaddvq_u32(values.raw)); 9143 #else 9144 const uint64x2_t x2 = vpaddlq_u32(values.raw); 9145 return detail::OnlyActive(d, vgetq_lane_u64(x2, 0) + vgetq_lane_u64(x2, 1)); 9146 #endif 9147 } 9148 9149 template <class D, HWY_IF_T_SIZE_D(D, 4), HWY_IF_V_SIZE_LE_D(D, 8)> 9150 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9151 // Upper lanes of partial loads are undefined. OnlyActive will fix this if 9152 // we load all kSliceLanes so the upper lanes do not pollute the valid bits. 9153 alignas(8) static constexpr uint32_t kSliceLanes[2] = {1, 2}; 9154 const RebindToUnsigned<decltype(d)> du; 9155 using VU = VFromD<decltype(du)>; 9156 const VU slice(Load(Full64<uint32_t>(), kSliceLanes).raw); 9157 const VU values = BitCast(du, VecFromMask(d, mask)) & slice; 9158 #if HWY_ARCH_ARM_A64 9159 return detail::OnlyActive(d, vaddv_u32(values.raw)); 9160 #else 9161 const uint64x1_t x2 = vpaddl_u32(values.raw); 9162 return detail::OnlyActive(d, vget_lane_u64(x2, 0)); 9163 #endif 9164 } 9165 9166 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_D(D, 16)> 9167 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9168 alignas(16) static constexpr uint64_t kSliceLanes[2] = {1, 2}; 9169 const RebindToUnsigned<decltype(d)> du; 9170 const Vec128<uint64_t> values = 9171 BitCast(du, VecFromMask(d, mask)) & Load(du, kSliceLanes); 9172 #if HWY_ARCH_ARM_A64 9173 return detail::OnlyActive(d, vaddvq_u64(values.raw)); 9174 #else 9175 return detail::OnlyActive( 9176 d, vgetq_lane_u64(values.raw, 0) + vgetq_lane_u64(values.raw, 1)); 9177 #endif 9178 } 9179 9180 template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_V_SIZE_LE_D(D, 8)> 9181 HWY_API uint64_t BitsFromMask(D d, MFromD<D> mask) { 9182 const RebindToUnsigned<decltype(d)> du; 9183 const Vec64<uint64_t> values = BitCast(du, VecFromMask(d, mask)) & Set(du, 1); 9184 return vget_lane_u64(values.raw, 0); 9185 } 9186 9187 namespace detail { 9188 9189 // Returns number of lanes whose mask is set. 9190 // 9191 // Masks are either FF..FF or 0. Unfortunately there is no reduce-sub op 9192 // ("vsubv"). ANDing with 1 would work but requires a constant. Negating also 9193 // changes each lane to 1 (if mask set) or 0. 9194 // NOTE: PopCount also operates on vectors, so we still have to do horizontal 9195 // sums separately. We specialize CountTrue for full vectors (negating instead 9196 // of PopCount because it avoids an extra shift), and use PopCount of 9197 // NibblesFromMask for partial vectors. 9198 9199 template <typename T> 9200 HWY_INLINE size_t CountTrue(hwy::SizeTag<1> /*tag*/, Mask128<T> mask) { 9201 const Full128<int8_t> di; 9202 const int8x16_t ones = 9203 vnegq_s8(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); 9204 9205 #if HWY_ARCH_ARM_A64 9206 return static_cast<size_t>(vaddvq_s8(ones)); 9207 #else 9208 const int16x8_t x2 = vpaddlq_s8(ones); 9209 const int32x4_t x4 = vpaddlq_s16(x2); 9210 const int64x2_t x8 = vpaddlq_s32(x4); 9211 return static_cast<size_t>(vgetq_lane_s64(x8, 0) + vgetq_lane_s64(x8, 1)); 9212 #endif 9213 } 9214 template <typename T> 9215 HWY_INLINE size_t CountTrue(hwy::SizeTag<2> /*tag*/, Mask128<T> mask) { 9216 const Full128<int16_t> di; 9217 const int16x8_t ones = 9218 vnegq_s16(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); 9219 9220 #if HWY_ARCH_ARM_A64 9221 return static_cast<size_t>(vaddvq_s16(ones)); 9222 #else 9223 const int32x4_t x2 = vpaddlq_s16(ones); 9224 const int64x2_t x4 = vpaddlq_s32(x2); 9225 return static_cast<size_t>(vgetq_lane_s64(x4, 0) + vgetq_lane_s64(x4, 1)); 9226 #endif 9227 } 9228 9229 template <typename T> 9230 HWY_INLINE size_t CountTrue(hwy::SizeTag<4> /*tag*/, Mask128<T> mask) { 9231 const Full128<int32_t> di; 9232 const int32x4_t ones = 9233 vnegq_s32(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); 9234 9235 #if HWY_ARCH_ARM_A64 9236 return static_cast<size_t>(vaddvq_s32(ones)); 9237 #else 9238 const int64x2_t x2 = vpaddlq_s32(ones); 9239 return static_cast<size_t>(vgetq_lane_s64(x2, 0) + vgetq_lane_s64(x2, 1)); 9240 #endif 9241 } 9242 9243 template <typename T> 9244 HWY_INLINE size_t CountTrue(hwy::SizeTag<8> /*tag*/, Mask128<T> mask) { 9245 #if HWY_ARCH_ARM_A64 9246 const Full128<int64_t> di; 9247 const int64x2_t ones = 9248 vnegq_s64(BitCast(di, VecFromMask(Full128<T>(), mask)).raw); 9249 return static_cast<size_t>(vaddvq_s64(ones)); 9250 #else 9251 const Full128<uint64_t> du; 9252 const auto mask_u = VecFromMask(du, RebindMask(du, mask)); 9253 const uint64x2_t ones = vshrq_n_u64(mask_u.raw, 63); 9254 return static_cast<size_t>(vgetq_lane_u64(ones, 0) + vgetq_lane_u64(ones, 1)); 9255 #endif 9256 } 9257 9258 } // namespace detail 9259 9260 // Full 9261 template <class D, typename T = TFromD<D>> 9262 HWY_API size_t CountTrue(D /* tag */, Mask128<T> mask) { 9263 return detail::CountTrue(hwy::SizeTag<sizeof(T)>(), mask); 9264 } 9265 9266 // Partial 9267 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 9268 HWY_API size_t CountTrue(D d, MFromD<D> mask) { 9269 constexpr int kDiv = 4 * sizeof(TFromD<D>); 9270 return PopCount(detail::NibblesFromMask(d, mask)) / kDiv; 9271 } 9272 9273 template <class D> 9274 HWY_API size_t FindKnownFirstTrue(D d, MFromD<D> mask) { 9275 const uint64_t nib = detail::NibblesFromMask(d, mask); 9276 constexpr size_t kDiv = 4 * sizeof(TFromD<D>); 9277 return Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv; 9278 } 9279 9280 template <class D> 9281 HWY_API intptr_t FindFirstTrue(D d, MFromD<D> mask) { 9282 const uint64_t nib = detail::NibblesFromMask(d, mask); 9283 if (nib == 0) return -1; 9284 constexpr size_t kDiv = 4 * sizeof(TFromD<D>); 9285 return static_cast<intptr_t>(Num0BitsBelowLS1Bit_Nonzero64(nib) / kDiv); 9286 } 9287 9288 template <class D> 9289 HWY_API size_t FindKnownLastTrue(D d, MFromD<D> mask) { 9290 const uint64_t nib = detail::NibblesFromMask(d, mask); 9291 constexpr size_t kDiv = 4 * sizeof(TFromD<D>); 9292 return (63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / kDiv; 9293 } 9294 9295 template <class D> 9296 HWY_API intptr_t FindLastTrue(D d, MFromD<D> mask) { 9297 const uint64_t nib = detail::NibblesFromMask(d, mask); 9298 if (nib == 0) return -1; 9299 constexpr size_t kDiv = 4 * sizeof(TFromD<D>); 9300 return static_cast<intptr_t>((63 - Num0BitsAboveMS1Bit_Nonzero64(nib)) / 9301 kDiv); 9302 } 9303 9304 // `p` points to at least 8 writable bytes. 9305 template <class D> 9306 HWY_API size_t StoreMaskBits(D d, MFromD<D> mask, uint8_t* bits) { 9307 const uint64_t mask_bits = BitsFromMask(d, mask); 9308 const size_t kNumBytes = (d.MaxLanes() + 7) / 8; 9309 CopyBytes<kNumBytes>(&mask_bits, bits); 9310 return kNumBytes; 9311 } 9312 9313 template <class D> 9314 HWY_API bool AllFalse(D d, MFromD<D> m) { 9315 return detail::NibblesFromMask(d, m) == 0; 9316 } 9317 9318 // Full 9319 template <class D, typename T = TFromD<D>> 9320 HWY_API bool AllTrue(D d, Mask128<T> m) { 9321 return detail::NibblesFromMask(d, m) == ~0ull; 9322 } 9323 // Partial 9324 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 9325 HWY_API bool AllTrue(D d, MFromD<D> m) { 9326 return detail::NibblesFromMask(d, m) == (1ull << (d.MaxBytes() * 4)) - 1; 9327 } 9328 9329 // ------------------------------ Compress 9330 9331 template <typename T> 9332 struct CompressIsPartition { 9333 enum { value = (sizeof(T) != 1) }; 9334 }; 9335 9336 namespace detail { 9337 9338 // Load 8 bytes, replicate into upper half so ZipLower can use the lower half. 9339 template <class D, HWY_IF_V_SIZE_D(D, 16)> 9340 HWY_INLINE Vec128<uint8_t> Load8Bytes(D /*tag*/, const uint8_t* bytes) { 9341 return Vec128<uint8_t>(vreinterpretq_u8_u64( 9342 vld1q_dup_u64(HWY_RCAST_ALIGNED(const uint64_t*, bytes)))); 9343 } 9344 9345 // Load 8 bytes and return half-reg with N <= 8 bytes. 9346 template <class D, HWY_IF_V_SIZE_LE_D(D, 8)> 9347 HWY_INLINE VFromD<D> Load8Bytes(D d, const uint8_t* bytes) { 9348 return Load(d, bytes); 9349 } 9350 9351 template <typename T, size_t N> 9352 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<2> /*tag*/, 9353 uint64_t mask_bits) { 9354 HWY_DASSERT(mask_bits < 256); 9355 const Simd<T, N, 0> d; 9356 const Repartition<uint8_t, decltype(d)> d8; 9357 const Simd<uint16_t, N, 0> du; 9358 9359 // NEON does not provide an equivalent of AVX2 permutevar, so we need byte 9360 // indices for VTBL (one vector's worth for each of 256 combinations of 9361 // 8 mask bits). Loading them directly would require 4 KiB. We can instead 9362 // store lane indices and convert to byte indices (2*lane + 0..1), with the 9363 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane 9364 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. 9365 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles 9366 // is likely more costly than the higher cache footprint from storing bytes. 9367 alignas(16) static constexpr uint8_t table[256 * 8] = { 9368 // PrintCompress16x8Tables 9369 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9370 2, 0, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9371 4, 0, 2, 6, 8, 10, 12, 14, /**/ 0, 4, 2, 6, 8, 10, 12, 14, // 9372 2, 4, 0, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9373 6, 0, 2, 4, 8, 10, 12, 14, /**/ 0, 6, 2, 4, 8, 10, 12, 14, // 9374 2, 6, 0, 4, 8, 10, 12, 14, /**/ 0, 2, 6, 4, 8, 10, 12, 14, // 9375 4, 6, 0, 2, 8, 10, 12, 14, /**/ 0, 4, 6, 2, 8, 10, 12, 14, // 9376 2, 4, 6, 0, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9377 8, 0, 2, 4, 6, 10, 12, 14, /**/ 0, 8, 2, 4, 6, 10, 12, 14, // 9378 2, 8, 0, 4, 6, 10, 12, 14, /**/ 0, 2, 8, 4, 6, 10, 12, 14, // 9379 4, 8, 0, 2, 6, 10, 12, 14, /**/ 0, 4, 8, 2, 6, 10, 12, 14, // 9380 2, 4, 8, 0, 6, 10, 12, 14, /**/ 0, 2, 4, 8, 6, 10, 12, 14, // 9381 6, 8, 0, 2, 4, 10, 12, 14, /**/ 0, 6, 8, 2, 4, 10, 12, 14, // 9382 2, 6, 8, 0, 4, 10, 12, 14, /**/ 0, 2, 6, 8, 4, 10, 12, 14, // 9383 4, 6, 8, 0, 2, 10, 12, 14, /**/ 0, 4, 6, 8, 2, 10, 12, 14, // 9384 2, 4, 6, 8, 0, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9385 10, 0, 2, 4, 6, 8, 12, 14, /**/ 0, 10, 2, 4, 6, 8, 12, 14, // 9386 2, 10, 0, 4, 6, 8, 12, 14, /**/ 0, 2, 10, 4, 6, 8, 12, 14, // 9387 4, 10, 0, 2, 6, 8, 12, 14, /**/ 0, 4, 10, 2, 6, 8, 12, 14, // 9388 2, 4, 10, 0, 6, 8, 12, 14, /**/ 0, 2, 4, 10, 6, 8, 12, 14, // 9389 6, 10, 0, 2, 4, 8, 12, 14, /**/ 0, 6, 10, 2, 4, 8, 12, 14, // 9390 2, 6, 10, 0, 4, 8, 12, 14, /**/ 0, 2, 6, 10, 4, 8, 12, 14, // 9391 4, 6, 10, 0, 2, 8, 12, 14, /**/ 0, 4, 6, 10, 2, 8, 12, 14, // 9392 2, 4, 6, 10, 0, 8, 12, 14, /**/ 0, 2, 4, 6, 10, 8, 12, 14, // 9393 8, 10, 0, 2, 4, 6, 12, 14, /**/ 0, 8, 10, 2, 4, 6, 12, 14, // 9394 2, 8, 10, 0, 4, 6, 12, 14, /**/ 0, 2, 8, 10, 4, 6, 12, 14, // 9395 4, 8, 10, 0, 2, 6, 12, 14, /**/ 0, 4, 8, 10, 2, 6, 12, 14, // 9396 2, 4, 8, 10, 0, 6, 12, 14, /**/ 0, 2, 4, 8, 10, 6, 12, 14, // 9397 6, 8, 10, 0, 2, 4, 12, 14, /**/ 0, 6, 8, 10, 2, 4, 12, 14, // 9398 2, 6, 8, 10, 0, 4, 12, 14, /**/ 0, 2, 6, 8, 10, 4, 12, 14, // 9399 4, 6, 8, 10, 0, 2, 12, 14, /**/ 0, 4, 6, 8, 10, 2, 12, 14, // 9400 2, 4, 6, 8, 10, 0, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9401 12, 0, 2, 4, 6, 8, 10, 14, /**/ 0, 12, 2, 4, 6, 8, 10, 14, // 9402 2, 12, 0, 4, 6, 8, 10, 14, /**/ 0, 2, 12, 4, 6, 8, 10, 14, // 9403 4, 12, 0, 2, 6, 8, 10, 14, /**/ 0, 4, 12, 2, 6, 8, 10, 14, // 9404 2, 4, 12, 0, 6, 8, 10, 14, /**/ 0, 2, 4, 12, 6, 8, 10, 14, // 9405 6, 12, 0, 2, 4, 8, 10, 14, /**/ 0, 6, 12, 2, 4, 8, 10, 14, // 9406 2, 6, 12, 0, 4, 8, 10, 14, /**/ 0, 2, 6, 12, 4, 8, 10, 14, // 9407 4, 6, 12, 0, 2, 8, 10, 14, /**/ 0, 4, 6, 12, 2, 8, 10, 14, // 9408 2, 4, 6, 12, 0, 8, 10, 14, /**/ 0, 2, 4, 6, 12, 8, 10, 14, // 9409 8, 12, 0, 2, 4, 6, 10, 14, /**/ 0, 8, 12, 2, 4, 6, 10, 14, // 9410 2, 8, 12, 0, 4, 6, 10, 14, /**/ 0, 2, 8, 12, 4, 6, 10, 14, // 9411 4, 8, 12, 0, 2, 6, 10, 14, /**/ 0, 4, 8, 12, 2, 6, 10, 14, // 9412 2, 4, 8, 12, 0, 6, 10, 14, /**/ 0, 2, 4, 8, 12, 6, 10, 14, // 9413 6, 8, 12, 0, 2, 4, 10, 14, /**/ 0, 6, 8, 12, 2, 4, 10, 14, // 9414 2, 6, 8, 12, 0, 4, 10, 14, /**/ 0, 2, 6, 8, 12, 4, 10, 14, // 9415 4, 6, 8, 12, 0, 2, 10, 14, /**/ 0, 4, 6, 8, 12, 2, 10, 14, // 9416 2, 4, 6, 8, 12, 0, 10, 14, /**/ 0, 2, 4, 6, 8, 12, 10, 14, // 9417 10, 12, 0, 2, 4, 6, 8, 14, /**/ 0, 10, 12, 2, 4, 6, 8, 14, // 9418 2, 10, 12, 0, 4, 6, 8, 14, /**/ 0, 2, 10, 12, 4, 6, 8, 14, // 9419 4, 10, 12, 0, 2, 6, 8, 14, /**/ 0, 4, 10, 12, 2, 6, 8, 14, // 9420 2, 4, 10, 12, 0, 6, 8, 14, /**/ 0, 2, 4, 10, 12, 6, 8, 14, // 9421 6, 10, 12, 0, 2, 4, 8, 14, /**/ 0, 6, 10, 12, 2, 4, 8, 14, // 9422 2, 6, 10, 12, 0, 4, 8, 14, /**/ 0, 2, 6, 10, 12, 4, 8, 14, // 9423 4, 6, 10, 12, 0, 2, 8, 14, /**/ 0, 4, 6, 10, 12, 2, 8, 14, // 9424 2, 4, 6, 10, 12, 0, 8, 14, /**/ 0, 2, 4, 6, 10, 12, 8, 14, // 9425 8, 10, 12, 0, 2, 4, 6, 14, /**/ 0, 8, 10, 12, 2, 4, 6, 14, // 9426 2, 8, 10, 12, 0, 4, 6, 14, /**/ 0, 2, 8, 10, 12, 4, 6, 14, // 9427 4, 8, 10, 12, 0, 2, 6, 14, /**/ 0, 4, 8, 10, 12, 2, 6, 14, // 9428 2, 4, 8, 10, 12, 0, 6, 14, /**/ 0, 2, 4, 8, 10, 12, 6, 14, // 9429 6, 8, 10, 12, 0, 2, 4, 14, /**/ 0, 6, 8, 10, 12, 2, 4, 14, // 9430 2, 6, 8, 10, 12, 0, 4, 14, /**/ 0, 2, 6, 8, 10, 12, 4, 14, // 9431 4, 6, 8, 10, 12, 0, 2, 14, /**/ 0, 4, 6, 8, 10, 12, 2, 14, // 9432 2, 4, 6, 8, 10, 12, 0, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14, // 9433 14, 0, 2, 4, 6, 8, 10, 12, /**/ 0, 14, 2, 4, 6, 8, 10, 12, // 9434 2, 14, 0, 4, 6, 8, 10, 12, /**/ 0, 2, 14, 4, 6, 8, 10, 12, // 9435 4, 14, 0, 2, 6, 8, 10, 12, /**/ 0, 4, 14, 2, 6, 8, 10, 12, // 9436 2, 4, 14, 0, 6, 8, 10, 12, /**/ 0, 2, 4, 14, 6, 8, 10, 12, // 9437 6, 14, 0, 2, 4, 8, 10, 12, /**/ 0, 6, 14, 2, 4, 8, 10, 12, // 9438 2, 6, 14, 0, 4, 8, 10, 12, /**/ 0, 2, 6, 14, 4, 8, 10, 12, // 9439 4, 6, 14, 0, 2, 8, 10, 12, /**/ 0, 4, 6, 14, 2, 8, 10, 12, // 9440 2, 4, 6, 14, 0, 8, 10, 12, /**/ 0, 2, 4, 6, 14, 8, 10, 12, // 9441 8, 14, 0, 2, 4, 6, 10, 12, /**/ 0, 8, 14, 2, 4, 6, 10, 12, // 9442 2, 8, 14, 0, 4, 6, 10, 12, /**/ 0, 2, 8, 14, 4, 6, 10, 12, // 9443 4, 8, 14, 0, 2, 6, 10, 12, /**/ 0, 4, 8, 14, 2, 6, 10, 12, // 9444 2, 4, 8, 14, 0, 6, 10, 12, /**/ 0, 2, 4, 8, 14, 6, 10, 12, // 9445 6, 8, 14, 0, 2, 4, 10, 12, /**/ 0, 6, 8, 14, 2, 4, 10, 12, // 9446 2, 6, 8, 14, 0, 4, 10, 12, /**/ 0, 2, 6, 8, 14, 4, 10, 12, // 9447 4, 6, 8, 14, 0, 2, 10, 12, /**/ 0, 4, 6, 8, 14, 2, 10, 12, // 9448 2, 4, 6, 8, 14, 0, 10, 12, /**/ 0, 2, 4, 6, 8, 14, 10, 12, // 9449 10, 14, 0, 2, 4, 6, 8, 12, /**/ 0, 10, 14, 2, 4, 6, 8, 12, // 9450 2, 10, 14, 0, 4, 6, 8, 12, /**/ 0, 2, 10, 14, 4, 6, 8, 12, // 9451 4, 10, 14, 0, 2, 6, 8, 12, /**/ 0, 4, 10, 14, 2, 6, 8, 12, // 9452 2, 4, 10, 14, 0, 6, 8, 12, /**/ 0, 2, 4, 10, 14, 6, 8, 12, // 9453 6, 10, 14, 0, 2, 4, 8, 12, /**/ 0, 6, 10, 14, 2, 4, 8, 12, // 9454 2, 6, 10, 14, 0, 4, 8, 12, /**/ 0, 2, 6, 10, 14, 4, 8, 12, // 9455 4, 6, 10, 14, 0, 2, 8, 12, /**/ 0, 4, 6, 10, 14, 2, 8, 12, // 9456 2, 4, 6, 10, 14, 0, 8, 12, /**/ 0, 2, 4, 6, 10, 14, 8, 12, // 9457 8, 10, 14, 0, 2, 4, 6, 12, /**/ 0, 8, 10, 14, 2, 4, 6, 12, // 9458 2, 8, 10, 14, 0, 4, 6, 12, /**/ 0, 2, 8, 10, 14, 4, 6, 12, // 9459 4, 8, 10, 14, 0, 2, 6, 12, /**/ 0, 4, 8, 10, 14, 2, 6, 12, // 9460 2, 4, 8, 10, 14, 0, 6, 12, /**/ 0, 2, 4, 8, 10, 14, 6, 12, // 9461 6, 8, 10, 14, 0, 2, 4, 12, /**/ 0, 6, 8, 10, 14, 2, 4, 12, // 9462 2, 6, 8, 10, 14, 0, 4, 12, /**/ 0, 2, 6, 8, 10, 14, 4, 12, // 9463 4, 6, 8, 10, 14, 0, 2, 12, /**/ 0, 4, 6, 8, 10, 14, 2, 12, // 9464 2, 4, 6, 8, 10, 14, 0, 12, /**/ 0, 2, 4, 6, 8, 10, 14, 12, // 9465 12, 14, 0, 2, 4, 6, 8, 10, /**/ 0, 12, 14, 2, 4, 6, 8, 10, // 9466 2, 12, 14, 0, 4, 6, 8, 10, /**/ 0, 2, 12, 14, 4, 6, 8, 10, // 9467 4, 12, 14, 0, 2, 6, 8, 10, /**/ 0, 4, 12, 14, 2, 6, 8, 10, // 9468 2, 4, 12, 14, 0, 6, 8, 10, /**/ 0, 2, 4, 12, 14, 6, 8, 10, // 9469 6, 12, 14, 0, 2, 4, 8, 10, /**/ 0, 6, 12, 14, 2, 4, 8, 10, // 9470 2, 6, 12, 14, 0, 4, 8, 10, /**/ 0, 2, 6, 12, 14, 4, 8, 10, // 9471 4, 6, 12, 14, 0, 2, 8, 10, /**/ 0, 4, 6, 12, 14, 2, 8, 10, // 9472 2, 4, 6, 12, 14, 0, 8, 10, /**/ 0, 2, 4, 6, 12, 14, 8, 10, // 9473 8, 12, 14, 0, 2, 4, 6, 10, /**/ 0, 8, 12, 14, 2, 4, 6, 10, // 9474 2, 8, 12, 14, 0, 4, 6, 10, /**/ 0, 2, 8, 12, 14, 4, 6, 10, // 9475 4, 8, 12, 14, 0, 2, 6, 10, /**/ 0, 4, 8, 12, 14, 2, 6, 10, // 9476 2, 4, 8, 12, 14, 0, 6, 10, /**/ 0, 2, 4, 8, 12, 14, 6, 10, // 9477 6, 8, 12, 14, 0, 2, 4, 10, /**/ 0, 6, 8, 12, 14, 2, 4, 10, // 9478 2, 6, 8, 12, 14, 0, 4, 10, /**/ 0, 2, 6, 8, 12, 14, 4, 10, // 9479 4, 6, 8, 12, 14, 0, 2, 10, /**/ 0, 4, 6, 8, 12, 14, 2, 10, // 9480 2, 4, 6, 8, 12, 14, 0, 10, /**/ 0, 2, 4, 6, 8, 12, 14, 10, // 9481 10, 12, 14, 0, 2, 4, 6, 8, /**/ 0, 10, 12, 14, 2, 4, 6, 8, // 9482 2, 10, 12, 14, 0, 4, 6, 8, /**/ 0, 2, 10, 12, 14, 4, 6, 8, // 9483 4, 10, 12, 14, 0, 2, 6, 8, /**/ 0, 4, 10, 12, 14, 2, 6, 8, // 9484 2, 4, 10, 12, 14, 0, 6, 8, /**/ 0, 2, 4, 10, 12, 14, 6, 8, // 9485 6, 10, 12, 14, 0, 2, 4, 8, /**/ 0, 6, 10, 12, 14, 2, 4, 8, // 9486 2, 6, 10, 12, 14, 0, 4, 8, /**/ 0, 2, 6, 10, 12, 14, 4, 8, // 9487 4, 6, 10, 12, 14, 0, 2, 8, /**/ 0, 4, 6, 10, 12, 14, 2, 8, // 9488 2, 4, 6, 10, 12, 14, 0, 8, /**/ 0, 2, 4, 6, 10, 12, 14, 8, // 9489 8, 10, 12, 14, 0, 2, 4, 6, /**/ 0, 8, 10, 12, 14, 2, 4, 6, // 9490 2, 8, 10, 12, 14, 0, 4, 6, /**/ 0, 2, 8, 10, 12, 14, 4, 6, // 9491 4, 8, 10, 12, 14, 0, 2, 6, /**/ 0, 4, 8, 10, 12, 14, 2, 6, // 9492 2, 4, 8, 10, 12, 14, 0, 6, /**/ 0, 2, 4, 8, 10, 12, 14, 6, // 9493 6, 8, 10, 12, 14, 0, 2, 4, /**/ 0, 6, 8, 10, 12, 14, 2, 4, // 9494 2, 6, 8, 10, 12, 14, 0, 4, /**/ 0, 2, 6, 8, 10, 12, 14, 4, // 9495 4, 6, 8, 10, 12, 14, 0, 2, /**/ 0, 4, 6, 8, 10, 12, 14, 2, // 9496 2, 4, 6, 8, 10, 12, 14, 0, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 9497 9498 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8); 9499 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); 9500 return BitCast(d, pairs + Set(du, 0x0100)); 9501 } 9502 9503 template <typename T, size_t N> 9504 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<2> /*tag*/, 9505 uint64_t mask_bits) { 9506 HWY_DASSERT(mask_bits < 256); 9507 const Simd<T, N, 0> d; 9508 const Repartition<uint8_t, decltype(d)> d8; 9509 const Simd<uint16_t, N, 0> du; 9510 9511 // NEON does not provide an equivalent of AVX2 permutevar, so we need byte 9512 // indices for VTBL (one vector's worth for each of 256 combinations of 9513 // 8 mask bits). Loading them directly would require 4 KiB. We can instead 9514 // store lane indices and convert to byte indices (2*lane + 0..1), with the 9515 // doubling baked into the table. AVX2 Compress32 stores eight 4-bit lane 9516 // indices (total 1 KiB), broadcasts them into each 32-bit lane and shifts. 9517 // Here, 16-bit lanes are too narrow to hold all bits, and unpacking nibbles 9518 // is likely more costly than the higher cache footprint from storing bytes. 9519 alignas(16) static constexpr uint8_t table[256 * 8] = { 9520 // PrintCompressNot16x8Tables 9521 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 14, 0, // 9522 0, 4, 6, 8, 10, 12, 14, 2, /**/ 4, 6, 8, 10, 12, 14, 0, 2, // 9523 0, 2, 6, 8, 10, 12, 14, 4, /**/ 2, 6, 8, 10, 12, 14, 0, 4, // 9524 0, 6, 8, 10, 12, 14, 2, 4, /**/ 6, 8, 10, 12, 14, 0, 2, 4, // 9525 0, 2, 4, 8, 10, 12, 14, 6, /**/ 2, 4, 8, 10, 12, 14, 0, 6, // 9526 0, 4, 8, 10, 12, 14, 2, 6, /**/ 4, 8, 10, 12, 14, 0, 2, 6, // 9527 0, 2, 8, 10, 12, 14, 4, 6, /**/ 2, 8, 10, 12, 14, 0, 4, 6, // 9528 0, 8, 10, 12, 14, 2, 4, 6, /**/ 8, 10, 12, 14, 0, 2, 4, 6, // 9529 0, 2, 4, 6, 10, 12, 14, 8, /**/ 2, 4, 6, 10, 12, 14, 0, 8, // 9530 0, 4, 6, 10, 12, 14, 2, 8, /**/ 4, 6, 10, 12, 14, 0, 2, 8, // 9531 0, 2, 6, 10, 12, 14, 4, 8, /**/ 2, 6, 10, 12, 14, 0, 4, 8, // 9532 0, 6, 10, 12, 14, 2, 4, 8, /**/ 6, 10, 12, 14, 0, 2, 4, 8, // 9533 0, 2, 4, 10, 12, 14, 6, 8, /**/ 2, 4, 10, 12, 14, 0, 6, 8, // 9534 0, 4, 10, 12, 14, 2, 6, 8, /**/ 4, 10, 12, 14, 0, 2, 6, 8, // 9535 0, 2, 10, 12, 14, 4, 6, 8, /**/ 2, 10, 12, 14, 0, 4, 6, 8, // 9536 0, 10, 12, 14, 2, 4, 6, 8, /**/ 10, 12, 14, 0, 2, 4, 6, 8, // 9537 0, 2, 4, 6, 8, 12, 14, 10, /**/ 2, 4, 6, 8, 12, 14, 0, 10, // 9538 0, 4, 6, 8, 12, 14, 2, 10, /**/ 4, 6, 8, 12, 14, 0, 2, 10, // 9539 0, 2, 6, 8, 12, 14, 4, 10, /**/ 2, 6, 8, 12, 14, 0, 4, 10, // 9540 0, 6, 8, 12, 14, 2, 4, 10, /**/ 6, 8, 12, 14, 0, 2, 4, 10, // 9541 0, 2, 4, 8, 12, 14, 6, 10, /**/ 2, 4, 8, 12, 14, 0, 6, 10, // 9542 0, 4, 8, 12, 14, 2, 6, 10, /**/ 4, 8, 12, 14, 0, 2, 6, 10, // 9543 0, 2, 8, 12, 14, 4, 6, 10, /**/ 2, 8, 12, 14, 0, 4, 6, 10, // 9544 0, 8, 12, 14, 2, 4, 6, 10, /**/ 8, 12, 14, 0, 2, 4, 6, 10, // 9545 0, 2, 4, 6, 12, 14, 8, 10, /**/ 2, 4, 6, 12, 14, 0, 8, 10, // 9546 0, 4, 6, 12, 14, 2, 8, 10, /**/ 4, 6, 12, 14, 0, 2, 8, 10, // 9547 0, 2, 6, 12, 14, 4, 8, 10, /**/ 2, 6, 12, 14, 0, 4, 8, 10, // 9548 0, 6, 12, 14, 2, 4, 8, 10, /**/ 6, 12, 14, 0, 2, 4, 8, 10, // 9549 0, 2, 4, 12, 14, 6, 8, 10, /**/ 2, 4, 12, 14, 0, 6, 8, 10, // 9550 0, 4, 12, 14, 2, 6, 8, 10, /**/ 4, 12, 14, 0, 2, 6, 8, 10, // 9551 0, 2, 12, 14, 4, 6, 8, 10, /**/ 2, 12, 14, 0, 4, 6, 8, 10, // 9552 0, 12, 14, 2, 4, 6, 8, 10, /**/ 12, 14, 0, 2, 4, 6, 8, 10, // 9553 0, 2, 4, 6, 8, 10, 14, 12, /**/ 2, 4, 6, 8, 10, 14, 0, 12, // 9554 0, 4, 6, 8, 10, 14, 2, 12, /**/ 4, 6, 8, 10, 14, 0, 2, 12, // 9555 0, 2, 6, 8, 10, 14, 4, 12, /**/ 2, 6, 8, 10, 14, 0, 4, 12, // 9556 0, 6, 8, 10, 14, 2, 4, 12, /**/ 6, 8, 10, 14, 0, 2, 4, 12, // 9557 0, 2, 4, 8, 10, 14, 6, 12, /**/ 2, 4, 8, 10, 14, 0, 6, 12, // 9558 0, 4, 8, 10, 14, 2, 6, 12, /**/ 4, 8, 10, 14, 0, 2, 6, 12, // 9559 0, 2, 8, 10, 14, 4, 6, 12, /**/ 2, 8, 10, 14, 0, 4, 6, 12, // 9560 0, 8, 10, 14, 2, 4, 6, 12, /**/ 8, 10, 14, 0, 2, 4, 6, 12, // 9561 0, 2, 4, 6, 10, 14, 8, 12, /**/ 2, 4, 6, 10, 14, 0, 8, 12, // 9562 0, 4, 6, 10, 14, 2, 8, 12, /**/ 4, 6, 10, 14, 0, 2, 8, 12, // 9563 0, 2, 6, 10, 14, 4, 8, 12, /**/ 2, 6, 10, 14, 0, 4, 8, 12, // 9564 0, 6, 10, 14, 2, 4, 8, 12, /**/ 6, 10, 14, 0, 2, 4, 8, 12, // 9565 0, 2, 4, 10, 14, 6, 8, 12, /**/ 2, 4, 10, 14, 0, 6, 8, 12, // 9566 0, 4, 10, 14, 2, 6, 8, 12, /**/ 4, 10, 14, 0, 2, 6, 8, 12, // 9567 0, 2, 10, 14, 4, 6, 8, 12, /**/ 2, 10, 14, 0, 4, 6, 8, 12, // 9568 0, 10, 14, 2, 4, 6, 8, 12, /**/ 10, 14, 0, 2, 4, 6, 8, 12, // 9569 0, 2, 4, 6, 8, 14, 10, 12, /**/ 2, 4, 6, 8, 14, 0, 10, 12, // 9570 0, 4, 6, 8, 14, 2, 10, 12, /**/ 4, 6, 8, 14, 0, 2, 10, 12, // 9571 0, 2, 6, 8, 14, 4, 10, 12, /**/ 2, 6, 8, 14, 0, 4, 10, 12, // 9572 0, 6, 8, 14, 2, 4, 10, 12, /**/ 6, 8, 14, 0, 2, 4, 10, 12, // 9573 0, 2, 4, 8, 14, 6, 10, 12, /**/ 2, 4, 8, 14, 0, 6, 10, 12, // 9574 0, 4, 8, 14, 2, 6, 10, 12, /**/ 4, 8, 14, 0, 2, 6, 10, 12, // 9575 0, 2, 8, 14, 4, 6, 10, 12, /**/ 2, 8, 14, 0, 4, 6, 10, 12, // 9576 0, 8, 14, 2, 4, 6, 10, 12, /**/ 8, 14, 0, 2, 4, 6, 10, 12, // 9577 0, 2, 4, 6, 14, 8, 10, 12, /**/ 2, 4, 6, 14, 0, 8, 10, 12, // 9578 0, 4, 6, 14, 2, 8, 10, 12, /**/ 4, 6, 14, 0, 2, 8, 10, 12, // 9579 0, 2, 6, 14, 4, 8, 10, 12, /**/ 2, 6, 14, 0, 4, 8, 10, 12, // 9580 0, 6, 14, 2, 4, 8, 10, 12, /**/ 6, 14, 0, 2, 4, 8, 10, 12, // 9581 0, 2, 4, 14, 6, 8, 10, 12, /**/ 2, 4, 14, 0, 6, 8, 10, 12, // 9582 0, 4, 14, 2, 6, 8, 10, 12, /**/ 4, 14, 0, 2, 6, 8, 10, 12, // 9583 0, 2, 14, 4, 6, 8, 10, 12, /**/ 2, 14, 0, 4, 6, 8, 10, 12, // 9584 0, 14, 2, 4, 6, 8, 10, 12, /**/ 14, 0, 2, 4, 6, 8, 10, 12, // 9585 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 12, 0, 14, // 9586 0, 4, 6, 8, 10, 12, 2, 14, /**/ 4, 6, 8, 10, 12, 0, 2, 14, // 9587 0, 2, 6, 8, 10, 12, 4, 14, /**/ 2, 6, 8, 10, 12, 0, 4, 14, // 9588 0, 6, 8, 10, 12, 2, 4, 14, /**/ 6, 8, 10, 12, 0, 2, 4, 14, // 9589 0, 2, 4, 8, 10, 12, 6, 14, /**/ 2, 4, 8, 10, 12, 0, 6, 14, // 9590 0, 4, 8, 10, 12, 2, 6, 14, /**/ 4, 8, 10, 12, 0, 2, 6, 14, // 9591 0, 2, 8, 10, 12, 4, 6, 14, /**/ 2, 8, 10, 12, 0, 4, 6, 14, // 9592 0, 8, 10, 12, 2, 4, 6, 14, /**/ 8, 10, 12, 0, 2, 4, 6, 14, // 9593 0, 2, 4, 6, 10, 12, 8, 14, /**/ 2, 4, 6, 10, 12, 0, 8, 14, // 9594 0, 4, 6, 10, 12, 2, 8, 14, /**/ 4, 6, 10, 12, 0, 2, 8, 14, // 9595 0, 2, 6, 10, 12, 4, 8, 14, /**/ 2, 6, 10, 12, 0, 4, 8, 14, // 9596 0, 6, 10, 12, 2, 4, 8, 14, /**/ 6, 10, 12, 0, 2, 4, 8, 14, // 9597 0, 2, 4, 10, 12, 6, 8, 14, /**/ 2, 4, 10, 12, 0, 6, 8, 14, // 9598 0, 4, 10, 12, 2, 6, 8, 14, /**/ 4, 10, 12, 0, 2, 6, 8, 14, // 9599 0, 2, 10, 12, 4, 6, 8, 14, /**/ 2, 10, 12, 0, 4, 6, 8, 14, // 9600 0, 10, 12, 2, 4, 6, 8, 14, /**/ 10, 12, 0, 2, 4, 6, 8, 14, // 9601 0, 2, 4, 6, 8, 12, 10, 14, /**/ 2, 4, 6, 8, 12, 0, 10, 14, // 9602 0, 4, 6, 8, 12, 2, 10, 14, /**/ 4, 6, 8, 12, 0, 2, 10, 14, // 9603 0, 2, 6, 8, 12, 4, 10, 14, /**/ 2, 6, 8, 12, 0, 4, 10, 14, // 9604 0, 6, 8, 12, 2, 4, 10, 14, /**/ 6, 8, 12, 0, 2, 4, 10, 14, // 9605 0, 2, 4, 8, 12, 6, 10, 14, /**/ 2, 4, 8, 12, 0, 6, 10, 14, // 9606 0, 4, 8, 12, 2, 6, 10, 14, /**/ 4, 8, 12, 0, 2, 6, 10, 14, // 9607 0, 2, 8, 12, 4, 6, 10, 14, /**/ 2, 8, 12, 0, 4, 6, 10, 14, // 9608 0, 8, 12, 2, 4, 6, 10, 14, /**/ 8, 12, 0, 2, 4, 6, 10, 14, // 9609 0, 2, 4, 6, 12, 8, 10, 14, /**/ 2, 4, 6, 12, 0, 8, 10, 14, // 9610 0, 4, 6, 12, 2, 8, 10, 14, /**/ 4, 6, 12, 0, 2, 8, 10, 14, // 9611 0, 2, 6, 12, 4, 8, 10, 14, /**/ 2, 6, 12, 0, 4, 8, 10, 14, // 9612 0, 6, 12, 2, 4, 8, 10, 14, /**/ 6, 12, 0, 2, 4, 8, 10, 14, // 9613 0, 2, 4, 12, 6, 8, 10, 14, /**/ 2, 4, 12, 0, 6, 8, 10, 14, // 9614 0, 4, 12, 2, 6, 8, 10, 14, /**/ 4, 12, 0, 2, 6, 8, 10, 14, // 9615 0, 2, 12, 4, 6, 8, 10, 14, /**/ 2, 12, 0, 4, 6, 8, 10, 14, // 9616 0, 12, 2, 4, 6, 8, 10, 14, /**/ 12, 0, 2, 4, 6, 8, 10, 14, // 9617 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 10, 0, 12, 14, // 9618 0, 4, 6, 8, 10, 2, 12, 14, /**/ 4, 6, 8, 10, 0, 2, 12, 14, // 9619 0, 2, 6, 8, 10, 4, 12, 14, /**/ 2, 6, 8, 10, 0, 4, 12, 14, // 9620 0, 6, 8, 10, 2, 4, 12, 14, /**/ 6, 8, 10, 0, 2, 4, 12, 14, // 9621 0, 2, 4, 8, 10, 6, 12, 14, /**/ 2, 4, 8, 10, 0, 6, 12, 14, // 9622 0, 4, 8, 10, 2, 6, 12, 14, /**/ 4, 8, 10, 0, 2, 6, 12, 14, // 9623 0, 2, 8, 10, 4, 6, 12, 14, /**/ 2, 8, 10, 0, 4, 6, 12, 14, // 9624 0, 8, 10, 2, 4, 6, 12, 14, /**/ 8, 10, 0, 2, 4, 6, 12, 14, // 9625 0, 2, 4, 6, 10, 8, 12, 14, /**/ 2, 4, 6, 10, 0, 8, 12, 14, // 9626 0, 4, 6, 10, 2, 8, 12, 14, /**/ 4, 6, 10, 0, 2, 8, 12, 14, // 9627 0, 2, 6, 10, 4, 8, 12, 14, /**/ 2, 6, 10, 0, 4, 8, 12, 14, // 9628 0, 6, 10, 2, 4, 8, 12, 14, /**/ 6, 10, 0, 2, 4, 8, 12, 14, // 9629 0, 2, 4, 10, 6, 8, 12, 14, /**/ 2, 4, 10, 0, 6, 8, 12, 14, // 9630 0, 4, 10, 2, 6, 8, 12, 14, /**/ 4, 10, 0, 2, 6, 8, 12, 14, // 9631 0, 2, 10, 4, 6, 8, 12, 14, /**/ 2, 10, 0, 4, 6, 8, 12, 14, // 9632 0, 10, 2, 4, 6, 8, 12, 14, /**/ 10, 0, 2, 4, 6, 8, 12, 14, // 9633 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 8, 0, 10, 12, 14, // 9634 0, 4, 6, 8, 2, 10, 12, 14, /**/ 4, 6, 8, 0, 2, 10, 12, 14, // 9635 0, 2, 6, 8, 4, 10, 12, 14, /**/ 2, 6, 8, 0, 4, 10, 12, 14, // 9636 0, 6, 8, 2, 4, 10, 12, 14, /**/ 6, 8, 0, 2, 4, 10, 12, 14, // 9637 0, 2, 4, 8, 6, 10, 12, 14, /**/ 2, 4, 8, 0, 6, 10, 12, 14, // 9638 0, 4, 8, 2, 6, 10, 12, 14, /**/ 4, 8, 0, 2, 6, 10, 12, 14, // 9639 0, 2, 8, 4, 6, 10, 12, 14, /**/ 2, 8, 0, 4, 6, 10, 12, 14, // 9640 0, 8, 2, 4, 6, 10, 12, 14, /**/ 8, 0, 2, 4, 6, 10, 12, 14, // 9641 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 6, 0, 8, 10, 12, 14, // 9642 0, 4, 6, 2, 8, 10, 12, 14, /**/ 4, 6, 0, 2, 8, 10, 12, 14, // 9643 0, 2, 6, 4, 8, 10, 12, 14, /**/ 2, 6, 0, 4, 8, 10, 12, 14, // 9644 0, 6, 2, 4, 8, 10, 12, 14, /**/ 6, 0, 2, 4, 8, 10, 12, 14, // 9645 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 4, 0, 6, 8, 10, 12, 14, // 9646 0, 4, 2, 6, 8, 10, 12, 14, /**/ 4, 0, 2, 6, 8, 10, 12, 14, // 9647 0, 2, 4, 6, 8, 10, 12, 14, /**/ 2, 0, 4, 6, 8, 10, 12, 14, // 9648 0, 2, 4, 6, 8, 10, 12, 14, /**/ 0, 2, 4, 6, 8, 10, 12, 14}; 9649 9650 const Vec128<uint8_t, 2 * N> byte_idx = Load8Bytes(d8, table + mask_bits * 8); 9651 const Vec128<uint16_t, N> pairs = ZipLower(byte_idx, byte_idx); 9652 return BitCast(d, pairs + Set(du, 0x0100)); 9653 } 9654 9655 template <typename T, size_t N> 9656 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<4> /*tag*/, 9657 uint64_t mask_bits) { 9658 HWY_DASSERT(mask_bits < 16); 9659 9660 // There are only 4 lanes, so we can afford to load the index vector directly. 9661 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { 9662 // PrintCompress32x4Tables 9663 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 9664 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 9665 4, 5, 6, 7, 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, // 9666 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 9667 8, 9, 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, // 9668 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, // 9669 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, // 9670 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, // 9671 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, // 9672 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, // 9673 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 8, 9, 10, 11, // 9674 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, // 9675 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, // 9676 0, 1, 2, 3, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, // 9677 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, // 9678 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 9679 const Simd<T, N, 0> d; 9680 const Repartition<uint8_t, decltype(d)> d8; 9681 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 9682 } 9683 9684 template <typename T, size_t N> 9685 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<4> /*tag*/, 9686 uint64_t mask_bits) { 9687 HWY_DASSERT(mask_bits < 16); 9688 9689 // There are only 4 lanes, so we can afford to load the index vector directly. 9690 alignas(16) static constexpr uint8_t u8_indices[16 * 16] = { 9691 // PrintCompressNot32x4Tables 9692 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 9693 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 0, 1, 2, 3, 9694 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 9695 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 9696 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 9697 2, 3, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 4, 5, 6, 7, 9698 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9699 10, 11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 9700 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, 12, 13, 14, 15, 0, 1, 9701 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15, 8, 9, 10, 11, 9702 0, 1, 2, 3, 4, 5, 6, 7, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 9703 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 4, 5, 6, 7, 0, 1, 2, 3, 9704 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 9705 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 9706 12, 13, 14, 15}; 9707 const Simd<T, N, 0> d; 9708 const Repartition<uint8_t, decltype(d)> d8; 9709 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 9710 } 9711 9712 #if HWY_HAVE_INTEGER64 || HWY_HAVE_FLOAT64 9713 9714 template <typename T, size_t N> 9715 HWY_INLINE Vec128<T, N> IdxFromBits(hwy::SizeTag<8> /*tag*/, 9716 uint64_t mask_bits) { 9717 HWY_DASSERT(mask_bits < 4); 9718 9719 // There are only 2 lanes, so we can afford to load the index vector directly. 9720 alignas(16) static constexpr uint8_t u8_indices[64] = { 9721 // PrintCompress64x2Tables 9722 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 9723 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 9724 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 9725 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 9726 9727 const Simd<T, N, 0> d; 9728 const Repartition<uint8_t, decltype(d)> d8; 9729 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 9730 } 9731 9732 template <typename T, size_t N> 9733 HWY_INLINE Vec128<T, N> IdxFromNotBits(hwy::SizeTag<8> /*tag*/, 9734 uint64_t mask_bits) { 9735 HWY_DASSERT(mask_bits < 4); 9736 9737 // There are only 2 lanes, so we can afford to load the index vector directly. 9738 alignas(16) static constexpr uint8_t u8_indices[4 * 16] = { 9739 // PrintCompressNot64x2Tables 9740 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 9741 8, 9, 10, 11, 12, 13, 14, 15, 0, 1, 2, 3, 4, 5, 6, 7, 9742 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 9743 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; 9744 9745 const Simd<T, N, 0> d; 9746 const Repartition<uint8_t, decltype(d)> d8; 9747 return BitCast(d, Load(d8, u8_indices + 16 * mask_bits)); 9748 } 9749 9750 #endif 9751 9752 // Helper function called by both Compress and CompressStore - avoids a 9753 // redundant BitsFromMask in the latter. 9754 template <typename T, size_t N> 9755 HWY_INLINE Vec128<T, N> Compress(Vec128<T, N> v, uint64_t mask_bits) { 9756 const auto idx = 9757 detail::IdxFromBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits); 9758 using D = DFromV<decltype(v)>; 9759 const RebindToSigned<D> di; 9760 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); 9761 } 9762 9763 template <typename T, size_t N> 9764 HWY_INLINE Vec128<T, N> CompressNot(Vec128<T, N> v, uint64_t mask_bits) { 9765 const auto idx = 9766 detail::IdxFromNotBits<T, N>(hwy::SizeTag<sizeof(T)>(), mask_bits); 9767 using D = DFromV<decltype(v)>; 9768 const RebindToSigned<D> di; 9769 return BitCast(D(), TableLookupBytes(BitCast(di, v), BitCast(di, idx))); 9770 } 9771 9772 } // namespace detail 9773 9774 // Single lane: no-op 9775 template <typename T> 9776 HWY_API Vec128<T, 1> Compress(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 9777 return v; 9778 } 9779 9780 // Two lanes: conditional swap 9781 template <typename T, size_t N, HWY_IF_T_SIZE(T, 8)> 9782 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 9783 // If mask[1] = 1 and mask[0] = 0, then swap both halves, else keep. 9784 const DFromV<decltype(v)> d; 9785 const Vec128<T, N> m = VecFromMask(d, mask); 9786 const Vec128<T, N> maskL = DupEven(m); 9787 const Vec128<T, N> maskH = DupOdd(m); 9788 const Vec128<T, N> swap = AndNot(maskL, maskH); 9789 return IfVecThenElse(swap, Shuffle01(v), v); 9790 } 9791 9792 // General case, 2 or 4 byte lanes 9793 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 9794 HWY_API Vec128<T, N> Compress(Vec128<T, N> v, Mask128<T, N> mask) { 9795 const DFromV<decltype(v)> d; 9796 return detail::Compress(v, BitsFromMask(d, mask)); 9797 } 9798 9799 // Single lane: no-op 9800 template <typename T> 9801 HWY_API Vec128<T, 1> CompressNot(Vec128<T, 1> v, Mask128<T, 1> /*m*/) { 9802 return v; 9803 } 9804 9805 // Two lanes: conditional swap 9806 template <typename T, HWY_IF_T_SIZE(T, 8)> 9807 HWY_API Vec128<T> CompressNot(Vec128<T> v, Mask128<T> mask) { 9808 // If mask[1] = 0 and mask[0] = 1, then swap both halves, else keep. 9809 const DFromV<decltype(v)> d; 9810 const Vec128<T> m = VecFromMask(d, mask); 9811 const Vec128<T> maskL = DupEven(m); 9812 const Vec128<T> maskH = DupOdd(m); 9813 const Vec128<T> swap = AndNot(maskH, maskL); 9814 return IfVecThenElse(swap, Shuffle01(v), v); 9815 } 9816 9817 // General case, 2 or 4 byte lanes 9818 template <typename T, size_t N, HWY_IF_T_SIZE_ONE_OF(T, (1 << 2) | (1 << 4))> 9819 HWY_API Vec128<T, N> CompressNot(Vec128<T, N> v, Mask128<T, N> mask) { 9820 const DFromV<decltype(v)> d; 9821 // For partial vectors, we cannot pull the Not() into the table because 9822 // BitsFromMask clears the upper bits. 9823 if (N < 16 / sizeof(T)) { 9824 return detail::Compress(v, BitsFromMask(d, Not(mask))); 9825 } 9826 return detail::CompressNot(v, BitsFromMask(d, mask)); 9827 } 9828 9829 // ------------------------------ CompressBlocksNot 9830 HWY_API Vec128<uint64_t> CompressBlocksNot(Vec128<uint64_t> v, 9831 Mask128<uint64_t> /* m */) { 9832 return v; 9833 } 9834 9835 // ------------------------------ CompressBits 9836 9837 template <typename T, size_t N, HWY_IF_NOT_T_SIZE(T, 1)> 9838 HWY_INLINE Vec128<T, N> CompressBits(Vec128<T, N> v, 9839 const uint8_t* HWY_RESTRICT bits) { 9840 uint64_t mask_bits = 0; 9841 constexpr size_t kNumBytes = (N + 7) / 8; 9842 CopyBytes<kNumBytes>(bits, &mask_bits); 9843 if (N < 8) { 9844 mask_bits &= (1ull << N) - 1; 9845 } 9846 9847 return detail::Compress(v, mask_bits); 9848 } 9849 9850 // ------------------------------ CompressStore 9851 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 9852 HWY_API size_t CompressStore(VFromD<D> v, MFromD<D> mask, D d, 9853 TFromD<D>* HWY_RESTRICT unaligned) { 9854 const uint64_t mask_bits = BitsFromMask(d, mask); 9855 StoreU(detail::Compress(v, mask_bits), d, unaligned); 9856 return PopCount(mask_bits); 9857 } 9858 9859 // ------------------------------ CompressBlendedStore 9860 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 9861 HWY_API size_t CompressBlendedStore(VFromD<D> v, MFromD<D> m, D d, 9862 TFromD<D>* HWY_RESTRICT unaligned) { 9863 const RebindToUnsigned<decltype(d)> du; // so we can support fp16/bf16 9864 const uint64_t mask_bits = BitsFromMask(d, m); 9865 const size_t count = PopCount(mask_bits); 9866 const MFromD<D> store_mask = RebindMask(d, FirstN(du, count)); 9867 const VFromD<decltype(du)> compressed = 9868 detail::Compress(BitCast(du, v), mask_bits); 9869 BlendedStore(BitCast(d, compressed), store_mask, d, unaligned); 9870 return count; 9871 } 9872 9873 // ------------------------------ CompressBitsStore 9874 9875 template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)> 9876 HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits, 9877 D d, TFromD<D>* HWY_RESTRICT unaligned) { 9878 uint64_t mask_bits = 0; 9879 constexpr size_t kNumBytes = (d.MaxLanes() + 7) / 8; 9880 CopyBytes<kNumBytes>(bits, &mask_bits); 9881 if (d.MaxLanes() < 8) { 9882 mask_bits &= (1ull << d.MaxLanes()) - 1; 9883 } 9884 9885 StoreU(detail::Compress(v, mask_bits), d, unaligned); 9886 return PopCount(mask_bits); 9887 } 9888 9889 // ------------------------------ LoadInterleaved2 9890 9891 // Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2. 9892 #ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED 9893 #undef HWY_NATIVE_LOAD_STORE_INTERLEAVED 9894 #else 9895 #define HWY_NATIVE_LOAD_STORE_INTERLEAVED 9896 #endif 9897 9898 namespace detail { 9899 9900 #define HWY_NEON_BUILD_TPL_HWY_LOAD_INT 9901 #define HWY_NEON_BUILD_ARG_HWY_LOAD_INT from 9902 9903 #if HWY_ARCH_ARM_A64 9904 #define HWY_IF_LOAD_INT(D) \ 9905 HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D) 9906 #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \ 9907 HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ 9908 HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) 9909 #else 9910 // Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any 9911 // emulated types. 9912 #define HWY_IF_LOAD_INT(D) \ 9913 HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), \ 9914 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \ 9915 nullptr 9916 #define HWY_NEON_DEF_FUNCTION_LOAD_INT(name, prefix, infix, args) \ 9917 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ 9918 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ 9919 HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \ 9920 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ 9921 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ 9922 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) 9923 #endif // HWY_ARCH_ARM_A64 9924 9925 // Must return raw tuple because Tuple2 lack a ctor, and we cannot use 9926 // brace-initialization in HWY_NEON_DEF_FUNCTION because some functions return 9927 // void. 9928 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ 9929 decltype(Tuple2<type##_t, size>().raw) 9930 // Tuple tag arg allows overloading (cannot just overload on return type) 9931 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ 9932 const NativeLaneType<type##_t>*from, Tuple2<type##_t, size> 9933 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved2, vld2, _, HWY_LOAD_INT) 9934 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT 9935 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT 9936 9937 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ 9938 decltype(Tuple3<type##_t, size>().raw) 9939 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ 9940 const NativeLaneType<type##_t>*from, Tuple3<type##_t, size> 9941 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved3, vld3, _, HWY_LOAD_INT) 9942 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT 9943 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT 9944 9945 #define HWY_NEON_BUILD_RET_HWY_LOAD_INT(type, size) \ 9946 decltype(Tuple4<type##_t, size>().raw) 9947 #define HWY_NEON_BUILD_PARAM_HWY_LOAD_INT(type, size) \ 9948 const NativeLaneType<type##_t>*from, Tuple4<type##_t, size> 9949 HWY_NEON_DEF_FUNCTION_LOAD_INT(LoadInterleaved4, vld4, _, HWY_LOAD_INT) 9950 #undef HWY_NEON_BUILD_PARAM_HWY_LOAD_INT 9951 #undef HWY_NEON_BUILD_RET_HWY_LOAD_INT 9952 9953 #undef HWY_NEON_DEF_FUNCTION_LOAD_INT 9954 #undef HWY_NEON_BUILD_TPL_HWY_LOAD_INT 9955 #undef HWY_NEON_BUILD_ARG_HWY_LOAD_INT 9956 9957 } // namespace detail 9958 9959 template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>> 9960 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, 9961 VFromD<D>& v0, VFromD<D>& v1) { 9962 auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(unaligned), 9963 detail::Tuple2<T, d.MaxLanes()>()); 9964 v0 = VFromD<D>(raw.val[0]); 9965 v1 = VFromD<D>(raw.val[1]); 9966 } 9967 9968 // <= 32 bits: avoid loading more than N bytes by copying to buffer 9969 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), 9970 typename T = TFromD<D>> 9971 HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned, 9972 VFromD<D>& v0, VFromD<D>& v1) { 9973 // The smallest vector registers are 64-bits and we want space for two. 9974 alignas(16) T buf[2 * 8 / sizeof(T)] = {}; 9975 CopyBytes<d.MaxBytes() * 2>(unaligned, buf); 9976 auto raw = detail::LoadInterleaved2(detail::NativeLanePointer(buf), 9977 detail::Tuple2<T, d.MaxLanes()>()); 9978 v0 = VFromD<D>(raw.val[0]); 9979 v1 = VFromD<D>(raw.val[1]); 9980 } 9981 9982 #if HWY_ARCH_ARM_V7 9983 // 64x2: split into two 64x1 9984 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8), 9985 HWY_NEON_IF_NOT_EMULATED_D(D)> 9986 HWY_API void LoadInterleaved2(D d, T* HWY_RESTRICT unaligned, Vec128<T>& v0, 9987 Vec128<T>& v1) { 9988 const Half<decltype(d)> dh; 9989 VFromD<decltype(dh)> v00, v10, v01, v11; 9990 LoadInterleaved2(dh, detail::NativeLanePointer(unaligned), v00, v10); 9991 LoadInterleaved2(dh, detail::NativeLanePointer(unaligned + 2), v01, v11); 9992 v0 = Combine(d, v01, v00); 9993 v1 = Combine(d, v11, v10); 9994 } 9995 #endif // HWY_ARCH_ARM_V7 9996 9997 // ------------------------------ LoadInterleaved3 9998 9999 template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>> 10000 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, 10001 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 10002 auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(unaligned), 10003 detail::Tuple3<T, d.MaxLanes()>()); 10004 v0 = VFromD<D>(raw.val[0]); 10005 v1 = VFromD<D>(raw.val[1]); 10006 v2 = VFromD<D>(raw.val[2]); 10007 } 10008 10009 // <= 32 bits: avoid writing more than N bytes by copying to buffer 10010 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), 10011 typename T = TFromD<D>> 10012 HWY_API void LoadInterleaved3(D d, const T* HWY_RESTRICT unaligned, 10013 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) { 10014 // The smallest vector registers are 64-bits and we want space for three. 10015 alignas(16) T buf[3 * 8 / sizeof(T)] = {}; 10016 CopyBytes<d.MaxBytes() * 3>(unaligned, buf); 10017 auto raw = detail::LoadInterleaved3(detail::NativeLanePointer(buf), 10018 detail::Tuple3<T, d.MaxLanes()>()); 10019 v0 = VFromD<D>(raw.val[0]); 10020 v1 = VFromD<D>(raw.val[1]); 10021 v2 = VFromD<D>(raw.val[2]); 10022 } 10023 10024 #if HWY_ARCH_ARM_V7 10025 // 64x2: split into two 64x1 10026 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8), 10027 HWY_NEON_IF_NOT_EMULATED_D(D)> 10028 HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned, 10029 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2) { 10030 const Half<decltype(d)> dh; 10031 VFromD<decltype(dh)> v00, v10, v20, v01, v11, v21; 10032 LoadInterleaved3(dh, detail::NativeLanePointer(unaligned), v00, v10, v20); 10033 LoadInterleaved3(dh, detail::NativeLanePointer(unaligned + 3), v01, v11, v21); 10034 v0 = Combine(d, v01, v00); 10035 v1 = Combine(d, v11, v10); 10036 v2 = Combine(d, v21, v20); 10037 } 10038 #endif // HWY_ARCH_ARM_V7 10039 10040 // ------------------------------ LoadInterleaved4 10041 10042 template <class D, HWY_IF_LOAD_INT(D), typename T = TFromD<D>> 10043 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, 10044 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 10045 VFromD<D>& v3) { 10046 auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(unaligned), 10047 detail::Tuple4<T, d.MaxLanes()>()); 10048 v0 = VFromD<D>(raw.val[0]); 10049 v1 = VFromD<D>(raw.val[1]); 10050 v2 = VFromD<D>(raw.val[2]); 10051 v3 = VFromD<D>(raw.val[3]); 10052 } 10053 10054 // <= 32 bits: avoid writing more than N bytes by copying to buffer 10055 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), 10056 typename T = TFromD<D>> 10057 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, 10058 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2, 10059 VFromD<D>& v3) { 10060 alignas(16) T buf[4 * 8 / sizeof(T)] = {}; 10061 CopyBytes<d.MaxBytes() * 4>(unaligned, buf); 10062 auto raw = detail::LoadInterleaved4(detail::NativeLanePointer(buf), 10063 detail::Tuple4<T, d.MaxLanes()>()); 10064 v0 = VFromD<D>(raw.val[0]); 10065 v1 = VFromD<D>(raw.val[1]); 10066 v2 = VFromD<D>(raw.val[2]); 10067 v3 = VFromD<D>(raw.val[3]); 10068 } 10069 10070 #if HWY_ARCH_ARM_V7 10071 // 64x2: split into two 64x1 10072 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8), 10073 HWY_NEON_IF_NOT_EMULATED_D(D)> 10074 HWY_API void LoadInterleaved4(D d, const T* HWY_RESTRICT unaligned, 10075 Vec128<T>& v0, Vec128<T>& v1, Vec128<T>& v2, 10076 Vec128<T>& v3) { 10077 const Half<decltype(d)> dh; 10078 VFromD<decltype(dh)> v00, v10, v20, v30, v01, v11, v21, v31; 10079 LoadInterleaved4(dh, detail::NativeLanePointer(unaligned), v00, v10, v20, 10080 v30); 10081 LoadInterleaved4(dh, detail::NativeLanePointer(unaligned + 4), v01, v11, v21, 10082 v31); 10083 v0 = Combine(d, v01, v00); 10084 v1 = Combine(d, v11, v10); 10085 v2 = Combine(d, v21, v20); 10086 v3 = Combine(d, v31, v30); 10087 } 10088 #endif // HWY_ARCH_ARM_V7 10089 10090 #undef HWY_IF_LOAD_INT 10091 10092 // ------------------------------ StoreInterleaved2 10093 10094 namespace detail { 10095 #define HWY_NEON_BUILD_TPL_HWY_STORE_INT 10096 #define HWY_NEON_BUILD_RET_HWY_STORE_INT(type, size) void 10097 #define HWY_NEON_BUILD_ARG_HWY_STORE_INT to, tup.raw 10098 10099 #if HWY_ARCH_ARM_A64 10100 #define HWY_IF_STORE_INT(D) \ 10101 HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D) 10102 #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \ 10103 HWY_NEON_DEF_FUNCTION_ALL_TYPES(name, prefix, infix, args) \ 10104 HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) 10105 #else 10106 // Exclude 64x2 and f64x1, which are only supported on aarch64; also exclude any 10107 // emulated types. 10108 #define HWY_IF_STORE_INT(D) \ 10109 HWY_IF_V_SIZE_GT_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), \ 10110 hwy::EnableIf<(HWY_MAX_LANES_D(D) == 1 || sizeof(TFromD<D>) < 8)>* = \ 10111 nullptr 10112 #define HWY_NEON_DEF_FUNCTION_STORE_INT(name, prefix, infix, args) \ 10113 HWY_NEON_DEF_FUNCTION_INT_8_16_32(name, prefix, infix, args) \ 10114 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(name, prefix, infix, args) \ 10115 HWY_NEON_DEF_FUNCTION_BFLOAT_16(name, prefix, infix, args) \ 10116 HWY_NEON_DEF_FUNCTION_FLOAT_16_32(name, prefix, infix, args) \ 10117 HWY_NEON_DEF_FUNCTION(int64, 1, name, prefix, infix, s64, args) \ 10118 HWY_NEON_DEF_FUNCTION(uint64, 1, name, prefix, infix, u64, args) 10119 #endif // HWY_ARCH_ARM_A64 10120 10121 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ 10122 Tuple2<type##_t, size> tup, NativeLaneType<type##_t>*to 10123 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved2, vst2, _, HWY_STORE_INT) 10124 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT 10125 10126 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ 10127 Tuple3<type##_t, size> tup, NativeLaneType<type##_t>*to 10128 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved3, vst3, _, HWY_STORE_INT) 10129 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT 10130 10131 #define HWY_NEON_BUILD_PARAM_HWY_STORE_INT(type, size) \ 10132 Tuple4<type##_t, size> tup, NativeLaneType<type##_t>*to 10133 HWY_NEON_DEF_FUNCTION_STORE_INT(StoreInterleaved4, vst4, _, HWY_STORE_INT) 10134 #undef HWY_NEON_BUILD_PARAM_HWY_STORE_INT 10135 10136 #undef HWY_NEON_DEF_FUNCTION_STORE_INT 10137 #undef HWY_NEON_BUILD_TPL_HWY_STORE_INT 10138 #undef HWY_NEON_BUILD_RET_HWY_STORE_INT 10139 #undef HWY_NEON_BUILD_ARG_HWY_STORE_INT 10140 } // namespace detail 10141 10142 template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>> 10143 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 10144 T* HWY_RESTRICT unaligned) { 10145 detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}}; 10146 detail::StoreInterleaved2(tup, detail::NativeLanePointer(unaligned)); 10147 } 10148 10149 // <= 32 bits: avoid writing more than N bytes by copying to buffer 10150 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), 10151 typename T = TFromD<D>> 10152 HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d, 10153 T* HWY_RESTRICT unaligned) { 10154 alignas(16) T buf[2 * 8 / sizeof(T)]; 10155 detail::Tuple2<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw}}}; 10156 detail::StoreInterleaved2(tup, detail::NativeLanePointer(buf)); 10157 CopyBytes<d.MaxBytes() * 2>(buf, unaligned); 10158 } 10159 10160 #if HWY_ARCH_ARM_V7 10161 // 64x2: split into two 64x1 10162 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8), 10163 HWY_NEON_IF_NOT_EMULATED_D(D)> 10164 HWY_API void StoreInterleaved2(Vec128<T> v0, Vec128<T> v1, D d, 10165 T* HWY_RESTRICT unaligned) { 10166 const Half<decltype(d)> dh; 10167 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), dh, 10168 detail::NativeLanePointer(unaligned)); 10169 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), dh, 10170 detail::NativeLanePointer(unaligned + 2)); 10171 } 10172 #endif // HWY_ARCH_ARM_V7 10173 10174 // ------------------------------ StoreInterleaved3 10175 10176 template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>> 10177 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 10178 T* HWY_RESTRICT unaligned) { 10179 detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}}; 10180 detail::StoreInterleaved3(tup, detail::NativeLanePointer(unaligned)); 10181 } 10182 10183 // <= 32 bits: avoid writing more than N bytes by copying to buffer 10184 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), 10185 typename T = TFromD<D>> 10186 HWY_API void StoreInterleaved3(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, D d, 10187 T* HWY_RESTRICT unaligned) { 10188 alignas(16) T buf[3 * 8 / sizeof(T)]; 10189 detail::Tuple3<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw}}}; 10190 detail::StoreInterleaved3(tup, detail::NativeLanePointer(buf)); 10191 CopyBytes<d.MaxBytes() * 3>(buf, unaligned); 10192 } 10193 10194 #if HWY_ARCH_ARM_V7 10195 // 64x2: split into two 64x1 10196 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8), 10197 HWY_NEON_IF_NOT_EMULATED_D(D)> 10198 HWY_API void StoreInterleaved3(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, D d, 10199 T* HWY_RESTRICT unaligned) { 10200 const Half<decltype(d)> dh; 10201 StoreInterleaved3(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), dh, 10202 detail::NativeLanePointer(unaligned)); 10203 StoreInterleaved3(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), dh, 10204 detail::NativeLanePointer(unaligned + 3)); 10205 } 10206 #endif // HWY_ARCH_ARM_V7 10207 10208 // ------------------------------ StoreInterleaved4 10209 10210 template <class D, HWY_IF_STORE_INT(D), typename T = TFromD<D>> 10211 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 10212 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { 10213 detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; 10214 detail::StoreInterleaved4(tup, detail::NativeLanePointer(unaligned)); 10215 } 10216 10217 // <= 32 bits: avoid writing more than N bytes by copying to buffer 10218 template <class D, HWY_IF_V_SIZE_LE_D(D, 4), HWY_NEON_IF_NOT_EMULATED_D(D), 10219 typename T = TFromD<D>> 10220 HWY_API void StoreInterleaved4(VFromD<D> v0, VFromD<D> v1, VFromD<D> v2, 10221 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) { 10222 alignas(16) T buf[4 * 8 / sizeof(T)]; 10223 detail::Tuple4<T, d.MaxLanes()> tup = {{{v0.raw, v1.raw, v2.raw, v3.raw}}}; 10224 detail::StoreInterleaved4(tup, detail::NativeLanePointer(buf)); 10225 CopyBytes<d.MaxBytes() * 4>(buf, unaligned); 10226 } 10227 10228 #if HWY_ARCH_ARM_V7 10229 // 64x2: split into two 64x1 10230 template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE(T, 8), 10231 HWY_NEON_IF_NOT_EMULATED_D(D)> 10232 HWY_API void StoreInterleaved4(Vec128<T> v0, Vec128<T> v1, Vec128<T> v2, 10233 Vec128<T> v3, D d, T* HWY_RESTRICT unaligned) { 10234 const Half<decltype(d)> dh; 10235 StoreInterleaved4(LowerHalf(dh, v0), LowerHalf(dh, v1), LowerHalf(dh, v2), 10236 LowerHalf(dh, v3), dh, 10237 detail::NativeLanePointer(unaligned)); 10238 StoreInterleaved4(UpperHalf(dh, v0), UpperHalf(dh, v1), UpperHalf(dh, v2), 10239 UpperHalf(dh, v3), dh, 10240 detail::NativeLanePointer(unaligned + 4)); 10241 } 10242 #endif // HWY_ARCH_ARM_V7 10243 10244 #undef HWY_IF_STORE_INT 10245 10246 // Fall back on generic Load/StoreInterleaved[234] for any emulated types. 10247 // Requires HWY_GENERIC_IF_EMULATED_D mirrors HWY_NEON_IF_EMULATED_D. 10248 10249 // ------------------------------ Additional mask logical operations 10250 template <class T> 10251 HWY_API Mask128<T, 1> SetAtOrAfterFirst(Mask128<T, 1> mask) { 10252 return mask; 10253 } 10254 template <class T> 10255 HWY_API Mask128<T, 2> SetAtOrAfterFirst(Mask128<T, 2> mask) { 10256 const FixedTag<T, 2> d; 10257 const auto vmask = VecFromMask(d, mask); 10258 return MaskFromVec(Or(vmask, InterleaveLower(vmask, vmask))); 10259 } 10260 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 10261 HWY_API Mask128<T, N> SetAtOrAfterFirst(Mask128<T, N> mask) { 10262 const Simd<T, N, 0> d; 10263 const auto vmask = VecFromMask(d, mask); 10264 const auto neg_vmask = 10265 ResizeBitCast(d, Neg(ResizeBitCast(Full64<int64_t>(), vmask))); 10266 return MaskFromVec(Or(vmask, neg_vmask)); 10267 } 10268 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 10269 HWY_API Mask128<T> SetAtOrAfterFirst(Mask128<T> mask) { 10270 const Full128<T> d; 10271 const Repartition<int64_t, decltype(d)> di64; 10272 10273 auto vmask = BitCast(di64, VecFromMask(d, mask)); 10274 vmask = Or(vmask, Neg(vmask)); 10275 10276 // Copy the sign bit of the first int64_t lane to the second int64_t lane 10277 const auto vmask2 = BroadcastSignBit(InterleaveLower(Zero(di64), vmask)); 10278 return MaskFromVec(BitCast(d, Or(vmask, vmask2))); 10279 } 10280 10281 template <class T, size_t N> 10282 HWY_API Mask128<T, N> SetBeforeFirst(Mask128<T, N> mask) { 10283 return Not(SetAtOrAfterFirst(mask)); 10284 } 10285 10286 template <class T> 10287 HWY_API Mask128<T, 1> SetOnlyFirst(Mask128<T, 1> mask) { 10288 return mask; 10289 } 10290 template <class T> 10291 HWY_API Mask128<T, 2> SetOnlyFirst(Mask128<T, 2> mask) { 10292 const FixedTag<T, 2> d; 10293 const RebindToSigned<decltype(d)> di; 10294 10295 const auto vmask = BitCast(di, VecFromMask(d, mask)); 10296 const auto zero = Zero(di); 10297 const auto vmask2 = VecFromMask(di, InterleaveLower(zero, vmask) == zero); 10298 return MaskFromVec(BitCast(d, And(vmask, vmask2))); 10299 } 10300 template <class T, size_t N, HWY_IF_LANES_GT(N, 2), HWY_IF_V_SIZE_LE(T, N, 8)> 10301 HWY_API Mask128<T, N> SetOnlyFirst(Mask128<T, N> mask) { 10302 const Simd<T, N, 0> d; 10303 const RebindToSigned<decltype(d)> di; 10304 10305 const auto vmask = ResizeBitCast(Full64<int64_t>(), VecFromMask(d, mask)); 10306 const auto only_first_vmask = 10307 BitCast(d, Neg(ResizeBitCast(di, And(vmask, Neg(vmask))))); 10308 return MaskFromVec(only_first_vmask); 10309 } 10310 template <class T, HWY_IF_NOT_T_SIZE(T, 8)> 10311 HWY_API Mask128<T> SetOnlyFirst(Mask128<T> mask) { 10312 const Full128<T> d; 10313 const RebindToSigned<decltype(d)> di; 10314 const Repartition<int64_t, decltype(d)> di64; 10315 10316 const auto zero = Zero(di64); 10317 const auto vmask = BitCast(di64, VecFromMask(d, mask)); 10318 const auto vmask2 = VecFromMask(di64, InterleaveLower(zero, vmask) == zero); 10319 const auto only_first_vmask = Neg(BitCast(di, And(vmask, Neg(vmask)))); 10320 return MaskFromVec(BitCast(d, And(only_first_vmask, BitCast(di, vmask2)))); 10321 } 10322 10323 template <class T> 10324 HWY_API Mask128<T, 1> SetAtOrBeforeFirst(Mask128<T, 1> /*mask*/) { 10325 const FixedTag<T, 1> d; 10326 const RebindToSigned<decltype(d)> di; 10327 using TI = MakeSigned<T>; 10328 10329 return RebindMask(d, MaskFromVec(Set(di, TI(-1)))); 10330 } 10331 template <class T, size_t N, HWY_IF_LANES_GT(N, 1)> 10332 HWY_API Mask128<T, N> SetAtOrBeforeFirst(Mask128<T, N> mask) { 10333 const Simd<T, N, 0> d; 10334 return SetBeforeFirst(MaskFromVec(ShiftLeftLanes<1>(VecFromMask(d, mask)))); 10335 } 10336 10337 // ------------------------------ Lt128 10338 10339 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 10340 HWY_INLINE MFromD<D> Lt128(D d, VFromD<D> a, VFromD<D> b) { 10341 static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64"); 10342 // Truth table of Eq and Lt for Hi and Lo u64. 10343 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true) 10344 // =H =L cH cL | out = cH | (=H & cL) 10345 // 0 0 0 0 | 0 10346 // 0 0 0 1 | 0 10347 // 0 0 1 0 | 1 10348 // 0 0 1 1 | 1 10349 // 0 1 0 0 | 0 10350 // 0 1 0 1 | 0 10351 // 0 1 1 0 | 1 10352 // 1 0 0 0 | 0 10353 // 1 0 0 1 | 1 10354 // 1 1 0 0 | 0 10355 const MFromD<D> eqHL = Eq(a, b); 10356 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 10357 // We need to bring cL to the upper lane/bit corresponding to cH. Comparing 10358 // the result of InterleaveUpper/Lower requires 9 ops, whereas shifting the 10359 // comparison result leftwards requires only 4. IfThenElse compiles to the 10360 // same code as OrAnd(). 10361 const VFromD<D> ltLx = DupEven(ltHL); 10362 const VFromD<D> outHx = IfThenElse(eqHL, ltLx, ltHL); 10363 return MaskFromVec(DupOdd(outHx)); 10364 } 10365 10366 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 10367 HWY_INLINE MFromD<D> Lt128Upper(D d, VFromD<D> a, VFromD<D> b) { 10368 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b)); 10369 return MaskFromVec(InterleaveUpper(d, ltHL, ltHL)); 10370 } 10371 10372 // ------------------------------ Eq128 10373 10374 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 10375 HWY_INLINE MFromD<D> Eq128(D d, VFromD<D> a, VFromD<D> b) { 10376 static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64"); 10377 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 10378 return MaskFromVec(And(Reverse2(d, eqHL), eqHL)); 10379 } 10380 10381 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 10382 HWY_INLINE MFromD<D> Eq128Upper(D d, VFromD<D> a, VFromD<D> b) { 10383 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b)); 10384 return MaskFromVec(InterleaveUpper(d, eqHL, eqHL)); 10385 } 10386 10387 // ------------------------------ Ne128 10388 10389 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 10390 HWY_INLINE MFromD<D> Ne128(D d, VFromD<D> a, VFromD<D> b) { 10391 static_assert(IsSame<TFromD<D>, uint64_t>(), "T must be u64"); 10392 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 10393 return MaskFromVec(Or(Reverse2(d, neHL), neHL)); 10394 } 10395 10396 template <class D, HWY_IF_V_SIZE_LE_D(D, 16)> 10397 HWY_INLINE MFromD<D> Ne128Upper(D d, VFromD<D> a, VFromD<D> b) { 10398 const VFromD<D> neHL = VecFromMask(d, Ne(a, b)); 10399 return MaskFromVec(InterleaveUpper(d, neHL, neHL)); 10400 } 10401 10402 // ------------------------------ Min128, Max128 (Lt128) 10403 10404 // Without a native OddEven, it seems infeasible to go faster than Lt128. 10405 template <class D> 10406 HWY_INLINE VFromD<D> Min128(D d, VFromD<D> a, VFromD<D> b) { 10407 return IfThenElse(Lt128(d, a, b), a, b); 10408 } 10409 10410 template <class D> 10411 HWY_INLINE VFromD<D> Max128(D d, VFromD<D> a, VFromD<D> b) { 10412 return IfThenElse(Lt128(d, b, a), a, b); 10413 } 10414 10415 template <class D> 10416 HWY_INLINE VFromD<D> Min128Upper(D d, VFromD<D> a, VFromD<D> b) { 10417 return IfThenElse(Lt128Upper(d, a, b), a, b); 10418 } 10419 10420 template <class D> 10421 HWY_INLINE VFromD<D> Max128Upper(D d, VFromD<D> a, VFromD<D> b) { 10422 return IfThenElse(Lt128Upper(d, b, a), a, b); 10423 } 10424 10425 // -------------------- LeadingZeroCount, TrailingZeroCount, HighestSetBitIndex 10426 10427 #ifdef HWY_NATIVE_LEADING_ZERO_COUNT 10428 #undef HWY_NATIVE_LEADING_ZERO_COUNT 10429 #else 10430 #define HWY_NATIVE_LEADING_ZERO_COUNT 10431 #endif 10432 10433 HWY_NEON_DEF_FUNCTION_INT_8_16_32(LeadingZeroCount, vclz, _, 1) 10434 HWY_NEON_DEF_FUNCTION_UINT_8_16_32(LeadingZeroCount, vclz, _, 1) 10435 10436 template <class V, HWY_IF_UI64_D(DFromV<V>)> 10437 HWY_API V LeadingZeroCount(V v) { 10438 const DFromV<decltype(v)> d; 10439 const RebindToUnsigned<decltype(d)> du; 10440 const Repartition<uint32_t, decltype(d)> du32; 10441 10442 const auto v_k32 = BitCast(du32, Set(du, 32)); 10443 const auto v_u32_lzcnt = LeadingZeroCount(BitCast(du32, v)) + v_k32; 10444 const auto v_u32_lo_lzcnt = 10445 And(v_u32_lzcnt, BitCast(du32, Set(du, 0xFFFFFFFFu))); 10446 const auto v_u32_hi_lzcnt = 10447 BitCast(du32, ShiftRight<32>(BitCast(du, v_u32_lzcnt))); 10448 10449 return BitCast( 10450 d, IfThenElse(v_u32_hi_lzcnt == v_k32, v_u32_lo_lzcnt, v_u32_hi_lzcnt)); 10451 } 10452 10453 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)> 10454 HWY_API V HighestSetBitIndex(V v) { 10455 const DFromV<decltype(v)> d; 10456 using T = TFromD<decltype(d)>; 10457 return BitCast(d, Set(d, T{sizeof(T) * 8 - 1}) - LeadingZeroCount(v)); 10458 } 10459 10460 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), HWY_IF_T_SIZE_V(V, 1)> 10461 HWY_API V TrailingZeroCount(V v) { 10462 return LeadingZeroCount(ReverseBits(v)); 10463 } 10464 10465 template <class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V), 10466 HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 2) | (1 << 4) | (1 << 8))> 10467 HWY_API V TrailingZeroCount(V v) { 10468 const DFromV<decltype(v)> d; 10469 const Repartition<uint8_t, decltype(d)> du8; 10470 return LeadingZeroCount( 10471 ReverseLaneBytes(BitCast(d, ReverseBits(BitCast(du8, v))))); 10472 } 10473 10474 namespace detail { // for code folding 10475 #if HWY_ARCH_ARM_V7 10476 #undef vuzp1_s8 10477 #undef vuzp1_u8 10478 #undef vuzp1_s16 10479 #undef vuzp1_u16 10480 #undef vuzp1_s32 10481 #undef vuzp1_u32 10482 #undef vuzp1_f32 10483 #undef vuzp1q_s8 10484 #undef vuzp1q_u8 10485 #undef vuzp1q_s16 10486 #undef vuzp1q_u16 10487 #undef vuzp1q_s32 10488 #undef vuzp1q_u32 10489 #undef vuzp1q_f32 10490 #undef vuzp2_s8 10491 #undef vuzp2_u8 10492 #undef vuzp2_s16 10493 #undef vuzp2_u16 10494 #undef vuzp2_s32 10495 #undef vuzp2_u32 10496 #undef vuzp2_f32 10497 #undef vuzp2q_s8 10498 #undef vuzp2q_u8 10499 #undef vuzp2q_s16 10500 #undef vuzp2q_u16 10501 #undef vuzp2q_s32 10502 #undef vuzp2q_u32 10503 #undef vuzp2q_f32 10504 #undef vzip1_s8 10505 #undef vzip1_u8 10506 #undef vzip1_s16 10507 #undef vzip1_u16 10508 #undef vzip1_s32 10509 #undef vzip1_u32 10510 #undef vzip1_f32 10511 #undef vzip1q_s8 10512 #undef vzip1q_u8 10513 #undef vzip1q_s16 10514 #undef vzip1q_u16 10515 #undef vzip1q_s32 10516 #undef vzip1q_u32 10517 #undef vzip1q_f32 10518 #undef vzip2_s8 10519 #undef vzip2_u8 10520 #undef vzip2_s16 10521 #undef vzip2_u16 10522 #undef vzip2_s32 10523 #undef vzip2_u32 10524 #undef vzip2_f32 10525 #undef vzip2q_s8 10526 #undef vzip2q_u8 10527 #undef vzip2q_s16 10528 #undef vzip2q_u16 10529 #undef vzip2q_s32 10530 #undef vzip2q_u32 10531 #undef vzip2q_f32 10532 #endif 10533 10534 #undef HWY_NEON_BUILD_ARG_1 10535 #undef HWY_NEON_BUILD_ARG_2 10536 #undef HWY_NEON_BUILD_ARG_3 10537 #undef HWY_NEON_BUILD_PARAM_1 10538 #undef HWY_NEON_BUILD_PARAM_2 10539 #undef HWY_NEON_BUILD_PARAM_3 10540 #undef HWY_NEON_BUILD_RET_1 10541 #undef HWY_NEON_BUILD_RET_2 10542 #undef HWY_NEON_BUILD_RET_3 10543 #undef HWY_NEON_BUILD_TPL_1 10544 #undef HWY_NEON_BUILD_TPL_2 10545 #undef HWY_NEON_BUILD_TPL_3 10546 #undef HWY_NEON_DEF_FUNCTION 10547 #undef HWY_NEON_DEF_FUNCTION_ALL_FLOATS 10548 #undef HWY_NEON_DEF_FUNCTION_ALL_TYPES 10549 #undef HWY_NEON_DEF_FUNCTION_BFLOAT_16 10550 #undef HWY_NEON_DEF_FUNCTION_FLOAT_16 10551 #undef HWY_NEON_DEF_FUNCTION_FLOAT_16_32 10552 #undef HWY_NEON_DEF_FUNCTION_FLOAT_32 10553 #undef HWY_NEON_DEF_FUNCTION_FLOAT_64 10554 #undef HWY_NEON_DEF_FUNCTION_FULL_UI 10555 #undef HWY_NEON_DEF_FUNCTION_FULL_UI_64 10556 #undef HWY_NEON_DEF_FUNCTION_FULL_UIF_64 10557 #undef HWY_NEON_DEF_FUNCTION_INT_16 10558 #undef HWY_NEON_DEF_FUNCTION_INT_32 10559 #undef HWY_NEON_DEF_FUNCTION_INT_64 10560 #undef HWY_NEON_DEF_FUNCTION_INT_8 10561 #undef HWY_NEON_DEF_FUNCTION_INT_8_16_32 10562 #undef HWY_NEON_DEF_FUNCTION_INTS 10563 #undef HWY_NEON_DEF_FUNCTION_INTS_UINTS 10564 #undef HWY_NEON_DEF_FUNCTION_UI_8_16_32 10565 #undef HWY_NEON_DEF_FUNCTION_UIF_64 10566 #undef HWY_NEON_DEF_FUNCTION_UIF_8_16_32 10567 #undef HWY_NEON_DEF_FUNCTION_UINT_16 10568 #undef HWY_NEON_DEF_FUNCTION_UINT_32 10569 #undef HWY_NEON_DEF_FUNCTION_UINT_64 10570 #undef HWY_NEON_DEF_FUNCTION_UINT_8 10571 #undef HWY_NEON_DEF_FUNCTION_UINT_8_16_32 10572 #undef HWY_NEON_DEF_FUNCTION_UINTS 10573 #undef HWY_NEON_EVAL 10574 #undef HWY_NEON_IF_EMULATED_D 10575 #undef HWY_NEON_IF_NOT_EMULATED_D 10576 } // namespace detail 10577 10578 // NOLINTNEXTLINE(google-readability-namespace-comments) 10579 } // namespace HWY_NAMESPACE 10580 } // namespace hwy 10581 HWY_AFTER_NAMESPACE();