bit_pack-inl.h (114998B)
1 // Copyright 2022 Google LLC 2 // SPDX-License-Identifier: Apache-2.0 3 // 4 // Licensed under the Apache License, Version 2.0 (the "License"); 5 // you may not use this file except in compliance with the License. 6 // You may obtain a copy of the License at 7 // 8 // http://www.apache.org/licenses/LICENSE-2.0 9 // 10 // Unless required by applicable law or agreed to in writing, software 11 // distributed under the License is distributed on an "AS IS" BASIS, 12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 // See the License for the specific language governing permissions and 14 // limitations under the License. 15 16 #include <stddef.h> 17 #include <stdint.h> 18 19 #include "hwy/base.h" 20 21 // Per-target include guard 22 // clang-format off 23 #if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == defined(HWY_TARGET_TOGGLE) // NOLINT 24 // clang-format on 25 #ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ 26 #undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ 27 #else 28 #define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_ 29 #endif 30 31 #include "hwy/highway.h" 32 33 HWY_BEFORE_NAMESPACE(); 34 namespace hwy { 35 namespace HWY_NAMESPACE { 36 37 // The entry points are class templates specialized below for each number of 38 // bits. Each provides Pack and Unpack member functions which load (Pack) or 39 // store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of 40 // packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16 41 // for Pack16, 32 for Pack32 which is also the upper bound for kBits. 42 template <size_t kBits> // <= 8 43 struct Pack8 {}; 44 template <size_t kBits> // <= 16 45 struct Pack16 {}; 46 47 template <> 48 struct Pack8<1> { 49 template <class D8> 50 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 51 uint8_t* HWY_RESTRICT packed_out) const { 52 const RepartitionToWide<decltype(d8)> d16; 53 using VU16 = Vec<decltype(d16)>; 54 const size_t N8 = Lanes(d8); 55 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). 56 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 57 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 58 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 59 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 60 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 61 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 62 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 63 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 64 65 const VU16 packed = 66 Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)), 67 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)), 68 Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0)); 69 StoreU(BitCast(d8, packed), d8, packed_out); 70 } 71 72 template <class D8> 73 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 74 uint8_t* HWY_RESTRICT raw) const { 75 const RepartitionToWide<decltype(d8)> d16; 76 using VU16 = Vec<decltype(d16)>; 77 const size_t N8 = Lanes(d8); 78 const VU16 mask = Set(d16, 0x0101u); // LSB in each byte 79 80 const VU16 packed = BitCast(d16, LoadU(d8, packed_in)); 81 82 const VU16 raw0 = And(packed, mask); 83 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 84 85 const VU16 raw1 = And(ShiftRight<1>(packed), mask); 86 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 87 88 const VU16 raw2 = And(ShiftRight<2>(packed), mask); 89 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 90 91 const VU16 raw3 = And(ShiftRight<3>(packed), mask); 92 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 93 94 const VU16 raw4 = And(ShiftRight<4>(packed), mask); 95 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 96 97 const VU16 raw5 = And(ShiftRight<5>(packed), mask); 98 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 99 100 const VU16 raw6 = And(ShiftRight<6>(packed), mask); 101 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 102 103 const VU16 raw7 = And(ShiftRight<7>(packed), mask); 104 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 105 } 106 }; // Pack8<1> 107 108 template <> 109 struct Pack8<2> { 110 template <class D8> 111 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 112 uint8_t* HWY_RESTRICT packed_out) const { 113 const RepartitionToWide<decltype(d8)> d16; 114 using VU16 = Vec<decltype(d16)>; 115 const size_t N8 = Lanes(d8); 116 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). 117 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 118 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 119 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 120 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 121 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 122 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 123 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 124 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 125 126 const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4), 127 Or(ShiftLeft<2>(raw2), raw0)); 128 const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5), 129 Or(ShiftLeft<2>(raw3), raw1)); 130 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); 131 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); 132 } 133 134 template <class D8> 135 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 136 uint8_t* HWY_RESTRICT raw) const { 137 const RepartitionToWide<decltype(d8)> d16; 138 using VU16 = Vec<decltype(d16)>; 139 const size_t N8 = Lanes(d8); 140 const VU16 mask = Set(d16, 0x0303u); // Lowest 2 bits per byte 141 142 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); 143 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); 144 145 const VU16 raw0 = And(packed0, mask); 146 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 147 148 const VU16 raw1 = And(packed1, mask); 149 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 150 151 const VU16 raw2 = And(ShiftRight<2>(packed0), mask); 152 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 153 154 const VU16 raw3 = And(ShiftRight<2>(packed1), mask); 155 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 156 157 const VU16 raw4 = And(ShiftRight<4>(packed0), mask); 158 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 159 160 const VU16 raw5 = And(ShiftRight<4>(packed1), mask); 161 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 162 163 const VU16 raw6 = And(ShiftRight<6>(packed0), mask); 164 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 165 166 const VU16 raw7 = And(ShiftRight<6>(packed1), mask); 167 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 168 } 169 }; // Pack8<2> 170 171 template <> 172 struct Pack8<3> { 173 template <class D8> 174 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 175 uint8_t* HWY_RESTRICT packed_out) const { 176 const RepartitionToWide<decltype(d8)> d16; 177 using VU16 = Vec<decltype(d16)>; 178 const size_t N8 = Lanes(d8); 179 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 180 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 181 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 182 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 183 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 184 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 185 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 186 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 187 188 // The upper two bits of these three will be filled with packed3 (6 bits). 189 VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0); 190 VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1); 191 VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2); 192 const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3); 193 194 const VU16 hi2 = Set(d16, 0xC0C0u); 195 packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2); 196 packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2); 197 packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2); 198 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); 199 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); 200 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); 201 } 202 203 template <class D8> 204 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 205 uint8_t* HWY_RESTRICT raw) const { 206 const RepartitionToWide<decltype(d8)> d16; 207 using VU16 = Vec<decltype(d16)>; 208 const size_t N8 = Lanes(d8); 209 const VU16 mask = Set(d16, 0x0707u); // Lowest 3 bits per byte 210 211 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); 212 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); 213 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); 214 215 const VU16 raw0 = And(packed0, mask); 216 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 217 218 const VU16 raw1 = And(packed1, mask); 219 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 220 221 const VU16 raw2 = And(packed2, mask); 222 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 223 224 const VU16 raw4 = And(ShiftRight<3>(packed0), mask); 225 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 226 227 const VU16 raw5 = And(ShiftRight<3>(packed1), mask); 228 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 229 230 const VU16 raw6 = And(ShiftRight<3>(packed2), mask); 231 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 232 233 // raw73 is the concatenation of the upper two bits in packed0..2. 234 const VU16 hi2 = Set(d16, 0xC0C0u); 235 const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)), // 236 ShiftRight<4>(And(packed1, hi2)), 237 ShiftRight<2>(And(packed0, hi2))); 238 239 const VU16 raw3 = And(mask, raw73); 240 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 241 242 const VU16 raw7 = And(mask, ShiftRight<3>(raw73)); 243 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 244 } 245 }; // Pack8<3> 246 247 template <> 248 struct Pack8<4> { 249 template <class D8> 250 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 251 uint8_t* HWY_RESTRICT packed_out) const { 252 const RepartitionToWide<decltype(d8)> d16; 253 using VU16 = Vec<decltype(d16)>; 254 const size_t N8 = Lanes(d8); 255 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes). 256 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 257 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 258 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 259 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 260 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 261 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 262 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 263 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 264 265 const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0); 266 const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1); 267 const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4); 268 const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5); 269 270 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); 271 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); 272 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); 273 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); 274 } 275 276 template <class D8> 277 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 278 uint8_t* HWY_RESTRICT raw) const { 279 const RepartitionToWide<decltype(d8)> d16; 280 using VU16 = Vec<decltype(d16)>; 281 const size_t N8 = Lanes(d8); 282 const VU16 mask = Set(d16, 0x0F0Fu); // Lowest 4 bits per byte 283 284 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); 285 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); 286 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); 287 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); 288 289 const VU16 raw0 = And(packed0, mask); 290 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 291 292 const VU16 raw1 = And(packed1, mask); 293 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 294 295 const VU16 raw2 = And(ShiftRight<4>(packed0), mask); 296 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 297 298 const VU16 raw3 = And(ShiftRight<4>(packed1), mask); 299 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 300 301 const VU16 raw4 = And(packed2, mask); 302 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 303 304 const VU16 raw5 = And(packed3, mask); 305 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 306 307 const VU16 raw6 = And(ShiftRight<4>(packed2), mask); 308 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 309 310 const VU16 raw7 = And(ShiftRight<4>(packed3), mask); 311 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 312 } 313 }; // Pack8<4> 314 315 template <> 316 struct Pack8<5> { 317 template <class D8> 318 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 319 uint8_t* HWY_RESTRICT packed_out) const { 320 const RepartitionToWide<decltype(d8)> d16; 321 using VU16 = Vec<decltype(d16)>; 322 const size_t N8 = Lanes(d8); 323 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 324 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 325 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 326 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 327 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 328 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 329 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 330 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 331 332 // Fill upper three bits with upper bits from raw4..7. 333 const VU16 hi3 = Set(d16, 0xE0E0u); 334 const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3); 335 const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3); 336 const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3); 337 const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3); 338 339 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); 340 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); 341 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); 342 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); 343 344 // Combine lower two bits of raw4..7 into packed4. 345 const VU16 lo2 = Set(d16, 0x0303u); 346 const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)), 347 ShiftLeft<4>(And(raw6, lo2)), 348 ShiftLeft<6>(And(raw7, lo2)))); 349 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); 350 } 351 352 template <class D8> 353 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 354 uint8_t* HWY_RESTRICT raw) const { 355 const RepartitionToWide<decltype(d8)> d16; 356 using VU16 = Vec<decltype(d16)>; 357 const size_t N8 = Lanes(d8); 358 359 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); 360 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); 361 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); 362 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); 363 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); 364 365 const VU16 mask = Set(d16, 0x1F1Fu); // Lowest 5 bits per byte 366 367 const VU16 raw0 = And(packed0, mask); 368 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 369 370 const VU16 raw1 = And(packed1, mask); 371 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 372 373 const VU16 raw2 = And(packed2, mask); 374 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 375 376 const VU16 raw3 = And(packed3, mask); 377 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 378 379 // The upper bits are the top 3 bits shifted right by three. 380 const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0)); 381 const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1)); 382 const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2)); 383 const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3)); 384 385 // Insert the lower 2 bits, which were concatenated into a byte. 386 const VU16 lo2 = Set(d16, 0x0303u); 387 const VU16 raw4 = OrAnd(top4, lo2, packed4); 388 const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4)); 389 const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4)); 390 const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4)); 391 392 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 393 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 394 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 395 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 396 } 397 }; // Pack8<5> 398 399 template <> 400 struct Pack8<6> { 401 template <class D8> 402 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 403 uint8_t* HWY_RESTRICT packed_out) const { 404 const RepartitionToWide<decltype(d8)> d16; 405 using VU16 = Vec<decltype(d16)>; 406 const size_t N8 = Lanes(d8); 407 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 408 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 409 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 410 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 411 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 412 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 413 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 414 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 415 416 const VU16 hi2 = Set(d16, 0xC0C0u); 417 // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits. 418 const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2); 419 const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2); 420 const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2); 421 const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2); 422 const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2); 423 const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2); 424 425 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); 426 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); 427 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); 428 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); 429 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); 430 StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); 431 } 432 433 template <class D8> 434 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 435 uint8_t* HWY_RESTRICT raw) const { 436 const RepartitionToWide<decltype(d8)> d16; 437 using VU16 = Vec<decltype(d16)>; 438 const size_t N8 = Lanes(d8); 439 const VU16 mask = Set(d16, 0x3F3Fu); // Lowest 6 bits per byte 440 441 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); 442 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); 443 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); 444 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); 445 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); 446 const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); 447 448 const VU16 raw0 = And(packed0, mask); 449 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 450 451 const VU16 raw1 = And(packed1, mask); 452 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 453 454 const VU16 raw2 = And(packed2, mask); 455 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 456 457 const VU16 raw4 = And(packed3, mask); 458 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 459 460 const VU16 raw5 = And(packed4, mask); 461 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 462 463 const VU16 raw6 = And(packed5, mask); 464 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 465 466 // raw3/7 are the concatenation of the upper two bits in packed0..2. 467 const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)), 468 ShiftRight<4>(AndNot(mask, packed1)), 469 ShiftRight<2>(AndNot(mask, packed0))); 470 const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)), 471 ShiftRight<4>(AndNot(mask, packed4)), 472 ShiftRight<2>(AndNot(mask, packed3))); 473 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 474 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 475 } 476 }; // Pack8<6> 477 478 template <> 479 struct Pack8<7> { 480 template <class D8> 481 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 482 uint8_t* HWY_RESTRICT packed_out) const { 483 const RepartitionToWide<decltype(d8)> d16; 484 using VU16 = Vec<decltype(d16)>; 485 const size_t N8 = Lanes(d8); 486 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8)); 487 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8)); 488 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8)); 489 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8)); 490 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8)); 491 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8)); 492 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8)); 493 // Inserted into top bit of packed0..6. 494 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8)); 495 496 const VU16 hi1 = Set(d16, 0x8080u); 497 const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1); 498 const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1); 499 const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1); 500 const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1); 501 const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1); 502 const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1); 503 const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1); 504 505 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8); 506 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8); 507 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8); 508 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8); 509 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8); 510 StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8); 511 StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8); 512 } 513 514 template <class D8> 515 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 516 uint8_t* HWY_RESTRICT raw) const { 517 const RepartitionToWide<decltype(d8)> d16; 518 using VU16 = Vec<decltype(d16)>; 519 const size_t N8 = Lanes(d8); 520 521 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8)); 522 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8)); 523 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8)); 524 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8)); 525 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8)); 526 const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8)); 527 const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8)); 528 529 const VU16 mask = Set(d16, 0x7F7Fu); // Lowest 7 bits per byte 530 531 const VU16 raw0 = And(packed0, mask); 532 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8); 533 534 const VU16 raw1 = And(packed1, mask); 535 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8); 536 537 const VU16 raw2 = And(packed2, mask); 538 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8); 539 540 const VU16 raw3 = And(packed3, mask); 541 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8); 542 543 const VU16 raw4 = And(packed4, mask); 544 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8); 545 546 const VU16 raw5 = And(packed5, mask); 547 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8); 548 549 const VU16 raw6 = And(packed6, mask); 550 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8); 551 552 const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)), 553 ShiftRight<6>(AndNot(mask, packed5)), 554 ShiftRight<5>(AndNot(mask, packed4))); 555 const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)), 556 ShiftRight<3>(AndNot(mask, packed2)), 557 ShiftRight<2>(AndNot(mask, packed1))); 558 const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1); 559 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8); 560 } 561 }; // Pack8<7> 562 563 template <> 564 struct Pack8<8> { 565 template <class D8> 566 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw, 567 uint8_t* HWY_RESTRICT packed_out) const { 568 using VU8 = Vec<decltype(d8)>; 569 const size_t N8 = Lanes(d8); 570 const VU8 raw0 = LoadU(d8, raw + 0 * N8); 571 const VU8 raw1 = LoadU(d8, raw + 1 * N8); 572 const VU8 raw2 = LoadU(d8, raw + 2 * N8); 573 const VU8 raw3 = LoadU(d8, raw + 3 * N8); 574 const VU8 raw4 = LoadU(d8, raw + 4 * N8); 575 const VU8 raw5 = LoadU(d8, raw + 5 * N8); 576 const VU8 raw6 = LoadU(d8, raw + 6 * N8); 577 const VU8 raw7 = LoadU(d8, raw + 7 * N8); 578 579 StoreU(raw0, d8, packed_out + 0 * N8); 580 StoreU(raw1, d8, packed_out + 1 * N8); 581 StoreU(raw2, d8, packed_out + 2 * N8); 582 StoreU(raw3, d8, packed_out + 3 * N8); 583 StoreU(raw4, d8, packed_out + 4 * N8); 584 StoreU(raw5, d8, packed_out + 5 * N8); 585 StoreU(raw6, d8, packed_out + 6 * N8); 586 StoreU(raw7, d8, packed_out + 7 * N8); 587 } 588 589 template <class D8> 590 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in, 591 uint8_t* HWY_RESTRICT raw) const { 592 using VU8 = Vec<decltype(d8)>; 593 const size_t N8 = Lanes(d8); 594 const VU8 raw0 = LoadU(d8, packed_in + 0 * N8); 595 const VU8 raw1 = LoadU(d8, packed_in + 1 * N8); 596 const VU8 raw2 = LoadU(d8, packed_in + 2 * N8); 597 const VU8 raw3 = LoadU(d8, packed_in + 3 * N8); 598 const VU8 raw4 = LoadU(d8, packed_in + 4 * N8); 599 const VU8 raw5 = LoadU(d8, packed_in + 5 * N8); 600 const VU8 raw6 = LoadU(d8, packed_in + 6 * N8); 601 const VU8 raw7 = LoadU(d8, packed_in + 7 * N8); 602 603 StoreU(raw0, d8, raw + 0 * N8); 604 StoreU(raw1, d8, raw + 1 * N8); 605 StoreU(raw2, d8, raw + 2 * N8); 606 StoreU(raw3, d8, raw + 3 * N8); 607 StoreU(raw4, d8, raw + 4 * N8); 608 StoreU(raw5, d8, raw + 5 * N8); 609 StoreU(raw6, d8, raw + 6 * N8); 610 StoreU(raw7, d8, raw + 7 * N8); 611 } 612 }; // Pack8<8> 613 614 template <> 615 struct Pack16<1> { 616 template <class D> 617 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 618 uint16_t* HWY_RESTRICT packed_out) const { 619 using VU16 = Vec<decltype(d)>; 620 const size_t N = Lanes(d); 621 const VU16 raw0 = LoadU(d, raw + 0 * N); 622 const VU16 raw1 = LoadU(d, raw + 1 * N); 623 const VU16 raw2 = LoadU(d, raw + 2 * N); 624 const VU16 raw3 = LoadU(d, raw + 3 * N); 625 const VU16 raw4 = LoadU(d, raw + 4 * N); 626 const VU16 raw5 = LoadU(d, raw + 5 * N); 627 const VU16 raw6 = LoadU(d, raw + 6 * N); 628 const VU16 raw7 = LoadU(d, raw + 7 * N); 629 const VU16 raw8 = LoadU(d, raw + 8 * N); 630 const VU16 raw9 = LoadU(d, raw + 9 * N); 631 const VU16 rawA = LoadU(d, raw + 0xA * N); 632 const VU16 rawB = LoadU(d, raw + 0xB * N); 633 const VU16 rawC = LoadU(d, raw + 0xC * N); 634 const VU16 rawD = LoadU(d, raw + 0xD * N); 635 const VU16 rawE = LoadU(d, raw + 0xE * N); 636 const VU16 rawF = LoadU(d, raw + 0xF * N); 637 638 const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0); 639 const VU16 p1 = 640 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)); 641 const VU16 p2 = 642 Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)); 643 const VU16 p3 = 644 Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9)); 645 const VU16 p4 = 646 Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC)); 647 const VU16 packed = 648 Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4)); 649 StoreU(packed, d, packed_out); 650 } 651 652 template <class D> 653 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 654 uint16_t* HWY_RESTRICT raw) const { 655 using VU16 = Vec<decltype(d)>; 656 const size_t N = Lanes(d); 657 const VU16 mask = Set(d, 1u); // Lowest bit 658 659 const VU16 packed = LoadU(d, packed_in); 660 661 const VU16 raw0 = And(packed, mask); 662 StoreU(raw0, d, raw + 0 * N); 663 664 const VU16 raw1 = And(ShiftRight<1>(packed), mask); 665 StoreU(raw1, d, raw + 1 * N); 666 667 const VU16 raw2 = And(ShiftRight<2>(packed), mask); 668 StoreU(raw2, d, raw + 2 * N); 669 670 const VU16 raw3 = And(ShiftRight<3>(packed), mask); 671 StoreU(raw3, d, raw + 3 * N); 672 673 const VU16 raw4 = And(ShiftRight<4>(packed), mask); 674 StoreU(raw4, d, raw + 4 * N); 675 676 const VU16 raw5 = And(ShiftRight<5>(packed), mask); 677 StoreU(raw5, d, raw + 5 * N); 678 679 const VU16 raw6 = And(ShiftRight<6>(packed), mask); 680 StoreU(raw6, d, raw + 6 * N); 681 682 const VU16 raw7 = And(ShiftRight<7>(packed), mask); 683 StoreU(raw7, d, raw + 7 * N); 684 685 const VU16 raw8 = And(ShiftRight<8>(packed), mask); 686 StoreU(raw8, d, raw + 8 * N); 687 688 const VU16 raw9 = And(ShiftRight<9>(packed), mask); 689 StoreU(raw9, d, raw + 9 * N); 690 691 const VU16 rawA = And(ShiftRight<0xA>(packed), mask); 692 StoreU(rawA, d, raw + 0xA * N); 693 694 const VU16 rawB = And(ShiftRight<0xB>(packed), mask); 695 StoreU(rawB, d, raw + 0xB * N); 696 697 const VU16 rawC = And(ShiftRight<0xC>(packed), mask); 698 StoreU(rawC, d, raw + 0xC * N); 699 700 const VU16 rawD = And(ShiftRight<0xD>(packed), mask); 701 StoreU(rawD, d, raw + 0xD * N); 702 703 const VU16 rawE = And(ShiftRight<0xE>(packed), mask); 704 StoreU(rawE, d, raw + 0xE * N); 705 706 const VU16 rawF = ShiftRight<0xF>(packed); 707 StoreU(rawF, d, raw + 0xF * N); 708 } 709 }; // Pack16<1> 710 711 template <> 712 struct Pack16<2> { 713 template <class D> 714 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 715 uint16_t* HWY_RESTRICT packed_out) const { 716 using VU16 = Vec<decltype(d)>; 717 const size_t N = Lanes(d); 718 const VU16 raw0 = LoadU(d, raw + 0 * N); 719 const VU16 raw1 = LoadU(d, raw + 1 * N); 720 const VU16 raw2 = LoadU(d, raw + 2 * N); 721 const VU16 raw3 = LoadU(d, raw + 3 * N); 722 const VU16 raw4 = LoadU(d, raw + 4 * N); 723 const VU16 raw5 = LoadU(d, raw + 5 * N); 724 const VU16 raw6 = LoadU(d, raw + 6 * N); 725 const VU16 raw7 = LoadU(d, raw + 7 * N); 726 const VU16 raw8 = LoadU(d, raw + 8 * N); 727 const VU16 raw9 = LoadU(d, raw + 9 * N); 728 const VU16 rawA = LoadU(d, raw + 0xA * N); 729 const VU16 rawB = LoadU(d, raw + 0xB * N); 730 const VU16 rawC = LoadU(d, raw + 0xC * N); 731 const VU16 rawD = LoadU(d, raw + 0xD * N); 732 const VU16 rawE = LoadU(d, raw + 0xE * N); 733 const VU16 rawF = LoadU(d, raw + 0xF * N); 734 735 VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0); 736 VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1); 737 packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6)); 738 packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7)); 739 740 packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA)); 741 packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB)); 742 743 packed0 = Or(packed0, ShiftLeft<14>(rawE)); 744 packed1 = Or(packed1, ShiftLeft<14>(rawF)); 745 StoreU(packed0, d, packed_out + 0 * N); 746 StoreU(packed1, d, packed_out + 1 * N); 747 } 748 749 template <class D> 750 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 751 uint16_t* HWY_RESTRICT raw) const { 752 using VU16 = Vec<decltype(d)>; 753 const size_t N = Lanes(d); 754 const VU16 mask = Set(d, 0x3u); // Lowest 2 bits 755 756 const VU16 packed0 = LoadU(d, packed_in + 0 * N); 757 const VU16 packed1 = LoadU(d, packed_in + 1 * N); 758 759 const VU16 raw0 = And(packed0, mask); 760 StoreU(raw0, d, raw + 0 * N); 761 762 const VU16 raw1 = And(packed1, mask); 763 StoreU(raw1, d, raw + 1 * N); 764 765 const VU16 raw2 = And(ShiftRight<2>(packed0), mask); 766 StoreU(raw2, d, raw + 2 * N); 767 768 const VU16 raw3 = And(ShiftRight<2>(packed1), mask); 769 StoreU(raw3, d, raw + 3 * N); 770 771 const VU16 raw4 = And(ShiftRight<4>(packed0), mask); 772 StoreU(raw4, d, raw + 4 * N); 773 774 const VU16 raw5 = And(ShiftRight<4>(packed1), mask); 775 StoreU(raw5, d, raw + 5 * N); 776 777 const VU16 raw6 = And(ShiftRight<6>(packed0), mask); 778 StoreU(raw6, d, raw + 6 * N); 779 780 const VU16 raw7 = And(ShiftRight<6>(packed1), mask); 781 StoreU(raw7, d, raw + 7 * N); 782 783 const VU16 raw8 = And(ShiftRight<8>(packed0), mask); 784 StoreU(raw8, d, raw + 8 * N); 785 786 const VU16 raw9 = And(ShiftRight<8>(packed1), mask); 787 StoreU(raw9, d, raw + 9 * N); 788 789 const VU16 rawA = And(ShiftRight<0xA>(packed0), mask); 790 StoreU(rawA, d, raw + 0xA * N); 791 792 const VU16 rawB = And(ShiftRight<0xA>(packed1), mask); 793 StoreU(rawB, d, raw + 0xB * N); 794 795 const VU16 rawC = And(ShiftRight<0xC>(packed0), mask); 796 StoreU(rawC, d, raw + 0xC * N); 797 798 const VU16 rawD = And(ShiftRight<0xC>(packed1), mask); 799 StoreU(rawD, d, raw + 0xD * N); 800 801 const VU16 rawE = ShiftRight<0xE>(packed0); 802 StoreU(rawE, d, raw + 0xE * N); 803 804 const VU16 rawF = ShiftRight<0xE>(packed1); 805 StoreU(rawF, d, raw + 0xF * N); 806 } 807 }; // Pack16<2> 808 809 template <> 810 struct Pack16<3> { 811 template <class D> 812 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 813 uint16_t* HWY_RESTRICT packed_out) const { 814 using VU16 = Vec<decltype(d)>; 815 const size_t N = Lanes(d); 816 const VU16 raw0 = LoadU(d, raw + 0 * N); 817 const VU16 raw1 = LoadU(d, raw + 1 * N); 818 const VU16 raw2 = LoadU(d, raw + 2 * N); 819 const VU16 raw3 = LoadU(d, raw + 3 * N); 820 const VU16 raw4 = LoadU(d, raw + 4 * N); 821 const VU16 raw5 = LoadU(d, raw + 5 * N); 822 const VU16 raw6 = LoadU(d, raw + 6 * N); 823 const VU16 raw7 = LoadU(d, raw + 7 * N); 824 const VU16 raw8 = LoadU(d, raw + 8 * N); 825 const VU16 raw9 = LoadU(d, raw + 9 * N); 826 const VU16 rawA = LoadU(d, raw + 0xA * N); 827 const VU16 rawB = LoadU(d, raw + 0xB * N); 828 const VU16 rawC = LoadU(d, raw + 0xC * N); 829 const VU16 rawD = LoadU(d, raw + 0xD * N); 830 const VU16 rawE = LoadU(d, raw + 0xE * N); 831 const VU16 rawF = LoadU(d, raw + 0xF * N); 832 833 // We can fit 15 raw vectors in three packed vectors (five each). 834 VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0); 835 VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1); 836 VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2); 837 838 // rawF will be scattered into the upper bit of these three. 839 packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9)); 840 packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA)); 841 packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB)); 842 843 const VU16 hi1 = Set(d, 0x8000u); 844 packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask 845 packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); 846 packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); 847 StoreU(packed0, d, packed_out + 0 * N); 848 StoreU(packed1, d, packed_out + 1 * N); 849 StoreU(packed2, d, packed_out + 2 * N); 850 } 851 852 template <class D> 853 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 854 uint16_t* HWY_RESTRICT raw) const { 855 using VU16 = Vec<decltype(d)>; 856 const size_t N = Lanes(d); 857 const VU16 mask = Set(d, 0x7u); // Lowest 3 bits 858 859 const VU16 packed0 = LoadU(d, packed_in + 0 * N); 860 const VU16 packed1 = LoadU(d, packed_in + 1 * N); 861 const VU16 packed2 = LoadU(d, packed_in + 2 * N); 862 863 const VU16 raw0 = And(mask, packed0); 864 StoreU(raw0, d, raw + 0 * N); 865 866 const VU16 raw1 = And(mask, packed1); 867 StoreU(raw1, d, raw + 1 * N); 868 869 const VU16 raw2 = And(mask, packed2); 870 StoreU(raw2, d, raw + 2 * N); 871 872 const VU16 raw3 = And(mask, ShiftRight<3>(packed0)); 873 StoreU(raw3, d, raw + 3 * N); 874 875 const VU16 raw4 = And(mask, ShiftRight<3>(packed1)); 876 StoreU(raw4, d, raw + 4 * N); 877 878 const VU16 raw5 = And(mask, ShiftRight<3>(packed2)); 879 StoreU(raw5, d, raw + 5 * N); 880 881 const VU16 raw6 = And(mask, ShiftRight<6>(packed0)); 882 StoreU(raw6, d, raw + 6 * N); 883 884 const VU16 raw7 = And(mask, ShiftRight<6>(packed1)); 885 StoreU(raw7, d, raw + 7 * N); 886 887 const VU16 raw8 = And(mask, ShiftRight<6>(packed2)); 888 StoreU(raw8, d, raw + 8 * N); 889 890 const VU16 raw9 = And(mask, ShiftRight<9>(packed0)); 891 StoreU(raw9, d, raw + 9 * N); 892 893 const VU16 rawA = And(mask, ShiftRight<9>(packed1)); 894 StoreU(rawA, d, raw + 0xA * N); 895 896 const VU16 rawB = And(mask, ShiftRight<9>(packed2)); 897 StoreU(rawB, d, raw + 0xB * N); 898 899 const VU16 rawC = And(mask, ShiftRight<12>(packed0)); 900 StoreU(rawC, d, raw + 0xC * N); 901 902 const VU16 rawD = And(mask, ShiftRight<12>(packed1)); 903 StoreU(rawD, d, raw + 0xD * N); 904 905 const VU16 rawE = And(mask, ShiftRight<12>(packed2)); 906 StoreU(rawE, d, raw + 0xE * N); 907 908 // rawF is the concatenation of the upper bit of packed0..2. 909 const VU16 down0 = ShiftRight<15>(packed0); 910 const VU16 down1 = ShiftRight<15>(packed1); 911 const VU16 down2 = ShiftRight<15>(packed2); 912 const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0); 913 StoreU(rawF, d, raw + 0xF * N); 914 } 915 }; // Pack16<3> 916 917 template <> 918 struct Pack16<4> { 919 template <class D> 920 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 921 uint16_t* HWY_RESTRICT packed_out) const { 922 using VU16 = Vec<decltype(d)>; 923 const size_t N = Lanes(d); 924 const VU16 raw0 = LoadU(d, raw + 0 * N); 925 const VU16 raw1 = LoadU(d, raw + 1 * N); 926 const VU16 raw2 = LoadU(d, raw + 2 * N); 927 const VU16 raw3 = LoadU(d, raw + 3 * N); 928 const VU16 raw4 = LoadU(d, raw + 4 * N); 929 const VU16 raw5 = LoadU(d, raw + 5 * N); 930 const VU16 raw6 = LoadU(d, raw + 6 * N); 931 const VU16 raw7 = LoadU(d, raw + 7 * N); 932 const VU16 raw8 = LoadU(d, raw + 8 * N); 933 const VU16 raw9 = LoadU(d, raw + 9 * N); 934 const VU16 rawA = LoadU(d, raw + 0xA * N); 935 const VU16 rawB = LoadU(d, raw + 0xB * N); 936 const VU16 rawC = LoadU(d, raw + 0xC * N); 937 const VU16 rawD = LoadU(d, raw + 0xD * N); 938 const VU16 rawE = LoadU(d, raw + 0xE * N); 939 const VU16 rawF = LoadU(d, raw + 0xF * N); 940 941 VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0); 942 VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1); 943 packed0 = Or(packed0, ShiftLeft<12>(raw6)); 944 packed1 = Or(packed1, ShiftLeft<12>(raw7)); 945 VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8); 946 VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9); 947 packed2 = Or(packed2, ShiftLeft<12>(rawE)); 948 packed3 = Or(packed3, ShiftLeft<12>(rawF)); 949 950 StoreU(packed0, d, packed_out + 0 * N); 951 StoreU(packed1, d, packed_out + 1 * N); 952 StoreU(packed2, d, packed_out + 2 * N); 953 StoreU(packed3, d, packed_out + 3 * N); 954 } 955 956 template <class D> 957 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 958 uint16_t* HWY_RESTRICT raw) const { 959 using VU16 = Vec<decltype(d)>; 960 const size_t N = Lanes(d); 961 const VU16 mask = Set(d, 0xFu); // Lowest 4 bits 962 963 const VU16 packed0 = LoadU(d, packed_in + 0 * N); 964 const VU16 packed1 = LoadU(d, packed_in + 1 * N); 965 const VU16 packed2 = LoadU(d, packed_in + 2 * N); 966 const VU16 packed3 = LoadU(d, packed_in + 3 * N); 967 968 const VU16 raw0 = And(packed0, mask); 969 StoreU(raw0, d, raw + 0 * N); 970 971 const VU16 raw1 = And(packed1, mask); 972 StoreU(raw1, d, raw + 1 * N); 973 974 const VU16 raw2 = And(ShiftRight<4>(packed0), mask); 975 StoreU(raw2, d, raw + 2 * N); 976 977 const VU16 raw3 = And(ShiftRight<4>(packed1), mask); 978 StoreU(raw3, d, raw + 3 * N); 979 980 const VU16 raw4 = And(ShiftRight<8>(packed0), mask); 981 StoreU(raw4, d, raw + 4 * N); 982 983 const VU16 raw5 = And(ShiftRight<8>(packed1), mask); 984 StoreU(raw5, d, raw + 5 * N); 985 986 const VU16 raw6 = ShiftRight<12>(packed0); // no mask required 987 StoreU(raw6, d, raw + 6 * N); 988 989 const VU16 raw7 = ShiftRight<12>(packed1); // no mask required 990 StoreU(raw7, d, raw + 7 * N); 991 992 const VU16 raw8 = And(packed2, mask); 993 StoreU(raw8, d, raw + 8 * N); 994 995 const VU16 raw9 = And(packed3, mask); 996 StoreU(raw9, d, raw + 9 * N); 997 998 const VU16 rawA = And(ShiftRight<4>(packed2), mask); 999 StoreU(rawA, d, raw + 0xA * N); 1000 1001 const VU16 rawB = And(ShiftRight<4>(packed3), mask); 1002 StoreU(rawB, d, raw + 0xB * N); 1003 1004 const VU16 rawC = And(ShiftRight<8>(packed2), mask); 1005 StoreU(rawC, d, raw + 0xC * N); 1006 1007 const VU16 rawD = And(ShiftRight<8>(packed3), mask); 1008 StoreU(rawD, d, raw + 0xD * N); 1009 1010 const VU16 rawE = ShiftRight<12>(packed2); // no mask required 1011 StoreU(rawE, d, raw + 0xE * N); 1012 1013 const VU16 rawF = ShiftRight<12>(packed3); // no mask required 1014 StoreU(rawF, d, raw + 0xF * N); 1015 } 1016 }; // Pack16<4> 1017 1018 template <> 1019 struct Pack16<5> { 1020 template <class D> 1021 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1022 uint16_t* HWY_RESTRICT packed_out) const { 1023 using VU16 = Vec<decltype(d)>; 1024 const size_t N = Lanes(d); 1025 const VU16 raw0 = LoadU(d, raw + 0 * N); 1026 const VU16 raw1 = LoadU(d, raw + 1 * N); 1027 const VU16 raw2 = LoadU(d, raw + 2 * N); 1028 const VU16 raw3 = LoadU(d, raw + 3 * N); 1029 const VU16 raw4 = LoadU(d, raw + 4 * N); 1030 const VU16 raw5 = LoadU(d, raw + 5 * N); 1031 const VU16 raw6 = LoadU(d, raw + 6 * N); 1032 const VU16 raw7 = LoadU(d, raw + 7 * N); 1033 const VU16 raw8 = LoadU(d, raw + 8 * N); 1034 const VU16 raw9 = LoadU(d, raw + 9 * N); 1035 const VU16 rawA = LoadU(d, raw + 0xA * N); 1036 const VU16 rawB = LoadU(d, raw + 0xB * N); 1037 const VU16 rawC = LoadU(d, raw + 0xC * N); 1038 const VU16 rawD = LoadU(d, raw + 0xD * N); 1039 const VU16 rawE = LoadU(d, raw + 0xE * N); 1040 const VU16 rawF = LoadU(d, raw + 0xF * N); 1041 1042 // We can fit 15 raw vectors in five packed vectors (three each). 1043 VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0); 1044 VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1); 1045 VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2); 1046 VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3); 1047 VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4); 1048 1049 // rawF will be scattered into the upper bits of these five. 1050 const VU16 hi1 = Set(d, 0x8000u); 1051 packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask 1052 packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1); 1053 packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1); 1054 packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1); 1055 packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1); 1056 1057 StoreU(packed0, d, packed_out + 0 * N); 1058 StoreU(packed1, d, packed_out + 1 * N); 1059 StoreU(packed2, d, packed_out + 2 * N); 1060 StoreU(packed3, d, packed_out + 3 * N); 1061 StoreU(packed4, d, packed_out + 4 * N); 1062 } 1063 1064 template <class D> 1065 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1066 uint16_t* HWY_RESTRICT raw) const { 1067 using VU16 = Vec<decltype(d)>; 1068 const size_t N = Lanes(d); 1069 1070 const VU16 packed0 = LoadU(d, packed_in + 0 * N); 1071 const VU16 packed1 = LoadU(d, packed_in + 1 * N); 1072 const VU16 packed2 = LoadU(d, packed_in + 2 * N); 1073 const VU16 packed3 = LoadU(d, packed_in + 3 * N); 1074 const VU16 packed4 = LoadU(d, packed_in + 4 * N); 1075 1076 const VU16 mask = Set(d, 0x1Fu); // Lowest 5 bits 1077 1078 const VU16 raw0 = And(packed0, mask); 1079 StoreU(raw0, d, raw + 0 * N); 1080 1081 const VU16 raw1 = And(packed1, mask); 1082 StoreU(raw1, d, raw + 1 * N); 1083 1084 const VU16 raw2 = And(packed2, mask); 1085 StoreU(raw2, d, raw + 2 * N); 1086 1087 const VU16 raw3 = And(packed3, mask); 1088 StoreU(raw3, d, raw + 3 * N); 1089 1090 const VU16 raw4 = And(packed4, mask); 1091 StoreU(raw4, d, raw + 4 * N); 1092 1093 const VU16 raw5 = And(ShiftRight<5>(packed0), mask); 1094 StoreU(raw5, d, raw + 5 * N); 1095 1096 const VU16 raw6 = And(ShiftRight<5>(packed1), mask); 1097 StoreU(raw6, d, raw + 6 * N); 1098 1099 const VU16 raw7 = And(ShiftRight<5>(packed2), mask); 1100 StoreU(raw7, d, raw + 7 * N); 1101 1102 const VU16 raw8 = And(ShiftRight<5>(packed3), mask); 1103 StoreU(raw8, d, raw + 8 * N); 1104 1105 const VU16 raw9 = And(ShiftRight<5>(packed4), mask); 1106 StoreU(raw9, d, raw + 9 * N); 1107 1108 const VU16 rawA = And(ShiftRight<10>(packed0), mask); 1109 StoreU(rawA, d, raw + 0xA * N); 1110 1111 const VU16 rawB = And(ShiftRight<10>(packed1), mask); 1112 StoreU(rawB, d, raw + 0xB * N); 1113 1114 const VU16 rawC = And(ShiftRight<10>(packed2), mask); 1115 StoreU(rawC, d, raw + 0xC * N); 1116 1117 const VU16 rawD = And(ShiftRight<10>(packed3), mask); 1118 StoreU(rawD, d, raw + 0xD * N); 1119 1120 const VU16 rawE = And(ShiftRight<10>(packed4), mask); 1121 StoreU(rawE, d, raw + 0xE * N); 1122 1123 // rawF is the concatenation of the lower bit of packed0..4. 1124 const VU16 down0 = ShiftRight<15>(packed0); 1125 const VU16 down1 = ShiftRight<15>(packed1); 1126 const VU16 hi1 = Set(d, 0x8000u); 1127 const VU16 p0 = 1128 Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0); 1129 const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)), 1130 ShiftRight<12>(And(packed3, hi1)), p0); 1131 StoreU(rawF, d, raw + 0xF * N); 1132 } 1133 }; // Pack16<5> 1134 1135 template <> 1136 struct Pack16<6> { 1137 template <class D> 1138 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1139 uint16_t* HWY_RESTRICT packed_out) const { 1140 using VU16 = Vec<decltype(d)>; 1141 const size_t N = Lanes(d); 1142 const VU16 raw0 = LoadU(d, raw + 0 * N); 1143 const VU16 raw1 = LoadU(d, raw + 1 * N); 1144 const VU16 raw2 = LoadU(d, raw + 2 * N); 1145 const VU16 raw3 = LoadU(d, raw + 3 * N); 1146 const VU16 raw4 = LoadU(d, raw + 4 * N); 1147 const VU16 raw5 = LoadU(d, raw + 5 * N); 1148 const VU16 raw6 = LoadU(d, raw + 6 * N); 1149 const VU16 raw7 = LoadU(d, raw + 7 * N); 1150 const VU16 raw8 = LoadU(d, raw + 8 * N); 1151 const VU16 raw9 = LoadU(d, raw + 9 * N); 1152 const VU16 rawA = LoadU(d, raw + 0xA * N); 1153 const VU16 rawB = LoadU(d, raw + 0xB * N); 1154 const VU16 rawC = LoadU(d, raw + 0xC * N); 1155 const VU16 rawD = LoadU(d, raw + 0xD * N); 1156 const VU16 rawE = LoadU(d, raw + 0xE * N); 1157 const VU16 rawF = LoadU(d, raw + 0xF * N); 1158 1159 const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3); 1160 const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB); 1161 // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the 1162 // four remainder bits at the top of each vector. 1163 const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0); 1164 VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1); 1165 VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2); 1166 const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8); 1167 VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9); 1168 VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA); 1169 1170 const VU16 hi4 = Set(d, 0xF000u); 1171 packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4); 1172 packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4); 1173 packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4); 1174 packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4); 1175 1176 StoreU(packed0, d, packed_out + 0 * N); 1177 StoreU(packed1, d, packed_out + 1 * N); 1178 StoreU(packed2, d, packed_out + 2 * N); 1179 StoreU(packed4, d, packed_out + 3 * N); 1180 StoreU(packed5, d, packed_out + 4 * N); 1181 StoreU(packed6, d, packed_out + 5 * N); 1182 } 1183 1184 template <class D> 1185 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1186 uint16_t* HWY_RESTRICT raw) const { 1187 using VU16 = Vec<decltype(d)>; 1188 const size_t N = Lanes(d); 1189 const VU16 mask = Set(d, 0x3Fu); // Lowest 6 bits 1190 1191 const VU16 packed0 = LoadU(d, packed_in + 0 * N); 1192 const VU16 packed1 = LoadU(d, packed_in + 1 * N); 1193 const VU16 packed2 = LoadU(d, packed_in + 2 * N); 1194 const VU16 packed4 = LoadU(d, packed_in + 3 * N); 1195 const VU16 packed5 = LoadU(d, packed_in + 4 * N); 1196 const VU16 packed6 = LoadU(d, packed_in + 5 * N); 1197 1198 const VU16 raw0 = And(packed0, mask); 1199 StoreU(raw0, d, raw + 0 * N); 1200 1201 const VU16 raw1 = And(packed1, mask); 1202 StoreU(raw1, d, raw + 1 * N); 1203 1204 const VU16 raw2 = And(packed2, mask); 1205 StoreU(raw2, d, raw + 2 * N); 1206 1207 const VU16 raw4 = And(ShiftRight<6>(packed0), mask); 1208 StoreU(raw4, d, raw + 4 * N); 1209 1210 const VU16 raw5 = And(ShiftRight<6>(packed1), mask); 1211 StoreU(raw5, d, raw + 5 * N); 1212 1213 const VU16 raw6 = And(ShiftRight<6>(packed2), mask); 1214 StoreU(raw6, d, raw + 6 * N); 1215 1216 const VU16 raw8 = And(packed4, mask); 1217 StoreU(raw8, d, raw + 8 * N); 1218 1219 const VU16 raw9 = And(packed5, mask); 1220 StoreU(raw9, d, raw + 9 * N); 1221 1222 const VU16 rawA = And(packed6, mask); 1223 StoreU(rawA, d, raw + 0xA * N); 1224 1225 const VU16 rawC = And(ShiftRight<6>(packed4), mask); 1226 StoreU(rawC, d, raw + 0xC * N); 1227 1228 const VU16 rawD = And(ShiftRight<6>(packed5), mask); 1229 StoreU(rawD, d, raw + 0xD * N); 1230 1231 const VU16 rawE = And(ShiftRight<6>(packed6), mask); 1232 StoreU(rawE, d, raw + 0xE * N); 1233 1234 // packed3 is the concatenation of the four upper bits in packed0..2. 1235 const VU16 down0 = ShiftRight<12>(packed0); 1236 const VU16 down4 = ShiftRight<12>(packed4); 1237 const VU16 hi4 = Set(d, 0xF000u); 1238 const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)), 1239 ShiftRight<8>(And(packed1, hi4)), down0); 1240 const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)), 1241 ShiftRight<8>(And(packed5, hi4)), down4); 1242 const VU16 raw3 = And(packed3, mask); 1243 StoreU(raw3, d, raw + 3 * N); 1244 1245 const VU16 rawB = And(packed7, mask); 1246 StoreU(rawB, d, raw + 0xB * N); 1247 1248 const VU16 raw7 = ShiftRight<6>(packed3); // upper bits already zero 1249 StoreU(raw7, d, raw + 7 * N); 1250 1251 const VU16 rawF = ShiftRight<6>(packed7); // upper bits already zero 1252 StoreU(rawF, d, raw + 0xF * N); 1253 } 1254 }; // Pack16<6> 1255 1256 template <> 1257 struct Pack16<7> { 1258 template <class D> 1259 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1260 uint16_t* HWY_RESTRICT packed_out) const { 1261 using VU16 = Vec<decltype(d)>; 1262 const size_t N = Lanes(d); 1263 const VU16 raw0 = LoadU(d, raw + 0 * N); 1264 const VU16 raw1 = LoadU(d, raw + 1 * N); 1265 const VU16 raw2 = LoadU(d, raw + 2 * N); 1266 const VU16 raw3 = LoadU(d, raw + 3 * N); 1267 const VU16 raw4 = LoadU(d, raw + 4 * N); 1268 const VU16 raw5 = LoadU(d, raw + 5 * N); 1269 const VU16 raw6 = LoadU(d, raw + 6 * N); 1270 const VU16 raw7 = LoadU(d, raw + 7 * N); 1271 const VU16 raw8 = LoadU(d, raw + 8 * N); 1272 const VU16 raw9 = LoadU(d, raw + 9 * N); 1273 const VU16 rawA = LoadU(d, raw + 0xA * N); 1274 const VU16 rawB = LoadU(d, raw + 0xB * N); 1275 const VU16 rawC = LoadU(d, raw + 0xC * N); 1276 const VU16 rawD = LoadU(d, raw + 0xD * N); 1277 const VU16 rawE = LoadU(d, raw + 0xE * N); 1278 const VU16 rawF = LoadU(d, raw + 0xF * N); 1279 1280 const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7); 1281 // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the 1282 // two remainder bits at the top of each vector. 1283 const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0); 1284 VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1); 1285 VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2); 1286 VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3); 1287 VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4); 1288 VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5); 1289 VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6); 1290 1291 const VU16 hi2 = Set(d, 0xC000u); 1292 packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2); 1293 packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2); 1294 packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2); 1295 packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2); 1296 packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2); 1297 packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2); 1298 1299 StoreU(packed0, d, packed_out + 0 * N); 1300 StoreU(packed1, d, packed_out + 1 * N); 1301 StoreU(packed2, d, packed_out + 2 * N); 1302 StoreU(packed3, d, packed_out + 3 * N); 1303 StoreU(packed4, d, packed_out + 4 * N); 1304 StoreU(packed5, d, packed_out + 5 * N); 1305 StoreU(packed6, d, packed_out + 6 * N); 1306 } 1307 1308 template <class D> 1309 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1310 uint16_t* HWY_RESTRICT raw) const { 1311 using VU16 = Vec<decltype(d)>; 1312 const size_t N = Lanes(d); 1313 1314 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 1315 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 1316 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 1317 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 1318 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 1319 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 1320 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 1321 1322 const VU16 mask = Set(d, 0x7Fu); // Lowest 7 bits 1323 1324 const VU16 raw0 = And(packed0, mask); 1325 StoreU(raw0, d, raw + 0 * N); 1326 1327 const VU16 raw1 = And(packed1, mask); 1328 StoreU(raw1, d, raw + 1 * N); 1329 1330 const VU16 raw2 = And(packed2, mask); 1331 StoreU(raw2, d, raw + 2 * N); 1332 1333 const VU16 raw3 = And(packed3, mask); 1334 StoreU(raw3, d, raw + 3 * N); 1335 1336 const VU16 raw4 = And(packed4, mask); 1337 StoreU(raw4, d, raw + 4 * N); 1338 1339 const VU16 raw5 = And(packed5, mask); 1340 StoreU(raw5, d, raw + 5 * N); 1341 1342 const VU16 raw6 = And(packed6, mask); 1343 StoreU(raw6, d, raw + 6 * N); 1344 1345 const VU16 raw8 = And(ShiftRight<7>(packed0), mask); 1346 StoreU(raw8, d, raw + 8 * N); 1347 1348 const VU16 raw9 = And(ShiftRight<7>(packed1), mask); 1349 StoreU(raw9, d, raw + 9 * N); 1350 1351 const VU16 rawA = And(ShiftRight<7>(packed2), mask); 1352 StoreU(rawA, d, raw + 0xA * N); 1353 1354 const VU16 rawB = And(ShiftRight<7>(packed3), mask); 1355 StoreU(rawB, d, raw + 0xB * N); 1356 1357 const VU16 rawC = And(ShiftRight<7>(packed4), mask); 1358 StoreU(rawC, d, raw + 0xC * N); 1359 1360 const VU16 rawD = And(ShiftRight<7>(packed5), mask); 1361 StoreU(rawD, d, raw + 0xD * N); 1362 1363 const VU16 rawE = And(ShiftRight<7>(packed6), mask); 1364 StoreU(rawE, d, raw + 0xE * N); 1365 1366 // packed7 is the concatenation of the two upper bits in packed0..6. 1367 const VU16 down0 = ShiftRight<14>(packed0); 1368 const VU16 hi2 = Set(d, 0xC000u); 1369 const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)), 1370 ShiftRight<10>(And(packed2, hi2)), down0); 1371 const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)), // 1372 ShiftRight<6>(And(packed4, hi2)), 1373 ShiftRight<4>(And(packed5, hi2))); 1374 const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0); 1375 1376 const VU16 raw7 = And(packed7, mask); 1377 StoreU(raw7, d, raw + 7 * N); 1378 1379 const VU16 rawF = ShiftRight<7>(packed7); // upper bits already zero 1380 StoreU(rawF, d, raw + 0xF * N); 1381 } 1382 }; // Pack16<7> 1383 1384 template <> 1385 struct Pack16<8> { 1386 template <class D> 1387 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1388 uint16_t* HWY_RESTRICT packed_out) const { 1389 using VU16 = Vec<decltype(d)>; 1390 const size_t N = Lanes(d); 1391 const VU16 raw0 = LoadU(d, raw + 0 * N); 1392 const VU16 raw1 = LoadU(d, raw + 1 * N); 1393 const VU16 raw2 = LoadU(d, raw + 2 * N); 1394 const VU16 raw3 = LoadU(d, raw + 3 * N); 1395 const VU16 raw4 = LoadU(d, raw + 4 * N); 1396 const VU16 raw5 = LoadU(d, raw + 5 * N); 1397 const VU16 raw6 = LoadU(d, raw + 6 * N); 1398 const VU16 raw7 = LoadU(d, raw + 7 * N); 1399 const VU16 raw8 = LoadU(d, raw + 8 * N); 1400 const VU16 raw9 = LoadU(d, raw + 9 * N); 1401 const VU16 rawA = LoadU(d, raw + 0xA * N); 1402 const VU16 rawB = LoadU(d, raw + 0xB * N); 1403 const VU16 rawC = LoadU(d, raw + 0xC * N); 1404 const VU16 rawD = LoadU(d, raw + 0xD * N); 1405 const VU16 rawE = LoadU(d, raw + 0xE * N); 1406 const VU16 rawF = LoadU(d, raw + 0xF * N); 1407 1408 // This is equivalent to ConcatEven with 8-bit lanes, but much more 1409 // efficient on RVV and slightly less efficient on SVE2. 1410 const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0); 1411 const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1); 1412 const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4); 1413 const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5); 1414 const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8); 1415 const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9); 1416 const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC); 1417 const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD); 1418 1419 StoreU(packed0, d, packed_out + 0 * N); 1420 StoreU(packed1, d, packed_out + 1 * N); 1421 StoreU(packed2, d, packed_out + 2 * N); 1422 StoreU(packed3, d, packed_out + 3 * N); 1423 StoreU(packed4, d, packed_out + 4 * N); 1424 StoreU(packed5, d, packed_out + 5 * N); 1425 StoreU(packed6, d, packed_out + 6 * N); 1426 StoreU(packed7, d, packed_out + 7 * N); 1427 } 1428 1429 template <class D> 1430 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1431 uint16_t* HWY_RESTRICT raw) const { 1432 using VU16 = Vec<decltype(d)>; 1433 const size_t N = Lanes(d); 1434 1435 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 1436 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 1437 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 1438 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 1439 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 1440 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 1441 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 1442 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 1443 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits 1444 1445 const VU16 raw0 = And(packed0, mask); 1446 StoreU(raw0, d, raw + 0 * N); 1447 1448 const VU16 raw1 = And(packed1, mask); 1449 StoreU(raw1, d, raw + 1 * N); 1450 1451 const VU16 raw2 = ShiftRight<8>(packed0); // upper bits already zero 1452 StoreU(raw2, d, raw + 2 * N); 1453 1454 const VU16 raw3 = ShiftRight<8>(packed1); // upper bits already zero 1455 StoreU(raw3, d, raw + 3 * N); 1456 1457 const VU16 raw4 = And(packed2, mask); 1458 StoreU(raw4, d, raw + 4 * N); 1459 1460 const VU16 raw5 = And(packed3, mask); 1461 StoreU(raw5, d, raw + 5 * N); 1462 1463 const VU16 raw6 = ShiftRight<8>(packed2); // upper bits already zero 1464 StoreU(raw6, d, raw + 6 * N); 1465 1466 const VU16 raw7 = ShiftRight<8>(packed3); // upper bits already zero 1467 StoreU(raw7, d, raw + 7 * N); 1468 1469 const VU16 raw8 = And(packed4, mask); 1470 StoreU(raw8, d, raw + 8 * N); 1471 1472 const VU16 raw9 = And(packed5, mask); 1473 StoreU(raw9, d, raw + 9 * N); 1474 1475 const VU16 rawA = ShiftRight<8>(packed4); // upper bits already zero 1476 StoreU(rawA, d, raw + 0xA * N); 1477 1478 const VU16 rawB = ShiftRight<8>(packed5); // upper bits already zero 1479 StoreU(rawB, d, raw + 0xB * N); 1480 1481 const VU16 rawC = And(packed6, mask); 1482 StoreU(rawC, d, raw + 0xC * N); 1483 1484 const VU16 rawD = And(packed7, mask); 1485 StoreU(rawD, d, raw + 0xD * N); 1486 1487 const VU16 rawE = ShiftRight<8>(packed6); // upper bits already zero 1488 StoreU(rawE, d, raw + 0xE * N); 1489 1490 const VU16 rawF = ShiftRight<8>(packed7); // upper bits already zero 1491 StoreU(rawF, d, raw + 0xF * N); 1492 } 1493 }; // Pack16<8> 1494 1495 template <> 1496 struct Pack16<9> { 1497 template <class D> 1498 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1499 uint16_t* HWY_RESTRICT packed_out) const { 1500 using VU16 = Vec<decltype(d)>; 1501 const size_t N = Lanes(d); 1502 const VU16 raw0 = LoadU(d, raw + 0 * N); 1503 const VU16 raw1 = LoadU(d, raw + 1 * N); 1504 const VU16 raw2 = LoadU(d, raw + 2 * N); 1505 const VU16 raw3 = LoadU(d, raw + 3 * N); 1506 const VU16 raw4 = LoadU(d, raw + 4 * N); 1507 const VU16 raw5 = LoadU(d, raw + 5 * N); 1508 const VU16 raw6 = LoadU(d, raw + 6 * N); 1509 const VU16 raw7 = LoadU(d, raw + 7 * N); 1510 const VU16 raw8 = LoadU(d, raw + 8 * N); 1511 const VU16 raw9 = LoadU(d, raw + 9 * N); 1512 const VU16 rawA = LoadU(d, raw + 0xA * N); 1513 const VU16 rawB = LoadU(d, raw + 0xB * N); 1514 const VU16 rawC = LoadU(d, raw + 0xC * N); 1515 const VU16 rawD = LoadU(d, raw + 0xD * N); 1516 const VU16 rawE = LoadU(d, raw + 0xE * N); 1517 const VU16 rawF = LoadU(d, raw + 0xF * N); 1518 // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8. 1519 const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0); 1520 const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1); 1521 const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2); 1522 const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3); 1523 const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4); 1524 const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5); 1525 const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6); 1526 const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7); 1527 1528 // We could shift down, OR and shift up, but two shifts are typically more 1529 // expensive than AND, shift into position, and OR (which can be further 1530 // reduced via Xor3). 1531 const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9 1532 const VU16 part8 = ShiftRight<7>(And(raw8, mid2)); 1533 const VU16 part9 = ShiftRight<5>(And(raw9, mid2)); 1534 const VU16 partA = ShiftRight<3>(And(rawA, mid2)); 1535 const VU16 partB = ShiftRight<1>(And(rawB, mid2)); 1536 const VU16 partC = ShiftLeft<1>(And(rawC, mid2)); 1537 const VU16 partD = ShiftLeft<3>(And(rawD, mid2)); 1538 const VU16 partE = ShiftLeft<5>(And(rawE, mid2)); 1539 const VU16 partF = ShiftLeft<7>(And(rawF, mid2)); 1540 const VU16 packed8 = Xor3(Xor3(part8, part9, partA), 1541 Xor3(partB, partC, partD), Or(partE, partF)); 1542 1543 StoreU(packed0, d, packed_out + 0 * N); 1544 StoreU(packed1, d, packed_out + 1 * N); 1545 StoreU(packed2, d, packed_out + 2 * N); 1546 StoreU(packed3, d, packed_out + 3 * N); 1547 StoreU(packed4, d, packed_out + 4 * N); 1548 StoreU(packed5, d, packed_out + 5 * N); 1549 StoreU(packed6, d, packed_out + 6 * N); 1550 StoreU(packed7, d, packed_out + 7 * N); 1551 StoreU(packed8, d, packed_out + 8 * N); 1552 } 1553 1554 template <class D> 1555 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1556 uint16_t* HWY_RESTRICT raw) const { 1557 using VU16 = Vec<decltype(d)>; 1558 const size_t N = Lanes(d); 1559 1560 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 1561 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 1562 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 1563 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 1564 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 1565 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 1566 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 1567 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 1568 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 1569 1570 const VU16 mask = Set(d, 0x1FFu); // Lowest 9 bits 1571 1572 const VU16 raw0 = And(packed0, mask); 1573 StoreU(raw0, d, raw + 0 * N); 1574 1575 const VU16 raw1 = And(packed1, mask); 1576 StoreU(raw1, d, raw + 1 * N); 1577 1578 const VU16 raw2 = And(packed2, mask); 1579 StoreU(raw2, d, raw + 2 * N); 1580 1581 const VU16 raw3 = And(packed3, mask); 1582 StoreU(raw3, d, raw + 3 * N); 1583 1584 const VU16 raw4 = And(packed4, mask); 1585 StoreU(raw4, d, raw + 4 * N); 1586 1587 const VU16 raw5 = And(packed5, mask); 1588 StoreU(raw5, d, raw + 5 * N); 1589 1590 const VU16 raw6 = And(packed6, mask); 1591 StoreU(raw6, d, raw + 6 * N); 1592 1593 const VU16 raw7 = And(packed7, mask); 1594 StoreU(raw7, d, raw + 7 * N); 1595 1596 const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9 1597 const VU16 raw8 = 1598 OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2); 1599 const VU16 raw9 = 1600 OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2); 1601 const VU16 rawA = 1602 OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2); 1603 const VU16 rawB = 1604 OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2); 1605 const VU16 rawC = 1606 OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2); 1607 const VU16 rawD = 1608 OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2); 1609 const VU16 rawE = 1610 OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2); 1611 const VU16 rawF = 1612 OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2); 1613 1614 StoreU(raw8, d, raw + 8 * N); 1615 StoreU(raw9, d, raw + 9 * N); 1616 StoreU(rawA, d, raw + 0xA * N); 1617 StoreU(rawB, d, raw + 0xB * N); 1618 StoreU(rawC, d, raw + 0xC * N); 1619 StoreU(rawD, d, raw + 0xD * N); 1620 StoreU(rawE, d, raw + 0xE * N); 1621 StoreU(rawF, d, raw + 0xF * N); 1622 } 1623 }; // Pack16<9> 1624 1625 template <> 1626 struct Pack16<10> { 1627 template <class D> 1628 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1629 uint16_t* HWY_RESTRICT packed_out) const { 1630 using VU16 = Vec<decltype(d)>; 1631 const size_t N = Lanes(d); 1632 const VU16 raw0 = LoadU(d, raw + 0 * N); 1633 const VU16 raw1 = LoadU(d, raw + 1 * N); 1634 const VU16 raw2 = LoadU(d, raw + 2 * N); 1635 const VU16 raw3 = LoadU(d, raw + 3 * N); 1636 const VU16 raw4 = LoadU(d, raw + 4 * N); 1637 const VU16 raw5 = LoadU(d, raw + 5 * N); 1638 const VU16 raw6 = LoadU(d, raw + 6 * N); 1639 const VU16 raw7 = LoadU(d, raw + 7 * N); 1640 const VU16 raw8 = LoadU(d, raw + 8 * N); 1641 const VU16 raw9 = LoadU(d, raw + 9 * N); 1642 const VU16 rawA = LoadU(d, raw + 0xA * N); 1643 const VU16 rawB = LoadU(d, raw + 0xB * N); 1644 const VU16 rawC = LoadU(d, raw + 0xC * N); 1645 const VU16 rawD = LoadU(d, raw + 0xD * N); 1646 const VU16 rawE = LoadU(d, raw + 0xE * N); 1647 const VU16 rawF = LoadU(d, raw + 0xF * N); 1648 1649 // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into 1650 // packed8 and packed9. 1651 const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0); 1652 const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1); 1653 const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2); 1654 const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3); 1655 const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4); 1656 const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5); 1657 const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6); 1658 const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7); 1659 1660 // We could shift down, OR and shift up, but two shifts are typically more 1661 // expensive than AND, shift into position, and OR (which can be further 1662 // reduced via Xor3). 1663 const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10 1664 const VU16 part8 = ShiftRight<6>(And(raw8, mid4)); 1665 const VU16 part9 = ShiftRight<2>(And(raw9, mid4)); 1666 const VU16 partA = ShiftLeft<2>(And(rawA, mid4)); 1667 const VU16 partB = ShiftLeft<6>(And(rawB, mid4)); 1668 const VU16 partC = ShiftRight<6>(And(rawC, mid4)); 1669 const VU16 partD = ShiftRight<2>(And(rawD, mid4)); 1670 const VU16 partE = ShiftLeft<2>(And(rawE, mid4)); 1671 const VU16 partF = ShiftLeft<6>(And(rawF, mid4)); 1672 const VU16 packed8 = Or(Xor3(part8, part9, partA), partB); 1673 const VU16 packed9 = Or(Xor3(partC, partD, partE), partF); 1674 1675 StoreU(packed0, d, packed_out + 0 * N); 1676 StoreU(packed1, d, packed_out + 1 * N); 1677 StoreU(packed2, d, packed_out + 2 * N); 1678 StoreU(packed3, d, packed_out + 3 * N); 1679 StoreU(packed4, d, packed_out + 4 * N); 1680 StoreU(packed5, d, packed_out + 5 * N); 1681 StoreU(packed6, d, packed_out + 6 * N); 1682 StoreU(packed7, d, packed_out + 7 * N); 1683 StoreU(packed8, d, packed_out + 8 * N); 1684 StoreU(packed9, d, packed_out + 9 * N); 1685 } 1686 1687 template <class D> 1688 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1689 uint16_t* HWY_RESTRICT raw) const { 1690 using VU16 = Vec<decltype(d)>; 1691 const size_t N = Lanes(d); 1692 1693 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 1694 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 1695 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 1696 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 1697 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 1698 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 1699 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 1700 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 1701 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 1702 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 1703 1704 const VU16 mask = Set(d, 0x3FFu); // Lowest 10 bits 1705 1706 const VU16 raw0 = And(packed0, mask); 1707 StoreU(raw0, d, raw + 0 * N); 1708 1709 const VU16 raw1 = And(packed1, mask); 1710 StoreU(raw1, d, raw + 1 * N); 1711 1712 const VU16 raw2 = And(packed2, mask); 1713 StoreU(raw2, d, raw + 2 * N); 1714 1715 const VU16 raw3 = And(packed3, mask); 1716 StoreU(raw3, d, raw + 3 * N); 1717 1718 const VU16 raw4 = And(packed4, mask); 1719 StoreU(raw4, d, raw + 4 * N); 1720 1721 const VU16 raw5 = And(packed5, mask); 1722 StoreU(raw5, d, raw + 5 * N); 1723 1724 const VU16 raw6 = And(packed6, mask); 1725 StoreU(raw6, d, raw + 6 * N); 1726 1727 const VU16 raw7 = And(packed7, mask); 1728 StoreU(raw7, d, raw + 7 * N); 1729 1730 const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10 1731 const VU16 raw8 = 1732 OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4); 1733 const VU16 raw9 = 1734 OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4); 1735 const VU16 rawA = 1736 OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4); 1737 const VU16 rawB = 1738 OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4); 1739 const VU16 rawC = 1740 OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4); 1741 const VU16 rawD = 1742 OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4); 1743 const VU16 rawE = 1744 OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4); 1745 const VU16 rawF = 1746 OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4); 1747 1748 StoreU(raw8, d, raw + 8 * N); 1749 StoreU(raw9, d, raw + 9 * N); 1750 StoreU(rawA, d, raw + 0xA * N); 1751 StoreU(rawB, d, raw + 0xB * N); 1752 StoreU(rawC, d, raw + 0xC * N); 1753 StoreU(rawD, d, raw + 0xD * N); 1754 StoreU(rawE, d, raw + 0xE * N); 1755 StoreU(rawF, d, raw + 0xF * N); 1756 } 1757 }; // Pack16<10> 1758 1759 template <> 1760 struct Pack16<11> { 1761 template <class D> 1762 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1763 uint16_t* HWY_RESTRICT packed_out) const { 1764 using VU16 = Vec<decltype(d)>; 1765 const size_t N = Lanes(d); 1766 const VU16 raw0 = LoadU(d, raw + 0 * N); 1767 const VU16 raw1 = LoadU(d, raw + 1 * N); 1768 const VU16 raw2 = LoadU(d, raw + 2 * N); 1769 const VU16 raw3 = LoadU(d, raw + 3 * N); 1770 const VU16 raw4 = LoadU(d, raw + 4 * N); 1771 const VU16 raw5 = LoadU(d, raw + 5 * N); 1772 const VU16 raw6 = LoadU(d, raw + 6 * N); 1773 const VU16 raw7 = LoadU(d, raw + 7 * N); 1774 const VU16 raw8 = LoadU(d, raw + 8 * N); 1775 const VU16 raw9 = LoadU(d, raw + 9 * N); 1776 const VU16 rawA = LoadU(d, raw + 0xA * N); 1777 const VU16 rawB = LoadU(d, raw + 0xB * N); 1778 const VU16 rawC = LoadU(d, raw + 0xC * N); 1779 const VU16 rawD = LoadU(d, raw + 0xD * N); 1780 const VU16 rawE = LoadU(d, raw + 0xE * N); 1781 const VU16 rawF = LoadU(d, raw + 0xF * N); 1782 1783 // It is not obvious what the optimal partitioning looks like. To reduce the 1784 // number of constants, we want to minimize the number of distinct bit 1785 // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers. 1786 // 8+3 seems better: it is easier to scatter 3 bits into the MSBs. 1787 const VU16 lo8 = Set(d, 0xFFu); 1788 1789 // Lower 8 bits of all raw 1790 const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8); 1791 const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8); 1792 const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8); 1793 const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8); 1794 const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8); 1795 const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8); 1796 const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8); 1797 const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8); 1798 1799 StoreU(packed0, d, packed_out + 0 * N); 1800 StoreU(packed1, d, packed_out + 1 * N); 1801 StoreU(packed2, d, packed_out + 2 * N); 1802 StoreU(packed3, d, packed_out + 3 * N); 1803 StoreU(packed4, d, packed_out + 4 * N); 1804 StoreU(packed5, d, packed_out + 5 * N); 1805 StoreU(packed6, d, packed_out + 6 * N); 1806 StoreU(packed7, d, packed_out + 7 * N); 1807 1808 // Three vectors, five 3bit remnants each, plus one 3bit in their MSB. 1809 const VU16 top0 = ShiftRight<8>(raw0); 1810 const VU16 top1 = ShiftRight<8>(raw1); 1811 const VU16 top2 = ShiftRight<8>(raw2); 1812 // Insert top raw bits into 3-bit groups within packed8..A. Moving the 1813 // mask along avoids masking each of raw0..E and enables OrAnd. 1814 VU16 next = Set(d, 0x38u); // 0x7 << 3 1815 VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next); 1816 VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next); 1817 VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next); 1818 next = ShiftLeft<3>(next); 1819 packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next); 1820 packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next); 1821 packedA = OrAnd(packedA, ShiftRight<2>(raw8), next); 1822 next = ShiftLeft<3>(next); 1823 packed8 = OrAnd(packed8, Add(raw9, raw9), next); 1824 packed9 = OrAnd(packed9, Add(rawA, rawA), next); 1825 packedA = OrAnd(packedA, Add(rawB, rawB), next); 1826 next = ShiftLeft<3>(next); 1827 packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next); 1828 packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next); 1829 packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next); 1830 1831 // Scatter upper 3 bits of rawF into the upper bits. 1832 next = ShiftLeft<3>(next); // = 0x8000u 1833 packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next); 1834 packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next); 1835 packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next); 1836 1837 StoreU(packed8, d, packed_out + 8 * N); 1838 StoreU(packed9, d, packed_out + 9 * N); 1839 StoreU(packedA, d, packed_out + 0xA * N); 1840 } 1841 1842 template <class D> 1843 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1844 uint16_t* HWY_RESTRICT raw) const { 1845 using VU16 = Vec<decltype(d)>; 1846 const size_t N = Lanes(d); 1847 1848 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 1849 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 1850 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 1851 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 1852 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 1853 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 1854 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 1855 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 1856 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 1857 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 1858 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); 1859 1860 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits 1861 1862 const VU16 down0 = And(packed0, mask); 1863 const VU16 down1 = ShiftRight<8>(packed0); 1864 const VU16 down2 = And(packed1, mask); 1865 const VU16 down3 = ShiftRight<8>(packed1); 1866 const VU16 down4 = And(packed2, mask); 1867 const VU16 down5 = ShiftRight<8>(packed2); 1868 const VU16 down6 = And(packed3, mask); 1869 const VU16 down7 = ShiftRight<8>(packed3); 1870 const VU16 down8 = And(packed4, mask); 1871 const VU16 down9 = ShiftRight<8>(packed4); 1872 const VU16 downA = And(packed5, mask); 1873 const VU16 downB = ShiftRight<8>(packed5); 1874 const VU16 downC = And(packed6, mask); 1875 const VU16 downD = ShiftRight<8>(packed6); 1876 const VU16 downE = And(packed7, mask); 1877 const VU16 downF = ShiftRight<8>(packed7); 1878 1879 // Three bits from packed8..A, eight bits from down0..F. 1880 const VU16 hi3 = Set(d, 0x700u); 1881 const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3); 1882 const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3); 1883 const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3); 1884 1885 const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3); 1886 const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3); 1887 const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3); 1888 1889 const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3); 1890 const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3); 1891 const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3); 1892 1893 const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3); 1894 const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3); 1895 const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3); 1896 1897 const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3); 1898 const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3); 1899 const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3); 1900 1901 // Shift MSB into the top 3-of-11 and mask. 1902 const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3), 1903 And(ShiftRight<6>(packed9), hi3), 1904 And(ShiftRight<5>(packedA), hi3))); 1905 1906 StoreU(raw0, d, raw + 0 * N); 1907 StoreU(raw1, d, raw + 1 * N); 1908 StoreU(raw2, d, raw + 2 * N); 1909 StoreU(raw3, d, raw + 3 * N); 1910 StoreU(raw4, d, raw + 4 * N); 1911 StoreU(raw5, d, raw + 5 * N); 1912 StoreU(raw6, d, raw + 6 * N); 1913 StoreU(raw7, d, raw + 7 * N); 1914 StoreU(raw8, d, raw + 8 * N); 1915 StoreU(raw9, d, raw + 9 * N); 1916 StoreU(rawA, d, raw + 0xA * N); 1917 StoreU(rawB, d, raw + 0xB * N); 1918 StoreU(rawC, d, raw + 0xC * N); 1919 StoreU(rawD, d, raw + 0xD * N); 1920 StoreU(rawE, d, raw + 0xE * N); 1921 StoreU(rawF, d, raw + 0xF * N); 1922 } 1923 }; // Pack16<11> 1924 1925 template <> 1926 struct Pack16<12> { 1927 template <class D> 1928 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 1929 uint16_t* HWY_RESTRICT packed_out) const { 1930 using VU16 = Vec<decltype(d)>; 1931 const size_t N = Lanes(d); 1932 const VU16 raw0 = LoadU(d, raw + 0 * N); 1933 const VU16 raw1 = LoadU(d, raw + 1 * N); 1934 const VU16 raw2 = LoadU(d, raw + 2 * N); 1935 const VU16 raw3 = LoadU(d, raw + 3 * N); 1936 const VU16 raw4 = LoadU(d, raw + 4 * N); 1937 const VU16 raw5 = LoadU(d, raw + 5 * N); 1938 const VU16 raw6 = LoadU(d, raw + 6 * N); 1939 const VU16 raw7 = LoadU(d, raw + 7 * N); 1940 const VU16 raw8 = LoadU(d, raw + 8 * N); 1941 const VU16 raw9 = LoadU(d, raw + 9 * N); 1942 const VU16 rawA = LoadU(d, raw + 0xA * N); 1943 const VU16 rawB = LoadU(d, raw + 0xB * N); 1944 const VU16 rawC = LoadU(d, raw + 0xC * N); 1945 const VU16 rawD = LoadU(d, raw + 0xD * N); 1946 const VU16 rawE = LoadU(d, raw + 0xE * N); 1947 const VU16 rawF = LoadU(d, raw + 0xF * N); 1948 1949 // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into 1950 // packed8 to packedB. 1951 const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0); 1952 const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1); 1953 const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2); 1954 const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3); 1955 const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4); 1956 const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5); 1957 const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6); 1958 const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7); 1959 1960 // Masking after shifting left enables OrAnd. 1961 const VU16 hi8 = Set(d, 0xFF00u); 1962 const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8); 1963 const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8); 1964 const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8); 1965 const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8); 1966 StoreU(packed0, d, packed_out + 0 * N); 1967 StoreU(packed1, d, packed_out + 1 * N); 1968 StoreU(packed2, d, packed_out + 2 * N); 1969 StoreU(packed3, d, packed_out + 3 * N); 1970 StoreU(packed4, d, packed_out + 4 * N); 1971 StoreU(packed5, d, packed_out + 5 * N); 1972 StoreU(packed6, d, packed_out + 6 * N); 1973 StoreU(packed7, d, packed_out + 7 * N); 1974 StoreU(packed8, d, packed_out + 8 * N); 1975 StoreU(packed9, d, packed_out + 9 * N); 1976 StoreU(packedA, d, packed_out + 0xA * N); 1977 StoreU(packedB, d, packed_out + 0xB * N); 1978 } 1979 1980 template <class D> 1981 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 1982 uint16_t* HWY_RESTRICT raw) const { 1983 using VU16 = Vec<decltype(d)>; 1984 const size_t N = Lanes(d); 1985 1986 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 1987 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 1988 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 1989 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 1990 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 1991 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 1992 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 1993 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 1994 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 1995 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 1996 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); 1997 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); 1998 1999 const VU16 mask = Set(d, 0xFFFu); // Lowest 12 bits 2000 2001 const VU16 raw0 = And(packed0, mask); 2002 StoreU(raw0, d, raw + 0 * N); 2003 2004 const VU16 raw1 = And(packed1, mask); 2005 StoreU(raw1, d, raw + 1 * N); 2006 2007 const VU16 raw2 = And(packed2, mask); 2008 StoreU(raw2, d, raw + 2 * N); 2009 2010 const VU16 raw3 = And(packed3, mask); 2011 StoreU(raw3, d, raw + 3 * N); 2012 2013 const VU16 raw4 = And(packed4, mask); 2014 StoreU(raw4, d, raw + 4 * N); 2015 2016 const VU16 raw5 = And(packed5, mask); 2017 StoreU(raw5, d, raw + 5 * N); 2018 2019 const VU16 raw6 = And(packed6, mask); 2020 StoreU(raw6, d, raw + 6 * N); 2021 2022 const VU16 raw7 = And(packed7, mask); 2023 StoreU(raw7, d, raw + 7 * N); 2024 2025 const VU16 mid8 = Set(d, 0xFF0u); // upper 8 in lower 12 2026 const VU16 raw8 = 2027 OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8); 2028 const VU16 raw9 = 2029 OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8); 2030 const VU16 rawA = 2031 OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8); 2032 const VU16 rawB = 2033 OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8); 2034 const VU16 rawC = 2035 OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8); 2036 const VU16 rawD = 2037 OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8); 2038 const VU16 rawE = 2039 OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8); 2040 const VU16 rawF = 2041 OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8); 2042 StoreU(raw8, d, raw + 8 * N); 2043 StoreU(raw9, d, raw + 9 * N); 2044 StoreU(rawA, d, raw + 0xA * N); 2045 StoreU(rawB, d, raw + 0xB * N); 2046 StoreU(rawC, d, raw + 0xC * N); 2047 StoreU(rawD, d, raw + 0xD * N); 2048 StoreU(rawE, d, raw + 0xE * N); 2049 StoreU(rawF, d, raw + 0xF * N); 2050 } 2051 }; // Pack16<12> 2052 2053 template <> 2054 struct Pack16<13> { 2055 template <class D> 2056 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 2057 uint16_t* HWY_RESTRICT packed_out) const { 2058 using VU16 = Vec<decltype(d)>; 2059 const size_t N = Lanes(d); 2060 const VU16 raw0 = LoadU(d, raw + 0 * N); 2061 const VU16 raw1 = LoadU(d, raw + 1 * N); 2062 const VU16 raw2 = LoadU(d, raw + 2 * N); 2063 const VU16 raw3 = LoadU(d, raw + 3 * N); 2064 const VU16 raw4 = LoadU(d, raw + 4 * N); 2065 const VU16 raw5 = LoadU(d, raw + 5 * N); 2066 const VU16 raw6 = LoadU(d, raw + 6 * N); 2067 const VU16 raw7 = LoadU(d, raw + 7 * N); 2068 const VU16 raw8 = LoadU(d, raw + 8 * N); 2069 const VU16 raw9 = LoadU(d, raw + 9 * N); 2070 const VU16 rawA = LoadU(d, raw + 0xA * N); 2071 const VU16 rawB = LoadU(d, raw + 0xB * N); 2072 const VU16 rawC = LoadU(d, raw + 0xC * N); 2073 const VU16 rawD = LoadU(d, raw + 0xD * N); 2074 const VU16 rawE = LoadU(d, raw + 0xE * N); 2075 const VU16 rawF = LoadU(d, raw + 0xF * N); 2076 2077 // As with 11 bits, it is not obvious what the optimal partitioning looks 2078 // like. We similarly go with an 8+5 split. 2079 const VU16 lo8 = Set(d, 0xFFu); 2080 2081 // Lower 8 bits of all raw 2082 const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8); 2083 const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8); 2084 const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8); 2085 const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8); 2086 const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8); 2087 const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8); 2088 const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8); 2089 const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8); 2090 2091 StoreU(packed0, d, packed_out + 0 * N); 2092 StoreU(packed1, d, packed_out + 1 * N); 2093 StoreU(packed2, d, packed_out + 2 * N); 2094 StoreU(packed3, d, packed_out + 3 * N); 2095 StoreU(packed4, d, packed_out + 4 * N); 2096 StoreU(packed5, d, packed_out + 5 * N); 2097 StoreU(packed6, d, packed_out + 6 * N); 2098 StoreU(packed7, d, packed_out + 7 * N); 2099 2100 // Five vectors, three 5bit remnants each, plus one 5bit in their MSB. 2101 const VU16 top0 = ShiftRight<8>(raw0); 2102 const VU16 top1 = ShiftRight<8>(raw1); 2103 const VU16 top2 = ShiftRight<8>(raw2); 2104 const VU16 top3 = ShiftRight<8>(raw3); 2105 const VU16 top4 = ShiftRight<8>(raw4); 2106 2107 // Insert top raw bits into 5-bit groups within packed8..C. Moving the 2108 // mask along avoids masking each of raw0..E and enables OrAnd. 2109 VU16 next = Set(d, 0x3E0u); // 0x1F << 5 2110 VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next); 2111 VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next); 2112 VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next); 2113 VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next); 2114 VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next); 2115 next = ShiftLeft<5>(next); 2116 packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next); 2117 packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next); 2118 packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next); 2119 packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next); 2120 packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next); 2121 2122 // Scatter upper 5 bits of rawF into the upper bits. 2123 next = ShiftLeft<3>(next); // = 0x8000u 2124 packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next); 2125 packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next); 2126 packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next); 2127 packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next); 2128 packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next); 2129 2130 StoreU(packed8, d, packed_out + 8 * N); 2131 StoreU(packed9, d, packed_out + 9 * N); 2132 StoreU(packedA, d, packed_out + 0xA * N); 2133 StoreU(packedB, d, packed_out + 0xB * N); 2134 StoreU(packedC, d, packed_out + 0xC * N); 2135 } 2136 2137 template <class D> 2138 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 2139 uint16_t* HWY_RESTRICT raw) const { 2140 using VU16 = Vec<decltype(d)>; 2141 const size_t N = Lanes(d); 2142 2143 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 2144 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 2145 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 2146 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 2147 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 2148 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 2149 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 2150 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 2151 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 2152 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 2153 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); 2154 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); 2155 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); 2156 2157 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits 2158 2159 const VU16 down0 = And(packed0, mask); 2160 const VU16 down1 = ShiftRight<8>(packed0); 2161 const VU16 down2 = And(packed1, mask); 2162 const VU16 down3 = ShiftRight<8>(packed1); 2163 const VU16 down4 = And(packed2, mask); 2164 const VU16 down5 = ShiftRight<8>(packed2); 2165 const VU16 down6 = And(packed3, mask); 2166 const VU16 down7 = ShiftRight<8>(packed3); 2167 const VU16 down8 = And(packed4, mask); 2168 const VU16 down9 = ShiftRight<8>(packed4); 2169 const VU16 downA = And(packed5, mask); 2170 const VU16 downB = ShiftRight<8>(packed5); 2171 const VU16 downC = And(packed6, mask); 2172 const VU16 downD = ShiftRight<8>(packed6); 2173 const VU16 downE = And(packed7, mask); 2174 const VU16 downF = ShiftRight<8>(packed7); 2175 2176 // Upper five bits from packed8..C, eight bits from down0..F. 2177 const VU16 hi5 = Set(d, 0x1F00u); 2178 const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5); 2179 const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5); 2180 const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5); 2181 const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5); 2182 const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5); 2183 2184 const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5); 2185 const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5); 2186 const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5); 2187 const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5); 2188 const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5); 2189 2190 const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5); 2191 const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5); 2192 const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5); 2193 const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5); 2194 const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5); 2195 2196 // Shift MSB into the top 5-of-11 and mask. 2197 const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5), // 2198 And(ShiftRight<6>(packed9), hi5), 2199 And(ShiftRight<5>(packedA), hi5)); 2200 const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5), 2201 And(ShiftRight<3>(packedC), hi5), downF); 2202 const VU16 rawF = Or(p0, p1); 2203 2204 StoreU(raw0, d, raw + 0 * N); 2205 StoreU(raw1, d, raw + 1 * N); 2206 StoreU(raw2, d, raw + 2 * N); 2207 StoreU(raw3, d, raw + 3 * N); 2208 StoreU(raw4, d, raw + 4 * N); 2209 StoreU(raw5, d, raw + 5 * N); 2210 StoreU(raw6, d, raw + 6 * N); 2211 StoreU(raw7, d, raw + 7 * N); 2212 StoreU(raw8, d, raw + 8 * N); 2213 StoreU(raw9, d, raw + 9 * N); 2214 StoreU(rawA, d, raw + 0xA * N); 2215 StoreU(rawB, d, raw + 0xB * N); 2216 StoreU(rawC, d, raw + 0xC * N); 2217 StoreU(rawD, d, raw + 0xD * N); 2218 StoreU(rawE, d, raw + 0xE * N); 2219 StoreU(rawF, d, raw + 0xF * N); 2220 } 2221 }; // Pack16<13> 2222 2223 template <> 2224 struct Pack16<14> { 2225 template <class D> 2226 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 2227 uint16_t* HWY_RESTRICT packed_out) const { 2228 using VU16 = Vec<decltype(d)>; 2229 const size_t N = Lanes(d); 2230 const VU16 raw0 = LoadU(d, raw + 0 * N); 2231 const VU16 raw1 = LoadU(d, raw + 1 * N); 2232 const VU16 raw2 = LoadU(d, raw + 2 * N); 2233 const VU16 raw3 = LoadU(d, raw + 3 * N); 2234 const VU16 raw4 = LoadU(d, raw + 4 * N); 2235 const VU16 raw5 = LoadU(d, raw + 5 * N); 2236 const VU16 raw6 = LoadU(d, raw + 6 * N); 2237 const VU16 raw7 = LoadU(d, raw + 7 * N); 2238 const VU16 raw8 = LoadU(d, raw + 8 * N); 2239 const VU16 raw9 = LoadU(d, raw + 9 * N); 2240 const VU16 rawA = LoadU(d, raw + 0xA * N); 2241 const VU16 rawB = LoadU(d, raw + 0xB * N); 2242 const VU16 rawC = LoadU(d, raw + 0xC * N); 2243 const VU16 rawD = LoadU(d, raw + 0xD * N); 2244 const VU16 rawE = LoadU(d, raw + 0xE * N); 2245 const VU16 rawF = LoadU(d, raw + 0xF * N); 2246 2247 // 14 vectors, each with 14+2 bits; two raw vectors are scattered 2248 // across the upper 2 bits. 2249 const VU16 hi2 = Set(d, 0xC000u); 2250 const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE)); 2251 const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2); 2252 const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2); 2253 const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2); 2254 const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2); 2255 const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2); 2256 const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2); 2257 const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF)); 2258 const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2); 2259 const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2); 2260 const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2); 2261 const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2); 2262 const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2); 2263 const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2); 2264 2265 StoreU(packed0, d, packed_out + 0 * N); 2266 StoreU(packed1, d, packed_out + 1 * N); 2267 StoreU(packed2, d, packed_out + 2 * N); 2268 StoreU(packed3, d, packed_out + 3 * N); 2269 StoreU(packed4, d, packed_out + 4 * N); 2270 StoreU(packed5, d, packed_out + 5 * N); 2271 StoreU(packed6, d, packed_out + 6 * N); 2272 StoreU(packed7, d, packed_out + 7 * N); 2273 StoreU(packed8, d, packed_out + 8 * N); 2274 StoreU(packed9, d, packed_out + 9 * N); 2275 StoreU(packedA, d, packed_out + 0xA * N); 2276 StoreU(packedB, d, packed_out + 0xB * N); 2277 StoreU(packedC, d, packed_out + 0xC * N); 2278 StoreU(packedD, d, packed_out + 0xD * N); 2279 } 2280 2281 template <class D> 2282 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 2283 uint16_t* HWY_RESTRICT raw) const { 2284 using VU16 = Vec<decltype(d)>; 2285 const size_t N = Lanes(d); 2286 2287 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 2288 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 2289 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 2290 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 2291 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 2292 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 2293 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 2294 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 2295 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 2296 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 2297 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); 2298 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); 2299 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); 2300 const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N)); 2301 2302 const VU16 mask = Set(d, 0x3FFFu); // Lowest 14 bits 2303 2304 const VU16 raw0 = And(packed0, mask); 2305 StoreU(raw0, d, raw + 0 * N); 2306 2307 const VU16 raw1 = And(packed1, mask); 2308 StoreU(raw1, d, raw + 1 * N); 2309 2310 const VU16 raw2 = And(packed2, mask); 2311 StoreU(raw2, d, raw + 2 * N); 2312 2313 const VU16 raw3 = And(packed3, mask); 2314 StoreU(raw3, d, raw + 3 * N); 2315 2316 const VU16 raw4 = And(packed4, mask); 2317 StoreU(raw4, d, raw + 4 * N); 2318 2319 const VU16 raw5 = And(packed5, mask); 2320 StoreU(raw5, d, raw + 5 * N); 2321 2322 const VU16 raw6 = And(packed6, mask); 2323 StoreU(raw6, d, raw + 6 * N); 2324 2325 const VU16 raw7 = And(packed7, mask); 2326 StoreU(raw7, d, raw + 7 * N); 2327 2328 const VU16 raw8 = And(packed8, mask); 2329 StoreU(raw8, d, raw + 8 * N); 2330 2331 const VU16 raw9 = And(packed9, mask); 2332 StoreU(raw9, d, raw + 9 * N); 2333 2334 const VU16 rawA = And(packedA, mask); 2335 StoreU(rawA, d, raw + 0xA * N); 2336 2337 const VU16 rawB = And(packedB, mask); 2338 StoreU(rawB, d, raw + 0xB * N); 2339 2340 const VU16 rawC = And(packedC, mask); 2341 StoreU(rawC, d, raw + 0xC * N); 2342 2343 const VU16 rawD = And(packedD, mask); 2344 StoreU(rawD, d, raw + 0xD * N); 2345 2346 // rawE is the concatenation of the top two bits in packed0..6. 2347 const VU16 E0 = Xor3(ShiftRight<14>(packed0), // 2348 ShiftRight<12>(AndNot(mask, packed1)), 2349 ShiftRight<10>(AndNot(mask, packed2))); 2350 const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)), 2351 ShiftRight<6>(AndNot(mask, packed4)), 2352 ShiftRight<4>(AndNot(mask, packed5))); 2353 const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1); 2354 const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)), 2355 ShiftRight<12>(AndNot(mask, packed8)), 2356 ShiftRight<10>(AndNot(mask, packed9))); 2357 const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)), 2358 ShiftRight<6>(AndNot(mask, packedB)), 2359 ShiftRight<4>(AndNot(mask, packedC))); 2360 const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1); 2361 StoreU(rawE, d, raw + 0xE * N); 2362 StoreU(rawF, d, raw + 0xF * N); 2363 } 2364 }; // Pack16<14> 2365 2366 template <> 2367 struct Pack16<15> { 2368 template <class D> 2369 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 2370 uint16_t* HWY_RESTRICT packed_out) const { 2371 using VU16 = Vec<decltype(d)>; 2372 const size_t N = Lanes(d); 2373 const VU16 raw0 = LoadU(d, raw + 0 * N); 2374 const VU16 raw1 = LoadU(d, raw + 1 * N); 2375 const VU16 raw2 = LoadU(d, raw + 2 * N); 2376 const VU16 raw3 = LoadU(d, raw + 3 * N); 2377 const VU16 raw4 = LoadU(d, raw + 4 * N); 2378 const VU16 raw5 = LoadU(d, raw + 5 * N); 2379 const VU16 raw6 = LoadU(d, raw + 6 * N); 2380 const VU16 raw7 = LoadU(d, raw + 7 * N); 2381 const VU16 raw8 = LoadU(d, raw + 8 * N); 2382 const VU16 raw9 = LoadU(d, raw + 9 * N); 2383 const VU16 rawA = LoadU(d, raw + 0xA * N); 2384 const VU16 rawB = LoadU(d, raw + 0xB * N); 2385 const VU16 rawC = LoadU(d, raw + 0xC * N); 2386 const VU16 rawD = LoadU(d, raw + 0xD * N); 2387 const VU16 rawE = LoadU(d, raw + 0xE * N); 2388 const VU16 rawF = LoadU(d, raw + 0xF * N); 2389 2390 // 15 vectors, each with 15+1 bits; one packed vector is scattered 2391 // across the upper bit. 2392 const VU16 hi1 = Set(d, 0x8000u); 2393 const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF)); 2394 const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1); 2395 const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1); 2396 const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1); 2397 const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1); 2398 const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1); 2399 const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1); 2400 const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1); 2401 const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1); 2402 const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1); 2403 const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1); 2404 const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1); 2405 const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1); 2406 const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1); 2407 const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1); 2408 2409 StoreU(packed0, d, packed_out + 0 * N); 2410 StoreU(packed1, d, packed_out + 1 * N); 2411 StoreU(packed2, d, packed_out + 2 * N); 2412 StoreU(packed3, d, packed_out + 3 * N); 2413 StoreU(packed4, d, packed_out + 4 * N); 2414 StoreU(packed5, d, packed_out + 5 * N); 2415 StoreU(packed6, d, packed_out + 6 * N); 2416 StoreU(packed7, d, packed_out + 7 * N); 2417 StoreU(packed8, d, packed_out + 8 * N); 2418 StoreU(packed9, d, packed_out + 9 * N); 2419 StoreU(packedA, d, packed_out + 0xA * N); 2420 StoreU(packedB, d, packed_out + 0xB * N); 2421 StoreU(packedC, d, packed_out + 0xC * N); 2422 StoreU(packedD, d, packed_out + 0xD * N); 2423 StoreU(packedE, d, packed_out + 0xE * N); 2424 } 2425 2426 template <class D> 2427 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 2428 uint16_t* HWY_RESTRICT raw) const { 2429 using VU16 = Vec<decltype(d)>; 2430 const size_t N = Lanes(d); 2431 2432 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 2433 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 2434 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 2435 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 2436 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 2437 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 2438 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 2439 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 2440 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 2441 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 2442 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N)); 2443 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N)); 2444 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N)); 2445 const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N)); 2446 const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N)); 2447 2448 const VU16 mask = Set(d, 0x7FFFu); // Lowest 15 bits 2449 2450 const VU16 raw0 = And(packed0, mask); 2451 StoreU(raw0, d, raw + 0 * N); 2452 2453 const VU16 raw1 = And(packed1, mask); 2454 StoreU(raw1, d, raw + 1 * N); 2455 2456 const VU16 raw2 = And(packed2, mask); 2457 StoreU(raw2, d, raw + 2 * N); 2458 2459 const VU16 raw3 = And(packed3, mask); 2460 StoreU(raw3, d, raw + 3 * N); 2461 2462 const VU16 raw4 = And(packed4, mask); 2463 StoreU(raw4, d, raw + 4 * N); 2464 2465 const VU16 raw5 = And(packed5, mask); 2466 StoreU(raw5, d, raw + 5 * N); 2467 2468 const VU16 raw6 = And(packed6, mask); 2469 StoreU(raw6, d, raw + 6 * N); 2470 2471 const VU16 raw7 = And(packed7, mask); 2472 StoreU(raw7, d, raw + 7 * N); 2473 2474 const VU16 raw8 = And(packed8, mask); 2475 StoreU(raw8, d, raw + 8 * N); 2476 2477 const VU16 raw9 = And(packed9, mask); 2478 StoreU(raw9, d, raw + 9 * N); 2479 2480 const VU16 rawA = And(packedA, mask); 2481 StoreU(rawA, d, raw + 0xA * N); 2482 2483 const VU16 rawB = And(packedB, mask); 2484 StoreU(rawB, d, raw + 0xB * N); 2485 2486 const VU16 rawC = And(packedC, mask); 2487 StoreU(rawC, d, raw + 0xC * N); 2488 2489 const VU16 rawD = And(packedD, mask); 2490 StoreU(rawD, d, raw + 0xD * N); 2491 2492 const VU16 rawE = And(packedE, mask); 2493 StoreU(rawE, d, raw + 0xE * N); 2494 2495 // rawF is the concatenation of the top bit in packed0..E. 2496 const VU16 F0 = Xor3(ShiftRight<15>(packed0), // 2497 ShiftRight<14>(AndNot(mask, packed1)), 2498 ShiftRight<13>(AndNot(mask, packed2))); 2499 const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)), 2500 ShiftRight<11>(AndNot(mask, packed4)), 2501 ShiftRight<10>(AndNot(mask, packed5))); 2502 const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)), 2503 ShiftRight<8>(AndNot(mask, packed7)), 2504 ShiftRight<7>(AndNot(mask, packed8))); 2505 const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)), 2506 ShiftRight<5>(AndNot(mask, packedA)), 2507 ShiftRight<4>(AndNot(mask, packedB))); 2508 const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)), 2509 ShiftRight<2>(AndNot(mask, packedD)), 2510 ShiftRight<1>(AndNot(mask, packedE))); 2511 const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4)); 2512 StoreU(rawF, d, raw + 0xF * N); 2513 } 2514 }; // Pack16<15> 2515 2516 template <> 2517 struct Pack16<16> { 2518 template <class D> 2519 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw, 2520 uint16_t* HWY_RESTRICT packed_out) const { 2521 using VU16 = Vec<decltype(d)>; 2522 const size_t N = Lanes(d); 2523 const VU16 raw0 = LoadU(d, raw + 0 * N); 2524 const VU16 raw1 = LoadU(d, raw + 1 * N); 2525 const VU16 raw2 = LoadU(d, raw + 2 * N); 2526 const VU16 raw3 = LoadU(d, raw + 3 * N); 2527 const VU16 raw4 = LoadU(d, raw + 4 * N); 2528 const VU16 raw5 = LoadU(d, raw + 5 * N); 2529 const VU16 raw6 = LoadU(d, raw + 6 * N); 2530 const VU16 raw7 = LoadU(d, raw + 7 * N); 2531 const VU16 raw8 = LoadU(d, raw + 8 * N); 2532 const VU16 raw9 = LoadU(d, raw + 9 * N); 2533 const VU16 rawA = LoadU(d, raw + 0xA * N); 2534 const VU16 rawB = LoadU(d, raw + 0xB * N); 2535 const VU16 rawC = LoadU(d, raw + 0xC * N); 2536 const VU16 rawD = LoadU(d, raw + 0xD * N); 2537 const VU16 rawE = LoadU(d, raw + 0xE * N); 2538 const VU16 rawF = LoadU(d, raw + 0xF * N); 2539 2540 StoreU(raw0, d, packed_out + 0 * N); 2541 StoreU(raw1, d, packed_out + 1 * N); 2542 StoreU(raw2, d, packed_out + 2 * N); 2543 StoreU(raw3, d, packed_out + 3 * N); 2544 StoreU(raw4, d, packed_out + 4 * N); 2545 StoreU(raw5, d, packed_out + 5 * N); 2546 StoreU(raw6, d, packed_out + 6 * N); 2547 StoreU(raw7, d, packed_out + 7 * N); 2548 StoreU(raw8, d, packed_out + 8 * N); 2549 StoreU(raw9, d, packed_out + 9 * N); 2550 StoreU(rawA, d, packed_out + 0xA * N); 2551 StoreU(rawB, d, packed_out + 0xB * N); 2552 StoreU(rawC, d, packed_out + 0xC * N); 2553 StoreU(rawD, d, packed_out + 0xD * N); 2554 StoreU(rawE, d, packed_out + 0xE * N); 2555 StoreU(rawF, d, packed_out + 0xF * N); 2556 } 2557 2558 template <class D> 2559 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in, 2560 uint16_t* HWY_RESTRICT raw) const { 2561 using VU16 = Vec<decltype(d)>; 2562 const size_t N = Lanes(d); 2563 2564 const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N)); 2565 const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N)); 2566 const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N)); 2567 const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N)); 2568 const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N)); 2569 const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N)); 2570 const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N)); 2571 const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N)); 2572 const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N)); 2573 const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N)); 2574 const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N)); 2575 const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N)); 2576 const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N)); 2577 const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N)); 2578 const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N)); 2579 const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N)); 2580 2581 StoreU(raw0, d, raw + 0 * N); 2582 StoreU(raw1, d, raw + 1 * N); 2583 StoreU(raw2, d, raw + 2 * N); 2584 StoreU(raw3, d, raw + 3 * N); 2585 StoreU(raw4, d, raw + 4 * N); 2586 StoreU(raw5, d, raw + 5 * N); 2587 StoreU(raw6, d, raw + 6 * N); 2588 StoreU(raw7, d, raw + 7 * N); 2589 StoreU(raw8, d, raw + 8 * N); 2590 StoreU(raw9, d, raw + 9 * N); 2591 StoreU(rawA, d, raw + 0xA * N); 2592 StoreU(rawB, d, raw + 0xB * N); 2593 StoreU(rawC, d, raw + 0xC * N); 2594 StoreU(rawD, d, raw + 0xD * N); 2595 StoreU(rawE, d, raw + 0xE * N); 2596 StoreU(rawF, d, raw + 0xF * N); 2597 } 2598 }; // Pack16<16> 2599 2600 // The supported packing types for 32/64 bits. 2601 enum BlockPackingType { 2602 // Simple fixed bit-packing. 2603 kBitPacked, 2604 // Bit packing after subtracting a `frame of reference` value from input. 2605 kFoRBitPacked, 2606 }; 2607 2608 namespace detail { 2609 2610 // Generates the implementation for bit-packing/un-packing `T` type numbers 2611 // where each number takes `kBits` bits. 2612 // `S` is the remainder bits left from the previous bit-packed block. 2613 // `kLoadPos` is the offset from which the next vector block should be loaded. 2614 // `kStorePos` is the offset into which the next vector block should be stored. 2615 // `BlockPackingType` is the type of packing/unpacking for this block. 2616 template <typename T, size_t kBits, size_t S, size_t kLoadPos, size_t kStorePos, 2617 BlockPackingType block_packing_type> 2618 struct BitPackUnroller { 2619 static constexpr size_t B = sizeof(T) * 8; 2620 2621 template <class D, typename V> 2622 static inline void Pack(D d, const T* HWY_RESTRICT raw, 2623 T* HWY_RESTRICT packed_out, const V& mask, 2624 const V& frame_of_reference, V& in, V& out) { 2625 // Avoid compilation errors and unnecessary template instantiation if 2626 // compiling in C++11 or C++14 mode 2627 using NextUnroller = BitPackUnroller< 2628 T, kBits, ((S <= B) ? (S + ((S < B) ? kBits : 0)) : (S % B)), 2629 kLoadPos + static_cast<size_t>(S < B), 2630 kStorePos + static_cast<size_t>(S > B), block_packing_type>; 2631 2632 (void)raw; 2633 (void)mask; 2634 (void)in; 2635 2636 const size_t N = Lanes(d); 2637 HWY_IF_CONSTEXPR(S >= B) { 2638 StoreU(out, d, packed_out + kStorePos * N); 2639 HWY_IF_CONSTEXPR(S == B) { return; } 2640 HWY_IF_CONSTEXPR(S != B) { 2641 constexpr size_t shr_amount = (kBits - S % B) % B; 2642 out = ShiftRight<shr_amount>(in); 2643 // NextUnroller is a typedef for 2644 // Unroller<T, kBits, S % B, kLoadPos, kStorePos + 1> if S > B is true 2645 return NextUnroller::Pack(d, raw, packed_out, mask, frame_of_reference, 2646 in, out); 2647 } 2648 } 2649 HWY_IF_CONSTEXPR(S < B) { 2650 HWY_IF_CONSTEXPR(block_packing_type == BlockPackingType::kBitPacked) { 2651 in = LoadU(d, raw + kLoadPos * N); 2652 } 2653 HWY_IF_CONSTEXPR(block_packing_type == BlockPackingType::kFoRBitPacked) { 2654 in = Sub(LoadU(d, raw + kLoadPos * N), frame_of_reference); 2655 } 2656 // Optimize for the case when `S` is zero. 2657 // We can skip `Or` + ShiftLeft` to align `in`. 2658 HWY_IF_CONSTEXPR(S == 0) { out = in; } 2659 HWY_IF_CONSTEXPR(S != 0) { out = Or(out, ShiftLeft<S % B>(in)); } 2660 // NextUnroller is a typedef for 2661 // Unroller<T, kBits, S + kBits, kLoadPos + 1, kStorePos> if S < B is true 2662 return NextUnroller::Pack(d, raw, packed_out, mask, frame_of_reference, 2663 in, out); 2664 } 2665 } 2666 2667 template <class D, typename V> 2668 static inline void Unpack(D d, const T* HWY_RESTRICT packed_in, 2669 T* HWY_RESTRICT raw, const V& mask, 2670 const V& frame_of_reference, V& in, V& out) { 2671 // Avoid compilation errors and unnecessary template instantiation if 2672 // compiling in C++11 or C++14 mode 2673 using NextUnroller = BitPackUnroller< 2674 T, kBits, ((S <= B) ? (S + ((S < B) ? kBits : 0)) : (S % B)), 2675 kLoadPos + static_cast<size_t>(S > B), 2676 kStorePos + static_cast<size_t>(S < B), block_packing_type>; 2677 2678 (void)packed_in; 2679 (void)mask; 2680 (void)in; 2681 2682 const size_t N = Lanes(d); 2683 HWY_IF_CONSTEXPR(S >= B) { 2684 HWY_IF_CONSTEXPR(S == B) { 2685 V bitpacked_output = out; 2686 HWY_IF_CONSTEXPR(block_packing_type == 2687 BlockPackingType::kFoRBitPacked) { 2688 bitpacked_output = Add(bitpacked_output, frame_of_reference); 2689 } 2690 StoreU(bitpacked_output, d, raw + kStorePos * N); 2691 return; 2692 } 2693 HWY_IF_CONSTEXPR(S != B) { 2694 in = LoadU(d, packed_in + kLoadPos * N); 2695 constexpr size_t shl_amount = (kBits - S % B) % B; 2696 out = And(Or(out, ShiftLeft<shl_amount>(in)), mask); 2697 // NextUnroller is a typedef for 2698 // Unroller<T, kBits, S % B, kLoadPos + 1, kStorePos> if S > B is true 2699 return NextUnroller::Unpack(d, packed_in, raw, mask, frame_of_reference, 2700 in, out); 2701 } 2702 } 2703 HWY_IF_CONSTEXPR(S < B) { 2704 V bitpacked_output = out; 2705 HWY_IF_CONSTEXPR(block_packing_type == BlockPackingType::kFoRBitPacked) { 2706 bitpacked_output = Add(bitpacked_output, frame_of_reference); 2707 } 2708 StoreU(bitpacked_output, d, raw + kStorePos * N); 2709 HWY_IF_CONSTEXPR(S + kBits < B) { 2710 // Optimize for the case when `S` is zero. 2711 // We can skip the `ShiftRight` to align `in`. 2712 HWY_IF_CONSTEXPR(S == 0) { out = And(in, mask); } 2713 HWY_IF_CONSTEXPR(S != 0) { out = And(ShiftRight<S % B>(in), mask); } 2714 } 2715 HWY_IF_CONSTEXPR(S + kBits >= B) { out = ShiftRight<S % B>(in); } 2716 // NextUnroller is a typedef for 2717 // Unroller<T, kBits, S + kBits, kLoadPos, kStorePos + 1> if S < B is true 2718 return NextUnroller::Unpack(d, packed_in, raw, mask, frame_of_reference, 2719 in, out); 2720 } 2721 } 2722 }; 2723 2724 // Computes the highest power of two that divides `kBits`. 2725 template <size_t kBits> 2726 constexpr size_t NumLoops() { 2727 return (kBits & ~(kBits - 1)); 2728 } 2729 2730 template <size_t kBits> 2731 constexpr size_t PackedIncr() { 2732 return kBits / NumLoops<kBits>(); 2733 } 2734 2735 template <typename T, size_t kBits> 2736 constexpr size_t UnpackedIncr() { 2737 return (sizeof(T) * 8) / NumLoops<kBits>(); 2738 } 2739 2740 template <size_t kBits> 2741 constexpr uint32_t MaskBits32() { 2742 return static_cast<uint32_t>((1ull << kBits) - 1); 2743 } 2744 2745 template <size_t kBits> 2746 constexpr uint64_t MaskBits64() { 2747 return (uint64_t{1} << kBits) - 1; 2748 } 2749 template <> 2750 constexpr uint64_t MaskBits64<64>() { 2751 return ~uint64_t{0}; 2752 } 2753 2754 } // namespace detail 2755 2756 template <size_t kBits> // <= 32 2757 struct Pack32 { 2758 template <class D, 2759 BlockPackingType block_packing_type = BlockPackingType::kBitPacked> 2760 HWY_INLINE void Pack(D d, const uint32_t* HWY_RESTRICT raw, 2761 uint32_t* HWY_RESTRICT packed_out, 2762 const uint32_t frame_of_reference_value = 0) const { 2763 using V = VFromD<D>; 2764 const V mask = Set(d, detail::MaskBits32<kBits>()); 2765 const V frame_of_reference = Set(d, frame_of_reference_value); 2766 for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) { 2767 V in = Zero(d); 2768 V out = Zero(d); 2769 detail::BitPackUnroller<uint32_t, kBits, 0, 0, 0, 2770 block_packing_type>::Pack(d, raw, packed_out, 2771 mask, 2772 frame_of_reference, in, 2773 out); 2774 raw += detail::UnpackedIncr<uint32_t, kBits>() * Lanes(d); 2775 packed_out += detail::PackedIncr<kBits>() * Lanes(d); 2776 } 2777 } 2778 2779 template <class D, 2780 BlockPackingType block_packing_type = BlockPackingType::kBitPacked> 2781 HWY_INLINE void Unpack(D d, const uint32_t* HWY_RESTRICT packed_in, 2782 uint32_t* HWY_RESTRICT raw, 2783 const uint32_t frame_of_reference_value = 0) const { 2784 using V = VFromD<D>; 2785 const V mask = Set(d, detail::MaskBits32<kBits>()); 2786 const V frame_of_reference = Set(d, frame_of_reference_value); 2787 for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) { 2788 V in = LoadU(d, packed_in + 0 * Lanes(d)); 2789 V out = And(in, mask); 2790 detail::BitPackUnroller<uint32_t, kBits, kBits, 1, 0, 2791 block_packing_type>::Unpack(d, packed_in, raw, 2792 mask, 2793 frame_of_reference, 2794 in, out); 2795 raw += detail::UnpackedIncr<uint32_t, kBits>() * Lanes(d); 2796 packed_in += detail::PackedIncr<kBits>() * Lanes(d); 2797 } 2798 } 2799 }; 2800 2801 template <size_t kBits> // <= 64 2802 struct Pack64 { 2803 template <class D, 2804 BlockPackingType block_packing_type = BlockPackingType::kBitPacked> 2805 HWY_INLINE void Pack(D d, const uint64_t* HWY_RESTRICT raw, 2806 uint64_t* HWY_RESTRICT packed_out, 2807 const uint64_t frame_of_reference_value = 0) const { 2808 using V = VFromD<D>; 2809 const V mask = Set(d, detail::MaskBits64<kBits>()); 2810 const V frame_of_reference = Set(d, frame_of_reference_value); 2811 for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) { 2812 V in = Zero(d); 2813 V out = Zero(d); 2814 detail::BitPackUnroller<uint64_t, kBits, 0, 0, 0, 2815 block_packing_type>::Pack(d, raw, packed_out, 2816 mask, 2817 frame_of_reference, in, 2818 out); 2819 raw += detail::UnpackedIncr<uint64_t, kBits>() * Lanes(d); 2820 packed_out += detail::PackedIncr<kBits>() * Lanes(d); 2821 } 2822 } 2823 2824 template <class D, 2825 BlockPackingType block_packing_type = BlockPackingType::kBitPacked> 2826 HWY_INLINE void Unpack(D d, const uint64_t* HWY_RESTRICT packed_in, 2827 uint64_t* HWY_RESTRICT raw, 2828 const uint64_t frame_of_reference_value = 0) const { 2829 using V = VFromD<D>; 2830 const V mask = Set(d, detail::MaskBits64<kBits>()); 2831 const V frame_of_reference = Set(d, frame_of_reference_value); 2832 for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) { 2833 V in = LoadU(d, packed_in + 0 * Lanes(d)); 2834 V out = And(in, mask); 2835 detail::BitPackUnroller<uint64_t, kBits, kBits, 1, 0, 2836 block_packing_type>::Unpack(d, packed_in, raw, 2837 mask, 2838 frame_of_reference, 2839 in, out); 2840 raw += detail::UnpackedIncr<uint64_t, kBits>() * Lanes(d); 2841 packed_in += detail::PackedIncr<kBits>() * Lanes(d); 2842 } 2843 } 2844 }; 2845 2846 // NOLINTNEXTLINE(google-readability-namespace-comments) 2847 } // namespace HWY_NAMESPACE 2848 } // namespace hwy 2849 HWY_AFTER_NAMESPACE(); 2850 2851 #endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_