tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

bit_pack-inl.h (114998B)


      1 // Copyright 2022 Google LLC
      2 // SPDX-License-Identifier: Apache-2.0
      3 //
      4 // Licensed under the Apache License, Version 2.0 (the "License");
      5 // you may not use this file except in compliance with the License.
      6 // You may obtain a copy of the License at
      7 //
      8 //      http://www.apache.org/licenses/LICENSE-2.0
      9 //
     10 // Unless required by applicable law or agreed to in writing, software
     11 // distributed under the License is distributed on an "AS IS" BASIS,
     12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
     13 // See the License for the specific language governing permissions and
     14 // limitations under the License.
     15 
     16 #include <stddef.h>
     17 #include <stdint.h>
     18 
     19 #include "hwy/base.h"
     20 
     21 // Per-target include guard
     22 // clang-format off
     23 #if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == defined(HWY_TARGET_TOGGLE)  // NOLINT
     24 // clang-format on
     25 #ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
     26 #undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
     27 #else
     28 #define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
     29 #endif
     30 
     31 #include "hwy/highway.h"
     32 
     33 HWY_BEFORE_NAMESPACE();
     34 namespace hwy {
     35 namespace HWY_NAMESPACE {
     36 
     37 // The entry points are class templates specialized below for each number of
     38 // bits. Each provides Pack and Unpack member functions which load (Pack) or
     39 // store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of
     40 // packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16
     41 // for Pack16, 32 for Pack32 which is also the upper bound for kBits.
     42 template <size_t kBits>  // <= 8
     43 struct Pack8 {};
     44 template <size_t kBits>  // <= 16
     45 struct Pack16 {};
     46 
     47 template <>
     48 struct Pack8<1> {
     49  template <class D8>
     50  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
     51                       uint8_t* HWY_RESTRICT packed_out) const {
     52    const RepartitionToWide<decltype(d8)> d16;
     53    using VU16 = Vec<decltype(d16)>;
     54    const size_t N8 = Lanes(d8);
     55    // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
     56    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
     57    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
     58    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
     59    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
     60    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
     61    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
     62    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
     63    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
     64 
     65    const VU16 packed =
     66        Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)),
     67             Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)),
     68             Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0));
     69    StoreU(BitCast(d8, packed), d8, packed_out);
     70  }
     71 
     72  template <class D8>
     73  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
     74                         uint8_t* HWY_RESTRICT raw) const {
     75    const RepartitionToWide<decltype(d8)> d16;
     76    using VU16 = Vec<decltype(d16)>;
     77    const size_t N8 = Lanes(d8);
     78    const VU16 mask = Set(d16, 0x0101u);  // LSB in each byte
     79 
     80    const VU16 packed = BitCast(d16, LoadU(d8, packed_in));
     81 
     82    const VU16 raw0 = And(packed, mask);
     83    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
     84 
     85    const VU16 raw1 = And(ShiftRight<1>(packed), mask);
     86    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
     87 
     88    const VU16 raw2 = And(ShiftRight<2>(packed), mask);
     89    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
     90 
     91    const VU16 raw3 = And(ShiftRight<3>(packed), mask);
     92    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
     93 
     94    const VU16 raw4 = And(ShiftRight<4>(packed), mask);
     95    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
     96 
     97    const VU16 raw5 = And(ShiftRight<5>(packed), mask);
     98    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
     99 
    100    const VU16 raw6 = And(ShiftRight<6>(packed), mask);
    101    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    102 
    103    const VU16 raw7 = And(ShiftRight<7>(packed), mask);
    104    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    105  }
    106 };  // Pack8<1>
    107 
    108 template <>
    109 struct Pack8<2> {
    110  template <class D8>
    111  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    112                       uint8_t* HWY_RESTRICT packed_out) const {
    113    const RepartitionToWide<decltype(d8)> d16;
    114    using VU16 = Vec<decltype(d16)>;
    115    const size_t N8 = Lanes(d8);
    116    // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
    117    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
    118    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
    119    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
    120    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
    121    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
    122    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
    123    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
    124    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
    125 
    126    const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4),
    127                              Or(ShiftLeft<2>(raw2), raw0));
    128    const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5),
    129                              Or(ShiftLeft<2>(raw3), raw1));
    130    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
    131    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
    132  }
    133 
    134  template <class D8>
    135  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    136                         uint8_t* HWY_RESTRICT raw) const {
    137    const RepartitionToWide<decltype(d8)> d16;
    138    using VU16 = Vec<decltype(d16)>;
    139    const size_t N8 = Lanes(d8);
    140    const VU16 mask = Set(d16, 0x0303u);  // Lowest 2 bits per byte
    141 
    142    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
    143    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
    144 
    145    const VU16 raw0 = And(packed0, mask);
    146    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
    147 
    148    const VU16 raw1 = And(packed1, mask);
    149    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
    150 
    151    const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
    152    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
    153 
    154    const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
    155    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
    156 
    157    const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
    158    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
    159 
    160    const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
    161    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
    162 
    163    const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
    164    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    165 
    166    const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
    167    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    168  }
    169 };  // Pack8<2>
    170 
    171 template <>
    172 struct Pack8<3> {
    173  template <class D8>
    174  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    175                       uint8_t* HWY_RESTRICT packed_out) const {
    176    const RepartitionToWide<decltype(d8)> d16;
    177    using VU16 = Vec<decltype(d16)>;
    178    const size_t N8 = Lanes(d8);
    179    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
    180    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
    181    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
    182    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
    183    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
    184    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
    185    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
    186    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
    187 
    188    // The upper two bits of these three will be filled with packed3 (6 bits).
    189    VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0);
    190    VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1);
    191    VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2);
    192    const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3);
    193 
    194    const VU16 hi2 = Set(d16, 0xC0C0u);
    195    packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2);
    196    packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2);
    197    packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2);
    198    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
    199    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
    200    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
    201  }
    202 
    203  template <class D8>
    204  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    205                         uint8_t* HWY_RESTRICT raw) const {
    206    const RepartitionToWide<decltype(d8)> d16;
    207    using VU16 = Vec<decltype(d16)>;
    208    const size_t N8 = Lanes(d8);
    209    const VU16 mask = Set(d16, 0x0707u);  // Lowest 3 bits per byte
    210 
    211    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
    212    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
    213    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
    214 
    215    const VU16 raw0 = And(packed0, mask);
    216    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
    217 
    218    const VU16 raw1 = And(packed1, mask);
    219    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
    220 
    221    const VU16 raw2 = And(packed2, mask);
    222    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
    223 
    224    const VU16 raw4 = And(ShiftRight<3>(packed0), mask);
    225    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
    226 
    227    const VU16 raw5 = And(ShiftRight<3>(packed1), mask);
    228    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
    229 
    230    const VU16 raw6 = And(ShiftRight<3>(packed2), mask);
    231    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    232 
    233    // raw73 is the concatenation of the upper two bits in packed0..2.
    234    const VU16 hi2 = Set(d16, 0xC0C0u);
    235    const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)),  //
    236                            ShiftRight<4>(And(packed1, hi2)),
    237                            ShiftRight<2>(And(packed0, hi2)));
    238 
    239    const VU16 raw3 = And(mask, raw73);
    240    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
    241 
    242    const VU16 raw7 = And(mask, ShiftRight<3>(raw73));
    243    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    244  }
    245 };  // Pack8<3>
    246 
    247 template <>
    248 struct Pack8<4> {
    249  template <class D8>
    250  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    251                       uint8_t* HWY_RESTRICT packed_out) const {
    252    const RepartitionToWide<decltype(d8)> d16;
    253    using VU16 = Vec<decltype(d16)>;
    254    const size_t N8 = Lanes(d8);
    255    // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
    256    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
    257    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
    258    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
    259    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
    260    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
    261    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
    262    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
    263    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
    264 
    265    const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0);
    266    const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1);
    267    const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4);
    268    const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5);
    269 
    270    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
    271    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
    272    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
    273    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
    274  }
    275 
    276  template <class D8>
    277  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    278                         uint8_t* HWY_RESTRICT raw) const {
    279    const RepartitionToWide<decltype(d8)> d16;
    280    using VU16 = Vec<decltype(d16)>;
    281    const size_t N8 = Lanes(d8);
    282    const VU16 mask = Set(d16, 0x0F0Fu);  // Lowest 4 bits per byte
    283 
    284    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
    285    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
    286    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
    287    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
    288 
    289    const VU16 raw0 = And(packed0, mask);
    290    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
    291 
    292    const VU16 raw1 = And(packed1, mask);
    293    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
    294 
    295    const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
    296    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
    297 
    298    const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
    299    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
    300 
    301    const VU16 raw4 = And(packed2, mask);
    302    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
    303 
    304    const VU16 raw5 = And(packed3, mask);
    305    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
    306 
    307    const VU16 raw6 = And(ShiftRight<4>(packed2), mask);
    308    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    309 
    310    const VU16 raw7 = And(ShiftRight<4>(packed3), mask);
    311    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    312  }
    313 };  // Pack8<4>
    314 
    315 template <>
    316 struct Pack8<5> {
    317  template <class D8>
    318  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    319                       uint8_t* HWY_RESTRICT packed_out) const {
    320    const RepartitionToWide<decltype(d8)> d16;
    321    using VU16 = Vec<decltype(d16)>;
    322    const size_t N8 = Lanes(d8);
    323    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
    324    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
    325    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
    326    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
    327    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
    328    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
    329    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
    330    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
    331 
    332    // Fill upper three bits with upper bits from raw4..7.
    333    const VU16 hi3 = Set(d16, 0xE0E0u);
    334    const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3);
    335    const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3);
    336    const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3);
    337    const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3);
    338 
    339    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
    340    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
    341    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
    342    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
    343 
    344    // Combine lower two bits of raw4..7 into packed4.
    345    const VU16 lo2 = Set(d16, 0x0303u);
    346    const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)),
    347                                                 ShiftLeft<4>(And(raw6, lo2)),
    348                                                 ShiftLeft<6>(And(raw7, lo2))));
    349    StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
    350  }
    351 
    352  template <class D8>
    353  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    354                         uint8_t* HWY_RESTRICT raw) const {
    355    const RepartitionToWide<decltype(d8)> d16;
    356    using VU16 = Vec<decltype(d16)>;
    357    const size_t N8 = Lanes(d8);
    358 
    359    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
    360    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
    361    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
    362    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
    363    const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
    364 
    365    const VU16 mask = Set(d16, 0x1F1Fu);  // Lowest 5 bits per byte
    366 
    367    const VU16 raw0 = And(packed0, mask);
    368    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
    369 
    370    const VU16 raw1 = And(packed1, mask);
    371    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
    372 
    373    const VU16 raw2 = And(packed2, mask);
    374    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
    375 
    376    const VU16 raw3 = And(packed3, mask);
    377    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
    378 
    379    // The upper bits are the top 3 bits shifted right by three.
    380    const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0));
    381    const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1));
    382    const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2));
    383    const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3));
    384 
    385    // Insert the lower 2 bits, which were concatenated into a byte.
    386    const VU16 lo2 = Set(d16, 0x0303u);
    387    const VU16 raw4 = OrAnd(top4, lo2, packed4);
    388    const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4));
    389    const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4));
    390    const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4));
    391 
    392    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
    393    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
    394    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    395    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    396  }
    397 };  // Pack8<5>
    398 
    399 template <>
    400 struct Pack8<6> {
    401  template <class D8>
    402  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    403                       uint8_t* HWY_RESTRICT packed_out) const {
    404    const RepartitionToWide<decltype(d8)> d16;
    405    using VU16 = Vec<decltype(d16)>;
    406    const size_t N8 = Lanes(d8);
    407    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
    408    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
    409    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
    410    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
    411    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
    412    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
    413    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
    414    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
    415 
    416    const VU16 hi2 = Set(d16, 0xC0C0u);
    417    // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits.
    418    const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2);
    419    const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2);
    420    const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2);
    421    const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2);
    422    const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2);
    423    const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2);
    424 
    425    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
    426    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
    427    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
    428    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
    429    StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
    430    StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
    431  }
    432 
    433  template <class D8>
    434  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    435                         uint8_t* HWY_RESTRICT raw) const {
    436    const RepartitionToWide<decltype(d8)> d16;
    437    using VU16 = Vec<decltype(d16)>;
    438    const size_t N8 = Lanes(d8);
    439    const VU16 mask = Set(d16, 0x3F3Fu);  // Lowest 6 bits per byte
    440 
    441    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
    442    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
    443    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
    444    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
    445    const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
    446    const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
    447 
    448    const VU16 raw0 = And(packed0, mask);
    449    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
    450 
    451    const VU16 raw1 = And(packed1, mask);
    452    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
    453 
    454    const VU16 raw2 = And(packed2, mask);
    455    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
    456 
    457    const VU16 raw4 = And(packed3, mask);
    458    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
    459 
    460    const VU16 raw5 = And(packed4, mask);
    461    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
    462 
    463    const VU16 raw6 = And(packed5, mask);
    464    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    465 
    466    // raw3/7 are the concatenation of the upper two bits in packed0..2.
    467    const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)),
    468                           ShiftRight<4>(AndNot(mask, packed1)),
    469                           ShiftRight<2>(AndNot(mask, packed0)));
    470    const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)),
    471                           ShiftRight<4>(AndNot(mask, packed4)),
    472                           ShiftRight<2>(AndNot(mask, packed3)));
    473    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
    474    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    475  }
    476 };  // Pack8<6>
    477 
    478 template <>
    479 struct Pack8<7> {
    480  template <class D8>
    481  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    482                       uint8_t* HWY_RESTRICT packed_out) const {
    483    const RepartitionToWide<decltype(d8)> d16;
    484    using VU16 = Vec<decltype(d16)>;
    485    const size_t N8 = Lanes(d8);
    486    const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
    487    const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
    488    const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
    489    const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
    490    const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
    491    const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
    492    const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
    493    // Inserted into top bit of packed0..6.
    494    const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
    495 
    496    const VU16 hi1 = Set(d16, 0x8080u);
    497    const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1);
    498    const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1);
    499    const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1);
    500    const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1);
    501    const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1);
    502    const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1);
    503    const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1);
    504 
    505    StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
    506    StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
    507    StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
    508    StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
    509    StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
    510    StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
    511    StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8);
    512  }
    513 
    514  template <class D8>
    515  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    516                         uint8_t* HWY_RESTRICT raw) const {
    517    const RepartitionToWide<decltype(d8)> d16;
    518    using VU16 = Vec<decltype(d16)>;
    519    const size_t N8 = Lanes(d8);
    520 
    521    const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
    522    const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
    523    const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
    524    const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
    525    const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
    526    const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
    527    const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8));
    528 
    529    const VU16 mask = Set(d16, 0x7F7Fu);  // Lowest 7 bits per byte
    530 
    531    const VU16 raw0 = And(packed0, mask);
    532    StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
    533 
    534    const VU16 raw1 = And(packed1, mask);
    535    StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
    536 
    537    const VU16 raw2 = And(packed2, mask);
    538    StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
    539 
    540    const VU16 raw3 = And(packed3, mask);
    541    StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
    542 
    543    const VU16 raw4 = And(packed4, mask);
    544    StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
    545 
    546    const VU16 raw5 = And(packed5, mask);
    547    StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
    548 
    549    const VU16 raw6 = And(packed6, mask);
    550    StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
    551 
    552    const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)),
    553                         ShiftRight<6>(AndNot(mask, packed5)),
    554                         ShiftRight<5>(AndNot(mask, packed4)));
    555    const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)),
    556                         ShiftRight<3>(AndNot(mask, packed2)),
    557                         ShiftRight<2>(AndNot(mask, packed1)));
    558    const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1);
    559    StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
    560  }
    561 };  // Pack8<7>
    562 
    563 template <>
    564 struct Pack8<8> {
    565  template <class D8>
    566  HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
    567                       uint8_t* HWY_RESTRICT packed_out) const {
    568    using VU8 = Vec<decltype(d8)>;
    569    const size_t N8 = Lanes(d8);
    570    const VU8 raw0 = LoadU(d8, raw + 0 * N8);
    571    const VU8 raw1 = LoadU(d8, raw + 1 * N8);
    572    const VU8 raw2 = LoadU(d8, raw + 2 * N8);
    573    const VU8 raw3 = LoadU(d8, raw + 3 * N8);
    574    const VU8 raw4 = LoadU(d8, raw + 4 * N8);
    575    const VU8 raw5 = LoadU(d8, raw + 5 * N8);
    576    const VU8 raw6 = LoadU(d8, raw + 6 * N8);
    577    const VU8 raw7 = LoadU(d8, raw + 7 * N8);
    578 
    579    StoreU(raw0, d8, packed_out + 0 * N8);
    580    StoreU(raw1, d8, packed_out + 1 * N8);
    581    StoreU(raw2, d8, packed_out + 2 * N8);
    582    StoreU(raw3, d8, packed_out + 3 * N8);
    583    StoreU(raw4, d8, packed_out + 4 * N8);
    584    StoreU(raw5, d8, packed_out + 5 * N8);
    585    StoreU(raw6, d8, packed_out + 6 * N8);
    586    StoreU(raw7, d8, packed_out + 7 * N8);
    587  }
    588 
    589  template <class D8>
    590  HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
    591                         uint8_t* HWY_RESTRICT raw) const {
    592    using VU8 = Vec<decltype(d8)>;
    593    const size_t N8 = Lanes(d8);
    594    const VU8 raw0 = LoadU(d8, packed_in + 0 * N8);
    595    const VU8 raw1 = LoadU(d8, packed_in + 1 * N8);
    596    const VU8 raw2 = LoadU(d8, packed_in + 2 * N8);
    597    const VU8 raw3 = LoadU(d8, packed_in + 3 * N8);
    598    const VU8 raw4 = LoadU(d8, packed_in + 4 * N8);
    599    const VU8 raw5 = LoadU(d8, packed_in + 5 * N8);
    600    const VU8 raw6 = LoadU(d8, packed_in + 6 * N8);
    601    const VU8 raw7 = LoadU(d8, packed_in + 7 * N8);
    602 
    603    StoreU(raw0, d8, raw + 0 * N8);
    604    StoreU(raw1, d8, raw + 1 * N8);
    605    StoreU(raw2, d8, raw + 2 * N8);
    606    StoreU(raw3, d8, raw + 3 * N8);
    607    StoreU(raw4, d8, raw + 4 * N8);
    608    StoreU(raw5, d8, raw + 5 * N8);
    609    StoreU(raw6, d8, raw + 6 * N8);
    610    StoreU(raw7, d8, raw + 7 * N8);
    611  }
    612 };  // Pack8<8>
    613 
    614 template <>
    615 struct Pack16<1> {
    616  template <class D>
    617  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
    618                       uint16_t* HWY_RESTRICT packed_out) const {
    619    using VU16 = Vec<decltype(d)>;
    620    const size_t N = Lanes(d);
    621    const VU16 raw0 = LoadU(d, raw + 0 * N);
    622    const VU16 raw1 = LoadU(d, raw + 1 * N);
    623    const VU16 raw2 = LoadU(d, raw + 2 * N);
    624    const VU16 raw3 = LoadU(d, raw + 3 * N);
    625    const VU16 raw4 = LoadU(d, raw + 4 * N);
    626    const VU16 raw5 = LoadU(d, raw + 5 * N);
    627    const VU16 raw6 = LoadU(d, raw + 6 * N);
    628    const VU16 raw7 = LoadU(d, raw + 7 * N);
    629    const VU16 raw8 = LoadU(d, raw + 8 * N);
    630    const VU16 raw9 = LoadU(d, raw + 9 * N);
    631    const VU16 rawA = LoadU(d, raw + 0xA * N);
    632    const VU16 rawB = LoadU(d, raw + 0xB * N);
    633    const VU16 rawC = LoadU(d, raw + 0xC * N);
    634    const VU16 rawD = LoadU(d, raw + 0xD * N);
    635    const VU16 rawE = LoadU(d, raw + 0xE * N);
    636    const VU16 rawF = LoadU(d, raw + 0xF * N);
    637 
    638    const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0);
    639    const VU16 p1 =
    640        Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3));
    641    const VU16 p2 =
    642        Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6));
    643    const VU16 p3 =
    644        Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9));
    645    const VU16 p4 =
    646        Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC));
    647    const VU16 packed =
    648        Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4));
    649    StoreU(packed, d, packed_out);
    650  }
    651 
    652  template <class D>
    653  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
    654                         uint16_t* HWY_RESTRICT raw) const {
    655    using VU16 = Vec<decltype(d)>;
    656    const size_t N = Lanes(d);
    657    const VU16 mask = Set(d, 1u);  // Lowest bit
    658 
    659    const VU16 packed = LoadU(d, packed_in);
    660 
    661    const VU16 raw0 = And(packed, mask);
    662    StoreU(raw0, d, raw + 0 * N);
    663 
    664    const VU16 raw1 = And(ShiftRight<1>(packed), mask);
    665    StoreU(raw1, d, raw + 1 * N);
    666 
    667    const VU16 raw2 = And(ShiftRight<2>(packed), mask);
    668    StoreU(raw2, d, raw + 2 * N);
    669 
    670    const VU16 raw3 = And(ShiftRight<3>(packed), mask);
    671    StoreU(raw3, d, raw + 3 * N);
    672 
    673    const VU16 raw4 = And(ShiftRight<4>(packed), mask);
    674    StoreU(raw4, d, raw + 4 * N);
    675 
    676    const VU16 raw5 = And(ShiftRight<5>(packed), mask);
    677    StoreU(raw5, d, raw + 5 * N);
    678 
    679    const VU16 raw6 = And(ShiftRight<6>(packed), mask);
    680    StoreU(raw6, d, raw + 6 * N);
    681 
    682    const VU16 raw7 = And(ShiftRight<7>(packed), mask);
    683    StoreU(raw7, d, raw + 7 * N);
    684 
    685    const VU16 raw8 = And(ShiftRight<8>(packed), mask);
    686    StoreU(raw8, d, raw + 8 * N);
    687 
    688    const VU16 raw9 = And(ShiftRight<9>(packed), mask);
    689    StoreU(raw9, d, raw + 9 * N);
    690 
    691    const VU16 rawA = And(ShiftRight<0xA>(packed), mask);
    692    StoreU(rawA, d, raw + 0xA * N);
    693 
    694    const VU16 rawB = And(ShiftRight<0xB>(packed), mask);
    695    StoreU(rawB, d, raw + 0xB * N);
    696 
    697    const VU16 rawC = And(ShiftRight<0xC>(packed), mask);
    698    StoreU(rawC, d, raw + 0xC * N);
    699 
    700    const VU16 rawD = And(ShiftRight<0xD>(packed), mask);
    701    StoreU(rawD, d, raw + 0xD * N);
    702 
    703    const VU16 rawE = And(ShiftRight<0xE>(packed), mask);
    704    StoreU(rawE, d, raw + 0xE * N);
    705 
    706    const VU16 rawF = ShiftRight<0xF>(packed);
    707    StoreU(rawF, d, raw + 0xF * N);
    708  }
    709 };  // Pack16<1>
    710 
    711 template <>
    712 struct Pack16<2> {
    713  template <class D>
    714  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
    715                       uint16_t* HWY_RESTRICT packed_out) const {
    716    using VU16 = Vec<decltype(d)>;
    717    const size_t N = Lanes(d);
    718    const VU16 raw0 = LoadU(d, raw + 0 * N);
    719    const VU16 raw1 = LoadU(d, raw + 1 * N);
    720    const VU16 raw2 = LoadU(d, raw + 2 * N);
    721    const VU16 raw3 = LoadU(d, raw + 3 * N);
    722    const VU16 raw4 = LoadU(d, raw + 4 * N);
    723    const VU16 raw5 = LoadU(d, raw + 5 * N);
    724    const VU16 raw6 = LoadU(d, raw + 6 * N);
    725    const VU16 raw7 = LoadU(d, raw + 7 * N);
    726    const VU16 raw8 = LoadU(d, raw + 8 * N);
    727    const VU16 raw9 = LoadU(d, raw + 9 * N);
    728    const VU16 rawA = LoadU(d, raw + 0xA * N);
    729    const VU16 rawB = LoadU(d, raw + 0xB * N);
    730    const VU16 rawC = LoadU(d, raw + 0xC * N);
    731    const VU16 rawD = LoadU(d, raw + 0xD * N);
    732    const VU16 rawE = LoadU(d, raw + 0xE * N);
    733    const VU16 rawF = LoadU(d, raw + 0xF * N);
    734 
    735    VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0);
    736    VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1);
    737    packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6));
    738    packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7));
    739 
    740    packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA));
    741    packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB));
    742 
    743    packed0 = Or(packed0, ShiftLeft<14>(rawE));
    744    packed1 = Or(packed1, ShiftLeft<14>(rawF));
    745    StoreU(packed0, d, packed_out + 0 * N);
    746    StoreU(packed1, d, packed_out + 1 * N);
    747  }
    748 
    749  template <class D>
    750  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
    751                         uint16_t* HWY_RESTRICT raw) const {
    752    using VU16 = Vec<decltype(d)>;
    753    const size_t N = Lanes(d);
    754    const VU16 mask = Set(d, 0x3u);  // Lowest 2 bits
    755 
    756    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
    757    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
    758 
    759    const VU16 raw0 = And(packed0, mask);
    760    StoreU(raw0, d, raw + 0 * N);
    761 
    762    const VU16 raw1 = And(packed1, mask);
    763    StoreU(raw1, d, raw + 1 * N);
    764 
    765    const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
    766    StoreU(raw2, d, raw + 2 * N);
    767 
    768    const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
    769    StoreU(raw3, d, raw + 3 * N);
    770 
    771    const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
    772    StoreU(raw4, d, raw + 4 * N);
    773 
    774    const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
    775    StoreU(raw5, d, raw + 5 * N);
    776 
    777    const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
    778    StoreU(raw6, d, raw + 6 * N);
    779 
    780    const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
    781    StoreU(raw7, d, raw + 7 * N);
    782 
    783    const VU16 raw8 = And(ShiftRight<8>(packed0), mask);
    784    StoreU(raw8, d, raw + 8 * N);
    785 
    786    const VU16 raw9 = And(ShiftRight<8>(packed1), mask);
    787    StoreU(raw9, d, raw + 9 * N);
    788 
    789    const VU16 rawA = And(ShiftRight<0xA>(packed0), mask);
    790    StoreU(rawA, d, raw + 0xA * N);
    791 
    792    const VU16 rawB = And(ShiftRight<0xA>(packed1), mask);
    793    StoreU(rawB, d, raw + 0xB * N);
    794 
    795    const VU16 rawC = And(ShiftRight<0xC>(packed0), mask);
    796    StoreU(rawC, d, raw + 0xC * N);
    797 
    798    const VU16 rawD = And(ShiftRight<0xC>(packed1), mask);
    799    StoreU(rawD, d, raw + 0xD * N);
    800 
    801    const VU16 rawE = ShiftRight<0xE>(packed0);
    802    StoreU(rawE, d, raw + 0xE * N);
    803 
    804    const VU16 rawF = ShiftRight<0xE>(packed1);
    805    StoreU(rawF, d, raw + 0xF * N);
    806  }
    807 };  // Pack16<2>
    808 
    809 template <>
    810 struct Pack16<3> {
    811  template <class D>
    812  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
    813                       uint16_t* HWY_RESTRICT packed_out) const {
    814    using VU16 = Vec<decltype(d)>;
    815    const size_t N = Lanes(d);
    816    const VU16 raw0 = LoadU(d, raw + 0 * N);
    817    const VU16 raw1 = LoadU(d, raw + 1 * N);
    818    const VU16 raw2 = LoadU(d, raw + 2 * N);
    819    const VU16 raw3 = LoadU(d, raw + 3 * N);
    820    const VU16 raw4 = LoadU(d, raw + 4 * N);
    821    const VU16 raw5 = LoadU(d, raw + 5 * N);
    822    const VU16 raw6 = LoadU(d, raw + 6 * N);
    823    const VU16 raw7 = LoadU(d, raw + 7 * N);
    824    const VU16 raw8 = LoadU(d, raw + 8 * N);
    825    const VU16 raw9 = LoadU(d, raw + 9 * N);
    826    const VU16 rawA = LoadU(d, raw + 0xA * N);
    827    const VU16 rawB = LoadU(d, raw + 0xB * N);
    828    const VU16 rawC = LoadU(d, raw + 0xC * N);
    829    const VU16 rawD = LoadU(d, raw + 0xD * N);
    830    const VU16 rawE = LoadU(d, raw + 0xE * N);
    831    const VU16 rawF = LoadU(d, raw + 0xF * N);
    832 
    833    // We can fit 15 raw vectors in three packed vectors (five each).
    834    VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0);
    835    VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1);
    836    VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2);
    837 
    838    // rawF will be scattered into the upper bit of these three.
    839    packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9));
    840    packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA));
    841    packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB));
    842 
    843    const VU16 hi1 = Set(d, 0x8000u);
    844    packed0 = Or(packed0, ShiftLeft<15>(rawF));  // MSB only, no mask
    845    packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
    846    packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
    847    StoreU(packed0, d, packed_out + 0 * N);
    848    StoreU(packed1, d, packed_out + 1 * N);
    849    StoreU(packed2, d, packed_out + 2 * N);
    850  }
    851 
    852  template <class D>
    853  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
    854                         uint16_t* HWY_RESTRICT raw) const {
    855    using VU16 = Vec<decltype(d)>;
    856    const size_t N = Lanes(d);
    857    const VU16 mask = Set(d, 0x7u);  // Lowest 3 bits
    858 
    859    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
    860    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
    861    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
    862 
    863    const VU16 raw0 = And(mask, packed0);
    864    StoreU(raw0, d, raw + 0 * N);
    865 
    866    const VU16 raw1 = And(mask, packed1);
    867    StoreU(raw1, d, raw + 1 * N);
    868 
    869    const VU16 raw2 = And(mask, packed2);
    870    StoreU(raw2, d, raw + 2 * N);
    871 
    872    const VU16 raw3 = And(mask, ShiftRight<3>(packed0));
    873    StoreU(raw3, d, raw + 3 * N);
    874 
    875    const VU16 raw4 = And(mask, ShiftRight<3>(packed1));
    876    StoreU(raw4, d, raw + 4 * N);
    877 
    878    const VU16 raw5 = And(mask, ShiftRight<3>(packed2));
    879    StoreU(raw5, d, raw + 5 * N);
    880 
    881    const VU16 raw6 = And(mask, ShiftRight<6>(packed0));
    882    StoreU(raw6, d, raw + 6 * N);
    883 
    884    const VU16 raw7 = And(mask, ShiftRight<6>(packed1));
    885    StoreU(raw7, d, raw + 7 * N);
    886 
    887    const VU16 raw8 = And(mask, ShiftRight<6>(packed2));
    888    StoreU(raw8, d, raw + 8 * N);
    889 
    890    const VU16 raw9 = And(mask, ShiftRight<9>(packed0));
    891    StoreU(raw9, d, raw + 9 * N);
    892 
    893    const VU16 rawA = And(mask, ShiftRight<9>(packed1));
    894    StoreU(rawA, d, raw + 0xA * N);
    895 
    896    const VU16 rawB = And(mask, ShiftRight<9>(packed2));
    897    StoreU(rawB, d, raw + 0xB * N);
    898 
    899    const VU16 rawC = And(mask, ShiftRight<12>(packed0));
    900    StoreU(rawC, d, raw + 0xC * N);
    901 
    902    const VU16 rawD = And(mask, ShiftRight<12>(packed1));
    903    StoreU(rawD, d, raw + 0xD * N);
    904 
    905    const VU16 rawE = And(mask, ShiftRight<12>(packed2));
    906    StoreU(rawE, d, raw + 0xE * N);
    907 
    908    // rawF is the concatenation of the upper bit of packed0..2.
    909    const VU16 down0 = ShiftRight<15>(packed0);
    910    const VU16 down1 = ShiftRight<15>(packed1);
    911    const VU16 down2 = ShiftRight<15>(packed2);
    912    const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0);
    913    StoreU(rawF, d, raw + 0xF * N);
    914  }
    915 };  // Pack16<3>
    916 
    917 template <>
    918 struct Pack16<4> {
    919  template <class D>
    920  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
    921                       uint16_t* HWY_RESTRICT packed_out) const {
    922    using VU16 = Vec<decltype(d)>;
    923    const size_t N = Lanes(d);
    924    const VU16 raw0 = LoadU(d, raw + 0 * N);
    925    const VU16 raw1 = LoadU(d, raw + 1 * N);
    926    const VU16 raw2 = LoadU(d, raw + 2 * N);
    927    const VU16 raw3 = LoadU(d, raw + 3 * N);
    928    const VU16 raw4 = LoadU(d, raw + 4 * N);
    929    const VU16 raw5 = LoadU(d, raw + 5 * N);
    930    const VU16 raw6 = LoadU(d, raw + 6 * N);
    931    const VU16 raw7 = LoadU(d, raw + 7 * N);
    932    const VU16 raw8 = LoadU(d, raw + 8 * N);
    933    const VU16 raw9 = LoadU(d, raw + 9 * N);
    934    const VU16 rawA = LoadU(d, raw + 0xA * N);
    935    const VU16 rawB = LoadU(d, raw + 0xB * N);
    936    const VU16 rawC = LoadU(d, raw + 0xC * N);
    937    const VU16 rawD = LoadU(d, raw + 0xD * N);
    938    const VU16 rawE = LoadU(d, raw + 0xE * N);
    939    const VU16 rawF = LoadU(d, raw + 0xF * N);
    940 
    941    VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0);
    942    VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1);
    943    packed0 = Or(packed0, ShiftLeft<12>(raw6));
    944    packed1 = Or(packed1, ShiftLeft<12>(raw7));
    945    VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8);
    946    VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9);
    947    packed2 = Or(packed2, ShiftLeft<12>(rawE));
    948    packed3 = Or(packed3, ShiftLeft<12>(rawF));
    949 
    950    StoreU(packed0, d, packed_out + 0 * N);
    951    StoreU(packed1, d, packed_out + 1 * N);
    952    StoreU(packed2, d, packed_out + 2 * N);
    953    StoreU(packed3, d, packed_out + 3 * N);
    954  }
    955 
    956  template <class D>
    957  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
    958                         uint16_t* HWY_RESTRICT raw) const {
    959    using VU16 = Vec<decltype(d)>;
    960    const size_t N = Lanes(d);
    961    const VU16 mask = Set(d, 0xFu);  // Lowest 4 bits
    962 
    963    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
    964    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
    965    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
    966    const VU16 packed3 = LoadU(d, packed_in + 3 * N);
    967 
    968    const VU16 raw0 = And(packed0, mask);
    969    StoreU(raw0, d, raw + 0 * N);
    970 
    971    const VU16 raw1 = And(packed1, mask);
    972    StoreU(raw1, d, raw + 1 * N);
    973 
    974    const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
    975    StoreU(raw2, d, raw + 2 * N);
    976 
    977    const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
    978    StoreU(raw3, d, raw + 3 * N);
    979 
    980    const VU16 raw4 = And(ShiftRight<8>(packed0), mask);
    981    StoreU(raw4, d, raw + 4 * N);
    982 
    983    const VU16 raw5 = And(ShiftRight<8>(packed1), mask);
    984    StoreU(raw5, d, raw + 5 * N);
    985 
    986    const VU16 raw6 = ShiftRight<12>(packed0);  // no mask required
    987    StoreU(raw6, d, raw + 6 * N);
    988 
    989    const VU16 raw7 = ShiftRight<12>(packed1);  // no mask required
    990    StoreU(raw7, d, raw + 7 * N);
    991 
    992    const VU16 raw8 = And(packed2, mask);
    993    StoreU(raw8, d, raw + 8 * N);
    994 
    995    const VU16 raw9 = And(packed3, mask);
    996    StoreU(raw9, d, raw + 9 * N);
    997 
    998    const VU16 rawA = And(ShiftRight<4>(packed2), mask);
    999    StoreU(rawA, d, raw + 0xA * N);
   1000 
   1001    const VU16 rawB = And(ShiftRight<4>(packed3), mask);
   1002    StoreU(rawB, d, raw + 0xB * N);
   1003 
   1004    const VU16 rawC = And(ShiftRight<8>(packed2), mask);
   1005    StoreU(rawC, d, raw + 0xC * N);
   1006 
   1007    const VU16 rawD = And(ShiftRight<8>(packed3), mask);
   1008    StoreU(rawD, d, raw + 0xD * N);
   1009 
   1010    const VU16 rawE = ShiftRight<12>(packed2);  // no mask required
   1011    StoreU(rawE, d, raw + 0xE * N);
   1012 
   1013    const VU16 rawF = ShiftRight<12>(packed3);  // no mask required
   1014    StoreU(rawF, d, raw + 0xF * N);
   1015  }
   1016 };  // Pack16<4>
   1017 
   1018 template <>
   1019 struct Pack16<5> {
   1020  template <class D>
   1021  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1022                       uint16_t* HWY_RESTRICT packed_out) const {
   1023    using VU16 = Vec<decltype(d)>;
   1024    const size_t N = Lanes(d);
   1025    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1026    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1027    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1028    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1029    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1030    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1031    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1032    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1033    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1034    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1035    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1036    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1037    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1038    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1039    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1040    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1041 
   1042    // We can fit 15 raw vectors in five packed vectors (three each).
   1043    VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0);
   1044    VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1);
   1045    VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2);
   1046    VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3);
   1047    VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4);
   1048 
   1049    // rawF will be scattered into the upper bits of these five.
   1050    const VU16 hi1 = Set(d, 0x8000u);
   1051    packed0 = Or(packed0, ShiftLeft<15>(rawF));  // MSB only, no mask
   1052    packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
   1053    packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
   1054    packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1);
   1055    packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1);
   1056 
   1057    StoreU(packed0, d, packed_out + 0 * N);
   1058    StoreU(packed1, d, packed_out + 1 * N);
   1059    StoreU(packed2, d, packed_out + 2 * N);
   1060    StoreU(packed3, d, packed_out + 3 * N);
   1061    StoreU(packed4, d, packed_out + 4 * N);
   1062  }
   1063 
   1064  template <class D>
   1065  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1066                         uint16_t* HWY_RESTRICT raw) const {
   1067    using VU16 = Vec<decltype(d)>;
   1068    const size_t N = Lanes(d);
   1069 
   1070    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
   1071    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
   1072    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
   1073    const VU16 packed3 = LoadU(d, packed_in + 3 * N);
   1074    const VU16 packed4 = LoadU(d, packed_in + 4 * N);
   1075 
   1076    const VU16 mask = Set(d, 0x1Fu);  // Lowest 5 bits
   1077 
   1078    const VU16 raw0 = And(packed0, mask);
   1079    StoreU(raw0, d, raw + 0 * N);
   1080 
   1081    const VU16 raw1 = And(packed1, mask);
   1082    StoreU(raw1, d, raw + 1 * N);
   1083 
   1084    const VU16 raw2 = And(packed2, mask);
   1085    StoreU(raw2, d, raw + 2 * N);
   1086 
   1087    const VU16 raw3 = And(packed3, mask);
   1088    StoreU(raw3, d, raw + 3 * N);
   1089 
   1090    const VU16 raw4 = And(packed4, mask);
   1091    StoreU(raw4, d, raw + 4 * N);
   1092 
   1093    const VU16 raw5 = And(ShiftRight<5>(packed0), mask);
   1094    StoreU(raw5, d, raw + 5 * N);
   1095 
   1096    const VU16 raw6 = And(ShiftRight<5>(packed1), mask);
   1097    StoreU(raw6, d, raw + 6 * N);
   1098 
   1099    const VU16 raw7 = And(ShiftRight<5>(packed2), mask);
   1100    StoreU(raw7, d, raw + 7 * N);
   1101 
   1102    const VU16 raw8 = And(ShiftRight<5>(packed3), mask);
   1103    StoreU(raw8, d, raw + 8 * N);
   1104 
   1105    const VU16 raw9 = And(ShiftRight<5>(packed4), mask);
   1106    StoreU(raw9, d, raw + 9 * N);
   1107 
   1108    const VU16 rawA = And(ShiftRight<10>(packed0), mask);
   1109    StoreU(rawA, d, raw + 0xA * N);
   1110 
   1111    const VU16 rawB = And(ShiftRight<10>(packed1), mask);
   1112    StoreU(rawB, d, raw + 0xB * N);
   1113 
   1114    const VU16 rawC = And(ShiftRight<10>(packed2), mask);
   1115    StoreU(rawC, d, raw + 0xC * N);
   1116 
   1117    const VU16 rawD = And(ShiftRight<10>(packed3), mask);
   1118    StoreU(rawD, d, raw + 0xD * N);
   1119 
   1120    const VU16 rawE = And(ShiftRight<10>(packed4), mask);
   1121    StoreU(rawE, d, raw + 0xE * N);
   1122 
   1123    // rawF is the concatenation of the lower bit of packed0..4.
   1124    const VU16 down0 = ShiftRight<15>(packed0);
   1125    const VU16 down1 = ShiftRight<15>(packed1);
   1126    const VU16 hi1 = Set(d, 0x8000u);
   1127    const VU16 p0 =
   1128        Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0);
   1129    const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)),
   1130                           ShiftRight<12>(And(packed3, hi1)), p0);
   1131    StoreU(rawF, d, raw + 0xF * N);
   1132  }
   1133 };  // Pack16<5>
   1134 
   1135 template <>
   1136 struct Pack16<6> {
   1137  template <class D>
   1138  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1139                       uint16_t* HWY_RESTRICT packed_out) const {
   1140    using VU16 = Vec<decltype(d)>;
   1141    const size_t N = Lanes(d);
   1142    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1143    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1144    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1145    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1146    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1147    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1148    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1149    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1150    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1151    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1152    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1153    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1154    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1155    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1156    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1157    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1158 
   1159    const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3);
   1160    const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB);
   1161    // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the
   1162    // four remainder bits at the top of each vector.
   1163    const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0);
   1164    VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1);
   1165    VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2);
   1166    const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8);
   1167    VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9);
   1168    VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA);
   1169 
   1170    const VU16 hi4 = Set(d, 0xF000u);
   1171    packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4);
   1172    packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4);
   1173    packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4);
   1174    packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4);
   1175 
   1176    StoreU(packed0, d, packed_out + 0 * N);
   1177    StoreU(packed1, d, packed_out + 1 * N);
   1178    StoreU(packed2, d, packed_out + 2 * N);
   1179    StoreU(packed4, d, packed_out + 3 * N);
   1180    StoreU(packed5, d, packed_out + 4 * N);
   1181    StoreU(packed6, d, packed_out + 5 * N);
   1182  }
   1183 
   1184  template <class D>
   1185  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1186                         uint16_t* HWY_RESTRICT raw) const {
   1187    using VU16 = Vec<decltype(d)>;
   1188    const size_t N = Lanes(d);
   1189    const VU16 mask = Set(d, 0x3Fu);  // Lowest 6 bits
   1190 
   1191    const VU16 packed0 = LoadU(d, packed_in + 0 * N);
   1192    const VU16 packed1 = LoadU(d, packed_in + 1 * N);
   1193    const VU16 packed2 = LoadU(d, packed_in + 2 * N);
   1194    const VU16 packed4 = LoadU(d, packed_in + 3 * N);
   1195    const VU16 packed5 = LoadU(d, packed_in + 4 * N);
   1196    const VU16 packed6 = LoadU(d, packed_in + 5 * N);
   1197 
   1198    const VU16 raw0 = And(packed0, mask);
   1199    StoreU(raw0, d, raw + 0 * N);
   1200 
   1201    const VU16 raw1 = And(packed1, mask);
   1202    StoreU(raw1, d, raw + 1 * N);
   1203 
   1204    const VU16 raw2 = And(packed2, mask);
   1205    StoreU(raw2, d, raw + 2 * N);
   1206 
   1207    const VU16 raw4 = And(ShiftRight<6>(packed0), mask);
   1208    StoreU(raw4, d, raw + 4 * N);
   1209 
   1210    const VU16 raw5 = And(ShiftRight<6>(packed1), mask);
   1211    StoreU(raw5, d, raw + 5 * N);
   1212 
   1213    const VU16 raw6 = And(ShiftRight<6>(packed2), mask);
   1214    StoreU(raw6, d, raw + 6 * N);
   1215 
   1216    const VU16 raw8 = And(packed4, mask);
   1217    StoreU(raw8, d, raw + 8 * N);
   1218 
   1219    const VU16 raw9 = And(packed5, mask);
   1220    StoreU(raw9, d, raw + 9 * N);
   1221 
   1222    const VU16 rawA = And(packed6, mask);
   1223    StoreU(rawA, d, raw + 0xA * N);
   1224 
   1225    const VU16 rawC = And(ShiftRight<6>(packed4), mask);
   1226    StoreU(rawC, d, raw + 0xC * N);
   1227 
   1228    const VU16 rawD = And(ShiftRight<6>(packed5), mask);
   1229    StoreU(rawD, d, raw + 0xD * N);
   1230 
   1231    const VU16 rawE = And(ShiftRight<6>(packed6), mask);
   1232    StoreU(rawE, d, raw + 0xE * N);
   1233 
   1234    // packed3 is the concatenation of the four upper bits in packed0..2.
   1235    const VU16 down0 = ShiftRight<12>(packed0);
   1236    const VU16 down4 = ShiftRight<12>(packed4);
   1237    const VU16 hi4 = Set(d, 0xF000u);
   1238    const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)),
   1239                              ShiftRight<8>(And(packed1, hi4)), down0);
   1240    const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)),
   1241                              ShiftRight<8>(And(packed5, hi4)), down4);
   1242    const VU16 raw3 = And(packed3, mask);
   1243    StoreU(raw3, d, raw + 3 * N);
   1244 
   1245    const VU16 rawB = And(packed7, mask);
   1246    StoreU(rawB, d, raw + 0xB * N);
   1247 
   1248    const VU16 raw7 = ShiftRight<6>(packed3);  // upper bits already zero
   1249    StoreU(raw7, d, raw + 7 * N);
   1250 
   1251    const VU16 rawF = ShiftRight<6>(packed7);  // upper bits already zero
   1252    StoreU(rawF, d, raw + 0xF * N);
   1253  }
   1254 };  // Pack16<6>
   1255 
   1256 template <>
   1257 struct Pack16<7> {
   1258  template <class D>
   1259  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1260                       uint16_t* HWY_RESTRICT packed_out) const {
   1261    using VU16 = Vec<decltype(d)>;
   1262    const size_t N = Lanes(d);
   1263    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1264    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1265    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1266    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1267    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1268    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1269    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1270    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1271    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1272    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1273    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1274    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1275    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1276    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1277    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1278    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1279 
   1280    const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7);
   1281    // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the
   1282    // two remainder bits at the top of each vector.
   1283    const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0);
   1284    VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1);
   1285    VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2);
   1286    VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3);
   1287    VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4);
   1288    VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5);
   1289    VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6);
   1290 
   1291    const VU16 hi2 = Set(d, 0xC000u);
   1292    packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2);
   1293    packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2);
   1294    packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2);
   1295    packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2);
   1296    packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2);
   1297    packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2);
   1298 
   1299    StoreU(packed0, d, packed_out + 0 * N);
   1300    StoreU(packed1, d, packed_out + 1 * N);
   1301    StoreU(packed2, d, packed_out + 2 * N);
   1302    StoreU(packed3, d, packed_out + 3 * N);
   1303    StoreU(packed4, d, packed_out + 4 * N);
   1304    StoreU(packed5, d, packed_out + 5 * N);
   1305    StoreU(packed6, d, packed_out + 6 * N);
   1306  }
   1307 
   1308  template <class D>
   1309  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1310                         uint16_t* HWY_RESTRICT raw) const {
   1311    using VU16 = Vec<decltype(d)>;
   1312    const size_t N = Lanes(d);
   1313 
   1314    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   1315    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   1316    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   1317    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   1318    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   1319    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   1320    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   1321 
   1322    const VU16 mask = Set(d, 0x7Fu);  // Lowest 7 bits
   1323 
   1324    const VU16 raw0 = And(packed0, mask);
   1325    StoreU(raw0, d, raw + 0 * N);
   1326 
   1327    const VU16 raw1 = And(packed1, mask);
   1328    StoreU(raw1, d, raw + 1 * N);
   1329 
   1330    const VU16 raw2 = And(packed2, mask);
   1331    StoreU(raw2, d, raw + 2 * N);
   1332 
   1333    const VU16 raw3 = And(packed3, mask);
   1334    StoreU(raw3, d, raw + 3 * N);
   1335 
   1336    const VU16 raw4 = And(packed4, mask);
   1337    StoreU(raw4, d, raw + 4 * N);
   1338 
   1339    const VU16 raw5 = And(packed5, mask);
   1340    StoreU(raw5, d, raw + 5 * N);
   1341 
   1342    const VU16 raw6 = And(packed6, mask);
   1343    StoreU(raw6, d, raw + 6 * N);
   1344 
   1345    const VU16 raw8 = And(ShiftRight<7>(packed0), mask);
   1346    StoreU(raw8, d, raw + 8 * N);
   1347 
   1348    const VU16 raw9 = And(ShiftRight<7>(packed1), mask);
   1349    StoreU(raw9, d, raw + 9 * N);
   1350 
   1351    const VU16 rawA = And(ShiftRight<7>(packed2), mask);
   1352    StoreU(rawA, d, raw + 0xA * N);
   1353 
   1354    const VU16 rawB = And(ShiftRight<7>(packed3), mask);
   1355    StoreU(rawB, d, raw + 0xB * N);
   1356 
   1357    const VU16 rawC = And(ShiftRight<7>(packed4), mask);
   1358    StoreU(rawC, d, raw + 0xC * N);
   1359 
   1360    const VU16 rawD = And(ShiftRight<7>(packed5), mask);
   1361    StoreU(rawD, d, raw + 0xD * N);
   1362 
   1363    const VU16 rawE = And(ShiftRight<7>(packed6), mask);
   1364    StoreU(rawE, d, raw + 0xE * N);
   1365 
   1366    // packed7 is the concatenation of the two upper bits in packed0..6.
   1367    const VU16 down0 = ShiftRight<14>(packed0);
   1368    const VU16 hi2 = Set(d, 0xC000u);
   1369    const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)),
   1370                         ShiftRight<10>(And(packed2, hi2)), down0);
   1371    const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)),  //
   1372                         ShiftRight<6>(And(packed4, hi2)),
   1373                         ShiftRight<4>(And(packed5, hi2)));
   1374    const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0);
   1375 
   1376    const VU16 raw7 = And(packed7, mask);
   1377    StoreU(raw7, d, raw + 7 * N);
   1378 
   1379    const VU16 rawF = ShiftRight<7>(packed7);  // upper bits already zero
   1380    StoreU(rawF, d, raw + 0xF * N);
   1381  }
   1382 };  // Pack16<7>
   1383 
   1384 template <>
   1385 struct Pack16<8> {
   1386  template <class D>
   1387  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1388                       uint16_t* HWY_RESTRICT packed_out) const {
   1389    using VU16 = Vec<decltype(d)>;
   1390    const size_t N = Lanes(d);
   1391    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1392    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1393    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1394    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1395    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1396    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1397    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1398    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1399    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1400    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1401    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1402    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1403    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1404    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1405    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1406    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1407 
   1408    // This is equivalent to ConcatEven with 8-bit lanes, but much more
   1409    // efficient on RVV and slightly less efficient on SVE2.
   1410    const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0);
   1411    const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1);
   1412    const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4);
   1413    const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5);
   1414    const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8);
   1415    const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9);
   1416    const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC);
   1417    const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD);
   1418 
   1419    StoreU(packed0, d, packed_out + 0 * N);
   1420    StoreU(packed1, d, packed_out + 1 * N);
   1421    StoreU(packed2, d, packed_out + 2 * N);
   1422    StoreU(packed3, d, packed_out + 3 * N);
   1423    StoreU(packed4, d, packed_out + 4 * N);
   1424    StoreU(packed5, d, packed_out + 5 * N);
   1425    StoreU(packed6, d, packed_out + 6 * N);
   1426    StoreU(packed7, d, packed_out + 7 * N);
   1427  }
   1428 
   1429  template <class D>
   1430  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1431                         uint16_t* HWY_RESTRICT raw) const {
   1432    using VU16 = Vec<decltype(d)>;
   1433    const size_t N = Lanes(d);
   1434 
   1435    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   1436    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   1437    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   1438    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   1439    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   1440    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   1441    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   1442    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   1443    const VU16 mask = Set(d, 0xFFu);  // Lowest 8 bits
   1444 
   1445    const VU16 raw0 = And(packed0, mask);
   1446    StoreU(raw0, d, raw + 0 * N);
   1447 
   1448    const VU16 raw1 = And(packed1, mask);
   1449    StoreU(raw1, d, raw + 1 * N);
   1450 
   1451    const VU16 raw2 = ShiftRight<8>(packed0);  // upper bits already zero
   1452    StoreU(raw2, d, raw + 2 * N);
   1453 
   1454    const VU16 raw3 = ShiftRight<8>(packed1);  // upper bits already zero
   1455    StoreU(raw3, d, raw + 3 * N);
   1456 
   1457    const VU16 raw4 = And(packed2, mask);
   1458    StoreU(raw4, d, raw + 4 * N);
   1459 
   1460    const VU16 raw5 = And(packed3, mask);
   1461    StoreU(raw5, d, raw + 5 * N);
   1462 
   1463    const VU16 raw6 = ShiftRight<8>(packed2);  // upper bits already zero
   1464    StoreU(raw6, d, raw + 6 * N);
   1465 
   1466    const VU16 raw7 = ShiftRight<8>(packed3);  // upper bits already zero
   1467    StoreU(raw7, d, raw + 7 * N);
   1468 
   1469    const VU16 raw8 = And(packed4, mask);
   1470    StoreU(raw8, d, raw + 8 * N);
   1471 
   1472    const VU16 raw9 = And(packed5, mask);
   1473    StoreU(raw9, d, raw + 9 * N);
   1474 
   1475    const VU16 rawA = ShiftRight<8>(packed4);  // upper bits already zero
   1476    StoreU(rawA, d, raw + 0xA * N);
   1477 
   1478    const VU16 rawB = ShiftRight<8>(packed5);  // upper bits already zero
   1479    StoreU(rawB, d, raw + 0xB * N);
   1480 
   1481    const VU16 rawC = And(packed6, mask);
   1482    StoreU(rawC, d, raw + 0xC * N);
   1483 
   1484    const VU16 rawD = And(packed7, mask);
   1485    StoreU(rawD, d, raw + 0xD * N);
   1486 
   1487    const VU16 rawE = ShiftRight<8>(packed6);  // upper bits already zero
   1488    StoreU(rawE, d, raw + 0xE * N);
   1489 
   1490    const VU16 rawF = ShiftRight<8>(packed7);  // upper bits already zero
   1491    StoreU(rawF, d, raw + 0xF * N);
   1492  }
   1493 };  // Pack16<8>
   1494 
   1495 template <>
   1496 struct Pack16<9> {
   1497  template <class D>
   1498  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1499                       uint16_t* HWY_RESTRICT packed_out) const {
   1500    using VU16 = Vec<decltype(d)>;
   1501    const size_t N = Lanes(d);
   1502    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1503    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1504    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1505    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1506    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1507    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1508    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1509    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1510    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1511    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1512    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1513    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1514    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1515    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1516    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1517    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1518    // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8.
   1519    const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0);
   1520    const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1);
   1521    const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2);
   1522    const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3);
   1523    const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4);
   1524    const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5);
   1525    const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6);
   1526    const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7);
   1527 
   1528    // We could shift down, OR and shift up, but two shifts are typically more
   1529    // expensive than AND, shift into position, and OR (which can be further
   1530    // reduced via Xor3).
   1531    const VU16 mid2 = Set(d, 0x180u);  // top 2 in lower 9
   1532    const VU16 part8 = ShiftRight<7>(And(raw8, mid2));
   1533    const VU16 part9 = ShiftRight<5>(And(raw9, mid2));
   1534    const VU16 partA = ShiftRight<3>(And(rawA, mid2));
   1535    const VU16 partB = ShiftRight<1>(And(rawB, mid2));
   1536    const VU16 partC = ShiftLeft<1>(And(rawC, mid2));
   1537    const VU16 partD = ShiftLeft<3>(And(rawD, mid2));
   1538    const VU16 partE = ShiftLeft<5>(And(rawE, mid2));
   1539    const VU16 partF = ShiftLeft<7>(And(rawF, mid2));
   1540    const VU16 packed8 = Xor3(Xor3(part8, part9, partA),
   1541                              Xor3(partB, partC, partD), Or(partE, partF));
   1542 
   1543    StoreU(packed0, d, packed_out + 0 * N);
   1544    StoreU(packed1, d, packed_out + 1 * N);
   1545    StoreU(packed2, d, packed_out + 2 * N);
   1546    StoreU(packed3, d, packed_out + 3 * N);
   1547    StoreU(packed4, d, packed_out + 4 * N);
   1548    StoreU(packed5, d, packed_out + 5 * N);
   1549    StoreU(packed6, d, packed_out + 6 * N);
   1550    StoreU(packed7, d, packed_out + 7 * N);
   1551    StoreU(packed8, d, packed_out + 8 * N);
   1552  }
   1553 
   1554  template <class D>
   1555  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1556                         uint16_t* HWY_RESTRICT raw) const {
   1557    using VU16 = Vec<decltype(d)>;
   1558    const size_t N = Lanes(d);
   1559 
   1560    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   1561    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   1562    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   1563    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   1564    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   1565    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   1566    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   1567    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   1568    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   1569 
   1570    const VU16 mask = Set(d, 0x1FFu);  // Lowest 9 bits
   1571 
   1572    const VU16 raw0 = And(packed0, mask);
   1573    StoreU(raw0, d, raw + 0 * N);
   1574 
   1575    const VU16 raw1 = And(packed1, mask);
   1576    StoreU(raw1, d, raw + 1 * N);
   1577 
   1578    const VU16 raw2 = And(packed2, mask);
   1579    StoreU(raw2, d, raw + 2 * N);
   1580 
   1581    const VU16 raw3 = And(packed3, mask);
   1582    StoreU(raw3, d, raw + 3 * N);
   1583 
   1584    const VU16 raw4 = And(packed4, mask);
   1585    StoreU(raw4, d, raw + 4 * N);
   1586 
   1587    const VU16 raw5 = And(packed5, mask);
   1588    StoreU(raw5, d, raw + 5 * N);
   1589 
   1590    const VU16 raw6 = And(packed6, mask);
   1591    StoreU(raw6, d, raw + 6 * N);
   1592 
   1593    const VU16 raw7 = And(packed7, mask);
   1594    StoreU(raw7, d, raw + 7 * N);
   1595 
   1596    const VU16 mid2 = Set(d, 0x180u);  // top 2 in lower 9
   1597    const VU16 raw8 =
   1598        OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2);
   1599    const VU16 raw9 =
   1600        OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2);
   1601    const VU16 rawA =
   1602        OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2);
   1603    const VU16 rawB =
   1604        OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2);
   1605    const VU16 rawC =
   1606        OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2);
   1607    const VU16 rawD =
   1608        OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2);
   1609    const VU16 rawE =
   1610        OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2);
   1611    const VU16 rawF =
   1612        OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2);
   1613 
   1614    StoreU(raw8, d, raw + 8 * N);
   1615    StoreU(raw9, d, raw + 9 * N);
   1616    StoreU(rawA, d, raw + 0xA * N);
   1617    StoreU(rawB, d, raw + 0xB * N);
   1618    StoreU(rawC, d, raw + 0xC * N);
   1619    StoreU(rawD, d, raw + 0xD * N);
   1620    StoreU(rawE, d, raw + 0xE * N);
   1621    StoreU(rawF, d, raw + 0xF * N);
   1622  }
   1623 };  // Pack16<9>
   1624 
   1625 template <>
   1626 struct Pack16<10> {
   1627  template <class D>
   1628  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1629                       uint16_t* HWY_RESTRICT packed_out) const {
   1630    using VU16 = Vec<decltype(d)>;
   1631    const size_t N = Lanes(d);
   1632    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1633    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1634    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1635    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1636    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1637    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1638    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1639    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1640    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1641    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1642    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1643    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1644    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1645    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1646    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1647    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1648 
   1649    // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into
   1650    // packed8 and packed9.
   1651    const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0);
   1652    const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1);
   1653    const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2);
   1654    const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3);
   1655    const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4);
   1656    const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5);
   1657    const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6);
   1658    const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7);
   1659 
   1660    // We could shift down, OR and shift up, but two shifts are typically more
   1661    // expensive than AND, shift into position, and OR (which can be further
   1662    // reduced via Xor3).
   1663    const VU16 mid4 = Set(d, 0x3C0u);  // top 4 in lower 10
   1664    const VU16 part8 = ShiftRight<6>(And(raw8, mid4));
   1665    const VU16 part9 = ShiftRight<2>(And(raw9, mid4));
   1666    const VU16 partA = ShiftLeft<2>(And(rawA, mid4));
   1667    const VU16 partB = ShiftLeft<6>(And(rawB, mid4));
   1668    const VU16 partC = ShiftRight<6>(And(rawC, mid4));
   1669    const VU16 partD = ShiftRight<2>(And(rawD, mid4));
   1670    const VU16 partE = ShiftLeft<2>(And(rawE, mid4));
   1671    const VU16 partF = ShiftLeft<6>(And(rawF, mid4));
   1672    const VU16 packed8 = Or(Xor3(part8, part9, partA), partB);
   1673    const VU16 packed9 = Or(Xor3(partC, partD, partE), partF);
   1674 
   1675    StoreU(packed0, d, packed_out + 0 * N);
   1676    StoreU(packed1, d, packed_out + 1 * N);
   1677    StoreU(packed2, d, packed_out + 2 * N);
   1678    StoreU(packed3, d, packed_out + 3 * N);
   1679    StoreU(packed4, d, packed_out + 4 * N);
   1680    StoreU(packed5, d, packed_out + 5 * N);
   1681    StoreU(packed6, d, packed_out + 6 * N);
   1682    StoreU(packed7, d, packed_out + 7 * N);
   1683    StoreU(packed8, d, packed_out + 8 * N);
   1684    StoreU(packed9, d, packed_out + 9 * N);
   1685  }
   1686 
   1687  template <class D>
   1688  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1689                         uint16_t* HWY_RESTRICT raw) const {
   1690    using VU16 = Vec<decltype(d)>;
   1691    const size_t N = Lanes(d);
   1692 
   1693    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   1694    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   1695    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   1696    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   1697    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   1698    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   1699    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   1700    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   1701    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   1702    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   1703 
   1704    const VU16 mask = Set(d, 0x3FFu);  // Lowest 10 bits
   1705 
   1706    const VU16 raw0 = And(packed0, mask);
   1707    StoreU(raw0, d, raw + 0 * N);
   1708 
   1709    const VU16 raw1 = And(packed1, mask);
   1710    StoreU(raw1, d, raw + 1 * N);
   1711 
   1712    const VU16 raw2 = And(packed2, mask);
   1713    StoreU(raw2, d, raw + 2 * N);
   1714 
   1715    const VU16 raw3 = And(packed3, mask);
   1716    StoreU(raw3, d, raw + 3 * N);
   1717 
   1718    const VU16 raw4 = And(packed4, mask);
   1719    StoreU(raw4, d, raw + 4 * N);
   1720 
   1721    const VU16 raw5 = And(packed5, mask);
   1722    StoreU(raw5, d, raw + 5 * N);
   1723 
   1724    const VU16 raw6 = And(packed6, mask);
   1725    StoreU(raw6, d, raw + 6 * N);
   1726 
   1727    const VU16 raw7 = And(packed7, mask);
   1728    StoreU(raw7, d, raw + 7 * N);
   1729 
   1730    const VU16 mid4 = Set(d, 0x3C0u);  // top 4 in lower 10
   1731    const VU16 raw8 =
   1732        OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4);
   1733    const VU16 raw9 =
   1734        OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4);
   1735    const VU16 rawA =
   1736        OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4);
   1737    const VU16 rawB =
   1738        OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4);
   1739    const VU16 rawC =
   1740        OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4);
   1741    const VU16 rawD =
   1742        OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4);
   1743    const VU16 rawE =
   1744        OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4);
   1745    const VU16 rawF =
   1746        OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4);
   1747 
   1748    StoreU(raw8, d, raw + 8 * N);
   1749    StoreU(raw9, d, raw + 9 * N);
   1750    StoreU(rawA, d, raw + 0xA * N);
   1751    StoreU(rawB, d, raw + 0xB * N);
   1752    StoreU(rawC, d, raw + 0xC * N);
   1753    StoreU(rawD, d, raw + 0xD * N);
   1754    StoreU(rawE, d, raw + 0xE * N);
   1755    StoreU(rawF, d, raw + 0xF * N);
   1756  }
   1757 };  // Pack16<10>
   1758 
   1759 template <>
   1760 struct Pack16<11> {
   1761  template <class D>
   1762  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1763                       uint16_t* HWY_RESTRICT packed_out) const {
   1764    using VU16 = Vec<decltype(d)>;
   1765    const size_t N = Lanes(d);
   1766    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1767    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1768    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1769    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1770    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1771    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1772    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1773    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1774    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1775    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1776    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1777    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1778    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1779    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1780    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1781    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1782 
   1783    // It is not obvious what the optimal partitioning looks like. To reduce the
   1784    // number of constants, we want to minimize the number of distinct bit
   1785    // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers.
   1786    // 8+3 seems better: it is easier to scatter 3 bits into the MSBs.
   1787    const VU16 lo8 = Set(d, 0xFFu);
   1788 
   1789    // Lower 8 bits of all raw
   1790    const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
   1791    const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
   1792    const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
   1793    const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
   1794    const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
   1795    const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
   1796    const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
   1797    const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
   1798 
   1799    StoreU(packed0, d, packed_out + 0 * N);
   1800    StoreU(packed1, d, packed_out + 1 * N);
   1801    StoreU(packed2, d, packed_out + 2 * N);
   1802    StoreU(packed3, d, packed_out + 3 * N);
   1803    StoreU(packed4, d, packed_out + 4 * N);
   1804    StoreU(packed5, d, packed_out + 5 * N);
   1805    StoreU(packed6, d, packed_out + 6 * N);
   1806    StoreU(packed7, d, packed_out + 7 * N);
   1807 
   1808    // Three vectors, five 3bit remnants each, plus one 3bit in their MSB.
   1809    const VU16 top0 = ShiftRight<8>(raw0);
   1810    const VU16 top1 = ShiftRight<8>(raw1);
   1811    const VU16 top2 = ShiftRight<8>(raw2);
   1812    // Insert top raw bits into 3-bit groups within packed8..A. Moving the
   1813    // mask along avoids masking each of raw0..E and enables OrAnd.
   1814    VU16 next = Set(d, 0x38u);  // 0x7 << 3
   1815    VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next);
   1816    VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next);
   1817    VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next);
   1818    next = ShiftLeft<3>(next);
   1819    packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next);
   1820    packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next);
   1821    packedA = OrAnd(packedA, ShiftRight<2>(raw8), next);
   1822    next = ShiftLeft<3>(next);
   1823    packed8 = OrAnd(packed8, Add(raw9, raw9), next);
   1824    packed9 = OrAnd(packed9, Add(rawA, rawA), next);
   1825    packedA = OrAnd(packedA, Add(rawB, rawB), next);
   1826    next = ShiftLeft<3>(next);
   1827    packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next);
   1828    packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next);
   1829    packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next);
   1830 
   1831    // Scatter upper 3 bits of rawF into the upper bits.
   1832    next = ShiftLeft<3>(next);  // = 0x8000u
   1833    packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
   1834    packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
   1835    packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
   1836 
   1837    StoreU(packed8, d, packed_out + 8 * N);
   1838    StoreU(packed9, d, packed_out + 9 * N);
   1839    StoreU(packedA, d, packed_out + 0xA * N);
   1840  }
   1841 
   1842  template <class D>
   1843  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1844                         uint16_t* HWY_RESTRICT raw) const {
   1845    using VU16 = Vec<decltype(d)>;
   1846    const size_t N = Lanes(d);
   1847 
   1848    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   1849    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   1850    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   1851    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   1852    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   1853    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   1854    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   1855    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   1856    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   1857    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   1858    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
   1859 
   1860    const VU16 mask = Set(d, 0xFFu);  // Lowest 8 bits
   1861 
   1862    const VU16 down0 = And(packed0, mask);
   1863    const VU16 down1 = ShiftRight<8>(packed0);
   1864    const VU16 down2 = And(packed1, mask);
   1865    const VU16 down3 = ShiftRight<8>(packed1);
   1866    const VU16 down4 = And(packed2, mask);
   1867    const VU16 down5 = ShiftRight<8>(packed2);
   1868    const VU16 down6 = And(packed3, mask);
   1869    const VU16 down7 = ShiftRight<8>(packed3);
   1870    const VU16 down8 = And(packed4, mask);
   1871    const VU16 down9 = ShiftRight<8>(packed4);
   1872    const VU16 downA = And(packed5, mask);
   1873    const VU16 downB = ShiftRight<8>(packed5);
   1874    const VU16 downC = And(packed6, mask);
   1875    const VU16 downD = ShiftRight<8>(packed6);
   1876    const VU16 downE = And(packed7, mask);
   1877    const VU16 downF = ShiftRight<8>(packed7);
   1878 
   1879    // Three bits from packed8..A, eight bits from down0..F.
   1880    const VU16 hi3 = Set(d, 0x700u);
   1881    const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3);
   1882    const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3);
   1883    const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3);
   1884 
   1885    const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3);
   1886    const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3);
   1887    const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3);
   1888 
   1889    const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3);
   1890    const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3);
   1891    const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3);
   1892 
   1893    const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3);
   1894    const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3);
   1895    const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3);
   1896 
   1897    const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3);
   1898    const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3);
   1899    const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3);
   1900 
   1901    // Shift MSB into the top 3-of-11 and mask.
   1902    const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3),
   1903                                     And(ShiftRight<6>(packed9), hi3),
   1904                                     And(ShiftRight<5>(packedA), hi3)));
   1905 
   1906    StoreU(raw0, d, raw + 0 * N);
   1907    StoreU(raw1, d, raw + 1 * N);
   1908    StoreU(raw2, d, raw + 2 * N);
   1909    StoreU(raw3, d, raw + 3 * N);
   1910    StoreU(raw4, d, raw + 4 * N);
   1911    StoreU(raw5, d, raw + 5 * N);
   1912    StoreU(raw6, d, raw + 6 * N);
   1913    StoreU(raw7, d, raw + 7 * N);
   1914    StoreU(raw8, d, raw + 8 * N);
   1915    StoreU(raw9, d, raw + 9 * N);
   1916    StoreU(rawA, d, raw + 0xA * N);
   1917    StoreU(rawB, d, raw + 0xB * N);
   1918    StoreU(rawC, d, raw + 0xC * N);
   1919    StoreU(rawD, d, raw + 0xD * N);
   1920    StoreU(rawE, d, raw + 0xE * N);
   1921    StoreU(rawF, d, raw + 0xF * N);
   1922  }
   1923 };  // Pack16<11>
   1924 
   1925 template <>
   1926 struct Pack16<12> {
   1927  template <class D>
   1928  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   1929                       uint16_t* HWY_RESTRICT packed_out) const {
   1930    using VU16 = Vec<decltype(d)>;
   1931    const size_t N = Lanes(d);
   1932    const VU16 raw0 = LoadU(d, raw + 0 * N);
   1933    const VU16 raw1 = LoadU(d, raw + 1 * N);
   1934    const VU16 raw2 = LoadU(d, raw + 2 * N);
   1935    const VU16 raw3 = LoadU(d, raw + 3 * N);
   1936    const VU16 raw4 = LoadU(d, raw + 4 * N);
   1937    const VU16 raw5 = LoadU(d, raw + 5 * N);
   1938    const VU16 raw6 = LoadU(d, raw + 6 * N);
   1939    const VU16 raw7 = LoadU(d, raw + 7 * N);
   1940    const VU16 raw8 = LoadU(d, raw + 8 * N);
   1941    const VU16 raw9 = LoadU(d, raw + 9 * N);
   1942    const VU16 rawA = LoadU(d, raw + 0xA * N);
   1943    const VU16 rawB = LoadU(d, raw + 0xB * N);
   1944    const VU16 rawC = LoadU(d, raw + 0xC * N);
   1945    const VU16 rawD = LoadU(d, raw + 0xD * N);
   1946    const VU16 rawE = LoadU(d, raw + 0xE * N);
   1947    const VU16 rawF = LoadU(d, raw + 0xF * N);
   1948 
   1949    // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into
   1950    // packed8 to packedB.
   1951    const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0);
   1952    const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1);
   1953    const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2);
   1954    const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3);
   1955    const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4);
   1956    const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5);
   1957    const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6);
   1958    const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7);
   1959 
   1960    // Masking after shifting left enables OrAnd.
   1961    const VU16 hi8 = Set(d, 0xFF00u);
   1962    const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8);
   1963    const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8);
   1964    const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8);
   1965    const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8);
   1966    StoreU(packed0, d, packed_out + 0 * N);
   1967    StoreU(packed1, d, packed_out + 1 * N);
   1968    StoreU(packed2, d, packed_out + 2 * N);
   1969    StoreU(packed3, d, packed_out + 3 * N);
   1970    StoreU(packed4, d, packed_out + 4 * N);
   1971    StoreU(packed5, d, packed_out + 5 * N);
   1972    StoreU(packed6, d, packed_out + 6 * N);
   1973    StoreU(packed7, d, packed_out + 7 * N);
   1974    StoreU(packed8, d, packed_out + 8 * N);
   1975    StoreU(packed9, d, packed_out + 9 * N);
   1976    StoreU(packedA, d, packed_out + 0xA * N);
   1977    StoreU(packedB, d, packed_out + 0xB * N);
   1978  }
   1979 
   1980  template <class D>
   1981  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   1982                         uint16_t* HWY_RESTRICT raw) const {
   1983    using VU16 = Vec<decltype(d)>;
   1984    const size_t N = Lanes(d);
   1985 
   1986    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   1987    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   1988    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   1989    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   1990    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   1991    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   1992    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   1993    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   1994    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   1995    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   1996    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
   1997    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
   1998 
   1999    const VU16 mask = Set(d, 0xFFFu);  // Lowest 12 bits
   2000 
   2001    const VU16 raw0 = And(packed0, mask);
   2002    StoreU(raw0, d, raw + 0 * N);
   2003 
   2004    const VU16 raw1 = And(packed1, mask);
   2005    StoreU(raw1, d, raw + 1 * N);
   2006 
   2007    const VU16 raw2 = And(packed2, mask);
   2008    StoreU(raw2, d, raw + 2 * N);
   2009 
   2010    const VU16 raw3 = And(packed3, mask);
   2011    StoreU(raw3, d, raw + 3 * N);
   2012 
   2013    const VU16 raw4 = And(packed4, mask);
   2014    StoreU(raw4, d, raw + 4 * N);
   2015 
   2016    const VU16 raw5 = And(packed5, mask);
   2017    StoreU(raw5, d, raw + 5 * N);
   2018 
   2019    const VU16 raw6 = And(packed6, mask);
   2020    StoreU(raw6, d, raw + 6 * N);
   2021 
   2022    const VU16 raw7 = And(packed7, mask);
   2023    StoreU(raw7, d, raw + 7 * N);
   2024 
   2025    const VU16 mid8 = Set(d, 0xFF0u);  // upper 8 in lower 12
   2026    const VU16 raw8 =
   2027        OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8);
   2028    const VU16 raw9 =
   2029        OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8);
   2030    const VU16 rawA =
   2031        OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8);
   2032    const VU16 rawB =
   2033        OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8);
   2034    const VU16 rawC =
   2035        OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8);
   2036    const VU16 rawD =
   2037        OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8);
   2038    const VU16 rawE =
   2039        OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8);
   2040    const VU16 rawF =
   2041        OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8);
   2042    StoreU(raw8, d, raw + 8 * N);
   2043    StoreU(raw9, d, raw + 9 * N);
   2044    StoreU(rawA, d, raw + 0xA * N);
   2045    StoreU(rawB, d, raw + 0xB * N);
   2046    StoreU(rawC, d, raw + 0xC * N);
   2047    StoreU(rawD, d, raw + 0xD * N);
   2048    StoreU(rawE, d, raw + 0xE * N);
   2049    StoreU(rawF, d, raw + 0xF * N);
   2050  }
   2051 };  // Pack16<12>
   2052 
   2053 template <>
   2054 struct Pack16<13> {
   2055  template <class D>
   2056  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   2057                       uint16_t* HWY_RESTRICT packed_out) const {
   2058    using VU16 = Vec<decltype(d)>;
   2059    const size_t N = Lanes(d);
   2060    const VU16 raw0 = LoadU(d, raw + 0 * N);
   2061    const VU16 raw1 = LoadU(d, raw + 1 * N);
   2062    const VU16 raw2 = LoadU(d, raw + 2 * N);
   2063    const VU16 raw3 = LoadU(d, raw + 3 * N);
   2064    const VU16 raw4 = LoadU(d, raw + 4 * N);
   2065    const VU16 raw5 = LoadU(d, raw + 5 * N);
   2066    const VU16 raw6 = LoadU(d, raw + 6 * N);
   2067    const VU16 raw7 = LoadU(d, raw + 7 * N);
   2068    const VU16 raw8 = LoadU(d, raw + 8 * N);
   2069    const VU16 raw9 = LoadU(d, raw + 9 * N);
   2070    const VU16 rawA = LoadU(d, raw + 0xA * N);
   2071    const VU16 rawB = LoadU(d, raw + 0xB * N);
   2072    const VU16 rawC = LoadU(d, raw + 0xC * N);
   2073    const VU16 rawD = LoadU(d, raw + 0xD * N);
   2074    const VU16 rawE = LoadU(d, raw + 0xE * N);
   2075    const VU16 rawF = LoadU(d, raw + 0xF * N);
   2076 
   2077    // As with 11 bits, it is not obvious what the optimal partitioning looks
   2078    // like. We similarly go with an 8+5 split.
   2079    const VU16 lo8 = Set(d, 0xFFu);
   2080 
   2081    // Lower 8 bits of all raw
   2082    const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
   2083    const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
   2084    const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
   2085    const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
   2086    const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
   2087    const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
   2088    const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
   2089    const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
   2090 
   2091    StoreU(packed0, d, packed_out + 0 * N);
   2092    StoreU(packed1, d, packed_out + 1 * N);
   2093    StoreU(packed2, d, packed_out + 2 * N);
   2094    StoreU(packed3, d, packed_out + 3 * N);
   2095    StoreU(packed4, d, packed_out + 4 * N);
   2096    StoreU(packed5, d, packed_out + 5 * N);
   2097    StoreU(packed6, d, packed_out + 6 * N);
   2098    StoreU(packed7, d, packed_out + 7 * N);
   2099 
   2100    // Five vectors, three 5bit remnants each, plus one 5bit in their MSB.
   2101    const VU16 top0 = ShiftRight<8>(raw0);
   2102    const VU16 top1 = ShiftRight<8>(raw1);
   2103    const VU16 top2 = ShiftRight<8>(raw2);
   2104    const VU16 top3 = ShiftRight<8>(raw3);
   2105    const VU16 top4 = ShiftRight<8>(raw4);
   2106 
   2107    // Insert top raw bits into 5-bit groups within packed8..C. Moving the
   2108    // mask along avoids masking each of raw0..E and enables OrAnd.
   2109    VU16 next = Set(d, 0x3E0u);  // 0x1F << 5
   2110    VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next);
   2111    VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next);
   2112    VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next);
   2113    VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next);
   2114    VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next);
   2115    next = ShiftLeft<5>(next);
   2116    packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next);
   2117    packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next);
   2118    packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next);
   2119    packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next);
   2120    packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next);
   2121 
   2122    // Scatter upper 5 bits of rawF into the upper bits.
   2123    next = ShiftLeft<3>(next);  // = 0x8000u
   2124    packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
   2125    packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
   2126    packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
   2127    packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next);
   2128    packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next);
   2129 
   2130    StoreU(packed8, d, packed_out + 8 * N);
   2131    StoreU(packed9, d, packed_out + 9 * N);
   2132    StoreU(packedA, d, packed_out + 0xA * N);
   2133    StoreU(packedB, d, packed_out + 0xB * N);
   2134    StoreU(packedC, d, packed_out + 0xC * N);
   2135  }
   2136 
   2137  template <class D>
   2138  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   2139                         uint16_t* HWY_RESTRICT raw) const {
   2140    using VU16 = Vec<decltype(d)>;
   2141    const size_t N = Lanes(d);
   2142 
   2143    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   2144    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   2145    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   2146    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   2147    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   2148    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   2149    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   2150    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   2151    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   2152    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   2153    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
   2154    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
   2155    const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
   2156 
   2157    const VU16 mask = Set(d, 0xFFu);  // Lowest 8 bits
   2158 
   2159    const VU16 down0 = And(packed0, mask);
   2160    const VU16 down1 = ShiftRight<8>(packed0);
   2161    const VU16 down2 = And(packed1, mask);
   2162    const VU16 down3 = ShiftRight<8>(packed1);
   2163    const VU16 down4 = And(packed2, mask);
   2164    const VU16 down5 = ShiftRight<8>(packed2);
   2165    const VU16 down6 = And(packed3, mask);
   2166    const VU16 down7 = ShiftRight<8>(packed3);
   2167    const VU16 down8 = And(packed4, mask);
   2168    const VU16 down9 = ShiftRight<8>(packed4);
   2169    const VU16 downA = And(packed5, mask);
   2170    const VU16 downB = ShiftRight<8>(packed5);
   2171    const VU16 downC = And(packed6, mask);
   2172    const VU16 downD = ShiftRight<8>(packed6);
   2173    const VU16 downE = And(packed7, mask);
   2174    const VU16 downF = ShiftRight<8>(packed7);
   2175 
   2176    // Upper five bits from packed8..C, eight bits from down0..F.
   2177    const VU16 hi5 = Set(d, 0x1F00u);
   2178    const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5);
   2179    const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5);
   2180    const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5);
   2181    const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5);
   2182    const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5);
   2183 
   2184    const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5);
   2185    const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5);
   2186    const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5);
   2187    const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5);
   2188    const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5);
   2189 
   2190    const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5);
   2191    const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5);
   2192    const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5);
   2193    const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5);
   2194    const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5);
   2195 
   2196    // Shift MSB into the top 5-of-11 and mask.
   2197    const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5),  //
   2198                         And(ShiftRight<6>(packed9), hi5),
   2199                         And(ShiftRight<5>(packedA), hi5));
   2200    const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5),
   2201                         And(ShiftRight<3>(packedC), hi5), downF);
   2202    const VU16 rawF = Or(p0, p1);
   2203 
   2204    StoreU(raw0, d, raw + 0 * N);
   2205    StoreU(raw1, d, raw + 1 * N);
   2206    StoreU(raw2, d, raw + 2 * N);
   2207    StoreU(raw3, d, raw + 3 * N);
   2208    StoreU(raw4, d, raw + 4 * N);
   2209    StoreU(raw5, d, raw + 5 * N);
   2210    StoreU(raw6, d, raw + 6 * N);
   2211    StoreU(raw7, d, raw + 7 * N);
   2212    StoreU(raw8, d, raw + 8 * N);
   2213    StoreU(raw9, d, raw + 9 * N);
   2214    StoreU(rawA, d, raw + 0xA * N);
   2215    StoreU(rawB, d, raw + 0xB * N);
   2216    StoreU(rawC, d, raw + 0xC * N);
   2217    StoreU(rawD, d, raw + 0xD * N);
   2218    StoreU(rawE, d, raw + 0xE * N);
   2219    StoreU(rawF, d, raw + 0xF * N);
   2220  }
   2221 };  // Pack16<13>
   2222 
   2223 template <>
   2224 struct Pack16<14> {
   2225  template <class D>
   2226  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   2227                       uint16_t* HWY_RESTRICT packed_out) const {
   2228    using VU16 = Vec<decltype(d)>;
   2229    const size_t N = Lanes(d);
   2230    const VU16 raw0 = LoadU(d, raw + 0 * N);
   2231    const VU16 raw1 = LoadU(d, raw + 1 * N);
   2232    const VU16 raw2 = LoadU(d, raw + 2 * N);
   2233    const VU16 raw3 = LoadU(d, raw + 3 * N);
   2234    const VU16 raw4 = LoadU(d, raw + 4 * N);
   2235    const VU16 raw5 = LoadU(d, raw + 5 * N);
   2236    const VU16 raw6 = LoadU(d, raw + 6 * N);
   2237    const VU16 raw7 = LoadU(d, raw + 7 * N);
   2238    const VU16 raw8 = LoadU(d, raw + 8 * N);
   2239    const VU16 raw9 = LoadU(d, raw + 9 * N);
   2240    const VU16 rawA = LoadU(d, raw + 0xA * N);
   2241    const VU16 rawB = LoadU(d, raw + 0xB * N);
   2242    const VU16 rawC = LoadU(d, raw + 0xC * N);
   2243    const VU16 rawD = LoadU(d, raw + 0xD * N);
   2244    const VU16 rawE = LoadU(d, raw + 0xE * N);
   2245    const VU16 rawF = LoadU(d, raw + 0xF * N);
   2246 
   2247    // 14 vectors, each with 14+2 bits; two raw vectors are scattered
   2248    // across the upper 2 bits.
   2249    const VU16 hi2 = Set(d, 0xC000u);
   2250    const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE));
   2251    const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2);
   2252    const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2);
   2253    const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2);
   2254    const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2);
   2255    const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2);
   2256    const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2);
   2257    const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF));
   2258    const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2);
   2259    const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2);
   2260    const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2);
   2261    const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2);
   2262    const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2);
   2263    const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2);
   2264 
   2265    StoreU(packed0, d, packed_out + 0 * N);
   2266    StoreU(packed1, d, packed_out + 1 * N);
   2267    StoreU(packed2, d, packed_out + 2 * N);
   2268    StoreU(packed3, d, packed_out + 3 * N);
   2269    StoreU(packed4, d, packed_out + 4 * N);
   2270    StoreU(packed5, d, packed_out + 5 * N);
   2271    StoreU(packed6, d, packed_out + 6 * N);
   2272    StoreU(packed7, d, packed_out + 7 * N);
   2273    StoreU(packed8, d, packed_out + 8 * N);
   2274    StoreU(packed9, d, packed_out + 9 * N);
   2275    StoreU(packedA, d, packed_out + 0xA * N);
   2276    StoreU(packedB, d, packed_out + 0xB * N);
   2277    StoreU(packedC, d, packed_out + 0xC * N);
   2278    StoreU(packedD, d, packed_out + 0xD * N);
   2279  }
   2280 
   2281  template <class D>
   2282  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   2283                         uint16_t* HWY_RESTRICT raw) const {
   2284    using VU16 = Vec<decltype(d)>;
   2285    const size_t N = Lanes(d);
   2286 
   2287    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   2288    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   2289    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   2290    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   2291    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   2292    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   2293    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   2294    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   2295    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   2296    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   2297    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
   2298    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
   2299    const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
   2300    const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
   2301 
   2302    const VU16 mask = Set(d, 0x3FFFu);  // Lowest 14 bits
   2303 
   2304    const VU16 raw0 = And(packed0, mask);
   2305    StoreU(raw0, d, raw + 0 * N);
   2306 
   2307    const VU16 raw1 = And(packed1, mask);
   2308    StoreU(raw1, d, raw + 1 * N);
   2309 
   2310    const VU16 raw2 = And(packed2, mask);
   2311    StoreU(raw2, d, raw + 2 * N);
   2312 
   2313    const VU16 raw3 = And(packed3, mask);
   2314    StoreU(raw3, d, raw + 3 * N);
   2315 
   2316    const VU16 raw4 = And(packed4, mask);
   2317    StoreU(raw4, d, raw + 4 * N);
   2318 
   2319    const VU16 raw5 = And(packed5, mask);
   2320    StoreU(raw5, d, raw + 5 * N);
   2321 
   2322    const VU16 raw6 = And(packed6, mask);
   2323    StoreU(raw6, d, raw + 6 * N);
   2324 
   2325    const VU16 raw7 = And(packed7, mask);
   2326    StoreU(raw7, d, raw + 7 * N);
   2327 
   2328    const VU16 raw8 = And(packed8, mask);
   2329    StoreU(raw8, d, raw + 8 * N);
   2330 
   2331    const VU16 raw9 = And(packed9, mask);
   2332    StoreU(raw9, d, raw + 9 * N);
   2333 
   2334    const VU16 rawA = And(packedA, mask);
   2335    StoreU(rawA, d, raw + 0xA * N);
   2336 
   2337    const VU16 rawB = And(packedB, mask);
   2338    StoreU(rawB, d, raw + 0xB * N);
   2339 
   2340    const VU16 rawC = And(packedC, mask);
   2341    StoreU(rawC, d, raw + 0xC * N);
   2342 
   2343    const VU16 rawD = And(packedD, mask);
   2344    StoreU(rawD, d, raw + 0xD * N);
   2345 
   2346    // rawE is the concatenation of the top two bits in packed0..6.
   2347    const VU16 E0 = Xor3(ShiftRight<14>(packed0),  //
   2348                         ShiftRight<12>(AndNot(mask, packed1)),
   2349                         ShiftRight<10>(AndNot(mask, packed2)));
   2350    const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)),
   2351                         ShiftRight<6>(AndNot(mask, packed4)),
   2352                         ShiftRight<4>(AndNot(mask, packed5)));
   2353    const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1);
   2354    const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)),
   2355                         ShiftRight<12>(AndNot(mask, packed8)),
   2356                         ShiftRight<10>(AndNot(mask, packed9)));
   2357    const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)),
   2358                         ShiftRight<6>(AndNot(mask, packedB)),
   2359                         ShiftRight<4>(AndNot(mask, packedC)));
   2360    const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1);
   2361    StoreU(rawE, d, raw + 0xE * N);
   2362    StoreU(rawF, d, raw + 0xF * N);
   2363  }
   2364 };  // Pack16<14>
   2365 
   2366 template <>
   2367 struct Pack16<15> {
   2368  template <class D>
   2369  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   2370                       uint16_t* HWY_RESTRICT packed_out) const {
   2371    using VU16 = Vec<decltype(d)>;
   2372    const size_t N = Lanes(d);
   2373    const VU16 raw0 = LoadU(d, raw + 0 * N);
   2374    const VU16 raw1 = LoadU(d, raw + 1 * N);
   2375    const VU16 raw2 = LoadU(d, raw + 2 * N);
   2376    const VU16 raw3 = LoadU(d, raw + 3 * N);
   2377    const VU16 raw4 = LoadU(d, raw + 4 * N);
   2378    const VU16 raw5 = LoadU(d, raw + 5 * N);
   2379    const VU16 raw6 = LoadU(d, raw + 6 * N);
   2380    const VU16 raw7 = LoadU(d, raw + 7 * N);
   2381    const VU16 raw8 = LoadU(d, raw + 8 * N);
   2382    const VU16 raw9 = LoadU(d, raw + 9 * N);
   2383    const VU16 rawA = LoadU(d, raw + 0xA * N);
   2384    const VU16 rawB = LoadU(d, raw + 0xB * N);
   2385    const VU16 rawC = LoadU(d, raw + 0xC * N);
   2386    const VU16 rawD = LoadU(d, raw + 0xD * N);
   2387    const VU16 rawE = LoadU(d, raw + 0xE * N);
   2388    const VU16 rawF = LoadU(d, raw + 0xF * N);
   2389 
   2390    // 15 vectors, each with 15+1 bits; one packed vector is scattered
   2391    // across the upper bit.
   2392    const VU16 hi1 = Set(d, 0x8000u);
   2393    const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF));
   2394    const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1);
   2395    const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1);
   2396    const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1);
   2397    const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1);
   2398    const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1);
   2399    const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1);
   2400    const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1);
   2401    const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1);
   2402    const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1);
   2403    const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1);
   2404    const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1);
   2405    const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1);
   2406    const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1);
   2407    const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1);
   2408 
   2409    StoreU(packed0, d, packed_out + 0 * N);
   2410    StoreU(packed1, d, packed_out + 1 * N);
   2411    StoreU(packed2, d, packed_out + 2 * N);
   2412    StoreU(packed3, d, packed_out + 3 * N);
   2413    StoreU(packed4, d, packed_out + 4 * N);
   2414    StoreU(packed5, d, packed_out + 5 * N);
   2415    StoreU(packed6, d, packed_out + 6 * N);
   2416    StoreU(packed7, d, packed_out + 7 * N);
   2417    StoreU(packed8, d, packed_out + 8 * N);
   2418    StoreU(packed9, d, packed_out + 9 * N);
   2419    StoreU(packedA, d, packed_out + 0xA * N);
   2420    StoreU(packedB, d, packed_out + 0xB * N);
   2421    StoreU(packedC, d, packed_out + 0xC * N);
   2422    StoreU(packedD, d, packed_out + 0xD * N);
   2423    StoreU(packedE, d, packed_out + 0xE * N);
   2424  }
   2425 
   2426  template <class D>
   2427  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   2428                         uint16_t* HWY_RESTRICT raw) const {
   2429    using VU16 = Vec<decltype(d)>;
   2430    const size_t N = Lanes(d);
   2431 
   2432    const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   2433    const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   2434    const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   2435    const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   2436    const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   2437    const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   2438    const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   2439    const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   2440    const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   2441    const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   2442    const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
   2443    const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
   2444    const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
   2445    const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
   2446    const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N));
   2447 
   2448    const VU16 mask = Set(d, 0x7FFFu);  // Lowest 15 bits
   2449 
   2450    const VU16 raw0 = And(packed0, mask);
   2451    StoreU(raw0, d, raw + 0 * N);
   2452 
   2453    const VU16 raw1 = And(packed1, mask);
   2454    StoreU(raw1, d, raw + 1 * N);
   2455 
   2456    const VU16 raw2 = And(packed2, mask);
   2457    StoreU(raw2, d, raw + 2 * N);
   2458 
   2459    const VU16 raw3 = And(packed3, mask);
   2460    StoreU(raw3, d, raw + 3 * N);
   2461 
   2462    const VU16 raw4 = And(packed4, mask);
   2463    StoreU(raw4, d, raw + 4 * N);
   2464 
   2465    const VU16 raw5 = And(packed5, mask);
   2466    StoreU(raw5, d, raw + 5 * N);
   2467 
   2468    const VU16 raw6 = And(packed6, mask);
   2469    StoreU(raw6, d, raw + 6 * N);
   2470 
   2471    const VU16 raw7 = And(packed7, mask);
   2472    StoreU(raw7, d, raw + 7 * N);
   2473 
   2474    const VU16 raw8 = And(packed8, mask);
   2475    StoreU(raw8, d, raw + 8 * N);
   2476 
   2477    const VU16 raw9 = And(packed9, mask);
   2478    StoreU(raw9, d, raw + 9 * N);
   2479 
   2480    const VU16 rawA = And(packedA, mask);
   2481    StoreU(rawA, d, raw + 0xA * N);
   2482 
   2483    const VU16 rawB = And(packedB, mask);
   2484    StoreU(rawB, d, raw + 0xB * N);
   2485 
   2486    const VU16 rawC = And(packedC, mask);
   2487    StoreU(rawC, d, raw + 0xC * N);
   2488 
   2489    const VU16 rawD = And(packedD, mask);
   2490    StoreU(rawD, d, raw + 0xD * N);
   2491 
   2492    const VU16 rawE = And(packedE, mask);
   2493    StoreU(rawE, d, raw + 0xE * N);
   2494 
   2495    // rawF is the concatenation of the top bit in packed0..E.
   2496    const VU16 F0 = Xor3(ShiftRight<15>(packed0),  //
   2497                         ShiftRight<14>(AndNot(mask, packed1)),
   2498                         ShiftRight<13>(AndNot(mask, packed2)));
   2499    const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)),
   2500                         ShiftRight<11>(AndNot(mask, packed4)),
   2501                         ShiftRight<10>(AndNot(mask, packed5)));
   2502    const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)),
   2503                         ShiftRight<8>(AndNot(mask, packed7)),
   2504                         ShiftRight<7>(AndNot(mask, packed8)));
   2505    const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)),
   2506                         ShiftRight<5>(AndNot(mask, packedA)),
   2507                         ShiftRight<4>(AndNot(mask, packedB)));
   2508    const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)),
   2509                         ShiftRight<2>(AndNot(mask, packedD)),
   2510                         ShiftRight<1>(AndNot(mask, packedE)));
   2511    const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4));
   2512    StoreU(rawF, d, raw + 0xF * N);
   2513  }
   2514 };  // Pack16<15>
   2515 
   2516 template <>
   2517 struct Pack16<16> {
   2518  template <class D>
   2519  HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
   2520                       uint16_t* HWY_RESTRICT packed_out) const {
   2521    using VU16 = Vec<decltype(d)>;
   2522    const size_t N = Lanes(d);
   2523    const VU16 raw0 = LoadU(d, raw + 0 * N);
   2524    const VU16 raw1 = LoadU(d, raw + 1 * N);
   2525    const VU16 raw2 = LoadU(d, raw + 2 * N);
   2526    const VU16 raw3 = LoadU(d, raw + 3 * N);
   2527    const VU16 raw4 = LoadU(d, raw + 4 * N);
   2528    const VU16 raw5 = LoadU(d, raw + 5 * N);
   2529    const VU16 raw6 = LoadU(d, raw + 6 * N);
   2530    const VU16 raw7 = LoadU(d, raw + 7 * N);
   2531    const VU16 raw8 = LoadU(d, raw + 8 * N);
   2532    const VU16 raw9 = LoadU(d, raw + 9 * N);
   2533    const VU16 rawA = LoadU(d, raw + 0xA * N);
   2534    const VU16 rawB = LoadU(d, raw + 0xB * N);
   2535    const VU16 rawC = LoadU(d, raw + 0xC * N);
   2536    const VU16 rawD = LoadU(d, raw + 0xD * N);
   2537    const VU16 rawE = LoadU(d, raw + 0xE * N);
   2538    const VU16 rawF = LoadU(d, raw + 0xF * N);
   2539 
   2540    StoreU(raw0, d, packed_out + 0 * N);
   2541    StoreU(raw1, d, packed_out + 1 * N);
   2542    StoreU(raw2, d, packed_out + 2 * N);
   2543    StoreU(raw3, d, packed_out + 3 * N);
   2544    StoreU(raw4, d, packed_out + 4 * N);
   2545    StoreU(raw5, d, packed_out + 5 * N);
   2546    StoreU(raw6, d, packed_out + 6 * N);
   2547    StoreU(raw7, d, packed_out + 7 * N);
   2548    StoreU(raw8, d, packed_out + 8 * N);
   2549    StoreU(raw9, d, packed_out + 9 * N);
   2550    StoreU(rawA, d, packed_out + 0xA * N);
   2551    StoreU(rawB, d, packed_out + 0xB * N);
   2552    StoreU(rawC, d, packed_out + 0xC * N);
   2553    StoreU(rawD, d, packed_out + 0xD * N);
   2554    StoreU(rawE, d, packed_out + 0xE * N);
   2555    StoreU(rawF, d, packed_out + 0xF * N);
   2556  }
   2557 
   2558  template <class D>
   2559  HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
   2560                         uint16_t* HWY_RESTRICT raw) const {
   2561    using VU16 = Vec<decltype(d)>;
   2562    const size_t N = Lanes(d);
   2563 
   2564    const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N));
   2565    const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N));
   2566    const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N));
   2567    const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N));
   2568    const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N));
   2569    const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N));
   2570    const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N));
   2571    const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N));
   2572    const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N));
   2573    const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N));
   2574    const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N));
   2575    const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N));
   2576    const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N));
   2577    const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N));
   2578    const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N));
   2579    const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N));
   2580 
   2581    StoreU(raw0, d, raw + 0 * N);
   2582    StoreU(raw1, d, raw + 1 * N);
   2583    StoreU(raw2, d, raw + 2 * N);
   2584    StoreU(raw3, d, raw + 3 * N);
   2585    StoreU(raw4, d, raw + 4 * N);
   2586    StoreU(raw5, d, raw + 5 * N);
   2587    StoreU(raw6, d, raw + 6 * N);
   2588    StoreU(raw7, d, raw + 7 * N);
   2589    StoreU(raw8, d, raw + 8 * N);
   2590    StoreU(raw9, d, raw + 9 * N);
   2591    StoreU(rawA, d, raw + 0xA * N);
   2592    StoreU(rawB, d, raw + 0xB * N);
   2593    StoreU(rawC, d, raw + 0xC * N);
   2594    StoreU(rawD, d, raw + 0xD * N);
   2595    StoreU(rawE, d, raw + 0xE * N);
   2596    StoreU(rawF, d, raw + 0xF * N);
   2597  }
   2598 };  // Pack16<16>
   2599 
   2600 // The supported packing types for 32/64 bits.
   2601 enum BlockPackingType {
   2602  // Simple fixed bit-packing.
   2603  kBitPacked,
   2604  // Bit packing after subtracting a `frame of reference` value from input.
   2605  kFoRBitPacked,
   2606 };
   2607 
   2608 namespace detail {
   2609 
   2610 // Generates the implementation for bit-packing/un-packing `T` type numbers
   2611 // where each number takes `kBits` bits.
   2612 // `S` is the remainder bits left from the previous bit-packed block.
   2613 // `kLoadPos` is the offset from which the next vector block should be loaded.
   2614 // `kStorePos` is the offset into which the next vector block should be stored.
   2615 // `BlockPackingType` is the type of packing/unpacking for this block.
   2616 template <typename T, size_t kBits, size_t S, size_t kLoadPos, size_t kStorePos,
   2617          BlockPackingType block_packing_type>
   2618 struct BitPackUnroller {
   2619  static constexpr size_t B = sizeof(T) * 8;
   2620 
   2621  template <class D, typename V>
   2622  static inline void Pack(D d, const T* HWY_RESTRICT raw,
   2623                          T* HWY_RESTRICT packed_out, const V& mask,
   2624                          const V& frame_of_reference, V& in, V& out) {
   2625    // Avoid compilation errors and unnecessary template instantiation if
   2626    // compiling in C++11 or C++14 mode
   2627    using NextUnroller = BitPackUnroller<
   2628        T, kBits, ((S <= B) ? (S + ((S < B) ? kBits : 0)) : (S % B)),
   2629        kLoadPos + static_cast<size_t>(S < B),
   2630        kStorePos + static_cast<size_t>(S > B), block_packing_type>;
   2631 
   2632    (void)raw;
   2633    (void)mask;
   2634    (void)in;
   2635 
   2636    const size_t N = Lanes(d);
   2637    HWY_IF_CONSTEXPR(S >= B) {
   2638      StoreU(out, d, packed_out + kStorePos * N);
   2639      HWY_IF_CONSTEXPR(S == B) { return; }
   2640      HWY_IF_CONSTEXPR(S != B) {
   2641        constexpr size_t shr_amount = (kBits - S % B) % B;
   2642        out = ShiftRight<shr_amount>(in);
   2643        // NextUnroller is a typedef for
   2644        // Unroller<T, kBits, S % B, kLoadPos, kStorePos + 1> if S > B is true
   2645        return NextUnroller::Pack(d, raw, packed_out, mask, frame_of_reference,
   2646                                  in, out);
   2647      }
   2648    }
   2649    HWY_IF_CONSTEXPR(S < B) {
   2650      HWY_IF_CONSTEXPR(block_packing_type == BlockPackingType::kBitPacked) {
   2651        in = LoadU(d, raw + kLoadPos * N);
   2652      }
   2653      HWY_IF_CONSTEXPR(block_packing_type == BlockPackingType::kFoRBitPacked) {
   2654        in = Sub(LoadU(d, raw + kLoadPos * N), frame_of_reference);
   2655      }
   2656      // Optimize for the case when `S` is zero.
   2657      // We can skip `Or` + ShiftLeft` to align `in`.
   2658      HWY_IF_CONSTEXPR(S == 0) { out = in; }
   2659      HWY_IF_CONSTEXPR(S != 0) { out = Or(out, ShiftLeft<S % B>(in)); }
   2660      // NextUnroller is a typedef for
   2661      // Unroller<T, kBits, S + kBits, kLoadPos + 1, kStorePos> if S < B is true
   2662      return NextUnroller::Pack(d, raw, packed_out, mask, frame_of_reference,
   2663                                in, out);
   2664    }
   2665  }
   2666 
   2667  template <class D, typename V>
   2668  static inline void Unpack(D d, const T* HWY_RESTRICT packed_in,
   2669                            T* HWY_RESTRICT raw, const V& mask,
   2670                            const V& frame_of_reference, V& in, V& out) {
   2671    // Avoid compilation errors and unnecessary template instantiation if
   2672    // compiling in C++11 or C++14 mode
   2673    using NextUnroller = BitPackUnroller<
   2674        T, kBits, ((S <= B) ? (S + ((S < B) ? kBits : 0)) : (S % B)),
   2675        kLoadPos + static_cast<size_t>(S > B),
   2676        kStorePos + static_cast<size_t>(S < B), block_packing_type>;
   2677 
   2678    (void)packed_in;
   2679    (void)mask;
   2680    (void)in;
   2681 
   2682    const size_t N = Lanes(d);
   2683    HWY_IF_CONSTEXPR(S >= B) {
   2684      HWY_IF_CONSTEXPR(S == B) {
   2685        V bitpacked_output = out;
   2686        HWY_IF_CONSTEXPR(block_packing_type ==
   2687                         BlockPackingType::kFoRBitPacked) {
   2688          bitpacked_output = Add(bitpacked_output, frame_of_reference);
   2689        }
   2690        StoreU(bitpacked_output, d, raw + kStorePos * N);
   2691        return;
   2692      }
   2693      HWY_IF_CONSTEXPR(S != B) {
   2694        in = LoadU(d, packed_in + kLoadPos * N);
   2695        constexpr size_t shl_amount = (kBits - S % B) % B;
   2696        out = And(Or(out, ShiftLeft<shl_amount>(in)), mask);
   2697        // NextUnroller is a typedef for
   2698        // Unroller<T, kBits, S % B, kLoadPos + 1, kStorePos> if S > B is true
   2699        return NextUnroller::Unpack(d, packed_in, raw, mask, frame_of_reference,
   2700                                    in, out);
   2701      }
   2702    }
   2703    HWY_IF_CONSTEXPR(S < B) {
   2704      V bitpacked_output = out;
   2705      HWY_IF_CONSTEXPR(block_packing_type == BlockPackingType::kFoRBitPacked) {
   2706        bitpacked_output = Add(bitpacked_output, frame_of_reference);
   2707      }
   2708      StoreU(bitpacked_output, d, raw + kStorePos * N);
   2709      HWY_IF_CONSTEXPR(S + kBits < B) {
   2710        // Optimize for the case when `S` is zero.
   2711        // We can skip the `ShiftRight` to align `in`.
   2712        HWY_IF_CONSTEXPR(S == 0) { out = And(in, mask); }
   2713        HWY_IF_CONSTEXPR(S != 0) { out = And(ShiftRight<S % B>(in), mask); }
   2714      }
   2715      HWY_IF_CONSTEXPR(S + kBits >= B) { out = ShiftRight<S % B>(in); }
   2716      // NextUnroller is a typedef for
   2717      // Unroller<T, kBits, S + kBits, kLoadPos, kStorePos + 1> if S < B is true
   2718      return NextUnroller::Unpack(d, packed_in, raw, mask, frame_of_reference,
   2719                                  in, out);
   2720    }
   2721  }
   2722 };
   2723 
   2724 // Computes the highest power of two that divides `kBits`.
   2725 template <size_t kBits>
   2726 constexpr size_t NumLoops() {
   2727  return (kBits & ~(kBits - 1));
   2728 }
   2729 
   2730 template <size_t kBits>
   2731 constexpr size_t PackedIncr() {
   2732  return kBits / NumLoops<kBits>();
   2733 }
   2734 
   2735 template <typename T, size_t kBits>
   2736 constexpr size_t UnpackedIncr() {
   2737  return (sizeof(T) * 8) / NumLoops<kBits>();
   2738 }
   2739 
   2740 template <size_t kBits>
   2741 constexpr uint32_t MaskBits32() {
   2742  return static_cast<uint32_t>((1ull << kBits) - 1);
   2743 }
   2744 
   2745 template <size_t kBits>
   2746 constexpr uint64_t MaskBits64() {
   2747  return (uint64_t{1} << kBits) - 1;
   2748 }
   2749 template <>
   2750 constexpr uint64_t MaskBits64<64>() {
   2751  return ~uint64_t{0};
   2752 }
   2753 
   2754 }  // namespace detail
   2755 
   2756 template <size_t kBits>  // <= 32
   2757 struct Pack32 {
   2758  template <class D,
   2759            BlockPackingType block_packing_type = BlockPackingType::kBitPacked>
   2760  HWY_INLINE void Pack(D d, const uint32_t* HWY_RESTRICT raw,
   2761                       uint32_t* HWY_RESTRICT packed_out,
   2762                       const uint32_t frame_of_reference_value = 0) const {
   2763    using V = VFromD<D>;
   2764    const V mask = Set(d, detail::MaskBits32<kBits>());
   2765    const V frame_of_reference = Set(d, frame_of_reference_value);
   2766    for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) {
   2767      V in = Zero(d);
   2768      V out = Zero(d);
   2769      detail::BitPackUnroller<uint32_t, kBits, 0, 0, 0,
   2770                              block_packing_type>::Pack(d, raw, packed_out,
   2771                                                        mask,
   2772                                                        frame_of_reference, in,
   2773                                                        out);
   2774      raw += detail::UnpackedIncr<uint32_t, kBits>() * Lanes(d);
   2775      packed_out += detail::PackedIncr<kBits>() * Lanes(d);
   2776    }
   2777  }
   2778 
   2779  template <class D,
   2780            BlockPackingType block_packing_type = BlockPackingType::kBitPacked>
   2781  HWY_INLINE void Unpack(D d, const uint32_t* HWY_RESTRICT packed_in,
   2782                         uint32_t* HWY_RESTRICT raw,
   2783                         const uint32_t frame_of_reference_value = 0) const {
   2784    using V = VFromD<D>;
   2785    const V mask = Set(d, detail::MaskBits32<kBits>());
   2786    const V frame_of_reference = Set(d, frame_of_reference_value);
   2787    for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) {
   2788      V in = LoadU(d, packed_in + 0 * Lanes(d));
   2789      V out = And(in, mask);
   2790      detail::BitPackUnroller<uint32_t, kBits, kBits, 1, 0,
   2791                              block_packing_type>::Unpack(d, packed_in, raw,
   2792                                                          mask,
   2793                                                          frame_of_reference,
   2794                                                          in, out);
   2795      raw += detail::UnpackedIncr<uint32_t, kBits>() * Lanes(d);
   2796      packed_in += detail::PackedIncr<kBits>() * Lanes(d);
   2797    }
   2798  }
   2799 };
   2800 
   2801 template <size_t kBits>  // <= 64
   2802 struct Pack64 {
   2803  template <class D,
   2804            BlockPackingType block_packing_type = BlockPackingType::kBitPacked>
   2805  HWY_INLINE void Pack(D d, const uint64_t* HWY_RESTRICT raw,
   2806                       uint64_t* HWY_RESTRICT packed_out,
   2807                       const uint64_t frame_of_reference_value = 0) const {
   2808    using V = VFromD<D>;
   2809    const V mask = Set(d, detail::MaskBits64<kBits>());
   2810    const V frame_of_reference = Set(d, frame_of_reference_value);
   2811    for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) {
   2812      V in = Zero(d);
   2813      V out = Zero(d);
   2814      detail::BitPackUnroller<uint64_t, kBits, 0, 0, 0,
   2815                              block_packing_type>::Pack(d, raw, packed_out,
   2816                                                        mask,
   2817                                                        frame_of_reference, in,
   2818                                                        out);
   2819      raw += detail::UnpackedIncr<uint64_t, kBits>() * Lanes(d);
   2820      packed_out += detail::PackedIncr<kBits>() * Lanes(d);
   2821    }
   2822  }
   2823 
   2824  template <class D,
   2825            BlockPackingType block_packing_type = BlockPackingType::kBitPacked>
   2826  HWY_INLINE void Unpack(D d, const uint64_t* HWY_RESTRICT packed_in,
   2827                         uint64_t* HWY_RESTRICT raw,
   2828                         const uint64_t frame_of_reference_value = 0) const {
   2829    using V = VFromD<D>;
   2830    const V mask = Set(d, detail::MaskBits64<kBits>());
   2831    const V frame_of_reference = Set(d, frame_of_reference_value);
   2832    for (size_t i = 0; i < detail::NumLoops<kBits>(); ++i) {
   2833      V in = LoadU(d, packed_in + 0 * Lanes(d));
   2834      V out = And(in, mask);
   2835      detail::BitPackUnroller<uint64_t, kBits, kBits, 1, 0,
   2836                              block_packing_type>::Unpack(d, packed_in, raw,
   2837                                                          mask,
   2838                                                          frame_of_reference,
   2839                                                          in, out);
   2840      raw += detail::UnpackedIncr<uint64_t, kBits>() * Lanes(d);
   2841      packed_in += detail::PackedIncr<kBits>() * Lanes(d);
   2842    }
   2843  }
   2844 };
   2845 
   2846 // NOLINTNEXTLINE(google-readability-namespace-comments)
   2847 }  // namespace HWY_NAMESPACE
   2848 }  // namespace hwy
   2849 HWY_AFTER_NAMESPACE();
   2850 
   2851 #endif  // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_