tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simd_util-inl.h (12991B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 // Misc utilities for SIMD operations
      7 
      8 #if defined(LIB_JXL_SIMD_UTIL_INL_H_) == defined(HWY_TARGET_TOGGLE)
      9 #ifdef LIB_JXL_SIMD_UTIL_INL_H_
     10 #undef LIB_JXL_SIMD_UTIL_INL_H_
     11 #else
     12 #define LIB_JXL_SIMD_UTIL_INL_H_
     13 #endif
     14 
     15 #include <hwy/highway.h>
     16 
     17 #include "lib/jxl/base/compiler_specific.h"
     18 
     19 HWY_BEFORE_NAMESPACE();
     20 namespace jxl {
     21 namespace HWY_NAMESPACE {
     22 
     23 #if HWY_CAP_GE512
     24 using hwy::HWY_NAMESPACE::Half;
     25 using hwy::HWY_NAMESPACE::Vec;
     26 template <size_t i, class DF, class V>
     27 HWY_INLINE Vec<Half<Half<DF>>> Quarter(const DF df, V v) {
     28  using HF = Half<DF>;
     29  using HHF = Half<HF>;
     30  auto half = i >= 2 ? UpperHalf(HF(), v) : LowerHalf(HF(), v);
     31  return i & 1 ? UpperHalf(HHF(), half) : LowerHalf(HHF(), half);
     32 }
     33 
     34 template <class DF, class V>
     35 HWY_INLINE Vec<DF> Concat4(const DF df, V v0, V v1, V v2, V v3) {
     36  using HF = Half<DF>;
     37  return Combine(DF(), Combine(HF(), v3, v2), Combine(HF(), v1, v0));
     38 }
     39 
     40 #endif
     41 
     42 // Stores v0[0], v1[0], v0[1], v1[1], ... to mem, in this order. Mem must be
     43 // aligned.
     44 template <class DF, class V, typename T>
     45 void StoreInterleaved(const DF df, V v0, V v1, T* mem) {
     46  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
     47 #if HWY_TARGET == HWY_SCALAR
     48  Store(v0, df, mem);
     49  Store(v1, df, mem + 1);
     50 #elif !HWY_CAP_GE256
     51  Store(InterleaveLower(df, v0, v1), df, mem);
     52  Store(InterleaveUpper(df, v0, v1), df, mem + Lanes(df));
     53 #else
     54  if (!HWY_CAP_GE512 || Lanes(df) == 8) {
     55    auto t0 = InterleaveLower(df, v0, v1);
     56    auto t1 = InterleaveUpper(df, v0, v1);
     57    Store(ConcatLowerLower(df, t1, t0), df, mem);
     58    Store(ConcatUpperUpper(df, t1, t0), df, mem + Lanes(df));
     59  } else {
     60 #if HWY_CAP_GE512
     61    auto t0 = InterleaveLower(df, v0, v1);
     62    auto t1 = InterleaveUpper(df, v0, v1);
     63    Store(Concat4(df, Quarter<0>(df, t0), Quarter<0>(df, t1),
     64                  Quarter<1>(df, t0), Quarter<1>(df, t1)),
     65          df, mem);
     66    Store(Concat4(df, Quarter<2>(df, t0), Quarter<2>(df, t1),
     67                  Quarter<3>(df, t0), Quarter<3>(df, t1)),
     68          df, mem + Lanes(df));
     69 #endif
     70  }
     71 #endif
     72 }
     73 
     74 // Stores v0[0], v1[0], v2[0], v3[0], v0[1] ... to mem, in this order. Mem must
     75 // be aligned.
     76 template <class DF, class V, typename T>
     77 void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, T* mem) {
     78  static_assert(sizeof(T) == 4, "only use StoreInterleaved for 4-byte types");
     79 #if HWY_TARGET == HWY_SCALAR
     80  Store(v0, df, mem);
     81  Store(v1, df, mem + 1);
     82  Store(v2, df, mem + 2);
     83  Store(v3, df, mem + 3);
     84 #elif !HWY_CAP_GE256
     85  auto t0 = InterleaveLower(df, v0, v2);
     86  auto t1 = InterleaveLower(df, v1, v3);
     87  auto t2 = InterleaveUpper(df, v0, v2);
     88  auto t3 = InterleaveUpper(df, v1, v3);
     89  Store(InterleaveLower(df, t0, t1), df, mem);
     90  Store(InterleaveUpper(df, t0, t1), df, mem + Lanes(df));
     91  Store(InterleaveLower(df, t2, t3), df, mem + 2 * Lanes(df));
     92  Store(InterleaveUpper(df, t2, t3), df, mem + 3 * Lanes(df));
     93 #elif !HWY_CAP_GE512
     94  auto t0 = InterleaveLower(df, v0, v2);
     95  auto t1 = InterleaveLower(df, v1, v3);
     96  auto t2 = InterleaveUpper(df, v0, v2);
     97  auto t3 = InterleaveUpper(df, v1, v3);
     98 
     99  auto m0 = InterleaveLower(df, t0, t1);
    100  auto m1 = InterleaveUpper(df, t0, t1);
    101  auto m2 = InterleaveLower(df, t2, t3);
    102  auto m3 = InterleaveUpper(df, t2, t3);
    103 
    104  Store(ConcatLowerLower(df, m1, m0), df, mem);
    105  Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
    106  Store(ConcatUpperUpper(df, m1, m0), df, mem + 2 * Lanes(df));
    107  Store(ConcatUpperUpper(df, m3, m2), df, mem + 3 * Lanes(df));
    108 #else
    109  auto t0 = InterleaveLower(df, v0, v2);
    110  auto t1 = InterleaveLower(df, v1, v3);
    111  auto t2 = InterleaveUpper(df, v0, v2);
    112  auto t3 = InterleaveUpper(df, v1, v3);
    113 
    114  auto m0 = InterleaveLower(df, t0, t1);
    115  auto m1 = InterleaveUpper(df, t0, t1);
    116  auto m2 = InterleaveLower(df, t2, t3);
    117  auto m3 = InterleaveUpper(df, t2, t3);
    118 
    119  Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
    120                Quarter<0>(df, m3)),
    121        df, mem);
    122  Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
    123                Quarter<1>(df, m3)),
    124        df, mem + Lanes(df));
    125  Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
    126                Quarter<2>(df, m3)),
    127        df, mem + 2 * Lanes(df));
    128  Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
    129                Quarter<3>(df, m3)),
    130        df, mem + 3 * Lanes(df));
    131 #endif
    132 }
    133 
    134 // Stores v0[0], v1[0], v2[0], v3[0], v4[0], v5[0], v6[0], v7[0], v0[1] ... to
    135 // mem, in this order. Mem must be aligned.
    136 template <class DF, class V>
    137 void StoreInterleaved(const DF df, V v0, V v1, V v2, V v3, V v4, V v5, V v6,
    138                      V v7, float* mem) {
    139 #if HWY_TARGET == HWY_SCALAR
    140  Store(v0, df, mem);
    141  Store(v1, df, mem + 1);
    142  Store(v2, df, mem + 2);
    143  Store(v3, df, mem + 3);
    144  Store(v4, df, mem + 4);
    145  Store(v5, df, mem + 5);
    146  Store(v6, df, mem + 6);
    147  Store(v7, df, mem + 7);
    148 #elif !HWY_CAP_GE256
    149  auto t0 = InterleaveLower(df, v0, v4);
    150  auto t1 = InterleaveLower(df, v1, v5);
    151  auto t2 = InterleaveLower(df, v2, v6);
    152  auto t3 = InterleaveLower(df, v3, v7);
    153  auto t4 = InterleaveUpper(df, v0, v4);
    154  auto t5 = InterleaveUpper(df, v1, v5);
    155  auto t6 = InterleaveUpper(df, v2, v6);
    156  auto t7 = InterleaveUpper(df, v3, v7);
    157 
    158  auto w0 = InterleaveLower(df, t0, t2);
    159  auto w1 = InterleaveLower(df, t1, t3);
    160  auto w2 = InterleaveUpper(df, t0, t2);
    161  auto w3 = InterleaveUpper(df, t1, t3);
    162  auto w4 = InterleaveLower(df, t4, t6);
    163  auto w5 = InterleaveLower(df, t5, t7);
    164  auto w6 = InterleaveUpper(df, t4, t6);
    165  auto w7 = InterleaveUpper(df, t5, t7);
    166 
    167  Store(InterleaveLower(df, w0, w1), df, mem);
    168  Store(InterleaveUpper(df, w0, w1), df, mem + Lanes(df));
    169  Store(InterleaveLower(df, w2, w3), df, mem + 2 * Lanes(df));
    170  Store(InterleaveUpper(df, w2, w3), df, mem + 3 * Lanes(df));
    171  Store(InterleaveLower(df, w4, w5), df, mem + 4 * Lanes(df));
    172  Store(InterleaveUpper(df, w4, w5), df, mem + 5 * Lanes(df));
    173  Store(InterleaveLower(df, w6, w7), df, mem + 6 * Lanes(df));
    174  Store(InterleaveUpper(df, w6, w7), df, mem + 7 * Lanes(df));
    175 #elif !HWY_CAP_GE512
    176  auto t0 = InterleaveLower(df, v0, v4);
    177  auto t1 = InterleaveLower(df, v1, v5);
    178  auto t2 = InterleaveLower(df, v2, v6);
    179  auto t3 = InterleaveLower(df, v3, v7);
    180  auto t4 = InterleaveUpper(df, v0, v4);
    181  auto t5 = InterleaveUpper(df, v1, v5);
    182  auto t6 = InterleaveUpper(df, v2, v6);
    183  auto t7 = InterleaveUpper(df, v3, v7);
    184 
    185  auto w0 = InterleaveLower(df, t0, t2);
    186  auto w1 = InterleaveLower(df, t1, t3);
    187  auto w2 = InterleaveUpper(df, t0, t2);
    188  auto w3 = InterleaveUpper(df, t1, t3);
    189  auto w4 = InterleaveLower(df, t4, t6);
    190  auto w5 = InterleaveLower(df, t5, t7);
    191  auto w6 = InterleaveUpper(df, t4, t6);
    192  auto w7 = InterleaveUpper(df, t5, t7);
    193 
    194  auto m0 = InterleaveLower(df, w0, w1);
    195  auto m1 = InterleaveUpper(df, w0, w1);
    196  auto m2 = InterleaveLower(df, w2, w3);
    197  auto m3 = InterleaveUpper(df, w2, w3);
    198  auto m4 = InterleaveLower(df, w4, w5);
    199  auto m5 = InterleaveUpper(df, w4, w5);
    200  auto m6 = InterleaveLower(df, w6, w7);
    201  auto m7 = InterleaveUpper(df, w6, w7);
    202 
    203  Store(ConcatLowerLower(df, m1, m0), df, mem);
    204  Store(ConcatLowerLower(df, m3, m2), df, mem + Lanes(df));
    205  Store(ConcatLowerLower(df, m5, m4), df, mem + 2 * Lanes(df));
    206  Store(ConcatLowerLower(df, m7, m6), df, mem + 3 * Lanes(df));
    207  Store(ConcatUpperUpper(df, m1, m0), df, mem + 4 * Lanes(df));
    208  Store(ConcatUpperUpper(df, m3, m2), df, mem + 5 * Lanes(df));
    209  Store(ConcatUpperUpper(df, m5, m4), df, mem + 6 * Lanes(df));
    210  Store(ConcatUpperUpper(df, m7, m6), df, mem + 7 * Lanes(df));
    211 #else
    212  auto t0 = InterleaveLower(df, v0, v4);
    213  auto t1 = InterleaveLower(df, v1, v5);
    214  auto t2 = InterleaveLower(df, v2, v6);
    215  auto t3 = InterleaveLower(df, v3, v7);
    216  auto t4 = InterleaveUpper(df, v0, v4);
    217  auto t5 = InterleaveUpper(df, v1, v5);
    218  auto t6 = InterleaveUpper(df, v2, v6);
    219  auto t7 = InterleaveUpper(df, v3, v7);
    220 
    221  auto w0 = InterleaveLower(df, t0, t2);
    222  auto w1 = InterleaveLower(df, t1, t3);
    223  auto w2 = InterleaveUpper(df, t0, t2);
    224  auto w3 = InterleaveUpper(df, t1, t3);
    225  auto w4 = InterleaveLower(df, t4, t6);
    226  auto w5 = InterleaveLower(df, t5, t7);
    227  auto w6 = InterleaveUpper(df, t4, t6);
    228  auto w7 = InterleaveUpper(df, t5, t7);
    229 
    230  auto m0 = InterleaveLower(df, w0, w1);
    231  auto m1 = InterleaveUpper(df, w0, w1);
    232  auto m2 = InterleaveLower(df, w2, w3);
    233  auto m3 = InterleaveUpper(df, w2, w3);
    234  auto m4 = InterleaveLower(df, w4, w5);
    235  auto m5 = InterleaveUpper(df, w4, w5);
    236  auto m6 = InterleaveLower(df, w6, w7);
    237  auto m7 = InterleaveUpper(df, w6, w7);
    238 
    239  Store(Concat4(df, Quarter<0>(df, m0), Quarter<0>(df, m1), Quarter<0>(df, m2),
    240                Quarter<0>(df, m3)),
    241        df, mem);
    242  Store(Concat4(df, Quarter<0>(df, m4), Quarter<0>(df, m5), Quarter<0>(df, m6),
    243                Quarter<0>(df, m7)),
    244        df, mem + Lanes(df));
    245  Store(Concat4(df, Quarter<1>(df, m0), Quarter<1>(df, m1), Quarter<1>(df, m2),
    246                Quarter<1>(df, m3)),
    247        df, mem + 2 * Lanes(df));
    248  Store(Concat4(df, Quarter<1>(df, m4), Quarter<1>(df, m5), Quarter<1>(df, m6),
    249                Quarter<1>(df, m7)),
    250        df, mem + 3 * Lanes(df));
    251  Store(Concat4(df, Quarter<2>(df, m0), Quarter<2>(df, m1), Quarter<2>(df, m2),
    252                Quarter<2>(df, m3)),
    253        df, mem + 4 * Lanes(df));
    254  Store(Concat4(df, Quarter<2>(df, m4), Quarter<2>(df, m5), Quarter<2>(df, m6),
    255                Quarter<2>(df, m7)),
    256        df, mem + 5 * Lanes(df));
    257  Store(Concat4(df, Quarter<3>(df, m0), Quarter<3>(df, m1), Quarter<3>(df, m2),
    258                Quarter<3>(df, m3)),
    259        df, mem + 6 * Lanes(df));
    260  Store(Concat4(df, Quarter<3>(df, m4), Quarter<3>(df, m5), Quarter<3>(df, m6),
    261                Quarter<3>(df, m7)),
    262        df, mem + 7 * Lanes(df));
    263 #endif
    264 }
    265 
    266 #if HWY_CAP_GE256
    267 JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
    268                                  int32_t* JXL_RESTRICT to, size_t fromstride) {
    269  const HWY_CAPPED(int32_t, 8) d;
    270  auto i0 = Load(d, from);
    271  auto i1 = Load(d, from + 1 * fromstride);
    272  auto i2 = Load(d, from + 2 * fromstride);
    273  auto i3 = Load(d, from + 3 * fromstride);
    274  auto i4 = Load(d, from + 4 * fromstride);
    275  auto i5 = Load(d, from + 5 * fromstride);
    276  auto i6 = Load(d, from + 6 * fromstride);
    277  auto i7 = Load(d, from + 7 * fromstride);
    278 
    279  const auto q0 = InterleaveLower(d, i0, i2);
    280  const auto q1 = InterleaveLower(d, i1, i3);
    281  const auto q2 = InterleaveUpper(d, i0, i2);
    282  const auto q3 = InterleaveUpper(d, i1, i3);
    283  const auto q4 = InterleaveLower(d, i4, i6);
    284  const auto q5 = InterleaveLower(d, i5, i7);
    285  const auto q6 = InterleaveUpper(d, i4, i6);
    286  const auto q7 = InterleaveUpper(d, i5, i7);
    287 
    288  const auto r0 = InterleaveLower(d, q0, q1);
    289  const auto r1 = InterleaveUpper(d, q0, q1);
    290  const auto r2 = InterleaveLower(d, q2, q3);
    291  const auto r3 = InterleaveUpper(d, q2, q3);
    292  const auto r4 = InterleaveLower(d, q4, q5);
    293  const auto r5 = InterleaveUpper(d, q4, q5);
    294  const auto r6 = InterleaveLower(d, q6, q7);
    295  const auto r7 = InterleaveUpper(d, q6, q7);
    296 
    297  i0 = ConcatLowerLower(d, r4, r0);
    298  i1 = ConcatLowerLower(d, r5, r1);
    299  i2 = ConcatLowerLower(d, r6, r2);
    300  i3 = ConcatLowerLower(d, r7, r3);
    301  i4 = ConcatUpperUpper(d, r4, r0);
    302  i5 = ConcatUpperUpper(d, r5, r1);
    303  i6 = ConcatUpperUpper(d, r6, r2);
    304  i7 = ConcatUpperUpper(d, r7, r3);
    305 
    306  Store(i0, d, to);
    307  Store(i1, d, to + 1 * 8);
    308  Store(i2, d, to + 2 * 8);
    309  Store(i3, d, to + 3 * 8);
    310  Store(i4, d, to + 4 * 8);
    311  Store(i5, d, to + 5 * 8);
    312  Store(i6, d, to + 6 * 8);
    313  Store(i7, d, to + 7 * 8);
    314 }
    315 #elif HWY_TARGET != HWY_SCALAR
    316 JXL_INLINE void Transpose8x8Block(const int32_t* JXL_RESTRICT from,
    317                                  int32_t* JXL_RESTRICT to, size_t fromstride) {
    318  const HWY_CAPPED(int32_t, 4) d;
    319  for (size_t n = 0; n < 8; n += 4) {
    320    for (size_t m = 0; m < 8; m += 4) {
    321      auto p0 = Load(d, from + n * fromstride + m);
    322      auto p1 = Load(d, from + (n + 1) * fromstride + m);
    323      auto p2 = Load(d, from + (n + 2) * fromstride + m);
    324      auto p3 = Load(d, from + (n + 3) * fromstride + m);
    325      const auto q0 = InterleaveLower(d, p0, p2);
    326      const auto q1 = InterleaveLower(d, p1, p3);
    327      const auto q2 = InterleaveUpper(d, p0, p2);
    328      const auto q3 = InterleaveUpper(d, p1, p3);
    329 
    330      const auto r0 = InterleaveLower(d, q0, q1);
    331      const auto r1 = InterleaveUpper(d, q0, q1);
    332      const auto r2 = InterleaveLower(d, q2, q3);
    333      const auto r3 = InterleaveUpper(d, q2, q3);
    334      Store(r0, d, to + m * 8 + n);
    335      Store(r1, d, to + (1 + m) * 8 + n);
    336      Store(r2, d, to + (2 + m) * 8 + n);
    337      Store(r3, d, to + (3 + m) * 8 + n);
    338    }
    339  }
    340 }
    341 
    342 #endif
    343 
    344 // NOLINTNEXTLINE(google-readability-namespace-comments)
    345 }  // namespace HWY_NAMESPACE
    346 }  // namespace jxl
    347 HWY_AFTER_NAMESPACE();
    348 
    349 #endif  // LIB_JXL_SIMD_UTIL_INL_H_