tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

transpose-inl.h (3684B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #if defined(LIB_JPEGLI_TRANSPOSE_INL_H_) == defined(HWY_TARGET_TOGGLE)
      7 #ifdef LIB_JPEGLI_TRANSPOSE_INL_H_
      8 #undef LIB_JPEGLI_TRANSPOSE_INL_H_
      9 #else
     10 #define LIB_JPEGLI_TRANSPOSE_INL_H_
     11 #endif
     12 
     13 #include "lib/jxl/base/compiler_specific.h"
     14 
     15 HWY_BEFORE_NAMESPACE();
     16 namespace jpegli {
     17 namespace HWY_NAMESPACE {
     18 namespace {
     19 
     20 #if HWY_CAP_GE256
     21 JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
     22                                  float* JXL_RESTRICT to) {
     23  const HWY_CAPPED(float, 8) d;
     24  auto i0 = Load(d, from);
     25  auto i1 = Load(d, from + 1 * 8);
     26  auto i2 = Load(d, from + 2 * 8);
     27  auto i3 = Load(d, from + 3 * 8);
     28  auto i4 = Load(d, from + 4 * 8);
     29  auto i5 = Load(d, from + 5 * 8);
     30  auto i6 = Load(d, from + 6 * 8);
     31  auto i7 = Load(d, from + 7 * 8);
     32 
     33  const auto q0 = InterleaveLower(d, i0, i2);
     34  const auto q1 = InterleaveLower(d, i1, i3);
     35  const auto q2 = InterleaveUpper(d, i0, i2);
     36  const auto q3 = InterleaveUpper(d, i1, i3);
     37  const auto q4 = InterleaveLower(d, i4, i6);
     38  const auto q5 = InterleaveLower(d, i5, i7);
     39  const auto q6 = InterleaveUpper(d, i4, i6);
     40  const auto q7 = InterleaveUpper(d, i5, i7);
     41 
     42  const auto r0 = InterleaveLower(d, q0, q1);
     43  const auto r1 = InterleaveUpper(d, q0, q1);
     44  const auto r2 = InterleaveLower(d, q2, q3);
     45  const auto r3 = InterleaveUpper(d, q2, q3);
     46  const auto r4 = InterleaveLower(d, q4, q5);
     47  const auto r5 = InterleaveUpper(d, q4, q5);
     48  const auto r6 = InterleaveLower(d, q6, q7);
     49  const auto r7 = InterleaveUpper(d, q6, q7);
     50 
     51  i0 = ConcatLowerLower(d, r4, r0);
     52  i1 = ConcatLowerLower(d, r5, r1);
     53  i2 = ConcatLowerLower(d, r6, r2);
     54  i3 = ConcatLowerLower(d, r7, r3);
     55  i4 = ConcatUpperUpper(d, r4, r0);
     56  i5 = ConcatUpperUpper(d, r5, r1);
     57  i6 = ConcatUpperUpper(d, r6, r2);
     58  i7 = ConcatUpperUpper(d, r7, r3);
     59 
     60  Store(i0, d, to);
     61  Store(i1, d, to + 1 * 8);
     62  Store(i2, d, to + 2 * 8);
     63  Store(i3, d, to + 3 * 8);
     64  Store(i4, d, to + 4 * 8);
     65  Store(i5, d, to + 5 * 8);
     66  Store(i6, d, to + 6 * 8);
     67  Store(i7, d, to + 7 * 8);
     68 }
     69 #elif HWY_TARGET != HWY_SCALAR
     70 JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
     71                                  float* JXL_RESTRICT to) {
     72  const HWY_CAPPED(float, 4) d;
     73  for (size_t n = 0; n < 8; n += 4) {
     74    for (size_t m = 0; m < 8; m += 4) {
     75      auto p0 = Load(d, from + n * 8 + m);
     76      auto p1 = Load(d, from + (n + 1) * 8 + m);
     77      auto p2 = Load(d, from + (n + 2) * 8 + m);
     78      auto p3 = Load(d, from + (n + 3) * 8 + m);
     79      const auto q0 = InterleaveLower(d, p0, p2);
     80      const auto q1 = InterleaveLower(d, p1, p3);
     81      const auto q2 = InterleaveUpper(d, p0, p2);
     82      const auto q3 = InterleaveUpper(d, p1, p3);
     83 
     84      const auto r0 = InterleaveLower(d, q0, q1);
     85      const auto r1 = InterleaveUpper(d, q0, q1);
     86      const auto r2 = InterleaveLower(d, q2, q3);
     87      const auto r3 = InterleaveUpper(d, q2, q3);
     88      Store(r0, d, to + m * 8 + n);
     89      Store(r1, d, to + (1 + m) * 8 + n);
     90      Store(r2, d, to + (2 + m) * 8 + n);
     91      Store(r3, d, to + (3 + m) * 8 + n);
     92    }
     93  }
     94 }
     95 #else
     96 static JXL_INLINE void Transpose8x8Block(const float* JXL_RESTRICT from,
     97                                         float* JXL_RESTRICT to) {
     98  for (size_t n = 0; n < 8; ++n) {
     99    for (size_t m = 0; m < 8; ++m) {
    100      to[8 * n + m] = from[8 * m + n];
    101    }
    102  }
    103 }
    104 #endif
    105 
    106 // NOLINTNEXTLINE(google-readability-namespace-comments)
    107 }  // namespace
    108 }  // namespace HWY_NAMESPACE
    109 }  // namespace jpegli
    110 HWY_AFTER_NAMESPACE();
    111 #endif  // LIB_JPEGLI_TRANSPOSE_INL_H_