tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

dct-inl.h (8167B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #if defined(LIB_JPEGLI_DCT_INL_H_) == defined(HWY_TARGET_TOGGLE)
      7 #ifdef LIB_JPEGLI_DCT_INL_H_
      8 #undef LIB_JPEGLI_DCT_INL_H_
      9 #else
     10 #define LIB_JPEGLI_DCT_INL_H_
     11 #endif
     12 
     13 #include "lib/jpegli/transpose-inl.h"
     14 #include "lib/jxl/base/compiler_specific.h"
     15 
     16 HWY_BEFORE_NAMESPACE();
     17 namespace jpegli {
     18 namespace HWY_NAMESPACE {
     19 namespace {
     20 
     21 // These templates are not found via ADL.
     22 using hwy::HWY_NAMESPACE::Abs;
     23 using hwy::HWY_NAMESPACE::Add;
     24 using hwy::HWY_NAMESPACE::DemoteTo;
     25 using hwy::HWY_NAMESPACE::Ge;
     26 using hwy::HWY_NAMESPACE::IfThenElseZero;
     27 using hwy::HWY_NAMESPACE::Mul;
     28 using hwy::HWY_NAMESPACE::MulAdd;
     29 using hwy::HWY_NAMESPACE::Rebind;
     30 using hwy::HWY_NAMESPACE::Round;
     31 using hwy::HWY_NAMESPACE::Sub;
     32 using hwy::HWY_NAMESPACE::Vec;
     33 
     34 using D = HWY_FULL(float);
     35 using DI = HWY_FULL(int32_t);
     36 
     37 template <size_t N>
     38 void AddReverse(const float* JXL_RESTRICT a_in1,
     39                const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
     40  HWY_CAPPED(float, 8) d8;
     41  for (size_t i = 0; i < N; i++) {
     42    auto in1 = Load(d8, a_in1 + i * 8);
     43    auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
     44    Store(Add(in1, in2), d8, a_out + i * 8);
     45  }
     46 }
     47 
     48 template <size_t N>
     49 void SubReverse(const float* JXL_RESTRICT a_in1,
     50                const float* JXL_RESTRICT a_in2, float* JXL_RESTRICT a_out) {
     51  HWY_CAPPED(float, 8) d8;
     52  for (size_t i = 0; i < N; i++) {
     53    auto in1 = Load(d8, a_in1 + i * 8);
     54    auto in2 = Load(d8, a_in2 + (N - i - 1) * 8);
     55    Store(Sub(in1, in2), d8, a_out + i * 8);
     56  }
     57 }
     58 
     59 template <size_t N>
     60 void B(float* JXL_RESTRICT coeff) {
     61  HWY_CAPPED(float, 8) d8;
     62  constexpr float kSqrt2 = 1.41421356237f;
     63  auto sqrt2 = Set(d8, kSqrt2);
     64  auto in1 = Load(d8, coeff);
     65  auto in2 = Load(d8, coeff + 8);
     66  Store(MulAdd(in1, sqrt2, in2), d8, coeff);
     67  for (size_t i = 1; i + 1 < N; i++) {
     68    auto in1 = Load(d8, coeff + i * 8);
     69    auto in2 = Load(d8, coeff + (i + 1) * 8);
     70    Store(Add(in1, in2), d8, coeff + i * 8);
     71  }
     72 }
     73 
     74 // Ideally optimized away by compiler (except the multiply).
     75 template <size_t N>
     76 void InverseEvenOdd(const float* JXL_RESTRICT a_in, float* JXL_RESTRICT a_out) {
     77  HWY_CAPPED(float, 8) d8;
     78  for (size_t i = 0; i < N / 2; i++) {
     79    auto in1 = Load(d8, a_in + i * 8);
     80    Store(in1, d8, a_out + 2 * i * 8);
     81  }
     82  for (size_t i = N / 2; i < N; i++) {
     83    auto in1 = Load(d8, a_in + i * 8);
     84    Store(in1, d8, a_out + (2 * (i - N / 2) + 1) * 8);
     85  }
     86 }
     87 
     88 // Constants for DCT implementation. Generated by the following snippet:
     89 // for i in range(N // 2):
     90 //    print(1.0 / (2 * math.cos((i + 0.5) * math.pi / N)), end=", ")
     91 template <size_t N>
     92 struct WcMultipliers;
     93 
     94 template <>
     95 struct WcMultipliers<4> {
     96  static constexpr float kMultipliers[] = {
     97      0.541196100146197,
     98      1.3065629648763764,
     99  };
    100 };
    101 
    102 template <>
    103 struct WcMultipliers<8> {
    104  static constexpr float kMultipliers[] = {
    105      0.5097955791041592,
    106      0.6013448869350453,
    107      0.8999762231364156,
    108      2.5629154477415055,
    109  };
    110 };
    111 
    112 #if JXL_CXX_LANG < JXL_CXX_17
    113 constexpr float WcMultipliers<4>::kMultipliers[];
    114 constexpr float WcMultipliers<8>::kMultipliers[];
    115 #endif
    116 
    117 // Invoked on full vector.
    118 template <size_t N>
    119 void Multiply(float* JXL_RESTRICT coeff) {
    120  HWY_CAPPED(float, 8) d8;
    121  for (size_t i = 0; i < N / 2; i++) {
    122    auto in1 = Load(d8, coeff + (N / 2 + i) * 8);
    123    auto mul = Set(d8, WcMultipliers<N>::kMultipliers[i]);
    124    Store(Mul(in1, mul), d8, coeff + (N / 2 + i) * 8);
    125  }
    126 }
    127 
    128 void LoadFromBlock(const float* JXL_RESTRICT pixels, size_t pixels_stride,
    129                   size_t off, float* JXL_RESTRICT coeff) {
    130  HWY_CAPPED(float, 8) d8;
    131  for (size_t i = 0; i < 8; i++) {
    132    Store(LoadU(d8, pixels + i * pixels_stride + off), d8, coeff + i * 8);
    133  }
    134 }
    135 
    136 void StoreToBlockAndScale(const float* JXL_RESTRICT coeff, float* output,
    137                          size_t off) {
    138  HWY_CAPPED(float, 8) d8;
    139  auto mul = Set(d8, 1.0f / 8);
    140  for (size_t i = 0; i < 8; i++) {
    141    StoreU(Mul(mul, Load(d8, coeff + i * 8)), d8, output + i * 8 + off);
    142  }
    143 }
    144 
    145 template <size_t N>
    146 struct DCT1DImpl;
    147 
    148 template <>
    149 struct DCT1DImpl<1> {
    150  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {}
    151 };
    152 
    153 template <>
    154 struct DCT1DImpl<2> {
    155  JXL_INLINE void operator()(float* JXL_RESTRICT mem) {
    156    HWY_CAPPED(float, 8) d8;
    157    auto in1 = Load(d8, mem);
    158    auto in2 = Load(d8, mem + 8);
    159    Store(Add(in1, in2), d8, mem);
    160    Store(Sub(in1, in2), d8, mem + 8);
    161  }
    162 };
    163 
    164 template <size_t N>
    165 struct DCT1DImpl {
    166  void operator()(float* JXL_RESTRICT mem) {
    167    HWY_ALIGN float tmp[N * 8];
    168    AddReverse<N / 2>(mem, mem + N * 4, tmp);
    169    DCT1DImpl<N / 2>()(tmp);
    170    SubReverse<N / 2>(mem, mem + N * 4, tmp + N * 4);
    171    Multiply<N>(tmp);
    172    DCT1DImpl<N / 2>()(tmp + N * 4);
    173    B<N / 2>(tmp + N * 4);
    174    InverseEvenOdd<N>(tmp, mem);
    175  }
    176 };
    177 
    178 void DCT1D(const float* JXL_RESTRICT pixels, size_t pixels_stride,
    179           float* JXL_RESTRICT output) {
    180  HWY_CAPPED(float, 8) d8;
    181  HWY_ALIGN float tmp[64];
    182  for (size_t i = 0; i < 8; i += Lanes(d8)) {
    183    // TODO(veluca): consider removing the temporary memory here (as is done in
    184    // IDCT), if it turns out that some compilers don't optimize away the loads
    185    // and this is performance-critical.
    186    LoadFromBlock(pixels, pixels_stride, i, tmp);
    187    DCT1DImpl<8>()(tmp);
    188    StoreToBlockAndScale(tmp, output, i);
    189  }
    190 }
    191 
    192 JXL_INLINE JXL_MAYBE_UNUSED void TransformFromPixels(
    193    const float* JXL_RESTRICT pixels, size_t pixels_stride,
    194    float* JXL_RESTRICT coefficients, float* JXL_RESTRICT scratch_space) {
    195  DCT1D(pixels, pixels_stride, scratch_space);
    196  Transpose8x8Block(scratch_space, coefficients);
    197  DCT1D(coefficients, 8, scratch_space);
    198  Transpose8x8Block(scratch_space, coefficients);
    199 }
    200 
    201 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
    202                                                     int16_t* out) {
    203  Rebind<int16_t, DI> di16;
    204  Store(DemoteTo(di16, ival), di16, out);
    205 }
    206 
    207 JXL_INLINE JXL_MAYBE_UNUSED void StoreQuantizedValue(const Vec<DI>& ival,
    208                                                     int32_t* out) {
    209  DI di;
    210  Store(ival, di, out);
    211 }
    212 
    213 template <typename T>
    214 void QuantizeBlock(const float* dct, const float* qmc, float aq_strength,
    215                   const float* zero_bias_offset, const float* zero_bias_mul,
    216                   T* block) {
    217  D d;
    218  DI di;
    219  const auto aq_mul = Set(d, aq_strength);
    220  for (size_t k = 0; k < DCTSIZE2; k += Lanes(d)) {
    221    const auto val = Load(d, dct + k);
    222    const auto q = Load(d, qmc + k);
    223    const auto qval = Mul(val, q);
    224    const auto zb_offset = Load(d, zero_bias_offset + k);
    225    const auto zb_mul = Load(d, zero_bias_mul + k);
    226    const auto threshold = Add(zb_offset, Mul(zb_mul, aq_mul));
    227    const auto nzero_mask = Ge(Abs(qval), threshold);
    228    const auto ival = ConvertTo(di, IfThenElseZero(nzero_mask, Round(qval)));
    229    StoreQuantizedValue(ival, block + k);
    230  }
    231 }
    232 
    233 template <typename T>
    234 void ComputeCoefficientBlock(const float* JXL_RESTRICT pixels, size_t stride,
    235                             const float* JXL_RESTRICT qmc,
    236                             int16_t last_dc_coeff, float aq_strength,
    237                             const float* zero_bias_offset,
    238                             const float* zero_bias_mul,
    239                             float* JXL_RESTRICT tmp, T* block) {
    240  float* JXL_RESTRICT dct = tmp;
    241  float* JXL_RESTRICT scratch_space = tmp + DCTSIZE2;
    242  TransformFromPixels(pixels, stride, dct, scratch_space);
    243  QuantizeBlock(dct, qmc, aq_strength, zero_bias_offset, zero_bias_mul, block);
    244  // Center DC values around zero.
    245  static constexpr float kDCBias = 128.0f;
    246  const float dc = (dct[0] - kDCBias) * qmc[0];
    247  float dc_threshold = zero_bias_offset[0] + aq_strength * zero_bias_mul[0];
    248  if (std::abs(dc - last_dc_coeff) < dc_threshold) {
    249    block[0] = last_dc_coeff;
    250  } else {
    251    block[0] = std::round(dc);
    252  }
    253 }
    254 
    255 // NOLINTNEXTLINE(google-readability-namespace-comments)
    256 }  // namespace
    257 }  // namespace HWY_NAMESPACE
    258 }  // namespace jpegli
    259 HWY_AFTER_NAMESPACE();
    260 #endif  // LIB_JPEGLI_DCT_INL_H_