tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_symmetric5.cc (7210B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/base/status.h"
      7 #include "lib/jxl/convolve.h"
      8 
      9 #undef HWY_TARGET_INCLUDE
     10 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc"
     11 #include <hwy/foreach_target.h>
     12 #include <hwy/highway.h>
     13 
     14 #include "lib/jxl/base/common.h"
     15 #include "lib/jxl/base/rect.h"
     16 #include "lib/jxl/convolve-inl.h"
     17 
     18 HWY_BEFORE_NAMESPACE();
     19 namespace jxl {
     20 namespace HWY_NAMESPACE {
     21 
     22 // These templates are not found via ADL.
     23 using hwy::HWY_NAMESPACE::Add;
     24 using hwy::HWY_NAMESPACE::Mul;
     25 using hwy::HWY_NAMESPACE::Vec;
     26 
     27 // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2].
     28 template <class WrapY>
     29 static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y,
     30                               const int64_t ix, const int64_t iy,
     31                               const size_t xsize, const size_t ysize,
     32                               const float wx0, const float wx1,
     33                               const float wx2) {
     34  const WrapMirror wrap_x;
     35  const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize));
     36  const float in_m2 = row[wrap_x(ix - 2, xsize)];
     37  const float in_p2 = row[wrap_x(ix + 2, xsize)];
     38  const float in_m1 = row[wrap_x(ix - 1, xsize)];
     39  const float in_p1 = row[wrap_x(ix + 1, xsize)];
     40  const float in_00 = row[ix];
     41  const float sum_2 = wx2 * (in_m2 + in_p2);
     42  const float sum_1 = wx1 * (in_m1 + in_p1);
     43  const float sum_0 = wx0 * in_00;
     44  return sum_2 + (sum_1 + sum_0);
     45 }
     46 
     47 template <class WrapY, class V>
     48 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
     49                     const int64_t iy, const size_t ysize, const V wx0,
     50                     const V wx1, const V wx2) {
     51  const HWY_FULL(float) d;
     52  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
     53  const auto in_m2 = LoadU(d, center - 2);
     54  const auto in_p2 = LoadU(d, center + 2);
     55  const auto in_m1 = LoadU(d, center - 1);
     56  const auto in_p1 = LoadU(d, center + 1);
     57  const auto in_00 = LoadU(d, center);
     58  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
     59  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
     60  const auto sum_0 = Mul(wx0, in_00);
     61  return Add(sum_2, Add(sum_1, sum_0));
     62 }
     63 
     64 // Produces result for one pixel
     65 template <class WrapY>
     66 float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy,
     67                       const WeightsSymmetric5& weights) {
     68  const float w0 = weights.c[0];
     69  const float w1 = weights.r[0];
     70  const float w2 = weights.R[0];
     71  const float w4 = weights.d[0];
     72  const float w5 = weights.L[0];
     73  const float w8 = weights.D[0];
     74 
     75  const size_t xsize = in.xsize();
     76  const size_t ysize = in.ysize();
     77  const WrapY wrap_y;
     78  // Unrolled loop over all 5 rows of the kernel.
     79  float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2);
     80 
     81  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8);
     82  float sum1 =
     83      WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8);
     84 
     85  sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5);
     86  sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5);
     87 
     88  return sum0 + sum1;
     89 }
     90 
     91 // Produces result for one vector's worth of pixels
     92 template <class WrapY>
     93 static void Symmetric5Interior(const ImageF& in, const int64_t ix,
     94                               const int64_t rix, const int64_t iy,
     95                               const WeightsSymmetric5& weights,
     96                               float* JXL_RESTRICT row_out) {
     97  const HWY_FULL(float) d;
     98 
     99  const auto w0 = LoadDup128(d, weights.c);
    100  const auto w1 = LoadDup128(d, weights.r);
    101  const auto w2 = LoadDup128(d, weights.R);
    102  const auto w4 = LoadDup128(d, weights.d);
    103  const auto w5 = LoadDup128(d, weights.L);
    104  const auto w8 = LoadDup128(d, weights.D);
    105 
    106  const size_t ysize = in.ysize();
    107  const WrapY wrap_y;
    108  // Unrolled loop over all 5 rows of the kernel.
    109  auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2);
    110 
    111  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8));
    112  auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8);
    113 
    114  sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5));
    115  sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5));
    116 
    117  StoreU(Add(sum0, sum1), d, row_out + rix);
    118 }
    119 
    120 template <class WrapY>
    121 static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy,
    122                          const WeightsSymmetric5& weights,
    123                          float* JXL_RESTRICT row_out) {
    124  const int64_t kRadius = 2;
    125  const size_t xend = rect.x1();
    126 
    127  size_t rix = 0;
    128  size_t ix = rect.x0();
    129  const HWY_FULL(float) d;
    130  const size_t N = Lanes(d);
    131  const size_t aligned_x = RoundUpTo(kRadius, N);
    132  for (; ix < std::min(aligned_x, xend); ++ix, ++rix) {
    133    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
    134  }
    135  for (; ix + N + kRadius <= xend; ix += N, rix += N) {
    136    Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out);
    137  }
    138  for (; ix < xend; ++ix, ++rix) {
    139    row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights);
    140  }
    141 }
    142 
    143 // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike
    144 // the fully vectorized strategies below.
    145 Status Symmetric5(const ImageF& in, const Rect& in_rect,
    146                  const WeightsSymmetric5& weights, ThreadPool* pool,
    147                  ImageF* JXL_RESTRICT out, const Rect& out_rect) {
    148  JXL_ENSURE(in_rect.xsize() == out_rect.xsize());
    149  JXL_ENSURE(in_rect.ysize() == out_rect.ysize());
    150  const size_t ysize = in_rect.ysize();
    151  const auto process_row = [&](const uint32_t task,
    152                               size_t /*thread*/) -> Status {
    153    const int64_t riy = task;
    154    const int64_t iy = in_rect.y0() + riy;
    155 
    156    if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) {
    157      Symmetric5Row<WrapMirror>(in, in_rect, iy, weights,
    158                                out_rect.Row(out, riy));
    159    } else {
    160      Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights,
    161                                   out_rect.Row(out, riy));
    162    }
    163    return true;
    164  };
    165  JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize),
    166                                ThreadPool::NoInit, process_row,
    167                                "Symmetric5x5Convolution"));
    168  return true;
    169 }
    170 
    171 // NOLINTNEXTLINE(google-readability-namespace-comments)
    172 }  // namespace HWY_NAMESPACE
    173 }  // namespace jxl
    174 HWY_AFTER_NAMESPACE();
    175 
    176 #if HWY_ONCE
    177 namespace jxl {
    178 
    179 HWY_EXPORT(Symmetric5);
    180 Status Symmetric5(const ImageF& in, const Rect& in_rect,
    181                  const WeightsSymmetric5& weights, ThreadPool* pool,
    182                  ImageF* JXL_RESTRICT out, const Rect& out_rect) {
    183  return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out,
    184                                          out_rect);
    185 }
    186 
    187 Status Symmetric5(const ImageF& in, const Rect& rect,
    188                  const WeightsSymmetric5& weights, ThreadPool* pool,
    189                  ImageF* JXL_RESTRICT out) {
    190  return Symmetric5(in, rect, weights, pool, out, Rect(*out));
    191 }
    192 
    193 }  // namespace jxl
    194 #endif  // HWY_ONCE