tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

convolve_symmetric3.cc (7031B)


      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/base/status.h"
      7 #include "lib/jxl/convolve.h"
      8 
      9 #undef HWY_TARGET_INCLUDE
     10 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric3.cc"
     11 #include <hwy/foreach_target.h>
     12 #include <hwy/highway.h>
     13 
     14 #include "lib/jxl/base/rect.h"
     15 #include "lib/jxl/convolve-inl.h"
     16 
     17 HWY_BEFORE_NAMESPACE();
     18 namespace jxl {
     19 namespace HWY_NAMESPACE {
     20 
     21 // These templates are not found via ADL.
     22 using hwy::HWY_NAMESPACE::Add;
     23 using hwy::HWY_NAMESPACE::Mul;
     24 using hwy::HWY_NAMESPACE::MulAdd;
     25 using hwy::HWY_NAMESPACE::Vec;
     26 
     27 template <class WrapY, class V>
     28 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix,
     29                     const int64_t iy, const size_t ysize, const V wx0,
     30                     const V wx1, const V wx2) {
     31  const HWY_FULL(float) d;
     32  const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix;
     33  const auto in_m2 = LoadU(d, center - 2);
     34  const auto in_p2 = LoadU(d, center + 2);
     35  const auto in_m1 = LoadU(d, center - 1);
     36  const auto in_p1 = LoadU(d, center + 1);
     37  const auto in_00 = Load(d, center);
     38  const auto sum_2 = Mul(wx2, Add(in_m2, in_p2));
     39  const auto sum_1 = Mul(wx1, Add(in_m1, in_p1));
     40  const auto sum_0 = Mul(wx0, in_00);
     41  return Add(sum_2, Add(sum_1, sum_0));
     42 }
     43 
     44 // 3x3 convolution by symmetric kernel with a single scan through the input.
     45 class Symmetric3Strategy {
     46  using D = HWY_CAPPED(float, 16);
     47  using V = Vec<D>;
     48 
     49 public:
     50  static constexpr int64_t kRadius = 1;
     51 
     52  // Only accesses pixels in [0, xsize).
     53  template <size_t kSizeModN, class WrapRow>
     54  static JXL_MAYBE_INLINE void ConvolveRow(
     55      const float* const JXL_RESTRICT row_m, const size_t xsize,
     56      const int64_t stride, const WrapRow& wrap_row,
     57      const WeightsSymmetric3& weights, float* const JXL_RESTRICT row_out) {
     58    const D d;
     59    // t, m, b = top, middle, bottom row;
     60    const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride);
     61    const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride);
     62 
     63    // Must load in advance - compiler doesn't understand LoadDup128 and
     64    // schedules them too late.
     65    const V w0 = LoadDup128(d, weights.c);
     66    const V w1 = LoadDup128(d, weights.r);
     67    const V w2 = LoadDup128(d, weights.d);
     68 
     69    // l, c, r = left, center, right. Leftmost vector: need FirstL1.
     70    {
     71      const V tc = LoadU(d, row_t + 0);
     72      const V mc = LoadU(d, row_m + 0);
     73      const V bc = LoadU(d, row_b + 0);
     74      const V tl = Neighbors::FirstL1(tc);
     75      const V tr = LoadU(d, row_t + 0 + 1);
     76      const V ml = Neighbors::FirstL1(mc);
     77      const V mr = LoadU(d, row_m + 0 + 1);
     78      const V bl = Neighbors::FirstL1(bc);
     79      const V br = LoadU(d, row_b + 0 + 1);
     80      const V conv =
     81          WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
     82      Store(conv, d, row_out + 0);
     83    }
     84 
     85    // Loop as long as we can load enough new values:
     86    const size_t N = Lanes(d);
     87    size_t x = N;
     88    for (; x + N + kRadius <= xsize; x += N) {
     89      const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2);
     90      Store(conv, d, row_out + x);
     91    }
     92 
     93    // For final (partial) vector:
     94    const V tc = LoadU(d, row_t + x);
     95    const V mc = LoadU(d, row_m + x);
     96    const V bc = LoadU(d, row_b + x);
     97 
     98    V tr;
     99    V mr;
    100    V br;
    101 #if HWY_TARGET == HWY_SCALAR
    102    tr = tc;  // Single-lane => mirrored right neighbor = center value.
    103    mr = mc;
    104    br = bc;
    105 #else
    106    if (kSizeModN == 0) {
    107      // The above loop didn't handle the last vector because it needs an
    108      // additional right neighbor (generated via mirroring).
    109      auto mirror = SetTableIndices(d, MirrorLanes(N - 1));
    110      tr = TableLookupLanes(tc, mirror);
    111      mr = TableLookupLanes(mc, mirror);
    112      br = TableLookupLanes(bc, mirror);
    113    } else {
    114      auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1));
    115      // Loads last valid value into uppermost lane and mirrors.
    116      tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror);
    117      mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror);
    118      br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror);
    119    }
    120 #endif
    121 
    122    const V tl = LoadU(d, row_t + x - 1);
    123    const V ml = LoadU(d, row_m + x - 1);
    124    const V bl = LoadU(d, row_b + x - 1);
    125    const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
    126    Store(conv, d, row_out + x);
    127  }
    128 
    129 private:
    130  // Returns sum{x_i * w_i}.
    131  template <class V>
    132  static JXL_MAYBE_INLINE V WeightedSum(const V tl, const V tc, const V tr,
    133                                        const V ml, const V mc, const V mr,
    134                                        const V bl, const V bc, const V br,
    135                                        const V w0, const V w1, const V w2) {
    136    const V sum_tb = Add(tc, bc);
    137 
    138    // Faster than 5 mul + 4 FMA.
    139    const V mul0 = Mul(mc, w0);
    140    const V sum_lr = Add(ml, mr);
    141 
    142    const V x1 = Add(sum_tb, sum_lr);
    143    const V mul1 = MulAdd(x1, w1, mul0);
    144 
    145    const V sum_t2 = Add(tl, tr);
    146    const V sum_b2 = Add(bl, br);
    147    const V x2 = Add(sum_t2, sum_b2);
    148    const V mul2 = MulAdd(x2, w2, mul1);
    149    return mul2;
    150  }
    151 
    152  static JXL_MAYBE_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t,
    153                                          const float* JXL_RESTRICT row_m,
    154                                          const float* JXL_RESTRICT row_b,
    155                                          const int64_t x, const V w0,
    156                                          const V w1, const V w2) {
    157    const D d;
    158    const V tc = LoadU(d, row_t + x);
    159    const V mc = LoadU(d, row_m + x);
    160    const V bc = LoadU(d, row_b + x);
    161    const V tl = LoadU(d, row_t + x - 1);
    162    const V tr = LoadU(d, row_t + x + 1);
    163    const V ml = LoadU(d, row_m + x - 1);
    164    const V mr = LoadU(d, row_m + x + 1);
    165    const V bl = LoadU(d, row_b + x - 1);
    166    const V br = LoadU(d, row_b + x + 1);
    167    return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2);
    168  }
    169 };
    170 
    171 Status Symmetric3(const ImageF& in, const Rect& rect,
    172                  const WeightsSymmetric3& weights, ThreadPool* pool,
    173                  ImageF* out) {
    174  using Conv = ConvolveT<Symmetric3Strategy>;
    175  if (rect.xsize() >= Conv::MinWidth()) {
    176    JXL_ENSURE(SameSize(rect, *out));
    177    JXL_ENSURE(rect.xsize() >= Conv::MinWidth());
    178    Conv::Run(in, rect, weights, pool, out);
    179    return true;
    180  }
    181 
    182  JXL_RETURN_IF_ERROR(SlowSymmetric3(in, rect, weights, pool, out));
    183  return true;
    184 }
    185 
    186 // NOLINTNEXTLINE(google-readability-namespace-comments)
    187 }  // namespace HWY_NAMESPACE
    188 }  // namespace jxl
    189 HWY_AFTER_NAMESPACE();
    190 
    191 #if HWY_ONCE
    192 namespace jxl {
    193 
    194 HWY_EXPORT(Symmetric3);
    195 Status Symmetric3(const ImageF& in, const Rect& rect,
    196                  const WeightsSymmetric3& weights, ThreadPool* pool,
    197                  ImageF* out) {
    198  return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out);
    199 }
    200 
    201 }  // namespace jxl
    202 #endif  // HWY_ONCE