[ tor-browser ].git.dasho

convolve_separable5.cc (9723B)
      1 // Copyright (c) the JPEG XL Project Authors. All rights reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style
      4 // license that can be found in the LICENSE file.
      5 
      6 #include "lib/jxl/convolve.h"
      7 
      8 #undef HWY_TARGET_INCLUDE
      9 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc"
     10 #include <hwy/foreach_target.h>
     11 #include <hwy/highway.h>
     12 
     13 #include "lib/jxl/base/rect.h"
     14 #include "lib/jxl/convolve-inl.h"
     15 
     16 HWY_BEFORE_NAMESPACE();
     17 namespace jxl {
     18 namespace HWY_NAMESPACE {
     19 
     20 // These templates are not found via ADL.
     21 using hwy::HWY_NAMESPACE::Add;
     22 using hwy::HWY_NAMESPACE::Mul;
     23 using hwy::HWY_NAMESPACE::MulAdd;
     24 using hwy::HWY_NAMESPACE::Vec;
     25 
     26 // 5x5 convolution by separable kernel with a single scan through the input.
     27 // This is more cache-efficient than separate horizontal/vertical passes, and
     28 // possibly faster (given enough registers) than tiling and/or transposing.
     29 //
     30 // Overview: imagine a 5x5 window around a central pixel. First convolve the
     31 // rows by multiplying the pixels with the corresponding weights from
     32 // WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these
     33 // intermediate results by the corresponding vertical weight, i.e.
     34 // vert[abs(y_offset) * 4]. Finally, store the sum of these values as the
     35 // convolution result at the position of the central pixel in the output.
     36 //
     37 // Each of these operations uses SIMD vectors. The central pixel and most
     38 // importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1)
     39 // require unaligned loads. Because weights are supplied in identical groups of
     40 // 4, we can use LoadDup128 to load them (slightly faster).
     41 //
     42 // Uses mirrored boundary handling. Until x >= kRadius, the horizontal
     43 // convolution uses Neighbors class to shuffle vectors as if each of its lanes
     44 // had been loaded from the mirrored offset. Similarly, the last full vector to
     45 // write uses mirroring. In the case of scalar vectors, Neighbors is not usable
     46 // and the value is loaded directly. Otherwise, the number of valid pixels
     47 // modulo the vector size enables a small optimization: for smaller offsets,
     48 // a non-mirrored load is sufficient.
     49 class Separable5Strategy {
     50  using D = HWY_CAPPED(float, 16);
     51  using V = Vec<D>;
     52 
     53 public:
     54  static constexpr int64_t kRadius = 2;
     55 
     56  template <size_t kSizeModN, class WrapRow>
     57  static JXL_MAYBE_INLINE void ConvolveRow(
     58      const float* const JXL_RESTRICT row_m, const size_t xsize,
     59      const int64_t stride, const WrapRow& wrap_row,
     60      const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) {
     61    const D d;
     62    const int64_t neg_stride = -stride;  // allows LEA addressing.
     63    const float* const JXL_RESTRICT row_t2 =
     64        wrap_row(row_m + 2 * neg_stride, stride);
     65    const float* const JXL_RESTRICT row_t1 =
     66        wrap_row(row_m + 1 * neg_stride, stride);
     67    const float* const JXL_RESTRICT row_b1 =
     68        wrap_row(row_m + 1 * stride, stride);
     69    const float* const JXL_RESTRICT row_b2 =
     70        wrap_row(row_m + 2 * stride, stride);
     71 
     72    const V wh0 = LoadDup128(d, weights.horz + 0 * 4);
     73    const V wh1 = LoadDup128(d, weights.horz + 1 * 4);
     74    const V wh2 = LoadDup128(d, weights.horz + 2 * 4);
     75    const V wv0 = LoadDup128(d, weights.vert + 0 * 4);
     76    const V wv1 = LoadDup128(d, weights.vert + 1 * 4);
     77    const V wv2 = LoadDup128(d, weights.vert + 2 * 4);
     78 
     79    size_t x = 0;
     80 
     81    // More than one iteration for scalars.
     82    for (; x < kRadius; x += Lanes(d)) {
     83      const V conv0 =
     84          Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0);
     85 
     86      const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2);
     87      const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2);
     88      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
     89 
     90      const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2);
     91      const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2);
     92      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
     93      Store(conv2, d, row_out + x);
     94    }
     95 
     96    // Main loop: load inputs without padding
     97    for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) {
     98      const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0);
     99 
    100      const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2);
    101      const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2);
    102      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
    103 
    104      const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2);
    105      const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2);
    106      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
    107      Store(conv2, d, row_out + x);
    108    }
    109 
    110    // Last full vector to write (the above loop handled mod >= kRadius)
    111 #if HWY_TARGET == HWY_SCALAR
    112    while (x < xsize) {
    113 #else
    114    if (kSizeModN < kRadius) {
    115 #endif
    116      const V conv0 =
    117          Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0);
    118 
    119      const V conv1t =
    120          HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2);
    121      const V conv1b =
    122          HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2);
    123      const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0);
    124 
    125      const V conv2t =
    126          HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2);
    127      const V conv2b =
    128          HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2);
    129      const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1);
    130      Store(conv2, d, row_out + x);
    131      x += Lanes(d);
    132    }
    133 
    134    // If mod = 0, the above vector was the last.
    135    if (kSizeModN != 0) {
    136      for (; x < xsize; ++x) {
    137        float mul = 0.0f;
    138        for (int64_t dy = -kRadius; dy <= kRadius; ++dy) {
    139          const float wy = weights.vert[std::abs(dy) * 4];
    140          const float* clamped_row = wrap_row(row_m + dy * stride, stride);
    141          for (int64_t dx = -kRadius; dx <= kRadius; ++dx) {
    142            const float wx = weights.horz[std::abs(dx) * 4];
    143            const int64_t clamped_x = Mirror(x + dx, xsize);
    144            mul += clamped_row[clamped_x] * wx * wy;
    145          }
    146        }
    147        row_out[x] = mul;
    148      }
    149    }
    150  }
    151 
    152 private:
    153  // Same as HorzConvolve for the first/last vector in a row.
    154  static JXL_MAYBE_INLINE V HorzConvolveFirst(
    155      const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize,
    156      const V wh0, const V wh1, const V wh2) {
    157    const D d;
    158    const V c = LoadU(d, row + x);
    159    const V mul0 = Mul(c, wh0);
    160 
    161 #if HWY_TARGET == HWY_SCALAR
    162    const V l1 = LoadU(d, row + Mirror(x - 1, xsize));
    163    const V l2 = LoadU(d, row + Mirror(x - 2, xsize));
    164 #else
    165    (void)xsize;
    166    const V l1 = Neighbors::FirstL1(c);
    167    const V l2 = Neighbors::FirstL2(c);
    168 #endif
    169 
    170    const V r1 = LoadU(d, row + x + 1);
    171    const V r2 = LoadU(d, row + x + 2);
    172 
    173    const V mul1 = MulAdd(Add(l1, r1), wh1, mul0);
    174    const V mul2 = MulAdd(Add(l2, r2), wh2, mul1);
    175    return mul2;
    176  }
    177 
    178  template <size_t kSizeModN>
    179  static JXL_MAYBE_INLINE V
    180  HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x,
    181                   const int64_t xsize, const V wh0, const V wh1, const V wh2) {
    182    const D d;
    183    const V c = LoadU(d, row + x);
    184    const V mul0 = Mul(c, wh0);
    185 
    186    const V l1 = LoadU(d, row + x - 1);
    187    const V l2 = LoadU(d, row + x - 2);
    188 
    189    V r1;
    190    V r2;
    191 #if HWY_TARGET == HWY_SCALAR
    192    r1 = LoadU(d, row + Mirror(x + 1, xsize));
    193    r2 = LoadU(d, row + Mirror(x + 2, xsize));
    194 #else
    195    const size_t N = Lanes(d);
    196    if (kSizeModN == 0) {
    197      r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2)));
    198      r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1)));
    199    } else {  // == 1
    200      const auto last = LoadU(d, row + xsize - N);
    201      r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1)));
    202      r1 = last;
    203    }
    204 #endif
    205 
    206    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
    207    const V sum1 = Add(l1, r1);
    208    const V mul1 = MulAdd(sum1, wh1, mul0);
    209    const V sum2 = Add(l2, r2);
    210    const V mul2 = MulAdd(sum2, wh2, mul1);
    211    return mul2;
    212  }
    213 
    214  // Requires kRadius valid pixels before/after pos.
    215  static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos,
    216                                         const V wh0, const V wh1,
    217                                         const V wh2) {
    218    const D d;
    219    const V c = LoadU(d, pos);
    220    const V mul0 = Mul(c, wh0);
    221 
    222    // Loading anew is faster than combining vectors.
    223    const V l1 = LoadU(d, pos - 1);
    224    const V r1 = LoadU(d, pos + 1);
    225    const V l2 = LoadU(d, pos - 2);
    226    const V r2 = LoadU(d, pos + 2);
    227    // Sum of pixels with Manhattan distance i, multiplied by weights[i].
    228    const V sum1 = Add(l1, r1);
    229    const V mul1 = MulAdd(sum1, wh1, mul0);
    230    const V sum2 = Add(l2, r2);
    231    const V mul2 = MulAdd(sum2, wh2, mul1);
    232    return mul2;
    233  }
    234 };
    235 
    236 Status Separable5(const ImageF& in, const Rect& rect,
    237                  const WeightsSeparable5& weights, ThreadPool* pool,
    238                  ImageF* out) {
    239  using Conv = ConvolveT<Separable5Strategy>;
    240  if (rect.xsize() >= Conv::MinWidth()) {
    241    JXL_ENSURE(SameSize(rect, *out));
    242    JXL_ENSURE(rect.xsize() >= Conv::MinWidth());
    243 
    244    Conv::Run(in, rect, weights, pool, out);
    245    return true;
    246  }
    247 
    248  return SlowSeparable5(in, rect, weights, pool, out, Rect(*out));
    249 }
    250 
    251 // NOLINTNEXTLINE(google-readability-namespace-comments)
    252 }  // namespace HWY_NAMESPACE
    253 }  // namespace jxl
    254 HWY_AFTER_NAMESPACE();
    255 
    256 #if HWY_ONCE
    257 namespace jxl {
    258 
    259 HWY_EXPORT(Separable5);
    260 Status Separable5(const ImageF& in, const Rect& rect,
    261                  const WeightsSeparable5& weights, ThreadPool* pool,
    262                  ImageF* out) {
    263  return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out);
    264 }
    265 
    266 }  // namespace jxl
    267 #endif  // HWY_ONCE
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE