convolve_symmetric3.cc (7031B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/base/status.h" 7 #include "lib/jxl/convolve.h" 8 9 #undef HWY_TARGET_INCLUDE 10 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric3.cc" 11 #include <hwy/foreach_target.h> 12 #include <hwy/highway.h> 13 14 #include "lib/jxl/base/rect.h" 15 #include "lib/jxl/convolve-inl.h" 16 17 HWY_BEFORE_NAMESPACE(); 18 namespace jxl { 19 namespace HWY_NAMESPACE { 20 21 // These templates are not found via ADL. 22 using hwy::HWY_NAMESPACE::Add; 23 using hwy::HWY_NAMESPACE::Mul; 24 using hwy::HWY_NAMESPACE::MulAdd; 25 using hwy::HWY_NAMESPACE::Vec; 26 27 template <class WrapY, class V> 28 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, 29 const int64_t iy, const size_t ysize, const V wx0, 30 const V wx1, const V wx2) { 31 const HWY_FULL(float) d; 32 const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; 33 const auto in_m2 = LoadU(d, center - 2); 34 const auto in_p2 = LoadU(d, center + 2); 35 const auto in_m1 = LoadU(d, center - 1); 36 const auto in_p1 = LoadU(d, center + 1); 37 const auto in_00 = Load(d, center); 38 const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); 39 const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); 40 const auto sum_0 = Mul(wx0, in_00); 41 return Add(sum_2, Add(sum_1, sum_0)); 42 } 43 44 // 3x3 convolution by symmetric kernel with a single scan through the input. 45 class Symmetric3Strategy { 46 using D = HWY_CAPPED(float, 16); 47 using V = Vec<D>; 48 49 public: 50 static constexpr int64_t kRadius = 1; 51 52 // Only accesses pixels in [0, xsize). 53 template <size_t kSizeModN, class WrapRow> 54 static JXL_MAYBE_INLINE void ConvolveRow( 55 const float* const JXL_RESTRICT row_m, const size_t xsize, 56 const int64_t stride, const WrapRow& wrap_row, 57 const WeightsSymmetric3& weights, float* const JXL_RESTRICT row_out) { 58 const D d; 59 // t, m, b = top, middle, bottom row; 60 const float* const JXL_RESTRICT row_t = wrap_row(row_m - stride, stride); 61 const float* const JXL_RESTRICT row_b = wrap_row(row_m + stride, stride); 62 63 // Must load in advance - compiler doesn't understand LoadDup128 and 64 // schedules them too late. 65 const V w0 = LoadDup128(d, weights.c); 66 const V w1 = LoadDup128(d, weights.r); 67 const V w2 = LoadDup128(d, weights.d); 68 69 // l, c, r = left, center, right. Leftmost vector: need FirstL1. 70 { 71 const V tc = LoadU(d, row_t + 0); 72 const V mc = LoadU(d, row_m + 0); 73 const V bc = LoadU(d, row_b + 0); 74 const V tl = Neighbors::FirstL1(tc); 75 const V tr = LoadU(d, row_t + 0 + 1); 76 const V ml = Neighbors::FirstL1(mc); 77 const V mr = LoadU(d, row_m + 0 + 1); 78 const V bl = Neighbors::FirstL1(bc); 79 const V br = LoadU(d, row_b + 0 + 1); 80 const V conv = 81 WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); 82 Store(conv, d, row_out + 0); 83 } 84 85 // Loop as long as we can load enough new values: 86 const size_t N = Lanes(d); 87 size_t x = N; 88 for (; x + N + kRadius <= xsize; x += N) { 89 const auto conv = ConvolveValid(row_t, row_m, row_b, x, w0, w1, w2); 90 Store(conv, d, row_out + x); 91 } 92 93 // For final (partial) vector: 94 const V tc = LoadU(d, row_t + x); 95 const V mc = LoadU(d, row_m + x); 96 const V bc = LoadU(d, row_b + x); 97 98 V tr; 99 V mr; 100 V br; 101 #if HWY_TARGET == HWY_SCALAR 102 tr = tc; // Single-lane => mirrored right neighbor = center value. 103 mr = mc; 104 br = bc; 105 #else 106 if (kSizeModN == 0) { 107 // The above loop didn't handle the last vector because it needs an 108 // additional right neighbor (generated via mirroring). 109 auto mirror = SetTableIndices(d, MirrorLanes(N - 1)); 110 tr = TableLookupLanes(tc, mirror); 111 mr = TableLookupLanes(mc, mirror); 112 br = TableLookupLanes(bc, mirror); 113 } else { 114 auto mirror = SetTableIndices(d, MirrorLanes((xsize % N) - 1)); 115 // Loads last valid value into uppermost lane and mirrors. 116 tr = TableLookupLanes(LoadU(d, row_t + xsize - N), mirror); 117 mr = TableLookupLanes(LoadU(d, row_m + xsize - N), mirror); 118 br = TableLookupLanes(LoadU(d, row_b + xsize - N), mirror); 119 } 120 #endif 121 122 const V tl = LoadU(d, row_t + x - 1); 123 const V ml = LoadU(d, row_m + x - 1); 124 const V bl = LoadU(d, row_b + x - 1); 125 const V conv = WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); 126 Store(conv, d, row_out + x); 127 } 128 129 private: 130 // Returns sum{x_i * w_i}. 131 template <class V> 132 static JXL_MAYBE_INLINE V WeightedSum(const V tl, const V tc, const V tr, 133 const V ml, const V mc, const V mr, 134 const V bl, const V bc, const V br, 135 const V w0, const V w1, const V w2) { 136 const V sum_tb = Add(tc, bc); 137 138 // Faster than 5 mul + 4 FMA. 139 const V mul0 = Mul(mc, w0); 140 const V sum_lr = Add(ml, mr); 141 142 const V x1 = Add(sum_tb, sum_lr); 143 const V mul1 = MulAdd(x1, w1, mul0); 144 145 const V sum_t2 = Add(tl, tr); 146 const V sum_b2 = Add(bl, br); 147 const V x2 = Add(sum_t2, sum_b2); 148 const V mul2 = MulAdd(x2, w2, mul1); 149 return mul2; 150 } 151 152 static JXL_MAYBE_INLINE V ConvolveValid(const float* JXL_RESTRICT row_t, 153 const float* JXL_RESTRICT row_m, 154 const float* JXL_RESTRICT row_b, 155 const int64_t x, const V w0, 156 const V w1, const V w2) { 157 const D d; 158 const V tc = LoadU(d, row_t + x); 159 const V mc = LoadU(d, row_m + x); 160 const V bc = LoadU(d, row_b + x); 161 const V tl = LoadU(d, row_t + x - 1); 162 const V tr = LoadU(d, row_t + x + 1); 163 const V ml = LoadU(d, row_m + x - 1); 164 const V mr = LoadU(d, row_m + x + 1); 165 const V bl = LoadU(d, row_b + x - 1); 166 const V br = LoadU(d, row_b + x + 1); 167 return WeightedSum(tl, tc, tr, ml, mc, mr, bl, bc, br, w0, w1, w2); 168 } 169 }; 170 171 Status Symmetric3(const ImageF& in, const Rect& rect, 172 const WeightsSymmetric3& weights, ThreadPool* pool, 173 ImageF* out) { 174 using Conv = ConvolveT<Symmetric3Strategy>; 175 if (rect.xsize() >= Conv::MinWidth()) { 176 JXL_ENSURE(SameSize(rect, *out)); 177 JXL_ENSURE(rect.xsize() >= Conv::MinWidth()); 178 Conv::Run(in, rect, weights, pool, out); 179 return true; 180 } 181 182 JXL_RETURN_IF_ERROR(SlowSymmetric3(in, rect, weights, pool, out)); 183 return true; 184 } 185 186 // NOLINTNEXTLINE(google-readability-namespace-comments) 187 } // namespace HWY_NAMESPACE 188 } // namespace jxl 189 HWY_AFTER_NAMESPACE(); 190 191 #if HWY_ONCE 192 namespace jxl { 193 194 HWY_EXPORT(Symmetric3); 195 Status Symmetric3(const ImageF& in, const Rect& rect, 196 const WeightsSymmetric3& weights, ThreadPool* pool, 197 ImageF* out) { 198 return HWY_DYNAMIC_DISPATCH(Symmetric3)(in, rect, weights, pool, out); 199 } 200 201 } // namespace jxl 202 #endif // HWY_ONCE