convolve_separable5.cc (9723B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/convolve.h" 7 8 #undef HWY_TARGET_INCLUDE 9 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_separable5.cc" 10 #include <hwy/foreach_target.h> 11 #include <hwy/highway.h> 12 13 #include "lib/jxl/base/rect.h" 14 #include "lib/jxl/convolve-inl.h" 15 16 HWY_BEFORE_NAMESPACE(); 17 namespace jxl { 18 namespace HWY_NAMESPACE { 19 20 // These templates are not found via ADL. 21 using hwy::HWY_NAMESPACE::Add; 22 using hwy::HWY_NAMESPACE::Mul; 23 using hwy::HWY_NAMESPACE::MulAdd; 24 using hwy::HWY_NAMESPACE::Vec; 25 26 // 5x5 convolution by separable kernel with a single scan through the input. 27 // This is more cache-efficient than separate horizontal/vertical passes, and 28 // possibly faster (given enough registers) than tiling and/or transposing. 29 // 30 // Overview: imagine a 5x5 window around a central pixel. First convolve the 31 // rows by multiplying the pixels with the corresponding weights from 32 // WeightsSeparable5.horz[abs(x_offset) * 4]. Then multiply each of these 33 // intermediate results by the corresponding vertical weight, i.e. 34 // vert[abs(y_offset) * 4]. Finally, store the sum of these values as the 35 // convolution result at the position of the central pixel in the output. 36 // 37 // Each of these operations uses SIMD vectors. The central pixel and most 38 // importantly the output are aligned, so neighnoring pixels (e.g. x_offset=1) 39 // require unaligned loads. Because weights are supplied in identical groups of 40 // 4, we can use LoadDup128 to load them (slightly faster). 41 // 42 // Uses mirrored boundary handling. Until x >= kRadius, the horizontal 43 // convolution uses Neighbors class to shuffle vectors as if each of its lanes 44 // had been loaded from the mirrored offset. Similarly, the last full vector to 45 // write uses mirroring. In the case of scalar vectors, Neighbors is not usable 46 // and the value is loaded directly. Otherwise, the number of valid pixels 47 // modulo the vector size enables a small optimization: for smaller offsets, 48 // a non-mirrored load is sufficient. 49 class Separable5Strategy { 50 using D = HWY_CAPPED(float, 16); 51 using V = Vec<D>; 52 53 public: 54 static constexpr int64_t kRadius = 2; 55 56 template <size_t kSizeModN, class WrapRow> 57 static JXL_MAYBE_INLINE void ConvolveRow( 58 const float* const JXL_RESTRICT row_m, const size_t xsize, 59 const int64_t stride, const WrapRow& wrap_row, 60 const WeightsSeparable5& weights, float* const JXL_RESTRICT row_out) { 61 const D d; 62 const int64_t neg_stride = -stride; // allows LEA addressing. 63 const float* const JXL_RESTRICT row_t2 = 64 wrap_row(row_m + 2 * neg_stride, stride); 65 const float* const JXL_RESTRICT row_t1 = 66 wrap_row(row_m + 1 * neg_stride, stride); 67 const float* const JXL_RESTRICT row_b1 = 68 wrap_row(row_m + 1 * stride, stride); 69 const float* const JXL_RESTRICT row_b2 = 70 wrap_row(row_m + 2 * stride, stride); 71 72 const V wh0 = LoadDup128(d, weights.horz + 0 * 4); 73 const V wh1 = LoadDup128(d, weights.horz + 1 * 4); 74 const V wh2 = LoadDup128(d, weights.horz + 2 * 4); 75 const V wv0 = LoadDup128(d, weights.vert + 0 * 4); 76 const V wv1 = LoadDup128(d, weights.vert + 1 * 4); 77 const V wv2 = LoadDup128(d, weights.vert + 2 * 4); 78 79 size_t x = 0; 80 81 // More than one iteration for scalars. 82 for (; x < kRadius; x += Lanes(d)) { 83 const V conv0 = 84 Mul(HorzConvolveFirst(row_m, x, xsize, wh0, wh1, wh2), wv0); 85 86 const V conv1t = HorzConvolveFirst(row_t1, x, xsize, wh0, wh1, wh2); 87 const V conv1b = HorzConvolveFirst(row_b1, x, xsize, wh0, wh1, wh2); 88 const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); 89 90 const V conv2t = HorzConvolveFirst(row_t2, x, xsize, wh0, wh1, wh2); 91 const V conv2b = HorzConvolveFirst(row_b2, x, xsize, wh0, wh1, wh2); 92 const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); 93 Store(conv2, d, row_out + x); 94 } 95 96 // Main loop: load inputs without padding 97 for (; x + Lanes(d) + kRadius <= xsize; x += Lanes(d)) { 98 const V conv0 = Mul(HorzConvolve(row_m + x, wh0, wh1, wh2), wv0); 99 100 const V conv1t = HorzConvolve(row_t1 + x, wh0, wh1, wh2); 101 const V conv1b = HorzConvolve(row_b1 + x, wh0, wh1, wh2); 102 const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); 103 104 const V conv2t = HorzConvolve(row_t2 + x, wh0, wh1, wh2); 105 const V conv2b = HorzConvolve(row_b2 + x, wh0, wh1, wh2); 106 const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); 107 Store(conv2, d, row_out + x); 108 } 109 110 // Last full vector to write (the above loop handled mod >= kRadius) 111 #if HWY_TARGET == HWY_SCALAR 112 while (x < xsize) { 113 #else 114 if (kSizeModN < kRadius) { 115 #endif 116 const V conv0 = 117 Mul(HorzConvolveLast<kSizeModN>(row_m, x, xsize, wh0, wh1, wh2), wv0); 118 119 const V conv1t = 120 HorzConvolveLast<kSizeModN>(row_t1, x, xsize, wh0, wh1, wh2); 121 const V conv1b = 122 HorzConvolveLast<kSizeModN>(row_b1, x, xsize, wh0, wh1, wh2); 123 const V conv1 = MulAdd(Add(conv1t, conv1b), wv1, conv0); 124 125 const V conv2t = 126 HorzConvolveLast<kSizeModN>(row_t2, x, xsize, wh0, wh1, wh2); 127 const V conv2b = 128 HorzConvolveLast<kSizeModN>(row_b2, x, xsize, wh0, wh1, wh2); 129 const V conv2 = MulAdd(Add(conv2t, conv2b), wv2, conv1); 130 Store(conv2, d, row_out + x); 131 x += Lanes(d); 132 } 133 134 // If mod = 0, the above vector was the last. 135 if (kSizeModN != 0) { 136 for (; x < xsize; ++x) { 137 float mul = 0.0f; 138 for (int64_t dy = -kRadius; dy <= kRadius; ++dy) { 139 const float wy = weights.vert[std::abs(dy) * 4]; 140 const float* clamped_row = wrap_row(row_m + dy * stride, stride); 141 for (int64_t dx = -kRadius; dx <= kRadius; ++dx) { 142 const float wx = weights.horz[std::abs(dx) * 4]; 143 const int64_t clamped_x = Mirror(x + dx, xsize); 144 mul += clamped_row[clamped_x] * wx * wy; 145 } 146 } 147 row_out[x] = mul; 148 } 149 } 150 } 151 152 private: 153 // Same as HorzConvolve for the first/last vector in a row. 154 static JXL_MAYBE_INLINE V HorzConvolveFirst( 155 const float* const JXL_RESTRICT row, const int64_t x, const int64_t xsize, 156 const V wh0, const V wh1, const V wh2) { 157 const D d; 158 const V c = LoadU(d, row + x); 159 const V mul0 = Mul(c, wh0); 160 161 #if HWY_TARGET == HWY_SCALAR 162 const V l1 = LoadU(d, row + Mirror(x - 1, xsize)); 163 const V l2 = LoadU(d, row + Mirror(x - 2, xsize)); 164 #else 165 (void)xsize; 166 const V l1 = Neighbors::FirstL1(c); 167 const V l2 = Neighbors::FirstL2(c); 168 #endif 169 170 const V r1 = LoadU(d, row + x + 1); 171 const V r2 = LoadU(d, row + x + 2); 172 173 const V mul1 = MulAdd(Add(l1, r1), wh1, mul0); 174 const V mul2 = MulAdd(Add(l2, r2), wh2, mul1); 175 return mul2; 176 } 177 178 template <size_t kSizeModN> 179 static JXL_MAYBE_INLINE V 180 HorzConvolveLast(const float* const JXL_RESTRICT row, const int64_t x, 181 const int64_t xsize, const V wh0, const V wh1, const V wh2) { 182 const D d; 183 const V c = LoadU(d, row + x); 184 const V mul0 = Mul(c, wh0); 185 186 const V l1 = LoadU(d, row + x - 1); 187 const V l2 = LoadU(d, row + x - 2); 188 189 V r1; 190 V r2; 191 #if HWY_TARGET == HWY_SCALAR 192 r1 = LoadU(d, row + Mirror(x + 1, xsize)); 193 r2 = LoadU(d, row + Mirror(x + 2, xsize)); 194 #else 195 const size_t N = Lanes(d); 196 if (kSizeModN == 0) { 197 r2 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 2))); 198 r1 = TableLookupLanes(c, SetTableIndices(d, MirrorLanes(N - 1))); 199 } else { // == 1 200 const auto last = LoadU(d, row + xsize - N); 201 r2 = TableLookupLanes(last, SetTableIndices(d, MirrorLanes(N - 1))); 202 r1 = last; 203 } 204 #endif 205 206 // Sum of pixels with Manhattan distance i, multiplied by weights[i]. 207 const V sum1 = Add(l1, r1); 208 const V mul1 = MulAdd(sum1, wh1, mul0); 209 const V sum2 = Add(l2, r2); 210 const V mul2 = MulAdd(sum2, wh2, mul1); 211 return mul2; 212 } 213 214 // Requires kRadius valid pixels before/after pos. 215 static JXL_MAYBE_INLINE V HorzConvolve(const float* const JXL_RESTRICT pos, 216 const V wh0, const V wh1, 217 const V wh2) { 218 const D d; 219 const V c = LoadU(d, pos); 220 const V mul0 = Mul(c, wh0); 221 222 // Loading anew is faster than combining vectors. 223 const V l1 = LoadU(d, pos - 1); 224 const V r1 = LoadU(d, pos + 1); 225 const V l2 = LoadU(d, pos - 2); 226 const V r2 = LoadU(d, pos + 2); 227 // Sum of pixels with Manhattan distance i, multiplied by weights[i]. 228 const V sum1 = Add(l1, r1); 229 const V mul1 = MulAdd(sum1, wh1, mul0); 230 const V sum2 = Add(l2, r2); 231 const V mul2 = MulAdd(sum2, wh2, mul1); 232 return mul2; 233 } 234 }; 235 236 Status Separable5(const ImageF& in, const Rect& rect, 237 const WeightsSeparable5& weights, ThreadPool* pool, 238 ImageF* out) { 239 using Conv = ConvolveT<Separable5Strategy>; 240 if (rect.xsize() >= Conv::MinWidth()) { 241 JXL_ENSURE(SameSize(rect, *out)); 242 JXL_ENSURE(rect.xsize() >= Conv::MinWidth()); 243 244 Conv::Run(in, rect, weights, pool, out); 245 return true; 246 } 247 248 return SlowSeparable5(in, rect, weights, pool, out, Rect(*out)); 249 } 250 251 // NOLINTNEXTLINE(google-readability-namespace-comments) 252 } // namespace HWY_NAMESPACE 253 } // namespace jxl 254 HWY_AFTER_NAMESPACE(); 255 256 #if HWY_ONCE 257 namespace jxl { 258 259 HWY_EXPORT(Separable5); 260 Status Separable5(const ImageF& in, const Rect& rect, 261 const WeightsSeparable5& weights, ThreadPool* pool, 262 ImageF* out) { 263 return HWY_DYNAMIC_DISPATCH(Separable5)(in, rect, weights, pool, out); 264 } 265 266 } // namespace jxl 267 #endif // HWY_ONCE