convolve_symmetric5.cc (7210B)
1 // Copyright (c) the JPEG XL Project Authors. All rights reserved. 2 // 3 // Use of this source code is governed by a BSD-style 4 // license that can be found in the LICENSE file. 5 6 #include "lib/jxl/base/status.h" 7 #include "lib/jxl/convolve.h" 8 9 #undef HWY_TARGET_INCLUDE 10 #define HWY_TARGET_INCLUDE "lib/jxl/convolve_symmetric5.cc" 11 #include <hwy/foreach_target.h> 12 #include <hwy/highway.h> 13 14 #include "lib/jxl/base/common.h" 15 #include "lib/jxl/base/rect.h" 16 #include "lib/jxl/convolve-inl.h" 17 18 HWY_BEFORE_NAMESPACE(); 19 namespace jxl { 20 namespace HWY_NAMESPACE { 21 22 // These templates are not found via ADL. 23 using hwy::HWY_NAMESPACE::Add; 24 using hwy::HWY_NAMESPACE::Mul; 25 using hwy::HWY_NAMESPACE::Vec; 26 27 // Weighted sum of 1x5 pixels around ix, iy with [wx2 wx1 wx0 wx1 wx2]. 28 template <class WrapY> 29 static float WeightedSumBorder(const ImageF& in, const WrapY wrap_y, 30 const int64_t ix, const int64_t iy, 31 const size_t xsize, const size_t ysize, 32 const float wx0, const float wx1, 33 const float wx2) { 34 const WrapMirror wrap_x; 35 const float* JXL_RESTRICT row = in.ConstRow(wrap_y(iy, ysize)); 36 const float in_m2 = row[wrap_x(ix - 2, xsize)]; 37 const float in_p2 = row[wrap_x(ix + 2, xsize)]; 38 const float in_m1 = row[wrap_x(ix - 1, xsize)]; 39 const float in_p1 = row[wrap_x(ix + 1, xsize)]; 40 const float in_00 = row[ix]; 41 const float sum_2 = wx2 * (in_m2 + in_p2); 42 const float sum_1 = wx1 * (in_m1 + in_p1); 43 const float sum_0 = wx0 * in_00; 44 return sum_2 + (sum_1 + sum_0); 45 } 46 47 template <class WrapY, class V> 48 static V WeightedSum(const ImageF& in, const WrapY wrap_y, const size_t ix, 49 const int64_t iy, const size_t ysize, const V wx0, 50 const V wx1, const V wx2) { 51 const HWY_FULL(float) d; 52 const float* JXL_RESTRICT center = in.ConstRow(wrap_y(iy, ysize)) + ix; 53 const auto in_m2 = LoadU(d, center - 2); 54 const auto in_p2 = LoadU(d, center + 2); 55 const auto in_m1 = LoadU(d, center - 1); 56 const auto in_p1 = LoadU(d, center + 1); 57 const auto in_00 = LoadU(d, center); 58 const auto sum_2 = Mul(wx2, Add(in_m2, in_p2)); 59 const auto sum_1 = Mul(wx1, Add(in_m1, in_p1)); 60 const auto sum_0 = Mul(wx0, in_00); 61 return Add(sum_2, Add(sum_1, sum_0)); 62 } 63 64 // Produces result for one pixel 65 template <class WrapY> 66 float Symmetric5Border(const ImageF& in, const int64_t ix, const int64_t iy, 67 const WeightsSymmetric5& weights) { 68 const float w0 = weights.c[0]; 69 const float w1 = weights.r[0]; 70 const float w2 = weights.R[0]; 71 const float w4 = weights.d[0]; 72 const float w5 = weights.L[0]; 73 const float w8 = weights.D[0]; 74 75 const size_t xsize = in.xsize(); 76 const size_t ysize = in.ysize(); 77 const WrapY wrap_y; 78 // Unrolled loop over all 5 rows of the kernel. 79 float sum0 = WeightedSumBorder(in, wrap_y, ix, iy, xsize, ysize, w0, w1, w2); 80 81 sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 2, xsize, ysize, w2, w5, w8); 82 float sum1 = 83 WeightedSumBorder(in, wrap_y, ix, iy + 2, xsize, ysize, w2, w5, w8); 84 85 sum0 += WeightedSumBorder(in, wrap_y, ix, iy - 1, xsize, ysize, w1, w4, w5); 86 sum1 += WeightedSumBorder(in, wrap_y, ix, iy + 1, xsize, ysize, w1, w4, w5); 87 88 return sum0 + sum1; 89 } 90 91 // Produces result for one vector's worth of pixels 92 template <class WrapY> 93 static void Symmetric5Interior(const ImageF& in, const int64_t ix, 94 const int64_t rix, const int64_t iy, 95 const WeightsSymmetric5& weights, 96 float* JXL_RESTRICT row_out) { 97 const HWY_FULL(float) d; 98 99 const auto w0 = LoadDup128(d, weights.c); 100 const auto w1 = LoadDup128(d, weights.r); 101 const auto w2 = LoadDup128(d, weights.R); 102 const auto w4 = LoadDup128(d, weights.d); 103 const auto w5 = LoadDup128(d, weights.L); 104 const auto w8 = LoadDup128(d, weights.D); 105 106 const size_t ysize = in.ysize(); 107 const WrapY wrap_y; 108 // Unrolled loop over all 5 rows of the kernel. 109 auto sum0 = WeightedSum(in, wrap_y, ix, iy, ysize, w0, w1, w2); 110 111 sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 2, ysize, w2, w5, w8)); 112 auto sum1 = WeightedSum(in, wrap_y, ix, iy + 2, ysize, w2, w5, w8); 113 114 sum0 = Add(sum0, WeightedSum(in, wrap_y, ix, iy - 1, ysize, w1, w4, w5)); 115 sum1 = Add(sum1, WeightedSum(in, wrap_y, ix, iy + 1, ysize, w1, w4, w5)); 116 117 StoreU(Add(sum0, sum1), d, row_out + rix); 118 } 119 120 template <class WrapY> 121 static void Symmetric5Row(const ImageF& in, const Rect& rect, const int64_t iy, 122 const WeightsSymmetric5& weights, 123 float* JXL_RESTRICT row_out) { 124 const int64_t kRadius = 2; 125 const size_t xend = rect.x1(); 126 127 size_t rix = 0; 128 size_t ix = rect.x0(); 129 const HWY_FULL(float) d; 130 const size_t N = Lanes(d); 131 const size_t aligned_x = RoundUpTo(kRadius, N); 132 for (; ix < std::min(aligned_x, xend); ++ix, ++rix) { 133 row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); 134 } 135 for (; ix + N + kRadius <= xend; ix += N, rix += N) { 136 Symmetric5Interior<WrapY>(in, ix, rix, iy, weights, row_out); 137 } 138 for (; ix < xend; ++ix, ++rix) { 139 row_out[rix] = Symmetric5Border<WrapY>(in, ix, iy, weights); 140 } 141 } 142 143 // Semi-vectorized (interior pixels Fonly); called directly like slow::, unlike 144 // the fully vectorized strategies below. 145 Status Symmetric5(const ImageF& in, const Rect& in_rect, 146 const WeightsSymmetric5& weights, ThreadPool* pool, 147 ImageF* JXL_RESTRICT out, const Rect& out_rect) { 148 JXL_ENSURE(in_rect.xsize() == out_rect.xsize()); 149 JXL_ENSURE(in_rect.ysize() == out_rect.ysize()); 150 const size_t ysize = in_rect.ysize(); 151 const auto process_row = [&](const uint32_t task, 152 size_t /*thread*/) -> Status { 153 const int64_t riy = task; 154 const int64_t iy = in_rect.y0() + riy; 155 156 if (iy < 2 || iy >= static_cast<ssize_t>(in.ysize()) - 2) { 157 Symmetric5Row<WrapMirror>(in, in_rect, iy, weights, 158 out_rect.Row(out, riy)); 159 } else { 160 Symmetric5Row<WrapUnchanged>(in, in_rect, iy, weights, 161 out_rect.Row(out, riy)); 162 } 163 return true; 164 }; 165 JXL_RETURN_IF_ERROR(RunOnPool(pool, 0, static_cast<uint32_t>(ysize), 166 ThreadPool::NoInit, process_row, 167 "Symmetric5x5Convolution")); 168 return true; 169 } 170 171 // NOLINTNEXTLINE(google-readability-namespace-comments) 172 } // namespace HWY_NAMESPACE 173 } // namespace jxl 174 HWY_AFTER_NAMESPACE(); 175 176 #if HWY_ONCE 177 namespace jxl { 178 179 HWY_EXPORT(Symmetric5); 180 Status Symmetric5(const ImageF& in, const Rect& in_rect, 181 const WeightsSymmetric5& weights, ThreadPool* pool, 182 ImageF* JXL_RESTRICT out, const Rect& out_rect) { 183 return HWY_DYNAMIC_DISPATCH(Symmetric5)(in, in_rect, weights, pool, out, 184 out_rect); 185 } 186 187 Status Symmetric5(const ImageF& in, const Rect& rect, 188 const WeightsSymmetric5& weights, ThreadPool* pool, 189 ImageF* JXL_RESTRICT out) { 190 return Symmetric5(in, rect, weights, pool, out, Rect(*out)); 191 } 192 193 } // namespace jxl 194 #endif // HWY_ONCE