looprestoration_inner.c (15203B)
1 /* 2 * Copyright © 2023, VideoLAN and dav1d authors 3 * Copyright © 2023, Loongson Technology Corporation Limited 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/loongarch/looprestoration.h" 29 30 #if BITDEPTH == 8 31 32 #define REST_UNIT_STRIDE (400) 33 34 void BF(dav1d_wiener_filter_h, lsx)(int32_t *hor_ptr, 35 uint8_t *tmp_ptr, 36 const int16_t filterh[8], 37 const int w, const int h); 38 39 void BF(dav1d_wiener_filter_h, lasx)(int32_t *hor_ptr, 40 uint8_t *tmp_ptr, 41 const int16_t filterh[8], 42 const int w, const int h); 43 44 void BF(dav1d_wiener_filter_v, lsx)(uint8_t *p, 45 const ptrdiff_t p_stride, 46 const int32_t *hor, 47 const int16_t filterv[8], 48 const int w, const int h); 49 50 void BF(dav1d_wiener_filter_v, lasx)(uint8_t *p, 51 const ptrdiff_t p_stride, 52 const int32_t *hor, 53 const int16_t filterv[8], 54 const int w, const int h); 55 56 // This function refers to the function in the ppc/looprestoration_init_tmpl.c. 57 static inline void padding(uint8_t *dst, const uint8_t *p, 58 const ptrdiff_t stride, const uint8_t (*left)[4], 59 const uint8_t *lpf, int unit_w, const int stripe_h, 60 const enum LrEdgeFlags edges) 61 { 62 const int have_left = !!(edges & LR_HAVE_LEFT); 63 const int have_right = !!(edges & LR_HAVE_RIGHT); 64 65 // Copy more pixels if we don't have to pad them 66 unit_w += 3 * have_left + 3 * have_right; 67 uint8_t *dst_l = dst + 3 * !have_left; 68 p -= 3 * have_left; 69 lpf -= 3 * have_left; 70 71 if (edges & LR_HAVE_TOP) { 72 // Copy previous loop filtered rows 73 const uint8_t *const above_1 = lpf; 74 const uint8_t *const above_2 = above_1 + PXSTRIDE(stride); 75 pixel_copy(dst_l, above_1, unit_w); 76 pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w); 77 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w); 78 } else { 79 // Pad with first row 80 pixel_copy(dst_l, p, unit_w); 81 pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w); 82 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w); 83 if (have_left) { 84 pixel_copy(dst_l, &left[0][1], 3); 85 pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3); 86 pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3); 87 } 88 } 89 90 uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE; 91 if (edges & LR_HAVE_BOTTOM) { 92 // Copy next loop filtered rows 93 const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride); 94 const uint8_t *const below_2 = below_1 + PXSTRIDE(stride); 95 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w); 96 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w); 97 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w); 98 } else { 99 // Pad with last row 100 const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride); 101 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w); 102 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w); 103 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w); 104 if (have_left) { 105 pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); 106 pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); 107 pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3); 108 } 109 } 110 111 // Inner UNIT_WxSTRIPE_H 112 for (int j = 0; j < stripe_h; j++) { 113 pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left); 114 dst_tl += REST_UNIT_STRIDE; 115 p += PXSTRIDE(stride); 116 } 117 118 if (!have_right) { 119 uint8_t *pad = dst_l + unit_w; 120 uint8_t *row_last = &dst_l[unit_w - 1]; 121 // Pad 3x(STRIPE_H+6) with last column 122 for (int j = 0; j < stripe_h + 6; j++) { 123 pixel_set(pad, *row_last, 3); 124 pad += REST_UNIT_STRIDE; 125 row_last += REST_UNIT_STRIDE; 126 } 127 } 128 129 if (!have_left) { 130 // Pad 3x(STRIPE_H+6) with first column 131 for (int j = 0; j < stripe_h + 6; j++) { 132 pixel_set(dst, *dst_l, 3); 133 dst += REST_UNIT_STRIDE; 134 dst_l += REST_UNIT_STRIDE; 135 } 136 } else { 137 dst += 3 * REST_UNIT_STRIDE; 138 for (int j = 0; j < stripe_h; j++) { 139 pixel_copy(dst, &left[j][1], 3); 140 dst += REST_UNIT_STRIDE; 141 } 142 } 143 } 144 145 // This function refers to the function in the ppc/looprestoration_init_tmpl.c. 146 147 // FIXME Could split into luma and chroma specific functions, 148 // (since first and last tops are always 0 for chroma) 149 // FIXME Could implement a version that requires less temporary memory 150 // (should be possible to implement with only 6 rows of temp storage) 151 void dav1d_wiener_filter_lsx(uint8_t *p, const ptrdiff_t p_stride, 152 const uint8_t (*const left)[4], 153 const uint8_t *lpf, 154 const int w, const int h, 155 const LooprestorationParams *const params, 156 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 157 { 158 const int16_t (*const filter)[8] = params->filter; 159 160 // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels 161 // of padding above and below 162 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); 163 padding(tmp, p, p_stride, left, lpf, w, h, edges); 164 ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,); 165 166 BF(dav1d_wiener_filter_h, lsx)(hor, tmp, filter[0], w, h + 6); 167 BF(dav1d_wiener_filter_v, lsx)(p, p_stride, hor, filter[1], w, h); 168 } 169 170 void dav1d_wiener_filter_lasx(uint8_t *p, const ptrdiff_t p_stride, 171 const uint8_t (*const left)[4], 172 const uint8_t *lpf, 173 const int w, const int h, 174 const LooprestorationParams *const params, 175 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 176 { 177 const int16_t (*const filter)[8] = params->filter; 178 179 // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels 180 // of padding above and below 181 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); 182 padding(tmp, p, p_stride, left, lpf, w, h, edges); 183 ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,); 184 185 BF(dav1d_wiener_filter_h, lasx)(hor, tmp, filter[0], w, h + 6); 186 BF(dav1d_wiener_filter_v, lasx)(p, p_stride, hor, filter[1], w, h); 187 } 188 189 void BF(dav1d_boxsum3_h, lsx)(int32_t *sumsq, int16_t *sum, pixel *src, 190 const int w, const int h); 191 void BF(dav1d_boxsum3_v, lsx)(int32_t *sumsq, int16_t *sum, 192 const int w, const int h); 193 194 void BF(dav1d_boxsum3_sgf_h, lsx)(int32_t *sumsq, int16_t *sum, 195 const int w, const int h, const int w1); 196 void BF(dav1d_boxsum3_sgf_v, lsx)(int16_t *dst, uint8_t *tmp, 197 int32_t *sumsq, int16_t *sum, 198 const int w, const int h); 199 void BF(dav1d_sgr_3x3_finish, lsx)(pixel *p, const ptrdiff_t p_stride, 200 int16_t *dst, int w1, 201 const int w, const int h); 202 203 void BF(dav1d_boxsum3_h, lasx)(int32_t *sumsq, int16_t *sum, pixel *src, 204 const int w, const int h); 205 void BF(dav1d_boxsum3_sgf_h, lasx)(int32_t *sumsq, int16_t *sum, 206 const int w, const int h, const int w1); 207 void BF(dav1d_boxsum3_sgf_v, lasx)(int16_t *dst, uint8_t *tmp, 208 int32_t *sumsq, int16_t *sum, 209 const int w, const int h); 210 211 static inline void boxsum3_lsx(int32_t *sumsq, coef *sum, pixel *src, 212 const int w, const int h) 213 { 214 BF(dav1d_boxsum3_h, lsx)(sumsq, sum, src, w + 6, h + 6); 215 BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6); 216 } 217 218 static inline void boxsum3_lasx(int32_t *sumsq, coef *sum, pixel *src, 219 const int w, const int h) 220 { 221 BF(dav1d_boxsum3_h, lasx)(sumsq, sum, src, w + 6, h + 6); 222 BF(dav1d_boxsum3_v, lsx)(sumsq, sum, w + 6, h + 6); 223 } 224 225 void dav1d_sgr_filter_3x3_lsx(pixel *p, const ptrdiff_t p_stride, 226 const pixel (*const left)[4], 227 const pixel *lpf, 228 const int w, const int h, 229 const LooprestorationParams *const params, 230 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 231 { 232 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); 233 padding(tmp, p, p_stride, left, lpf, w, h, edges); 234 coef dst[64 * 384]; 235 236 ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); 237 ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); 238 239 boxsum3_lsx(sumsq, sum, tmp, w, h); 240 BF(dav1d_boxsum3_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s1); 241 BF(dav1d_boxsum3_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h); 242 BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h); 243 } 244 245 void dav1d_sgr_filter_3x3_lasx(pixel *p, const ptrdiff_t p_stride, 246 const pixel (*const left)[4], 247 const pixel *lpf, 248 const int w, const int h, 249 const LooprestorationParams *const params, 250 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 251 { 252 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); 253 padding(tmp, p, p_stride, left, lpf, w, h, edges); 254 coef dst[64 * 384]; 255 256 ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); 257 ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); 258 259 boxsum3_lasx(sumsq, sum, tmp, w, h); 260 BF(dav1d_boxsum3_sgf_h, lasx)(sumsq, sum, w, h, params->sgr.s1); 261 BF(dav1d_boxsum3_sgf_v, lasx)(dst, tmp, sumsq, sum, w, h); 262 BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w1, w, h); 263 } 264 265 void BF(dav1d_boxsum5_h, lsx)(int32_t *sumsq, int16_t *sum, 266 const uint8_t *const src, 267 const int w, const int h); 268 269 void BF(dav1d_boxsum5_v, lsx)(int32_t *sumsq, int16_t *sum, 270 const int w, const int h); 271 272 void BF(dav1d_boxsum5_sgf_h, lsx)(int32_t *sumsq, int16_t *sum, 273 const int w, const int h, 274 const unsigned s); 275 276 void BF(dav1d_boxsum5_sgf_v, lsx)(int16_t *dst, uint8_t *src, 277 int32_t *sumsq, int16_t *sum, 278 const int w, const int h); 279 280 void BF(dav1d_sgr_mix_finish, lsx)(uint8_t *p, const ptrdiff_t stride, 281 const int16_t *dst0, const int16_t *dst1, 282 const int w0, const int w1, 283 const int w, const int h); 284 285 static inline void boxsum5_lsx(int32_t *sumsq, coef *sum, pixel *src, 286 const int w, const int h) 287 { 288 BF(dav1d_boxsum5_h, lsx)(sumsq, sum, src, w + 6, h + 6); 289 BF(dav1d_boxsum5_v, lsx)(sumsq, sum, w + 6, h + 6); 290 } 291 292 void dav1d_sgr_filter_5x5_lsx(pixel *p, const ptrdiff_t p_stride, 293 const pixel (*const left)[4], 294 const pixel *lpf, 295 const int w, const int h, 296 const LooprestorationParams *const params, 297 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 298 { 299 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); 300 padding(tmp, p, p_stride, left, lpf, w, h, edges); 301 coef dst[64 * 384]; 302 303 ALIGN_STK_16(int32_t, sumsq, 68 * REST_UNIT_STRIDE + 8, ); 304 ALIGN_STK_16(int16_t, sum, 68 * REST_UNIT_STRIDE + 16, ); 305 306 boxsum5_lsx(sumsq, sum, tmp, w, h); 307 BF(dav1d_boxsum5_sgf_h, lsx)(sumsq, sum, w, h, params->sgr.s0); 308 BF(dav1d_boxsum5_sgf_v, lsx)(dst, tmp, sumsq, sum, w, h); 309 BF(dav1d_sgr_3x3_finish, lsx)(p, p_stride, dst, params->sgr.w0, w, h); 310 } 311 312 void dav1d_sgr_filter_mix_lsx(pixel *p, const ptrdiff_t p_stride, 313 const pixel (*const left)[4], 314 const pixel *lpf, 315 const int w, const int h, 316 const LooprestorationParams *const params, 317 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 318 { 319 ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,); 320 padding(tmp, p, p_stride, left, lpf, w, h, edges); 321 coef dst0[64 * 384]; 322 coef dst1[64 * 384]; 323 324 ALIGN_STK_16(int32_t, sumsq0, 68 * REST_UNIT_STRIDE + 8, ); 325 ALIGN_STK_16(int16_t, sum0, 68 * REST_UNIT_STRIDE + 16, ); 326 327 boxsum5_lsx(sumsq0, sum0, tmp, w, h); 328 BF(dav1d_boxsum5_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s0); 329 BF(dav1d_boxsum5_sgf_v, lsx)(dst0, tmp, sumsq0, sum0, w, h); 330 331 boxsum3_lsx(sumsq0, sum0, tmp, w, h); 332 BF(dav1d_boxsum3_sgf_h, lsx)(sumsq0, sum0, w, h, params->sgr.s1); 333 BF(dav1d_boxsum3_sgf_v, lsx)(dst1, tmp, sumsq0, sum0, w, h); 334 335 BF(dav1d_sgr_mix_finish, lsx)(p, p_stride, dst0, dst1, params->sgr.w0, 336 params->sgr.w1, w, h); 337 } 338 #endif