looprestoration.h (43694B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "src/cpu.h" 29 #include "src/looprestoration.h" 30 31 #if ARCH_AARCH64 32 void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride, 33 const pixel (*left)[4], const pixel *lpf, 34 const int w, int h, 35 const LooprestorationParams *const params, 36 const enum LrEdgeFlags edges 37 HIGHBD_DECL_SUFFIX); 38 void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride, 39 const pixel (*left)[4], const pixel *lpf, 40 const int w, int h, 41 const LooprestorationParams *const params, 42 const enum LrEdgeFlags edges 43 HIGHBD_DECL_SUFFIX); 44 #else 45 46 // The 8bpc version calculates things slightly differently than the reference 47 // C version. That version calculates roughly this: 48 // int16_t sum = 0; 49 // for (int i = 0; i < 7; i++) 50 // sum += src[idx] * fh[i]; 51 // int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h; 52 // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h; 53 // sum += 1 << (bitdepth + 6 - round_bits_h); 54 // Compared to the reference C version, this is the output of the first pass 55 // _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e. 56 // with round_offset precompensated. 57 // The 16bpc version calculates things pretty much the same way as the 58 // reference C version, but with the end result subtracted by 59 // 1 << (bitdepth + 6 - round_bits_h). 60 void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4], 61 const pixel *src, const int16_t fh[8], 62 const int w, const enum LrEdgeFlags edges 63 HIGHBD_DECL_SUFFIX); 64 // This calculates things slightly differently than the reference C version. 65 // This version calculates roughly this: 66 // int32_t sum = 0; 67 // for (int i = 0; i < 7; i++) 68 // sum += mid[idx] * fv[i]; 69 // sum = (sum + rounding_off_v) >> round_bits_v; 70 // This function assumes that the width is a multiple of 8. 71 void BF(dav1d_wiener_filter_v, neon)(pixel *dst, int16_t **ptrs, 72 const int16_t fv[8], const int w 73 HIGHBD_DECL_SUFFIX); 74 75 void BF(dav1d_wiener_filter_hv, neon)(pixel *dst, const pixel (*left)[4], 76 const pixel *src, 77 const int16_t filter[2][8], 78 const int w, const enum LrEdgeFlags edges, 79 int16_t **ptrs 80 HIGHBD_DECL_SUFFIX); 81 82 static void wiener_filter_neon(pixel *p, const ptrdiff_t stride, 83 const pixel (*left)[4], const pixel *lpf, 84 const int w, int h, 85 const LooprestorationParams *const params, 86 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 87 { 88 ALIGN_STK_16(int16_t, hor, 6 * 384,); 89 int16_t *ptrs[7], *rows[6]; 90 for (int i = 0; i < 6; i++) 91 rows[i] = &hor[i * 384]; 92 const int16_t (*const filter)[8] = params->filter; 93 const int16_t *fh = params->filter[0]; 94 const int16_t *fv = params->filter[1]; 95 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 96 97 const pixel *src = p; 98 if (edges & LR_HAVE_TOP) { 99 ptrs[0] = rows[0]; 100 ptrs[1] = rows[0]; 101 ptrs[2] = rows[1]; 102 ptrs[3] = rows[2]; 103 ptrs[4] = rows[2]; 104 ptrs[5] = rows[2]; 105 106 BF(dav1d_wiener_filter_h, neon)(rows[0], NULL, lpf, fh, w, edges 107 HIGHBD_TAIL_SUFFIX); 108 lpf += PXSTRIDE(stride); 109 BF(dav1d_wiener_filter_h, neon)(rows[1], NULL, lpf, fh, w, edges 110 HIGHBD_TAIL_SUFFIX); 111 112 BF(dav1d_wiener_filter_h, neon)(rows[2], left, src, fh, w, edges 113 HIGHBD_TAIL_SUFFIX); 114 left++; 115 src += PXSTRIDE(stride); 116 117 if (--h <= 0) 118 goto v1; 119 120 ptrs[4] = ptrs[5] = rows[3]; 121 BF(dav1d_wiener_filter_h, neon)(rows[3], left, src, fh, w, edges 122 HIGHBD_TAIL_SUFFIX); 123 left++; 124 src += PXSTRIDE(stride); 125 126 if (--h <= 0) 127 goto v2; 128 129 ptrs[5] = rows[4]; 130 BF(dav1d_wiener_filter_h, neon)(rows[4], left, src, fh, w, edges 131 HIGHBD_TAIL_SUFFIX); 132 left++; 133 src += PXSTRIDE(stride); 134 135 if (--h <= 0) 136 goto v3; 137 } else { 138 ptrs[0] = rows[0]; 139 ptrs[1] = rows[0]; 140 ptrs[2] = rows[0]; 141 ptrs[3] = rows[0]; 142 ptrs[4] = rows[0]; 143 ptrs[5] = rows[0]; 144 145 BF(dav1d_wiener_filter_h, neon)(rows[0], left, src, fh, w, edges 146 HIGHBD_TAIL_SUFFIX); 147 left++; 148 src += PXSTRIDE(stride); 149 150 if (--h <= 0) 151 goto v1; 152 153 ptrs[4] = ptrs[5] = rows[1]; 154 BF(dav1d_wiener_filter_h, neon)(rows[1], left, src, fh, w, edges 155 HIGHBD_TAIL_SUFFIX); 156 left++; 157 src += PXSTRIDE(stride); 158 159 if (--h <= 0) 160 goto v2; 161 162 ptrs[5] = rows[2]; 163 BF(dav1d_wiener_filter_h, neon)(rows[2], left, src, fh, w, edges 164 HIGHBD_TAIL_SUFFIX); 165 left++; 166 src += PXSTRIDE(stride); 167 168 if (--h <= 0) 169 goto v3; 170 171 ptrs[6] = rows[3]; 172 BF(dav1d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs 173 HIGHBD_TAIL_SUFFIX); 174 left++; 175 src += PXSTRIDE(stride); 176 p += PXSTRIDE(stride); 177 178 if (--h <= 0) 179 goto v3; 180 181 ptrs[6] = rows[4]; 182 BF(dav1d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs 183 HIGHBD_TAIL_SUFFIX); 184 left++; 185 src += PXSTRIDE(stride); 186 p += PXSTRIDE(stride); 187 188 if (--h <= 0) 189 goto v3; 190 } 191 192 ptrs[6] = ptrs[5] + 384; 193 do { 194 BF(dav1d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs 195 HIGHBD_TAIL_SUFFIX); 196 left++; 197 src += PXSTRIDE(stride); 198 p += PXSTRIDE(stride); 199 } while (--h > 0); 200 201 if (!(edges & LR_HAVE_BOTTOM)) 202 goto v3; 203 204 BF(dav1d_wiener_filter_hv, neon)(p, NULL, lpf_bottom, filter, w, edges, ptrs 205 HIGHBD_TAIL_SUFFIX); 206 lpf_bottom += PXSTRIDE(stride); 207 p += PXSTRIDE(stride); 208 209 BF(dav1d_wiener_filter_hv, neon)(p, NULL, lpf_bottom, filter, w, edges, ptrs 210 HIGHBD_TAIL_SUFFIX); 211 p += PXSTRIDE(stride); 212 v1: 213 BF(dav1d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); 214 215 return; 216 217 v3: 218 BF(dav1d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); 219 p += PXSTRIDE(stride); 220 v2: 221 BF(dav1d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); 222 p += PXSTRIDE(stride); 223 goto v1; 224 } 225 #endif 226 227 static void rotate_neon(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) { 228 int32_t *tmp32 = sumsq_ptrs[0]; 229 int16_t *tmp16 = sum_ptrs[0]; 230 for (int i = 0; i < n - 1; i++) { 231 sumsq_ptrs[i] = sumsq_ptrs[i + 1]; 232 sum_ptrs[i] = sum_ptrs[i + 1]; 233 } 234 sumsq_ptrs[n - 1] = tmp32; 235 sum_ptrs[n - 1] = tmp16; 236 } 237 static void rotate5_x2_neon(int32_t **sumsq_ptrs, int16_t **sum_ptrs) { 238 int32_t *tmp32[2]; 239 int16_t *tmp16[2]; 240 for (int i = 0; i < 2; i++) { 241 tmp32[i] = sumsq_ptrs[i]; 242 tmp16[i] = sum_ptrs[i]; 243 } 244 for (int i = 0; i < 3; i++) { 245 sumsq_ptrs[i] = sumsq_ptrs[i + 2]; 246 sum_ptrs[i] = sum_ptrs[i + 2]; 247 } 248 for (int i = 0; i < 2; i++) { 249 sumsq_ptrs[3 + i] = tmp32[i]; 250 sum_ptrs[3 + i] = tmp16[i]; 251 } 252 } 253 254 void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum, 255 const pixel (*left)[4], 256 const pixel *src, const int w, 257 const enum LrEdgeFlags edges); 258 void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum, 259 const pixel (*left)[4], 260 const pixel *src, const int w, 261 const enum LrEdgeFlags edges); 262 void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3, 263 int32_t *sumsq5, int16_t *sum5, 264 const pixel (*left)[4], 265 const pixel *src, const int w, 266 const enum LrEdgeFlags edges); 267 268 #if ARCH_ARM 269 void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum, 270 int32_t *sumsq_out, int16_t *sum_out, 271 const int w); 272 void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum, 273 int32_t *sumsq_out, int16_t *sum_out, 274 const int w); 275 void dav1d_sgr_calc_row_ab1_neon(int32_t *AA, int16_t *BB, int w, int s, 276 int bitdepth_max); 277 void dav1d_sgr_calc_row_ab2_neon(int32_t *AA, int16_t *BB, int w, int s, 278 int bitdepth_max); 279 void BF(dav1d_sgr_finish_filter_row1, neon)(int16_t *tmp, const pixel *src, 280 int32_t **A_ptrs, int16_t **B_ptrs, 281 const int w); 282 void BF(dav1d_sgr_weighted_row1, neon)(pixel *dst, const int16_t *t1, 283 const int w, const int wt 284 HIGHBD_DECL_SUFFIX); 285 #else 286 void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, 287 int32_t *AA, int16_t *BB, 288 const int w, const int s, 289 const int bitdepth_max); 290 void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, 291 int32_t *AA, int16_t *BB, 292 const int w, const int s, 293 const int bitdepth_max); 294 295 void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst, 296 int32_t **A_ptrs, int16_t **B_ptrs, 297 const int w, const int w1 298 HIGHBD_DECL_SUFFIX); 299 void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride, 300 int32_t **A_ptrs, int16_t **B_ptrs, 301 const int w, const int h, 302 const int w1 HIGHBD_DECL_SUFFIX); 303 304 void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src, 305 const ptrdiff_t src_stride, 306 int32_t **A_ptrs, 307 int16_t **B_ptrs, 308 const int w, const int h); 309 #endif 310 void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src, 311 const ptrdiff_t src_stride, 312 int32_t **A_ptrs, int16_t **B_ptrs, 313 const int w, const int h); 314 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride, 315 const int16_t *t1, const int16_t *t2, 316 const int w, const int h, 317 const int16_t wt[2] HIGHBD_DECL_SUFFIX); 318 319 static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum, 320 int32_t *sumsq_out, int16_t *sum_out, 321 const int w, const int s, const int bitdepth_max) { 322 #if ARCH_ARM 323 dav1d_sgr_box3_row_v_neon(sumsq, sum, sumsq_out, sum_out, w); 324 dav1d_sgr_calc_row_ab1_neon(sumsq_out, sum_out, w, s, bitdepth_max); 325 #else 326 // box3_v + calc_ab1 327 dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); 328 #endif 329 rotate_neon(sumsq, sum, 3); 330 } 331 332 static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum, 333 int32_t *sumsq_out, int16_t *sum_out, 334 const int w, const int s, const int bitdepth_max) { 335 #if ARCH_ARM 336 dav1d_sgr_box5_row_v_neon(sumsq, sum, sumsq_out, sum_out, w); 337 dav1d_sgr_calc_row_ab2_neon(sumsq_out, sum_out, w, s, bitdepth_max); 338 #else 339 // box5_v + calc_ab2 340 dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max); 341 #endif 342 rotate5_x2_neon(sumsq, sum); 343 } 344 345 static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum, 346 int32_t *AA, int16_t *BB, 347 const pixel (*left)[4], 348 const pixel *src, const int w, 349 const int s, 350 const enum LrEdgeFlags edges, 351 const int bitdepth_max) { 352 BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges); 353 sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max); 354 } 355 356 357 static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride, 358 int32_t **A_ptrs, int16_t **B_ptrs, const int w, 359 const int w1 HIGHBD_DECL_SUFFIX) { 360 #if ARCH_ARM 361 ALIGN_STK_16(int16_t, tmp, 384,); 362 363 BF(dav1d_sgr_finish_filter_row1, neon)(tmp, *dst, A_ptrs, B_ptrs, w); 364 BF(dav1d_sgr_weighted_row1, neon)(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); 365 #else 366 BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs, 367 w, w1 HIGHBD_TAIL_SUFFIX); 368 #endif 369 *dst += PXSTRIDE(stride); 370 rotate_neon(A_ptrs, B_ptrs, 3); 371 } 372 373 #define ARM_FILTER_OUT_STRIDE 384 374 375 static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride, 376 int32_t **A_ptrs, int16_t **B_ptrs, 377 const int w, const int h, const int w1 378 HIGHBD_DECL_SUFFIX) { 379 #if ARCH_ARM 380 ALIGN_STK_16(int16_t, tmp, 2*ARM_FILTER_OUT_STRIDE,); 381 382 BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp, *dst, stride, A_ptrs, B_ptrs, w, h); 383 BF(dav1d_sgr_weighted_row1, neon)(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); 384 *dst += PXSTRIDE(stride); 385 if (h > 1) { 386 BF(dav1d_sgr_weighted_row1, neon)(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX); 387 *dst += PXSTRIDE(stride); 388 } 389 #else 390 BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs, 391 w, h, w1 HIGHBD_TAIL_SUFFIX); 392 *dst += 2*PXSTRIDE(stride); 393 #endif 394 rotate_neon(A_ptrs, B_ptrs, 2); 395 } 396 397 static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride, 398 int32_t **A5_ptrs, int16_t **B5_ptrs, 399 int32_t **A3_ptrs, int16_t **B3_ptrs, 400 const int w, const int h, 401 const int w0, const int w1 HIGHBD_DECL_SUFFIX) { 402 ALIGN_STK_16(int16_t, tmp5, 2*ARM_FILTER_OUT_STRIDE,); 403 ALIGN_STK_16(int16_t, tmp3, 2*ARM_FILTER_OUT_STRIDE,); 404 405 BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride, 406 A5_ptrs, B5_ptrs, w, h); 407 #if ARCH_ARM 408 BF(dav1d_sgr_finish_filter_row1, neon)(tmp3, *dst, A3_ptrs, B3_ptrs, w); 409 BF(dav1d_sgr_finish_filter_row1, neon)(tmp3 + FILTER_OUT_STRIDE, 410 *dst + PXSTRIDE(stride), 411 &A3_ptrs[1], &B3_ptrs[1], w); 412 #else 413 BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride, 414 A3_ptrs, B3_ptrs, w, h); 415 #endif 416 const int16_t wt[2] = { w0, w1 }; 417 BF(dav1d_sgr_weighted2, neon)(*dst, stride, 418 tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX); 419 *dst += h*PXSTRIDE(stride); 420 rotate_neon(A5_ptrs, B5_ptrs, 2); 421 rotate_neon(A3_ptrs, B3_ptrs, 4); 422 } 423 424 425 static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride, 426 const pixel (*left)[4], const pixel *lpf, 427 const int w, int h, 428 const LooprestorationParams *const params, 429 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 430 { 431 #define ARM_BUF_STRIDE (384 + 16) 432 ALIGN_STK_16(int32_t, sumsq_buf, ARM_BUF_STRIDE * 3 + 16,); 433 ALIGN_STK_16(int16_t, sum_buf, ARM_BUF_STRIDE * 3 + 16,); 434 int32_t *sumsq_ptrs[3], *sumsq_rows[3]; 435 int16_t *sum_ptrs[3], *sum_rows[3]; 436 for (int i = 0; i < 3; i++) { 437 sumsq_rows[i] = &sumsq_buf[i * ARM_BUF_STRIDE]; 438 sum_rows[i] = &sum_buf[i * ARM_BUF_STRIDE]; 439 } 440 441 ALIGN_STK_16(int32_t, A_buf, ARM_BUF_STRIDE * 3 + 16,); 442 ALIGN_STK_16(int16_t, B_buf, ARM_BUF_STRIDE * 3 + 16,); 443 int32_t *A_ptrs[3]; 444 int16_t *B_ptrs[3]; 445 for (int i = 0; i < 3; i++) { 446 A_ptrs[i] = &A_buf[i * ARM_BUF_STRIDE]; 447 B_ptrs[i] = &B_buf[i * ARM_BUF_STRIDE]; 448 } 449 const pixel *src = dst; 450 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 451 452 if (edges & LR_HAVE_TOP) { 453 sumsq_ptrs[0] = sumsq_rows[0]; 454 sumsq_ptrs[1] = sumsq_rows[1]; 455 sumsq_ptrs[2] = sumsq_rows[2]; 456 sum_ptrs[0] = sum_rows[0]; 457 sum_ptrs[1] = sum_rows[1]; 458 sum_ptrs[2] = sum_rows[2]; 459 460 BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], 461 NULL, lpf, w, edges); 462 lpf += PXSTRIDE(stride); 463 BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1], 464 NULL, lpf, w, edges); 465 466 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 467 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 468 left++; 469 src += PXSTRIDE(stride); 470 rotate_neon(A_ptrs, B_ptrs, 3); 471 472 if (--h <= 0) 473 goto vert_1; 474 475 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 476 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 477 left++; 478 src += PXSTRIDE(stride); 479 rotate_neon(A_ptrs, B_ptrs, 3); 480 481 if (--h <= 0) 482 goto vert_2; 483 } else { 484 sumsq_ptrs[0] = sumsq_rows[0]; 485 sumsq_ptrs[1] = sumsq_rows[0]; 486 sumsq_ptrs[2] = sumsq_rows[0]; 487 sum_ptrs[0] = sum_rows[0]; 488 sum_ptrs[1] = sum_rows[0]; 489 sum_ptrs[2] = sum_rows[0]; 490 491 BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0], 492 left, src, w, edges); 493 left++; 494 src += PXSTRIDE(stride); 495 496 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 497 w, params->sgr.s1, BITDEPTH_MAX); 498 rotate_neon(A_ptrs, B_ptrs, 3); 499 500 if (--h <= 0) 501 goto vert_1; 502 503 sumsq_ptrs[2] = sumsq_rows[1]; 504 sum_ptrs[2] = sum_rows[1]; 505 506 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 507 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 508 left++; 509 src += PXSTRIDE(stride); 510 rotate_neon(A_ptrs, B_ptrs, 3); 511 512 if (--h <= 0) 513 goto vert_2; 514 515 sumsq_ptrs[2] = sumsq_rows[2]; 516 sum_ptrs[2] = sum_rows[2]; 517 } 518 519 do { 520 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 521 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 522 left++; 523 src += PXSTRIDE(stride); 524 525 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, 526 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 527 } while (--h > 0); 528 529 if (!(edges & LR_HAVE_BOTTOM)) 530 goto vert_2; 531 532 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 533 NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); 534 lpf_bottom += PXSTRIDE(stride); 535 536 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, 537 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 538 539 sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 540 NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); 541 542 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, 543 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 544 return; 545 546 vert_2: 547 sumsq_ptrs[2] = sumsq_ptrs[1]; 548 sum_ptrs[2] = sum_ptrs[1]; 549 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 550 w, params->sgr.s1, BITDEPTH_MAX); 551 552 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, 553 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 554 555 output_1: 556 sumsq_ptrs[2] = sumsq_ptrs[1]; 557 sum_ptrs[2] = sum_ptrs[1]; 558 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 559 w, params->sgr.s1, BITDEPTH_MAX); 560 561 sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs, 562 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 563 return; 564 565 vert_1: 566 sumsq_ptrs[2] = sumsq_ptrs[1]; 567 sum_ptrs[2] = sum_ptrs[1]; 568 sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 569 w, params->sgr.s1, BITDEPTH_MAX); 570 rotate_neon(A_ptrs, B_ptrs, 3); 571 goto output_1; 572 } 573 574 static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride, 575 const pixel (*left)[4], const pixel *lpf, 576 const int w, int h, 577 const LooprestorationParams *const params, 578 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 579 { 580 ALIGN_STK_16(int32_t, sumsq_buf, ARM_BUF_STRIDE * 5 + 16,); 581 ALIGN_STK_16(int16_t, sum_buf, ARM_BUF_STRIDE * 5 + 16,); 582 int32_t *sumsq_ptrs[5], *sumsq_rows[5]; 583 int16_t *sum_ptrs[5], *sum_rows[5]; 584 for (int i = 0; i < 5; i++) { 585 sumsq_rows[i] = &sumsq_buf[i * ARM_BUF_STRIDE]; 586 sum_rows[i] = &sum_buf[i * ARM_BUF_STRIDE]; 587 } 588 589 ALIGN_STK_16(int32_t, A_buf, ARM_BUF_STRIDE * 2 + 16,); 590 ALIGN_STK_16(int16_t, B_buf, ARM_BUF_STRIDE * 2 + 16,); 591 int32_t *A_ptrs[2]; 592 int16_t *B_ptrs[2]; 593 for (int i = 0; i < 2; i++) { 594 A_ptrs[i] = &A_buf[i * ARM_BUF_STRIDE]; 595 B_ptrs[i] = &B_buf[i * ARM_BUF_STRIDE]; 596 } 597 const pixel *src = dst; 598 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 599 600 if (edges & LR_HAVE_TOP) { 601 sumsq_ptrs[0] = sumsq_rows[0]; 602 sumsq_ptrs[1] = sumsq_rows[0]; 603 sumsq_ptrs[2] = sumsq_rows[1]; 604 sumsq_ptrs[3] = sumsq_rows[2]; 605 sumsq_ptrs[4] = sumsq_rows[3]; 606 sum_ptrs[0] = sum_rows[0]; 607 sum_ptrs[1] = sum_rows[0]; 608 sum_ptrs[2] = sum_rows[1]; 609 sum_ptrs[3] = sum_rows[2]; 610 sum_ptrs[4] = sum_rows[3]; 611 612 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], 613 NULL, lpf, w, edges); 614 lpf += PXSTRIDE(stride); 615 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], 616 NULL, lpf, w, edges); 617 618 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], 619 left, src, w, edges); 620 left++; 621 src += PXSTRIDE(stride); 622 623 if (--h <= 0) 624 goto vert_1; 625 626 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], 627 left, src, w, edges); 628 left++; 629 src += PXSTRIDE(stride); 630 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 631 w, params->sgr.s0, BITDEPTH_MAX); 632 rotate_neon(A_ptrs, B_ptrs, 2); 633 634 if (--h <= 0) 635 goto vert_2; 636 637 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 638 // one of them to point at the previously unused rows[4]. 639 sumsq_ptrs[3] = sumsq_rows[4]; 640 sum_ptrs[3] = sum_rows[4]; 641 } else { 642 sumsq_ptrs[0] = sumsq_rows[0]; 643 sumsq_ptrs[1] = sumsq_rows[0]; 644 sumsq_ptrs[2] = sumsq_rows[0]; 645 sumsq_ptrs[3] = sumsq_rows[0]; 646 sumsq_ptrs[4] = sumsq_rows[0]; 647 sum_ptrs[0] = sum_rows[0]; 648 sum_ptrs[1] = sum_rows[0]; 649 sum_ptrs[2] = sum_rows[0]; 650 sum_ptrs[3] = sum_rows[0]; 651 sum_ptrs[4] = sum_rows[0]; 652 653 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0], 654 left, src, w, edges); 655 left++; 656 src += PXSTRIDE(stride); 657 658 if (--h <= 0) 659 goto vert_1; 660 661 sumsq_ptrs[4] = sumsq_rows[1]; 662 sum_ptrs[4] = sum_rows[1]; 663 664 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1], 665 left, src, w, edges); 666 left++; 667 src += PXSTRIDE(stride); 668 669 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 670 w, params->sgr.s0, BITDEPTH_MAX); 671 rotate_neon(A_ptrs, B_ptrs, 2); 672 673 if (--h <= 0) 674 goto vert_2; 675 676 sumsq_ptrs[3] = sumsq_rows[2]; 677 sumsq_ptrs[4] = sumsq_rows[3]; 678 sum_ptrs[3] = sum_rows[2]; 679 sum_ptrs[4] = sum_rows[3]; 680 681 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2], 682 left, src, w, edges); 683 left++; 684 src += PXSTRIDE(stride); 685 686 if (--h <= 0) 687 goto odd; 688 689 BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3], 690 left, src, w, edges); 691 left++; 692 src += PXSTRIDE(stride); 693 694 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 695 w, params->sgr.s0, BITDEPTH_MAX); 696 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, 697 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 698 699 if (--h <= 0) 700 goto vert_2; 701 702 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 703 // one of them to point at the previously unused rows[4]. 704 sumsq_ptrs[3] = sumsq_rows[4]; 705 sum_ptrs[3] = sum_rows[4]; 706 } 707 708 do { 709 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], 710 left, src, w, edges); 711 left++; 712 src += PXSTRIDE(stride); 713 714 if (--h <= 0) 715 goto odd; 716 717 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], 718 left, src, w, edges); 719 left++; 720 src += PXSTRIDE(stride); 721 722 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 723 w, params->sgr.s0, BITDEPTH_MAX); 724 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, 725 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 726 } while (--h > 0); 727 728 if (!(edges & LR_HAVE_BOTTOM)) 729 goto vert_2; 730 731 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3], 732 NULL, lpf_bottom, w, edges); 733 lpf_bottom += PXSTRIDE(stride); 734 BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4], 735 NULL, lpf_bottom, w, edges); 736 737 output_2: 738 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 739 w, params->sgr.s0, BITDEPTH_MAX); 740 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, 741 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 742 return; 743 744 vert_2: 745 // Duplicate the last row twice more 746 sumsq_ptrs[3] = sumsq_ptrs[2]; 747 sumsq_ptrs[4] = sumsq_ptrs[2]; 748 sum_ptrs[3] = sum_ptrs[2]; 749 sum_ptrs[4] = sum_ptrs[2]; 750 goto output_2; 751 752 odd: 753 // Copy the last row as padding once 754 sumsq_ptrs[4] = sumsq_ptrs[3]; 755 sum_ptrs[4] = sum_ptrs[3]; 756 757 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 758 w, params->sgr.s0, BITDEPTH_MAX); 759 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, 760 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 761 762 output_1: 763 // Duplicate the last row twice more 764 sumsq_ptrs[3] = sumsq_ptrs[2]; 765 sumsq_ptrs[4] = sumsq_ptrs[2]; 766 sum_ptrs[3] = sum_ptrs[2]; 767 sum_ptrs[4] = sum_ptrs[2]; 768 769 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 770 w, params->sgr.s0, BITDEPTH_MAX); 771 // Output only one row 772 sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs, 773 w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 774 return; 775 776 vert_1: 777 // Copy the last row as padding once 778 sumsq_ptrs[4] = sumsq_ptrs[3]; 779 sum_ptrs[4] = sum_ptrs[3]; 780 781 sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 782 w, params->sgr.s0, BITDEPTH_MAX); 783 rotate_neon(A_ptrs, B_ptrs, 2); 784 785 goto output_1; 786 } 787 788 static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride, 789 const pixel (*left)[4], const pixel *lpf, 790 const int w, int h, 791 const LooprestorationParams *const params, 792 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 793 { 794 ALIGN_STK_16(int32_t, sumsq5_buf, ARM_BUF_STRIDE * 5 + 16,); 795 ALIGN_STK_16(int16_t, sum5_buf, ARM_BUF_STRIDE * 5 + 16,); 796 int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; 797 int16_t *sum5_ptrs[5], *sum5_rows[5]; 798 for (int i = 0; i < 5; i++) { 799 sumsq5_rows[i] = &sumsq5_buf[i * ARM_BUF_STRIDE]; 800 sum5_rows[i] = &sum5_buf[i * ARM_BUF_STRIDE]; 801 } 802 ALIGN_STK_16(int32_t, sumsq3_buf, ARM_BUF_STRIDE * 3 + 16,); 803 ALIGN_STK_16(int16_t, sum3_buf, ARM_BUF_STRIDE * 3 + 16,); 804 int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; 805 int16_t *sum3_ptrs[3], *sum3_rows[3]; 806 for (int i = 0; i < 3; i++) { 807 sumsq3_rows[i] = &sumsq3_buf[i * ARM_BUF_STRIDE]; 808 sum3_rows[i] = &sum3_buf[i * ARM_BUF_STRIDE]; 809 } 810 811 ALIGN_STK_16(int32_t, A5_buf, ARM_BUF_STRIDE * 2 + 16,); 812 ALIGN_STK_16(int16_t, B5_buf, ARM_BUF_STRIDE * 2 + 16,); 813 int32_t *A5_ptrs[2]; 814 int16_t *B5_ptrs[2]; 815 for (int i = 0; i < 2; i++) { 816 A5_ptrs[i] = &A5_buf[i * ARM_BUF_STRIDE]; 817 B5_ptrs[i] = &B5_buf[i * ARM_BUF_STRIDE]; 818 } 819 ALIGN_STK_16(int32_t, A3_buf, ARM_BUF_STRIDE * 4 + 16,); 820 ALIGN_STK_16(int16_t, B3_buf, ARM_BUF_STRIDE * 4 + 16,); 821 int32_t *A3_ptrs[4]; 822 int16_t *B3_ptrs[4]; 823 for (int i = 0; i < 4; i++) { 824 A3_ptrs[i] = &A3_buf[i * ARM_BUF_STRIDE]; 825 B3_ptrs[i] = &B3_buf[i * ARM_BUF_STRIDE]; 826 } 827 const pixel *src = dst; 828 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 829 830 if (edges & LR_HAVE_TOP) { 831 sumsq5_ptrs[0] = sumsq5_rows[0]; 832 sumsq5_ptrs[1] = sumsq5_rows[0]; 833 sumsq5_ptrs[2] = sumsq5_rows[1]; 834 sumsq5_ptrs[3] = sumsq5_rows[2]; 835 sumsq5_ptrs[4] = sumsq5_rows[3]; 836 sum5_ptrs[0] = sum5_rows[0]; 837 sum5_ptrs[1] = sum5_rows[0]; 838 sum5_ptrs[2] = sum5_rows[1]; 839 sum5_ptrs[3] = sum5_rows[2]; 840 sum5_ptrs[4] = sum5_rows[3]; 841 842 sumsq3_ptrs[0] = sumsq3_rows[0]; 843 sumsq3_ptrs[1] = sumsq3_rows[1]; 844 sumsq3_ptrs[2] = sumsq3_rows[2]; 845 sum3_ptrs[0] = sum3_rows[0]; 846 sum3_ptrs[1] = sum3_rows[1]; 847 sum3_ptrs[2] = sum3_rows[2]; 848 849 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], 850 sumsq5_rows[0], sum5_rows[0], 851 NULL, lpf, w, edges); 852 lpf += PXSTRIDE(stride); 853 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], 854 sumsq5_rows[1], sum5_rows[1], 855 NULL, lpf, w, edges); 856 857 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], 858 sumsq5_rows[2], sum5_rows[2], 859 left, src, w, edges); 860 left++; 861 src += PXSTRIDE(stride); 862 863 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 864 w, params->sgr.s1, BITDEPTH_MAX); 865 rotate_neon(A3_ptrs, B3_ptrs, 4); 866 867 if (--h <= 0) 868 goto vert_1; 869 870 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], 871 sumsq5_rows[3], sum5_rows[3], 872 left, src, w, edges); 873 left++; 874 src += PXSTRIDE(stride); 875 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 876 w, params->sgr.s0, BITDEPTH_MAX); 877 rotate_neon(A5_ptrs, B5_ptrs, 2); 878 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 879 w, params->sgr.s1, BITDEPTH_MAX); 880 rotate_neon(A3_ptrs, B3_ptrs, 4); 881 882 if (--h <= 0) 883 goto vert_2; 884 885 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 886 // one of them to point at the previously unused rows[4]. 887 sumsq5_ptrs[3] = sumsq5_rows[4]; 888 sum5_ptrs[3] = sum5_rows[4]; 889 } else { 890 sumsq5_ptrs[0] = sumsq5_rows[0]; 891 sumsq5_ptrs[1] = sumsq5_rows[0]; 892 sumsq5_ptrs[2] = sumsq5_rows[0]; 893 sumsq5_ptrs[3] = sumsq5_rows[0]; 894 sumsq5_ptrs[4] = sumsq5_rows[0]; 895 sum5_ptrs[0] = sum5_rows[0]; 896 sum5_ptrs[1] = sum5_rows[0]; 897 sum5_ptrs[2] = sum5_rows[0]; 898 sum5_ptrs[3] = sum5_rows[0]; 899 sum5_ptrs[4] = sum5_rows[0]; 900 901 sumsq3_ptrs[0] = sumsq3_rows[0]; 902 sumsq3_ptrs[1] = sumsq3_rows[0]; 903 sumsq3_ptrs[2] = sumsq3_rows[0]; 904 sum3_ptrs[0] = sum3_rows[0]; 905 sum3_ptrs[1] = sum3_rows[0]; 906 sum3_ptrs[2] = sum3_rows[0]; 907 908 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0], 909 sumsq5_rows[0], sum5_rows[0], 910 left, src, w, edges); 911 left++; 912 src += PXSTRIDE(stride); 913 914 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 915 w, params->sgr.s1, BITDEPTH_MAX); 916 rotate_neon(A3_ptrs, B3_ptrs, 4); 917 918 if (--h <= 0) 919 goto vert_1; 920 921 sumsq5_ptrs[4] = sumsq5_rows[1]; 922 sum5_ptrs[4] = sum5_rows[1]; 923 924 sumsq3_ptrs[2] = sumsq3_rows[1]; 925 sum3_ptrs[2] = sum3_rows[1]; 926 927 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1], 928 sumsq5_rows[1], sum5_rows[1], 929 left, src, w, edges); 930 left++; 931 src += PXSTRIDE(stride); 932 933 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 934 w, params->sgr.s0, BITDEPTH_MAX); 935 rotate_neon(A5_ptrs, B5_ptrs, 2); 936 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 937 w, params->sgr.s1, BITDEPTH_MAX); 938 rotate_neon(A3_ptrs, B3_ptrs, 4); 939 940 if (--h <= 0) 941 goto vert_2; 942 943 sumsq5_ptrs[3] = sumsq5_rows[2]; 944 sumsq5_ptrs[4] = sumsq5_rows[3]; 945 sum5_ptrs[3] = sum5_rows[2]; 946 sum5_ptrs[4] = sum5_rows[3]; 947 948 sumsq3_ptrs[2] = sumsq3_rows[2]; 949 sum3_ptrs[2] = sum3_rows[2]; 950 951 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2], 952 sumsq5_rows[2], sum5_rows[2], 953 left, src, w, edges); 954 left++; 955 src += PXSTRIDE(stride); 956 957 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 958 w, params->sgr.s1, BITDEPTH_MAX); 959 rotate_neon(A3_ptrs, B3_ptrs, 4); 960 961 if (--h <= 0) 962 goto odd; 963 964 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], 965 sumsq5_rows[3], sum5_rows[3], 966 left, src, w, edges); 967 left++; 968 src += PXSTRIDE(stride); 969 970 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 971 w, params->sgr.s0, BITDEPTH_MAX); 972 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 973 w, params->sgr.s1, BITDEPTH_MAX); 974 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 975 w, 2, params->sgr.w0, params->sgr.w1 976 HIGHBD_TAIL_SUFFIX); 977 978 if (--h <= 0) 979 goto vert_2; 980 981 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 982 // one of them to point at the previously unused rows[4]. 983 sumsq5_ptrs[3] = sumsq5_rows[4]; 984 sum5_ptrs[3] = sum5_rows[4]; 985 } 986 987 do { 988 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], 989 sumsq5_ptrs[3], sum5_ptrs[3], 990 left, src, w, edges); 991 left++; 992 src += PXSTRIDE(stride); 993 994 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 995 w, params->sgr.s1, BITDEPTH_MAX); 996 rotate_neon(A3_ptrs, B3_ptrs, 4); 997 998 if (--h <= 0) 999 goto odd; 1000 1001 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], 1002 sumsq5_ptrs[4], sum5_ptrs[4], 1003 left, src, w, edges); 1004 left++; 1005 src += PXSTRIDE(stride); 1006 1007 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1008 w, params->sgr.s0, BITDEPTH_MAX); 1009 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1010 w, params->sgr.s1, BITDEPTH_MAX); 1011 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1012 w, 2, params->sgr.w0, params->sgr.w1 1013 HIGHBD_TAIL_SUFFIX); 1014 } while (--h > 0); 1015 1016 if (!(edges & LR_HAVE_BOTTOM)) 1017 goto vert_2; 1018 1019 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], 1020 sumsq5_ptrs[3], sum5_ptrs[3], 1021 NULL, lpf_bottom, w, edges); 1022 lpf_bottom += PXSTRIDE(stride); 1023 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1024 w, params->sgr.s1, BITDEPTH_MAX); 1025 rotate_neon(A3_ptrs, B3_ptrs, 4); 1026 1027 BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2], 1028 sumsq5_ptrs[4], sum5_ptrs[4], 1029 NULL, lpf_bottom, w, edges); 1030 1031 output_2: 1032 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1033 w, params->sgr.s0, BITDEPTH_MAX); 1034 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1035 w, params->sgr.s1, BITDEPTH_MAX); 1036 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1037 w, 2, params->sgr.w0, params->sgr.w1 1038 HIGHBD_TAIL_SUFFIX); 1039 return; 1040 1041 vert_2: 1042 // Duplicate the last row twice more 1043 sumsq5_ptrs[3] = sumsq5_ptrs[2]; 1044 sumsq5_ptrs[4] = sumsq5_ptrs[2]; 1045 sum5_ptrs[3] = sum5_ptrs[2]; 1046 sum5_ptrs[4] = sum5_ptrs[2]; 1047 1048 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1049 sum3_ptrs[2] = sum3_ptrs[1]; 1050 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1051 w, params->sgr.s1, BITDEPTH_MAX); 1052 rotate_neon(A3_ptrs, B3_ptrs, 4); 1053 1054 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1055 sum3_ptrs[2] = sum3_ptrs[1]; 1056 1057 goto output_2; 1058 1059 odd: 1060 // Copy the last row as padding once 1061 sumsq5_ptrs[4] = sumsq5_ptrs[3]; 1062 sum5_ptrs[4] = sum5_ptrs[3]; 1063 1064 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1065 sum3_ptrs[2] = sum3_ptrs[1]; 1066 1067 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1068 w, params->sgr.s0, BITDEPTH_MAX); 1069 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1070 w, params->sgr.s1, BITDEPTH_MAX); 1071 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1072 w, 2, params->sgr.w0, params->sgr.w1 1073 HIGHBD_TAIL_SUFFIX); 1074 1075 output_1: 1076 // Duplicate the last row twice more 1077 sumsq5_ptrs[3] = sumsq5_ptrs[2]; 1078 sumsq5_ptrs[4] = sumsq5_ptrs[2]; 1079 sum5_ptrs[3] = sum5_ptrs[2]; 1080 sum5_ptrs[4] = sum5_ptrs[2]; 1081 1082 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1083 sum3_ptrs[2] = sum3_ptrs[1]; 1084 1085 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1086 w, params->sgr.s0, BITDEPTH_MAX); 1087 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1088 w, params->sgr.s1, BITDEPTH_MAX); 1089 rotate_neon(A3_ptrs, B3_ptrs, 4); 1090 // Output only one row 1091 sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1092 w, 1, params->sgr.w0, params->sgr.w1 1093 HIGHBD_TAIL_SUFFIX); 1094 return; 1095 1096 vert_1: 1097 // Copy the last row as padding once 1098 sumsq5_ptrs[4] = sumsq5_ptrs[3]; 1099 sum5_ptrs[4] = sum5_ptrs[3]; 1100 1101 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1102 sum3_ptrs[2] = sum3_ptrs[1]; 1103 1104 sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1105 w, params->sgr.s0, BITDEPTH_MAX); 1106 rotate_neon(A5_ptrs, B5_ptrs, 2); 1107 sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1108 w, params->sgr.s1, BITDEPTH_MAX); 1109 rotate_neon(A3_ptrs, B3_ptrs, 4); 1110 1111 goto output_1; 1112 } 1113 1114 1115 static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) { 1116 const unsigned flags = dav1d_get_cpu_flags(); 1117 1118 if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; 1119 1120 #if ARCH_AARCH64 1121 c->wiener[0] = BF(dav1d_wiener_filter7, neon); 1122 c->wiener[1] = BF(dav1d_wiener_filter5, neon); 1123 #else 1124 c->wiener[0] = c->wiener[1] = wiener_filter_neon; 1125 #endif 1126 if (BITDEPTH == 8 || bpc == 10) { 1127 c->sgr[0] = sgr_filter_5x5_neon; 1128 c->sgr[1] = sgr_filter_3x3_neon; 1129 c->sgr[2] = sgr_filter_mix_neon; 1130 } 1131 }