looprestoration_tmpl.c (47500B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * Copyright © 2018, Two Orioles, LLC 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "config.h" 29 30 #include <stdint.h> 31 #include <stdlib.h> 32 #include <string.h> 33 34 #include "common/attributes.h" 35 #include "common/bitdepth.h" 36 #include "common/intops.h" 37 38 #include "src/looprestoration.h" 39 #include "src/tables.h" 40 41 // 256 * 1.5 + 3 + 3 = 390 42 #define REST_UNIT_STRIDE (390) 43 44 static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4], 45 const pixel *src, const int16_t fh[8], 46 const int w, const enum LrEdgeFlags edges 47 HIGHBD_DECL_SUFFIX) 48 { 49 const int bitdepth = bitdepth_from_max(bitdepth_max); 50 const int round_bits_h = 3 + (bitdepth == 12) * 2; 51 const int rounding_off_h = 1 << (round_bits_h - 1); 52 const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h); 53 54 if (w < 6) { 55 // For small widths, do the fully conditional loop with 56 // conditions on each access. 57 for (int x = 0; x < w; x++) { 58 int sum = (1 << (bitdepth + 6)); 59 #if BITDEPTH == 8 60 sum += src[x] * 128; 61 #endif 62 for (int i = 0; i < 7; i++) { 63 int idx = x + i - 3; 64 if (idx < 0) { 65 if (!(edges & LR_HAVE_LEFT)) 66 sum += src[0] * fh[i]; 67 else if (left) 68 sum += left[0][4 + idx] * fh[i]; 69 else 70 sum += src[idx] * fh[i]; 71 } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) { 72 sum += src[w - 1] * fh[i]; 73 } else 74 sum += src[idx] * fh[i]; 75 } 76 sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); 77 dst[x] = sum; 78 } 79 80 return; 81 } 82 83 // For larger widths, do separate loops with less conditions; first 84 // handle the start of the row. 85 int start = 3; 86 if (!(edges & LR_HAVE_LEFT)) { 87 // If there's no left edge, pad using the leftmost pixel. 88 for (int x = 0; x < 3; x++) { 89 int sum = (1 << (bitdepth + 6)); 90 #if BITDEPTH == 8 91 sum += src[x] * 128; 92 #endif 93 for (int i = 0; i < 7; i++) { 94 int idx = x + i - 3; 95 if (idx < 0) 96 sum += src[0] * fh[i]; 97 else 98 sum += src[idx] * fh[i]; 99 } 100 sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); 101 dst[x] = sum; 102 } 103 } else if (left) { 104 // If we have the left edge and a separate left buffer, pad using that. 105 for (int x = 0; x < 3; x++) { 106 int sum = (1 << (bitdepth + 6)); 107 #if BITDEPTH == 8 108 sum += src[x] * 128; 109 #endif 110 for (int i = 0; i < 7; i++) { 111 int idx = x + i - 3; 112 if (idx < 0) 113 sum += left[0][4 + idx] * fh[i]; 114 else 115 sum += src[idx] * fh[i]; 116 } 117 sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); 118 dst[x] = sum; 119 } 120 } else { 121 // If we have the left edge, but no separate left buffer, we're in the 122 // top/bottom area (lpf) with the left edge existing in the same 123 // buffer; just do the regular loop from the start. 124 start = 0; 125 } 126 int end = w - 3; 127 if (edges & LR_HAVE_RIGHT) 128 end = w; 129 130 // Do a condititon free loop for the bulk of the row. 131 for (int x = start; x < end; x++) { 132 int sum = (1 << (bitdepth + 6)); 133 #if BITDEPTH == 8 134 sum += src[x] * 128; 135 #endif 136 for (int i = 0; i < 7; i++) { 137 int idx = x + i - 3; 138 sum += src[idx] * fh[i]; 139 } 140 sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); 141 dst[x] = sum; 142 } 143 144 // If we need to, calculate the end of the row with a condition for 145 // right edge padding. 146 for (int x = end; x < w; x++) { 147 int sum = (1 << (bitdepth + 6)); 148 #if BITDEPTH == 8 149 sum += src[x] * 128; 150 #endif 151 for (int i = 0; i < 7; i++) { 152 int idx = x + i - 3; 153 if (idx >= w) 154 sum += src[w - 1] * fh[i]; 155 else 156 sum += src[idx] * fh[i]; 157 } 158 sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1); 159 dst[x] = sum; 160 } 161 } 162 163 static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8], 164 const int w HIGHBD_DECL_SUFFIX) 165 { 166 const int bitdepth = bitdepth_from_max(bitdepth_max); 167 168 const int round_bits_v = 11 - (bitdepth == 12) * 2; 169 const int rounding_off_v = 1 << (round_bits_v - 1); 170 const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); 171 172 for (int i = 0; i < w; i++) { 173 int sum = -round_offset; 174 175 // Only filter using 6 input rows. The 7th row is assumed to be 176 // identical to the last one. 177 // 178 // This function is assumed to only be called at the end, when doing 179 // padding at the bottom. 180 for (int k = 0; k < 6; k++) 181 sum += ptrs[k][i] * fv[k]; 182 sum += ptrs[5][i] * fv[6]; 183 184 p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); 185 } 186 187 // Shift the pointers, but only update the first 5; the 6th pointer is kept 188 // as it was before (and the 7th is implicitly identical to the 6th). 189 for (int i = 0; i < 5; i++) 190 ptrs[i] = ptrs[i + 1]; 191 } 192 193 static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4], 194 const pixel *src, const int16_t filter[2][8], 195 const int w, const enum LrEdgeFlags edges 196 HIGHBD_DECL_SUFFIX) 197 { 198 const int bitdepth = bitdepth_from_max(bitdepth_max); 199 200 const int round_bits_v = 11 - (bitdepth == 12) * 2; 201 const int rounding_off_v = 1 << (round_bits_v - 1); 202 const int round_offset = 1 << (bitdepth + (round_bits_v - 1)); 203 204 const int16_t *fh = filter[0]; 205 const int16_t *fv = filter[1]; 206 207 // Do combined horziontal and vertical filtering; doing horizontal 208 // filtering of one row, combined with vertical filtering of 6 209 // preexisting rows and the newly filtered row. 210 211 // For simplicity in the C implementation, just do a separate call 212 // of the horizontal filter, into a temporary buffer. 213 uint16_t tmp[REST_UNIT_STRIDE]; 214 wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 215 216 for (int i = 0; i < w; i++) { 217 int sum = -round_offset; 218 219 // Filter using the 6 stored preexisting rows, and the newly 220 // filtered one in tmp[]. 221 for (int k = 0; k < 6; k++) 222 sum += ptrs[k][i] * fv[k]; 223 sum += tmp[i] * fv[6]; 224 // At this point, after having read all inputs at point [i], we 225 // could overwrite [i] with the newly filtered data. 226 227 p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v); 228 } 229 230 // For simplicity in the C implementation, just memcpy the newly 231 // filtered row into ptrs[6]. Normally, in steady state filtering, 232 // this output row, ptrs[6], is equal to ptrs[0]. However at startup, 233 // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1], 234 // so we can't assume we can write into ptrs[0] but we need to keep 235 // a separate pointer for the next row to write into. 236 memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE); 237 238 // Rotate the window of pointers. Shift the 6 pointers downwards one step. 239 for (int i = 0; i < 6; i++) 240 ptrs[i] = ptrs[i + 1]; 241 // The topmost pointer, ptrs[6], which isn't used as input, is set to 242 // ptrs[0], which will be used as output for the next _hv call. 243 // At the start of the filtering, the caller may set ptrs[6] to the 244 // right next buffer to fill in, instead. 245 ptrs[6] = ptrs[0]; 246 } 247 248 // FIXME Could split into luma and chroma specific functions, 249 // (since first and last tops are always 0 for chroma) 250 static void wiener_c(pixel *p, const ptrdiff_t stride, 251 const pixel (*left)[4], 252 const pixel *lpf, const int w, int h, 253 const LooprestorationParams *const params, 254 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 255 { 256 // Values stored between horizontal and vertical filtering don't 257 // fit in a uint8_t. 258 uint16_t hor[6 * REST_UNIT_STRIDE]; 259 uint16_t *ptrs[7], *rows[6]; 260 for (int i = 0; i < 6; i++) 261 rows[i] = &hor[i * REST_UNIT_STRIDE]; 262 const int16_t (*const filter)[8] = params->filter; 263 const int16_t *fh = params->filter[0]; 264 const int16_t *fv = params->filter[1]; 265 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 266 267 const pixel *src = p; 268 if (edges & LR_HAVE_TOP) { 269 ptrs[0] = rows[0]; 270 ptrs[1] = rows[0]; 271 ptrs[2] = rows[1]; 272 ptrs[3] = rows[2]; 273 ptrs[4] = rows[2]; 274 ptrs[5] = rows[2]; 275 276 wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); 277 lpf += PXSTRIDE(stride); 278 wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX); 279 280 wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 281 left++; 282 src += PXSTRIDE(stride); 283 284 if (--h <= 0) 285 goto v1; 286 287 ptrs[4] = ptrs[5] = rows[3]; 288 wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 289 left++; 290 src += PXSTRIDE(stride); 291 292 if (--h <= 0) 293 goto v2; 294 295 ptrs[5] = rows[4]; 296 wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 297 left++; 298 src += PXSTRIDE(stride); 299 300 if (--h <= 0) 301 goto v3; 302 } else { 303 ptrs[0] = rows[0]; 304 ptrs[1] = rows[0]; 305 ptrs[2] = rows[0]; 306 ptrs[3] = rows[0]; 307 ptrs[4] = rows[0]; 308 ptrs[5] = rows[0]; 309 310 wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 311 left++; 312 src += PXSTRIDE(stride); 313 314 if (--h <= 0) 315 goto v1; 316 317 ptrs[4] = ptrs[5] = rows[1]; 318 wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 319 left++; 320 src += PXSTRIDE(stride); 321 322 if (--h <= 0) 323 goto v2; 324 325 ptrs[5] = rows[2]; 326 wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX); 327 left++; 328 src += PXSTRIDE(stride); 329 330 if (--h <= 0) 331 goto v3; 332 333 ptrs[6] = rows[3]; 334 wiener_filter_hv(p, ptrs, left, src, filter, w, edges 335 HIGHBD_TAIL_SUFFIX); 336 left++; 337 src += PXSTRIDE(stride); 338 p += PXSTRIDE(stride); 339 340 if (--h <= 0) 341 goto v3; 342 343 ptrs[6] = rows[4]; 344 wiener_filter_hv(p, ptrs, left, src, filter, w, edges 345 HIGHBD_TAIL_SUFFIX); 346 left++; 347 src += PXSTRIDE(stride); 348 p += PXSTRIDE(stride); 349 350 if (--h <= 0) 351 goto v3; 352 } 353 354 ptrs[6] = ptrs[5] + REST_UNIT_STRIDE; 355 do { 356 wiener_filter_hv(p, ptrs, left, src, filter, w, edges 357 HIGHBD_TAIL_SUFFIX); 358 left++; 359 src += PXSTRIDE(stride); 360 p += PXSTRIDE(stride); 361 } while (--h > 0); 362 363 if (!(edges & LR_HAVE_BOTTOM)) 364 goto v3; 365 366 wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges 367 HIGHBD_TAIL_SUFFIX); 368 lpf_bottom += PXSTRIDE(stride); 369 p += PXSTRIDE(stride); 370 371 wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges 372 HIGHBD_TAIL_SUFFIX); 373 p += PXSTRIDE(stride); 374 v1: 375 wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); 376 377 return; 378 379 v3: 380 wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); 381 p += PXSTRIDE(stride); 382 v2: 383 wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX); 384 p += PXSTRIDE(stride); 385 goto v1; 386 } 387 388 // SGR 389 static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n) 390 { 391 int32_t *tmp32 = sumsq_ptrs[0]; 392 coef *tmpc = sum_ptrs[0]; 393 for (int i = 0; i < n - 1; i++) { 394 sumsq_ptrs[i] = sumsq_ptrs[i + 1]; 395 sum_ptrs[i] = sum_ptrs[i + 1]; 396 } 397 sumsq_ptrs[n - 1] = tmp32; 398 sum_ptrs[n - 1] = tmpc; 399 } 400 401 static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs) 402 { 403 int32_t *tmp32[2]; 404 coef *tmpc[2]; 405 for (int i = 0; i < 2; i++) { 406 tmp32[i] = sumsq_ptrs[i]; 407 tmpc[i] = sum_ptrs[i]; 408 } 409 for (int i = 0; i < 3; i++) { 410 sumsq_ptrs[i] = sumsq_ptrs[i + 2]; 411 sum_ptrs[i] = sum_ptrs[i + 2]; 412 } 413 for (int i = 0; i < 2; i++) { 414 sumsq_ptrs[3 + i] = tmp32[i]; 415 sum_ptrs[3 + i] = tmpc[i]; 416 } 417 } 418 419 static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum, 420 const pixel (*left)[4], 421 const pixel *src, const int w, 422 const enum LrEdgeFlags edges) 423 { 424 sumsq++; 425 sum++; 426 int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; 427 int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; 428 for (int x = -1; x < w + 1; x++) { 429 int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1]; 430 sum[x] = a + b + c; 431 sumsq[x] = a * a + b * b + c * c; 432 a = b; 433 b = c; 434 } 435 } 436 437 static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum, 438 const pixel (*left)[4], 439 const pixel *src, const int w, 440 const enum LrEdgeFlags edges) 441 { 442 sumsq++; 443 sum++; 444 int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0]; 445 int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0]; 446 int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0]; 447 int d = src[0]; 448 for (int x = -1; x < w + 1; x++) { 449 int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1]; 450 sum[x] = a + b + c + d + e; 451 sumsq[x] = a * a + b * b + c * c + d * d + e * e; 452 a = b; 453 b = c; 454 c = d; 455 d = e; 456 } 457 } 458 459 static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3, 460 int32_t *sumsq5, coef *sum5, 461 const pixel (*left)[4], 462 const pixel *src, const int w, 463 const enum LrEdgeFlags edges) 464 { 465 sgr_box3_row_h(sumsq3, sum3, left, src, w, edges); 466 sgr_box5_row_h(sumsq5, sum5, left, src, w, edges); 467 } 468 469 static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum, 470 int32_t *sumsq_out, coef *sum_out, 471 const int w) 472 { 473 for (int x = 0; x < w + 2; x++) { 474 int sq_a = sumsq[0][x]; 475 int sq_b = sumsq[1][x]; 476 int sq_c = sumsq[2][x]; 477 int s_a = sum[0][x]; 478 int s_b = sum[1][x]; 479 int s_c = sum[2][x]; 480 sumsq_out[x] = sq_a + sq_b + sq_c; 481 sum_out[x] = s_a + s_b + s_c; 482 } 483 } 484 485 static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum, 486 int32_t *sumsq_out, coef *sum_out, 487 const int w) 488 { 489 for (int x = 0; x < w + 2; x++) { 490 int sq_a = sumsq[0][x]; 491 int sq_b = sumsq[1][x]; 492 int sq_c = sumsq[2][x]; 493 int sq_d = sumsq[3][x]; 494 int sq_e = sumsq[4][x]; 495 int s_a = sum[0][x]; 496 int s_b = sum[1][x]; 497 int s_c = sum[2][x]; 498 int s_d = sum[3][x]; 499 int s_e = sum[4][x]; 500 sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e; 501 sum_out[x] = s_a + s_b + s_c + s_d + s_e; 502 } 503 } 504 505 static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s, 506 int bitdepth_max, int n, int sgr_one_by_x) 507 { 508 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 509 for (int i = 0; i < w + 2; i++) { 510 const int a = 511 (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8); 512 const int b = 513 (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8; 514 515 const unsigned p = imax(a * n - b * b, 0); 516 const unsigned z = (p * s + (1 << 19)) >> 20; 517 const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)]; 518 519 // This is where we invert A and B, so that B is of size coef. 520 AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12; 521 BB[i] = x; 522 } 523 } 524 525 static void sgr_box3_vert(int32_t **sumsq, coef **sum, 526 int32_t *sumsq_out, coef *sum_out, 527 const int w, const int s, const int bitdepth_max) 528 { 529 sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w); 530 sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455); 531 rotate(sumsq, sum, 3); 532 } 533 534 static void sgr_box5_vert(int32_t **sumsq, coef **sum, 535 int32_t *sumsq_out, coef *sum_out, 536 const int w, const int s, const int bitdepth_max) 537 { 538 sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w); 539 sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164); 540 rotate5_x2(sumsq, sum); 541 } 542 543 static void sgr_box3_hv(int32_t **sumsq, coef **sum, 544 int32_t *AA, coef *BB, 545 const pixel (*left)[4], 546 const pixel *src, const int w, 547 const int s, 548 const enum LrEdgeFlags edges, 549 const int bitdepth_max) 550 { 551 sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges); 552 sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max); 553 } 554 555 static NOINLINE void sgr_finish_filter_row1(coef *tmp, 556 const pixel *src, 557 int32_t **A_ptrs, coef **B_ptrs, 558 const int w) 559 { 560 #define EIGHT_NEIGHBORS(P, i)\ 561 ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \ 562 (P[0][i - 1] + P[2][i - 1] + \ 563 P[0][i + 1] + P[2][i + 1]) * 3) 564 for (int i = 0; i < w; i++) { 565 const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1); 566 const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1); 567 tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; 568 } 569 #undef EIGHT_NEIGHBORS 570 } 571 572 #define FILTER_OUT_STRIDE (384) 573 574 static NOINLINE void sgr_finish_filter2(coef *tmp, 575 const pixel *src, 576 const ptrdiff_t src_stride, 577 int32_t **A_ptrs, coef **B_ptrs, 578 const int w, const int h) 579 { 580 #define SIX_NEIGHBORS(P, i)\ 581 ((P[0][i] + P[1][i]) * 6 + \ 582 (P[0][i - 1] + P[1][i - 1] + \ 583 P[0][i + 1] + P[1][i + 1]) * 5) 584 for (int i = 0; i < w; i++) { 585 const int a = SIX_NEIGHBORS(B_ptrs, i + 1); 586 const int b = SIX_NEIGHBORS(A_ptrs, i + 1); 587 tmp[i] = (b - a * src[i] + (1 << 8)) >> 9; 588 } 589 if (h <= 1) 590 return; 591 tmp += FILTER_OUT_STRIDE; 592 src += PXSTRIDE(src_stride); 593 const int32_t *A = &A_ptrs[1][1]; 594 const coef *B = &B_ptrs[1][1]; 595 for (int i = 0; i < w; i++) { 596 const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5; 597 const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5; 598 tmp[i] = (b - a * src[i] + (1 << 7)) >> 8; 599 } 600 #undef SIX_NEIGHBORS 601 } 602 603 static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1, 604 const int w, const int w1 HIGHBD_DECL_SUFFIX) 605 { 606 for (int i = 0; i < w; i++) { 607 const int v = w1 * t1[i]; 608 dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); 609 } 610 } 611 612 static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride, 613 const coef *t1, const coef *t2, 614 const int w, const int h, 615 const int w0, const int w1 HIGHBD_DECL_SUFFIX) 616 { 617 for (int j = 0; j < h; j++) { 618 for (int i = 0; i < w; i++) { 619 const int v = w0 * t1[i] + w1 * t2[i]; 620 dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11)); 621 } 622 dst += PXSTRIDE(dst_stride); 623 t1 += FILTER_OUT_STRIDE; 624 t2 += FILTER_OUT_STRIDE; 625 } 626 } 627 628 static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride, 629 int32_t **A_ptrs, coef **B_ptrs, const int w, 630 const int w1 HIGHBD_DECL_SUFFIX) 631 { 632 // Only one single row, no stride needed 633 ALIGN_STK_16(coef, tmp, 384,); 634 635 sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w); 636 sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); 637 *dst += PXSTRIDE(stride); 638 rotate(A_ptrs, B_ptrs, 3); 639 } 640 641 static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride, 642 int32_t **A_ptrs, coef **B_ptrs, 643 const int w, const int h, const int w1 644 HIGHBD_DECL_SUFFIX) 645 { 646 ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,); 647 648 sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h); 649 sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX); 650 *dst += PXSTRIDE(stride); 651 if (h > 1) { 652 sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX); 653 *dst += PXSTRIDE(stride); 654 } 655 rotate(A_ptrs, B_ptrs, 2); 656 } 657 658 static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride, 659 int32_t **A5_ptrs, coef **B5_ptrs, 660 int32_t **A3_ptrs, coef **B3_ptrs, 661 const int w, const int h, 662 const int w0, const int w1 HIGHBD_DECL_SUFFIX) 663 { 664 ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,); 665 ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,); 666 667 sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h); 668 sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w); 669 if (h > 1) 670 sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride), 671 &A3_ptrs[1], &B3_ptrs[1], w); 672 sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX); 673 *dst += h*PXSTRIDE(stride); 674 rotate(A5_ptrs, B5_ptrs, 2); 675 rotate(A3_ptrs, B3_ptrs, 4); 676 } 677 678 679 static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride, 680 const pixel (*left)[4], const pixel *lpf, 681 const int w, int h, 682 const LooprestorationParams *const params, 683 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 684 { 685 #define BUF_STRIDE (384 + 16) 686 ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,); 687 ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,); 688 int32_t *sumsq_ptrs[3], *sumsq_rows[3]; 689 coef *sum_ptrs[3], *sum_rows[3]; 690 for (int i = 0; i < 3; i++) { 691 sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; 692 sum_rows[i] = &sum_buf[i * BUF_STRIDE]; 693 } 694 695 ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,); 696 ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,); 697 int32_t *A_ptrs[3]; 698 coef *B_ptrs[3]; 699 for (int i = 0; i < 3; i++) { 700 A_ptrs[i] = &A_buf[i * BUF_STRIDE]; 701 B_ptrs[i] = &B_buf[i * BUF_STRIDE]; 702 } 703 const pixel *src = dst; 704 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 705 706 if (edges & LR_HAVE_TOP) { 707 sumsq_ptrs[0] = sumsq_rows[0]; 708 sumsq_ptrs[1] = sumsq_rows[1]; 709 sumsq_ptrs[2] = sumsq_rows[2]; 710 sum_ptrs[0] = sum_rows[0]; 711 sum_ptrs[1] = sum_rows[1]; 712 sum_ptrs[2] = sum_rows[2]; 713 714 sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); 715 lpf += PXSTRIDE(stride); 716 sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); 717 718 sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 719 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 720 left++; 721 src += PXSTRIDE(stride); 722 rotate(A_ptrs, B_ptrs, 3); 723 724 if (--h <= 0) 725 goto vert_1; 726 727 sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 728 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 729 left++; 730 src += PXSTRIDE(stride); 731 rotate(A_ptrs, B_ptrs, 3); 732 733 if (--h <= 0) 734 goto vert_2; 735 } else { 736 sumsq_ptrs[0] = sumsq_rows[0]; 737 sumsq_ptrs[1] = sumsq_rows[0]; 738 sumsq_ptrs[2] = sumsq_rows[0]; 739 sum_ptrs[0] = sum_rows[0]; 740 sum_ptrs[1] = sum_rows[0]; 741 sum_ptrs[2] = sum_rows[0]; 742 743 sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); 744 left++; 745 src += PXSTRIDE(stride); 746 747 sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 748 w, params->sgr.s1, BITDEPTH_MAX); 749 rotate(A_ptrs, B_ptrs, 3); 750 751 if (--h <= 0) 752 goto vert_1; 753 754 sumsq_ptrs[2] = sumsq_rows[1]; 755 sum_ptrs[2] = sum_rows[1]; 756 757 sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 758 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 759 left++; 760 src += PXSTRIDE(stride); 761 rotate(A_ptrs, B_ptrs, 3); 762 763 if (--h <= 0) 764 goto vert_2; 765 766 sumsq_ptrs[2] = sumsq_rows[2]; 767 sum_ptrs[2] = sum_rows[2]; 768 } 769 770 do { 771 sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 772 left, src, w, params->sgr.s1, edges, BITDEPTH_MAX); 773 left++; 774 src += PXSTRIDE(stride); 775 776 sgr_finish1(&dst, stride, A_ptrs, B_ptrs, 777 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 778 } while (--h > 0); 779 780 if (!(edges & LR_HAVE_BOTTOM)) 781 goto vert_2; 782 783 sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 784 NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); 785 lpf_bottom += PXSTRIDE(stride); 786 787 sgr_finish1(&dst, stride, A_ptrs, B_ptrs, 788 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 789 790 sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 791 NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX); 792 793 sgr_finish1(&dst, stride, A_ptrs, B_ptrs, 794 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 795 return; 796 797 vert_2: 798 sumsq_ptrs[2] = sumsq_ptrs[1]; 799 sum_ptrs[2] = sum_ptrs[1]; 800 sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 801 w, params->sgr.s1, BITDEPTH_MAX); 802 803 sgr_finish1(&dst, stride, A_ptrs, B_ptrs, 804 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 805 806 output_1: 807 sumsq_ptrs[2] = sumsq_ptrs[1]; 808 sum_ptrs[2] = sum_ptrs[1]; 809 sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 810 w, params->sgr.s1, BITDEPTH_MAX); 811 812 sgr_finish1(&dst, stride, A_ptrs, B_ptrs, 813 w, params->sgr.w1 HIGHBD_TAIL_SUFFIX); 814 return; 815 816 vert_1: 817 sumsq_ptrs[2] = sumsq_ptrs[1]; 818 sum_ptrs[2] = sum_ptrs[1]; 819 sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2], 820 w, params->sgr.s1, BITDEPTH_MAX); 821 rotate(A_ptrs, B_ptrs, 3); 822 goto output_1; 823 } 824 825 static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride, 826 const pixel (*left)[4], const pixel *lpf, 827 const int w, int h, 828 const LooprestorationParams *const params, 829 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 830 { 831 ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,); 832 ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,); 833 int32_t *sumsq_ptrs[5], *sumsq_rows[5]; 834 coef *sum_ptrs[5], *sum_rows[5]; 835 for (int i = 0; i < 5; i++) { 836 sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE]; 837 sum_rows[i] = &sum_buf[i * BUF_STRIDE]; 838 } 839 840 ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,); 841 ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,); 842 int32_t *A_ptrs[2]; 843 coef *B_ptrs[2]; 844 for (int i = 0; i < 2; i++) { 845 A_ptrs[i] = &A_buf[i * BUF_STRIDE]; 846 B_ptrs[i] = &B_buf[i * BUF_STRIDE]; 847 } 848 const pixel *src = dst; 849 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 850 851 if (edges & LR_HAVE_TOP) { 852 sumsq_ptrs[0] = sumsq_rows[0]; 853 sumsq_ptrs[1] = sumsq_rows[0]; 854 sumsq_ptrs[2] = sumsq_rows[1]; 855 sumsq_ptrs[3] = sumsq_rows[2]; 856 sumsq_ptrs[4] = sumsq_rows[3]; 857 sum_ptrs[0] = sum_rows[0]; 858 sum_ptrs[1] = sum_rows[0]; 859 sum_ptrs[2] = sum_rows[1]; 860 sum_ptrs[3] = sum_rows[2]; 861 sum_ptrs[4] = sum_rows[3]; 862 863 sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges); 864 lpf += PXSTRIDE(stride); 865 sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges); 866 867 sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); 868 left++; 869 src += PXSTRIDE(stride); 870 871 if (--h <= 0) 872 goto vert_1; 873 874 sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); 875 left++; 876 src += PXSTRIDE(stride); 877 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 878 w, params->sgr.s0, BITDEPTH_MAX); 879 rotate(A_ptrs, B_ptrs, 2); 880 881 if (--h <= 0) 882 goto vert_2; 883 884 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 885 // one of them to point at the previously unused rows[4]. 886 sumsq_ptrs[3] = sumsq_rows[4]; 887 sum_ptrs[3] = sum_rows[4]; 888 } else { 889 sumsq_ptrs[0] = sumsq_rows[0]; 890 sumsq_ptrs[1] = sumsq_rows[0]; 891 sumsq_ptrs[2] = sumsq_rows[0]; 892 sumsq_ptrs[3] = sumsq_rows[0]; 893 sumsq_ptrs[4] = sumsq_rows[0]; 894 sum_ptrs[0] = sum_rows[0]; 895 sum_ptrs[1] = sum_rows[0]; 896 sum_ptrs[2] = sum_rows[0]; 897 sum_ptrs[3] = sum_rows[0]; 898 sum_ptrs[4] = sum_rows[0]; 899 900 sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges); 901 left++; 902 src += PXSTRIDE(stride); 903 904 if (--h <= 0) 905 goto vert_1; 906 907 sumsq_ptrs[4] = sumsq_rows[1]; 908 sum_ptrs[4] = sum_rows[1]; 909 910 sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges); 911 left++; 912 src += PXSTRIDE(stride); 913 914 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 915 w, params->sgr.s0, BITDEPTH_MAX); 916 rotate(A_ptrs, B_ptrs, 2); 917 918 if (--h <= 0) 919 goto vert_2; 920 921 sumsq_ptrs[3] = sumsq_rows[2]; 922 sumsq_ptrs[4] = sumsq_rows[3]; 923 sum_ptrs[3] = sum_rows[2]; 924 sum_ptrs[4] = sum_rows[3]; 925 926 sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges); 927 left++; 928 src += PXSTRIDE(stride); 929 930 if (--h <= 0) 931 goto odd; 932 933 sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges); 934 left++; 935 src += PXSTRIDE(stride); 936 937 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 938 w, params->sgr.s0, BITDEPTH_MAX); 939 sgr_finish2(&dst, stride, A_ptrs, B_ptrs, 940 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 941 942 if (--h <= 0) 943 goto vert_2; 944 945 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 946 // one of them to point at the previously unused rows[4]. 947 sumsq_ptrs[3] = sumsq_rows[4]; 948 sum_ptrs[3] = sum_rows[4]; 949 } 950 951 do { 952 sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges); 953 left++; 954 src += PXSTRIDE(stride); 955 956 if (--h <= 0) 957 goto odd; 958 959 sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges); 960 left++; 961 src += PXSTRIDE(stride); 962 963 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 964 w, params->sgr.s0, BITDEPTH_MAX); 965 sgr_finish2(&dst, stride, A_ptrs, B_ptrs, 966 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 967 } while (--h > 0); 968 969 if (!(edges & LR_HAVE_BOTTOM)) 970 goto vert_2; 971 972 sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges); 973 lpf_bottom += PXSTRIDE(stride); 974 sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges); 975 976 output_2: 977 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 978 w, params->sgr.s0, BITDEPTH_MAX); 979 sgr_finish2(&dst, stride, A_ptrs, B_ptrs, 980 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 981 return; 982 983 vert_2: 984 // Duplicate the last row twice more 985 sumsq_ptrs[3] = sumsq_ptrs[2]; 986 sumsq_ptrs[4] = sumsq_ptrs[2]; 987 sum_ptrs[3] = sum_ptrs[2]; 988 sum_ptrs[4] = sum_ptrs[2]; 989 goto output_2; 990 991 odd: 992 // Copy the last row as padding once 993 sumsq_ptrs[4] = sumsq_ptrs[3]; 994 sum_ptrs[4] = sum_ptrs[3]; 995 996 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 997 w, params->sgr.s0, BITDEPTH_MAX); 998 sgr_finish2(&dst, stride, A_ptrs, B_ptrs, 999 w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 1000 1001 output_1: 1002 // Duplicate the last row twice more 1003 sumsq_ptrs[3] = sumsq_ptrs[2]; 1004 sumsq_ptrs[4] = sumsq_ptrs[2]; 1005 sum_ptrs[3] = sum_ptrs[2]; 1006 sum_ptrs[4] = sum_ptrs[2]; 1007 1008 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 1009 w, params->sgr.s0, BITDEPTH_MAX); 1010 // Output only one row 1011 sgr_finish2(&dst, stride, A_ptrs, B_ptrs, 1012 w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX); 1013 return; 1014 1015 vert_1: 1016 // Copy the last row as padding once 1017 sumsq_ptrs[4] = sumsq_ptrs[3]; 1018 sum_ptrs[4] = sum_ptrs[3]; 1019 1020 sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1], 1021 w, params->sgr.s0, BITDEPTH_MAX); 1022 rotate(A_ptrs, B_ptrs, 2); 1023 1024 goto output_1; 1025 } 1026 1027 static void sgr_mix_c(pixel *dst, const ptrdiff_t stride, 1028 const pixel (*left)[4], const pixel *lpf, 1029 const int w, int h, 1030 const LooprestorationParams *const params, 1031 const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX) 1032 { 1033 ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,); 1034 ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,); 1035 int32_t *sumsq5_ptrs[5], *sumsq5_rows[5]; 1036 coef *sum5_ptrs[5], *sum5_rows[5]; 1037 for (int i = 0; i < 5; i++) { 1038 sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE]; 1039 sum5_rows[i] = &sum5_buf[i * BUF_STRIDE]; 1040 } 1041 ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,); 1042 ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,); 1043 int32_t *sumsq3_ptrs[3], *sumsq3_rows[3]; 1044 coef *sum3_ptrs[3], *sum3_rows[3]; 1045 for (int i = 0; i < 3; i++) { 1046 sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE]; 1047 sum3_rows[i] = &sum3_buf[i * BUF_STRIDE]; 1048 } 1049 1050 ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,); 1051 ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,); 1052 int32_t *A5_ptrs[2]; 1053 coef *B5_ptrs[2]; 1054 for (int i = 0; i < 2; i++) { 1055 A5_ptrs[i] = &A5_buf[i * BUF_STRIDE]; 1056 B5_ptrs[i] = &B5_buf[i * BUF_STRIDE]; 1057 } 1058 ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,); 1059 ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,); 1060 int32_t *A3_ptrs[4]; 1061 coef *B3_ptrs[4]; 1062 for (int i = 0; i < 4; i++) { 1063 A3_ptrs[i] = &A3_buf[i * BUF_STRIDE]; 1064 B3_ptrs[i] = &B3_buf[i * BUF_STRIDE]; 1065 } 1066 const pixel *src = dst; 1067 const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride); 1068 1069 if (edges & LR_HAVE_TOP) { 1070 sumsq5_ptrs[0] = sumsq5_rows[0]; 1071 sumsq5_ptrs[1] = sumsq5_rows[0]; 1072 sumsq5_ptrs[2] = sumsq5_rows[1]; 1073 sumsq5_ptrs[3] = sumsq5_rows[2]; 1074 sumsq5_ptrs[4] = sumsq5_rows[3]; 1075 sum5_ptrs[0] = sum5_rows[0]; 1076 sum5_ptrs[1] = sum5_rows[0]; 1077 sum5_ptrs[2] = sum5_rows[1]; 1078 sum5_ptrs[3] = sum5_rows[2]; 1079 sum5_ptrs[4] = sum5_rows[3]; 1080 1081 sumsq3_ptrs[0] = sumsq3_rows[0]; 1082 sumsq3_ptrs[1] = sumsq3_rows[1]; 1083 sumsq3_ptrs[2] = sumsq3_rows[2]; 1084 sum3_ptrs[0] = sum3_rows[0]; 1085 sum3_ptrs[1] = sum3_rows[1]; 1086 sum3_ptrs[2] = sum3_rows[2]; 1087 1088 sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], 1089 sumsq5_rows[0], sum5_rows[0], 1090 NULL, lpf, w, edges); 1091 lpf += PXSTRIDE(stride); 1092 sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], 1093 sumsq5_rows[1], sum5_rows[1], 1094 NULL, lpf, w, edges); 1095 1096 sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], 1097 sumsq5_rows[2], sum5_rows[2], 1098 left, src, w, edges); 1099 left++; 1100 src += PXSTRIDE(stride); 1101 1102 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1103 w, params->sgr.s1, BITDEPTH_MAX); 1104 rotate(A3_ptrs, B3_ptrs, 4); 1105 1106 if (--h <= 0) 1107 goto vert_1; 1108 1109 sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], 1110 sumsq5_rows[3], sum5_rows[3], 1111 left, src, w, edges); 1112 left++; 1113 src += PXSTRIDE(stride); 1114 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1115 w, params->sgr.s0, BITDEPTH_MAX); 1116 rotate(A5_ptrs, B5_ptrs, 2); 1117 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1118 w, params->sgr.s1, BITDEPTH_MAX); 1119 rotate(A3_ptrs, B3_ptrs, 4); 1120 1121 if (--h <= 0) 1122 goto vert_2; 1123 1124 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 1125 // one of them to point at the previously unused rows[4]. 1126 sumsq5_ptrs[3] = sumsq5_rows[4]; 1127 sum5_ptrs[3] = sum5_rows[4]; 1128 } else { 1129 sumsq5_ptrs[0] = sumsq5_rows[0]; 1130 sumsq5_ptrs[1] = sumsq5_rows[0]; 1131 sumsq5_ptrs[2] = sumsq5_rows[0]; 1132 sumsq5_ptrs[3] = sumsq5_rows[0]; 1133 sumsq5_ptrs[4] = sumsq5_rows[0]; 1134 sum5_ptrs[0] = sum5_rows[0]; 1135 sum5_ptrs[1] = sum5_rows[0]; 1136 sum5_ptrs[2] = sum5_rows[0]; 1137 sum5_ptrs[3] = sum5_rows[0]; 1138 sum5_ptrs[4] = sum5_rows[0]; 1139 1140 sumsq3_ptrs[0] = sumsq3_rows[0]; 1141 sumsq3_ptrs[1] = sumsq3_rows[0]; 1142 sumsq3_ptrs[2] = sumsq3_rows[0]; 1143 sum3_ptrs[0] = sum3_rows[0]; 1144 sum3_ptrs[1] = sum3_rows[0]; 1145 sum3_ptrs[2] = sum3_rows[0]; 1146 1147 sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0], 1148 sumsq5_rows[0], sum5_rows[0], 1149 left, src, w, edges); 1150 left++; 1151 src += PXSTRIDE(stride); 1152 1153 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1154 w, params->sgr.s1, BITDEPTH_MAX); 1155 rotate(A3_ptrs, B3_ptrs, 4); 1156 1157 if (--h <= 0) 1158 goto vert_1; 1159 1160 sumsq5_ptrs[4] = sumsq5_rows[1]; 1161 sum5_ptrs[4] = sum5_rows[1]; 1162 1163 sumsq3_ptrs[2] = sumsq3_rows[1]; 1164 sum3_ptrs[2] = sum3_rows[1]; 1165 1166 sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1], 1167 sumsq5_rows[1], sum5_rows[1], 1168 left, src, w, edges); 1169 left++; 1170 src += PXSTRIDE(stride); 1171 1172 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1173 w, params->sgr.s0, BITDEPTH_MAX); 1174 rotate(A5_ptrs, B5_ptrs, 2); 1175 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1176 w, params->sgr.s1, BITDEPTH_MAX); 1177 rotate(A3_ptrs, B3_ptrs, 4); 1178 1179 if (--h <= 0) 1180 goto vert_2; 1181 1182 sumsq5_ptrs[3] = sumsq5_rows[2]; 1183 sumsq5_ptrs[4] = sumsq5_rows[3]; 1184 sum5_ptrs[3] = sum5_rows[2]; 1185 sum5_ptrs[4] = sum5_rows[3]; 1186 1187 sumsq3_ptrs[2] = sumsq3_rows[2]; 1188 sum3_ptrs[2] = sum3_rows[2]; 1189 1190 sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2], 1191 sumsq5_rows[2], sum5_rows[2], 1192 left, src, w, edges); 1193 left++; 1194 src += PXSTRIDE(stride); 1195 1196 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1197 w, params->sgr.s1, BITDEPTH_MAX); 1198 rotate(A3_ptrs, B3_ptrs, 4); 1199 1200 if (--h <= 0) 1201 goto odd; 1202 1203 sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], 1204 sumsq5_rows[3], sum5_rows[3], 1205 left, src, w, edges); 1206 left++; 1207 src += PXSTRIDE(stride); 1208 1209 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1210 w, params->sgr.s0, BITDEPTH_MAX); 1211 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1212 w, params->sgr.s1, BITDEPTH_MAX); 1213 sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1214 w, 2, params->sgr.w0, params->sgr.w1 1215 HIGHBD_TAIL_SUFFIX); 1216 1217 if (--h <= 0) 1218 goto vert_2; 1219 1220 // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set 1221 // one of them to point at the previously unused rows[4]. 1222 sumsq5_ptrs[3] = sumsq5_rows[4]; 1223 sum5_ptrs[3] = sum5_rows[4]; 1224 } 1225 1226 do { 1227 sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], 1228 sumsq5_ptrs[3], sum5_ptrs[3], 1229 left, src, w, edges); 1230 left++; 1231 src += PXSTRIDE(stride); 1232 1233 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1234 w, params->sgr.s1, BITDEPTH_MAX); 1235 rotate(A3_ptrs, B3_ptrs, 4); 1236 1237 if (--h <= 0) 1238 goto odd; 1239 1240 sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], 1241 sumsq5_ptrs[4], sum5_ptrs[4], 1242 left, src, w, edges); 1243 left++; 1244 src += PXSTRIDE(stride); 1245 1246 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1247 w, params->sgr.s0, BITDEPTH_MAX); 1248 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1249 w, params->sgr.s1, BITDEPTH_MAX); 1250 sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1251 w, 2, params->sgr.w0, params->sgr.w1 1252 HIGHBD_TAIL_SUFFIX); 1253 } while (--h > 0); 1254 1255 if (!(edges & LR_HAVE_BOTTOM)) 1256 goto vert_2; 1257 1258 sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], 1259 sumsq5_ptrs[3], sum5_ptrs[3], 1260 NULL, lpf_bottom, w, edges); 1261 lpf_bottom += PXSTRIDE(stride); 1262 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1263 w, params->sgr.s1, BITDEPTH_MAX); 1264 rotate(A3_ptrs, B3_ptrs, 4); 1265 1266 sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2], 1267 sumsq5_ptrs[4], sum5_ptrs[4], 1268 NULL, lpf_bottom, w, edges); 1269 1270 output_2: 1271 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1272 w, params->sgr.s0, BITDEPTH_MAX); 1273 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1274 w, params->sgr.s1, BITDEPTH_MAX); 1275 sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1276 w, 2, params->sgr.w0, params->sgr.w1 1277 HIGHBD_TAIL_SUFFIX); 1278 return; 1279 1280 vert_2: 1281 // Duplicate the last row twice more 1282 sumsq5_ptrs[3] = sumsq5_ptrs[2]; 1283 sumsq5_ptrs[4] = sumsq5_ptrs[2]; 1284 sum5_ptrs[3] = sum5_ptrs[2]; 1285 sum5_ptrs[4] = sum5_ptrs[2]; 1286 1287 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1288 sum3_ptrs[2] = sum3_ptrs[1]; 1289 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1290 w, params->sgr.s1, BITDEPTH_MAX); 1291 rotate(A3_ptrs, B3_ptrs, 4); 1292 1293 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1294 sum3_ptrs[2] = sum3_ptrs[1]; 1295 1296 goto output_2; 1297 1298 odd: 1299 // Copy the last row as padding once 1300 sumsq5_ptrs[4] = sumsq5_ptrs[3]; 1301 sum5_ptrs[4] = sum5_ptrs[3]; 1302 1303 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1304 sum3_ptrs[2] = sum3_ptrs[1]; 1305 1306 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1307 w, params->sgr.s0, BITDEPTH_MAX); 1308 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1309 w, params->sgr.s1, BITDEPTH_MAX); 1310 sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1311 w, 2, params->sgr.w0, params->sgr.w1 1312 HIGHBD_TAIL_SUFFIX); 1313 1314 output_1: 1315 // Duplicate the last row twice more 1316 sumsq5_ptrs[3] = sumsq5_ptrs[2]; 1317 sumsq5_ptrs[4] = sumsq5_ptrs[2]; 1318 sum5_ptrs[3] = sum5_ptrs[2]; 1319 sum5_ptrs[4] = sum5_ptrs[2]; 1320 1321 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1322 sum3_ptrs[2] = sum3_ptrs[1]; 1323 1324 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1325 w, params->sgr.s0, BITDEPTH_MAX); 1326 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1327 w, params->sgr.s1, BITDEPTH_MAX); 1328 rotate(A3_ptrs, B3_ptrs, 4); 1329 // Output only one row 1330 sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs, 1331 w, 1, params->sgr.w0, params->sgr.w1 1332 HIGHBD_TAIL_SUFFIX); 1333 return; 1334 1335 vert_1: 1336 // Copy the last row as padding once 1337 sumsq5_ptrs[4] = sumsq5_ptrs[3]; 1338 sum5_ptrs[4] = sum5_ptrs[3]; 1339 1340 sumsq3_ptrs[2] = sumsq3_ptrs[1]; 1341 sum3_ptrs[2] = sum3_ptrs[1]; 1342 1343 sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1], 1344 w, params->sgr.s0, BITDEPTH_MAX); 1345 rotate(A5_ptrs, B5_ptrs, 2); 1346 sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3], 1347 w, params->sgr.s1, BITDEPTH_MAX); 1348 rotate(A3_ptrs, B3_ptrs, 4); 1349 1350 goto output_1; 1351 } 1352 1353 #if HAVE_ASM 1354 #if ARCH_AARCH64 || ARCH_ARM 1355 #include "src/arm/looprestoration.h" 1356 #elif ARCH_LOONGARCH64 1357 #include "src/loongarch/looprestoration.h" 1358 #elif ARCH_PPC64LE 1359 #include "src/ppc/looprestoration.h" 1360 #elif ARCH_X86 1361 #include "src/x86/looprestoration.h" 1362 #endif 1363 #endif 1364 1365 COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c, 1366 const int bpc) 1367 { 1368 c->wiener[0] = c->wiener[1] = wiener_c; 1369 c->sgr[0] = sgr_5x5_c; 1370 c->sgr[1] = sgr_3x3_c; 1371 c->sgr[2] = sgr_mix_c; 1372 1373 #if HAVE_ASM 1374 #if ARCH_AARCH64 || ARCH_ARM 1375 loop_restoration_dsp_init_arm(c, bpc); 1376 #elif ARCH_LOONGARCH64 1377 loop_restoration_dsp_init_loongarch(c, bpc); 1378 #elif ARCH_PPC64LE 1379 loop_restoration_dsp_init_ppc(c, bpc); 1380 #elif ARCH_X86 1381 loop_restoration_dsp_init_x86(c, bpc); 1382 #endif 1383 #endif 1384 }