tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration_tmpl.c (47500B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Two Orioles, LLC
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "config.h"
     29 
     30 #include <stdint.h>
     31 #include <stdlib.h>
     32 #include <string.h>
     33 
     34 #include "common/attributes.h"
     35 #include "common/bitdepth.h"
     36 #include "common/intops.h"
     37 
     38 #include "src/looprestoration.h"
     39 #include "src/tables.h"
     40 
     41 // 256 * 1.5 + 3 + 3 = 390
     42 #define REST_UNIT_STRIDE (390)
     43 
     44 static void wiener_filter_h(uint16_t *dst, const pixel (*left)[4],
     45                            const pixel *src, const int16_t fh[8],
     46                            const int w, const enum LrEdgeFlags edges
     47                            HIGHBD_DECL_SUFFIX)
     48 {
     49    const int bitdepth = bitdepth_from_max(bitdepth_max);
     50    const int round_bits_h = 3 + (bitdepth == 12) * 2;
     51    const int rounding_off_h = 1 << (round_bits_h - 1);
     52    const int clip_limit = 1 << (bitdepth + 1 + 7 - round_bits_h);
     53 
     54    if (w < 6) {
     55        // For small widths, do the fully conditional loop with
     56        // conditions on each access.
     57        for (int x = 0; x < w; x++) {
     58            int sum = (1 << (bitdepth + 6));
     59 #if BITDEPTH == 8
     60            sum += src[x] * 128;
     61 #endif
     62            for (int i = 0; i < 7; i++) {
     63                int idx = x + i - 3;
     64                if (idx < 0) {
     65                    if (!(edges & LR_HAVE_LEFT))
     66                        sum += src[0] * fh[i];
     67                    else if (left)
     68                        sum += left[0][4 + idx] * fh[i];
     69                    else
     70                        sum += src[idx] * fh[i];
     71                } else if (idx >= w && !(edges & LR_HAVE_RIGHT)) {
     72                    sum += src[w - 1] * fh[i];
     73                } else
     74                    sum += src[idx] * fh[i];
     75            }
     76            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
     77            dst[x] = sum;
     78        }
     79 
     80        return;
     81    }
     82 
     83    // For larger widths, do separate loops with less conditions; first
     84    // handle the start of the row.
     85    int start = 3;
     86    if (!(edges & LR_HAVE_LEFT)) {
     87        // If there's no left edge, pad using the leftmost pixel.
     88        for (int x = 0; x < 3; x++) {
     89            int sum = (1 << (bitdepth + 6));
     90 #if BITDEPTH == 8
     91            sum += src[x] * 128;
     92 #endif
     93            for (int i = 0; i < 7; i++) {
     94                int idx = x + i - 3;
     95                if (idx < 0)
     96                    sum += src[0] * fh[i];
     97                else
     98                    sum += src[idx] * fh[i];
     99            }
    100            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
    101            dst[x] = sum;
    102        }
    103    } else if (left) {
    104        // If we have the left edge and a separate left buffer, pad using that.
    105        for (int x = 0; x < 3; x++) {
    106            int sum = (1 << (bitdepth + 6));
    107 #if BITDEPTH == 8
    108            sum += src[x] * 128;
    109 #endif
    110            for (int i = 0; i < 7; i++) {
    111                int idx = x + i - 3;
    112                if (idx < 0)
    113                    sum += left[0][4 + idx] * fh[i];
    114                else
    115                    sum += src[idx] * fh[i];
    116            }
    117            sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
    118            dst[x] = sum;
    119        }
    120    } else {
    121        // If we have the left edge, but no separate left buffer, we're in the
    122        // top/bottom area (lpf) with the left edge existing in the same
    123        // buffer; just do the regular loop from the start.
    124        start = 0;
    125    }
    126    int end = w - 3;
    127    if (edges & LR_HAVE_RIGHT)
    128        end = w;
    129 
    130    // Do a condititon free loop for the bulk of the row.
    131    for (int x = start; x < end; x++) {
    132        int sum = (1 << (bitdepth + 6));
    133 #if BITDEPTH == 8
    134        sum += src[x] * 128;
    135 #endif
    136        for (int i = 0; i < 7; i++) {
    137            int idx = x + i - 3;
    138            sum += src[idx] * fh[i];
    139        }
    140        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
    141        dst[x] = sum;
    142    }
    143 
    144    // If we need to, calculate the end of the row with a condition for
    145    // right edge padding.
    146    for (int x = end; x < w; x++) {
    147        int sum = (1 << (bitdepth + 6));
    148 #if BITDEPTH == 8
    149        sum += src[x] * 128;
    150 #endif
    151        for (int i = 0; i < 7; i++) {
    152            int idx = x + i - 3;
    153            if (idx >= w)
    154                sum += src[w - 1] * fh[i];
    155            else
    156                sum += src[idx] * fh[i];
    157        }
    158        sum = iclip((sum + rounding_off_h) >> round_bits_h, 0, clip_limit - 1);
    159        dst[x] = sum;
    160    }
    161 }
    162 
    163 static void wiener_filter_v(pixel *p, uint16_t **ptrs, const int16_t fv[8],
    164                            const int w HIGHBD_DECL_SUFFIX)
    165 {
    166    const int bitdepth = bitdepth_from_max(bitdepth_max);
    167 
    168    const int round_bits_v = 11 - (bitdepth == 12) * 2;
    169    const int rounding_off_v = 1 << (round_bits_v - 1);
    170    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
    171 
    172    for (int i = 0; i < w; i++) {
    173        int sum = -round_offset;
    174 
    175        // Only filter using 6 input rows. The 7th row is assumed to be
    176        // identical to the last one.
    177        //
    178        // This function is assumed to only be called at the end, when doing
    179        // padding at the bottom.
    180        for (int k = 0; k < 6; k++)
    181            sum += ptrs[k][i] * fv[k];
    182        sum += ptrs[5][i] * fv[6];
    183 
    184        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
    185    }
    186 
    187    // Shift the pointers, but only update the first 5; the 6th pointer is kept
    188    // as it was before (and the 7th is implicitly identical to the 6th).
    189    for (int i = 0; i < 5; i++)
    190        ptrs[i] = ptrs[i + 1];
    191 }
    192 
    193 static void wiener_filter_hv(pixel *p, uint16_t **ptrs, const pixel (*left)[4],
    194                             const pixel *src, const int16_t filter[2][8],
    195                             const int w, const enum LrEdgeFlags edges
    196                             HIGHBD_DECL_SUFFIX)
    197 {
    198    const int bitdepth = bitdepth_from_max(bitdepth_max);
    199 
    200    const int round_bits_v = 11 - (bitdepth == 12) * 2;
    201    const int rounding_off_v = 1 << (round_bits_v - 1);
    202    const int round_offset = 1 << (bitdepth + (round_bits_v - 1));
    203 
    204    const int16_t *fh = filter[0];
    205    const int16_t *fv = filter[1];
    206 
    207    // Do combined horziontal and vertical filtering; doing horizontal
    208    // filtering of one row, combined with vertical filtering of 6
    209    // preexisting rows and the newly filtered row.
    210 
    211    // For simplicity in the C implementation, just do a separate call
    212    // of the horizontal filter, into a temporary buffer.
    213    uint16_t tmp[REST_UNIT_STRIDE];
    214    wiener_filter_h(tmp, left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    215 
    216    for (int i = 0; i < w; i++) {
    217        int sum = -round_offset;
    218 
    219        // Filter using the 6 stored preexisting rows, and the newly
    220        // filtered one in tmp[].
    221        for (int k = 0; k < 6; k++)
    222            sum += ptrs[k][i] * fv[k];
    223        sum += tmp[i] * fv[6];
    224        // At this point, after having read all inputs at point [i], we
    225        // could overwrite [i] with the newly filtered data.
    226 
    227        p[i] = iclip_pixel((sum + rounding_off_v) >> round_bits_v);
    228    }
    229 
    230    // For simplicity in the C implementation, just memcpy the newly
    231    // filtered row into ptrs[6]. Normally, in steady state filtering,
    232    // this output row, ptrs[6], is equal to ptrs[0]. However at startup,
    233    // at the top of the filtered area, we may have ptrs[0] equal to ptrs[1],
    234    // so we can't assume we can write into ptrs[0] but we need to keep
    235    // a separate pointer for the next row to write into.
    236    memcpy(ptrs[6], tmp, sizeof(uint16_t) * REST_UNIT_STRIDE);
    237 
    238    // Rotate the window of pointers. Shift the 6 pointers downwards one step.
    239    for (int i = 0; i < 6; i++)
    240        ptrs[i] = ptrs[i + 1];
    241    // The topmost pointer, ptrs[6], which isn't used as input, is set to
    242    // ptrs[0], which will be used as output for the next _hv call.
    243    // At the start of the filtering, the caller may set ptrs[6] to the
    244    // right next buffer to fill in, instead.
    245    ptrs[6] = ptrs[0];
    246 }
    247 
    248 // FIXME Could split into luma and chroma specific functions,
    249 // (since first and last tops are always 0 for chroma)
    250 static void wiener_c(pixel *p, const ptrdiff_t stride,
    251                     const pixel (*left)[4],
    252                     const pixel *lpf, const int w, int h,
    253                     const LooprestorationParams *const params,
    254                     const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
    255 {
    256    // Values stored between horizontal and vertical filtering don't
    257    // fit in a uint8_t.
    258    uint16_t hor[6 * REST_UNIT_STRIDE];
    259    uint16_t *ptrs[7], *rows[6];
    260    for (int i = 0; i < 6; i++)
    261        rows[i] = &hor[i * REST_UNIT_STRIDE];
    262    const int16_t (*const filter)[8] = params->filter;
    263    const int16_t *fh = params->filter[0];
    264    const int16_t *fv = params->filter[1];
    265    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
    266 
    267    const pixel *src = p;
    268    if (edges & LR_HAVE_TOP) {
    269        ptrs[0] = rows[0];
    270        ptrs[1] = rows[0];
    271        ptrs[2] = rows[1];
    272        ptrs[3] = rows[2];
    273        ptrs[4] = rows[2];
    274        ptrs[5] = rows[2];
    275 
    276        wiener_filter_h(rows[0], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
    277        lpf += PXSTRIDE(stride);
    278        wiener_filter_h(rows[1], NULL, lpf, fh, w, edges HIGHBD_TAIL_SUFFIX);
    279 
    280        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    281        left++;
    282        src += PXSTRIDE(stride);
    283 
    284        if (--h <= 0)
    285            goto v1;
    286 
    287        ptrs[4] = ptrs[5] = rows[3];
    288        wiener_filter_h(rows[3], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    289        left++;
    290        src += PXSTRIDE(stride);
    291 
    292        if (--h <= 0)
    293            goto v2;
    294 
    295        ptrs[5] = rows[4];
    296        wiener_filter_h(rows[4], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    297        left++;
    298        src += PXSTRIDE(stride);
    299 
    300        if (--h <= 0)
    301            goto v3;
    302    } else {
    303        ptrs[0] = rows[0];
    304        ptrs[1] = rows[0];
    305        ptrs[2] = rows[0];
    306        ptrs[3] = rows[0];
    307        ptrs[4] = rows[0];
    308        ptrs[5] = rows[0];
    309 
    310        wiener_filter_h(rows[0], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    311        left++;
    312        src += PXSTRIDE(stride);
    313 
    314        if (--h <= 0)
    315            goto v1;
    316 
    317        ptrs[4] = ptrs[5] = rows[1];
    318        wiener_filter_h(rows[1], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    319        left++;
    320        src += PXSTRIDE(stride);
    321 
    322        if (--h <= 0)
    323            goto v2;
    324 
    325        ptrs[5] = rows[2];
    326        wiener_filter_h(rows[2], left, src, fh, w, edges HIGHBD_TAIL_SUFFIX);
    327        left++;
    328        src += PXSTRIDE(stride);
    329 
    330        if (--h <= 0)
    331            goto v3;
    332 
    333        ptrs[6] = rows[3];
    334        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
    335                         HIGHBD_TAIL_SUFFIX);
    336        left++;
    337        src += PXSTRIDE(stride);
    338        p += PXSTRIDE(stride);
    339 
    340        if (--h <= 0)
    341            goto v3;
    342 
    343        ptrs[6] = rows[4];
    344        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
    345                         HIGHBD_TAIL_SUFFIX);
    346        left++;
    347        src += PXSTRIDE(stride);
    348        p += PXSTRIDE(stride);
    349 
    350        if (--h <= 0)
    351            goto v3;
    352    }
    353 
    354    ptrs[6] = ptrs[5] + REST_UNIT_STRIDE;
    355    do {
    356        wiener_filter_hv(p, ptrs, left, src, filter, w, edges
    357                         HIGHBD_TAIL_SUFFIX);
    358        left++;
    359        src += PXSTRIDE(stride);
    360        p += PXSTRIDE(stride);
    361    } while (--h > 0);
    362 
    363    if (!(edges & LR_HAVE_BOTTOM))
    364        goto v3;
    365 
    366    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
    367                     HIGHBD_TAIL_SUFFIX);
    368    lpf_bottom += PXSTRIDE(stride);
    369    p += PXSTRIDE(stride);
    370 
    371    wiener_filter_hv(p, ptrs, NULL, lpf_bottom, filter, w, edges
    372                     HIGHBD_TAIL_SUFFIX);
    373    p += PXSTRIDE(stride);
    374 v1:
    375    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    376 
    377    return;
    378 
    379 v3:
    380    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    381    p += PXSTRIDE(stride);
    382 v2:
    383    wiener_filter_v(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    384    p += PXSTRIDE(stride);
    385    goto v1;
    386 }
    387 
    388 // SGR
    389 static NOINLINE void rotate(int32_t **sumsq_ptrs, coef **sum_ptrs, int n)
    390 {
    391    int32_t *tmp32 = sumsq_ptrs[0];
    392    coef *tmpc = sum_ptrs[0];
    393    for (int i = 0; i < n - 1; i++) {
    394        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
    395        sum_ptrs[i] = sum_ptrs[i + 1];
    396    }
    397    sumsq_ptrs[n - 1] = tmp32;
    398    sum_ptrs[n - 1] = tmpc;
    399 }
    400 
    401 static NOINLINE void rotate5_x2(int32_t **sumsq_ptrs, coef **sum_ptrs)
    402 {
    403    int32_t *tmp32[2];
    404    coef *tmpc[2];
    405    for (int i = 0; i < 2; i++) {
    406        tmp32[i] = sumsq_ptrs[i];
    407        tmpc[i] = sum_ptrs[i];
    408    }
    409    for (int i = 0; i < 3; i++) {
    410        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
    411        sum_ptrs[i] = sum_ptrs[i + 2];
    412    }
    413    for (int i = 0; i < 2; i++) {
    414        sumsq_ptrs[3 + i] = tmp32[i];
    415        sum_ptrs[3 + i] = tmpc[i];
    416    }
    417 }
    418 
    419 static NOINLINE void sgr_box3_row_h(int32_t *sumsq, coef *sum,
    420                                    const pixel (*left)[4],
    421                                    const pixel *src, const int w,
    422                                    const enum LrEdgeFlags edges)
    423 {
    424    sumsq++;
    425    sum++;
    426    int a = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
    427    int b = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
    428    for (int x = -1; x < w + 1; x++) {
    429        int c = (x + 1 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 1] : src[w - 1];
    430        sum[x] = a + b + c;
    431        sumsq[x] = a * a + b * b + c * c;
    432        a = b;
    433        b = c;
    434    }
    435 }
    436 
    437 static NOINLINE void sgr_box5_row_h(int32_t *sumsq, coef *sum,
    438                                    const pixel (*left)[4],
    439                                    const pixel *src, const int w,
    440                                    const enum LrEdgeFlags edges)
    441 {
    442    sumsq++;
    443    sum++;
    444    int a = edges & LR_HAVE_LEFT ? (left ? left[0][1] : src[-3]) : src[0];
    445    int b = edges & LR_HAVE_LEFT ? (left ? left[0][2] : src[-2]) : src[0];
    446    int c = edges & LR_HAVE_LEFT ? (left ? left[0][3] : src[-1]) : src[0];
    447    int d = src[0];
    448    for (int x = -1; x < w + 1; x++) {
    449        int e = (x + 2 < w || (edges & LR_HAVE_RIGHT)) ? src[x + 2] : src[w - 1];
    450        sum[x] = a + b + c + d + e;
    451        sumsq[x] = a * a + b * b + c * c + d * d + e * e;
    452        a = b;
    453        b = c;
    454        c = d;
    455        d = e;
    456    }
    457 }
    458 
    459 static void sgr_box35_row_h(int32_t *sumsq3, coef *sum3,
    460                            int32_t *sumsq5, coef *sum5,
    461                            const pixel (*left)[4],
    462                            const pixel *src, const int w,
    463                            const enum LrEdgeFlags edges)
    464 {
    465    sgr_box3_row_h(sumsq3, sum3, left, src, w, edges);
    466    sgr_box5_row_h(sumsq5, sum5, left, src, w, edges);
    467 }
    468 
    469 static NOINLINE void sgr_box3_row_v(int32_t **sumsq, coef **sum,
    470                                    int32_t *sumsq_out, coef *sum_out,
    471                                    const int w)
    472 {
    473    for (int x = 0; x < w + 2; x++) {
    474        int sq_a = sumsq[0][x];
    475        int sq_b = sumsq[1][x];
    476        int sq_c = sumsq[2][x];
    477        int s_a = sum[0][x];
    478        int s_b = sum[1][x];
    479        int s_c = sum[2][x];
    480        sumsq_out[x] = sq_a + sq_b + sq_c;
    481        sum_out[x] = s_a + s_b + s_c;
    482    }
    483 }
    484 
    485 static NOINLINE void sgr_box5_row_v(int32_t **sumsq, coef **sum,
    486                                    int32_t *sumsq_out, coef *sum_out,
    487                                    const int w)
    488 {
    489    for (int x = 0; x < w + 2; x++) {
    490        int sq_a = sumsq[0][x];
    491        int sq_b = sumsq[1][x];
    492        int sq_c = sumsq[2][x];
    493        int sq_d = sumsq[3][x];
    494        int sq_e = sumsq[4][x];
    495        int s_a = sum[0][x];
    496        int s_b = sum[1][x];
    497        int s_c = sum[2][x];
    498        int s_d = sum[3][x];
    499        int s_e = sum[4][x];
    500        sumsq_out[x] = sq_a + sq_b + sq_c + sq_d + sq_e;
    501        sum_out[x] = s_a + s_b + s_c + s_d + s_e;
    502    }
    503 }
    504 
    505 static NOINLINE void sgr_calc_row_ab(int32_t *AA, coef *BB, int w, int s,
    506                                     int bitdepth_max, int n, int sgr_one_by_x)
    507 {
    508    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    509    for (int i = 0; i < w + 2; i++) {
    510        const int a =
    511            (AA[i] + ((1 << (2 * bitdepth_min_8)) >> 1)) >> (2 * bitdepth_min_8);
    512        const int b =
    513            (BB[i] + ((1 << bitdepth_min_8) >> 1)) >> bitdepth_min_8;
    514 
    515        const unsigned p = imax(a * n - b * b, 0);
    516        const unsigned z = (p * s + (1 << 19)) >> 20;
    517        const unsigned x = dav1d_sgr_x_by_x[umin(z, 255)];
    518 
    519        // This is where we invert A and B, so that B is of size coef.
    520        AA[i] = (x * BB[i] * sgr_one_by_x + (1 << 11)) >> 12;
    521        BB[i] = x;
    522    }
    523 }
    524 
    525 static void sgr_box3_vert(int32_t **sumsq, coef **sum,
    526                          int32_t *sumsq_out, coef *sum_out,
    527                          const int w, const int s, const int bitdepth_max)
    528 {
    529    sgr_box3_row_v(sumsq, sum, sumsq_out, sum_out, w);
    530    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 9, 455);
    531    rotate(sumsq, sum, 3);
    532 }
    533 
    534 static void sgr_box5_vert(int32_t **sumsq, coef **sum,
    535                          int32_t *sumsq_out, coef *sum_out,
    536                          const int w, const int s, const int bitdepth_max)
    537 {
    538    sgr_box5_row_v(sumsq, sum, sumsq_out, sum_out, w);
    539    sgr_calc_row_ab(sumsq_out, sum_out, w, s, bitdepth_max, 25, 164);
    540    rotate5_x2(sumsq, sum);
    541 }
    542 
    543 static void sgr_box3_hv(int32_t **sumsq, coef **sum,
    544                        int32_t *AA, coef *BB,
    545                        const pixel (*left)[4],
    546                        const pixel *src, const int w,
    547                        const int s,
    548                        const enum LrEdgeFlags edges,
    549                        const int bitdepth_max)
    550 {
    551    sgr_box3_row_h(sumsq[2], sum[2], left, src, w, edges);
    552    sgr_box3_vert(sumsq, sum, AA, BB, w, s, bitdepth_max);
    553 }
    554 
    555 static NOINLINE void sgr_finish_filter_row1(coef *tmp,
    556                                            const pixel *src,
    557                                            int32_t **A_ptrs, coef **B_ptrs,
    558                                            const int w)
    559 {
    560 #define EIGHT_NEIGHBORS(P, i)\
    561    ((P[1][i] + P[1][i - 1] + P[1][i + 1] + P[0][i] + P[2][i]) * 4 + \
    562     (P[0][i - 1] + P[2][i - 1] +                           \
    563      P[0][i + 1] + P[2][i + 1]) * 3)
    564    for (int i = 0; i < w; i++) {
    565        const int a = EIGHT_NEIGHBORS(B_ptrs, i + 1);
    566        const int b = EIGHT_NEIGHBORS(A_ptrs, i + 1);
    567        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
    568    }
    569 #undef EIGHT_NEIGHBORS
    570 }
    571 
    572 #define FILTER_OUT_STRIDE (384)
    573 
    574 static NOINLINE void sgr_finish_filter2(coef *tmp,
    575                                        const pixel *src,
    576                                        const ptrdiff_t src_stride,
    577                                        int32_t **A_ptrs, coef **B_ptrs,
    578                                        const int w, const int h)
    579 {
    580 #define SIX_NEIGHBORS(P, i)\
    581    ((P[0][i]     + P[1][i]) * 6 +   \
    582     (P[0][i - 1] + P[1][i - 1] +    \
    583      P[0][i + 1] + P[1][i + 1]) * 5)
    584    for (int i = 0; i < w; i++) {
    585        const int a = SIX_NEIGHBORS(B_ptrs, i + 1);
    586        const int b = SIX_NEIGHBORS(A_ptrs, i + 1);
    587        tmp[i] = (b - a * src[i] + (1 << 8)) >> 9;
    588    }
    589    if (h <= 1)
    590        return;
    591    tmp += FILTER_OUT_STRIDE;
    592    src += PXSTRIDE(src_stride);
    593    const int32_t *A = &A_ptrs[1][1];
    594    const coef *B = &B_ptrs[1][1];
    595    for (int i = 0; i < w; i++) {
    596        const int a = B[i] * 6 + (B[i - 1] + B[i + 1]) * 5;
    597        const int b = A[i] * 6 + (A[i - 1] + A[i + 1]) * 5;
    598        tmp[i] = (b - a * src[i] + (1 << 7)) >> 8;
    599    }
    600 #undef SIX_NEIGHBORS
    601 }
    602 
    603 static NOINLINE void sgr_weighted_row1(pixel *dst, const coef *t1,
    604                                       const int w, const int w1 HIGHBD_DECL_SUFFIX)
    605 {
    606    for (int i = 0; i < w; i++) {
    607        const int v = w1 * t1[i];
    608        dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
    609    }
    610 }
    611 
    612 static NOINLINE void sgr_weighted2(pixel *dst, const ptrdiff_t dst_stride,
    613                                   const coef *t1, const coef *t2,
    614                                   const int w, const int h,
    615                                   const int w0, const int w1 HIGHBD_DECL_SUFFIX)
    616 {
    617    for (int j = 0; j < h; j++) {
    618        for (int i = 0; i < w; i++) {
    619            const int v = w0 * t1[i] + w1 * t2[i];
    620            dst[i] = iclip_pixel(dst[i] + ((v + (1 << 10)) >> 11));
    621        }
    622        dst += PXSTRIDE(dst_stride);
    623        t1 += FILTER_OUT_STRIDE;
    624        t2 += FILTER_OUT_STRIDE;
    625    }
    626 }
    627 
    628 static NOINLINE void sgr_finish1(pixel **dst, const ptrdiff_t stride,
    629                                 int32_t **A_ptrs, coef **B_ptrs, const int w,
    630                                 const int w1 HIGHBD_DECL_SUFFIX)
    631 {
    632    // Only one single row, no stride needed
    633    ALIGN_STK_16(coef, tmp, 384,);
    634 
    635    sgr_finish_filter_row1(tmp, *dst, A_ptrs, B_ptrs, w);
    636    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
    637    *dst += PXSTRIDE(stride);
    638    rotate(A_ptrs, B_ptrs, 3);
    639 }
    640 
    641 static NOINLINE void sgr_finish2(pixel **dst, const ptrdiff_t stride,
    642                                 int32_t **A_ptrs, coef **B_ptrs,
    643                                 const int w, const int h, const int w1
    644                                 HIGHBD_DECL_SUFFIX)
    645 {
    646    ALIGN_STK_16(coef, tmp, 2*FILTER_OUT_STRIDE,);
    647 
    648    sgr_finish_filter2(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
    649    sgr_weighted_row1(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
    650    *dst += PXSTRIDE(stride);
    651    if (h > 1) {
    652        sgr_weighted_row1(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
    653        *dst += PXSTRIDE(stride);
    654    }
    655    rotate(A_ptrs, B_ptrs, 2);
    656 }
    657 
    658 static NOINLINE void sgr_finish_mix(pixel **dst, const ptrdiff_t stride,
    659                                    int32_t **A5_ptrs, coef **B5_ptrs,
    660                                    int32_t **A3_ptrs, coef **B3_ptrs,
    661                                    const int w, const int h,
    662                                    const int w0, const int w1 HIGHBD_DECL_SUFFIX)
    663 {
    664    ALIGN_STK_16(coef, tmp5, 2*FILTER_OUT_STRIDE,);
    665    ALIGN_STK_16(coef, tmp3, 2*FILTER_OUT_STRIDE,);
    666 
    667    sgr_finish_filter2(tmp5, *dst, stride, A5_ptrs, B5_ptrs, w, h);
    668    sgr_finish_filter_row1(tmp3, *dst, A3_ptrs, B3_ptrs, w);
    669    if (h > 1)
    670        sgr_finish_filter_row1(tmp3 + FILTER_OUT_STRIDE, *dst + PXSTRIDE(stride),
    671                               &A3_ptrs[1], &B3_ptrs[1], w);
    672    sgr_weighted2(*dst, stride, tmp5, tmp3, w, h, w0, w1 HIGHBD_TAIL_SUFFIX);
    673    *dst += h*PXSTRIDE(stride);
    674    rotate(A5_ptrs, B5_ptrs, 2);
    675    rotate(A3_ptrs, B3_ptrs, 4);
    676 }
    677 
    678 
    679 static void sgr_3x3_c(pixel *dst, const ptrdiff_t stride,
    680                      const pixel (*left)[4], const pixel *lpf,
    681                      const int w, int h,
    682                      const LooprestorationParams *const params,
    683                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
    684 {
    685 #define BUF_STRIDE (384 + 16)
    686    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 3 + 16,);
    687    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 3 + 16,);
    688    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
    689    coef *sum_ptrs[3], *sum_rows[3];
    690    for (int i = 0; i < 3; i++) {
    691        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
    692        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
    693    }
    694 
    695    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 3 + 16,);
    696    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 3 + 16,);
    697    int32_t *A_ptrs[3];
    698    coef *B_ptrs[3];
    699    for (int i = 0; i < 3; i++) {
    700        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
    701        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
    702    }
    703    const pixel *src = dst;
    704    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
    705 
    706    if (edges & LR_HAVE_TOP) {
    707        sumsq_ptrs[0] = sumsq_rows[0];
    708        sumsq_ptrs[1] = sumsq_rows[1];
    709        sumsq_ptrs[2] = sumsq_rows[2];
    710        sum_ptrs[0] = sum_rows[0];
    711        sum_ptrs[1] = sum_rows[1];
    712        sum_ptrs[2] = sum_rows[2];
    713 
    714        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
    715        lpf += PXSTRIDE(stride);
    716        sgr_box3_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
    717 
    718        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    719                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    720        left++;
    721        src += PXSTRIDE(stride);
    722        rotate(A_ptrs, B_ptrs, 3);
    723 
    724        if (--h <= 0)
    725            goto vert_1;
    726 
    727        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    728                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    729        left++;
    730        src += PXSTRIDE(stride);
    731        rotate(A_ptrs, B_ptrs, 3);
    732 
    733        if (--h <= 0)
    734            goto vert_2;
    735    } else {
    736        sumsq_ptrs[0] = sumsq_rows[0];
    737        sumsq_ptrs[1] = sumsq_rows[0];
    738        sumsq_ptrs[2] = sumsq_rows[0];
    739        sum_ptrs[0] = sum_rows[0];
    740        sum_ptrs[1] = sum_rows[0];
    741        sum_ptrs[2] = sum_rows[0];
    742 
    743        sgr_box3_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
    744        left++;
    745        src += PXSTRIDE(stride);
    746 
    747        sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    748                      w, params->sgr.s1, BITDEPTH_MAX);
    749        rotate(A_ptrs, B_ptrs, 3);
    750 
    751        if (--h <= 0)
    752            goto vert_1;
    753 
    754        sumsq_ptrs[2] = sumsq_rows[1];
    755        sum_ptrs[2] = sum_rows[1];
    756 
    757        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    758                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    759        left++;
    760        src += PXSTRIDE(stride);
    761        rotate(A_ptrs, B_ptrs, 3);
    762 
    763        if (--h <= 0)
    764            goto vert_2;
    765 
    766        sumsq_ptrs[2] = sumsq_rows[2];
    767        sum_ptrs[2] = sum_rows[2];
    768    }
    769 
    770    do {
    771        sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    772                    left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    773        left++;
    774        src += PXSTRIDE(stride);
    775 
    776        sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
    777                    w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    778    } while (--h > 0);
    779 
    780    if (!(edges & LR_HAVE_BOTTOM))
    781        goto vert_2;
    782 
    783    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    784                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
    785    lpf_bottom += PXSTRIDE(stride);
    786 
    787    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
    788                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    789 
    790    sgr_box3_hv(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    791                NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
    792 
    793    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
    794                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    795    return;
    796 
    797 vert_2:
    798    sumsq_ptrs[2] = sumsq_ptrs[1];
    799    sum_ptrs[2] = sum_ptrs[1];
    800    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    801                  w, params->sgr.s1, BITDEPTH_MAX);
    802 
    803    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
    804                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    805 
    806 output_1:
    807    sumsq_ptrs[2] = sumsq_ptrs[1];
    808    sum_ptrs[2] = sum_ptrs[1];
    809    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    810                  w, params->sgr.s1, BITDEPTH_MAX);
    811 
    812    sgr_finish1(&dst, stride, A_ptrs, B_ptrs,
    813                w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    814    return;
    815 
    816 vert_1:
    817    sumsq_ptrs[2] = sumsq_ptrs[1];
    818    sum_ptrs[2] = sum_ptrs[1];
    819    sgr_box3_vert(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    820                  w, params->sgr.s1, BITDEPTH_MAX);
    821    rotate(A_ptrs, B_ptrs, 3);
    822    goto output_1;
    823 }
    824 
    825 static void sgr_5x5_c(pixel *dst, const ptrdiff_t stride,
    826                      const pixel (*left)[4], const pixel *lpf,
    827                      const int w, int h,
    828                      const LooprestorationParams *const params,
    829                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
    830 {
    831    ALIGN_STK_16(int32_t, sumsq_buf, BUF_STRIDE * 5 + 16,);
    832    ALIGN_STK_16(coef, sum_buf, BUF_STRIDE * 5 + 16,);
    833    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
    834    coef *sum_ptrs[5], *sum_rows[5];
    835    for (int i = 0; i < 5; i++) {
    836        sumsq_rows[i] = &sumsq_buf[i * BUF_STRIDE];
    837        sum_rows[i] = &sum_buf[i * BUF_STRIDE];
    838    }
    839 
    840    ALIGN_STK_16(int32_t, A_buf, BUF_STRIDE * 2 + 16,);
    841    ALIGN_STK_16(coef, B_buf, BUF_STRIDE * 2 + 16,);
    842    int32_t *A_ptrs[2];
    843    coef *B_ptrs[2];
    844    for (int i = 0; i < 2; i++) {
    845        A_ptrs[i] = &A_buf[i * BUF_STRIDE];
    846        B_ptrs[i] = &B_buf[i * BUF_STRIDE];
    847    }
    848    const pixel *src = dst;
    849    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
    850 
    851    if (edges & LR_HAVE_TOP) {
    852        sumsq_ptrs[0] = sumsq_rows[0];
    853        sumsq_ptrs[1] = sumsq_rows[0];
    854        sumsq_ptrs[2] = sumsq_rows[1];
    855        sumsq_ptrs[3] = sumsq_rows[2];
    856        sumsq_ptrs[4] = sumsq_rows[3];
    857        sum_ptrs[0] = sum_rows[0];
    858        sum_ptrs[1] = sum_rows[0];
    859        sum_ptrs[2] = sum_rows[1];
    860        sum_ptrs[3] = sum_rows[2];
    861        sum_ptrs[4] = sum_rows[3];
    862 
    863        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], NULL, lpf, w, edges);
    864        lpf += PXSTRIDE(stride);
    865        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], NULL, lpf, w, edges);
    866 
    867        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
    868        left++;
    869        src += PXSTRIDE(stride);
    870 
    871        if (--h <= 0)
    872            goto vert_1;
    873 
    874        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
    875        left++;
    876        src += PXSTRIDE(stride);
    877        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    878                      w, params->sgr.s0, BITDEPTH_MAX);
    879        rotate(A_ptrs, B_ptrs, 2);
    880 
    881        if (--h <= 0)
    882            goto vert_2;
    883 
    884        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
    885        // one of them to point at the previously unused rows[4].
    886        sumsq_ptrs[3] = sumsq_rows[4];
    887        sum_ptrs[3] = sum_rows[4];
    888    } else {
    889        sumsq_ptrs[0] = sumsq_rows[0];
    890        sumsq_ptrs[1] = sumsq_rows[0];
    891        sumsq_ptrs[2] = sumsq_rows[0];
    892        sumsq_ptrs[3] = sumsq_rows[0];
    893        sumsq_ptrs[4] = sumsq_rows[0];
    894        sum_ptrs[0] = sum_rows[0];
    895        sum_ptrs[1] = sum_rows[0];
    896        sum_ptrs[2] = sum_rows[0];
    897        sum_ptrs[3] = sum_rows[0];
    898        sum_ptrs[4] = sum_rows[0];
    899 
    900        sgr_box5_row_h(sumsq_rows[0], sum_rows[0], left, src, w, edges);
    901        left++;
    902        src += PXSTRIDE(stride);
    903 
    904        if (--h <= 0)
    905            goto vert_1;
    906 
    907        sumsq_ptrs[4] = sumsq_rows[1];
    908        sum_ptrs[4] = sum_rows[1];
    909 
    910        sgr_box5_row_h(sumsq_rows[1], sum_rows[1], left, src, w, edges);
    911        left++;
    912        src += PXSTRIDE(stride);
    913 
    914        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    915                      w, params->sgr.s0, BITDEPTH_MAX);
    916        rotate(A_ptrs, B_ptrs, 2);
    917 
    918        if (--h <= 0)
    919            goto vert_2;
    920 
    921        sumsq_ptrs[3] = sumsq_rows[2];
    922        sumsq_ptrs[4] = sumsq_rows[3];
    923        sum_ptrs[3] = sum_rows[2];
    924        sum_ptrs[4] = sum_rows[3];
    925 
    926        sgr_box5_row_h(sumsq_rows[2], sum_rows[2], left, src, w, edges);
    927        left++;
    928        src += PXSTRIDE(stride);
    929 
    930        if (--h <= 0)
    931            goto odd;
    932 
    933        sgr_box5_row_h(sumsq_rows[3], sum_rows[3], left, src, w, edges);
    934        left++;
    935        src += PXSTRIDE(stride);
    936 
    937        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    938                      w, params->sgr.s0, BITDEPTH_MAX);
    939        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
    940                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    941 
    942        if (--h <= 0)
    943            goto vert_2;
    944 
    945        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
    946        // one of them to point at the previously unused rows[4].
    947        sumsq_ptrs[3] = sumsq_rows[4];
    948        sum_ptrs[3] = sum_rows[4];
    949    }
    950 
    951    do {
    952        sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], left, src, w, edges);
    953        left++;
    954        src += PXSTRIDE(stride);
    955 
    956        if (--h <= 0)
    957            goto odd;
    958 
    959        sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], left, src, w, edges);
    960        left++;
    961        src += PXSTRIDE(stride);
    962 
    963        sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    964                      w, params->sgr.s0, BITDEPTH_MAX);
    965        sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
    966                    w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    967    } while (--h > 0);
    968 
    969    if (!(edges & LR_HAVE_BOTTOM))
    970        goto vert_2;
    971 
    972    sgr_box5_row_h(sumsq_ptrs[3], sum_ptrs[3], NULL, lpf_bottom, w, edges);
    973    lpf_bottom += PXSTRIDE(stride);
    974    sgr_box5_row_h(sumsq_ptrs[4], sum_ptrs[4], NULL, lpf_bottom, w, edges);
    975 
    976 output_2:
    977    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    978                  w, params->sgr.s0, BITDEPTH_MAX);
    979    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
    980                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    981    return;
    982 
    983 vert_2:
    984    // Duplicate the last row twice more
    985    sumsq_ptrs[3] = sumsq_ptrs[2];
    986    sumsq_ptrs[4] = sumsq_ptrs[2];
    987    sum_ptrs[3] = sum_ptrs[2];
    988    sum_ptrs[4] = sum_ptrs[2];
    989    goto output_2;
    990 
    991 odd:
    992    // Copy the last row as padding once
    993    sumsq_ptrs[4] = sumsq_ptrs[3];
    994    sum_ptrs[4] = sum_ptrs[3];
    995 
    996    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    997                  w, params->sgr.s0, BITDEPTH_MAX);
    998    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
    999                w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
   1000 
   1001 output_1:
   1002    // Duplicate the last row twice more
   1003    sumsq_ptrs[3] = sumsq_ptrs[2];
   1004    sumsq_ptrs[4] = sumsq_ptrs[2];
   1005    sum_ptrs[3] = sum_ptrs[2];
   1006    sum_ptrs[4] = sum_ptrs[2];
   1007 
   1008    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
   1009                  w, params->sgr.s0, BITDEPTH_MAX);
   1010    // Output only one row
   1011    sgr_finish2(&dst, stride, A_ptrs, B_ptrs,
   1012                w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
   1013    return;
   1014 
   1015 vert_1:
   1016    // Copy the last row as padding once
   1017    sumsq_ptrs[4] = sumsq_ptrs[3];
   1018    sum_ptrs[4] = sum_ptrs[3];
   1019 
   1020    sgr_box5_vert(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
   1021                  w, params->sgr.s0, BITDEPTH_MAX);
   1022    rotate(A_ptrs, B_ptrs, 2);
   1023 
   1024    goto output_1;
   1025 }
   1026 
   1027 static void sgr_mix_c(pixel *dst, const ptrdiff_t stride,
   1028                      const pixel (*left)[4], const pixel *lpf,
   1029                      const int w, int h,
   1030                      const LooprestorationParams *const params,
   1031                      const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
   1032 {
   1033    ALIGN_STK_16(int32_t, sumsq5_buf, BUF_STRIDE * 5 + 16,);
   1034    ALIGN_STK_16(coef, sum5_buf, BUF_STRIDE * 5 + 16,);
   1035    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
   1036    coef *sum5_ptrs[5], *sum5_rows[5];
   1037    for (int i = 0; i < 5; i++) {
   1038        sumsq5_rows[i] = &sumsq5_buf[i * BUF_STRIDE];
   1039        sum5_rows[i] = &sum5_buf[i * BUF_STRIDE];
   1040    }
   1041    ALIGN_STK_16(int32_t, sumsq3_buf, BUF_STRIDE * 3 + 16,);
   1042    ALIGN_STK_16(coef, sum3_buf, BUF_STRIDE * 3 + 16,);
   1043    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
   1044    coef *sum3_ptrs[3], *sum3_rows[3];
   1045    for (int i = 0; i < 3; i++) {
   1046        sumsq3_rows[i] = &sumsq3_buf[i * BUF_STRIDE];
   1047        sum3_rows[i] = &sum3_buf[i * BUF_STRIDE];
   1048    }
   1049 
   1050    ALIGN_STK_16(int32_t, A5_buf, BUF_STRIDE * 2 + 16,);
   1051    ALIGN_STK_16(coef, B5_buf, BUF_STRIDE * 2 + 16,);
   1052    int32_t *A5_ptrs[2];
   1053    coef *B5_ptrs[2];
   1054    for (int i = 0; i < 2; i++) {
   1055        A5_ptrs[i] = &A5_buf[i * BUF_STRIDE];
   1056        B5_ptrs[i] = &B5_buf[i * BUF_STRIDE];
   1057    }
   1058    ALIGN_STK_16(int32_t, A3_buf, BUF_STRIDE * 4 + 16,);
   1059    ALIGN_STK_16(coef, B3_buf, BUF_STRIDE * 4 + 16,);
   1060    int32_t *A3_ptrs[4];
   1061    coef *B3_ptrs[4];
   1062    for (int i = 0; i < 4; i++) {
   1063        A3_ptrs[i] = &A3_buf[i * BUF_STRIDE];
   1064        B3_ptrs[i] = &B3_buf[i * BUF_STRIDE];
   1065    }
   1066    const pixel *src = dst;
   1067    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
   1068 
   1069    if (edges & LR_HAVE_TOP) {
   1070        sumsq5_ptrs[0] = sumsq5_rows[0];
   1071        sumsq5_ptrs[1] = sumsq5_rows[0];
   1072        sumsq5_ptrs[2] = sumsq5_rows[1];
   1073        sumsq5_ptrs[3] = sumsq5_rows[2];
   1074        sumsq5_ptrs[4] = sumsq5_rows[3];
   1075        sum5_ptrs[0] = sum5_rows[0];
   1076        sum5_ptrs[1] = sum5_rows[0];
   1077        sum5_ptrs[2] = sum5_rows[1];
   1078        sum5_ptrs[3] = sum5_rows[2];
   1079        sum5_ptrs[4] = sum5_rows[3];
   1080 
   1081        sumsq3_ptrs[0] = sumsq3_rows[0];
   1082        sumsq3_ptrs[1] = sumsq3_rows[1];
   1083        sumsq3_ptrs[2] = sumsq3_rows[2];
   1084        sum3_ptrs[0] = sum3_rows[0];
   1085        sum3_ptrs[1] = sum3_rows[1];
   1086        sum3_ptrs[2] = sum3_rows[2];
   1087 
   1088        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
   1089                        sumsq5_rows[0], sum5_rows[0],
   1090                        NULL, lpf, w, edges);
   1091        lpf += PXSTRIDE(stride);
   1092        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
   1093                        sumsq5_rows[1], sum5_rows[1],
   1094                        NULL, lpf, w, edges);
   1095 
   1096        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
   1097                        sumsq5_rows[2], sum5_rows[2],
   1098                        left, src, w, edges);
   1099        left++;
   1100        src += PXSTRIDE(stride);
   1101 
   1102        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1103                      w, params->sgr.s1, BITDEPTH_MAX);
   1104        rotate(A3_ptrs, B3_ptrs, 4);
   1105 
   1106        if (--h <= 0)
   1107            goto vert_1;
   1108 
   1109        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
   1110                        sumsq5_rows[3], sum5_rows[3],
   1111                        left, src, w, edges);
   1112        left++;
   1113        src += PXSTRIDE(stride);
   1114        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1115                      w, params->sgr.s0, BITDEPTH_MAX);
   1116        rotate(A5_ptrs, B5_ptrs, 2);
   1117        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1118                      w, params->sgr.s1, BITDEPTH_MAX);
   1119        rotate(A3_ptrs, B3_ptrs, 4);
   1120 
   1121        if (--h <= 0)
   1122            goto vert_2;
   1123 
   1124        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
   1125        // one of them to point at the previously unused rows[4].
   1126        sumsq5_ptrs[3] = sumsq5_rows[4];
   1127        sum5_ptrs[3] = sum5_rows[4];
   1128    } else {
   1129        sumsq5_ptrs[0] = sumsq5_rows[0];
   1130        sumsq5_ptrs[1] = sumsq5_rows[0];
   1131        sumsq5_ptrs[2] = sumsq5_rows[0];
   1132        sumsq5_ptrs[3] = sumsq5_rows[0];
   1133        sumsq5_ptrs[4] = sumsq5_rows[0];
   1134        sum5_ptrs[0] = sum5_rows[0];
   1135        sum5_ptrs[1] = sum5_rows[0];
   1136        sum5_ptrs[2] = sum5_rows[0];
   1137        sum5_ptrs[3] = sum5_rows[0];
   1138        sum5_ptrs[4] = sum5_rows[0];
   1139 
   1140        sumsq3_ptrs[0] = sumsq3_rows[0];
   1141        sumsq3_ptrs[1] = sumsq3_rows[0];
   1142        sumsq3_ptrs[2] = sumsq3_rows[0];
   1143        sum3_ptrs[0] = sum3_rows[0];
   1144        sum3_ptrs[1] = sum3_rows[0];
   1145        sum3_ptrs[2] = sum3_rows[0];
   1146 
   1147        sgr_box35_row_h(sumsq3_rows[0], sum3_rows[0],
   1148                        sumsq5_rows[0], sum5_rows[0],
   1149                        left, src, w, edges);
   1150        left++;
   1151        src += PXSTRIDE(stride);
   1152 
   1153        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1154                      w, params->sgr.s1, BITDEPTH_MAX);
   1155        rotate(A3_ptrs, B3_ptrs, 4);
   1156 
   1157        if (--h <= 0)
   1158            goto vert_1;
   1159 
   1160        sumsq5_ptrs[4] = sumsq5_rows[1];
   1161        sum5_ptrs[4] = sum5_rows[1];
   1162 
   1163        sumsq3_ptrs[2] = sumsq3_rows[1];
   1164        sum3_ptrs[2] = sum3_rows[1];
   1165 
   1166        sgr_box35_row_h(sumsq3_rows[1], sum3_rows[1],
   1167                        sumsq5_rows[1], sum5_rows[1],
   1168                        left, src, w, edges);
   1169        left++;
   1170        src += PXSTRIDE(stride);
   1171 
   1172        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1173                      w, params->sgr.s0, BITDEPTH_MAX);
   1174        rotate(A5_ptrs, B5_ptrs, 2);
   1175        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1176                      w, params->sgr.s1, BITDEPTH_MAX);
   1177        rotate(A3_ptrs, B3_ptrs, 4);
   1178 
   1179        if (--h <= 0)
   1180            goto vert_2;
   1181 
   1182        sumsq5_ptrs[3] = sumsq5_rows[2];
   1183        sumsq5_ptrs[4] = sumsq5_rows[3];
   1184        sum5_ptrs[3] = sum5_rows[2];
   1185        sum5_ptrs[4] = sum5_rows[3];
   1186 
   1187        sumsq3_ptrs[2] = sumsq3_rows[2];
   1188        sum3_ptrs[2] = sum3_rows[2];
   1189 
   1190        sgr_box35_row_h(sumsq3_rows[2], sum3_rows[2],
   1191                        sumsq5_rows[2], sum5_rows[2],
   1192                        left, src, w, edges);
   1193        left++;
   1194        src += PXSTRIDE(stride);
   1195 
   1196        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1197                      w, params->sgr.s1, BITDEPTH_MAX);
   1198        rotate(A3_ptrs, B3_ptrs, 4);
   1199 
   1200        if (--h <= 0)
   1201            goto odd;
   1202 
   1203        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
   1204                        sumsq5_rows[3], sum5_rows[3],
   1205                        left, src, w, edges);
   1206        left++;
   1207        src += PXSTRIDE(stride);
   1208 
   1209        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1210                      w, params->sgr.s0, BITDEPTH_MAX);
   1211        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1212                      w, params->sgr.s1, BITDEPTH_MAX);
   1213        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1214                       w, 2, params->sgr.w0, params->sgr.w1
   1215                       HIGHBD_TAIL_SUFFIX);
   1216 
   1217        if (--h <= 0)
   1218            goto vert_2;
   1219 
   1220        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
   1221        // one of them to point at the previously unused rows[4].
   1222        sumsq5_ptrs[3] = sumsq5_rows[4];
   1223        sum5_ptrs[3] = sum5_rows[4];
   1224    }
   1225 
   1226    do {
   1227        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
   1228                        sumsq5_ptrs[3], sum5_ptrs[3],
   1229                        left, src, w, edges);
   1230        left++;
   1231        src += PXSTRIDE(stride);
   1232 
   1233        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1234                      w, params->sgr.s1, BITDEPTH_MAX);
   1235        rotate(A3_ptrs, B3_ptrs, 4);
   1236 
   1237        if (--h <= 0)
   1238            goto odd;
   1239 
   1240        sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
   1241                        sumsq5_ptrs[4], sum5_ptrs[4],
   1242                        left, src, w, edges);
   1243        left++;
   1244        src += PXSTRIDE(stride);
   1245 
   1246        sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1247                      w, params->sgr.s0, BITDEPTH_MAX);
   1248        sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1249                      w, params->sgr.s1, BITDEPTH_MAX);
   1250        sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1251                       w, 2, params->sgr.w0, params->sgr.w1
   1252                       HIGHBD_TAIL_SUFFIX);
   1253    } while (--h > 0);
   1254 
   1255    if (!(edges & LR_HAVE_BOTTOM))
   1256        goto vert_2;
   1257 
   1258    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
   1259                    sumsq5_ptrs[3], sum5_ptrs[3],
   1260                    NULL, lpf_bottom, w, edges);
   1261    lpf_bottom += PXSTRIDE(stride);
   1262    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1263                  w, params->sgr.s1, BITDEPTH_MAX);
   1264    rotate(A3_ptrs, B3_ptrs, 4);
   1265 
   1266    sgr_box35_row_h(sumsq3_ptrs[2], sum3_ptrs[2],
   1267                    sumsq5_ptrs[4], sum5_ptrs[4],
   1268                    NULL, lpf_bottom, w, edges);
   1269 
   1270 output_2:
   1271    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1272                  w, params->sgr.s0, BITDEPTH_MAX);
   1273    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1274                  w, params->sgr.s1, BITDEPTH_MAX);
   1275    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1276                   w, 2, params->sgr.w0, params->sgr.w1
   1277                   HIGHBD_TAIL_SUFFIX);
   1278    return;
   1279 
   1280 vert_2:
   1281    // Duplicate the last row twice more
   1282    sumsq5_ptrs[3] = sumsq5_ptrs[2];
   1283    sumsq5_ptrs[4] = sumsq5_ptrs[2];
   1284    sum5_ptrs[3] = sum5_ptrs[2];
   1285    sum5_ptrs[4] = sum5_ptrs[2];
   1286 
   1287    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1288    sum3_ptrs[2] = sum3_ptrs[1];
   1289    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1290                  w, params->sgr.s1, BITDEPTH_MAX);
   1291    rotate(A3_ptrs, B3_ptrs, 4);
   1292 
   1293    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1294    sum3_ptrs[2] = sum3_ptrs[1];
   1295 
   1296    goto output_2;
   1297 
   1298 odd:
   1299    // Copy the last row as padding once
   1300    sumsq5_ptrs[4] = sumsq5_ptrs[3];
   1301    sum5_ptrs[4] = sum5_ptrs[3];
   1302 
   1303    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1304    sum3_ptrs[2] = sum3_ptrs[1];
   1305 
   1306    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1307                  w, params->sgr.s0, BITDEPTH_MAX);
   1308    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1309                  w, params->sgr.s1, BITDEPTH_MAX);
   1310    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1311                   w, 2, params->sgr.w0, params->sgr.w1
   1312                   HIGHBD_TAIL_SUFFIX);
   1313 
   1314 output_1:
   1315    // Duplicate the last row twice more
   1316    sumsq5_ptrs[3] = sumsq5_ptrs[2];
   1317    sumsq5_ptrs[4] = sumsq5_ptrs[2];
   1318    sum5_ptrs[3] = sum5_ptrs[2];
   1319    sum5_ptrs[4] = sum5_ptrs[2];
   1320 
   1321    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1322    sum3_ptrs[2] = sum3_ptrs[1];
   1323 
   1324    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1325                  w, params->sgr.s0, BITDEPTH_MAX);
   1326    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1327                  w, params->sgr.s1, BITDEPTH_MAX);
   1328    rotate(A3_ptrs, B3_ptrs, 4);
   1329    // Output only one row
   1330    sgr_finish_mix(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1331                   w, 1, params->sgr.w0, params->sgr.w1
   1332                   HIGHBD_TAIL_SUFFIX);
   1333    return;
   1334 
   1335 vert_1:
   1336    // Copy the last row as padding once
   1337    sumsq5_ptrs[4] = sumsq5_ptrs[3];
   1338    sum5_ptrs[4] = sum5_ptrs[3];
   1339 
   1340    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1341    sum3_ptrs[2] = sum3_ptrs[1];
   1342 
   1343    sgr_box5_vert(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1344                  w, params->sgr.s0, BITDEPTH_MAX);
   1345    rotate(A5_ptrs, B5_ptrs, 2);
   1346    sgr_box3_vert(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1347                  w, params->sgr.s1, BITDEPTH_MAX);
   1348    rotate(A3_ptrs, B3_ptrs, 4);
   1349 
   1350    goto output_1;
   1351 }
   1352 
   1353 #if HAVE_ASM
   1354 #if ARCH_AARCH64 || ARCH_ARM
   1355 #include "src/arm/looprestoration.h"
   1356 #elif ARCH_LOONGARCH64
   1357 #include "src/loongarch/looprestoration.h"
   1358 #elif ARCH_PPC64LE
   1359 #include "src/ppc/looprestoration.h"
   1360 #elif ARCH_X86
   1361 #include "src/x86/looprestoration.h"
   1362 #endif
   1363 #endif
   1364 
   1365 COLD void bitfn(dav1d_loop_restoration_dsp_init)(Dav1dLoopRestorationDSPContext *const c,
   1366                                                 const int bpc)
   1367 {
   1368    c->wiener[0] = c->wiener[1] = wiener_c;
   1369    c->sgr[0] = sgr_5x5_c;
   1370    c->sgr[1] = sgr_3x3_c;
   1371    c->sgr[2] = sgr_mix_c;
   1372 
   1373 #if HAVE_ASM
   1374 #if ARCH_AARCH64 || ARCH_ARM
   1375    loop_restoration_dsp_init_arm(c, bpc);
   1376 #elif ARCH_LOONGARCH64
   1377    loop_restoration_dsp_init_loongarch(c, bpc);
   1378 #elif ARCH_PPC64LE
   1379    loop_restoration_dsp_init_ppc(c, bpc);
   1380 #elif ARCH_X86
   1381    loop_restoration_dsp_init_x86(c, bpc);
   1382 #endif
   1383 #endif
   1384 }