tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration.h (43694B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Two Orioles, LLC
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/cpu.h"
     29 #include "src/looprestoration.h"
     30 
     31 #if ARCH_AARCH64
     32 void BF(dav1d_wiener_filter7, neon)(pixel *p, const ptrdiff_t stride,
     33                                    const pixel (*left)[4], const pixel *lpf,
     34                                    const int w, int h,
     35                                    const LooprestorationParams *const params,
     36                                    const enum LrEdgeFlags edges
     37                                    HIGHBD_DECL_SUFFIX);
     38 void BF(dav1d_wiener_filter5, neon)(pixel *p, const ptrdiff_t stride,
     39                                    const pixel (*left)[4], const pixel *lpf,
     40                                    const int w, int h,
     41                                    const LooprestorationParams *const params,
     42                                    const enum LrEdgeFlags edges
     43                                    HIGHBD_DECL_SUFFIX);
     44 #else
     45 
     46 // The 8bpc version calculates things slightly differently than the reference
     47 // C version. That version calculates roughly this:
     48 // int16_t sum = 0;
     49 // for (int i = 0; i < 7; i++)
     50 //     sum += src[idx] * fh[i];
     51 // int16_t sum2 = (src[x] << 7) - (1 << (bitdepth + 6)) + rounding_off_h;
     52 // sum = iclip(sum + sum2, INT16_MIN, INT16_MAX) >> round_bits_h;
     53 // sum += 1 << (bitdepth + 6 - round_bits_h);
     54 // Compared to the reference C version, this is the output of the first pass
     55 // _subtracted_ by 1 << (bitdepth + 6 - round_bits_h) = 2048, i.e.
     56 // with round_offset precompensated.
     57 // The 16bpc version calculates things pretty much the same way as the
     58 // reference C version, but with the end result subtracted by
     59 // 1 << (bitdepth + 6 - round_bits_h).
     60 void BF(dav1d_wiener_filter_h, neon)(int16_t *dst, const pixel (*left)[4],
     61                                     const pixel *src, const int16_t fh[8],
     62                                     const int w, const enum LrEdgeFlags edges
     63                                     HIGHBD_DECL_SUFFIX);
     64 // This calculates things slightly differently than the reference C version.
     65 // This version calculates roughly this:
     66 // int32_t sum = 0;
     67 // for (int i = 0; i < 7; i++)
     68 //     sum += mid[idx] * fv[i];
     69 // sum = (sum + rounding_off_v) >> round_bits_v;
     70 // This function assumes that the width is a multiple of 8.
     71 void BF(dav1d_wiener_filter_v, neon)(pixel *dst, int16_t **ptrs,
     72                                     const int16_t fv[8], const int w
     73                                     HIGHBD_DECL_SUFFIX);
     74 
     75 void BF(dav1d_wiener_filter_hv, neon)(pixel *dst, const pixel (*left)[4],
     76                                      const pixel *src,
     77                                      const int16_t filter[2][8],
     78                                      const int w, const enum LrEdgeFlags edges,
     79                                      int16_t **ptrs
     80                                      HIGHBD_DECL_SUFFIX);
     81 
     82 static void wiener_filter_neon(pixel *p, const ptrdiff_t stride,
     83                               const pixel (*left)[4], const pixel *lpf,
     84                               const int w, int h,
     85                               const LooprestorationParams *const params,
     86                               const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
     87 {
     88    ALIGN_STK_16(int16_t, hor, 6 * 384,);
     89    int16_t *ptrs[7], *rows[6];
     90    for (int i = 0; i < 6; i++)
     91        rows[i] = &hor[i * 384];
     92    const int16_t (*const filter)[8] = params->filter;
     93    const int16_t *fh = params->filter[0];
     94    const int16_t *fv = params->filter[1];
     95    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
     96 
     97    const pixel *src = p;
     98    if (edges & LR_HAVE_TOP) {
     99        ptrs[0] = rows[0];
    100        ptrs[1] = rows[0];
    101        ptrs[2] = rows[1];
    102        ptrs[3] = rows[2];
    103        ptrs[4] = rows[2];
    104        ptrs[5] = rows[2];
    105 
    106        BF(dav1d_wiener_filter_h, neon)(rows[0], NULL, lpf, fh, w, edges
    107                                        HIGHBD_TAIL_SUFFIX);
    108        lpf += PXSTRIDE(stride);
    109        BF(dav1d_wiener_filter_h, neon)(rows[1], NULL, lpf, fh, w, edges
    110                                        HIGHBD_TAIL_SUFFIX);
    111 
    112        BF(dav1d_wiener_filter_h, neon)(rows[2], left, src, fh, w, edges
    113                                        HIGHBD_TAIL_SUFFIX);
    114        left++;
    115        src += PXSTRIDE(stride);
    116 
    117        if (--h <= 0)
    118            goto v1;
    119 
    120        ptrs[4] = ptrs[5] = rows[3];
    121        BF(dav1d_wiener_filter_h, neon)(rows[3], left, src, fh, w, edges
    122                                        HIGHBD_TAIL_SUFFIX);
    123        left++;
    124        src += PXSTRIDE(stride);
    125 
    126        if (--h <= 0)
    127            goto v2;
    128 
    129        ptrs[5] = rows[4];
    130        BF(dav1d_wiener_filter_h, neon)(rows[4], left, src, fh, w, edges
    131                                        HIGHBD_TAIL_SUFFIX);
    132        left++;
    133        src += PXSTRIDE(stride);
    134 
    135        if (--h <= 0)
    136            goto v3;
    137    } else {
    138        ptrs[0] = rows[0];
    139        ptrs[1] = rows[0];
    140        ptrs[2] = rows[0];
    141        ptrs[3] = rows[0];
    142        ptrs[4] = rows[0];
    143        ptrs[5] = rows[0];
    144 
    145        BF(dav1d_wiener_filter_h, neon)(rows[0], left, src, fh, w, edges
    146                                        HIGHBD_TAIL_SUFFIX);
    147        left++;
    148        src += PXSTRIDE(stride);
    149 
    150        if (--h <= 0)
    151            goto v1;
    152 
    153        ptrs[4] = ptrs[5] = rows[1];
    154        BF(dav1d_wiener_filter_h, neon)(rows[1], left, src, fh, w, edges
    155                                        HIGHBD_TAIL_SUFFIX);
    156        left++;
    157        src += PXSTRIDE(stride);
    158 
    159        if (--h <= 0)
    160            goto v2;
    161 
    162        ptrs[5] = rows[2];
    163        BF(dav1d_wiener_filter_h, neon)(rows[2], left, src, fh, w, edges
    164                                        HIGHBD_TAIL_SUFFIX);
    165        left++;
    166        src += PXSTRIDE(stride);
    167 
    168        if (--h <= 0)
    169            goto v3;
    170 
    171        ptrs[6] = rows[3];
    172        BF(dav1d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs
    173                                         HIGHBD_TAIL_SUFFIX);
    174        left++;
    175        src += PXSTRIDE(stride);
    176        p += PXSTRIDE(stride);
    177 
    178        if (--h <= 0)
    179            goto v3;
    180 
    181        ptrs[6] = rows[4];
    182        BF(dav1d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs
    183                                         HIGHBD_TAIL_SUFFIX);
    184        left++;
    185        src += PXSTRIDE(stride);
    186        p += PXSTRIDE(stride);
    187 
    188        if (--h <= 0)
    189            goto v3;
    190    }
    191 
    192    ptrs[6] = ptrs[5] + 384;
    193    do {
    194        BF(dav1d_wiener_filter_hv, neon)(p, left, src, filter, w, edges, ptrs
    195                                         HIGHBD_TAIL_SUFFIX);
    196        left++;
    197        src += PXSTRIDE(stride);
    198        p += PXSTRIDE(stride);
    199    } while (--h > 0);
    200 
    201    if (!(edges & LR_HAVE_BOTTOM))
    202        goto v3;
    203 
    204    BF(dav1d_wiener_filter_hv, neon)(p, NULL, lpf_bottom, filter, w, edges, ptrs
    205                                     HIGHBD_TAIL_SUFFIX);
    206    lpf_bottom += PXSTRIDE(stride);
    207    p += PXSTRIDE(stride);
    208 
    209    BF(dav1d_wiener_filter_hv, neon)(p, NULL, lpf_bottom, filter, w, edges, ptrs
    210                                     HIGHBD_TAIL_SUFFIX);
    211    p += PXSTRIDE(stride);
    212 v1:
    213    BF(dav1d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    214 
    215    return;
    216 
    217 v3:
    218    BF(dav1d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    219    p += PXSTRIDE(stride);
    220 v2:
    221    BF(dav1d_wiener_filter_v, neon)(p, ptrs, fv, w HIGHBD_TAIL_SUFFIX);
    222    p += PXSTRIDE(stride);
    223    goto v1;
    224 }
    225 #endif
    226 
    227 static void rotate_neon(int32_t **sumsq_ptrs, int16_t **sum_ptrs, int n) {
    228    int32_t *tmp32 = sumsq_ptrs[0];
    229    int16_t *tmp16 = sum_ptrs[0];
    230    for (int i = 0; i < n - 1; i++) {
    231        sumsq_ptrs[i] = sumsq_ptrs[i + 1];
    232        sum_ptrs[i] = sum_ptrs[i + 1];
    233    }
    234    sumsq_ptrs[n - 1] = tmp32;
    235    sum_ptrs[n - 1] = tmp16;
    236 }
    237 static void rotate5_x2_neon(int32_t **sumsq_ptrs, int16_t **sum_ptrs) {
    238    int32_t *tmp32[2];
    239    int16_t *tmp16[2];
    240    for (int i = 0; i < 2; i++) {
    241        tmp32[i] = sumsq_ptrs[i];
    242        tmp16[i] = sum_ptrs[i];
    243    }
    244    for (int i = 0; i < 3; i++) {
    245        sumsq_ptrs[i] = sumsq_ptrs[i + 2];
    246        sum_ptrs[i] = sum_ptrs[i + 2];
    247    }
    248    for (int i = 0; i < 2; i++) {
    249        sumsq_ptrs[3 + i] = tmp32[i];
    250        sum_ptrs[3 + i] = tmp16[i];
    251    }
    252 }
    253 
    254 void BF(dav1d_sgr_box3_row_h, neon)(int32_t *sumsq, int16_t *sum,
    255                                    const pixel (*left)[4],
    256                                    const pixel *src, const int w,
    257                                    const enum LrEdgeFlags edges);
    258 void BF(dav1d_sgr_box5_row_h, neon)(int32_t *sumsq, int16_t *sum,
    259                                    const pixel (*left)[4],
    260                                    const pixel *src, const int w,
    261                                    const enum LrEdgeFlags edges);
    262 void BF(dav1d_sgr_box35_row_h, neon)(int32_t *sumsq3, int16_t *sum3,
    263                                     int32_t *sumsq5, int16_t *sum5,
    264                                     const pixel (*left)[4],
    265                                     const pixel *src, const int w,
    266                                     const enum LrEdgeFlags edges);
    267 
    268 #if ARCH_ARM
    269 void dav1d_sgr_box3_row_v_neon(int32_t **sumsq, int16_t **sum,
    270                               int32_t *sumsq_out, int16_t *sum_out,
    271                               const int w);
    272 void dav1d_sgr_box5_row_v_neon(int32_t **sumsq, int16_t **sum,
    273                               int32_t *sumsq_out, int16_t *sum_out,
    274                               const int w);
    275 void dav1d_sgr_calc_row_ab1_neon(int32_t *AA, int16_t *BB, int w, int s,
    276                                 int bitdepth_max);
    277 void dav1d_sgr_calc_row_ab2_neon(int32_t *AA, int16_t *BB, int w, int s,
    278                                 int bitdepth_max);
    279 void BF(dav1d_sgr_finish_filter_row1, neon)(int16_t *tmp, const pixel *src,
    280                                            int32_t **A_ptrs, int16_t **B_ptrs,
    281                                            const int w);
    282 void BF(dav1d_sgr_weighted_row1, neon)(pixel *dst, const int16_t *t1,
    283                                       const int w, const int wt
    284                                       HIGHBD_DECL_SUFFIX);
    285 #else
    286 void dav1d_sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
    287                              int32_t *AA, int16_t *BB,
    288                              const int w, const int s,
    289                              const int bitdepth_max);
    290 void dav1d_sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
    291                              int32_t *AA, int16_t *BB,
    292                              const int w, const int s,
    293                              const int bitdepth_max);
    294 
    295 void BF(dav1d_sgr_finish_weighted1, neon)(pixel *dst,
    296                                          int32_t **A_ptrs, int16_t **B_ptrs,
    297                                          const int w, const int w1
    298                                          HIGHBD_DECL_SUFFIX);
    299 void BF(dav1d_sgr_finish_weighted2, neon)(pixel *dst, const ptrdiff_t stride,
    300                                          int32_t **A_ptrs, int16_t **B_ptrs,
    301                                          const int w, const int h,
    302                                          const int w1 HIGHBD_DECL_SUFFIX);
    303 
    304 void BF(dav1d_sgr_finish_filter1_2rows, neon)(int16_t *tmp, const pixel *src,
    305                                              const ptrdiff_t src_stride,
    306                                              int32_t **A_ptrs,
    307                                              int16_t **B_ptrs,
    308                                              const int w, const int h);
    309 #endif
    310 void BF(dav1d_sgr_finish_filter2_2rows, neon)(int16_t *tmp, const pixel *src,
    311                                              const ptrdiff_t src_stride,
    312                                              int32_t **A_ptrs, int16_t **B_ptrs,
    313                                              const int w, const int h);
    314 void BF(dav1d_sgr_weighted2, neon)(pixel *dst, const ptrdiff_t dst_stride,
    315                                   const int16_t *t1, const int16_t *t2,
    316                                   const int w, const int h,
    317                                   const int16_t wt[2] HIGHBD_DECL_SUFFIX);
    318 
    319 static void sgr_box3_vert_neon(int32_t **sumsq, int16_t **sum,
    320                               int32_t *sumsq_out, int16_t *sum_out,
    321                               const int w, const int s, const int bitdepth_max) {
    322 #if ARCH_ARM
    323    dav1d_sgr_box3_row_v_neon(sumsq, sum, sumsq_out, sum_out, w);
    324    dav1d_sgr_calc_row_ab1_neon(sumsq_out, sum_out, w, s, bitdepth_max);
    325 #else
    326    // box3_v + calc_ab1
    327    dav1d_sgr_box3_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
    328 #endif
    329    rotate_neon(sumsq, sum, 3);
    330 }
    331 
    332 static void sgr_box5_vert_neon(int32_t **sumsq, int16_t **sum,
    333                               int32_t *sumsq_out, int16_t *sum_out,
    334                               const int w, const int s, const int bitdepth_max) {
    335 #if ARCH_ARM
    336    dav1d_sgr_box5_row_v_neon(sumsq, sum, sumsq_out, sum_out, w);
    337    dav1d_sgr_calc_row_ab2_neon(sumsq_out, sum_out, w, s, bitdepth_max);
    338 #else
    339    // box5_v + calc_ab2
    340    dav1d_sgr_box5_vert_neon(sumsq, sum, sumsq_out, sum_out, w, s, bitdepth_max);
    341 #endif
    342    rotate5_x2_neon(sumsq, sum);
    343 }
    344 
    345 static void sgr_box3_hv_neon(int32_t **sumsq, int16_t **sum,
    346                             int32_t *AA, int16_t *BB,
    347                             const pixel (*left)[4],
    348                             const pixel *src, const int w,
    349                             const int s,
    350                             const enum LrEdgeFlags edges,
    351                             const int bitdepth_max) {
    352    BF(dav1d_sgr_box3_row_h, neon)(sumsq[2], sum[2], left, src, w, edges);
    353    sgr_box3_vert_neon(sumsq, sum, AA, BB, w, s, bitdepth_max);
    354 }
    355 
    356 
    357 static void sgr_finish1_neon(pixel **dst, const ptrdiff_t stride,
    358                             int32_t **A_ptrs, int16_t **B_ptrs, const int w,
    359                             const int w1 HIGHBD_DECL_SUFFIX) {
    360 #if ARCH_ARM
    361    ALIGN_STK_16(int16_t, tmp, 384,);
    362 
    363    BF(dav1d_sgr_finish_filter_row1, neon)(tmp, *dst, A_ptrs, B_ptrs, w);
    364    BF(dav1d_sgr_weighted_row1, neon)(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
    365 #else
    366    BF(dav1d_sgr_finish_weighted1, neon)(*dst, A_ptrs, B_ptrs,
    367                                         w, w1 HIGHBD_TAIL_SUFFIX);
    368 #endif
    369    *dst += PXSTRIDE(stride);
    370    rotate_neon(A_ptrs, B_ptrs, 3);
    371 }
    372 
    373 #define ARM_FILTER_OUT_STRIDE 384
    374 
    375 static void sgr_finish2_neon(pixel **dst, const ptrdiff_t stride,
    376                             int32_t **A_ptrs, int16_t **B_ptrs,
    377                             const int w, const int h, const int w1
    378                             HIGHBD_DECL_SUFFIX) {
    379 #if ARCH_ARM
    380    ALIGN_STK_16(int16_t, tmp, 2*ARM_FILTER_OUT_STRIDE,);
    381 
    382    BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp, *dst, stride, A_ptrs, B_ptrs, w, h);
    383    BF(dav1d_sgr_weighted_row1, neon)(*dst, tmp, w, w1 HIGHBD_TAIL_SUFFIX);
    384    *dst += PXSTRIDE(stride);
    385    if (h > 1) {
    386        BF(dav1d_sgr_weighted_row1, neon)(*dst, tmp + FILTER_OUT_STRIDE, w, w1 HIGHBD_TAIL_SUFFIX);
    387        *dst += PXSTRIDE(stride);
    388    }
    389 #else
    390    BF(dav1d_sgr_finish_weighted2, neon)(*dst, stride, A_ptrs, B_ptrs,
    391                                         w, h, w1 HIGHBD_TAIL_SUFFIX);
    392    *dst += 2*PXSTRIDE(stride);
    393 #endif
    394    rotate_neon(A_ptrs, B_ptrs, 2);
    395 }
    396 
    397 static void sgr_finish_mix_neon(pixel **dst, const ptrdiff_t stride,
    398                                int32_t **A5_ptrs, int16_t **B5_ptrs,
    399                                int32_t **A3_ptrs, int16_t **B3_ptrs,
    400                                const int w, const int h,
    401                                const int w0, const int w1 HIGHBD_DECL_SUFFIX) {
    402    ALIGN_STK_16(int16_t, tmp5, 2*ARM_FILTER_OUT_STRIDE,);
    403    ALIGN_STK_16(int16_t, tmp3, 2*ARM_FILTER_OUT_STRIDE,);
    404 
    405    BF(dav1d_sgr_finish_filter2_2rows, neon)(tmp5, *dst, stride,
    406                                             A5_ptrs, B5_ptrs, w, h);
    407 #if ARCH_ARM
    408    BF(dav1d_sgr_finish_filter_row1, neon)(tmp3, *dst, A3_ptrs, B3_ptrs, w);
    409    BF(dav1d_sgr_finish_filter_row1, neon)(tmp3 + FILTER_OUT_STRIDE,
    410                                           *dst + PXSTRIDE(stride),
    411                                           &A3_ptrs[1], &B3_ptrs[1], w);
    412 #else
    413    BF(dav1d_sgr_finish_filter1_2rows, neon)(tmp3, *dst, stride,
    414                                             A3_ptrs, B3_ptrs, w, h);
    415 #endif
    416    const int16_t wt[2] = { w0, w1 };
    417    BF(dav1d_sgr_weighted2, neon)(*dst, stride,
    418                                  tmp5, tmp3, w, h, wt HIGHBD_TAIL_SUFFIX);
    419    *dst += h*PXSTRIDE(stride);
    420    rotate_neon(A5_ptrs, B5_ptrs, 2);
    421    rotate_neon(A3_ptrs, B3_ptrs, 4);
    422 }
    423 
    424 
    425 static void sgr_filter_3x3_neon(pixel *dst, const ptrdiff_t stride,
    426                                const pixel (*left)[4], const pixel *lpf,
    427                                const int w, int h,
    428                                const LooprestorationParams *const params,
    429                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
    430 {
    431 #define ARM_BUF_STRIDE (384 + 16)
    432    ALIGN_STK_16(int32_t, sumsq_buf, ARM_BUF_STRIDE * 3 + 16,);
    433    ALIGN_STK_16(int16_t, sum_buf, ARM_BUF_STRIDE * 3 + 16,);
    434    int32_t *sumsq_ptrs[3], *sumsq_rows[3];
    435    int16_t *sum_ptrs[3], *sum_rows[3];
    436    for (int i = 0; i < 3; i++) {
    437        sumsq_rows[i] = &sumsq_buf[i * ARM_BUF_STRIDE];
    438        sum_rows[i] = &sum_buf[i * ARM_BUF_STRIDE];
    439    }
    440 
    441    ALIGN_STK_16(int32_t, A_buf, ARM_BUF_STRIDE * 3 + 16,);
    442    ALIGN_STK_16(int16_t, B_buf, ARM_BUF_STRIDE * 3 + 16,);
    443    int32_t *A_ptrs[3];
    444    int16_t *B_ptrs[3];
    445    for (int i = 0; i < 3; i++) {
    446        A_ptrs[i] = &A_buf[i * ARM_BUF_STRIDE];
    447        B_ptrs[i] = &B_buf[i * ARM_BUF_STRIDE];
    448    }
    449    const pixel *src = dst;
    450    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
    451 
    452    if (edges & LR_HAVE_TOP) {
    453        sumsq_ptrs[0] = sumsq_rows[0];
    454        sumsq_ptrs[1] = sumsq_rows[1];
    455        sumsq_ptrs[2] = sumsq_rows[2];
    456        sum_ptrs[0] = sum_rows[0];
    457        sum_ptrs[1] = sum_rows[1];
    458        sum_ptrs[2] = sum_rows[2];
    459 
    460        BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
    461                                       NULL, lpf, w, edges);
    462        lpf += PXSTRIDE(stride);
    463        BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[1], sum_rows[1],
    464                                       NULL, lpf, w, edges);
    465 
    466        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    467                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    468        left++;
    469        src += PXSTRIDE(stride);
    470        rotate_neon(A_ptrs, B_ptrs, 3);
    471 
    472        if (--h <= 0)
    473            goto vert_1;
    474 
    475        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    476                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    477        left++;
    478        src += PXSTRIDE(stride);
    479        rotate_neon(A_ptrs, B_ptrs, 3);
    480 
    481        if (--h <= 0)
    482            goto vert_2;
    483    } else {
    484        sumsq_ptrs[0] = sumsq_rows[0];
    485        sumsq_ptrs[1] = sumsq_rows[0];
    486        sumsq_ptrs[2] = sumsq_rows[0];
    487        sum_ptrs[0] = sum_rows[0];
    488        sum_ptrs[1] = sum_rows[0];
    489        sum_ptrs[2] = sum_rows[0];
    490 
    491        BF(dav1d_sgr_box3_row_h, neon)(sumsq_rows[0], sum_rows[0],
    492                                       left, src, w, edges);
    493        left++;
    494        src += PXSTRIDE(stride);
    495 
    496        sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    497                           w, params->sgr.s1, BITDEPTH_MAX);
    498        rotate_neon(A_ptrs, B_ptrs, 3);
    499 
    500        if (--h <= 0)
    501            goto vert_1;
    502 
    503        sumsq_ptrs[2] = sumsq_rows[1];
    504        sum_ptrs[2] = sum_rows[1];
    505 
    506        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    507                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    508        left++;
    509        src += PXSTRIDE(stride);
    510        rotate_neon(A_ptrs, B_ptrs, 3);
    511 
    512        if (--h <= 0)
    513            goto vert_2;
    514 
    515        sumsq_ptrs[2] = sumsq_rows[2];
    516        sum_ptrs[2] = sum_rows[2];
    517    }
    518 
    519    do {
    520        sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    521                         left, src, w, params->sgr.s1, edges, BITDEPTH_MAX);
    522        left++;
    523        src += PXSTRIDE(stride);
    524 
    525        sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
    526                         w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    527    } while (--h > 0);
    528 
    529    if (!(edges & LR_HAVE_BOTTOM))
    530        goto vert_2;
    531 
    532    sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    533                     NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
    534    lpf_bottom += PXSTRIDE(stride);
    535 
    536    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
    537                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    538 
    539    sgr_box3_hv_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    540                     NULL, lpf_bottom, w, params->sgr.s1, edges, BITDEPTH_MAX);
    541 
    542    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
    543                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    544    return;
    545 
    546 vert_2:
    547    sumsq_ptrs[2] = sumsq_ptrs[1];
    548    sum_ptrs[2] = sum_ptrs[1];
    549    sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    550                       w, params->sgr.s1, BITDEPTH_MAX);
    551 
    552    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
    553                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    554 
    555 output_1:
    556    sumsq_ptrs[2] = sumsq_ptrs[1];
    557    sum_ptrs[2] = sum_ptrs[1];
    558    sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    559                       w, params->sgr.s1, BITDEPTH_MAX);
    560 
    561    sgr_finish1_neon(&dst, stride, A_ptrs, B_ptrs,
    562                     w, params->sgr.w1 HIGHBD_TAIL_SUFFIX);
    563    return;
    564 
    565 vert_1:
    566    sumsq_ptrs[2] = sumsq_ptrs[1];
    567    sum_ptrs[2] = sum_ptrs[1];
    568    sgr_box3_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[2], B_ptrs[2],
    569                       w, params->sgr.s1, BITDEPTH_MAX);
    570    rotate_neon(A_ptrs, B_ptrs, 3);
    571    goto output_1;
    572 }
    573 
    574 static void sgr_filter_5x5_neon(pixel *dst, const ptrdiff_t stride,
    575                                const pixel (*left)[4], const pixel *lpf,
    576                                const int w, int h,
    577                                const LooprestorationParams *const params,
    578                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
    579 {
    580    ALIGN_STK_16(int32_t, sumsq_buf, ARM_BUF_STRIDE * 5 + 16,);
    581    ALIGN_STK_16(int16_t, sum_buf, ARM_BUF_STRIDE * 5 + 16,);
    582    int32_t *sumsq_ptrs[5], *sumsq_rows[5];
    583    int16_t *sum_ptrs[5], *sum_rows[5];
    584    for (int i = 0; i < 5; i++) {
    585        sumsq_rows[i] = &sumsq_buf[i * ARM_BUF_STRIDE];
    586        sum_rows[i] = &sum_buf[i * ARM_BUF_STRIDE];
    587    }
    588 
    589    ALIGN_STK_16(int32_t, A_buf, ARM_BUF_STRIDE * 2 + 16,);
    590    ALIGN_STK_16(int16_t, B_buf, ARM_BUF_STRIDE * 2 + 16,);
    591    int32_t *A_ptrs[2];
    592    int16_t *B_ptrs[2];
    593    for (int i = 0; i < 2; i++) {
    594        A_ptrs[i] = &A_buf[i * ARM_BUF_STRIDE];
    595        B_ptrs[i] = &B_buf[i * ARM_BUF_STRIDE];
    596    }
    597    const pixel *src = dst;
    598    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
    599 
    600    if (edges & LR_HAVE_TOP) {
    601        sumsq_ptrs[0] = sumsq_rows[0];
    602        sumsq_ptrs[1] = sumsq_rows[0];
    603        sumsq_ptrs[2] = sumsq_rows[1];
    604        sumsq_ptrs[3] = sumsq_rows[2];
    605        sumsq_ptrs[4] = sumsq_rows[3];
    606        sum_ptrs[0] = sum_rows[0];
    607        sum_ptrs[1] = sum_rows[0];
    608        sum_ptrs[2] = sum_rows[1];
    609        sum_ptrs[3] = sum_rows[2];
    610        sum_ptrs[4] = sum_rows[3];
    611 
    612        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
    613                                       NULL, lpf, w, edges);
    614        lpf += PXSTRIDE(stride);
    615        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
    616                                       NULL, lpf, w, edges);
    617 
    618        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
    619                                       left, src, w, edges);
    620        left++;
    621        src += PXSTRIDE(stride);
    622 
    623        if (--h <= 0)
    624            goto vert_1;
    625 
    626        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
    627                                       left, src, w, edges);
    628        left++;
    629        src += PXSTRIDE(stride);
    630        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    631                           w, params->sgr.s0, BITDEPTH_MAX);
    632        rotate_neon(A_ptrs, B_ptrs, 2);
    633 
    634        if (--h <= 0)
    635            goto vert_2;
    636 
    637        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
    638        // one of them to point at the previously unused rows[4].
    639        sumsq_ptrs[3] = sumsq_rows[4];
    640        sum_ptrs[3] = sum_rows[4];
    641    } else {
    642        sumsq_ptrs[0] = sumsq_rows[0];
    643        sumsq_ptrs[1] = sumsq_rows[0];
    644        sumsq_ptrs[2] = sumsq_rows[0];
    645        sumsq_ptrs[3] = sumsq_rows[0];
    646        sumsq_ptrs[4] = sumsq_rows[0];
    647        sum_ptrs[0] = sum_rows[0];
    648        sum_ptrs[1] = sum_rows[0];
    649        sum_ptrs[2] = sum_rows[0];
    650        sum_ptrs[3] = sum_rows[0];
    651        sum_ptrs[4] = sum_rows[0];
    652 
    653        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[0], sum_rows[0],
    654                                       left, src, w, edges);
    655        left++;
    656        src += PXSTRIDE(stride);
    657 
    658        if (--h <= 0)
    659            goto vert_1;
    660 
    661        sumsq_ptrs[4] = sumsq_rows[1];
    662        sum_ptrs[4] = sum_rows[1];
    663 
    664        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[1], sum_rows[1],
    665                                       left, src, w, edges);
    666        left++;
    667        src += PXSTRIDE(stride);
    668 
    669        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    670                           w, params->sgr.s0, BITDEPTH_MAX);
    671        rotate_neon(A_ptrs, B_ptrs, 2);
    672 
    673        if (--h <= 0)
    674            goto vert_2;
    675 
    676        sumsq_ptrs[3] = sumsq_rows[2];
    677        sumsq_ptrs[4] = sumsq_rows[3];
    678        sum_ptrs[3] = sum_rows[2];
    679        sum_ptrs[4] = sum_rows[3];
    680 
    681        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[2], sum_rows[2],
    682                                       left, src, w, edges);
    683        left++;
    684        src += PXSTRIDE(stride);
    685 
    686        if (--h <= 0)
    687            goto odd;
    688 
    689        BF(dav1d_sgr_box5_row_h, neon)(sumsq_rows[3], sum_rows[3],
    690                                       left, src, w, edges);
    691        left++;
    692        src += PXSTRIDE(stride);
    693 
    694        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    695                           w, params->sgr.s0, BITDEPTH_MAX);
    696        sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
    697                         w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    698 
    699        if (--h <= 0)
    700            goto vert_2;
    701 
    702        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
    703        // one of them to point at the previously unused rows[4].
    704        sumsq_ptrs[3] = sumsq_rows[4];
    705        sum_ptrs[3] = sum_rows[4];
    706    }
    707 
    708    do {
    709        BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
    710                                       left, src, w, edges);
    711        left++;
    712        src += PXSTRIDE(stride);
    713 
    714        if (--h <= 0)
    715            goto odd;
    716 
    717        BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
    718                                       left, src, w, edges);
    719        left++;
    720        src += PXSTRIDE(stride);
    721 
    722        sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    723                           w, params->sgr.s0, BITDEPTH_MAX);
    724        sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
    725                         w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    726    } while (--h > 0);
    727 
    728    if (!(edges & LR_HAVE_BOTTOM))
    729        goto vert_2;
    730 
    731    BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[3], sum_ptrs[3],
    732                                   NULL, lpf_bottom, w, edges);
    733    lpf_bottom += PXSTRIDE(stride);
    734    BF(dav1d_sgr_box5_row_h, neon)(sumsq_ptrs[4], sum_ptrs[4],
    735                                   NULL, lpf_bottom, w, edges);
    736 
    737 output_2:
    738    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    739                       w, params->sgr.s0, BITDEPTH_MAX);
    740    sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
    741                     w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    742    return;
    743 
    744 vert_2:
    745    // Duplicate the last row twice more
    746    sumsq_ptrs[3] = sumsq_ptrs[2];
    747    sumsq_ptrs[4] = sumsq_ptrs[2];
    748    sum_ptrs[3] = sum_ptrs[2];
    749    sum_ptrs[4] = sum_ptrs[2];
    750    goto output_2;
    751 
    752 odd:
    753    // Copy the last row as padding once
    754    sumsq_ptrs[4] = sumsq_ptrs[3];
    755    sum_ptrs[4] = sum_ptrs[3];
    756 
    757    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    758                       w, params->sgr.s0, BITDEPTH_MAX);
    759    sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
    760                     w, 2, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    761 
    762 output_1:
    763    // Duplicate the last row twice more
    764    sumsq_ptrs[3] = sumsq_ptrs[2];
    765    sumsq_ptrs[4] = sumsq_ptrs[2];
    766    sum_ptrs[3] = sum_ptrs[2];
    767    sum_ptrs[4] = sum_ptrs[2];
    768 
    769    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    770                       w, params->sgr.s0, BITDEPTH_MAX);
    771    // Output only one row
    772    sgr_finish2_neon(&dst, stride, A_ptrs, B_ptrs,
    773                     w, 1, params->sgr.w0 HIGHBD_TAIL_SUFFIX);
    774    return;
    775 
    776 vert_1:
    777    // Copy the last row as padding once
    778    sumsq_ptrs[4] = sumsq_ptrs[3];
    779    sum_ptrs[4] = sum_ptrs[3];
    780 
    781    sgr_box5_vert_neon(sumsq_ptrs, sum_ptrs, A_ptrs[1], B_ptrs[1],
    782                       w, params->sgr.s0, BITDEPTH_MAX);
    783    rotate_neon(A_ptrs, B_ptrs, 2);
    784 
    785    goto output_1;
    786 }
    787 
    788 static void sgr_filter_mix_neon(pixel *dst, const ptrdiff_t stride,
    789                                const pixel (*left)[4], const pixel *lpf,
    790                                const int w, int h,
    791                                const LooprestorationParams *const params,
    792                                const enum LrEdgeFlags edges HIGHBD_DECL_SUFFIX)
    793 {
    794    ALIGN_STK_16(int32_t, sumsq5_buf, ARM_BUF_STRIDE * 5 + 16,);
    795    ALIGN_STK_16(int16_t, sum5_buf, ARM_BUF_STRIDE * 5 + 16,);
    796    int32_t *sumsq5_ptrs[5], *sumsq5_rows[5];
    797    int16_t *sum5_ptrs[5], *sum5_rows[5];
    798    for (int i = 0; i < 5; i++) {
    799        sumsq5_rows[i] = &sumsq5_buf[i * ARM_BUF_STRIDE];
    800        sum5_rows[i] = &sum5_buf[i * ARM_BUF_STRIDE];
    801    }
    802    ALIGN_STK_16(int32_t, sumsq3_buf, ARM_BUF_STRIDE * 3 + 16,);
    803    ALIGN_STK_16(int16_t, sum3_buf, ARM_BUF_STRIDE * 3 + 16,);
    804    int32_t *sumsq3_ptrs[3], *sumsq3_rows[3];
    805    int16_t *sum3_ptrs[3], *sum3_rows[3];
    806    for (int i = 0; i < 3; i++) {
    807        sumsq3_rows[i] = &sumsq3_buf[i * ARM_BUF_STRIDE];
    808        sum3_rows[i] = &sum3_buf[i * ARM_BUF_STRIDE];
    809    }
    810 
    811    ALIGN_STK_16(int32_t, A5_buf, ARM_BUF_STRIDE * 2 + 16,);
    812    ALIGN_STK_16(int16_t, B5_buf, ARM_BUF_STRIDE * 2 + 16,);
    813    int32_t *A5_ptrs[2];
    814    int16_t *B5_ptrs[2];
    815    for (int i = 0; i < 2; i++) {
    816        A5_ptrs[i] = &A5_buf[i * ARM_BUF_STRIDE];
    817        B5_ptrs[i] = &B5_buf[i * ARM_BUF_STRIDE];
    818    }
    819    ALIGN_STK_16(int32_t, A3_buf, ARM_BUF_STRIDE * 4 + 16,);
    820    ALIGN_STK_16(int16_t, B3_buf, ARM_BUF_STRIDE * 4 + 16,);
    821    int32_t *A3_ptrs[4];
    822    int16_t *B3_ptrs[4];
    823    for (int i = 0; i < 4; i++) {
    824        A3_ptrs[i] = &A3_buf[i * ARM_BUF_STRIDE];
    825        B3_ptrs[i] = &B3_buf[i * ARM_BUF_STRIDE];
    826    }
    827    const pixel *src = dst;
    828    const pixel *lpf_bottom = lpf + 6*PXSTRIDE(stride);
    829 
    830    if (edges & LR_HAVE_TOP) {
    831        sumsq5_ptrs[0] = sumsq5_rows[0];
    832        sumsq5_ptrs[1] = sumsq5_rows[0];
    833        sumsq5_ptrs[2] = sumsq5_rows[1];
    834        sumsq5_ptrs[3] = sumsq5_rows[2];
    835        sumsq5_ptrs[4] = sumsq5_rows[3];
    836        sum5_ptrs[0] = sum5_rows[0];
    837        sum5_ptrs[1] = sum5_rows[0];
    838        sum5_ptrs[2] = sum5_rows[1];
    839        sum5_ptrs[3] = sum5_rows[2];
    840        sum5_ptrs[4] = sum5_rows[3];
    841 
    842        sumsq3_ptrs[0] = sumsq3_rows[0];
    843        sumsq3_ptrs[1] = sumsq3_rows[1];
    844        sumsq3_ptrs[2] = sumsq3_rows[2];
    845        sum3_ptrs[0] = sum3_rows[0];
    846        sum3_ptrs[1] = sum3_rows[1];
    847        sum3_ptrs[2] = sum3_rows[2];
    848 
    849        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
    850                                        sumsq5_rows[0], sum5_rows[0],
    851                                        NULL, lpf, w, edges);
    852        lpf += PXSTRIDE(stride);
    853        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
    854                                        sumsq5_rows[1], sum5_rows[1],
    855                                        NULL, lpf, w, edges);
    856 
    857        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
    858                                        sumsq5_rows[2], sum5_rows[2],
    859                                        left, src, w, edges);
    860        left++;
    861        src += PXSTRIDE(stride);
    862 
    863        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    864                           w, params->sgr.s1, BITDEPTH_MAX);
    865        rotate_neon(A3_ptrs, B3_ptrs, 4);
    866 
    867        if (--h <= 0)
    868            goto vert_1;
    869 
    870        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
    871                                        sumsq5_rows[3], sum5_rows[3],
    872                                        left, src, w, edges);
    873        left++;
    874        src += PXSTRIDE(stride);
    875        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
    876                           w, params->sgr.s0, BITDEPTH_MAX);
    877        rotate_neon(A5_ptrs, B5_ptrs, 2);
    878        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    879                           w, params->sgr.s1, BITDEPTH_MAX);
    880        rotate_neon(A3_ptrs, B3_ptrs, 4);
    881 
    882        if (--h <= 0)
    883            goto vert_2;
    884 
    885        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
    886        // one of them to point at the previously unused rows[4].
    887        sumsq5_ptrs[3] = sumsq5_rows[4];
    888        sum5_ptrs[3] = sum5_rows[4];
    889    } else {
    890        sumsq5_ptrs[0] = sumsq5_rows[0];
    891        sumsq5_ptrs[1] = sumsq5_rows[0];
    892        sumsq5_ptrs[2] = sumsq5_rows[0];
    893        sumsq5_ptrs[3] = sumsq5_rows[0];
    894        sumsq5_ptrs[4] = sumsq5_rows[0];
    895        sum5_ptrs[0] = sum5_rows[0];
    896        sum5_ptrs[1] = sum5_rows[0];
    897        sum5_ptrs[2] = sum5_rows[0];
    898        sum5_ptrs[3] = sum5_rows[0];
    899        sum5_ptrs[4] = sum5_rows[0];
    900 
    901        sumsq3_ptrs[0] = sumsq3_rows[0];
    902        sumsq3_ptrs[1] = sumsq3_rows[0];
    903        sumsq3_ptrs[2] = sumsq3_rows[0];
    904        sum3_ptrs[0] = sum3_rows[0];
    905        sum3_ptrs[1] = sum3_rows[0];
    906        sum3_ptrs[2] = sum3_rows[0];
    907 
    908        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[0], sum3_rows[0],
    909                                        sumsq5_rows[0], sum5_rows[0],
    910                                        left, src, w, edges);
    911        left++;
    912        src += PXSTRIDE(stride);
    913 
    914        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    915                           w, params->sgr.s1, BITDEPTH_MAX);
    916        rotate_neon(A3_ptrs, B3_ptrs, 4);
    917 
    918        if (--h <= 0)
    919            goto vert_1;
    920 
    921        sumsq5_ptrs[4] = sumsq5_rows[1];
    922        sum5_ptrs[4] = sum5_rows[1];
    923 
    924        sumsq3_ptrs[2] = sumsq3_rows[1];
    925        sum3_ptrs[2] = sum3_rows[1];
    926 
    927        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[1], sum3_rows[1],
    928                                        sumsq5_rows[1], sum5_rows[1],
    929                                        left, src, w, edges);
    930        left++;
    931        src += PXSTRIDE(stride);
    932 
    933        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
    934                           w, params->sgr.s0, BITDEPTH_MAX);
    935        rotate_neon(A5_ptrs, B5_ptrs, 2);
    936        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    937                           w, params->sgr.s1, BITDEPTH_MAX);
    938        rotate_neon(A3_ptrs, B3_ptrs, 4);
    939 
    940        if (--h <= 0)
    941            goto vert_2;
    942 
    943        sumsq5_ptrs[3] = sumsq5_rows[2];
    944        sumsq5_ptrs[4] = sumsq5_rows[3];
    945        sum5_ptrs[3] = sum5_rows[2];
    946        sum5_ptrs[4] = sum5_rows[3];
    947 
    948        sumsq3_ptrs[2] = sumsq3_rows[2];
    949        sum3_ptrs[2] = sum3_rows[2];
    950 
    951        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_rows[2], sum3_rows[2],
    952                                        sumsq5_rows[2], sum5_rows[2],
    953                                        left, src, w, edges);
    954        left++;
    955        src += PXSTRIDE(stride);
    956 
    957        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    958                           w, params->sgr.s1, BITDEPTH_MAX);
    959        rotate_neon(A3_ptrs, B3_ptrs, 4);
    960 
    961        if (--h <= 0)
    962            goto odd;
    963 
    964        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
    965                                        sumsq5_rows[3], sum5_rows[3],
    966                                        left, src, w, edges);
    967        left++;
    968        src += PXSTRIDE(stride);
    969 
    970        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
    971                           w, params->sgr.s0, BITDEPTH_MAX);
    972        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    973                           w, params->sgr.s1, BITDEPTH_MAX);
    974        sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
    975                            w, 2, params->sgr.w0, params->sgr.w1
    976                            HIGHBD_TAIL_SUFFIX);
    977 
    978        if (--h <= 0)
    979            goto vert_2;
    980 
    981        // ptrs are rotated by 2; both [3] and [4] now point at rows[0]; set
    982        // one of them to point at the previously unused rows[4].
    983        sumsq5_ptrs[3] = sumsq5_rows[4];
    984        sum5_ptrs[3] = sum5_rows[4];
    985    }
    986 
    987    do {
    988        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
    989                                        sumsq5_ptrs[3], sum5_ptrs[3],
    990                                        left, src, w, edges);
    991        left++;
    992        src += PXSTRIDE(stride);
    993 
    994        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
    995                           w, params->sgr.s1, BITDEPTH_MAX);
    996        rotate_neon(A3_ptrs, B3_ptrs, 4);
    997 
    998        if (--h <= 0)
    999            goto odd;
   1000 
   1001        BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
   1002                                        sumsq5_ptrs[4], sum5_ptrs[4],
   1003                                        left, src, w, edges);
   1004        left++;
   1005        src += PXSTRIDE(stride);
   1006 
   1007        sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1008                           w, params->sgr.s0, BITDEPTH_MAX);
   1009        sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1010                           w, params->sgr.s1, BITDEPTH_MAX);
   1011        sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1012                            w, 2, params->sgr.w0, params->sgr.w1
   1013                            HIGHBD_TAIL_SUFFIX);
   1014    } while (--h > 0);
   1015 
   1016    if (!(edges & LR_HAVE_BOTTOM))
   1017        goto vert_2;
   1018 
   1019    BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
   1020                                    sumsq5_ptrs[3], sum5_ptrs[3],
   1021                                    NULL, lpf_bottom, w, edges);
   1022    lpf_bottom += PXSTRIDE(stride);
   1023    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1024                       w, params->sgr.s1, BITDEPTH_MAX);
   1025    rotate_neon(A3_ptrs, B3_ptrs, 4);
   1026 
   1027    BF(dav1d_sgr_box35_row_h, neon)(sumsq3_ptrs[2], sum3_ptrs[2],
   1028                                    sumsq5_ptrs[4], sum5_ptrs[4],
   1029                                    NULL, lpf_bottom, w, edges);
   1030 
   1031 output_2:
   1032    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1033                       w, params->sgr.s0, BITDEPTH_MAX);
   1034    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1035                       w, params->sgr.s1, BITDEPTH_MAX);
   1036    sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1037                        w, 2, params->sgr.w0, params->sgr.w1
   1038                        HIGHBD_TAIL_SUFFIX);
   1039    return;
   1040 
   1041 vert_2:
   1042    // Duplicate the last row twice more
   1043    sumsq5_ptrs[3] = sumsq5_ptrs[2];
   1044    sumsq5_ptrs[4] = sumsq5_ptrs[2];
   1045    sum5_ptrs[3] = sum5_ptrs[2];
   1046    sum5_ptrs[4] = sum5_ptrs[2];
   1047 
   1048    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1049    sum3_ptrs[2] = sum3_ptrs[1];
   1050    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1051                       w, params->sgr.s1, BITDEPTH_MAX);
   1052    rotate_neon(A3_ptrs, B3_ptrs, 4);
   1053 
   1054    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1055    sum3_ptrs[2] = sum3_ptrs[1];
   1056 
   1057    goto output_2;
   1058 
   1059 odd:
   1060    // Copy the last row as padding once
   1061    sumsq5_ptrs[4] = sumsq5_ptrs[3];
   1062    sum5_ptrs[4] = sum5_ptrs[3];
   1063 
   1064    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1065    sum3_ptrs[2] = sum3_ptrs[1];
   1066 
   1067    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1068                       w, params->sgr.s0, BITDEPTH_MAX);
   1069    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1070                       w, params->sgr.s1, BITDEPTH_MAX);
   1071    sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1072                        w, 2, params->sgr.w0, params->sgr.w1
   1073                        HIGHBD_TAIL_SUFFIX);
   1074 
   1075 output_1:
   1076    // Duplicate the last row twice more
   1077    sumsq5_ptrs[3] = sumsq5_ptrs[2];
   1078    sumsq5_ptrs[4] = sumsq5_ptrs[2];
   1079    sum5_ptrs[3] = sum5_ptrs[2];
   1080    sum5_ptrs[4] = sum5_ptrs[2];
   1081 
   1082    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1083    sum3_ptrs[2] = sum3_ptrs[1];
   1084 
   1085    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1086                       w, params->sgr.s0, BITDEPTH_MAX);
   1087    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1088                       w, params->sgr.s1, BITDEPTH_MAX);
   1089    rotate_neon(A3_ptrs, B3_ptrs, 4);
   1090    // Output only one row
   1091    sgr_finish_mix_neon(&dst, stride, A5_ptrs, B5_ptrs, A3_ptrs, B3_ptrs,
   1092                        w, 1, params->sgr.w0, params->sgr.w1
   1093                        HIGHBD_TAIL_SUFFIX);
   1094    return;
   1095 
   1096 vert_1:
   1097    // Copy the last row as padding once
   1098    sumsq5_ptrs[4] = sumsq5_ptrs[3];
   1099    sum5_ptrs[4] = sum5_ptrs[3];
   1100 
   1101    sumsq3_ptrs[2] = sumsq3_ptrs[1];
   1102    sum3_ptrs[2] = sum3_ptrs[1];
   1103 
   1104    sgr_box5_vert_neon(sumsq5_ptrs, sum5_ptrs, A5_ptrs[1], B5_ptrs[1],
   1105                       w, params->sgr.s0, BITDEPTH_MAX);
   1106    rotate_neon(A5_ptrs, B5_ptrs, 2);
   1107    sgr_box3_vert_neon(sumsq3_ptrs, sum3_ptrs, A3_ptrs[3], B3_ptrs[3],
   1108                       w, params->sgr.s1, BITDEPTH_MAX);
   1109    rotate_neon(A3_ptrs, B3_ptrs, 4);
   1110 
   1111    goto output_1;
   1112 }
   1113 
   1114 
   1115 static ALWAYS_INLINE void loop_restoration_dsp_init_arm(Dav1dLoopRestorationDSPContext *const c, int bpc) {
   1116    const unsigned flags = dav1d_get_cpu_flags();
   1117 
   1118    if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return;
   1119 
   1120 #if ARCH_AARCH64
   1121    c->wiener[0] = BF(dav1d_wiener_filter7, neon);
   1122    c->wiener[1] = BF(dav1d_wiener_filter5, neon);
   1123 #else
   1124    c->wiener[0] = c->wiener[1] = wiener_filter_neon;
   1125 #endif
   1126    if (BITDEPTH == 8 || bpc == 10) {
   1127        c->sgr[0] = sgr_filter_5x5_neon;
   1128        c->sgr[1] = sgr_filter_3x3_neon;
   1129        c->sgr[2] = sgr_filter_mix_neon;
   1130    }
   1131 }