tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration16.S (57924B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2020, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 const right_ext_mask_buf
     32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     36        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     37        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     38 right_ext_mask:
     39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     41        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     42        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     43        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     44        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     45 endconst
     46 
     47 // void dav1d_wiener_filter7_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
     48 //                                      const pixel (*left)[4], const pixel *lpf,
     49 //                                      const int w, int h,
     50 //                                      const int16_t filter[2][8],
     51 //                                      const enum LrEdgeFlags edges,
     52 //                                      const int bitdepth_max);
     53 function wiener_filter7_16bpc_neon, export=1
     54        ldr             w8,  [sp]
     55        AARCH64_SIGN_LINK_REGISTER
     56        stp             x29, x30, [sp, #-32]!
     57        stp             d8,  d9,  [sp, #16]
     58        mov             x29, sp
     59        ld1             {v0.8h, v1.8h},  [x6]
     60        tst             w7,  #4               // LR_HAVE_TOP
     61        sub_sp          384*2*6
     62 
     63        dup             v28.8h,  w8           // bitdepth_max
     64        clz             w8,  w8
     65        movi            v30.4s,  #1
     66        sub             w10, w8,  #38         // -(bitdepth + 6)
     67        sub             w11, w8,  #11         // round_bits_v
     68        sub             w8,  w8,  #25         // -round_bits_h
     69        neg             w10, w10              // bitdepth + 6
     70        neg             w11, w11              // -round_bits_v
     71        dup             v2.4s,   w10
     72        dup             v29.4s,  w8           // -round_bits_h
     73        dup             v27.4s,  w11          // -round_bits_v
     74        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
     75        ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
     76 
     77        zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
     78 
     79        // x9  - t6
     80        // x10 - t5
     81        // x11 - t4
     82        // x12 - t3
     83        // x13 - t2
     84        // x14 - t1
     85        // x15 - t0
     86        mov             x14, sp               // t1
     87        b.eq            L(no_top_7)
     88 
     89        mov             x16, x2               // backup left
     90        mov             x2,  #0
     91        bl              wiener_filter7_h_16bpc_neon
     92        add             x3,  x3,  x1          // lpf += stride
     93        mov             x9,  x14              // t6
     94        mov             x10, x14              // t5
     95        add             x14, x14, #384*2      // t1 += 384*2
     96        bl              wiener_filter7_h_16bpc_neon
     97        add             x3,  x3,  x1,  lsl #2
     98        add             x3,  x3,  x1          // lpf += stride*5
     99        mov             x11, x14              // t4
    100        add             x14, x14, #384*2      // t1 += 384*2
    101        mov             x2,  x16              // left
    102        mov             x16, x3               // backup lpf
    103        mov             x3,  x0               // lpf = p
    104        bl              wiener_filter7_h_16bpc_neon
    105        subs            w5,  w5,  #1          // h--
    106        mov             x12, x14              // t3
    107        mov             x13, x14              // t2
    108        b.eq            L(v1_7)
    109        add             x3,  x3,  x1          // src += stride
    110        add             x14, x14, #384*2      // t1 += 384*2
    111        bl              wiener_filter7_h_16bpc_neon
    112        mov             x13, x14              // t2
    113        subs            w5,  w5,  #1          // h--
    114        b.eq            L(v2_7)
    115        add             x3,  x3,  x1          // src += stride
    116        add             x14, x14, #384*2      // t1 += 384*2
    117        bl              wiener_filter7_h_16bpc_neon
    118        subs            w5,  w5,  #1          // h--
    119        b.eq            L(v3_7)
    120        add             x3,  x3,  x1          // src += stride
    121 
    122 L(main_7):
    123        add             x15, x14, #384*2      // t0 = t1 + 384*2
    124 L(main_loop_7):
    125        bl              wiener_filter7_hv_16bpc_neon
    126        subs            w5,  w5,  #1          // h--
    127        b.ne            L(main_loop_7)
    128        tst             w7,  #8 // LR_HAVE_BOTTOM
    129        b.eq            L(v3_7)
    130 
    131        mov             x3,  x16              // restore lpf
    132        mov             x2,  #0               // left = NULL
    133        bl              wiener_filter7_hv_16bpc_neon
    134        bl              wiener_filter7_hv_16bpc_neon
    135 L(v1_7):
    136        bl              wiener_filter7_v_16bpc_neon
    137 
    138        mov             sp,  x29
    139        ldp             d8,  d9,  [sp, #16]
    140        ldp             x29, x30, [sp], #32
    141        AARCH64_VALIDATE_LINK_REGISTER
    142        ret
    143 
    144 L(no_top_7):
    145        add             x3,  x3,  x1,  lsl #2
    146        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
    147        mov             x3,  x0               // lpf = p
    148 
    149        bl              wiener_filter7_h_16bpc_neon
    150        subs            w5,  w5,  #1          // h--
    151        mov             x9,  x14              // t6
    152        mov             x10, x14              // t5
    153        mov             x11, x14              // t4
    154        mov             x12, x14              // t3
    155        mov             x13, x14              // t2
    156        b.eq            L(v1_7)
    157        add             x3,  x3,  x1          // src += p_stride
    158        add             x14, x14, #384*2      // t1 += 384*2
    159        bl              wiener_filter7_h_16bpc_neon
    160        subs            w5,  w5,  #1          // h--
    161        mov             x13, x14              // t2
    162        b.eq            L(v2_7)
    163        add             x3,  x3,  x1          // src += p_stride
    164        add             x14, x14, #384*2      // t1 += 384*2
    165        bl              wiener_filter7_h_16bpc_neon
    166        subs            w5,  w5,  #1          // h--
    167        b.eq            L(v3_7)
    168        add             x3,  x3,  x1          // src += p_stride
    169        add             x15, x14, #384*2      // t0 = t1 + 384*2
    170        bl              wiener_filter7_hv_16bpc_neon
    171        subs            w5,  w5,  #1          // h--
    172        b.eq            L(v3_7)
    173        add             x15, x15, #384*2*4    // t0 += 384*2*4
    174        bl              wiener_filter7_hv_16bpc_neon
    175        subs            w5,  w5,  #1          // h--
    176        b.ne            L(main_7)
    177 L(v3_7):
    178        bl              wiener_filter7_v_16bpc_neon
    179 L(v2_7):
    180        bl              wiener_filter7_v_16bpc_neon
    181        b               L(v1_7)
    182 endfunc
    183 
    184 
    185 function wiener_filter7_h_16bpc_neon
    186        stp             x3,  x4,  [sp, #-32]!
    187        str             x14,      [sp, #16]
    188 
    189        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    190        tst             w7,  #1 // LR_HAVE_LEFT
    191        b.eq            1f
    192        // LR_HAVE_LEFT
    193        cbnz            x2,  0f
    194        // left == NULL
    195        sub             x3,  x3,  #6
    196        ld1             {v2.8h, v3.8h}, [x3], #32
    197        b               2f
    198 
    199 0:
    200        // LR_HAVE_LEFT, left != NULL
    201        ld1             {v2.8h, v3.8h}, [x3], #32
    202        ld1             {v4.d}[1], [x2], #8
    203        // Move x3 back to account for the last 3 pixels we loaded earlier,
    204        // which we'll shift out.
    205        sub             x3,  x3,  #6
    206        ext             v3.16b,  v2.16b,  v3.16b,  #10
    207        ext             v2.16b,  v4.16b,  v2.16b,  #10
    208        b               2f
    209 
    210 1:
    211        ld1             {v2.8h, v3.8h}, [x3], #32
    212        // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
    213        // and shift v3 to have 3x the first pixel at the front.
    214        dup             v4.8h,  v2.h[0]
    215        // Move x3 back to account for the last 3 pixels we loaded before,
    216        // which we shifted out.
    217        sub             x3,  x3,  #6
    218        ext             v3.16b,  v2.16b,  v3.16b,  #10
    219        ext             v2.16b,  v4.16b,  v2.16b,  #10
    220 
    221 2:
    222        ld1             {v4.8h}, [x3], #16
    223 
    224        tst             w7,  #2 // LR_HAVE_RIGHT
    225        b.ne            4f
    226 
    227 3:      // !LR_HAVE_RIGHT
    228 
    229        // Check whether we need to pad the right edge
    230        cmp             w4,  #19
    231        b.ge            4f   // If w >= 19, all used input pixels are valid
    232 
    233        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
    234        // this ends up called again; it's not strictly needed in those
    235        // cases (we pad enough here), but keeping the code as simple as possible.
    236 
    237        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
    238        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
    239        sub             w17, w4,  #22
    240        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
    241        // buffer pointer.
    242        movrel          x6,  right_ext_mask, -6
    243        ldr             h26, [x3,  w17, sxtw #1]
    244        sub             x6,  x6,  w4,  uxtw #1
    245        dup             v26.8h,  v26.h[0]
    246        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
    247 
    248        bit             v2.16b,  v26.16b, v23.16b
    249        bit             v3.16b,  v26.16b, v24.16b
    250        bit             v4.16b,  v26.16b, v25.16b
    251 
    252 4:      // Loop horizontally
    253        // Interleaving the mul/mla chains actually hurts performance
    254        // significantly on Cortex A53, thus keeping mul/mla tightly
    255        // chained like this.
    256        ext             v17.16b, v2.16b,  v3.16b, #4
    257        ext             v19.16b, v2.16b,  v3.16b, #8
    258        ext             v16.16b, v2.16b,  v3.16b, #2
    259        ext             v20.16b, v2.16b,  v3.16b, #10
    260        ext             v21.16b, v2.16b,  v3.16b, #12
    261        ext             v18.16b, v2.16b,  v3.16b, #6
    262        add             v19.8h,  v19.8h,  v17.8h
    263        add             v20.8h,  v20.8h,  v16.8h
    264        add             v21.8h,  v21.8h,  v2.8h
    265        smull           v6.4s,   v18.4h,  v0.h[3]
    266        smlal           v6.4s,   v19.4h,  v0.h[2]
    267        smlal           v6.4s,   v20.4h,  v0.h[1]
    268        smlal           v6.4s,   v21.4h,  v0.h[0]
    269        smull2          v7.4s,   v18.8h,  v0.h[3]
    270        smlal2          v7.4s,   v19.8h,  v0.h[2]
    271        smlal2          v7.4s,   v20.8h,  v0.h[1]
    272        smlal2          v7.4s,   v21.8h,  v0.h[0]
    273 
    274        ext             v17.16b, v3.16b,  v4.16b, #4
    275        ext             v19.16b, v3.16b,  v4.16b, #8
    276        ext             v16.16b, v3.16b,  v4.16b, #2
    277        ext             v20.16b, v3.16b,  v4.16b, #10
    278        ext             v21.16b, v3.16b,  v4.16b, #12
    279        ext             v18.16b, v3.16b,  v4.16b, #6
    280 
    281        add             v19.8h,  v19.8h,  v17.8h
    282        add             v20.8h,  v20.8h,  v16.8h
    283        add             v21.8h,  v21.8h,  v3.8h
    284        smull           v16.4s,  v18.4h,  v0.h[3]
    285        smlal           v16.4s,  v19.4h,  v0.h[2]
    286        smlal           v16.4s,  v20.4h,  v0.h[1]
    287        smlal           v16.4s,  v21.4h,  v0.h[0]
    288        smull2          v17.4s,  v18.8h,  v0.h[3]
    289        smlal2          v17.4s,  v19.8h,  v0.h[2]
    290        smlal2          v17.4s,  v20.8h,  v0.h[1]
    291        smlal2          v17.4s,  v21.8h,  v0.h[0]
    292 
    293        mvni            v24.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
    294        add             v6.4s,   v6.4s,   v30.4s
    295        add             v7.4s,   v7.4s,   v30.4s
    296        add             v16.4s,  v16.4s,  v30.4s
    297        add             v17.4s,  v17.4s,  v30.4s
    298        srshl           v6.4s,   v6.4s,   v29.4s
    299        srshl           v7.4s,   v7.4s,   v29.4s
    300        srshl           v16.4s,  v16.4s,  v29.4s
    301        srshl           v17.4s,  v17.4s,  v29.4s
    302        sqxtun          v6.4h,   v6.4s
    303        sqxtun2         v6.8h,   v7.4s
    304        sqxtun          v7.4h,   v16.4s
    305        sqxtun2         v7.8h,   v17.4s
    306        umin            v6.8h,   v6.8h,   v24.8h
    307        umin            v7.8h,   v7.8h,   v24.8h
    308        sub             v6.8h,   v6.8h,   v31.8h
    309        sub             v7.8h,   v7.8h,   v31.8h
    310 
    311        subs            w4,  w4,  #16
    312 
    313        st1             {v6.8h, v7.8h}, [x14], #32
    314 
    315        b.le            0f
    316        mov             v2.16b,  v4.16b
    317        tst             w7,  #2 // LR_HAVE_RIGHT
    318        ld1             {v3.8h, v4.8h}, [x3], #32
    319        b.ne            4b // If we don't need to pad, just keep filtering.
    320        b               3b // If we need to pad, check how many pixels we have left.
    321 
    322 0:
    323        ldr             x14,      [sp, #16]
    324        ldp             x3,  x4,  [sp], #32
    325        ret
    326 endfunc
    327 
    328 function wiener_filter7_v_16bpc_neon
    329        // Backing up/restoring registers shifted, so that x9 gets the value
    330        // of x10, etc, afterwards.
    331        stp             x10, x11, [sp, #-64]!
    332        stp             x12, x13, [sp, #16]
    333        stp             x14, x14, [sp, #32]
    334        stp             x0,  x4,  [sp, #48]
    335 1:
    336        ld1             {v16.8h, v17.8h}, [x9],  #32
    337        ld1             {v18.8h, v19.8h}, [x10], #32
    338        ld1             {v20.8h, v21.8h}, [x11], #32
    339        ld1             {v22.8h, v23.8h}, [x12], #32
    340        ld1             {v24.8h, v25.8h}, [x13], #32
    341        ld1             {v6.8h,  v7.8h},  [x14], #32
    342 
    343        smull           v2.4s,   v16.4h,  v0.h[4]
    344        smlal           v2.4s,   v18.4h,  v0.h[5]
    345        smlal           v2.4s,   v20.4h,  v0.h[6]
    346        smlal           v2.4s,   v22.4h,  v0.h[7]
    347        smlal           v2.4s,   v24.4h,  v0.h[6]
    348        smlal           v2.4s,   v6.4h,   v0.h[5]
    349        smlal           v2.4s,   v6.4h,   v0.h[4]
    350        smull2          v3.4s,   v16.8h,  v0.h[4]
    351        smlal2          v3.4s,   v18.8h,  v0.h[5]
    352        smlal2          v3.4s,   v20.8h,  v0.h[6]
    353        smlal2          v3.4s,   v22.8h,  v0.h[7]
    354        smlal2          v3.4s,   v24.8h,  v0.h[6]
    355        smlal2          v3.4s,   v6.8h,   v0.h[5]
    356        smlal2          v3.4s,   v6.8h,   v0.h[4]
    357        smull           v4.4s,   v17.4h,  v0.h[4]
    358        smlal           v4.4s,   v19.4h,  v0.h[5]
    359        smlal           v4.4s,   v21.4h,  v0.h[6]
    360        smlal           v4.4s,   v23.4h,  v0.h[7]
    361        smlal           v4.4s,   v25.4h,  v0.h[6]
    362        smlal           v4.4s,   v7.4h,   v0.h[5]
    363        smlal           v4.4s,   v7.4h,   v0.h[4]
    364        smull2          v5.4s,   v17.8h,  v0.h[4]
    365        smlal2          v5.4s,   v19.8h,  v0.h[5]
    366        smlal2          v5.4s,   v21.8h,  v0.h[6]
    367        smlal2          v5.4s,   v23.8h,  v0.h[7]
    368        smlal2          v5.4s,   v25.8h,  v0.h[6]
    369        smlal2          v5.4s,   v7.8h,   v0.h[5]
    370        smlal2          v5.4s,   v7.8h,   v0.h[4]
    371        srshl           v2.4s,   v2.4s,   v27.4s  // -round_bits_v
    372        srshl           v3.4s,   v3.4s,   v27.4s
    373        srshl           v4.4s,   v4.4s,   v27.4s
    374        srshl           v5.4s,   v5.4s,   v27.4s
    375        sqxtun          v2.4h,   v2.4s
    376        sqxtun2         v2.8h,   v3.4s
    377        sqxtun          v3.4h,   v4.4s
    378        sqxtun2         v3.8h,   v5.4s
    379        umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
    380        umin            v3.8h,   v3.8h,   v28.8h
    381        subs            w4,  w4,  #16
    382        st1             {v2.8h, v3.8h}, [x0], #32
    383        b.gt            1b
    384 
    385        ldp             x0,  x4,  [sp, #48]
    386        ldp             x13, x14, [sp, #32]
    387        ldp             x11, x12, [sp, #16]
    388        ldp             x9,  x10, [sp], #64
    389 
    390        add             x0,  x0,  x1
    391        ret
    392 endfunc
    393 
    394 function wiener_filter7_hv_16bpc_neon
    395        // Backing up/restoring registers shifted, so that x9 gets the value
    396        // of x10, etc, and x15==x9, afterwards.
    397        stp             x10, x11, [sp, #-80]!
    398        stp             x12, x13, [sp, #16]
    399        stp             x14, x15, [sp, #32]
    400        stp             x10, x0,  [sp, #48]
    401        stp             x3,  x4,  [sp, #64]
    402 
    403        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    404        tst             w7,  #1 // LR_HAVE_LEFT
    405        b.eq            1f
    406        // LR_HAVE_LEFT
    407        cbnz            x2,  0f
    408        // left == NULL
    409        sub             x3,  x3,  #6
    410        ld1             {v2.8h, v3.8h}, [x3], #32
    411        b               2f
    412 
    413 0:
    414        // LR_HAVE_LEFT, left != NULL
    415        ld1             {v2.8h, v3.8h}, [x3], #32
    416        ld1             {v4.d}[1], [x2], #8
    417        // Move x3 back to account for the last 3 pixels we loaded earlier,
    418        // which we'll shift out.
    419        sub             x3,  x3,  #6
    420        ext             v3.16b,  v2.16b,  v3.16b,  #10
    421        ext             v2.16b,  v4.16b,  v2.16b,  #10
    422        b               2f
    423 1:
    424        ld1             {v2.8h, v3.8h}, [x3], #32
    425        // !LR_HAVE_LEFT, fill v4 with the leftmost pixel
    426        // and shift v3 to have 3x the first pixel at the front.
    427        dup             v4.8h,  v2.h[0]
    428        // Move x3 back to account for the last 3 pixels we loaded before,
    429        // which we shifted out.
    430        sub             x3,  x3,  #6
    431        ext             v3.16b,  v2.16b,  v3.16b,  #10
    432        ext             v2.16b,  v4.16b,  v2.16b,  #10
    433 
    434 2:
    435        ld1             {v4.8h}, [x3], #16
    436 
    437        tst             w7,  #2 // LR_HAVE_RIGHT
    438        b.ne            4f
    439 
    440 3:      // !LR_HAVE_RIGHT
    441 
    442        // Check whether we need to pad the right edge
    443        cmp             w4,  #19
    444        b.ge            4f   // If w >= 19, all used input pixels are valid
    445 
    446        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
    447        // this ends up called again; it's not strictly needed in those
    448        // cases (we pad enough here), but keeping the code as simple as possible.
    449 
    450        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
    451        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
    452        sub             w17, w4,  #22
    453        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
    454        // buffer pointer.
    455        movrel          x6,  right_ext_mask, -6
    456        ldr             h26, [x3,  w17, sxtw #1]
    457        sub             x6,  x6,  w4,  uxtw #1
    458        dup             v26.8h,  v26.h[0]
    459        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
    460 
    461        bit             v2.16b,  v26.16b, v23.16b
    462        bit             v3.16b,  v26.16b, v24.16b
    463        bit             v4.16b,  v26.16b, v25.16b
    464 
    465 4:      // Loop horizontally
    466        ext             v17.16b, v2.16b,  v3.16b, #4
    467        ext             v19.16b, v2.16b,  v3.16b, #8
    468        ext             v16.16b, v2.16b,  v3.16b, #2
    469        ext             v20.16b, v2.16b,  v3.16b, #10
    470        ext             v21.16b, v2.16b,  v3.16b, #12
    471        ext             v18.16b, v2.16b,  v3.16b, #6
    472        add             v19.8h,  v19.8h,  v17.8h
    473        add             v20.8h,  v20.8h,  v16.8h
    474        add             v21.8h,  v21.8h,  v2.8h
    475        smull           v6.4s,   v18.4h,  v0.h[3]
    476        smlal           v6.4s,   v19.4h,  v0.h[2]
    477        smlal           v6.4s,   v20.4h,  v0.h[1]
    478        smlal           v6.4s,   v21.4h,  v0.h[0]
    479        smull2          v7.4s,   v18.8h,  v0.h[3]
    480        smlal2          v7.4s,   v19.8h,  v0.h[2]
    481        smlal2          v7.4s,   v20.8h,  v0.h[1]
    482        smlal2          v7.4s,   v21.8h,  v0.h[0]
    483 
    484        ext             v17.16b, v3.16b,  v4.16b, #4
    485        ext             v19.16b, v3.16b,  v4.16b, #8
    486        ext             v16.16b, v3.16b,  v4.16b, #2
    487        ext             v20.16b, v3.16b,  v4.16b, #10
    488        ext             v21.16b, v3.16b,  v4.16b, #12
    489        ext             v18.16b, v3.16b,  v4.16b, #6
    490 
    491        add             v19.8h,  v19.8h,  v17.8h
    492        add             v20.8h,  v20.8h,  v16.8h
    493        add             v21.8h,  v21.8h,  v3.8h
    494        smull           v24.4s,  v18.4h,  v0.h[3]
    495        smlal           v24.4s,  v19.4h,  v0.h[2]
    496        smlal           v24.4s,  v20.4h,  v0.h[1]
    497        smlal           v24.4s,  v21.4h,  v0.h[0]
    498        smull2          v25.4s,  v18.8h,  v0.h[3]
    499        smlal2          v25.4s,  v19.8h,  v0.h[2]
    500        smlal2          v25.4s,  v20.8h,  v0.h[1]
    501        smlal2          v25.4s,  v21.8h,  v0.h[0]
    502 
    503        ld1             {v16.8h, v17.8h}, [x9],  #32
    504 
    505        mvni            v26.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
    506        add             v6.4s,   v6.4s,   v30.4s
    507        add             v7.4s,   v7.4s,   v30.4s
    508        add             v24.4s,  v24.4s,  v30.4s
    509        add             v25.4s,  v25.4s,  v30.4s
    510        ld1             {v18.8h, v19.8h}, [x10], #32
    511        srshl           v6.4s,   v6.4s,   v29.4s
    512        srshl           v7.4s,   v7.4s,   v29.4s
    513        srshl           v24.4s,  v24.4s,  v29.4s
    514        srshl           v25.4s,  v25.4s,  v29.4s
    515        ld1             {v20.8h, v21.8h}, [x11], #32
    516        sqxtun          v6.4h,   v6.4s
    517        sqxtun2         v6.8h,   v7.4s
    518        sqxtun          v7.4h,   v24.4s
    519        sqxtun2         v7.8h,   v25.4s
    520        ld1             {v22.8h, v23.8h}, [x12], #32
    521        umin            v6.8h,   v6.8h,   v26.8h
    522        umin            v7.8h,   v7.8h,   v26.8h
    523        ld1             {v24.8h, v25.8h}, [x13], #32
    524        sub             v6.8h,   v6.8h,   v31.8h
    525        sub             v7.8h,   v7.8h,   v31.8h
    526 
    527        ld1             {v8.8h,  v9.8h},  [x14], #32
    528 
    529        smull           v1.4s,   v16.4h,  v0.h[4]
    530        smlal           v1.4s,   v18.4h,  v0.h[5]
    531        smlal           v1.4s,   v20.4h,  v0.h[6]
    532        smlal           v1.4s,   v22.4h,  v0.h[7]
    533        smlal           v1.4s,   v24.4h,  v0.h[6]
    534        smlal           v1.4s,   v8.4h,   v0.h[5]
    535        smlal           v1.4s,   v6.4h,   v0.h[4]
    536        smull2          v5.4s,   v16.8h,  v0.h[4]
    537        smlal2          v5.4s,   v18.8h,  v0.h[5]
    538        smlal2          v5.4s,   v20.8h,  v0.h[6]
    539        smlal2          v5.4s,   v22.8h,  v0.h[7]
    540        smlal2          v5.4s,   v24.8h,  v0.h[6]
    541        smlal2          v5.4s,   v8.8h,   v0.h[5]
    542        smlal2          v5.4s,   v6.8h,   v0.h[4]
    543        smull           v26.4s,  v17.4h,  v0.h[4]
    544        smlal           v26.4s,  v19.4h,  v0.h[5]
    545        smlal           v26.4s,  v21.4h,  v0.h[6]
    546        smlal           v26.4s,  v23.4h,  v0.h[7]
    547        smlal           v26.4s,  v25.4h,  v0.h[6]
    548        smlal           v26.4s,  v9.4h,   v0.h[5]
    549        smlal           v26.4s,  v7.4h,   v0.h[4]
    550        smull2          v16.4s,  v17.8h,  v0.h[4]
    551        smlal2          v16.4s,  v19.8h,  v0.h[5]
    552        smlal2          v16.4s,  v21.8h,  v0.h[6]
    553        smlal2          v16.4s,  v23.8h,  v0.h[7]
    554        smlal2          v16.4s,  v25.8h,  v0.h[6]
    555        smlal2          v16.4s,  v9.8h,   v0.h[5]
    556        smlal2          v16.4s,  v7.8h,   v0.h[4]
    557        srshl           v1.4s,   v1.4s,   v27.4s  // -round_bits_v
    558        srshl           v5.4s,   v5.4s,   v27.4s
    559        srshl           v26.4s,  v26.4s,  v27.4s
    560        srshl           v16.4s,  v16.4s,  v27.4s
    561        sqxtun          v18.4h,  v1.4s
    562        sqxtun2         v18.8h,  v5.4s
    563        sqxtun          v19.4h,  v26.4s
    564        sqxtun2         v19.8h,  v16.4s
    565        st1             {v6.8h, v7.8h}, [x15], #32
    566        umin            v18.8h,  v18.8h,  v28.8h  // bitdepth_max
    567        umin            v19.8h,  v19.8h,  v28.8h
    568        subs            w4,  w4,  #16
    569 
    570        st1             {v18.8h, v19.8h}, [x0], #32
    571 
    572        b.le            0f
    573        mov             v2.16b,  v4.16b
    574        tst             w7,  #2 // LR_HAVE_RIGHT
    575        ld1             {v3.8h, v4.8h}, [x3], #32
    576        b.ne            4b // If we don't need to pad, just keep filtering.
    577        b               3b // If we need to pad, check how many pixels we have left.
    578 
    579 0:
    580        ldp             x3,  x4,  [sp, #64]
    581        ldp             x15, x0,  [sp, #48]
    582        ldp             x13, x14, [sp, #32]
    583        ldp             x11, x12, [sp, #16]
    584        ldp             x9,  x10, [sp], #80
    585 
    586        add             x3,  x3,  x1
    587        add             x0,  x0,  x1
    588 
    589        ret
    590 endfunc
    591 
    592 // void dav1d_wiener_filter5_16bpc_neon(pixel *p, const ptrdiff_t p_stride,
    593 //                                      const pixel (*left)[4], const pixel *lpf,
    594 //                                      const int w, int h,
    595 //                                      const int16_t filter[2][8],
    596 //                                      const enum LrEdgeFlags edges,
    597 //                                      const int bitdepth_max);
    598 function wiener_filter5_16bpc_neon, export=1
    599        ldr             w8,  [sp]
    600        AARCH64_SIGN_LINK_REGISTER
    601        stp             x29, x30, [sp, #-32]!
    602        stp             d8,  d9,  [sp, #16]
    603        mov             x29, sp
    604        ld1             {v0.8h, v1.8h},  [x6]
    605        tst             w7,  #4               // LR_HAVE_TOP
    606        sub_sp          384*2*4
    607 
    608        dup             v28.8h,  w8           // bitdepth_max
    609        clz             w8,  w8
    610        movi            v30.4s,  #1
    611        sub             w10, w8,  #38         // -(bitdepth + 6)
    612        sub             w11, w8,  #11         // round_bits_v
    613        sub             w8,  w8,  #25         // -round_bits_h
    614        neg             w10, w10              // bitdepth + 6
    615        neg             w11, w11              // -round_bits_v
    616        dup             v2.4s,   w10
    617        dup             v29.4s,  w8           // -round_bits_h
    618        dup             v27.4s,  w11          // -round_bits_v
    619        movi            v31.8h,  #0x20, lsl #8  // 1 << 13 = 8192
    620        ushl            v30.4s,  v30.4s,  v2.4s // 1 << (bitdepth + 6)
    621 
    622        zip1            v0.2d,   v0.2d,   v1.2d // move vertical coeffs to v0.h[4-7], freeing up v1
    623 
    624        // x11 - t4
    625        // x12 - t3
    626        // x13 - t2
    627        // x14 - t1
    628        // x15 - t0
    629        mov             x14, sp               // t1
    630        b.eq            L(no_top_5)
    631 
    632        mov             x16, x2               // backup left
    633        mov             x2,  #0
    634        bl              wiener_filter5_h_16bpc_neon
    635        add             x3,  x3,  x1          // lpf += stride
    636        mov             x11, x14              // t4
    637        add             x14, x14, #384*2      // t1 += 384*2
    638        bl              wiener_filter5_h_16bpc_neon
    639        add             x3,  x3,  x1,  lsl #2
    640        add             x3,  x3,  x1          // lpf += stride*5
    641        mov             x12, x14              // t3
    642        add             x14, x14, #384*2      // t1 += 384*2
    643        mov             x2,  x16              // left
    644        mov             x16, x3               // backup lpf
    645        mov             x3,  x0               // lpf = p
    646        bl              wiener_filter5_h_16bpc_neon
    647        subs            w5,  w5,  #1          // h--
    648        mov             x13, x14              // t2
    649        b.eq            L(v1_5)
    650        add             x3,  x3,  x1          // src += stride
    651        add             x14, x14, #384*2      // t1 += 384*2
    652        bl              wiener_filter5_h_16bpc_neon
    653        subs            w5,  w5,  #1          // h--
    654        b.eq            L(v2_5)
    655        add             x3,  x3,  x1          // src += stride
    656 
    657 L(main_5):
    658        mov             x15, x11              // t0 = t4
    659 L(main_loop_5):
    660        bl              wiener_filter5_hv_16bpc_neon
    661        subs            w5,  w5,  #1          // h--
    662        b.ne            L(main_loop_5)
    663        tst             w7,  #8 // LR_HAVE_BOTTOM
    664        b.eq            L(v2_5)
    665 
    666        mov             x3,  x16              // restore lpf
    667        mov             x2,  #0               // left = NULL
    668        bl              wiener_filter5_hv_16bpc_neon
    669        bl              wiener_filter5_hv_16bpc_neon
    670 L(end_5):
    671 
    672        mov             sp,  x29
    673        ldp             d8,  d9,  [sp, #16]
    674        ldp             x29, x30, [sp], #32
    675        AARCH64_VALIDATE_LINK_REGISTER
    676        ret
    677 
    678 L(no_top_5):
    679        add             x3,  x3,  x1,  lsl #2
    680        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
    681        mov             x3,  x0               // lpf = p
    682 
    683        bl              wiener_filter5_h_16bpc_neon
    684        subs            w5,  w5,  #1          // h--
    685        mov             x11, x14              // t4
    686        mov             x12, x14              // t3
    687        mov             x13, x14              // t2
    688        b.eq            L(v1_5)
    689        add             x3,  x3,  x1          // src += stride
    690        add             x14, x14, #384*2      // t1 += 384*2
    691        bl              wiener_filter5_h_16bpc_neon
    692        subs            w5,  w5,  #1          // h--
    693        b.eq            L(v2_5)
    694        add             x3,  x3,  x1          // src += stride
    695        add             x15, x14, #384*2      // t0 = t1 + 384*2
    696        bl              wiener_filter5_hv_16bpc_neon
    697        subs            w5,  w5,  #1          // h--
    698        b.eq            L(v2_5)
    699        add             x15, x15, #384*2*3    // t0 += 384*2*3
    700        bl              wiener_filter5_hv_16bpc_neon
    701        subs            w5,  w5,  #1          // h--
    702        b.ne            L(main_5)
    703 L(v2_5):
    704        bl              wiener_filter5_v_16bpc_neon
    705        add             x0,  x0,  x1
    706        mov             x11, x12
    707        mov             x12, x13
    708        mov             x13, x14
    709 L(v1_5):
    710        bl              wiener_filter5_v_16bpc_neon
    711        b               L(end_5)
    712 endfunc
    713 
    714 
    715 function wiener_filter5_h_16bpc_neon
    716        stp             x3,  x4,  [sp, #-32]!
    717        str             x14,      [sp, #16]
    718 
    719        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    720        tst             w7,  #1 // LR_HAVE_LEFT
    721        b.eq            1f
    722        // LR_HAVE_LEFT
    723        cbnz            x2,  0f
    724        // left == NULL
    725        sub             x3,  x3,  #4
    726        ld1             {v2.8h, v3.8h}, [x3], #32
    727        b               2f
    728 
    729 0:
    730        // LR_HAVE_LEFT, left != NULL
    731        ld1             {v2.8h, v3.8h}, [x3], #32
    732        ld1             {v4.d}[1], [x2], #8
    733        // Move x3 back to account for the last 2 pixels we loaded earlier,
    734        // which we'll shift out.
    735        sub             x3,  x3,  #4
    736        ext             v3.16b,  v2.16b,  v3.16b,  #12
    737        ext             v2.16b,  v4.16b,  v2.16b,  #12
    738        b               2f
    739 
    740 1:
    741        ld1             {v2.8h, v3.8h}, [x3], #32
    742        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
    743        // and shift v3 to have 3x the first pixel at the front.
    744        dup             v4.8h,  v2.h[0]
    745        // Move x3 back to account for the last 2 pixels we loaded before,
    746        // which we shifted out.
    747        sub             x3,  x3,  #4
    748        ext             v3.16b,  v2.16b,  v3.16b,  #12
    749        ext             v2.16b,  v4.16b,  v2.16b,  #12
    750 
    751 2:
    752        ld1             {v4.8h}, [x3], #16
    753 
    754        tst             w7,  #2 // LR_HAVE_RIGHT
    755        b.ne            4f
    756 
    757 3:      // !LR_HAVE_RIGHT
    758 
    759        // Check whether we need to pad the right edge
    760        cmp             w4,  #18
    761        b.ge            4f   // If w >= 18, all used input pixels are valid
    762 
    763        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
    764        // this ends up called again; it's not strictly needed in those
    765        // cases (we pad enough here), but keeping the code as simple as possible.
    766 
    767        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
    768        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
    769        sub             w17, w4,  #23
    770        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
    771        // buffer pointer.
    772        movrel          x6,  right_ext_mask, -4
    773        ldr             h26, [x3,  w17, sxtw #1]
    774        sub             x6,  x6,  w4,  uxtw #1
    775        dup             v26.8h,  v26.h[0]
    776        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
    777 
    778        bit             v2.16b,  v26.16b, v23.16b
    779        bit             v3.16b,  v26.16b, v24.16b
    780        bit             v4.16b,  v26.16b, v25.16b
    781 
    782 4:      // Loop horizontally
    783        // Interleaving the mul/mla chains actually hurts performance
    784        // significantly on Cortex A53, thus keeping mul/mla tightly
    785        // chained like this.
    786        ext             v16.16b, v2.16b,  v3.16b, #2
    787        ext             v18.16b, v2.16b,  v3.16b, #6
    788        ext             v19.16b, v2.16b,  v3.16b, #8
    789        ext             v17.16b, v2.16b,  v3.16b, #4
    790        add             v18.8h,  v18.8h,  v16.8h
    791        add             v19.8h,  v19.8h,  v2.8h
    792        smull           v6.4s,   v17.4h,  v0.h[3]
    793        smlal           v6.4s,   v18.4h,  v0.h[2]
    794        smlal           v6.4s,   v19.4h,  v0.h[1]
    795        smull2          v7.4s,   v17.8h,  v0.h[3]
    796        smlal2          v7.4s,   v18.8h,  v0.h[2]
    797        smlal2          v7.4s,   v19.8h,  v0.h[1]
    798 
    799        ext             v16.16b, v3.16b,  v4.16b, #2
    800        ext             v18.16b, v3.16b,  v4.16b, #6
    801        ext             v19.16b, v3.16b,  v4.16b, #8
    802        ext             v17.16b, v3.16b,  v4.16b, #4
    803        add             v18.8h,  v18.8h,  v16.8h
    804        add             v19.8h,  v19.8h,  v3.8h
    805        smull           v16.4s,  v17.4h,  v0.h[3]
    806        smlal           v16.4s,  v18.4h,  v0.h[2]
    807        smlal           v16.4s,  v19.4h,  v0.h[1]
    808        smull2          v17.4s,  v17.8h,  v0.h[3]
    809        smlal2          v17.4s,  v18.8h,  v0.h[2]
    810        smlal2          v17.4s,  v19.8h,  v0.h[1]
    811 
    812        mvni            v24.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
    813        add             v6.4s,   v6.4s,   v30.4s
    814        add             v7.4s,   v7.4s,   v30.4s
    815        add             v16.4s,  v16.4s,  v30.4s
    816        add             v17.4s,  v17.4s,  v30.4s
    817        srshl           v6.4s,   v6.4s,   v29.4s
    818        srshl           v7.4s,   v7.4s,   v29.4s
    819        srshl           v16.4s,  v16.4s,  v29.4s
    820        srshl           v17.4s,  v17.4s,  v29.4s
    821        sqxtun          v6.4h,   v6.4s
    822        sqxtun2         v6.8h,   v7.4s
    823        sqxtun          v7.4h,   v16.4s
    824        sqxtun2         v7.8h,   v17.4s
    825        umin            v6.8h,   v6.8h,   v24.8h
    826        umin            v7.8h,   v7.8h,   v24.8h
    827        sub             v6.8h,   v6.8h,   v31.8h
    828        sub             v7.8h,   v7.8h,   v31.8h
    829 
    830        subs            w4,  w4,  #16
    831 
    832        st1             {v6.8h, v7.8h}, [x14], #32
    833 
    834        b.le            0f
    835        mov             v2.16b,  v4.16b
    836        tst             w7,  #2 // LR_HAVE_RIGHT
    837        ld1             {v3.8h, v4.8h}, [x3], #32
    838        b.ne            4b // If we don't need to pad, just keep filtering.
    839        b               3b // If we need to pad, check how many pixels we have left.
    840 
    841 0:
    842        ldr             x14,      [sp, #16]
    843        ldp             x3,  x4,  [sp], #32
    844        ret
    845 endfunc
    846 
    847 function wiener_filter5_v_16bpc_neon
    848        stp             x11, x12, [sp, #-48]!
    849        stp             x13, x14, [sp, #16]
    850        stp             x0,  x4,  [sp, #32]
    851 1:
    852        ld1             {v16.8h, v17.8h}, [x11], #32
    853        ld1             {v18.8h, v19.8h}, [x12], #32
    854        ld1             {v20.8h, v21.8h}, [x13], #32
    855        ld1             {v22.8h, v23.8h}, [x14], #32
    856 
    857        smull           v2.4s,   v16.4h,  v0.h[5]
    858        smlal           v2.4s,   v18.4h,  v0.h[6]
    859        smlal           v2.4s,   v20.4h,  v0.h[7]
    860        smlal           v2.4s,   v22.4h,  v0.h[6]
    861        smlal           v2.4s,   v22.4h,  v0.h[5]
    862        smull2          v3.4s,   v16.8h,  v0.h[5]
    863        smlal2          v3.4s,   v18.8h,  v0.h[6]
    864        smlal2          v3.4s,   v20.8h,  v0.h[7]
    865        smlal2          v3.4s,   v22.8h,  v0.h[6]
    866        smlal2          v3.4s,   v22.8h,  v0.h[5]
    867        smull           v4.4s,   v17.4h,  v0.h[5]
    868        smlal           v4.4s,   v19.4h,  v0.h[6]
    869        smlal           v4.4s,   v21.4h,  v0.h[7]
    870        smlal           v4.4s,   v23.4h,  v0.h[6]
    871        smlal           v4.4s,   v23.4h,  v0.h[5]
    872        smull2          v5.4s,   v17.8h,  v0.h[5]
    873        smlal2          v5.4s,   v19.8h,  v0.h[6]
    874        smlal2          v5.4s,   v21.8h,  v0.h[7]
    875        smlal2          v5.4s,   v23.8h,  v0.h[6]
    876        smlal2          v5.4s,   v23.8h,  v0.h[5]
    877        srshl           v2.4s,   v2.4s,   v27.4s  // -round_bits_v
    878        srshl           v3.4s,   v3.4s,   v27.4s
    879        srshl           v4.4s,   v4.4s,   v27.4s
    880        srshl           v5.4s,   v5.4s,   v27.4s
    881        sqxtun          v2.4h,   v2.4s
    882        sqxtun2         v2.8h,   v3.4s
    883        sqxtun          v3.4h,   v4.4s
    884        sqxtun2         v3.8h,   v5.4s
    885        umin            v2.8h,   v2.8h,   v28.8h  // bitdepth_max
    886        umin            v3.8h,   v3.8h,   v28.8h
    887 
    888        subs            w4,  w4,  #16
    889        st1             {v2.8h, v3.8h}, [x0], #32
    890        b.gt            1b
    891 
    892        ldp             x0,  x4,  [sp, #32]
    893        ldp             x13, x14, [sp, #16]
    894        ldp             x11, x12, [sp], #48
    895 
    896        ret
    897 endfunc
    898 
    899 function wiener_filter5_hv_16bpc_neon
    900        // Backing up/restoring registers shifted, so that x11 gets the value
    901        // of x12, etc, and x15==x11, afterwards.
    902        stp             x12, x13, [sp, #-64]!
    903        stp             x14, x15, [sp, #16]
    904        stp             x12, x0,  [sp, #32]
    905        stp             x3,  x4,  [sp, #48]
    906 
    907        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    908        tst             w7,  #1 // LR_HAVE_LEFT
    909        b.eq            1f
    910        // LR_HAVE_LEFT
    911        cbnz            x2,  0f
    912        // left == NULL
    913        sub             x3,  x3,  #4
    914        ld1             {v2.8h, v3.8h}, [x3], #32
    915        b               2f
    916 
    917 0:
    918        // LR_HAVE_LEFT, left != NULL
    919        ld1             {v2.8h, v3.8h}, [x3], #32
    920        ld1             {v4.d}[1], [x2], #8
    921        // Move x3 back to account for the last 2 pixels we loaded earlier,
    922        // which we'll shift out.
    923        sub             x3,  x3,  #4
    924        ext             v3.16b,  v2.16b,  v3.16b,  #12
    925        ext             v2.16b,  v4.16b,  v2.16b,  #12
    926        b               2f
    927 1:
    928        ld1             {v2.8h, v3.8h}, [x3], #32
    929        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
    930        // and shift v3 to have 2x the first pixel at the front.
    931        dup             v4.8h,   v2.h[0]
    932        // Move x3 back to account for the last 2 pixels we loaded before,
    933        // which we shifted out.
    934        sub             x3,  x3,  #4
    935        ext             v3.16b,  v2.16b,  v3.16b,  #12
    936        ext             v2.16b,  v4.16b,  v2.16b,  #12
    937 
    938 2:
    939        ld1             {v4.8h}, [x3], #16
    940 
    941        tst             w7,  #2 // LR_HAVE_RIGHT
    942        b.ne            4f
    943 
    944 3:      // !LR_HAVE_RIGHT
    945 
    946        // Check whether we need to pad the right edge
    947        cmp             w4,  #18
    948        b.ge            4f   // If w >= 18, all used input pixels are valid
    949 
    950        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
    951        // this ends up called again; it's not strictly needed in those
    952        // cases (we pad enough here), but keeping the code as simple as possible.
    953 
    954        // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
    955        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
    956        sub             w17, w4,  #23
    957        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
    958        // buffer pointer.
    959        movrel          x6,  right_ext_mask, -4
    960        ldr             h26, [x3,  w17, sxtw #1]
    961        sub             x6,  x6,  w4,  uxtw #1
    962        dup             v26.8h,  v26.h[0]
    963        ld1             {v23.16b, v24.16b, v25.16b}, [x6]
    964 
    965        bit             v2.16b,  v26.16b, v23.16b
    966        bit             v3.16b,  v26.16b, v24.16b
    967        bit             v4.16b,  v26.16b, v25.16b
    968 
    969 4:      // Loop horizontally
    970        ext             v16.16b, v2.16b,  v3.16b, #2
    971        ext             v18.16b, v2.16b,  v3.16b, #6
    972        ext             v19.16b, v2.16b,  v3.16b, #8
    973        ext             v17.16b, v2.16b,  v3.16b, #4
    974        add             v18.8h,  v18.8h,  v16.8h
    975        add             v19.8h,  v19.8h,  v2.8h
    976        smull           v6.4s,   v17.4h,  v0.h[3]
    977        smlal           v6.4s,   v18.4h,  v0.h[2]
    978        smlal           v6.4s,   v19.4h,  v0.h[1]
    979        smull2          v7.4s,   v17.8h,  v0.h[3]
    980        smlal2          v7.4s,   v18.8h,  v0.h[2]
    981        smlal2          v7.4s,   v19.8h,  v0.h[1]
    982 
    983        ext             v16.16b, v3.16b,  v4.16b, #2
    984        ext             v18.16b, v3.16b,  v4.16b, #6
    985        ext             v19.16b, v3.16b,  v4.16b, #8
    986        ext             v17.16b, v3.16b,  v4.16b, #4
    987        add             v18.8h,  v18.8h,  v16.8h
    988        add             v19.8h,  v19.8h,  v3.8h
    989        smull           v24.4s,  v17.4h,  v0.h[3]
    990        smlal           v24.4s,  v18.4h,  v0.h[2]
    991        smlal           v24.4s,  v19.4h,  v0.h[1]
    992        smull2          v25.4s,  v17.8h,  v0.h[3]
    993        smlal2          v25.4s,  v18.8h,  v0.h[2]
    994        smlal2          v25.4s,  v19.8h,  v0.h[1]
    995 
    996        ld1             {v16.8h, v17.8h}, [x11], #32
    997        mvni            v26.8h,  #0x80, lsl #8 // 0x7fff = (1 << 15) - 1
    998        add             v6.4s,   v6.4s,   v30.4s
    999        add             v7.4s,   v7.4s,   v30.4s
   1000        add             v24.4s,  v24.4s,  v30.4s
   1001        add             v25.4s,  v25.4s,  v30.4s
   1002        ld1             {v18.8h, v19.8h}, [x12], #32
   1003        srshl           v6.4s,   v6.4s,   v29.4s
   1004        srshl           v7.4s,   v7.4s,   v29.4s
   1005        srshl           v24.4s,  v24.4s,  v29.4s
   1006        srshl           v25.4s,  v25.4s,  v29.4s
   1007        ld1             {v20.8h, v21.8h}, [x13], #32
   1008        sqxtun          v6.4h,   v6.4s
   1009        sqxtun2         v6.8h,   v7.4s
   1010        sqxtun          v7.4h,   v24.4s
   1011        sqxtun2         v7.8h,   v25.4s
   1012        ld1             {v22.8h, v23.8h}, [x14], #32
   1013        umin            v6.8h,   v6.8h,   v26.8h
   1014        umin            v7.8h,   v7.8h,   v26.8h
   1015        sub             v6.8h,   v6.8h,   v31.8h
   1016        sub             v7.8h,   v7.8h,   v31.8h
   1017 
   1018        smull           v8.4s,   v16.4h,  v0.h[5]
   1019        smlal           v8.4s,   v18.4h,  v0.h[6]
   1020        smlal           v8.4s,   v20.4h,  v0.h[7]
   1021        smlal           v8.4s,   v22.4h,  v0.h[6]
   1022        smlal           v8.4s,   v6.4h,   v0.h[5]
   1023        smull2          v9.4s,   v16.8h,  v0.h[5]
   1024        smlal2          v9.4s,   v18.8h,  v0.h[6]
   1025        smlal2          v9.4s,   v20.8h,  v0.h[7]
   1026        smlal2          v9.4s,   v22.8h,  v0.h[6]
   1027        smlal2          v9.4s,   v6.8h,   v0.h[5]
   1028        smull           v1.4s,   v17.4h,  v0.h[5]
   1029        smlal           v1.4s,   v19.4h,  v0.h[6]
   1030        smlal           v1.4s,   v21.4h,  v0.h[7]
   1031        smlal           v1.4s,   v23.4h,  v0.h[6]
   1032        smlal           v1.4s,   v7.4h,   v0.h[5]
   1033        smull2          v5.4s,   v17.8h,  v0.h[5]
   1034        smlal2          v5.4s,   v19.8h,  v0.h[6]
   1035        smlal2          v5.4s,   v21.8h,  v0.h[7]
   1036        smlal2          v5.4s,   v23.8h,  v0.h[6]
   1037        smlal2          v5.4s,   v7.8h,   v0.h[5]
   1038        srshl           v8.4s,   v8.4s,   v27.4s  // -round_bits_v
   1039        srshl           v9.4s,   v9.4s,   v27.4s
   1040        srshl           v1.4s,   v1.4s,   v27.4s
   1041        srshl           v5.4s,   v5.4s,   v27.4s
   1042        sqxtun          v8.4h,   v8.4s
   1043        sqxtun2         v8.8h,   v9.4s
   1044        sqxtun          v9.4h,   v1.4s
   1045        sqxtun2         v9.8h,   v5.4s
   1046        st1             {v6.8h, v7.8h}, [x15], #32
   1047        umin            v8.8h,   v8.8h,   v28.8h  // bitdepth_max
   1048        umin            v9.8h,   v9.8h,   v28.8h
   1049 
   1050        subs            w4,  w4,  #16
   1051 
   1052        st1             {v8.8h, v9.8h}, [x0], #32
   1053 
   1054        b.le            0f
   1055        mov             v2.16b,  v4.16b
   1056        tst             w7,  #2 // LR_HAVE_RIGHT
   1057        ld1             {v3.8h, v4.8h}, [x3], #32
   1058        b.ne            4b // If we don't need to pad, just keep filtering.
   1059        b               3b // If we need to pad, check how many pixels we have left.
   1060 
   1061 0:
   1062        ldp             x3,  x4,  [sp, #48]
   1063        ldp             x15, x0,  [sp, #32]
   1064        ldp             x13, x14, [sp, #16]
   1065        ldp             x11, x12, [sp], #64
   1066 
   1067        add             x3,  x3,  x1
   1068        add             x0,  x0,  x1
   1069 
   1070        ret
   1071 endfunc
   1072 
   1073 #include "looprestoration_tmpl.S"
   1074 
   1075 // void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
   1076 //                                      const pixel (*left)[4],
   1077 //                                      const pixel *src, const int w,
   1078 //                                      const enum LrEdgeFlags edges);
   1079 function sgr_box3_row_h_16bpc_neon, export=1
   1080        add             w4,  w4,  #2 // w += 2
   1081 
   1082        tst             w5,  #1 // LR_HAVE_LEFT
   1083        b.eq            1f
   1084        cbnz            x2,  0f
   1085 
   1086        // LR_HAVE_LEFT && left == NULL
   1087        sub             x3,  x3,  #4
   1088        ld1             {v0.8h, v1.8h}, [x3], #32
   1089        b               2f
   1090 
   1091 0:
   1092        // LR_HAVE_LEFT, left != NULL
   1093        ld1             {v0.8h, v1.8h}, [x3], #32
   1094        ld1             {v2.d}[1], [x2]
   1095        // Move x3 back to account for the last 2 pixels we loaded earlier,
   1096        // which we'll shift out.
   1097        sub             x3,  x3,  #4
   1098        ext             v1.16b, v0.16b, v1.16b, #12
   1099        ext             v0.16b, v2.16b, v0.16b, #12
   1100        b               2f
   1101 
   1102 1:
   1103        ld1             {v0.8h, v1.8h}, [x3], #32
   1104        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
   1105        // and shift v0/v1 to have 2x the first pixel at the front.
   1106        dup             v2.8h, v0.h[0]
   1107        // Move x3 back to account for the last 2 pixels we loaded before,
   1108        // which we shifted out.
   1109        sub             x3,  x3,  #4
   1110        ext             v1.16b, v0.16b, v1.16b, #12
   1111        ext             v0.16b, v2.16b, v0.16b, #12
   1112 
   1113 2:
   1114        tst             w5,  #2 // LR_HAVE_RIGHT
   1115        b.ne            4f
   1116        // If we'll need to pad the right edge, load that pixel to pad with
   1117        // here since we can find it pretty easily from here.
   1118        sub             w13, w4, #(2 + 16 - 2 + 1)
   1119        ldr             h30, [x3,  w13, sxtw #1]
   1120        // Fill v30 with the right padding pixel
   1121        dup             v30.8h,  v30.h[0]
   1122 3:      // !LR_HAVE_RIGHT
   1123 
   1124        // Check whether we need to pad the right edge
   1125        cmp             w4,  #10
   1126        b.ge            4f   // If w >= 10, all used input pixels are valid
   1127 
   1128        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
   1129        // again; it's not strictly needed in those cases (we pad enough here),
   1130        // but keeping the code as simple as possible.
   1131 
   1132        // Insert padding in v0.h[w] onwards
   1133        movrel          x13, right_ext_mask
   1134        sub             x13, x13, w4,  uxtw #1
   1135        ld1             {v28.16b, v29.16b}, [x13]
   1136 
   1137        bit             v0.16b,  v30.16b, v28.16b
   1138        bit             v1.16b,  v30.16b, v29.16b
   1139 
   1140 4:      // Loop horizontally
   1141        ext             v26.16b, v0.16b,  v1.16b,  #2
   1142        ext             v27.16b, v0.16b,  v1.16b,  #4
   1143 
   1144        add             v6.8h,   v0.8h,   v26.8h
   1145        umull           v22.4s,  v0.4h,   v0.4h
   1146        umlal           v22.4s,  v26.4h,  v26.4h
   1147        umlal           v22.4s,  v27.4h,  v27.4h
   1148        add             v6.8h,   v6.8h,   v27.8h
   1149        umull2          v23.4s,  v0.8h,   v0.8h
   1150        umlal2          v23.4s,  v26.8h,  v26.8h
   1151        umlal2          v23.4s,  v27.8h,  v27.8h
   1152 
   1153        subs            w4,  w4,  #8
   1154 
   1155        st1             {v6.8h},         [x1],  #16
   1156        st1             {v22.4s,v23.4s}, [x0],  #32
   1157 
   1158        b.le            9f
   1159        tst             w5,  #2 // LR_HAVE_RIGHT
   1160        mov             v0.16b,  v1.16b
   1161        ld1             {v1.8h},  [x3],  #16
   1162 
   1163        b.ne            4b // If we don't need to pad, just keep summing.
   1164        b               3b // If we need to pad, check how many pixels we have left.
   1165 
   1166 9:
   1167        ret
   1168 endfunc
   1169 
   1170 // void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
   1171 //                                      const pixel (*left)[4],
   1172 //                                      const pixel *src, const int w,
   1173 //                                      const enum LrEdgeFlags edges);
   1174 function sgr_box5_row_h_16bpc_neon, export=1
   1175        add             w4,  w4,  #2 // w += 2
   1176 
   1177        tst             w5,  #1 // LR_HAVE_LEFT
   1178        b.eq            1f
   1179        cbnz            x2,  0f
   1180 
   1181        // LR_HAVE_LEFT && left == NULL
   1182        sub             x3,  x3,  #6
   1183        ld1             {v0.8h, v1.8h}, [x3], #32
   1184        b               2f
   1185 
   1186 0:
   1187        // LR_HAVE_LEFT, left != NULL
   1188        ld1             {v0.8h, v1.8h}, [x3], #32
   1189        ld1             {v2.d}[1], [x2], #8
   1190        // Move x3 back to account for the last 3 pixels we loaded earlier,
   1191        // which we'll shift out.
   1192        sub             x3,  x3,  #6
   1193        ext             v1.16b,  v0.16b,  v1.16b,  #10
   1194        ext             v0.16b,  v2.16b,  v0.16b,  #10
   1195        b               2f
   1196 
   1197 1:
   1198        ld1             {v0.8h, v1.8h}, [x3], #32
   1199        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
   1200        // and shift v0/v1 to have 3x the first pixel at the front.
   1201        dup             v2.8h,  v0.h[0]
   1202        // Move x3 back to account for the last 3 pixels we loaded before,
   1203        // which we shifted out.
   1204        sub             x3,  x3,  #6
   1205        ext             v1.16b,  v0.16b,  v1.16b,  #10
   1206        ext             v0.16b,  v2.16b,  v0.16b,  #10
   1207 
   1208 2:
   1209        tst             w5,  #2 // LR_HAVE_RIGHT
   1210        b.ne            4f
   1211        // If we'll need to pad the right edge, load that pixel to pad with
   1212        // here since we can find it pretty easily from here.
   1213        sub             w13, w4, #(2 + 16 - 3 + 1)
   1214        ldr             h30, [x3,  w13, sxtw #1]
   1215        // Fill v30 with the right padding pixel
   1216        dup             v30.8h,  v30.h[0]
   1217 3:      // !LR_HAVE_RIGHT
   1218 
   1219        // Check whether we need to pad the right edge
   1220        cmp             w4,  #11
   1221        b.ge            4f   // If w >= 11, all used input pixels are valid
   1222 
   1223        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
   1224        // this ends up called again; it's not strictly needed in those
   1225        // cases (we pad enough here), but keeping the code as simple as possible.
   1226 
   1227        // Insert padding in v0.h[w+1] onwards; fuse the +1 into the
   1228        // buffer pointer.
   1229        movrel          x13, right_ext_mask, -1
   1230        sub             x13, x13, w4,  uxtw #1
   1231        ld1             {v28.16b, v29.16b}, [x13]
   1232 
   1233        bit             v0.16b,  v30.16b, v28.16b
   1234        bit             v1.16b,  v30.16b, v29.16b
   1235 
   1236 4:      // Loop horizontally
   1237        ext             v26.16b, v0.16b,  v1.16b,  #2
   1238        ext             v27.16b, v0.16b,  v1.16b,  #4
   1239 
   1240        add             v6.8h,   v0.8h,   v26.8h
   1241        umull           v22.4s,  v0.4h,   v0.4h
   1242        umlal           v22.4s,  v26.4h,  v26.4h
   1243        umlal           v22.4s,  v27.4h,  v27.4h
   1244        add             v6.8h,   v6.8h,   v27.8h
   1245        umull2          v23.4s,  v0.8h,   v0.8h
   1246        umlal2          v23.4s,  v26.8h,  v26.8h
   1247        umlal2          v23.4s,  v27.8h,  v27.8h
   1248 
   1249        ext             v26.16b, v0.16b,  v1.16b,  #6
   1250        ext             v27.16b, v0.16b,  v1.16b,  #8
   1251 
   1252        add             v6.8h,   v6.8h,   v26.8h
   1253        umlal           v22.4s,  v26.4h,  v26.4h
   1254        umlal           v22.4s,  v27.4h,  v27.4h
   1255        add             v6.8h,   v6.8h,   v27.8h
   1256        umlal2          v23.4s,  v26.8h,  v26.8h
   1257        umlal2          v23.4s,  v27.8h,  v27.8h
   1258 
   1259        subs            w4,  w4,  #8
   1260 
   1261        st1             {v6.8h},         [x1],  #16
   1262        st1             {v22.4s,v23.4s}, [x0],  #32
   1263 
   1264        b.le            9f
   1265        tst             w5,  #2 // LR_HAVE_RIGHT
   1266        mov             v0.16b,  v1.16b
   1267        ld1             {v1.8h}, [x3], #16
   1268 
   1269        b.ne            4b // If we don't need to pad, just keep summing.
   1270        b               3b // If we need to pad, check how many pixels we have left.
   1271 
   1272 9:
   1273        ret
   1274 endfunc
   1275 
   1276 // void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
   1277 //                                       int32_t *sumsq5, int16_t *sum5,
   1278 //                                       const pixel (*left)[4],
   1279 //                                       const pixel *src, const int w,
   1280 //                                       const enum LrEdgeFlags edges);
   1281 function sgr_box35_row_h_16bpc_neon, export=1
   1282        add             w6,  w6,  #2 // w += 2
   1283 
   1284        tst             w7,  #1 // LR_HAVE_LEFT
   1285        b.eq            1f
   1286        cbnz            x4,  0f
   1287 
   1288        // LR_HAVE_LEFT && left == NULL
   1289        sub             x5,  x5,  #6
   1290        ld1             {v0.8h, v1.8h}, [x5], #32
   1291        b               2f
   1292 
   1293 0:
   1294        // LR_HAVE_LEFT, left != NULL
   1295        ld1             {v0.8h, v1.8h}, [x5], #32
   1296        ld1             {v2.d}[1], [x4], #8
   1297        // Move x3 back to account for the last 3 pixels we loaded earlier,
   1298        // which we'll shift out.
   1299        sub             x5,  x5,  #6
   1300        ext             v1.16b,  v0.16b,  v1.16b,  #10
   1301        ext             v0.16b,  v2.16b,  v0.16b,  #10
   1302        b               2f
   1303 
   1304 1:
   1305        ld1             {v0.8h, v1.8h}, [x5], #32
   1306        // !LR_HAVE_LEFT, fill v2 with the leftmost pixel
   1307        // and shift v0/v1 to have 3x the first pixel at the front.
   1308        dup             v2.8h,  v0.h[0]
   1309        // Move x5 back to account for the last 3 pixels we loaded before,
   1310        // which we shifted out.
   1311        sub             x5,  x5,  #6
   1312        ext             v1.16b,  v0.16b,  v1.16b,  #10
   1313        ext             v0.16b,  v2.16b,  v0.16b,  #10
   1314 
   1315 2:
   1316        tst             w7,  #2 // LR_HAVE_RIGHT
   1317        b.ne            4f
   1318        // If we'll need to pad the right edge, load that pixel to pad with
   1319        // here since we can find it pretty easily from here.
   1320        sub             w13, w6, #(2 + 16 - 3 + 1)
   1321        ldr             h30, [x5,  w13, sxtw #1]
   1322        // Fill v30 with the right padding pixel
   1323        dup             v30.8h,  v30.h[0]
   1324 3:      // !LR_HAVE_RIGHT
   1325 
   1326        // Check whether we need to pad the right edge
   1327        cmp             w6,  #11
   1328        b.ge            4f   // If w >= 11, all used input pixels are valid
   1329 
   1330        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
   1331        // this ends up called again; it's not strictly needed in those
   1332        // cases (we pad enough here), but keeping the code as simple as possible.
   1333 
   1334        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
   1335        // buffer pointer.
   1336        movrel          x13, right_ext_mask, -1
   1337        sub             x13, x13, w6,  uxtw #1
   1338        ld1             {v28.16b, v29.16b}, [x13]
   1339 
   1340        bit             v0.16b,  v30.16b, v28.16b
   1341        bit             v1.16b,  v30.16b, v29.16b
   1342 
   1343 4:      // Loop horizontally
   1344        ext             v16.16b, v0.16b,  v1.16b,  #2
   1345        ext             v17.16b, v0.16b,  v1.16b,  #4
   1346        ext             v19.16b, v0.16b,  v1.16b,  #8
   1347        ext             v18.16b, v0.16b,  v1.16b,  #6
   1348 
   1349        add             v20.8h,  v16.8h,  v17.8h
   1350        add             v21.8h,  v0.8h,   v19.8h
   1351        add             v20.8h,  v20.8h,  v18.8h
   1352 
   1353        umull           v22.4s,  v16.4h,  v16.4h
   1354        umlal           v22.4s,  v17.4h,  v17.4h
   1355        umlal           v22.4s,  v18.4h,  v18.4h
   1356 
   1357        umull2          v23.4s,  v16.8h,  v16.8h
   1358        umlal2          v23.4s,  v17.8h,  v17.8h
   1359        umlal2          v23.4s,  v18.8h,  v18.8h
   1360 
   1361        add             v21.8h,  v21.8h,  v20.8h
   1362        st1             {v20.8h},        [x1], #16
   1363        st1             {v22.4s,v23.4s}, [x0], #32
   1364 
   1365        umlal           v22.4s,  v0.4h,   v0.4h
   1366        umlal           v22.4s,  v19.4h,  v19.4h
   1367 
   1368        umlal2          v23.4s,  v0.8h,   v0.8h
   1369        umlal2          v23.4s,  v19.8h,  v19.8h
   1370 
   1371        subs            w6,  w6,  #8
   1372 
   1373        st1             {v21.8h},        [x3], #16
   1374        st1             {v22.4s,v23.4s}, [x2], #32
   1375 
   1376        b.le            9f
   1377        tst             w7,  #2 // LR_HAVE_RIGHT
   1378        mov             v0.16b,  v1.16b
   1379        ld1             {v1.8h}, [x5], #16
   1380 
   1381        b.ne            4b // If we don't need to pad, just keep summing.
   1382        b               3b // If we need to pad, check how many pixels we have left.
   1383 
   1384 9:
   1385        ret
   1386 endfunc
   1387 
   1388 sgr_funcs 16