tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration.S (52203B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 const right_ext_mask_buf
     32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     36        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     37        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     38 right_ext_mask:
     39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     41        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     42        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     43        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     44        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     45 endconst
     46 
     47 // void dav1d_wiener_filter7_8bpc_neon(pixel *p, const ptrdiff_t stride,
     48 //                                     const pixel (*left)[4], const pixel *lpf,
     49 //                                     const int w, int h,
     50 //                                     const int16_t filter[2][8],
     51 //                                     const enum LrEdgeFlags edges);
     52 function wiener_filter7_8bpc_neon, export=1
     53        AARCH64_SIGN_LINK_REGISTER
     54        stp             x29, x30, [sp, #-16]!
     55        mov             x29, sp
     56        ld1             {v0.8h, v1.8h},  [x6]
     57        tst             w7,  #4               // LR_HAVE_TOP
     58        sub_sp          384*2*6
     59 
     60        mov             w17, #(1 << 14) - (1 << 2)
     61        dup             v30.8h,  w17
     62        movi            v31.8h,  #8, lsl #8
     63 
     64        // x9  - t6
     65        // x10 - t5
     66        // x11 - t4
     67        // x12 - t3
     68        // x13 - t2
     69        // x14 - t1
     70        // x15 - t0
     71        mov             x14, sp               // t1
     72        b.eq            L(no_top_7)
     73 
     74        mov             x16, x2               // backup left
     75        mov             x2,  #0
     76        bl              wiener_filter7_h_8bpc_neon
     77        add             x3,  x3,  x1          // lpf += stride
     78        mov             x9,  x14              // t6
     79        mov             x10, x14              // t5
     80        add             x14, x14, #384*2      // t1 += 384*2
     81        bl              wiener_filter7_h_8bpc_neon
     82        add             x3,  x3,  x1,  lsl #2
     83        add             x3,  x3,  x1          // lpf += stride*5
     84        mov             x11, x14              // t4
     85        add             x14, x14, #384*2      // t1 += 384*2
     86        mov             x2,  x16              // left
     87        mov             x16, x3               // backup lpf
     88        mov             x3,  x0               // lpf = p
     89        bl              wiener_filter7_h_8bpc_neon
     90        subs            w5,  w5,  #1          // h--
     91        mov             x12, x14              // t3
     92        mov             x13, x14              // t2
     93        b.eq            L(v1_7)
     94        add             x3,  x3,  x1          // src += stride
     95        add             x14, x14, #384*2      // t1 += 384*2
     96        bl              wiener_filter7_h_8bpc_neon
     97        mov             x13, x14              // t2
     98        subs            w5,  w5,  #1          // h--
     99        b.eq            L(v2_7)
    100        add             x3,  x3,  x1          // src += stride
    101        add             x14, x14, #384*2      // t1 += 384*2
    102        bl              wiener_filter7_h_8bpc_neon
    103        subs            w5,  w5,  #1          // h--
    104        b.eq            L(v3_7)
    105        add             x3,  x3,  x1          // src += stride
    106 
    107 L(main_7):
    108        add             x15, x14, #384*2      // t0 = t1 + 384*2
    109 L(main_loop_7):
    110        bl              wiener_filter7_hv_8bpc_neon
    111        subs            w5,  w5,  #1          // h--
    112        b.ne            L(main_loop_7)
    113        tst             w7,  #8 // LR_HAVE_BOTTOM
    114        b.eq            L(v3_7)
    115 
    116        mov             x3,  x16              // restore lpf
    117        mov             x2,  #0               // left = NULL
    118        bl              wiener_filter7_hv_8bpc_neon
    119        bl              wiener_filter7_hv_8bpc_neon
    120 L(v1_7):
    121        bl              wiener_filter7_v_8bpc_neon
    122 
    123        mov             sp,  x29
    124        ldp             x29, x30, [sp], #16
    125        AARCH64_VALIDATE_LINK_REGISTER
    126        ret
    127 
    128 L(no_top_7):
    129        add             x3,  x3,  x1,  lsl #2
    130        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
    131        mov             x3,  x0               // lpf = p
    132 
    133        bl              wiener_filter7_h_8bpc_neon
    134        subs            w5,  w5,  #1          // h--
    135        mov             x9,  x14              // t6
    136        mov             x10, x14              // t5
    137        mov             x11, x14              // t4
    138        mov             x12, x14              // t3
    139        mov             x13, x14              // t2
    140        b.eq            L(v1_7)
    141        add             x3,  x3,  x1          // src += stride
    142        add             x14, x14, #384*2      // t1 += 384*2
    143        bl              wiener_filter7_h_8bpc_neon
    144        subs            w5,  w5,  #1          // h--
    145        mov             x13, x14              // t2
    146        b.eq            L(v2_7)
    147        add             x3,  x3,  x1          // src += stride
    148        add             x14, x14, #384*2      // t1 += 384*2
    149        bl              wiener_filter7_h_8bpc_neon
    150        subs            w5,  w5,  #1          // h--
    151        b.eq            L(v3_7)
    152        add             x3,  x3,  x1          // src += stride
    153        add             x15, x14, #384*2      // t0 = t1 + 384*2
    154        bl              wiener_filter7_hv_8bpc_neon
    155        subs            w5,  w5,  #1          // h--
    156        b.eq            L(v3_7)
    157        add             x15, x15, #384*2*4    // t0 += 384*2*4
    158        bl              wiener_filter7_hv_8bpc_neon
    159        subs            w5,  w5,  #1          // h--
    160        b.ne            L(main_7)
    161 L(v3_7):
    162        bl              wiener_filter7_v_8bpc_neon
    163 L(v2_7):
    164        bl              wiener_filter7_v_8bpc_neon
    165        b               L(v1_7)
    166 endfunc
    167 
    168 
    169 function wiener_filter7_h_8bpc_neon
    170        stp             x3,  x4,  [sp, #-32]!
    171        str             x14,      [sp, #16]
    172 
    173        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    174        tst             w7,  #1 // LR_HAVE_LEFT
    175        b.eq            1f
    176        // LR_HAVE_LEFT
    177        cbnz            x2,  0f
    178        // left == NULL
    179        sub             x3,  x3,  #3
    180        ld1             {v3.16b}, [x3], #16
    181        b               2f
    182 
    183 0:
    184        // LR_HAVE_LEFT, left != NULL
    185        ld1             {v3.16b},  [x3], #16
    186        ld1             {v2.s}[3], [x2], #4
    187        // Move x3 back to account for the last 3 bytes we loaded earlier,
    188        // which we'll shift out.
    189        sub             x3,  x3,  #3
    190        ext             v3.16b,  v2.16b,  v3.16b, #13
    191        b               2f
    192 
    193 1:
    194        ld1             {v3.16b}, [x3], #16
    195        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
    196        // and shift v3 to have 3x the first byte at the front.
    197        dup             v2.16b,  v3.b[0]
    198        // Move x3 back to account for the last 3 bytes we loaded before,
    199        // which we shifted out.
    200        sub             x3,  x3,  #3
    201        ext             v3.16b,  v2.16b,  v3.16b, #13
    202 
    203 2:
    204        ld1             {v4.8b}, [x3], #8
    205        uxtl            v2.8h,   v3.8b
    206        uxtl2           v3.8h,   v3.16b
    207        uxtl            v4.8h,   v4.8b
    208 
    209        tst             w7,  #2 // LR_HAVE_RIGHT
    210        b.ne            4f
    211 
    212 3:      // !LR_HAVE_RIGHT
    213 
    214        // Check whether we need to pad the right edge
    215        cmp             w4,  #19
    216        b.ge            4f   // If w >= 19, all used input pixels are valid
    217 
    218        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
    219        // this ends up called again; it's not strictly needed in those
    220        // cases (we pad enough here), but keeping the code as simple as possible.
    221 
    222        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
    223        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
    224        sub             w17, w4,  #22
    225        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
    226        // buffer pointer.
    227        movrel          x6,  right_ext_mask, -6
    228        ldr             b28, [x3,  w17, sxtw]
    229        sub             x6,  x6,  w4,  uxtw #1
    230        dup             v28.8h,  v28.h[0]
    231        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
    232 
    233        bit             v2.16b,  v28.16b, v25.16b
    234        bit             v3.16b,  v28.16b, v26.16b
    235        bit             v4.16b,  v28.16b, v27.16b
    236 
    237 4:      // Loop horizontally
    238        // Interleaving the mul/mla chains actually hurts performance
    239        // significantly on Cortex A53, thus keeping mul/mla tightly
    240        // chained like this.
    241        ext             v17.16b, v2.16b,  v3.16b, #4
    242        ext             v19.16b, v2.16b,  v3.16b, #8
    243        ext             v16.16b, v2.16b,  v3.16b, #2
    244        ext             v20.16b, v2.16b,  v3.16b, #10
    245        ext             v21.16b, v2.16b,  v3.16b, #12
    246        ext             v18.16b, v2.16b,  v3.16b, #6
    247        add             v19.8h,  v19.8h,  v17.8h
    248        add             v20.8h,  v20.8h,  v16.8h
    249        add             v21.8h,  v21.8h,  v2.8h
    250        shl             v22.8h,  v18.8h,  #7
    251        mul             v6.8h,   v18.8h,  v0.h[3]
    252        mla             v6.8h,   v19.8h,  v0.h[4]
    253        mla             v6.8h,   v20.8h,  v0.h[5]
    254        mla             v6.8h,   v21.8h,  v0.h[6]
    255 
    256        ext             v17.16b, v3.16b,  v4.16b, #4
    257        ext             v19.16b, v3.16b,  v4.16b, #8
    258        ext             v16.16b, v3.16b,  v4.16b, #2
    259        ext             v20.16b, v3.16b,  v4.16b, #10
    260        ext             v21.16b, v3.16b,  v4.16b, #12
    261        ext             v18.16b, v3.16b,  v4.16b, #6
    262 
    263        add             v19.8h,  v19.8h,  v17.8h
    264        add             v20.8h,  v20.8h,  v16.8h
    265        add             v21.8h,  v21.8h,  v3.8h
    266        shl             v23.8h,  v18.8h,  #7
    267        mul             v7.8h,   v18.8h,  v0.h[3]
    268        mla             v7.8h,   v19.8h,  v0.h[4]
    269        mla             v7.8h,   v20.8h,  v0.h[5]
    270        mla             v7.8h,   v21.8h,  v0.h[6]
    271 
    272        sub             v22.8h,  v22.8h,  v30.8h
    273        sub             v23.8h,  v23.8h,  v30.8h
    274        sqadd           v6.8h,   v6.8h,   v22.8h
    275        sqadd           v7.8h,   v7.8h,   v23.8h
    276        sshr            v6.8h,   v6.8h,   #3
    277        sshr            v7.8h,   v7.8h,   #3
    278        add             v6.8h,   v6.8h,   v31.8h
    279        add             v7.8h,   v7.8h,   v31.8h
    280 
    281        subs            w4,  w4,  #16
    282 
    283        st1             {v6.8h, v7.8h}, [x14], #32
    284 
    285        b.le            0f
    286        mov             v2.16b,  v4.16b
    287        ld1             {v4.16b}, [x3], #16
    288        tst             w7,  #2 // LR_HAVE_RIGHT
    289        uxtl            v3.8h,   v4.8b
    290        uxtl2           v4.8h,   v4.16b
    291        b.ne            4b // If we don't need to pad, just keep filtering.
    292        b               3b // If we need to pad, check how many pixels we have left.
    293 
    294 0:
    295        ldr             x14,      [sp, #16]
    296        ldp             x3,  x4,  [sp], #32
    297        ret
    298 endfunc
    299 
    300 function wiener_filter7_v_8bpc_neon
    301        // Backing up/restoring registers shifted, so that x9 gets the value
    302        // of x10, etc, afterwards.
    303        stp             x10, x11, [sp, #-64]!
    304        stp             x12, x13, [sp, #16]
    305        stp             x14, x14, [sp, #32]
    306        stp             x0,  x4,  [sp, #48]
    307 1:
    308        ld1             {v20.8h, v21.8h}, [x11], #32
    309        ld1             {v24.8h, v25.8h}, [x13], #32
    310 
    311        ld1             {v18.8h, v19.8h}, [x10], #32
    312        add             v24.8h,  v24.8h,  v20.8h
    313        ld1             {v26.8h, v27.8h}, [x14], #32
    314 
    315        ld1             {v16.8h, v17.8h}, [x9],  #32
    316        add             v28.8h,  v26.8h,  v18.8h
    317        ld1             {v22.8h, v23.8h}, [x12], #32
    318 
    319        add             v16.8h,  v26.8h,  v16.8h
    320        add             v25.8h,  v25.8h,  v21.8h
    321 
    322        smull           v2.4s,   v22.4h,  v1.h[3]
    323        smlal           v2.4s,   v24.4h,  v1.h[4]
    324        smlal           v2.4s,   v28.4h,  v1.h[5]
    325        smlal           v2.4s,   v16.4h,  v1.h[6]
    326        add             v29.8h,  v27.8h,  v19.8h
    327        smull2          v3.4s,   v22.8h,  v1.h[3]
    328        smlal2          v3.4s,   v24.8h,  v1.h[4]
    329        smlal2          v3.4s,   v28.8h,  v1.h[5]
    330        smlal2          v3.4s,   v16.8h,  v1.h[6]
    331        add             v17.8h,  v27.8h,  v17.8h
    332        smull           v4.4s,   v23.4h,  v1.h[3]
    333        smlal           v4.4s,   v25.4h,  v1.h[4]
    334        smlal           v4.4s,   v29.4h,  v1.h[5]
    335        smlal           v4.4s,   v17.4h,  v1.h[6]
    336        smull2          v5.4s,   v23.8h,  v1.h[3]
    337        smlal2          v5.4s,   v25.8h,  v1.h[4]
    338        smlal2          v5.4s,   v29.8h,  v1.h[5]
    339        smlal2          v5.4s,   v17.8h,  v1.h[6]
    340        sqrshrun        v2.4h,   v2.4s,   #11
    341        sqrshrun2       v2.8h,   v3.4s,   #11
    342        sqrshrun        v3.4h,   v4.4s,   #11
    343        sqrshrun2       v3.8h,   v5.4s,   #11
    344        sqxtun          v2.8b,   v2.8h
    345        sqxtun2         v2.16b,  v3.8h
    346        subs            w4,  w4,  #16
    347        st1             {v2.16b}, [x0], #16
    348        b.gt            1b
    349 
    350        ldp             x0,  x4,  [sp, #48]
    351        ldp             x13, x14, [sp, #32]
    352        ldp             x11, x12, [sp, #16]
    353        ldp             x9,  x10, [sp], #64
    354 
    355        add             x0,  x0,  x1
    356        ret
    357 endfunc
    358 
    359 function wiener_filter7_hv_8bpc_neon
    360        // Backing up/restoring registers shifted, so that x9 gets the value
    361        // of x10, etc, and x15==x9, afterwards.
    362        stp             x10, x11, [sp, #-80]!
    363        stp             x12, x13, [sp, #16]
    364        stp             x14, x15, [sp, #32]
    365        stp             x10, x0,  [sp, #48]
    366        stp             x3,  x4,  [sp, #64]
    367 
    368        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    369        tst             w7,  #1 // LR_HAVE_LEFT
    370        b.eq            1f
    371        // LR_HAVE_LEFT
    372        cbnz            x2,  0f
    373        // left == NULL
    374        sub             x3,  x3,  #3
    375        ld1             {v3.16b}, [x3], #16
    376        b               2f
    377 
    378 0:
    379        // LR_HAVE_LEFT, left != NULL
    380        ld1             {v3.16b},  [x3], #16
    381        ld1             {v2.s}[3], [x2], #4
    382        // Move x3 back to account for the last 3 bytes we loaded earlier,
    383        // which we'll shift out.
    384        sub             x3,  x3,  #3
    385        ext             v3.16b,  v2.16b,  v3.16b, #13
    386        b               2f
    387 1:
    388        ld1             {v3.16b}, [x3], #16
    389        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
    390        // and shift v3 to have 3x the first byte at the front.
    391        dup             v2.16b,  v3.b[0]
    392        // Move x3 back to account for the last 3 bytes we loaded before,
    393        // which we shifted out.
    394        sub             x3,  x3,  #3
    395        ext             v3.16b,  v2.16b,  v3.16b, #13
    396 
    397 2:
    398        ld1             {v4.8b}, [x3], #8
    399        uxtl            v2.8h,   v3.8b
    400        uxtl2           v3.8h,   v3.16b
    401        uxtl            v4.8h,   v4.8b
    402 
    403        tst             w7,  #2 // LR_HAVE_RIGHT
    404        b.ne            4f
    405 
    406 3:      // !LR_HAVE_RIGHT
    407 
    408        // Check whether we need to pad the right edge
    409        cmp             w4,  #19
    410        b.ge            4f   // If w >= 19, all used input pixels are valid
    411 
    412        // 1 <= w < 19, w+3 pixels valid in v2-v4. For w>=9,
    413        // this ends up called again; it's not strictly needed in those
    414        // cases (we pad enough here), but keeping the code as simple as possible.
    415 
    416        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
    417        // v2/3/4.h[24]. Thus read from x3[w-22] to find the padding pixel.
    418        sub             w17, w4,  #22
    419        // Insert padding in v2/3/4.h[w+3] onwards; fuse the +3 (*2) into the
    420        // buffer pointer.
    421        movrel          x6,  right_ext_mask, -6
    422        ldr             b28, [x3,  w17, sxtw]
    423        sub             x6,  x6,  w4,  uxtw #1
    424        dup             v28.8h,  v28.h[0]
    425        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
    426 
    427        bit             v2.16b,  v28.16b, v25.16b
    428        bit             v3.16b,  v28.16b, v26.16b
    429        bit             v4.16b,  v28.16b, v27.16b
    430 
    431 4:      // Loop horizontally
    432        ext             v17.16b, v2.16b,  v3.16b, #4
    433        ext             v19.16b, v2.16b,  v3.16b, #8
    434        ext             v16.16b, v2.16b,  v3.16b, #2
    435        ext             v20.16b, v2.16b,  v3.16b, #10
    436        ext             v21.16b, v2.16b,  v3.16b, #12
    437        ext             v18.16b, v2.16b,  v3.16b, #6
    438        add             v19.8h,  v19.8h,  v17.8h
    439        add             v20.8h,  v20.8h,  v16.8h
    440        add             v21.8h,  v21.8h,  v2.8h
    441        shl             v22.8h,  v18.8h,  #7
    442        mul             v6.8h,   v18.8h,  v0.h[3]
    443        mla             v6.8h,   v19.8h,  v0.h[4]
    444        mla             v6.8h,   v20.8h,  v0.h[5]
    445        mla             v6.8h,   v21.8h,  v0.h[6]
    446 
    447        ext             v17.16b, v3.16b,  v4.16b, #4
    448        ext             v19.16b, v3.16b,  v4.16b, #8
    449        ext             v16.16b, v3.16b,  v4.16b, #2
    450        ext             v20.16b, v3.16b,  v4.16b, #10
    451        ext             v21.16b, v3.16b,  v4.16b, #12
    452        ext             v18.16b, v3.16b,  v4.16b, #6
    453 
    454        add             v19.8h,  v19.8h,  v17.8h
    455        add             v20.8h,  v20.8h,  v16.8h
    456        add             v21.8h,  v21.8h,  v3.8h
    457        shl             v23.8h,  v18.8h,  #7
    458        mul             v7.8h,   v18.8h,  v0.h[3]
    459        mla             v7.8h,   v19.8h,  v0.h[4]
    460        mla             v7.8h,   v20.8h,  v0.h[5]
    461        mla             v7.8h,   v21.8h,  v0.h[6]
    462 
    463        ld1             {v20.8h, v21.8h}, [x11], #32
    464 
    465        sub             v22.8h,  v22.8h,  v30.8h
    466        sub             v23.8h,  v23.8h,  v30.8h
    467        ld1             {v26.8h, v27.8h}, [x13], #32
    468        sqadd           v6.8h,   v6.8h,   v22.8h
    469        sqadd           v7.8h,   v7.8h,   v23.8h
    470        ld1             {v18.8h, v19.8h}, [x10], #32
    471        sshr            v6.8h,   v6.8h,   #3
    472        sshr            v7.8h,   v7.8h,   #3
    473        ld1             {v28.8h, v29.8h}, [x14], #32
    474        add             v6.8h,   v6.8h,   v31.8h
    475        add             v7.8h,   v7.8h,   v31.8h
    476 
    477        ld1             {v16.8h, v17.8h}, [x9],  #32
    478        add             v26.8h,  v20.8h,  v26.8h
    479 
    480        ld1             {v24.8h, v25.8h}, [x12], #32
    481        add             v28.8h,  v18.8h,  v28.8h
    482 
    483        add             v16.8h,  v16.8h,  v6.8h
    484        add             v27.8h,  v21.8h,  v27.8h
    485 
    486        smull           v18.4s,  v24.4h,  v1.h[3]
    487        smlal           v18.4s,  v26.4h,  v1.h[4]
    488        smlal           v18.4s,  v28.4h,  v1.h[5]
    489        smlal           v18.4s,  v16.4h,  v1.h[6]
    490        add             v29.8h,  v19.8h,  v29.8h
    491        smull2          v19.4s,  v24.8h,  v1.h[3]
    492        smlal2          v19.4s,  v26.8h,  v1.h[4]
    493        smlal2          v19.4s,  v28.8h,  v1.h[5]
    494        smlal2          v19.4s,  v16.8h,  v1.h[6]
    495        add             v17.8h,  v17.8h,  v7.8h
    496        smull           v20.4s,  v25.4h,  v1.h[3]
    497        smlal           v20.4s,  v27.4h,  v1.h[4]
    498        smlal           v20.4s,  v29.4h,  v1.h[5]
    499        smlal           v20.4s,  v17.4h,  v1.h[6]
    500        smull2          v21.4s,  v25.8h,  v1.h[3]
    501        smlal2          v21.4s,  v27.8h,  v1.h[4]
    502        smlal2          v21.4s,  v29.8h,  v1.h[5]
    503        smlal2          v21.4s,  v17.8h,  v1.h[6]
    504        sqrshrun        v18.4h,  v18.4s,  #11
    505        sqrshrun2       v18.8h,  v19.4s,  #11
    506        sqrshrun        v19.4h,  v20.4s,  #11
    507        sqrshrun2       v19.8h,  v21.4s,  #11
    508        st1             {v6.8h, v7.8h}, [x15], #32
    509        sqxtun          v18.8b,  v18.8h
    510        sqxtun2         v18.16b, v19.8h
    511        subs            w4,  w4,  #16
    512 
    513        st1             {v18.16b}, [x0], #16
    514 
    515        b.le            0f
    516        mov             v2.16b,  v4.16b
    517        ld1             {v4.16b}, [x3], #16
    518        tst             w7,  #2 // LR_HAVE_RIGHT
    519        uxtl            v3.8h,   v4.8b
    520        uxtl2           v4.8h,   v4.16b
    521        b.ne            4b // If we don't need to pad, just keep filtering.
    522        b               3b // If we need to pad, check how many pixels we have left.
    523 
    524 0:
    525        ldp             x3,  x4,  [sp, #64]
    526        ldp             x15, x0,  [sp, #48]
    527        ldp             x13, x14, [sp, #32]
    528        ldp             x11, x12, [sp, #16]
    529        ldp             x9,  x10, [sp], #80
    530 
    531        add             x3,  x3,  x1
    532        add             x0,  x0,  x1
    533 
    534        ret
    535 endfunc
    536 
    537 // void dav1d_wiener_filter5_8bpc_neon(pixel *p, const ptrdiff_t stride,
    538 //                                     const pixel (*left)[4], const pixel *lpf,
    539 //                                     const int w, int h,
    540 //                                     const int16_t filter[2][8],
    541 //                                     const enum LrEdgeFlags edges);
    542 function wiener_filter5_8bpc_neon, export=1
    543        AARCH64_SIGN_LINK_REGISTER
    544        stp             x29, x30, [sp, #-16]!
    545        mov             x29, sp
    546        ld1             {v0.8h, v1.8h},  [x6]
    547        tst             w7,  #4               // LR_HAVE_TOP
    548        sub_sp          384*2*4
    549 
    550        mov             w17, #(1 << 14) - (1 << 2)
    551        dup             v30.8h,  w17
    552        movi            v31.8h,  #8, lsl #8
    553 
    554        // x11 - t4
    555        // x12 - t3
    556        // x13 - t2
    557        // x14 - t1
    558        // x15 - t0
    559        mov             x14, sp               // t1
    560        b.eq            L(no_top_5)
    561 
    562        mov             x16, x2               // backup left
    563        mov             x2,  #0
    564        bl              wiener_filter5_h_8bpc_neon
    565        add             x3,  x3,  x1          // lpf += stride
    566        mov             x11, x14              // t4
    567        add             x14, x14, #384*2      // t1 += 384*2
    568        bl              wiener_filter5_h_8bpc_neon
    569        add             x3,  x3,  x1,  lsl #2
    570        add             x3,  x3,  x1          // lpf += stride*5
    571        mov             x12, x14              // t3
    572        add             x14, x14, #384*2      // t1 += 384*2
    573        mov             x2,  x16              // left
    574        mov             x16, x3               // backup lpf
    575        mov             x3,  x0               // lpf = p
    576        bl              wiener_filter5_h_8bpc_neon
    577        subs            w5,  w5,  #1          // h--
    578        mov             x13, x14              // t2
    579        b.eq            L(v1_5)
    580        add             x3,  x3,  x1          // src += stride
    581        add             x14, x14, #384*2      // t1 += 384*2
    582        bl              wiener_filter5_h_8bpc_neon
    583        subs            w5,  w5,  #1          // h--
    584        b.eq            L(v2_5)
    585        add             x3,  x3,  x1          // src += stride
    586 
    587 L(main_5):
    588        mov             x15, x11              // t0 = t4
    589 L(main_loop_5):
    590        bl              wiener_filter5_hv_8bpc_neon
    591        subs            w5,  w5,  #1          // h--
    592        b.ne            L(main_loop_5)
    593        tst             w7,  #8 // LR_HAVE_BOTTOM
    594        b.eq            L(v2_5)
    595 
    596        mov             x3,  x16              // restore lpf
    597        mov             x2,  #0               // left = NULL
    598        bl              wiener_filter5_hv_8bpc_neon
    599        bl              wiener_filter5_hv_8bpc_neon
    600 L(end_5):
    601 
    602        mov             sp,  x29
    603        ldp             x29, x30, [sp], #16
    604        AARCH64_VALIDATE_LINK_REGISTER
    605        ret
    606 
    607 L(no_top_5):
    608        add             x3,  x3,  x1,  lsl #2
    609        add             x16, x3,  x1,  lsl #1 // lpf += stride*6, backup
    610        mov             x3,  x0               // lpf = p
    611 
    612        bl              wiener_filter5_h_8bpc_neon
    613        subs            w5,  w5,  #1          // h--
    614        mov             x11, x14              // t4
    615        mov             x12, x14              // t3
    616        mov             x13, x14              // t2
    617        b.eq            L(v1_5)
    618        add             x3,  x3,  x1          // src += stride
    619        add             x14, x14, #384*2      // t1 += 384*2
    620        bl              wiener_filter5_h_8bpc_neon
    621        subs            w5,  w5,  #1          // h--
    622        b.eq            L(v2_5)
    623        add             x3,  x3,  x1          // src += stride
    624        add             x15, x14, #384*2      // t0 = t1 + 384*2
    625        bl              wiener_filter5_hv_8bpc_neon
    626        subs            w5,  w5,  #1          // h--
    627        b.eq            L(v2_5)
    628        add             x15, x15, #384*2*3    // t0 += 384*2*3
    629        bl              wiener_filter5_hv_8bpc_neon
    630        subs            w5,  w5,  #1          // h--
    631        b.ne            L(main_5)
    632 L(v2_5):
    633        bl              wiener_filter5_v_8bpc_neon
    634        add             x0,  x0,  x1
    635        mov             x11, x12
    636        mov             x12, x13
    637        mov             x13, x14
    638 L(v1_5):
    639        bl              wiener_filter5_v_8bpc_neon
    640        b               L(end_5)
    641 endfunc
    642 
    643 
    644 function wiener_filter5_h_8bpc_neon
    645        stp             x3,  x4,  [sp, #-32]!
    646        str             x14,      [sp, #16]
    647 
    648        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    649        tst             w7,  #1 // LR_HAVE_LEFT
    650        b.eq            1f
    651        // LR_HAVE_LEFT
    652        cbnz            x2,  0f
    653        // left == NULL
    654        sub             x3,  x3,  #2
    655        ld1             {v3.16b}, [x3], #16
    656        b               2f
    657 
    658 0:
    659        // LR_HAVE_LEFT, left != NULL
    660        ld1             {v3.16b},  [x3], #16
    661        ld1             {v2.s}[3], [x2], #4
    662        // Move x3 back to account for the last 2 bytes we loaded earlier,
    663        // which we'll shift out.
    664        sub             x3,  x3,  #2
    665        ext             v3.16b,  v2.16b,  v3.16b, #14
    666        b               2f
    667 
    668 1:
    669        ld1             {v3.16b}, [x3], #16
    670        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
    671        // and shift v3 to have 3x the first byte at the front.
    672        dup             v2.16b,  v3.b[0]
    673        // Move x3 back to account for the last 2 bytes we loaded before,
    674        // which we shifted out.
    675        sub             x3,  x3,  #2
    676        ext             v3.16b,  v2.16b,  v3.16b, #14
    677 
    678 2:
    679        ld1             {v4.8b}, [x3], #8
    680        uxtl            v2.8h,   v3.8b
    681        uxtl2           v3.8h,   v3.16b
    682        uxtl            v4.8h,   v4.8b
    683 
    684        tst             w7,  #2 // LR_HAVE_RIGHT
    685        b.ne            4f
    686 
    687 3:      // !LR_HAVE_RIGHT
    688 
    689        // Check whether we need to pad the right edge
    690        cmp             w4,  #18
    691        b.ge            4f   // If w >= 18, all used input pixels are valid
    692 
    693        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
    694        // this ends up called again; it's not strictly needed in those
    695        // cases (we pad enough here), but keeping the code as simple as possible.
    696 
    697        // The padding pixel is v2/3/4.h[w+2]. x3 points at the next input, ie
    698        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
    699        sub             w17, w4,  #23
    700        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
    701        // buffer pointer.
    702        movrel          x6,  right_ext_mask, -4
    703        ldr             b28, [x3,  w17, sxtw]
    704        sub             x6,  x6,  w4,  uxtw #1
    705        dup             v28.8h,  v28.h[0]
    706        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
    707 
    708        bit             v2.16b,  v28.16b, v25.16b
    709        bit             v3.16b,  v28.16b, v26.16b
    710        bit             v4.16b,  v28.16b, v27.16b
    711 
    712 4:      // Loop horizontally
    713        // Interleaving the mul/mla chains actually hurts performance
    714        // significantly on Cortex A53, thus keeping mul/mla tightly
    715        // chained like this.
    716        ext             v16.16b, v2.16b,  v3.16b, #2
    717        ext             v18.16b, v2.16b,  v3.16b, #6
    718        ext             v19.16b, v2.16b,  v3.16b, #8
    719        ext             v17.16b, v2.16b,  v3.16b, #4
    720        add             v18.8h,  v18.8h,  v16.8h
    721        add             v19.8h,  v19.8h,  v2.8h
    722        shl             v22.8h,  v17.8h,  #7
    723        mul             v6.8h,   v17.8h,  v0.h[3]
    724        mla             v6.8h,   v18.8h,  v0.h[4]
    725        mla             v6.8h,   v19.8h,  v0.h[5]
    726 
    727        ext             v16.16b, v3.16b,  v4.16b, #2
    728        ext             v18.16b, v3.16b,  v4.16b, #6
    729        ext             v19.16b, v3.16b,  v4.16b, #8
    730        ext             v17.16b, v3.16b,  v4.16b, #4
    731        add             v18.8h,  v18.8h,  v16.8h
    732        add             v19.8h,  v19.8h,  v3.8h
    733        shl             v23.8h,  v17.8h,  #7
    734        mul             v7.8h,   v17.8h,  v0.h[3]
    735        mla             v7.8h,   v18.8h,  v0.h[4]
    736        mla             v7.8h,   v19.8h,  v0.h[5]
    737 
    738        sub             v22.8h,  v22.8h,  v30.8h
    739        sub             v23.8h,  v23.8h,  v30.8h
    740        sqadd           v6.8h,   v6.8h,   v22.8h
    741        sqadd           v7.8h,   v7.8h,   v23.8h
    742        sshr            v6.8h,   v6.8h,   #3
    743        sshr            v7.8h,   v7.8h,   #3
    744        add             v6.8h,   v6.8h,   v31.8h
    745        add             v7.8h,   v7.8h,   v31.8h
    746 
    747        subs            w4,  w4,  #16
    748 
    749        st1             {v6.8h, v7.8h}, [x14], #32
    750 
    751        b.le            0f
    752        mov             v2.16b,  v4.16b
    753        ld1             {v4.16b}, [x3], #16
    754        tst             w7,  #2 // LR_HAVE_RIGHT
    755        uxtl            v3.8h,   v4.8b
    756        uxtl2           v4.8h,   v4.16b
    757        b.ne            4b // If we don't need to pad, just keep filtering.
    758        b               3b // If we need to pad, check how many pixels we have left.
    759 
    760 0:
    761        ldr             x14,      [sp, #16]
    762        ldp             x3,  x4,  [sp], #32
    763        ret
    764 endfunc
    765 
    766 function wiener_filter5_v_8bpc_neon
    767        stp             x11, x12, [sp, #-48]!
    768        stp             x13, x14, [sp, #16]
    769        stp             x0,  x4,  [sp, #32]
    770 1:
    771        ld1             {v18.8h, v19.8h}, [x12], #32
    772        ld1             {v22.8h, v23.8h}, [x14], #32
    773        ld1             {v16.8h, v17.8h}, [x11], #32
    774 
    775        add             v24.8h,  v22.8h,  v18.8h
    776        ld1             {v20.8h, v21.8h}, [x13], #32
    777        add             v16.8h,  v22.8h,  v16.8h
    778        add             v25.8h,  v23.8h,  v19.8h
    779 
    780        smull           v2.4s,   v20.4h,  v1.h[3]
    781        smlal           v2.4s,   v24.4h,  v1.h[4]
    782        smlal           v2.4s,   v16.4h,  v1.h[5]
    783        add             v17.8h,  v23.8h,  v17.8h
    784        smull2          v3.4s,   v20.8h,  v1.h[3]
    785        smlal2          v3.4s,   v24.8h,  v1.h[4]
    786        smlal2          v3.4s,   v16.8h,  v1.h[5]
    787        smull           v4.4s,   v21.4h,  v1.h[3]
    788        smlal           v4.4s,   v25.4h,  v1.h[4]
    789        smlal           v4.4s,   v17.4h,  v1.h[5]
    790        smull2          v5.4s,   v21.8h,  v1.h[3]
    791        smlal2          v5.4s,   v25.8h,  v1.h[4]
    792        smlal2          v5.4s,   v17.8h,  v1.h[5]
    793        sqrshrun        v2.4h,   v2.4s,   #11
    794        sqrshrun2       v2.8h,   v3.4s,   #11
    795        sqrshrun        v3.4h,   v4.4s,   #11
    796        sqrshrun2       v3.8h,   v5.4s,   #11
    797        sqxtun          v2.8b,   v2.8h
    798        sqxtun2         v2.16b,  v3.8h
    799        subs            w4,  w4,  #16
    800        st1             {v2.16b}, [x0], #16
    801        b.gt            1b
    802 
    803        ldp             x0,  x4,  [sp, #32]
    804        ldp             x13, x14, [sp, #16]
    805        ldp             x11, x12, [sp], #48
    806 
    807        ret
    808 endfunc
    809 
    810 function wiener_filter5_hv_8bpc_neon
    811        // Backing up/restoring registers shifted, so that x11 gets the value
    812        // of x12, etc, and x15==x11, afterwards.
    813        stp             x12, x13, [sp, #-64]!
    814        stp             x14, x15, [sp, #16]
    815        stp             x12, x0,  [sp, #32]
    816        stp             x3,  x4,  [sp, #48]
    817 
    818        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    819        tst             w7,  #1 // LR_HAVE_LEFT
    820        b.eq            1f
    821        // LR_HAVE_LEFT
    822        cbnz            x2,  0f
    823        // left == NULL
    824        sub             x3,  x3,  #2
    825        ld1             {v3.16b}, [x3], #16
    826        b               2f
    827 
    828 0:
    829        // LR_HAVE_LEFT, left != NULL
    830        ld1             {v3.16b},  [x3], #16
    831        ld1             {v2.s}[3], [x2], #4
    832        // Move x3 back to account for the last 2 bytes we loaded earlier,
    833        // which we'll shift out.
    834        sub             x3,  x3,  #2
    835        ext             v3.16b,  v2.16b,  v3.16b, #14
    836        b               2f
    837 1:
    838        ld1             {v3.16b}, [x3], #16
    839        // !LR_HAVE_LEFT, fill v2 with the leftmost byte
    840        // and shift v3 to have 2x the first byte at the front.
    841        dup             v2.16b,  v3.b[0]
    842        // Move x3 back to account for the last 2 bytes we loaded before,
    843        // which we shifted out.
    844        sub             x3,  x3,  #2
    845        ext             v3.16b, v2.16b, v3.16b, #14
    846 
    847 2:
    848        ld1             {v4.8b}, [x3], #8
    849        uxtl            v2.8h,  v3.8b
    850        uxtl2           v3.8h,  v3.16b
    851        uxtl            v4.8h,  v4.8b
    852 
    853        tst             w7,  #2 // LR_HAVE_RIGHT
    854        b.ne            4f
    855 
    856 3:      // !LR_HAVE_RIGHT
    857 
    858        // Check whether we need to pad the right edge
    859        cmp             w4,  #18
    860        b.ge            4f   // If w >= 18, all used input pixels are valid
    861 
    862        // 1 <= w < 18, w+2 pixels valid in v2-v4. For w>=9,
    863        // this ends up called again; it's not strictly needed in those
    864        // cases (we pad enough here), but keeping the code as simple as possible.
    865 
    866        // The padding pixel is v2/3/4.h[w+1]. x3 points at the next input, ie
    867        // v2/3/4.h[24]. Thus read from x3[w-23] to find the padding pixel.
    868        sub             w17, w4,  #23
    869        // Insert padding in v2/3/4.h[w+2] onwards; fuse the +2 (*2) into the
    870        // buffer pointer.
    871        movrel          x6,  right_ext_mask, -4
    872        ldr             b28, [x3,  w17, sxtw]
    873        sub             x6,  x6,  w4,  uxtw #1
    874        dup             v28.8h,  v28.h[0]
    875        ld1             {v25.16b, v26.16b, v27.16b}, [x6]
    876 
    877        bit             v2.16b,  v28.16b, v25.16b
    878        bit             v3.16b,  v28.16b, v26.16b
    879        bit             v4.16b,  v28.16b, v27.16b
    880 
    881 4:      // Loop horizontally
    882 
    883        ext             v16.16b, v2.16b,  v3.16b, #2
    884        ext             v18.16b, v2.16b,  v3.16b, #6
    885        ext             v19.16b, v2.16b,  v3.16b, #8
    886        ext             v17.16b, v2.16b,  v3.16b, #4
    887        add             v18.8h,  v18.8h,  v16.8h
    888        add             v19.8h,  v19.8h,  v2.8h
    889        shl             v22.8h,  v17.8h,  #7
    890        mul             v6.8h,   v17.8h,  v0.h[3]
    891        mla             v6.8h,   v18.8h,  v0.h[4]
    892        mla             v6.8h,   v19.8h,  v0.h[5]
    893 
    894        ext             v16.16b, v3.16b,  v4.16b, #2
    895        ext             v18.16b, v3.16b,  v4.16b, #6
    896        ext             v19.16b, v3.16b,  v4.16b, #8
    897        ext             v17.16b, v3.16b,  v4.16b, #4
    898        add             v18.8h,  v18.8h,  v16.8h
    899        add             v19.8h,  v19.8h,  v3.8h
    900        shl             v23.8h,  v17.8h,  #7
    901        mul             v7.8h,   v17.8h,  v0.h[3]
    902        mla             v7.8h,   v18.8h,  v0.h[4]
    903        mla             v7.8h,   v19.8h,  v0.h[5]
    904 
    905        ld1             {v18.8h, v19.8h}, [x12], #32
    906 
    907        sub             v22.8h,  v22.8h,  v30.8h
    908        sub             v23.8h,  v23.8h,  v30.8h
    909        ld1             {v24.8h, v25.8h}, [x14], #32
    910        sqadd           v6.8h,   v6.8h,   v22.8h
    911        sqadd           v7.8h,   v7.8h,   v23.8h
    912        ld1             {v16.8h, v17.8h}, [x11], #32
    913        sshr            v6.8h,   v6.8h,   #3
    914        sshr            v7.8h,   v7.8h,   #3
    915        ld1             {v20.8h, v21.8h}, [x13], #32
    916        add             v6.8h,   v6.8h,   v31.8h
    917        add             v7.8h,   v7.8h,   v31.8h
    918 
    919        add             v24.8h,  v24.8h,  v18.8h
    920        add             v16.8h,  v16.8h,  v6.8h
    921 
    922        smull           v18.4s,  v20.4h,  v1.h[3]
    923        smlal           v18.4s,  v24.4h,  v1.h[4]
    924        smlal           v18.4s,  v16.4h,  v1.h[5]
    925        add             v25.8h,  v25.8h,  v19.8h
    926        smull2          v19.4s,  v20.8h,  v1.h[3]
    927        smlal2          v19.4s,  v24.8h,  v1.h[4]
    928        smlal2          v19.4s,  v16.8h,  v1.h[5]
    929        add             v17.8h,  v17.8h,  v7.8h
    930        smull           v20.4s,  v21.4h,  v1.h[3]
    931        smlal           v20.4s,  v25.4h,  v1.h[4]
    932        smlal           v20.4s,  v17.4h,  v1.h[5]
    933        smull2          v21.4s,  v21.8h,  v1.h[3]
    934        smlal2          v21.4s,  v25.8h,  v1.h[4]
    935        smlal2          v21.4s,  v17.8h,  v1.h[5]
    936        sqrshrun        v18.4h,  v18.4s,  #11
    937        sqrshrun2       v18.8h,  v19.4s,  #11
    938        sqrshrun        v19.4h,  v20.4s,  #11
    939        sqrshrun2       v19.8h,  v21.4s,  #11
    940        st1             {v6.8h, v7.8h}, [x15], #32
    941        sqxtun          v18.8b,  v18.8h
    942        sqxtun2         v18.16b, v19.8h
    943        subs            w4,  w4,  #16
    944 
    945        st1             {v18.16b}, [x0], #16
    946 
    947        b.le            0f
    948        mov             v2.16b,  v4.16b
    949        ld1             {v4.16b}, [x3], #16
    950        tst             w7,  #2 // LR_HAVE_RIGHT
    951        uxtl            v3.8h,   v4.8b
    952        uxtl2           v4.8h,   v4.16b
    953        b.ne            4b // If we don't need to pad, just keep filtering.
    954        b               3b // If we need to pad, check how many pixels we have left.
    955 
    956 0:
    957        ldp             x3,  x4,  [sp, #48]
    958        ldp             x15, x0,  [sp, #32]
    959        ldp             x13, x14, [sp, #16]
    960        ldp             x11, x12, [sp], #64
    961 
    962        add             x3,  x3,  x1
    963        add             x0,  x0,  x1
    964 
    965        ret
    966 endfunc
    967 
    968 #include "looprestoration_tmpl.S"
    969 
    970 // void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
    971 //                                     const pixel (*left)[4],
    972 //                                     const pixel *src, const int w,
    973 //                                     const enum LrEdgeFlags edges);
    974 function sgr_box3_row_h_8bpc_neon, export=1
    975        add             w4,  w4,  #2 // w += 2
    976 
    977        tst             w5,  #1 // LR_HAVE_LEFT
    978        b.eq            1f
    979        cbnz            x2,  0f
    980 
    981        // LR_HAVE_LEFT && left == NULL
    982        sub             x3,  x3,  #2
    983        ld1             {v0.16b}, [x3], #16
    984        b               2f
    985 
    986 0:
    987        // LR_HAVE_LEFT, left != NULL
    988        ld1             {v0.16b},  [x3], #16
    989        ld1             {v1.s}[3], [x2]
    990        // Move x3 back to account for the last 2 bytes we loaded earlier,
    991        // which we'll shift out.
    992        sub             x3,  x3,  #2
    993        ext             v0.16b, v1.16b, v0.16b, #14
    994        b               2f
    995 
    996 1:
    997        ld1             {v0.16b}, [x3], #16
    998        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
    999        // and shift v0 to have 2x the first byte at the front.
   1000        dup             v1.16b, v0.b[0]
   1001        // Move x3 back to account for the last 2 bytes we loaded before,
   1002        // which we shifted out.
   1003        sub             x3,  x3,  #2
   1004        ext             v0.16b, v1.16b, v0.16b, #14
   1005 
   1006 2:
   1007        umull           v1.8h,   v0.8b,   v0.8b
   1008        umull2          v2.8h,   v0.16b,  v0.16b
   1009 
   1010        tst             w5,  #2 // LR_HAVE_RIGHT
   1011        b.ne            4f
   1012        // If we'll need to pad the right edge, load that byte to pad with
   1013        // here since we can find it pretty easily from here.
   1014        sub             w13, w4, #(2 + 16 - 2 + 1)
   1015        ldr             b30, [x3,  w13, sxtw]
   1016        // Fill v30 with the right padding pixel
   1017        dup             v30.16b, v30.b[0]
   1018 3:      // !LR_HAVE_RIGHT
   1019 
   1020        // Check whether we need to pad the right edge
   1021        cmp             w4,  #10
   1022        b.ge            4f   // If w >= 10, all used input pixels are valid
   1023 
   1024        // 1 <= w < 10, w pixels valid in v0. For w=9, this ends up called
   1025        // again; it's not strictly needed in those cases (we pad enough here),
   1026        // but keeping the code as simple as possible.
   1027 
   1028        // Insert padding in v0.b[w] onwards
   1029        movrel          x13, right_ext_mask
   1030        sub             x13, x13, w4,  uxtw
   1031        ld1             {v29.16b}, [x13]
   1032 
   1033        bit             v0.16b,  v30.16b, v29.16b
   1034 
   1035        // Update the precalculated squares
   1036        umull           v1.8h,   v0.8b,   v0.8b
   1037        umull2          v2.8h,   v0.16b,  v0.16b
   1038 
   1039 4:      // Loop horizontally
   1040        ext             v16.16b, v0.16b,  v0.16b, #1
   1041        ext             v17.16b, v0.16b,  v0.16b, #2
   1042        uaddl           v3.8h,   v0.8b,   v16.8b
   1043        ext             v20.16b, v1.16b,  v2.16b, #2
   1044        uaddw           v3.8h,   v3.8h,   v17.8b
   1045 
   1046        ext             v21.16b, v1.16b,  v2.16b, #4
   1047 
   1048        uaddl           v26.4s,  v1.4h,   v20.4h
   1049        uaddl2          v27.4s,  v1.8h,   v20.8h
   1050        uaddw           v26.4s,  v26.4s,  v21.4h
   1051        uaddw2          v27.4s,  v27.4s,  v21.8h
   1052 
   1053        subs            w4,  w4,  #8
   1054 
   1055        st1             {v3.8h},         [x1],  #16
   1056        st1             {v26.4s,v27.4s}, [x0],  #32
   1057 
   1058        b.le            9f
   1059        tst             w5,  #2 // LR_HAVE_RIGHT
   1060        ld1             {v3.8b},  [x3],  #8
   1061        mov             v1.16b,  v2.16b
   1062        ext             v0.16b,  v0.16b,  v3.16b, #8
   1063        umull           v2.8h,   v3.8b,   v3.8b
   1064 
   1065        b.ne            4b // If we don't need to pad, just keep summing.
   1066        b               3b // If we need to pad, check how many pixels we have left.
   1067 
   1068 9:
   1069        ret
   1070 endfunc
   1071 
   1072 // void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
   1073 //                                     const pixel (*left)[4],
   1074 //                                     const pixel *src, const int w,
   1075 //                                     const enum LrEdgeFlags edges);
   1076 function sgr_box5_row_h_8bpc_neon, export=1
   1077        add             w4,  w4,  #2 // w += 2
   1078 
   1079        tst             w5,  #1 // LR_HAVE_LEFT
   1080        b.eq            1f
   1081        cbnz            x2,  0f
   1082 
   1083        // LR_HAVE_LEFT && left == NULL
   1084        sub             x3,  x3,  #3
   1085        ld1             {v0.16b}, [x3], #16
   1086        b               2f
   1087 
   1088 0:
   1089        // LR_HAVE_LEFT, left != NULL
   1090        ld1             {v0.16b},  [x3], #16
   1091        ld1             {v1.s}[3], [x2], #4
   1092        // Move x3 back to account for the last 3 bytes we loaded earlier,
   1093        // which we'll shift out.
   1094        sub             x3,  x3,  #3
   1095        ext             v0.16b, v1.16b, v0.16b, #13
   1096        b               2f
   1097 
   1098 1:
   1099        ld1             {v0.16b}, [x3], #16
   1100        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
   1101        // and shift v0 to have 3x the first byte at the front.
   1102        dup             v1.16b, v0.b[0]
   1103        // Move x3 back to account for the last 3 bytes we loaded before,
   1104        // which we shifted out.
   1105        sub             x3,  x3,  #3
   1106        ext             v0.16b, v1.16b, v0.16b, #13
   1107 
   1108 2:
   1109        umull           v1.8h,   v0.8b,   v0.8b
   1110        umull2          v2.8h,   v0.16b,  v0.16b
   1111 
   1112        tst             w5,  #2 // LR_HAVE_RIGHT
   1113        b.ne            4f
   1114        // If we'll need to pad the right edge, load that byte to pad with
   1115        // here since we can find it pretty easily from here.
   1116        sub             w13, w4, #(2 + 16 - 3 + 1)
   1117        ldr             b30, [x3,  w13, sxtw]
   1118        // Fill v30 with the right padding pixel
   1119        dup             v30.16b, v30.b[0]
   1120 3:      // !LR_HAVE_RIGHT
   1121 
   1122        // Check whether we need to pad the right edge
   1123        cmp             w4,  #11
   1124        b.ge            4f   // If w >= 11, all used input pixels are valid
   1125 
   1126        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
   1127        // this ends up called again; it's not strictly needed in those
   1128        // cases (we pad enough here), but keeping the code as simple as possible.
   1129 
   1130        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
   1131        // buffer pointer.
   1132        movrel          x13, right_ext_mask, -1
   1133        sub             x13, x13, w4,  uxtw
   1134        ld1             {v29.16b}, [x13]
   1135 
   1136        bit             v0.16b,  v30.16b, v29.16b
   1137 
   1138        // Update the precalculated squares
   1139        umull           v1.8h,   v0.8b,   v0.8b
   1140        umull2          v2.8h,   v0.16b,  v0.16b
   1141 
   1142 4:      // Loop horizontally
   1143        ext             v16.16b, v0.16b,  v0.16b, #1
   1144        ext             v17.16b, v0.16b,  v0.16b, #2
   1145        ext             v18.16b, v0.16b,  v0.16b, #3
   1146        ext             v19.16b, v0.16b,  v0.16b, #4
   1147        uaddl           v3.8h,   v0.8b,   v16.8b
   1148        uaddl           v24.8h,  v17.8b,  v18.8b
   1149        uaddw           v3.8h,   v3.8h,   v19.8b
   1150        add             v3.8h,   v3.8h,   v24.8h
   1151 
   1152        ext             v16.16b, v1.16b,  v2.16b, #2
   1153        ext             v17.16b, v1.16b,  v2.16b, #4
   1154        ext             v18.16b, v1.16b,  v2.16b, #6
   1155        ext             v19.16b, v1.16b,  v2.16b, #8
   1156 
   1157        uaddl           v26.4s,  v1.4h,   v16.4h
   1158        uaddl2          v27.4s,  v1.8h,   v16.8h
   1159        uaddl           v16.4s,  v17.4h,  v18.4h
   1160        uaddl2          v17.4s,  v17.8h,  v18.8h
   1161        uaddw           v26.4s,  v26.4s,  v19.4h
   1162        uaddw2          v27.4s,  v27.4s,  v19.8h
   1163        add             v26.4s,  v26.4s,  v16.4s
   1164        add             v27.4s,  v27.4s,  v17.4s
   1165 
   1166        subs            w4,  w4,  #8
   1167 
   1168        st1             {v3.8h},         [x1],  #16
   1169        st1             {v26.4s,v27.4s}, [x0],  #32
   1170 
   1171        b.le            9f
   1172        tst             w5,  #2 // LR_HAVE_RIGHT
   1173        ld1             {v3.8b},  [x3],  #8
   1174        mov             v1.16b,  v2.16b
   1175        ext             v0.16b,  v0.16b,  v3.16b, #8
   1176        umull           v2.8h,   v3.8b,   v3.8b
   1177 
   1178        b.ne            4b // If we don't need to pad, just keep summing.
   1179        b               3b // If we need to pad, check how many pixels we have left.
   1180 
   1181 9:
   1182        ret
   1183 endfunc
   1184 
   1185 // void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
   1186 //                                      int32_t *sumsq5, int16_t *sum5,
   1187 //                                      const pixel (*left)[4],
   1188 //                                      const pixel *src, const int w,
   1189 //                                      const enum LrEdgeFlags edges);
   1190 function sgr_box35_row_h_8bpc_neon, export=1
   1191        add             w6,  w6,  #2 // w += 2
   1192 
   1193        tst             w7,  #1 // LR_HAVE_LEFT
   1194        b.eq            1f
   1195        cbnz            x4,  0f
   1196 
   1197        // LR_HAVE_LEFT && left == NULL
   1198        sub             x5,  x5,  #3
   1199        ld1             {v0.16b},  [x5], #16
   1200        b               2f
   1201 
   1202 0:
   1203        // LR_HAVE_LEFT, left != NULL
   1204        ld1             {v0.16b},  [x5], #16
   1205        ld1             {v1.s}[3], [x4], #4
   1206        // Move x3 back to account for the last 3 bytes we loaded earlier,
   1207        // which we'll shift out.
   1208        sub             x5,  x5,  #3
   1209        ext             v0.16b, v1.16b, v0.16b, #13
   1210        b               2f
   1211 
   1212 1:
   1213        ld1             {v0.16b}, [x5], #16
   1214        // !LR_HAVE_LEFT, fill v1 with the leftmost byte
   1215        // and shift v0 to have 3x the first byte at the front.
   1216        dup             v1.16b, v0.b[0]
   1217        // Move x3 back to account for the last 3 bytes we loaded before,
   1218        // which we shifted out.
   1219        sub             x5,  x5,  #3
   1220        ext             v0.16b, v1.16b, v0.16b, #13
   1221 
   1222 2:
   1223        umull           v1.8h,   v0.8b,   v0.8b
   1224        umull2          v2.8h,   v0.16b,  v0.16b
   1225 
   1226        tst             w7,  #2 // LR_HAVE_RIGHT
   1227        b.ne            4f
   1228        // If we'll need to pad the right edge, load that byte to pad with
   1229        // here since we can find it pretty easily from here.
   1230        sub             w13, w6, #(2 + 16 - 3 + 1)
   1231        ldr             b30, [x5,  w13, sxtw]
   1232        // Fill v30 with the right padding pixel
   1233        dup             v30.16b, v30.b[0]
   1234 3:      // !LR_HAVE_RIGHT
   1235 
   1236        // Check whether we need to pad the right edge
   1237        cmp             w6,  #11
   1238        b.ge            4f   // If w >= 11, all used input pixels are valid
   1239 
   1240        // 1 <= w < 11, w+1 pixels valid in v0. For w=9 or w=10,
   1241        // this ends up called again; it's not strictly needed in those
   1242        // cases (we pad enough here), but keeping the code as simple as possible.
   1243 
   1244        // Insert padding in v0.b[w+1] onwards; fuse the +1 into the
   1245        // buffer pointer.
   1246        movrel          x13, right_ext_mask, -1
   1247        sub             x13, x13, w6,  uxtw
   1248        ld1             {v29.16b}, [x13]
   1249 
   1250        bit             v0.16b,  v30.16b, v29.16b
   1251 
   1252        // Update the precalculated squares
   1253        umull           v1.8h,   v0.8b,   v0.8b
   1254        umull2          v2.8h,   v0.16b,  v0.16b
   1255 
   1256 4:      // Loop horizontally
   1257        ext             v16.16b, v0.16b,  v0.16b, #1
   1258        ext             v17.16b, v0.16b,  v0.16b, #2
   1259        ext             v19.16b, v0.16b,  v0.16b, #4
   1260        ext             v18.16b, v0.16b,  v0.16b, #3
   1261        uaddl           v3.8h,   v16.8b,  v17.8b
   1262        uaddl           v24.8h,  v0.8b,   v19.8b
   1263        uaddw           v3.8h,   v3.8h,   v18.8b
   1264 
   1265        ext             v16.16b, v1.16b,  v2.16b, #2
   1266        ext             v17.16b, v1.16b,  v2.16b, #4
   1267        ext             v19.16b, v1.16b,  v2.16b, #8
   1268        ext             v18.16b, v1.16b,  v2.16b, #6
   1269 
   1270        st1             {v3.8h},         [x1], #16
   1271        add             v3.8h,   v3.8h,   v24.8h
   1272 
   1273        uaddl           v26.4s,  v16.4h,  v17.4h
   1274        uaddl2          v27.4s,  v16.8h,  v17.8h
   1275        uaddl           v16.4s,  v1.4h,   v19.4h
   1276        uaddl2          v17.4s,  v1.8h,   v19.8h
   1277        uaddw           v26.4s,  v26.4s,  v18.4h
   1278        uaddw2          v27.4s,  v27.4s,  v18.8h
   1279 
   1280        st1             {v26.4s,v27.4s}, [x0], #32
   1281        add             v26.4s,  v26.4s,  v16.4s
   1282        add             v27.4s,  v27.4s,  v17.4s
   1283 
   1284        subs            w6,  w6,  #8
   1285 
   1286        st1             {v3.8h},         [x3], #16
   1287        st1             {v26.4s,v27.4s}, [x2], #32
   1288 
   1289        b.le            9f
   1290        tst             w7,  #2 // LR_HAVE_RIGHT
   1291        ld1             {v3.8b},  [x5],  #8
   1292        mov             v1.16b,  v2.16b
   1293        ext             v0.16b,  v0.16b,  v3.16b, #8
   1294        umull           v2.8h,   v3.8b,   v3.8b
   1295 
   1296        b.ne            4b // If we don't need to pad, just keep summing.
   1297        b               3b // If we need to pad, check how many pixels we have left.
   1298 
   1299 9:
   1300        ret
   1301 endfunc
   1302 
   1303 sgr_funcs 8