tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration_tmpl.S (30150B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 
     30 #define FILTER_OUT_STRIDE 384
     31 
     32 .macro sgr_funcs bpc
     33 // void dav1d_sgr_finish_filter1_2rows_Xbpc_neon(int16_t *tmp,
     34 //                                               const pixel *src,
     35 //                                               const ptrdiff_t src_stride,
     36 //                                               const int32_t **a,
     37 //                                               const int16_t **b,
     38 //                                               const int w, const int h);
     39 function sgr_finish_filter1_2rows_\bpc\()bpc_neon, export=1
     40        stp             d8,  d9,  [sp, #-0x40]!
     41        stp             d10, d11, [sp, #0x10]
     42        stp             d12, d13, [sp, #0x20]
     43        stp             d14, d15, [sp, #0x30]
     44 
     45        ldp             x7,  x8,  [x3]
     46        ldp             x9,  x3,  [x3, #16]
     47        ldp             x10, x11, [x4]
     48        ldp             x12, x4,  [x4, #16]
     49 
     50        mov             x13, #FILTER_OUT_STRIDE
     51        cmp             w6,  #1
     52        add             x2,  x1,  x2 // src + stride
     53        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
     54        add             x13, x0,  x13, lsl #1
     55 
     56        movi            v30.8h, #3
     57        movi            v31.4s, #3
     58 1:
     59        ld1             {v0.8h, v1.8h}, [x10], #32
     60        ld1             {v2.8h, v3.8h}, [x11], #32
     61        ld1             {v4.8h, v5.8h}, [x12], #32
     62        ld1             {v6.8h, v7.8h}, [x4],  #32
     63        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
     64        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
     65        ld1             {v22.4s, v23.4s, v24.4s}, [x9], #48
     66        ld1             {v25.4s, v26.4s, v27.4s}, [x3], #48
     67 
     68 2:
     69        ext             v8.16b,  v0.16b,  v1.16b, #2  // [0][1]
     70        ext             v9.16b,  v2.16b,  v3.16b, #2  // [1][1]
     71        ext             v10.16b, v4.16b,  v5.16b, #2  // [2][1]
     72        ext             v11.16b, v0.16b,  v1.16b, #4  // [0][2]
     73        ext             v12.16b, v2.16b,  v3.16b, #4  // [1][2]
     74        ext             v13.16b, v4.16b,  v5.16b, #4  // [2][2]
     75 
     76        add             v14.8h,  v2.8h,   v8.8h       // [1][0] + [0][1]
     77        add             v15.8h,  v9.8h,   v10.8h      // [1][1] + [2][1]
     78 
     79        add             v28.8h,  v0.8h,   v11.8h      // [0][0] + [0][2]
     80        add             v14.8h,  v14.8h,  v12.8h      // () + [1][2]
     81        add             v29.8h,  v4.8h,   v13.8h      // [2][0] + [2][2]
     82 
     83        ext             v8.16b,  v6.16b,  v7.16b, #2  // [3][1]
     84        ext             v11.16b, v6.16b,  v7.16b, #4  // [3][2]
     85 
     86        add             v14.8h,  v14.8h,  v15.8h      // mid
     87        add             v15.8h,  v28.8h,  v29.8h      // corners
     88 
     89        add             v28.8h,  v4.8h,   v9.8h       // [2][0] + [1][1]
     90        add             v29.8h,  v10.8h,  v8.8h       // [2][1] + [3][1]
     91 
     92        add             v2.8h,   v2.8h,   v12.8h      // [1][0] + [1][2]
     93        add             v28.8h,  v28.8h,  v13.8h      // () + [2][2]
     94        add             v4.8h,   v6.8h,   v11.8h      // [3][0] + [3][2]
     95 
     96        add             v0.8h,   v28.8h,  v29.8h      // mid
     97        add             v2.8h,   v2.8h,   v4.8h       // corners
     98 
     99        shl             v4.8h,   v14.8h,  #2
    100        mla             v4.8h,   v15.8h,  v30.8h      // * 3 -> a
    101 
    102        shl             v0.8h,   v0.8h,   #2
    103        mla             v0.8h,   v2.8h,   v30.8h      // * 3 -> a
    104 
    105        ext             v8.16b,  v16.16b, v17.16b, #4 // [0][1]
    106        ext             v9.16b,  v17.16b, v18.16b, #4
    107        ext             v10.16b, v16.16b, v17.16b, #8 // [0][2]
    108        ext             v11.16b, v17.16b, v18.16b, #8
    109        ext             v12.16b, v19.16b, v20.16b, #4 // [1][1]
    110        ext             v13.16b, v20.16b, v21.16b, #4
    111        add             v8.4s,   v8.4s,   v19.4s      // [0][1] + [1][0]
    112        add             v9.4s,   v9.4s,   v20.4s
    113        add             v16.4s,  v16.4s,  v10.4s      // [0][0] + [0][2]
    114        add             v17.4s,  v17.4s,  v11.4s
    115        ext             v14.16b, v19.16b, v20.16b, #8 // [1][2]
    116        ext             v15.16b, v20.16b, v21.16b, #8
    117        add             v16.4s,  v16.4s,  v22.4s      // () + [2][0]
    118        add             v17.4s,  v17.4s,  v23.4s
    119        add             v28.4s,  v12.4s,  v14.4s      // [1][1] + [1][2]
    120        add             v29.4s,  v13.4s,  v15.4s
    121        ext             v10.16b, v22.16b, v23.16b, #4 // [2][1]
    122        ext             v11.16b, v23.16b, v24.16b, #4
    123        add             v8.4s,   v8.4s,   v28.4s      // mid (incomplete)
    124        add             v9.4s,   v9.4s,   v29.4s
    125 
    126        add             v19.4s,  v19.4s,  v14.4s      // [1][0] + [1][2]
    127        add             v20.4s,  v20.4s,  v15.4s
    128        add             v14.4s,  v22.4s,  v12.4s      // [2][0] + [1][1]
    129        add             v15.4s,  v23.4s,  v13.4s
    130 
    131        ext             v12.16b, v22.16b, v23.16b, #8 // [2][2]
    132        ext             v13.16b, v23.16b, v24.16b, #8
    133        ext             v28.16b, v25.16b, v26.16b, #4 // [3][1]
    134        ext             v29.16b, v26.16b, v27.16b, #4
    135        add             v8.4s,   v8.4s,   v10.4s      // () + [2][1] = mid
    136        add             v9.4s,   v9.4s,   v11.4s
    137        add             v14.4s,  v14.4s,  v10.4s      // () + [2][1]
    138        add             v15.4s,  v15.4s,  v11.4s
    139        ext             v10.16b, v25.16b, v26.16b, #8 // [3][2]
    140        ext             v11.16b, v26.16b, v27.16b, #8
    141        add             v16.4s,  v16.4s,  v12.4s      // () + [2][2] = corner
    142        add             v17.4s,  v17.4s,  v13.4s
    143 
    144        add             v12.4s,  v12.4s,  v28.4s      // [2][2] + [3][1]
    145        add             v13.4s,  v13.4s,  v29.4s
    146        add             v25.4s,  v25.4s,  v10.4s      // [3][0] + [3][2]
    147        add             v26.4s,  v26.4s,  v11.4s
    148 
    149        add             v14.4s,  v14.4s,  v12.4s      // mid
    150        add             v15.4s,  v15.4s,  v13.4s
    151        add             v19.4s,  v19.4s,  v25.4s      // corner
    152        add             v20.4s,  v20.4s,  v26.4s
    153 
    154 .if \bpc == 8
    155        ld1             {v25.8b}, [x1], #8            // src
    156        ld1             {v26.8b}, [x2], #8
    157 .else
    158        ld1             {v25.8h}, [x1], #16           // src
    159        ld1             {v26.8h}, [x2], #16
    160 .endif
    161 
    162        shl             v8.4s,   v8.4s,   #2
    163        shl             v9.4s,   v9.4s,   #2
    164        mla             v8.4s,   v16.4s,  v31.4s      // * 3 -> b
    165        mla             v9.4s,   v17.4s,  v31.4s
    166 
    167 .if \bpc == 8
    168        uxtl            v25.8h,  v25.8b               // src
    169        uxtl            v26.8h,  v26.8b
    170 .endif
    171 
    172        shl             v14.4s,  v14.4s,  #2
    173        shl             v15.4s,  v15.4s,  #2
    174        mla             v14.4s,  v19.4s,  v31.4s      // * 3 -> b
    175        mla             v15.4s,  v20.4s,  v31.4s
    176 
    177        umlsl           v8.4s,   v4.4h,   v25.4h      // b - a * src
    178        umlsl2          v9.4s,   v4.8h,   v25.8h
    179        umlsl           v14.4s,  v0.4h,   v26.4h      // b - a * src
    180        umlsl2          v15.4s,  v0.8h,   v26.8h
    181        mov             v0.16b,  v1.16b
    182        rshrn           v8.4h,   v8.4s,   #9
    183        rshrn2          v8.8h,   v9.4s,   #9
    184        mov             v2.16b,  v3.16b
    185        rshrn           v14.4h,  v14.4s,  #9
    186        rshrn2          v14.8h,  v15.4s,  #9
    187        subs            w5,  w5,  #8
    188        mov             v4.16b,  v5.16b
    189        st1             {v8.8h},  [x0],  #16
    190        mov             v6.16b,  v7.16b
    191        st1             {v14.8h}, [x13], #16
    192 
    193        b.le            3f
    194        mov             v16.16b, v18.16b
    195        mov             v19.16b, v21.16b
    196        mov             v22.16b, v24.16b
    197        mov             v25.16b, v27.16b
    198        ld1             {v1.8h}, [x10], #16
    199        ld1             {v3.8h}, [x11], #16
    200        ld1             {v5.8h}, [x12], #16
    201        ld1             {v7.8h}, [x4],  #16
    202        ld1             {v17.4s, v18.4s}, [x7], #32
    203        ld1             {v20.4s, v21.4s}, [x8], #32
    204        ld1             {v23.4s, v24.4s}, [x9], #32
    205        ld1             {v26.4s, v27.4s}, [x3], #32
    206        b               2b
    207 
    208 3:
    209        ldp             d14, d15, [sp, #0x30]
    210        ldp             d12, d13, [sp, #0x20]
    211        ldp             d10, d11, [sp, #0x10]
    212        ldp             d8,  d9,  [sp], 0x40
    213        ret
    214 endfunc
    215 
    216 // void dav1d_sgr_finish_weighted1_Xbpc_neon(pixel *dst,
    217 //                                           const int32_t **a, const int16_t **b,
    218 //                                           const int w, const int w1,
    219 //                                           const int bitdepth_max);
    220 function sgr_finish_weighted1_\bpc\()bpc_neon, export=1
    221        ldp             x7,  x8,  [x1]
    222        ldr             x1,       [x1, #16]
    223        ldp             x9,  x10, [x2]
    224        ldr             x2,       [x2, #16]
    225 
    226        dup             v31.8h, w4
    227        dup             v30.8h, w5
    228 
    229        movi            v6.8h,  #3
    230        movi            v7.4s,  #3
    231 1:
    232        ld1             {v0.8h, v1.8h}, [x9],  #32
    233        ld1             {v2.8h, v3.8h}, [x10], #32
    234        ld1             {v4.8h, v5.8h}, [x2],  #32
    235        ld1             {v16.4s, v17.4s, v18.4s}, [x7], #48
    236        ld1             {v19.4s, v20.4s, v21.4s}, [x8], #48
    237        ld1             {v22.4s, v23.4s, v24.4s}, [x1], #48
    238 
    239 2:
    240        ext             v25.16b, v0.16b,  v1.16b, #2  // -stride
    241        ext             v26.16b, v2.16b,  v3.16b, #2  // 0
    242        ext             v27.16b, v4.16b,  v5.16b, #2  // +stride
    243        ext             v28.16b, v0.16b,  v1.16b, #4  // +1-stride
    244        ext             v29.16b, v2.16b,  v3.16b, #4  // +1
    245        add             v2.8h,   v2.8h,   v25.8h      // -1, -stride
    246        ext             v25.16b, v4.16b,  v5.16b, #4  // +1+stride
    247        add             v26.8h,  v26.8h,  v27.8h      // 0, +stride
    248        add             v0.8h,   v0.8h,   v28.8h      // -1-stride, +1-stride
    249        add             v2.8h,   v2.8h,   v26.8h
    250        add             v4.8h,   v4.8h,   v25.8h      // -1+stride, +1+stride
    251        add             v2.8h,   v2.8h,   v29.8h      // +1
    252        add             v0.8h,   v0.8h,   v4.8h
    253 
    254        ext             v25.16b, v16.16b, v17.16b, #4 // -stride
    255        ext             v26.16b, v17.16b, v18.16b, #4
    256        shl             v2.8h,   v2.8h,   #2
    257        ext             v27.16b, v16.16b, v17.16b, #8 // +1-stride
    258        ext             v28.16b, v17.16b, v18.16b, #8
    259        ext             v29.16b, v19.16b, v20.16b, #4 // 0
    260        ext             v4.16b,  v20.16b, v21.16b, #4
    261        mla             v2.8h,   v0.8h,   v6.8h       // * 3 -> a
    262        add             v25.4s,  v25.4s,  v19.4s      // -stride, -1
    263        add             v26.4s,  v26.4s,  v20.4s
    264        add             v16.4s,  v16.4s,  v27.4s      // -1-stride, +1-stride
    265        add             v17.4s,  v17.4s,  v28.4s
    266        ext             v27.16b, v19.16b, v20.16b, #8 // +1
    267        ext             v28.16b, v20.16b, v21.16b, #8
    268        add             v16.4s,  v16.4s,  v22.4s      // -1+stride
    269        add             v17.4s,  v17.4s,  v23.4s
    270        add             v29.4s,  v29.4s,  v27.4s      // 0, +1
    271        add             v4.4s,   v4.4s,   v28.4s
    272        add             v25.4s,  v25.4s,  v29.4s
    273        add             v26.4s,  v26.4s,  v4.4s
    274        ext             v27.16b, v22.16b, v23.16b, #4 // +stride
    275        ext             v28.16b, v23.16b, v24.16b, #4
    276        ext             v29.16b, v22.16b, v23.16b, #8 // +1+stride
    277        ext             v4.16b,  v23.16b, v24.16b, #8
    278 .if \bpc == 8
    279        ld1             {v19.8b}, [x0]                // src
    280 .else
    281        ld1             {v19.8h}, [x0]                // src
    282 .endif
    283        add             v25.4s,  v25.4s,  v27.4s      // +stride
    284        add             v26.4s,  v26.4s,  v28.4s
    285        add             v16.4s,  v16.4s,  v29.4s      // +1+stride
    286        add             v17.4s,  v17.4s,  v4.4s
    287        shl             v25.4s,  v25.4s,  #2
    288        shl             v26.4s,  v26.4s,  #2
    289        mla             v25.4s,  v16.4s,  v7.4s       // * 3 -> b
    290        mla             v26.4s,  v17.4s,  v7.4s
    291 .if \bpc == 8
    292        uxtl            v19.8h,  v19.8b               // src
    293 .endif
    294        mov             v0.16b,  v1.16b
    295        umlsl           v25.4s,  v2.4h,   v19.4h      // b - a * src
    296        umlsl2          v26.4s,  v2.8h,   v19.8h
    297        mov             v2.16b,  v3.16b
    298        rshrn           v25.4h,  v25.4s,  #9
    299        rshrn2          v25.8h,  v26.4s,  #9
    300 
    301        subs            w3,  w3,  #8
    302 
    303        // weighted1
    304        mov             v4.16b,  v5.16b
    305 
    306        ld1             {v1.8h}, [x9],  #16
    307        ld1             {v3.8h}, [x10], #16
    308        smull           v26.4s,  v25.4h,  v31.4h // v = t1 * w1
    309        smull2          v27.4s,  v25.8h,  v31.8h
    310        ld1             {v5.8h}, [x2],  #16
    311        rshrn           v26.4h,  v26.4s,  #11
    312        rshrn2          v26.8h,  v27.4s,  #11
    313        usqadd          v19.8h,  v26.8h
    314 .if \bpc == 8
    315        mov             v16.16b, v18.16b
    316        sqxtun          v26.8b,  v19.8h
    317        mov             v19.16b, v21.16b
    318        mov             v22.16b, v24.16b
    319        st1             {v26.8b}, [x0], #8
    320 .else
    321        mov             v16.16b, v18.16b
    322        umin            v26.8h,  v19.8h,  v30.8h
    323        mov             v19.16b, v21.16b
    324        mov             v22.16b, v24.16b
    325        st1             {v26.8h}, [x0], #16
    326 .endif
    327 
    328        b.le            3f
    329        ld1             {v17.4s, v18.4s}, [x7], #32
    330        ld1             {v20.4s, v21.4s}, [x8], #32
    331        ld1             {v23.4s, v24.4s}, [x1], #32
    332        b               2b
    333 
    334 3:
    335        ret
    336 endfunc
    337 
    338 // void dav1d_sgr_finish_filter2_2rows_Xbpc_neon(int16_t *tmp,
    339 //                                               const pixel *src,
    340 //                                               const ptrdiff_t stride,
    341 //                                               const int32_t **a,
    342 //                                               const int16_t **b,
    343 //                                               const int w, const int h);
    344 function sgr_finish_filter2_2rows_\bpc\()bpc_neon, export=1
    345        stp             d8,  d9,  [sp, #-0x40]!
    346        stp             d10, d11, [sp, #0x10]
    347        stp             d12, d13, [sp, #0x20]
    348        stp             d14, d15, [sp, #0x30]
    349 
    350        ldp             x3,  x7,  [x3]
    351        ldp             x4,  x8,  [x4]
    352        mov             x10, #FILTER_OUT_STRIDE
    353        cmp             w6,  #1
    354        add             x2,  x1,  x2 // src + stride
    355        csel            x2,  x1,  x2,  le // if (h <= 1) x2 = x1
    356        add             x10, x0,  x10, lsl #1
    357        movi            v4.8h,  #5
    358        movi            v5.4s,  #5
    359        movi            v6.8h,  #6
    360        movi            v7.4s,  #6
    361 1:
    362        ld1             {v0.8h, v1.8h}, [x4], #32
    363        ld1             {v2.8h, v3.8h}, [x8], #32
    364        ld1             {v16.4s, v17.4s, v18.4s}, [x3], #48
    365        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
    366 
    367 2:
    368        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
    369        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
    370        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
    371        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
    372        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
    373        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
    374        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
    375        add             v0.8h,   v0.8h,   v25.8h
    376 
    377        mul             v8.8h,   v25.8h,  v4.8h       // * 5
    378        mla             v8.8h,   v23.8h,  v6.8h       // * 6
    379 
    380        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
    381        ext             v23.16b, v17.16b, v18.16b, #4
    382        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
    383        ext             v25.16b, v20.16b, v21.16b, #4
    384        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
    385        ext             v27.16b, v17.16b, v18.16b, #8
    386        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
    387        ext             v29.16b, v20.16b, v21.16b, #8
    388        mul             v0.8h,   v0.8h,   v4.8h       // * 5
    389        mla             v0.8h,   v2.8h,   v6.8h       // * 6
    390 .if \bpc == 8
    391        ld1             {v31.8b}, [x1], #8
    392        ld1             {v30.8b}, [x2], #8
    393 .else
    394        ld1             {v31.8h}, [x1], #16
    395        ld1             {v30.8h}, [x2], #16
    396 .endif
    397        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
    398        add             v17.4s,  v17.4s,  v27.4s
    399        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
    400        add             v20.4s,  v20.4s,  v29.4s
    401        add             v16.4s,  v16.4s,  v19.4s
    402        add             v17.4s,  v17.4s,  v20.4s
    403 
    404        mul             v9.4s,   v19.4s,  v5.4s       // * 5
    405        mla             v9.4s,   v24.4s,  v7.4s       // * 6
    406        mul             v10.4s,  v20.4s,  v5.4s       // * 5
    407        mla             v10.4s,  v25.4s,  v7.4s       // * 6
    408 
    409        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
    410        add             v23.4s,  v23.4s,  v25.4s
    411        // This is, surprisingly, faster than other variants where the
    412        // mul+mla pairs are further apart, on Cortex A53.
    413        mul             v16.4s,  v16.4s,  v5.4s       // * 5
    414        mla             v16.4s,  v22.4s,  v7.4s       // * 6
    415        mul             v17.4s,  v17.4s,  v5.4s       // * 5
    416        mla             v17.4s,  v23.4s,  v7.4s       // * 6
    417 
    418 .if \bpc == 8
    419        uxtl            v31.8h,  v31.8b
    420        uxtl            v30.8h,  v30.8b
    421 .endif
    422        umlsl           v16.4s,  v0.4h,   v31.4h      // b - a * src
    423        umlsl2          v17.4s,  v0.8h,   v31.8h
    424        umlsl           v9.4s,   v8.4h,   v30.4h      // b - a * src
    425        umlsl2          v10.4s,  v8.8h,   v30.8h
    426        mov             v0.16b,  v1.16b
    427        rshrn           v16.4h,  v16.4s,  #9
    428        rshrn2          v16.8h,  v17.4s,  #9
    429        rshrn           v9.4h,   v9.4s,   #8
    430        rshrn2          v9.8h,   v10.4s,  #8
    431        subs            w5,  w5,  #8
    432        mov             v2.16b,  v3.16b
    433        st1             {v16.8h}, [x0],  #16
    434        st1             {v9.8h},  [x10], #16
    435 
    436        b.le            9f
    437        mov             v16.16b, v18.16b
    438        mov             v19.16b, v21.16b
    439        ld1             {v1.8h}, [x4], #16
    440        ld1             {v3.8h}, [x8], #16
    441        ld1             {v17.4s, v18.4s}, [x3], #32
    442        ld1             {v20.4s, v21.4s}, [x7], #32
    443        b               2b
    444 
    445 9:
    446        ldp             d14, d15, [sp, #0x30]
    447        ldp             d12, d13, [sp, #0x20]
    448        ldp             d10, d11, [sp, #0x10]
    449        ldp             d8,  d9,  [sp], 0x40
    450        ret
    451 endfunc
    452 
    453 // void dav1d_sgr_finish_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
    454 //                                           const int32_t **a,
    455 //                                           const int16_t **b,
    456 //                                           const int w, const int h,
    457 //                                           const int w1,
    458 //                                           const int bitdepth_max);
    459 function sgr_finish_weighted2_\bpc\()bpc_neon, export=1
    460        stp             d8,  d9,  [sp, #-0x30]!
    461        str             d10,      [sp, #0x10]
    462        stp             d14, d15, [sp, #0x20]
    463 
    464        dup             v14.8h, w6
    465        dup             v15.8h, w7
    466 
    467        ldp             x2,  x7,  [x2]
    468        ldp             x3,  x8,  [x3]
    469        cmp             w5,  #1
    470        add             x1,  x0,  x1 // src + stride
    471        // if (h <= 1), set the pointer to the second row to any dummy buffer
    472        // we can clobber (x2 in this case)
    473        csel            x1,  x2,  x1,  le
    474        movi            v4.8h,  #5
    475        movi            v5.4s,  #5
    476        movi            v6.8h,  #6
    477        movi            v7.4s,  #6
    478 1:
    479        ld1             {v0.8h, v1.8h}, [x3], #32
    480        ld1             {v2.8h, v3.8h}, [x8], #32
    481        ld1             {v16.4s, v17.4s, v18.4s}, [x2], #48
    482        ld1             {v19.4s, v20.4s, v21.4s}, [x7], #48
    483 
    484 2:
    485        ext             v24.16b, v0.16b,  v1.16b, #4  // +1-stride
    486        ext             v25.16b, v2.16b,  v3.16b, #4  // +1+stride
    487        ext             v22.16b, v0.16b,  v1.16b, #2  // -stride
    488        ext             v23.16b, v2.16b,  v3.16b, #2  // +stride
    489        add             v0.8h,   v0.8h,   v24.8h      // -1-stride, +1-stride
    490        add             v25.8h,  v2.8h,   v25.8h      // -1+stride, +1+stride
    491        add             v2.8h,   v22.8h,  v23.8h      // -stride, +stride
    492        add             v0.8h,   v0.8h,   v25.8h
    493 
    494        mul             v8.8h,   v25.8h,  v4.8h       // * 5
    495        mla             v8.8h,   v23.8h,  v6.8h       // * 6
    496 
    497        ext             v22.16b, v16.16b, v17.16b, #4 // -stride
    498        ext             v23.16b, v17.16b, v18.16b, #4
    499        ext             v24.16b, v19.16b, v20.16b, #4 // +stride
    500        ext             v25.16b, v20.16b, v21.16b, #4
    501        ext             v26.16b, v16.16b, v17.16b, #8 // +1-stride
    502        ext             v27.16b, v17.16b, v18.16b, #8
    503        ext             v28.16b, v19.16b, v20.16b, #8 // +1+stride
    504        ext             v29.16b, v20.16b, v21.16b, #8
    505        mul             v0.8h,   v0.8h,   v4.8h       // * 5
    506        mla             v0.8h,   v2.8h,   v6.8h       // * 6
    507 .if \bpc == 8
    508        ld1             {v31.8b}, [x0]
    509        ld1             {v30.8b}, [x1]
    510 .else
    511        ld1             {v31.8h}, [x0]
    512        ld1             {v30.8h}, [x1]
    513 .endif
    514        add             v16.4s,  v16.4s,  v26.4s      // -1-stride, +1-stride
    515        add             v17.4s,  v17.4s,  v27.4s
    516        add             v19.4s,  v19.4s,  v28.4s      // -1+stride, +1+stride
    517        add             v20.4s,  v20.4s,  v29.4s
    518        add             v16.4s,  v16.4s,  v19.4s
    519        add             v17.4s,  v17.4s,  v20.4s
    520 
    521        mul             v9.4s,   v19.4s,  v5.4s       // * 5
    522        mla             v9.4s,   v24.4s,  v7.4s       // * 6
    523        mul             v10.4s,  v20.4s,  v5.4s       // * 5
    524        mla             v10.4s,  v25.4s,  v7.4s       // * 6
    525 
    526        add             v22.4s,  v22.4s,  v24.4s      // -stride, +stride
    527        add             v23.4s,  v23.4s,  v25.4s
    528        // This is, surprisingly, faster than other variants where the
    529        // mul+mla pairs are further apart, on Cortex A53.
    530        mul             v16.4s,  v16.4s,  v5.4s       // * 5
    531        mla             v16.4s,  v22.4s,  v7.4s       // * 6
    532        mul             v17.4s,  v17.4s,  v5.4s       // * 5
    533        mla             v17.4s,  v23.4s,  v7.4s       // * 6
    534 
    535 .if \bpc == 8
    536        uxtl            v31.8h,  v31.8b
    537        uxtl            v30.8h,  v30.8b
    538 .endif
    539        umlsl           v16.4s,  v0.4h,   v31.4h      // b - a * src
    540        umlsl2          v17.4s,  v0.8h,   v31.8h
    541        umlsl           v9.4s,   v8.4h,   v30.4h      // b - a * src
    542        umlsl2          v10.4s,  v8.8h,   v30.8h
    543        mov             v0.16b,  v1.16b
    544        rshrn           v16.4h,  v16.4s,  #9
    545        rshrn2          v16.8h,  v17.4s,  #9
    546        rshrn           v9.4h,   v9.4s,   #8
    547        rshrn2          v9.8h,   v10.4s,  #8
    548 
    549        subs            w4,  w4,  #8
    550 
    551        // weighted1
    552        mov             v2.16b,  v3.16b
    553 
    554        ld1             {v1.8h}, [x3], #16
    555        ld1             {v3.8h}, [x8], #16
    556        smull           v22.4s,  v16.4h,  v14.4h // v
    557        smull2          v23.4s,  v16.8h,  v14.8h
    558        mov             v16.16b, v18.16b
    559        smull           v24.4s,  v9.4h,   v14.4h
    560        smull2          v25.4s,  v9.8h,   v14.8h
    561        mov             v19.16b, v21.16b
    562        rshrn           v22.4h,  v22.4s,  #11
    563        rshrn2          v22.8h,  v23.4s,  #11
    564        rshrn           v23.4h,  v24.4s,  #11
    565        rshrn2          v23.8h,  v25.4s,  #11
    566        usqadd          v31.8h,  v22.8h
    567        usqadd          v30.8h,  v23.8h
    568 .if \bpc == 8
    569        sqxtun          v22.8b,  v31.8h
    570        sqxtun          v23.8b,  v30.8h
    571        st1             {v22.8b}, [x0], #8
    572        st1             {v23.8b}, [x1], #8
    573 .else
    574        umin            v22.8h,  v31.8h,  v15.8h
    575        umin            v23.8h,  v30.8h,  v15.8h
    576        st1             {v22.8h}, [x0], #16
    577        st1             {v23.8h}, [x1], #16
    578 .endif
    579 
    580        b.le            3f
    581        ld1             {v17.4s, v18.4s}, [x2], #32
    582        ld1             {v20.4s, v21.4s}, [x7], #32
    583        b               2b
    584 
    585 3:
    586        ldp             d14, d15, [sp, #0x20]
    587        ldr             d10,      [sp, #0x10]
    588        ldp             d8,  d9,  [sp], 0x30
    589        ret
    590 endfunc
    591 
    592 // void dav1d_sgr_weighted2_Xbpc_neon(pixel *dst, const ptrdiff_t stride,
    593 //                                    const int16_t *t1, const int16_t *t2,
    594 //                                    const int w, const int h,
    595 //                                    const int16_t wt[2], const int bitdepth_max);
    596 function sgr_weighted2_\bpc\()bpc_neon, export=1
    597        cmp             w5,  #2
    598        add             x10, x0,  x1
    599        add             x12, x2,  #2*FILTER_OUT_STRIDE
    600        add             x13, x3,  #2*FILTER_OUT_STRIDE
    601        ld2r            {v30.8h, v31.8h}, [x6] // wt[0], wt[1]
    602 .if \bpc == 16
    603        dup             v29.8h,  w7
    604 .endif
    605        mov             x8,  #4*FILTER_OUT_STRIDE
    606        lsl             x1,  x1,  #1
    607        add             w9,  w4,  #7
    608        bic             x9,  x9,  #7 // Aligned width
    609 .if \bpc == 8
    610        sub             x1,  x1,  x9
    611 .else
    612        sub             x1,  x1,  x9, lsl #1
    613 .endif
    614        sub             x8,  x8,  x9, lsl #1
    615        mov             w9,  w4
    616        b.lt            2f
    617 1:
    618 .if \bpc == 8
    619        ld1             {v0.8b},  [x0]
    620        ld1             {v16.8b}, [x10]
    621 .else
    622        ld1             {v0.8h},  [x0]
    623        ld1             {v16.8h}, [x10]
    624 .endif
    625        ld1             {v1.8h},  [x2],  #16
    626        ld1             {v17.8h}, [x12], #16
    627        ld1             {v2.8h},  [x3],  #16
    628        ld1             {v18.8h}, [x13], #16
    629        subs            w4,  w4,  #8
    630 .if \bpc == 8
    631        uxtl            v0.8h,  v0.8b
    632        uxtl            v16.8h, v16.8b
    633 .endif
    634        smull           v3.4s,  v1.4h,  v30.4h // wt[0] * t1
    635        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * t2
    636        smull2          v4.4s,  v1.8h,  v30.8h // wt[0] * t1
    637        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * t2
    638        smull           v19.4s, v17.4h, v30.4h // wt[0] * t1
    639        smlal           v19.4s, v18.4h, v31.4h // wt[1] * t2
    640        smull2          v20.4s, v17.8h, v30.8h // wt[0] * t1
    641        smlal2          v20.4s, v18.8h, v31.8h // wt[1] * t2
    642        rshrn           v3.4h,  v3.4s,  #11
    643        rshrn2          v3.8h,  v4.4s,  #11
    644        rshrn           v19.4h, v19.4s, #11
    645        rshrn2          v19.8h, v20.4s, #11
    646        usqadd          v0.8h,  v3.8h
    647        usqadd          v16.8h, v19.8h
    648 .if \bpc == 8
    649        sqxtun          v3.8b,  v0.8h
    650        sqxtun          v19.8b, v16.8h
    651        st1             {v3.8b},  [x0],  #8
    652        st1             {v19.8b}, [x10], #8
    653 .else
    654        umin            v3.8h,  v0.8h,  v29.8h
    655        umin            v19.8h, v16.8h, v29.8h
    656        st1             {v3.8h},  [x0],  #16
    657        st1             {v19.8h}, [x10], #16
    658 .endif
    659        b.gt            1b
    660 
    661        subs            w5,  w5,  #2
    662        cmp             w5,  #1
    663        b.lt            0f
    664        mov             w4,  w9
    665        add             x0,  x0,  x1
    666        add             x10, x10, x1
    667        add             x2,  x2,  x8
    668        add             x12, x12, x8
    669        add             x3,  x3,  x8
    670        add             x13, x13, x8
    671        b.eq            2f
    672        b               1b
    673 
    674 2:
    675 .if \bpc == 8
    676        ld1             {v0.8b}, [x0]
    677 .else
    678        ld1             {v0.8h}, [x0]
    679 .endif
    680        ld1             {v1.8h}, [x2], #16
    681        ld1             {v2.8h}, [x3], #16
    682        subs            w4,  w4,  #8
    683 .if \bpc == 8
    684        uxtl            v0.8h,  v0.8b
    685 .endif
    686        smull           v3.4s,  v1.4h,  v30.4h // wt[0] * t1
    687        smlal           v3.4s,  v2.4h,  v31.4h // wt[1] * t2
    688        smull2          v4.4s,  v1.8h,  v30.8h // wt[0] * t1
    689        smlal2          v4.4s,  v2.8h,  v31.8h // wt[1] * t2
    690        rshrn           v3.4h,  v3.4s,  #11
    691        rshrn2          v3.8h,  v4.4s,  #11
    692        usqadd          v0.8h,  v3.8h
    693 .if \bpc == 8
    694        sqxtun          v3.8b,  v0.8h
    695        st1             {v3.8b}, [x0], #8
    696 .else
    697        umin            v3.8h,  v0.8h,  v29.8h
    698        st1             {v3.8h}, [x0], #16
    699 .endif
    700        b.gt            2b
    701 0:
    702        ret
    703 endfunc
    704 .endm