tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc.S (131851B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Janne Grunau
      4 * Copyright © 2018, Martin Storsjo
      5 * All rights reserved.
      6 *
      7 * Redistribution and use in source and binary forms, with or without
      8 * modification, are permitted provided that the following conditions are met:
      9 *
     10 * 1. Redistributions of source code must retain the above copyright notice, this
     11 *    list of conditions and the following disclaimer.
     12 *
     13 * 2. Redistributions in binary form must reproduce the above copyright notice,
     14 *    this list of conditions and the following disclaimer in the documentation
     15 *    and/or other materials provided with the distribution.
     16 *
     17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 */
     28 
     29 #include "src/arm/asm.S"
     30 #include "util.S"
     31 
     32 .macro avg dst, t0, t1, t2, t3
     33        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
     34        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
     35        add             \t0\().8h,   \t0\().8h,   \t2\().8h
     36        add             \t1\().8h,   \t1\().8h,   \t3\().8h
     37        sqrshrun        \dst\().8b,  \t0\().8h,   #5
     38        sqrshrun2       \dst\().16b, \t1\().8h,   #5
     39 .endm
     40 
     41 .macro w_avg dst, t0, t1, t2, t3
     42        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
     43        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
     44        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
     45        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
     46        sqdmulh         \t0\().8h,   \t0\().8h,   v30.8h
     47        sqdmulh         \t1\().8h,   \t1\().8h,   v30.8h
     48        add             \t0\().8h,   \t2\().8h,   \t0\().8h
     49        add             \t1\().8h,   \t3\().8h,   \t1\().8h
     50        sqrshrun        \dst\().8b,  \t0\().8h,   #4
     51        sqrshrun2       \dst\().16b, \t1\().8h,   #4
     52 .endm
     53 
     54 .macro mask dst, t0, t1, t2, t3
     55        ld1             {v30.16b}, [x6],  16
     56        ld1             {\t0\().8h,\t1\().8h},   [x2],  32
     57        mul             v30.16b, v30.16b, v31.16b
     58        ld1             {\t2\().8h,\t3\().8h},   [x3],  32
     59        shll            v28.8h, v30.8b,  #8
     60        shll2           v29.8h, v30.16b, #8
     61        sub             \t0\().8h,   \t2\().8h,   \t0\().8h
     62        sub             \t1\().8h,   \t3\().8h,   \t1\().8h
     63        sqdmulh         \t0\().8h,   \t0\().8h,   v28.8h
     64        sqdmulh         \t1\().8h,   \t1\().8h,   v29.8h
     65        add             \t0\().8h,   \t2\().8h,   \t0\().8h
     66        add             \t1\().8h,   \t3\().8h,   \t1\().8h
     67        sqrshrun        \dst\().8b,  \t0\().8h,   #4
     68        sqrshrun2       \dst\().16b, \t1\().8h,   #4
     69 .endm
     70 
     71 .macro bidir_fn type
     72 function \type\()_8bpc_neon, export=1
     73        clz             w4,  w4
     74 .ifc \type, w_avg
     75        dup             v30.8h, w6
     76        neg             v30.8h, v30.8h
     77        shl             v30.8h, v30.8h, #11
     78 .endif
     79 .ifc \type, mask
     80        movi            v31.16b, #256-2
     81 .endif
     82        movrel          x7,  \type\()_tbl
     83        sub             w4,  w4,  #24
     84        ldrsw           x4,  [x7, x4, lsl #2]
     85        \type           v4,  v0,  v1,  v2,  v3
     86        add             x7,  x7,  x4
     87        br              x7
     88 40:
     89        AARCH64_VALID_JUMP_TARGET
     90        add             x7,  x0,  x1
     91        lsl             x1,  x1,  #1
     92 4:
     93        cmp             w5,  #4
     94        st1             {v4.s}[0],  [x0], x1
     95        st1             {v4.s}[1],  [x7], x1
     96        st1             {v4.s}[2],  [x0], x1
     97        st1             {v4.s}[3],  [x7], x1
     98        b.eq            0f
     99        \type           v5,  v0,  v1,  v2,  v3
    100        cmp             w5,  #8
    101        st1             {v5.s}[0],  [x0], x1
    102        st1             {v5.s}[1],  [x7], x1
    103        st1             {v5.s}[2],  [x0], x1
    104        st1             {v5.s}[3],  [x7], x1
    105        b.eq            0f
    106        \type           v4,  v0,  v1,  v2,  v3
    107        st1             {v4.s}[0],  [x0], x1
    108        st1             {v4.s}[1],  [x7], x1
    109        \type           v5,  v0,  v1,  v2,  v3
    110        st1             {v4.s}[2],  [x0], x1
    111        st1             {v4.s}[3],  [x7], x1
    112        st1             {v5.s}[0],  [x0], x1
    113        st1             {v5.s}[1],  [x7], x1
    114        st1             {v5.s}[2],  [x0], x1
    115        st1             {v5.s}[3],  [x7], x1
    116        ret
    117 80:
    118        AARCH64_VALID_JUMP_TARGET
    119        add             x7,  x0,  x1
    120        lsl             x1,  x1,  #1
    121 8:
    122        st1             {v4.8b},    [x0], x1
    123        \type           v5,  v0,  v1,  v2,  v3
    124        st1             {v4.d}[1],  [x7], x1
    125        st1             {v5.8b},    [x0], x1
    126        subs            w5,  w5,  #4
    127        st1             {v5.d}[1],  [x7], x1
    128        b.le            0f
    129        \type           v4,  v0,  v1,  v2,  v3
    130        b               8b
    131 160:
    132        AARCH64_VALID_JUMP_TARGET
    133 16:
    134        \type           v5,  v0,  v1,  v2,  v3
    135        st1             {v4.16b}, [x0], x1
    136        \type           v6,  v0,  v1,  v2,  v3
    137        st1             {v5.16b}, [x0], x1
    138        \type           v7,  v0,  v1,  v2,  v3
    139        st1             {v6.16b}, [x0], x1
    140        subs            w5,  w5,  #4
    141        st1             {v7.16b}, [x0], x1
    142        b.le            0f
    143        \type           v4,  v0,  v1,  v2,  v3
    144        b               16b
    145 320:
    146        AARCH64_VALID_JUMP_TARGET
    147        add             x7,  x0,  x1
    148        lsl             x1,  x1,  #1
    149 32:
    150        \type           v5,  v0,  v1,  v2,  v3
    151        \type           v6,  v0,  v1,  v2,  v3
    152        st1             {v4.16b,v5.16b}, [x0], x1
    153        \type           v7,  v0,  v1,  v2,  v3
    154        subs            w5,  w5,  #2
    155        st1             {v6.16b,v7.16b}, [x7], x1
    156        b.le            0f
    157        \type           v4,  v0,  v1,  v2,  v3
    158        b               32b
    159 640:
    160        AARCH64_VALID_JUMP_TARGET
    161        add             x7,  x0,  x1
    162        lsl             x1,  x1,  #1
    163 64:
    164        \type           v5,  v0,  v1,  v2,  v3
    165        \type           v6,  v0,  v1,  v2,  v3
    166        \type           v7,  v0,  v1,  v2,  v3
    167        \type           v16, v0,  v1,  v2,  v3
    168        \type           v17, v0,  v1,  v2,  v3
    169        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
    170        \type           v18, v0,  v1,  v2,  v3
    171        \type           v19, v0,  v1,  v2,  v3
    172        subs            w5,  w5,  #2
    173        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
    174        b.le            0f
    175        \type           v4, v0,  v1,  v2,  v3
    176        b               64b
    177 1280:
    178        AARCH64_VALID_JUMP_TARGET
    179        add             x7,  x0,  #64
    180 128:
    181        \type           v5,  v0,  v1,  v2,  v3
    182        \type           v6,  v0,  v1,  v2,  v3
    183        \type           v7,  v0,  v1,  v2,  v3
    184        \type           v16, v0,  v1,  v2,  v3
    185        \type           v17, v0,  v1,  v2,  v3
    186        st1             {v4.16b,v5.16b,v6.16b,v7.16b}, [x0], x1
    187        \type           v18, v0,  v1,  v2,  v3
    188        \type           v19, v0,  v1,  v2,  v3
    189        subs            w5,  w5,  #1
    190        st1             {v16.16b,v17.16b,v18.16b,v19.16b}, [x7], x1
    191        b.le            0f
    192        \type           v4, v0,  v1,  v2,  v3
    193        b               128b
    194 0:
    195        ret
    196 endfunc
    197 
    198 jumptable \type\()_tbl
    199        .word 1280b - \type\()_tbl
    200        .word 640b  - \type\()_tbl
    201        .word 320b  - \type\()_tbl
    202        .word 160b  - \type\()_tbl
    203        .word 80b   - \type\()_tbl
    204        .word 40b   - \type\()_tbl
    205 endjumptable
    206 .endm
    207 
    208 bidir_fn avg
    209 bidir_fn w_avg
    210 bidir_fn mask
    211 
    212 
    213 .macro w_mask_fn type
    214 function w_mask_\type\()_8bpc_neon, export=1
    215        clz             w8,  w4
    216        movrel          x9,  w_mask_\type\()_tbl
    217        sub             w8,  w8,  #24
    218        ldrsw           x8,  [x9,  x8,  lsl #2]
    219        add             x9,  x9,  x8
    220        mov             w10, #6903
    221        dup             v0.8h,   w10
    222 .if \type == 444
    223        movi            v1.16b,  #64
    224 .elseif \type == 422
    225        dup             v2.8b,   w7
    226        movi            v3.8b,   #129
    227        sub             v3.8b,   v3.8b,   v2.8b
    228 .elseif \type == 420
    229        dup             v2.8h,   w7
    230        movi            v3.8h,   #1, lsl #8
    231        sub             v3.8h,   v3.8h,   v2.8h
    232 .endif
    233        add             x12,  x0,  x1
    234        lsl             x1,   x1,  #1
    235        br              x9
    236 40:
    237        AARCH64_VALID_JUMP_TARGET
    238 4:
    239        ld1             {v4.8h,   v5.8h},   [x2],  #32  // tmp1 (four rows at once)
    240        ld1             {v6.8h,   v7.8h},   [x3],  #32  // tmp2 (four rows at once)
    241        subs            w5,  w5,  #4
    242        sub             v16.8h,  v6.8h,   v4.8h
    243        sub             v17.8h,  v7.8h,   v5.8h
    244        sabd            v18.8h,  v4.8h,   v6.8h
    245        sabd            v19.8h,  v5.8h,   v7.8h
    246        uqsub           v18.8h,  v0.8h,   v18.8h
    247        uqsub           v19.8h,  v0.8h,   v19.8h
    248        ushr            v18.8h,  v18.8h,  #8
    249        ushr            v19.8h,  v19.8h,  #8
    250        shl             v20.8h,  v18.8h,  #9
    251        shl             v21.8h,  v19.8h,  #9
    252        sqdmulh         v20.8h,  v20.8h,  v16.8h
    253        sqdmulh         v21.8h,  v21.8h,  v17.8h
    254        add             v20.8h,  v20.8h,  v4.8h
    255        add             v21.8h,  v21.8h,  v5.8h
    256        sqrshrun        v22.8b,  v20.8h,  #4
    257        sqrshrun        v23.8b,  v21.8h,  #4
    258 .if \type == 444
    259        uzp1            v18.16b,  v18.16b, v19.16b      // Same as xtn, xtn2
    260        sub             v18.16b,  v1.16b,  v18.16b
    261        st1             {v18.16b}, [x6],  #16
    262 .elseif \type == 422
    263        addp            v18.8h,   v18.8h,  v19.8h
    264        xtn             v18.8b,   v18.8h
    265        uhsub           v18.8b,   v3.8b,   v18.8b
    266        st1             {v18.8b},  [x6],  #8
    267 .elseif \type == 420
    268        trn1            v24.2d,   v18.2d,  v19.2d
    269        trn2            v25.2d,   v18.2d,  v19.2d
    270        add             v24.8h,   v24.8h,  v25.8h
    271        addp            v18.8h,   v24.8h,  v24.8h
    272        sub             v18.4h,   v3.4h,   v18.4h
    273        rshrn           v18.8b,   v18.8h,  #2
    274        str             s18,         [x6],  #4
    275 .endif
    276        st1             {v22.s}[0],  [x0],  x1
    277        st1             {v22.s}[1],  [x12], x1
    278        st1             {v23.s}[0],  [x0],  x1
    279        st1             {v23.s}[1],  [x12], x1
    280        b.gt            4b
    281        ret
    282 80:
    283        AARCH64_VALID_JUMP_TARGET
    284 8:
    285        ld1             {v4.8h,   v5.8h},   [x2],  #32
    286        ld1             {v6.8h,   v7.8h},   [x3],  #32
    287        subs            w5,  w5,  #2
    288        sub             v16.8h,  v6.8h,   v4.8h
    289        sub             v17.8h,  v7.8h,   v5.8h
    290        sabd            v18.8h,  v4.8h,   v6.8h
    291        sabd            v19.8h,  v5.8h,   v7.8h
    292        uqsub           v18.8h,  v0.8h,   v18.8h
    293        uqsub           v19.8h,  v0.8h,   v19.8h
    294        ushr            v18.8h,  v18.8h,  #8
    295        ushr            v19.8h,  v19.8h,  #8
    296        shl             v20.8h,  v18.8h,  #9
    297        shl             v21.8h,  v19.8h,  #9
    298        sqdmulh         v20.8h,  v20.8h,  v16.8h
    299        sqdmulh         v21.8h,  v21.8h,  v17.8h
    300        add             v20.8h,  v20.8h,  v4.8h
    301        add             v21.8h,  v21.8h,  v5.8h
    302        sqrshrun        v22.8b,  v20.8h,  #4
    303        sqrshrun        v23.8b,  v21.8h,  #4
    304 .if \type == 444
    305        uzp1            v18.16b, v18.16b, v19.16b       // Same as xtn, xtn2
    306        sub             v18.16b, v1.16b,  v18.16b
    307        st1             {v18.16b}, [x6],  #16
    308 .elseif \type == 422
    309        addp            v18.8h,  v18.8h,  v19.8h
    310        xtn             v18.8b,  v18.8h
    311        uhsub           v18.8b,  v3.8b,   v18.8b
    312        st1             {v18.8b},  [x6],  #8
    313 .elseif \type == 420
    314        add             v18.8h,  v18.8h,  v19.8h
    315        addp            v18.8h,  v18.8h,  v18.8h
    316        sub             v18.4h,  v3.4h,   v18.4h
    317        rshrn           v18.8b,  v18.8h,  #2
    318        str             s18,       [x6],  #4
    319 .endif
    320        st1             {v22.8b},  [x0],  x1
    321        st1             {v23.8b},  [x12], x1
    322        b.gt            8b
    323        ret
    324 1280:
    325 640:
    326 320:
    327 160:
    328        AARCH64_VALID_JUMP_TARGET
    329        mov             w11, w4
    330        sub             x1,  x1,  w4,  uxtw
    331 .if \type == 444
    332        add             x10, x6,  w4,  uxtw
    333 .elseif \type == 422
    334        add             x10, x6,  x11, lsr #1
    335 .endif
    336        add             x9,  x3,  w4,  uxtw #1
    337        add             x7,  x2,  w4,  uxtw #1
    338 161:
    339        mov             w8,  w4
    340 16:
    341        ld1             {v4.8h,   v5.8h},   [x2],  #32
    342        ld1             {v6.8h,   v7.8h},   [x3],  #32
    343        ld1             {v16.8h,  v17.8h},  [x7],  #32
    344        ld1             {v18.8h,  v19.8h},  [x9],  #32
    345        subs            w8,  w8,  #16
    346        sub             v6.8h,   v6.8h,   v4.8h
    347        sub             v7.8h,   v7.8h,   v5.8h
    348        sub             v18.8h,  v18.8h,  v16.8h
    349        sub             v19.8h,  v19.8h,  v17.8h
    350        abs             v20.8h,  v6.8h
    351        abs             v21.8h,  v7.8h
    352        abs             v22.8h,  v18.8h
    353        abs             v23.8h,  v19.8h
    354        uqsub           v20.8h,  v0.8h,   v20.8h
    355        uqsub           v21.8h,  v0.8h,   v21.8h
    356        uqsub           v22.8h,  v0.8h,   v22.8h
    357        uqsub           v23.8h,  v0.8h,   v23.8h
    358        ushr            v20.8h,  v20.8h,  #8
    359        ushr            v21.8h,  v21.8h,  #8
    360        ushr            v22.8h,  v22.8h,  #8
    361        ushr            v23.8h,  v23.8h,  #8
    362        shl             v24.8h,  v20.8h,  #9
    363        shl             v25.8h,  v21.8h,  #9
    364        shl             v26.8h,  v22.8h,  #9
    365        shl             v27.8h,  v23.8h,  #9
    366        sqdmulh         v24.8h,  v24.8h,  v6.8h
    367        sqdmulh         v25.8h,  v25.8h,  v7.8h
    368        sqdmulh         v26.8h,  v26.8h,  v18.8h
    369        sqdmulh         v27.8h,  v27.8h,  v19.8h
    370        add             v24.8h,  v24.8h,  v4.8h
    371        add             v25.8h,  v25.8h,  v5.8h
    372        add             v26.8h,  v26.8h,  v16.8h
    373        add             v27.8h,  v27.8h,  v17.8h
    374        sqrshrun        v24.8b,  v24.8h,  #4
    375        sqrshrun        v25.8b,  v25.8h,  #4
    376        sqrshrun        v26.8b,  v26.8h,  #4
    377        sqrshrun        v27.8b,  v27.8h,  #4
    378 .if \type == 444
    379        uzp1            v20.16b, v20.16b, v21.16b       // Same as xtn, xtn2
    380        uzp1            v21.16b, v22.16b, v23.16b       // Ditto
    381        sub             v20.16b, v1.16b,  v20.16b
    382        sub             v21.16b, v1.16b,  v21.16b
    383        st1             {v20.16b}, [x6],  #16
    384        st1             {v21.16b}, [x10], #16
    385 .elseif \type == 422
    386        addp            v20.8h,  v20.8h,  v21.8h
    387        addp            v21.8h,  v22.8h,  v23.8h
    388        xtn             v20.8b,  v20.8h
    389        xtn             v21.8b,  v21.8h
    390        uhsub           v20.8b,  v3.8b,   v20.8b
    391        uhsub           v21.8b,  v3.8b,   v21.8b
    392        st1             {v20.8b},  [x6],  #8
    393        st1             {v21.8b},  [x10], #8
    394 .elseif \type == 420
    395        add             v20.8h,  v20.8h,  v22.8h
    396        add             v21.8h,  v21.8h,  v23.8h
    397        addp            v20.8h,  v20.8h,  v21.8h
    398        sub             v20.8h,  v3.8h,   v20.8h
    399        rshrn           v20.8b,  v20.8h,  #2
    400        st1             {v20.8b},  [x6],  #8
    401 .endif
    402        st1             {v24.8b,  v25.8b},  [x0],  #16
    403        st1             {v26.8b,  v27.8b},  [x12], #16
    404        b.gt            16b
    405        subs            w5,  w5,  #2
    406        add             x2,  x2,  w4,  uxtw #1
    407        add             x3,  x3,  w4,  uxtw #1
    408        add             x7,  x7,  w4,  uxtw #1
    409        add             x9,  x9,  w4,  uxtw #1
    410 .if \type == 444
    411        add             x6,  x6,  w4,  uxtw
    412        add             x10, x10, w4,  uxtw
    413 .elseif \type == 422
    414        add             x6,  x6,  x11, lsr #1
    415        add             x10, x10, x11, lsr #1
    416 .endif
    417        add             x0,  x0,  x1
    418        add             x12, x12, x1
    419        b.gt            161b
    420        ret
    421 endfunc
    422 
    423 jumptable w_mask_\type\()_tbl
    424        .word 1280b - w_mask_\type\()_tbl
    425        .word 640b  - w_mask_\type\()_tbl
    426        .word 320b  - w_mask_\type\()_tbl
    427        .word 160b  - w_mask_\type\()_tbl
    428        .word 80b   - w_mask_\type\()_tbl
    429        .word 40b   - w_mask_\type\()_tbl
    430 endjumptable
    431 .endm
    432 
    433 w_mask_fn 444
    434 w_mask_fn 422
    435 w_mask_fn 420
    436 
    437 
    438 function blend_8bpc_neon, export=1
    439        movrel          x6,  blend_tbl
    440        clz             w3,  w3
    441        sub             w3,  w3,  #26
    442        ldrsw           x3,  [x6,  x3,  lsl #2]
    443        add             x6,  x6,  x3
    444        movi            v4.16b,  #64
    445        add             x8,  x0,  x1
    446        lsl             x1,  x1,  #1
    447        br              x6
    448 40:
    449        AARCH64_VALID_JUMP_TARGET
    450 4:
    451        ld1             {v2.8b},  [x5],  #8
    452        ldr             d1,       [x2],  #8
    453        ldr             s0,       [x0]
    454        subs            w4,  w4,  #2
    455        ld1             {v0.s}[1],   [x8]
    456        sub             v3.8b,   v4.8b,   v2.8b
    457        umull           v5.8h,   v1.8b,   v2.8b
    458        umlal           v5.8h,   v0.8b,   v3.8b
    459        rshrn           v6.8b,   v5.8h,   #6
    460        st1             {v6.s}[0],   [x0],  x1
    461        st1             {v6.s}[1],   [x8],  x1
    462        b.gt            4b
    463        ret
    464 80:
    465        AARCH64_VALID_JUMP_TARGET
    466 8:
    467        ld1             {v2.16b},  [x5],  #16
    468        ld1             {v1.16b},  [x2],  #16
    469        ldr             d0,        [x0]
    470        ld1             {v0.d}[1], [x8]
    471        sub             v3.16b,  v4.16b,  v2.16b
    472        subs            w4,  w4,  #2
    473        umull           v5.8h,   v1.8b,   v2.8b
    474        umlal           v5.8h,   v0.8b,   v3.8b
    475        umull2          v6.8h,   v1.16b,  v2.16b
    476        umlal2          v6.8h,   v0.16b,  v3.16b
    477        rshrn           v7.8b,   v5.8h,   #6
    478        rshrn           v16.8b,  v6.8h,   #6
    479        st1             {v7.8b},   [x0],  x1
    480        st1             {v16.8b},  [x8],  x1
    481        b.gt            8b
    482        ret
    483 160:
    484        AARCH64_VALID_JUMP_TARGET
    485 16:
    486        ld1             {v1.16b,  v2.16b},  [x5],  #32
    487        ld1             {v5.16b,  v6.16b},  [x2],  #32
    488        ld1             {v0.16b},  [x0]
    489        subs            w4,  w4,  #2
    490        sub             v7.16b,  v4.16b,  v1.16b
    491        sub             v20.16b, v4.16b,  v2.16b
    492        ld1             {v3.16b},  [x8]
    493        umull           v16.8h,  v5.8b,   v1.8b
    494        umlal           v16.8h,  v0.8b,   v7.8b
    495        umull2          v17.8h,  v5.16b,  v1.16b
    496        umlal2          v17.8h,  v0.16b,  v7.16b
    497        umull           v21.8h,  v6.8b,   v2.8b
    498        umlal           v21.8h,  v3.8b,   v20.8b
    499        umull2          v22.8h,  v6.16b,  v2.16b
    500        umlal2          v22.8h,  v3.16b,  v20.16b
    501        rshrn           v18.8b,  v16.8h,  #6
    502        rshrn2          v18.16b, v17.8h,  #6
    503        rshrn           v19.8b,  v21.8h,  #6
    504        rshrn2          v19.16b, v22.8h,  #6
    505        st1             {v18.16b}, [x0],  x1
    506        st1             {v19.16b}, [x8],  x1
    507        b.gt            16b
    508        ret
    509 320:
    510        AARCH64_VALID_JUMP_TARGET
    511 32:
    512        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5],  #64
    513        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
    514        ld1             {v20.16b, v21.16b}, [x0]
    515        subs            w4,  w4,  #2
    516        ld1             {v22.16b, v23.16b}, [x8]
    517        sub             v5.16b,  v4.16b,  v0.16b
    518        sub             v6.16b,  v4.16b,  v1.16b
    519        sub             v30.16b, v4.16b,  v2.16b
    520        sub             v31.16b, v4.16b,  v3.16b
    521        umull           v24.8h,  v16.8b,  v0.8b
    522        umlal           v24.8h,  v20.8b,  v5.8b
    523        umull2          v26.8h,  v16.16b, v0.16b
    524        umlal2          v26.8h,  v20.16b, v5.16b
    525        umull           v28.8h,  v17.8b,  v1.8b
    526        umlal           v28.8h,  v21.8b,  v6.8b
    527        umull2          v7.8h,   v17.16b, v1.16b
    528        umlal2          v7.8h,   v21.16b, v6.16b
    529        umull           v27.8h,  v18.8b,  v2.8b
    530        umlal           v27.8h,  v22.8b,  v30.8b
    531        umull2          v1.8h,   v18.16b, v2.16b
    532        umlal2          v1.8h,   v22.16b, v30.16b
    533        umull           v29.8h,  v19.8b,  v3.8b
    534        umlal           v29.8h,  v23.8b,  v31.8b
    535        umull2          v21.8h,  v19.16b, v3.16b
    536        umlal2          v21.8h,  v23.16b, v31.16b
    537        rshrn           v24.8b,  v24.8h,  #6
    538        rshrn2          v24.16b, v26.8h,  #6
    539        rshrn           v25.8b,  v28.8h,  #6
    540        rshrn2          v25.16b, v7.8h,   #6
    541        rshrn           v27.8b,  v27.8h,  #6
    542        rshrn2          v27.16b, v1.8h,   #6
    543        rshrn           v28.8b,  v29.8h,  #6
    544        rshrn2          v28.16b, v21.8h,  #6
    545        st1             {v24.16b, v25.16b}, [x0],  x1
    546        st1             {v27.16b, v28.16b}, [x8],  x1
    547        b.gt            32b
    548        ret
    549 endfunc
    550 
    551 jumptable blend_tbl
    552        .word 320b - blend_tbl
    553        .word 160b - blend_tbl
    554        .word 80b  - blend_tbl
    555        .word 40b  - blend_tbl
    556 endjumptable
    557 
    558 function blend_h_8bpc_neon, export=1
    559        movrel          x6,  blend_h_tbl
    560        movrel          x5,  X(obmc_masks)
    561        add             x5,  x5,  w4,  uxtw
    562        sub             w4,  w4,  w4,  lsr #2
    563        clz             w7,  w3
    564        movi            v4.16b,  #64
    565        add             x8,  x0,  x1
    566        lsl             x1,  x1,  #1
    567        sub             w7,  w7,  #24
    568        ldrsw           x7,  [x6,  x7,  lsl #2]
    569        add             x6,  x6,  x7
    570        br              x6
    571 20:
    572        AARCH64_VALID_JUMP_TARGET
    573 2:
    574        ldr             h0,  [x5],  #2
    575        ldr             s1,  [x2],  #4
    576        subs            w4,  w4,  #2
    577        ldr             h2,  [x0]
    578        zip1            v0.8b,   v0.8b,   v0.8b
    579        sub             v3.8b,   v4.8b,   v0.8b
    580        ld1             {v2.h}[1],   [x8]
    581        umull           v5.8h,   v1.8b,   v0.8b
    582        umlal           v5.8h,   v2.8b,   v3.8b
    583        rshrn           v5.8b,   v5.8h,   #6
    584        st1             {v5.h}[0],   [x0],  x1
    585        st1             {v5.h}[1],   [x8],  x1
    586        b.gt            2b
    587        ret
    588 40:
    589        AARCH64_VALID_JUMP_TARGET
    590 4:
    591        ld2r            {v0.8b,   v1.8b},   [x5],  #2
    592        ld1             {v2.8b},   [x2],  #8
    593        subs            w4,  w4,  #2
    594        ext             v0.8b,   v0.8b,   v1.8b,   #4
    595        ldr             s3,          [x0]
    596        sub             v5.8b,   v4.8b,   v0.8b
    597        ld1             {v3.s}[1],   [x8]
    598        umull           v6.8h,   v2.8b,   v0.8b
    599        umlal           v6.8h,   v3.8b,   v5.8b
    600        rshrn           v6.8b,   v6.8h,   #6
    601        st1             {v6.s}[0],   [x0],  x1
    602        st1             {v6.s}[1],   [x8],  x1
    603        b.gt            4b
    604        ret
    605 80:
    606        AARCH64_VALID_JUMP_TARGET
    607 8:
    608        ld2r            {v0.16b,  v1.16b},  [x5],  #2
    609        ld1             {v2.16b},  [x2],  #16
    610        ldr             d3,        [x0]
    611        ext             v0.16b,  v0.16b,  v1.16b,  #8
    612        sub             v5.16b,  v4.16b,  v0.16b
    613        ld1             {v3.d}[1], [x8]
    614        subs            w4,  w4,  #2
    615        umull           v6.8h,   v0.8b,   v2.8b
    616        umlal           v6.8h,   v3.8b,   v5.8b
    617        umull2          v7.8h,   v0.16b,  v2.16b
    618        umlal2          v7.8h,   v3.16b,  v5.16b
    619        rshrn           v16.8b,  v6.8h,   #6
    620        rshrn           v17.8b,  v7.8h,   #6
    621        st1             {v16.8b},  [x0],  x1
    622        st1             {v17.8b},  [x8],  x1
    623        b.gt            8b
    624        ret
    625 160:
    626        AARCH64_VALID_JUMP_TARGET
    627 16:
    628        ld2r            {v0.16b,  v1.16b},  [x5],  #2
    629        ld1             {v2.16b,  v3.16b},  [x2],  #32
    630        ld1             {v5.16b},  [x0]
    631        sub             v7.16b,  v4.16b,  v0.16b
    632        sub             v16.16b, v4.16b,  v1.16b
    633        ld1             {v6.16b},  [x8]
    634        subs            w4,  w4,  #2
    635        umull           v17.8h,  v0.8b,   v2.8b
    636        umlal           v17.8h,  v5.8b,   v7.8b
    637        umull2          v18.8h,  v0.16b,  v2.16b
    638        umlal2          v18.8h,  v5.16b,  v7.16b
    639        umull           v19.8h,  v1.8b,   v3.8b
    640        umlal           v19.8h,  v6.8b,   v16.8b
    641        umull2          v20.8h,  v1.16b,  v3.16b
    642        umlal2          v20.8h,  v6.16b,  v16.16b
    643        rshrn           v21.8b,  v17.8h,  #6
    644        rshrn2          v21.16b, v18.8h,  #6
    645        rshrn           v22.8b,  v19.8h,  #6
    646        rshrn2          v22.16b, v20.8h,  #6
    647        st1             {v21.16b}, [x0],  x1
    648        st1             {v22.16b}, [x8],  x1
    649        b.gt            16b
    650        ret
    651 1280:
    652 640:
    653 320:
    654        AARCH64_VALID_JUMP_TARGET
    655        sub             x1,  x1,  w3,  uxtw
    656        add             x7,  x2,  w3,  uxtw
    657 321:
    658        ld2r            {v0.16b,  v1.16b},  [x5],  #2
    659        mov             w6,  w3
    660        sub             v20.16b, v4.16b,  v0.16b
    661        sub             v21.16b, v4.16b,  v1.16b
    662 32:
    663        ld1             {v16.16b, v17.16b}, [x2],  #32
    664        ld1             {v2.16b,  v3.16b},  [x0]
    665        subs            w6,  w6,  #32
    666        umull           v23.8h,  v0.8b,   v16.8b
    667        umlal           v23.8h,  v2.8b,   v20.8b
    668        ld1             {v18.16b, v19.16b}, [x7],  #32
    669        umull2          v27.8h,  v0.16b,  v16.16b
    670        umlal2          v27.8h,  v2.16b,  v20.16b
    671        ld1             {v6.16b,  v7.16b},  [x8]
    672        umull           v24.8h,  v0.8b,   v17.8b
    673        umlal           v24.8h,  v3.8b,   v20.8b
    674        umull2          v28.8h,  v0.16b,  v17.16b
    675        umlal2          v28.8h,  v3.16b,  v20.16b
    676        umull           v25.8h,  v1.8b,   v18.8b
    677        umlal           v25.8h,  v6.8b,   v21.8b
    678        umull2          v5.8h,   v1.16b,  v18.16b
    679        umlal2          v5.8h,   v6.16b,  v21.16b
    680        rshrn           v29.8b,  v23.8h,  #6
    681        rshrn2          v29.16b, v27.8h,  #6
    682        umull           v26.8h,  v1.8b,   v19.8b
    683        umlal           v26.8h,  v7.8b,   v21.8b
    684        umull2          v31.8h,  v1.16b,  v19.16b
    685        umlal2          v31.8h,  v7.16b,  v21.16b
    686        rshrn           v30.8b,  v24.8h,  #6
    687        rshrn2          v30.16b, v28.8h,  #6
    688        rshrn           v23.8b,  v25.8h,  #6
    689        rshrn2          v23.16b, v5.8h,   #6
    690        rshrn           v24.8b,  v26.8h,  #6
    691        st1             {v29.16b, v30.16b}, [x0],  #32
    692        rshrn2          v24.16b, v31.8h,  #6
    693        st1             {v23.16b, v24.16b}, [x8],  #32
    694        b.gt            32b
    695        subs            w4,  w4,  #2
    696        add             x0,  x0,  x1
    697        add             x8,  x8,  x1
    698        add             x2,  x2,  w3,  uxtw
    699        add             x7,  x7,  w3,  uxtw
    700        b.gt            321b
    701        ret
    702 endfunc
    703 
    704 jumptable blend_h_tbl
    705        .word 1280b - blend_h_tbl
    706        .word 640b  - blend_h_tbl
    707        .word 320b  - blend_h_tbl
    708        .word 160b  - blend_h_tbl
    709        .word 80b   - blend_h_tbl
    710        .word 40b   - blend_h_tbl
    711        .word 20b   - blend_h_tbl
    712 endjumptable
    713 
    714 function blend_v_8bpc_neon, export=1
    715        movrel          x6,  blend_v_tbl
    716        movrel          x5,  X(obmc_masks)
    717        add             x5,  x5,  w3,  uxtw
    718        clz             w3,  w3
    719        movi            v4.16b,  #64
    720        add             x8,  x0,  x1
    721        lsl             x1,  x1,  #1
    722        sub             w3,  w3,  #26
    723        ldrsw           x3,  [x6,  x3,  lsl #2]
    724        add             x6,  x6,  x3
    725        br              x6
    726 20:
    727        AARCH64_VALID_JUMP_TARGET
    728        ld1r            {v0.8b},   [x5]
    729        sub             v1.8b,   v4.8b,   v0.8b
    730 2:
    731        ldr             h2,          [x2],  #2
    732        ldr             b3,          [x0]
    733        subs            w4,  w4,  #2
    734        ld1             {v2.b}[1],   [x2]
    735        ld1             {v3.b}[1],   [x8]
    736        umull           v5.8h,   v2.8b,   v0.8b
    737        umlal           v5.8h,   v3.8b,   v1.8b
    738        rshrn           v5.8b,   v5.8h,   #6
    739        add             x2,  x2,  #2
    740        st1             {v5.b}[0],   [x0],  x1
    741        st1             {v5.b}[1],   [x8],  x1
    742        b.gt            2b
    743        ret
    744 40:
    745        AARCH64_VALID_JUMP_TARGET
    746        ld1r            {v0.2s},   [x5]
    747        sub             x1,  x1,  #2
    748        sub             v1.8b,   v4.8b,   v0.8b
    749 4:
    750        ld1             {v2.8b},   [x2],  #8
    751        ldr             s3,          [x0]
    752        ld1             {v3.s}[1],   [x8]
    753        subs            w4,  w4,  #2
    754        umull           v5.8h,   v2.8b,   v0.8b
    755        umlal           v5.8h,   v3.8b,   v1.8b
    756        rshrn           v5.8b,   v5.8h,   #6
    757        str             h5,          [x0],  #2
    758        st1             {v5.h}[2],   [x8],  #2
    759        st1             {v5.b}[2],   [x0],  x1
    760        st1             {v5.b}[6],   [x8],  x1
    761        b.gt            4b
    762        ret
    763 80:
    764        AARCH64_VALID_JUMP_TARGET
    765        ld1r            {v0.2d},   [x5]
    766        sub             x1,  x1,  #4
    767        sub             v1.16b,  v4.16b,  v0.16b
    768        zip2            v16.2d,  v1.2d,   v1.2d
    769 8:
    770        ld1             {v2.16b},  [x2],  #16
    771        ldr             d3,          [x0]
    772        ldr             d4,          [x8]
    773        subs            w4,  w4,  #2
    774        umull           v5.8h,  v0.8b,  v2.8b
    775        umlal           v5.8h,  v3.8b,  v1.8b
    776        umull2          v6.8h,  v0.16b, v2.16b
    777        umlal           v6.8h,  v4.8b,  v16.8b
    778        rshrn           v7.8b,  v5.8h,  #6
    779        rshrn           v17.8b, v6.8h,  #6
    780        str             s7,          [x0],  #4
    781        str             s17,         [x8],  #4
    782        st1             {v7.h}[2],   [x0],  x1
    783        st1             {v17.h}[2],  [x8],  x1
    784        b.gt            8b
    785        ret
    786 160:
    787        AARCH64_VALID_JUMP_TARGET
    788        ld1             {v0.16b},  [x5]
    789        sub             x1,  x1,  #8
    790        sub             v2.16b,  v4.16b,  v0.16b
    791 16:
    792        ld1             {v5.16b,  v6.16b},  [x2],  #32
    793        ld1             {v7.16b},  [x0]
    794        subs            w4,  w4,  #2
    795        ld1             {v16.16b}, [x8]
    796        umull           v17.8h,  v5.8b,   v0.8b
    797        umlal           v17.8h,  v7.8b,   v2.8b
    798        umull2          v18.8h,  v5.16b,  v0.16b
    799        umlal2          v18.8h,  v7.16b,  v2.16b
    800        umull           v20.8h,  v6.8b,   v0.8b
    801        umlal           v20.8h,  v16.8b,  v2.8b
    802        umull2          v21.8h,  v6.16b,  v0.16b
    803        umlal2          v21.8h,  v16.16b, v2.16b
    804        rshrn           v19.8b,  v17.8h,  #6
    805        rshrn2          v19.16b, v18.8h,  #6
    806        rshrn           v22.8b,  v20.8h,  #6
    807        rshrn2          v22.16b, v21.8h,  #6
    808        st1             {v19.8b},  [x0],  #8
    809        st1             {v22.8b},  [x8],  #8
    810        st1             {v19.s}[2],  [x0],  x1
    811        st1             {v22.s}[2],  [x8],  x1
    812        b.gt            16b
    813        ret
    814 320:
    815        AARCH64_VALID_JUMP_TARGET
    816        ld1             {v0.16b,  v1.16b},  [x5]
    817        sub             x1,  x1,  #16
    818        sub             v2.16b,  v4.16b,  v0.16b
    819        sub             v3.8b,   v4.8b,   v1.8b
    820 32:
    821        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x2],  #64
    822        ld1             {v5.16b,  v6.16b},  [x0]
    823        subs            w4,  w4,  #2
    824        ld1             {v20.16b, v21.16b}, [x8]
    825        umull           v22.8h,  v16.8b,  v0.8b
    826        umlal           v22.8h,  v5.8b,   v2.8b
    827        umull2          v23.8h,  v16.16b, v0.16b
    828        umlal2          v23.8h,  v5.16b,  v2.16b
    829        umull           v28.8h,  v17.8b,  v1.8b
    830        umlal           v28.8h,  v6.8b,   v3.8b
    831        umull           v30.8h,  v18.8b,  v0.8b
    832        umlal           v30.8h,  v20.8b,  v2.8b
    833        umull2          v31.8h,  v18.16b, v0.16b
    834        umlal2          v31.8h,  v20.16b, v2.16b
    835        umull           v25.8h,  v19.8b,  v1.8b
    836        umlal           v25.8h,  v21.8b,  v3.8b
    837        rshrn           v24.8b,  v22.8h,  #6
    838        rshrn2          v24.16b, v23.8h,  #6
    839        rshrn           v28.8b,  v28.8h,  #6
    840        rshrn           v30.8b,  v30.8h,  #6
    841        rshrn2          v30.16b, v31.8h,  #6
    842        rshrn           v27.8b,  v25.8h,  #6
    843        st1             {v24.16b}, [x0],  #16
    844        st1             {v30.16b}, [x8],  #16
    845        st1             {v28.8b},  [x0],  x1
    846        st1             {v27.8b},  [x8],  x1
    847        b.gt            32b
    848        ret
    849 endfunc
    850 
    851 jumptable blend_v_tbl
    852        .word 320b - blend_v_tbl
    853        .word 160b - blend_v_tbl
    854        .word 80b  - blend_v_tbl
    855        .word 40b  - blend_v_tbl
    856        .word 20b  - blend_v_tbl
    857 endjumptable
    858 
    859 
    860 // This has got the same signature as the put_8tap functions,
    861 // and assumes that x8 is set to (clz(w)-24).
    862 function put_neon, export=1
    863        movrel          x9,  put_tbl
    864        ldrsw           x8,  [x9, x8, lsl #2]
    865        add             x9,  x9,  x8
    866        br              x9
    867 
    868 20:
    869        AARCH64_VALID_JUMP_TARGET
    870 2:
    871        ldrh            w9, [x2]
    872        ldrh            w10, [x2, x3]
    873        add             x2, x2, x3, lsl #1
    874        subs            w5, w5, #2
    875        strh            w9, [x0]
    876        strh            w10, [x0, x1]
    877        add             x0, x0, x1, lsl #1
    878        b.gt            2b
    879        ret
    880 40:
    881        AARCH64_VALID_JUMP_TARGET
    882 4:
    883        ldr             w9, [x2]
    884        ldr             w10, [x2, x3]
    885        add             x2, x2, x3, lsl #1
    886        subs            w5, w5, #2
    887        str             w9, [x0]
    888        str             w10, [x0, x1]
    889        add             x0, x0, x1, lsl #1
    890        b.gt            4b
    891        ret
    892 80:
    893        AARCH64_VALID_JUMP_TARGET
    894 8:
    895        ldr             x9, [x2]
    896        ldr             x10, [x2, x3]
    897        add             x2, x2, x3, lsl #1
    898        subs            w5, w5, #2
    899        str             x9, [x0]
    900        str             x10, [x0, x1]
    901        add             x0, x0, x1, lsl #1
    902        b.gt            8b
    903        ret
    904 160:
    905        AARCH64_VALID_JUMP_TARGET
    906 16:
    907        ldr             q0, [x2]
    908        ldr             q1, [x2, x3]
    909        add             x2, x2, x3, lsl #1
    910        subs            w5, w5, #2
    911        str             q0, [x0]
    912        str             q1, [x0, x1]
    913        add             x0, x0, x1, lsl #1
    914        b.gt            16b
    915        ret
    916 320:
    917        AARCH64_VALID_JUMP_TARGET
    918 32:
    919        ldp             q0, q1, [x2]
    920        add             x2, x2, x3
    921        stp             q0, q1, [x0]
    922        add             x0, x0, x1
    923        ldp             q2, q3, [x2]
    924        add             x2, x2, x3
    925        stp             q2, q3, [x0]
    926        subs            w5, w5, #2
    927        add             x0, x0, x1
    928        b.gt            32b
    929        ret
    930 640:
    931        AARCH64_VALID_JUMP_TARGET
    932 64:
    933        ldp             q0, q1, [x2]
    934        stp             q0, q1, [x0]
    935        ldp             q2, q3, [x2, #32]
    936        add             x2, x2, x3
    937        stp             q2, q3, [x0, #32]
    938        subs            w5, w5, #1
    939        add             x0, x0, x1
    940        b.gt            64b
    941        ret
    942 1280:
    943        AARCH64_VALID_JUMP_TARGET
    944 128:
    945        ldp             q0, q1, [x2]
    946        stp             q0, q1, [x0]
    947        ldp             q2, q3, [x2, #32]
    948        stp             q2, q3, [x0, #32]
    949        ldp             q4, q5, [x2, #64]
    950        stp             q4, q5, [x0, #64]
    951        ldp             q6, q7, [x2, #96]
    952        add             x2, x2, x3
    953        stp             q6, q7, [x0, #96]
    954        subs            w5, w5, #1
    955        add             x0, x0, x1
    956        b.gt            128b
    957        ret
    958 endfunc
    959 
    960 jumptable put_tbl
    961        .word 1280b - put_tbl
    962        .word 640b  - put_tbl
    963        .word 320b  - put_tbl
    964        .word 160b  - put_tbl
    965        .word 80b   - put_tbl
    966        .word 40b   - put_tbl
    967        .word 20b   - put_tbl
    968 endjumptable
    969 
    970 
    971 // This has got the same signature as the prep_8tap functions,
    972 // and assumes that x8 is set to (clz(w)-24), and x7 to w*2.
    973 function prep_neon, export=1
    974        movrel          x9,  prep_tbl
    975        ldrsw           x8,  [x9, x8, lsl #2]
    976        movi            v24.16b, #16
    977        add             x9,  x9,  x8
    978        br              x9
    979 
    980 40:
    981        AARCH64_VALID_JUMP_TARGET
    982 4:
    983        ldr             s0, [x1]
    984        ldr             s2, [x1, x2]
    985        add             x1, x1, x2, lsl #1
    986        ldr             s1, [x1]
    987        ldr             s3, [x1, x2]
    988        add             x1, x1, x2, lsl #1
    989        mov             v0.s[1], v2.s[0]
    990        mov             v1.s[1], v3.s[0]
    991        ushll           v0.8h, v0.8b, #4
    992        ushll           v1.8h, v1.8b, #4
    993        subs            w4, w4, #4
    994        stp             q0, q1, [x0], #32
    995        b.gt            4b
    996        ret
    997 80:
    998        AARCH64_VALID_JUMP_TARGET
    999 8:
   1000        ldr             d0, [x1]
   1001        ldr             d1, [x1, x2]
   1002        add             x1, x1, x2, lsl #1
   1003        ldr             d2, [x1]
   1004        ldr             d3, [x1, x2]
   1005        add             x1, x1, x2, lsl #1
   1006        ushll           v0.8h, v0.8b, #4
   1007        ushll           v1.8h, v1.8b, #4
   1008        umull           v2.8h, v2.8b, v24.8b
   1009        umull           v3.8h, v3.8b, v24.8b
   1010        subs            w4, w4, #4
   1011        stp             q0, q1, [x0]
   1012        stp             q2, q3, [x0, #32]
   1013        add             x0, x0, #64
   1014        b.gt            8b
   1015        ret
   1016 160:
   1017        AARCH64_VALID_JUMP_TARGET
   1018 16:
   1019        ldr             q1, [x1]
   1020        ldr             q3, [x1, x2]
   1021        add             x1, x1, x2, lsl #1
   1022        ushll           v0.8h, v1.8b, #4
   1023        ushll2          v1.8h, v1.16b, #4
   1024        ldr             q5, [x1]
   1025        ldr             q7, [x1, x2]
   1026        add             x1, x1, x2, lsl #1
   1027        umull           v2.8h, v3.8b, v24.8b
   1028        umull2          v3.8h, v3.16b, v24.16b
   1029        ushll           v4.8h, v5.8b, #4
   1030        ushll2          v5.8h, v5.16b, #4
   1031        umull           v6.8h, v7.8b, v24.8b
   1032        umull2          v7.8h, v7.16b, v24.16b
   1033        subs            w4, w4, #4
   1034        stp             q0, q1, [x0]
   1035        stp             q2, q3, [x0, #32]
   1036        stp             q4, q5, [x0, #64]
   1037        stp             q6, q7, [x0, #96]
   1038        add             x0, x0, #128
   1039        b.gt            16b
   1040        ret
   1041 320:
   1042        AARCH64_VALID_JUMP_TARGET
   1043 32:
   1044        ldp             q4, q5, [x1]
   1045        add             x1, x1, x2
   1046        ldp             q6, q7, [x1]
   1047        add             x1, x1, x2
   1048        ushll           v0.8h, v4.8b, #4
   1049        ushll2          v1.8h, v4.16b, #4
   1050        umull           v2.8h, v5.8b, v24.8b
   1051        umull2          v3.8h, v5.16b, v24.16b
   1052        ushll           v4.8h, v6.8b, #4
   1053        ushll2          v5.8h, v6.16b, #4
   1054        umull           v6.8h, v7.8b, v24.8b
   1055        umull2          v7.8h, v7.16b, v24.16b
   1056        subs            w4, w4, #2
   1057        stp             q0, q1, [x0]
   1058        stp             q2, q3, [x0, #32]
   1059        stp             q4, q5, [x0, #64]
   1060        stp             q6, q7, [x0, #96]
   1061        add             x0, x0, #128
   1062        b.gt            32b
   1063        ret
   1064 640:
   1065        AARCH64_VALID_JUMP_TARGET
   1066 64:
   1067        ldp             q4, q5, [x1]
   1068        ldp             q6, q7, [x1, #32]
   1069        add             x1, x1, x2
   1070        ushll           v0.8h, v4.8b, #4
   1071        ushll2          v1.8h, v4.16b, #4
   1072        umull           v2.8h, v5.8b, v24.8b
   1073        umull2          v3.8h, v5.16b, v24.16b
   1074        ushll           v4.8h, v6.8b, #4
   1075        ushll2          v5.8h, v6.16b, #4
   1076        umull           v6.8h, v7.8b, v24.8b
   1077        umull2          v7.8h, v7.16b, v24.16b
   1078        subs            w4, w4, #1
   1079        stp             q0, q1, [x0]
   1080        stp             q2, q3, [x0, #32]
   1081        stp             q4, q5, [x0, #64]
   1082        stp             q6, q7, [x0, #96]
   1083        add             x0, x0, #128
   1084        b.gt            64b
   1085        ret
   1086 1280:
   1087        AARCH64_VALID_JUMP_TARGET
   1088 128:
   1089        ldp             q28, q29, [x1]
   1090        ldp             q30, q31, [x1, #32]
   1091        ushll           v16.8h, v28.8b, #4
   1092        ushll2          v17.8h, v28.16b, #4
   1093        umull           v18.8h, v29.8b, v24.8b
   1094        umull2          v19.8h, v29.16b, v24.16b
   1095        ushll           v20.8h, v30.8b, #4
   1096        ushll2          v21.8h, v30.16b, #4
   1097        umull           v22.8h, v31.8b, v24.8b
   1098        umull2          v23.8h, v31.16b, v24.16b
   1099        ldp             q28, q29, [x1, #64]
   1100        ldp             q30, q31, [x1, #96]
   1101        add             x1, x1, x2
   1102        stp             q16, q17, [x0]
   1103        stp             q18, q19, [x0, #32]
   1104        stp             q20, q21, [x0, #64]
   1105        stp             q22, q23, [x0, #96]
   1106        ushll           v16.8h, v28.8b, #4
   1107        ushll2          v17.8h, v28.16b, #4
   1108        umull           v18.8h, v29.8b, v24.8b
   1109        umull2          v19.8h, v29.16b, v24.16b
   1110        ushll           v20.8h, v30.8b, #4
   1111        ushll2          v21.8h, v30.16b, #4
   1112        umull           v22.8h, v31.8b, v24.8b
   1113        umull2          v23.8h, v31.16b, v24.16b
   1114        subs            w4, w4, #1
   1115        stp             q16, q17, [x0, #128]
   1116        stp             q18, q19, [x0, #160]
   1117        stp             q20, q21, [x0, #192]
   1118        stp             q22, q23, [x0, #224]
   1119        add             x0, x0, #256
   1120        b.gt            128b
   1121        ret
   1122 endfunc
   1123 
   1124 jumptable prep_tbl
   1125        .word 1280b - prep_tbl
   1126        .word 640b  - prep_tbl
   1127        .word 320b  - prep_tbl
   1128        .word 160b  - prep_tbl
   1129        .word 80b   - prep_tbl
   1130        .word 40b   - prep_tbl
   1131 endjumptable
   1132 
   1133 
   1134 .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
   1135        ld1             {\d0\wd}[0], [\s0], \strd
   1136        ld1             {\d1\wd}[0], [\s1], \strd
   1137 .ifnb \d2
   1138        ld1             {\d2\wd}[0], [\s0], \strd
   1139        ld1             {\d3\wd}[0], [\s1], \strd
   1140 .endif
   1141 .ifnb \d4
   1142        ld1             {\d4\wd}[0], [\s0], \strd
   1143 .endif
   1144 .ifnb \d5
   1145        ld1             {\d5\wd}[0], [\s1], \strd
   1146 .endif
   1147 .ifnb \d6
   1148        ld1             {\d6\wd}[0], [\s0], \strd
   1149 .endif
   1150 .endm
   1151 .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
   1152        ld1             {\d0\wd}, [\s0], \strd
   1153        ld1             {\d1\wd}, [\s1], \strd
   1154 .ifnb \d2
   1155        ld1             {\d2\wd}, [\s0], \strd
   1156        ld1             {\d3\wd}, [\s1], \strd
   1157 .endif
   1158 .ifnb \d4
   1159        ld1             {\d4\wd}, [\s0], \strd
   1160 .endif
   1161 .ifnb \d5
   1162        ld1             {\d5\wd}, [\s1], \strd
   1163 .endif
   1164 .ifnb \d6
   1165        ld1             {\d6\wd}, [\s0], \strd
   1166 .endif
   1167 .endm
   1168 .macro load_h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1169        load_slice      \s0, \s1, \strd, .h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1170 .endm
   1171 .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1172        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1173 .endm
   1174 .macro load_8b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1175        load_reg        \s0, \s1, \strd, .8b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1176 .endm
   1177 .macro load_16b s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1178        load_reg        \s0, \s1, \strd, .16b, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1179 .endm
   1180 .macro interleave_1 wd, r0, r1, r2, r3, r4
   1181        trn1            \r0\wd, \r0\wd, \r1\wd
   1182        trn1            \r1\wd, \r1\wd, \r2\wd
   1183 .ifnb \r3
   1184        trn1            \r2\wd, \r2\wd, \r3\wd
   1185        trn1            \r3\wd, \r3\wd, \r4\wd
   1186 .endif
   1187 .endm
   1188 .macro interleave_1_h r0, r1, r2, r3, r4
   1189        interleave_1    .4h, \r0, \r1, \r2, \r3, \r4
   1190 .endm
   1191 .macro interleave_1_s r0, r1, r2, r3, r4
   1192        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
   1193 .endm
   1194 .macro interleave_2 wd, r0, r1, r2, r3, r4, r5
   1195        trn1            \r0\wd,  \r0\wd, \r2\wd
   1196        trn1            \r1\wd,  \r1\wd, \r3\wd
   1197        trn1            \r2\wd,  \r2\wd, \r4\wd
   1198        trn1            \r3\wd,  \r3\wd, \r5\wd
   1199 .endm
   1200 .macro interleave_2_s r0, r1, r2, r3, r4, r5
   1201        interleave_2    .2s, \r0, \r1, \r2, \r3, \r4, \r5
   1202 .endm
   1203 .macro uxtl_b r0, r1, r2, r3, r4, r5, r6
   1204        uxtl            \r0\().8h, \r0\().8b
   1205        uxtl            \r1\().8h, \r1\().8b
   1206 .ifnb \r2
   1207        uxtl            \r2\().8h, \r2\().8b
   1208        uxtl            \r3\().8h, \r3\().8b
   1209 .endif
   1210 .ifnb \r4
   1211        uxtl            \r4\().8h, \r4\().8b
   1212 .endif
   1213 .ifnb \r5
   1214        uxtl            \r5\().8h, \r5\().8b
   1215 .endif
   1216 .ifnb \r6
   1217        uxtl            \r6\().8h, \r6\().8b
   1218 .endif
   1219 .endm
   1220 .macro mul_mla_4tap d, s0, s1, s2, s3, wd
   1221        mul             \d\wd,  \s0\wd,  v0.h[0]
   1222        mla             \d\wd,  \s1\wd,  v0.h[1]
   1223        mla             \d\wd,  \s2\wd,  v0.h[2]
   1224        mla             \d\wd,  \s3\wd,  v0.h[3]
   1225 .endm
   1226 // Interleaving the mul/mla chains actually hurts performance
   1227 // significantly on Cortex A53, thus keeping mul/mla tightly
   1228 // chained like this.
   1229 .macro mul_mla_6tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
   1230        mul             \d0\().4h, \s1\().4h, v0.h[1]
   1231        mla             \d0\().4h, \s2\().4h, v0.h[2]
   1232        mla             \d0\().4h, \s3\().4h, v0.h[3]
   1233        mla             \d0\().4h, \s4\().4h, v0.h[4]
   1234        mla             \d0\().4h, \s5\().4h, v0.h[5]
   1235        mla             \d0\().4h, \s6\().4h, v0.h[6]
   1236 .endm
   1237 .macro mul_mla_6tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
   1238        mul             \d0\().8h, \s1\().8h, v0.h[1]
   1239        mla             \d0\().8h, \s2\().8h, v0.h[2]
   1240        mla             \d0\().8h, \s3\().8h, v0.h[3]
   1241        mla             \d0\().8h, \s4\().8h, v0.h[4]
   1242        mla             \d0\().8h, \s5\().8h, v0.h[5]
   1243        mla             \d0\().8h, \s6\().8h, v0.h[6]
   1244 .endm
   1245 .macro mul_mla_6tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
   1246        mul             \d0\().8h, \s1\().8h, v0.h[1]
   1247        mla             \d0\().8h, \s2\().8h, v0.h[2]
   1248        mla             \d0\().8h, \s3\().8h, v0.h[3]
   1249        mla             \d0\().8h, \s4\().8h, v0.h[4]
   1250        mla             \d0\().8h, \s5\().8h, v0.h[5]
   1251        mla             \d0\().8h, \s6\().8h, v0.h[6]
   1252        mul             \d1\().8h, \s2\().8h, v0.h[1]
   1253        mla             \d1\().8h, \s3\().8h, v0.h[2]
   1254        mla             \d1\().8h, \s4\().8h, v0.h[3]
   1255        mla             \d1\().8h, \s5\().8h, v0.h[4]
   1256        mla             \d1\().8h, \s6\().8h, v0.h[5]
   1257        mla             \d1\().8h, \s7\().8h, v0.h[6]
   1258 .endm
   1259 .macro mul_mla_6tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
   1260        mul             \d0\().8h, \s1\().8h, v0.h[1]
   1261        mla             \d0\().8h, \s2\().8h, v0.h[2]
   1262        mla             \d0\().8h, \s3\().8h, v0.h[3]
   1263        mla             \d0\().8h, \s4\().8h, v0.h[4]
   1264        mla             \d0\().8h, \s5\().8h, v0.h[5]
   1265        mla             \d0\().8h, \s6\().8h, v0.h[6]
   1266        mul             \d1\().8h, \s3\().8h, v0.h[1]
   1267        mla             \d1\().8h, \s4\().8h, v0.h[2]
   1268        mla             \d1\().8h, \s5\().8h, v0.h[3]
   1269        mla             \d1\().8h, \s6\().8h, v0.h[4]
   1270        mla             \d1\().8h, \s7\().8h, v0.h[5]
   1271        mla             \d1\().8h, \s8\().8h, v0.h[6]
   1272 .endm
   1273 .macro mul_mla_8tap_0_4h d0, s0, s1, s2, s3, s4, s5, s6, s7
   1274        mul             \d0\().4h, \s0\().4h, v0.h[0]
   1275        mla             \d0\().4h, \s1\().4h, v0.h[1]
   1276        mla             \d0\().4h, \s2\().4h, v0.h[2]
   1277        mla             \d0\().4h, \s3\().4h, v0.h[3]
   1278        mla             \d0\().4h, \s4\().4h, v0.h[4]
   1279        mla             \d0\().4h, \s5\().4h, v0.h[5]
   1280        mla             \d0\().4h, \s6\().4h, v0.h[6]
   1281        mla             \d0\().4h, \s7\().4h, v0.h[7]
   1282 .endm
   1283 .macro mul_mla_8tap_0 d0, s0, s1, s2, s3, s4, s5, s6, s7
   1284        mul             \d0\().8h, \s0\().8h, v0.h[0]
   1285        mla             \d0\().8h, \s1\().8h, v0.h[1]
   1286        mla             \d0\().8h, \s2\().8h, v0.h[2]
   1287        mla             \d0\().8h, \s3\().8h, v0.h[3]
   1288        mla             \d0\().8h, \s4\().8h, v0.h[4]
   1289        mla             \d0\().8h, \s5\().8h, v0.h[5]
   1290        mla             \d0\().8h, \s6\().8h, v0.h[6]
   1291        mla             \d0\().8h, \s7\().8h, v0.h[7]
   1292 .endm
   1293 .macro mul_mla_8tap_1 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8
   1294        mul             \d0\().8h, \s0\().8h, v0.h[0]
   1295        mla             \d0\().8h, \s1\().8h, v0.h[1]
   1296        mla             \d0\().8h, \s2\().8h, v0.h[2]
   1297        mla             \d0\().8h, \s3\().8h, v0.h[3]
   1298        mla             \d0\().8h, \s4\().8h, v0.h[4]
   1299        mla             \d0\().8h, \s5\().8h, v0.h[5]
   1300        mla             \d0\().8h, \s6\().8h, v0.h[6]
   1301        mla             \d0\().8h, \s7\().8h, v0.h[7]
   1302        mul             \d1\().8h, \s1\().8h, v0.h[0]
   1303        mla             \d1\().8h, \s2\().8h, v0.h[1]
   1304        mla             \d1\().8h, \s3\().8h, v0.h[2]
   1305        mla             \d1\().8h, \s4\().8h, v0.h[3]
   1306        mla             \d1\().8h, \s5\().8h, v0.h[4]
   1307        mla             \d1\().8h, \s6\().8h, v0.h[5]
   1308        mla             \d1\().8h, \s7\().8h, v0.h[6]
   1309        mla             \d1\().8h, \s8\().8h, v0.h[7]
   1310 .endm
   1311 .macro mul_mla_8tap_2 d0, d1, s0, s1, s2, s3, s4, s5, s6, s7, s8, s9
   1312        mul             \d0\().8h, \s0\().8h, v0.h[0]
   1313        mla             \d0\().8h, \s1\().8h, v0.h[1]
   1314        mla             \d0\().8h, \s2\().8h, v0.h[2]
   1315        mla             \d0\().8h, \s3\().8h, v0.h[3]
   1316        mla             \d0\().8h, \s4\().8h, v0.h[4]
   1317        mla             \d0\().8h, \s5\().8h, v0.h[5]
   1318        mla             \d0\().8h, \s6\().8h, v0.h[6]
   1319        mla             \d0\().8h, \s7\().8h, v0.h[7]
   1320        mul             \d1\().8h, \s2\().8h, v0.h[0]
   1321        mla             \d1\().8h, \s3\().8h, v0.h[1]
   1322        mla             \d1\().8h, \s4\().8h, v0.h[2]
   1323        mla             \d1\().8h, \s5\().8h, v0.h[3]
   1324        mla             \d1\().8h, \s6\().8h, v0.h[4]
   1325        mla             \d1\().8h, \s7\().8h, v0.h[5]
   1326        mla             \d1\().8h, \s8\().8h, v0.h[6]
   1327        mla             \d1\().8h, \s9\().8h, v0.h[7]
   1328 .endm
   1329 .macro sqrshrun_b shift, r0, r1, r2, r3
   1330        sqrshrun        \r0\().8b, \r0\().8h,  #\shift
   1331 .ifnb \r1
   1332        sqrshrun        \r1\().8b, \r1\().8h,  #\shift
   1333 .endif
   1334 .ifnb \r2
   1335        sqrshrun        \r2\().8b, \r2\().8h,  #\shift
   1336        sqrshrun        \r3\().8b, \r3\().8h,  #\shift
   1337 .endif
   1338 .endm
   1339 .macro srshr_h shift, r0, r1, r2, r3
   1340        srshr           \r0\().8h, \r0\().8h,  #\shift
   1341 .ifnb \r1
   1342        srshr           \r1\().8h, \r1\().8h,  #\shift
   1343 .endif
   1344 .ifnb \r2
   1345        srshr           \r2\().8h, \r2\().8h,  #\shift
   1346        srshr           \r3\().8h, \r3\().8h,  #\shift
   1347 .endif
   1348 .endm
   1349 .macro st_h strd, reg, lanes
   1350        st1             {\reg\().h}[0], [x0], \strd
   1351        st1             {\reg\().h}[1], [x8], \strd
   1352 .if \lanes > 2
   1353        st1             {\reg\().h}[2], [x0], \strd
   1354        st1             {\reg\().h}[3], [x8], \strd
   1355 .endif
   1356 .endm
   1357 .macro st_s strd, r0, r1
   1358        st1             {\r0\().s}[0], [x0], \strd
   1359        st1             {\r0\().s}[1], [x8], \strd
   1360 .ifnb \r1
   1361        st1             {\r1\().s}[0], [x0], \strd
   1362        st1             {\r1\().s}[1], [x8], \strd
   1363 .endif
   1364 .endm
   1365 .macro st_d strd, r0, r1
   1366        st1             {\r0\().8b},   [x0], \strd
   1367        st1             {\r0\().d}[1], [x8], \strd
   1368 .ifnb \r1
   1369        st1             {\r1\().8b},   [x0], \strd
   1370        st1             {\r1\().d}[1], [x8], \strd
   1371 .endif
   1372 .endm
   1373 .macro shift_store_4 type, strd, r0, r1
   1374 .ifc \type, put
   1375        sqrshrun_b      6,     \r0, \r1
   1376        st_s            \strd, \r0, \r1
   1377 .else
   1378        srshr_h         2,     \r0, \r1
   1379        st_d            \strd, \r0, \r1
   1380 .endif
   1381 .endm
   1382 .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
   1383        st1             {\r0\wd}, [x0], \strd
   1384        st1             {\r1\wd}, [x8], \strd
   1385 .ifnb \r2
   1386        st1             {\r2\wd}, [x0], \strd
   1387        st1             {\r3\wd}, [x8], \strd
   1388 .endif
   1389 .ifnb \r4
   1390        st1             {\r4\wd}, [x0], \strd
   1391        st1             {\r5\wd}, [x8], \strd
   1392        st1             {\r6\wd}, [x0], \strd
   1393        st1             {\r7\wd}, [x8], \strd
   1394 .endif
   1395 .endm
   1396 .macro st_8b strd, r0, r1, r2, r3, r4, r5, r6, r7
   1397        st_reg          \strd, .8b,  \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
   1398 .endm
   1399 .macro st_16b strd, r0, r1, r2, r3, r4, r5, r6, r7
   1400        st_reg          \strd, .16b, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
   1401 .endm
   1402 .macro shift_store_8 type, strd, r0, r1, r2, r3
   1403 .ifc \type, put
   1404        sqrshrun_b      6,     \r0, \r1, \r2, \r3
   1405        st_8b           \strd, \r0, \r1, \r2, \r3
   1406 .else
   1407        srshr_h         2,     \r0, \r1, \r2, \r3
   1408        st_16b          \strd, \r0, \r1, \r2, \r3
   1409 .endif
   1410 .endm
   1411 .macro shift_store_16 type, strd, r0, r1, r2, r3
   1412 .ifc \type, put
   1413        sqrshrun        \r0\().8b,  \r0\().8h, #6
   1414        sqrshrun2       \r0\().16b, \r1\().8h, #6
   1415        sqrshrun        \r2\().8b,  \r2\().8h, #6
   1416        sqrshrun2       \r2\().16b, \r3\().8h, #6
   1417        st_16b          \strd, \r0, \r2
   1418 .else
   1419        srshr_h         2,     \r0, \r1, \r2, \r3
   1420        st1             {\r0\().8h, \r1\().8h}, [x0], \strd
   1421        st1             {\r2\().8h, \r3\().8h}, [x8], \strd
   1422 .endif
   1423 .endm
   1424 
   1425 .macro make_8tap_fn op, type, type_h, type_v, taps
   1426 function \op\()_8tap_\type\()_8bpc_neon, export=1
   1427        mov             x8,  \type_h
   1428        mov             x9,  \type_v
   1429        b               \op\()_\taps\()_neon
   1430 endfunc
   1431 .endm
   1432 
   1433 // No spaces in these expressions, due to gas-preprocessor.
   1434 #define REGULAR ((0*15<<7)|3*15)
   1435 #define SMOOTH  ((1*15<<7)|4*15)
   1436 #define SHARP   ((2*15<<7)|3*15)
   1437 
   1438 .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv, taps
   1439 function \type\()_\taps\()_neon
   1440        mov             w10,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
   1441        mul             \mx,  \mx, w10
   1442        mul             \my,  \my, w10
   1443        add             \mx,  \mx, w8 // mx, 8tap_h, 4tap_h
   1444        add             \my,  \my, w9 // my, 8tap_v, 4tap_v
   1445 .ifc \type, prep
   1446        uxtw            \d_strd, \w
   1447        lsl             \d_strd, \d_strd, #1
   1448 .endif
   1449 
   1450        clz             w8,  \w
   1451        tst             \mx, #(0x7f << 14)
   1452        sub             w8,  w8,  #24
   1453        movrel          x10, X(mc_subpel_filters), -8
   1454        b.ne            L(\type\()_\taps\()_h)
   1455        tst             \my, #(0x7f << 14)
   1456        b.ne            L(\type\()_\taps\()_v)
   1457        b               \type\()_neon
   1458 
   1459 L(\type\()_\taps\()_h):
   1460        cmp             \w,  #4
   1461        ubfx            w9,  \mx, #7, #7
   1462        and             \mx, \mx, #0x7f
   1463        b.le            4f
   1464        mov             \mx,  w9
   1465 4:
   1466        tst             \my,  #(0x7f << 14)
   1467        add             \xmx, x10, \mx, uxtw #3
   1468        b.ne            L(\type\()_\taps\()_hv)
   1469 
   1470        movrel          x9,  \type\()_\taps\()_h_tbl
   1471        ldrsw           x8,  [x9, x8, lsl #2]
   1472        add             x9,  x9,  x8
   1473        br              x9
   1474 
   1475 20:     // 2xN h
   1476        AARCH64_VALID_JUMP_TARGET
   1477 .ifc \type, put
   1478        ldur            s0,  [\xmx, #2]
   1479        sub             \src,  \src,  #1
   1480        add             \ds2,  \dst,  \d_strd
   1481        add             \sr2,  \src,  \s_strd
   1482        lsl             \d_strd,  \d_strd,  #1
   1483        lsl             \s_strd,  \s_strd,  #1
   1484        sxtl            v0.8h,  v0.8b
   1485 2:
   1486        ld1             {v4.8b},  [\src], \s_strd
   1487        ld1             {v6.8b},  [\sr2], \s_strd
   1488        uxtl            v4.8h,  v4.8b
   1489        uxtl            v6.8h,  v6.8b
   1490        ext             v5.16b, v4.16b, v4.16b, #2
   1491        ext             v7.16b, v6.16b, v6.16b, #2
   1492        subs            \h,  \h,  #2
   1493        trn1            v3.2s,  v4.2s,  v6.2s
   1494        trn2            v6.2s,  v4.2s,  v6.2s
   1495        trn1            v4.2s,  v5.2s,  v7.2s
   1496        trn2            v7.2s,  v5.2s,  v7.2s
   1497        mul             v3.4h,  v3.4h,  v0.h[0]
   1498        mla             v3.4h,  v4.4h,  v0.h[1]
   1499        mla             v3.4h,  v6.4h,  v0.h[2]
   1500        mla             v3.4h,  v7.4h,  v0.h[3]
   1501        srshr           v3.4h,  v3.4h,  #2
   1502        sqrshrun        v3.8b,  v3.8h,  #4
   1503        st1             {v3.h}[0], [\dst], \d_strd
   1504        st1             {v3.h}[1], [\ds2], \d_strd
   1505        b.gt            2b
   1506        ret
   1507 .endif
   1508 
   1509 40:     // 4xN h
   1510        AARCH64_VALID_JUMP_TARGET
   1511        ldur            s0,  [\xmx, #2]
   1512        sub             \src,  \src,  #1
   1513        add             \ds2,  \dst,  \d_strd
   1514        add             \sr2,  \src,  \s_strd
   1515        lsl             \d_strd,  \d_strd,  #1
   1516        lsl             \s_strd,  \s_strd,  #1
   1517        sxtl            v0.8h,  v0.8b
   1518 4:
   1519        ld1             {v16.8b}, [\src], \s_strd
   1520        ld1             {v20.8b}, [\sr2], \s_strd
   1521        uxtl            v16.8h,  v16.8b
   1522        uxtl            v20.8h,  v20.8b
   1523        ext             v17.16b, v16.16b, v16.16b, #2
   1524        ext             v18.16b, v16.16b, v16.16b, #4
   1525        ext             v19.16b, v16.16b, v16.16b, #6
   1526        ext             v21.16b, v20.16b, v20.16b, #2
   1527        ext             v22.16b, v20.16b, v20.16b, #4
   1528        ext             v23.16b, v20.16b, v20.16b, #6
   1529        subs            \h,  \h,  #2
   1530        mul             v16.4h,  v16.4h,  v0.h[0]
   1531        mla             v16.4h,  v17.4h,  v0.h[1]
   1532        mla             v16.4h,  v18.4h,  v0.h[2]
   1533        mla             v16.4h,  v19.4h,  v0.h[3]
   1534        mul             v20.4h,  v20.4h,  v0.h[0]
   1535        mla             v20.4h,  v21.4h,  v0.h[1]
   1536        mla             v20.4h,  v22.4h,  v0.h[2]
   1537        mla             v20.4h,  v23.4h,  v0.h[3]
   1538        srshr           v16.4h,  v16.4h,  #2
   1539        srshr           v20.4h,  v20.4h,  #2
   1540 .ifc \type, put
   1541        sqrshrun        v16.8b,  v16.8h,  #4
   1542        sqrshrun        v20.8b,  v20.8h,  #4
   1543        str             s16,  [\dst]
   1544        str             s20,  [\ds2]
   1545        add             \dst, \dst, \d_strd
   1546        add             \ds2, \ds2, \d_strd
   1547 .else
   1548        st1             {v16.4h}, [\dst], \d_strd
   1549        st1             {v20.4h}, [\ds2], \d_strd
   1550 .endif
   1551        b.gt            4b
   1552        ret
   1553 
   1554 80:     // 8xN h
   1555        AARCH64_VALID_JUMP_TARGET
   1556        ld1             {v0.8b}, [\xmx]
   1557 .ifc \taps, 6tap
   1558        sub             \src,  \src,  #2
   1559 .else
   1560        sub             \src,  \src,  #3
   1561 .endif
   1562        add             \ds2,  \dst,  \d_strd
   1563        add             \sr2,  \src,  \s_strd
   1564        lsl             \d_strd,  \d_strd,  #1
   1565        lsl             \s_strd,  \s_strd,  #1
   1566        sxtl            v0.8h, v0.8b
   1567 8:
   1568        ld1             {v16.8b, v17.8b},  [\src], \s_strd
   1569        ld1             {v20.8b, v21.8b},  [\sr2], \s_strd
   1570        uxtl            v16.8h,  v16.8b
   1571        uxtl            v17.8h,  v17.8b
   1572        uxtl            v20.8h,  v20.8b
   1573        uxtl            v21.8h,  v21.8b
   1574 
   1575 .ifc \taps, 6tap
   1576        mul             v18.8h,  v16.8h,  v0.h[1]
   1577        mul             v22.8h,  v20.8h,  v0.h[1]
   1578    .irpc i, 23456
   1579        ext             v19.16b, v16.16b, v17.16b, #(2*\i-2)
   1580        ext             v23.16b, v20.16b, v21.16b, #(2*\i-2)
   1581        mla             v18.8h,  v19.8h,  v0.h[\i]
   1582        mla             v22.8h,  v23.8h,  v0.h[\i]
   1583    .endr
   1584 .else   // 8tap
   1585        mul             v18.8h,  v16.8h,  v0.h[0]
   1586        mul             v22.8h,  v20.8h,  v0.h[0]
   1587    .irpc i, 1234567
   1588        ext             v19.16b, v16.16b, v17.16b, #(2*\i)
   1589        ext             v23.16b, v20.16b, v21.16b, #(2*\i)
   1590        mla             v18.8h,  v19.8h,  v0.h[\i]
   1591        mla             v22.8h,  v23.8h,  v0.h[\i]
   1592    .endr
   1593 .endif
   1594        subs            \h,  \h,  #2
   1595        srshr           v18.8h,  v18.8h, #2
   1596        srshr           v22.8h,  v22.8h, #2
   1597 .ifc \type, put
   1598        sqrshrun        v18.8b,  v18.8h, #4
   1599        sqrshrun        v22.8b,  v22.8h, #4
   1600        st1             {v18.8b}, [\dst], \d_strd
   1601        st1             {v22.8b}, [\ds2], \d_strd
   1602 .else
   1603        st1             {v18.8h}, [\dst], \d_strd
   1604        st1             {v22.8h}, [\ds2], \d_strd
   1605 .endif
   1606        b.gt            8b
   1607        ret
   1608 160:
   1609 320:
   1610 640:
   1611 1280:   // 16xN, 32xN, ... h
   1612        AARCH64_VALID_JUMP_TARGET
   1613        ld1             {v0.8b}, [\xmx]
   1614 .ifc \taps, 6tap
   1615        sub             \src,  \src,  #2
   1616 .else
   1617        sub             \src,  \src,  #3
   1618 .endif
   1619        add             \ds2,  \dst,  \d_strd
   1620        add             \sr2,  \src,  \s_strd
   1621        lsl             \s_strd,  \s_strd,  #1
   1622        sxtl            v0.8h, v0.8b
   1623 
   1624        sub             \s_strd,  \s_strd,  \w, uxtw
   1625        sub             \s_strd,  \s_strd,  #8
   1626 .ifc \type, put
   1627        lsl             \d_strd,  \d_strd,  #1
   1628        sub             \d_strd,  \d_strd,  \w, uxtw
   1629 .endif
   1630 161:
   1631        ld1             {v16.8b, v17.8b, v18.8b},  [\src], #24
   1632        ld1             {v20.8b, v21.8b, v22.8b},  [\sr2], #24
   1633        mov             \mx, \w
   1634        uxtl            v16.8h,  v16.8b
   1635        uxtl            v17.8h,  v17.8b
   1636        uxtl            v18.8h,  v18.8b
   1637        uxtl            v20.8h,  v20.8b
   1638        uxtl            v21.8h,  v21.8b
   1639        uxtl            v22.8h,  v22.8b
   1640 
   1641 16:
   1642 .ifc \taps, 6tap
   1643        mul             v24.8h,  v16.8h,  v0.h[1]
   1644        mul             v25.8h,  v17.8h,  v0.h[1]
   1645        mul             v26.8h,  v20.8h,  v0.h[1]
   1646        mul             v27.8h,  v21.8h,  v0.h[1]
   1647    .irpc i, 23456
   1648        ext             v28.16b, v16.16b, v17.16b, #(2*\i-2)
   1649        ext             v29.16b, v17.16b, v18.16b, #(2*\i-2)
   1650        ext             v30.16b, v20.16b, v21.16b, #(2*\i-2)
   1651        ext             v31.16b, v21.16b, v22.16b, #(2*\i-2)
   1652        mla             v24.8h,  v28.8h,  v0.h[\i]
   1653        mla             v25.8h,  v29.8h,  v0.h[\i]
   1654        mla             v26.8h,  v30.8h,  v0.h[\i]
   1655        mla             v27.8h,  v31.8h,  v0.h[\i]
   1656    .endr
   1657 .else   // 8tap
   1658        mul             v24.8h,  v16.8h,  v0.h[0]
   1659        mul             v25.8h,  v17.8h,  v0.h[0]
   1660        mul             v26.8h,  v20.8h,  v0.h[0]
   1661        mul             v27.8h,  v21.8h,  v0.h[0]
   1662    .irpc i, 1234567
   1663        ext             v28.16b, v16.16b, v17.16b, #(2*\i)
   1664        ext             v29.16b, v17.16b, v18.16b, #(2*\i)
   1665        ext             v30.16b, v20.16b, v21.16b, #(2*\i)
   1666        ext             v31.16b, v21.16b, v22.16b, #(2*\i)
   1667        mla             v24.8h,  v28.8h,  v0.h[\i]
   1668        mla             v25.8h,  v29.8h,  v0.h[\i]
   1669        mla             v26.8h,  v30.8h,  v0.h[\i]
   1670        mla             v27.8h,  v31.8h,  v0.h[\i]
   1671    .endr
   1672 .endif
   1673        srshr           v24.8h,  v24.8h, #2
   1674        srshr           v25.8h,  v25.8h, #2
   1675        srshr           v26.8h,  v26.8h, #2
   1676        srshr           v27.8h,  v27.8h, #2
   1677        subs            \mx, \mx, #16
   1678 .ifc \type, put
   1679        sqrshrun        v24.8b,  v24.8h, #4
   1680        sqrshrun2       v24.16b, v25.8h, #4
   1681        sqrshrun        v26.8b,  v26.8h, #4
   1682        sqrshrun2       v26.16b, v27.8h, #4
   1683        st1             {v24.16b}, [\dst], #16
   1684        st1             {v26.16b}, [\ds2], #16
   1685 .else
   1686        st1             {v24.8h, v25.8h}, [\dst], #32
   1687        st1             {v26.8h, v27.8h}, [\ds2], #32
   1688 .endif
   1689        b.le            9f
   1690 
   1691        mov             v16.16b, v18.16b
   1692        mov             v20.16b, v22.16b
   1693        ld1             {v17.8b, v18.8b}, [\src], #16
   1694        ld1             {v21.8b, v22.8b}, [\sr2], #16
   1695        uxtl            v17.8h,  v17.8b
   1696        uxtl            v18.8h,  v18.8b
   1697        uxtl            v21.8h,  v21.8b
   1698        uxtl            v22.8h,  v22.8b
   1699        b               16b
   1700 
   1701 9:
   1702        add             \dst,  \dst,  \d_strd
   1703        add             \ds2,  \ds2,  \d_strd
   1704        add             \src,  \src,  \s_strd
   1705        add             \sr2,  \sr2,  \s_strd
   1706 
   1707        subs            \h,  \h,  #2
   1708        b.gt            161b
   1709        ret
   1710 endfunc
   1711 
   1712 jumptable \type\()_\taps\()_h_tbl
   1713        .word 1280b - \type\()_\taps\()_h_tbl
   1714        .word 640b  - \type\()_\taps\()_h_tbl
   1715        .word 320b  - \type\()_\taps\()_h_tbl
   1716        .word 160b  - \type\()_\taps\()_h_tbl
   1717        .word 80b   - \type\()_\taps\()_h_tbl
   1718        .word 40b   - \type\()_\taps\()_h_tbl
   1719        .word 20b   - \type\()_\taps\()_h_tbl
   1720 endjumptable
   1721 
   1722 function L(\type\()_\taps\()_v)
   1723        cmp             \h,  #4
   1724        ubfx            w9,  \my, #7, #7
   1725        and             \my, \my, #0x7f
   1726        b.le            4f
   1727        mov             \my, w9
   1728 4:
   1729        add             \xmy, x10, \my, uxtw #3
   1730 
   1731        movrel          x9,  \type\()_\taps\()_v_tbl
   1732        ldrsw           x8,  [x9, x8, lsl #2]
   1733        add             x9,  x9,  x8
   1734        br              x9
   1735 
   1736 20:     // 2xN v
   1737        AARCH64_VALID_JUMP_TARGET
   1738 .ifc \type, put
   1739        b.gt            28f
   1740 
   1741        cmp             \h,  #2
   1742        ldur            s0,  [\xmy, #2]
   1743        sub             \src,  \src,  \s_strd
   1744        add             \ds2,  \dst,  \d_strd
   1745        add             \sr2,  \src,  \s_strd
   1746        lsl             \s_strd,  \s_strd,  #1
   1747        lsl             \d_strd,  \d_strd,  #1
   1748        sxtl            v0.8h, v0.8b
   1749 
   1750        // 2x2 v
   1751        load_h          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
   1752        interleave_1_h  v1, v2, v3, v4, v5
   1753        b.gt            24f
   1754        uxtl_b          v1, v2, v3, v4
   1755        mul_mla_4tap    v6, v1, v2, v3, v4, .4h
   1756        sqrshrun_b      6,  v6
   1757        st_h            \d_strd, v6, 2
   1758        ret
   1759 
   1760 24:     // 2x4 v
   1761        load_h          \sr2, \src, \s_strd, v6, v7
   1762        interleave_1_h  v5, v6, v7
   1763        interleave_2_s  v1, v2, v3, v4, v5, v6
   1764        uxtl_b          v1, v2, v3, v4
   1765        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
   1766        sqrshrun_b      6,  v6
   1767        st_h            \d_strd, v6, 4
   1768        ret
   1769 
   1770 28:     // 2x6, 2x8, 2x12, 2x16 v
   1771        ld1             {v0.8b}, [\xmy]
   1772        sub             \sr2,  \src,  \s_strd, lsl #1
   1773        add             \ds2,  \dst,  \d_strd
   1774        sub             \src,  \sr2,  \s_strd
   1775        lsl             \d_strd,  \d_strd,  #1
   1776        lsl             \s_strd,  \s_strd,  #1
   1777        sxtl            v0.8h, v0.8b
   1778 
   1779        load_h          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
   1780        interleave_1_h  v1,  v2,  v3,  v4,  v5
   1781        interleave_1_h  v5,  v6,  v7
   1782        interleave_2_s  v1,  v2,  v3,  v4,  v5,  v6
   1783        uxtl_b          v1,  v2,  v3,  v4
   1784 216:
   1785        subs            \h,  \h,  #4
   1786        load_h          \sr2, \src, \s_strd, v16, v17, v18, v19
   1787        interleave_1_h  v7,  v16, v17, v18, v19
   1788        interleave_2_s  v5,  v6,  v7,  v16, v17, v18
   1789        uxtl_b          v5,  v6,  v7,  v16
   1790        mul_mla_\taps\()_0 v30, v1, v2, v3, v4, v5, v6, v7, v16
   1791        sqrshrun_b      6,   v30
   1792        st_h            \d_strd, v30, 4
   1793        b.le            0f
   1794        cmp             \h,  #2
   1795        mov             v1.16b,  v5.16b
   1796        mov             v2.16b,  v6.16b
   1797        mov             v3.16b,  v7.16b
   1798        mov             v4.16b,  v16.16b
   1799        mov             v5.16b,  v17.16b
   1800        mov             v6.16b,  v18.16b
   1801        mov             v7.16b,  v19.16b
   1802        b.eq            26f
   1803        b               216b
   1804 26:
   1805        load_h          \sr2, \src, \s_strd, v16, v17
   1806        interleave_1_h  v7,  v16, v17
   1807        uxtl_b          v5,  v6,  v7,  v16
   1808        mul_mla_\taps\()_0_4h v30, v1, v2, v3, v4, v5, v6, v7, v16
   1809        sqrshrun_b      6,   v30
   1810        st_h            \d_strd, v30, 2
   1811 0:
   1812        ret
   1813 .endif
   1814 
   1815 40:
   1816        AARCH64_VALID_JUMP_TARGET
   1817        b.gt            480f
   1818 
   1819        // 4x2, 4x4 v
   1820        cmp             \h,  #2
   1821        ldur            s0,  [\xmy, #2]
   1822        sub             \src, \src, \s_strd
   1823        add             \ds2, \dst, \d_strd
   1824        add             \sr2, \src, \s_strd
   1825        lsl             \s_strd, \s_strd, #1
   1826        lsl             \d_strd, \d_strd, #1
   1827        sxtl            v0.8h, v0.8b
   1828 
   1829        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
   1830        interleave_1_s  v1, v2, v3, v4, v5
   1831        uxtl_b          v1, v2, v3, v4
   1832        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
   1833        shift_store_4   \type, \d_strd, v6
   1834        b.le            0f
   1835        load_s          \sr2, \src, \s_strd, v6, v7
   1836        interleave_1_s  v5, v6, v7
   1837        uxtl_b          v5, v6
   1838        mul_mla_4tap    v7, v3, v4, v5, v6, .8h
   1839        shift_store_4   \type, \d_strd, v7
   1840 0:
   1841        ret
   1842 
   1843 480:    // 4x6, 4x8, 4x12, 4x16 v
   1844        ld1             {v0.8b}, [\xmy]
   1845        sub             \sr2, \src, \s_strd, lsl #1
   1846        add             \ds2, \dst, \d_strd
   1847        sub             \src, \sr2, \s_strd
   1848        lsl             \s_strd, \s_strd, #1
   1849        lsl             \d_strd, \d_strd, #1
   1850        sxtl            v0.8h, v0.8b
   1851 
   1852        load_s          \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
   1853        interleave_1_s  v16, v17, v18
   1854        interleave_1_s  v18, v19, v20, v21, v22
   1855        uxtl_b          v16, v17
   1856        uxtl_b          v18, v19, v20, v21
   1857 
   1858 48:
   1859        subs            \h,  \h,  #4
   1860        load_s          \sr2, \src, \s_strd, v23, v24, v25, v26
   1861        interleave_1_s  v22, v23, v24, v25, v26
   1862        uxtl_b          v22, v23, v24, v25
   1863        mul_mla_\taps\()_2 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25
   1864        shift_store_4   \type, \d_strd, v1, v2
   1865        b.le            0f
   1866        load_s          \sr2,  \src, \s_strd, v27, v16
   1867        subs            \h,  \h,  #2
   1868        interleave_1_s  v26, v27, v16
   1869        uxtl_b          v26, v27
   1870        mul_mla_\taps\()_0 v1, v20, v21, v22, v23, v24, v25, v26, v27
   1871        shift_store_4   \type, \d_strd, v1
   1872        b.le            0f
   1873        load_s          \sr2,  \src, \s_strd, v17, v18
   1874        subs            \h,  \h,  #2
   1875        interleave_1_s  v16, v17, v18
   1876        uxtl_b          v16, v17
   1877        mul_mla_\taps\()_0 v2, v22, v23, v24, v25, v26, v27, v16, v17
   1878        shift_store_4   \type, \d_strd, v2
   1879        b.le            0f
   1880        subs            \h,  \h,  #4
   1881        load_s          \sr2, \src, \s_strd, v19, v20, v21, v22
   1882        interleave_1_s  v18, v19, v20, v21, v22
   1883        uxtl_b          v18, v19, v20, v21
   1884        mul_mla_\taps\()_2 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20, v21
   1885        shift_store_4   \type, \d_strd, v1, v2
   1886        b.gt            48b
   1887 0:
   1888        ret
   1889 
   1890 80:
   1891        AARCH64_VALID_JUMP_TARGET
   1892        b.gt            880f
   1893 
   1894        // 8x2, 8x4 v
   1895        cmp             \h,  #2
   1896        ldur            s0,  [\xmy, #2]
   1897        sub             \src, \src, \s_strd
   1898        add             \ds2, \dst, \d_strd
   1899        add             \sr2, \src, \s_strd
   1900        lsl             \s_strd, \s_strd, #1
   1901        lsl             \d_strd, \d_strd, #1
   1902        sxtl            v0.8h, v0.8b
   1903 
   1904        load_8b         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
   1905        uxtl_b          v1, v2, v3, v4, v5
   1906        mul_mla_4tap    v6, v1, v2, v3, v4, .8h
   1907        mul_mla_4tap    v7, v2, v3, v4, v5, .8h
   1908        shift_store_8   \type, \d_strd, v6, v7
   1909        b.le            0f
   1910        load_8b         \sr2, \src, \s_strd, v6, v7
   1911        uxtl_b          v6, v7
   1912        mul_mla_4tap    v1, v3, v4, v5, v6, .8h
   1913        mul_mla_4tap    v2, v4, v5, v6, v7, .8h
   1914        shift_store_8   \type, \d_strd, v1, v2
   1915 0:
   1916        ret
   1917 
   1918 880:    // 8x6, 8x8, 8x16, 8x32 v
   1919 1680:   // 16x8, 16x16, ...
   1920 320:    // 32x8, 32x16, ...
   1921 640:
   1922 1280:
   1923        AARCH64_VALID_JUMP_TARGET
   1924        ld1             {v0.8b}, [\xmy]
   1925        sub             \src, \src, \s_strd
   1926        sub             \src, \src, \s_strd, lsl #1
   1927        sxtl            v0.8h, v0.8b
   1928        mov             \my,  \h
   1929 168:
   1930        add             \ds2, \dst, \d_strd
   1931        add             \sr2, \src, \s_strd
   1932        lsl             \s_strd, \s_strd, #1
   1933        lsl             \d_strd, \d_strd, #1
   1934 
   1935        load_8b         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
   1936        uxtl_b          v16, v17, v18, v19, v20, v21, v22
   1937 
   1938 88:
   1939        subs            \h,  \h,  #2
   1940        load_8b         \sr2, \src, \s_strd, v23, v24
   1941        uxtl_b          v23, v24
   1942        mul_mla_\taps\()_1 v1, v2, v16, v17, v18, v19, v20, v21, v22, v23, v24
   1943        shift_store_8   \type, \d_strd, v1, v2
   1944        b.le            9f
   1945        subs            \h,  \h,  #2
   1946        load_8b         \sr2, \src, \s_strd, v25, v26
   1947        uxtl_b          v25, v26
   1948        mul_mla_\taps\()_1 v3, v4, v18, v19, v20, v21, v22, v23, v24, v25, v26
   1949        shift_store_8   \type, \d_strd, v3, v4
   1950        b.le            9f
   1951        subs            \h,  \h,  #2
   1952        load_8b         \sr2, \src, \s_strd, v27, v16
   1953        uxtl_b          v27, v16
   1954        mul_mla_\taps\()_1 v1, v2, v20, v21, v22, v23, v24, v25, v26, v27, v16
   1955        shift_store_8   \type, \d_strd, v1, v2
   1956        b.le            9f
   1957        subs            \h,  \h,  #2
   1958        load_8b         \sr2, \src, \s_strd, v17, v18
   1959        uxtl_b          v17, v18
   1960        mul_mla_\taps\()_1 v3, v4, v22, v23, v24, v25, v26, v27, v16, v17, v18
   1961        shift_store_8   \type, \d_strd, v3, v4
   1962        b.le            9f
   1963        subs            \h,  \h,  #4
   1964        load_8b         \sr2, \src, \s_strd, v19, v20, v21, v22
   1965        uxtl_b          v19, v20, v21, v22
   1966        mul_mla_\taps\()_1 v1, v2, v24, v25, v26, v27, v16, v17, v18, v19, v20
   1967        mul_mla_\taps\()_1 v3, v4, v26, v27, v16, v17, v18, v19, v20, v21, v22
   1968        shift_store_8   \type, \d_strd, v1, v2, v3, v4
   1969        b.gt            88b
   1970 9:
   1971        subs            \w,  \w,  #8
   1972        b.le            0f
   1973        asr             \s_strd, \s_strd, #1
   1974        asr             \d_strd, \d_strd, #1
   1975        msub            \src, \s_strd, \xmy, \src
   1976        msub            \dst, \d_strd, \xmy, \dst
   1977        sub             \src, \src, \s_strd, lsl #3
   1978        mov             \h,  \my
   1979        add             \src, \src, #8
   1980 .ifc \type, put
   1981        add             \dst, \dst, #8
   1982 .else
   1983        add             \dst, \dst, #16
   1984 .endif
   1985        b               168b
   1986 0:
   1987        ret
   1988 
   1989 160:
   1990        AARCH64_VALID_JUMP_TARGET
   1991        b.gt            1680b
   1992 
   1993        // 16x2, 16x4 v
   1994        ldur            s0,  [\xmy, #2]
   1995        sub             \src, \src, \s_strd
   1996        add             \ds2, \dst, \d_strd
   1997        add             \sr2, \src, \s_strd
   1998        lsl             \s_strd, \s_strd, #1
   1999        lsl             \d_strd, \d_strd, #1
   2000        sxtl            v0.8h, v0.8b
   2001 
   2002        cmp             \h,  #2
   2003        load_16b        \src, \sr2, \s_strd, v1,  v2,  v3,  v4,  v5
   2004        uxtl            v16.8h, v1.8b
   2005        uxtl            v17.8h, v2.8b
   2006        uxtl            v18.8h, v3.8b
   2007        uxtl            v19.8h, v4.8b
   2008        uxtl            v20.8h, v5.8b
   2009        uxtl2           v23.8h, v1.16b
   2010        uxtl2           v24.8h, v2.16b
   2011        uxtl2           v25.8h, v3.16b
   2012        uxtl2           v26.8h, v4.16b
   2013        uxtl2           v27.8h, v5.16b
   2014        mul_mla_4tap    v1,  v16, v17, v18, v19, .8h
   2015        mul_mla_4tap    v16, v17, v18, v19, v20, .8h
   2016        mul_mla_4tap    v2,  v23, v24, v25, v26, .8h
   2017        mul_mla_4tap    v17, v24, v25, v26, v27, .8h
   2018        shift_store_16  \type, \d_strd, v1, v2, v16, v17
   2019        b.le            0f
   2020        load_16b        \sr2, \src, \s_strd, v6,  v7
   2021        uxtl            v21.8h, v6.8b
   2022        uxtl            v22.8h, v7.8b
   2023        uxtl2           v28.8h, v6.16b
   2024        uxtl2           v29.8h, v7.16b
   2025        mul_mla_4tap    v1,  v18, v19, v20, v21, .8h
   2026        mul_mla_4tap    v3,  v19, v20, v21, v22, .8h
   2027        mul_mla_4tap    v2,  v25, v26, v27, v28, .8h
   2028        mul_mla_4tap    v4,  v26, v27, v28, v29, .8h
   2029        shift_store_16  \type, \d_strd, v1, v2, v3, v4
   2030 0:
   2031        ret
   2032 endfunc
   2033 
   2034 jumptable \type\()_\taps\()_v_tbl
   2035        .word 1280b - \type\()_\taps\()_v_tbl
   2036        .word 640b  - \type\()_\taps\()_v_tbl
   2037        .word 320b  - \type\()_\taps\()_v_tbl
   2038        .word 160b  - \type\()_\taps\()_v_tbl
   2039        .word 80b   - \type\()_\taps\()_v_tbl
   2040        .word 40b   - \type\()_\taps\()_v_tbl
   2041        .word 20b   - \type\()_\taps\()_v_tbl
   2042 endjumptable
   2043 
   2044 function L(\type\()_\taps\()_hv)
   2045        cmp             \h,  #4
   2046        ubfx            w9,  \my, #7, #7
   2047        and             \my, \my, #0x7f
   2048        b.le            4f
   2049        mov             \my,  w9
   2050 4:
   2051        add             \xmy,  x10, \my, uxtw #3
   2052 
   2053        movrel          x9,  \type\()_\taps\()_hv_tbl
   2054        ldrsw           x8,  [x9, x8, lsl #2]
   2055        add             x9,  x9,  x8
   2056        br              x9
   2057 
   2058 20:
   2059        AARCH64_VALID_JUMP_TARGET
   2060 .ifc \type, put
   2061        ldur            s0,  [\xmx, #2]
   2062        b.gt            280f
   2063        ldur            s1,  [\xmy, #2]
   2064 
   2065        // 2x2, 2x4 hv
   2066        sub             \sr2, \src, #1
   2067        sub             \src, \sr2, \s_strd
   2068        add             \ds2, \dst, \d_strd
   2069        lsl             \s_strd, \s_strd, #1
   2070        lsl             \d_strd, \d_strd, #1
   2071        sxtl            v0.8h,  v0.8b
   2072        sxtl            v1.8h,  v1.8b
   2073        mov             x15, x30
   2074 
   2075        ld1             {v28.8b}, [\src], \s_strd
   2076        uxtl            v28.8h,  v28.8b
   2077        ext             v29.16b, v28.16b, v28.16b, #2
   2078        mul             v28.4h,  v28.4h,  v0.4h
   2079        mul             v29.4h,  v29.4h,  v0.4h
   2080        addp            v28.4h,  v28.4h,  v29.4h
   2081        addp            v16.4h,  v28.4h,  v28.4h
   2082        srshr           v16.4h,  v16.4h,  #2
   2083        bl              L(\type\()_\taps\()_filter_2)
   2084 
   2085        trn1            v16.2s, v16.2s, v28.2s
   2086        mov             v17.8b, v28.8b
   2087 
   2088 2:
   2089        bl              L(\type\()_\taps\()_filter_2)
   2090 
   2091        ext             v18.8b, v17.8b, v28.8b, #4
   2092        smull           v2.4s,  v16.4h, v1.h[0]
   2093        smlal           v2.4s,  v17.4h, v1.h[1]
   2094        smlal           v2.4s,  v18.4h, v1.h[2]
   2095        smlal           v2.4s,  v28.4h, v1.h[3]
   2096 
   2097        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
   2098        sqxtun          v2.8b,  v2.8h
   2099        subs            \h,  \h,  #2
   2100        st1             {v2.h}[0], [\dst], \d_strd
   2101        st1             {v2.h}[1], [\ds2], \d_strd
   2102        b.le            0f
   2103        mov             v16.8b, v18.8b
   2104        mov             v17.8b, v28.8b
   2105        b               2b
   2106 
   2107 280:    // 2x8, 2x16, 2x32 hv
   2108        ld1             {v1.8b},  [\xmy]
   2109        sub             \src, \src, #1
   2110        sub             \sr2, \src, \s_strd, lsl #1
   2111        sub             \src, \sr2, \s_strd
   2112        add             \ds2, \dst, \d_strd
   2113        lsl             \s_strd, \s_strd, #1
   2114        lsl             \d_strd, \d_strd, #1
   2115        sxtl            v0.8h,  v0.8b
   2116        sxtl            v1.8h,  v1.8b
   2117        mov             x15, x30
   2118 
   2119        ld1             {v28.8b}, [\src], \s_strd
   2120        uxtl            v28.8h,  v28.8b
   2121        ext             v29.16b, v28.16b, v28.16b, #2
   2122        mul             v28.4h,  v28.4h,  v0.4h
   2123        mul             v29.4h,  v29.4h,  v0.4h
   2124        addp            v28.4h,  v28.4h,  v29.4h
   2125        addp            v16.4h,  v28.4h,  v28.4h
   2126        srshr           v16.4h,  v16.4h,  #2
   2127 
   2128        bl              L(\type\()_\taps\()_filter_2)
   2129        trn1            v16.2s, v16.2s, v28.2s
   2130        mov             v17.8b, v28.8b
   2131        bl              L(\type\()_\taps\()_filter_2)
   2132        ext             v18.8b, v17.8b, v28.8b, #4
   2133        mov             v19.8b, v28.8b
   2134        bl              L(\type\()_\taps\()_filter_2)
   2135        ext             v20.8b, v19.8b, v28.8b, #4
   2136        mov             v21.8b, v28.8b
   2137 
   2138 28:
   2139        bl              L(\type\()_\taps\()_filter_2)
   2140        ext             v22.8b, v21.8b, v28.8b, #4
   2141 .ifc \taps, 6tap
   2142        smull           v2.4s,  v17.4h, v1.h[1]
   2143        smlal           v2.4s,  v18.4h, v1.h[2]
   2144        smlal           v2.4s,  v19.4h, v1.h[3]
   2145        smlal           v2.4s,  v20.4h, v1.h[4]
   2146        smlal           v2.4s,  v21.4h, v1.h[5]
   2147        smlal           v2.4s,  v22.4h, v1.h[6]
   2148 .else   // 8tap
   2149        smull           v2.4s,  v16.4h, v1.h[0]
   2150        smlal           v2.4s,  v17.4h, v1.h[1]
   2151        smlal           v2.4s,  v18.4h, v1.h[2]
   2152        smlal           v2.4s,  v19.4h, v1.h[3]
   2153        smlal           v2.4s,  v20.4h, v1.h[4]
   2154        smlal           v2.4s,  v21.4h, v1.h[5]
   2155        smlal           v2.4s,  v22.4h, v1.h[6]
   2156        smlal           v2.4s,  v28.4h, v1.h[7]
   2157 .endif
   2158 
   2159        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
   2160        sqxtun          v2.8b,  v2.8h
   2161        subs            \h,  \h,  #2
   2162        st1             {v2.h}[0], [\dst], \d_strd
   2163        st1             {v2.h}[1], [\ds2], \d_strd
   2164        b.le            0f
   2165        mov             v16.8b, v18.8b
   2166        mov             v17.8b, v19.8b
   2167        mov             v18.8b, v20.8b
   2168        mov             v19.8b, v21.8b
   2169        mov             v20.8b, v22.8b
   2170        mov             v21.8b, v28.8b
   2171        b               28b
   2172 
   2173 0:
   2174        ret             x15
   2175 
   2176 L(\type\()_\taps\()_filter_2):
   2177        ld1             {v28.8b},  [\sr2], \s_strd
   2178        ld1             {v30.8b},  [\src], \s_strd
   2179        uxtl            v28.8h,  v28.8b
   2180        uxtl            v30.8h,  v30.8b
   2181        ext             v29.16b, v28.16b, v28.16b, #2
   2182        ext             v31.16b, v30.16b, v30.16b, #2
   2183        trn1            v27.2s,  v28.2s,  v30.2s
   2184        trn2            v30.2s,  v28.2s,  v30.2s
   2185        trn1            v28.2s,  v29.2s,  v31.2s
   2186        trn2            v31.2s,  v29.2s,  v31.2s
   2187        mul             v27.4h,  v27.4h,  v0.h[0]
   2188        mla             v27.4h,  v28.4h,  v0.h[1]
   2189        mla             v27.4h,  v30.4h,  v0.h[2]
   2190        mla             v27.4h,  v31.4h,  v0.h[3]
   2191        srshr           v28.4h,  v27.4h,  #2
   2192        ret
   2193 .endif
   2194 
   2195 40:
   2196        AARCH64_VALID_JUMP_TARGET
   2197        ldur            s0,  [\xmx, #2]
   2198        b.gt            480f
   2199        ldur            s1,  [\xmy, #2]
   2200        sub             \sr2, \src, #1
   2201        sub             \src, \sr2, \s_strd
   2202        add             \ds2, \dst, \d_strd
   2203        lsl             \s_strd, \s_strd, #1
   2204        lsl             \d_strd, \d_strd, #1
   2205        sxtl            v0.8h,  v0.8b
   2206        sxtl            v1.8h,  v1.8b
   2207        mov             x15, x30
   2208 
   2209        // 4x2, 4x4 hv
   2210        ld1             {v26.8b}, [\src], \s_strd
   2211        uxtl            v26.8h,  v26.8b
   2212        ext             v28.16b, v26.16b, v26.16b, #2
   2213        ext             v29.16b, v26.16b, v26.16b, #4
   2214        ext             v30.16b, v26.16b, v26.16b, #6
   2215        mul             v31.4h,  v26.4h,  v0.h[0]
   2216        mla             v31.4h,  v28.4h,  v0.h[1]
   2217        mla             v31.4h,  v29.4h,  v0.h[2]
   2218        mla             v31.4h,  v30.4h,  v0.h[3]
   2219        srshr           v16.4h,  v31.4h,  #2
   2220 
   2221        bl              L(\type\()_\taps\()_filter_4)
   2222        mov             v17.8b, v28.8b
   2223        mov             v18.8b, v29.8b
   2224 
   2225 4:
   2226        bl              L(\type\()_\taps\()_filter_4)
   2227        // Interleaving the mul/mla chains actually hurts performance
   2228        // significantly on Cortex A53, thus keeping mul/mla tightly
   2229        // chained like this.
   2230        smull           v2.4s,  v16.4h, v1.h[0]
   2231        smlal           v2.4s,  v17.4h, v1.h[1]
   2232        smlal           v2.4s,  v18.4h, v1.h[2]
   2233        smlal           v2.4s,  v28.4h, v1.h[3]
   2234        smull           v3.4s,  v17.4h, v1.h[0]
   2235        smlal           v3.4s,  v18.4h, v1.h[1]
   2236        smlal           v3.4s,  v28.4h, v1.h[2]
   2237        smlal           v3.4s,  v29.4h, v1.h[3]
   2238        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
   2239        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
   2240        subs            \h,  \h,  #2
   2241 .ifc \type, put
   2242        sqxtun          v2.8b,  v2.8h
   2243        sqxtun          v3.8b,  v3.8h
   2244        str             s2,  [\dst]
   2245        str             s3,  [\ds2]
   2246        add             \dst, \dst, \d_strd
   2247        add             \ds2, \ds2, \d_strd
   2248 .else
   2249        st1             {v2.4h}, [\dst], \d_strd
   2250        st1             {v3.4h}, [\ds2], \d_strd
   2251 .endif
   2252        b.le            0f
   2253        mov             v16.8b,  v18.8b
   2254        mov             v17.8b,  v28.8b
   2255        mov             v18.8b,  v29.8b
   2256        b               4b
   2257 
   2258 480:    // 4x8, 4x16, 4x32 hv
   2259        ld1             {v1.8b},  [\xmy]
   2260        sub             \src, \src, #1
   2261 .ifc \taps, 6tap
   2262        sub             \sr2, \src, \s_strd
   2263        sub             \src, \src, \s_strd, lsl #1
   2264 .else
   2265        sub             \sr2, \src, \s_strd, lsl #1
   2266        sub             \src, \sr2, \s_strd
   2267 .endif
   2268        add             \ds2, \dst, \d_strd
   2269        lsl             \s_strd, \s_strd, #1
   2270        lsl             \d_strd, \d_strd, #1
   2271        sxtl            v0.8h,  v0.8b
   2272        sxtl            v1.8h,  v1.8b
   2273        mov             x15, x30
   2274 
   2275        ld1             {v26.8b}, [\src], \s_strd
   2276        uxtl            v26.8h,  v26.8b
   2277        ext             v28.16b, v26.16b, v26.16b, #2
   2278        ext             v29.16b, v26.16b, v26.16b, #4
   2279        ext             v30.16b, v26.16b, v26.16b, #6
   2280        mul             v31.4h,  v26.4h,  v0.h[0]
   2281        mla             v31.4h,  v28.4h,  v0.h[1]
   2282        mla             v31.4h,  v29.4h,  v0.h[2]
   2283        mla             v31.4h,  v30.4h,  v0.h[3]
   2284 .ifc \taps, 6tap
   2285        srshr           v18.4h,  v31.4h,  #2
   2286 .else
   2287        srshr           v16.4h,  v31.4h,  #2
   2288 
   2289        bl              L(\type\()_\taps\()_filter_4)
   2290        mov             v17.8b, v28.8b
   2291        mov             v18.8b, v29.8b
   2292 .endif
   2293        bl              L(\type\()_\taps\()_filter_4)
   2294        mov             v19.8b, v28.8b
   2295        mov             v20.8b, v29.8b
   2296        bl              L(\type\()_\taps\()_filter_4)
   2297        mov             v21.8b, v28.8b
   2298        mov             v22.8b, v29.8b
   2299 
   2300 48:
   2301        bl              L(\type\()_\taps\()_filter_4)
   2302 .ifc \taps, 6tap
   2303        smull           v2.4s,  v18.4h, v1.h[1]
   2304        smlal           v2.4s,  v19.4h, v1.h[2]
   2305        smlal           v2.4s,  v20.4h, v1.h[3]
   2306        smlal           v2.4s,  v21.4h, v1.h[4]
   2307        smlal           v2.4s,  v22.4h, v1.h[5]
   2308        smlal           v2.4s,  v28.4h, v1.h[6]
   2309        smull           v3.4s,  v19.4h, v1.h[1]
   2310        smlal           v3.4s,  v20.4h, v1.h[2]
   2311        smlal           v3.4s,  v21.4h, v1.h[3]
   2312        smlal           v3.4s,  v22.4h, v1.h[4]
   2313        smlal           v3.4s,  v28.4h, v1.h[5]
   2314        smlal           v3.4s,  v29.4h, v1.h[6]
   2315 .else   // 8tap
   2316        smull           v2.4s,  v16.4h, v1.h[0]
   2317        smlal           v2.4s,  v17.4h, v1.h[1]
   2318        smlal           v2.4s,  v18.4h, v1.h[2]
   2319        smlal           v2.4s,  v19.4h, v1.h[3]
   2320        smlal           v2.4s,  v20.4h, v1.h[4]
   2321        smlal           v2.4s,  v21.4h, v1.h[5]
   2322        smlal           v2.4s,  v22.4h, v1.h[6]
   2323        smlal           v2.4s,  v28.4h, v1.h[7]
   2324        smull           v3.4s,  v17.4h, v1.h[0]
   2325        smlal           v3.4s,  v18.4h, v1.h[1]
   2326        smlal           v3.4s,  v19.4h, v1.h[2]
   2327        smlal           v3.4s,  v20.4h, v1.h[3]
   2328        smlal           v3.4s,  v21.4h, v1.h[4]
   2329        smlal           v3.4s,  v22.4h, v1.h[5]
   2330        smlal           v3.4s,  v28.4h, v1.h[6]
   2331        smlal           v3.4s,  v29.4h, v1.h[7]
   2332 .endif
   2333        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
   2334        sqrshrn         v3.4h,  v3.4s,  #\shift_hv
   2335        subs            \h,  \h,  #2
   2336 .ifc \type, put
   2337        sqxtun          v2.8b,  v2.8h
   2338        sqxtun          v3.8b,  v3.8h
   2339        str             s2,  [\dst]
   2340        str             s3,  [\ds2]
   2341        add             \dst, \dst, \d_strd
   2342        add             \ds2, \ds2, \d_strd
   2343 .else
   2344        st1             {v2.4h}, [\dst], \d_strd
   2345        st1             {v3.4h}, [\ds2], \d_strd
   2346 .endif
   2347        b.le            0f
   2348 .ifc \taps, 8tap
   2349        mov             v16.8b,  v18.8b
   2350        mov             v17.8b,  v19.8b
   2351 .endif
   2352        mov             v18.8b,  v20.8b
   2353        mov             v19.8b,  v21.8b
   2354        mov             v20.8b,  v22.8b
   2355        mov             v21.8b,  v28.8b
   2356        mov             v22.8b,  v29.8b
   2357        b               48b
   2358 0:
   2359        ret             x15
   2360 
   2361 L(\type\()_\taps\()_filter_4):
   2362        ld1             {v26.8b}, [\sr2], \s_strd
   2363        ld1             {v27.8b}, [\src], \s_strd
   2364        uxtl            v26.8h,  v26.8b
   2365        uxtl            v27.8h,  v27.8b
   2366        ext             v28.16b, v26.16b, v26.16b, #2
   2367        ext             v29.16b, v26.16b, v26.16b, #4
   2368        ext             v30.16b, v26.16b, v26.16b, #6
   2369        mul             v31.4h,  v26.4h,  v0.h[0]
   2370        mla             v31.4h,  v28.4h,  v0.h[1]
   2371        mla             v31.4h,  v29.4h,  v0.h[2]
   2372        mla             v31.4h,  v30.4h,  v0.h[3]
   2373        ext             v28.16b, v27.16b, v27.16b, #2
   2374        ext             v29.16b, v27.16b, v27.16b, #4
   2375        ext             v30.16b, v27.16b, v27.16b, #6
   2376        mul             v27.4h,  v27.4h,  v0.h[0]
   2377        mla             v27.4h,  v28.4h,  v0.h[1]
   2378        mla             v27.4h,  v29.4h,  v0.h[2]
   2379        mla             v27.4h,  v30.4h,  v0.h[3]
   2380        srshr           v28.4h,  v31.4h,  #2
   2381        srshr           v29.4h,  v27.4h,  #2
   2382        ret
   2383 
   2384 80:
   2385 160:
   2386 320:
   2387        AARCH64_VALID_JUMP_TARGET
   2388        b.gt            880f
   2389        ld1             {v0.8b},  [\xmx]
   2390        ldur            s1,  [\xmy, #2]
   2391 .ifc \taps, 6tap
   2392        sub             \src,  \src,  #2
   2393 .else
   2394        sub             \src,  \src,  #3
   2395 .endif
   2396        sub             \src,  \src,  \s_strd
   2397        sxtl            v0.8h,  v0.8b
   2398        sxtl            v1.8h,  v1.8b
   2399        mov             x15, x30
   2400        mov             \my,  \h
   2401 
   2402 164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
   2403        add             \ds2,  \dst,  \d_strd
   2404        add             \sr2,  \src,  \s_strd
   2405        lsl             \d_strd, \d_strd, #1
   2406        lsl             \s_strd, \s_strd, #1
   2407 
   2408        bl              L(\type\()_\taps\()_filter_8_first)
   2409        bl              L(\type\()_\taps\()_filter_8)
   2410        mov             v17.16b, v24.16b
   2411        mov             v18.16b, v25.16b
   2412 
   2413 8:
   2414        smull           v2.4s,  v16.4h, v1.h[0]
   2415        smull2          v3.4s,  v16.8h, v1.h[0]
   2416        bl              L(\type\()_\taps\()_filter_8)
   2417        smull           v4.4s,  v17.4h, v1.h[0]
   2418        smull2          v5.4s,  v17.8h, v1.h[0]
   2419        smlal           v2.4s,  v17.4h, v1.h[1]
   2420        smlal2          v3.4s,  v17.8h, v1.h[1]
   2421        smlal           v4.4s,  v18.4h, v1.h[1]
   2422        smlal2          v5.4s,  v18.8h, v1.h[1]
   2423        smlal           v2.4s,  v18.4h, v1.h[2]
   2424        smlal2          v3.4s,  v18.8h, v1.h[2]
   2425        smlal           v4.4s,  v24.4h, v1.h[2]
   2426        smlal2          v5.4s,  v24.8h, v1.h[2]
   2427        smlal           v2.4s,  v24.4h, v1.h[3]
   2428        smlal2          v3.4s,  v24.8h, v1.h[3]
   2429        smlal           v4.4s,  v25.4h, v1.h[3]
   2430        smlal2          v5.4s,  v25.8h, v1.h[3]
   2431        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
   2432        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
   2433        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
   2434        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
   2435        subs            \h,  \h,  #2
   2436 .ifc \type, put
   2437        sqxtun          v2.8b,  v2.8h
   2438        sqxtun          v4.8b,  v4.8h
   2439        st1             {v2.8b}, [\dst], \d_strd
   2440        st1             {v4.8b}, [\ds2], \d_strd
   2441 .else
   2442        st1             {v2.8h}, [\dst], \d_strd
   2443        st1             {v4.8h}, [\ds2], \d_strd
   2444 .endif
   2445        b.le            9f
   2446        mov             v16.16b, v18.16b
   2447        mov             v17.16b, v24.16b
   2448        mov             v18.16b, v25.16b
   2449        b               8b
   2450 9:
   2451        subs            \w,  \w,  #8
   2452        b.le            0f
   2453        asr             \s_strd,  \s_strd,  #1
   2454        asr             \d_strd,  \d_strd,  #1
   2455        msub            \src,  \s_strd,  \xmy,  \src
   2456        msub            \dst,  \d_strd,  \xmy,  \dst
   2457        sub             \src,  \src,  \s_strd,  lsl #2
   2458        mov             \h,  \my
   2459        add             \src,  \src,  #8
   2460 .ifc \type, put
   2461        add             \dst,  \dst,  #8
   2462 .else
   2463        add             \dst,  \dst,  #16
   2464 .endif
   2465        b               164b
   2466 
   2467 880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
   2468 640:
   2469 1280:
   2470        AARCH64_VALID_JUMP_TARGET
   2471        ld1             {v0.8b},  [\xmx]
   2472        ld1             {v1.8b},  [\xmy]
   2473 .ifc \taps, 6tap
   2474        sub             \src,  \src,  #2
   2475 .else
   2476        sub             \src,  \src,  #3
   2477        sub             \src,  \src,  \s_strd
   2478 .endif
   2479        sub             \src,  \src,  \s_strd, lsl #1
   2480        sxtl            v0.8h,  v0.8b
   2481        sxtl            v1.8h,  v1.8b
   2482        mov             x15, x30
   2483        mov             \my,  \h
   2484 
   2485 168:
   2486        add             \ds2,  \dst,  \d_strd
   2487        add             \sr2,  \src,  \s_strd
   2488        lsl             \d_strd, \d_strd, #1
   2489        lsl             \s_strd, \s_strd, #1
   2490 
   2491        bl              L(\type\()_\taps\()_filter_8_first)
   2492 .ifc \taps, 6tap
   2493        mov             v18.16b, v16.16b
   2494 .else
   2495        bl              L(\type\()_\taps\()_filter_8)
   2496        mov             v17.16b, v24.16b
   2497        mov             v18.16b, v25.16b
   2498 .endif
   2499        bl              L(\type\()_\taps\()_filter_8)
   2500        mov             v19.16b, v24.16b
   2501        mov             v20.16b, v25.16b
   2502        bl              L(\type\()_\taps\()_filter_8)
   2503        mov             v21.16b, v24.16b
   2504        mov             v22.16b, v25.16b
   2505 
   2506 88:
   2507 .ifc \taps, 6tap
   2508        smull           v2.4s,  v18.4h, v1.h[1]
   2509        smull2          v3.4s,  v18.8h, v1.h[1]
   2510        bl              L(\type\()_\taps\()_filter_8)
   2511        smull           v4.4s,  v19.4h, v1.h[1]
   2512        smull2          v5.4s,  v19.8h, v1.h[1]
   2513        smlal           v2.4s,  v19.4h, v1.h[2]
   2514        smlal2          v3.4s,  v19.8h, v1.h[2]
   2515        smlal           v4.4s,  v20.4h, v1.h[2]
   2516        smlal2          v5.4s,  v20.8h, v1.h[2]
   2517        smlal           v2.4s,  v20.4h, v1.h[3]
   2518        smlal2          v3.4s,  v20.8h, v1.h[3]
   2519        smlal           v4.4s,  v21.4h, v1.h[3]
   2520        smlal2          v5.4s,  v21.8h, v1.h[3]
   2521        smlal           v2.4s,  v21.4h, v1.h[4]
   2522        smlal2          v3.4s,  v21.8h, v1.h[4]
   2523        smlal           v4.4s,  v22.4h, v1.h[4]
   2524        smlal2          v5.4s,  v22.8h, v1.h[4]
   2525        smlal           v2.4s,  v22.4h, v1.h[5]
   2526        smlal2          v3.4s,  v22.8h, v1.h[5]
   2527        smlal           v4.4s,  v24.4h, v1.h[5]
   2528        smlal2          v5.4s,  v24.8h, v1.h[5]
   2529        smlal           v2.4s,  v24.4h, v1.h[6]
   2530        smlal2          v3.4s,  v24.8h, v1.h[6]
   2531        smlal           v4.4s,  v25.4h, v1.h[6]
   2532        smlal2          v5.4s,  v25.8h, v1.h[6]
   2533 .else   // 8tap
   2534        smull           v2.4s,  v16.4h, v1.h[0]
   2535        smull2          v3.4s,  v16.8h, v1.h[0]
   2536        bl              L(\type\()_\taps\()_filter_8)
   2537        smull           v4.4s,  v17.4h, v1.h[0]
   2538        smull2          v5.4s,  v17.8h, v1.h[0]
   2539        smlal           v2.4s,  v17.4h, v1.h[1]
   2540        smlal2          v3.4s,  v17.8h, v1.h[1]
   2541        smlal           v4.4s,  v18.4h, v1.h[1]
   2542        smlal2          v5.4s,  v18.8h, v1.h[1]
   2543        smlal           v2.4s,  v18.4h, v1.h[2]
   2544        smlal2          v3.4s,  v18.8h, v1.h[2]
   2545        smlal           v4.4s,  v19.4h, v1.h[2]
   2546        smlal2          v5.4s,  v19.8h, v1.h[2]
   2547        smlal           v2.4s,  v19.4h, v1.h[3]
   2548        smlal2          v3.4s,  v19.8h, v1.h[3]
   2549        smlal           v4.4s,  v20.4h, v1.h[3]
   2550        smlal2          v5.4s,  v20.8h, v1.h[3]
   2551        smlal           v2.4s,  v20.4h, v1.h[4]
   2552        smlal2          v3.4s,  v20.8h, v1.h[4]
   2553        smlal           v4.4s,  v21.4h, v1.h[4]
   2554        smlal2          v5.4s,  v21.8h, v1.h[4]
   2555        smlal           v2.4s,  v21.4h, v1.h[5]
   2556        smlal2          v3.4s,  v21.8h, v1.h[5]
   2557        smlal           v4.4s,  v22.4h, v1.h[5]
   2558        smlal2          v5.4s,  v22.8h, v1.h[5]
   2559        smlal           v2.4s,  v22.4h, v1.h[6]
   2560        smlal2          v3.4s,  v22.8h, v1.h[6]
   2561        smlal           v4.4s,  v24.4h, v1.h[6]
   2562        smlal2          v5.4s,  v24.8h, v1.h[6]
   2563        smlal           v2.4s,  v24.4h, v1.h[7]
   2564        smlal2          v3.4s,  v24.8h, v1.h[7]
   2565        smlal           v4.4s,  v25.4h, v1.h[7]
   2566        smlal2          v5.4s,  v25.8h, v1.h[7]
   2567 .endif
   2568        sqrshrn         v2.4h,  v2.4s,  #\shift_hv
   2569        sqrshrn2        v2.8h,  v3.4s,  #\shift_hv
   2570        sqrshrn         v4.4h,  v4.4s,  #\shift_hv
   2571        sqrshrn2        v4.8h,  v5.4s,  #\shift_hv
   2572        subs            \h,  \h,  #2
   2573 .ifc \type, put
   2574        sqxtun          v2.8b,  v2.8h
   2575        sqxtun          v4.8b,  v4.8h
   2576        st1             {v2.8b}, [\dst], \d_strd
   2577        st1             {v4.8b}, [\ds2], \d_strd
   2578 .else
   2579        st1             {v2.8h}, [\dst], \d_strd
   2580        st1             {v4.8h}, [\ds2], \d_strd
   2581 .endif
   2582        b.le            9f
   2583 .ifc \taps, 8tap
   2584        mov             v16.16b, v18.16b
   2585        mov             v17.16b, v19.16b
   2586 .endif
   2587        mov             v18.16b, v20.16b
   2588        mov             v19.16b, v21.16b
   2589        mov             v20.16b, v22.16b
   2590        mov             v21.16b, v24.16b
   2591        mov             v22.16b, v25.16b
   2592        b               88b
   2593 9:
   2594        subs            \w,  \w,  #8
   2595        b.le            0f
   2596        asr             \s_strd,  \s_strd,  #1
   2597        asr             \d_strd,  \d_strd,  #1
   2598        msub            \src,  \s_strd,  \xmy,  \src
   2599        msub            \dst,  \d_strd,  \xmy,  \dst
   2600        sub             \src,  \src,  \s_strd,  lsl #3
   2601        mov             \h,  \my
   2602        add             \src,  \src,  #8
   2603 .ifc \type, put
   2604        add             \dst,  \dst,  #8
   2605 .else
   2606        add             \dst,  \dst,  #16
   2607 .endif
   2608 .ifc \taps, 6tap
   2609        add             \src,  \src,  \s_strd,  lsl #1
   2610 .endif
   2611        b               168b
   2612 0:
   2613        ret             x15
   2614 
   2615 L(\type\()_\taps\()_filter_8_first):
   2616        ld1             {v28.8b, v29.8b},  [\src], \s_strd
   2617        uxtl            v28.8h,  v28.8b
   2618        uxtl            v29.8h,  v29.8b
   2619 .ifc \taps, 6tap
   2620        mul             v16.8h,  v28.8h,  v0.h[1]
   2621        ext             v25.16b, v28.16b, v29.16b, #(2*1)
   2622        ext             v26.16b, v28.16b, v29.16b, #(2*2)
   2623        ext             v27.16b, v28.16b, v29.16b, #(2*3)
   2624        mla             v16.8h,  v25.8h,  v0.h[2]
   2625        mla             v16.8h,  v26.8h,  v0.h[3]
   2626        mla             v16.8h,  v27.8h,  v0.h[4]
   2627        ext             v24.16b, v28.16b, v29.16b, #(2*4)
   2628        ext             v25.16b, v28.16b, v29.16b, #(2*5)
   2629        mla             v16.8h,  v24.8h,  v0.h[5]
   2630        mla             v16.8h,  v25.8h,  v0.h[6]
   2631 .else   // 8tap
   2632        mul             v16.8h,  v28.8h,  v0.h[0]
   2633        ext             v24.16b, v28.16b, v29.16b, #(2*1)
   2634        ext             v25.16b, v28.16b, v29.16b, #(2*2)
   2635        ext             v26.16b, v28.16b, v29.16b, #(2*3)
   2636        ext             v27.16b, v28.16b, v29.16b, #(2*4)
   2637        mla             v16.8h,  v24.8h,  v0.h[1]
   2638        mla             v16.8h,  v25.8h,  v0.h[2]
   2639        mla             v16.8h,  v26.8h,  v0.h[3]
   2640        mla             v16.8h,  v27.8h,  v0.h[4]
   2641        ext             v24.16b, v28.16b, v29.16b, #(2*5)
   2642        ext             v25.16b, v28.16b, v29.16b, #(2*6)
   2643        ext             v26.16b, v28.16b, v29.16b, #(2*7)
   2644        mla             v16.8h,  v24.8h,  v0.h[5]
   2645        mla             v16.8h,  v25.8h,  v0.h[6]
   2646        mla             v16.8h,  v26.8h,  v0.h[7]
   2647 .endif
   2648        srshr           v16.8h,  v16.8h,  #2
   2649        ret
   2650 
   2651 L(\type\()_\taps\()_filter_8):
   2652        ld1             {v28.8b, v29.8b},  [\sr2], \s_strd
   2653        ld1             {v30.8b, v31.8b},  [\src], \s_strd
   2654        uxtl            v28.8h,  v28.8b
   2655        uxtl            v29.8h,  v29.8b
   2656        uxtl            v30.8h,  v30.8b
   2657        uxtl            v31.8h,  v31.8b
   2658 .ifc \taps, 6tap
   2659        mul             v24.8h,  v28.8h,  v0.h[1]
   2660        mul             v25.8h,  v30.8h,  v0.h[1]
   2661    .irpc i, 23456
   2662        ext             v26.16b, v28.16b, v29.16b, #(2*\i-2)
   2663        ext             v27.16b, v30.16b, v31.16b, #(2*\i-2)
   2664        mla             v24.8h,  v26.8h,  v0.h[\i]
   2665        mla             v25.8h,  v27.8h,  v0.h[\i]
   2666    .endr
   2667 .else   // 8tap
   2668        mul             v24.8h,  v28.8h,  v0.h[0]
   2669        mul             v25.8h,  v30.8h,  v0.h[0]
   2670    .irpc i, 1234567
   2671        ext             v26.16b, v28.16b, v29.16b, #(2*\i)
   2672        ext             v27.16b, v30.16b, v31.16b, #(2*\i)
   2673        mla             v24.8h,  v26.8h,  v0.h[\i]
   2674        mla             v25.8h,  v27.8h,  v0.h[\i]
   2675    .endr
   2676 .endif
   2677        srshr           v24.8h,  v24.8h, #2
   2678        srshr           v25.8h,  v25.8h, #2
   2679        ret
   2680 endfunc
   2681 
   2682 jumptable \type\()_\taps\()_hv_tbl
   2683        .word 1280b - \type\()_\taps\()_hv_tbl
   2684        .word 640b  - \type\()_\taps\()_hv_tbl
   2685        .word 320b  - \type\()_\taps\()_hv_tbl
   2686        .word 160b  - \type\()_\taps\()_hv_tbl
   2687        .word 80b   - \type\()_\taps\()_hv_tbl
   2688        .word 40b   - \type\()_\taps\()_hv_tbl
   2689        .word 20b   - \type\()_\taps\()_hv_tbl
   2690 endjumptable
   2691 .endm
   2692 
   2693 
   2694 .macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, ds2, sr2, shift_hv
   2695 function \type\()_bilin_8bpc_neon, export=1
   2696        dup             v1.16b, \mx
   2697        dup             v3.16b, \my
   2698        mov             w9,  #16
   2699        sub             w8, w9, \mx
   2700        sub             w9, w9, \my
   2701        dup             v0.16b, w8
   2702        dup             v2.16b, w9
   2703 .ifc \type, prep
   2704        uxtw            \d_strd, \w
   2705        lsl             \d_strd, \d_strd, #1
   2706 .endif
   2707 
   2708        clz             w8,  \w
   2709        sub             w8,  w8,  #24
   2710        cbnz            \mx, L(\type\()_bilin_h)
   2711        cbnz            \my, L(\type\()_bilin_v)
   2712        b               \type\()_neon
   2713 
   2714 L(\type\()_bilin_h):
   2715        cbnz            \my, L(\type\()_bilin_hv)
   2716 
   2717        movrel          x9,  \type\()_bilin_h_tbl
   2718        ldrsw           x8,  [x9, x8, lsl #2]
   2719        add             x9,  x9,  x8
   2720        br              x9
   2721 
   2722 20:     // 2xN h
   2723        AARCH64_VALID_JUMP_TARGET
   2724 .ifc \type, put
   2725        add             \ds2,  \dst,  \d_strd
   2726        add             \sr2,  \src,  \s_strd
   2727        lsl             \d_strd,  \d_strd,  #1
   2728        lsl             \s_strd,  \s_strd,  #1
   2729 2:
   2730        ld1r            {v4.4s},  [\src], \s_strd
   2731        ld1r            {v6.4s},  [\sr2], \s_strd
   2732        ext             v5.8b,  v4.8b,  v4.8b, #1
   2733        ext             v7.8b,  v6.8b,  v6.8b, #1
   2734        trn1            v4.4h,  v4.4h,  v6.4h
   2735        trn1            v5.4h,  v5.4h,  v7.4h
   2736        subs            \h,  \h,  #2
   2737        umull           v4.8h,  v4.8b,  v0.8b
   2738        umlal           v4.8h,  v5.8b,  v1.8b
   2739        uqrshrn         v4.8b,  v4.8h,  #4
   2740        st1             {v4.h}[0], [\dst], \d_strd
   2741        st1             {v4.h}[1], [\ds2], \d_strd
   2742        b.gt            2b
   2743        ret
   2744 .endif
   2745 
   2746 40:     // 4xN h
   2747        AARCH64_VALID_JUMP_TARGET
   2748        add             \ds2,  \dst,  \d_strd
   2749        add             \sr2,  \src,  \s_strd
   2750        lsl             \d_strd,  \d_strd,  #1
   2751        lsl             \s_strd,  \s_strd,  #1
   2752 4:
   2753        ld1             {v4.8b}, [\src], \s_strd
   2754        ld1             {v6.8b}, [\sr2], \s_strd
   2755        ext             v5.8b,  v4.8b,  v4.8b, #1
   2756        ext             v7.8b,  v6.8b,  v6.8b, #1
   2757        trn1            v4.2s,  v4.2s,  v6.2s
   2758        trn1            v5.2s,  v5.2s,  v7.2s
   2759        subs            \h,  \h,  #2
   2760        umull           v4.8h,  v4.8b,  v0.8b
   2761        umlal           v4.8h,  v5.8b,  v1.8b
   2762 .ifc \type, put
   2763        uqrshrn         v4.8b,  v4.8h,  #4
   2764        st1             {v4.s}[0], [\dst], \d_strd
   2765        st1             {v4.s}[1], [\ds2], \d_strd
   2766 .else
   2767        st1             {v4.8b},   [\dst], \d_strd
   2768        st1             {v4.d}[1], [\ds2], \d_strd
   2769 .endif
   2770        b.gt            4b
   2771        ret
   2772 
   2773 80:     // 8xN h
   2774        AARCH64_VALID_JUMP_TARGET
   2775        add             \ds2,  \dst,  \d_strd
   2776        add             \sr2,  \src,  \s_strd
   2777        lsl             \d_strd,  \d_strd,  #1
   2778        lsl             \s_strd,  \s_strd,  #1
   2779 8:
   2780        ld1             {v4.16b}, [\src], \s_strd
   2781        ld1             {v6.16b}, [\sr2], \s_strd
   2782        ext             v5.16b, v4.16b, v4.16b, #1
   2783        ext             v7.16b, v6.16b, v6.16b, #1
   2784        subs            \h,  \h,  #2
   2785        umull           v4.8h,  v4.8b,  v0.8b
   2786        umull           v6.8h,  v6.8b,  v0.8b
   2787        umlal           v4.8h,  v5.8b,  v1.8b
   2788        umlal           v6.8h,  v7.8b,  v1.8b
   2789 .ifc \type, put
   2790        uqrshrn         v4.8b,  v4.8h,  #4
   2791        uqrshrn         v6.8b,  v6.8h,  #4
   2792        st1             {v4.8b}, [\dst], \d_strd
   2793        st1             {v6.8b}, [\ds2], \d_strd
   2794 .else
   2795        st1             {v4.8h}, [\dst], \d_strd
   2796        st1             {v6.8h}, [\ds2], \d_strd
   2797 .endif
   2798        b.gt            8b
   2799        ret
   2800 160:
   2801 320:
   2802 640:
   2803 1280:   // 16xN, 32xN, ... h
   2804        AARCH64_VALID_JUMP_TARGET
   2805        add             \ds2,  \dst,  \d_strd
   2806        add             \sr2,  \src,  \s_strd
   2807        lsl             \s_strd,  \s_strd,  #1
   2808 
   2809        sub             \s_strd,  \s_strd,  \w, uxtw
   2810        sub             \s_strd,  \s_strd,  #8
   2811 .ifc \type, put
   2812        lsl             \d_strd,  \d_strd,  #1
   2813        sub             \d_strd,  \d_strd,  \w, uxtw
   2814 .endif
   2815 161:
   2816        ld1             {v16.d}[1],  [\src], #8
   2817        ld1             {v20.d}[1],  [\sr2], #8
   2818        mov             \mx, \w
   2819 
   2820 16:
   2821        ld1             {v18.16b},  [\src], #16
   2822        ld1             {v22.16b},  [\sr2], #16
   2823        ext             v17.16b, v16.16b, v18.16b, #8
   2824        ext             v19.16b, v16.16b, v18.16b, #9
   2825        ext             v21.16b, v20.16b, v22.16b, #8
   2826        ext             v23.16b, v20.16b, v22.16b, #9
   2827        umull           v16.8h,  v17.8b,  v0.8b
   2828        umull2          v17.8h,  v17.16b, v0.16b
   2829        umull           v20.8h,  v21.8b,  v0.8b
   2830        umull2          v21.8h,  v21.16b, v0.16b
   2831        umlal           v16.8h,  v19.8b,  v1.8b
   2832        umlal2          v17.8h,  v19.16b, v1.16b
   2833        umlal           v20.8h,  v23.8b,  v1.8b
   2834        umlal2          v21.8h,  v23.16b, v1.16b
   2835        subs            \mx, \mx, #16
   2836 .ifc \type, put
   2837        uqrshrn         v16.8b,  v16.8h, #4
   2838        uqrshrn2        v16.16b, v17.8h, #4
   2839        uqrshrn         v20.8b,  v20.8h, #4
   2840        uqrshrn2        v20.16b, v21.8h, #4
   2841        st1             {v16.16b}, [\dst], #16
   2842        st1             {v20.16b}, [\ds2], #16
   2843 .else
   2844        st1             {v16.8h, v17.8h}, [\dst], #32
   2845        st1             {v20.8h, v21.8h}, [\ds2], #32
   2846 .endif
   2847        b.le            9f
   2848 
   2849        mov             v16.16b, v18.16b
   2850        mov             v20.16b, v22.16b
   2851        b               16b
   2852 
   2853 9:
   2854        add             \dst,  \dst,  \d_strd
   2855        add             \ds2,  \ds2,  \d_strd
   2856        add             \src,  \src,  \s_strd
   2857        add             \sr2,  \sr2,  \s_strd
   2858 
   2859        subs            \h,  \h,  #2
   2860        b.gt            161b
   2861        ret
   2862 endfunc
   2863 
   2864 jumptable \type\()_bilin_h_tbl
   2865        .word 1280b - \type\()_bilin_h_tbl
   2866        .word 640b  - \type\()_bilin_h_tbl
   2867        .word 320b  - \type\()_bilin_h_tbl
   2868        .word 160b  - \type\()_bilin_h_tbl
   2869        .word 80b   - \type\()_bilin_h_tbl
   2870        .word 40b   - \type\()_bilin_h_tbl
   2871        .word 20b   - \type\()_bilin_h_tbl
   2872 endjumptable
   2873 
   2874 
   2875 function L(\type\()_bilin_v)
   2876        cmp             \h,  #4
   2877        movrel          x9,  \type\()_bilin_v_tbl
   2878        ldrsw           x8,  [x9, x8, lsl #2]
   2879        add             x9,  x9,  x8
   2880        br              x9
   2881 
   2882 20:     // 2xN v
   2883        AARCH64_VALID_JUMP_TARGET
   2884 .ifc \type, put
   2885        cmp             \h,  #2
   2886        add             \ds2,  \dst,  \d_strd
   2887        add             \sr2,  \src,  \s_strd
   2888        lsl             \s_strd,  \s_strd,  #1
   2889        lsl             \d_strd,  \d_strd,  #1
   2890 
   2891        // 2x2 v
   2892        ld1r            {v16.8h}, [\src], \s_strd
   2893        b.gt            24f
   2894 22:
   2895        ld1r            {v17.8h}, [\sr2], \s_strd
   2896        ld1r            {v18.8h}, [\src], \s_strd
   2897        trn1            v16.4h, v16.4h, v17.4h
   2898        trn1            v17.4h, v17.4h, v18.4h
   2899        umull           v4.8h,  v16.8b,  v2.8b
   2900        umlal           v4.8h,  v17.8b,  v3.8b
   2901        uqrshrn         v4.8b,  v4.8h,  #4
   2902        str             h4,        [\dst]
   2903        st1             {v4.h}[1], [\ds2]
   2904        ret
   2905 24:     // 2x4, 2x6, 2x8, ... v
   2906        ld1r            {v17.8h}, [\sr2], \s_strd
   2907        ld1r            {v18.8h}, [\src], \s_strd
   2908        ld1r            {v19.8h}, [\sr2], \s_strd
   2909        ld1r            {v20.8h}, [\src], \s_strd
   2910        sub             \h,  \h,  #4
   2911        trn1            v16.4h, v16.4h, v17.4h
   2912        trn1            v17.4h, v17.4h, v18.4h
   2913        trn1            v18.4h, v18.4h, v19.4h
   2914        trn1            v19.4h, v19.4h, v20.4h
   2915        trn1            v16.2s, v16.2s, v18.2s
   2916        trn1            v17.2s, v17.2s, v19.2s
   2917        umull           v4.8h,  v16.8b,  v2.8b
   2918        umlal           v4.8h,  v17.8b,  v3.8b
   2919        cmp             \h,  #2
   2920        uqrshrn         v4.8b,  v4.8h,  #4
   2921        st1             {v4.h}[0], [\dst], \d_strd
   2922        st1             {v4.h}[1], [\ds2], \d_strd
   2923        st1             {v4.h}[2], [\dst], \d_strd
   2924        st1             {v4.h}[3], [\ds2], \d_strd
   2925        b.lt            0f
   2926        mov             v16.8b, v20.8b
   2927        b.eq            22b
   2928        b               24b
   2929 0:
   2930        ret
   2931 .endif
   2932 
   2933 40:     // 4xN v
   2934        AARCH64_VALID_JUMP_TARGET
   2935        add             \ds2,  \dst,  \d_strd
   2936        add             \sr2,  \src,  \s_strd
   2937        lsl             \s_strd,  \s_strd,  #1
   2938        lsl             \d_strd,  \d_strd,  #1
   2939        ld1r            {v16.4s}, [\src], \s_strd
   2940 4:
   2941        ld1r            {v17.4s}, [\sr2], \s_strd
   2942        ld1r            {v18.4s}, [\src], \s_strd
   2943        trn1            v16.2s, v16.2s, v17.2s
   2944        trn1            v17.2s, v17.2s, v18.2s
   2945        umull           v4.8h,  v16.8b,  v2.8b
   2946        umlal           v4.8h,  v17.8b,  v3.8b
   2947        subs            \h,  \h,  #2
   2948 .ifc \type, put
   2949        uqrshrn         v4.8b,  v4.8h,  #4
   2950        st1             {v4.s}[0], [\dst], \d_strd
   2951        st1             {v4.s}[1], [\ds2], \d_strd
   2952 .else
   2953        st1             {v4.8b},   [\dst], \d_strd
   2954        st1             {v4.d}[1], [\ds2], \d_strd
   2955 .endif
   2956        b.le            0f
   2957        mov             v16.8b, v18.8b
   2958        b               4b
   2959 0:
   2960        ret
   2961 
   2962 80:     // 8xN v
   2963        AARCH64_VALID_JUMP_TARGET
   2964        add             \ds2,  \dst,  \d_strd
   2965        add             \sr2,  \src,  \s_strd
   2966        lsl             \s_strd,  \s_strd,  #1
   2967        lsl             \d_strd,  \d_strd,  #1
   2968        ld1             {v16.8b}, [\src], \s_strd
   2969 8:
   2970        ld1             {v17.8b}, [\sr2], \s_strd
   2971        ld1             {v18.8b}, [\src], \s_strd
   2972        umull           v4.8h,  v16.8b,  v2.8b
   2973        umull           v5.8h,  v17.8b,  v2.8b
   2974        umlal           v4.8h,  v17.8b,  v3.8b
   2975        umlal           v5.8h,  v18.8b,  v3.8b
   2976        subs            \h,  \h,  #2
   2977 .ifc \type, put
   2978        uqrshrn         v4.8b,  v4.8h,  #4
   2979        uqrshrn         v5.8b,  v5.8h,  #4
   2980        st1             {v4.8b}, [\dst], \d_strd
   2981        st1             {v5.8b}, [\ds2], \d_strd
   2982 .else
   2983        st1             {v4.8h}, [\dst], \d_strd
   2984        st1             {v5.8h}, [\ds2], \d_strd
   2985 .endif
   2986        b.le            0f
   2987        mov             v16.8b, v18.8b
   2988        b               8b
   2989 0:
   2990        ret
   2991 
   2992 160:    // 16xN, 32xN, ...
   2993 320:
   2994 640:
   2995 1280:
   2996        AARCH64_VALID_JUMP_TARGET
   2997        mov             \my,  \h
   2998 1:
   2999        add             \ds2, \dst, \d_strd
   3000        add             \sr2, \src, \s_strd
   3001        lsl             \s_strd, \s_strd, #1
   3002        lsl             \d_strd, \d_strd, #1
   3003 
   3004        ld1             {v16.16b}, [\src], \s_strd
   3005 2:
   3006        ld1             {v17.16b}, [\sr2], \s_strd
   3007        ld1             {v18.16b}, [\src], \s_strd
   3008        umull           v4.8h,  v16.8b,  v2.8b
   3009        umull2          v5.8h,  v16.16b, v2.16b
   3010        umull           v6.8h,  v17.8b,  v2.8b
   3011        umull2          v7.8h,  v17.16b, v2.16b
   3012        umlal           v4.8h,  v17.8b,  v3.8b
   3013        umlal2          v5.8h,  v17.16b, v3.16b
   3014        umlal           v6.8h,  v18.8b,  v3.8b
   3015        umlal2          v7.8h,  v18.16b, v3.16b
   3016        subs            \h,  \h,  #2
   3017 .ifc \type, put
   3018        uqrshrn         v4.8b,  v4.8h,  #4
   3019        uqrshrn2        v4.16b, v5.8h,  #4
   3020        uqrshrn         v6.8b,  v6.8h,  #4
   3021        uqrshrn2        v6.16b, v7.8h,  #4
   3022        st1             {v4.16b}, [\dst], \d_strd
   3023        st1             {v6.16b}, [\ds2], \d_strd
   3024 .else
   3025        st1             {v4.8h, v5.8h}, [\dst], \d_strd
   3026        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
   3027 .endif
   3028        b.le            9f
   3029        mov             v16.16b, v18.16b
   3030        b               2b
   3031 9:
   3032        subs            \w,  \w,  #16
   3033        b.le            0f
   3034        asr             \s_strd, \s_strd, #1
   3035        asr             \d_strd, \d_strd, #1
   3036        msub            \src, \s_strd, \xmy, \src
   3037        msub            \dst, \d_strd, \xmy, \dst
   3038        sub             \src, \src, \s_strd, lsl #1
   3039        mov             \h,  \my
   3040        add             \src, \src, #16
   3041 .ifc \type, put
   3042        add             \dst, \dst, #16
   3043 .else
   3044        add             \dst, \dst, #32
   3045 .endif
   3046        b               1b
   3047 0:
   3048        ret
   3049 endfunc
   3050 
   3051 jumptable \type\()_bilin_v_tbl
   3052        .word 1280b - \type\()_bilin_v_tbl
   3053        .word 640b  - \type\()_bilin_v_tbl
   3054        .word 320b  - \type\()_bilin_v_tbl
   3055        .word 160b  - \type\()_bilin_v_tbl
   3056        .word 80b   - \type\()_bilin_v_tbl
   3057        .word 40b   - \type\()_bilin_v_tbl
   3058        .word 20b   - \type\()_bilin_v_tbl
   3059 endjumptable
   3060 
   3061 function L(\type\()_bilin_hv)
   3062        uxtl            v2.8h, v2.8b
   3063        uxtl            v3.8h, v3.8b
   3064        movrel          x9,  \type\()_bilin_hv_tbl
   3065        ldrsw           x8,  [x9, x8, lsl #2]
   3066        add             x9,  x9,  x8
   3067        br              x9
   3068 
   3069 20:     // 2xN hv
   3070        AARCH64_VALID_JUMP_TARGET
   3071 .ifc \type, put
   3072        add             \sr2, \src, \s_strd
   3073        add             \ds2, \dst, \d_strd
   3074        lsl             \s_strd, \s_strd, #1
   3075        lsl             \d_strd, \d_strd, #1
   3076 
   3077        ld1r            {v28.4s},  [\src], \s_strd
   3078        ext             v29.8b, v28.8b, v28.8b, #1
   3079        umull           v16.8h, v28.8b, v0.8b
   3080        umlal           v16.8h, v29.8b, v1.8b
   3081 
   3082 2:
   3083        ld1r            {v28.4s},  [\sr2], \s_strd
   3084        ld1r            {v30.4s},  [\src], \s_strd
   3085        ext             v29.8b, v28.8b, v28.8b, #1
   3086        ext             v31.8b, v30.8b, v30.8b, #1
   3087        trn1            v28.4h, v28.4h, v30.4h
   3088        trn1            v29.4h, v29.4h, v31.4h
   3089        umull           v17.8h, v28.8b, v0.8b
   3090        umlal           v17.8h, v29.8b, v1.8b
   3091 
   3092        trn1            v16.2s, v16.2s, v17.2s
   3093 
   3094        mul             v4.4h,  v16.4h, v2.4h
   3095        mla             v4.4h,  v17.4h, v3.4h
   3096        uqrshrn         v4.8b,  v4.8h,  #8
   3097        subs            \h,  \h,  #2
   3098        st1             {v4.h}[0], [\dst], \d_strd
   3099        st1             {v4.h}[1], [\ds2], \d_strd
   3100        b.le            0f
   3101        trn2            v16.2s, v17.2s, v17.2s
   3102        b               2b
   3103 0:
   3104        ret
   3105 .endif
   3106 
   3107 40:     // 4xN hv
   3108        AARCH64_VALID_JUMP_TARGET
   3109        add             \sr2, \src, \s_strd
   3110        add             \ds2, \dst, \d_strd
   3111        lsl             \s_strd, \s_strd, #1
   3112        lsl             \d_strd, \d_strd, #1
   3113 
   3114        ld1             {v28.8b},  [\src], \s_strd
   3115        ext             v29.8b, v28.8b, v28.8b, #1
   3116        umull           v16.8h, v28.8b, v0.8b
   3117        umlal           v16.8h, v29.8b, v1.8b
   3118 
   3119 4:
   3120        ld1             {v28.8b},  [\sr2], \s_strd
   3121        ld1             {v30.8b},  [\src], \s_strd
   3122        ext             v29.8b, v28.8b, v28.8b, #1
   3123        ext             v31.8b, v30.8b, v30.8b, #1
   3124        trn1            v28.2s, v28.2s, v30.2s
   3125        trn1            v29.2s, v29.2s, v31.2s
   3126        umull           v17.8h, v28.8b, v0.8b
   3127        umlal           v17.8h, v29.8b, v1.8b
   3128 
   3129        trn1            v16.2d, v16.2d, v17.2d
   3130 
   3131        mul             v4.8h,  v16.8h, v2.8h
   3132        mla             v4.8h,  v17.8h, v3.8h
   3133        subs            \h,  \h,  #2
   3134 .ifc \type, put
   3135        uqrshrn         v4.8b,  v4.8h,  #8
   3136        st1             {v4.s}[0], [\dst], \d_strd
   3137        st1             {v4.s}[1], [\ds2], \d_strd
   3138 .else
   3139        urshr           v4.8h,  v4.8h,  #4
   3140        st1             {v4.8b},   [\dst], \d_strd
   3141        st1             {v4.d}[1], [\ds2], \d_strd
   3142 .endif
   3143        b.le            0f
   3144        trn2            v16.2d, v17.2d, v17.2d
   3145        b               4b
   3146 0:
   3147        ret
   3148 
   3149 80:     // 8xN, 16xN, ... hv
   3150 160:
   3151 320:
   3152 640:
   3153 1280:
   3154        AARCH64_VALID_JUMP_TARGET
   3155        mov             \my,  \h
   3156 
   3157 1:
   3158        add             \sr2, \src, \s_strd
   3159        add             \ds2, \dst, \d_strd
   3160        lsl             \s_strd, \s_strd, #1
   3161        lsl             \d_strd, \d_strd, #1
   3162 
   3163        ld1             {v28.16b},  [\src], \s_strd
   3164        ext             v29.16b, v28.16b, v28.16b, #1
   3165        umull           v16.8h, v28.8b, v0.8b
   3166        umlal           v16.8h, v29.8b, v1.8b
   3167 
   3168 2:
   3169        ld1             {v28.16b},  [\sr2], \s_strd
   3170        ld1             {v30.16b},  [\src], \s_strd
   3171        ext             v29.16b, v28.16b, v28.16b, #1
   3172        ext             v31.16b, v30.16b, v30.16b, #1
   3173        umull           v17.8h, v28.8b, v0.8b
   3174        umlal           v17.8h, v29.8b, v1.8b
   3175        umull           v18.8h, v30.8b, v0.8b
   3176        umlal           v18.8h, v31.8b, v1.8b
   3177 
   3178        mul             v4.8h,  v16.8h, v2.8h
   3179        mla             v4.8h,  v17.8h, v3.8h
   3180        mul             v5.8h,  v17.8h, v2.8h
   3181        mla             v5.8h,  v18.8h, v3.8h
   3182        subs            \h,  \h,  #2
   3183 .ifc \type, put
   3184        uqrshrn         v4.8b,  v4.8h,  #8
   3185        uqrshrn         v5.8b,  v5.8h,  #8
   3186        st1             {v4.8b}, [\dst], \d_strd
   3187        st1             {v5.8b}, [\ds2], \d_strd
   3188 .else
   3189        urshr           v4.8h,  v4.8h,  #4
   3190        urshr           v5.8h,  v5.8h,  #4
   3191        st1             {v4.8h}, [\dst], \d_strd
   3192        st1             {v5.8h}, [\ds2], \d_strd
   3193 .endif
   3194        b.le            9f
   3195        mov             v16.16b, v18.16b
   3196        b               2b
   3197 9:
   3198        subs            \w,  \w,  #8
   3199        b.le            0f
   3200        asr             \s_strd,  \s_strd,  #1
   3201        asr             \d_strd,  \d_strd,  #1
   3202        msub            \src,  \s_strd,  \xmy,  \src
   3203        msub            \dst,  \d_strd,  \xmy,  \dst
   3204        sub             \src,  \src,  \s_strd,  lsl #1
   3205        mov             \h,  \my
   3206        add             \src,  \src,  #8
   3207 .ifc \type, put
   3208        add             \dst,  \dst,  #8
   3209 .else
   3210        add             \dst,  \dst,  #16
   3211 .endif
   3212        b               1b
   3213 0:
   3214        ret
   3215 endfunc
   3216 
   3217 jumptable \type\()_bilin_hv_tbl
   3218        .word 1280b - \type\()_bilin_hv_tbl
   3219        .word 640b  - \type\()_bilin_hv_tbl
   3220        .word 320b  - \type\()_bilin_hv_tbl
   3221        .word 160b  - \type\()_bilin_hv_tbl
   3222        .word 80b   - \type\()_bilin_hv_tbl
   3223        .word 40b   - \type\()_bilin_hv_tbl
   3224        .word 20b   - \type\()_bilin_hv_tbl
   3225 endjumptable
   3226 .endm
   3227 
   3228 make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
   3229 make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
   3230 make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
   3231 make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
   3232 make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
   3233 filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 8tap
   3234 
   3235 make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
   3236 make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
   3237 make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
   3238 make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
   3239 filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10, 6tap
   3240 filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, x8, x9, 10
   3241 
   3242 make_8tap_fn    prep, regular_sharp,  REGULAR, SHARP,   8tap
   3243 make_8tap_fn    prep, smooth_sharp,   SMOOTH,  SHARP,   8tap
   3244 make_8tap_fn    prep, sharp,          SHARP,   SHARP,   8tap
   3245 make_8tap_fn    prep, sharp_regular,  SHARP,   REGULAR, 8tap
   3246 make_8tap_fn    prep, sharp_smooth,   SHARP,   SMOOTH,  8tap
   3247 filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  8tap
   3248 
   3249 make_8tap_fn    prep, regular,        REGULAR, REGULAR, 6tap
   3250 make_8tap_fn    prep, regular_smooth, REGULAR, SMOOTH,  6tap
   3251 make_8tap_fn    prep, smooth,         SMOOTH,  SMOOTH,  6tap
   3252 make_8tap_fn    prep, smooth_regular, SMOOTH,  REGULAR, 6tap
   3253 filter_fn       prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6,  6tap
   3254 filter_bilin_fn prep, x0, x7, x1, x2, w3, w4, w5, x5, w6, x6, x8, x9, 6
   3255 
   3256 
   3257 .macro load_filter_row dst, src, inc
   3258        asr             w13, \src, #10
   3259        add             \src, \src, \inc
   3260        ldr             \dst, [x11, w13, sxtw #3]
   3261 .endm
   3262 
   3263 function warp_filter_horz_neon
   3264        add             w12, w5,  #512
   3265 
   3266        ld1             {v16.8b, v17.8b}, [x2], x3
   3267 
   3268        load_filter_row d0, w12, w7
   3269        load_filter_row d1, w12, w7
   3270        load_filter_row d2, w12, w7
   3271        load_filter_row d3, w12, w7
   3272        load_filter_row d4, w12, w7
   3273        load_filter_row d5, w12, w7
   3274        load_filter_row d6, w12, w7
   3275        // subtract by 128 to allow using smull
   3276        eor             v16.8b,  v16.8b,  v22.8b
   3277        eor             v17.8b,  v17.8b,  v22.8b
   3278        load_filter_row d7, w12, w7
   3279 
   3280        ext             v18.8b,  v16.8b,  v17.8b,  #1
   3281        ext             v19.8b,  v16.8b,  v17.8b,  #2
   3282        smull           v0.8h,   v0.8b,   v16.8b
   3283        smull           v1.8h,   v1.8b,   v18.8b
   3284        ext             v18.8b,  v16.8b,  v17.8b,  #3
   3285        ext             v20.8b,  v16.8b,  v17.8b,  #4
   3286        smull           v2.8h,   v2.8b,   v19.8b
   3287        smull           v3.8h,   v3.8b,   v18.8b
   3288        ext             v18.8b,  v16.8b,  v17.8b,  #5
   3289        ext             v19.8b,  v16.8b,  v17.8b,  #6
   3290        smull           v4.8h,   v4.8b,   v20.8b
   3291        smull           v5.8h,   v5.8b,   v18.8b
   3292        ext             v18.8b,  v16.8b,  v17.8b,  #7
   3293        smull           v6.8h,   v6.8b,   v19.8b
   3294        smull           v7.8h,   v7.8b,   v18.8b
   3295 
   3296        addp            v0.8h,   v0.8h,   v1.8h
   3297        addp            v2.8h,   v2.8h,   v3.8h
   3298        addp            v4.8h,   v4.8h,   v5.8h
   3299        addp            v6.8h,   v6.8h,   v7.8h
   3300 
   3301        addp            v0.8h,   v0.8h,   v2.8h
   3302        addp            v4.8h,   v4.8h,   v6.8h
   3303 
   3304        addp            v0.8h,   v0.8h,   v4.8h
   3305 
   3306        add             w5,  w5,  w8
   3307 
   3308        ret
   3309 endfunc
   3310 
   3311 // void dav1d_warp_affine_8x8_8bpc_neon(
   3312 //         pixel *dst, const ptrdiff_t dst_stride,
   3313 //         const pixel *src, const ptrdiff_t src_stride,
   3314 //         const int16_t *const abcd, int mx, int my)
   3315 .macro warp t, shift
   3316 function warp_affine_8x8\t\()_8bpc_neon, export=1
   3317        ldr             x4,  [x4]
   3318        sbfx            x7,  x4, #0,  #16
   3319        sbfx            x8,  x4, #16, #16
   3320        sbfx            x9,  x4, #32, #16
   3321        sbfx            x4,  x4, #48, #16
   3322        mov             w10, #8
   3323        sub             x2,  x2,  x3, lsl #1
   3324        sub             x2,  x2,  x3
   3325        sub             x2,  x2,  #3
   3326        movrel          x11, X(mc_warp_filter), 64*8
   3327        mov             x15, x30
   3328 .ifnb \t
   3329        lsl             x1,  x1,  #1
   3330 .endif
   3331 
   3332        movi            v22.8b,  #128
   3333 .ifb \t
   3334        movi            v23.8h,  #128
   3335 .else
   3336        movi            v23.8h,  #8, lsl #8
   3337 .endif
   3338 
   3339        bl              warp_filter_horz_neon
   3340        srshr           v24.8h,  v0.8h,  #3
   3341        bl              warp_filter_horz_neon
   3342        srshr           v25.8h,  v0.8h,  #3
   3343        bl              warp_filter_horz_neon
   3344        srshr           v26.8h,  v0.8h,  #3
   3345        bl              warp_filter_horz_neon
   3346        srshr           v27.8h,  v0.8h,  #3
   3347        bl              warp_filter_horz_neon
   3348        srshr           v28.8h,  v0.8h,  #3
   3349        bl              warp_filter_horz_neon
   3350        srshr           v29.8h,  v0.8h,  #3
   3351        bl              warp_filter_horz_neon
   3352        srshr           v30.8h,  v0.8h,  #3
   3353 
   3354 1:
   3355        add             w14, w6,  #512
   3356        bl              warp_filter_horz_neon
   3357        srshr           v31.8h,  v0.8h,  #3
   3358 
   3359        load_filter_row d0, w14, w9
   3360        load_filter_row d1, w14, w9
   3361        load_filter_row d2, w14, w9
   3362        load_filter_row d3, w14, w9
   3363        load_filter_row d4, w14, w9
   3364        load_filter_row d5, w14, w9
   3365        load_filter_row d6, w14, w9
   3366        load_filter_row d7, w14, w9
   3367        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
   3368 
   3369        // This ordering of smull/smlal/smull2/smlal2 is highly
   3370        // beneficial for Cortex A53 here.
   3371        smull           v16.4s,  v24.4h,  v0.4h
   3372        smlal           v16.4s,  v25.4h,  v1.4h
   3373        smlal           v16.4s,  v26.4h,  v2.4h
   3374        smlal           v16.4s,  v27.4h,  v3.4h
   3375        smlal           v16.4s,  v28.4h,  v4.4h
   3376        smlal           v16.4s,  v29.4h,  v5.4h
   3377        smlal           v16.4s,  v30.4h,  v6.4h
   3378        smlal           v16.4s,  v31.4h,  v7.4h
   3379        smull2          v17.4s,  v24.8h,  v0.8h
   3380        smlal2          v17.4s,  v25.8h,  v1.8h
   3381        smlal2          v17.4s,  v26.8h,  v2.8h
   3382        smlal2          v17.4s,  v27.8h,  v3.8h
   3383        smlal2          v17.4s,  v28.8h,  v4.8h
   3384        smlal2          v17.4s,  v29.8h,  v5.8h
   3385        smlal2          v17.4s,  v30.8h,  v6.8h
   3386        smlal2          v17.4s,  v31.8h,  v7.8h
   3387 
   3388        mov             v24.16b, v25.16b
   3389        mov             v25.16b, v26.16b
   3390        sqrshrn         v16.4h,  v16.4s,  #\shift
   3391        mov             v26.16b, v27.16b
   3392        sqrshrn2        v16.8h,  v17.4s,  #\shift
   3393        mov             v27.16b, v28.16b
   3394        mov             v28.16b, v29.16b
   3395        add             v16.8h,  v16.8h,  v23.8h
   3396 .ifb \t
   3397        sqxtun          v16.8b,  v16.8h
   3398 .endif
   3399        mov             v29.16b, v30.16b
   3400        mov             v30.16b, v31.16b
   3401        subs            w10, w10, #1
   3402 .ifnb \t
   3403        st1             {v16.8h}, [x0], x1
   3404 .else
   3405        st1             {v16.8b}, [x0], x1
   3406 .endif
   3407 
   3408        add             w6,  w6,  w4
   3409        b.gt            1b
   3410 
   3411        ret             x15
   3412 endfunc
   3413 .endm
   3414 
   3415 warp  , 11
   3416 warp t, 7
   3417 
   3418 // void dav1d_emu_edge_8bpc_neon(
   3419 //         const intptr_t bw, const intptr_t bh,
   3420 //         const intptr_t iw, const intptr_t ih,
   3421 //         const intptr_t x, const intptr_t y,
   3422 //         pixel *dst, const ptrdiff_t dst_stride,
   3423 //         const pixel *ref, const ptrdiff_t ref_stride)
   3424 function emu_edge_8bpc_neon, export=1
   3425        ldp             x8,  x9,  [sp]
   3426 
   3427        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
   3428        // ref += iclip(x, 0, iw - 1)
   3429        sub             x12, x3,  #1           // ih - 1
   3430        cmp             x5,  x3
   3431        sub             x13, x2,  #1           // iw - 1
   3432        csel            x12, x12, x5,  ge      // min(y, ih - 1)
   3433        cmp             x4,  x2
   3434        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
   3435        csel            x13, x13, x4,  ge      // min(x, iw - 1)
   3436        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
   3437        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
   3438        add             x8,  x8,  x13          // ref += iclip()
   3439 
   3440        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
   3441        // top_ext = iclip(-y, 0, bh - 1)
   3442        add             x10, x5,  x1           // y + bh
   3443        neg             x5,  x5                // -y
   3444        sub             x10, x10, x3           // y + bh - ih
   3445        sub             x12, x1,  #1           // bh - 1
   3446        cmp             x10, x1
   3447        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
   3448        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
   3449        cmp             x5,  x1
   3450        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
   3451        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
   3452 
   3453        // right_ext = iclip(x + bw - iw, 0, bw - 1)
   3454        // left_ext = iclip(-x, 0, bw - 1)
   3455        add             x11, x4,  x0           // x + bw
   3456        neg             x4,  x4                // -x
   3457        sub             x11, x11, x2           // x + bw - iw
   3458        sub             x13, x0,  #1           // bw - 1
   3459        cmp             x11, x0
   3460        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
   3461        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
   3462        cmp             x4,  x0
   3463        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
   3464        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
   3465 
   3466        // center_h = bh - top_ext - bottom_ext
   3467        // dst += top_ext * PXSTRIDE(dst_stride)
   3468        // center_w = bw - left_ext - right_ext
   3469        sub             x1,  x1,  x5           // bh - top_ext
   3470        madd            x6,  x5,  x7,  x6
   3471        sub             x2,  x0,  x4           // bw - left_ext
   3472        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
   3473        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
   3474 
   3475        mov             x14, x6                // backup of dst
   3476 
   3477 .macro v_loop need_left, need_right
   3478 0:
   3479 .if \need_left
   3480        ld1r            {v0.16b}, [x8]
   3481        mov             x12, x6                // out = dst
   3482        mov             x3,  x4
   3483 1:
   3484        subs            x3,  x3,  #16
   3485        st1             {v0.16b}, [x12], #16
   3486        b.gt            1b
   3487 .endif
   3488        mov             x13, x8
   3489        add             x12, x6,  x4           // out = dst + left_ext
   3490        mov             x3,  x2
   3491 1:
   3492        ld1             {v0.16b, v1.16b}, [x13], #32
   3493        subs            x3,  x3,  #32
   3494        st1             {v0.16b, v1.16b}, [x12], #32
   3495        b.gt            1b
   3496 .if \need_right
   3497        add             x3,  x8,  x2           // in + center_w
   3498        sub             x3,  x3,  #1           // in + center_w - 1
   3499        add             x12, x6,  x4           // dst + left_ext
   3500        ld1r            {v0.16b}, [x3]
   3501        add             x12, x12, x2           // out = dst + left_ext + center_w
   3502        mov             x3,  x11
   3503 1:
   3504        subs            x3,  x3,  #16
   3505        st1             {v0.16b}, [x12], #16
   3506        b.gt            1b
   3507 .endif
   3508 
   3509        subs            x1,  x1,  #1           // center_h--
   3510        add             x6,  x6,  x7
   3511        add             x8,  x8,  x9
   3512        b.gt            0b
   3513 .endm
   3514 
   3515        cbz             x4,  2f
   3516        // need_left
   3517        cbz             x11, 3f
   3518        // need_left + need_right
   3519        v_loop          1,   1
   3520        b               5f
   3521 
   3522 2:
   3523        // !need_left
   3524        cbz             x11, 4f
   3525        // !need_left + need_right
   3526        v_loop          0,   1
   3527        b               5f
   3528 
   3529 3:
   3530        // need_left + !need_right
   3531        v_loop          1,   0
   3532        b               5f
   3533 
   3534 4:
   3535        // !need_left + !need_right
   3536        v_loop          0,   0
   3537 
   3538 5:
   3539 
   3540        cbz             x10, 3f
   3541        // need_bottom
   3542        sub             x8,  x6,  x7           // ref = dst - stride
   3543        mov             x4,  x0
   3544 1:
   3545        ld1             {v0.16b, v1.16b}, [x8], #32
   3546        mov             x3,  x10
   3547 2:
   3548        subs            x3,  x3,  #1
   3549        st1             {v0.16b, v1.16b}, [x6], x7
   3550        b.gt            2b
   3551        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
   3552        subs            x4,  x4,  #32          // bw -= 32
   3553        add             x6,  x6,  #32          // dst += 32
   3554        b.gt            1b
   3555 
   3556 3:
   3557        cbz             x5,  3f
   3558        // need_top
   3559        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
   3560 1:
   3561        ld1             {v0.16b, v1.16b}, [x14], #32
   3562        mov             x3,  x5
   3563 2:
   3564        subs            x3,  x3,  #1
   3565        st1             {v0.16b, v1.16b}, [x6], x7
   3566        b.gt            2b
   3567        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
   3568        subs            x0,  x0,  #32          // bw -= 32
   3569        add             x6,  x6,  #32          // dst += 32
   3570        b.gt            1b
   3571 
   3572 3:
   3573        ret
   3574 endfunc