tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc16.S (151422B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Janne Grunau
      4 * Copyright © 2020, Martin Storsjo
      5 * All rights reserved.
      6 *
      7 * Redistribution and use in source and binary forms, with or without
      8 * modification, are permitted provided that the following conditions are met:
      9 *
     10 * 1. Redistributions of source code must retain the above copyright notice, this
     11 *    list of conditions and the following disclaimer.
     12 *
     13 * 2. Redistributions in binary form must reproduce the above copyright notice,
     14 *    this list of conditions and the following disclaimer in the documentation
     15 *    and/or other materials provided with the distribution.
     16 *
     17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     27 */
     28 
     29 #include "src/arm/asm.S"
     30 #include "util.S"
     31 
     32 #define PREP_BIAS 8192
     33 
     34 .macro avg d0, d1, t0, t1, t2, t3
     35        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
     36        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
     37        sqadd           \t0\().8h,  \t0\().8h,  \t2\().8h
     38        sqadd           \t1\().8h,  \t1\().8h,  \t3\().8h
     39        smax            \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
     40        smax            \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
     41        sqsub           \t0\().8h,  \t0\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
     42        sqsub           \t1\().8h,  \t1\().8h,  v28.8h // -2*PREP_BIAS - 1 << intermediate_bits
     43        sshl            \d0\().8h,  \t0\().8h,  v29.8h // -(intermediate_bits+1)
     44        sshl            \d1\().8h,  \t1\().8h,  v29.8h // -(intermediate_bits+1)
     45 .endm
     46 
     47 .macro w_avg d0, d1, t0, t1, t2, t3
     48        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
     49        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
     50        // This difference requires a 17 bit range, and all bits are
     51        // significant for the following multiplication.
     52        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
     53        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
     54        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
     55        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
     56        mul             \d0\().4s,  \d0\().4s,  v27.4s
     57        mul             \t0\().4s,  \t0\().4s,  v27.4s
     58        mul             \d1\().4s,  \d1\().4s,  v27.4s
     59        mul             \t1\().4s,  \t1\().4s,  v27.4s
     60        sshr            \d0\().4s,  \d0\().4s,  #4
     61        sshr            \t0\().4s,  \t0\().4s,  #4
     62        sshr            \d1\().4s,  \d1\().4s,  #4
     63        sshr            \t1\().4s,  \t1\().4s,  #4
     64        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
     65        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
     66        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
     67        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
     68        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h // Same as xtn, xtn2
     69        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h // Ditto
     70        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
     71        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
     72        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
     73        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
     74        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
     75        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
     76        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
     77        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
     78 .endm
     79 
     80 .macro mask d0, d1, t0, t1, t2, t3
     81        ld1             {v27.16b}, [x6],  16
     82        ld1             {\t0\().8h,\t1\().8h},  [x2],  32
     83        neg             v27.16b, v27.16b
     84        ld1             {\t2\().8h,\t3\().8h},  [x3],  32
     85        sxtl            v26.8h,  v27.8b
     86        sxtl2           v27.8h,  v27.16b
     87        sxtl            v24.4s,  v26.4h
     88        sxtl2           v25.4s,  v26.8h
     89        sxtl            v26.4s,  v27.4h
     90        sxtl2           v27.4s,  v27.8h
     91        ssubl           \d0\().4s,  \t2\().4h,  \t0\().4h
     92        ssubl2          \t0\().4s,  \t2\().8h,  \t0\().8h
     93        ssubl           \d1\().4s,  \t3\().4h,  \t1\().4h
     94        ssubl2          \t1\().4s,  \t3\().8h,  \t1\().8h
     95        mul             \d0\().4s,  \d0\().4s,  v24.4s
     96        mul             \t0\().4s,  \t0\().4s,  v25.4s
     97        mul             \d1\().4s,  \d1\().4s,  v26.4s
     98        mul             \t1\().4s,  \t1\().4s,  v27.4s
     99        sshr            \d0\().4s,  \d0\().4s,  #6
    100        sshr            \t0\().4s,  \t0\().4s,  #6
    101        sshr            \d1\().4s,  \d1\().4s,  #6
    102        sshr            \t1\().4s,  \t1\().4s,  #6
    103        saddw           \d0\().4s,  \d0\().4s,  \t2\().4h
    104        saddw2          \t0\().4s,  \t0\().4s,  \t2\().8h
    105        saddw           \d1\().4s,  \d1\().4s,  \t3\().4h
    106        saddw2          \t1\().4s,  \t1\().4s,  \t3\().8h
    107        uzp1            \d0\().8h,  \d0\().8h,  \t0\().8h  // Same as xtn, xtn2
    108        uzp1            \d1\().8h,  \d1\().8h,  \t1\().8h  // Ditto
    109        srshl           \d0\().8h,  \d0\().8h,  v29.8h // -intermediate_bits
    110        srshl           \d1\().8h,  \d1\().8h,  v29.8h // -intermediate_bits
    111        add             \d0\().8h,  \d0\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
    112        add             \d1\().8h,  \d1\().8h,  v28.8h // PREP_BIAS >> intermediate_bits
    113        smin            \d0\().8h,  \d0\().8h,  v31.8h // bitdepth_max
    114        smin            \d1\().8h,  \d1\().8h,  v31.8h // bitdepth_max
    115        smax            \d0\().8h,  \d0\().8h,  v30.8h // 0
    116        smax            \d1\().8h,  \d1\().8h,  v30.8h // 0
    117 .endm
    118 
    119 .macro bidir_fn type, bdmax
    120 function \type\()_16bpc_neon, export=1
    121        clz             w4,  w4
    122 .ifnc \type, avg
    123        dup             v31.8h,  \bdmax // bitdepth_max
    124        movi            v30.8h,  #0
    125 .endif
    126        clz             w7,  \bdmax
    127        sub             w7,  w7,  #18   // intermediate_bits = clz(bitdepth_max) - 18
    128 .ifc \type, avg
    129        mov             w9,  #1
    130        mov             w8,  #-2*PREP_BIAS
    131        lsl             w9,  w9,  w7    // 1 << intermediate_bits
    132        add             w7,  w7,  #1
    133        sub             w8,  w8,  w9    // -2*PREP_BIAS - 1 << intermediate_bits
    134        neg             w7,  w7         // -(intermediate_bits+1)
    135        dup             v28.8h,   w8    // -2*PREP_BIAS - 1 << intermediate_bits
    136        dup             v29.8h,   w7    // -(intermediate_bits+1)
    137 .else
    138        mov             w8,  #PREP_BIAS
    139        lsr             w8,  w8,  w7    // PREP_BIAS >> intermediate_bits
    140        neg             w7,  w7         // -intermediate_bits
    141        dup             v28.8h,  w8     // PREP_BIAS >> intermediate_bits
    142        dup             v29.8h,  w7     // -intermediate_bits
    143 .endif
    144 .ifc \type, w_avg
    145        dup             v27.4s,  w6
    146        neg             v27.4s,  v27.4s
    147 .endif
    148        movrel          x7,  \type\()_tbl
    149        sub             w4,  w4,  #24
    150        \type           v4,  v5,  v0,  v1,  v2,  v3
    151        ldrsw           x4,  [x7, x4, lsl #2]
    152        add             x7,  x7,  x4
    153        br              x7
    154 40:
    155        AARCH64_VALID_JUMP_TARGET
    156        add             x7,  x0,  x1
    157        lsl             x1,  x1,  #1
    158 4:
    159        subs            w5,  w5,  #4
    160        st1             {v4.8b},    [x0], x1
    161        st1             {v4.d}[1],  [x7], x1
    162        st1             {v5.8b},    [x0], x1
    163        st1             {v5.d}[1],  [x7], x1
    164        b.le            0f
    165        \type           v4,  v5,  v0,  v1,  v2,  v3
    166        b               4b
    167 80:
    168        AARCH64_VALID_JUMP_TARGET
    169        add             x7,  x0,  x1
    170        lsl             x1,  x1,  #1
    171 8:
    172        st1             {v4.8h},  [x0], x1
    173        subs            w5,  w5,  #2
    174        st1             {v5.8h},  [x7], x1
    175        b.le            0f
    176        \type           v4,  v5,  v0,  v1,  v2,  v3
    177        b               8b
    178 160:
    179        AARCH64_VALID_JUMP_TARGET
    180 16:
    181        \type           v6,  v7,  v0,  v1,  v2,  v3
    182        st1             {v4.8h, v5.8h}, [x0], x1
    183        subs            w5,  w5,  #2
    184        st1             {v6.8h, v7.8h}, [x0], x1
    185        b.le            0f
    186        \type           v4,  v5,  v0,  v1,  v2,  v3
    187        b               16b
    188 320:
    189        AARCH64_VALID_JUMP_TARGET
    190 32:
    191        \type           v6,  v7,  v0,  v1,  v2,  v3
    192        subs            w5,  w5,  #1
    193        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
    194        b.le            0f
    195        \type           v4,  v5,  v0,  v1,  v2,  v3
    196        b               32b
    197 640:
    198        AARCH64_VALID_JUMP_TARGET
    199        add             x7,  x0,  #64
    200 64:
    201        \type           v6,  v7,  v0,  v1,  v2,  v3
    202        \type           v16, v17, v0,  v1,  v2,  v3
    203        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
    204        \type           v18, v19, v0,  v1,  v2,  v3
    205        subs            w5,  w5,  #1
    206        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
    207        b.le            0f
    208        \type           v4,  v5,  v0,  v1,  v2,  v3
    209        b               64b
    210 1280:
    211        AARCH64_VALID_JUMP_TARGET
    212        add             x7,  x0,  #64
    213        mov             x8,  #128
    214        sub             x1,  x1,  #128
    215 128:
    216        \type           v6,  v7,  v0,  v1,  v2,  v3
    217        \type           v16, v17, v0,  v1,  v2,  v3
    218        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x8
    219        \type           v18, v19, v0,  v1,  v2,  v3
    220        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x8
    221        \type           v4,  v5,  v0,  v1,  v2,  v3
    222        \type           v6,  v7,  v0,  v1,  v2,  v3
    223        \type           v16, v17, v0,  v1,  v2,  v3
    224        subs            w5,  w5,  #1
    225        st1             {v4.8h, v5.8h, v6.8h, v7.8h},  [x0], x1
    226        \type           v18, v19, v0,  v1,  v2,  v3
    227        st1             {v16.8h,v17.8h,v18.8h,v19.8h}, [x7], x1
    228        b.le            0f
    229        \type           v4,  v5,  v0,  v1,  v2,  v3
    230        b               128b
    231 0:
    232        ret
    233 endfunc
    234 
    235 jumptable \type\()_tbl
    236        .word 1280b - \type\()_tbl
    237        .word 640b  - \type\()_tbl
    238        .word 320b  - \type\()_tbl
    239        .word 160b  - \type\()_tbl
    240        .word 80b   - \type\()_tbl
    241        .word 40b   - \type\()_tbl
    242 endjumptable
    243 .endm
    244 
    245 bidir_fn avg, w6
    246 bidir_fn w_avg, w7
    247 bidir_fn mask, w7
    248 
    249 
    250 .macro w_mask_fn type
    251 function w_mask_\type\()_16bpc_neon, export=1
    252        ldr             w8,  [sp]
    253        clz             w9,  w4
    254        movrel          x10, w_mask_\type\()_tbl
    255        dup             v31.8h,  w8   // bitdepth_max
    256        sub             w9,  w9,  #24
    257        clz             w8,  w8       // clz(bitdepth_max)
    258        ldrsw           x9,  [x10,  x9,  lsl #2]
    259        add             x10, x10, x9
    260        sub             w8,  w8,  #12 // sh = intermediate_bits + 6 = clz(bitdepth_max) - 12
    261        mov             w9,  #PREP_BIAS*64
    262        neg             w8,  w8       // -sh
    263        mov             w11, #27615   // (64 + 1 - 38)<<mask_sh - 1 - mask_rnd
    264        dup             v30.4s,  w9   // PREP_BIAS*64
    265        dup             v29.4s,  w8   // -sh
    266        dup             v0.8h,   w11
    267 .if \type == 444
    268        movi            v1.16b,  #64
    269 .elseif \type == 422
    270        dup             v2.8b,   w7
    271        movi            v3.8b,   #129
    272        sub             v3.8b,   v3.8b,   v2.8b
    273 .elseif \type == 420
    274        dup             v2.8h,   w7
    275        movi            v3.8h,   #1, lsl #8
    276        sub             v3.8h,   v3.8h,   v2.8h
    277 .endif
    278        add             x12,  x0,  x1
    279        lsl             x1,   x1,  #1
    280        br              x10
    281 40:
    282        AARCH64_VALID_JUMP_TARGET
    283 4:
    284        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1 (four rows at once)
    285        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2 (four rows at once)
    286        subs            w5,  w5,  #4
    287        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
    288        sabd            v21.8h,  v5.8h,   v7.8h
    289        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
    290        ssubl2          v17.4s,  v6.8h,   v4.8h
    291        ssubl           v18.4s,  v7.4h,   v5.4h
    292        ssubl2          v19.4s,  v7.8h,   v5.8h
    293        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
    294        uqsub           v21.8h,  v0.8h,   v21.8h
    295        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
    296        sshll           v6.4s,   v5.4h,   #6
    297        sshll2          v5.4s,   v4.8h,   #6
    298        sshll           v4.4s,   v4.4h,   #6
    299        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
    300        ushr            v21.8h,  v21.8h,  #10
    301        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
    302        add             v5.4s,   v5.4s,   v30.4s
    303        add             v6.4s,   v6.4s,   v30.4s
    304        add             v7.4s,   v7.4s,   v30.4s
    305        uxtl            v22.4s,  v20.4h
    306        uxtl2           v23.4s,  v20.8h
    307        uxtl            v24.4s,  v21.4h
    308        uxtl2           v25.4s,  v21.8h
    309        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
    310        mla             v5.4s,   v17.4s,  v23.4s
    311        mla             v6.4s,   v18.4s,  v24.4s
    312        mla             v7.4s,   v19.4s,  v25.4s
    313        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
    314        srshl           v5.4s,   v5.4s,   v29.4s
    315        srshl           v6.4s,   v6.4s,   v29.4s
    316        srshl           v7.4s,   v7.4s,   v29.4s
    317        sqxtun          v4.4h,   v4.4s            // iclip_pixel
    318        sqxtun2         v4.8h,   v5.4s
    319        sqxtun          v5.4h,   v6.4s
    320        sqxtun2         v5.8h,   v7.4s
    321        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
    322        umin            v5.8h,   v5.8h,   v31.8h
    323 .if \type == 444
    324        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
    325        sub             v20.16b, v1.16b,  v20.16b // m
    326        st1             {v20.16b}, [x6], #16
    327 .elseif \type == 422
    328        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
    329        xtn             v20.8b,  v20.8h
    330        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
    331        st1             {v20.8b}, [x6], #8
    332 .elseif \type == 420
    333        trn1            v24.2d,  v20.2d,  v21.2d
    334        trn2            v25.2d,  v20.2d,  v21.2d
    335        add             v24.8h,  v24.8h,  v25.8h  // (64 - my1) + (64 - my2) (row wise addition)
    336        addp            v20.8h,  v24.8h,  v24.8h  // (128 - m) + (128 - n) (column wise addition)
    337        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
    338        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
    339        str             s20,        [x6],  #4
    340 .endif
    341        st1             {v4.8b},    [x0],  x1
    342        st1             {v4.d}[1],  [x12], x1
    343        st1             {v5.8b},    [x0],  x1
    344        st1             {v5.d}[1],  [x12], x1
    345        b.gt            4b
    346        ret
    347 80:
    348        AARCH64_VALID_JUMP_TARGET
    349 8:
    350        ld1             {v4.8h, v5.8h}, [x2], #32 // tmp1
    351        ld1             {v6.8h, v7.8h}, [x3], #32 // tmp2
    352        subs            w5,  w5,  #2
    353        sabd            v20.8h,  v4.8h,   v6.8h   // abs(tmp1 - tmp2)
    354        sabd            v21.8h,  v5.8h,   v7.8h
    355        ssubl           v16.4s,  v6.4h,   v4.4h   // tmp2 - tmp1 (requires 17 bit)
    356        ssubl2          v17.4s,  v6.8h,   v4.8h
    357        ssubl           v18.4s,  v7.4h,   v5.4h
    358        ssubl2          v19.4s,  v7.8h,   v5.8h
    359        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
    360        uqsub           v21.8h,  v0.8h,   v21.8h
    361        sshll2          v7.4s,   v5.8h,   #6      // tmp1 << 6
    362        sshll           v6.4s,   v5.4h,   #6
    363        sshll2          v5.4s,   v4.8h,   #6
    364        sshll           v4.4s,   v4.4h,   #6
    365        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
    366        ushr            v21.8h,  v21.8h,  #10
    367        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
    368        add             v5.4s,   v5.4s,   v30.4s
    369        add             v6.4s,   v6.4s,   v30.4s
    370        add             v7.4s,   v7.4s,   v30.4s
    371        uxtl            v22.4s,  v20.4h
    372        uxtl2           v23.4s,  v20.8h
    373        uxtl            v24.4s,  v21.4h
    374        uxtl2           v25.4s,  v21.8h
    375        mla             v4.4s,   v16.4s,  v22.4s  // (tmp2-tmp1)*(64-m)
    376        mla             v5.4s,   v17.4s,  v23.4s
    377        mla             v6.4s,   v18.4s,  v24.4s
    378        mla             v7.4s,   v19.4s,  v25.4s
    379        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
    380        srshl           v5.4s,   v5.4s,   v29.4s
    381        srshl           v6.4s,   v6.4s,   v29.4s
    382        srshl           v7.4s,   v7.4s,   v29.4s
    383        sqxtun          v4.4h,   v4.4s            // iclip_pixel
    384        sqxtun2         v4.8h,   v5.4s
    385        sqxtun          v5.4h,   v6.4s
    386        sqxtun2         v5.8h,   v7.4s
    387        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
    388        umin            v5.8h,   v5.8h,   v31.8h
    389 .if \type == 444
    390        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
    391        sub             v20.16b, v1.16b,  v20.16b // m
    392        st1             {v20.16b}, [x6], #16
    393 .elseif \type == 422
    394        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
    395        xtn             v20.8b,  v20.8h
    396        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
    397        st1             {v20.8b}, [x6], #8
    398 .elseif \type == 420
    399        add             v20.8h,  v20.8h,  v21.8h  // (64 - my1) + (64 - my2) (row wise addition)
    400        addp            v20.8h,  v20.8h,  v20.8h  // (128 - m) + (128 - n) (column wise addition)
    401        sub             v20.4h,  v3.4h,   v20.4h  // (256 - sign) - ((128 - m) + (128 - n))
    402        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
    403        str             s20,     [x6],  #4
    404 .endif
    405        st1             {v4.8h}, [x0],  x1
    406        st1             {v5.8h}, [x12], x1
    407        b.gt            8b
    408        ret
    409 1280:
    410 640:
    411 320:
    412 160:
    413        AARCH64_VALID_JUMP_TARGET
    414        mov             w11, w4
    415        sub             x1,  x1,  w4,  uxtw #1
    416 .if \type == 444
    417        add             x10, x6,  w4,  uxtw
    418 .elseif \type == 422
    419        add             x10, x6,  x11, lsr #1
    420 .endif
    421        add             x9,  x3,  w4,  uxtw #1
    422        add             x7,  x2,  w4,  uxtw #1
    423 161:
    424        mov             w8,  w4
    425 16:
    426        ld1             {v4.8h,   v5.8h},  [x2], #32 // tmp1
    427        ld1             {v16.8h,  v17.8h}, [x3], #32 // tmp2
    428        ld1             {v6.8h,   v7.8h},  [x7], #32
    429        ld1             {v18.8h,  v19.8h}, [x9], #32
    430        subs            w8,  w8,  #16
    431        sabd            v20.8h,  v4.8h,   v16.8h  // abs(tmp1 - tmp2)
    432        sabd            v21.8h,  v5.8h,   v17.8h
    433        ssubl           v22.4s,  v16.4h,  v4.4h   // tmp2 - tmp1 (requires 17 bit)
    434        ssubl2          v23.4s,  v16.8h,  v4.8h
    435        ssubl           v24.4s,  v17.4h,  v5.4h
    436        ssubl2          v25.4s,  v17.8h,  v5.8h
    437        uqsub           v20.8h,  v0.8h,   v20.8h  // 27615 - abs()
    438        uqsub           v21.8h,  v0.8h,   v21.8h
    439        sshll2          v27.4s,  v5.8h,   #6      // tmp1 << 6
    440        sshll           v26.4s,  v5.4h,   #6
    441        sshll2          v5.4s,   v4.8h,   #6
    442        sshll           v4.4s,   v4.4h,   #6
    443        ushr            v20.8h,  v20.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
    444        ushr            v21.8h,  v21.8h,  #10
    445        add             v4.4s,   v4.4s,   v30.4s  // += PREP_BIAS*64
    446        add             v5.4s,   v5.4s,   v30.4s
    447        add             v26.4s,  v26.4s,  v30.4s
    448        add             v27.4s,  v27.4s,  v30.4s
    449        uxtl            v16.4s,  v20.4h
    450        uxtl2           v17.4s,  v20.8h
    451        uxtl            v28.4s,  v21.4h
    452        mla             v4.4s,   v22.4s,  v16.4s  // (tmp2-tmp1)*(64-m)
    453        uxtl2           v16.4s,  v21.8h
    454        mla             v5.4s,   v23.4s,  v17.4s
    455        mla             v26.4s,  v24.4s,  v28.4s
    456        mla             v27.4s,  v25.4s,  v16.4s
    457        srshl           v4.4s,   v4.4s,   v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
    458        srshl           v5.4s,   v5.4s,   v29.4s
    459        srshl           v26.4s,  v26.4s,  v29.4s
    460        srshl           v27.4s,  v27.4s,  v29.4s
    461        sqxtun          v4.4h,   v4.4s            // iclip_pixel
    462        sqxtun2         v4.8h,   v5.4s
    463        sqxtun          v5.4h,   v26.4s
    464        sqxtun2         v5.8h,   v27.4s
    465 
    466        // Start of other half
    467        sabd            v22.8h,  v6.8h,   v18.8h  // abs(tmp1 - tmp2)
    468        sabd            v23.8h,  v7.8h,   v19.8h
    469 
    470        umin            v4.8h,   v4.8h,   v31.8h  // iclip_pixel
    471        umin            v5.8h,   v5.8h,   v31.8h
    472 
    473        ssubl           v16.4s,  v18.4h,  v6.4h   // tmp2 - tmp1 (requires 17 bit)
    474        ssubl2          v17.4s,  v18.8h,  v6.8h
    475        ssubl           v18.4s,  v19.4h,  v7.4h
    476        ssubl2          v19.4s,  v19.8h,  v7.8h
    477        uqsub           v22.8h,  v0.8h,   v22.8h  // 27615 - abs()
    478        uqsub           v23.8h,  v0.8h,   v23.8h
    479        sshll           v24.4s,  v6.4h,   #6      // tmp1 << 6
    480        sshll2          v25.4s,  v6.8h,   #6
    481        sshll           v26.4s,  v7.4h,   #6
    482        sshll2          v27.4s,  v7.8h,   #6
    483        ushr            v22.8h,  v22.8h,  #10     // 64-m = (27615 - abs()) >> mask_sh
    484        ushr            v23.8h,  v23.8h,  #10
    485        add             v24.4s,  v24.4s,  v30.4s  // += PREP_BIAS*64
    486        add             v25.4s,  v25.4s,  v30.4s
    487        add             v26.4s,  v26.4s,  v30.4s
    488        add             v27.4s,  v27.4s,  v30.4s
    489        uxtl            v6.4s,   v22.4h
    490        uxtl2           v7.4s,   v22.8h
    491        uxtl            v28.4s,  v23.4h
    492        mla             v24.4s,  v16.4s,  v6.4s   // (tmp2-tmp1)*(64-m)
    493        uxtl2           v6.4s,   v23.8h
    494        mla             v25.4s,  v17.4s,  v7.4s
    495        mla             v26.4s,  v18.4s,  v28.4s
    496        mla             v27.4s,  v19.4s,  v6.4s
    497        srshl           v24.4s,  v24.4s,  v29.4s  // (tmp1<<6 + (tmp2-tmp1)*(64-m) + (1 << (sh-1)) + PREP_BIAS*64) >> sh
    498        srshl           v25.4s,  v25.4s,  v29.4s
    499        srshl           v26.4s,  v26.4s,  v29.4s
    500        srshl           v27.4s,  v27.4s,  v29.4s
    501        sqxtun          v6.4h,   v24.4s           // iclip_pixel
    502        sqxtun2         v6.8h,   v25.4s
    503        sqxtun          v7.4h,   v26.4s
    504        sqxtun2         v7.8h,   v27.4s
    505        umin            v6.8h,   v6.8h,   v31.8h  // iclip_pixel
    506        umin            v7.8h,   v7.8h,   v31.8h
    507 .if \type == 444
    508        uzp1            v20.16b, v20.16b, v21.16b // 64 - m
    509        uzp1            v21.16b, v22.16b, v23.16b
    510        sub             v20.16b, v1.16b,  v20.16b // m
    511        sub             v21.16b, v1.16b,  v21.16b
    512        st1             {v20.16b}, [x6],  #16
    513        st1             {v21.16b}, [x10], #16
    514 .elseif \type == 422
    515        addp            v20.8h,  v20.8h,  v21.8h  // (64 - m) + (64 - n) (column wise addition)
    516        addp            v21.8h,  v22.8h,  v23.8h
    517        xtn             v20.8b,  v20.8h
    518        xtn             v21.8b,  v21.8h
    519        uhsub           v20.8b,  v3.8b,   v20.8b  // ((129 - sign) - ((64 - m) + (64 - n)) >> 1
    520        uhsub           v21.8b,  v3.8b,   v21.8b
    521        st1             {v20.8b}, [x6],  #8
    522        st1             {v21.8b}, [x10], #8
    523 .elseif \type == 420
    524        add             v20.8h,  v20.8h,  v22.8h  // (64 - my1) + (64 - my2) (row wise addition)
    525        add             v21.8h,  v21.8h,  v23.8h
    526        addp            v20.8h,  v20.8h,  v21.8h  // (128 - m) + (128 - n) (column wise addition)
    527        sub             v20.8h,  v3.8h,   v20.8h  // (256 - sign) - ((128 - m) + (128 - n))
    528        rshrn           v20.8b,  v20.8h,  #2      // ((256 - sign) - ((128 - m) + (128 - n)) + 2) >> 2
    529        st1             {v20.8b}, [x6], #8
    530 .endif
    531        st1             {v4.8h, v5.8h}, [x0],  #32
    532        st1             {v6.8h, v7.8h}, [x12], #32
    533        b.gt            16b
    534        subs            w5,  w5,  #2
    535        add             x2,  x2,  w4,  uxtw #1
    536        add             x3,  x3,  w4,  uxtw #1
    537        add             x7,  x7,  w4,  uxtw #1
    538        add             x9,  x9,  w4,  uxtw #1
    539 .if \type == 444
    540        add             x6,  x6,  w4,  uxtw
    541        add             x10, x10, w4,  uxtw
    542 .elseif \type == 422
    543        add             x6,  x6,  x11, lsr #1
    544        add             x10, x10, x11, lsr #1
    545 .endif
    546        add             x0,  x0,  x1
    547        add             x12, x12, x1
    548        b.gt            161b
    549        ret
    550 endfunc
    551 
    552 jumptable w_mask_\type\()_tbl
    553        .word 1280b - w_mask_\type\()_tbl
    554        .word 640b  - w_mask_\type\()_tbl
    555        .word 320b  - w_mask_\type\()_tbl
    556        .word 160b  - w_mask_\type\()_tbl
    557        .word 80b   - w_mask_\type\()_tbl
    558        .word 40b   - w_mask_\type\()_tbl
    559 endjumptable
    560 .endm
    561 
    562 w_mask_fn 444
    563 w_mask_fn 422
    564 w_mask_fn 420
    565 
    566 
    567 function blend_16bpc_neon, export=1
    568        movrel          x6,  blend_tbl
    569        clz             w3,  w3
    570        sub             w3,  w3,  #26
    571        ldrsw           x3,  [x6,  x3,  lsl #2]
    572        add             x6,  x6,  x3
    573        add             x8,  x0,  x1
    574        br              x6
    575 40:
    576        AARCH64_VALID_JUMP_TARGET
    577        lsl             x1,  x1,  #1
    578 4:
    579        ld1             {v2.8b},   [x5], #8
    580        ld1             {v1.8h},   [x2], #16
    581        ldr             d0,        [x0]
    582        neg             v2.8b,   v2.8b            // -m
    583        subs            w4,  w4,  #2
    584        ld1             {v0.d}[1], [x8]
    585        sxtl            v2.8h,   v2.8b
    586        shl             v2.8h,   v2.8h,   #9      // -m << 9
    587        sub             v1.8h,   v0.8h,   v1.8h   // a - b
    588        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
    589        add             v0.8h,   v0.8h,   v1.8h
    590        st1             {v0.8b},   [x0], x1
    591        st1             {v0.d}[1], [x8], x1
    592        b.gt            4b
    593        ret
    594 80:
    595        AARCH64_VALID_JUMP_TARGET
    596        lsl             x1,  x1,  #1
    597 8:
    598        ld1             {v4.16b},       [x5], #16
    599        ld1             {v2.8h, v3.8h}, [x2], #32
    600        neg             v5.16b,  v4.16b           // -m
    601        ld1             {v0.8h},   [x0]
    602        ld1             {v1.8h},   [x8]
    603        sxtl            v4.8h,   v5.8b
    604        sxtl2           v5.8h,   v5.16b
    605        shl             v4.8h,   v4.8h,   #9      // -m << 9
    606        shl             v5.8h,   v5.8h,   #9
    607        sub             v2.8h,   v0.8h,   v2.8h   // a - b
    608        sub             v3.8h,   v1.8h,   v3.8h
    609        subs            w4,  w4,  #2
    610        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
    611        sqrdmulh        v3.8h,   v3.8h,   v5.8h
    612        add             v0.8h,   v0.8h,   v2.8h
    613        add             v1.8h,   v1.8h,   v3.8h
    614        st1             {v0.8h}, [x0], x1
    615        st1             {v1.8h}, [x8], x1
    616        b.gt            8b
    617        ret
    618 160:
    619        AARCH64_VALID_JUMP_TARGET
    620        lsl             x1,  x1,  #1
    621 16:
    622        ld1             {v16.16b, v17.16b},           [x5], #32
    623        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
    624        subs            w4,  w4,  #2
    625        neg             v18.16b, v16.16b          // -m
    626        neg             v19.16b, v17.16b
    627        ld1             {v0.8h, v1.8h}, [x0]
    628        sxtl            v16.8h,  v18.8b
    629        sxtl2           v17.8h,  v18.16b
    630        sxtl            v18.8h,  v19.8b
    631        sxtl2           v19.8h,  v19.16b
    632        ld1             {v2.8h, v3.8h}, [x8]
    633        shl             v16.8h,  v16.8h,  #9      // -m << 9
    634        shl             v17.8h,  v17.8h,  #9
    635        shl             v18.8h,  v18.8h,  #9
    636        shl             v19.8h,  v19.8h,  #9
    637        sub             v4.8h,   v0.8h,   v4.8h   // a - b
    638        sub             v5.8h,   v1.8h,   v5.8h
    639        sub             v6.8h,   v2.8h,   v6.8h
    640        sub             v7.8h,   v3.8h,   v7.8h
    641        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
    642        sqrdmulh        v5.8h,   v5.8h,   v17.8h
    643        sqrdmulh        v6.8h,   v6.8h,   v18.8h
    644        sqrdmulh        v7.8h,   v7.8h,   v19.8h
    645        add             v0.8h,   v0.8h,   v4.8h
    646        add             v1.8h,   v1.8h,   v5.8h
    647        add             v2.8h,   v2.8h,   v6.8h
    648        add             v3.8h,   v3.8h,   v7.8h
    649        st1             {v0.8h, v1.8h}, [x0], x1
    650        st1             {v2.8h, v3.8h}, [x8], x1
    651        b.gt            16b
    652        ret
    653 320:
    654        AARCH64_VALID_JUMP_TARGET
    655 32:
    656        ld1             {v16.16b, v17.16b},           [x5], #32
    657        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
    658        subs            w4,  w4,  #1
    659        neg             v18.16b, v16.16b          // -m
    660        neg             v19.16b, v17.16b
    661        sxtl            v16.8h,  v18.8b
    662        sxtl2           v17.8h,  v18.16b
    663        sxtl            v18.8h,  v19.8b
    664        sxtl2           v19.8h,  v19.16b
    665        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
    666        shl             v16.8h,  v16.8h,  #9      // -m << 9
    667        shl             v17.8h,  v17.8h,  #9
    668        shl             v18.8h,  v18.8h,  #9
    669        shl             v19.8h,  v19.8h,  #9
    670        sub             v4.8h,   v0.8h,   v4.8h   // a - b
    671        sub             v5.8h,   v1.8h,   v5.8h
    672        sub             v6.8h,   v2.8h,   v6.8h
    673        sub             v7.8h,   v3.8h,   v7.8h
    674        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
    675        sqrdmulh        v5.8h,   v5.8h,   v17.8h
    676        sqrdmulh        v6.8h,   v6.8h,   v18.8h
    677        sqrdmulh        v7.8h,   v7.8h,   v19.8h
    678        add             v0.8h,   v0.8h,   v4.8h
    679        add             v1.8h,   v1.8h,   v5.8h
    680        add             v2.8h,   v2.8h,   v6.8h
    681        add             v3.8h,   v3.8h,   v7.8h
    682        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    683        b.gt            32b
    684        ret
    685 endfunc
    686 
    687 jumptable blend_tbl
    688        .word 320b - blend_tbl
    689        .word 160b - blend_tbl
    690        .word 80b  - blend_tbl
    691        .word 40b  - blend_tbl
    692 endjumptable
    693 
    694 function blend_h_16bpc_neon, export=1
    695        movrel          x6,  blend_h_tbl
    696        movrel          x5,  X(obmc_masks)
    697        add             x5,  x5,  w4,  uxtw
    698        sub             w4,  w4,  w4,  lsr #2
    699        clz             w7,  w3
    700        add             x8,  x0,  x1
    701        lsl             x1,  x1,  #1
    702        sub             w7,  w7,  #24
    703        ldrsw           x7,  [x6,  x7,  lsl #2]
    704        add             x6,  x6,  x7
    705        br              x6
    706 20:
    707        AARCH64_VALID_JUMP_TARGET
    708 2:
    709        ld2r            {v2.8b, v3.8b}, [x5], #2
    710        ld1             {v1.4h},        [x2], #8
    711        ext             v2.8b,   v2.8b,   v3.8b,   #6
    712        subs            w4,  w4,  #2
    713        neg             v2.8b,   v2.8b            // -m
    714        ldr             s0,        [x0]
    715        ld1             {v0.s}[1], [x8]
    716        sxtl            v2.8h,   v2.8b
    717        shl             v2.4h,   v2.4h,   #9      // -m << 9
    718        sub             v1.4h,   v0.4h,   v1.4h   // a - b
    719        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
    720        add             v0.4h,   v0.4h,   v1.4h
    721        st1             {v0.s}[0], [x0], x1
    722        st1             {v0.s}[1], [x8], x1
    723        b.gt            2b
    724        ret
    725 40:
    726        AARCH64_VALID_JUMP_TARGET
    727 4:
    728        ld2r            {v2.8b, v3.8b}, [x5], #2
    729        ld1             {v1.8h},        [x2], #16
    730        ext             v2.8b,   v2.8b,   v3.8b,   #4
    731        subs            w4,  w4,  #2
    732        neg             v2.8b,   v2.8b            // -m
    733        ldr             d0,          [x0]
    734        ld1             {v0.d}[1],   [x8]
    735        sxtl            v2.8h,   v2.8b
    736        shl             v2.8h,   v2.8h,   #9      // -m << 9
    737        sub             v1.8h,   v0.8h,   v1.8h   // a - b
    738        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
    739        add             v0.8h,   v0.8h,   v1.8h
    740        st1             {v0.8b},   [x0], x1
    741        st1             {v0.d}[1], [x8], x1
    742        b.gt            4b
    743        ret
    744 80:
    745        AARCH64_VALID_JUMP_TARGET
    746 8:
    747        ld2r            {v4.8b, v5.8b}, [x5], #2
    748        ld1             {v2.8h, v3.8h}, [x2], #32
    749        neg             v4.8b,   v4.8b            // -m
    750        neg             v5.8b,   v5.8b
    751        ld1             {v0.8h}, [x0]
    752        subs            w4,  w4,  #2
    753        sxtl            v4.8h,   v4.8b
    754        sxtl            v5.8h,   v5.8b
    755        ld1             {v1.8h}, [x8]
    756        shl             v4.8h,   v4.8h,   #9      // -m << 9
    757        shl             v5.8h,   v5.8h,   #9
    758        sub             v2.8h,   v0.8h,   v2.8h   // a - b
    759        sub             v3.8h,   v1.8h,   v3.8h
    760        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
    761        sqrdmulh        v3.8h,   v3.8h,   v5.8h
    762        add             v0.8h,   v0.8h,   v2.8h
    763        add             v1.8h,   v1.8h,   v3.8h
    764        st1             {v0.8h}, [x0], x1
    765        st1             {v1.8h}, [x8], x1
    766        b.gt            8b
    767        ret
    768 160:
    769        AARCH64_VALID_JUMP_TARGET
    770 16:
    771        ld2r            {v16.8b, v17.8b}, [x5], #2
    772        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
    773        neg             v16.8b,  v16.8b           // -m
    774        neg             v17.8b,  v17.8b
    775        ld1             {v0.8h, v1.8h},  [x0]
    776        ld1             {v2.8h, v3.8h},  [x8]
    777        subs            w4,  w4,  #2
    778        sxtl            v16.8h,  v16.8b
    779        sxtl            v17.8h,  v17.8b
    780        shl             v16.8h,  v16.8h,  #9      // -m << 9
    781        shl             v17.8h,  v17.8h,  #9
    782        sub             v4.8h,   v0.8h,   v4.8h   // a - b
    783        sub             v5.8h,   v1.8h,   v5.8h
    784        sub             v6.8h,   v2.8h,   v6.8h
    785        sub             v7.8h,   v3.8h,   v7.8h
    786        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
    787        sqrdmulh        v5.8h,   v5.8h,   v16.8h
    788        sqrdmulh        v6.8h,   v6.8h,   v17.8h
    789        sqrdmulh        v7.8h,   v7.8h,   v17.8h
    790        add             v0.8h,   v0.8h,   v4.8h
    791        add             v1.8h,   v1.8h,   v5.8h
    792        add             v2.8h,   v2.8h,   v6.8h
    793        add             v3.8h,   v3.8h,   v7.8h
    794        st1             {v0.8h, v1.8h}, [x0], x1
    795        st1             {v2.8h, v3.8h}, [x8], x1
    796        b.gt            16b
    797        ret
    798 1280:
    799 640:
    800 320:
    801        AARCH64_VALID_JUMP_TARGET
    802        sub             x1,  x1,  w3,  uxtw #1
    803        add             x7,  x2,  w3,  uxtw #1
    804 321:
    805        ld2r            {v24.8b, v25.8b}, [x5], #2
    806        mov             w6,  w3
    807        neg             v24.8b,  v24.8b           // -m
    808        neg             v25.8b,  v25.8b
    809        sxtl            v24.8h,  v24.8b
    810        sxtl            v25.8h,  v25.8b
    811        shl             v24.8h,  v24.8h,  #9      // -m << 9
    812        shl             v25.8h,  v25.8h,  #9
    813 32:
    814        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
    815        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0]
    816        subs            w6,  w6,  #32
    817        sub             v16.8h,  v0.8h,   v16.8h  // a - b
    818        sub             v17.8h,  v1.8h,   v17.8h
    819        sub             v18.8h,  v2.8h,   v18.8h
    820        sub             v19.8h,  v3.8h,   v19.8h
    821        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x7], #64
    822        ld1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8]
    823        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
    824        sqrdmulh        v17.8h,  v17.8h,  v24.8h
    825        sqrdmulh        v18.8h,  v18.8h,  v24.8h
    826        sqrdmulh        v19.8h,  v19.8h,  v24.8h
    827        sub             v20.8h,  v4.8h,   v20.8h  // a - b
    828        sub             v21.8h,  v5.8h,   v21.8h
    829        sub             v22.8h,  v6.8h,   v22.8h
    830        sub             v23.8h,  v7.8h,   v23.8h
    831        add             v0.8h,   v0.8h,   v16.8h
    832        add             v1.8h,   v1.8h,   v17.8h
    833        add             v2.8h,   v2.8h,   v18.8h
    834        add             v3.8h,   v3.8h,   v19.8h
    835        sqrdmulh        v20.8h,  v20.8h,  v25.8h  // ((a-b)*-m + 32) >> 6
    836        sqrdmulh        v21.8h,  v21.8h,  v25.8h
    837        sqrdmulh        v22.8h,  v22.8h,  v25.8h
    838        sqrdmulh        v23.8h,  v23.8h,  v25.8h
    839        st1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x0], #64
    840        add             v4.8h,   v4.8h,   v20.8h
    841        add             v5.8h,   v5.8h,   v21.8h
    842        add             v6.8h,   v6.8h,   v22.8h
    843        add             v7.8h,   v7.8h,   v23.8h
    844        st1             {v4.8h,  v5.8h,  v6.8h,  v7.8h},  [x8], #64
    845        b.gt            32b
    846        subs            w4,  w4,  #2
    847        add             x0,  x0,  x1
    848        add             x8,  x8,  x1
    849        add             x2,  x2,  w3,  uxtw #1
    850        add             x7,  x7,  w3,  uxtw #1
    851        b.gt            321b
    852        ret
    853 endfunc
    854 
    855 jumptable blend_h_tbl
    856        .word 1280b - blend_h_tbl
    857        .word 640b  - blend_h_tbl
    858        .word 320b  - blend_h_tbl
    859        .word 160b  - blend_h_tbl
    860        .word 80b   - blend_h_tbl
    861        .word 40b   - blend_h_tbl
    862        .word 20b   - blend_h_tbl
    863 endjumptable
    864 
    865 function blend_v_16bpc_neon, export=1
    866        movrel          x6,  blend_v_tbl
    867        movrel          x5,  X(obmc_masks)
    868        add             x5,  x5,  w3,  uxtw
    869        clz             w3,  w3
    870        add             x8,  x0,  x1
    871        lsl             x1,  x1,  #1
    872        sub             w3,  w3,  #26
    873        ldrsw           x3,  [x6,  x3,  lsl #2]
    874        add             x6,  x6,  x3
    875        br              x6
    876 20:
    877        AARCH64_VALID_JUMP_TARGET
    878        ld1r            {v2.8b}, [x5]
    879        neg             v2.8b,   v2.8b            // -m
    880        sxtl            v2.8h,   v2.8b
    881        shl             v2.4h,   v2.4h,   #9      // -m << 9
    882 2:
    883        ldr             s1,  [x2],  #4
    884        ldr             h0,  [x0]
    885        subs            w4,  w4,  #2
    886        ld1             {v1.h}[1], [x2]
    887        ld1             {v0.h}[1], [x8]
    888        add             x2,  x2,  #4
    889        sub             v1.4h,   v0.4h,   v1.4h   // a - b
    890        sqrdmulh        v1.4h,   v1.4h,   v2.4h   // ((a-b)*-m + 32) >> 6
    891        add             v0.4h,   v0.4h,   v1.4h
    892        st1             {v0.h}[0], [x0],  x1
    893        st1             {v0.h}[1], [x8],  x1
    894        b.gt            2b
    895        ret
    896 40:
    897        AARCH64_VALID_JUMP_TARGET
    898        ld1r            {v2.2s}, [x5]
    899        sub             x1,  x1,  #4
    900        neg             v2.8b,   v2.8b            // -m
    901        sxtl            v2.8h,   v2.8b
    902        shl             v2.8h,   v2.8h,   #9      // -m << 9
    903 4:
    904        ld1             {v1.8h},   [x2], #16
    905        ldr             d0,        [x0]
    906        ld1             {v0.d}[1], [x8]
    907        subs            w4,  w4,  #2
    908        sub             v1.8h,   v0.8h,   v1.8h   // a - b
    909        sqrdmulh        v1.8h,   v1.8h,   v2.8h   // ((a-b)*-m + 32) >> 6
    910        add             v0.8h,   v0.8h,   v1.8h
    911        str             s0,        [x0], #4
    912        st1             {v0.s}[2], [x8], #4
    913        st1             {v0.h}[2], [x0], x1
    914        st1             {v0.h}[6], [x8], x1
    915        b.gt            4b
    916        ret
    917 80:
    918        AARCH64_VALID_JUMP_TARGET
    919        ld1             {v4.8b}, [x5]
    920        sub             x1,  x1,  #8
    921        neg             v4.8b,   v4.8b            // -m
    922        sxtl            v4.8h,   v4.8b
    923        shl             v4.8h,   v4.8h,   #9      // -m << 9
    924 8:
    925        ld1             {v2.8h, v3.8h}, [x2], #32
    926        ld1             {v0.8h}, [x0]
    927        ld1             {v1.8h}, [x8]
    928        subs            w4,  w4,  #2
    929        sub             v2.8h,   v0.8h,   v2.8h   // a - b
    930        sub             v3.8h,   v1.8h,   v3.8h
    931        sqrdmulh        v2.8h,   v2.8h,   v4.8h   // ((a-b)*-m + 32) >> 6
    932        sqrdmulh        v3.8h,   v3.8h,   v4.8h
    933        add             v0.8h,   v0.8h,   v2.8h
    934        add             v1.8h,   v1.8h,   v3.8h
    935        str             d0,        [x0], #8
    936        str             d1,        [x8], #8
    937        st1             {v0.s}[2], [x0], x1
    938        st1             {v1.s}[2], [x8], x1
    939        b.gt            8b
    940        ret
    941 160:
    942        AARCH64_VALID_JUMP_TARGET
    943        ld1             {v16.16b}, [x5]
    944        sub             x1,  x1,  #16
    945        neg             v17.16b, v16.16b          // -m
    946        sxtl            v16.8h,  v17.8b
    947        sxtl2           v17.8h,  v17.16b
    948        shl             v16.8h,  v16.8h,  #9      // -m << 9
    949        shl             v17.4h,  v17.4h,  #9
    950 16:
    951        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
    952        ld1             {v0.8h, v1.8h}, [x0]
    953        subs            w4,  w4,  #2
    954        ld1             {v2.8h, v3.8h}, [x8]
    955        sub             v4.8h,   v0.8h,   v4.8h   // a - b
    956        sub             v5.4h,   v1.4h,   v5.4h
    957        sub             v6.8h,   v2.8h,   v6.8h
    958        sub             v7.4h,   v3.4h,   v7.4h
    959        sqrdmulh        v4.8h,   v4.8h,   v16.8h  // ((a-b)*-m + 32) >> 6
    960        sqrdmulh        v5.4h,   v5.4h,   v17.4h
    961        sqrdmulh        v6.8h,   v6.8h,   v16.8h
    962        sqrdmulh        v7.4h,   v7.4h,   v17.4h
    963        add             v0.8h,   v0.8h,   v4.8h
    964        add             v1.4h,   v1.4h,   v5.4h
    965        add             v2.8h,   v2.8h,   v6.8h
    966        add             v3.4h,   v3.4h,   v7.4h
    967        st1             {v0.8h}, [x0], #16
    968        st1             {v2.8h}, [x8], #16
    969        st1             {v1.4h}, [x0], x1
    970        st1             {v3.4h}, [x8], x1
    971        b.gt            16b
    972        ret
    973 320:
    974        AARCH64_VALID_JUMP_TARGET
    975        ld1             {v24.16b, v25.16b},  [x5]
    976        neg             v26.16b, v24.16b          // -m
    977        neg             v27.8b,  v25.8b
    978        sxtl            v24.8h,  v26.8b
    979        sxtl2           v25.8h,  v26.16b
    980        sxtl            v26.8h,  v27.8b
    981        shl             v24.8h,  v24.8h,  #9      // -m << 9
    982        shl             v25.8h,  v25.8h,  #9
    983        shl             v26.8h,  v26.8h,  #9
    984 32:
    985        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x2], #64
    986        ld1             {v0.8h, v1.8h, v2.8h}, [x0]
    987        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2], #64
    988        ld1             {v4.8h, v5.8h, v6.8h}, [x8]
    989        subs            w4,  w4,  #2
    990        sub             v16.8h,  v0.8h,   v16.8h  // a - b
    991        sub             v17.8h,  v1.8h,   v17.8h
    992        sub             v18.8h,  v2.8h,   v18.8h
    993        sub             v20.8h,  v4.8h,   v20.8h
    994        sub             v21.8h,  v5.8h,   v21.8h
    995        sub             v22.8h,  v6.8h,   v22.8h
    996        sqrdmulh        v16.8h,  v16.8h,  v24.8h  // ((a-b)*-m + 32) >> 6
    997        sqrdmulh        v17.8h,  v17.8h,  v25.8h
    998        sqrdmulh        v18.8h,  v18.8h,  v26.8h
    999        sqrdmulh        v20.8h,  v20.8h,  v24.8h
   1000        sqrdmulh        v21.8h,  v21.8h,  v25.8h
   1001        sqrdmulh        v22.8h,  v22.8h,  v26.8h
   1002        add             v0.8h,   v0.8h,   v16.8h
   1003        add             v1.8h,   v1.8h,   v17.8h
   1004        add             v2.8h,   v2.8h,   v18.8h
   1005        add             v4.8h,   v4.8h,   v20.8h
   1006        add             v5.8h,   v5.8h,   v21.8h
   1007        add             v6.8h,   v6.8h,   v22.8h
   1008        st1             {v0.8h, v1.8h, v2.8h}, [x0], x1
   1009        st1             {v4.8h, v5.8h, v6.8h}, [x8], x1
   1010        b.gt            32b
   1011        ret
   1012 endfunc
   1013 
   1014 jumptable blend_v_tbl
   1015        .word 320b - blend_v_tbl
   1016        .word 160b - blend_v_tbl
   1017        .word 80b  - blend_v_tbl
   1018        .word 40b  - blend_v_tbl
   1019        .word 20b  - blend_v_tbl
   1020 endjumptable
   1021 
   1022 
   1023 // This has got the same signature as the put_8tap functions,
   1024 // and assumes that x9 is set to (clz(w)-24).
   1025 function put_16bpc_neon, export=1
   1026        movrel          x10, put_16bpc_tbl
   1027        ldrsw           x9, [x10, x9, lsl #2]
   1028        add             x10, x10, x9
   1029        br              x10
   1030 
   1031 20:
   1032        AARCH64_VALID_JUMP_TARGET
   1033 2:
   1034        ld1r            {v0.4s},   [x2], x3
   1035        ld1r            {v1.4s},   [x2], x3
   1036        subs            w5,  w5,  #2
   1037        st1             {v0.s}[0], [x0], x1
   1038        st1             {v1.s}[0], [x0], x1
   1039        b.gt            2b
   1040        ret
   1041 40:
   1042        AARCH64_VALID_JUMP_TARGET
   1043 4:
   1044        ld1             {v0.4h}, [x2], x3
   1045        ld1             {v1.4h}, [x2], x3
   1046        subs            w5,  w5,  #2
   1047        st1             {v0.4h}, [x0], x1
   1048        st1             {v1.4h}, [x0], x1
   1049        b.gt            4b
   1050        ret
   1051 80:
   1052        AARCH64_VALID_JUMP_TARGET
   1053        add             x8,  x0,  x1
   1054        lsl             x1,  x1,  #1
   1055        add             x9,  x2,  x3
   1056        lsl             x3,  x3,  #1
   1057 8:
   1058        ld1             {v0.8h}, [x2], x3
   1059        ld1             {v1.8h}, [x9], x3
   1060        subs            w5,  w5,  #2
   1061        st1             {v0.8h}, [x0], x1
   1062        st1             {v1.8h}, [x8], x1
   1063        b.gt            8b
   1064        ret
   1065 160:
   1066        AARCH64_VALID_JUMP_TARGET
   1067 16:
   1068        ldp             x6,  x7,  [x2]
   1069        ldp             x8,  x9,  [x2, #16]
   1070        stp             x6,  x7,  [x0]
   1071        subs            w5,  w5,  #1
   1072        stp             x8,  x9,  [x0, #16]
   1073        add             x2,  x2,  x3
   1074        add             x0,  x0,  x1
   1075        b.gt            16b
   1076        ret
   1077 320:
   1078        AARCH64_VALID_JUMP_TARGET
   1079 32:
   1080        ldp             x6,  x7,  [x2]
   1081        ldp             x8,  x9,  [x2, #16]
   1082        stp             x6,  x7,  [x0]
   1083        ldp             x10, x11, [x2, #32]
   1084        stp             x8,  x9,  [x0, #16]
   1085        subs            w5,  w5,  #1
   1086        ldp             x12, x13, [x2, #48]
   1087        stp             x10, x11, [x0, #32]
   1088        stp             x12, x13, [x0, #48]
   1089        add             x2,  x2,  x3
   1090        add             x0,  x0,  x1
   1091        b.gt            32b
   1092        ret
   1093 640:
   1094        AARCH64_VALID_JUMP_TARGET
   1095 64:
   1096        ldp             q0,  q1,  [x2]
   1097        ldp             q2,  q3,  [x2, #32]
   1098        stp             q0,  q1,  [x0]
   1099        ldp             q4,  q5,  [x2, #64]
   1100        stp             q2,  q3,  [x0, #32]
   1101        ldp             q6,  q7,  [x2, #96]
   1102        subs            w5,  w5,  #1
   1103        stp             q4,  q5,  [x0, #64]
   1104        stp             q6,  q7,  [x0, #96]
   1105        add             x2,  x2,  x3
   1106        add             x0,  x0,  x1
   1107        b.gt            64b
   1108        ret
   1109 1280:
   1110        AARCH64_VALID_JUMP_TARGET
   1111 128:
   1112        ldp             q0,  q1,  [x2]
   1113        ldp             q2,  q3,  [x2, #32]
   1114        stp             q0,  q1,  [x0]
   1115        ldp             q4,  q5,  [x2, #64]
   1116        stp             q2,  q3,  [x0, #32]
   1117        ldp             q6,  q7,  [x2, #96]
   1118        subs            w5,  w5,  #1
   1119        stp             q4,  q5,  [x0, #64]
   1120        ldp             q16, q17, [x2, #128]
   1121        stp             q6,  q7,  [x0, #96]
   1122        ldp             q18, q19, [x2, #160]
   1123        stp             q16, q17, [x0, #128]
   1124        ldp             q20, q21, [x2, #192]
   1125        stp             q18, q19, [x0, #160]
   1126        ldp             q22, q23, [x2, #224]
   1127        stp             q20, q21, [x0, #192]
   1128        stp             q22, q23, [x0, #224]
   1129        add             x2,  x2,  x3
   1130        add             x0,  x0,  x1
   1131        b.gt            128b
   1132        ret
   1133 endfunc
   1134 
   1135 jumptable put_16bpc_tbl
   1136        .word 1280b - put_16bpc_tbl
   1137        .word 640b  - put_16bpc_tbl
   1138        .word 320b  - put_16bpc_tbl
   1139        .word 160b  - put_16bpc_tbl
   1140        .word 80b   - put_16bpc_tbl
   1141        .word 40b   - put_16bpc_tbl
   1142        .word 20b   - put_16bpc_tbl
   1143 endjumptable
   1144 
   1145 
   1146 // This has got the same signature as the prep_8tap functions,
   1147 // and assumes that x9 is set to (clz(w)-24), w7 to intermediate_bits and
   1148 // x8 to w*2.
   1149 function prep_16bpc_neon
   1150        movrel          x10, prep_16bpc_tbl
   1151        ldrsw           x9, [x10, x9, lsl #2]
   1152        dup             v31.8h,  w7   // intermediate_bits
   1153        movi            v30.8h,  #(PREP_BIAS >> 8), lsl #8
   1154        add             x10, x10, x9
   1155        br              x10
   1156 
   1157 40:
   1158        AARCH64_VALID_JUMP_TARGET
   1159        add             x9,  x1,  x2
   1160        lsl             x2,  x2,  #1
   1161 4:
   1162        ld1             {v0.8b},   [x1], x2
   1163        ld1             {v0.d}[1], [x9], x2
   1164        subs            w4,  w4,  #2
   1165        sshl            v0.8h,   v0.8h,   v31.8h
   1166        sub             v0.8h,   v0.8h,   v30.8h
   1167        st1             {v0.8h}, [x0], #16
   1168        b.gt            4b
   1169        ret
   1170 80:
   1171        AARCH64_VALID_JUMP_TARGET
   1172        add             x9,  x1,  x2
   1173        lsl             x2,  x2,  #1
   1174 8:
   1175        ld1             {v0.8h}, [x1], x2
   1176        ld1             {v1.8h}, [x9], x2
   1177        subs            w4,  w4,  #2
   1178        sshl            v0.8h,   v0.8h,   v31.8h
   1179        sshl            v1.8h,   v1.8h,   v31.8h
   1180        sub             v0.8h,   v0.8h,   v30.8h
   1181        sub             v1.8h,   v1.8h,   v30.8h
   1182        st1             {v0.8h, v1.8h}, [x0], #32
   1183        b.gt            8b
   1184        ret
   1185 160:
   1186        AARCH64_VALID_JUMP_TARGET
   1187 16:
   1188        ldp             q0,  q1,  [x1]
   1189        add             x1,  x1,  x2
   1190        sshl            v0.8h,   v0.8h,   v31.8h
   1191        ldp             q2,  q3,  [x1]
   1192        add             x1,  x1,  x2
   1193        subs            w4,  w4,  #2
   1194        sshl            v1.8h,   v1.8h,   v31.8h
   1195        sshl            v2.8h,   v2.8h,   v31.8h
   1196        sshl            v3.8h,   v3.8h,   v31.8h
   1197        sub             v0.8h,   v0.8h,   v30.8h
   1198        sub             v1.8h,   v1.8h,   v30.8h
   1199        sub             v2.8h,   v2.8h,   v30.8h
   1200        sub             v3.8h,   v3.8h,   v30.8h
   1201        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   1202        b.gt            16b
   1203        ret
   1204 320:
   1205        AARCH64_VALID_JUMP_TARGET
   1206 32:
   1207        ldp             q0,  q1,  [x1]
   1208        sshl            v0.8h,   v0.8h,   v31.8h
   1209        ldp             q2,  q3,  [x1, #32]
   1210        add             x1,  x1,  x2
   1211        sshl            v1.8h,   v1.8h,   v31.8h
   1212        sshl            v2.8h,   v2.8h,   v31.8h
   1213        sshl            v3.8h,   v3.8h,   v31.8h
   1214        subs            w4,  w4,  #1
   1215        sub             v0.8h,   v0.8h,   v30.8h
   1216        sub             v1.8h,   v1.8h,   v30.8h
   1217        sub             v2.8h,   v2.8h,   v30.8h
   1218        sub             v3.8h,   v3.8h,   v30.8h
   1219        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   1220        b.gt            32b
   1221        ret
   1222 640:
   1223        AARCH64_VALID_JUMP_TARGET
   1224 64:
   1225        ldp             q0,  q1,  [x1]
   1226        subs            w4,  w4,  #1
   1227        sshl            v0.8h,   v0.8h,   v31.8h
   1228        ldp             q2,  q3,  [x1, #32]
   1229        sshl            v1.8h,   v1.8h,   v31.8h
   1230        ldp             q4,  q5,  [x1, #64]
   1231        sshl            v2.8h,   v2.8h,   v31.8h
   1232        sshl            v3.8h,   v3.8h,   v31.8h
   1233        ldp             q6,  q7,  [x1, #96]
   1234        add             x1,  x1,  x2
   1235        sshl            v4.8h,   v4.8h,   v31.8h
   1236        sshl            v5.8h,   v5.8h,   v31.8h
   1237        sshl            v6.8h,   v6.8h,   v31.8h
   1238        sshl            v7.8h,   v7.8h,   v31.8h
   1239        sub             v0.8h,   v0.8h,   v30.8h
   1240        sub             v1.8h,   v1.8h,   v30.8h
   1241        sub             v2.8h,   v2.8h,   v30.8h
   1242        sub             v3.8h,   v3.8h,   v30.8h
   1243        stp             q0,  q1,  [x0]
   1244        sub             v4.8h,   v4.8h,   v30.8h
   1245        sub             v5.8h,   v5.8h,   v30.8h
   1246        stp             q2,  q3,  [x0, #32]
   1247        sub             v6.8h,   v6.8h,   v30.8h
   1248        sub             v7.8h,   v7.8h,   v30.8h
   1249        stp             q4,  q5,  [x0, #64]
   1250        stp             q6,  q7,  [x0, #96]
   1251        add             x0,  x0,  x8
   1252        b.gt            64b
   1253        ret
   1254 1280:
   1255        AARCH64_VALID_JUMP_TARGET
   1256 128:
   1257        ldp             q0,  q1,  [x1]
   1258        subs            w4,  w4,  #1
   1259        sshl            v0.8h,   v0.8h,   v31.8h
   1260        ldp             q2,  q3,  [x1, #32]
   1261        sshl            v1.8h,   v1.8h,   v31.8h
   1262        ldp             q4,  q5,  [x1, #64]
   1263        sshl            v2.8h,   v2.8h,   v31.8h
   1264        sshl            v3.8h,   v3.8h,   v31.8h
   1265        ldp             q6,  q7,  [x1, #96]
   1266        sshl            v4.8h,   v4.8h,   v31.8h
   1267        sshl            v5.8h,   v5.8h,   v31.8h
   1268        ldp             q16, q17, [x1, #128]
   1269        sshl            v6.8h,   v6.8h,   v31.8h
   1270        sshl            v7.8h,   v7.8h,   v31.8h
   1271        ldp             q18, q19, [x1, #160]
   1272        sshl            v16.8h,  v16.8h,  v31.8h
   1273        sshl            v17.8h,  v17.8h,  v31.8h
   1274        ldp             q20, q21, [x1, #192]
   1275        sshl            v18.8h,  v18.8h,  v31.8h
   1276        sshl            v19.8h,  v19.8h,  v31.8h
   1277        ldp             q22, q23, [x1, #224]
   1278        add             x1,  x1,  x2
   1279        sshl            v20.8h,  v20.8h,  v31.8h
   1280        sshl            v21.8h,  v21.8h,  v31.8h
   1281        sshl            v22.8h,  v22.8h,  v31.8h
   1282        sshl            v23.8h,  v23.8h,  v31.8h
   1283        sub             v0.8h,   v0.8h,   v30.8h
   1284        sub             v1.8h,   v1.8h,   v30.8h
   1285        sub             v2.8h,   v2.8h,   v30.8h
   1286        sub             v3.8h,   v3.8h,   v30.8h
   1287        stp             q0,  q1,  [x0]
   1288        sub             v4.8h,   v4.8h,   v30.8h
   1289        sub             v5.8h,   v5.8h,   v30.8h
   1290        stp             q2,  q3,  [x0, #32]
   1291        sub             v6.8h,   v6.8h,   v30.8h
   1292        sub             v7.8h,   v7.8h,   v30.8h
   1293        stp             q4,  q5,  [x0, #64]
   1294        sub             v16.8h,  v16.8h,  v30.8h
   1295        sub             v17.8h,  v17.8h,  v30.8h
   1296        stp             q6,  q7,  [x0, #96]
   1297        sub             v18.8h,  v18.8h,  v30.8h
   1298        sub             v19.8h,  v19.8h,  v30.8h
   1299        stp             q16, q17, [x0, #128]
   1300        sub             v20.8h,  v20.8h,  v30.8h
   1301        sub             v21.8h,  v21.8h,  v30.8h
   1302        stp             q18, q19, [x0, #160]
   1303        sub             v22.8h,  v22.8h,  v30.8h
   1304        sub             v23.8h,  v23.8h,  v30.8h
   1305        stp             q20, q21, [x0, #192]
   1306        stp             q22, q23, [x0, #224]
   1307        add             x0,  x0,  x8
   1308        b.gt            128b
   1309        ret
   1310 endfunc
   1311 
   1312 jumptable prep_16bpc_tbl
   1313        .word 1280b - prep_16bpc_tbl
   1314        .word 640b  - prep_16bpc_tbl
   1315        .word 320b  - prep_16bpc_tbl
   1316        .word 160b  - prep_16bpc_tbl
   1317        .word 80b   - prep_16bpc_tbl
   1318        .word 40b   - prep_16bpc_tbl
   1319 endjumptable
   1320 
   1321 
   1322 .macro load_slice s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
   1323        ld1             {\d0\wd}[0], [\s0], \strd
   1324        ld1             {\d1\wd}[0], [\s1], \strd
   1325 .ifnb \d2
   1326        ld1             {\d2\wd}[0], [\s0], \strd
   1327        ld1             {\d3\wd}[0], [\s1], \strd
   1328 .endif
   1329 .ifnb \d4
   1330        ld1             {\d4\wd}[0], [\s0], \strd
   1331 .endif
   1332 .ifnb \d5
   1333        ld1             {\d5\wd}[0], [\s1], \strd
   1334 .endif
   1335 .ifnb \d6
   1336        ld1             {\d6\wd}[0], [\s0], \strd
   1337 .endif
   1338 .endm
   1339 .macro load_reg s0, s1, strd, wd, d0, d1, d2, d3, d4, d5, d6
   1340        ld1             {\d0\wd}, [\s0], \strd
   1341        ld1             {\d1\wd}, [\s1], \strd
   1342 .ifnb \d2
   1343        ld1             {\d2\wd}, [\s0], \strd
   1344        ld1             {\d3\wd}, [\s1], \strd
   1345 .endif
   1346 .ifnb \d4
   1347        ld1             {\d4\wd}, [\s0], \strd
   1348 .endif
   1349 .ifnb \d5
   1350        ld1             {\d5\wd}, [\s1], \strd
   1351 .endif
   1352 .ifnb \d6
   1353        ld1             {\d6\wd}, [\s0], \strd
   1354 .endif
   1355 .endm
   1356 .macro load_regpair s0, s1, strd, wd, d0, d1, d2, d3, d4, d5
   1357        ld1             {\d0\wd, \d1\wd}, [\s0], \strd
   1358 .ifnb \d2
   1359        ld1             {\d2\wd, \d3\wd}, [\s1], \strd
   1360 .endif
   1361 .ifnb \d4
   1362        ld1             {\d4\wd, \d5\wd}, [\s0], \strd
   1363 .endif
   1364 .endm
   1365 .macro load_s s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1366        load_slice      \s0, \s1, \strd, .s, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1367 .endm
   1368 .macro load_4h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1369        load_reg        \s0, \s1, \strd, .4h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1370 .endm
   1371 .macro load_8h s0, s1, strd, d0, d1, d2, d3, d4, d5, d6
   1372        load_reg        \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5, \d6
   1373 .endm
   1374 .macro load_16h s0, s1, strd, d0, d1, d2, d3, d4, d5
   1375        load_regpair    \s0, \s1, \strd, .8h, \d0, \d1, \d2, \d3, \d4, \d5
   1376 .endm
   1377 .macro interleave_1 wd, r0, r1, r2, r3, r4
   1378        trn1            \r0\wd, \r0\wd, \r1\wd
   1379        trn1            \r1\wd, \r1\wd, \r2\wd
   1380 .ifnb \r3
   1381        trn1            \r2\wd, \r2\wd, \r3\wd
   1382        trn1            \r3\wd, \r3\wd, \r4\wd
   1383 .endif
   1384 .endm
   1385 .macro interleave_1_s r0, r1, r2, r3, r4
   1386        interleave_1    .2s, \r0, \r1, \r2, \r3, \r4
   1387 .endm
   1388 .macro umin_h c, wd, r0, r1, r2, r3
   1389        umin            \r0\wd,  \r0\wd,  \c\wd
   1390 .ifnb \r1
   1391        umin            \r1\wd,  \r1\wd,  \c\wd
   1392 .endif
   1393 .ifnb \r2
   1394        umin            \r2\wd,  \r2\wd,  \c\wd
   1395        umin            \r3\wd,  \r3\wd,  \c\wd
   1396 .endif
   1397 .endm
   1398 .macro sub_h c, wd, r0, r1, r2, r3
   1399        sub             \r0\wd,  \r0\wd,  \c\wd
   1400 .ifnb \r1
   1401        sub             \r1\wd,  \r1\wd,  \c\wd
   1402 .endif
   1403 .ifnb \r2
   1404        sub             \r2\wd,  \r2\wd,  \c\wd
   1405        sub             \r3\wd,  \r3\wd,  \c\wd
   1406 .endif
   1407 .endm
   1408 .macro smull_smlal_4tap d, s0, s1, s2, s3
   1409        smull           \d\().4s,  \s0\().4h,  v0.h[0]
   1410        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
   1411        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
   1412        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
   1413 .endm
   1414 .macro smull2_smlal2_4tap d, s0, s1, s2, s3
   1415        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
   1416        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
   1417        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
   1418        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
   1419 .endm
   1420 .macro smull_smlal_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
   1421        smull           \d\().4s,  \s1\().4h,  v0.h[1]
   1422        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
   1423        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
   1424        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
   1425        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
   1426        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
   1427 .endm
   1428 .macro smull2_smlal2_6tap d, s0, s1, s2, s3, s4, s5, s6, s7
   1429        smull2          \d\().4s,  \s1\().8h,  v0.h[1]
   1430        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
   1431        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
   1432        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
   1433        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
   1434        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
   1435 .endm
   1436 .macro smull_smlal_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
   1437        smull           \d\().4s,  \s0\().4h,  v0.h[0]
   1438        smlal           \d\().4s,  \s1\().4h,  v0.h[1]
   1439        smlal           \d\().4s,  \s2\().4h,  v0.h[2]
   1440        smlal           \d\().4s,  \s3\().4h,  v0.h[3]
   1441        smlal           \d\().4s,  \s4\().4h,  v0.h[4]
   1442        smlal           \d\().4s,  \s5\().4h,  v0.h[5]
   1443        smlal           \d\().4s,  \s6\().4h,  v0.h[6]
   1444        smlal           \d\().4s,  \s7\().4h,  v0.h[7]
   1445 .endm
   1446 .macro smull2_smlal2_8tap d, s0, s1, s2, s3, s4, s5, s6, s7
   1447        smull2          \d\().4s,  \s0\().8h,  v0.h[0]
   1448        smlal2          \d\().4s,  \s1\().8h,  v0.h[1]
   1449        smlal2          \d\().4s,  \s2\().8h,  v0.h[2]
   1450        smlal2          \d\().4s,  \s3\().8h,  v0.h[3]
   1451        smlal2          \d\().4s,  \s4\().8h,  v0.h[4]
   1452        smlal2          \d\().4s,  \s5\().8h,  v0.h[5]
   1453        smlal2          \d\().4s,  \s6\().8h,  v0.h[6]
   1454        smlal2          \d\().4s,  \s7\().8h,  v0.h[7]
   1455 .endm
   1456 .macro sqrshrun_h shift, r0, r1, r2, r3
   1457        sqrshrun        \r0\().4h, \r0\().4s,  #\shift
   1458 .ifnb \r1
   1459        sqrshrun2       \r0\().8h, \r1\().4s,  #\shift
   1460 .endif
   1461 .ifnb \r2
   1462        sqrshrun        \r2\().4h, \r2\().4s,  #\shift
   1463        sqrshrun2       \r2\().8h, \r3\().4s,  #\shift
   1464 .endif
   1465 .endm
   1466 .macro xtn_h r0, r1, r2, r3
   1467        uzp1            \r0\().8h,  \r0\().8h,  \r1\().8h // Same as xtn, xtn2
   1468 .ifnb \r2
   1469        uzp1            \r2\().8h,  \r2\().8h,  \r3\().8h // Ditto
   1470 .endif
   1471 .endm
   1472 .macro srshl_s shift, r0, r1, r2, r3
   1473        srshl           \r0\().4s,  \r0\().4s,  \shift\().4s
   1474        srshl           \r1\().4s,  \r1\().4s,  \shift\().4s
   1475 .ifnb \r2
   1476        srshl           \r2\().4s,  \r2\().4s,  \shift\().4s
   1477        srshl           \r3\().4s,  \r3\().4s,  \shift\().4s
   1478 .endif
   1479 .endm
   1480 .macro st_s strd, reg, lanes
   1481        st1             {\reg\().s}[0], [x0], \strd
   1482        st1             {\reg\().s}[1], [x9], \strd
   1483 .if \lanes > 2
   1484        st1             {\reg\().s}[2], [x0], \strd
   1485        st1             {\reg\().s}[3], [x9], \strd
   1486 .endif
   1487 .endm
   1488 .macro st_d strd, r0, r1
   1489        st1             {\r0\().8b},   [x0], \strd
   1490        st1             {\r0\().d}[1], [x9], \strd
   1491 .ifnb \r1
   1492        st1             {\r1\().8b},   [x0], \strd
   1493        st1             {\r1\().d}[1], [x9], \strd
   1494 .endif
   1495 .endm
   1496 .macro shift_store_4 type, strd, r0, r1, r2, r3
   1497 .ifc \type, put
   1498        sqrshrun_h      6,   \r0, \r1, \r2, \r3
   1499        umin_h          v31, .8h, \r0, \r2
   1500 .else
   1501        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
   1502        xtn_h           \r0, \r1, \r2, \r3
   1503        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
   1504 .endif
   1505        st_d            \strd, \r0, \r2
   1506 .endm
   1507 .macro st_reg strd, wd, r0, r1, r2, r3, r4, r5, r6, r7
   1508        st1             {\r0\wd}, [x0], \strd
   1509        st1             {\r1\wd}, [x9], \strd
   1510 .ifnb \r2
   1511        st1             {\r2\wd}, [x0], \strd
   1512        st1             {\r3\wd}, [x9], \strd
   1513 .endif
   1514 .ifnb \r4
   1515        st1             {\r4\wd}, [x0], \strd
   1516        st1             {\r5\wd}, [x9], \strd
   1517        st1             {\r6\wd}, [x0], \strd
   1518        st1             {\r7\wd}, [x9], \strd
   1519 .endif
   1520 .endm
   1521 .macro st_8h strd, r0, r1, r2, r3, r4, r5, r6, r7
   1522        st_reg          \strd, .8h, \r0, \r1, \r2, \r3, \r4, \r5, \r6, \r7
   1523 .endm
   1524 .macro shift_store_8 type, strd, r0, r1, r2, r3
   1525 .ifc \type, put
   1526        sqrshrun_h      6,   \r0, \r1, \r2, \r3
   1527        umin_h          v31, .8h, \r0, \r2
   1528 .else
   1529        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
   1530        xtn_h           \r0, \r1, \r2, \r3
   1531        sub_h           v29, .8h, \r0, \r2       // PREP_BIAS
   1532 .endif
   1533        st_8h           \strd, \r0, \r2
   1534 .endm
   1535 .macro shift_store_16 type, strd, dst, r0, r1, r2, r3
   1536 .ifc \type, put
   1537        sqrshrun_h      6,   \r0, \r1, \r2, \r3
   1538        umin            \r0\().8h, \r0\().8h, v31.8h
   1539        umin            \r1\().8h, \r2\().8h, v31.8h
   1540 .else
   1541        srshl_s         v30, \r0, \r1, \r2, \r3  // -(6-intermediate_bits)
   1542        xtn_h           \r0, \r1, \r2, \r3
   1543        sub             \r0\().8h, \r0\().8h, v29.8h
   1544        sub             \r1\().8h, \r2\().8h, v29.8h
   1545 .endif
   1546        st1             {\r0\().8h, \r1\().8h}, [\dst], \strd
   1547 .endm
   1548 
   1549 .macro make_8tap_fn op, type, type_h, type_v, taps
   1550 function \op\()_8tap_\type\()_16bpc_neon, export=1
   1551        mov             w9,  \type_h
   1552        mov             w10, \type_v
   1553        b               \op\()_\taps\()_neon
   1554 endfunc
   1555 .endm
   1556 
   1557 // No spaces in these expressions, due to gas-preprocessor.
   1558 #define REGULAR ((0*15<<7)|3*15)
   1559 #define SMOOTH  ((1*15<<7)|4*15)
   1560 #define SHARP   ((2*15<<7)|3*15)
   1561 
   1562 .macro filter_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2, taps
   1563 function \type\()_\taps\()_neon
   1564 .ifc \bdmax, w8
   1565        ldr             w8,  [sp]
   1566 .endif
   1567        mov             w11,  #0x4081  // (1 << 14) | (1 << 7) | (1 << 0)
   1568        mul             \mx,  \mx, w11
   1569        mul             \my,  \my, w11
   1570        add             \mx,  \mx, w9  // mx, 8tap_h, 4tap_h
   1571        add             \my,  \my, w10 // my, 8tap_v, 4tap_v
   1572 .ifc \type, prep
   1573        uxtw            \d_strd, \w
   1574        lsl             \d_strd, \d_strd, #1
   1575 .endif
   1576 
   1577        dup             v31.8h,  \bdmax        // bitdepth_max
   1578        clz             \bdmax,  \bdmax
   1579        clz             w9,  \w
   1580        sub             \bdmax,  \bdmax,  #18  // intermediate_bits = clz(bitdepth_max) - 18
   1581        mov             w12, #6
   1582        tst             \mx, #(0x7f << 14)
   1583        sub             w9,  w9,  #24
   1584        add             w13, w12, \bdmax       // 6 + intermediate_bits
   1585        sub             w12, w12, \bdmax       // 6 - intermediate_bits
   1586        movrel          x11, X(mc_subpel_filters), -8
   1587        b.ne            L(\type\()_\taps\()_h)
   1588        tst             \my, #(0x7f << 14)
   1589        b.ne            L(\type\()_\taps\()_v)
   1590        b               \type\()_16bpc_neon
   1591 
   1592 L(\type\()_\taps\()_h):
   1593        cmp             \w,   #4
   1594        ubfx            w10,  \mx, #7, #7
   1595        and             \mx,  \mx, #0x7f
   1596        b.le            4f
   1597        mov             \mx,  w10
   1598 4:
   1599        tst             \my,  #(0x7f << 14)
   1600        add             \xmx, x11, \mx, uxtw #3
   1601        b.ne            L(\type\()_\taps\()_hv)
   1602 
   1603        movrel          x10, \type\()_\taps\()_h_tbl
   1604        ldrsw           x9,  [x10, x9, lsl #2]
   1605 .ifc \type, put
   1606        mov             w12,  #34              // rounding for 10-bit
   1607        mov             w13,  #40              // rounding for 12-bit
   1608        cmp             \bdmax, #2             // 10-bit: 4, 12-bit: 2
   1609        csel            w12,  w12,  w13,  ne   // select rounding based on \bdmax
   1610 .else
   1611        neg             w12,  w12              // -(6 - intermediate_bits)
   1612        movi            v28.8h,  #(PREP_BIAS >> 8), lsl #8
   1613 .endif
   1614        add             x10, x10, x9
   1615        dup             v30.4s,  w12           // rounding or shift amount
   1616        br              x10
   1617 
   1618 20:     // 2xN h
   1619        AARCH64_VALID_JUMP_TARGET
   1620 .ifc \type, put
   1621        ldur            s0,  [\xmx, #2]
   1622        sub             \src,  \src,  #2
   1623        add             \ds2,  \dst,  \d_strd
   1624        add             \sr2,  \src,  \s_strd
   1625        lsl             \d_strd,  \d_strd,  #1
   1626        lsl             \s_strd,  \s_strd,  #1
   1627        sxtl            v0.8h,   v0.8b
   1628 2:
   1629        ld1             {v4.8h},  [\src], \s_strd
   1630        ld1             {v6.8h},  [\sr2], \s_strd
   1631        mov             v2.16b,  v30.16b
   1632        ext             v5.16b,  v4.16b,  v4.16b,  #2
   1633        ext             v7.16b,  v6.16b,  v6.16b,  #2
   1634        subs            \h,  \h,  #2
   1635        trn1            v3.2s,   v4.2s,   v6.2s
   1636        trn2            v6.2s,   v4.2s,   v6.2s
   1637        trn1            v4.2s,   v5.2s,   v7.2s
   1638        trn2            v7.2s,   v5.2s,   v7.2s
   1639        smlal           v2.4s,   v3.4h,   v0.h[0]
   1640        smlal           v2.4s,   v4.4h,   v0.h[1]
   1641        smlal           v2.4s,   v6.4h,   v0.h[2]
   1642        smlal           v2.4s,   v7.4h,   v0.h[3]
   1643        sqshrun         v2.4h,   v2.4s,   #6
   1644        umin            v2.4h,   v2.4h,   v31.4h
   1645        st1             {v2.s}[0], [\dst], \d_strd
   1646        st1             {v2.s}[1], [\ds2], \d_strd
   1647        b.gt            2b
   1648        ret
   1649 .endif
   1650 
   1651 40:     // 4xN h
   1652        AARCH64_VALID_JUMP_TARGET
   1653        ldur            s0,  [\xmx, #2]
   1654        sub             \src,  \src,  #2
   1655        add             \ds2,  \dst,  \d_strd
   1656        add             \sr2,  \src,  \s_strd
   1657        lsl             \d_strd,  \d_strd,  #1
   1658        lsl             \s_strd,  \s_strd,  #1
   1659        sxtl            v0.8h,   v0.8b
   1660 4:
   1661        ld1             {v16.8h}, [\src], \s_strd
   1662        ld1             {v20.8h}, [\sr2], \s_strd
   1663 .ifc \type, put
   1664        mov             v2.16b,  v30.16b
   1665        mov             v3.16b,  v30.16b
   1666 .endif
   1667        ext             v17.16b, v16.16b, v16.16b, #2
   1668        ext             v18.16b, v16.16b, v16.16b, #4
   1669        ext             v19.16b, v16.16b, v16.16b, #6
   1670        ext             v21.16b, v20.16b, v20.16b, #2
   1671        ext             v22.16b, v20.16b, v20.16b, #4
   1672        ext             v23.16b, v20.16b, v20.16b, #6
   1673        subs            \h,  \h,  #2
   1674 .ifc \type, put
   1675        smlal           v2.4s,   v16.4h,  v0.h[0]
   1676 .else
   1677        smull           v2.4s,   v16.4h,  v0.h[0]
   1678 .endif
   1679        smlal           v2.4s,   v17.4h,  v0.h[1]
   1680        smlal           v2.4s,   v18.4h,  v0.h[2]
   1681        smlal           v2.4s,   v19.4h,  v0.h[3]
   1682 .ifc \type, put
   1683        smlal           v3.4s,   v20.4h,  v0.h[0]
   1684 .else
   1685        smull           v3.4s,   v20.4h,  v0.h[0]
   1686 .endif
   1687        smlal           v3.4s,   v21.4h,  v0.h[1]
   1688        smlal           v3.4s,   v22.4h,  v0.h[2]
   1689        smlal           v3.4s,   v23.4h,  v0.h[3]
   1690 .ifc \type, put
   1691        sqshrun         v16.4h,  v2.4s,   #6
   1692        sqshrun2        v16.8h,  v3.4s,   #6
   1693        umin            v16.8h,  v16.8h,  v31.8h
   1694 .else
   1695        srshl           v16.4s,  v2.4s,   v30.4s // -(6-intermediate_bits)
   1696        srshl           v20.4s,  v3.4s,   v30.4s // -(6-intermediate_bits)
   1697        uzp1            v16.8h,  v16.8h,  v20.8h // Same as xtn, xtn2
   1698        sub             v16.8h,  v16.8h,  v28.8h // PREP_BIAS
   1699 .endif
   1700        st1             {v16.8b},   [\dst], \d_strd
   1701        st1             {v16.d}[1], [\ds2], \d_strd
   1702        b.gt            4b
   1703        ret
   1704 
   1705 80:
   1706 160:
   1707 320:
   1708 640:
   1709 1280:   // 8xN, 16xN, 32xN, ... h
   1710        AARCH64_VALID_JUMP_TARGET
   1711        ld1             {v0.8b}, [\xmx]
   1712 .ifc \taps, 6tap
   1713        sub             \src,  \src,  #4
   1714 .else
   1715        sub             \src,  \src,  #6
   1716 .endif
   1717        add             \ds2,  \dst,  \d_strd
   1718        add             \sr2,  \src,  \s_strd
   1719        lsl             \s_strd,  \s_strd,  #1
   1720        sxtl            v0.8h,   v0.8b
   1721 
   1722        sub             \s_strd,  \s_strd,  \w, uxtw #1
   1723        sub             \s_strd,  \s_strd,  #16
   1724 .ifc \type, put
   1725        lsl             \d_strd,  \d_strd,  #1
   1726        sub             \d_strd,  \d_strd,  \w, uxtw #1
   1727 .endif
   1728 81:
   1729        ld1             {v16.8h, v17.8h},  [\src], #32
   1730        ld1             {v20.8h, v21.8h},  [\sr2], #32
   1731        mov             \mx, \w
   1732 
   1733 8:
   1734 .ifc \taps, 6tap
   1735    .ifc \type, put
   1736        mov             v18.16b, v30.16b
   1737        mov             v19.16b, v30.16b
   1738        smlal           v18.4s,  v16.4h,  v0.h[1]
   1739        smlal2          v19.4s,  v16.8h,  v0.h[1]
   1740        mov             v22.16b, v30.16b
   1741        mov             v23.16b, v30.16b
   1742        smlal           v22.4s,  v20.4h,  v0.h[1]
   1743        smlal2          v23.4s,  v20.8h,  v0.h[1]
   1744    .else
   1745        smull           v18.4s,  v16.4h,  v0.h[1]
   1746        smull2          v19.4s,  v16.8h,  v0.h[1]
   1747        smull           v22.4s,  v20.4h,  v0.h[1]
   1748        smull2          v23.4s,  v20.8h,  v0.h[1]
   1749    .endif
   1750    .irpc i, 23456
   1751        ext             v24.16b, v16.16b, v17.16b, #(2*\i-2)
   1752        ext             v25.16b, v20.16b, v21.16b, #(2*\i-2)
   1753        smlal           v18.4s,  v24.4h,  v0.h[\i]
   1754        smlal2          v19.4s,  v24.8h,  v0.h[\i]
   1755        smlal           v22.4s,  v25.4h,  v0.h[\i]
   1756        smlal2          v23.4s,  v25.8h,  v0.h[\i]
   1757    .endr
   1758 .else   // 8tap
   1759    .ifc \type, put
   1760        mov             v18.16b, v30.16b
   1761        mov             v19.16b, v30.16b
   1762        smlal           v18.4s,  v16.4h,  v0.h[0]
   1763        smlal2          v19.4s,  v16.8h,  v0.h[0]
   1764        mov             v22.16b, v30.16b
   1765        mov             v23.16b, v30.16b
   1766        smlal           v22.4s,  v20.4h,  v0.h[0]
   1767        smlal2          v23.4s,  v20.8h,  v0.h[0]
   1768    .else
   1769        smull           v18.4s,  v16.4h,  v0.h[0]
   1770        smull2          v19.4s,  v16.8h,  v0.h[0]
   1771        smull           v22.4s,  v20.4h,  v0.h[0]
   1772        smull2          v23.4s,  v20.8h,  v0.h[0]
   1773    .endif
   1774    .irpc i, 1234567
   1775        ext             v24.16b, v16.16b, v17.16b, #(2*\i)
   1776        ext             v25.16b, v20.16b, v21.16b, #(2*\i)
   1777        smlal           v18.4s,  v24.4h,  v0.h[\i]
   1778        smlal2          v19.4s,  v24.8h,  v0.h[\i]
   1779        smlal           v22.4s,  v25.4h,  v0.h[\i]
   1780        smlal2          v23.4s,  v25.8h,  v0.h[\i]
   1781    .endr
   1782 .endif
   1783        subs            \mx, \mx, #8
   1784 .ifc \type, put
   1785        sqshrun         v18.4h,  v18.4s,  #6
   1786        sqshrun2        v18.8h,  v19.4s,  #6
   1787        sqshrun         v22.4h,  v22.4s,  #6
   1788        sqshrun2        v22.8h,  v23.4s,  #6
   1789        umin            v18.8h,  v18.8h,  v31.8h
   1790        umin            v22.8h,  v22.8h,  v31.8h
   1791 .else
   1792        srshl           v18.4s,  v18.4s,  v30.4s // -(6-intermediate_bits)
   1793        srshl           v19.4s,  v19.4s,  v30.4s // -(6-intermediate_bits)
   1794        srshl           v22.4s,  v22.4s,  v30.4s // -(6-intermediate_bits)
   1795        srshl           v23.4s,  v23.4s,  v30.4s // -(6-intermediate_bits)
   1796        uzp1            v18.8h,  v18.8h,  v19.8h // Same as xtn, xtn2
   1797        uzp1            v22.8h,  v22.8h,  v23.8h // Ditto
   1798        sub             v18.8h,  v18.8h,  v28.8h // PREP_BIAS
   1799        sub             v22.8h,  v22.8h,  v28.8h // PREP_BIAS
   1800 .endif
   1801        st1             {v18.8h}, [\dst], #16
   1802        st1             {v22.8h}, [\ds2], #16
   1803        b.le            9f
   1804 
   1805        mov             v16.16b, v17.16b
   1806        mov             v20.16b, v21.16b
   1807        ld1             {v17.8h}, [\src], #16
   1808        ld1             {v21.8h}, [\sr2], #16
   1809        b               8b
   1810 
   1811 9:
   1812        add             \dst,  \dst,  \d_strd
   1813        add             \ds2,  \ds2,  \d_strd
   1814        add             \src,  \src,  \s_strd
   1815        add             \sr2,  \sr2,  \s_strd
   1816 
   1817        subs            \h,  \h,  #2
   1818        b.gt            81b
   1819        ret
   1820 endfunc
   1821 
   1822 jumptable \type\()_\taps\()_h_tbl
   1823        .word 1280b - \type\()_\taps\()_h_tbl
   1824        .word 640b  - \type\()_\taps\()_h_tbl
   1825        .word 320b  - \type\()_\taps\()_h_tbl
   1826        .word 160b  - \type\()_\taps\()_h_tbl
   1827        .word 80b   - \type\()_\taps\()_h_tbl
   1828        .word 40b   - \type\()_\taps\()_h_tbl
   1829        .word 20b   - \type\()_\taps\()_h_tbl
   1830 endjumptable
   1831 
   1832 
   1833 function L(\type\()_\taps\()_v)
   1834        cmp             \h,  #4
   1835        ubfx            w10, \my, #7, #7
   1836        and             \my, \my, #0x7f
   1837        b.le            4f
   1838        mov             \my, w10
   1839 4:
   1840        add             \xmy, x11, \my, uxtw #3
   1841 
   1842 .ifc \type, prep
   1843        dup             v30.4s,  w12           // 6 - intermediate_bits
   1844        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
   1845 .endif
   1846        movrel          x10, \type\()_\taps\()_v_tbl
   1847        ldrsw           x9,  [x10, x9, lsl #2]
   1848 .ifc \type, prep
   1849        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
   1850 .endif
   1851        add             x10, x10, x9
   1852        br              x10
   1853 
   1854 20:     // 2xN v
   1855        AARCH64_VALID_JUMP_TARGET
   1856 .ifc \type, put
   1857        b.gt            28f
   1858 
   1859        cmp             \h,  #2
   1860        ldur            s0,  [\xmy, #2]
   1861        sub             \src,  \src,  \s_strd
   1862        add             \ds2,  \dst,  \d_strd
   1863        add             \sr2,  \src,  \s_strd
   1864        lsl             \s_strd,  \s_strd,  #1
   1865        lsl             \d_strd,  \d_strd,  #1
   1866        sxtl            v0.8h,   v0.8b
   1867 
   1868        // 2x2 v
   1869        load_s          \src, \sr2, \s_strd, v1, v2, v3, v4, v5
   1870        interleave_1_s  v1,  v2,  v3,  v4,  v5
   1871        b.gt            24f
   1872        smull_smlal_4tap v6, v1,  v2,  v3,  v4
   1873        sqrshrun_h      6,   v6
   1874        umin_h          v31, .8h, v6
   1875        st_s            \d_strd, v6, 2
   1876        ret
   1877 
   1878 24:     // 2x4 v
   1879        load_s          \sr2, \src, \s_strd, v6, v7
   1880        interleave_1_s  v5,  v6,  v7
   1881        smull_smlal_4tap v16, v1, v2, v3, v4
   1882        smull_smlal_4tap v17, v3, v4, v5, v6
   1883        sqrshrun_h      6,   v16, v17
   1884        umin_h          v31, .8h, v16
   1885        st_s            \d_strd, v16, 4
   1886        ret
   1887 
   1888 28:     // 2x6, 2x8, 2x12, 2x16 v
   1889        ld1             {v0.8b}, [\xmy]
   1890        sub             \sr2,  \src,  \s_strd, lsl #1
   1891        add             \ds2,  \dst,  \d_strd
   1892        sub             \src,  \sr2,  \s_strd
   1893        lsl             \d_strd,  \d_strd,  #1
   1894        lsl             \s_strd,  \s_strd,  #1
   1895        sxtl            v0.8h,   v0.8b
   1896 
   1897        load_s          \src, \sr2, \s_strd, v1,  v2,  v3,  v4, v5, v6, v7
   1898        interleave_1_s  v1,  v2,  v3,  v4,  v5
   1899        interleave_1_s  v5,  v6,  v7
   1900 216:
   1901        subs            \h,  \h,  #4
   1902        load_s          \sr2, \src, \s_strd, v16, v17, v18, v19
   1903        interleave_1_s  v7,  v16, v17, v18, v19
   1904        smull_smlal_\taps v24, v1,  v2,  v3,  v4,  v5,  v6,  v7, v16
   1905        smull_smlal_\taps v25, v3,  v4,  v5,  v6,  v7, v16, v17, v18
   1906        sqrshrun_h      6,   v24, v25
   1907        umin_h          v31, .8h, v24
   1908        st_s            \d_strd, v24, 4
   1909        b.le            0f
   1910        cmp             \h,  #2
   1911        mov             v1.16b,  v5.16b
   1912        mov             v2.16b,  v6.16b
   1913        mov             v3.16b,  v7.16b
   1914        mov             v4.16b,  v16.16b
   1915        mov             v5.16b,  v17.16b
   1916        mov             v6.16b,  v18.16b
   1917        mov             v7.16b,  v19.16b
   1918        b.eq            26f
   1919        b               216b
   1920 26:
   1921        load_s          \sr2, \src, \s_strd, v16, v17
   1922        interleave_1_s  v7,  v16, v17
   1923        smull_smlal_\taps v24, v1, v2,  v3,  v4,  v5,  v6,  v7, v16
   1924        sqrshrun_h      6,   v24
   1925        umin_h          v31, .4h, v24
   1926        st_s            \d_strd, v24, 2
   1927 0:
   1928        ret
   1929 .endif
   1930 
   1931 40:
   1932        AARCH64_VALID_JUMP_TARGET
   1933        b.gt            480f
   1934 
   1935        // 4x2, 4x4 v
   1936        cmp             \h,  #2
   1937        ldur            s0,  [\xmy, #2]
   1938        sub             \src, \src, \s_strd
   1939        add             \ds2, \dst, \d_strd
   1940        add             \sr2, \src, \s_strd
   1941        lsl             \s_strd, \s_strd, #1
   1942        lsl             \d_strd, \d_strd, #1
   1943        sxtl            v0.8h,   v0.8b
   1944 
   1945        load_4h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
   1946        smull_smlal_4tap v6,  v1,  v2,  v3,  v4
   1947        smull_smlal_4tap v7,  v2,  v3,  v4,  v5
   1948        shift_store_4   \type, \d_strd, v6, v7
   1949        b.le            0f
   1950        load_4h         \sr2, \src, \s_strd, v6, v7
   1951        smull_smlal_4tap v1,  v3,  v4,  v5,  v6
   1952        smull_smlal_4tap v2,  v4,  v5,  v6,  v7
   1953        shift_store_4   \type, \d_strd, v1, v2
   1954 0:
   1955        ret
   1956 
   1957 480:    // 4x6, 4x8, 4x12, 4x16 v
   1958        ld1             {v0.8b}, [\xmy]
   1959        sub             \sr2, \src, \s_strd, lsl #1
   1960        add             \ds2, \dst, \d_strd
   1961        sub             \src, \sr2, \s_strd
   1962        lsl             \s_strd, \s_strd, #1
   1963        lsl             \d_strd, \d_strd, #1
   1964        sxtl            v0.8h,   v0.8b
   1965 
   1966        load_4h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
   1967 
   1968 48:
   1969        subs            \h,  \h,  #4
   1970        load_4h         \sr2, \src, \s_strd, v23, v24, v25, v26
   1971        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
   1972        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
   1973        smull_smlal_\taps v3, v18, v19, v20, v21, v22, v23, v24, v25
   1974        smull_smlal_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
   1975        shift_store_4   \type, \d_strd, v1, v2, v3, v4
   1976        b.le            0f
   1977        cmp             \h,  #2
   1978        mov             v16.8b,  v20.8b
   1979        mov             v17.8b,  v21.8b
   1980        mov             v18.8b,  v22.8b
   1981        mov             v19.8b,  v23.8b
   1982        mov             v20.8b,  v24.8b
   1983        mov             v21.8b,  v25.8b
   1984        mov             v22.8b,  v26.8b
   1985        b.eq            46f
   1986        b               48b
   1987 46:
   1988        load_4h         \sr2, \src, \s_strd, v23, v24
   1989        smull_smlal_\taps v1, v16, v17, v18, v19, v20, v21, v22, v23
   1990        smull_smlal_\taps v2, v17, v18, v19, v20, v21, v22, v23, v24
   1991        shift_store_4   \type, \d_strd, v1, v2
   1992 0:
   1993        ret
   1994 
   1995 80:
   1996        AARCH64_VALID_JUMP_TARGET
   1997        b.gt            880f
   1998 
   1999        // 8x2, 8x4 v
   2000        cmp             \h,  #2
   2001        ldur            s0,  [\xmy, #2]
   2002        sub             \src, \src, \s_strd
   2003        add             \ds2, \dst, \d_strd
   2004        add             \sr2, \src, \s_strd
   2005        lsl             \s_strd, \s_strd, #1
   2006        lsl             \d_strd, \d_strd, #1
   2007        sxtl            v0.8h,   v0.8b
   2008 
   2009        load_8h         \src, \sr2, \s_strd, v1, v2, v3, v4, v5
   2010        smull_smlal_4tap   v16, v1,  v2,  v3,  v4
   2011        smull2_smlal2_4tap v17, v1,  v2,  v3,  v4
   2012        smull_smlal_4tap   v18, v2,  v3,  v4,  v5
   2013        smull2_smlal2_4tap v19, v2,  v3,  v4,  v5
   2014        shift_store_8   \type, \d_strd, v16, v17, v18, v19
   2015        b.le            0f
   2016        load_8h         \sr2, \src, \s_strd, v6, v7
   2017        smull_smlal_4tap   v16, v3,  v4,  v5,  v6
   2018        smull2_smlal2_4tap v17, v3,  v4,  v5,  v6
   2019        smull_smlal_4tap   v18, v4,  v5,  v6,  v7
   2020        smull2_smlal2_4tap v19, v4,  v5,  v6,  v7
   2021        shift_store_8   \type, \d_strd, v16, v17, v18, v19
   2022 0:
   2023        ret
   2024 
   2025 880:    // 8x6, 8x8, 8x16, 8x32 v
   2026 1680:   // 16x8, 16x16, ...
   2027 320:    // 32x8, 32x16, ...
   2028 640:
   2029 1280:
   2030        AARCH64_VALID_JUMP_TARGET
   2031        ld1             {v0.8b}, [\xmy]
   2032        sub             \src, \src, \s_strd
   2033        sub             \src, \src, \s_strd, lsl #1
   2034        sxtl            v0.8h,   v0.8b
   2035        mov             \my,  \h
   2036 168:
   2037        add             \ds2, \dst, \d_strd
   2038        add             \sr2, \src, \s_strd
   2039        lsl             \s_strd, \s_strd, #1
   2040        lsl             \d_strd, \d_strd, #1
   2041 
   2042        load_8h         \src, \sr2, \s_strd, v16, v17, v18, v19, v20, v21, v22
   2043 
   2044 88:
   2045        subs            \h,  \h,  #2
   2046        load_8h         \sr2, \src, \s_strd, v23, v24
   2047        smull_smlal_\taps   v1, v16, v17, v18, v19, v20, v21, v22, v23
   2048        smull2_smlal2_\taps v2, v16, v17, v18, v19, v20, v21, v22, v23
   2049        smull_smlal_\taps   v3, v17, v18, v19, v20, v21, v22, v23, v24
   2050        smull2_smlal2_\taps v4, v17, v18, v19, v20, v21, v22, v23, v24
   2051        shift_store_8   \type, \d_strd, v1, v2, v3, v4
   2052        b.le            9f
   2053        subs            \h,  \h,  #2
   2054        load_8h         \sr2, \src, \s_strd, v25, v26
   2055        smull_smlal_\taps   v1, v18, v19, v20, v21, v22, v23, v24, v25
   2056        smull2_smlal2_\taps v2, v18, v19, v20, v21, v22, v23, v24, v25
   2057        smull_smlal_\taps   v3, v19, v20, v21, v22, v23, v24, v25, v26
   2058        smull2_smlal2_\taps v4, v19, v20, v21, v22, v23, v24, v25, v26
   2059        shift_store_8   \type, \d_strd, v1, v2, v3, v4
   2060        b.le            9f
   2061        mov             v16.16b, v20.16b
   2062        mov             v17.16b, v21.16b
   2063        mov             v18.16b, v22.16b
   2064        mov             v19.16b, v23.16b
   2065        mov             v20.16b, v24.16b
   2066        mov             v21.16b, v25.16b
   2067        mov             v22.16b, v26.16b
   2068        b               88b
   2069 9:
   2070        subs            \w,  \w,  #8
   2071        b.le            0f
   2072        asr             \s_strd, \s_strd, #1
   2073        asr             \d_strd, \d_strd, #1
   2074        msub            \src, \s_strd, \xmy, \src
   2075        msub            \dst, \d_strd, \xmy, \dst
   2076        sub             \src, \src, \s_strd, lsl #3
   2077        mov             \h,  \my
   2078        add             \src, \src, #16
   2079        add             \dst, \dst, #16
   2080        b               168b
   2081 0:
   2082        ret
   2083 
   2084 160:
   2085        AARCH64_VALID_JUMP_TARGET
   2086        b.gt            1680b
   2087 
   2088        // 16x2, 16x4 v
   2089        ldur            s0,  [\xmy, #2]
   2090        sub             \src, \src, \s_strd
   2091        sxtl            v0.8h,   v0.8b
   2092 
   2093        load_16h        \src, \src, \s_strd, v16, v17, v18, v19, v20, v21
   2094 16:
   2095        load_16h        \src, \src, \s_strd, v22, v23
   2096        subs            \h,  \h,  #1
   2097        smull_smlal_4tap   v1, v16, v18, v20, v22
   2098        smull2_smlal2_4tap v2, v16, v18, v20, v22
   2099        smull_smlal_4tap   v3, v17, v19, v21, v23
   2100        smull2_smlal2_4tap v4, v17, v19, v21, v23
   2101        shift_store_16  \type, \d_strd, x0, v1, v2, v3, v4
   2102        b.le            0f
   2103        mov             v16.16b, v18.16b
   2104        mov             v17.16b, v19.16b
   2105        mov             v18.16b, v20.16b
   2106        mov             v19.16b, v21.16b
   2107        mov             v20.16b, v22.16b
   2108        mov             v21.16b, v23.16b
   2109        b               16b
   2110 0:
   2111        ret
   2112 endfunc
   2113 
   2114 jumptable \type\()_\taps\()_v_tbl
   2115        .word 1280b - \type\()_\taps\()_v_tbl
   2116        .word 640b  - \type\()_\taps\()_v_tbl
   2117        .word 320b  - \type\()_\taps\()_v_tbl
   2118        .word 160b  - \type\()_\taps\()_v_tbl
   2119        .word 80b   - \type\()_\taps\()_v_tbl
   2120        .word 40b   - \type\()_\taps\()_v_tbl
   2121        .word 20b   - \type\()_\taps\()_v_tbl
   2122 endjumptable
   2123 
   2124 function L(\type\()_\taps\()_hv)
   2125        cmp             \h,  #4
   2126        ubfx            w10, \my, #7, #7
   2127        and             \my, \my, #0x7f
   2128        b.le            4f
   2129        mov             \my,  w10
   2130 4:
   2131        add             \xmy, x11, \my, uxtw #3
   2132 
   2133        movrel          x10, \type\()_\taps\()_hv_tbl
   2134        dup             v30.4s,  w12           // 6 - intermediate_bits
   2135        ldrsw           x9,  [x10, x9, lsl #2]
   2136        neg             v30.4s,  v30.4s        // -(6-intermediate_bits)
   2137 .ifc \type, put
   2138        dup             v29.4s,  w13           // 6 + intermediate_bits
   2139 .else
   2140        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
   2141 .endif
   2142        add             x10, x10, x9
   2143 .ifc \type, put
   2144        neg             v29.4s,  v29.4s        // -(6+intermediate_bits)
   2145 .endif
   2146        br              x10
   2147 
   2148 20:
   2149        AARCH64_VALID_JUMP_TARGET
   2150 .ifc \type, put
   2151        ldur            s0,  [\xmx, #2]
   2152        b.gt            280f
   2153        ldur            s1,  [\xmy, #2]
   2154 
   2155        // 2x2, 2x4 hv
   2156        sub             \sr2, \src, #2
   2157        sub             \src, \sr2, \s_strd
   2158        add             \ds2, \dst, \d_strd
   2159        lsl             \s_strd, \s_strd, #1
   2160        lsl             \d_strd, \d_strd, #1
   2161        sxtl            v0.8h,   v0.8b
   2162        sxtl            v1.8h,   v1.8b
   2163        mov             x15, x30
   2164 
   2165        ld1             {v27.8h}, [\src], \s_strd
   2166        ext             v28.16b, v27.16b, v27.16b, #2
   2167        smull           v27.4s,  v27.4h,  v0.4h
   2168        smull           v28.4s,  v28.4h,  v0.4h
   2169        addp            v27.4s,  v27.4s,  v28.4s
   2170        addp            v16.4s,  v27.4s,  v27.4s
   2171        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
   2172        bl              L(\type\()_\taps\()_filter_2)
   2173        // The intermediates from the horizontal pass fit in 16 bit without
   2174        // any bias; we could just as well keep them as .4s, but narrowing
   2175        // them to .4h gives a significant speedup on out of order cores
   2176        // (at the cost of a smaller slowdown on in-order cores such as A53).
   2177        xtn             v16.4h,  v16.4s
   2178 
   2179        trn1            v16.2s,  v16.2s,  v24.2s
   2180        mov             v17.8b,  v24.8b
   2181 
   2182 2:
   2183        bl              L(\type\()_\taps\()_filter_2)
   2184 
   2185        ext             v18.8b,  v17.8b,  v24.8b,  #4
   2186        smull           v2.4s,   v16.4h,  v1.h[0]
   2187        smlal           v2.4s,   v17.4h,  v1.h[1]
   2188        smlal           v2.4s,   v18.4h,  v1.h[2]
   2189        smlal           v2.4s,   v24.4h,  v1.h[3]
   2190 
   2191        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
   2192        sqxtun          v2.4h,   v2.4s
   2193        umin            v2.4h,   v2.4h,   v31.4h
   2194        subs            \h,  \h,  #2
   2195        st1             {v2.s}[0], [\dst], \d_strd
   2196        st1             {v2.s}[1], [\ds2], \d_strd
   2197        b.le            0f
   2198        mov             v16.8b,  v18.8b
   2199        mov             v17.8b,  v24.8b
   2200        b               2b
   2201 
   2202 280:    // 2x8, 2x16, 2x32 hv
   2203        ld1             {v1.8b},  [\xmy]
   2204        sub             \src, \src, #2
   2205        sub             \sr2, \src, \s_strd, lsl #1
   2206        sub             \src, \sr2, \s_strd
   2207        add             \ds2, \dst, \d_strd
   2208        lsl             \s_strd, \s_strd, #1
   2209        lsl             \d_strd, \d_strd, #1
   2210        sxtl            v0.8h,   v0.8b
   2211        sxtl            v1.8h,   v1.8b
   2212        mov             x15, x30
   2213 
   2214        ld1             {v27.8h}, [\src], \s_strd
   2215        ext             v28.16b, v27.16b, v27.16b, #2
   2216        smull           v27.4s,  v27.4h,  v0.4h
   2217        smull           v28.4s,  v28.4h,  v0.4h
   2218        addp            v27.4s,  v27.4s,  v28.4s
   2219        addp            v16.4s,  v27.4s,  v27.4s
   2220        srshl           v16.2s,  v16.2s,  v30.2s // -(6-intermediate_bits)
   2221        // The intermediates from the horizontal pass fit in 16 bit without
   2222        // any bias; we could just as well keep them as .4s, but narrowing
   2223        // them to .4h gives a significant speedup on out of order cores
   2224        // (at the cost of a smaller slowdown on in-order cores such as A53).
   2225 
   2226        bl              L(\type\()_\taps\()_filter_2)
   2227        xtn             v16.4h,  v16.4s
   2228        trn1            v16.2s,  v16.2s,  v24.2s
   2229        mov             v17.8b,  v24.8b
   2230        bl              L(\type\()_\taps\()_filter_2)
   2231        ext             v18.8b,  v17.8b,  v24.8b,  #4
   2232        mov             v19.8b,  v24.8b
   2233        bl              L(\type\()_\taps\()_filter_2)
   2234        ext             v20.8b,  v19.8b,  v24.8b,  #4
   2235        mov             v21.8b,  v24.8b
   2236 
   2237 28:
   2238        bl              L(\type\()_\taps\()_filter_2)
   2239        ext             v22.8b,  v21.8b,  v24.8b,  #4
   2240 .ifc \taps, 6tap
   2241        smull           v3.4s,   v17.4h,  v1.h[1]
   2242        smlal           v3.4s,   v18.4h,  v1.h[2]
   2243        smlal           v3.4s,   v19.4h,  v1.h[3]
   2244        smlal           v3.4s,   v20.4h,  v1.h[4]
   2245        smlal           v3.4s,   v21.4h,  v1.h[5]
   2246        smlal           v3.4s,   v22.4h,  v1.h[6]
   2247 .else   // 8tap
   2248        smull           v3.4s,   v16.4h,  v1.h[0]
   2249        smlal           v3.4s,   v17.4h,  v1.h[1]
   2250        smlal           v3.4s,   v18.4h,  v1.h[2]
   2251        smlal           v3.4s,   v19.4h,  v1.h[3]
   2252        smlal           v3.4s,   v20.4h,  v1.h[4]
   2253        smlal           v3.4s,   v21.4h,  v1.h[5]
   2254        smlal           v3.4s,   v22.4h,  v1.h[6]
   2255        smlal           v3.4s,   v24.4h,  v1.h[7]
   2256 .endif
   2257 
   2258        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
   2259        sqxtun          v3.4h,   v3.4s
   2260        umin            v3.4h,   v3.4h,   v31.4h
   2261        subs            \h,  \h,  #2
   2262        st1             {v3.s}[0], [\dst], \d_strd
   2263        st1             {v3.s}[1], [\ds2], \d_strd
   2264        b.le            0f
   2265        mov             v16.8b,  v18.8b
   2266        mov             v17.8b,  v19.8b
   2267        mov             v18.8b,  v20.8b
   2268        mov             v19.8b,  v21.8b
   2269        mov             v20.8b,  v22.8b
   2270        mov             v21.8b,  v24.8b
   2271        b               28b
   2272 
   2273 0:
   2274        ret             x15
   2275 
   2276 L(\type\()_\taps\()_filter_2):
   2277        ld1             {v25.8h},  [\sr2], \s_strd
   2278        ld1             {v27.8h},  [\src], \s_strd
   2279        ext             v26.16b, v25.16b, v25.16b, #2
   2280        ext             v28.16b, v27.16b, v27.16b, #2
   2281        trn1            v24.2s,  v25.2s,  v27.2s
   2282        trn2            v27.2s,  v25.2s,  v27.2s
   2283        trn1            v25.2s,  v26.2s,  v28.2s
   2284        trn2            v28.2s,  v26.2s,  v28.2s
   2285        smull           v24.4s,  v24.4h,  v0.h[0]
   2286        smlal           v24.4s,  v25.4h,  v0.h[1]
   2287        smlal           v24.4s,  v27.4h,  v0.h[2]
   2288        smlal           v24.4s,  v28.4h,  v0.h[3]
   2289        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
   2290        xtn             v24.4h,  v24.4s
   2291        ret
   2292 .endif
   2293 
   2294 40:
   2295        AARCH64_VALID_JUMP_TARGET
   2296        ldur            s0,  [\xmx, #2]
   2297        b.gt            480f
   2298        ldur            s1,  [\xmy, #2]
   2299        sub             \sr2, \src, #2
   2300        sub             \src, \sr2, \s_strd
   2301        add             \ds2, \dst, \d_strd
   2302        lsl             \s_strd, \s_strd, #1
   2303        lsl             \d_strd, \d_strd, #1
   2304        sxtl            v0.8h,   v0.8b
   2305        sxtl            v1.8h,   v1.8b
   2306        mov             x15, x30
   2307 
   2308        // 4x2, 4x4 hv
   2309        ld1             {v25.8h}, [\src], \s_strd
   2310        ext             v26.16b, v25.16b, v25.16b, #2
   2311        ext             v27.16b, v25.16b, v25.16b, #4
   2312        ext             v28.16b, v25.16b, v25.16b, #6
   2313        smull           v25.4s,  v25.4h,  v0.h[0]
   2314        smlal           v25.4s,  v26.4h,  v0.h[1]
   2315        smlal           v25.4s,  v27.4h,  v0.h[2]
   2316        smlal           v25.4s,  v28.4h,  v0.h[3]
   2317        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
   2318        // The intermediates from the horizontal pass fit in 16 bit without
   2319        // any bias; we could just as well keep them as .4s, but narrowing
   2320        // them to .4h gives a significant speedup on out of order cores
   2321        // (at the cost of a smaller slowdown on in-order cores such as A53).
   2322        xtn             v16.4h,  v16.4s
   2323 
   2324        bl              L(\type\()_\taps\()_filter_4)
   2325        mov             v17.8b,  v24.8b
   2326        mov             v18.8b,  v25.8b
   2327 
   2328 4:
   2329        bl              L(\type\()_\taps\()_filter_4)
   2330        smull           v2.4s,   v16.4h,  v1.h[0]
   2331        smlal           v2.4s,   v17.4h,  v1.h[1]
   2332        smlal           v2.4s,   v18.4h,  v1.h[2]
   2333        smlal           v2.4s,   v24.4h,  v1.h[3]
   2334        smull           v3.4s,   v17.4h,  v1.h[0]
   2335        smlal           v3.4s,   v18.4h,  v1.h[1]
   2336        smlal           v3.4s,   v24.4h,  v1.h[2]
   2337        smlal           v3.4s,   v25.4h,  v1.h[3]
   2338 .ifc \type, put
   2339        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
   2340        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
   2341        sqxtun          v2.4h,   v2.4s
   2342        sqxtun2         v2.8h,   v3.4s
   2343        umin            v2.8h,   v2.8h,   v31.8h
   2344 .else
   2345        rshrn           v2.4h,   v2.4s,   #6
   2346        rshrn2          v2.8h,   v3.4s,   #6
   2347        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
   2348 .endif
   2349        subs            \h,  \h,  #2
   2350 
   2351        st1             {v2.8b},   [\dst], \d_strd
   2352        st1             {v2.d}[1], [\ds2], \d_strd
   2353        b.le            0f
   2354        mov             v16.8b,  v18.8b
   2355        mov             v17.8b,  v24.8b
   2356        mov             v18.8b,  v25.8b
   2357        b               4b
   2358 
   2359 480:    // 4x8, 4x16, 4x32 hv
   2360        ld1             {v1.8b},  [\xmy]
   2361        sub             \src, \src, #2
   2362 .ifc \taps, 6tap
   2363        sub             \sr2, \src, \s_strd
   2364        sub             \src, \src, \s_strd, lsl #1
   2365 .else
   2366        sub             \sr2, \src, \s_strd, lsl #1
   2367        sub             \src, \sr2, \s_strd
   2368 .endif
   2369        add             \ds2, \dst, \d_strd
   2370        lsl             \s_strd, \s_strd, #1
   2371        lsl             \d_strd, \d_strd, #1
   2372        sxtl            v0.8h,   v0.8b
   2373        sxtl            v1.8h,   v1.8b
   2374        mov             x15, x30
   2375 
   2376        ld1             {v25.8h}, [\src], \s_strd
   2377        ext             v26.16b, v25.16b, v25.16b, #2
   2378        ext             v27.16b, v25.16b, v25.16b, #4
   2379        ext             v28.16b, v25.16b, v25.16b, #6
   2380        smull           v25.4s,  v25.4h,  v0.h[0]
   2381        smlal           v25.4s,  v26.4h,  v0.h[1]
   2382        smlal           v25.4s,  v27.4h,  v0.h[2]
   2383        smlal           v25.4s,  v28.4h,  v0.h[3]
   2384        srshl           v16.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
   2385        // The intermediates from the horizontal pass fit in 16 bit without
   2386        // any bias; we could just as well keep them as .4s, but narrowing
   2387        // them to .4h gives a significant speedup on out of order cores
   2388        // (at the cost of a smaller slowdown on in-order cores such as A53).
   2389 .ifc \taps, 6tap
   2390        xtn             v18.4h,  v16.4s
   2391 .else
   2392        xtn             v16.4h,  v16.4s
   2393 
   2394        bl              L(\type\()_\taps\()_filter_4)
   2395        mov             v17.8b,  v24.8b
   2396        mov             v18.8b,  v25.8b
   2397 .endif
   2398        bl              L(\type\()_\taps\()_filter_4)
   2399        mov             v19.8b,  v24.8b
   2400        mov             v20.8b,  v25.8b
   2401        bl              L(\type\()_\taps\()_filter_4)
   2402        mov             v21.8b,  v24.8b
   2403        mov             v22.8b,  v25.8b
   2404 
   2405 48:
   2406        bl              L(\type\()_\taps\()_filter_4)
   2407 .ifc \taps, 6tap
   2408        smull           v3.4s,   v18.4h,  v1.h[1]
   2409        smlal           v3.4s,   v19.4h,  v1.h[2]
   2410        smlal           v3.4s,   v20.4h,  v1.h[3]
   2411        smlal           v3.4s,   v21.4h,  v1.h[4]
   2412        smlal           v3.4s,   v22.4h,  v1.h[5]
   2413        smlal           v3.4s,   v24.4h,  v1.h[6]
   2414        smull           v4.4s,   v19.4h,  v1.h[1]
   2415        smlal           v4.4s,   v20.4h,  v1.h[2]
   2416        smlal           v4.4s,   v21.4h,  v1.h[3]
   2417        smlal           v4.4s,   v22.4h,  v1.h[4]
   2418        smlal           v4.4s,   v24.4h,  v1.h[5]
   2419        smlal           v4.4s,   v25.4h,  v1.h[6]
   2420 .else   // 8tap
   2421        smull           v3.4s,   v16.4h,  v1.h[0]
   2422        smlal           v3.4s,   v17.4h,  v1.h[1]
   2423        smlal           v3.4s,   v18.4h,  v1.h[2]
   2424        smlal           v3.4s,   v19.4h,  v1.h[3]
   2425        smlal           v3.4s,   v20.4h,  v1.h[4]
   2426        smlal           v3.4s,   v21.4h,  v1.h[5]
   2427        smlal           v3.4s,   v22.4h,  v1.h[6]
   2428        smlal           v3.4s,   v24.4h,  v1.h[7]
   2429        smull           v4.4s,   v17.4h,  v1.h[0]
   2430        smlal           v4.4s,   v18.4h,  v1.h[1]
   2431        smlal           v4.4s,   v19.4h,  v1.h[2]
   2432        smlal           v4.4s,   v20.4h,  v1.h[3]
   2433        smlal           v4.4s,   v21.4h,  v1.h[4]
   2434        smlal           v4.4s,   v22.4h,  v1.h[5]
   2435        smlal           v4.4s,   v24.4h,  v1.h[6]
   2436        smlal           v4.4s,   v25.4h,  v1.h[7]
   2437 .endif
   2438 .ifc \type, put
   2439        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
   2440        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
   2441        sqxtun          v3.4h,   v3.4s
   2442        sqxtun2         v3.8h,   v4.4s
   2443        umin            v3.8h,   v3.8h,   v31.8h
   2444 .else
   2445        rshrn           v3.4h,   v3.4s,   #6
   2446        rshrn2          v3.8h,   v4.4s,   #6
   2447        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
   2448 .endif
   2449        subs            \h,  \h,  #2
   2450        st1             {v3.8b},   [\dst], \d_strd
   2451        st1             {v3.d}[1], [\ds2], \d_strd
   2452        b.le            0f
   2453 .ifc \taps, 8tap
   2454        mov             v16.8b,  v18.8b
   2455        mov             v17.8b,  v19.8b
   2456 .endif
   2457        mov             v18.8b,  v20.8b
   2458        mov             v19.8b,  v21.8b
   2459        mov             v20.8b,  v22.8b
   2460        mov             v21.8b,  v24.8b
   2461        mov             v22.8b,  v25.8b
   2462        b               48b
   2463 0:
   2464        ret             x15
   2465 
   2466 L(\type\()_\taps\()_filter_4):
   2467        ld1             {v24.8h}, [\sr2], \s_strd
   2468        ld1             {v25.8h}, [\src], \s_strd
   2469        ext             v26.16b, v24.16b, v24.16b, #2
   2470        ext             v27.16b, v24.16b, v24.16b, #4
   2471        ext             v28.16b, v24.16b, v24.16b, #6
   2472        smull           v24.4s,  v24.4h,  v0.h[0]
   2473        smlal           v24.4s,  v26.4h,  v0.h[1]
   2474        smlal           v24.4s,  v27.4h,  v0.h[2]
   2475        smlal           v24.4s,  v28.4h,  v0.h[3]
   2476        ext             v26.16b, v25.16b, v25.16b, #2
   2477        ext             v27.16b, v25.16b, v25.16b, #4
   2478        ext             v28.16b, v25.16b, v25.16b, #6
   2479        smull           v25.4s,  v25.4h,  v0.h[0]
   2480        smlal           v25.4s,  v26.4h,  v0.h[1]
   2481        smlal           v25.4s,  v27.4h,  v0.h[2]
   2482        smlal           v25.4s,  v28.4h,  v0.h[3]
   2483        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
   2484        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
   2485        xtn             v24.4h,  v24.4s
   2486        xtn             v25.4h,  v25.4s
   2487        ret
   2488 
   2489 80:
   2490 160:
   2491 320:
   2492        AARCH64_VALID_JUMP_TARGET
   2493        b.gt            880f
   2494        ld1             {v0.8b},  [\xmx]
   2495        ldur            s1,  [\xmy, #2]
   2496 .ifc \taps, 6tap
   2497        sub             \src,  \src,  #4
   2498 .else
   2499        sub             \src,  \src,  #6
   2500 .endif
   2501        sub             \src,  \src,  \s_strd
   2502        sxtl            v0.8h,   v0.8b
   2503        sxtl            v1.8h,   v1.8b
   2504        mov             x15, x30
   2505        mov             \my, \h
   2506 
   2507 164:    // 8x2, 8x4, 16x2, 16x4, 32x2, 32x4 hv
   2508        add             \ds2,  \dst,  \d_strd
   2509        add             \sr2,  \src,  \s_strd
   2510        lsl             \d_strd, \d_strd, #1
   2511        lsl             \s_strd, \s_strd, #1
   2512 
   2513        ld1             {v27.8h, v28.8h},  [\src], \s_strd
   2514 .ifc \taps, 6tap
   2515        smull           v24.4s,  v27.4h,  v0.h[1]
   2516        smull2          v25.4s,  v27.8h,  v0.h[1]
   2517    .irpc i, 23456
   2518        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
   2519        smlal           v24.4s,  v26.4h,  v0.h[\i]
   2520        smlal2          v25.4s,  v26.8h,  v0.h[\i]
   2521    .endr
   2522 .else
   2523        smull           v24.4s,  v27.4h,  v0.h[0]
   2524        smull2          v25.4s,  v27.8h,  v0.h[0]
   2525    .irpc i, 1234567
   2526        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
   2527        smlal           v24.4s,  v26.4h,  v0.h[\i]
   2528        smlal2          v25.4s,  v26.8h,  v0.h[\i]
   2529    .endr
   2530 .endif
   2531        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
   2532        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
   2533        // The intermediates from the horizontal pass fit in 16 bit without
   2534        // any bias; we could just as well keep them as .4s, but narrowing
   2535        // them to .4h gives a significant speedup on out of order cores
   2536        // (at the cost of a smaller slowdown on in-order cores such as A53),
   2537        // and conserves register space (no need to clobber v8-v15).
   2538        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
   2539 
   2540        bl              L(\type\()_\taps\()_filter_8)
   2541        mov             v17.16b, v23.16b
   2542        mov             v18.16b, v24.16b
   2543 
   2544 8:
   2545        smull           v2.4s,   v16.4h,  v1.h[0]
   2546        smull2          v3.4s,   v16.8h,  v1.h[0]
   2547        bl              L(\type\()_\taps\()_filter_8)
   2548        smull           v4.4s,   v17.4h,  v1.h[0]
   2549        smull2          v5.4s,   v17.8h,  v1.h[0]
   2550        smlal           v2.4s,   v17.4h,  v1.h[1]
   2551        smlal2          v3.4s,   v17.8h,  v1.h[1]
   2552        smlal           v4.4s,   v18.4h,  v1.h[1]
   2553        smlal2          v5.4s,   v18.8h,  v1.h[1]
   2554        smlal           v2.4s,   v18.4h,  v1.h[2]
   2555        smlal2          v3.4s,   v18.8h,  v1.h[2]
   2556        smlal           v4.4s,   v23.4h,  v1.h[2]
   2557        smlal2          v5.4s,   v23.8h,  v1.h[2]
   2558        smlal           v2.4s,   v23.4h,  v1.h[3]
   2559        smlal2          v3.4s,   v23.8h,  v1.h[3]
   2560        smlal           v4.4s,   v24.4h,  v1.h[3]
   2561        smlal2          v5.4s,   v24.8h,  v1.h[3]
   2562 .ifc \type, put
   2563        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
   2564        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
   2565        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
   2566        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
   2567        sqxtun          v2.4h,   v2.4s
   2568        sqxtun2         v2.8h,   v3.4s
   2569        sqxtun          v3.4h,   v4.4s
   2570        sqxtun2         v3.8h,   v5.4s
   2571        umin            v2.8h,   v2.8h,   v31.8h
   2572        umin            v3.8h,   v3.8h,   v31.8h
   2573 .else
   2574        rshrn           v2.4h,   v2.4s,   #6
   2575        rshrn2          v2.8h,   v3.4s,   #6
   2576        rshrn           v3.4h,   v4.4s,   #6
   2577        rshrn2          v3.8h,   v5.4s,   #6
   2578        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
   2579        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
   2580 .endif
   2581        subs            \h,  \h,  #2
   2582        st1             {v2.8h}, [\dst], \d_strd
   2583        st1             {v3.8h}, [\ds2], \d_strd
   2584        b.le            9f
   2585        mov             v16.16b, v18.16b
   2586        mov             v17.16b, v23.16b
   2587        mov             v18.16b, v24.16b
   2588        b               8b
   2589 9:
   2590        subs            \w,  \w,  #8
   2591        b.le            0f
   2592        asr             \s_strd,  \s_strd,  #1
   2593        asr             \d_strd,  \d_strd,  #1
   2594        msub            \src,  \s_strd,  \xmy,  \src
   2595        msub            \dst,  \d_strd,  \xmy,  \dst
   2596        sub             \src,  \src,  \s_strd,  lsl #2
   2597        mov             \h,  \my
   2598        add             \src,  \src,  #16
   2599        add             \dst,  \dst,  #16
   2600        b               164b
   2601 
   2602 880:    // 8x8, 8x16, ..., 16x8, ..., 32x8, ... hv
   2603 640:
   2604 1280:
   2605        AARCH64_VALID_JUMP_TARGET
   2606        ld1             {v0.8b},  [\xmx]
   2607        ld1             {v1.8b},  [\xmy]
   2608 .ifc \taps, 6tap
   2609        sub             \src,  \src,  #4
   2610 .else
   2611        sub             \src,  \src,  #6
   2612        sub             \src,  \src,  \s_strd
   2613 .endif
   2614        sub             \src,  \src,  \s_strd, lsl #1
   2615        sxtl            v0.8h,   v0.8b
   2616        sxtl            v1.8h,   v1.8b
   2617        mov             x15, x30
   2618        mov             \my, \h
   2619 
   2620 168:
   2621        add             \ds2,  \dst,  \d_strd
   2622        add             \sr2,  \src,  \s_strd
   2623        lsl             \d_strd, \d_strd, #1
   2624        lsl             \s_strd, \s_strd, #1
   2625 
   2626        ld1             {v27.8h, v28.8h},  [\src], \s_strd
   2627 .ifc \taps, 6tap
   2628        smull           v24.4s,  v27.4h,  v0.h[1]
   2629        smull2          v25.4s,  v27.8h,  v0.h[1]
   2630    .irpc i, 23456
   2631        ext             v26.16b, v27.16b, v28.16b, #(2*\i-2)
   2632        smlal           v24.4s,  v26.4h,  v0.h[\i]
   2633        smlal2          v25.4s,  v26.8h,  v0.h[\i]
   2634    .endr
   2635 .else   // 8tap
   2636        smull           v24.4s,  v27.4h,  v0.h[0]
   2637        smull2          v25.4s,  v27.8h,  v0.h[0]
   2638    .irpc i, 1234567
   2639        ext             v26.16b, v27.16b, v28.16b, #(2*\i)
   2640        smlal           v24.4s,  v26.4h,  v0.h[\i]
   2641        smlal2          v25.4s,  v26.8h,  v0.h[\i]
   2642    .endr
   2643 .endif
   2644        srshl           v24.4s,  v24.4s,  v30.4s // -(6-intermediate_bits)
   2645        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
   2646        // The intermediates from the horizontal pass fit in 16 bit without
   2647        // any bias; we could just as well keep them as .4s, but narrowing
   2648        // them to .4h gives a significant speedup on out of order cores
   2649        // (at the cost of a smaller slowdown on in-order cores such as A53),
   2650        // and conserves register space (no need to clobber v8-v15).
   2651 .ifc \taps, 6tap
   2652        uzp1            v18.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
   2653 .else
   2654        uzp1            v16.8h,  v24.8h,  v25.8h // Same as xtn, xtn2
   2655 
   2656        bl              L(\type\()_\taps\()_filter_8)
   2657        mov             v17.16b, v23.16b
   2658        mov             v18.16b, v24.16b
   2659 .endif
   2660        bl              L(\type\()_\taps\()_filter_8)
   2661        mov             v19.16b, v23.16b
   2662        mov             v20.16b, v24.16b
   2663        bl              L(\type\()_\taps\()_filter_8)
   2664        mov             v21.16b, v23.16b
   2665        mov             v22.16b, v24.16b
   2666 
   2667 88:
   2668 .ifc \taps, 6tap
   2669        smull           v2.4s,   v18.4h,  v1.h[1]
   2670        smull2          v3.4s,   v18.8h,  v1.h[1]
   2671        bl              L(\type\()_\taps\()_filter_8)
   2672        smull           v4.4s,   v19.4h,  v1.h[1]
   2673        smull2          v5.4s,   v19.8h,  v1.h[1]
   2674        smlal           v2.4s,   v19.4h,  v1.h[2]
   2675        smlal2          v3.4s,   v19.8h,  v1.h[2]
   2676        smlal           v4.4s,   v20.4h,  v1.h[2]
   2677        smlal2          v5.4s,   v20.8h,  v1.h[2]
   2678        smlal           v2.4s,   v20.4h,  v1.h[3]
   2679        smlal2          v3.4s,   v20.8h,  v1.h[3]
   2680        smlal           v4.4s,   v21.4h,  v1.h[3]
   2681        smlal2          v5.4s,   v21.8h,  v1.h[3]
   2682        smlal           v2.4s,   v21.4h,  v1.h[4]
   2683        smlal2          v3.4s,   v21.8h,  v1.h[4]
   2684        smlal           v4.4s,   v22.4h,  v1.h[4]
   2685        smlal2          v5.4s,   v22.8h,  v1.h[4]
   2686        smlal           v2.4s,   v22.4h,  v1.h[5]
   2687        smlal2          v3.4s,   v22.8h,  v1.h[5]
   2688        smlal           v4.4s,   v23.4h,  v1.h[5]
   2689        smlal2          v5.4s,   v23.8h,  v1.h[5]
   2690        smlal           v2.4s,   v23.4h,  v1.h[6]
   2691        smlal2          v3.4s,   v23.8h,  v1.h[6]
   2692        smlal           v4.4s,   v24.4h,  v1.h[6]
   2693        smlal2          v5.4s,   v24.8h,  v1.h[6]
   2694 .else   // 8tap
   2695        smull           v2.4s,   v16.4h,  v1.h[0]
   2696        smull2          v3.4s,   v16.8h,  v1.h[0]
   2697        bl              L(\type\()_\taps\()_filter_8)
   2698        smull           v4.4s,   v17.4h,  v1.h[0]
   2699        smull2          v5.4s,   v17.8h,  v1.h[0]
   2700        smlal           v2.4s,   v17.4h,  v1.h[1]
   2701        smlal2          v3.4s,   v17.8h,  v1.h[1]
   2702        smlal           v4.4s,   v18.4h,  v1.h[1]
   2703        smlal2          v5.4s,   v18.8h,  v1.h[1]
   2704        smlal           v2.4s,   v18.4h,  v1.h[2]
   2705        smlal2          v3.4s,   v18.8h,  v1.h[2]
   2706        smlal           v4.4s,   v19.4h,  v1.h[2]
   2707        smlal2          v5.4s,   v19.8h,  v1.h[2]
   2708        smlal           v2.4s,   v19.4h,  v1.h[3]
   2709        smlal2          v3.4s,   v19.8h,  v1.h[3]
   2710        smlal           v4.4s,   v20.4h,  v1.h[3]
   2711        smlal2          v5.4s,   v20.8h,  v1.h[3]
   2712        smlal           v2.4s,   v20.4h,  v1.h[4]
   2713        smlal2          v3.4s,   v20.8h,  v1.h[4]
   2714        smlal           v4.4s,   v21.4h,  v1.h[4]
   2715        smlal2          v5.4s,   v21.8h,  v1.h[4]
   2716        smlal           v2.4s,   v21.4h,  v1.h[5]
   2717        smlal2          v3.4s,   v21.8h,  v1.h[5]
   2718        smlal           v4.4s,   v22.4h,  v1.h[5]
   2719        smlal2          v5.4s,   v22.8h,  v1.h[5]
   2720        smlal           v2.4s,   v22.4h,  v1.h[6]
   2721        smlal2          v3.4s,   v22.8h,  v1.h[6]
   2722        smlal           v4.4s,   v23.4h,  v1.h[6]
   2723        smlal2          v5.4s,   v23.8h,  v1.h[6]
   2724        smlal           v2.4s,   v23.4h,  v1.h[7]
   2725        smlal2          v3.4s,   v23.8h,  v1.h[7]
   2726        smlal           v4.4s,   v24.4h,  v1.h[7]
   2727        smlal2          v5.4s,   v24.8h,  v1.h[7]
   2728 .endif
   2729 .ifc \type, put
   2730        srshl           v2.4s,   v2.4s,   v29.4s // -(6+intermediate_bits)
   2731        srshl           v3.4s,   v3.4s,   v29.4s // -(6+intermediate_bits)
   2732        srshl           v4.4s,   v4.4s,   v29.4s // -(6+intermediate_bits)
   2733        srshl           v5.4s,   v5.4s,   v29.4s // -(6+intermediate_bits)
   2734        sqxtun          v2.4h,   v2.4s
   2735        sqxtun2         v2.8h,   v3.4s
   2736        sqxtun          v3.4h,   v4.4s
   2737        sqxtun2         v3.8h,   v5.4s
   2738        umin            v2.8h,   v2.8h,   v31.8h
   2739        umin            v3.8h,   v3.8h,   v31.8h
   2740 .else
   2741        rshrn           v2.4h,   v2.4s,   #6
   2742        rshrn2          v2.8h,   v3.4s,   #6
   2743        rshrn           v3.4h,   v4.4s,   #6
   2744        rshrn2          v3.8h,   v5.4s,   #6
   2745        sub             v2.8h,   v2.8h,   v29.8h // PREP_BIAS
   2746        sub             v3.8h,   v3.8h,   v29.8h // PREP_BIAS
   2747 .endif
   2748        subs            \h,  \h,  #2
   2749        st1             {v2.8h}, [\dst], \d_strd
   2750        st1             {v3.8h}, [\ds2], \d_strd
   2751        b.le            9f
   2752 .ifc \taps, 8tap
   2753        mov             v16.16b, v18.16b
   2754        mov             v17.16b, v19.16b
   2755 .endif
   2756        mov             v18.16b, v20.16b
   2757        mov             v19.16b, v21.16b
   2758        mov             v20.16b, v22.16b
   2759        mov             v21.16b, v23.16b
   2760        mov             v22.16b, v24.16b
   2761        b               88b
   2762 9:
   2763        subs            \w,  \w,  #8
   2764        b.le            0f
   2765        asr             \s_strd,  \s_strd,  #1
   2766        asr             \d_strd,  \d_strd,  #1
   2767        msub            \src,  \s_strd,  \xmy,  \src
   2768        msub            \dst,  \d_strd,  \xmy,  \dst
   2769        sub             \src,  \src,  \s_strd,  lsl #3
   2770        mov             \h,  \my
   2771        add             \src,  \src,  #16
   2772        add             \dst,  \dst,  #16
   2773 .ifc \taps, 6tap
   2774        add             \src,  \src,  \s_strd,  lsl #1
   2775 .endif
   2776        b               168b
   2777 0:
   2778        ret             x15
   2779 
   2780 L(\type\()_\taps\()_filter_8):
   2781        ld1             {v4.8h, v5.8h},  [\sr2], \s_strd
   2782        ld1             {v6.8h, v7.8h},  [\src], \s_strd
   2783 .ifc \taps, 6tap
   2784        smull           v25.4s,  v4.4h,   v0.h[1]
   2785        smull2          v26.4s,  v4.8h,   v0.h[1]
   2786        smull           v27.4s,  v6.4h,   v0.h[1]
   2787        smull2          v28.4s,  v6.8h,   v0.h[1]
   2788 .irpc i, 23456
   2789        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i-2)
   2790        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i-2)
   2791        smlal           v25.4s,  v23.4h,  v0.h[\i]
   2792        smlal2          v26.4s,  v23.8h,  v0.h[\i]
   2793        smlal           v27.4s,  v24.4h,  v0.h[\i]
   2794        smlal2          v28.4s,  v24.8h,  v0.h[\i]
   2795 .endr
   2796 .else   // 8tap
   2797        smull           v25.4s,  v4.4h,   v0.h[0]
   2798        smull2          v26.4s,  v4.8h,   v0.h[0]
   2799        smull           v27.4s,  v6.4h,   v0.h[0]
   2800        smull2          v28.4s,  v6.8h,   v0.h[0]
   2801 .irpc i, 1234567
   2802        ext             v23.16b, v4.16b,  v5.16b,  #(2*\i)
   2803        ext             v24.16b, v6.16b,  v7.16b,  #(2*\i)
   2804        smlal           v25.4s,  v23.4h,  v0.h[\i]
   2805        smlal2          v26.4s,  v23.8h,  v0.h[\i]
   2806        smlal           v27.4s,  v24.4h,  v0.h[\i]
   2807        smlal2          v28.4s,  v24.8h,  v0.h[\i]
   2808 .endr
   2809 .endif
   2810        srshl           v25.4s,  v25.4s,  v30.4s // -(6-intermediate_bits)
   2811        srshl           v26.4s,  v26.4s,  v30.4s // -(6-intermediate_bits)
   2812        srshl           v27.4s,  v27.4s,  v30.4s // -(6-intermediate_bits)
   2813        srshl           v28.4s,  v28.4s,  v30.4s // -(6-intermediate_bits)
   2814        uzp1            v23.8h,  v25.8h,  v26.8h // Same as xtn, xtn2
   2815        uzp1            v24.8h,  v27.8h,  v28.8h // Ditto
   2816        ret
   2817 endfunc
   2818 
   2819 jumptable \type\()_\taps\()_hv_tbl
   2820        .word 1280b - \type\()_\taps\()_hv_tbl
   2821        .word 640b  - \type\()_\taps\()_hv_tbl
   2822        .word 320b  - \type\()_\taps\()_hv_tbl
   2823        .word 160b  - \type\()_\taps\()_hv_tbl
   2824        .word 80b   - \type\()_\taps\()_hv_tbl
   2825        .word 40b   - \type\()_\taps\()_hv_tbl
   2826        .word 20b   - \type\()_\taps\()_hv_tbl
   2827 endjumptable
   2828 .endm
   2829 
   2830 
   2831 .macro filter_bilin_fn type, dst, d_strd, src, s_strd, w, h, mx, xmx, my, xmy, bdmax, ds2, sr2
   2832 function \type\()_bilin_16bpc_neon, export=1
   2833 .ifc \bdmax, w8
   2834        ldr             w8,  [sp]
   2835 .endif
   2836        dup             v1.8h,   \mx
   2837        dup             v3.8h,   \my
   2838        mov             w10, #16
   2839        sub             w9,  w10, \mx
   2840        sub             w10, w10, \my
   2841        dup             v0.8h,   w9
   2842        dup             v2.8h,   w10
   2843 .ifc \type, prep
   2844        uxtw            \d_strd, \w
   2845        lsl             \d_strd, \d_strd, #1
   2846 .endif
   2847 
   2848        clz             \bdmax,   \bdmax       // bitdepth_max
   2849        clz             w9,  \w
   2850        sub             \bdmax,   \bdmax,  #18 // intermediate_bits = clz(bitdepth_max) - 18
   2851        mov             w11, #4
   2852        sub             w9,  w9,  #24
   2853        sub             w11, w11, \bdmax  // 4 - intermediate_bits
   2854        add             w12, \bdmax, #4   // 4 + intermediate_bits
   2855        cbnz            \mx, L(\type\()_bilin_h)
   2856        cbnz            \my, L(\type\()_bilin_v)
   2857        b               \type\()_16bpc_neon
   2858 
   2859 L(\type\()_bilin_h):
   2860        cbnz            \my, L(\type\()_bilin_hv)
   2861 
   2862        movrel          x10, \type\()_bilin_h_tbl
   2863        dup             v31.8h,  w11      // 4 - intermediate_bits
   2864        ldrsw           x9,  [x10, x9, lsl #2]
   2865        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
   2866 .ifc \type, put
   2867        dup             v30.8h,  \bdmax   // intermediate_bits
   2868 .else
   2869        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
   2870 .endif
   2871        add             x10, x10, x9
   2872 .ifc \type, put
   2873        neg             v30.8h,  v30.8h   // -intermediate_bits
   2874 .endif
   2875        br              x10
   2876 
   2877 20:     // 2xN h
   2878        AARCH64_VALID_JUMP_TARGET
   2879 .ifc \type, put
   2880        add             \ds2,  \dst,  \d_strd
   2881        add             \sr2,  \src,  \s_strd
   2882        lsl             \d_strd,  \d_strd,  #1
   2883        lsl             \s_strd,  \s_strd,  #1
   2884 2:
   2885        ld1             {v4.4h},  [\src], \s_strd
   2886        ld1             {v6.4h},  [\sr2], \s_strd
   2887        ext             v5.8b,   v4.8b,   v4.8b,   #2
   2888        ext             v7.8b,   v6.8b,   v6.8b,   #2
   2889        trn1            v4.2s,   v4.2s,   v6.2s
   2890        trn1            v5.2s,   v5.2s,   v7.2s
   2891        subs            \h,  \h,  #2
   2892        mul             v4.4h,   v4.4h,   v0.4h
   2893        mla             v4.4h,   v5.4h,   v1.4h
   2894        urshl           v4.4h,   v4.4h,   v31.4h
   2895        urshl           v4.4h,   v4.4h,   v30.4h
   2896        st1             {v4.s}[0], [\dst], \d_strd
   2897        st1             {v4.s}[1], [\ds2], \d_strd
   2898        b.gt            2b
   2899        ret
   2900 .endif
   2901 
   2902 40:     // 4xN h
   2903        AARCH64_VALID_JUMP_TARGET
   2904        add             \ds2,  \dst,  \d_strd
   2905        add             \sr2,  \src,  \s_strd
   2906        lsl             \d_strd,  \d_strd,  #1
   2907        lsl             \s_strd,  \s_strd,  #1
   2908 4:
   2909        ld1             {v4.8h}, [\src], \s_strd
   2910        ld1             {v6.8h}, [\sr2], \s_strd
   2911        ext             v5.16b,  v4.16b,  v4.16b,  #2
   2912        ext             v7.16b,  v6.16b,  v6.16b,  #2
   2913        trn1            v4.2d,   v4.2d,   v6.2d
   2914        trn1            v5.2d,   v5.2d,   v7.2d
   2915        subs            \h,  \h,  #2
   2916        mul             v4.8h,   v4.8h,   v0.8h
   2917        mla             v4.8h,   v5.8h,   v1.8h
   2918        urshl           v4.8h,   v4.8h,   v31.8h
   2919 .ifc \type, put
   2920        urshl           v4.8h,   v4.8h,   v30.8h
   2921 .else
   2922        sub             v4.8h,   v4.8h,   v29.8h
   2923 .endif
   2924        st1             {v4.8b},   [\dst], \d_strd
   2925        st1             {v4.d}[1], [\ds2], \d_strd
   2926        b.gt            4b
   2927        ret
   2928 
   2929 80:     // 8xN h
   2930        AARCH64_VALID_JUMP_TARGET
   2931        add             \ds2,  \dst,  \d_strd
   2932        add             \sr2,  \src,  \s_strd
   2933        lsl             \d_strd,  \d_strd,  #1
   2934        lsl             \s_strd,  \s_strd,  #1
   2935 8:
   2936        ldr             h5,  [\src, #16]
   2937        ldr             h7,  [\sr2, #16]
   2938        ld1             {v4.8h}, [\src], \s_strd
   2939        ld1             {v6.8h}, [\sr2], \s_strd
   2940        ext             v5.16b,  v4.16b,  v5.16b,  #2
   2941        ext             v7.16b,  v6.16b,  v7.16b,  #2
   2942        subs            \h,  \h,  #2
   2943        mul             v4.8h,   v4.8h,   v0.8h
   2944        mla             v4.8h,   v5.8h,   v1.8h
   2945        mul             v6.8h,   v6.8h,   v0.8h
   2946        mla             v6.8h,   v7.8h,   v1.8h
   2947        urshl           v4.8h,   v4.8h,   v31.8h
   2948        urshl           v6.8h,   v6.8h,   v31.8h
   2949 .ifc \type, put
   2950        urshl           v4.8h,   v4.8h,   v30.8h
   2951        urshl           v6.8h,   v6.8h,   v30.8h
   2952 .else
   2953        sub             v4.8h,   v4.8h,   v29.8h
   2954        sub             v6.8h,   v6.8h,   v29.8h
   2955 .endif
   2956        st1             {v4.8h}, [\dst], \d_strd
   2957        st1             {v6.8h}, [\ds2], \d_strd
   2958        b.gt            8b
   2959        ret
   2960 160:
   2961 320:
   2962 640:
   2963 1280:   // 16xN, 32xN, ... h
   2964        AARCH64_VALID_JUMP_TARGET
   2965        add             \ds2,  \dst,  \d_strd
   2966        add             \sr2,  \src,  \s_strd
   2967        lsl             \s_strd,  \s_strd,  #1
   2968 
   2969        sub             \s_strd,  \s_strd,  \w, uxtw #1
   2970        sub             \s_strd,  \s_strd,  #16
   2971 .ifc \type, put
   2972        lsl             \d_strd,  \d_strd,  #1
   2973        sub             \d_strd,  \d_strd,  \w, uxtw #1
   2974 .endif
   2975 161:
   2976        ld1             {v16.8h},  [\src], #16
   2977        ld1             {v21.8h},  [\sr2], #16
   2978        mov             \mx, \w
   2979 
   2980 16:
   2981        ld1             {v17.8h, v18.8h},  [\src], #32
   2982        ld1             {v22.8h, v23.8h},  [\sr2], #32
   2983        ext             v19.16b, v16.16b, v17.16b, #2
   2984        ext             v20.16b, v17.16b, v18.16b, #2
   2985        ext             v24.16b, v21.16b, v22.16b, #2
   2986        ext             v25.16b, v22.16b, v23.16b, #2
   2987        mul             v16.8h,  v16.8h,  v0.8h
   2988        mla             v16.8h,  v19.8h,  v1.8h
   2989        mul             v17.8h,  v17.8h,  v0.8h
   2990        mla             v17.8h,  v20.8h,  v1.8h
   2991        mul             v21.8h,  v21.8h,  v0.8h
   2992        mla             v21.8h,  v24.8h,  v1.8h
   2993        mul             v22.8h,  v22.8h,  v0.8h
   2994        mla             v22.8h,  v25.8h,  v1.8h
   2995        urshl           v16.8h,  v16.8h,  v31.8h
   2996        urshl           v17.8h,  v17.8h,  v31.8h
   2997        urshl           v21.8h,  v21.8h,  v31.8h
   2998        urshl           v22.8h,  v22.8h,  v31.8h
   2999        subs            \mx, \mx, #16
   3000 .ifc \type, put
   3001        urshl           v16.8h,  v16.8h,  v30.8h
   3002        urshl           v17.8h,  v17.8h,  v30.8h
   3003        urshl           v21.8h,  v21.8h,  v30.8h
   3004        urshl           v22.8h,  v22.8h,  v30.8h
   3005 .else
   3006        sub             v16.8h,  v16.8h,  v29.8h
   3007        sub             v17.8h,  v17.8h,  v29.8h
   3008        sub             v21.8h,  v21.8h,  v29.8h
   3009        sub             v22.8h,  v22.8h,  v29.8h
   3010 .endif
   3011        st1             {v16.8h, v17.8h}, [\dst], #32
   3012        st1             {v21.8h, v22.8h}, [\ds2], #32
   3013        b.le            9f
   3014 
   3015        mov             v16.16b, v18.16b
   3016        mov             v21.16b, v23.16b
   3017        b               16b
   3018 
   3019 9:
   3020        add             \dst,  \dst,  \d_strd
   3021        add             \ds2,  \ds2,  \d_strd
   3022        add             \src,  \src,  \s_strd
   3023        add             \sr2,  \sr2,  \s_strd
   3024 
   3025        subs            \h,  \h,  #2
   3026        b.gt            161b
   3027        ret
   3028 endfunc
   3029 
   3030 jumptable \type\()_bilin_h_tbl
   3031        .word 1280b - \type\()_bilin_h_tbl
   3032        .word 640b  - \type\()_bilin_h_tbl
   3033        .word 320b  - \type\()_bilin_h_tbl
   3034        .word 160b  - \type\()_bilin_h_tbl
   3035        .word 80b   - \type\()_bilin_h_tbl
   3036        .word 40b   - \type\()_bilin_h_tbl
   3037        .word 20b   - \type\()_bilin_h_tbl
   3038 endjumptable
   3039 
   3040 
   3041 function L(\type\()_bilin_v)
   3042        cmp             \h,  #4
   3043        movrel          x10, \type\()_bilin_v_tbl
   3044 .ifc \type, prep
   3045        dup             v31.8h,  w11      // 4 - intermediate_bits
   3046 .endif
   3047        ldrsw           x9,  [x10, x9, lsl #2]
   3048 .ifc \type, prep
   3049        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
   3050        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
   3051 .endif
   3052        add             x10, x10, x9
   3053        br              x10
   3054 
   3055 20:     // 2xN v
   3056        AARCH64_VALID_JUMP_TARGET
   3057 .ifc \type, put
   3058        cmp             \h,  #2
   3059        add             \ds2,  \dst,  \d_strd
   3060        add             \sr2,  \src,  \s_strd
   3061        lsl             \s_strd,  \s_strd,  #1
   3062        lsl             \d_strd,  \d_strd,  #1
   3063 
   3064        // 2x2 v
   3065        ld1r            {v16.4s}, [\src], \s_strd
   3066        b.gt            24f
   3067 22:
   3068        ld1r            {v17.4s}, [\sr2], \s_strd
   3069        ld1r            {v18.4s}, [\src], \s_strd
   3070        trn1            v16.2s,  v16.2s,  v17.2s
   3071        trn1            v17.2s,  v17.2s,  v18.2s
   3072        mul             v4.4h,   v16.4h,  v2.4h
   3073        mla             v4.4h,   v17.4h,  v3.4h
   3074        urshr           v4.8h,   v4.8h,   #4
   3075        str             s4,        [\dst]
   3076        st1             {v4.s}[1], [\ds2]
   3077        ret
   3078 24:     // 2x4, 2x6, 2x8, ... v
   3079        ld1r            {v17.4s}, [\sr2], \s_strd
   3080        ld1r            {v18.4s}, [\src], \s_strd
   3081        ld1r            {v19.4s}, [\sr2], \s_strd
   3082        ld1r            {v20.4s}, [\src], \s_strd
   3083        sub             \h,  \h,  #4
   3084        trn1            v16.2s,  v16.2s,  v17.2s
   3085        trn1            v17.2s,  v17.2s,  v18.2s
   3086        trn1            v18.2s,  v18.2s,  v19.2s
   3087        trn1            v19.2s,  v19.2s,  v20.2s
   3088        trn1            v16.2d,  v16.2d,  v18.2d
   3089        trn1            v17.2d,  v17.2d,  v19.2d
   3090        mul             v4.8h,   v16.8h,  v2.8h
   3091        mla             v4.8h,   v17.8h,  v3.8h
   3092        cmp             \h,  #2
   3093        urshr           v4.8h,   v4.8h,   #4
   3094        st1             {v4.s}[0], [\dst], \d_strd
   3095        st1             {v4.s}[1], [\ds2], \d_strd
   3096        st1             {v4.s}[2], [\dst], \d_strd
   3097        st1             {v4.s}[3], [\ds2], \d_strd
   3098        b.lt            0f
   3099        mov             v16.8b,  v20.8b
   3100        b.eq            22b
   3101        b               24b
   3102 0:
   3103        ret
   3104 .endif
   3105 
   3106 40:     // 4xN v
   3107        AARCH64_VALID_JUMP_TARGET
   3108        add             \ds2,  \dst,  \d_strd
   3109        add             \sr2,  \src,  \s_strd
   3110        lsl             \s_strd,  \s_strd,  #1
   3111        lsl             \d_strd,  \d_strd,  #1
   3112        ld1             {v16.4h}, [\src], \s_strd
   3113 4:
   3114        ld1             {v17.4h}, [\sr2], \s_strd
   3115        ld1             {v18.4h}, [\src], \s_strd
   3116        trn1            v16.2d,  v16.2d,  v17.2d
   3117        trn1            v17.2d,  v17.2d,  v18.2d
   3118        mul             v4.8h,   v16.8h,  v2.8h
   3119        mla             v4.8h,   v17.8h,  v3.8h
   3120        subs            \h,  \h,  #2
   3121 .ifc \type, put
   3122        urshr           v4.8h,   v4.8h,   #4
   3123 .else
   3124        urshl           v4.8h,   v4.8h,   v31.8h
   3125        sub             v4.8h,   v4.8h,   v29.8h
   3126 .endif
   3127        st1             {v4.8b},   [\dst], \d_strd
   3128        st1             {v4.d}[1], [\ds2], \d_strd
   3129        b.le            0f
   3130        mov             v16.8b,  v18.8b
   3131        b               4b
   3132 0:
   3133        ret
   3134 
   3135 80:     // 8xN v
   3136        AARCH64_VALID_JUMP_TARGET
   3137        add             \ds2,  \dst,  \d_strd
   3138        add             \sr2,  \src,  \s_strd
   3139        lsl             \s_strd,  \s_strd,  #1
   3140        lsl             \d_strd,  \d_strd,  #1
   3141        ld1             {v16.8h}, [\src], \s_strd
   3142 8:
   3143        ld1             {v17.8h}, [\sr2], \s_strd
   3144        ld1             {v18.8h}, [\src], \s_strd
   3145        mul             v4.8h,   v16.8h,  v2.8h
   3146        mla             v4.8h,   v17.8h,  v3.8h
   3147        mul             v5.8h,   v17.8h,  v2.8h
   3148        mla             v5.8h,   v18.8h,  v3.8h
   3149        subs            \h,  \h,  #2
   3150 .ifc \type, put
   3151        urshr           v4.8h,   v4.8h,   #4
   3152        urshr           v5.8h,   v5.8h,   #4
   3153 .else
   3154        urshl           v4.8h,   v4.8h,   v31.8h
   3155        urshl           v5.8h,   v5.8h,   v31.8h
   3156        sub             v4.8h,   v4.8h,   v29.8h
   3157        sub             v5.8h,   v5.8h,   v29.8h
   3158 .endif
   3159        st1             {v4.8h}, [\dst], \d_strd
   3160        st1             {v5.8h}, [\ds2], \d_strd
   3161        b.le            0f
   3162        mov             v16.16b, v18.16b
   3163        b               8b
   3164 0:
   3165        ret
   3166 
   3167 160:    // 16xN, 32xN, ...
   3168 320:
   3169 640:
   3170 1280:
   3171        AARCH64_VALID_JUMP_TARGET
   3172        mov             \my, \h
   3173 1:
   3174        add             \ds2, \dst, \d_strd
   3175        add             \sr2, \src, \s_strd
   3176        lsl             \s_strd, \s_strd, #1
   3177        lsl             \d_strd, \d_strd, #1
   3178 
   3179        ld1             {v16.8h, v17.8h}, [\src], \s_strd
   3180 2:
   3181        ld1             {v18.8h, v19.8h}, [\sr2], \s_strd
   3182        ld1             {v20.8h, v21.8h}, [\src], \s_strd
   3183        mul             v4.8h,   v16.8h,  v2.8h
   3184        mla             v4.8h,   v18.8h,  v3.8h
   3185        mul             v5.8h,   v17.8h,  v2.8h
   3186        mla             v5.8h,   v19.8h,  v3.8h
   3187        mul             v6.8h,   v18.8h,  v2.8h
   3188        mla             v6.8h,   v20.8h,  v3.8h
   3189        mul             v7.8h,   v19.8h,  v2.8h
   3190        mla             v7.8h,   v21.8h,  v3.8h
   3191        subs            \h,  \h,  #2
   3192 .ifc \type, put
   3193        urshr           v4.8h,   v4.8h,   #4
   3194        urshr           v5.8h,   v5.8h,   #4
   3195        urshr           v6.8h,   v6.8h,   #4
   3196        urshr           v7.8h,   v7.8h,   #4
   3197 .else
   3198        urshl           v4.8h,   v4.8h,   v31.8h
   3199        urshl           v5.8h,   v5.8h,   v31.8h
   3200        urshl           v6.8h,   v6.8h,   v31.8h
   3201        urshl           v7.8h,   v7.8h,   v31.8h
   3202        sub             v4.8h,   v4.8h,   v29.8h
   3203        sub             v5.8h,   v5.8h,   v29.8h
   3204        sub             v6.8h,   v6.8h,   v29.8h
   3205        sub             v7.8h,   v7.8h,   v29.8h
   3206 .endif
   3207        st1             {v4.8h, v5.8h}, [\dst], \d_strd
   3208        st1             {v6.8h, v7.8h}, [\ds2], \d_strd
   3209        b.le            9f
   3210        mov             v16.16b, v20.16b
   3211        mov             v17.16b, v21.16b
   3212        b               2b
   3213 9:
   3214        subs            \w,  \w,  #16
   3215        b.le            0f
   3216        asr             \s_strd, \s_strd, #1
   3217        asr             \d_strd, \d_strd, #1
   3218        msub            \src, \s_strd, \xmy, \src
   3219        msub            \dst, \d_strd, \xmy, \dst
   3220        sub             \src, \src, \s_strd, lsl #1
   3221        mov             \h,  \my
   3222        add             \src, \src, #32
   3223        add             \dst, \dst, #32
   3224        b               1b
   3225 0:
   3226        ret
   3227 endfunc
   3228 
   3229 jumptable \type\()_bilin_v_tbl
   3230        .word 1280b - \type\()_bilin_v_tbl
   3231        .word 640b  - \type\()_bilin_v_tbl
   3232        .word 320b  - \type\()_bilin_v_tbl
   3233        .word 160b  - \type\()_bilin_v_tbl
   3234        .word 80b   - \type\()_bilin_v_tbl
   3235        .word 40b   - \type\()_bilin_v_tbl
   3236        .word 20b   - \type\()_bilin_v_tbl
   3237 endjumptable
   3238 
   3239 function L(\type\()_bilin_hv)
   3240        movrel          x10, \type\()_bilin_hv_tbl
   3241        dup             v31.8h,  w11      // 4 - intermediate_bits
   3242        ldrsw           x9,  [x10, x9, lsl #2]
   3243        neg             v31.8h,  v31.8h   // -(4-intermediate_bits)
   3244 .ifc \type, put
   3245        dup             v30.4s,  w12      // 4 + intermediate_bits
   3246 .else
   3247        movi            v29.8h,  #(PREP_BIAS >> 8), lsl #8
   3248 .endif
   3249        add             x10, x10, x9
   3250 .ifc \type, put
   3251        neg             v30.4s,  v30.4s   // -(4+intermediate_bits)
   3252 .endif
   3253        br              x10
   3254 
   3255 20:     // 2xN hv
   3256        AARCH64_VALID_JUMP_TARGET
   3257 .ifc \type, put
   3258        add             \sr2, \src, \s_strd
   3259        add             \ds2, \dst, \d_strd
   3260        lsl             \s_strd, \s_strd, #1
   3261        lsl             \d_strd, \d_strd, #1
   3262 
   3263        ld1             {v20.4h},  [\src], \s_strd
   3264        ext             v21.8b,  v20.8b,  v20.8b,  #2
   3265        mul             v16.4h,  v20.4h,  v0.4h
   3266        mla             v16.4h,  v21.4h,  v1.4h
   3267        urshl           v16.4h,  v16.4h,  v31.4h
   3268 
   3269 2:
   3270        ld1             {v22.4h},  [\sr2], \s_strd
   3271        ld1             {v24.4h},  [\src], \s_strd
   3272        ext             v23.8b,  v22.8b,  v22.8b,  #2
   3273        ext             v25.8b,  v24.8b,  v24.8b,  #2
   3274        trn1            v22.2s,  v22.2s,  v24.2s
   3275        trn1            v23.2s,  v23.2s,  v25.2s
   3276        mul             v17.4h,  v22.4h,  v0.4h
   3277        mla             v17.4h,  v23.4h,  v1.4h
   3278        urshl           v17.4h,  v17.4h,  v31.4h
   3279 
   3280        trn1            v16.2s,  v16.2s,  v17.2s
   3281 
   3282        umull           v4.4s,   v16.4h,  v2.4h
   3283        umlal           v4.4s,   v17.4h,  v3.4h
   3284        urshl           v4.4s,   v4.4s,   v30.4s
   3285        xtn             v4.4h,   v4.4s
   3286        subs            \h,  \h,  #2
   3287        st1             {v4.s}[0], [\dst], \d_strd
   3288        st1             {v4.s}[1], [\ds2], \d_strd
   3289        b.le            0f
   3290        trn2            v16.2s,  v17.2s,  v17.2s
   3291        b               2b
   3292 0:
   3293        ret
   3294 .endif
   3295 
   3296 40:     // 4xN hv
   3297        AARCH64_VALID_JUMP_TARGET
   3298        add             \sr2, \src, \s_strd
   3299        add             \ds2, \dst, \d_strd
   3300        lsl             \s_strd, \s_strd, #1
   3301        lsl             \d_strd, \d_strd, #1
   3302 
   3303        ld1             {v20.8h},  [\src], \s_strd
   3304        ext             v21.16b, v20.16b, v20.16b, #2
   3305        mul             v16.4h,  v20.4h,  v0.4h
   3306        mla             v16.4h,  v21.4h,  v1.4h
   3307        urshl           v16.4h,  v16.4h,  v31.4h
   3308 
   3309 4:
   3310        ld1             {v22.8h},  [\sr2], \s_strd
   3311        ld1             {v24.8h},  [\src], \s_strd
   3312        ext             v23.16b, v22.16b, v22.16b, #2
   3313        ext             v25.16b, v24.16b, v24.16b, #2
   3314        trn1            v22.2d,  v22.2d,  v24.2d
   3315        trn1            v23.2d,  v23.2d,  v25.2d
   3316        mul             v17.8h,  v22.8h,  v0.8h
   3317        mla             v17.8h,  v23.8h,  v1.8h
   3318        urshl           v17.8h,  v17.8h,  v31.8h
   3319 
   3320        trn1            v16.2d,  v16.2d,  v17.2d
   3321 
   3322        umull           v4.4s,   v16.4h,  v2.4h
   3323        umlal           v4.4s,   v17.4h,  v3.4h
   3324        umull2          v5.4s,   v16.8h,  v2.8h
   3325        umlal2          v5.4s,   v17.8h,  v3.8h
   3326 .ifc \type, put
   3327        urshl           v4.4s,   v4.4s,   v30.4s
   3328        urshl           v5.4s,   v5.4s,   v30.4s
   3329        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
   3330 .else
   3331        rshrn           v4.4h,   v4.4s,   #4
   3332        rshrn2          v4.8h,   v5.4s,   #4
   3333        sub             v4.8h,   v4.8h,   v29.8h
   3334 .endif
   3335        subs            \h,  \h,  #2
   3336        st1             {v4.8b},   [\dst], \d_strd
   3337        st1             {v4.d}[1], [\ds2], \d_strd
   3338        b.le            0f
   3339        trn2            v16.2d,  v17.2d,  v17.2d
   3340        b               4b
   3341 0:
   3342        ret
   3343 
   3344 80:     // 8xN, 16xN, ... hv
   3345 160:
   3346 320:
   3347 640:
   3348 1280:
   3349        AARCH64_VALID_JUMP_TARGET
   3350        mov             \my, \h
   3351 
   3352 1:
   3353        add             \sr2, \src, \s_strd
   3354        add             \ds2, \dst, \d_strd
   3355        lsl             \s_strd, \s_strd, #1
   3356        lsl             \d_strd, \d_strd, #1
   3357 
   3358        ldr             h21, [\src, #16]
   3359        ld1             {v20.8h},  [\src], \s_strd
   3360        ext             v21.16b, v20.16b, v21.16b, #2
   3361        mul             v16.8h,  v20.8h,  v0.8h
   3362        mla             v16.8h,  v21.8h,  v1.8h
   3363        urshl           v16.8h,  v16.8h,  v31.8h
   3364 
   3365 2:
   3366        ldr             h23, [\sr2, #16]
   3367        ld1             {v22.8h},  [\sr2], \s_strd
   3368        ldr             h25, [\src, #16]
   3369        ld1             {v24.8h},  [\src], \s_strd
   3370        ext             v23.16b, v22.16b, v23.16b, #2
   3371        ext             v25.16b, v24.16b, v25.16b, #2
   3372        mul             v17.8h,  v22.8h,  v0.8h
   3373        mla             v17.8h,  v23.8h,  v1.8h
   3374        mul             v18.8h,  v24.8h,  v0.8h
   3375        mla             v18.8h,  v25.8h,  v1.8h
   3376        urshl           v17.8h,  v17.8h,  v31.8h
   3377        urshl           v18.8h,  v18.8h,  v31.8h
   3378 
   3379        umull           v4.4s,   v16.4h,  v2.4h
   3380        umlal           v4.4s,   v17.4h,  v3.4h
   3381        umull2          v5.4s,   v16.8h,  v2.8h
   3382        umlal2          v5.4s,   v17.8h,  v3.8h
   3383        umull           v6.4s,   v17.4h,  v2.4h
   3384        umlal           v6.4s,   v18.4h,  v3.4h
   3385        umull2          v7.4s,   v17.8h,  v2.8h
   3386        umlal2          v7.4s,   v18.8h,  v3.8h
   3387 .ifc \type, put
   3388        urshl           v4.4s,   v4.4s,   v30.4s
   3389        urshl           v5.4s,   v5.4s,   v30.4s
   3390        urshl           v6.4s,   v6.4s,   v30.4s
   3391        urshl           v7.4s,   v7.4s,   v30.4s
   3392        uzp1            v4.8h,   v4.8h,   v5.8h  // Same as xtn, xtn2
   3393        uzp1            v5.8h,   v6.8h,   v7.8h  // Ditto
   3394 .else
   3395        rshrn           v4.4h,   v4.4s,   #4
   3396        rshrn2          v4.8h,   v5.4s,   #4
   3397        rshrn           v5.4h,   v6.4s,   #4
   3398        rshrn2          v5.8h,   v7.4s,   #4
   3399        sub             v4.8h,   v4.8h,   v29.8h
   3400        sub             v5.8h,   v5.8h,   v29.8h
   3401 .endif
   3402        subs            \h,  \h,  #2
   3403        st1             {v4.8h}, [\dst], \d_strd
   3404        st1             {v5.8h}, [\ds2], \d_strd
   3405        b.le            9f
   3406        mov             v16.16b, v18.16b
   3407        b               2b
   3408 9:
   3409        subs            \w,  \w,  #8
   3410        b.le            0f
   3411        asr             \s_strd,  \s_strd,  #1
   3412        asr             \d_strd,  \d_strd,  #1
   3413        msub            \src,  \s_strd,  \xmy,  \src
   3414        msub            \dst,  \d_strd,  \xmy,  \dst
   3415        sub             \src,  \src,  \s_strd,  lsl #1
   3416        mov             \h,  \my
   3417        add             \src,  \src,  #16
   3418        add             \dst,  \dst,  #16
   3419        b               1b
   3420 0:
   3421        ret
   3422 endfunc
   3423 
   3424 jumptable \type\()_bilin_hv_tbl
   3425        .word 1280b - \type\()_bilin_hv_tbl
   3426        .word 640b  - \type\()_bilin_hv_tbl
   3427        .word 320b  - \type\()_bilin_hv_tbl
   3428        .word 160b  - \type\()_bilin_hv_tbl
   3429        .word 80b   - \type\()_bilin_hv_tbl
   3430        .word 40b   - \type\()_bilin_hv_tbl
   3431        .word 20b   - \type\()_bilin_hv_tbl
   3432 endjumptable
   3433 .endm
   3434 
   3435 make_8tap_fn    put,  regular_sharp,  REGULAR, SHARP,   8tap
   3436 make_8tap_fn    put,  smooth_sharp,   SMOOTH,  SHARP,   8tap
   3437 make_8tap_fn    put,  sharp,          SHARP,   SHARP,   8tap
   3438 make_8tap_fn    put,  sharp_regular,  SHARP,   REGULAR, 8tap
   3439 make_8tap_fn    put,  sharp_smooth,   SHARP,   SMOOTH,  8tap
   3440 filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 8tap
   3441 
   3442 make_8tap_fn    put,  regular,        REGULAR, REGULAR, 6tap
   3443 make_8tap_fn    put,  regular_smooth, REGULAR, SMOOTH,  6tap
   3444 make_8tap_fn    put,  smooth,         SMOOTH,  SMOOTH,  6tap
   3445 make_8tap_fn    put,  smooth_regular, SMOOTH,  REGULAR, 6tap
   3446 filter_fn       put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10, 6tap
   3447 filter_bilin_fn put,  x0, x1, x2, x3, w4, w5, w6, x6, w7, x7, w8, x9, x10
   3448 
   3449 make_8tap_fn    prep,  regular_sharp,  REGULAR, SHARP,   8tap
   3450 make_8tap_fn    prep,  smooth_sharp,   SMOOTH,  SHARP,   8tap
   3451 make_8tap_fn    prep,  sharp,          SHARP,   SHARP,   8tap
   3452 make_8tap_fn    prep,  sharp_regular,  SHARP,   REGULAR, 8tap
   3453 make_8tap_fn    prep,  sharp_smooth,   SHARP,   SMOOTH,  8tap
   3454 filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 8tap
   3455 
   3456 make_8tap_fn    prep,  regular,        REGULAR, REGULAR, 6tap
   3457 make_8tap_fn    prep,  regular_smooth, REGULAR, SMOOTH,  6tap
   3458 make_8tap_fn    prep,  smooth,         SMOOTH,  SMOOTH,  6tap
   3459 make_8tap_fn    prep,  smooth_regular, SMOOTH,  REGULAR, 6tap
   3460 filter_fn       prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10, 6tap
   3461 filter_bilin_fn prep, x0, x8, x1, x2, w3, w4, w5, x5, w6, x6, w7, x9, x10
   3462 
   3463 
   3464 .macro load_filter_row dst, src, inc
   3465        asr             w13, \src, #10
   3466        add             \src, \src, \inc
   3467        ldr             \dst, [x11, w13, sxtw #3]
   3468 .endm
   3469 
   3470 function warp_filter_horz_neon
   3471        add             w12, w5,  #512
   3472 
   3473        ld1             {v16.8h, v17.8h}, [x2], x3
   3474 
   3475        load_filter_row d0, w12, w7
   3476        load_filter_row d1, w12, w7
   3477        load_filter_row d2, w12, w7
   3478        sxtl            v0.8h,   v0.8b
   3479        load_filter_row d3, w12, w7
   3480        sxtl            v1.8h,   v1.8b
   3481        load_filter_row d4, w12, w7
   3482        sxtl            v2.8h,   v2.8b
   3483        load_filter_row d5, w12, w7
   3484        sxtl            v3.8h,   v3.8b
   3485        load_filter_row d6, w12, w7
   3486        sxtl            v4.8h,   v4.8b
   3487        load_filter_row d7, w12, w7
   3488        sxtl            v5.8h,   v5.8b
   3489        ext             v18.16b, v16.16b, v17.16b, #2*1
   3490        smull           v8.4s,   v16.4h,  v0.4h
   3491        smull2          v9.4s,   v16.8h,  v0.8h
   3492        sxtl            v6.8h,   v6.8b
   3493        ext             v19.16b, v16.16b, v17.16b, #2*2
   3494        smull           v10.4s,  v18.4h,  v1.4h
   3495        smull2          v11.4s,  v18.8h,  v1.8h
   3496        sxtl            v7.8h,   v7.8b
   3497        ext             v20.16b, v16.16b, v17.16b, #2*3
   3498        smull           v0.4s,   v19.4h,  v2.4h
   3499        smull2          v1.4s,   v19.8h,  v2.8h
   3500        ext             v21.16b, v16.16b, v17.16b, #2*4
   3501        addp            v8.4s,   v8.4s,   v9.4s
   3502        smull           v2.4s,   v20.4h,  v3.4h
   3503        smull2          v3.4s,   v20.8h,  v3.8h
   3504        ext             v22.16b, v16.16b, v17.16b, #2*5
   3505        addp            v9.4s,   v10.4s,  v11.4s
   3506        smull           v10.4s,  v21.4h,  v4.4h
   3507        smull2          v11.4s,  v21.8h,  v4.8h
   3508        ext             v23.16b, v16.16b, v17.16b, #2*6
   3509        addp            v0.4s,   v0.4s,   v1.4s
   3510        smull           v18.4s,  v22.4h,  v5.4h
   3511        smull2          v19.4s,  v22.8h,  v5.8h
   3512        ext             v16.16b, v16.16b, v17.16b, #2*7
   3513        addp            v1.4s,   v2.4s,   v3.4s
   3514        addp            v2.4s,   v10.4s,  v11.4s
   3515        smull           v20.4s,  v23.4h,  v6.4h
   3516        smull2          v21.4s,  v23.8h,  v6.8h
   3517        addp            v3.4s,   v18.4s,  v19.4s
   3518        smull           v22.4s,  v16.4h,  v7.4h
   3519        smull2          v23.4s,  v16.8h,  v7.8h
   3520        addp            v4.4s,   v20.4s,  v21.4s
   3521        addp            v5.4s,   v22.4s,  v23.4s
   3522 
   3523        addp            v8.4s,   v8.4s,   v9.4s
   3524        addp            v0.4s,   v0.4s,   v1.4s
   3525        addp            v2.4s,   v2.4s,   v3.4s
   3526        addp            v4.4s,   v4.4s,   v5.4s
   3527 
   3528        addp            v16.4s,  v8.4s,   v0.4s
   3529        addp            v17.4s,  v2.4s,   v4.4s
   3530 
   3531        add             w5,  w5,  w8
   3532 
   3533        srshl           v16.4s,  v16.4s,  v14.4s // -(7 - intermediate_bits)
   3534        srshl           v17.4s,  v17.4s,  v14.4s // -(7 - intermediate_bits)
   3535 
   3536        ret
   3537 endfunc
   3538 
   3539 // void dav1d_warp_affine_8x8_16bpc_neon(
   3540 //         pixel *dst, const ptrdiff_t dst_stride,
   3541 //         const pixel *src, const ptrdiff_t src_stride,
   3542 //         const int16_t *const abcd, int mx, int my,
   3543 //         const int bitdepth_max)
   3544 .macro warp t
   3545 function warp_affine_8x8\t\()_16bpc_neon, export=1
   3546        stp             d8,  d9,  [sp, #-0x40]!
   3547        stp             d10, d11, [sp, #0x10]
   3548        stp             d12, d13, [sp, #0x20]
   3549        stp             d14, d15, [sp, #0x30]
   3550 
   3551 .ifb \t
   3552        dup             v15.8h,  w7        // bitdepth_max
   3553 .else
   3554        movi            v15.8h,  #(PREP_BIAS >> 8), lsl #8
   3555 .endif
   3556        clz             w7,  w7
   3557                                           // intermediate_bits = clz(bitdepth_max) - 18
   3558 .ifb \t
   3559        sub             w8,  w7,  #11      // 7 + intermediate_bits = clz(bitdepth_max) - 18 + 7
   3560 .endif
   3561        sub             w7,  w7,  #25      // -(7 - intermediate_bits)
   3562 .ifb \t
   3563        neg             w8,  w8            // -(7 + intermediate_bits)
   3564 .endif
   3565        dup             v14.4s,  w7        // -(7 - intermediate_bits)
   3566 .ifb \t
   3567        dup             v13.4s,  w8        // -(7 + intermediate_bits)
   3568 .endif
   3569 
   3570        ldr             x4,  [x4]
   3571        sbfx            x7,  x4, #0,  #16
   3572        sbfx            x8,  x4, #16, #16
   3573        sbfx            x9,  x4, #32, #16
   3574        sbfx            x4,  x4, #48, #16
   3575        mov             w10, #8
   3576        sub             x2,  x2,  x3, lsl #1
   3577        sub             x2,  x2,  x3
   3578        sub             x2,  x2,  #6
   3579        movrel          x11, X(mc_warp_filter), 64*8
   3580        mov             x15, x30
   3581 .ifnb \t
   3582        lsl             x1,  x1,  #1
   3583 .endif
   3584 
   3585        bl              warp_filter_horz_neon
   3586        uzp1            v24.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
   3587        bl              warp_filter_horz_neon
   3588        uzp1            v25.8h,  v16.8h,  v17.8h // Ditto
   3589        bl              warp_filter_horz_neon
   3590        uzp1            v26.8h,  v16.8h,  v17.8h // Ditto
   3591        bl              warp_filter_horz_neon
   3592        uzp1            v27.8h,  v16.8h,  v17.8h // Ditto
   3593        bl              warp_filter_horz_neon
   3594        uzp1            v28.8h,  v16.8h,  v17.8h // Ditto
   3595        bl              warp_filter_horz_neon
   3596        uzp1            v29.8h,  v16.8h,  v17.8h // Ditto
   3597        bl              warp_filter_horz_neon
   3598        uzp1            v30.8h,  v16.8h,  v17.8h // Ditto
   3599 
   3600 1:
   3601        add             w14, w6,  #512
   3602        bl              warp_filter_horz_neon
   3603        uzp1            v31.8h,  v16.8h,  v17.8h // Same as xtn, xtn2
   3604 
   3605        load_filter_row d0, w14, w9
   3606        load_filter_row d1, w14, w9
   3607        load_filter_row d2, w14, w9
   3608        load_filter_row d3, w14, w9
   3609        load_filter_row d4, w14, w9
   3610        load_filter_row d5, w14, w9
   3611        load_filter_row d6, w14, w9
   3612        load_filter_row d7, w14, w9
   3613        transpose_8x8b_xtl v0, v1, v2, v3, v4, v5, v6, v7, sxtl
   3614 
   3615        // This ordering of smull/smlal/smull2/smlal2 is highly
   3616        // beneficial for Cortex A53 here.
   3617        smull           v16.4s,  v24.4h,  v0.4h
   3618        smlal           v16.4s,  v25.4h,  v1.4h
   3619        smlal           v16.4s,  v26.4h,  v2.4h
   3620        smlal           v16.4s,  v27.4h,  v3.4h
   3621        smlal           v16.4s,  v28.4h,  v4.4h
   3622        smlal           v16.4s,  v29.4h,  v5.4h
   3623        smlal           v16.4s,  v30.4h,  v6.4h
   3624        smlal           v16.4s,  v31.4h,  v7.4h
   3625        smull2          v17.4s,  v24.8h,  v0.8h
   3626        smlal2          v17.4s,  v25.8h,  v1.8h
   3627        smlal2          v17.4s,  v26.8h,  v2.8h
   3628        smlal2          v17.4s,  v27.8h,  v3.8h
   3629        smlal2          v17.4s,  v28.8h,  v4.8h
   3630        smlal2          v17.4s,  v29.8h,  v5.8h
   3631        smlal2          v17.4s,  v30.8h,  v6.8h
   3632        smlal2          v17.4s,  v31.8h,  v7.8h
   3633 
   3634        mov             v24.16b, v25.16b
   3635        mov             v25.16b, v26.16b
   3636 .ifb \t
   3637        srshl           v16.4s,  v16.4s,  v13.4s // -(7 + intermediate_bits)
   3638        srshl           v17.4s,  v17.4s,  v13.4s // -(7 + intermediate_bits)
   3639 .else
   3640        rshrn           v16.4h,  v16.4s,  #7
   3641        rshrn2          v16.8h,  v17.4s,  #7
   3642 .endif
   3643        mov             v26.16b, v27.16b
   3644 .ifb \t
   3645        sqxtun          v16.4h,  v16.4s
   3646        sqxtun2         v16.8h,  v17.4s
   3647 .else
   3648        sub             v16.8h,  v16.8h,  v15.8h // PREP_BIAS
   3649 .endif
   3650        mov             v27.16b, v28.16b
   3651        mov             v28.16b, v29.16b
   3652 .ifb \t
   3653        umin            v16.8h,  v16.8h,  v15.8h // bitdepth_max
   3654 .endif
   3655        mov             v29.16b, v30.16b
   3656        mov             v30.16b, v31.16b
   3657        subs            w10, w10, #1
   3658        st1             {v16.8h}, [x0], x1
   3659 
   3660        add             w6,  w6,  w4
   3661        b.gt            1b
   3662 
   3663        ldp             d14, d15, [sp, #0x30]
   3664        ldp             d12, d13, [sp, #0x20]
   3665        ldp             d10, d11, [sp, #0x10]
   3666        ldp             d8,  d9,  [sp], 0x40
   3667 
   3668        ret             x15
   3669 endfunc
   3670 .endm
   3671 
   3672 warp
   3673 warp t
   3674 
   3675 // void dav1d_emu_edge_16bpc_neon(
   3676 //         const intptr_t bw, const intptr_t bh,
   3677 //         const intptr_t iw, const intptr_t ih,
   3678 //         const intptr_t x, const intptr_t y,
   3679 //         pixel *dst, const ptrdiff_t dst_stride,
   3680 //         const pixel *ref, const ptrdiff_t ref_stride)
   3681 function emu_edge_16bpc_neon, export=1
   3682        ldp             x8,  x9,  [sp]
   3683 
   3684        // ref += iclip(y, 0, ih - 1) * PXSTRIDE(ref_stride)
   3685        // ref += iclip(x, 0, iw - 1)
   3686        sub             x12, x3,  #1           // ih - 1
   3687        cmp             x5,  x3
   3688        sub             x13, x2,  #1           // iw - 1
   3689        csel            x12, x12, x5,  ge      // min(y, ih - 1)
   3690        cmp             x4,  x2
   3691        bic             x12, x12, x12, asr #63 // max(min(y, ih - 1), 0)
   3692        csel            x13, x13, x4,  ge      // min(x, iw - 1)
   3693        bic             x13, x13, x13, asr #63 // max(min(x, iw - 1), 0)
   3694        madd            x8,  x12, x9,  x8      // ref += iclip() * stride
   3695        add             x8,  x8,  x13, lsl #1  // ref += iclip()
   3696 
   3697        // bottom_ext = iclip(y + bh - ih, 0, bh - 1)
   3698        // top_ext = iclip(-y, 0, bh - 1)
   3699        add             x10, x5,  x1           // y + bh
   3700        neg             x5,  x5                // -y
   3701        sub             x10, x10, x3           // y + bh - ih
   3702        sub             x12, x1,  #1           // bh - 1
   3703        cmp             x10, x1
   3704        bic             x5,  x5,  x5,  asr #63 // max(-y, 0)
   3705        csel            x10, x10, x12, lt      // min(y + bh - ih, bh-1)
   3706        cmp             x5,  x1
   3707        bic             x10, x10, x10, asr #63 // max(min(y + bh - ih, bh-1), 0)
   3708        csel            x5,  x5,  x12, lt      // min(max(-y, 0), bh-1)
   3709 
   3710        // right_ext = iclip(x + bw - iw, 0, bw - 1)
   3711        // left_ext = iclip(-x, 0, bw - 1)
   3712        add             x11, x4,  x0           // x + bw
   3713        neg             x4,  x4                // -x
   3714        sub             x11, x11, x2           // x + bw - iw
   3715        sub             x13, x0,  #1           // bw - 1
   3716        cmp             x11, x0
   3717        bic             x4,  x4,  x4,  asr #63 // max(-x, 0)
   3718        csel            x11, x11, x13, lt      // min(x + bw - iw, bw-1)
   3719        cmp             x4,  x0
   3720        bic             x11, x11, x11, asr #63 // max(min(x + bw - iw, bw-1), 0)
   3721        csel            x4,  x4,  x13, lt      // min(max(-x, 0), bw - 1)
   3722 
   3723        // center_h = bh - top_ext - bottom_ext
   3724        // dst += top_ext * PXSTRIDE(dst_stride)
   3725        // center_w = bw - left_ext - right_ext
   3726        sub             x1,  x1,  x5           // bh - top_ext
   3727        madd            x6,  x5,  x7,  x6
   3728        sub             x2,  x0,  x4           // bw - left_ext
   3729        sub             x1,  x1,  x10          // center_h = bh - top_ext - bottom_ext
   3730        sub             x2,  x2,  x11          // center_w = bw - left_ext - right_ext
   3731 
   3732        mov             x14, x6                // backup of dst
   3733 
   3734 .macro v_loop need_left, need_right
   3735 0:
   3736 .if \need_left
   3737        ld1r            {v0.8h}, [x8]
   3738        mov             x12, x6                // out = dst
   3739        mov             x3,  x4
   3740        mov             v1.16b,  v0.16b
   3741 1:
   3742        subs            x3,  x3,  #16
   3743        st1             {v0.8h, v1.8h}, [x12], #32
   3744        b.gt            1b
   3745 .endif
   3746        mov             x13, x8
   3747        add             x12, x6,  x4, lsl #1   // out = dst + left_ext
   3748        mov             x3,  x2
   3749 1:
   3750        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x13], #64
   3751        subs            x3,  x3,  #32
   3752        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x12], #64
   3753        b.gt            1b
   3754 .if \need_right
   3755        add             x3,  x8,  x2, lsl #1   // in + center_w
   3756        sub             x3,  x3,  #2           // in + center_w - 1
   3757        add             x12, x6,  x4, lsl #1   // dst + left_ext
   3758        ld1r            {v0.8h}, [x3]
   3759        add             x12, x12, x2, lsl #1   // out = dst + left_ext + center_w
   3760        mov             x3,  x11
   3761        mov             v1.16b,  v0.16b
   3762 1:
   3763        subs            x3,  x3,  #16
   3764        st1             {v0.8h, v1.8h}, [x12], #32
   3765        b.gt            1b
   3766 .endif
   3767 
   3768        subs            x1,  x1,  #1           // center_h--
   3769        add             x6,  x6,  x7
   3770        add             x8,  x8,  x9
   3771        b.gt            0b
   3772 .endm
   3773 
   3774        cbz             x4,  2f
   3775        // need_left
   3776        cbz             x11, 3f
   3777        // need_left + need_right
   3778        v_loop          1,   1
   3779        b               5f
   3780 
   3781 2:
   3782        // !need_left
   3783        cbz             x11, 4f
   3784        // !need_left + need_right
   3785        v_loop          0,   1
   3786        b               5f
   3787 
   3788 3:
   3789        // need_left + !need_right
   3790        v_loop          1,   0
   3791        b               5f
   3792 
   3793 4:
   3794        // !need_left + !need_right
   3795        v_loop          0,   0
   3796 
   3797 5:
   3798 
   3799        cbz             x10, 3f
   3800        // need_bottom
   3801        sub             x8,  x6,  x7           // ref = dst - stride
   3802        mov             x4,  x0
   3803 1:
   3804        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x8], #64
   3805        mov             x3,  x10
   3806 2:
   3807        subs            x3,  x3,  #1
   3808        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
   3809        b.gt            2b
   3810        msub            x6,  x7,  x10,  x6     // dst -= bottom_ext * stride
   3811        subs            x4,  x4,  #32          // bw -= 32
   3812        add             x6,  x6,  #64          // dst += 32
   3813        b.gt            1b
   3814 
   3815 3:
   3816        cbz             x5,  3f
   3817        // need_top
   3818        msub            x6,  x7,  x5,  x14     // dst = stored_dst - top_ext * stride
   3819 1:
   3820        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x14], #64
   3821        mov             x3,  x5
   3822 2:
   3823        subs            x3,  x3,  #1
   3824        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x7
   3825        b.gt            2b
   3826        msub            x6,  x7,  x5,  x6      // dst -= top_ext * stride
   3827        subs            x0,  x0,  #32          // bw -= 32
   3828        add             x6,  x6,  #64          // dst += 32
   3829        b.gt            1b
   3830 
   3831 3:
   3832        ret
   3833 endfunc