tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc_dotprod.S (64824B)


      1 /*
      2 * Copyright © 2024, VideoLAN and dav1d authors
      3 * Copyright © 2024, Janne Grunau
      4 * Copyright © 2024, Martin Storsjo
      5 * Copyright © 2024, Arm Limited
      6 * All rights reserved.
      7 *
      8 * Redistribution and use in source and binary forms, with or without
      9 * modification, are permitted provided that the following conditions are met:
     10 *
     11 * 1. Redistributions of source code must retain the above copyright notice, this
     12 *    list of conditions and the following disclaimer.
     13 *
     14 * 2. Redistributions in binary form must reproduce the above copyright notice,
     15 *    this list of conditions and the following disclaimer in the documentation
     16 *    and/or other materials provided with the distribution.
     17 *
     18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     19 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     20 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     21 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     22 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     23 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     24 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     25 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     26 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     27 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     28 */
     29 
     30 #include "src/arm/asm.S"
     31 #include "util.S"
     32 
     33 
     34 #if HAVE_DOTPROD
     35 ENABLE_DOTPROD
     36 
     37 // No spaces in these expressions, due to gas-preprocessor. It is translated by
     38 // -1 to save the negative offset at getting the address of `mc_subpel_filters`.
     39 #define REGULAR1        (((0*15-1)<<7)|(3*15-1))
     40 #define SMOOTH1         (((1*15-1)<<7)|(4*15-1))
     41 #define SHARP1          (((2*15-1)<<7)|(3*15-1))
     42 
     43 #define FUNC_ALIGN      2
     44 #define JUMP_ALIGN      2
     45 #define LOOP_ALIGN      2
     46 
     47 
     48 const h_tbl_neon_dotprod, align=4
     49        // Shuffle indices to permute horizontal samples in preparation for
     50        // input to SDOT instructions. The 8-tap horizontal convolution uses
     51        // sample indices in the interval of [-3, 4] relative to the current
     52        // sample position.
     53        .byte  0,  1,  2,  3,   1,  2,  3,  4,   2,  3,  4,  5,   3,  4,  5,  6
     54        .byte  4,  5,  6,  7,   5,  6,  7,  8,   6,  7,  8,  9,   7,  8,  9, 10
     55        .byte  8,  9, 10, 11,   9, 10, 11, 12,  10, 11, 12, 13,  11, 12, 13, 14
     56 
     57        // Shuffle indices to permute horizontal samples in preparation for
     58        // input to USMMLA instructions.
     59 #define OFFSET_USMMLA 48
     60        .byte  0,  1,  2,  3,   4,  5,  6,  7,   2,  3,  4,  5,   6,  7,  8,  9
     61        .byte  4,  5,  6,  7,   8,  9, 10, 11,   6,  7,  8,  9,  10, 11, 12, 13
     62 
     63        // Lookup table used to help conversion of shifted 32-bit values to 8-bit.
     64 #define OFFSET_CVT_32_8 80
     65        .byte  1,  2,  5,  6,   9, 10, 13, 14,  17, 18, 21, 22,  25, 26, 29, 30
     66 endconst
     67 
     68 const v_tbl_neon_dotprod, align=4
     69        // Vertical convolutions are also using SDOT instructions, where a
     70        // 128-bit register contains a transposed 4x4 matrix of values.
     71        // Subsequent iterations of the vertical convolution can reuse the
     72        // 3x4 sub-matrix from the previous loop iteration. These shuffle
     73        // indices shift and merge this 4x4 matrix with the values of a new
     74        // line.
     75        .byte  1,  2,  3, 16,   5,  6,  7, 20,   9, 10, 11, 24,  13, 14, 15, 28
     76        .byte  1,  2,  3, 16,   5,  6,  7, 17,   9, 10, 11, 18,  13, 14, 15, 19
     77        .byte  1,  2,  3, 20,   5,  6,  7, 21,   9, 10, 11, 22,  13, 14, 15, 23
     78        .byte  1,  2,  3, 24,   5,  6,  7, 25,   9, 10, 11, 26,  13, 14, 15, 27
     79        .byte  1,  2,  3, 28,   5,  6,  7, 29,   9, 10, 11, 30,  13, 14, 15, 31
     80 endconst
     81 
     82 
     83 .macro make_8tap_fn op, type, type_h, type_v, isa, jump=1
     84 function \op\()_8tap_\type\()_8bpc_\isa, export=1, align=FUNC_ALIGN
     85        mov             x9,  \type_h
     86        mov             x10, \type_v
     87    .if \jump
     88        b               \op\()_8tap_\isa
     89    .endif
     90 endfunc
     91 .endm
     92 
     93 .macro filter_8tap_fn type, dot, isa, dst, d_strd, src, s_strd, w, h, mx, my, xmx, xmy, ldst, lsrc, wd_strd
     94 make_8tap_fn \type, sharp,          SHARP1,   SHARP1,   \isa
     95 make_8tap_fn \type, sharp_smooth,   SHARP1,   SMOOTH1,  \isa
     96 make_8tap_fn \type, sharp_regular,  SHARP1,   REGULAR1, \isa
     97 make_8tap_fn \type, smooth_sharp,   SMOOTH1,  SHARP1,   \isa
     98 make_8tap_fn \type, smooth,         SMOOTH1,  SMOOTH1,  \isa
     99 make_8tap_fn \type, smooth_regular, SMOOTH1,  REGULAR1, \isa
    100 make_8tap_fn \type, regular_sharp,  REGULAR1, SHARP1,   \isa
    101 make_8tap_fn \type, regular_smooth, REGULAR1, SMOOTH1,  \isa
    102 make_8tap_fn \type, regular,        REGULAR1, REGULAR1, \isa, jump=0
    103 
    104 function \type\()_8tap_\isa, align=FUNC_ALIGN
    105        clz             w8, \w
    106        mov             w11,  #0x4081   // (1 << 14) | (1 << 7) | (1 << 0)
    107        sub             w8, w8, #24     // for jump tables
    108        movrel          x12, X(mc_subpel_filters)
    109        cbnz            \mx, L(\type\()_8tap_h_hv_\isa)
    110        cbnz            \my, L(\type\()_8tap_v_\isa)
    111 .ifc \type, prep
    112        add             \wd_strd, \w, \w    // prep_neon needs w * 2 as stride
    113 .endif
    114        b               X(\type\()_neon)
    115 
    116        .align JUMP_ALIGN
    117 L(\type\()_8tap_v_\isa):
    118        madd            \my, \my, w11, w10
    119        movrel          x13, v_tbl_neon_dotprod
    120        sub             \src, \src, \s_strd
    121 .ifc \isa, neon_dotprod
    122    .ifc \type, prep
    123        mov             w8, #0x2002         // FILTER_WEIGHT * 128 + rounding
    124        dup             v4.4s, w8
    125    .else
    126        movi            v4.4s, #32, lsl #8  // FILTER_WEIGHT * 128, bias for SDOT
    127    .endif
    128 .endif
    129        ubfx            w11, \my, #7, #7
    130        and             \my, \my, #0x7F
    131        ldp             q6, q28, [x13]
    132        cmp             \h, #4
    133        csel            \my, \my, w11, le
    134        sub             \src, \src, \s_strd, lsl #1     // src - s_strd * 3
    135        add             \xmy, x12, \xmy, lsl #3         // subpel V filter address
    136        ldr             q29, [x13, #32]
    137 .ifc \isa, neon_dotprod
    138        movi            v5.16b, #128
    139 .endif
    140        ldr             d7, [\xmy]
    141        cmp             \w, #8
    142        b.eq            80f
    143        b.lt            40f
    144 
    145        // .align JUMP_ALIGN    // fallthrough
    146 160:    // V - 16xN+
    147        ldp             q30, q31, [x13, #48]
    148 .ifc \type, prep
    149        add             \wd_strd, \w, \w
    150 .endif
    151        .align LOOP_ALIGN
    152 161:
    153        mov             \lsrc, \src
    154        mov             \ldst, \dst
    155        sub             w8, \h, #1
    156 
    157        ldr             q16, [\lsrc]
    158        ldr             q17, [\lsrc, \s_strd]
    159        add             \lsrc, \lsrc, \s_strd, lsl #1
    160        ldr             q18, [\lsrc]
    161        ldr             q19, [\lsrc, \s_strd]
    162        add             \lsrc, \lsrc, \s_strd, lsl #1
    163 
    164        zip1            v0.16b, v16.16b, v17.16b
    165        zip2            v1.16b, v16.16b, v17.16b
    166        zip1            v2.16b, v18.16b, v19.16b
    167        zip2            v3.16b, v18.16b, v19.16b
    168 
    169        ldr             q20, [\lsrc]
    170        ldr             q21, [\lsrc, \s_strd]
    171        add             \lsrc, \lsrc, \s_strd, lsl #1
    172        ldr             q22, [\lsrc]
    173        ldr             q23, [\lsrc, \s_strd]
    174        add             \lsrc, \lsrc, \s_strd, lsl #1
    175 
    176        zip1            v18.16b, v20.16b, v21.16b
    177        zip2            v21.16b, v20.16b, v21.16b
    178        zip1            v24.16b, v22.16b, v23.16b
    179        zip2            v27.16b, v22.16b, v23.16b
    180 
    181        zip1            v16.8h, v0.8h, v2.8h
    182        zip2            v19.8h, v0.8h, v2.8h
    183        zip1            v22.8h, v1.8h, v3.8h
    184        zip2            v25.8h, v1.8h, v3.8h
    185 
    186        zip1            v17.8h, v18.8h, v24.8h
    187        zip2            v20.8h, v18.8h, v24.8h
    188        zip1            v23.8h, v21.8h, v27.8h
    189        zip2            v26.8h, v21.8h, v27.8h
    190 .ifc \isa, neon_dotprod
    191        sub             v16.16b, v16.16b, v5.16b
    192        sub             v19.16b, v19.16b, v5.16b
    193        sub             v22.16b, v22.16b, v5.16b
    194        sub             v25.16b, v25.16b, v5.16b
    195 
    196        sub             v17.16b, v17.16b, v5.16b
    197        sub             v20.16b, v20.16b, v5.16b
    198        sub             v23.16b, v23.16b, v5.16b
    199        sub             v26.16b, v26.16b, v5.16b
    200 .endif
    201        .align LOOP_ALIGN
    202 16:
    203 .ifc \isa, neon_i8mm
    204        ld1             {v18.16b}, [\lsrc], \s_strd
    205        movi            v0.4s, #0
    206        movi            v1.4s, #0
    207        movi            v2.4s, #0
    208        movi            v3.4s, #0
    209        mov             v21.16b, v18.16b
    210        mov             v24.16b, v18.16b
    211        mov             v27.16b, v18.16b
    212 .else   // neon_dotprod
    213        ld1             {v27.16b}, [\lsrc], \s_strd
    214        mov             v0.16b, v4.16b
    215        mov             v1.16b, v4.16b
    216        mov             v2.16b, v4.16b
    217        mov             v3.16b, v4.16b
    218        sub             v18.16b, v27.16b, v5.16b
    219        sub             v21.16b, v27.16b, v5.16b
    220        sub             v24.16b, v27.16b, v5.16b
    221        sub             v27.16b, v27.16b, v5.16b
    222 .endif
    223        \dot            v0.4s, v16.16b, v7.4b[0]
    224        \dot            v1.4s, v19.16b, v7.4b[0]
    225        \dot            v2.4s, v22.16b, v7.4b[0]
    226        \dot            v3.4s, v25.16b, v7.4b[0]
    227 
    228        tbl             v16.16b, {v16.16b, v17.16b}, v6.16b
    229        tbl             v19.16b, {v19.16b, v20.16b}, v6.16b
    230        tbl             v22.16b, {v22.16b, v23.16b}, v6.16b
    231        tbl             v25.16b, {v25.16b, v26.16b}, v6.16b
    232 
    233        \dot            v0.4s, v17.16b, v7.4b[1]
    234        \dot            v1.4s, v20.16b, v7.4b[1]
    235        \dot            v2.4s, v23.16b, v7.4b[1]
    236        \dot            v3.4s, v26.16b, v7.4b[1]
    237 
    238        tbl             v17.16b, {v17.16b, v18.16b}, v28.16b
    239        tbl             v20.16b, {v20.16b, v21.16b}, v29.16b
    240        tbl             v23.16b, {v23.16b, v24.16b}, v30.16b
    241        tbl             v26.16b, {v26.16b, v27.16b}, v31.16b
    242 
    243        subs            w8, w8, #1
    244        uzp1            v0.8h, v0.8h, v1.8h
    245        uzp1            v2.8h, v2.8h, v3.8h
    246 .ifc \type, prep
    247    .ifc \isa, neon_i8mm
    248        srshr           v0.8h, v0.8h, #2
    249        srshr           v1.8h, v2.8h, #2
    250    .else
    251        sshr            v0.8h, v0.8h, #2
    252        sshr            v1.8h, v2.8h, #2
    253    .endif
    254        st1             {v0.8h, v1.8h}, [\ldst], \d_strd
    255 .else   // put
    256        sqrshrun        v0.8b, v0.8h, #6
    257        sqrshrun2       v0.16b, v2.8h, #6
    258        st1             {v0.16b}, [\ldst], \d_strd
    259 .endif
    260        b.gt            16b
    261 
    262 .ifc \isa, neon_i8mm
    263        movi            v0.4s, #0
    264        movi            v1.4s, #0
    265        movi            v2.4s, #0
    266        movi            v3.4s, #0
    267 .else   // neon_dotprod
    268        mov             v0.16b, v4.16b
    269        mov             v1.16b, v4.16b
    270        mov             v2.16b, v4.16b
    271        mov             v3.16b, v4.16b
    272 .endif
    273        \dot            v0.4s, v16.16b, v7.4b[0]
    274        \dot            v1.4s, v19.16b, v7.4b[0]
    275        \dot            v2.4s, v22.16b, v7.4b[0]
    276        \dot            v3.4s, v25.16b, v7.4b[0]
    277 
    278        \dot            v0.4s, v17.16b, v7.4b[1]
    279        \dot            v1.4s, v20.16b, v7.4b[1]
    280        \dot            v2.4s, v23.16b, v7.4b[1]
    281        \dot            v3.4s, v26.16b, v7.4b[1]
    282 
    283        subs            \w, \w, #16
    284        uzp1            v0.8h, v0.8h, v1.8h
    285        uzp1            v2.8h, v2.8h, v3.8h
    286 .ifc \type, prep
    287    .ifc \isa, neon_i8mm
    288        srshr           v0.8h, v0.8h, #2
    289        srshr           v1.8h, v2.8h, #2
    290    .else
    291        sshr            v0.8h, v0.8h, #2
    292        sshr            v1.8h, v2.8h, #2
    293    .endif
    294        stp             q0, q1, [\ldst]
    295        add             \dst, \dst, #32
    296 .else   // put
    297        sqrshrun        v0.8b, v0.8h, #6
    298        sqrshrun2       v0.16b, v2.8h, #6
    299        str             q0, [\ldst]
    300        add             \dst, \dst, #16
    301 .endif
    302        add             \src, \src, #16
    303        b.gt            161b
    304        ret
    305 
    306        .align JUMP_ALIGN
    307 80:     // V - 8xN
    308        ldr             d16, [\src]
    309        ldr             d17, [\src, \s_strd]
    310        add             \src, \src, \s_strd, lsl #1
    311        ldr             d18, [\src]
    312        ldr             d19, [\src, \s_strd]
    313        add             \src, \src, \s_strd, lsl #1
    314 
    315        ldr             d20, [\src]
    316        ldr             d21, [\src, \s_strd]
    317        add             \src, \src, \s_strd, lsl #1
    318        ldr             d22, [\src]
    319        ldr             d23, [\src, \s_strd]
    320        add             \src, \src, \s_strd, lsl #1
    321        subs            \h, \h, #2  // for prep: sub is enough
    322 
    323        zip1            v0.16b, v16.16b, v17.16b
    324        zip1            v2.16b, v18.16b, v19.16b
    325        zip1            v18.16b, v20.16b, v21.16b
    326        zip1            v24.16b, v22.16b, v23.16b
    327 
    328        zip1            v16.8h,  v0.8h,  v2.8h
    329        zip2            v19.8h,  v0.8h,  v2.8h
    330        zip1            v17.8h, v18.8h, v24.8h
    331        zip2            v20.8h, v18.8h, v24.8h
    332 .ifc \isa, neon_dotprod
    333        sub             v16.16b, v16.16b, v5.16b
    334        sub             v19.16b, v19.16b, v5.16b
    335        sub             v17.16b, v17.16b, v5.16b
    336        sub             v20.16b, v20.16b, v5.16b
    337 .endif
    338 .ifc \type, put
    339        b.eq            82f
    340 .endif
    341        .align LOOP_ALIGN
    342 8:
    343 .ifc \isa, neon_i8mm
    344        ldr             d18, [\src]
    345        movi            v0.4s, #0
    346        movi            v1.4s, #0
    347        ldr             d24, [\src, \s_strd]
    348        add             \src, \src, \s_strd, lsl #1
    349        movi            v2.4s, #0
    350        movi            v3.4s, #0
    351        mov             v21.8b, v18.8b
    352        mov             v27.8b, v24.8b
    353 .else   // neon_dotprod
    354        ldr             d21, [\src]
    355        ldr             d27, [\src, \s_strd]
    356        add             \src, \src, \s_strd, lsl #1
    357        mov             v0.16b, v4.16b
    358        mov             v1.16b, v4.16b
    359        mov             v2.16b, v4.16b
    360        mov             v3.16b, v4.16b
    361        sub             v18.16b, v21.16b, v5.16b
    362        sub             v21.16b, v21.16b, v5.16b
    363        sub             v24.16b, v27.16b, v5.16b
    364        sub             v27.16b, v27.16b, v5.16b
    365 .endif
    366        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
    367        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
    368        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
    369        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
    370 
    371        \dot            v0.4s, v16.16b, v7.4b[0]
    372        \dot            v0.4s, v17.16b, v7.4b[1]
    373        \dot            v1.4s, v19.16b, v7.4b[0]
    374        \dot            v1.4s, v20.16b, v7.4b[1]
    375 
    376        tbl             v16.16b, {v22.16b, v23.16b}, v6.16b
    377        tbl             v19.16b, {v25.16b, v26.16b}, v6.16b
    378        tbl             v17.16b, {v23.16b, v24.16b}, v28.16b
    379        tbl             v20.16b, {v26.16b, v27.16b}, v29.16b
    380 
    381        \dot            v2.4s, v22.16b, v7.4b[0]
    382        \dot            v2.4s, v23.16b, v7.4b[1]
    383        \dot            v3.4s, v25.16b, v7.4b[0]
    384        \dot            v3.4s, v26.16b, v7.4b[1]
    385 
    386        subs            \h, \h, #2
    387        uzp1            v0.8h, v0.8h, v1.8h
    388        uzp1            v2.8h, v2.8h, v3.8h
    389 .ifc \type, prep
    390    .ifc \isa, neon_i8mm
    391        srshr           v0.8h, v0.8h, #2
    392        srshr           v1.8h, v2.8h, #2
    393    .else
    394        sshr            v0.8h, v0.8h, #2
    395        sshr            v1.8h, v2.8h, #2
    396    .endif
    397        stp             q0, q1, [\dst], #32
    398 .else   // put
    399        sqrshrun        v0.8b, v0.8h, #6
    400        sqrshrun        v1.8b, v2.8h, #6
    401        str             d0, [\dst]
    402        str             d1, [\dst, \d_strd]
    403        add             \dst, \dst, \d_strd, lsl #1
    404 .endif
    405        b.gt            8b
    406 
    407 .ifc \type, put
    408        .align JUMP_ALIGN
    409 82:
    410 .endif
    411 .ifc \isa, neon_i8mm
    412        ldr             d18, [\src]
    413        movi            v0.4s, #0
    414        movi            v1.4s, #0
    415        movi            v2.4s, #0
    416        movi            v3.4s, #0
    417        mov             v21.8b, v18.8b
    418 .else   // neon_dotprod
    419        ldr             d21, [\src]
    420        mov             v0.16b, v4.16b
    421        mov             v1.16b, v4.16b
    422        mov             v2.16b, v4.16b
    423        mov             v3.16b, v4.16b
    424        sub             v18.16b, v21.16b, v5.16b
    425        sub             v21.16b, v21.16b, v5.16b
    426 .endif
    427        tbl             v22.16b, {v16.16b, v17.16b}, v6.16b
    428        tbl             v25.16b, {v19.16b, v20.16b}, v6.16b
    429        tbl             v23.16b, {v17.16b, v18.16b}, v28.16b
    430        tbl             v26.16b, {v20.16b, v21.16b}, v29.16b
    431 
    432        \dot            v0.4s, v16.16b, v7.4b[0]
    433        \dot            v0.4s, v17.16b, v7.4b[1]
    434        \dot            v1.4s, v19.16b, v7.4b[0]
    435        \dot            v1.4s, v20.16b, v7.4b[1]
    436 
    437        \dot            v2.4s, v22.16b, v7.4b[0]
    438        \dot            v2.4s, v23.16b, v7.4b[1]
    439        \dot            v3.4s, v25.16b, v7.4b[0]
    440        \dot            v3.4s, v26.16b, v7.4b[1]
    441 
    442        uzp1            v0.8h, v0.8h, v1.8h
    443        uzp1            v2.8h, v2.8h, v3.8h
    444 .ifc \type, prep
    445    .ifc \isa, neon_i8mm
    446        srshr           v0.8h, v0.8h, #2
    447        srshr           v1.8h, v2.8h, #2
    448    .else
    449        sshr            v0.8h, v0.8h, #2
    450        sshr            v1.8h, v2.8h, #2
    451    .endif
    452        stp             q0, q1, [\dst]
    453 .else   // put
    454        sqrshrun        v0.8b, v0.8h, #6
    455        sqrshrun        v1.8b, v2.8h, #6
    456        str             d0, [\dst]
    457        str             d1, [\dst, \d_strd]
    458 .endif
    459        ret
    460 
    461        .align JUMP_ALIGN
    462 40:     // V - 4xN or 2xN (put only)
    463 .ifc \type, put
    464        cmp             \w, #2
    465        b.eq            20f
    466 .endif
    467        ldr             s16, [\src]
    468        ldr             s17, [\src, \s_strd]
    469        add             \src, \src, \s_strd, lsl #1
    470        ldr             s18, [\src]
    471        ldr             s19, [\src, \s_strd]
    472        add             \src, \src, \s_strd, lsl #1
    473 
    474        ldr             s20, [\src]
    475        ldr             s21, [\src, \s_strd]
    476        add             \src, \src, \s_strd, lsl #1
    477        ldr             s22, [\src]
    478        ldr             s23, [\src, \s_strd]
    479        add             \src, \src, \s_strd, lsl #1
    480        subs            \h, \h, #2  // for prep: sub is enough
    481 
    482        zip1            v0.8b, v16.8b, v17.8b
    483        zip1            v2.8b, v18.8b, v19.8b
    484        zip1            v18.8b, v20.8b, v21.8b
    485        zip1            v24.8b, v22.8b, v23.8b
    486 
    487        zip1            v16.8h, v0.8h, v2.8h
    488        zip1            v17.8h, v18.8h, v24.8h
    489 .ifc \isa, neon_dotprod
    490        sub             v16.16b, v16.16b, v5.16b
    491        sub             v17.16b, v17.16b, v5.16b
    492 .endif
    493 .ifc \type, put
    494        b.eq            42f
    495 .endif
    496        .align LOOP_ALIGN
    497 4:
    498        ldr             s18, [\src]
    499        ldr             s21, [\src, \s_strd]
    500        add             \src, \src, \s_strd, lsl #1
    501 .ifc \isa, neon_i8mm
    502        movi            v0.4s, #0
    503        movi            v1.4s, #0
    504 .else   // neon_dotprod
    505        mov             v0.16b, v4.16b
    506        mov             v1.16b, v4.16b
    507        sub             v18.16b, v18.16b, v5.16b
    508        sub             v21.16b, v21.16b, v5.16b
    509 .endif
    510        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
    511        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
    512 
    513        \dot            v0.4s, v16.16b, v7.4b[0]
    514        \dot            v0.4s, v17.16b, v7.4b[1]
    515 
    516        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
    517        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
    518 
    519        \dot            v1.4s, v19.16b, v7.4b[0]
    520        \dot            v1.4s, v20.16b, v7.4b[1]
    521 .ifc \type, prep
    522        subs            \h, \h, #2
    523    .ifc \isa, neon_i8mm
    524        rshrn           v0.4h, v0.4s, #2
    525        rshrn2          v0.8h, v1.4s, #2
    526    .else
    527        shrn            v0.4h, v0.4s, #2
    528        shrn2           v0.8h, v1.4s, #2
    529    .endif
    530        str             q0, [\dst], #16
    531 .else
    532        uzp1            v0.8h, v0.8h, v1.8h
    533        sqrshrun        v0.8b, v0.8h, #6
    534        subs            \h, \h, #2
    535        fmov            x8, d0
    536        lsr             x9, x8, #32
    537        str             w8, [\dst]
    538        str             w9, [\dst, \d_strd]
    539        add             \dst, \dst, \d_strd, lsl #1
    540 .endif
    541        b.gt            4b
    542 
    543 .ifc \type, put
    544        .align JUMP_ALIGN
    545 42:
    546 .endif
    547        ldr             s18, [\src]
    548 .ifc \isa, neon_i8mm
    549        movi            v0.4s, #0
    550        movi            v1.4s, #0
    551 .else   // neon_dotprod
    552        mov             v0.16b, v4.16b
    553        mov             v1.16b, v4.16b
    554        sub             v18.16b, v18.16b, v5.16b
    555 .endif
    556        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
    557        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
    558 
    559        \dot            v0.4s, v16.16b, v7.4b[0]
    560        \dot            v0.4s, v17.16b, v7.4b[1]
    561 
    562        \dot            v1.4s, v19.16b, v7.4b[0]
    563        \dot            v1.4s, v20.16b, v7.4b[1]
    564 .ifc \type, prep
    565    .ifc \isa, neon_i8mm
    566        rshrn           v0.4h, v0.4s, #2
    567        rshrn2          v0.8h, v1.4s, #2
    568    .else
    569        shrn            v0.4h, v0.4s, #2
    570        shrn2           v0.8h, v1.4s, #2
    571    .endif
    572        str             q0, [\dst]
    573 .else
    574        uzp1            v0.8h, v0.8h, v1.8h
    575        sqrshrun        v0.8b, v0.8h, #6
    576        fmov            x8, d0
    577        lsr             x9, x8, #32
    578        str             w8, [\dst]
    579        str             w9, [\dst, \d_strd]
    580 .endif
    581        ret
    582 
    583 .ifc \type, put
    584        .align JUMP_ALIGN
    585 20:     // V - 2xN
    586        ldr             h16, [\src]
    587        ldr             h17, [\src, \s_strd]
    588        add             \src, \src, \s_strd, lsl #1
    589        ldr             h18, [\src]
    590        ldr             h19, [\src, \s_strd]
    591        add             \src, \src, \s_strd, lsl #1
    592 
    593        ldr             h20, [\src]
    594        ldr             h21, [\src, \s_strd]
    595        add             \src, \src, \s_strd, lsl #1
    596        ldr             h22, [\src]
    597        ldr             h23, [\src, \s_strd]
    598        add             \src, \src, \s_strd, lsl #1
    599        subs            \h, \h, #2
    600 
    601        zip1            v0.8b, v16.8b, v17.8b
    602        zip1            v2.8b, v18.8b, v19.8b
    603        zip1            v18.8b, v20.8b, v21.8b
    604        zip1            v24.8b, v22.8b, v23.8b
    605 
    606        zip1            v16.4h, v0.4h, v2.4h
    607        zip1            v17.4h, v18.4h, v24.4h
    608    .ifc \isa, neon_dotprod
    609        sub             v16.8b, v16.8b, v5.8b
    610        sub             v17.8b, v17.8b, v5.8b
    611    .endif
    612        b.eq            22f
    613 
    614        .align LOOP_ALIGN
    615 2:
    616        ldr             h18, [\src]
    617        ldr             h21, [\src, \s_strd]
    618        add             \src, \src, \s_strd, lsl #1
    619    .ifc \isa, neon_i8mm
    620        movi            v0.4s, #0
    621        movi            v1.4s, #0
    622    .else   // put
    623        mov             v0.16b, v4.16b
    624        mov             v1.16b, v4.16b
    625        sub             v18.8b, v18.8b, v5.8b
    626        sub             v21.8b, v21.8b, v5.8b
    627    .endif
    628        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
    629        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
    630 
    631        \dot            v0.4s, v16.16b, v7.4b[0]
    632        \dot            v0.4s, v17.16b, v7.4b[1]
    633 
    634        tbl             v16.16b, {v19.16b, v20.16b}, v6.16b
    635        tbl             v17.16b, {v20.16b, v21.16b}, v28.16b
    636 
    637        \dot            v1.4s, v19.16b, v7.4b[0]
    638        \dot            v1.4s, v20.16b, v7.4b[1]
    639 
    640        uzp1            v0.8h, v0.8h, v1.8h
    641        sqrshrun        v0.8b, v0.8h, #6
    642 
    643        subs            \h, \h, #2
    644        fmov            x8, d0
    645        lsr             x9, x8, #32
    646        strh            w8, [\dst]
    647        strh            w9, [\dst, \d_strd]
    648        add             \dst, \dst, \d_strd, lsl #1
    649        b.gt            2b
    650 
    651        .align JUMP_ALIGN
    652 22:
    653        ldr             h18, [\src]
    654    .ifc \isa, neon_i8mm
    655        movi            v0.4s, #0
    656        movi            v1.4s, #0
    657    .else   // put
    658        mov             v0.16b, v4.16b
    659        mov             v1.16b, v4.16b
    660        sub             v18.8b, v18.8b, v5.8b
    661    .endif
    662        tbl             v19.16b, {v16.16b, v17.16b}, v6.16b
    663        tbl             v20.16b, {v17.16b, v18.16b}, v28.16b
    664 
    665        \dot            v0.4s, v16.16b, v7.4b[0]
    666        \dot            v0.4s, v17.16b, v7.4b[1]
    667 
    668        \dot            v1.4s, v19.16b, v7.4b[0]
    669        \dot            v1.4s, v20.16b, v7.4b[1]
    670 
    671        uzp1            v0.8h, v0.8h, v1.8h
    672        sqrshrun        v0.8b, v0.8h, #6
    673 
    674        fmov            x8, d0
    675        lsr             x9, x8, #32
    676        strh            w8, [\dst]
    677        strh            w9, [\dst, \d_strd]
    678        ret
    679 .endif
    680 
    681        .align JUMP_ALIGN
    682 L(\type\()_8tap_h_hv_\isa):
    683        madd            \mx, \mx, w11, w9
    684        madd            w14, \my, w11, w10      // for HV
    685 .ifc \isa, neon_dotprod
    686        mov             w13, #0x2002            // FILTER_WEIGHT * 128 + rounding
    687        dup             v27.4s, w13             // put H overrides this
    688 .endif
    689        movrel          x13, h_tbl_neon_dotprod
    690        sub             \src, \src, #3          // src - 3
    691        ldr             q28, [x13]              // for 4-tap & 8-tap H filters
    692        ubfx            w15, \mx, #7, #7
    693        and             \mx, \mx, #0x7F
    694        ubfx            w11, w14, #7, #7        // for HV
    695        and             w14, w14, #0x7F         // for HV
    696        cmp             \w, #4
    697        csel            \mx, \mx, w15, le
    698        add             \xmx, x12, \xmx, lsl #3 // subpel H filter address
    699 .ifc \isa, neon_dotprod
    700        movi            v24.16b, #128
    701 .endif
    702        cbz             \my, L(\type\()_8tap_h_\isa)
    703 
    704        // HV cases
    705        cmp             \h, #4
    706        csel            w14, w14, w11, le
    707        sub             \src, \src, \s_strd, lsl #1 // src - s_strd * 2 - 3
    708        add             \xmy, x12, x14, lsl #3      // subpel V filter address
    709        mov             x15, x30
    710        ldr             d7, [\xmy]
    711 .ifc \type, put
    712        ldr             q25, [x13, #(OFFSET_CVT_32_8)] // LUT to help conversion
    713 .endif                                                 // of 32b values to 8b
    714        sxtl            v7.8h, v7.8b
    715        cmp             w10, #SHARP1
    716        b.ne            L(\type\()_6tap_hv_\isa)    // vertical != SHARP1
    717 
    718        // HV 8-tap cases
    719        sub             \src, \src, \s_strd         // src - s_strd * 3 - 3
    720        cmp             \w, #4
    721        b.eq            40f
    722 .ifc \type, put
    723        b.lt            20f
    724 .endif
    725 
    726        // .align JUMP_ALIGN    // fallthrough
    727 80:     // HV8 - 8xN+
    728        ldp             q29, q30, [x13, #16]
    729        ldr             d26, [\xmx]
    730 .ifc \type, prep
    731        add             \wd_strd, \w, \w
    732 .endif
    733        .align LOOP_ALIGN
    734 81:
    735        mov             \lsrc, \src
    736        mov             \ldst, \dst
    737        mov             w8, \h
    738 .ifc \isa, neon_i8mm
    739        bl              L(\type\()_hv_filter8_\isa)
    740        srshr           v16.8h, v22.8h, #2
    741        bl              L(\type\()_hv_filter8_\isa)
    742        srshr           v17.8h, v22.8h, #2
    743        bl              L(\type\()_hv_filter8_\isa)
    744        srshr           v18.8h, v22.8h, #2
    745        bl              L(\type\()_hv_filter8_\isa)
    746        srshr           v19.8h, v22.8h, #2
    747        bl              L(\type\()_hv_filter8_\isa)
    748        srshr           v20.8h, v22.8h, #2
    749        bl              L(\type\()_hv_filter8_\isa)
    750        srshr           v21.8h, v22.8h, #2
    751        bl              L(\type\()_hv_filter8_\isa)
    752        srshr           v22.8h, v22.8h, #2
    753 .else
    754        bl              L(\type\()_hv_filter8_\isa)
    755        sshr            v16.8h, v22.8h, #2
    756        bl              L(\type\()_hv_filter8_\isa)
    757        sshr            v17.8h, v22.8h, #2
    758        bl              L(\type\()_hv_filter8_\isa)
    759        sshr            v18.8h, v22.8h, #2
    760        bl              L(\type\()_hv_filter8_\isa)
    761        sshr            v19.8h, v22.8h, #2
    762        bl              L(\type\()_hv_filter8_\isa)
    763        sshr            v20.8h, v22.8h, #2
    764        bl              L(\type\()_hv_filter8_\isa)
    765        sshr            v21.8h, v22.8h, #2
    766        bl              L(\type\()_hv_filter8_\isa)
    767        sshr            v22.8h, v22.8h, #2
    768 .endif
    769        .align LOOP_ALIGN
    770 8:
    771        ldr             q23, [\lsrc]
    772        add             \lsrc, \lsrc, \s_strd
    773 
    774        smull           v0.4s, v16.4h, v7.h[0]
    775        smull2          v1.4s, v16.8h, v7.h[0]
    776        mov             v16.16b, v17.16b
    777 .ifc \isa, neon_i8mm
    778        movi            v5.4s, #0
    779        movi            v6.4s, #0
    780        tbl             v2.16b, {v23.16b}, v28.16b
    781        tbl             v3.16b, {v23.16b}, v29.16b
    782 .else   // neon_dotprod
    783        sub             v23.16b, v23.16b, v24.16b
    784        mov             v5.16b, v27.16b
    785        mov             v6.16b, v27.16b
    786 .endif
    787        smlal           v0.4s, v17.4h, v7.h[1]
    788        smlal2          v1.4s, v17.8h, v7.h[1]
    789 .ifc \isa, neon_i8mm
    790        tbl             v4.16b, {v23.16b}, v30.16b
    791        mov             v17.16b, v18.16b
    792 .else   // neon_dotprod
    793        mov             v17.16b, v18.16b
    794        tbl             v2.16b, {v23.16b}, v28.16b
    795        tbl             v3.16b, {v23.16b}, v29.16b
    796        tbl             v4.16b, {v23.16b}, v30.16b
    797 .endif
    798        smlal           v0.4s, v18.4h, v7.h[2]
    799        smlal2          v1.4s, v18.8h, v7.h[2]
    800        mov             v18.16b, v19.16b
    801 
    802        \dot            v5.4s, v2.16b, v26.4b[0]
    803        \dot            v6.4s, v3.16b, v26.4b[0]
    804 
    805        smlal           v0.4s, v19.4h, v7.h[3]
    806        smlal2          v1.4s, v19.8h, v7.h[3]
    807        mov             v19.16b, v20.16b
    808 
    809        \dot            v5.4s, v3.16b, v26.4b[1]
    810        \dot            v6.4s, v4.16b, v26.4b[1]
    811 
    812        smlal           v0.4s, v20.4h, v7.h[4]
    813        smlal2          v1.4s, v20.8h, v7.h[4]
    814        mov             v20.16b, v21.16b
    815 
    816        smlal           v0.4s, v21.4h, v7.h[5]
    817        smlal2          v1.4s, v21.8h, v7.h[5]
    818 .ifc \type, prep
    819        uzp1            v23.8h, v5.8h, v6.8h
    820 .endif
    821        mov             v21.16b, v22.16b
    822        smlal           v0.4s, v22.4h, v7.h[6]
    823        smlal2          v1.4s, v22.8h, v7.h[6]
    824 .ifc \isa, neon_i8mm
    825        subs            w8, w8, #1
    826 .endif
    827 .ifc \type, prep
    828    .ifc \isa, neon_i8mm
    829        srshr           v22.8h, v23.8h, #2
    830    .else
    831        sshr            v22.8h, v23.8h, #2
    832    .endif
    833        smlal           v0.4s, v22.4h, v7.h[7]
    834        smlal2          v1.4s, v22.8h, v7.h[7]
    835        rshrn           v0.4h, v0.4s, #6
    836        rshrn2          v0.8h, v1.4s, #6
    837 .else   // put
    838    .ifc \isa, neon_i8mm
    839        rshrn           v22.4h, v5.4s, #2
    840        rshrn2          v22.8h, v6.4s, #2
    841    .else
    842        shrn            v22.4h, v5.4s, #2
    843        shrn2           v22.8h, v6.4s, #2
    844    .endif
    845        smlal           v0.4s, v22.4h, v7.h[7]
    846        smlal2          v1.4s, v22.8h, v7.h[7]
    847        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
    848        sqrshrun        v0.8b, v0.8h, #2
    849 .endif
    850 .ifc \isa, neon_dotprod
    851        subs            w8, w8, #1
    852 .endif
    853 .ifc \type, prep
    854        st1             {v0.8h}, [\ldst], \d_strd
    855        b.gt            8b
    856        add             \dst, \dst, #16
    857 .else
    858        st1             {v0.8b}, [\ldst], \d_strd
    859        b.gt            8b
    860        add             \dst, \dst, #8
    861 .endif
    862        add             \src, \src, #8
    863        subs            \w, \w, #8
    864        b.gt            81b
    865        ret             x15
    866 
    867        .align JUMP_ALIGN
    868 40:     // HV8 - 4xN
    869        ldur            s26, [\xmx, #2]
    870        add             \src, \src, #2
    871 
    872        bl              L(\type\()_hv_filter4_\isa)
    873        shrn            v16.4h, v22.4s, #2
    874        bl              L(\type\()_hv_filter4_\isa)
    875        shrn            v17.4h, v22.4s, #2
    876        bl              L(\type\()_hv_filter4_\isa)
    877        shrn            v18.4h, v22.4s, #2
    878        bl              L(\type\()_hv_filter4_\isa)
    879        shrn            v19.4h, v22.4s, #2
    880        bl              L(\type\()_hv_filter4_\isa)
    881        shrn            v20.4h, v22.4s, #2
    882        bl              L(\type\()_hv_filter4_\isa)
    883        shrn            v21.4h, v22.4s, #2
    884        bl              L(\type\()_hv_filter4_\isa)
    885        shrn            v22.4h, v22.4s, #2
    886 
    887        .align LOOP_ALIGN
    888 4:
    889        ld1             {v4.8b}, [\src], \s_strd
    890 
    891        smull           v0.4s, v16.4h, v7.h[0]
    892        smlal           v0.4s, v17.4h, v7.h[1]
    893        mov             v16.16b, v17.16b
    894        mov             v17.16b, v18.16b
    895 .ifc \isa, neon_dotprod
    896        sub             v4.16b, v4.16b, v24.16b
    897 .endif
    898        smlal           v0.4s, v18.4h, v7.h[2]
    899        smlal           v0.4s, v19.4h, v7.h[3]
    900        tbl             v2.16b, {v4.16b}, v28.16b
    901 .ifc \isa, neon_i8mm
    902        movi            v5.4s, #0
    903 .else
    904        mov             v5.16b, v27.16b
    905 .endif
    906        mov             v18.16b, v19.16b
    907        mov             v19.16b, v20.16b
    908 
    909        smlal           v0.4s, v20.4h, v7.h[4]
    910        smlal           v0.4s, v21.4h, v7.h[5]
    911 
    912        \dot            v5.4s, v2.16b, v26.4b[0]
    913        mov             v20.16b, v21.16b
    914        mov             v21.16b, v22.16b
    915        smlal           v0.4s, v22.4h, v7.h[6]
    916 .ifc \isa, neon_i8mm
    917        rshrn           v22.4h, v5.4s, #2
    918 .else
    919        shrn            v22.4h, v5.4s, #2
    920 .endif
    921        smlal           v0.4s, v22.4h, v7.h[7]
    922 .ifc \type, prep
    923        rshrn           v0.4h, v0.4s, #6
    924        str             d0, [\dst], #8
    925        subs            \h, \h, #1
    926 .else
    927        subs            \h, \h, #1
    928        tbl             v0.8b, {v0.16b}, v25.8b
    929        sqrshrun        v0.8b, v0.8h, #2
    930        str             s0, [\dst]
    931        add             \dst, \dst, \d_strd
    932 .endif
    933        b.gt            4b
    934        ret             x15
    935 
    936 .ifc \type, put
    937        .align JUMP_ALIGN
    938 20:     // HV8 - 2xN
    939        ldur            s26, [\xmx, #2]
    940        add             \src, \src, #2
    941 
    942        bl              L(\type\()_hv_filter4_\isa)
    943        shrn            v16.4h, v22.4s, #2
    944        bl              L(\type\()_hv_filter4_\isa)
    945        shrn            v17.4h, v22.4s, #2
    946        bl              L(\type\()_hv_filter4_\isa)
    947        shrn            v18.4h, v22.4s, #2
    948        bl              L(\type\()_hv_filter4_\isa)
    949        shrn            v19.4h, v22.4s, #2
    950        bl              L(\type\()_hv_filter4_\isa)
    951        shrn            v20.4h, v22.4s, #2
    952        bl              L(\type\()_hv_filter4_\isa)
    953        shrn            v21.4h, v22.4s, #2
    954        bl              L(\type\()_hv_filter4_\isa)
    955        shrn            v22.4h, v22.4s, #2
    956 
    957        .align LOOP_ALIGN
    958 2:
    959        ld1             {v4.8b}, [\src], \s_strd
    960 
    961        smull           v0.4s, v16.4h, v7.h[0]
    962        smlal           v0.4s, v17.4h, v7.h[1]
    963        mov             v16.16b, v17.16b
    964        mov             v17.16b, v18.16b
    965    .ifc \isa, neon_dotprod
    966        sub             v4.16b, v4.16b, v24.16b
    967    .endif
    968        smlal           v0.4s, v18.4h, v7.h[2]
    969        smlal           v0.4s, v19.4h, v7.h[3]
    970        tbl             v2.16b, {v4.16b}, v28.16b
    971    .ifc \isa, neon_i8mm
    972        movi            v5.4s, #0
    973    .else
    974        mov             v5.16b, v27.16b
    975    .endif
    976        mov             v18.16b, v19.16b
    977        mov             v19.16b, v20.16b
    978 
    979        smlal           v0.4s, v20.4h, v7.h[4]
    980        smlal           v0.4s, v21.4h, v7.h[5]
    981 
    982        \dot            v5.4s, v2.16b, v26.4b[0]
    983        mov             v20.16b, v21.16b
    984        mov             v21.16b, v22.16b
    985 
    986        smlal           v0.4s, v22.4h, v7.h[6]
    987    .ifc \isa, neon_i8mm
    988        rshrn           v22.4h, v5.4s, #2
    989    .else
    990        shrn            v22.4h, v5.4s, #2
    991    .endif
    992        smlal           v0.4s, v22.4h, v7.h[7]
    993        subs            \h, \h, #1
    994 
    995        tbl             v0.8b, {v0.16b}, v25.8b
    996        sqrshrun        v0.8b, v0.8h, #2
    997 
    998        str             h0, [\dst]
    999        add             \dst, \dst, \d_strd
   1000        b.gt            2b
   1001        ret             x15
   1002 .endif
   1003 
   1004        .align JUMP_ALIGN
   1005 L(\type\()_6tap_hv_\isa):
   1006        cmp             \w, #4
   1007        b.eq            40f
   1008 .ifc \type, put
   1009        b.lt            20f
   1010 .endif
   1011 
   1012        // .align JUMP_ALIGN    // fallthrough
   1013 80:     // HV6 - 8xN+
   1014        ldr             d26, [\xmx]
   1015 .ifc \type, prep
   1016        add             \wd_strd, \w, \w
   1017 .endif
   1018 .ifc \isa, neon_i8mm
   1019        cmp             w9, #SHARP1
   1020        b.eq            88f             // horizontal == SHARP1
   1021 
   1022        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
   1023        ext             v0.8b, v26.8b, v26.8b, #7
   1024        ins             v26.d[1], v0.d[0]
   1025 
   1026        .align LOOP_ALIGN
   1027 81:
   1028        mov             \lsrc, \src
   1029        mov             \ldst, \dst
   1030        mov             w8, \h
   1031 
   1032        bl              L(\type\()_hv_filter6_neon_i8mm)
   1033        srshr           v16.8h, v22.8h, #2
   1034        bl              L(\type\()_hv_filter6_neon_i8mm)
   1035        srshr           v17.8h, v22.8h, #2
   1036        bl              L(\type\()_hv_filter6_neon_i8mm)
   1037        srshr           v18.8h, v22.8h, #2
   1038        bl              L(\type\()_hv_filter6_neon_i8mm)
   1039        srshr           v19.8h, v22.8h, #2
   1040        bl              L(\type\()_hv_filter6_neon_i8mm)
   1041        srshr           v20.8h, v22.8h, #2
   1042 
   1043        .align LOOP_ALIGN
   1044 8:
   1045        ld1             {v23.16b}, [\lsrc], \s_strd
   1046 
   1047        smull           v0.4s, v16.4h, v7.h[1]
   1048        smull2          v1.4s, v16.8h, v7.h[1]
   1049        mov             v16.16b, v17.16b
   1050        movi            v5.4s, #0
   1051        movi            v6.4s, #0
   1052        tbl             v2.16b, {v23.16b}, v29.16b
   1053        tbl             v3.16b, {v23.16b}, v30.16b
   1054 
   1055        smlal           v0.4s, v17.4h, v7.h[2]
   1056        smlal2          v1.4s, v17.8h, v7.h[2]
   1057        mov             v17.16b, v18.16b
   1058 
   1059        usmmla          v5.4s, v2.16b, v26.16b
   1060        usmmla          v6.4s, v3.16b, v26.16b
   1061 
   1062        smlal           v0.4s, v18.4h, v7.h[3]
   1063        smlal2          v1.4s, v18.8h, v7.h[3]
   1064        mov             v18.16b, v19.16b
   1065        subs            w8, w8, #1
   1066 
   1067        smlal           v0.4s, v19.4h, v7.h[4]
   1068        smlal2          v1.4s, v19.8h, v7.h[4]
   1069        uzp1            v23.8h, v5.8h, v6.8h
   1070        mov             v19.16b, v20.16b
   1071 
   1072        smlal           v0.4s, v20.4h, v7.h[5]
   1073        smlal2          v1.4s, v20.8h, v7.h[5]
   1074        srshr           v20.8h, v23.8h, #2
   1075        smlal           v0.4s, v20.4h, v7.h[6]
   1076        smlal2          v1.4s, v20.8h, v7.h[6]
   1077    .ifc \type, prep
   1078        rshrn           v0.4h, v0.4s, #6
   1079        rshrn2          v0.8h, v1.4s, #6
   1080        st1             {v0.8h}, [\ldst], \d_strd
   1081        b.gt            8b
   1082        add             \dst, \dst, #16
   1083    .else
   1084        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
   1085        sqrshrun        v0.8b, v0.8h, #2
   1086        st1             {v0.8b}, [\ldst], \d_strd
   1087        b.gt            8b
   1088        add             \dst, \dst, #8
   1089    .endif
   1090        add             \src, \src, #8
   1091        subs            \w, \w, #8
   1092        b.gt            81b
   1093        ret             x15
   1094 
   1095        .align JUMP_ALIGN
   1096 88:
   1097 .endif  // neon_i8mm
   1098        ldp             q29, q30, [x13, #16]
   1099 
   1100        .align LOOP_ALIGN
   1101 81:
   1102        mov             \lsrc, \src
   1103        mov             \ldst, \dst
   1104        mov             w8, \h
   1105 .ifc \isa, neon_i8mm
   1106        bl              L(\type\()_hv_filter8_\isa)
   1107        srshr           v16.8h, v22.8h, #2
   1108        bl              L(\type\()_hv_filter8_\isa)
   1109        srshr           v17.8h, v22.8h, #2
   1110        bl              L(\type\()_hv_filter8_\isa)
   1111        srshr           v18.8h, v22.8h, #2
   1112        bl              L(\type\()_hv_filter8_\isa)
   1113        srshr           v19.8h, v22.8h, #2
   1114        bl              L(\type\()_hv_filter8_\isa)
   1115        srshr           v20.8h, v22.8h, #2
   1116 .else
   1117        bl              L(\type\()_hv_filter8_\isa)
   1118        sshr            v16.8h, v22.8h, #2
   1119        bl              L(\type\()_hv_filter8_\isa)
   1120        sshr            v17.8h, v22.8h, #2
   1121        bl              L(\type\()_hv_filter8_\isa)
   1122        sshr            v18.8h, v22.8h, #2
   1123        bl              L(\type\()_hv_filter8_\isa)
   1124        sshr            v19.8h, v22.8h, #2
   1125        bl              L(\type\()_hv_filter8_\isa)
   1126        sshr            v20.8h, v22.8h, #2
   1127 .endif
   1128        .align LOOP_ALIGN
   1129 8:
   1130        ldr             q23, [\lsrc]
   1131        add             \lsrc, \lsrc, \s_strd
   1132 
   1133        smull           v0.4s, v16.4h, v7.h[1]
   1134        smull2          v1.4s, v16.8h, v7.h[1]
   1135 .ifc \isa, neon_dotprod
   1136        sub             v23.16b, v23.16b, v24.16b
   1137 .endif
   1138        mov             v16.16b, v17.16b
   1139 .ifc \isa, neon_i8mm
   1140        movi            v5.4s, #0
   1141        movi            v6.4s, #0
   1142 .else
   1143        mov             v5.16b, v27.16b
   1144        mov             v6.16b, v27.16b
   1145 .endif
   1146        tbl             v2.16b, {v23.16b}, v28.16b
   1147        tbl             v3.16b, {v23.16b}, v29.16b
   1148 
   1149        smlal           v0.4s, v17.4h, v7.h[2]
   1150        smlal2          v1.4s, v17.8h, v7.h[2]
   1151        tbl             v4.16b, {v23.16b}, v30.16b
   1152        mov             v17.16b, v18.16b
   1153 
   1154        \dot            v5.4s, v2.16b, v26.4b[0]
   1155        \dot            v6.4s, v3.16b, v26.4b[0]
   1156 
   1157        smlal           v0.4s, v18.4h, v7.h[3]
   1158        smlal2          v1.4s, v18.8h, v7.h[3]
   1159        mov             v18.16b, v19.16b
   1160 
   1161        \dot            v5.4s, v3.16b, v26.4b[1]
   1162        \dot            v6.4s, v4.16b, v26.4b[1]
   1163 
   1164        smlal           v0.4s, v19.4h, v7.h[4]
   1165        smlal2          v1.4s, v19.8h, v7.h[4]
   1166        mov             v19.16b, v20.16b
   1167        uzp1            v23.8h, v5.8h, v6.8h
   1168 
   1169        smlal           v0.4s, v20.4h, v7.h[5]
   1170        smlal2          v1.4s, v20.8h, v7.h[5]
   1171 .ifc \isa, neon_i8mm
   1172        srshr           v20.8h, v23.8h, #2
   1173 .else
   1174        sshr            v20.8h, v23.8h, #2
   1175 .endif
   1176        subs            w8, w8, #1
   1177        smlal           v0.4s, v20.4h, v7.h[6]
   1178        smlal2          v1.4s, v20.8h, v7.h[6]
   1179 .ifc \type, prep
   1180        rshrn           v0.4h, v0.4s, #6
   1181        rshrn2          v0.8h, v1.4s, #6
   1182        st1             {v0.8h}, [\ldst], \d_strd
   1183        b.gt            8b
   1184        add             \dst, \dst, #16
   1185 .else
   1186        tbl             v0.16b, {v0.16b, v1.16b}, v25.16b
   1187        sqrshrun        v0.8b, v0.8h, #2
   1188        st1             {v0.8b}, [\ldst], \d_strd
   1189        b.gt            8b
   1190        add             \dst, \dst, #8
   1191 .endif
   1192        add             \src, \src, #8
   1193        subs            \w, \w, #8
   1194        b.gt            81b
   1195        ret             x15
   1196 
   1197        .align FUNC_ALIGN
   1198 L(\type\()_hv_filter8_\isa):
   1199        ld1             {v4.16b}, [\lsrc], \s_strd
   1200 .ifc \isa, neon_i8mm
   1201        movi            v22.4s, #0
   1202        movi            v23.4s, #0
   1203 .else   // neon_dotprod
   1204        sub             v4.16b, v4.16b, v24.16b
   1205        mov             v22.16b, v27.16b
   1206        mov             v23.16b, v27.16b
   1207 .endif
   1208        tbl             v2.16b, {v4.16b}, v28.16b
   1209        tbl             v3.16b, {v4.16b}, v29.16b
   1210        tbl             v4.16b, {v4.16b}, v30.16b
   1211        \dot            v22.4s, v2.16b, v26.4b[0]
   1212        \dot            v23.4s, v3.16b, v26.4b[0]
   1213        \dot            v22.4s, v3.16b, v26.4b[1]
   1214        \dot            v23.4s, v4.16b, v26.4b[1]
   1215        uzp1            v22.8h, v22.8h, v23.8h
   1216        ret
   1217 
   1218 .ifc \isa, neon_i8mm
   1219        .align FUNC_ALIGN
   1220 L(\type\()_hv_filter6_neon_i8mm):
   1221        ld1             {v4.16b}, [\lsrc], \s_strd
   1222        movi            v22.4s, #0
   1223        movi            v23.4s, #0
   1224        tbl             v2.16b, {v4.16b}, v29.16b
   1225        tbl             v3.16b, {v4.16b}, v30.16b
   1226        usmmla          v22.4s, v2.16b, v26.16b
   1227        usmmla          v23.4s, v3.16b, v26.16b
   1228        uzp1            v22.8h, v22.8h, v23.8h
   1229        ret
   1230 .endif
   1231 
   1232        .align FUNC_ALIGN
   1233 L(\type\()_hv_filter4_\isa):
   1234        ld1             {v4.8b}, [\src], \s_strd
   1235 .ifc \isa, neon_i8mm
   1236        movi            v22.4s, #2
   1237 .else
   1238        mov             v22.16b, v27.16b
   1239        sub             v4.16b, v4.16b, v24.16b
   1240 .endif
   1241        tbl             v2.16b, {v4.16b}, v28.16b
   1242        \dot            v22.4s, v2.16b, v26.4b[0]
   1243        ret
   1244 
   1245        .align JUMP_ALIGN
   1246 40:     // HV6 - 4xN
   1247        ldur            s26, [\xmx, #2]
   1248        add             \src, \src, #2
   1249 
   1250        bl              L(\type\()_hv_filter4_\isa)
   1251        shrn            v16.4h, v22.4s, #2
   1252        bl              L(\type\()_hv_filter4_\isa)
   1253        shrn            v17.4h, v22.4s, #2
   1254        bl              L(\type\()_hv_filter4_\isa)
   1255        shrn            v18.4h, v22.4s, #2
   1256        bl              L(\type\()_hv_filter4_\isa)
   1257        shrn            v19.4h, v22.4s, #2
   1258        bl              L(\type\()_hv_filter4_\isa)
   1259        shrn            v20.4h, v22.4s, #2
   1260 
   1261        .align LOOP_ALIGN
   1262 4:
   1263        ld1             {v4.8b}, [\src], \s_strd
   1264 
   1265        smull           v0.4s, v16.4h, v7.h[1]
   1266        smlal           v0.4s, v17.4h, v7.h[2]
   1267 .ifc \isa, neon_dotprod
   1268        sub             v4.16b, v4.16b, v24.16b
   1269 .endif
   1270        mov             v16.16b, v17.16b
   1271        mov             v17.16b, v18.16b
   1272 
   1273        smlal           v0.4s, v18.4h, v7.h[3]
   1274        smlal           v0.4s, v19.4h, v7.h[4]
   1275        tbl             v2.16b, {v4.16b}, v28.16b
   1276 .ifc \isa, neon_i8mm
   1277        movi            v5.4s, #0
   1278 .else
   1279        mov             v5.16b, v27.16b
   1280 .endif
   1281        mov             v18.16b, v19.16b
   1282        mov             v19.16b, v20.16b
   1283        \dot            v5.4s, v2.16b, v26.4b[0]
   1284 
   1285        smlal           v0.4s, v20.4h, v7.h[5]
   1286 .ifc \isa, neon_i8mm
   1287        rshrn           v20.4h, v5.4s, #2
   1288 .else
   1289        shrn            v20.4h, v5.4s, #2
   1290 .endif
   1291        subs            \h, \h, #1
   1292        smlal           v0.4s, v20.4h, v7.h[6]
   1293 .ifc \type, prep
   1294        rshrn           v0.4h, v0.4s, #6
   1295        str             d0, [\dst], #8
   1296 .else
   1297        tbl             v0.8b, {v0.16b}, v25.8b
   1298        sqrshrun        v0.8b, v0.8h, #2
   1299        str             s0, [\dst]
   1300        add             \dst, \dst, \d_strd
   1301 .endif
   1302        b.gt            4b
   1303        ret             x15
   1304 
   1305 .ifc \type, put
   1306        .align JUMP_ALIGN
   1307 20:     // HV6 - 2xN
   1308        ldur            s26, [\xmx, #2]
   1309        add             \src, \src, #2
   1310 
   1311        bl              L(\type\()_hv_filter4_\isa)
   1312        shrn            v16.4h, v22.4s, #2
   1313        bl              L(\type\()_hv_filter4_\isa)
   1314        shrn            v17.4h, v22.4s, #2
   1315        bl              L(\type\()_hv_filter4_\isa)
   1316        shrn            v18.4h, v22.4s, #2
   1317        bl              L(\type\()_hv_filter4_\isa)
   1318        shrn            v19.4h, v22.4s, #2
   1319        bl              L(\type\()_hv_filter4_\isa)
   1320        shrn            v20.4h, v22.4s, #2
   1321 
   1322        .align LOOP_ALIGN
   1323 2:
   1324        ld1             {v4.8b}, [\src], \s_strd
   1325 
   1326        smull           v0.4s, v16.4h, v7.h[1]
   1327        smlal           v0.4s, v17.4h, v7.h[2]
   1328    .ifc \isa, neon_dotprod
   1329        sub             v4.16b, v4.16b, v24.16b
   1330    .endif
   1331        mov             v16.16b, v17.16b
   1332        mov             v17.16b, v18.16b
   1333 
   1334        smlal           v0.4s, v18.4h, v7.h[3]
   1335        smlal           v0.4s, v19.4h, v7.h[4]
   1336        tbl             v2.16b, {v4.16b}, v28.16b
   1337    .ifc \isa, neon_i8mm
   1338        movi            v5.4s, #0
   1339    .else
   1340        mov             v5.16b, v27.16b
   1341    .endif
   1342 
   1343        mov             v18.16b, v19.16b
   1344        mov             v19.16b, v20.16b
   1345        \dot            v5.4s, v2.16b, v26.4b[0]
   1346 
   1347        smlal           v0.4s, v20.4h, v7.h[5]
   1348    .ifc \isa, neon_i8mm
   1349        rshrn           v20.4h, v5.4s, #2
   1350    .else
   1351        shrn            v20.4h, v5.4s, #2
   1352    .endif
   1353 
   1354        subs            \h, \h, #1
   1355        smlal           v0.4s, v20.4h, v7.h[6]
   1356 
   1357        tbl             v0.8b, {v0.16b}, v25.8b
   1358        sqrshrun        v0.8b, v0.8h, #2
   1359 
   1360        str             h0, [\dst]
   1361        add             \dst, \dst, \d_strd
   1362        b.gt            2b
   1363        ret             x15
   1364 .endif
   1365 
   1366        .align JUMP_ALIGN
   1367 L(\type\()_8tap_h_\isa):
   1368        movrel          x11, \type\()_8tap_h_\isa\()_tbl
   1369        ldrsw           x8, [x11, x8, lsl #2]
   1370 .ifc \type, put
   1371    .ifc \isa, neon_i8mm
   1372        movi            v27.4s, #34     // special rounding
   1373    .else
   1374        mov             w10, #0x2022    // 64 * 128 + 34, bias and rounding for SDOT
   1375        dup             v27.4s, w10
   1376    .endif
   1377 .endif
   1378        add             x11, x11, x8
   1379        br              x11
   1380 
   1381 .ifc \type, put
   1382        .align JUMP_ALIGN
   1383 20:     // H - 2xN
   1384        AARCH64_VALID_JUMP_TARGET
   1385        add             \src, \src, #2
   1386        ldur            s26, [\xmx, #2]
   1387 
   1388        .align LOOP_ALIGN
   1389 2:
   1390        ldr             d0, [\src]
   1391        ldr             d1, [\src, \s_strd]
   1392        add             \src, \src, \s_strd, lsl #1
   1393    .ifc \isa, neon_dotprod
   1394        sub             v0.8b, v0.8b, v24.8b
   1395        sub             v1.8b, v1.8b, v24.8b
   1396    .endif
   1397        mov             v4.16b, v27.16b
   1398        mov             v5.16b, v27.16b
   1399 
   1400        tbl             v2.16b, {v0.16b}, v28.16b
   1401        tbl             v3.16b, {v1.16b}, v28.16b
   1402 
   1403        \dot            v4.4s, v2.16b, v26.4b[0]
   1404        \dot            v5.4s, v3.16b, v26.4b[0]
   1405 
   1406        uzp1            v4.8h, v4.8h, v5.8h
   1407        sqshrun         v4.8b, v4.8h, #6
   1408 
   1409        subs            \h, \h, #2
   1410        fmov            x8, d4
   1411        lsr             x9, x8, #32
   1412        strh            w8, [\dst]
   1413        strh            w9, [\dst, \d_strd]
   1414        add             \dst, \dst, \d_strd, lsl #1
   1415        b.gt            2b
   1416        ret
   1417 .endif
   1418 
   1419        .align JUMP_ALIGN
   1420 40:     // H - 4xN
   1421        AARCH64_VALID_JUMP_TARGET
   1422        add             \src, \src, #2
   1423        ldur            s26, [\xmx, #2]
   1424 
   1425        .align LOOP_ALIGN
   1426 4:
   1427        ldr             d0, [\src]
   1428        ldr             d1, [\src, \s_strd]
   1429        add             \src, \src, \s_strd, lsl #1
   1430 .ifc \type\()_\isa, prep_neon_i8mm
   1431        movi            v4.4s, #0
   1432        movi            v5.4s, #0
   1433 .else
   1434    .ifc \isa, neon_dotprod
   1435        sub             v0.8b, v0.8b, v24.8b
   1436        sub             v1.8b, v1.8b, v24.8b
   1437    .endif
   1438        mov             v4.16b, v27.16b
   1439        mov             v5.16b, v27.16b
   1440 .endif
   1441        tbl             v2.16b, {v0.16b}, v28.16b
   1442        tbl             v3.16b, {v1.16b}, v28.16b
   1443 
   1444        \dot            v4.4s, v2.16b, v26.4b[0]
   1445        \dot            v5.4s, v3.16b, v26.4b[0]
   1446 .ifc \type, prep
   1447        subs            \h, \h, #2
   1448    .ifc \isa, neon_i8mm
   1449        uzp1            v4.8h, v4.8h, v5.8h
   1450        srshr           v4.8h, v4.8h, #2
   1451    .else
   1452        shrn            v4.4h, v4.4s, #2
   1453        shrn2           v4.8h, v5.4s, #2
   1454    .endif
   1455        str             q4, [\dst], #16
   1456 .else   // put
   1457        uzp1            v4.8h, v4.8h, v5.8h
   1458        sqshrun         v4.8b, v4.8h, #6
   1459        subs            \h, \h, #2
   1460        fmov            x8, d4
   1461        lsr             x9, x8, #32
   1462        str             w8, [\dst]
   1463        str             w9, [\dst, \d_strd]
   1464        add             \dst, \dst, \d_strd, lsl #1
   1465 .endif
   1466        b.gt            4b
   1467        ret
   1468 
   1469        .align JUMP_ALIGN
   1470 80:     // H - 8xN
   1471        AARCH64_VALID_JUMP_TARGET
   1472        ldr             d26, [\xmx]
   1473 .ifc \isa, neon_i8mm
   1474        cmp             w9, #SHARP1
   1475        b.eq            88f             // horizontal == SHARP1
   1476 
   1477        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
   1478        ext             v0.8b, v26.8b, v26.8b, #7
   1479        ins             v26.d[1], v0.d[0]
   1480 
   1481        .align LOOP_ALIGN
   1482 8:
   1483        ldr             q0, [\src]
   1484        ldr             q16, [\src, \s_strd]
   1485        add             \src, \src, \s_strd, lsl #1
   1486    .ifc \type, prep
   1487        movi            v4.4s, #0
   1488        movi            v5.4s, #0
   1489        movi            v20.4s, #0
   1490        movi            v21.4s, #0
   1491    .else
   1492        mov             v4.16b, v27.16b
   1493        mov             v5.16b, v27.16b
   1494        mov             v20.16b, v27.16b
   1495        mov             v21.16b, v27.16b
   1496    .endif
   1497        tbl             v1.16b, {v0.16b}, v29.16b
   1498        tbl             v2.16b, {v0.16b}, v30.16b
   1499        tbl             v17.16b, {v16.16b}, v29.16b
   1500        tbl             v18.16b, {v16.16b}, v30.16b
   1501 
   1502        usmmla          v4.4s, v1.16b, v26.16b
   1503        usmmla          v5.4s, v2.16b, v26.16b
   1504        usmmla          v20.4s, v17.16b, v26.16b
   1505        usmmla          v21.4s, v18.16b, v26.16b
   1506 
   1507        uzp1            v4.8h, v4.8h, v5.8h
   1508        uzp1            v20.8h, v20.8h, v21.8h
   1509    .ifc \type, prep
   1510        srshr           v4.8h, v4.8h, #2
   1511        srshr           v20.8h, v20.8h, #2
   1512        subs            \h, \h, #2
   1513        stp             q4, q20, [\dst], #32
   1514    .else   // put
   1515        sqshrun         v4.8b, v4.8h, #6
   1516        sqshrun         v20.8b, v20.8h, #6
   1517        subs            \h, \h, #2
   1518        str             d4, [\dst]
   1519        str             d20, [\dst, \d_strd]
   1520        add             \dst, \dst, \d_strd, lsl #1
   1521    .endif
   1522        b.gt            8b
   1523        ret
   1524 
   1525        .align JUMP_ALIGN
   1526 88:
   1527 .endif  // neon_i8mm
   1528        ldp             q29, q30, [x13, #16]
   1529 
   1530        .align LOOP_ALIGN
   1531 8:
   1532        ldr             q0, [\src]
   1533        ldr             q16, [\src, \s_strd]
   1534        add             \src, \src, \s_strd, lsl #1
   1535 .ifc \type\()_\isa, prep_neon_i8mm
   1536        movi            v4.4s, #0
   1537        movi            v5.4s, #0
   1538        movi            v20.4s, #0
   1539        movi            v21.4s, #0
   1540 .else
   1541    .ifc \isa, neon_dotprod
   1542        sub             v0.16b, v0.16b, v24.16b
   1543        sub             v16.16b, v16.16b, v24.16b
   1544    .endif
   1545        mov             v4.16b, v27.16b
   1546        mov             v5.16b, v27.16b
   1547        mov             v20.16b, v27.16b
   1548        mov             v21.16b, v27.16b
   1549 .endif
   1550        tbl             v1.16b, {v0.16b}, v28.16b
   1551        tbl             v2.16b, {v0.16b}, v29.16b
   1552        tbl             v3.16b, {v0.16b}, v30.16b
   1553        tbl             v17.16b, {v16.16b}, v28.16b
   1554        tbl             v18.16b, {v16.16b}, v29.16b
   1555        tbl             v19.16b, {v16.16b}, v30.16b
   1556 
   1557        \dot            v4.4s, v1.16b, v26.4b[0]
   1558        \dot            v5.4s, v2.16b, v26.4b[0]
   1559        \dot            v20.4s, v17.16b, v26.4b[0]
   1560        \dot            v21.4s, v18.16b, v26.4b[0]
   1561        \dot            v4.4s, v2.16b, v26.4b[1]
   1562        \dot            v5.4s, v3.16b, v26.4b[1]
   1563        \dot            v20.4s, v18.16b, v26.4b[1]
   1564        \dot            v21.4s, v19.16b, v26.4b[1]
   1565 
   1566        uzp1            v4.8h, v4.8h, v5.8h
   1567        uzp1            v20.8h, v20.8h, v21.8h
   1568 .ifc \type, prep
   1569    .ifc \isa, neon_i8mm
   1570        srshr           v4.8h, v4.8h, #2
   1571        srshr           v20.8h, v20.8h, #2
   1572    .else
   1573        sshr            v4.8h, v4.8h, #2
   1574        sshr            v20.8h, v20.8h, #2
   1575    .endif
   1576        subs            \h, \h, #2
   1577        stp             q4, q20, [\dst], #32
   1578 .else   // put
   1579        sqshrun         v4.8b, v4.8h, #6
   1580        sqshrun         v20.8b, v20.8h, #6
   1581        subs            \h, \h, #2
   1582        str             d4, [\dst]
   1583        str             d20, [\dst, \d_strd]
   1584        add             \dst, \dst, \d_strd, lsl #1
   1585 .endif
   1586        b.gt            8b
   1587        ret
   1588 
   1589        .align JUMP_ALIGN
   1590 160:    // H - 16xN
   1591        AARCH64_VALID_JUMP_TARGET
   1592        ldr             d26, [\xmx]
   1593 .ifc \isa, neon_i8mm
   1594        cmp             w9, #SHARP1
   1595        b.eq            168f            // horizontal == SHARP1
   1596 
   1597        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
   1598        ext             v0.8b, v26.8b, v26.8b, #7
   1599        ins             v26.d[1], v0.d[0]
   1600 
   1601        .align LOOP_ALIGN
   1602 16:
   1603        ldr             q16, [\src]
   1604        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
   1605        add             \src, \src, \s_strd
   1606    .ifc \type, prep
   1607        movi            v6.4s, #0
   1608        movi            v7.4s, #0
   1609        movi            v22.4s, #0
   1610        movi            v23.4s, #0
   1611    .else
   1612        mov             v6.16b, v27.16b
   1613        mov             v7.16b, v27.16b
   1614        mov             v22.16b, v27.16b
   1615        mov             v23.16b, v27.16b
   1616    .endif
   1617        tbl             v0.16b, {v16.16b}, v29.16b
   1618        tbl             v1.16b, {v16.16b}, v30.16b
   1619        tbl             v2.16b, {v17.16b}, v29.16b
   1620        tbl             v3.16b, {v17.16b}, v30.16b
   1621 
   1622        usmmla          v6.4s, v0.16b, v26.16b
   1623        usmmla          v7.4s, v1.16b, v26.16b
   1624        usmmla          v22.4s, v2.16b, v26.16b
   1625        usmmla          v23.4s, v3.16b, v26.16b
   1626 
   1627        uzp1            v6.8h, v6.8h, v7.8h
   1628        uzp1            v22.8h, v22.8h, v23.8h
   1629    .ifc \type, prep
   1630        srshr           v6.8h, v6.8h, #2
   1631        srshr           v22.8h, v22.8h, #2
   1632        subs            \h, \h, #1
   1633        stp             q6, q22, [\dst], #32
   1634    .else   // put
   1635        sqshrun         v6.8b, v6.8h, #6
   1636        sqshrun2        v6.16b, v22.8h, #6
   1637        subs            \h, \h, #1
   1638        st1             {v6.16b}, [\dst], \d_strd
   1639    .endif
   1640        b.gt            16b
   1641        ret
   1642 
   1643        .align JUMP_ALIGN
   1644 168:
   1645 .endif  // neon_i8mm
   1646        ldp             q29, q30, [x13, #16]
   1647 
   1648        .align LOOP_ALIGN
   1649 16:
   1650        ldr             q16, [\src]
   1651        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
   1652        add             \src, \src, \s_strd
   1653 .ifc \type\()_\isa, prep_neon_i8mm
   1654        movi            v6.4s, #0
   1655        movi            v7.4s, #0
   1656        movi            v22.4s, #0
   1657        movi            v23.4s, #0
   1658 .else
   1659    .ifc \isa, neon_dotprod
   1660        sub             v16.16b, v16.16b, v24.16b
   1661        sub             v17.16b, v17.16b, v24.16b
   1662    .endif
   1663        mov             v6.16b, v27.16b
   1664        mov             v7.16b, v27.16b
   1665        mov             v22.16b, v27.16b
   1666        mov             v23.16b, v27.16b
   1667 .endif
   1668        tbl             v0.16b, {v16.16b}, v28.16b
   1669        tbl             v1.16b, {v16.16b}, v29.16b
   1670        tbl             v2.16b, {v16.16b}, v30.16b
   1671        tbl             v3.16b, {v17.16b}, v28.16b
   1672        tbl             v4.16b, {v17.16b}, v29.16b
   1673 
   1674        \dot            v6.4s, v0.16b, v26.4b[0]
   1675        \dot            v7.4s, v1.16b, v26.4b[0]
   1676        \dot            v22.4s, v2.16b, v26.4b[0]
   1677        \dot            v23.4s, v3.16b, v26.4b[0]
   1678        \dot            v6.4s, v1.16b, v26.4b[1]
   1679        \dot            v7.4s, v2.16b, v26.4b[1]
   1680        \dot            v22.4s, v3.16b, v26.4b[1]
   1681        \dot            v23.4s, v4.16b, v26.4b[1]
   1682 
   1683        uzp1            v6.8h, v6.8h, v7.8h
   1684        uzp1            v22.8h, v22.8h, v23.8h
   1685 .ifc \type, prep
   1686    .ifc \isa, neon_i8mm
   1687        srshr           v6.8h, v6.8h, #2
   1688        srshr           v22.8h, v22.8h, #2
   1689    .else
   1690        sshr            v6.8h, v6.8h, #2
   1691        sshr            v22.8h, v22.8h, #2
   1692    .endif
   1693        subs            \h, \h, #1
   1694        stp             q6, q22, [\dst], #32
   1695 .else   // put
   1696        sqshrun         v6.8b, v6.8h, #6
   1697        sqshrun2        v6.16b, v22.8h, #6
   1698        subs            \h, \h, #1
   1699        st1             {v6.16b}, [\dst], \d_strd
   1700 .endif
   1701        b.gt            16b
   1702        ret
   1703 
   1704        .align JUMP_ALIGN
   1705 320:    // H - 32xN+
   1706 640:
   1707 1280:
   1708        AARCH64_VALID_JUMP_TARGET
   1709        ldr             d26, [\xmx]
   1710 .ifc \type, put
   1711        sub             \d_strd, \d_strd, \w, uxtw
   1712 .endif
   1713        sub             \s_strd, \s_strd, \w, uxtw
   1714        mov             w8, \w
   1715 
   1716 .ifc \isa, neon_i8mm
   1717        cmp             w9, #SHARP1
   1718        b.eq            328f            // horizontal == SHARP1
   1719 
   1720        ldp             q29, q30, [x13, #(OFFSET_USMMLA)]
   1721        ext             v0.8b, v26.8b, v26.8b, #7
   1722        ins             v26.d[1], v0.d[0]
   1723 
   1724        .align LOOP_ALIGN
   1725 32:
   1726        ldr             q16, [\src]
   1727        ldur            q17, [\src, #8] // avoid 2 register TBL for small cores
   1728        add             \src, \src, #16
   1729    .ifc \type, prep
   1730        movi            v6.4s, #0
   1731        movi            v7.4s, #0
   1732        movi            v22.4s, #0
   1733        movi            v23.4s, #0
   1734    .else
   1735        mov             v6.16b, v27.16b
   1736        mov             v7.16b, v27.16b
   1737        mov             v22.16b, v27.16b
   1738        mov             v23.16b, v27.16b
   1739    .endif
   1740        tbl             v0.16b, {v16.16b}, v29.16b
   1741        tbl             v1.16b, {v16.16b}, v30.16b
   1742        tbl             v2.16b, {v17.16b}, v29.16b
   1743        tbl             v3.16b, {v17.16b}, v30.16b
   1744 
   1745        usmmla          v6.4s, v0.16b, v26.16b
   1746        usmmla          v7.4s, v1.16b, v26.16b
   1747        usmmla          v22.4s, v2.16b, v26.16b
   1748        usmmla          v23.4s, v3.16b, v26.16b
   1749 
   1750        uzp1            v6.8h, v6.8h, v7.8h
   1751        uzp1            v22.8h, v22.8h, v23.8h
   1752    .ifc \type, prep
   1753        srshr           v6.8h, v6.8h, #2
   1754        srshr           v22.8h, v22.8h, #2
   1755        subs            w8, w8, #16
   1756        stp             q6, q22, [\dst], #32
   1757    .else   // put
   1758        sqshrun         v6.8b, v6.8h, #6
   1759        sqshrun2        v6.16b, v22.8h, #6
   1760        subs            w8, w8, #16
   1761        str             q6, [\dst], #16
   1762    .endif
   1763        b.gt            32b
   1764 
   1765        add             \src, \src, \s_strd
   1766    .ifc \type, put
   1767        add             \dst, \dst, \d_strd
   1768    .endif
   1769        mov             w8, \w
   1770        subs            \h, \h, #1
   1771        b.gt            32b
   1772        ret
   1773 
   1774        .align JUMP_ALIGN
   1775 328:
   1776 .endif  // neon_i8mm
   1777        ldp             q29, q30, [x13, #16]
   1778 
   1779        .align LOOP_ALIGN
   1780 32:
   1781        ldr             q16, [\src]
   1782        ldur            q17, [\src, #12]  // avoid 2 register TBL for small cores
   1783        add             \src, \src, #16
   1784 .ifc \type\()_\isa, prep_neon_i8mm
   1785        movi            v6.4s, #0
   1786        movi            v7.4s, #0
   1787        movi            v22.4s, #0
   1788        movi            v23.4s, #0
   1789 .else
   1790    .ifc \isa, neon_dotprod
   1791        sub             v16.16b, v16.16b, v24.16b
   1792        sub             v17.16b, v17.16b, v24.16b
   1793    .endif
   1794        mov             v6.16b, v27.16b
   1795        mov             v7.16b, v27.16b
   1796        mov             v22.16b, v27.16b
   1797        mov             v23.16b, v27.16b
   1798 .endif
   1799        tbl             v0.16b, {v16.16b}, v28.16b
   1800        tbl             v1.16b, {v16.16b}, v29.16b
   1801        tbl             v2.16b, {v16.16b}, v30.16b
   1802        tbl             v3.16b, {v17.16b}, v28.16b
   1803        tbl             v4.16b, {v17.16b}, v29.16b
   1804 
   1805        \dot            v6.4s, v0.16b, v26.4b[0]
   1806        \dot            v7.4s, v1.16b, v26.4b[0]
   1807        \dot            v22.4s, v2.16b, v26.4b[0]
   1808        \dot            v23.4s, v3.16b, v26.4b[0]
   1809        \dot            v6.4s, v1.16b, v26.4b[1]
   1810        \dot            v7.4s, v2.16b, v26.4b[1]
   1811        \dot            v22.4s, v3.16b, v26.4b[1]
   1812        \dot            v23.4s, v4.16b, v26.4b[1]
   1813 
   1814        uzp1            v6.8h, v6.8h, v7.8h
   1815        uzp1            v22.8h, v22.8h, v23.8h
   1816 .ifc \type, prep
   1817    .ifc \isa, neon_i8mm
   1818        srshr           v6.8h, v6.8h, #2
   1819        srshr           v22.8h, v22.8h, #2
   1820    .else
   1821        sshr            v6.8h, v6.8h, #2
   1822        sshr            v22.8h, v22.8h, #2
   1823    .endif
   1824        subs            w8, w8, #16
   1825        stp             q6, q22, [\dst], #32
   1826 .else   // put
   1827        sqshrun         v6.8b, v6.8h, #6
   1828        sqshrun2        v6.16b, v22.8h, #6
   1829        subs            w8, w8, #16
   1830        str             q6, [\dst], #16
   1831 .endif
   1832        b.gt            32b
   1833 
   1834        add             \src, \src, \s_strd
   1835 .ifc \type, put
   1836        add             \dst, \dst, \d_strd
   1837 .endif
   1838        mov             w8, \w
   1839        subs            \h, \h, #1
   1840        b.gt            32b
   1841        ret
   1842 endfunc
   1843 
   1844 jumptable \type\()_8tap_h_\isa\()_tbl
   1845        .word 1280b - \type\()_8tap_h_\isa\()_tbl
   1846        .word 640b  - \type\()_8tap_h_\isa\()_tbl
   1847        .word 320b  - \type\()_8tap_h_\isa\()_tbl
   1848        .word 160b  - \type\()_8tap_h_\isa\()_tbl
   1849        .word 80b   - \type\()_8tap_h_\isa\()_tbl
   1850        .word 40b   - \type\()_8tap_h_\isa\()_tbl
   1851 .ifc \type, put
   1852        .word 20b   - \type\()_8tap_h_\isa\()_tbl
   1853 .endif
   1854 endjumptable
   1855 .endm
   1856 
   1857 // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
   1858 // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
   1859 filter_8tap_fn prep, sdot, neon_dotprod, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
   1860 
   1861 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
   1862 // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
   1863 filter_8tap_fn  put, sdot, neon_dotprod, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
   1864 
   1865 #if HAVE_I8MM
   1866 ENABLE_I8MM
   1867 
   1868 // dst(x0), d_strd(x7), src(x1), s_strd(x2), w(w3), h(w4), mx(w5), my(w6)
   1869 // xmx(x5), xmy(x6), ldst(x5), lsrc(x6), wd_strd(w7)
   1870 filter_8tap_fn prep, usdot, neon_i8mm, x0, x7, x1, x2, w3, w4, w5, w6, x5, x6, x5, x6, w7
   1871 
   1872 // dst(x0) d_strd(x1) src(x2) s_strd(x3) w(w4) h(w5) mx(w6) my(w7)
   1873 // xmx(x6), xmy(x7), ldst(x6), lsrc(x7), wd_strd(w1)
   1874 filter_8tap_fn  put, usdot, neon_i8mm, x0, x1, x2, x3, w4, w5, w6, w7, x6, x7, x6, x7, w1
   1875 
   1876 DISABLE_I8MM
   1877 #endif  // HAVE_I8MM
   1878 
   1879 DISABLE_DOTPROD
   1880 #endif  // HAVE_DOTPROD