tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc.S (222547B)


      1 /*
      2 * Copyright © 2023, VideoLAN and dav1d authors
      3 * Copyright © 2023, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/loongarch/loongson_asm.S"
     29 
     30 /*
     31 static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
     32                              const pixel *src, const ptrdiff_t src_stride,
     33                              const int16_t *const abcd, int mx, int my
     34                              HIGHBD_DECL_SUFFIX)
     35 */
     36 .macro vld_filter_row dst, src, inc
     37    addi.w          t3,       \src,     512
     38    srai.w          t3,       t3,       10
     39    add.w           \src,     \src,     \inc
     40    addi.w          t3,       t3,       64
     41    slli.w          t3,       t3,       3
     42    fldx.d          \dst,     t4,       t3
     43 .endm
     44 
     45 .macro warp_filter_horz_lsx
     46    addi.w          t5,       a5,       0
     47    vld             vr10,     a2,       0
     48    add.d           a2,       a2,       a3
     49 
     50    vld_filter_row f0, t5, t0
     51    vld_filter_row f1, t5, t0
     52    vld_filter_row f2, t5, t0
     53    vld_filter_row f3, t5, t0
     54    vld_filter_row f4, t5, t0
     55    vld_filter_row f5, t5, t0
     56    vld_filter_row f6, t5, t0
     57    vld_filter_row f7, t5, t0
     58 
     59    vxor.v          vr10,     vr10,     vr20
     60 
     61    vbsrl.v         vr8,      vr10,     1
     62    vbsrl.v         vr9,      vr10,     2
     63    vilvl.d         vr8,      vr8,      vr10
     64    vilvl.d         vr0,      vr1,      vr0
     65    vmulwev.h.b     vr11,     vr8,      vr0
     66    vmulwod.h.b     vr12,     vr8,      vr0
     67    vbsrl.v         vr8,      vr10,     3
     68    vbsrl.v         vr19,     vr10,     4
     69    vilvl.d         vr8,      vr8,      vr9
     70    vilvl.d         vr2,      vr3,      vr2
     71    vmulwev.h.b     vr13,     vr8,      vr2
     72    vmulwod.h.b     vr14,     vr8,      vr2
     73    vbsrl.v         vr8,      vr10,     5
     74    vbsrl.v         vr9,      vr10,     6
     75    vilvl.d         vr8,      vr8,      vr19
     76    vilvl.d         vr4,      vr5,      vr4
     77    vmulwev.h.b     vr15,     vr8,      vr4
     78    vmulwod.h.b     vr16,     vr8,      vr4
     79    vbsrl.v         vr8,      vr10,     7
     80    vilvl.d         vr8,      vr8,      vr9
     81    vilvl.d         vr6,      vr7,      vr6
     82    vmulwev.h.b     vr17,     vr8,      vr6
     83    vmulwod.h.b     vr18,     vr8,      vr6
     84 
     85    vadd.h          vr11,     vr11,     vr12
     86    vadd.h          vr13,     vr13,     vr14
     87    vadd.h          vr15,     vr15,     vr16
     88    vadd.h          vr17,     vr17,     vr18
     89    vpickev.h       vr12,     vr13,     vr11
     90    vpickod.h       vr14,     vr13,     vr11
     91    vpickev.h       vr16,     vr17,     vr15
     92    vpickod.h       vr18,     vr17,     vr15
     93    vadd.h          vr11,     vr12,     vr14
     94    vadd.h          vr15,     vr16,     vr18
     95    vpickev.h       vr12,     vr15,     vr11
     96    vpickod.h       vr14,     vr15,     vr11
     97    vadd.h          vr11,     vr12,     vr14
     98 
     99    add.d           a5,       a5,       t1
    100 .endm
    101 
    102 .macro transpose_8x8b_extend_lsx in0, in1, in2, in3, in4, in5, in6, in7
    103    vilvl.b         \in0,     \in1,     \in0
    104    vilvl.b         \in2,     \in3,     \in2
    105    vilvl.b         \in4,     \in5,     \in4
    106    vilvl.b         \in6,     \in7,     \in6
    107 
    108    vpackev.h       \in1,     \in2,     \in0
    109    vpackod.h       \in3,     \in2,     \in0
    110    vpackev.h       \in5,     \in6,     \in4
    111    vpackod.h       \in7,     \in6,     \in4
    112 
    113    vpackev.w       \in0,     \in5,     \in1
    114    vpackod.w       \in2,     \in5,     \in1
    115    vpackev.w       \in1,     \in7,     \in3
    116    vpackod.w       \in3,     \in7,     \in3
    117 
    118    vexth.h.b       \in4,     \in0
    119    vsllwil.h.b     \in0,     \in0,     0
    120    vexth.h.b       \in5,     \in1
    121    vsllwil.h.b     \in1,     \in1,     0
    122    vexth.h.b       \in6,     \in2
    123    vsllwil.h.b     \in2,     \in2,     0
    124    vexth.h.b       \in7,     \in3
    125    vsllwil.h.b     \in3,     \in3,     0
    126 .endm
    127 
    128 .macro warp t, shift
    129 function warp_affine_8x8\t\()_8bpc_lsx
    130    addi.d          sp,       sp,      -64
    131    fst.d           f24,      sp,      0
    132    fst.d           f25,      sp,      8
    133    fst.d           f26,      sp,      16
    134    fst.d           f27,      sp,      24
    135    fst.d           f28,      sp,      32
    136    fst.d           f29,      sp,      40
    137    fst.d           f30,      sp,      48
    138    fst.d           f31,      sp,      56
    139 
    140    ld.h            t0,       a4,      0
    141    ld.h            t1,       a4,      2
    142    ld.h            t2,       a4,      4
    143    ld.h            a4,       a4,      6
    144 
    145    li.d            t7,       8
    146    alsl.w          t3,       a3,      a3,     1
    147    sub.d           a2,       a2,      t3
    148    addi.d          a2,       a2,      -3
    149    la.local        t4,       dav1d_mc_warp_filter
    150 
    151 .ifnb \t
    152    slli.d          a1,       a1,      1
    153 .endif
    154 
    155    li.w            t3,       128
    156    vreplgr2vr.b    vr20,     t3
    157 .ifb \t
    158    vreplgr2vr.h    vr21,     t3
    159 .else
    160    li.w            t3,       2048
    161    vreplgr2vr.h    vr21,     t3
    162 .endif
    163    warp_filter_horz_lsx
    164    vsrari.h        vr24,     vr11,    3
    165    warp_filter_horz_lsx
    166    vsrari.h        vr25,     vr11,    3
    167    warp_filter_horz_lsx
    168    vsrari.h        vr26,     vr11,    3
    169    warp_filter_horz_lsx
    170    vsrari.h        vr27,     vr11,    3
    171    warp_filter_horz_lsx
    172    vsrari.h        vr28,     vr11,    3
    173    warp_filter_horz_lsx
    174    vsrari.h        vr29,     vr11,    3
    175    warp_filter_horz_lsx
    176    vsrari.h        vr30,     vr11,    3
    177 
    178 1:
    179    addi.d          t6,       a6,      0
    180    warp_filter_horz_lsx
    181    vsrari.h        vr31,     vr11,    3
    182 
    183    vld_filter_row f0, t6, t2
    184    vld_filter_row f1, t6, t2
    185    vld_filter_row f2, t6, t2
    186    vld_filter_row f3, t6, t2
    187    vld_filter_row f4, t6, t2
    188    vld_filter_row f5, t6, t2
    189    vld_filter_row f6, t6, t2
    190    vld_filter_row f7, t6, t2
    191 
    192    transpose_8x8b_extend_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    193 
    194    vmulwev.w.h     vr16,     vr24,    vr0
    195    vmulwod.w.h     vr17,     vr24,    vr0
    196    vmaddwev.w.h    vr16,     vr25,    vr1
    197    vmaddwod.w.h    vr17,     vr25,    vr1
    198    vmaddwev.w.h    vr16,     vr26,    vr2
    199    vmaddwod.w.h    vr17,     vr26,    vr2
    200    vmaddwev.w.h    vr16,     vr27,    vr3
    201    vmaddwod.w.h    vr17,     vr27,    vr3
    202    vmaddwev.w.h    vr16,     vr28,    vr4
    203    vmaddwod.w.h    vr17,     vr28,    vr4
    204    vmaddwev.w.h    vr16,     vr29,    vr5
    205    vmaddwod.w.h    vr17,     vr29,    vr5
    206    vmaddwev.w.h    vr16,     vr30,    vr6
    207    vmaddwod.w.h    vr17,     vr30,    vr6
    208    vmaddwev.w.h    vr16,     vr31,    vr7
    209    vmaddwod.w.h    vr17,     vr31,    vr7
    210 
    211    vssrarni.h.w    vr16,     vr16,    \shift
    212    vssrarni.h.w    vr17,     vr17,    \shift
    213    vilvl.h         vr16,     vr17,    vr16
    214    vadd.h          vr16,     vr16,    vr21
    215 
    216    vor.v           vr24,     vr25,    vr25
    217    vor.v           vr25,     vr26,    vr26
    218    vor.v           vr26,     vr27,    vr27
    219    vor.v           vr27,     vr28,    vr28
    220    vor.v           vr28,     vr29,    vr29
    221    vor.v           vr29,     vr30,    vr30
    222    vor.v           vr30,     vr31,    vr31
    223 
    224 .ifb \t
    225    vssrarni.bu.h   vr16,     vr16,    0
    226 .endif
    227 
    228    addi.d          t7,       t7,      -1
    229 .ifnb \t
    230    vst             vr16,     a0,      0
    231 .else
    232    vstelm.d        vr16,     a0,      0,   0
    233 .endif
    234    add.d           a0,       a1,      a0
    235 
    236    add.d           a6,       a6,      a4
    237    blt             zero,     t7,      1b
    238 
    239    fld.d           f24,      sp,      0
    240    fld.d           f25,      sp,      8
    241    fld.d           f26,      sp,      16
    242    fld.d           f27,      sp,      24
    243    fld.d           f28,      sp,      32
    244    fld.d           f29,      sp,      40
    245    fld.d           f30,      sp,      48
    246    fld.d           f31,      sp,      56
    247    addi.d          sp,       sp,      64
    248 endfunc
    249 .endm
    250 
    251 warp  , 11
    252 warp t, 7
    253 
    254 .macro FILTER_WARP_RND_P_LASX in0, in1, in2, out0, out1, out2, out3
    255    xvshuf.b        xr2,    \in0,     \in0,     \in2
    256 
    257    addi.w          t4,     \in1,     512
    258    srai.w          t4,     t4,       10
    259    addi.w          t4,     t4,       64
    260    slli.w          t4,     t4,       3
    261    vldx            vr3,    t5,       t4
    262    add.w           t3,     t3,       t0   // tmx += abcd[0]
    263 
    264    addi.w          t4,     t3,       512
    265    srai.w          t4,     t4,       10
    266    addi.w          t4,     t4,       64
    267    slli.w          t4,     t4,       3
    268    vldx            vr4,    t5,       t4
    269    add.w           t3,     t3,       t0   // tmx += abcd[0]
    270 
    271    addi.w          t4,     t3,       512
    272    srai.w          t4,     t4,       10
    273    addi.w          t4,     t4,       64
    274    slli.w          t4,     t4,       3
    275    vldx            vr5,    t5,       t4
    276    add.w           t3,     t3,       t0   // tmx += abcd[0]
    277 
    278    addi.w          t4,     t3,       512
    279    srai.w          t4,     t4,       10
    280    addi.w          t4,     t4,       64
    281    slli.w          t4,     t4,       3
    282    vldx            vr6,    t5,       t4
    283    add.w           t3,     t3,       t0   // tmx += abcd[0]
    284 
    285    xvinsve0.d      xr3,    xr5,      1
    286    xvinsve0.d      xr3,    xr4,      2
    287    xvinsve0.d      xr3,    xr6,      3
    288 
    289    xvmulwev.h.bu.b xr4,    xr2,      xr3
    290    xvmulwod.h.bu.b xr5,    xr2,      xr3
    291    xvilvl.d        xr2,    xr5,      xr4
    292    xvilvh.d        xr3,    xr5,      xr4
    293    xvhaddw.w.h     xr2,    xr2,      xr2
    294    xvhaddw.w.h     xr3,    xr3,      xr3
    295    xvhaddw.d.w     xr2,    xr2,      xr2
    296    xvhaddw.d.w     xr3,    xr3,      xr3
    297    xvhaddw.q.d     xr2,    xr2,      xr2
    298    xvhaddw.q.d     xr3,    xr3,      xr3
    299 
    300    xvextrins.w     \out0,  xr2,      \out1
    301    xvextrins.w     \out2,  xr3,      \out3
    302 .endm
    303 
    304 .macro FILTER_WARP_CLIP_LASX in0, in1, in2, out0, out1
    305    add.w           \in0,     \in0,    \in1
    306    addi.w          t6,       \in0,    512
    307    srai.w          t6,       t6,      10
    308    addi.w          t6,       t6,      64
    309    slli.w          t6,       t6,      3
    310    fldx.d          f1,       t5,      t6
    311 
    312    add.w           t2,       t2,      t7
    313    addi.w          t6,       t2,      512
    314    srai.w          t6,       t6,      10
    315    addi.w          t6,       t6,      64
    316    slli.w          t6,       t6,      3
    317    fldx.d          f2,       t5,      t6
    318 
    319    vilvl.d         vr0,      vr2,     vr1
    320    vext2xv.h.b     xr0,      xr0
    321    xvmulwev.w.h    xr3,      \in2,    xr0
    322    xvmaddwod.w.h   xr3,      \in2,    xr0
    323    xvhaddw.d.w     xr3,      xr3,     xr3
    324    xvhaddw.q.d     xr3,      xr3,     xr3
    325    xvextrins.w     \out0,    xr3,     \out1
    326 .endm
    327 
    328 const shuf0
    329 .byte  0, 1, 2, 3, 4, 5, 6, 7, 2, 3, 4, 5, 6, 7, 8, 9
    330 .byte  1, 2, 3, 4, 5, 6, 7, 8, 3, 4, 5, 6, 7, 8, 9, 10
    331 endconst
    332 
    333 const warp_sh
    334 .rept 2
    335 .byte 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17
    336 .endr
    337 .rept 2
    338 .byte 18, 19, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    339 .endr
    340 endconst
    341 
    342 .macro warp_lasx t, shift
    343 function warp_affine_8x8\t\()_8bpc_lasx
    344    addi.d          sp,       sp,      -16
    345    ld.h            t0,       a4,      0   // abcd[0]
    346    ld.h            t1,       a4,      2   // abcd[1]
    347    fst.d           f24,      sp,      0
    348    fst.d           f25,      sp,      8
    349 
    350    alsl.w          t2,       a3,      a3,     1
    351    addi.w          t3,       a5,      0
    352    la.local        t4,       warp_sh
    353    la.local        t5,       dav1d_mc_warp_filter
    354    sub.d           a2,       a2,      t2
    355    addi.d          a2,       a2,      -3
    356    vld             vr0,      a2,      0
    357    xvld            xr24,     t4,      0
    358    xvld            xr25,     t4,      32
    359    la.local        t2,       shuf0
    360    xvld            xr1,      t2,      0
    361    xvpermi.q       xr0,      xr0,     0x00
    362    xvaddi.bu        xr9,    xr1,      4
    363    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
    364    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
    365 
    366    add.w           a5,       a5,      t1
    367    or              t3,       a5,      a5
    368    add.d           a2,       a2,      a3
    369    vld             vr0,      a2,      0
    370    xvpermi.q       xr0,      xr0,     0x00
    371    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
    372    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
    373 
    374    add.w           a5,       a5,      t1
    375    or              t3,       a5,      a5
    376    add.d           a2,       a2,      a3
    377    vld             vr0,      a2,      0
    378    xvpermi.q       xr0,      xr0,     0x00
    379    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
    380    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
    381 
    382    add.w           a5,       a5,      t1
    383    or              t3,       a5,      a5
    384    add.d           a2,       a2,      a3
    385    vld             vr0,      a2,      0
    386    xvpermi.q       xr0,      xr0,     0x00
    387    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
    388    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
    389 
    390    add.w           a5,       a5,      t1
    391    or              t3,       a5,      a5
    392    add.d           a2,       a2,      a3
    393    vld             vr0,      a2,      0
    394    xvpermi.q       xr0,      xr0,     0x00
    395    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x00, xr13, 0x00
    396    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x00, xr15, 0x00
    397 
    398    add.w           a5,       a5,      t1
    399    or              t3,       a5,      a5
    400    add.d           a2,       a2,      a3
    401    vld             vr0,      a2,      0
    402    xvpermi.q       xr0,      xr0,     0x00
    403    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x10, xr13, 0x10
    404    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x10, xr15, 0x10
    405 
    406    add.w           a5,       a5,      t1
    407    or              t3,       a5,      a5
    408    add.d           a2,       a2,      a3
    409    vld             vr0,      a2,      0
    410    xvpermi.q       xr0,      xr0,     0x00
    411    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x20, xr13, 0x20
    412    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x20, xr15, 0x20
    413 
    414    add.w           a5,       a5,      t1
    415    or              t3,       a5,      a5
    416    add.d           a2,       a2,      a3
    417    vld             vr0,      a2,      0
    418    xvpermi.q       xr0,      xr0,     0x00
    419    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr12, 0x30, xr13, 0x30
    420    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr14, 0x30, xr15, 0x30
    421 
    422    xvsrarni.h.w    xr12,     xr7,     3
    423    xvsrarni.h.w    xr13,     xr8,     3
    424    xvsrarni.h.w    xr14,     xr10,    3
    425    xvsrarni.h.w    xr15,     xr11,    3
    426 
    427    add.w           a5,       a5,      t1
    428    or              t3,       a5,      a5
    429    add.d           a2,       a2,      a3
    430    vld             vr0,      a2,      0
    431    xvpermi.q       xr0,      xr0,     0x00
    432    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x00, xr8, 0x00
    433    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x00, xr11, 0x00
    434 
    435    add.w           a5,       a5,      t1
    436    or              t3,       a5,      a5
    437    add.d           a2,       a2,      a3
    438    vld             vr0,      a2,      0
    439    xvpermi.q       xr0,      xr0,     0x00
    440    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x10, xr8, 0x10
    441    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x10, xr11, 0x10
    442 
    443    add.w           a5,       a5,      t1
    444    or              t3,       a5,      a5
    445    add.d           a2,       a2,      a3
    446    vld             vr0,      a2,      0
    447    xvpermi.q       xr0,      xr0,     0x00
    448    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x20, xr8, 0x20
    449    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x20, xr11, 0x20
    450 
    451    add.w           a5,       a5,      t1
    452    or              t3,       a5,      a5
    453    add.d           a2,       a2,      a3
    454    vld             vr0,      a2,      0
    455    xvpermi.q       xr0,      xr0,     0x00
    456    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr7, 0x30, xr8, 0x30
    457    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr10, 0x30, xr11, 0x30
    458 
    459    add.w           a5,       a5,      t1
    460    or              t3,       a5,      a5
    461    add.d           a2,       a2,      a3
    462    vld             vr0,      a2,      0
    463    xvpermi.q       xr0,      xr0,     0x00
    464    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x00, xr17, 0x00
    465    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x00, xr19, 0x00
    466 
    467    add.w           a5,       a5,      t1
    468    or              t3,       a5,      a5
    469    add.d           a2,       a2,      a3
    470    vld             vr0,      a2,      0
    471    xvpermi.q       xr0,      xr0,     0x00
    472    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x10, xr17, 0x10
    473    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x10, xr19, 0x10
    474 
    475    add.w           a5,       a5,      t1
    476    or              t3,       a5,      a5
    477    add.d           a2,       a2,      a3
    478    vld             vr0,      a2,      0
    479    xvpermi.q       xr0,      xr0,     0x00
    480    FILTER_WARP_RND_P_LASX xr0, a5, xr1, xr16, 0x20, xr17, 0x20
    481    FILTER_WARP_RND_P_LASX xr0, t3, xr9, xr18, 0x20, xr19, 0x20
    482 
    483    xvsrarni.h.w    xr16,     xr7,     3
    484    xvsrarni.h.w    xr17,     xr8,     3
    485    xvsrarni.h.w    xr18,     xr10,    3
    486    xvsrarni.h.w    xr19,     xr11,    3
    487 
    488    addi.w          t2,       a6,      0   // my
    489    ld.h            t7,       a4,      4   // abcd[2]
    490    ld.h            t8,       a4,      6   // abcd[3]
    491 
    492 .ifnb \t
    493    slli.d          a1,       a1,      1
    494 .endif
    495 
    496    // y = 0
    497    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
    498    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
    499    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
    500    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
    501 
    502    xvshuf.b         xr12,     xr16,    xr12,   xr24
    503    xvshuf.b         xr13,     xr17,    xr13,   xr24
    504    xvshuf.b         xr14,     xr18,    xr14,   xr24
    505    xvshuf.b         xr15,     xr19,    xr15,   xr24
    506    xvextrins.h      xr24,     xr25,    0x70
    507 
    508    add.w           a6,       a6,      t8
    509    addi.w          t2,       a6,      0
    510    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
    511    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
    512    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
    513    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
    514 
    515 .ifnb \t
    516    xvssrarni.h.w   xr21,     xr20,     \shift
    517    xvpermi.q       xr22,     xr21,     0x01
    518    vilvl.h         vr23,     vr22,     vr21
    519    vilvh.h         vr21,     vr22,     vr21
    520    vst             vr23,     a0,       0
    521    vstx            vr21,     a0,       a1
    522 .else
    523    xvssrarni.hu.w   xr21,    xr20,     \shift
    524    xvssrlni.bu.h    xr22,    xr21,     0
    525    xvpermi.q        xr23,    xr22,     0x01
    526    vilvl.b          vr21,    vr23,     vr22
    527    fst.d            f21,     a0,       0
    528    add.d            a0,      a0,       a1
    529    vstelm.d         vr21,    a0,       0,     1
    530 .endif
    531 
    532    xvaddi.bu        xr25,     xr25,    2
    533    xvshuf.b         xr12,     xr16,    xr12,   xr24
    534    xvshuf.b         xr13,     xr17,    xr13,   xr24
    535    xvshuf.b         xr14,     xr18,    xr14,   xr24
    536    xvshuf.b         xr15,     xr19,    xr15,   xr24
    537    xvextrins.h      xr24,     xr25,    0x70
    538 
    539    add.w           a6,       a6,      t8
    540    addi.w          t2,       a6,      0
    541    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
    542    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
    543    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
    544    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
    545 
    546    xvaddi.bu        xr25,     xr25,    2
    547    xvshuf.b         xr12,     xr16,    xr12,   xr24
    548    xvshuf.b         xr13,     xr17,    xr13,   xr24
    549    xvshuf.b         xr14,     xr18,    xr14,   xr24
    550    xvshuf.b         xr15,     xr19,    xr15,   xr24
    551    xvextrins.h      xr24,     xr25,    0x70
    552 
    553    add.w           a6,       a6,      t8
    554    addi.w          t2,       a6,      0
    555    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
    556    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
    557    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
    558    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
    559 
    560 .ifnb \t
    561    xvssrarni.h.w   xr21,     xr20,     \shift
    562    alsl.d          a0,       a1,       a0,     1
    563    xvpermi.q       xr22,     xr21,     0x01
    564    vilvl.h         vr23,     vr22,     vr21
    565    vilvh.h         vr21,     vr22,     vr21
    566    vst             vr23,     a0,       0
    567    vstx            vr21,     a0,       a1
    568 .else
    569    xvssrarni.hu.w   xr21,    xr20,     11
    570    xvssrlni.bu.h    xr22,    xr21,     0
    571    xvpermi.q        xr23,    xr22,     0x01
    572    vilvl.b          vr21,    vr23,     vr22
    573    add.d            a0,      a0,       a1
    574    fst.d            f21,     a0,       0
    575    add.d            a0,      a0,       a1
    576    vstelm.d         vr21,    a0,       0,     1
    577 .endif
    578 
    579    xvaddi.bu        xr25,     xr25,    2
    580    xvshuf.b         xr12,     xr16,    xr12,   xr24
    581    xvshuf.b         xr13,     xr17,    xr13,   xr24
    582    xvshuf.b         xr14,     xr18,    xr14,   xr24
    583    xvshuf.b         xr15,     xr19,    xr15,   xr24
    584    xvextrins.h      xr24,     xr25,    0x70
    585 
    586    add.w           a6,       a6,      t8
    587    addi.w          t2,       a6,      0
    588    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
    589    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
    590    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
    591    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
    592 
    593    xvaddi.bu        xr25,     xr25,    2
    594    xvshuf.b         xr12,     xr16,    xr12,   xr24
    595    xvshuf.b         xr13,     xr17,    xr13,   xr24
    596    xvshuf.b         xr14,     xr18,    xr14,   xr24
    597    xvshuf.b         xr15,     xr19,    xr15,   xr24
    598    xvextrins.h      xr24,     xr25,    0x70
    599 
    600    add.w           a6,       a6,      t8
    601    addi.w          t2,       a6,      0
    602    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
    603    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
    604    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
    605    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
    606 
    607 .ifnb \t
    608    xvssrarni.h.w   xr21,     xr20,     \shift
    609    alsl.d          a0,       a1,       a0,     1
    610    xvpermi.q       xr22,     xr21,     0x01
    611    vilvl.h         vr23,     vr22,     vr21
    612    vilvh.h         vr21,     vr22,     vr21
    613    vst             vr23,     a0,       0
    614    vstx            vr21,     a0,       a1
    615 .else
    616    xvssrarni.hu.w   xr21,    xr20,     11
    617    xvssrlni.bu.h    xr22,    xr21,     0
    618    xvpermi.q        xr23,    xr22,     0x01
    619    vilvl.b          vr21,    vr23,     vr22
    620    add.d            a0,      a0,       a1
    621    fst.d            f21,     a0,       0
    622    add.d            a0,      a0,       a1
    623    vstelm.d         vr21,    a0,       0,     1
    624 .endif
    625 
    626    xvaddi.bu        xr25,     xr25,    2
    627    xvshuf.b         xr12,     xr16,    xr12,   xr24
    628    xvshuf.b         xr13,     xr17,    xr13,   xr24
    629    xvshuf.b         xr14,     xr18,    xr14,   xr24
    630    xvshuf.b         xr15,     xr19,    xr15,   xr24
    631    xvextrins.h      xr24,     xr25,    0x70
    632 
    633    add.w           a6,       a6,      t8
    634    addi.w          t2,       a6,      0
    635    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr20, 0x00
    636    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr20, 0x10
    637    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr20, 0x20
    638    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr20, 0x30
    639 
    640    xvshuf.b         xr12,     xr16,    xr12,   xr24
    641    xvshuf.b         xr13,     xr17,    xr13,   xr24
    642    xvshuf.b         xr14,     xr18,    xr14,   xr24
    643    xvshuf.b         xr15,     xr19,    xr15,   xr24
    644 
    645    add.w           a6,       a6,      t8
    646    addi.w          t2,       a6,      0
    647    FILTER_WARP_CLIP_LASX  t2, zero, xr12,  xr21, 0x00
    648    FILTER_WARP_CLIP_LASX  t2, t7,   xr13,  xr21, 0x10
    649    FILTER_WARP_CLIP_LASX  t2, t7,   xr14,  xr21, 0x20
    650    FILTER_WARP_CLIP_LASX  t2, t7,   xr15,  xr21, 0x30
    651 
    652 .ifnb \t
    653    xvssrarni.h.w   xr21,     xr20,     \shift
    654    alsl.d          a0,       a1,       a0,     1
    655    xvpermi.q       xr22,     xr21,     0x01
    656    vilvl.h         vr23,     vr22,     vr21
    657    vilvh.h         vr21,     vr22,     vr21
    658    vst             vr23,     a0,       0
    659    vstx            vr21,     a0,       a1
    660 .else
    661    xvssrarni.hu.w   xr21,    xr20,     11
    662    xvssrlni.bu.h    xr22,    xr21,     0
    663    xvpermi.q        xr23,    xr22,     0x01
    664    vilvl.b          vr21,    vr23,     vr22
    665    add.d            a0,      a0,       a1
    666    fst.d            f21,     a0,       0
    667    add.d            a0,      a0,       a1
    668    vstelm.d         vr21,    a0,       0,     1
    669 .endif
    670    fld.d            f24,     sp,       0
    671    fld.d            f25,     sp,       8
    672    addi.d           sp,      sp,       16
    673 endfunc
    674 .endm
    675 
    676 warp_lasx , 11
    677 warp_lasx t, 7
    678 
    679 /*
    680 static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
    681                    const int16_t *tmp1, const int16_t *tmp2,
    682                    const int w, int h,
    683                    const int weight HIGHBD_DECL_SUFFIX)
    684 */
    685 
    686 #define bpc8_sh     5     // sh = intermediate_bits + 1
    687 #define bpcw8_sh    8     // sh = intermediate_bits + 4
    688 
    689 #define bpc_sh   bpc8_sh
    690 #define bpcw_sh  bpcw8_sh
    691 
    692 function avg_8bpc_lsx
    693    addi.d        t8,     a0,     0
    694 
    695    clz.w         t0,     a4
    696    li.w          t1,     24
    697    sub.w         t0,     t0,      t1
    698    la.local      t1,     .AVG_LSX_JRTABLE
    699    alsl.d        t0,     t0,      t1,    1
    700    ld.h          t2,     t0,      0  // The jump addresses are relative to AVG_LSX_JRTABLE
    701    add.d         t1,     t1,      t2 // Get absolute address
    702    jirl          $r0,    t1,      0
    703 
    704    .align   3
    705 .AVG_LSX_JRTABLE:
    706    .hword .AVG_W128_LSX - .AVG_LSX_JRTABLE
    707    .hword .AVG_W64_LSX  - .AVG_LSX_JRTABLE
    708    .hword .AVG_W32_LSX  - .AVG_LSX_JRTABLE
    709    .hword .AVG_W16_LSX  - .AVG_LSX_JRTABLE
    710    .hword .AVG_W8_LSX   - .AVG_LSX_JRTABLE
    711    .hword .AVG_W4_LSX   - .AVG_LSX_JRTABLE
    712 
    713 .AVG_W4_LSX:
    714    vld           vr0,    a2,     0
    715    vld           vr1,    a3,     0
    716    vadd.h        vr2,    vr0,    vr1
    717    vssrarni.bu.h vr3,    vr2,    bpc_sh
    718    vstelm.w      vr3,    a0,     0,    0
    719    add.d         a0,     a0,     a1
    720    vstelm.w      vr3,    a0,     0,    1
    721    addi.w        a5,     a5,     -2
    722    addi.d        a2,     a2,     16
    723    addi.d        a3,     a3,     16
    724    add.d         a0,     a0,     a1
    725    blt           zero,   a5,     .AVG_W4_LSX
    726    b             .AVG_END_LSX
    727 
    728 .AVG_W8_LSX:
    729    vld           vr0,    a2,     0
    730    vld           vr2,    a2,     16
    731    vld           vr1,    a3,     0
    732    vld           vr3,    a3,     16
    733    vadd.h        vr4,    vr0,    vr1
    734    vadd.h        vr5,    vr2,    vr3
    735    vssrarni.bu.h vr5,    vr4,    bpc_sh
    736    addi.w        a5,     a5,     -2
    737    addi.d        a2,     a2,     32
    738    vstelm.d      vr5,    a0,     0,    0
    739    add.d         a0,     a0,     a1
    740    vstelm.d      vr5,    a0,     0,    1
    741    addi.d        a3,     a3,     32
    742    add.d         a0,     a0,     a1
    743    blt           zero,   a5,     .AVG_W8_LSX
    744    b             .AVG_END_LSX
    745 
    746 .AVG_W16_LSX:
    747    vld           vr0,    a2,     0
    748    vld           vr2,    a2,     16
    749    vld           vr1,    a3,     0
    750    vld           vr3,    a3,     16
    751    vadd.h        vr4,    vr0,    vr1
    752    vadd.h        vr5,    vr2,    vr3
    753    vssrarni.bu.h vr5,    vr4,    bpc_sh
    754    addi.w        a5,     a5,     -1
    755    addi.d        a2,     a2,     32
    756    vst           vr5,    a0,     0
    757    addi.d        a3,     a3,     32
    758    add.d         a0,     a0,     a1
    759    blt           zero,   a5,     .AVG_W16_LSX
    760    b             .AVG_END_LSX
    761 
    762 .AVG_W32_LSX:
    763    vld           vr0,    a2,     0
    764    vld           vr2,    a2,     16
    765    vld           vr4,    a2,     32
    766    vld           vr6,    a2,     48
    767    vld           vr1,    a3,     0
    768    vld           vr3,    a3,     16
    769    vld           vr5,    a3,     32
    770    vld           vr7,    a3,     48
    771    vadd.h        vr0,    vr0,    vr1
    772    vadd.h        vr2,    vr2,    vr3
    773    vadd.h        vr4,    vr4,    vr5
    774    vadd.h        vr6,    vr6,    vr7
    775    vssrarni.bu.h vr2,    vr0,    bpc_sh
    776    vssrarni.bu.h vr6,    vr4,    bpc_sh
    777    addi.w        a5,     a5,     -1
    778    addi.d        a2,     a2,     64
    779    vst           vr2,    a0,     0
    780    vst           vr6,    a0,     16
    781    addi.d        a3,     a3,     64
    782    add.d         a0,     a0,     a1
    783    blt           zero,   a5,     .AVG_W32_LSX
    784    b             .AVG_END_LSX
    785 
    786 .AVG_W64_LSX:
    787 .rept 4
    788    vld           vr0,    a2,     0
    789    vld           vr2,    a2,     16
    790    vld           vr1,    a3,     0
    791    vld           vr3,    a3,     16
    792    vadd.h        vr0,    vr0,    vr1
    793    vadd.h        vr2,    vr2,    vr3
    794    vssrarni.bu.h vr2,    vr0,    bpc_sh
    795    addi.d        a2,     a2,     32
    796    addi.d        a3,     a3,     32
    797    vst           vr2,    a0,     0
    798    addi.d        a0,     a0,     16
    799 .endr
    800    addi.w        a5,     a5,     -1
    801    add.d         t8,     t8,     a1
    802    add.d         a0,     t8,     zero
    803    blt           zero,   a5,     .AVG_W64_LSX
    804    b             .AVG_END_LSX
    805 
    806 .AVG_W128_LSX:
    807 .rept 8
    808    vld           vr0,    a2,     0
    809    vld           vr2,    a2,     16
    810    vld           vr1,    a3,     0
    811    vld           vr3,    a3,     16
    812    vadd.h        vr0,    vr0,    vr1
    813    vadd.h        vr2,    vr2,    vr3
    814    vssrarni.bu.h vr2,    vr0,    bpc_sh
    815    addi.d        a2,     a2,     32
    816    addi.d        a3,     a3,     32
    817    vst           vr2,    a0,     0
    818    addi.d        a0,     a0,     16
    819 .endr
    820    addi.w        a5,     a5,     -1
    821    add.d         t8,     t8,     a1
    822    add.d         a0,     t8,     zero
    823    blt           zero,   a5,     .AVG_W128_LSX
    824 .AVG_END_LSX:
    825 endfunc
    826 
    827 function avg_8bpc_lasx
    828    clz.w         t0,     a4
    829    li.w          t1,     24
    830    sub.w         t0,     t0,      t1
    831    la.local      t1,     .AVG_LASX_JRTABLE
    832    alsl.d        t0,     t0,      t1,    1
    833    ld.h          t2,     t0,      0
    834    add.d         t1,     t1,      t2
    835    jirl          $r0,    t1,      0
    836 
    837    .align   3
    838 .AVG_LASX_JRTABLE:
    839    .hword .AVG_W128_LASX - .AVG_LASX_JRTABLE
    840    .hword .AVG_W64_LASX  - .AVG_LASX_JRTABLE
    841    .hword .AVG_W32_LASX  - .AVG_LASX_JRTABLE
    842    .hword .AVG_W16_LASX  - .AVG_LASX_JRTABLE
    843    .hword .AVG_W8_LASX   - .AVG_LASX_JRTABLE
    844    .hword .AVG_W4_LASX   - .AVG_LASX_JRTABLE
    845 
    846 .AVG_W4_LASX:
    847    vld            vr0,    a2,     0
    848    vld            vr1,    a3,     0
    849    vadd.h         vr0,    vr0,    vr1
    850    vssrarni.bu.h  vr1,    vr0,    bpc_sh
    851    vstelm.w       vr1,    a0,     0,    0
    852    add.d          a0,     a0,     a1
    853    vstelm.w       vr1,    a0,     0,    1
    854    addi.w         a5,     a5,     -2
    855    addi.d         a2,     a2,     16
    856    addi.d         a3,     a3,     16
    857    add.d          a0,     a0,     a1
    858    blt            zero,   a5,     .AVG_W4_LASX
    859    b              .AVG_END_LASX
    860 .AVG_W8_LASX:
    861    xvld           xr0,    a2,     0
    862    xvld           xr1,    a3,     0
    863    xvadd.h        xr2,    xr0,    xr1
    864    xvssrarni.bu.h xr1,    xr2,    bpc_sh
    865    xvstelm.d      xr1,    a0,     0,    0
    866    add.d          a0,     a0,     a1
    867    xvstelm.d      xr1,    a0,     0,    2
    868    addi.w         a5,     a5,     -2
    869    addi.d         a2,     a2,     32
    870    addi.d         a3,     a3,     32
    871    add.d          a0,     a1,     a0
    872    blt            zero,   a5,     .AVG_W8_LASX
    873    b              .AVG_END_LASX
    874 .AVG_W16_LASX:
    875    xvld           xr0,    a2,     0
    876    xvld           xr2,    a2,     32
    877    xvld           xr1,    a3,     0
    878    xvld           xr3,    a3,     32
    879    xvadd.h        xr4,    xr0,    xr1
    880    xvadd.h        xr5,    xr2,    xr3
    881    xvssrarni.bu.h xr5,    xr4,    bpc_sh
    882    xvpermi.d      xr2,    xr5,    0xd8
    883    xvpermi.d      xr3,    xr5,    0x8d
    884    vst            vr2,    a0,     0
    885    vstx           vr3,    a0,     a1
    886    addi.w         a5,     a5,     -2
    887    addi.d         a2,     a2,     64
    888    addi.d         a3,     a3,     64
    889    alsl.d         a0,     a1,     a0,   1
    890    blt            zero,   a5,     .AVG_W16_LASX
    891    b              .AVG_END_LASX
    892 .AVG_W32_LASX:
    893    xvld           xr0,    a2,     0
    894    xvld           xr2,    a2,     32
    895    xvld           xr1,    a3,     0
    896    xvld           xr3,    a3,     32
    897    xvadd.h        xr4,    xr0,    xr1
    898    xvadd.h        xr5,    xr2,    xr3
    899    xvssrarni.bu.h xr5,    xr4,    bpc_sh
    900    xvpermi.d      xr6,    xr5,    0xd8
    901    xvst           xr6,    a0,     0
    902    addi.w         a5,     a5,     -1
    903    addi.d         a2,     a2,     64
    904    addi.d         a3,     a3,     64
    905    add.d          a0,     a0,     a1
    906    blt            zero,   a5,     .AVG_W32_LASX
    907    b              .AVG_END_LASX
    908 .AVG_W64_LASX:
    909    xvld           xr0,    a2,     0
    910    xvld           xr2,    a2,     32
    911    xvld           xr4,    a2,     64
    912    xvld           xr6,    a2,     96
    913    xvld           xr1,    a3,     0
    914    xvld           xr3,    a3,     32
    915    xvld           xr5,    a3,     64
    916    xvld           xr7,    a3,     96
    917    xvadd.h        xr0,    xr0,    xr1
    918    xvadd.h        xr2,    xr2,    xr3
    919    xvadd.h        xr4,    xr4,    xr5
    920    xvadd.h        xr6,    xr6,    xr7
    921    xvssrarni.bu.h xr2,    xr0,    bpc_sh
    922    xvssrarni.bu.h xr6,    xr4,    bpc_sh
    923    xvpermi.d      xr1,    xr2,    0xd8
    924    xvpermi.d      xr3,    xr6,    0xd8
    925    xvst           xr1,    a0,     0
    926    xvst           xr3,    a0,     32
    927    addi.w         a5,     a5,     -1
    928    addi.d         a2,     a2,     128
    929    addi.d         a3,     a3,     128
    930    add.d          a0,     a0,     a1
    931    blt            zero,   a5,     .AVG_W64_LASX
    932    b              .AVG_END_LASX
    933 .AVG_W128_LASX:
    934    xvld           xr0,    a2,     0
    935    xvld           xr2,    a2,     32
    936    xvld           xr4,    a2,     64
    937    xvld           xr6,    a2,     96
    938    xvld           xr8,    a2,     128
    939    xvld           xr10,   a2,     160
    940    xvld           xr12,   a2,     192
    941    xvld           xr14,   a2,     224
    942    xvld           xr1,    a3,     0
    943    xvld           xr3,    a3,     32
    944    xvld           xr5,    a3,     64
    945    xvld           xr7,    a3,     96
    946    xvld           xr9,    a3,     128
    947    xvld           xr11,   a3,     160
    948    xvld           xr13,   a3,     192
    949    xvld           xr15,   a3,     224
    950    xvadd.h        xr0,    xr0,    xr1
    951    xvadd.h        xr2,    xr2,    xr3
    952    xvadd.h        xr4,    xr4,    xr5
    953    xvadd.h        xr6,    xr6,    xr7
    954    xvadd.h        xr8,    xr8,    xr9
    955    xvadd.h        xr10,   xr10,   xr11
    956    xvadd.h        xr12,   xr12,   xr13
    957    xvadd.h        xr14,   xr14,   xr15
    958    xvssrarni.bu.h xr2,    xr0,    bpc_sh
    959    xvssrarni.bu.h xr6,    xr4,    bpc_sh
    960    xvssrarni.bu.h xr10,   xr8,    bpc_sh
    961    xvssrarni.bu.h xr14,   xr12,   bpc_sh
    962    xvpermi.d      xr1,    xr2,    0xd8
    963    xvpermi.d      xr3,    xr6,    0xd8
    964    xvpermi.d      xr5,    xr10,   0xd8
    965    xvpermi.d      xr7,    xr14,   0xd8
    966    xvst           xr1,    a0,     0
    967    xvst           xr3,    a0,     32
    968    xvst           xr5,    a0,     64
    969    xvst           xr7,    a0,     96
    970    addi.w         a5,     a5,     -1
    971    addi.d         a2,     a2,     256
    972    addi.d         a3,     a3,     256
    973    add.d          a0,     a0,     a1
    974    blt            zero,   a5,     .AVG_W128_LASX
    975 .AVG_END_LASX:
    976 endfunc
    977 
    978 function w_avg_8bpc_lsx
    979    addi.d        t8,     a0,     0
    980    li.w          t2,     16
    981    sub.w         t2,     t2,     a6  // 16 - weight
    982    vreplgr2vr.h  vr21,   a6
    983    vreplgr2vr.h  vr22,   t2
    984 
    985    clz.w         t0,     a4
    986    li.w          t1,     24
    987    sub.w         t0,     t0,      t1
    988    la.local      t1,     .W_AVG_LSX_JRTABLE
    989    alsl.d        t0,     t0,      t1,    1
    990    ld.h          t2,     t0,      0
    991    add.d         t1,     t1,      t2
    992    jirl          $r0,    t1,      0
    993 
    994    .align   3
    995 .W_AVG_LSX_JRTABLE:
    996    .hword .W_AVG_W128_LSX - .W_AVG_LSX_JRTABLE
    997    .hword .W_AVG_W64_LSX  - .W_AVG_LSX_JRTABLE
    998    .hword .W_AVG_W32_LSX  - .W_AVG_LSX_JRTABLE
    999    .hword .W_AVG_W16_LSX  - .W_AVG_LSX_JRTABLE
   1000    .hword .W_AVG_W8_LSX   - .W_AVG_LSX_JRTABLE
   1001    .hword .W_AVG_W4_LSX   - .W_AVG_LSX_JRTABLE
   1002 
   1003 .W_AVG_W4_LSX:
   1004    vld           vr0,    a2,     0
   1005    vld           vr1,    a3,     0
   1006    vmulwev.w.h   vr2,    vr0,    vr21
   1007    vmulwod.w.h   vr3,    vr0,    vr21
   1008    vmaddwev.w.h  vr2,    vr1,    vr22
   1009    vmaddwod.w.h  vr3,    vr1,    vr22
   1010    vssrarni.hu.w vr3,    vr2,    bpcw_sh
   1011    vssrlni.bu.h  vr1,    vr3,    0
   1012    vpickod.w     vr4,    vr2,    vr1
   1013    vilvl.b       vr0,    vr4,    vr1
   1014    fst.s         f0,     a0,     0
   1015    add.d         a0,     a0,     a1
   1016    vstelm.w      vr0,    a0,     0,   1
   1017    addi.w        a5,     a5,     -2
   1018    addi.d        a2,     a2,     16
   1019    addi.d        a3,     a3,     16
   1020    add.d         a0,     a1,     a0
   1021    blt           zero,   a5,     .W_AVG_W4_LSX
   1022    b             .W_AVG_END_LSX
   1023 .W_AVG_W8_LSX:
   1024    vld           vr0,    a2,     0
   1025    vld           vr1,    a3,     0
   1026    vmulwev.w.h   vr2,    vr0,    vr21
   1027    vmulwod.w.h   vr3,    vr0,    vr21
   1028    vmaddwev.w.h  vr2,    vr1,    vr22
   1029    vmaddwod.w.h  vr3,    vr1,    vr22
   1030    vssrarni.hu.w vr3,    vr2,    bpcw_sh
   1031    vssrlni.bu.h  vr1,    vr3,    0
   1032    vpickod.w     vr4,    vr2,    vr1
   1033    vilvl.b       vr0,    vr4,    vr1
   1034    fst.d         f0,     a0,     0
   1035    addi.w        a5,     a5,     -1
   1036    addi.d        a2,     a2,     16
   1037    addi.d        a3,     a3,     16
   1038    add.d         a0,     a0,     a1
   1039    blt           zero,   a5,     .W_AVG_W8_LSX
   1040    b             .W_AVG_END_LSX
   1041 .W_AVG_W16_LSX:
   1042    vld           vr0,    a2,     0
   1043    vld           vr2,    a2,     16
   1044    vld           vr1,    a3,     0
   1045    vld           vr3,    a3,     16
   1046    vmulwev.w.h   vr4,    vr0,    vr21
   1047    vmulwod.w.h   vr5,    vr0,    vr21
   1048    vmulwev.w.h   vr6,    vr2,    vr21
   1049    vmulwod.w.h   vr7,    vr2,    vr21
   1050    vmaddwev.w.h  vr4,    vr1,    vr22
   1051    vmaddwod.w.h  vr5,    vr1,    vr22
   1052    vmaddwev.w.h  vr6,    vr3,    vr22
   1053    vmaddwod.w.h  vr7,    vr3,    vr22
   1054    vssrarni.hu.w vr6,    vr4,    bpcw_sh
   1055    vssrarni.hu.w vr7,    vr5,    bpcw_sh
   1056    vssrlrni.bu.h vr7,    vr6,    0
   1057    vshuf4i.w     vr8,    vr7,    0x4E
   1058    vilvl.b       vr0,    vr8,    vr7
   1059    vst           vr0,    a0,     0
   1060    addi.w        a5,     a5,     -1
   1061    addi.d        a2,     a2,     32
   1062    addi.d        a3,     a3,     32
   1063    add.d         a0,     a0,     a1
   1064    blt           zero,   a5,     .W_AVG_W16_LSX
   1065    b             .W_AVG_END_LSX
   1066 .W_AVG_W32_LSX:
   1067 .rept 2
   1068    vld           vr0,    a2,     0
   1069    vld           vr2,    a2,     16
   1070    vld           vr1,    a3,     0
   1071    vld           vr3,    a3,     16
   1072    vmulwev.w.h   vr4,    vr0,    vr21
   1073    vmulwod.w.h   vr5,    vr0,    vr21
   1074    vmulwev.w.h   vr6,    vr2,    vr21
   1075    vmulwod.w.h   vr7,    vr2,    vr21
   1076    vmaddwev.w.h  vr4,    vr1,    vr22
   1077    vmaddwod.w.h  vr5,    vr1,    vr22
   1078    vmaddwev.w.h  vr6,    vr3,    vr22
   1079    vmaddwod.w.h  vr7,    vr3,    vr22
   1080    vssrarni.hu.w vr6,    vr4,    bpcw_sh
   1081    vssrarni.hu.w vr7,    vr5,    bpcw_sh
   1082    vssrlrni.bu.h vr7,    vr6,    0
   1083    vshuf4i.w     vr8,    vr7,    0x4E
   1084    vilvl.b       vr0,    vr8,    vr7
   1085    vst           vr0,    a0,     0
   1086    addi.d        a2,     a2,     32
   1087    addi.d        a3,     a3,     32
   1088    addi.d        a0,     a0,     16
   1089 .endr
   1090    addi.w        a5,     a5,     -1
   1091    add.d         t8,     t8,     a1
   1092    add.d         a0,     t8,     zero
   1093    blt           zero,   a5,     .W_AVG_W32_LSX
   1094    b             .W_AVG_END_LSX
   1095 
   1096 .W_AVG_W64_LSX:
   1097 .rept 4
   1098    vld           vr0,    a2,     0
   1099    vld           vr2,    a2,     16
   1100    vld           vr1,    a3,     0
   1101    vld           vr3,    a3,     16
   1102    vmulwev.w.h   vr4,    vr0,    vr21
   1103    vmulwod.w.h   vr5,    vr0,    vr21
   1104    vmulwev.w.h   vr6,    vr2,    vr21
   1105    vmulwod.w.h   vr7,    vr2,    vr21
   1106    vmaddwev.w.h  vr4,    vr1,    vr22
   1107    vmaddwod.w.h  vr5,    vr1,    vr22
   1108    vmaddwev.w.h  vr6,    vr3,    vr22
   1109    vmaddwod.w.h  vr7,    vr3,    vr22
   1110    vssrarni.hu.w vr6,    vr4,    bpcw_sh
   1111    vssrarni.hu.w vr7,    vr5,    bpcw_sh
   1112    vssrlrni.bu.h vr7,    vr6,    0
   1113    vshuf4i.w     vr8,    vr7,    0x4E
   1114    vilvl.b       vr0,    vr8,    vr7
   1115    vst           vr0,    a0,     0
   1116    addi.d        a2,     a2,     32
   1117    addi.d        a3,     a3,     32
   1118    addi.d        a0,     a0,     16
   1119 .endr
   1120    addi.w        a5,     a5,     -1
   1121    add.d         t8,     t8,     a1
   1122    add.d         a0,     t8,     zero
   1123    blt           zero,   a5,     .W_AVG_W64_LSX
   1124    b             .W_AVG_END_LSX
   1125 
   1126 .W_AVG_W128_LSX:
   1127 .rept 8
   1128    vld           vr0,    a2,     0
   1129    vld           vr2,    a2,     16
   1130    vld           vr1,    a3,     0
   1131    vld           vr3,    a3,     16
   1132    vmulwev.w.h   vr4,    vr0,    vr21
   1133    vmulwod.w.h   vr5,    vr0,    vr21
   1134    vmulwev.w.h   vr6,    vr2,    vr21
   1135    vmulwod.w.h   vr7,    vr2,    vr21
   1136    vmaddwev.w.h  vr4,    vr1,    vr22
   1137    vmaddwod.w.h  vr5,    vr1,    vr22
   1138    vmaddwev.w.h  vr6,    vr3,    vr22
   1139    vmaddwod.w.h  vr7,    vr3,    vr22
   1140    vssrarni.hu.w vr6,    vr4,    bpcw_sh
   1141    vssrarni.hu.w vr7,    vr5,    bpcw_sh
   1142    vssrlrni.bu.h vr7,    vr6,    0
   1143    vshuf4i.w     vr8,    vr7,    0x4E
   1144    vilvl.b       vr0,    vr8,    vr7
   1145    vst           vr0,    a0,     0
   1146    addi.d        a2,     a2,     32
   1147    addi.d        a3,     a3,     32
   1148    addi.d        a0,     a0,     16
   1149 .endr
   1150    addi.w        a5,     a5,     -1
   1151    add.d         t8,     t8,     a1
   1152    add.d         a0,     t8,     zero
   1153    blt           zero,   a5,     .W_AVG_W128_LSX
   1154 .W_AVG_END_LSX:
   1155 endfunc
   1156 
   1157 function w_avg_8bpc_lasx
   1158    addi.d        t8,     a0,     0
   1159    li.w          t2,     16
   1160    sub.w         t2,     t2,     a6  // 16 - weight
   1161    xvreplgr2vr.h xr21,   a6
   1162    xvreplgr2vr.h xr22,   t2
   1163 
   1164    clz.w         t0,     a4
   1165    li.w          t1,     24
   1166    sub.w         t0,     t0,      t1
   1167    la.local      t1,     .W_AVG_LASX_JRTABLE
   1168    alsl.d        t0,     t0,      t1,    1
   1169    ld.h          t2,     t0,      0
   1170    add.d         t1,     t1,      t2
   1171    jirl          $r0,    t1,      0
   1172 
   1173    .align   3
   1174 .W_AVG_LASX_JRTABLE:
   1175    .hword .W_AVG_W128_LASX - .W_AVG_LASX_JRTABLE
   1176    .hword .W_AVG_W64_LASX  - .W_AVG_LASX_JRTABLE
   1177    .hword .W_AVG_W32_LASX  - .W_AVG_LASX_JRTABLE
   1178    .hword .W_AVG_W16_LASX  - .W_AVG_LASX_JRTABLE
   1179    .hword .W_AVG_W8_LASX   - .W_AVG_LASX_JRTABLE
   1180    .hword .W_AVG_W4_LASX   - .W_AVG_LASX_JRTABLE
   1181 
   1182 .W_AVG_W4_LASX:
   1183    vld            vr0,    a2,     0
   1184    vld            vr1,    a3,     0
   1185    xvpermi.d      xr2,    xr0,    0xD8
   1186    xvpermi.d      xr3,    xr1,    0xD8
   1187    xvilvl.h       xr4,    xr3,    xr2
   1188    xvmulwev.w.h   xr0,    xr4,    xr21
   1189    xvmaddwod.w.h  xr0,    xr4,    xr22
   1190    xvssrarni.hu.w xr1,    xr0,    bpcw_sh
   1191    xvssrlni.bu.h  xr0,    xr1,    0
   1192    fst.s          f0,     a0,     0
   1193    add.d          a0,     a0,     a1
   1194    xvstelm.w      xr0,    a0,     0,     4
   1195    addi.w         a5,     a5,     -2
   1196    addi.d         a2,     a2,     16
   1197    addi.d         a3,     a3,     16
   1198    add.d          a0,     a1,     a0
   1199    blt            zero,   a5,     .W_AVG_W4_LASX
   1200    b              .W_AVG_END_LASX
   1201 
   1202 .W_AVG_W8_LASX:
   1203    xvld           xr0,    a2,     0
   1204    xvld           xr1,    a3,     0
   1205    xvmulwev.w.h   xr2,    xr0,    xr21
   1206    xvmulwod.w.h   xr3,    xr0,    xr21
   1207    xvmaddwev.w.h  xr2,    xr1,    xr22
   1208    xvmaddwod.w.h  xr3,    xr1,    xr22
   1209    xvssrarni.hu.w xr3,    xr2,    bpcw_sh
   1210    xvssrlni.bu.h  xr1,    xr3,    0
   1211    xvpickod.w     xr4,    xr2,    xr1
   1212    xvilvl.b       xr0,    xr4,    xr1
   1213    xvstelm.d      xr0,    a0,     0,     0
   1214    add.d          a0,     a0,     a1
   1215    xvstelm.d      xr0,    a0,     0,     2
   1216    addi.w         a5,     a5,     -2
   1217    addi.d         a2,     a2,     32
   1218    addi.d         a3,     a3,     32
   1219    add.d          a0,     a0,     a1
   1220    blt            zero,   a5,     .W_AVG_W8_LASX
   1221    b              .W_AVG_END_LASX
   1222 
   1223 .W_AVG_W16_LASX:
   1224    xvld           xr0,    a2,     0
   1225    xvld           xr1,    a3,     0
   1226    xvmulwev.w.h   xr2,    xr0,    xr21
   1227    xvmulwod.w.h   xr3,    xr0,    xr21
   1228    xvmaddwev.w.h  xr2,    xr1,    xr22
   1229    xvmaddwod.w.h  xr3,    xr1,    xr22
   1230    xvssrarni.hu.w xr3,    xr2,    bpcw_sh
   1231    xvssrlni.bu.h  xr1,    xr3,    0
   1232    xvpickod.w     xr4,    xr2,    xr1
   1233    xvilvl.b       xr0,    xr4,    xr1
   1234    xvpermi.d      xr1,    xr0,    0xD8
   1235    vst            vr1,    a0,     0
   1236    addi.w         a5,     a5,     -1
   1237    addi.d         a2,     a2,     32
   1238    addi.d         a3,     a3,     32
   1239    add.d          a0,     a0,     a1
   1240    blt            zero,   a5,     .W_AVG_W16_LASX
   1241    b              .W_AVG_END_LSX
   1242 
   1243 .W_AVG_W32_LASX:
   1244    xvld           xr0,    a2,     0
   1245    xvld           xr2,    a2,     32
   1246    xvld           xr1,    a3,     0
   1247    xvld           xr3,    a3,     32
   1248    xvmulwev.w.h   xr4,    xr0,    xr21
   1249    xvmulwod.w.h   xr5,    xr0,    xr21
   1250    xvmulwev.w.h   xr6,    xr2,    xr21
   1251    xvmulwod.w.h   xr7,    xr2,    xr21
   1252    xvmaddwev.w.h  xr4,    xr1,    xr22
   1253    xvmaddwod.w.h  xr5,    xr1,    xr22
   1254    xvmaddwev.w.h  xr6,    xr3,    xr22
   1255    xvmaddwod.w.h  xr7,    xr3,    xr22
   1256    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
   1257    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
   1258    xvssrlni.bu.h  xr7,    xr6,    0
   1259    xvshuf4i.w     xr8,    xr7,    0x4E
   1260    xvilvl.b       xr9,    xr8,    xr7
   1261    xvpermi.d      xr0,    xr9,    0xD8
   1262    xvst           xr0,    a0,     0
   1263    addi.w         a5,     a5,     -1
   1264    addi.d         a2,     a2,     64
   1265    addi.d         a3,     a3,     64
   1266    add.d          a0,     a0,     a1
   1267    blt            zero,   a5,     .W_AVG_W32_LASX
   1268    b              .W_AVG_END_LASX
   1269 
   1270 .W_AVG_W64_LASX:
   1271 .rept 2
   1272    xvld           xr0,    a2,     0
   1273    xvld           xr2,    a2,     32
   1274    xvld           xr1,    a3,     0
   1275    xvld           xr3,    a3,     32
   1276    xvmulwev.w.h   xr4,    xr0,    xr21
   1277    xvmulwod.w.h   xr5,    xr0,    xr21
   1278    xvmulwev.w.h   xr6,    xr2,    xr21
   1279    xvmulwod.w.h   xr7,    xr2,    xr21
   1280    xvmaddwev.w.h  xr4,    xr1,    xr22
   1281    xvmaddwod.w.h  xr5,    xr1,    xr22
   1282    xvmaddwev.w.h  xr6,    xr3,    xr22
   1283    xvmaddwod.w.h  xr7,    xr3,    xr22
   1284    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
   1285    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
   1286    xvssrlni.bu.h  xr7,    xr6,    0
   1287    xvshuf4i.w     xr8,    xr7,    0x4E
   1288    xvilvl.b       xr9,    xr8,    xr7
   1289    xvpermi.d      xr0,    xr9,    0xD8
   1290    xvst           xr0,    a0,     0
   1291    addi.d         a2,     a2,     64
   1292    addi.d         a3,     a3,     64
   1293    addi.d         a0,     a0,     32
   1294 .endr
   1295    addi.w         a5,     a5,     -1
   1296    add.d          t8,     t8,     a1
   1297    add.d          a0,     t8,     zero
   1298    blt            zero,   a5,     .W_AVG_W64_LASX
   1299    b              .W_AVG_END_LASX
   1300 
   1301 .W_AVG_W128_LASX:
   1302 .rept 4
   1303    xvld           xr0,    a2,     0
   1304    xvld           xr2,    a2,     32
   1305    xvld           xr1,    a3,     0
   1306    xvld           xr3,    a3,     32
   1307    xvmulwev.w.h   xr4,    xr0,    xr21
   1308    xvmulwod.w.h   xr5,    xr0,    xr21
   1309    xvmulwev.w.h   xr6,    xr2,    xr21
   1310    xvmulwod.w.h   xr7,    xr2,    xr21
   1311    xvmaddwev.w.h  xr4,    xr1,    xr22
   1312    xvmaddwod.w.h  xr5,    xr1,    xr22
   1313    xvmaddwev.w.h  xr6,    xr3,    xr22
   1314    xvmaddwod.w.h  xr7,    xr3,    xr22
   1315    xvssrarni.hu.w xr6,    xr4,    bpcw_sh
   1316    xvssrarni.hu.w xr7,    xr5,    bpcw_sh
   1317    xvssrlni.bu.h  xr7,    xr6,    0
   1318    xvshuf4i.w     xr8,    xr7,    0x4E
   1319    xvilvl.b       xr9,    xr8,    xr7
   1320    xvpermi.d      xr0,    xr9,    0xD8
   1321    xvst           xr0,    a0,     0
   1322    addi.d         a2,     a2,     64
   1323    addi.d         a3,     a3,     64
   1324    addi.d         a0,     a0,     32
   1325 .endr
   1326 
   1327    addi.w         a5,     a5,     -1
   1328    add.d          t8,     t8,     a1
   1329    add.d          a0,     t8,     zero
   1330    blt            zero,   a5,     .W_AVG_W128_LASX
   1331 .W_AVG_END_LASX:
   1332 endfunc
   1333 
   1334 #undef bpc_sh
   1335 #undef bpcw_sh
   1336 
   1337 #define mask_sh         10
   1338 /*
   1339 static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
   1340                   const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
   1341                   const uint8_t *mask HIGHBD_DECL_SUFFIX)
   1342 */
   1343 function mask_8bpc_lsx
   1344    vldi          vr21,   0x440   // 64
   1345    vxor.v        vr19,   vr19,   vr19
   1346    addi.d        t8,     a0,     0
   1347    clz.w         t0,     a4
   1348    li.w          t1,     24
   1349    sub.w         t0,     t0,      t1
   1350    la.local      t1,     .MASK_LSX_JRTABLE
   1351    alsl.d        t0,     t0,      t1,    1
   1352    ld.h          t2,     t0,      0
   1353    add.d         t1,     t1,      t2
   1354    jirl          $r0,    t1,      0
   1355 
   1356    .align   3
   1357 .MASK_LSX_JRTABLE:
   1358    .hword .MASK_W128_LSX - .MASK_LSX_JRTABLE
   1359    .hword .MASK_W64_LSX  - .MASK_LSX_JRTABLE
   1360    .hword .MASK_W32_LSX  - .MASK_LSX_JRTABLE
   1361    .hword .MASK_W16_LSX  - .MASK_LSX_JRTABLE
   1362    .hword .MASK_W8_LSX   - .MASK_LSX_JRTABLE
   1363    .hword .MASK_W4_LSX   - .MASK_LSX_JRTABLE
   1364 
   1365 .MASK_W4_LSX:
   1366    vld           vr0,     a2,     0
   1367    vld           vr1,     a3,     0
   1368    fld.d         f22,     a6,     0
   1369 
   1370    vilvl.b       vr2,    vr19,   vr22
   1371    vsub.h        vr3,    vr21,   vr2
   1372 
   1373    vmulwev.w.h   vr4,    vr0,    vr2
   1374    vmulwod.w.h   vr5,    vr0,    vr2
   1375    vmaddwev.w.h  vr4,    vr1,    vr3
   1376    vmaddwod.w.h  vr5,    vr1,    vr3
   1377    vssrarni.hu.w vr5,    vr4,    mask_sh
   1378    vssrlrni.bu.h vr1,    vr5,    0
   1379    vpickod.w     vr4,    vr2,    vr1
   1380    vilvl.b       vr0,    vr4,    vr1
   1381    fst.s         f0,     a0,     0
   1382    add.d         a0,     a0,     a1
   1383    vstelm.w      vr0,    a0,     0,    1
   1384    addi.d        a2,     a2,     16
   1385    addi.d        a3,     a3,     16
   1386    addi.d        a6,     a6,     8
   1387    add.d         a0,     a0,     a1
   1388    addi.w        a5,     a5,     -2
   1389    blt           zero,   a5,     .MASK_W4_LSX
   1390    b             .MASK_END_LSX
   1391 .MASK_W8_LSX:
   1392    vld           vr0,    a2,     0
   1393    vld           vr10,   a2,     16
   1394    vld           vr1,    a3,     0
   1395    vld           vr11,   a3,     16
   1396    vld           vr22,   a6,     0
   1397 
   1398    vilvl.b       vr2,    vr19,   vr22
   1399    vilvh.b       vr12,   vr19,   vr22
   1400    vsub.h        vr3,    vr21,   vr2
   1401    vsub.h        vr13,   vr21,   vr12
   1402 
   1403    vmulwev.w.h   vr4,    vr0,    vr2
   1404    vmulwod.w.h   vr5,    vr0,    vr2
   1405    vmulwev.w.h   vr14,   vr10,   vr12
   1406    vmulwod.w.h   vr15,   vr10,   vr12
   1407    vmaddwev.w.h  vr4,    vr1,    vr3
   1408    vmaddwod.w.h  vr5,    vr1,    vr3
   1409    vmaddwev.w.h  vr14,   vr11,   vr13
   1410    vmaddwod.w.h  vr15,   vr11,   vr13
   1411    vssrarni.hu.w vr14,   vr4,    mask_sh
   1412    vssrarni.hu.w vr15,   vr5,    mask_sh
   1413    vssrlrni.bu.h vr15,   vr14,   0
   1414    vshuf4i.w     vr6,    vr15,   0x4E
   1415    vilvl.b       vr0,    vr6,    vr15
   1416    fst.d         f0,     a0,     0
   1417    add.d         a0,     a0,     a1
   1418    vstelm.d      vr0,    a0,     0,   1
   1419    addi.d        a2,     a2,     32
   1420    addi.d        a3,     a3,     32
   1421    addi.d        a6,     a6,     16
   1422    add.d         a0,     a0,     a1
   1423    addi.w        a5,     a5,     -2
   1424    blt           zero,   a5,     .MASK_W8_LSX
   1425    b             .MASK_END_LSX
   1426 
   1427 .MASK_W16_LSX:
   1428    vld           vr0,    a2,     0
   1429    vld           vr10,   a2,     16
   1430    vld           vr1,    a3,     0
   1431    vld           vr11,   a3,     16
   1432    vld           vr22,   a6,     0
   1433 
   1434    vilvl.b       vr2,    vr19,   vr22
   1435    vilvh.b       vr12,   vr19,   vr22
   1436    vsub.h        vr3,    vr21,   vr2
   1437    vsub.h        vr13,   vr21,   vr12
   1438 
   1439    vmulwev.w.h   vr4,    vr0,    vr2
   1440    vmulwod.w.h   vr5,    vr0,    vr2
   1441    vmulwev.w.h   vr14,   vr10,   vr12
   1442    vmulwod.w.h   vr15,   vr10,   vr12
   1443    vmaddwev.w.h  vr4,    vr1,    vr3
   1444    vmaddwod.w.h  vr5,    vr1,    vr3
   1445    vmaddwev.w.h  vr14,   vr11,   vr13
   1446    vmaddwod.w.h  vr15,   vr11,   vr13
   1447    vssrarni.hu.w vr14,   vr4,    mask_sh
   1448    vssrarni.hu.w vr15,   vr5,    mask_sh
   1449    vssrlrni.bu.h vr15,   vr14,   0
   1450    vshuf4i.w     vr6,    vr15,   0x4E
   1451    vilvl.b       vr0,    vr6,    vr15
   1452    vst           vr0,    a0,     0
   1453    addi.d        a2,     a2,     32
   1454    addi.d        a3,     a3,     32
   1455    addi.d        a6,     a6,     16
   1456    add.d         a0,     a0,     a1
   1457    addi.w        a5,     a5,     -1
   1458    blt           zero,   a5,     .MASK_W16_LSX
   1459    b             .MASK_END_LSX
   1460 .MASK_W32_LSX:
   1461 .rept 2
   1462    vld           vr0,    a2,     0
   1463    vld           vr10,   a2,     16
   1464    vld           vr1,    a3,     0
   1465    vld           vr11,   a3,     16
   1466    vld           vr22,   a6,     0
   1467    vilvl.b       vr2,    vr19,   vr22
   1468    vilvh.b       vr12,   vr19,   vr22
   1469    vsub.h        vr3,    vr21,   vr2
   1470    vsub.h        vr13,   vr21,   vr12
   1471    vmulwev.w.h   vr4,    vr0,    vr2
   1472    vmulwod.w.h   vr5,    vr0,    vr2
   1473    vmulwev.w.h   vr14,   vr10,   vr12
   1474    vmulwod.w.h   vr15,   vr10,   vr12
   1475    vmaddwev.w.h  vr4,    vr1,    vr3
   1476    vmaddwod.w.h  vr5,    vr1,    vr3
   1477    vmaddwev.w.h  vr14,   vr11,   vr13
   1478    vmaddwod.w.h  vr15,   vr11,   vr13
   1479    vssrarni.hu.w vr14,   vr4,    mask_sh
   1480    vssrarni.hu.w vr15,   vr5,    mask_sh
   1481    vssrlrni.bu.h vr15,   vr14,   0
   1482    vshuf4i.w     vr6,    vr15,   0x4E
   1483    vilvl.b       vr0,    vr6,    vr15
   1484    vst           vr0,    a0,     0
   1485    addi.d        a2,     a2,     32
   1486    addi.d        a3,     a3,     32
   1487    addi.d        a6,     a6,     16
   1488    addi.d        a0,     a0,     16
   1489 .endr
   1490    add.d         t8,     t8,     a1
   1491    add.d         a0,     t8,     zero
   1492    addi.w        a5,     a5,     -1
   1493    blt           zero,   a5,     .MASK_W32_LSX
   1494    b             .MASK_END_LSX
   1495 .MASK_W64_LSX:
   1496 .rept 4
   1497    vld           vr0,    a2,     0
   1498    vld           vr10,   a2,     16
   1499    vld           vr1,    a3,     0
   1500    vld           vr11,   a3,     16
   1501    vld           vr22,   a6,     0
   1502    vilvl.b       vr2,    vr19,   vr22
   1503    vilvh.b       vr12,   vr19,   vr22
   1504    vsub.h        vr3,    vr21,   vr2
   1505    vsub.h        vr13,   vr21,   vr12
   1506    vmulwev.w.h   vr4,    vr0,    vr2
   1507    vmulwod.w.h   vr5,    vr0,    vr2
   1508    vmulwev.w.h   vr14,   vr10,   vr12
   1509    vmulwod.w.h   vr15,   vr10,   vr12
   1510    vmaddwev.w.h  vr4,    vr1,    vr3
   1511    vmaddwod.w.h  vr5,    vr1,    vr3
   1512    vmaddwev.w.h  vr14,   vr11,   vr13
   1513    vmaddwod.w.h  vr15,   vr11,   vr13
   1514    vssrarni.hu.w vr14,   vr4,    mask_sh
   1515    vssrarni.hu.w vr15,   vr5,    mask_sh
   1516    vssrlrni.bu.h vr15,   vr14,   0
   1517    vshuf4i.w     vr6,    vr15,   0x4E
   1518    vilvl.b       vr0,    vr6,    vr15
   1519    vst           vr0,    a0,     0
   1520    addi.d        a2,     a2,     32
   1521    addi.d        a3,     a3,     32
   1522    addi.d        a6,     a6,     16
   1523    addi.d        a0,     a0,     16
   1524 .endr
   1525    add.d         t8,     t8,     a1
   1526    add.d         a0,     t8,     zero
   1527    addi.w        a5,     a5,     -1
   1528    blt           zero,   a5,     .MASK_W64_LSX
   1529    b             .MASK_END_LSX
   1530 .MASK_W128_LSX:
   1531 .rept 8
   1532    vld           vr0,    a2,     0
   1533    vld           vr10,   a2,     16
   1534    vld           vr1,    a3,     0
   1535    vld           vr11,   a3,     16
   1536    vld           vr22,   a6,     0
   1537    vilvl.b       vr2,    vr19,   vr22
   1538    vilvh.b       vr12,   vr19,   vr22
   1539    vsub.h        vr3,    vr21,   vr2
   1540    vsub.h        vr13,   vr21,   vr12
   1541    vmulwev.w.h   vr4,    vr0,    vr2
   1542    vmulwod.w.h   vr5,    vr0,    vr2
   1543    vmulwev.w.h   vr14,   vr10,   vr12
   1544    vmulwod.w.h   vr15,   vr10,   vr12
   1545    vmaddwev.w.h  vr4,    vr1,    vr3
   1546    vmaddwod.w.h  vr5,    vr1,    vr3
   1547    vmaddwev.w.h  vr14,   vr11,   vr13
   1548    vmaddwod.w.h  vr15,   vr11,   vr13
   1549    vssrarni.hu.w vr14,   vr4,    mask_sh
   1550    vssrarni.hu.w vr15,   vr5,    mask_sh
   1551    vssrlrni.bu.h vr15,   vr14,   0
   1552    vshuf4i.w     vr6,    vr15,   0x4E
   1553    vilvl.b       vr0,    vr6,    vr15
   1554    vst           vr0,    a0,     0
   1555    addi.d        a2,     a2,     32
   1556    addi.d        a3,     a3,     32
   1557    addi.d        a6,     a6,     16
   1558    addi.d        a0,     a0,     16
   1559 .endr
   1560    add.d         t8,     t8,     a1
   1561    add.d         a0,     t8,     zero
   1562    addi.w        a5,     a5,     -1
   1563    blt           zero,   a5,     .MASK_W128_LSX
   1564 .MASK_END_LSX:
   1565 endfunc
   1566 
   1567 function mask_8bpc_lasx
   1568    xvldi         xr21,   0x440   // 64
   1569    xvxor.v       xr19,   xr19,   xr19
   1570    addi.d        t8,     a0,     0
   1571    clz.w         t0,     a4
   1572    li.w          t1,     24
   1573    sub.w         t0,     t0,      t1
   1574    la.local      t1,     .MASK_LASX_JRTABLE
   1575    alsl.d        t0,     t0,      t1,    1
   1576    ld.h          t2,     t0,      0
   1577    add.d         t1,     t1,      t2
   1578    jirl          $r0,    t1,      0
   1579 
   1580    .align   3
   1581 .MASK_LASX_JRTABLE:
   1582    .hword .MASK_W128_LASX - .MASK_LASX_JRTABLE
   1583    .hword .MASK_W64_LASX  - .MASK_LASX_JRTABLE
   1584    .hword .MASK_W32_LASX  - .MASK_LASX_JRTABLE
   1585    .hword .MASK_W16_LASX  - .MASK_LASX_JRTABLE
   1586    .hword .MASK_W8_LASX   - .MASK_LASX_JRTABLE
   1587    .hword .MASK_W4_LASX   - .MASK_LASX_JRTABLE
   1588 
   1589 .MASK_W4_LASX:
   1590    vld            vr0,    a2,     0
   1591    vld            vr1,    a3,     0
   1592    fld.d          f22,    a6,     0
   1593 
   1594    vilvl.h        vr4,    vr1,    vr0
   1595    vilvh.h        vr14,   vr1,    vr0
   1596    vilvl.b        vr2,    vr19,   vr22
   1597    vsub.h         vr3,    vr21,   vr2
   1598    xvpermi.q      xr14,   xr4,    0x20
   1599    vilvl.h        vr5,    vr3,    vr2
   1600    vilvh.h        vr15,   vr3,    vr2
   1601    xvpermi.q      xr15,   xr5,    0x20
   1602    xvmulwev.w.h   xr0,    xr14,   xr15
   1603    xvmaddwod.w.h  xr0,    xr14,   xr15
   1604    xvssrarni.hu.w xr1,    xr0,    mask_sh
   1605    xvssrlni.bu.h  xr2,    xr1,    0
   1606    fst.s          f2,     a0,     0
   1607    add.d          a0,     a0,     a1
   1608    xvstelm.w      xr2,    a0,     0,    4
   1609 
   1610    addi.d         a2,     a2,     16
   1611    addi.d         a3,     a3,     16
   1612    addi.d         a6,     a6,     8
   1613    add.d          a0,     a0,     a1
   1614    addi.w         a5,     a5,     -2
   1615    blt            zero,   a5,     .MASK_W4_LASX
   1616    b              .MASK_END_LASX
   1617 
   1618 .MASK_W8_LASX:
   1619    xvld           xr0,    a2,      0
   1620    xvld           xr1,    a3,      0
   1621    vld            vr22,   a6,      0
   1622 
   1623    vext2xv.hu.bu  xr2,    xr22
   1624    xvsub.h        xr3,    xr21,    xr2
   1625    xvmulwev.w.h   xr4,    xr0,     xr2
   1626    xvmulwod.w.h   xr5,    xr0,     xr2
   1627    xvmaddwev.w.h  xr4,    xr1,     xr3
   1628    xvmaddwod.w.h  xr5,    xr1,     xr3
   1629    xvssrarni.hu.w xr5,    xr4,     mask_sh
   1630    xvssrlni.bu.h  xr1,    xr5,     0
   1631    xvpickod.w     xr4,    xr2,     xr1
   1632    xvilvl.b       xr0,    xr4,     xr1
   1633    fst.d          f0,     a0,      0
   1634    add.d          a0,     a0,      a1
   1635    xvstelm.d      xr0,    a0,      0,    2
   1636 
   1637    addi.d         a2,     a2,      32
   1638    addi.d         a3,     a3,      32
   1639    addi.d         a6,     a6,      16
   1640    add.d          a0,     a0,      a1
   1641    addi.w         a5,     a5,      -2
   1642    blt            zero,   a5,      .MASK_W8_LASX
   1643    b              .MASK_END_LASX
   1644 
   1645 .MASK_W16_LASX:
   1646    xvld           xr0,    a2,      0
   1647    xvld           xr1,    a3,      0
   1648    vld            vr22,   a6,      0
   1649 
   1650    vext2xv.hu.bu  xr2,    xr22
   1651    xvsub.h        xr3,    xr21,    xr2
   1652    xvmulwev.w.h   xr4,    xr0,     xr2
   1653    xvmulwod.w.h   xr5,    xr0,     xr2
   1654    xvmaddwev.w.h  xr4,    xr1,     xr3
   1655    xvmaddwod.w.h  xr5,    xr1,     xr3
   1656    xvssrarni.hu.w xr5,    xr4,     mask_sh
   1657    xvssrlni.bu.h  xr1,    xr5,     0
   1658    xvpickod.w     xr4,    xr2,    xr1
   1659    xvilvl.b       xr0,    xr4,    xr1
   1660    xvpermi.d      xr1,    xr0,     0xD8
   1661    vst            vr1,    a0,      0
   1662 
   1663    addi.d         a2,     a2,      32
   1664    addi.d         a3,     a3,      32
   1665    addi.d         a6,     a6,      16
   1666    add.d          a0,     a0,      a1
   1667    addi.w         a5,     a5,      -1
   1668    blt            zero,   a5,      .MASK_W16_LASX
   1669    b              .MASK_END_LASX
   1670 .MASK_W32_LASX:
   1671    xvld           xr0,    a2,      0
   1672    xvld           xr10,   a2,      32
   1673    xvld           xr1,    a3,      0
   1674    xvld           xr11,   a3,      32
   1675    xvld           xr22,   a6,      0
   1676    vext2xv.hu.bu  xr2,    xr22
   1677    xvpermi.q      xr4,    xr22,    0x01
   1678    vext2xv.hu.bu  xr12,   xr4
   1679    xvsub.h        xr3,    xr21,    xr2
   1680    xvsub.h        xr13,   xr21,    xr12
   1681 
   1682    xvmulwev.w.h   xr4,    xr0,     xr2
   1683    xvmulwod.w.h   xr5,    xr0,     xr2
   1684    xvmulwev.w.h   xr14,   xr10,    xr12
   1685    xvmulwod.w.h   xr15,   xr10,    xr12
   1686    xvmaddwev.w.h  xr4,    xr1,     xr3
   1687    xvmaddwod.w.h  xr5,    xr1,     xr3
   1688    xvmaddwev.w.h  xr14,   xr11,    xr13
   1689    xvmaddwod.w.h  xr15,   xr11,    xr13
   1690    xvssrarni.hu.w xr14,   xr4,     mask_sh
   1691    xvssrarni.hu.w xr15,   xr5,     mask_sh
   1692    xvssrlni.bu.h  xr15,   xr14,    0
   1693    xvshuf4i.w     xr6,    xr15,    0x4E
   1694    xvilvl.b       xr1,    xr6,     xr15
   1695    xvpermi.d      xr0,    xr1,     0xD8
   1696    xvst           xr0,    a0,      0
   1697 
   1698    addi.d         a2,     a2,      64
   1699    addi.d         a3,     a3,      64
   1700    addi.d         a6,     a6,      32
   1701    add.d          a0,     a0,      a1
   1702    addi.w         a5,     a5,      -1
   1703    blt            zero,   a5,      .MASK_W32_LASX
   1704    b              .MASK_END_LASX
   1705 
   1706 .MASK_W64_LASX:
   1707 .rept 2
   1708    xvld           xr0,    a2,      0
   1709    xvld           xr10,   a2,      32
   1710    xvld           xr1,    a3,      0
   1711    xvld           xr11,   a3,      32
   1712    xvld           xr22,   a6,      0
   1713    vext2xv.hu.bu  xr2,    xr22
   1714    xvpermi.q      xr4,    xr22,    0x01
   1715    vext2xv.hu.bu  xr12,   xr4
   1716    xvsub.h        xr3,    xr21,    xr2
   1717    xvsub.h        xr13,   xr21,    xr12
   1718 
   1719    xvmulwev.w.h   xr4,    xr0,     xr2
   1720    xvmulwod.w.h   xr5,    xr0,     xr2
   1721    xvmulwev.w.h   xr14,   xr10,    xr12
   1722    xvmulwod.w.h   xr15,   xr10,    xr12
   1723    xvmaddwev.w.h  xr4,    xr1,     xr3
   1724    xvmaddwod.w.h  xr5,    xr1,     xr3
   1725    xvmaddwev.w.h  xr14,   xr11,    xr13
   1726    xvmaddwod.w.h  xr15,   xr11,    xr13
   1727    xvssrarni.hu.w xr14,   xr4,     mask_sh
   1728    xvssrarni.hu.w xr15,   xr5,     mask_sh
   1729    xvssrlni.bu.h  xr15,   xr14,    0
   1730    xvshuf4i.w     xr6,    xr15,    0x4E
   1731    xvilvl.b       xr1,    xr6,     xr15
   1732    xvpermi.d      xr0,    xr1,     0xD8
   1733    xvst           xr0,    a0,      0
   1734    addi.d         a2,     a2,      64
   1735    addi.d         a3,     a3,      64
   1736    addi.d         a6,     a6,      32
   1737    addi.d         a0,     a0,      32
   1738 .endr
   1739    add.d          t8,     t8,     a1
   1740    add.d          a0,     t8,     zero
   1741    addi.w         a5,     a5,      -1
   1742    blt            zero,   a5,      .MASK_W64_LASX
   1743    b              .MASK_END_LASX
   1744 
   1745 .MASK_W128_LASX:
   1746 .rept 4
   1747    xvld           xr0,    a2,      0
   1748    xvld           xr10,   a2,      32
   1749    xvld           xr1,    a3,      0
   1750    xvld           xr11,   a3,      32
   1751    xvld           xr22,   a6,      0
   1752    vext2xv.hu.bu  xr2,    xr22
   1753    xvpermi.q      xr4,    xr22,    0x01
   1754    vext2xv.hu.bu  xr12,   xr4
   1755    xvsub.h        xr3,    xr21,    xr2
   1756    xvsub.h        xr13,   xr21,    xr12
   1757 
   1758    xvmulwev.w.h   xr4,    xr0,     xr2
   1759    xvmulwod.w.h   xr5,    xr0,     xr2
   1760    xvmulwev.w.h   xr14,   xr10,    xr12
   1761    xvmulwod.w.h   xr15,   xr10,    xr12
   1762    xvmaddwev.w.h  xr4,    xr1,     xr3
   1763    xvmaddwod.w.h  xr5,    xr1,     xr3
   1764    xvmaddwev.w.h  xr14,   xr11,    xr13
   1765    xvmaddwod.w.h  xr15,   xr11,    xr13
   1766    xvssrarni.hu.w xr14,   xr4,     mask_sh
   1767    xvssrarni.hu.w xr15,   xr5,     mask_sh
   1768    xvssrlni.bu.h  xr15,   xr14,    0
   1769    xvshuf4i.w     xr6,    xr15,    0x4E
   1770    xvilvl.b       xr1,    xr6,     xr15
   1771    xvpermi.d      xr0,    xr1,     0xD8
   1772    xvst           xr0,    a0,      0
   1773 
   1774    addi.d         a2,     a2,      64
   1775    addi.d         a3,     a3,      64
   1776    addi.d         a6,     a6,      32
   1777    addi.d         a0,     a0,      32
   1778 .endr
   1779    add.d          t8,     t8,     a1
   1780    add.d          a0,     t8,     zero
   1781    addi.w         a5,     a5,      -1
   1782    blt            zero,   a5,      .MASK_W128_LASX
   1783 .MASK_END_LASX:
   1784 endfunc
   1785 
   1786 /*
   1787 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
   1788                     const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
   1789                     uint8_t *mask, const int sign,
   1790                     const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
   1791 */
   1792 function w_mask_420_8bpc_lsx
   1793    addi.d        sp,      sp,    -24
   1794    fst.d         f24,     sp,    0
   1795    fst.d         f25,     sp,    8
   1796    fst.d         f26,     sp,    16
   1797    vldi          vr20,    0x440
   1798    vreplgr2vr.h  vr21,    a7
   1799    vldi          vr22,    0x426
   1800 
   1801    clz.w         t0,      a4
   1802    li.w          t1,      24
   1803    sub.w         t0,      t0,      t1
   1804    la.local      t1,      .WMASK420_LSX_JRTABLE
   1805    alsl.d        t0,      t0,      t1,    1
   1806    ld.h          t8,      t0,      0
   1807    add.d         t1,      t1,      t8
   1808    jirl          $r0,     t1,      0
   1809 
   1810    .align   3
   1811 .WMASK420_LSX_JRTABLE:
   1812    .hword .WMASK420_W128_LSX - .WMASK420_LSX_JRTABLE
   1813    .hword .WMASK420_W64_LSX  - .WMASK420_LSX_JRTABLE
   1814    .hword .WMASK420_W32_LSX  - .WMASK420_LSX_JRTABLE
   1815    .hword .WMASK420_W16_LSX  - .WMASK420_LSX_JRTABLE
   1816    .hword .WMASK420_W8_LSX   - .WMASK420_LSX_JRTABLE
   1817    .hword .WMASK420_W4_LSX   - .WMASK420_LSX_JRTABLE
   1818 
   1819 .WMASK420_W4_LSX:
   1820    vld           vr0,     a2,       0
   1821    vld           vr1,     a2,       16
   1822    vld           vr2,     a3,       0
   1823    vld           vr3,     a3,       16
   1824    addi.w        a5,      a5,       -4
   1825 
   1826    vabsd.h       vr4,     vr0,      vr2
   1827    vabsd.h       vr5,     vr1,      vr3
   1828    vaddi.hu      vr4,     vr4,      8
   1829    vaddi.hu      vr5,     vr5,      8
   1830    vsrli.h       vr4,     vr4,      8
   1831    vsrli.h       vr5,     vr5,      8
   1832    vadd.h        vr4,     vr4,      vr22
   1833    vadd.h        vr5,     vr5,      vr22
   1834    vmin.hu       vr6,     vr4,      vr20
   1835    vmin.hu       vr7,     vr5,      vr20
   1836    vsub.h        vr8,     vr20,     vr6
   1837    vsub.h        vr9,     vr20,     vr7
   1838    vmulwev.w.h   vr4,     vr6,      vr0
   1839    vmulwod.w.h   vr5,     vr6,      vr0
   1840    vmulwev.w.h   vr10,    vr7,      vr1
   1841    vmulwod.w.h   vr11,    vr7,      vr1
   1842    vmaddwev.w.h  vr4,     vr8,      vr2
   1843    vmaddwod.w.h  vr5,     vr8,      vr2
   1844    vmaddwev.w.h  vr10,    vr9,      vr3
   1845    vmaddwod.w.h  vr11,    vr9,      vr3
   1846    vilvl.w       vr0,     vr5,      vr4
   1847    vilvh.w       vr1,     vr5,      vr4
   1848    vilvl.w       vr2,     vr11,     vr10
   1849    vilvh.w       vr3,     vr11,     vr10
   1850    vssrarni.hu.w vr1,     vr0,      10
   1851    vssrarni.hu.w vr3,     vr2,      10
   1852    vssrlni.bu.h  vr3,     vr1,      0
   1853    vstelm.w      vr3,     a0,       0,    0
   1854    add.d         a0,      a0,       a1
   1855    vstelm.w      vr3,     a0,       0,    1
   1856    add.d         a0,      a0,       a1
   1857    vstelm.w      vr3,     a0,       0,    2
   1858    add.d         a0,      a0,       a1
   1859    vstelm.w      vr3,     a0,       0,    3
   1860    add.d         a0,      a0,       a1
   1861    vpickev.h     vr0,     vr7,      vr6
   1862    vpickod.h     vr1,     vr7,      vr6
   1863    vadd.h        vr0,     vr0,      vr1
   1864    vshuf4i.h     vr0,     vr0,      0xd8
   1865    vhaddw.w.h    vr2,     vr0,      vr0
   1866    vpickev.h     vr2,     vr2,      vr2
   1867    vsub.h        vr2,     vr2,      vr21
   1868    vaddi.hu      vr2,     vr2,      2
   1869    vssrani.bu.h  vr2,     vr2,      2
   1870    vstelm.w      vr2,     a6,       0,    0
   1871 
   1872    addi.d        a2,      a2,       32
   1873    addi.d        a3,      a3,       32
   1874    addi.d        a6,      a6,       4
   1875    blt           zero,    a5,       .WMASK420_W4_LSX
   1876    b             .END_W420
   1877 
   1878 .WMASK420_W8_LSX:
   1879    vld           vr0,     a2,       0
   1880    vld           vr1,     a2,       16
   1881    vld           vr2,     a3,       0
   1882    vld           vr3,     a3,       16
   1883    addi.w        a5,      a5,       -2
   1884 
   1885    vabsd.h       vr4,     vr0,      vr2
   1886    vabsd.h       vr5,     vr1,      vr3
   1887    vaddi.hu      vr4,     vr4,      8
   1888    vaddi.hu      vr5,     vr5,      8
   1889    vsrli.h       vr4,     vr4,      8
   1890    vsrli.h       vr5,     vr5,      8
   1891    vadd.h        vr4,     vr4,      vr22
   1892    vadd.h        vr5,     vr5,      vr22
   1893    vmin.hu       vr6,     vr4,      vr20
   1894    vmin.hu       vr7,     vr5,      vr20
   1895    vsub.h        vr8,     vr20,     vr6
   1896    vsub.h        vr9,     vr20,     vr7
   1897    vmulwev.w.h   vr4,     vr6,      vr0
   1898    vmulwod.w.h   vr5,     vr6,      vr0
   1899    vmulwev.w.h   vr10,    vr7,      vr1
   1900    vmulwod.w.h   vr11,    vr7,      vr1
   1901    vmaddwev.w.h  vr4,     vr8,      vr2
   1902    vmaddwod.w.h  vr5,     vr8,      vr2
   1903    vmaddwev.w.h  vr10,    vr9,      vr3
   1904    vmaddwod.w.h  vr11,    vr9,      vr3
   1905    vssrarni.hu.w vr10,    vr4,      10
   1906    vssrarni.hu.w vr11,    vr5,      10
   1907    vssrlni.bu.h  vr11,    vr10,     0
   1908    vshuf4i.w     vr0,     vr11,     0x4E
   1909    vilvl.b       vr3,     vr0,      vr11
   1910    vstelm.d      vr3,     a0,       0,     0
   1911    add.d         a0,      a0,       a1
   1912    vstelm.d      vr3,     a0,       0,     1
   1913    add.d         a0,      a0,       a1
   1914    vpickev.h     vr0,     vr7,      vr6
   1915    vpickod.h     vr1,     vr7,      vr6
   1916    vadd.h        vr0,     vr0,      vr1
   1917    vilvh.d       vr2,     vr0,      vr0
   1918    vadd.h        vr2,     vr2,      vr0
   1919    vsub.h        vr2,     vr2,      vr21
   1920    vaddi.hu      vr2,     vr2,      2
   1921    vssrani.bu.h  vr2,     vr2,      2
   1922    vstelm.w      vr2,     a6,       0,     0
   1923 
   1924    addi.d        a2,      a2,       32
   1925    addi.d        a3,      a3,       32
   1926    addi.d        a6,      a6,       4
   1927    blt           zero,    a5,       .WMASK420_W8_LSX
   1928    b             .END_W420
   1929 
   1930 .WMASK420_W16_LSX:
   1931    vld           vr0,     a2,       0
   1932    vld           vr1,     a2,       16
   1933    alsl.d        a2,      a4,       a2,    1
   1934    vld           vr2,     a2,       0
   1935    vld           vr3,     a2,       16
   1936    vld           vr4,     a3,       0
   1937    vld           vr5,     a3,       16
   1938    alsl.d        a3,      a4,       a3,    1
   1939    vld           vr6,     a3,       0
   1940    vld           vr7,     a3,       16
   1941 
   1942    vabsd.h       vr8,     vr0,      vr4
   1943    vabsd.h       vr9,     vr1,      vr5
   1944    vabsd.h       vr10,    vr2,      vr6
   1945    vabsd.h       vr11,    vr3,      vr7
   1946    vaddi.hu      vr8,     vr8,      8
   1947    vaddi.hu      vr9,     vr9,      8
   1948    vaddi.hu      vr10,    vr10,     8
   1949    vaddi.hu      vr11,    vr11,     8
   1950    vsrli.h       vr8,     vr8,      8
   1951    vsrli.h       vr9,     vr9,      8
   1952    vsrli.h       vr10,    vr10,     8
   1953    vsrli.h       vr11,    vr11,     8
   1954    vadd.h        vr8,     vr8,      vr22
   1955    vadd.h        vr9,     vr9,      vr22
   1956    vadd.h        vr10,    vr10,     vr22
   1957    vadd.h        vr11,    vr11,     vr22
   1958    vmin.hu       vr12,    vr8,      vr20
   1959    vmin.hu       vr13,    vr9,      vr20
   1960    vmin.hu       vr14,    vr10,     vr20
   1961    vmin.hu       vr15,    vr11,     vr20
   1962    vsub.h        vr16,    vr20,     vr12
   1963    vsub.h        vr17,    vr20,     vr13
   1964    vsub.h        vr18,    vr20,     vr14
   1965    vsub.h        vr19,    vr20,     vr15
   1966    vmulwev.w.h   vr8,     vr12,     vr0
   1967    vmulwod.w.h   vr9,     vr12,     vr0
   1968    vmulwev.w.h   vr10,    vr13,     vr1
   1969    vmulwod.w.h   vr11,    vr13,     vr1
   1970    vmulwev.w.h   vr23,    vr14,     vr2
   1971    vmulwod.w.h   vr24,    vr14,     vr2
   1972    vmulwev.w.h   vr25,    vr15,     vr3
   1973    vmulwod.w.h   vr26,    vr15,     vr3
   1974    vmaddwev.w.h  vr8,     vr16,     vr4
   1975    vmaddwod.w.h  vr9,     vr16,     vr4
   1976    vmaddwev.w.h  vr10,    vr17,     vr5
   1977    vmaddwod.w.h  vr11,    vr17,     vr5
   1978    vmaddwev.w.h  vr23,    vr18,     vr6
   1979    vmaddwod.w.h  vr24,    vr18,     vr6
   1980    vmaddwev.w.h  vr25,    vr19,     vr7
   1981    vmaddwod.w.h  vr26,    vr19,     vr7
   1982    vssrarni.hu.w vr10,    vr8,      10
   1983    vssrarni.hu.w vr11,    vr9,      10
   1984    vssrarni.hu.w vr25,    vr23,     10
   1985    vssrarni.hu.w vr26,    vr24,     10
   1986    vssrlni.bu.h  vr11,    vr10,     0
   1987    vssrlni.bu.h  vr26,    vr25,     0
   1988    vshuf4i.w     vr0,     vr11,     0x4E
   1989    vshuf4i.w     vr1,     vr26,     0x4E
   1990    vilvl.b       vr3,     vr0,      vr11
   1991    vilvl.b       vr7,     vr1,      vr26
   1992    vst           vr3,     a0,       0
   1993    vstx          vr7,     a0,       a1
   1994    vpickev.h     vr0,     vr13,     vr12
   1995    vpickod.h     vr1,     vr13,     vr12
   1996    vpickev.h     vr2,     vr15,     vr14
   1997    vpickod.h     vr3,     vr15,     vr14
   1998    vadd.h        vr4,     vr0,      vr1
   1999    vadd.h        vr5,     vr2,      vr3
   2000    vadd.h        vr4,     vr4,      vr5
   2001    vsub.h        vr4,     vr4,      vr21
   2002    vssrarni.bu.h vr4,     vr4,      2
   2003    vstelm.d      vr4,     a6,       0,    0
   2004 
   2005    alsl.d        a2,      a4,       a2,   1
   2006    alsl.d        a3,      a4,       a3,   1
   2007    alsl.d        a0,      a1,       a0,   1
   2008    addi.d        a6,      a6,       8
   2009    addi.w        a5,      a5,       -2
   2010    blt           zero,    a5,       .WMASK420_W16_LSX
   2011    b    .END_W420
   2012 
   2013 .WMASK420_W32_LSX:
   2014 .WMASK420_W64_LSX:
   2015 .WMASK420_W128_LSX:
   2016 
   2017 .LOOP_W32_420_LSX:
   2018    add.d         t1,       a2,       zero
   2019    add.d         t2,       a3,       zero
   2020    add.d         t3,       a0,       zero
   2021    add.d         t4,       a6,       zero
   2022    alsl.d        t5,       a4,       t1,     1
   2023    alsl.d        t6,       a4,       t2,     1
   2024    or            t7,       a4,       a4
   2025 
   2026 .W32_420_LSX:
   2027    vld           vr0,      t1,       0
   2028    vld           vr1,      t1,       16
   2029    vld           vr2,      t2,       0
   2030    vld           vr3,      t2,       16
   2031    vld           vr4,      t5,       0
   2032    vld           vr5,      t5,       16
   2033    vld           vr6,      t6,       0
   2034    vld           vr7,      t6,       16
   2035    addi.d        t1,       t1,       32
   2036    addi.d        t2,       t2,       32
   2037    addi.d        t5,       t5,       32
   2038    addi.d        t6,       t6,       32
   2039    addi.w        t7,       t7,       -16
   2040    vabsd.h       vr8,      vr0,      vr2
   2041    vabsd.h       vr9,      vr1,      vr3
   2042    vabsd.h       vr10,     vr4,      vr6
   2043    vabsd.h       vr11,     vr5,      vr7
   2044    vaddi.hu      vr8,      vr8,      8
   2045    vaddi.hu      vr9,      vr9,      8
   2046    vaddi.hu      vr10,     vr10,     8
   2047    vaddi.hu      vr11,     vr11,     8
   2048    vsrli.h       vr8,      vr8,      8
   2049    vsrli.h       vr9,      vr9,      8
   2050    vsrli.h       vr10,     vr10,     8
   2051    vsrli.h       vr11,     vr11,     8
   2052    vadd.h        vr8,      vr8,      vr22
   2053    vadd.h        vr9,      vr9,      vr22
   2054    vadd.h        vr10,     vr10,     vr22
   2055    vadd.h        vr11,     vr11,     vr22
   2056    vmin.hu       vr12,     vr8,      vr20
   2057    vmin.hu       vr13,     vr9,      vr20
   2058    vmin.hu       vr14,     vr10,     vr20
   2059    vmin.hu       vr15,     vr11,     vr20
   2060    vsub.h        vr16,     vr20,     vr12
   2061    vsub.h        vr17,     vr20,     vr13
   2062    vsub.h        vr18,     vr20,     vr14
   2063    vsub.h        vr19,     vr20,     vr15
   2064    vmulwev.w.h   vr8,      vr12,     vr0
   2065    vmulwod.w.h   vr9,      vr12,     vr0
   2066    vmulwev.w.h   vr10,     vr13,     vr1
   2067    vmulwod.w.h   vr11,     vr13,     vr1
   2068    vmulwev.w.h   vr23,     vr14,     vr4
   2069    vmulwod.w.h   vr24,     vr14,     vr4
   2070    vmulwev.w.h   vr25,     vr15,     vr5
   2071    vmulwod.w.h   vr26,     vr15,     vr5
   2072    vmaddwev.w.h  vr8,      vr16,     vr2
   2073    vmaddwod.w.h  vr9,      vr16,     vr2
   2074    vmaddwev.w.h  vr10,     vr17,     vr3
   2075    vmaddwod.w.h  vr11,     vr17,     vr3
   2076    vmaddwev.w.h  vr23,     vr18,     vr6
   2077    vmaddwod.w.h  vr24,     vr18,     vr6
   2078    vmaddwev.w.h  vr25,     vr19,     vr7
   2079    vmaddwod.w.h  vr26,     vr19,     vr7
   2080    vssrarni.hu.w vr10,     vr8,      10
   2081    vssrarni.hu.w vr11,     vr9,      10
   2082    vssrarni.hu.w vr25,     vr23,     10
   2083    vssrarni.hu.w vr26,     vr24,     10
   2084    vssrlni.bu.h  vr11,     vr10,     0
   2085    vssrlni.bu.h  vr26,     vr25,     0
   2086    vshuf4i.w     vr8,      vr11,     0x4E
   2087    vshuf4i.w     vr9,      vr26,     0x4E
   2088    vilvl.b       vr3,      vr8,      vr11
   2089    vilvl.b       vr7,      vr9,      vr26
   2090    vst           vr3,      t3,       0
   2091    vstx          vr7,      a1,       t3
   2092    addi.d        t3,       t3,       16
   2093    vpickev.h     vr8,      vr13,     vr12
   2094    vpickod.h     vr9,      vr13,     vr12
   2095    vpickev.h     vr10,     vr15,     vr14
   2096    vpickod.h     vr11,     vr15,     vr14
   2097    vadd.h        vr8,      vr8,      vr9
   2098    vadd.h        vr10,     vr10,     vr11
   2099    vadd.h        vr12,     vr8,      vr10
   2100    vsub.h        vr12,     vr12,     vr21
   2101    vssrarni.bu.h vr12,     vr12,     2
   2102    vstelm.d      vr12,     t4,       0,     0
   2103    addi.d        t4,       t4,       8
   2104    bne           t7,       zero,     .W32_420_LSX
   2105 
   2106    alsl.d        a2,       a4,       a2,     2
   2107    alsl.d        a3,       a4,       a3,     2
   2108    alsl.d        a0,       a1,       a0,     1
   2109    srai.w        t8,       a4,       1
   2110    add.d         a6,       a6,       t8
   2111    addi.w        a5,       a5,       -2
   2112    blt           zero,     a5,       .LOOP_W32_420_LSX
   2113 
   2114 .END_W420:
   2115    fld.d            f24,     sp,    0
   2116    fld.d            f25,     sp,    8
   2117    fld.d            f26,     sp,    16
   2118    addi.d           sp,      sp,    24
   2119 endfunc
   2120 
   2121 function w_mask_420_8bpc_lasx
   2122    xvldi          xr20,    0x440
   2123    xvreplgr2vr.h  xr21,    a7
   2124    xvldi          xr22,    0x426
   2125 
   2126    clz.w          t0,      a4
   2127    li.w           t1,      24
   2128    sub.w          t0,      t0,      t1
   2129    la.local       t1,      .WMASK420_LASX_JRTABLE
   2130    alsl.d         t0,      t0,      t1,    1
   2131    ld.h           t8,      t0,      0
   2132    add.d          t1,      t1,      t8
   2133    jirl           $r0,     t1,      0
   2134 
   2135    .align   3
   2136 .WMASK420_LASX_JRTABLE:
   2137    .hword .WMASK420_W128_LASX - .WMASK420_LASX_JRTABLE
   2138    .hword .WMASK420_W64_LASX  - .WMASK420_LASX_JRTABLE
   2139    .hword .WMASK420_W32_LASX  - .WMASK420_LASX_JRTABLE
   2140    .hword .WMASK420_W16_LASX  - .WMASK420_LASX_JRTABLE
   2141    .hword .WMASK420_W8_LASX   - .WMASK420_LASX_JRTABLE
   2142    .hword .WMASK420_W4_LASX   - .WMASK420_LASX_JRTABLE
   2143 
   2144 .WMASK420_W4_LASX:
   2145    xvld           xr0,     a2,     0
   2146    xvld           xr1,     a3,     0
   2147    addi.w         a5,      a5,     -4
   2148 
   2149    xvabsd.h       xr2,     xr0,    xr1
   2150    xvaddi.hu      xr2,     xr2,    8
   2151    xvsrli.h       xr2,     xr2,    8
   2152    xvadd.h        xr2,     xr2,    xr22
   2153    xvmin.hu       xr3,     xr2,    xr20
   2154    xvsub.h        xr4,     xr20,   xr3
   2155    xvmulwev.w.h   xr5,     xr3,    xr0
   2156    xvmulwod.w.h   xr6,     xr3,    xr0
   2157    xvmaddwev.w.h  xr5,     xr4,    xr1
   2158    xvmaddwod.w.h  xr6,     xr4,    xr1
   2159    xvilvl.w       xr7,     xr6,    xr5
   2160    xvilvh.w       xr8,     xr6,    xr5
   2161    xvssrarni.hu.w xr8,     xr7,    10
   2162    xvssrlni.bu.h  xr9,     xr8,    0
   2163    vstelm.w       vr9,     a0,     0,     0
   2164    add.d          a0,      a0,     a1
   2165    vstelm.w       vr9,     a0,     0,     1
   2166    add.d          a0,      a0,     a1
   2167    xvstelm.w      xr9,     a0,     0,     4
   2168    add.d          a0,      a0,     a1
   2169    xvstelm.w      xr9,     a0,     0,     5
   2170    add.d          a0,      a0,     a1
   2171 
   2172    xvhaddw.w.h    xr3,     xr3,    xr3
   2173    xvpermi.d      xr4,     xr3,    0xb1
   2174    xvadd.h        xr3,     xr3,    xr4
   2175    xvpickev.h     xr3,     xr3,    xr3
   2176    xvsub.h        xr3,     xr3,    xr21
   2177    xvssrarni.bu.h xr3,     xr3,    2
   2178    vstelm.h       vr3,     a6,     0,     0
   2179    xvstelm.h      xr3,     a6,     2,     8
   2180 
   2181    addi.d         a2,     a2,      32
   2182    addi.d         a3,     a3,      32
   2183    addi.d         a6,     a6,      4
   2184    blt            zero,   a5,      .WMASK420_W4_LASX
   2185    b              .END_W420_LASX
   2186 
   2187 .WMASK420_W8_LASX:
   2188    xvld           xr0,      a2,     0
   2189    xvld           xr1,      a2,     32
   2190    xvld           xr2,      a3,     0
   2191    xvld           xr3,      a3,     32
   2192    addi.w         a5,       a5,     -4
   2193 
   2194    xvabsd.h       xr4,      xr0,    xr2
   2195    xvabsd.h       xr5,      xr1,    xr3
   2196    xvaddi.hu      xr4,      xr4,    8
   2197    xvaddi.hu      xr5,      xr5,    8
   2198    xvsrli.h       xr4,      xr4,    8
   2199    xvsrli.h       xr5,      xr5,    8
   2200    xvadd.h        xr4,      xr4,    xr22
   2201    xvadd.h        xr5,      xr5,    xr22
   2202    xvmin.hu       xr6,      xr4,    xr20
   2203    xvmin.hu       xr7,      xr5,    xr20
   2204    xvsub.h        xr8,      xr20,   xr6
   2205    xvsub.h        xr9,      xr20,   xr7
   2206    xvmulwev.w.h   xr10,     xr6,    xr0
   2207    xvmulwod.w.h   xr11,     xr6,    xr0
   2208    xvmulwev.w.h   xr12,     xr7,    xr1
   2209    xvmulwod.w.h   xr13,     xr7,    xr1
   2210    xvmaddwev.w.h  xr10,     xr8,    xr2
   2211    xvmaddwod.w.h  xr11,     xr8,    xr2
   2212    xvmaddwev.w.h  xr12,     xr9,    xr3
   2213    xvmaddwod.w.h  xr13,     xr9,    xr3
   2214    xvssrarni.hu.w xr12,     xr10,   10
   2215    xvssrarni.hu.w xr13,     xr11,   10
   2216    xvssrlni.bu.h  xr13,     xr12,   0
   2217    xvshuf4i.w     xr1,      xr13,   0x4E
   2218    xvilvl.b       xr17,     xr1,    xr13
   2219    vstelm.d       vr17,     a0,     0,     0
   2220    add.d          a0,       a0,     a1
   2221    xvstelm.d      xr17,     a0,     0,     2
   2222    add.d          a0,       a0,     a1
   2223    xvstelm.d      xr17,     a0,     0,     1
   2224    add.d          a0,       a0,     a1
   2225    xvstelm.d      xr17,     a0,     0,     3
   2226    add.d          a0,       a0,     a1
   2227 
   2228    xvhaddw.w.h    xr6,      xr6,    xr6
   2229    xvhaddw.w.h    xr7,      xr7,    xr7
   2230    xvpickev.h     xr8,      xr7,    xr6
   2231    xvpermi.q      xr9,      xr8,    0x01
   2232    vadd.h         vr8,      vr8,    vr9
   2233    vsub.h         vr8,      vr8,    vr21
   2234    vssrarni.bu.h  vr8,      vr8,    2
   2235    vstelm.d       vr8,      a6,     0,    0
   2236    addi.d         a2,       a2,     64
   2237    addi.d         a3,       a3,     64
   2238    addi.d         a6,       a6,     8
   2239    blt            zero,     a5,     .WMASK420_W8_LASX
   2240    b              .END_W420_LASX
   2241 
   2242 .WMASK420_W16_LASX:
   2243    xvld           xr0,      a2,     0
   2244    xvld           xr1,      a2,     32
   2245    xvld           xr2,      a3,     0
   2246    xvld           xr3,      a3,     32
   2247    addi.w         a5,       a5,     -2
   2248 
   2249    xvabsd.h       xr4,      xr0,    xr2
   2250    xvabsd.h       xr5,      xr1,    xr3
   2251    xvaddi.hu      xr4,      xr4,    8
   2252    xvaddi.hu      xr5,      xr5,    8
   2253    xvsrli.h       xr4,      xr4,    8
   2254    xvsrli.h       xr5,      xr5,    8
   2255    xvadd.h        xr4,      xr4,    xr22
   2256    xvadd.h        xr5,      xr5,    xr22
   2257    xvmin.hu       xr4,      xr4,    xr20
   2258    xvmin.hu       xr5,      xr5,    xr20
   2259    xvsub.h        xr6,      xr20,   xr4
   2260    xvsub.h        xr7,      xr20,   xr5
   2261    xvmulwev.w.h   xr8,      xr4,    xr0
   2262    xvmulwod.w.h   xr9,      xr4,    xr0
   2263    xvmulwev.w.h   xr10,     xr5,    xr1
   2264    xvmulwod.w.h   xr11,     xr5,    xr1
   2265    xvmaddwev.w.h  xr8,      xr6,    xr2
   2266    xvmaddwod.w.h  xr9,      xr6,    xr2
   2267    xvmaddwev.w.h  xr10,     xr7,    xr3
   2268    xvmaddwod.w.h  xr11,     xr7,    xr3
   2269    xvssrarni.hu.w xr10,     xr8,    10
   2270    xvssrarni.hu.w xr11,     xr9,    10
   2271    xvssrlni.bu.h  xr11,     xr10,   0
   2272    xvshuf4i.w     xr8,      xr11,   0x4E
   2273    xvilvl.b       xr15,     xr8,    xr11
   2274    xvpermi.d      xr16,     xr15,   0xd8
   2275    vst            vr16,     a0,     0
   2276    add.d          a0,       a0,     a1
   2277    xvpermi.q      xr16,     xr16,   0x01
   2278    vst            vr16,     a0,     0
   2279    add.d          a0,       a0,     a1
   2280 
   2281    xvhaddw.w.h    xr4,      xr4,    xr4
   2282    xvhaddw.w.h    xr5,      xr5,    xr5
   2283    xvadd.h        xr4,      xr5,    xr4
   2284    xvpickev.h     xr6,      xr4,    xr4
   2285    xvpermi.d      xr7,      xr6,    0x08
   2286    vsub.h         vr7,      vr7,    vr21
   2287    vssrarni.bu.h  vr7,      vr7,    2
   2288    vstelm.d       vr7,      a6,     0,    0
   2289 
   2290    addi.d         a2,       a2,     64
   2291    addi.d         a3,       a3,     64
   2292    addi.d         a6,       a6,     8
   2293    blt            zero,     a5,     .WMASK420_W16_LASX
   2294    b              .END_W420_LASX
   2295 
   2296 .WMASK420_W32_LASX:
   2297 .WMASK420_W64_LASX:
   2298 .WMASK420_W128_LASX:
   2299 
   2300 .LOOP_W32_420_LASX:
   2301    add.d          t1,       a2,       zero
   2302    add.d          t2,       a3,       zero
   2303    add.d          t3,       a0,       zero
   2304    add.d          t4,       a6,       zero
   2305    alsl.d         t5,       a4,       t1,     1
   2306    alsl.d         t6,       a4,       t2,     1
   2307    or             t7,       a4,       a4
   2308 .W32_420_LASX:
   2309    xvld           xr0,      t1,       0
   2310    xvld           xr1,      t2,       0
   2311    xvld           xr2,      t5,       0
   2312    xvld           xr3,      t6,       0
   2313    addi.d         t1,       t1,       32
   2314    addi.d         t2,       t2,       32
   2315    addi.d         t5,       t5,       32
   2316    addi.d         t6,       t6,       32
   2317    addi.w         t7,       t7,       -16
   2318    xvabsd.h       xr4,      xr0,      xr1
   2319    xvabsd.h       xr5,      xr2,      xr3
   2320    xvaddi.hu      xr4,      xr4,      8
   2321    xvaddi.hu      xr5,      xr5,      8
   2322    xvsrli.h       xr4,      xr4,      8
   2323    xvsrli.h       xr5,      xr5,      8
   2324    xvadd.h        xr4,      xr4,      xr22
   2325    xvadd.h        xr5,      xr5,      xr22
   2326    xvmin.hu       xr6,      xr4,      xr20
   2327    xvmin.hu       xr7,      xr5,      xr20
   2328    xvsub.h        xr8,      xr20,     xr6
   2329    xvsub.h        xr9,      xr20,     xr7
   2330    xvmulwev.w.h   xr10,     xr6,      xr0
   2331    xvmulwod.w.h   xr11,     xr6,      xr0
   2332    xvmulwev.w.h   xr12,     xr7,      xr2
   2333    xvmulwod.w.h   xr13,     xr7,      xr2
   2334    xvmaddwev.w.h  xr10,     xr8,      xr1
   2335    xvmaddwod.w.h  xr11,     xr8,      xr1
   2336    xvmaddwev.w.h  xr12,     xr9,      xr3
   2337    xvmaddwod.w.h  xr13,     xr9,      xr3
   2338    xvssrarni.hu.w xr12,     xr10,     10
   2339    xvssrarni.hu.w xr13,     xr11,     10
   2340    xvssrlni.bu.h  xr13,     xr12,     0
   2341    xvshuf4i.w     xr10,     xr13,     0x4E
   2342    xvilvl.b       xr17,     xr10,     xr13
   2343    xvpermi.d      xr18,     xr17,     0x08
   2344    xvpermi.d      xr19,     xr17,     0x0d
   2345    vst            vr18,     t3,       0
   2346    vstx           vr19,     t3,       a1
   2347    addi.d         t3,       t3,       16
   2348 
   2349    xvhaddw.w.h    xr6,      xr6,      xr6
   2350    xvhaddw.w.h    xr7,      xr7,      xr7
   2351    xvadd.h        xr6,      xr7,      xr6
   2352    xvpickev.h     xr7,      xr6,      xr6
   2353    xvpermi.d      xr8,      xr7,      0x08
   2354    vsub.h         vr9,      vr8,      vr21
   2355    vssrarni.bu.h  vr9,      vr9,      2
   2356    vstelm.d       vr9,      t4,       0,      0
   2357    addi.d         t4,       t4,       8
   2358    bne            t7,       zero,     .W32_420_LASX
   2359 
   2360    alsl.d         a2,       a4,       a2,     2
   2361    alsl.d         a3,       a4,       a3,     2
   2362    alsl.d         a0,       a1,       a0,     1
   2363    srai.w         t8,       a4,       1
   2364    add.d          a6,       a6,       t8
   2365    addi.w         a5,       a5,       -2
   2366    blt            zero,     a5,       .LOOP_W32_420_LASX
   2367 
   2368 .END_W420_LASX:
   2369 endfunc
   2370 
   2371 #undef bpc_sh
   2372 #undef bpcw_sh
   2373 
   2374 .macro  vhaddw.d.h  in0
   2375    vhaddw.w.h  \in0,  \in0,  \in0
   2376    vhaddw.d.w  \in0,  \in0,  \in0
   2377 .endm
   2378 .macro  vhaddw.q.w  in0
   2379    vhaddw.d.w  \in0,  \in0,  \in0
   2380    vhaddw.q.d  \in0,  \in0,  \in0
   2381 .endm
   2382 .macro PUT_H_8W in0
   2383    vshuf.b          vr2,    \in0,  \in0,   vr6
   2384    vshuf.b          vr3,    \in0,  \in0,   vr7
   2385    vshuf.b          vr4,    \in0,  \in0,   vr8
   2386    vmulwev.h.bu.b   vr12,   vr2,   vr10
   2387    vmulwev.h.bu.b   vr13,   vr3,   vr11
   2388    vmulwev.h.bu.b   vr14,   vr3,   vr10
   2389    vmulwev.h.bu.b   vr15,   vr4,   vr11
   2390    vmaddwod.h.bu.b  vr12,   vr2,   vr10
   2391    vmaddwod.h.bu.b  vr13,   vr3,   vr11
   2392    vmaddwod.h.bu.b  vr14,   vr3,   vr10
   2393    vmaddwod.h.bu.b  vr15,   vr4,   vr11
   2394    vadd.h           vr12,   vr12,  vr13
   2395    vadd.h           vr14,   vr14,  vr15
   2396    vhaddw.w.h       vr12,   vr12,  vr12
   2397    vhaddw.w.h       vr14,   vr14,  vr14
   2398    vpickev.h        \in0,   vr14,  vr12
   2399    vadd.h           \in0,   \in0,  vr9
   2400 .endm
   2401 
   2402 const subpel_h_shuf0
   2403 .byte 0, 1, 2, 3, 1, 2, 3, 4, 16, 17, 18, 19, 17, 18, 19, 20
   2404 endconst
   2405 const subpel_h_shuf1
   2406 .byte 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6
   2407 endconst
   2408 const subpel_h_shuf2
   2409 .byte 0, 1, 2, 3, 1, 2, 3, 4,  8,  9, 10, 11,  9, 10, 11, 12
   2410 .byte 2, 3, 4, 5, 3, 4, 5, 6, 10, 11, 12, 13, 11, 12, 13, 14
   2411 endconst
   2412 const subpel_h_shuf3
   2413 .byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
   2414 .byte 0, 4, 1, 5, 2, 6, 3, 7, 4, 8, 5, 9, 6, 10, 7, 11
   2415 endconst
   2416 
   2417 .macro FILTER_8TAP_8W in0
   2418    vshuf.b         vr13,    \in0,  \in0,  vr7
   2419    vshuf.b         vr14,    \in0,  \in0,  vr11
   2420    vshuf.b         vr15,    \in0,  \in0,  vr12
   2421    vmulwev.h.bu.b  vr16,    vr13,  vr8
   2422    vmulwev.h.bu.b  vr17,    vr14,  vr10
   2423    vmulwev.h.bu.b  vr18,    vr14,  vr8
   2424    vmulwev.h.bu.b  vr19,    vr15,  vr10
   2425    vmaddwod.h.bu.b vr16,    vr13,  vr8
   2426    vmaddwod.h.bu.b vr17,    vr14,  vr10
   2427    vmaddwod.h.bu.b vr18,    vr14,  vr8
   2428    vmaddwod.h.bu.b vr19,    vr15,  vr10
   2429    vadd.h          vr16,    vr16,  vr17
   2430    vadd.h          vr18,    vr18,  vr19
   2431    vhaddw.w.h      vr16,    vr16,  vr16
   2432    vhaddw.w.h      \in0,    vr18,  vr18
   2433    vssrarni.h.w    \in0,    vr16,  2
   2434 .endm
   2435 
   2436 .macro PUT_8TAP_8BPC_LSX lable
   2437    li.w             t0,     4
   2438    la.local         t6,     dav1d_mc_subpel_filters
   2439    slli.d           t2,     a3,    1  //src_stride*2
   2440    add.d            t3,     t2,    a3 //src_stride*3
   2441    slli.d           t4,     t2,    1  //src_stride*4
   2442 
   2443    bnez             a6,     .l_\lable\()put_h //mx
   2444    bnez             a7,     .l_\lable\()put_v //my
   2445 
   2446    clz.w            t1,     a4
   2447    li.w             t5,     24
   2448    sub.w            t1,     t1,    t5
   2449    la.local         t5,     .l_\lable\()put_hv0_jtable
   2450    alsl.d           t1,     t1,    t5,   3
   2451    ld.d             t6,     t1,    0
   2452    add.d            t5,     t5,    t6
   2453    jirl             $r0,    t5,    0
   2454 
   2455    .align   3
   2456 .l_\lable\()put_hv0_jtable:
   2457    .dword .l_\lable\()put_hv0_128w - .l_\lable\()put_hv0_jtable
   2458    .dword .l_\lable\()put_hv0_64w  - .l_\lable\()put_hv0_jtable
   2459    .dword .l_\lable\()put_hv0_32w  - .l_\lable\()put_hv0_jtable
   2460    .dword .l_\lable\()put_hv0_16w  - .l_\lable\()put_hv0_jtable
   2461    .dword .l_\lable\()put_hv0_8w   - .l_\lable\()put_hv0_jtable
   2462    .dword .l_\lable\()put_hv0_4w   - .l_\lable\()put_hv0_jtable
   2463    .dword .l_\lable\()put_hv0_2w   - .l_\lable\()put_hv0_jtable
   2464 
   2465 .l_\lable\()put_hv0_2w:
   2466    vldrepl.h        vr0,    a2,    0
   2467    add.d            a2,     a2,    a3
   2468    vldrepl.h        vr1,    a2,    0
   2469    vstelm.h         vr0,    a0,    0,     0
   2470    add.d            a0,     a0,    a1
   2471    vstelm.h         vr1,    a0,    0,     0
   2472    add.d            a2,     a2,    a3
   2473    add.d            a0,     a0,    a1
   2474    addi.w           a5,     a5,    -2
   2475    bnez             a5,     .l_\lable\()put_hv0_2w
   2476    b                .l_\lable\()end_put_8tap
   2477 .l_\lable\()put_hv0_4w:
   2478    fld.s            f0,     a2,    0
   2479    fldx.s           f1,     a2,    a3
   2480    fst.s            f0,     a0,    0
   2481    fstx.s           f1,     a0,    a1
   2482    alsl.d           a2,     a3,    a2,    1
   2483    alsl.d           a0,     a1,    a0,    1
   2484    addi.w           a5,     a5,    -2
   2485    bnez             a5,     .l_\lable\()put_hv0_4w
   2486    b                .l_\lable\()end_put_8tap
   2487 .l_\lable\()put_hv0_8w:
   2488    fld.d            f0,     a2,    0
   2489    fldx.d           f1,     a2,    a3
   2490    fst.d            f0,     a0,    0
   2491    fstx.d           f1,     a0,    a1
   2492    alsl.d           a2,     a3,    a2,    1
   2493    alsl.d           a0,     a1,    a0,    1
   2494    addi.w           a5,     a5,    -2
   2495    bnez             a5,     .l_\lable\()put_hv0_8w
   2496    b                .l_\lable\()end_put_8tap
   2497 .l_\lable\()put_hv0_16w:
   2498    vld              vr0,    a2,    0
   2499    vldx             vr1,    a2,    a3
   2500    vst              vr0,    a0,    0
   2501    vstx             vr1,    a0,    a1
   2502    alsl.d           a2,     a3,    a2,    1
   2503    alsl.d           a0,     a1,    a0,    1
   2504    addi.w           a5,     a5,    -2
   2505    bnez             a5,     .l_\lable\()put_hv0_16w
   2506    b                .l_\lable\()end_put_8tap
   2507 .l_\lable\()put_hv0_32w:
   2508    vld              vr0,    a2,    0
   2509    vld              vr1,    a2,    16
   2510    add.d            a2,     a2,    a3
   2511    vld              vr2,    a2,    0
   2512    vld              vr3,    a2,    16
   2513    vst              vr0,    a0,    0
   2514    vst              vr1,    a0,    16
   2515    add.d            a0,     a0,    a1
   2516    vst              vr2,    a0,    0
   2517    vst              vr3,    a0,    16
   2518    add.d            a2,     a2,    a3
   2519    add.d            a0,     a0,    a1
   2520    addi.w           a5,     a5,    -2
   2521    bnez             a5,     .l_\lable\()put_hv0_32w
   2522    b                .l_\lable\()end_put_8tap
   2523 .l_\lable\()put_hv0_64w:
   2524    vld              vr0,    a2,    0
   2525    vld              vr1,    a2,    16
   2526    vld              vr2,    a2,    32
   2527    vld              vr3,    a2,    48
   2528    add.d            a2,     a2,    a3
   2529    vld              vr4,    a2,    0
   2530    vld              vr5,    a2,    16
   2531    vld              vr6,    a2,    32
   2532    vld              vr7,    a2,    48
   2533    add.d            a2,     a2,    a3
   2534    vst              vr0,    a0,    0
   2535    vst              vr1,    a0,    16
   2536    vst              vr2,    a0,    32
   2537    vst              vr3,    a0,    48
   2538    add.d            a0,     a0,    a1
   2539    vst              vr4,    a0,    0
   2540    vst              vr5,    a0,    16
   2541    vst              vr6,    a0,    32
   2542    vst              vr7,    a0,    48
   2543    add.d            a0,     a0,    a1
   2544    addi.w           a5,     a5,    -2
   2545    bnez             a5,     .l_\lable\()put_hv0_64w
   2546    b                .l_\lable\()end_put_8tap
   2547 .l_\lable\()put_hv0_128w:
   2548    vld              vr0,    a2,    0
   2549    vld              vr1,    a2,    16
   2550    vld              vr2,    a2,    32
   2551    vld              vr3,    a2,    48
   2552    vld              vr4,    a2,    64
   2553    vld              vr5,    a2,    80
   2554    vld              vr6,    a2,    96
   2555    vld              vr7,    a2,    112
   2556    add.d            a2,     a2,    a3
   2557    vld              vr8,    a2,    0
   2558    vld              vr9,    a2,    16
   2559    vld              vr10,   a2,    32
   2560    vld              vr11,   a2,    48
   2561    vld              vr12,   a2,    64
   2562    vld              vr13,   a2,    80
   2563    vld              vr14,   a2,    96
   2564    vld              vr15,   a2,    112
   2565    add.d            a2,     a2,    a3
   2566    vst              vr0,    a0,    0
   2567    vst              vr1,    a0,    16
   2568    vst              vr2,    a0,    32
   2569    vst              vr3,    a0,    48
   2570    vst              vr4,    a0,    64
   2571    vst              vr5,    a0,    80
   2572    vst              vr6,    a0,    96
   2573    vst              vr7,    a0,    112
   2574    add.d            a0,     a0,    a1
   2575    vst              vr8,    a0,    0
   2576    vst              vr9,    a0,    16
   2577    vst              vr10,   a0,    32
   2578    vst              vr11,   a0,    48
   2579    vst              vr12,   a0,    64
   2580    vst              vr13,   a0,    80
   2581    vst              vr14,   a0,    96
   2582    vst              vr15,   a0,    112
   2583    add.d            a0,     a0,    a1
   2584    addi.w           a5,     a5,    -2
   2585    bnez             a5,     .l_\lable\()put_hv0_128w
   2586    b                .l_\lable\()end_put_8tap
   2587 
   2588 .l_\lable\()put_h:
   2589    bnez             a7,     .l_\lable\()put_hv //if(fh) && if (fv)
   2590    ld.d             t5,     sp,    0  //filter_type
   2591    andi             t1,     t5,    3
   2592    blt              t0,     a4,    .l_\lable\()put_h_idx_fh
   2593    andi             t1,     t5,    1
   2594    addi.w           t1,     t1,    3
   2595 
   2596 .l_\lable\()put_h_idx_fh:
   2597    addi.w           t5,     zero,  120
   2598    mul.w            t1,     t1,    t5
   2599    addi.w           t5,     a6,    -1
   2600    slli.w           t5,     t5,    3
   2601    add.w            t1,     t1,    t5
   2602    add.d            t7,     t6,    t1 //fh's offset
   2603    li.w             t1,     34
   2604    vreplgr2vr.h     vr9,    t1
   2605 
   2606    clz.w            t1,     a4
   2607    li.w             t5,     24
   2608    sub.w            t1,     t1,    t5
   2609    la.local         t5,     .l_\lable\()put_h_jtable
   2610    alsl.d           t1,     t1,    t5,   3
   2611    ld.d             t6,     t1,    0
   2612    add.d            t5,     t5,    t6
   2613    jirl             $r0,    t5,    0
   2614 
   2615    .align   3
   2616 .l_\lable\()put_h_jtable:
   2617    .dword .l_\lable\()put_h_128w - .l_\lable\()put_h_jtable
   2618    .dword .l_\lable\()put_h_64w  - .l_\lable\()put_h_jtable
   2619    .dword .l_\lable\()put_h_32w  - .l_\lable\()put_h_jtable
   2620    .dword .l_\lable\()put_h_16w  - .l_\lable\()put_h_jtable
   2621    .dword .l_\lable\()put_h_8w   - .l_\lable\()put_h_jtable
   2622    .dword .l_\lable\()put_h_4w   - .l_\lable\()put_h_jtable
   2623    .dword .l_\lable\()put_h_2w   - .l_\lable\()put_h_jtable
   2624 
   2625 .l_\lable\()put_h_2w:
   2626    addi.d           t7,     t7,    2
   2627    addi.d           a2,     a2,    -1
   2628    vldrepl.w        vr8,    t7,    0
   2629    la.local         t7,     subpel_h_shuf0
   2630    vld              vr7,    t7,    0
   2631 .l_\lable\()put_h_2w_loop:
   2632    vld              vr0,    a2,    0
   2633    vldx             vr1,    a2,    a3
   2634    add.d            a2,     a2,    t2
   2635 
   2636    vshuf.b          vr0,    vr1,   vr0,   vr7
   2637    vdp2.h.bu.b      vr1,    vr0,   vr8
   2638    vhaddw.w.h       vr0,    vr1,   vr1
   2639    vpickev.h        vr0,    vr0,   vr0
   2640    vadd.h           vr0,    vr0,   vr9
   2641    vssrani.bu.h     vr0,    vr0,   6
   2642 
   2643    vstelm.h         vr0,    a0,    0,     0
   2644    add.d            a0,     a0,    a1
   2645    vstelm.h         vr0,    a0,    0,     1
   2646    add.d            a0,     a0,    a1
   2647    addi.w           a5,     a5,    -2
   2648    bnez             a5,     .l_\lable\()put_h_2w_loop
   2649    b                .l_\lable\()end_put_8tap
   2650 
   2651 .l_\lable\()put_h_4w:
   2652    addi.d           t7,     t7,    2
   2653    addi.d           a2,     a2,    -1
   2654    vldrepl.w        vr8,    t7,    0
   2655    la.local         t7,     subpel_h_shuf1
   2656    vld              vr7,    t7,    0
   2657 .l_\lable\()put_h_4w_loop:
   2658    vld              vr0,    a2,    0
   2659    vldx             vr1,    a2,    a3
   2660    add.d            a2,     a2,    t2
   2661 
   2662    vshuf.b          vr0,    vr0,   vr0,   vr7
   2663    vshuf.b          vr1,    vr1,   vr1,   vr7
   2664    vmulwev.h.bu.b   vr2,    vr0,   vr8
   2665    vmulwev.h.bu.b   vr3,    vr1,   vr8
   2666    vmaddwod.h.bu.b  vr2,    vr0,   vr8
   2667    vmaddwod.h.bu.b  vr3,    vr1,   vr8
   2668    vhaddw.w.h       vr0,    vr2,   vr2
   2669    vhaddw.w.h       vr1,    vr3,   vr3
   2670    vpickev.h        vr0,    vr1,   vr0
   2671    vadd.h           vr0,    vr0,   vr9
   2672    vssrani.bu.h     vr0,    vr0,   6
   2673 
   2674    vstelm.w         vr0,    a0,    0,     0
   2675    add.d            a0,     a0,    a1
   2676    vstelm.w         vr0,    a0,    0,     1
   2677    add.d            a0,     a0,    a1
   2678    addi.d           a5,     a5,    -2
   2679    bnez             a5,     .l_\lable\()put_h_4w_loop
   2680    b                .l_\lable\()end_put_8tap
   2681 
   2682 .l_\lable\()put_h_8w:
   2683    fld.d            f10,    t7,    0
   2684    vreplvei.w       vr11,   vr10,  1
   2685    vreplvei.w       vr10,   vr10,  0
   2686    la.local         t7,     subpel_h_shuf1
   2687    vld              vr6,    t7,    0
   2688    vaddi.bu         vr7,    vr6,   4
   2689    vaddi.bu         vr8,    vr6,   8
   2690    addi.d           a2,     a2,    -3
   2691 .l_\lable\()put_h_8w_loop:
   2692    vld              vr0,    a2,    0
   2693    vldx             vr1,    a2,    a3
   2694    add.d            a2,     a2,    t2
   2695    PUT_H_8W         vr0
   2696    PUT_H_8W         vr1
   2697    vssrani.bu.h     vr1,    vr0,   6
   2698    vstelm.d         vr1,    a0,    0,    0
   2699    add.d            a0,     a0,    a1
   2700    vstelm.d         vr1,    a0,    0,    1
   2701    add.d            a0,     a0,    a1
   2702    addi.w           a5,     a5,    -2
   2703    bnez             a5,     .l_\lable\()put_h_8w_loop
   2704    b                .l_\lable\()end_put_8tap
   2705 
   2706 .l_\lable\()put_h_16w:
   2707 .l_\lable\()put_h_32w:
   2708 .l_\lable\()put_h_64w:
   2709 .l_\lable\()put_h_128w:
   2710    fld.d            f10,    t7,    0
   2711    vreplvei.w       vr11,   vr10,  1
   2712    vreplvei.w       vr10,   vr10,  0
   2713    la.local         t7,     subpel_h_shuf1
   2714    vld              vr6,    t7,    0
   2715    vaddi.bu         vr7,    vr6,   4
   2716    vaddi.bu         vr8,    vr6,   8
   2717    addi.d           a2,     a2,    -3
   2718    addi.d           t0,     a2,    0 //src
   2719    addi.w           t5,     a5,    0 //h
   2720    addi.d           t8,     a0,    0 //dst
   2721 .l_\lable\()put_h_16w_loop:
   2722    vld              vr0,    a2,    0
   2723    vld              vr1,    a2,    8
   2724    add.d            a2,     a2,    a3
   2725    PUT_H_8W         vr0
   2726    PUT_H_8W         vr1
   2727    vssrani.bu.h     vr1,    vr0,   6
   2728    vst              vr1,    a0,    0
   2729    add.d            a0,     a0,    a1
   2730    addi.d           a5,     a5,    -1
   2731    bnez             a5,     .l_\lable\()put_h_16w_loop
   2732    addi.d           a2,     t0,    16
   2733    addi.d           t0,     t0,    16
   2734    addi.d           a0,     t8,    16
   2735    addi.d           t8,     t8,    16
   2736    addi.w           a5,     t5,    0
   2737    addi.w           a4,     a4,    -16
   2738    bnez             a4,     .l_\lable\()put_h_16w_loop
   2739    b                .l_\lable\()end_put_8tap
   2740 
   2741 .l_\lable\()put_v:
   2742    ld.d             t1,     sp,    0  //filter_type
   2743    srli.w           t1,     t1,    2
   2744    blt              t0,     a5,    .l_\lable\()put_v_idx_fv
   2745    andi             t1,     t1,    1
   2746    addi.w           t1,     t1,    3
   2747 
   2748 .l_\lable\()put_v_idx_fv:
   2749    addi.w           t5,     zero,  120
   2750    mul.w            t1,     t1,    t5
   2751    addi.w           t5,     a7,    -1
   2752    slli.w           t5,     t5,    3
   2753    add.w            t1,     t1,    t5
   2754    add.d            t1,     t6,    t1 //fv's offset
   2755    vldrepl.d        vr8,    t1,    0
   2756    sub.d            a2,     a2,    t3
   2757 
   2758    vilvl.h          vr8,    vr8,   vr8
   2759    vreplvei.w       vr9,    vr8,   1
   2760    vreplvei.w       vr10,   vr8,   2
   2761    vreplvei.w       vr11,   vr8,   3
   2762    vreplvei.w       vr8,    vr8,   0
   2763 
   2764    clz.w            t1,     a4
   2765    li.w             t5,     24
   2766    sub.w            t1,     t1,    t5
   2767    la.local         t5,     .l_\lable\()put_v_jtable
   2768    alsl.d           t1,     t1,    t5,   3
   2769    ld.d             t6,     t1,    0
   2770    add.d            t5,     t5,    t6
   2771    jirl             $r0,    t5,    0
   2772 
   2773    .align   3
   2774 .l_\lable\()put_v_jtable:
   2775    .dword .l_\lable\()put_v_128w - .l_\lable\()put_v_jtable
   2776    .dword .l_\lable\()put_v_64w  - .l_\lable\()put_v_jtable
   2777    .dword .l_\lable\()put_v_32w  - .l_\lable\()put_v_jtable
   2778    .dword .l_\lable\()put_v_16w  - .l_\lable\()put_v_jtable
   2779    .dword .l_\lable\()put_v_8w   - .l_\lable\()put_v_jtable
   2780    .dword .l_\lable\()put_v_4w   - .l_\lable\()put_v_jtable
   2781    .dword .l_\lable\()put_v_2w   - .l_\lable\()put_v_jtable
   2782 
   2783 .l_\lable\()put_v_2w:
   2784    fld.s            f0,     a2,    0
   2785    fldx.s           f1,     a2,    a3
   2786    fldx.s           f2,     a2,    t2
   2787    add.d            a2,     a2,    t3
   2788    fld.s            f3,     a2,    0
   2789    fldx.s           f4,     a2,    a3
   2790    fldx.s           f5,     a2,    t2
   2791    fldx.s           f6,     a2,    t3
   2792    add.d            a2,     a2,    t4
   2793 
   2794    vilvl.h          vr0,    vr1,   vr0 //0 1
   2795    vilvl.h          vr1,    vr2,   vr1 //1 2
   2796    vilvl.b          vr0,    vr1,   vr0 //01 12
   2797    vilvl.h          vr2,    vr3,   vr2 //2 3
   2798    vilvl.h          vr3,    vr4,   vr3 //3 4
   2799    vilvl.b          vr1,    vr3,   vr2 //23 34
   2800    vilvl.h          vr2,    vr5,   vr4 //4 5
   2801    vilvl.h          vr3,    vr6,   vr5 //5 6
   2802    vilvl.b          vr2,    vr3,   vr2 //45 56
   2803 .l_\lable\()put_v_2w_loop:
   2804    fld.s            f7,     a2,    0
   2805    vilvl.h          vr3,    vr7,   vr6 //6 7
   2806    fldx.s           f6,     a2,    a3
   2807    add.d            a2,     a2,    t2
   2808    vilvl.h          vr4,    vr6,   vr7 //7 8
   2809    vilvl.b          vr3,    vr4,   vr3 //67 78
   2810 
   2811    vmulwev.h.bu.b   vr12,   vr0,   vr8
   2812    vmulwev.h.bu.b   vr13,   vr1,   vr9
   2813    vmulwev.h.bu.b   vr14,   vr2,   vr10
   2814    vmulwev.h.bu.b   vr15,   vr3,   vr11
   2815    vmaddwod.h.bu.b  vr12,   vr0,   vr8
   2816    vmaddwod.h.bu.b  vr13,   vr1,   vr9
   2817    vmaddwod.h.bu.b  vr14,   vr2,   vr10
   2818    vmaddwod.h.bu.b  vr15,   vr3,   vr11
   2819    vaddi.hu         vr0,    vr1,   0
   2820    vaddi.hu         vr1,    vr2,   0
   2821    vaddi.hu         vr2,    vr3,   0
   2822    vadd.h           vr12,   vr12,  vr13
   2823    vadd.h           vr12,   vr12,  vr14
   2824    vadd.h           vr12,   vr12,  vr15
   2825 
   2826    vssrarni.bu.h    vr12,   vr12,  6
   2827    vstelm.h         vr12,   a0,    0,   0
   2828    add.d            a0,     a0,    a1
   2829    vstelm.h         vr12,   a0,    0,   1
   2830    add.d            a0,     a0,    a1
   2831    addi.w           a5,     a5,    -2
   2832    bnez             a5,     .l_\lable\()put_v_2w_loop
   2833    b                .l_\lable\()end_put_8tap
   2834 
   2835 .l_\lable\()put_v_4w:
   2836    fld.s            f0,     a2,    0
   2837    fldx.s           f1,     a2,    a3
   2838    fldx.s           f2,     a2,    t2
   2839    add.d            a2,     a2,    t3
   2840    fld.s            f3,     a2,    0
   2841    fldx.s           f4,     a2,    a3
   2842    fldx.s           f5,     a2,    t2
   2843    fldx.s           f6,     a2,    t3
   2844    add.d            a2,     a2,    t4
   2845 
   2846    vilvl.w          vr0,    vr1,   vr0
   2847    vilvl.w          vr1,    vr2,   vr1
   2848    vilvl.b          vr0,    vr1,   vr0
   2849    vilvl.w          vr1,    vr3,   vr2
   2850    vilvl.w          vr2,    vr4,   vr3
   2851    vilvl.b          vr1,    vr2,   vr1
   2852    vilvl.w          vr2,    vr5,   vr4
   2853    vilvl.w          vr3,    vr6,   vr5
   2854    vilvl.b          vr2,    vr3,   vr2
   2855 .l_\lable\()put_v_4w_loop:
   2856    fld.s            f7,     a2,    0
   2857 
   2858    vilvl.w          vr3,    vr7,   vr6
   2859    fldx.s           f6,     a2,    a3
   2860    add.d            a2,     a2,    t2
   2861    vilvl.w          vr4,    vr6,   vr7
   2862    vilvl.b          vr3,    vr4,   vr3
   2863 
   2864    vmulwev.h.bu.b   vr12,   vr0,   vr8
   2865    vmulwev.h.bu.b   vr13,   vr1,   vr9
   2866    vmulwev.h.bu.b   vr14,   vr2,   vr10
   2867    vmulwev.h.bu.b   vr15,   vr3,   vr11
   2868    vmaddwod.h.bu.b  vr12,   vr0,   vr8
   2869    vmaddwod.h.bu.b  vr13,   vr1,   vr9
   2870    vmaddwod.h.bu.b  vr14,   vr2,   vr10
   2871    vmaddwod.h.bu.b  vr15,   vr3,   vr11
   2872    vaddi.hu         vr0,    vr1,   0
   2873    vaddi.hu         vr1,    vr2,   0
   2874    vaddi.hu         vr2,    vr3,   0
   2875    vadd.h           vr12,   vr12,  vr13
   2876    vadd.h           vr12,   vr12,  vr14
   2877    vadd.h           vr12,   vr12,  vr15
   2878 
   2879    vssrarni.bu.h    vr12,   vr12,  6
   2880    vstelm.w         vr12,   a0,    0,   0
   2881    add.d            a0,     a0,    a1
   2882    vstelm.w         vr12,   a0,    0,   1
   2883    add.d            a0,     a0,    a1
   2884    addi.w           a5,     a5,    -2
   2885    bnez             a5,     .l_\lable\()put_v_4w_loop
   2886    b                .l_\lable\()end_put_8tap
   2887 
   2888 .l_\lable\()put_v_8w:
   2889 .l_\lable\()put_v_16w:
   2890 .l_\lable\()put_v_32w:
   2891 .l_\lable\()put_v_64w:
   2892 .l_\lable\()put_v_128w:
   2893    addi.d           t0,     a2,    0 //src
   2894    addi.d           t5,     a5,    0 //h
   2895    addi.d           t8,     a0,    0 //dst
   2896 .l_\lable\()put_v_8w_loop0:
   2897    fld.d            f0,     a2,    0
   2898    fldx.d           f1,     a2,    a3
   2899    fldx.d           f2,     a2,    t2
   2900    add.d            a2,     a2,    t3
   2901    fld.d            f3,     a2,    0
   2902    fldx.d           f4,     a2,    a3
   2903    fldx.d           f5,     a2,    t2
   2904    fldx.d           f6,     a2,    t3
   2905    add.d            a2,     a2,    t4
   2906 
   2907    vilvl.b          vr0,    vr1,   vr0 //0 1
   2908    vilvl.b          vr1,    vr2,   vr1 //1 2
   2909    vilvl.b          vr2,    vr3,   vr2 //2 3
   2910    vilvl.b          vr3,    vr4,   vr3 //3 4
   2911    vilvl.b          vr4,    vr5,   vr4 //4 5
   2912    vilvl.b          vr5,    vr6,   vr5 //5 6
   2913 .l_\lable\()put_v_8w_loop:
   2914    fld.d            f7,     a2,    0
   2915    vilvl.b          vr12,   vr7,   vr6 //6 7
   2916    fldx.d           f6,     a2,    a3
   2917    add.d            a2,     a2,    t2
   2918    vilvl.b          vr13,   vr6,   vr7 //7 8
   2919 
   2920    vmulwev.h.bu.b   vr14,   vr0,   vr8
   2921    vmulwev.h.bu.b   vr15,   vr1,   vr8
   2922    vmulwev.h.bu.b   vr16,   vr2,   vr9
   2923    vmulwev.h.bu.b   vr17,   vr3,   vr9
   2924    vmulwev.h.bu.b   vr18,   vr4,   vr10
   2925    vmulwev.h.bu.b   vr19,   vr5,   vr10
   2926    vmulwev.h.bu.b   vr20,   vr12,  vr11
   2927    vmulwev.h.bu.b   vr21,   vr13,  vr11
   2928    vmaddwod.h.bu.b  vr14,   vr0,   vr8
   2929    vmaddwod.h.bu.b  vr15,   vr1,   vr8
   2930    vmaddwod.h.bu.b  vr16,   vr2,   vr9
   2931    vmaddwod.h.bu.b  vr17,   vr3,   vr9
   2932    vmaddwod.h.bu.b  vr18,   vr4,   vr10
   2933    vmaddwod.h.bu.b  vr19,   vr5,   vr10
   2934    vmaddwod.h.bu.b  vr20,   vr12,  vr11
   2935    vmaddwod.h.bu.b  vr21,   vr13,  vr11
   2936 
   2937    vaddi.hu         vr0,    vr2,   0
   2938    vaddi.hu         vr1,    vr3,   0
   2939    vaddi.hu         vr2,    vr4,   0
   2940    vaddi.hu         vr3,    vr5,   0
   2941    vaddi.hu         vr4,    vr12,  0
   2942    vaddi.hu         vr5,    vr13,  0
   2943    vadd.h           vr14,   vr14,  vr16
   2944    vadd.h           vr14,   vr14,  vr18
   2945    vadd.h           vr14,   vr14,  vr20
   2946    vadd.h           vr15,   vr15,  vr17
   2947    vadd.h           vr15,   vr15,  vr19
   2948    vadd.h           vr15,   vr15,  vr21
   2949 
   2950    vssrarni.bu.h    vr15,   vr14,  6
   2951    vstelm.d         vr15,   a0,    0,   0
   2952    add.d            a0,     a0,    a1
   2953    vstelm.d         vr15,   a0,    0,   1
   2954    add.d            a0,     a0,    a1
   2955    addi.w           a5,     a5,    -2
   2956    bnez             a5,     .l_\lable\()put_v_8w_loop
   2957    addi.d           a2,     t0,    8
   2958    addi.d           t0,     t0,    8
   2959    addi.d           a0,     t8,    8
   2960    addi.d           t8,     t8,    8
   2961    addi.d           a5,     t5,    0
   2962    addi.w           a4,     a4,    -8
   2963    bnez             a4,     .l_\lable\()put_v_8w_loop0
   2964    b                .l_\lable\()end_put_8tap
   2965 
   2966 .l_\lable\()put_hv:
   2967    ld.d             t5,     sp,    0  //filter_type
   2968    andi             t1,     t5,    3
   2969    blt              t0,     a4,    .l_\lable\()put_hv_idx_fh
   2970    andi             t1,     t5,    1
   2971    addi.w           t1,     t1,    3
   2972 .l_\lable\()put_hv_idx_fh:
   2973    addi.w           t5,     zero,  120
   2974    mul.w            t1,     t1,    t5
   2975    addi.w           t5,     a6,    -1
   2976    slli.w           t5,     t5,    3
   2977    add.w            t1,     t1,    t5
   2978    add.d            t1,     t6,    t1 //fh's offset
   2979    vldrepl.d        vr8,    t1,    0
   2980    ld.d             t1,     sp,    0  //filter_type
   2981    srli.w           t1,     t1,    2
   2982    blt              t0,     a5,    .l_\lable\()put_hv_idx_fv
   2983    andi             t1,     t1,    1
   2984    addi.w           t1,     t1,    3
   2985 .l_\lable\()put_hv_idx_fv:
   2986    addi.w           t5,     zero,  120
   2987    mul.w            t1,     t1,    t5
   2988    addi.w           t5,     a7,    -1
   2989    slli.w           t5,     t5,    3
   2990    add.w            t1,     t1,    t5
   2991    add.d            t1,     t6,    t1 //fv's offset
   2992    vldrepl.d        vr9,    t1,    0
   2993    vexth.h.b        vr9,    vr9
   2994 
   2995    sub.d            a2,     a2,    t3
   2996    addi.d           a2,     a2,    -3
   2997 
   2998    clz.w            t1,     a4
   2999    li.w             t5,     24
   3000    sub.w            t1,     t1,    t5
   3001    la.local         t5,     .l_\lable\()put_hv_jtable
   3002    alsl.d           t1,     t1,    t5,   3
   3003    ld.d             t6,     t1,    0
   3004    add.d            t5,     t5,    t6
   3005    jirl             $r0,    t5,    0
   3006 
   3007    .align   3
   3008 .l_\lable\()put_hv_jtable:
   3009    .dword .l_\lable\()put_hv_128w - .l_\lable\()put_hv_jtable
   3010    .dword .l_\lable\()put_hv_64w  - .l_\lable\()put_hv_jtable
   3011    .dword .l_\lable\()put_hv_32w  - .l_\lable\()put_hv_jtable
   3012    .dword .l_\lable\()put_hv_16w  - .l_\lable\()put_hv_jtable
   3013    .dword .l_\lable\()put_hv_8w   - .l_\lable\()put_hv_jtable
   3014    .dword .l_\lable\()put_hv_4w   - .l_\lable\()put_hv_jtable
   3015    .dword .l_\lable\()put_hv_2w   - .l_\lable\()put_hv_jtable
   3016 
   3017 .l_\lable\()put_hv_2w:
   3018    addi.d           a2,     a2,    2
   3019    vld              vr0,    a2,    0
   3020    vldx             vr1,    a2,    a3
   3021    vldx             vr2,    a2,    t2
   3022    add.d            a2,     a2,    t3
   3023    vld              vr3,    a2,    0
   3024    vldx             vr4,    a2,    a3
   3025    vldx             vr5,    a2,    t2
   3026    vldx             vr6,    a2,    t3
   3027    add.d            a2,     a2,    t4
   3028 
   3029    la.local         t1,     subpel_h_shuf0
   3030    vld              vr7,    t1,    0
   3031    vbsrl.v          vr8,    vr8,   2
   3032    vreplvei.w       vr8,    vr8,   0
   3033 
   3034    //fv
   3035    vreplvei.w       vr14,   vr9,   1
   3036    vreplvei.w       vr15,   vr9,   2
   3037    vreplvei.w       vr16,   vr9,   3
   3038    vreplvei.w       vr9,    vr9,   0
   3039 
   3040    vshuf.b          vr0,    vr1,   vr0,  vr7
   3041    vshuf.b          vr1,    vr3,   vr2,  vr7
   3042    vshuf.b          vr2,    vr5,   vr4,  vr7
   3043    vshuf.b          vr3,    vr6,   vr6,  vr7
   3044    vmulwev.h.bu.b   vr10,   vr0,   vr8
   3045    vmulwev.h.bu.b   vr11,   vr1,   vr8
   3046    vmulwev.h.bu.b   vr12,   vr2,   vr8
   3047    vmulwev.h.bu.b   vr13,   vr3,   vr8
   3048    vmaddwod.h.bu.b  vr10,   vr0,   vr8
   3049    vmaddwod.h.bu.b  vr11,   vr1,   vr8
   3050    vmaddwod.h.bu.b  vr12,   vr2,   vr8
   3051    vmaddwod.h.bu.b  vr13,   vr3,   vr8
   3052    vhaddw.w.h       vr0,    vr10,  vr10
   3053    vhaddw.w.h       vr1,    vr11,  vr11
   3054    vssrarni.h.w     vr1,    vr0,   2 //h0 h1 h2 h3
   3055    vhaddw.w.h       vr2,    vr12,  vr12
   3056    vhaddw.w.h       vr3,    vr13,  vr13
   3057    vssrarni.h.w     vr3,    vr2,   2 //h4 h5 h6 ~
   3058    vbsrl.v          vr2,    vr1,   4
   3059    vextrins.w       vr2,    vr3,   0x30 //h1 h2 h3 h4
   3060    vilvl.h          vr4,    vr2,   vr1 //h0 h1 h1 h2 --
   3061    vilvh.h          vr5,    vr2,   vr1 //h2 h3 h3 h4 --
   3062    vbsrl.v          vr6,    vr3,   4
   3063    vilvl.h          vr6,    vr6,   vr3 //h4 h5 h5 h6 --
   3064    vbsrl.v          vr3,    vr3,   8  //h6 ~
   3065 .l_\lable\()put_hv_2w_loop:
   3066    vld              vr0,    a2,    0
   3067    vldx             vr2,    a2,    a3
   3068    add.d            a2,     a2,    t2
   3069    vshuf.b          vr0,    vr2,   vr0,  vr7
   3070    vdp2.h.bu.b      vr17,   vr0,   vr8
   3071    vhaddw.w.h       vr17,   vr17,  vr17
   3072    vssrarni.h.w     vr17,   vr17,  2 //h7 h8
   3073    vextrins.w       vr3,    vr17,  0x10 //h6 h7
   3074    vilvl.h          vr3,    vr17,  vr3  //h6 h7 h7 h8 --
   3075 
   3076    vmulwev.w.h      vr18,   vr4,   vr9
   3077    vmulwev.w.h      vr19,   vr5,   vr14
   3078    vmulwev.w.h      vr20,   vr6,   vr15
   3079    vmulwev.w.h      vr21,   vr3,   vr16
   3080    vmaddwod.w.h     vr18,   vr4,   vr9
   3081    vmaddwod.w.h     vr19,   vr5,   vr14
   3082    vmaddwod.w.h     vr20,   vr6,   vr15
   3083    vmaddwod.w.h     vr21,   vr3,   vr16
   3084    vaddi.hu         vr4,    vr5,   0
   3085    vaddi.hu         vr5,    vr6,   0
   3086    vaddi.hu         vr6,    vr3,   0
   3087    vbsrl.v          vr3,    vr17,  4 //h8 ~
   3088    vadd.w           vr18,   vr18,  vr19
   3089    vadd.w           vr18,   vr18,  vr20
   3090    vadd.w           vr18,   vr18,  vr21
   3091 
   3092    vssrarni.hu.w    vr0,    vr18,  10
   3093    vssrani.bu.h     vr0,    vr0,   0
   3094    vstelm.h         vr0,    a0,    0,   0
   3095    add.d            a0,     a0,    a1
   3096    vstelm.h         vr0,    a0,    0,   1
   3097    add.d            a0,     a0,    a1
   3098    addi.d           a5,     a5,    -2
   3099    bnez             a5,     .l_\lable\()put_hv_2w_loop
   3100    b                .l_\lable\()end_put_8tap
   3101 
   3102 .l_\lable\()put_hv_4w:
   3103    addi.d           a2,     a2,    2 //ignore leading 0
   3104    vld              vr0,    a2,    0
   3105    vldx             vr1,    a2,    a3
   3106    vldx             vr2,    a2,    t2
   3107    add.d            a2,     a2,    t3
   3108    vld              vr3,    a2,    0
   3109    vldx             vr4,    a2,    a3
   3110    vldx             vr5,    a2,    t2
   3111    vldx             vr6,    a2,    t3
   3112    add.d            a2,     a2,    t4
   3113 
   3114    la.local         t1,     subpel_h_shuf1
   3115    vld              vr7,    t1,    0
   3116    vbsrl.v          vr8,    vr8,   2
   3117    vreplvei.w       vr8,    vr8,   0
   3118 
   3119    //fv
   3120    vreplvei.w       vr17,   vr9,   0
   3121    vreplvei.w       vr18,   vr9,   1
   3122    vreplvei.w       vr19,   vr9,   2
   3123    vreplvei.w       vr20,   vr9,   3
   3124 
   3125    //DAV1D_FILTER_8TAP_RND
   3126    vshuf.b          vr0,    vr0,   vr0,  vr7
   3127    vshuf.b          vr1,    vr1,   vr1,  vr7
   3128    vshuf.b          vr2,    vr2,   vr2,  vr7
   3129    vshuf.b          vr3,    vr3,   vr3,  vr7
   3130    vshuf.b          vr4,    vr4,   vr4,  vr7
   3131    vshuf.b          vr5,    vr5,   vr5,  vr7
   3132    vshuf.b          vr6,    vr6,   vr6,  vr7
   3133 
   3134    vmulwev.h.bu.b   vr10,   vr0,   vr8
   3135    vmulwev.h.bu.b   vr11,   vr1,   vr8
   3136    vmulwev.h.bu.b   vr12,   vr2,   vr8
   3137    vmulwev.h.bu.b   vr13,   vr3,   vr8
   3138    vmulwev.h.bu.b   vr14,   vr4,   vr8
   3139    vmulwev.h.bu.b   vr15,   vr5,   vr8
   3140    vmulwev.h.bu.b   vr16,   vr6,   vr8
   3141    vmaddwod.h.bu.b  vr10,   vr0,   vr8
   3142    vmaddwod.h.bu.b  vr11,   vr1,   vr8
   3143    vmaddwod.h.bu.b  vr12,   vr2,   vr8
   3144    vmaddwod.h.bu.b  vr13,   vr3,   vr8
   3145    vmaddwod.h.bu.b  vr14,   vr4,   vr8
   3146    vmaddwod.h.bu.b  vr15,   vr5,   vr8
   3147    vmaddwod.h.bu.b  vr16,   vr6,   vr8
   3148 
   3149    vhaddw.w.h       vr10,   vr10,  vr10
   3150    vhaddw.w.h       vr11,   vr11,  vr11
   3151    vhaddw.w.h       vr12,   vr12,  vr12
   3152    vhaddw.w.h       vr13,   vr13,  vr13
   3153    vhaddw.w.h       vr14,   vr14,  vr14
   3154    vhaddw.w.h       vr15,   vr15,  vr15
   3155    vhaddw.w.h       vr16,   vr16,  vr16
   3156 
   3157    vssrarni.h.w     vr10,   vr10,  2 //h0
   3158    vssrarni.h.w     vr11,   vr11,  2 //h1
   3159    vssrarni.h.w     vr12,   vr12,  2 //h2
   3160    vssrarni.h.w     vr13,   vr13,  2 //h3
   3161    vssrarni.h.w     vr14,   vr14,  2 //h4
   3162    vssrarni.h.w     vr15,   vr15,  2 //h5
   3163    vssrarni.h.w     vr16,   vr16,  2 //h6
   3164 
   3165    //h0
   3166    vilvl.h          vr0,    vr11,  vr10 //01
   3167    vilvl.h          vr1,    vr13,  vr12 //23
   3168    vilvl.h          vr2,    vr15,  vr14 //45
   3169    //h1
   3170    vilvl.h          vr4,    vr12,  vr11 //12
   3171    vilvl.h          vr5,    vr14,  vr13 //34
   3172    vilvl.h          vr6,    vr16,  vr15 //56
   3173 
   3174 .l_\lable\()put_hv_4w_loop:
   3175    vld              vr9,    a2,    0
   3176    vldx             vr10,   a2,    a3
   3177    add.d            a2,     a2,    t2
   3178 
   3179    //DAV1D_FILTER_8TAP_CLIP
   3180    vshuf.b          vr9,    vr9,   vr9,  vr7
   3181    vshuf.b          vr10,   vr10,  vr10, vr7
   3182    vmulwev.h.bu.b   vr11,   vr9,   vr8
   3183    vmulwev.h.bu.b   vr12,   vr10,  vr8
   3184    vmaddwod.h.bu.b  vr11,   vr9,   vr8
   3185    vmaddwod.h.bu.b  vr12,   vr10,  vr8
   3186    vhaddw.w.h       vr11,   vr11,  vr11
   3187    vhaddw.w.h       vr12,   vr12,  vr12
   3188    vssrarni.h.w     vr11,   vr11,  2 //h7
   3189    vssrarni.h.w     vr12,   vr12,  2 //h8
   3190    vilvl.h          vr3,    vr11,  vr16 //67
   3191    vilvl.h          vr13,   vr12,  vr11 //78
   3192 
   3193    vmulwev.w.h      vr9,    vr0,   vr17
   3194    vmulwev.w.h      vr10,   vr1,   vr18
   3195    vmulwev.w.h      vr14,   vr2,   vr19
   3196    vmulwev.w.h      vr15,   vr3,   vr20
   3197    vmaddwod.w.h     vr9,    vr0,   vr17
   3198    vmaddwod.w.h     vr10,   vr1,   vr18
   3199    vmaddwod.w.h     vr14,   vr2,   vr19
   3200    vmaddwod.w.h     vr15,   vr3,   vr20
   3201    vadd.w           vr16,   vr9,   vr10
   3202    vadd.w           vr16,   vr16,  vr14
   3203    vadd.w           vr16,   vr16,  vr15
   3204 
   3205    vmulwev.w.h      vr9,    vr4,   vr17
   3206    vmulwev.w.h      vr10,   vr5,   vr18
   3207    vmulwev.w.h      vr14,   vr6,   vr19
   3208    vmulwev.w.h      vr15,   vr13,  vr20
   3209    vmaddwod.w.h     vr9,    vr4,   vr17
   3210    vmaddwod.w.h     vr10,   vr5,   vr18
   3211    vmaddwod.w.h     vr14,   vr6,   vr19
   3212    vmaddwod.w.h     vr15,   vr13,  vr20
   3213    vadd.w           vr21,   vr9,   vr10
   3214    vadd.w           vr21,   vr21,  vr14
   3215    vadd.w           vr21,   vr21,  vr15
   3216 
   3217    vssrarni.hu.w    vr21,   vr16,  10
   3218    vssrani.bu.h     vr21,   vr21,  0
   3219    //cache
   3220    vaddi.hu         vr0,    vr1,   0
   3221    vaddi.hu         vr1,    vr2,   0
   3222    vaddi.hu         vr2,    vr3,   0
   3223    vaddi.hu         vr4,    vr5,   0
   3224    vaddi.hu         vr5,    vr6,   0
   3225    vaddi.hu         vr6,    vr13,  0
   3226    vaddi.hu         vr16,   vr12,  0
   3227 
   3228    vstelm.w         vr21,   a0,    0,    0
   3229    add.d            a0,     a0,    a1
   3230    vstelm.w         vr21,   a0,    0,    1
   3231    add.d            a0,     a0,    a1
   3232    addi.w           a5,     a5,    -2
   3233    bnez             a5,     .l_\lable\()put_hv_4w_loop
   3234    b                .l_\lable\()end_put_8tap
   3235 
   3236 .l_\lable\()put_hv_8w:
   3237 .l_\lable\()put_hv_16w:
   3238 .l_\lable\()put_hv_32w:
   3239 .l_\lable\()put_hv_64w:
   3240 .l_\lable\()put_hv_128w:
   3241    addi.d          sp,      sp,    -8*8
   3242    fst.d           f24,     sp,    0
   3243    fst.d           f25,     sp,    8
   3244    fst.d           f26,     sp,    16
   3245    fst.d           f27,     sp,    24
   3246    fst.d           f28,     sp,    32
   3247    fst.d           f29,     sp,    40
   3248    fst.d           f30,     sp,    48
   3249    fst.d           f31,     sp,    56
   3250    addi.d          t0,      a2,    0 //src
   3251    addi.d          t5,      a5,    0 //h
   3252    addi.d          t8,      a0,    0 //dst
   3253    la.local        t1,      subpel_h_shuf1
   3254    vld             vr7,     t1,    0
   3255    vaddi.bu        vr11,    vr7,   4
   3256    vaddi.bu        vr12,    vr7,   8
   3257    vreplvei.w      vr10,    vr8,   1
   3258    vreplvei.w      vr8,     vr8,   0
   3259    vreplvei.w      vr20,    vr9,   1
   3260    vreplvei.w      vr21,    vr9,   2
   3261    vreplvei.w      vr22,    vr9,   3
   3262    vreplvei.w      vr9,     vr9,   0
   3263 .l_\lable\()put_hv_8w_loop0:
   3264    vld             vr0,     a2,    0
   3265    vldx            vr1,     a2,    a3
   3266    vldx            vr2,     a2,    t2
   3267    add.d           a2,      a2,    t3
   3268    vld             vr3,     a2,    0
   3269    vldx            vr4,     a2,    a3
   3270    vldx            vr5,     a2,    t2
   3271    vldx            vr6,     a2,    t3
   3272    add.d           a2,      a2,    t4
   3273 
   3274    FILTER_8TAP_8W  vr0 //h0
   3275    FILTER_8TAP_8W  vr1 //h1
   3276    FILTER_8TAP_8W  vr2 //h2
   3277    FILTER_8TAP_8W  vr3 //h3
   3278    FILTER_8TAP_8W  vr4 //h4
   3279    FILTER_8TAP_8W  vr5 //h5
   3280    FILTER_8TAP_8W  vr6 //h6
   3281 
   3282    //h0' low part
   3283    vilvl.h         vr23,    vr1,   vr0 //01
   3284    vilvl.h         vr24,    vr3,   vr2 //23
   3285    vilvl.h         vr25,    vr5,   vr4 //45
   3286    //h0' high part
   3287    vilvh.h         vr26,    vr1,   vr0 //01
   3288    vilvh.h         vr27,    vr3,   vr2 //23
   3289    vilvh.h         vr28,    vr5,   vr4 //45
   3290 
   3291    //h1' low part
   3292    vilvl.h         vr29,    vr2,   vr1 //12
   3293    vilvl.h         vr30,    vr4,   vr3 //34
   3294    vilvl.h         vr31,    vr6,   vr5 //56
   3295    //h1' high part
   3296    vilvh.h         vr0,     vr2,   vr1 //12
   3297    vilvh.h         vr1,     vr4,   vr3 //34
   3298    vilvh.h         vr2,     vr6,   vr5 //56
   3299 
   3300 .l_\lable\()put_hv_8w_loop:
   3301    vld             vr3,     a2,    0
   3302    vldx            vr4,     a2,    a3
   3303    add.d           a2,      a2,    t2
   3304 
   3305    FILTER_8TAP_8W  vr3 //h7
   3306    FILTER_8TAP_8W  vr4 //h8
   3307 
   3308    //h0' low part
   3309    vilvl.h         vr16,    vr3,   vr6 //67 ~low
   3310    vmulwev.w.h     vr13,    vr23,  vr9
   3311    vmulwev.w.h     vr14,    vr24,  vr20
   3312    vmulwev.w.h     vr15,    vr25,  vr21
   3313    vmulwev.w.h     vr17,    vr16,  vr22
   3314    vmaddwod.w.h    vr13,    vr23,  vr9
   3315    vmaddwod.w.h    vr14,    vr24,  vr20
   3316    vmaddwod.w.h    vr15,    vr25,  vr21
   3317    vmaddwod.w.h    vr17,    vr16,  vr22
   3318    vadd.w          vr13,    vr13,  vr14
   3319    vadd.w          vr13,    vr13,  vr15
   3320    vadd.w          vr13,    vr13,  vr17
   3321    //cache
   3322    vaddi.hu        vr23,    vr24,  0
   3323    vaddi.hu        vr24,    vr25,  0
   3324    vaddi.hu        vr25,    vr16,  0
   3325 
   3326    //h0' high part
   3327    vilvh.h         vr17,    vr3,   vr6 //67 ~high
   3328    vmulwev.w.h     vr14,    vr26,  vr9
   3329    vmulwev.w.h     vr15,    vr27,  vr20
   3330    vmulwev.w.h     vr16,    vr28,  vr21
   3331    vmulwev.w.h     vr18,    vr17,  vr22
   3332    vmaddwod.w.h    vr14,    vr26,  vr9
   3333    vmaddwod.w.h    vr15,    vr27,  vr20
   3334    vmaddwod.w.h    vr16,    vr28,  vr21
   3335    vmaddwod.w.h    vr18,    vr17,  vr22
   3336    vadd.w          vr14,    vr14,  vr15
   3337    vadd.w          vr14,    vr14,  vr16
   3338    vadd.w          vr14,    vr14,  vr18
   3339    vssrarni.hu.w   vr14,    vr13,  10
   3340    vssrarni.bu.h   vr5,     vr14,  0
   3341    vstelm.d        vr5,     a0,    0,   0
   3342    add.d           a0,      a0,    a1
   3343    //cache
   3344    vaddi.hu        vr26,    vr27,  0
   3345    vaddi.hu        vr27,    vr28,  0
   3346    vaddi.hu        vr28,    vr17,  0
   3347    vaddi.hu        vr6,     vr4,   0
   3348 
   3349    vilvl.h         vr5,     vr4,   vr3 //78 ~low
   3350    vilvh.h         vr4,     vr4,   vr3 //78 ~high
   3351 
   3352    //h1' low part
   3353    vmulwev.w.h     vr13,    vr29,  vr9
   3354    vmulwev.w.h     vr14,    vr30,  vr20
   3355    vmulwev.w.h     vr15,    vr31,  vr21
   3356    vmulwev.w.h     vr16,    vr5,   vr22
   3357    vmaddwod.w.h    vr13,    vr29,  vr9
   3358    vmaddwod.w.h    vr14,    vr30,  vr20
   3359    vmaddwod.w.h    vr15,    vr31,  vr21
   3360    vmaddwod.w.h    vr16,    vr5,   vr22
   3361    vadd.w          vr13,    vr13,  vr14
   3362    vadd.w          vr13,    vr13,  vr15
   3363    vadd.w          vr13,    vr13,  vr16
   3364    //cache
   3365    vaddi.hu        vr29,    vr30,  0
   3366    vaddi.hu        vr30,    vr31,  0
   3367    vaddi.hu        vr31,    vr5,   0
   3368 
   3369    //h1' high part
   3370    vmulwev.w.h     vr14,    vr0,   vr9
   3371    vmulwev.w.h     vr15,    vr1,   vr20
   3372    vmulwev.w.h     vr16,    vr2,   vr21
   3373    vmulwev.w.h     vr17,    vr4,   vr22
   3374    vmaddwod.w.h    vr14,    vr0,   vr9
   3375    vmaddwod.w.h    vr15,    vr1,   vr20
   3376    vmaddwod.w.h    vr16,    vr2,   vr21
   3377    vmaddwod.w.h    vr17,    vr4,   vr22
   3378    vadd.w          vr14,    vr14,  vr15
   3379    vadd.w          vr14,    vr14,  vr16
   3380    vadd.w          vr14,    vr14,  vr17
   3381    vssrarni.hu.w   vr14,    vr13,  10
   3382    vssrarni.bu.h   vr5,     vr14,  0
   3383    vstelm.d        vr5,     a0,    0,   0
   3384    add.d           a0,      a0,    a1
   3385    //cache
   3386    vaddi.hu        vr0,     vr1,   0
   3387    vaddi.hu        vr1,     vr2,   0
   3388    vaddi.hu        vr2,     vr4,   0
   3389 
   3390    addi.w          a5,      a5,    -2
   3391    bnez            a5,      .l_\lable\()put_hv_8w_loop
   3392    addi.d          a2,      t0,    8
   3393    addi.d          t0,      t0,    8
   3394    addi.d          a0,      t8,    8
   3395    addi.d          t8,      t8,    8
   3396    addi.d          a5,      t5,    0
   3397    addi.w          a4,      a4,    -8
   3398    bnez            a4,      .l_\lable\()put_hv_8w_loop0
   3399    fld.d           f24,     sp,    0
   3400    fld.d           f25,     sp,    8
   3401    fld.d           f26,     sp,    16
   3402    fld.d           f27,     sp,    24
   3403    fld.d           f28,     sp,    32
   3404    fld.d           f29,     sp,    40
   3405    fld.d           f30,     sp,    48
   3406    fld.d           f31,     sp,    56
   3407    addi.d          sp,      sp,    8*8
   3408 .l_\lable\()end_put_8tap:
   3409 .endm
   3410 
   3411 function put_8tap_regular_8bpc_lsx
   3412    addi.d   sp, sp,  -16
   3413    st.d   zero, sp,  0
   3414    PUT_8TAP_8BPC_LSX 0
   3415    addi.d   sp, sp,  16
   3416 endfunc
   3417 
   3418 function put_8tap_smooth_regular_8bpc_lsx
   3419    addi.d   sp, sp,  -16
   3420    li.w     t0, 1
   3421    st.d     t0, sp,  0
   3422    PUT_8TAP_8BPC_LSX 1
   3423    addi.d   sp, sp,  16
   3424 endfunc
   3425 
   3426 function put_8tap_sharp_regular_8bpc_lsx
   3427    addi.d   sp, sp,  -16
   3428    li.w     t0, 2
   3429    st.d     t0, sp,  0
   3430    PUT_8TAP_8BPC_LSX 2
   3431    addi.d   sp, sp,  16
   3432 endfunc
   3433 
   3434 function put_8tap_regular_smooth_8bpc_lsx
   3435    addi.d   sp, sp,  -16
   3436    li.w     t0, 4
   3437    st.d     t0, sp,  0
   3438    PUT_8TAP_8BPC_LSX 4
   3439    addi.d   sp, sp,  16
   3440 endfunc
   3441 
   3442 function put_8tap_smooth_8bpc_lsx
   3443    addi.d   sp, sp,  -16
   3444    li.w     t0, 5
   3445    st.d     t0, sp,  0
   3446    PUT_8TAP_8BPC_LSX 5
   3447    addi.d   sp, sp,  16
   3448 endfunc
   3449 
   3450 function put_8tap_sharp_smooth_8bpc_lsx
   3451    addi.d   sp, sp,  -16
   3452    li.w     t0, 6
   3453    st.d     t0, sp,  0
   3454    PUT_8TAP_8BPC_LSX 6
   3455    addi.d   sp, sp,  16
   3456 endfunc
   3457 
   3458 function put_8tap_regular_sharp_8bpc_lsx
   3459    addi.d   sp, sp,  -16
   3460    li.w     t0, 8
   3461    st.d     t0, sp,  0
   3462    PUT_8TAP_8BPC_LSX 8
   3463    addi.d   sp, sp,  16
   3464 endfunc
   3465 
   3466 function put_8tap_smooth_sharp_8bpc_lsx
   3467    addi.d   sp, sp,  -16
   3468    li.w     t0, 9
   3469    st.d     t0, sp,  0
   3470    PUT_8TAP_8BPC_LSX 9
   3471    addi.d   sp, sp,  16
   3472 endfunc
   3473 
   3474 function put_8tap_sharp_8bpc_lsx
   3475    addi.d   sp, sp,  -16
   3476    li.w     t0, 10
   3477    st.d     t0, sp,  0
   3478    PUT_8TAP_8BPC_LSX 10
   3479    addi.d   sp, sp,  16
   3480 endfunc
   3481 
   3482 const shufb1
   3483 .byte 0,1,2,3,4,5,6,7,1,2,3,4,5,6,7,8
   3484 endconst
   3485 
   3486 .macro PREP_H_8W in0
   3487    vshuf.b          vr2,    \in0,  \in0,   vr6
   3488    vshuf.b          vr3,    \in0,  \in0,   vr7
   3489    vshuf.b          vr4,    \in0,  \in0,   vr8
   3490    vmulwev.h.bu.b   vr12,   vr2,   vr22
   3491    vmulwev.h.bu.b   vr13,   vr3,   vr23
   3492    vmulwev.h.bu.b   vr14,   vr3,   vr22
   3493    vmulwev.h.bu.b   vr15,   vr4,   vr23
   3494    vmaddwod.h.bu.b  vr12,   vr2,   vr22
   3495    vmaddwod.h.bu.b  vr13,   vr3,   vr23
   3496    vmaddwod.h.bu.b  vr14,   vr3,   vr22
   3497    vmaddwod.h.bu.b  vr15,   vr4,   vr23
   3498    vadd.h           vr12,   vr12,  vr13
   3499    vadd.h           vr14,   vr14,  vr15
   3500    vhaddw.w.h       vr12,   vr12,  vr12
   3501    vhaddw.w.h       \in0,   vr14,  vr14
   3502    vssrarni.h.w     \in0,   vr12,  2
   3503 .endm
   3504 
   3505 .macro PREP_HV_8W_LASX in0
   3506    xvshuf.b         xr4,   \in0,  \in0,   xr19
   3507    xvshuf.b         xr5,   \in0,  \in0,   xr20
   3508    xvshuf.b         xr6,   \in0,  \in0,   xr21
   3509    xvmulwev.h.bu.b  xr7,   xr4,   xr22
   3510    xvmulwev.h.bu.b  xr9,   xr5,   xr23
   3511    xvmulwev.h.bu.b  xr10,  xr5,   xr22
   3512    xvmulwev.h.bu.b  xr11,  xr6,   xr23
   3513    xvmaddwod.h.bu.b xr7,   xr4,   xr22
   3514    xvmaddwod.h.bu.b xr9,   xr5,   xr23
   3515    xvmaddwod.h.bu.b xr10,  xr5,   xr22
   3516    xvmaddwod.h.bu.b xr11,  xr6,   xr23
   3517    xvadd.h          xr7,   xr7,   xr9
   3518    xvadd.h          xr9,   xr10,  xr11
   3519    xvhaddw.w.h      xr7,   xr7,   xr7
   3520    xvhaddw.w.h      \in0,  xr9,   xr9
   3521    xvssrarni.h.w    \in0,  xr7,   2
   3522 .endm
   3523 
   3524 .macro PREP_8TAP_8BPC_LASX lable
   3525    li.w             t0,     4
   3526    la.local         t6,     dav1d_mc_subpel_filters
   3527    slli.d           t2,     a2,    1  //src_stride*2
   3528    add.d            t3,     t2,    a2 //src_stride*3
   3529    slli.d           t4,     t2,    1
   3530 
   3531    bnez             a5,     .l_\lable\()h_lasx //mx
   3532    bnez             a6,     .l_\lable\()v_lasx
   3533 
   3534    clz.w            t1,     a3
   3535    li.w             t5,     24
   3536    sub.w            t1,     t1,    t5
   3537    la.local         t5,     .l_\lable\()prep_hv0_jtable_lasx
   3538    alsl.d           t1,     t1,    t5,   1
   3539    ld.h             t8,     t1,    0
   3540    add.d            t5,     t5,    t8
   3541    jirl             $r0,    t5,    0
   3542 
   3543    .align   3
   3544 .l_\lable\()prep_hv0_jtable_lasx:
   3545    .hword .l_\lable\()hv0_128w_lasx - .l_\lable\()prep_hv0_jtable_lasx
   3546    .hword .l_\lable\()hv0_64w_lasx  - .l_\lable\()prep_hv0_jtable_lasx
   3547    .hword .l_\lable\()hv0_32w_lasx  - .l_\lable\()prep_hv0_jtable_lasx
   3548    .hword .l_\lable\()hv0_16w_lasx  - .l_\lable\()prep_hv0_jtable_lasx
   3549    .hword .l_\lable\()hv0_8w_lasx   - .l_\lable\()prep_hv0_jtable_lasx
   3550    .hword .l_\lable\()hv0_4w_lasx   - .l_\lable\()prep_hv0_jtable_lasx
   3551 
   3552 .l_\lable\()hv0_4w_lasx:
   3553    fld.s            f0,     a1,    0
   3554    fldx.s           f1,     a1,    a2
   3555    fldx.s           f2,     a1,    t2
   3556    fldx.s           f3,     a1,    t3
   3557    add.d            a1,     a1,    t4
   3558    xvpackev.w       xr0,    xr1,   xr0
   3559    xvpackev.w       xr1,    xr3,   xr2
   3560    xvpermi.q        xr0,    xr1,   0x02
   3561    xvsllwil.hu.bu   xr0,    xr0,   4
   3562    xvst             xr0,    a0,    0
   3563    addi.d           a0,     a0,    32
   3564    addi.d           a4,     a4,    -4
   3565    bnez             a4,     .l_\lable\()hv0_4w_lasx
   3566    b                .l_\lable\()end_pre_8tap_lasx
   3567 .l_\lable\()hv0_8w_lasx:
   3568    fld.d            f0,     a1,    0
   3569    fldx.d           f1,     a1,    a2
   3570    fldx.d           f2,     a1,    t2
   3571    fldx.d           f3,     a1,    t3
   3572    add.d            a1,     a1,    t4
   3573    xvpermi.q        xr0,    xr1,   0x02
   3574    xvpermi.q        xr2,    xr3,   0x02
   3575    xvsllwil.hu.bu   xr0,    xr0,   4
   3576    xvsllwil.hu.bu   xr2,    xr2,   4
   3577    xvst             xr0,    a0,    0
   3578    xvst             xr2,    a0,    32
   3579    addi.d           a0,     a0,    64
   3580    addi.d           a4,     a4,    -4
   3581    bnez             a4,     .l_\lable\()hv0_8w_lasx
   3582    b                .l_\lable\()end_pre_8tap_lasx
   3583 .l_\lable\()hv0_16w_lasx:
   3584    vld              vr0,    a1,    0
   3585    vldx             vr1,    a1,    a2
   3586    vldx             vr2,    a1,    t2
   3587    vldx             vr3,    a1,    t3
   3588    add.d            a1,     a1,    t4
   3589    vext2xv.hu.bu    xr0,    xr0
   3590    vext2xv.hu.bu    xr1,    xr1
   3591    vext2xv.hu.bu    xr2,    xr2
   3592    vext2xv.hu.bu    xr3,    xr3
   3593    xvslli.h         xr0,    xr0,   4
   3594    xvslli.h         xr1,    xr1,   4
   3595    xvslli.h         xr2,    xr2,   4
   3596    xvslli.h         xr3,    xr3,   4
   3597    xvst             xr0,    a0,    0
   3598    xvst             xr1,    a0,    32
   3599    xvst             xr2,    a0,    64
   3600    xvst             xr3,    a0,    96
   3601    addi.d           a0,     a0,    128
   3602    addi.d           a4,     a4,    -4
   3603    bnez             a4,     .l_\lable\()hv0_16w_lasx
   3604    b                .l_\lable\()end_pre_8tap_lasx
   3605 .l_\lable\()hv0_32w_lasx:
   3606    xvld             xr0,    a1,    0
   3607    xvldx            xr1,    a1,    a2
   3608    xvldx            xr2,    a1,    t2
   3609    xvldx            xr3,    a1,    t3
   3610    add.d            a1,     a1,    t4
   3611    xvpermi.d        xr4,    xr0,   0xD8
   3612    xvpermi.d        xr5,    xr1,   0xD8
   3613    xvpermi.d        xr6,    xr2,   0xD8
   3614    xvpermi.d        xr7,    xr3,   0xD8
   3615    xvpermi.d        xr10,   xr0,   0x32
   3616    xvpermi.d        xr11,   xr1,   0x32
   3617    xvpermi.d        xr12,   xr2,   0x32
   3618    xvpermi.d        xr13,   xr3,   0x32
   3619    xvsllwil.hu.bu   xr0,    xr4,   4
   3620    xvsllwil.hu.bu   xr1,    xr5,   4
   3621    xvsllwil.hu.bu   xr2,    xr6,   4
   3622    xvsllwil.hu.bu   xr3,    xr7,   4
   3623    xvsllwil.hu.bu   xr4,    xr10,  4
   3624    xvsllwil.hu.bu   xr5,    xr11,  4
   3625    xvsllwil.hu.bu   xr6,    xr12,  4
   3626    xvsllwil.hu.bu   xr7,    xr13,  4
   3627    xvst             xr0,    a0,    0
   3628    xvst             xr4,    a0,    32
   3629    xvst             xr1,    a0,    64
   3630    xvst             xr5,    a0,    96
   3631    xvst             xr2,    a0,    128
   3632    xvst             xr6,    a0,    160
   3633    xvst             xr3,    a0,    192
   3634    xvst             xr7,    a0,    224
   3635    addi.d           a0,     a0,    256
   3636    addi.d           a4,     a4,    -4
   3637    bnez             a4,     .l_\lable\()hv0_32w_lasx
   3638    b                .l_\lable\()end_pre_8tap_lasx
   3639 .l_\lable\()hv0_64w_lasx:
   3640 .l_\lable\()hv0_128w_lasx:
   3641    addi.d           t0,     a1,    0
   3642    addi.d           t5,     a4,    0
   3643    srli.w           t7,     a3,    5
   3644    slli.w           t7,     t7,    6
   3645    addi.d           t8,     a0,    0
   3646 .l_\lable\()hv0_32_loop_lasx:
   3647    xvld             xr0,    a1,    0
   3648    xvldx            xr1,    a1,    a2
   3649    xvldx            xr2,    a1,    t2
   3650    xvldx            xr3,    a1,    t3
   3651    add.d            a1,     a1,    t4
   3652    xvpermi.d        xr4,    xr0,   0xD8
   3653    xvpermi.d        xr5,    xr1,   0xD8
   3654    xvpermi.d        xr6,    xr2,   0xD8
   3655    xvpermi.d        xr7,    xr3,   0xD8
   3656    xvpermi.d        xr10,   xr0,   0x32
   3657    xvpermi.d        xr11,   xr1,   0x32
   3658    xvpermi.d        xr12,   xr2,   0x32
   3659    xvpermi.d        xr13,   xr3,   0x32
   3660    xvsllwil.hu.bu   xr0,    xr4,   4
   3661    xvsllwil.hu.bu   xr1,    xr5,   4
   3662    xvsllwil.hu.bu   xr2,    xr6,   4
   3663    xvsllwil.hu.bu   xr3,    xr7,   4
   3664    xvsllwil.hu.bu   xr4,    xr10,  4
   3665    xvsllwil.hu.bu   xr5,    xr11,  4
   3666    xvsllwil.hu.bu   xr6,    xr12,  4
   3667    xvsllwil.hu.bu   xr7,    xr13,  4
   3668    xvst             xr0,    a0,    0
   3669    xvst             xr4,    a0,    32
   3670    add.d            t1,     a0,    t7
   3671    xvst             xr1,    t1,    0
   3672    xvst             xr5,    t1,    32
   3673    add.d            t1,     t1,    t7
   3674    xvst             xr2,    t1,    0
   3675    xvst             xr6,    t1,    32
   3676    add.d            t1,     t1,    t7
   3677    xvst             xr3,    t1,    0
   3678    xvst             xr7,    t1,    32
   3679    add.d            a0,     t1,    t7
   3680    addi.d           a4,     a4,   -4
   3681    bnez             a4,     .l_\lable\()hv0_32_loop_lasx
   3682    addi.d           a1,     t0,    32
   3683    addi.d           t0,     t0,    32
   3684    addi.d           a0,     t8,    64
   3685    addi.d           t8,     t8,    64
   3686    addi.d           a4,     t5,    0
   3687    addi.d           a3,     a3,   -32
   3688    bnez             a3,     .l_\lable\()hv0_32_loop_lasx
   3689    b                .l_\lable\()end_pre_8tap_lasx
   3690 
   3691 .l_\lable\()h_lasx:
   3692    bnez             a6,     .l_\lable\()hv_lasx //if(fh) && if (fv)
   3693 
   3694    andi             t1,    a7,    3
   3695    blt              t0,    a3,    .l_\lable\()h_idx_fh_lasx
   3696    andi             t1,    a7,    1
   3697    addi.w           t1,    t1,    3
   3698 .l_\lable\()h_idx_fh_lasx:
   3699    addi.w           t5,    zero,  120
   3700    mul.w            t1,    t1,    t5
   3701    addi.w           t5,    a5,    -1
   3702    slli.w           t5,    t5,    3
   3703    add.w            t1,    t1,    t5
   3704    add.d            t1,    t6,    t1 //fh's offset
   3705    xvldrepl.d       xr22,  t1,    0
   3706 
   3707    addi.d           a1,    a1,    -3
   3708    clz.w            t1,    a3
   3709    li.w             t5,    24
   3710    sub.w            t1,    t1,    t5
   3711    la.local         t5,    .l_\lable\()prep_h_jtable_lasx
   3712    alsl.d           t1,    t1,    t5,   1
   3713    ld.h             t8,    t1,    0
   3714    add.d            t5,    t5,    t8
   3715    jirl             $r0,   t5,    0
   3716 
   3717    .align   3
   3718 .l_\lable\()prep_h_jtable_lasx:
   3719    .hword .l_\lable\()h_128w_lasx - .l_\lable\()prep_h_jtable_lasx
   3720    .hword .l_\lable\()h_64w_lasx  - .l_\lable\()prep_h_jtable_lasx
   3721    .hword .l_\lable\()h_32w_lasx  - .l_\lable\()prep_h_jtable_lasx
   3722    .hword .l_\lable\()h_16w_lasx  - .l_\lable\()prep_h_jtable_lasx
   3723    .hword .l_\lable\()h_8w_lasx   - .l_\lable\()prep_h_jtable_lasx
   3724    .hword .l_\lable\()h_4w_lasx   - .l_\lable\()prep_h_jtable_lasx
   3725 
   3726 .l_\lable\()h_4w_lasx:
   3727    addi.d           a1,    a1,    2
   3728    la.local         t7,    subpel_h_shuf1
   3729    vld              vr7,   t7,    0
   3730    xvreplve0.q      xr7,   xr7
   3731    xvbsrl.v         xr22,  xr22,  2
   3732    xvreplve0.w      xr22,  xr22
   3733 .l_\lable\()h_4w_loop_lasx:
   3734    vld              vr0,   a1,    0
   3735    vldx             vr1,   a1,    a2
   3736    vldx             vr2,   a1,    t2
   3737    vldx             vr3,   a1,    t3
   3738    add.d            a1,    a1,    t4
   3739    xvpermi.q        xr1,   xr0,   0x20
   3740    xvpermi.q        xr3,   xr2,   0x20
   3741    xvshuf.b         xr1,   xr1,   xr1,   xr7
   3742    xvshuf.b         xr3,   xr3,   xr3,   xr7
   3743    xvmulwev.h.bu.b  xr0,   xr1,   xr22
   3744    xvmulwev.h.bu.b  xr2,   xr3,   xr22
   3745    xvmaddwod.h.bu.b xr0,   xr1,   xr22
   3746    xvmaddwod.h.bu.b xr2,   xr3,   xr22
   3747    xvhaddw.w.h      xr0,   xr0,   xr0
   3748    xvhaddw.w.h      xr2,   xr2,   xr2
   3749    xvssrarni.h.w    xr2,   xr0,   2
   3750    xvpermi.d        xr2,   xr2,   0xd8
   3751    xvst             xr2,   a0,    0
   3752    addi.d           a0,    a0,    32
   3753    addi.w           a4,    a4,    -4
   3754    bnez             a4,    .l_\lable\()h_4w_loop_lasx
   3755    b                .l_\lable\()end_pre_8tap_lasx
   3756 
   3757 .l_\lable\()h_8w_lasx:
   3758    la.local         t7,    subpel_h_shuf1
   3759    vld              vr6,   t7,    0
   3760    vbsrl.v          vr23,  vr22,  4 //fh
   3761    xvreplve0.w      xr23,  xr23
   3762    xvreplve0.w      xr22,  xr22
   3763    xvreplve0.q      xr19,  xr6
   3764    xvaddi.bu        xr20,  xr19,  4
   3765    xvaddi.bu        xr21,  xr19,  8
   3766 .l_\lable\()h_8w_loop_lasx:
   3767    xvld             xr0,   a1,    0
   3768    xvldx            xr1,   a1,    a2
   3769    add.d            a1,    a1,    t2
   3770    xvpermi.q        xr0,   xr1,   0x02
   3771    PREP_HV_8W_LASX  xr0
   3772    xvst             xr0,   a0,    0
   3773    addi.d           a0,    a0,    32
   3774    addi.d           a4,    a4,   -2
   3775    bnez             a4,    .l_\lable\()h_8w_loop_lasx
   3776    b                .l_\lable\()end_pre_8tap_lasx
   3777 
   3778 .l_\lable\()h_16w_lasx:
   3779    la.local         t7,    subpel_h_shuf1
   3780    vld              vr6,   t7,    0
   3781    vbsrl.v          vr23,  vr22,  4 //fh
   3782    xvreplve0.w      xr23,  xr23
   3783    xvreplve0.w      xr22,  xr22
   3784    xvreplve0.q      xr19,  xr6
   3785    xvaddi.bu        xr20,  xr19,  4
   3786    xvaddi.bu        xr21,  xr19,  8
   3787 .l_\lable\()h_16w_loop_lasx:
   3788    xvld             xr0,   a1,    0
   3789    xvld             xr1,   a1,    8
   3790    add.d            a1,    a1,    a2
   3791    xvpermi.q        xr0,   xr1,   0x02
   3792    PREP_HV_8W_LASX  xr0
   3793    xvst             xr0,   a0,    0
   3794    xvld             xr0,   a1,    0
   3795    xvld             xr1,   a1,    8
   3796    add.d            a1,    a1,    a2
   3797    xvpermi.q        xr0,   xr1,   0x02
   3798    PREP_HV_8W_LASX  xr0
   3799    xvst             xr0,   a0,    32
   3800    addi.d           a0,    a0,    64
   3801    addi.w           a4,    a4,    -2
   3802    bnez             a4,     .l_\lable\()h_16w_loop_lasx
   3803    b                .l_\lable\()end_pre_8tap_lasx
   3804 
   3805 .l_\lable\()h_32w_lasx:
   3806 .l_\lable\()h_64w_lasx:
   3807 .l_\lable\()h_128w_lasx:
   3808    la.local         t7,    subpel_h_shuf1
   3809    vld              vr6,   t7,    0
   3810    vbsrl.v          vr23,  vr22,  4 //fh
   3811    xvreplve0.w      xr23,  xr23
   3812    xvreplve0.w      xr22,  xr22
   3813    xvreplve0.q      xr19,  xr6
   3814    xvaddi.bu        xr20,  xr19,  4
   3815    xvaddi.bu        xr21,  xr19,  8
   3816    addi.d           t5,    a1,    0 //src
   3817    addi.d           t6,    a3,    0 //w
   3818    slli.w           t7,    a3,    1 //store offset
   3819    addi.d           t8,    a0,    0 //dst
   3820 .l_\lable\()h_16_loop_lasx:
   3821    xvld             xr0,   a1,    0
   3822    xvld             xr1,   a1,    8
   3823    xvpermi.q        xr0,   xr1,   0x02
   3824    PREP_HV_8W_LASX  xr0
   3825    xvst             xr0,   a0,    0
   3826    xvld             xr0,   a1,    16
   3827    xvld             xr1,   a1,    24
   3828    xvpermi.q        xr0,   xr1,   0x02
   3829    PREP_HV_8W_LASX  xr0
   3830    xvst             xr0,   a0,    32
   3831    addi.d           a0,    a0,    64
   3832    addi.d           a1,    a1,    32
   3833    addi.d           a3,    a3,   -32
   3834    bnez             a3,    .l_\lable\()h_16_loop_lasx
   3835    add.d            a1,    t5,    a2
   3836    add.d            t5,    t5,    a2
   3837    add.d            a0,    t8,    t7
   3838    add.d            t8,    t8,    t7
   3839    addi.d           a3,    t6,    0
   3840    addi.d           a4,    a4,    -1
   3841    bnez             a4,    .l_\lable\()h_16_loop_lasx
   3842    b                .l_\lable\()end_pre_8tap_lasx
   3843 
   3844 .l_\lable\()hv_lasx:
   3845    andi             t1,    a7,    3
   3846    blt              t0,    a3,    .l_\lable\()hv_idx_fh_lasx
   3847    andi             t1,    a7,    1
   3848    addi.w           t1,    t1,    3
   3849 .l_\lable\()hv_idx_fh_lasx:
   3850    addi.w           t5,    zero,  120
   3851    mul.w            t1,    t1,    t5
   3852    addi.w           t5,    a5,    -1
   3853    slli.w           t5,    t5,    3
   3854    add.w            t1,    t1,    t5
   3855    add.d            t1,    t6,    t1 //fh's offset
   3856    xvldrepl.d       xr22,  t1,    0
   3857    srli.w           a7,    a7,    2
   3858    blt              t0,    a4,    .l_\lable\()hv_idx_fv_lasx
   3859    andi             a7,    a7,    1
   3860    addi.w           a7,    a7,    3
   3861 .l_\lable\()hv_idx_fv_lasx:
   3862    addi.w           t5,    zero,  120
   3863    mul.w            a7,    a7,    t5
   3864    addi.w           t5,    a6,    -1
   3865    slli.w           t5,    t5,    3
   3866    add.w            a7,    a7,    t5
   3867    add.d            a7,    t6,    a7 //fv's offset
   3868    xvldrepl.d       xr8,   a7,    0
   3869    xvsllwil.h.b     xr8,   xr8,   0
   3870    sub.d            a1,    a1,    t3
   3871    addi.d           a1,    a1,    -1 //ignore leading 0s
   3872    beq              a3,    t0,    .l_\lable\()hv_4w_lasx
   3873    addi.d           a1,    a1,    -2
   3874    b                .l_\lable\()hv_8w_lasx
   3875 .l_\lable\()hv_4w_lasx:
   3876    xvld             xr0,   a1,    0
   3877    xvldx            xr1,   a1,    a2
   3878    xvldx            xr2,   a1,    t2
   3879    xvldx            xr3,   a1,    t3
   3880    add.d            a1,    a1,    t4
   3881    xvld             xr4,   a1,    0
   3882    xvldx            xr5,   a1,    a2
   3883    xvldx            xr6,   a1,    t2
   3884    la.local         t1,    subpel_h_shuf2
   3885    xvld             xr7,   t1,    0
   3886    vbsrl.v          vr22,  vr22,  2
   3887    xvreplve0.w      xr22,  xr22
   3888    xvreplve0.q      xr8,   xr8
   3889    xvrepl128vei.w   xr12,  xr8,   0
   3890    xvrepl128vei.w   xr13,  xr8,   1
   3891    xvrepl128vei.w   xr14,  xr8,   2
   3892    xvrepl128vei.w   xr15,  xr8,   3
   3893    xvilvl.d         xr0,   xr1,   xr0
   3894    xvilvl.d         xr2,   xr3,   xr2
   3895    xvilvl.d         xr4,   xr5,   xr4
   3896    xvreplve0.q      xr0,   xr0
   3897    xvreplve0.q      xr2,   xr2
   3898    xvreplve0.q      xr4,   xr4
   3899    xvreplve0.q      xr6,   xr6
   3900    xvshuf.b         xr0,   xr0,   xr0,   xr7
   3901    xvshuf.b         xr2,   xr2,   xr2,   xr7
   3902    xvshuf.b         xr4,   xr4,   xr4,   xr7
   3903    xvshuf.b         xr6,   xr6,   xr6,   xr7
   3904    xvmulwev.h.bu.b  xr1,   xr0,   xr22
   3905    xvmulwev.h.bu.b  xr3,   xr2,   xr22
   3906    xvmulwev.h.bu.b  xr5,   xr4,   xr22
   3907    xvmulwev.h.bu.b  xr9,   xr6,   xr22
   3908    xvmaddwod.h.bu.b xr1,   xr0,   xr22
   3909    xvmaddwod.h.bu.b xr3,   xr2,   xr22
   3910    xvmaddwod.h.bu.b xr5,   xr4,   xr22
   3911    xvmaddwod.h.bu.b xr9,   xr6,   xr22
   3912    xvhaddw.w.h      xr1,   xr1,   xr1  // a0 b0 a1 b1  c0 d0 c1 d1
   3913    xvhaddw.w.h      xr3,   xr3,   xr3  // a2 b2 a3 b3  c2 d2 c3 d3
   3914    xvhaddw.w.h      xr5,   xr5,   xr5  // a4 b4 a5 b5  c4 d4 c5 d5
   3915    xvhaddw.w.h      xr9,   xr9,   xr9  // a6 b6 -  -   c6 d6 -  -
   3916    xvssrarni.h.w    xr3,   xr1,   2    // a0 b0 a1 b1  a2 b2 a3 b3  c0 d0 c1 d1  c2 d2 c3 d3
   3917    xvssrarni.h.w    xr9,   xr5,   2    // a4 b4 a5 b5  a6 b6 -  -   c4 d4 c5 d5  c6 d6 -  -
   3918    xvbsrl.v         xr4,   xr3,   4
   3919    xvextrins.w      xr4,   xr9,   0x30 // a1 b1 a2 b2  a3 b3 a4 b4  c1 d1 c2 d2  c3 d3 c4 d4
   3920    xvilvl.h         xr5,   xr4,   xr3  // a0 a1 b0 b1  a1 a2 b1 b2  c0 c1 d0 d1  c1 c2 d1 d2
   3921    xvilvh.h         xr6,   xr4,   xr3  // a2 a3 b2 b3  a3 a4 b3 b4  c2 c3 d2 d3  c3 c4 d3 d4
   3922    xvbsrl.v         xr10,  xr9,   4    // a5 b5 a6 b6  -  -  -  -   c5 d5 c6 d6  -  -  -  -
   3923    xvilvl.h         xr11,  xr10,  xr9  // a4 a5 b4 b5  a5 a6 b5 b6  c4 c5 d4 d5  c5 c6 d5 d6
   3924 .l_\lable\()hv_w4_loop_lasx:
   3925    xvmulwev.w.h     xr16,  xr5,   xr12 //a0 a1 (h0)
   3926    xvmulwev.w.h     xr17,  xr6,   xr12 //a2 a3 (h1)
   3927    xvmulwev.w.h     xr18,  xr6,   xr13 //a2 a3 (h0)
   3928    xvmulwev.w.h     xr19,  xr11,  xr13 //a4 a5 (h1)
   3929    xvmulwev.w.h     xr20,  xr11,  xr14 //a4 a5 (h0)
   3930    xvmaddwod.w.h    xr16,  xr5,   xr12 //
   3931    xvmaddwod.w.h    xr17,  xr6,   xr12 //
   3932    xvmaddwod.w.h    xr18,  xr6,   xr13 //
   3933    xvmaddwod.w.h    xr19,  xr11,  xr13 //
   3934    xvmaddwod.w.h    xr20,  xr11,  xr14 //
   3935    xvaddi.wu        xr5,   xr11,   0
   3936    xvadd.w          xr16,  xr16,  xr18 //a0 a1 + a2 a3
   3937    xvldx            xr18,  a1,    t3   //a7 b7 c7 d7
   3938    add.d            a1,    a1,    t4
   3939    xvadd.w          xr17,  xr17,  xr19 //a2 a3 + a4 a5
   3940    xvld             xr19,  a1,    0    //a8 b8 c8 d8
   3941    xvadd.w          xr16,  xr16,  xr20 //a0 a1 + a2 a3 + a4 a5
   3942    xvldx            xr20,  a1,    a2   //a9 b9 c9 d9
   3943    xvilvl.d         xr18,  xr19,  xr18
   3944    xvreplve0.q      xr18,  xr18
   3945    xvldx            xr19,  a1,    t2   //aa ba ca da
   3946    xvilvl.d         xr20,  xr19,  xr20
   3947    xvreplve0.q      xr20,  xr20
   3948    xvshuf.b         xr18,  xr18,  xr18,  xr7
   3949    xvshuf.b         xr20,  xr20,  xr20,  xr7
   3950    xvmulwev.h.bu.b  xr21,  xr18,  xr22
   3951    xvmulwev.h.bu.b  xr23,  xr20,  xr22
   3952    xvmaddwod.h.bu.b xr21,  xr18,  xr22
   3953    xvmaddwod.h.bu.b xr23,  xr20,  xr22
   3954    xvhaddw.w.h      xr21,  xr21,  xr21 //a7 b7 a8 b8 c7 d7 c8 d8
   3955    xvhaddw.w.h      xr23,  xr23,  xr23 //a9 b9 aa ba c9 d9 ca da
   3956    xvssrarni.h.w    xr23,  xr21,  2    //a7 b7 a8 b8  a9 b9 aa ba  c7 d7 c8 d8  c9 d9 ca da
   3957    xvbsll.v         xr0,   xr23,  4
   3958    xvextrins.w      xr0,   xr9,   0x02 //a6 b6 a7 b7  a8 b8 a9 b9  c6 d6 c7 d7  c8 d8 c9 d9
   3959    xvilvl.h         xr6,   xr23,  xr0  //a6 a7 b6 b7  a7 a8 b7 b8  c6 c7 d6 d7  c7 c8 d7 d8
   3960    xvilvh.h         xr11,  xr23,  xr0  //a8 a9 b8 b9  a9 aa b9 ba  c8 c9 d8 d9  c9 ca d9 da
   3961    xvbsrl.v         xr9,   xr23,  4
   3962    xvmulwev.w.h     xr1 ,  xr6,   xr14 //a6 a7 (h0)
   3963    xvmulwev.w.h     xr2 ,  xr6,   xr15 //a6 a7 (h1)
   3964    xvmulwev.w.h     xr3 ,  xr11,  xr15 //a8 a9 (h1)
   3965    xvmaddwod.w.h    xr1 ,  xr6,   xr14
   3966    xvmaddwod.w.h    xr2 ,  xr6,   xr15
   3967    xvmaddwod.w.h    xr3 ,  xr11,  xr15
   3968    xvadd.w          xr17,  xr17,  xr1  //a2 a3 + a4 a5 + a6 a7
   3969    xvadd.w          xr16,  xr16,  xr2  //a0 a1 + a2 a3 + a4 a5 + a6 a7
   3970    xvadd.w          xr17,  xr17,  xr3  //a2 a3 + a4 a5 + a6 a7 + a8 a9
   3971    xvssrarni.h.w    xr17,  xr16,  6    //a01 b01 a12 b12  a23 b23 a34 b34  c01 d01 c12 d12  c23 d23 c34 d34
   3972    xvpermi.d        xr17,  xr17,  0xd8 //a01 b01 a12 b12  c01 d01 c12 d12  a23 b23 a34 b34  c23 d23 c34 d34
   3973    xvshuf4i.w       xr17,  xr17,  0xd8
   3974    xvst             xr17,  a0,    0
   3975    addi.d           a0,    a0,    32
   3976    addi.d           a4,    a4,    -4
   3977    bnez             a4,    .l_\lable\()hv_w4_loop_lasx
   3978    b                .l_\lable\()end_pre_8tap_lasx
   3979 
   3980 .l_\lable\()hv_8w_lasx:
   3981    addi.d           sp,    sp,   -4*8
   3982    fst.d            f24,   sp,    0
   3983    fst.d            f25,   sp,    8
   3984    fst.d            f26,   sp,    16
   3985    fst.d            f27,   sp,    24
   3986    la.local         t1,    subpel_h_shuf1
   3987    vld              vr19,  t1,    0
   3988    addi.d           t0,    a1,    0
   3989    addi.d           t5,    a4,    0
   3990    slli.w           t7,    a3,    1 // store offset
   3991    addi.d           t8,    a0,    0
   3992    xvreplve0.q      xr19,  xr19
   3993    xvaddi.bu        xr20,  xr19,  4
   3994    xvaddi.bu        xr21,  xr19,  8
   3995    vbsrl.v          vr23,  vr22,  4
   3996    xvreplve0.w      xr22,  xr22 //f0f1f2f3
   3997    xvreplve0.w      xr23,  xr23 //f4f5f6f7
   3998    xvreplve0.q      xr8,   xr8
   3999    xvrepl128vei.w   xr24,  xr8,   0
   4000    xvrepl128vei.w   xr25,  xr8,   1
   4001    xvrepl128vei.w   xr26,  xr8,   2
   4002    xvrepl128vei.w   xr27,  xr8,   3
   4003 .l_\lable\()hv_8w_loop0_lasx:
   4004    xvld             xr0,   a1,    0
   4005    xvldx            xr1,   a1,    a2
   4006    xvldx            xr2,   a1,    t2
   4007    add.d            a1,    a1,    t3
   4008    xvld             xr3,   a1,    0
   4009    xvldx            xr4,   a1,    a2
   4010    xvldx            xr5,   a1,    t2
   4011    xvldx            xr6,   a1,    t3
   4012    add.d            a1,    a1,    t4
   4013    xvpermi.q        xr0,   xr3,   0x02 //0 3
   4014    xvpermi.q        xr1,   xr4,   0x02 //1 4
   4015    xvpermi.q        xr2,   xr5,   0x02 //2 5
   4016    xvpermi.q        xr3,   xr6,   0x02 //3 6
   4017    PREP_HV_8W_LASX  xr0 //a0b0c0d0 e0f0g0h0 a3b3c3d3 e3f3g3h3
   4018    PREP_HV_8W_LASX  xr1 //a1b1c1d1 e1f1g1h1 a4b4c4d4 e4f4g4h4
   4019    PREP_HV_8W_LASX  xr2 //a2b2c2d2 e2f2g2h2 a5b5c5d5 e5f5g5h5
   4020    PREP_HV_8W_LASX  xr3 //a3b3c3d3 e3f3g3h3 a6b6c6d6 e6f6g6h6
   4021    xvpermi.d        xr0,   xr0,   0xd8
   4022    xvpermi.d        xr1,   xr1,   0xd8
   4023    xvpermi.d        xr2,   xr2,   0xd8
   4024    xvpermi.d        xr18,  xr3,   0xd8
   4025    xvilvl.h         xr12,  xr1,   xr0 //a0a1b0b1c0c1d0d1 e0e1f0f1g0g1h0h1
   4026    xvilvh.h         xr13,  xr1,   xr0 //a3a4b3b4c3c4d3d4 e3e4f3f4g3g4h3h4
   4027    xvilvl.h         xr14,  xr2,   xr1 //a1a2b1b2c1c2d1d2 e1e2f1f2g1g2h1h2
   4028    xvilvh.h         xr15,  xr2,   xr1 //a4a5b4b5c4c5d4d5 e4e5f4f5g4g5h4h5
   4029    xvilvl.h         xr16,  xr18,  xr2 //a2a3b2b3c2c3d2d3 e2e3f2f3g2g3h2h3
   4030    xvilvh.h         xr17,  xr18,  xr2 //a5a6b5b6c5c6d5d6 e5e6f5f6g5g6h5h6
   4031 .l_\lable\()hv_8w_loop_lasx:
   4032    xvld             xr0,   a1,    0
   4033    xvldx            xr1,   a1,    a2
   4034    add.d            a1,    a1,    t2
   4035    xvpermi.q        xr0,   xr1,   0x02 //7 8
   4036    PREP_HV_8W_LASX  xr0                //a7b7c7d7e7f7g7h7 a8b8c8d8e8f8g8h8
   4037    xvpermi.q        xr3,   xr0,   0x03 //a6b6c6d6e6f6g6h6 a7b7c7d7e7f7g7h7
   4038    xvpermi.d        xr3,   xr3,   0xd8 //a6b6c6d6a7b7c7d7 e6f6g6h6e7f7g7h7
   4039    xvpermi.d        xr1,   xr0,   0xd8 //a7b7c7d7a8b8c8d8 e7f7g7h7e8f8g8h8
   4040    xvilvl.h         xr18,  xr1,   xr3  //a6a7b6b7c6c7d6d7 e6e7f6f7g6g7h6h7
   4041    xvilvh.h         xr2,   xr1,   xr3  //a7a8b7b8c7c8d7d8 e7e8f7f8g7g8h7h8
   4042    xvaddi.hu        xr3,   xr0,   0
   4043    xvmulwev.w.h     xr4,   xr12,  xr24 //01
   4044    xvmulwev.w.h     xr5,   xr14,  xr24 //12
   4045    xvmulwev.w.h     xr6,   xr16,  xr25 //23
   4046    xvmulwev.w.h     xr7,   xr13,  xr25 //34
   4047    xvmulwev.w.h     xr8,   xr15,  xr26 //45
   4048    xvmulwev.w.h     xr9,   xr17,  xr26 //56
   4049    xvmulwev.w.h     xr10,  xr18,  xr27 //67
   4050    xvmulwev.w.h     xr11,  xr2,   xr27 //78
   4051    xvmaddwod.w.h    xr4,   xr12,  xr24 //01
   4052    xvmaddwod.w.h    xr5,   xr14,  xr24 //12
   4053    xvmaddwod.w.h    xr6,   xr16,  xr25 //23
   4054    xvmaddwod.w.h    xr7,   xr13,  xr25 //34
   4055    xvmaddwod.w.h    xr8,   xr15,  xr26 //45
   4056    xvmaddwod.w.h    xr9,   xr17,  xr26 //56
   4057    xvmaddwod.w.h    xr10,  xr18,  xr27 //67
   4058    xvmaddwod.w.h    xr11,  xr2,   xr27 //78
   4059    xvadd.w          xr4,   xr4,   xr6
   4060    xvadd.w          xr5,   xr5,   xr7
   4061    xvadd.w          xr4,   xr4,   xr8
   4062    xvadd.w          xr5,   xr5,   xr9
   4063    xvadd.w          xr4,   xr4,   xr10
   4064    xvadd.w          xr5,   xr5,   xr11
   4065    xvaddi.hu        xr12,  xr16,  0 //01 <-- 23
   4066    xvaddi.hu        xr14,  xr13,  0 //12 <-- 34
   4067    xvaddi.hu        xr16,  xr15,  0 //23 <-- 45
   4068    xvaddi.hu        xr13,  xr17,  0 //34 <-- 56
   4069    xvaddi.hu        xr15,  xr18,  0 //45 <-- 67
   4070    xvaddi.hu        xr17,  xr2,   0 //56 <-- 78
   4071    xvssrarni.h.w    xr5,   xr4,   6
   4072    xvpermi.d        xr5,   xr5,   0xd8
   4073    vst              vr5,   a0,    0
   4074    xvpermi.q        xr5,   xr5,   0x11
   4075    vstx             vr5,   a0,    t7
   4076    alsl.d           a0,    t7,    a0,  1
   4077    addi.d           a4,    a4,   -2
   4078    bnez             a4,    .l_\lable\()hv_8w_loop_lasx
   4079    addi.d           a1,    t0,    8
   4080    addi.d           t0,    t0,    8
   4081    addi.d           a0,    t8,    16
   4082    addi.d           t8,    t8,    16
   4083    addi.d           a4,    t5,    0
   4084    addi.d           a3,    a3,   -8
   4085    bnez             a3,    .l_\lable\()hv_8w_loop0_lasx
   4086    fld.d            f24,   sp,    0
   4087    fld.d            f25,   sp,    8
   4088    fld.d            f26,   sp,    16
   4089    fld.d            f27,   sp,    24
   4090    addi.d           sp,    sp,    4*8
   4091    b                .l_\lable\()end_pre_8tap_lasx
   4092 
   4093 .l_\lable\()v_lasx:
   4094    srli.w           a7,    a7,    2
   4095    blt              t0,    a4,    .l_\lable\()v_idx_fv_lasx
   4096    andi             a7,    a7,    1
   4097    addi.w           a7,    a7,    3
   4098 .l_\lable\()v_idx_fv_lasx:
   4099    addi.w           t5,    zero,  120
   4100    mul.w            a7,    a7,    t5
   4101    addi.w           t5,    a6,    -1
   4102    slli.w           t5,    t5,    3
   4103    add.w            a7,    a7,    t5
   4104    add.d            a7,    t6,    a7 //fv's offset
   4105    xvldrepl.d       xr8,   a7,    0
   4106    xvrepl128vei.h   xr12,  xr8,   0
   4107    xvrepl128vei.h   xr13,  xr8,   1
   4108    xvrepl128vei.h   xr14,  xr8,   2
   4109    xvrepl128vei.h   xr15,  xr8,   3
   4110    sub.d            a1,    a1,    t3
   4111    beq              a3,    t0,    .l_\lable\()v_4w_lasx
   4112    addi.w           t0,    t0,    4
   4113    beq              a3,    t0,    .l_\lable\()v_8w_lasx
   4114    blt              t0,    a3,    .l_\lable\()v_16w_lasx
   4115 .l_\lable\()v_4w_lasx:
   4116    la.local         t6,    subpel_h_shuf3
   4117    xvld             xr11,  t6,    0
   4118    fld.s            f0,    a1,    0   //a0b0c0d0
   4119    fldx.s           f1,    a1,    a2  //a1b1c1d1
   4120    fldx.s           f2,    a1,    t2  //a2b2c2d2
   4121    add.d            a1,    a1,    t3
   4122    fld.s            f3,    a1,    0   //a3b3c3d3
   4123    fldx.s           f4,    a1,    a2  //a4b4c4d4
   4124    fldx.s           f5,    a1,    t2  //a5b5c5d5
   4125    fldx.s           f6,    a1,    t3  //a6b6c6d6
   4126    vilvl.w          vr0,   vr1,   vr0 //01
   4127    vilvl.w          vr1,   vr3,   vr2 //23
   4128    vilvl.d          vr0,   vr1,   vr0 //0123
   4129    vilvl.w          vr2,   vr5,   vr4 //45
   4130    vilvl.d          vr1,   vr2,   vr1 //2345
   4131    xvpermi.q        xr0,   xr1,   0x02 //0123 2345
   4132    xvbsrl.v         xr1,   xr0,   4    //123- 345-
   4133    xvpermi.q        xr4,   xr6,   0x02
   4134    xvextrins.w      xr1,   xr4,   0x30 //1234 3456
   4135    xvilvl.b         xr2,   xr1,   xr0  //0112 2334         //a0a1b0b1c0c1d0d1 a1a2b1b2c1c2d1d2 a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4
   4136    xvilvh.b         xr3,   xr1,   xr0  //2334 4556         //a2a3b2b3c2c3d2d3 a3a4b3b4c3c4d3d4 a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6
   4137 .l_\lable\()v_4w_loop_lasx:
   4138    add.d            a1,    a1,    t4
   4139    fld.s            f0,    a1,    0  //a7b7c7d7
   4140    fldx.s           f1,    a1,    a2 //a8b8c8d8
   4141    fldx.s           f4,    a1,    t2 //a9b9c9d9
   4142    fldx.s           f5,    a1,    t3 //aabacada
   4143    vilvl.w          vr7,   vr0,   vr6 //67
   4144    vilvl.w          vr10,  vr4,   vr1 //89
   4145    vextrins.w       vr7,   vr1,   0x20//678-
   4146    vextrins.w       vr10,  vr5,   0x20//89a-
   4147    xvpermi.q        xr7,   xr10,  0x02//678- 89a-
   4148    xvshuf.b         xr4,   xr7,   xr7,  xr11 //67 78 89 9a //a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8 a8a9b8b9c8c9d8d9 a9aab9bac9cad9da
   4149    xvpermi.q        xr7,   xr3,   0x11 //4556
   4150    xvpermi.q        xr7,   xr4,   0x02 //45 56 67 78       //a4a5b4b5c4c5d4d5 a5a6b5b6c5c6d5d6 a6a7b6b7c6c7d6d7 a7a8b7b8c7c8d7d8
   4151    xvmulwev.h.bu.b  xr16,  xr2,   xr12
   4152    xvmulwev.h.bu.b  xr17,  xr3,   xr13
   4153    xvmulwev.h.bu.b  xr18,  xr7,   xr14
   4154    xvmulwev.h.bu.b  xr19,  xr4,   xr15
   4155    xvmaddwod.h.bu.b xr16,  xr2,   xr12
   4156    xvmaddwod.h.bu.b xr17,  xr3,   xr13
   4157    xvmaddwod.h.bu.b xr18,  xr7,   xr14
   4158    xvmaddwod.h.bu.b xr19,  xr4,   xr15
   4159    xvadd.h          xr16,  xr16,  xr17
   4160    xvadd.h          xr16,  xr16,  xr18
   4161    xvadd.h          xr16,  xr16,  xr19
   4162    xvsrari.h        xr16,  xr16,  2
   4163    xvaddi.bu        xr2,   xr7,   0
   4164    xvaddi.bu        xr3,   xr4,   0
   4165    xvaddi.bu        xr6,   xr5,   0
   4166    xvst             xr16,  a0,    0
   4167    addi.d           a0,    a0,    32
   4168    addi.w           a4,    a4,   -4
   4169    bnez             a4,    .l_\lable\()v_4w_loop_lasx
   4170    b                .l_\lable\()end_pre_8tap_lasx
   4171 
   4172 .l_\lable\()v_8w_lasx:
   4173    fld.d            f0,    a1,    0
   4174    fldx.d           f1,    a1,    a2
   4175    fldx.d           f2,    a1,    t2
   4176    add.d            a1,    a1,    t3
   4177    fld.d            f3,    a1,    0
   4178    fldx.d           f4,    a1,    a2
   4179    fldx.d           f5,    a1,    t2
   4180    fldx.d           f6,    a1,    t3
   4181    xvpermi.q        xr0,   xr1,   0x02
   4182    xvpermi.q        xr1,   xr2,   0x02
   4183    xvilvl.b         xr0,   xr1,   xr0 //01 12
   4184    xvpermi.q        xr2,   xr3,   0x02
   4185    xvpermi.q        xr3,   xr4,   0x02
   4186    xvilvl.b         xr2,   xr3,   xr2 //23 34
   4187    xvpermi.q        xr4,   xr5,   0x02
   4188    xvpermi.q        xr5,   xr6,   0x02
   4189    xvilvl.b         xr4,   xr5,   xr4 //45 56
   4190 .l_\lable\()v_8w_loop_lasx:
   4191    add.d            a1,    a1,    t4
   4192    fld.d            f7,    a1,    0   //7
   4193    fldx.d           f10,   a1,    a2  //8
   4194    fldx.d           f11,   a1,    t2  //9
   4195    fldx.d           f18,   a1,    t3  //a
   4196    xvpermi.q        xr6,   xr7,   0x02
   4197    xvpermi.q        xr7,   xr10,  0x02
   4198    xvilvl.b         xr6,   xr7,   xr6  //67 78
   4199    xvpermi.q        xr10,  xr11,  0x02
   4200    xvpermi.q        xr11,  xr18,  0x02
   4201    xvilvl.b         xr10,  xr11,  xr10 //89 9a
   4202    xvmulwev.h.bu.b  xr1,   xr0,   xr12
   4203    xvmulwev.h.bu.b  xr3,   xr2,   xr13
   4204    xvmulwev.h.bu.b  xr5,   xr4,   xr14
   4205    xvmulwev.h.bu.b  xr7,   xr6,   xr15
   4206    xvmulwev.h.bu.b  xr9,   xr2,   xr12
   4207    xvmulwev.h.bu.b  xr11,  xr4,   xr13
   4208    xvmulwev.h.bu.b  xr16,  xr6,   xr14
   4209    xvmulwev.h.bu.b  xr17,  xr10,  xr15
   4210    xvmaddwod.h.bu.b xr1,   xr0,   xr12
   4211    xvmaddwod.h.bu.b xr3,   xr2,   xr13
   4212    xvmaddwod.h.bu.b xr5,   xr4,   xr14
   4213    xvmaddwod.h.bu.b xr7,   xr6,   xr15
   4214    xvmaddwod.h.bu.b xr9,   xr2,   xr12
   4215    xvmaddwod.h.bu.b xr11,  xr4,   xr13
   4216    xvmaddwod.h.bu.b xr16,  xr6,   xr14
   4217    xvmaddwod.h.bu.b xr17,  xr10,  xr15
   4218    xvadd.h          xr1,   xr1,   xr3
   4219    xvadd.h          xr1,   xr1,   xr5
   4220    xvadd.h          xr1,   xr1,   xr7
   4221    xvadd.h          xr9,   xr9,   xr11
   4222    xvadd.h          xr9,   xr9,   xr16
   4223    xvadd.h          xr9,   xr9,   xr17
   4224    xvaddi.bu        xr0,   xr4,   0
   4225    xvaddi.bu        xr2,   xr6,   0
   4226    xvaddi.bu        xr4,   xr10,  0
   4227    xvaddi.bu        xr6,   xr18,  0
   4228    xvsrari.h        xr1,   xr1,   2
   4229    xvsrari.h        xr9,   xr9,   2
   4230    xvst             xr1,   a0,    0
   4231    xvst             xr9,   a0,    32
   4232    addi.d           a0,    a0,    64
   4233    addi.w           a4,    a4,   -4
   4234    bnez             a4,    .l_\lable\()v_8w_loop_lasx
   4235    b                .l_\lable\()end_pre_8tap_lasx
   4236 
   4237 .l_\lable\()v_16w_lasx:
   4238    addi.d           t0,    a0,    0 //dst
   4239    addi.d           t5,    a1,    0 //src
   4240    slli.w           t7,    a3,    1 //w
   4241    addi.d           t8,    a4,    0 //h
   4242 .l_\lable\()v_16w_loop0_lasx:
   4243    vld              vr0,   a1,    0
   4244    vldx             vr1,   a1,    a2
   4245    vldx             vr2,   a1,    t2
   4246    add.d            a1,    a1,    t3
   4247    vld              vr3,   a1,    0
   4248    vldx             vr4,   a1,    a2
   4249    vldx             vr5,   a1,    t2
   4250    vldx             vr6,   a1,    t3
   4251    add.d            a1,    a1,    t4
   4252    xvpermi.d        xr0,   xr0,   0xd8
   4253    xvpermi.d        xr1,   xr1,   0xd8
   4254    xvpermi.d        xr2,   xr2,   0xd8
   4255    xvpermi.d        xr3,   xr3,   0xd8
   4256    xvpermi.d        xr4,   xr4,   0xd8
   4257    xvpermi.d        xr5,   xr5,   0xd8
   4258    xvpermi.d        xr6,   xr6,   0xd8
   4259    xvilvl.b         xr0,   xr1,   xr0 //01
   4260    xvilvl.b         xr1,   xr2,   xr1 //12
   4261    xvilvl.b         xr2,   xr3,   xr2 //23
   4262    xvilvl.b         xr3,   xr4,   xr3 //34
   4263    xvilvl.b         xr4,   xr5,   xr4 //45
   4264    xvilvl.b         xr5,   xr6,   xr5 //56
   4265 .l_\lable\()v_16w_loop_lasx:
   4266    vld              vr7,   a1,    0   //7
   4267    vldx             vr10,  a1,    a2  //8
   4268    add.d            a1,    a1,    t2
   4269    xvpermi.d        xr7,   xr7,   0xd8
   4270    xvpermi.d        xr10,  xr10,  0xd8
   4271    xvilvl.b         xr6,   xr7,   xr6 //67
   4272    xvilvl.b         xr7,   xr10,  xr7 //78
   4273    xvmulwev.h.bu.b  xr9,   xr0,   xr12
   4274    xvmulwev.h.bu.b  xr11,  xr2,   xr13
   4275    xvmulwev.h.bu.b  xr16,  xr4,   xr14
   4276    xvmulwev.h.bu.b  xr17,  xr6,   xr15
   4277    xvmulwev.h.bu.b  xr18,  xr1,   xr12
   4278    xvmulwev.h.bu.b  xr19,  xr3,   xr13
   4279    xvmulwev.h.bu.b  xr20,  xr5,   xr14
   4280    xvmulwev.h.bu.b  xr21,  xr7,   xr15
   4281    xvmaddwod.h.bu.b xr9,   xr0,   xr12
   4282    xvmaddwod.h.bu.b xr11,  xr2,   xr13
   4283    xvmaddwod.h.bu.b xr16,  xr4,   xr14
   4284    xvmaddwod.h.bu.b xr17,  xr6,   xr15
   4285    xvmaddwod.h.bu.b xr18,  xr1,   xr12
   4286    xvmaddwod.h.bu.b xr19,  xr3,   xr13
   4287    xvmaddwod.h.bu.b xr20,  xr5,   xr14
   4288    xvmaddwod.h.bu.b xr21,  xr7,   xr15
   4289    xvadd.h          xr9,   xr9,   xr11
   4290    xvadd.h          xr9,   xr9,   xr16
   4291    xvadd.h          xr9,   xr9,   xr17
   4292    xvadd.h          xr11,  xr18,  xr19
   4293    xvadd.h          xr11,  xr11,  xr20
   4294    xvadd.h          xr11,  xr11,  xr21
   4295    xvsrari.h        xr9,   xr9,   2
   4296    xvsrari.h        xr11,  xr11,  2
   4297    xvaddi.bu        xr0,   xr2,   0
   4298    xvaddi.bu        xr1,   xr3,   0
   4299    xvaddi.bu        xr2,   xr4,   0
   4300    xvaddi.bu        xr3,   xr5,   0
   4301    xvaddi.bu        xr4,   xr6,   0
   4302    xvaddi.bu        xr5,   xr7,   0
   4303    xvaddi.bu        xr6,   xr10,  0
   4304    xvst             xr9,   a0,    0
   4305    xvstx            xr11,  a0,    t7
   4306    alsl.d           a0,    t7,    a0,  1
   4307    addi.d           a4,    a4,   -2
   4308    bnez             a4,    .l_\lable\()v_16w_loop_lasx
   4309    addi.d           a3,    a3,   -16
   4310    addi.d           a0,    t0,    32
   4311    addi.d           t0,    t0,    32
   4312    addi.d           a1,    t5,    16
   4313    addi.d           t5,    t5,    16
   4314    addi.d           a4,    t8,    0
   4315    bnez             a3,    .l_\lable\()v_16w_loop0_lasx
   4316 .l_\lable\()end_pre_8tap_lasx:
   4317 .endm
   4318 
   4319 function prep_8tap_regular_8bpc_lasx
   4320    addi.w a7, zero, 0
   4321    PREP_8TAP_8BPC_LASX 0
   4322 endfunc
   4323 
   4324 function prep_8tap_smooth_regular_8bpc_lasx
   4325    addi.w a7, zero, 1
   4326    PREP_8TAP_8BPC_LASX 1
   4327 endfunc
   4328 
   4329 function prep_8tap_sharp_regular_8bpc_lasx
   4330    addi.w a7, zero, 2
   4331    PREP_8TAP_8BPC_LASX 2
   4332 endfunc
   4333 
   4334 function prep_8tap_regular_smooth_8bpc_lasx
   4335    addi.w a7, zero, 4
   4336    PREP_8TAP_8BPC_LASX 4
   4337 endfunc
   4338 
   4339 function prep_8tap_smooth_8bpc_lasx
   4340    addi.w a7, zero, 5
   4341    PREP_8TAP_8BPC_LASX 5
   4342 endfunc
   4343 
   4344 function prep_8tap_sharp_smooth_8bpc_lasx
   4345    addi.w a7, zero, 6
   4346    PREP_8TAP_8BPC_LASX 6
   4347 endfunc
   4348 
   4349 function prep_8tap_regular_sharp_8bpc_lasx
   4350    addi.w a7, zero, 8
   4351    PREP_8TAP_8BPC_LASX 8
   4352 endfunc
   4353 
   4354 function prep_8tap_smooth_sharp_8bpc_lasx
   4355    addi.w a7, zero, 9
   4356    PREP_8TAP_8BPC_LASX 9
   4357 endfunc
   4358 
   4359 function prep_8tap_sharp_8bpc_lasx
   4360    addi.w a7, zero, 10
   4361    PREP_8TAP_8BPC_LASX 10
   4362 endfunc
   4363 
   4364 .macro PREP_8TAP_8BPC_LSX lable
   4365    li.w             t0,     4
   4366    la.local         t6,     dav1d_mc_subpel_filters
   4367    la.local         t7,     shufb1
   4368    vld              vr23,   t7,    0
   4369    slli.d           t2,     a2,    1  //src_stride*2
   4370    add.d            t3,     t2,    a2 //src_stride*3
   4371    slli.d           t4,     t2,    1
   4372 
   4373    bnez             a5,     .l_\lable\()h_lsx //mx
   4374    bnez             a6,     .l_\lable\()v_lsx
   4375 
   4376    clz.w            t1,     a3
   4377    li.w             t5,     24
   4378    sub.w            t1,     t1,    t5
   4379    la.local         t5,     .l_\lable\()prep_hv0_jtable_lsx
   4380    alsl.d           t1,     t1,    t5,   1
   4381    ld.h             t8,     t1,    0
   4382    add.d            t5,     t5,    t8
   4383    jirl             $r0,    t5,    0
   4384    .align   3
   4385 .l_\lable\()prep_hv0_jtable_lsx:
   4386    .hword .l_\lable\()hv0_128w_lsx - .l_\lable\()prep_hv0_jtable_lsx
   4387    .hword .l_\lable\()hv0_64w_lsx  - .l_\lable\()prep_hv0_jtable_lsx
   4388    .hword .l_\lable\()hv0_32w_lsx  - .l_\lable\()prep_hv0_jtable_lsx
   4389    .hword .l_\lable\()hv0_16w_lsx  - .l_\lable\()prep_hv0_jtable_lsx
   4390    .hword .l_\lable\()hv0_8w_lsx   - .l_\lable\()prep_hv0_jtable_lsx
   4391    .hword .l_\lable\()hv0_4w_lsx   - .l_\lable\()prep_hv0_jtable_lsx
   4392 
   4393 .l_\lable\()hv0_4w_lsx:
   4394    fld.s            f0,     a1,    0
   4395    fldx.s           f1,     a1,    a2
   4396    add.d            a1,     a1,    t2
   4397    vilvl.w          vr0,    vr1,   vr0
   4398    vsllwil.hu.bu    vr0,    vr0,   4
   4399    vst              vr0,    a0,    0
   4400    addi.d           a0,     a0,    16
   4401    addi.d           a4,     a4,    -2
   4402    bnez             a4,     .l_\lable\()hv0_4w_lsx
   4403    b                .l_\lable\()end_pre_8tap_lsx
   4404 .l_\lable\()hv0_8w_lsx:
   4405    fld.d            f0,     a1,    0
   4406    fldx.d           f1,     a1,    a2
   4407    add.d            a1,     a1,    t2
   4408    vsllwil.hu.bu    vr0,    vr0,   4
   4409    vsllwil.hu.bu    vr1,    vr1,   4
   4410    vst              vr0,    a0,    0
   4411    vst              vr1,    a0,    16
   4412    addi.d           a0,     a0,    32
   4413    addi.d           a4,     a4,    -2
   4414    bnez             a4,     .l_\lable\()hv0_8w_lsx
   4415    b                .l_\lable\()end_pre_8tap_lsx
   4416 .l_\lable\()hv0_16w_lsx:
   4417    vld              vr0,    a1,    0
   4418    vldx             vr1,    a1,    a2
   4419    add.d            a1,     a1,    t2
   4420    vsllwil.hu.bu    vr2,    vr0,   4
   4421    vsllwil.hu.bu    vr4,    vr1,   4
   4422    vexth.hu.bu      vr3,    vr0
   4423    vexth.hu.bu      vr5,    vr1
   4424    vslli.h          vr3,    vr3,   4
   4425    vslli.h          vr5,    vr5,   4
   4426    vst              vr2,    a0,    0
   4427    vst              vr3,    a0,    16
   4428    vst              vr4,    a0,    32
   4429    vst              vr5,    a0,    48
   4430    addi.d           a0,     a0,    64
   4431    addi.d           a4,     a4,    -2
   4432    bnez             a4,     .l_\lable\()hv0_16w_lsx
   4433    b                .l_\lable\()end_pre_8tap_lsx
   4434 .l_\lable\()hv0_32w_lsx:
   4435 .l_\lable\()hv0_64w_lsx:
   4436 .l_\lable\()hv0_128w_lsx:
   4437    addi.d           t0,     a1,    0
   4438    addi.d           t5,     a4,    0
   4439    srli.w           t7,     a3,    4
   4440    slli.w           t7,     t7,    5
   4441    addi.d           t8,     a0,    0
   4442 .l_\lable\()hv0_16_loop_lsx:
   4443    vld              vr0,    a1,    0
   4444    vldx             vr1,    a1,    a2
   4445    add.d            a1,     a1,    t2
   4446    vsllwil.hu.bu    vr2,    vr0,   4
   4447    vsllwil.hu.bu    vr3,    vr1,   4
   4448    vexth.hu.bu      vr0,    vr0
   4449    vexth.hu.bu      vr1,    vr1
   4450    vslli.h          vr0,    vr0,   4
   4451    vslli.h          vr1,    vr1,   4
   4452    vst              vr2,    a0,    0
   4453    vst              vr0,    a0,    16
   4454    add.d            a0,     a0,    t7
   4455    vst              vr3,    a0,    0
   4456    vst              vr1,    a0,    16
   4457    add.d            a0,     a0,    t7
   4458    addi.d           a4,     a4,    -2
   4459    bnez             a4,     .l_\lable\()hv0_16_loop_lsx
   4460    addi.d           a1,     t0,    16
   4461    addi.d           t0,     t0,    16
   4462    addi.d           a0,     t8,    32
   4463    addi.d           t8,     t8,    32
   4464    addi.d           a4,     t5,    0
   4465    addi.d           a3,     a3,    -16
   4466    bnez             a3,     .l_\lable\()hv0_16_loop_lsx
   4467    b                .l_\lable\()end_pre_8tap_lsx
   4468 .l_\lable\()h_lsx:
   4469    bnez             a6,     .l_\lable\()hv_lsx //if(fh) && if (fv)
   4470 
   4471    andi             t1,     a7,    3
   4472    blt              t0,     a3,    .l_\lable\()h_idx_fh_lsx
   4473    andi             t1,     a7,    1
   4474    addi.w           t1,     t1,    3
   4475 .l_\lable\()h_idx_fh_lsx:
   4476    addi.w           t5,     zero,  120
   4477    mul.w            t1,     t1,    t5
   4478    addi.w           t5,     a5,    -1
   4479    slli.w           t5,     t5,    3
   4480    add.w            t1,     t1,    t5
   4481    add.d            t1,     t6,    t1 //fh's offset
   4482    vldrepl.d        vr23,   t1,    0
   4483 
   4484    addi.d           a1,     a1,    -3
   4485    clz.w            t1,     a3
   4486    li.w             t5,     24
   4487    sub.w            t1,     t1,    t5
   4488    la.local         t5,     .l_\lable\()prep_h_jtable_lsx
   4489    alsl.d           t1,     t1,    t5,   1
   4490    ld.h             t8,     t1,    0
   4491    add.d            t5,     t5,    t8
   4492    jirl             $r0,    t5,    0
   4493 
   4494    .align   3
   4495 .l_\lable\()prep_h_jtable_lsx:
   4496    .hword .l_\lable\()h_128w_lsx - .l_\lable\()prep_h_jtable_lsx
   4497    .hword .l_\lable\()h_64w_lsx  - .l_\lable\()prep_h_jtable_lsx
   4498    .hword .l_\lable\()h_32w_lsx  - .l_\lable\()prep_h_jtable_lsx
   4499    .hword .l_\lable\()h_16w_lsx  - .l_\lable\()prep_h_jtable_lsx
   4500    .hword .l_\lable\()h_8w_lsx   - .l_\lable\()prep_h_jtable_lsx
   4501    .hword .l_\lable\()h_4w_lsx   - .l_\lable\()prep_h_jtable_lsx
   4502 
   4503 .l_\lable\()h_4w_lsx:
   4504    addi.d           a1,     a1,    2
   4505    la.local         t7,     subpel_h_shuf1
   4506    vld              vr7,    t7,    0
   4507    vbsrl.v          vr23,   vr23,  2
   4508    vreplvei.w       vr23,   vr23,  0
   4509 .l_\lable\()h_4w_loop_lsx:
   4510    vld              vr0,    a1,    0
   4511    vldx             vr1,    a1,    a2
   4512    add.d            a1,     a1,    t2
   4513    vshuf.b          vr0,    vr0,   vr0,   vr7
   4514    vshuf.b          vr1,    vr1,   vr1,   vr7
   4515    vmulwev.h.bu.b   vr2,    vr0,   vr23
   4516    vmulwev.h.bu.b   vr3,    vr1,   vr23
   4517    vmaddwod.h.bu.b  vr2,    vr0,   vr23
   4518    vmaddwod.h.bu.b  vr3,    vr1,   vr23
   4519    vhaddw.w.h       vr0,    vr2,   vr2
   4520    vhaddw.w.h       vr1,    vr3,   vr3
   4521    vssrarni.h.w     vr1,    vr0,   2
   4522    vst              vr1,    a0,    0
   4523    addi.d           a0,     a0,    16
   4524    addi.w           a4,     a4,    -2
   4525    bnez             a4,     .l_\lable\()h_4w_loop_lsx
   4526    b                .l_\lable\()end_pre_8tap_lsx
   4527 
   4528 .l_\lable\()h_8w_lsx:
   4529    vreplvei.w       vr22,   vr23,  0 //fh
   4530    vreplvei.w       vr23,   vr23,  1
   4531    la.local         t7,     subpel_h_shuf1
   4532    vld              vr6,    t7,    0
   4533    vaddi.bu         vr7,    vr6,   4
   4534    vaddi.bu         vr8,    vr6,   8
   4535 .l_\lable\()h_8w_loop_lsx:
   4536    vld              vr0,    a1,    0
   4537    vldx             vr1,    a1,    a2
   4538    add.d            a1,     a1,    t2
   4539    PREP_H_8W        vr0
   4540    PREP_H_8W        vr1
   4541    vst              vr0,    a0,    0
   4542    vst              vr1,    a0,    16
   4543    addi.d           a0,     a0,    32
   4544    addi.d           a4,     a4,    -2
   4545    bnez             a4,     .l_\lable\()h_8w_loop_lsx
   4546    b                .l_\lable\()end_pre_8tap_lsx
   4547 
   4548 .l_\lable\()h_16w_lsx:
   4549 .l_\lable\()h_32w_lsx:
   4550 .l_\lable\()h_64w_lsx:
   4551 .l_\lable\()h_128w_lsx:
   4552    vreplvei.w       vr22,   vr23,  0 //fh
   4553    vreplvei.w       vr23,   vr23,  1
   4554    la.local         t7,     subpel_h_shuf1
   4555    vld              vr6,    t7,    0
   4556    vaddi.bu         vr7,    vr6,   4
   4557    vaddi.bu         vr8,    vr6,   8
   4558    srli.w           t7,     a3,    4
   4559    slli.w           t6,     t7,    5
   4560 .l_\lable\()h_16w_loop0_lsx:
   4561    addi.d           t0,     a1,    0 //src
   4562    addi.d           t5,     a4,    0 //h
   4563    addi.d           t8,     a0,    0 //dst
   4564 .l_\lable\()h_16w_loop_lsx:
   4565    vld              vr0,    a1,    0
   4566    vld              vr1,    a1,    8
   4567    add.d            a1,     a1,    a2
   4568    PREP_H_8W        vr0
   4569    PREP_H_8W        vr1
   4570    vst              vr0,    a0,    0
   4571    vst              vr1,    a0,    16
   4572    add.d            a0,     a0,    t6
   4573    addi.d           t5,     t5,    -1
   4574    bnez             t5,     .l_\lable\()h_16w_loop_lsx
   4575    addi.d           a1,     t0,    16
   4576    addi.d           a0,     t8,    32
   4577    addi.w           t7,     t7,    -1
   4578    bnez             t7,     .l_\lable\()h_16w_loop0_lsx
   4579    b                .l_\lable\()end_pre_8tap_lsx
   4580 
   4581 .l_\lable\()hv_lsx:
   4582    andi             t1,     a7,    3
   4583    blt              t0,     a3,    .l_\lable\()hv_idx_fh_lsx
   4584    andi             t1,     a7,    1
   4585    addi.w           t1,     t1,    3
   4586 .l_\lable\()hv_idx_fh_lsx:
   4587    addi.w           t5,     zero,  120
   4588    mul.w            t1,     t1,    t5
   4589    addi.w           t5,     a5,    -1
   4590    slli.w           t5,     t5,    3
   4591    add.w            t1,     t1,    t5
   4592    add.d            t1,     t6,    t1 //fh's offset
   4593    vldrepl.d        vr8,    t1,    0
   4594    srli.w           a7,     a7,    2
   4595    blt              t0,     a4,    .l_\lable\()hv_idx_fv_lsx
   4596    andi             a7,     a7,    1
   4597    addi.w           a7,     a7,    3
   4598 .l_\lable\()hv_idx_fv_lsx:
   4599    addi.w           t5,     zero,  120
   4600    mul.w            a7,     a7,    t5
   4601    addi.w           t5,     a6,    -1
   4602    slli.w           t5,     t5,    3
   4603    add.w            a7,     a7,    t5
   4604    add.d            a7,     t6,    a7 //fv's offset
   4605    vldrepl.d        vr9,    a7,    0
   4606    vsllwil.h.b      vr9,    vr9,   0
   4607 
   4608    sub.d            a1,     a1,    t3
   4609    addi.d           a1,     a1,    -3
   4610    beq              a3,     t0,    .l_\lable\()hv_4w_lsx
   4611    b                .l_\lable\()hv_8w_lsx
   4612 .l_\lable\()hv_4w_lsx:
   4613    addi.d           a1,     a1,    2 //ignore leading 0s
   4614    vld              vr0,    a1,    0
   4615    vldx             vr1,    a1,    a2
   4616    vldx             vr2,    a1,    t2
   4617    add.d            a1,     a1,    t3
   4618    vld              vr3,    a1,    0
   4619    vldx             vr4,    a1,    a2
   4620    vldx             vr5,    a1,    t2
   4621    vldx             vr6,    a1,    t3
   4622    add.d            a1,     a1,    t4
   4623 
   4624    la.local         t1,     subpel_h_shuf1
   4625    vld              vr7,    t1,    0
   4626    vbsrl.v          vr8,    vr8,   2
   4627    vreplvei.w       vr8,    vr8,   0
   4628 
   4629    //fv
   4630    vreplvei.w       vr17,   vr9,   0
   4631    vreplvei.w       vr18,   vr9,   1
   4632    vreplvei.w       vr19,   vr9,   2
   4633    vreplvei.w       vr20,   vr9,   3
   4634 
   4635    //DAV1D_FILTER_8TAP_RND
   4636    vshuf.b          vr0,    vr0,   vr0,  vr7
   4637    vshuf.b          vr1,    vr1,   vr1,  vr7
   4638    vshuf.b          vr2,    vr2,   vr2,  vr7
   4639    vshuf.b          vr3,    vr3,   vr3,  vr7
   4640    vshuf.b          vr4,    vr4,   vr4,  vr7
   4641    vshuf.b          vr5,    vr5,   vr5,  vr7
   4642    vshuf.b          vr6,    vr6,   vr6,  vr7
   4643 
   4644    vmulwev.h.bu.b   vr10,   vr0,   vr8
   4645    vmulwev.h.bu.b   vr11,   vr1,   vr8
   4646    vmulwev.h.bu.b   vr12,   vr2,   vr8
   4647    vmulwev.h.bu.b   vr13,   vr3,   vr8
   4648    vmulwev.h.bu.b   vr14,   vr4,   vr8
   4649    vmulwev.h.bu.b   vr15,   vr5,   vr8
   4650    vmulwev.h.bu.b   vr16,   vr6,   vr8
   4651    vmaddwod.h.bu.b  vr10,   vr0,   vr8
   4652    vmaddwod.h.bu.b  vr11,   vr1,   vr8
   4653    vmaddwod.h.bu.b  vr12,   vr2,   vr8
   4654    vmaddwod.h.bu.b  vr13,   vr3,   vr8
   4655    vmaddwod.h.bu.b  vr14,   vr4,   vr8
   4656    vmaddwod.h.bu.b  vr15,   vr5,   vr8
   4657    vmaddwod.h.bu.b  vr16,   vr6,   vr8
   4658 
   4659    vhaddw.w.h       vr10,   vr10,  vr10
   4660    vhaddw.w.h       vr11,   vr11,  vr11
   4661    vhaddw.w.h       vr12,   vr12,  vr12
   4662    vhaddw.w.h       vr13,   vr13,  vr13
   4663    vhaddw.w.h       vr14,   vr14,  vr14
   4664    vhaddw.w.h       vr15,   vr15,  vr15
   4665    vhaddw.w.h       vr16,   vr16,  vr16
   4666 
   4667    vssrarni.h.w     vr10,   vr10,  2 //h0
   4668    vssrarni.h.w     vr11,   vr11,  2 //h1
   4669    vssrarni.h.w     vr12,   vr12,  2 //h2
   4670    vssrarni.h.w     vr13,   vr13,  2 //h3
   4671    vssrarni.h.w     vr14,   vr14,  2 //h4
   4672    vssrarni.h.w     vr15,   vr15,  2 //h5
   4673    vssrarni.h.w     vr16,   vr16,  2 //h6
   4674 
   4675    //h0
   4676    vilvl.h          vr0,    vr11,  vr10 //01
   4677    vilvl.h          vr1,    vr13,  vr12 //23
   4678    vilvl.h          vr2,    vr15,  vr14 //45
   4679    //h1
   4680    vilvl.h          vr4,    vr12,  vr11 //12
   4681    vilvl.h          vr5,    vr14,  vr13 //34
   4682    vilvl.h          vr6,    vr16,  vr15 //56
   4683 
   4684 .l_\lable\()hv_w4_loop_lsx:
   4685    vld              vr9,    a1,    0
   4686    vldx             vr10,   a1,    a2
   4687    add.d            a1,     a1,    t2
   4688 
   4689    //DAV1D_FILTER_8TAP_CLIP
   4690    vshuf.b          vr9,    vr9,   vr9,  vr7
   4691    vshuf.b          vr10,   vr10,  vr10, vr7
   4692    vmulwev.h.bu.b   vr11,   vr9,   vr8
   4693    vmulwev.h.bu.b   vr12,   vr10,  vr8
   4694    vmaddwod.h.bu.b  vr11,   vr9,   vr8
   4695    vmaddwod.h.bu.b  vr12,   vr10,  vr8
   4696    vhaddw.w.h       vr11,   vr11,  vr11
   4697    vhaddw.w.h       vr12,   vr12,  vr12
   4698    vssrarni.h.w     vr11,   vr11,  2 //7h
   4699    vssrarni.h.w     vr12,   vr12,  2 //h8
   4700    vilvl.h          vr3,    vr11,  vr16 //67
   4701    vilvl.h          vr13,   vr12,  vr11 //78
   4702 
   4703    vmulwev.w.h      vr9,    vr0,   vr17
   4704    vmulwev.w.h      vr10,   vr1,   vr18
   4705    vmulwev.w.h      vr14,   vr2,   vr19
   4706    vmulwev.w.h      vr15,   vr3,   vr20
   4707    vmaddwod.w.h     vr9,    vr0,   vr17
   4708    vmaddwod.w.h     vr10,   vr1,   vr18
   4709    vmaddwod.w.h     vr14,   vr2,   vr19
   4710    vmaddwod.w.h     vr15,   vr3,   vr20
   4711    vadd.w           vr16,   vr9,   vr10
   4712    vadd.w           vr16,   vr16,  vr14
   4713    vadd.w           vr16,   vr16,  vr15
   4714 
   4715    vmulwev.w.h      vr9,    vr4,   vr17
   4716    vmulwev.w.h      vr10,   vr5,   vr18
   4717    vmulwev.w.h      vr14,   vr6,   vr19
   4718    vmulwev.w.h      vr15,   vr13,  vr20
   4719    vmaddwod.w.h     vr9,    vr4,   vr17
   4720    vmaddwod.w.h     vr10,   vr5,   vr18
   4721    vmaddwod.w.h     vr14,   vr6,   vr19
   4722    vmaddwod.w.h     vr15,   vr13,  vr20
   4723    vadd.w           vr21,   vr9,   vr10
   4724    vadd.w           vr21,   vr21,  vr14
   4725    vadd.w           vr21,   vr21,  vr15
   4726 
   4727    vssrarni.h.w     vr21,   vr16,  6
   4728    //cache
   4729    vaddi.hu         vr0,    vr1,   0
   4730    vaddi.hu         vr1,    vr2,   0
   4731    vaddi.hu         vr2,    vr3,   0
   4732    vaddi.hu         vr4,    vr5,   0
   4733    vaddi.hu         vr5,    vr6,   0
   4734    vaddi.hu         vr6,    vr13,  0
   4735    vaddi.hu         vr16,   vr12,  0
   4736 
   4737    vst              vr21,   a0,    0
   4738    addi.d           a0,     a0,    16
   4739    addi.d           a4,     a4,    -2
   4740    bnez             a4,     .l_\lable\()hv_w4_loop_lsx
   4741    b                .l_\lable\()end_pre_8tap_lsx
   4742 
   4743 .l_\lable\()hv_8w_lsx:
   4744 .l_\lable\()hv_16w_lsx:
   4745 .l_\lable\()hv_32w_lsx:
   4746 .l_\lable\()hv_64w_lsx:
   4747 .l_\lable\()hv_128w_lsx:
   4748    addi.d          sp,      sp,    -8*8
   4749    fst.d           f24,     sp,    0
   4750    fst.d           f25,     sp,    8
   4751    fst.d           f26,     sp,    16
   4752    fst.d           f27,     sp,    24
   4753    fst.d           f28,     sp,    32
   4754    fst.d           f29,     sp,    40
   4755    fst.d           f30,     sp,    48
   4756    fst.d           f31,     sp,    56
   4757    addi.d          t0,      a1,    0 //src
   4758    addi.d          t5,      a4,    0 //h
   4759    addi.d          t8,      a0,    0 //dst
   4760    slli.w          t6,      a3,    1
   4761    la.local        t1,      subpel_h_shuf1
   4762    vld             vr7,     t1,    0
   4763    vaddi.bu        vr11,    vr7,   4
   4764    vaddi.bu        vr12,    vr7,   8
   4765    vreplvei.w      vr10,    vr8,   1
   4766    vreplvei.w      vr8,     vr8,   0
   4767    vreplvei.w      vr20,    vr9,   1
   4768    vreplvei.w      vr21,    vr9,   2
   4769    vreplvei.w      vr22,    vr9,   3
   4770    vreplvei.w      vr9,     vr9,   0
   4771 .l_\lable\()prep_hv_8w_loop0_lsx:
   4772    vld             vr0,     a1,    0
   4773    vldx            vr1,     a1,    a2
   4774    vldx            vr2,     a1,    t2
   4775    add.d           a1,      a1,    t3
   4776    vld             vr3,     a1,    0
   4777    vldx            vr4,     a1,    a2
   4778    vldx            vr5,     a1,    t2
   4779    vldx            vr6,     a1,    t3
   4780    add.d           a1,      a1,    t4
   4781 
   4782    FILTER_8TAP_8W  vr0 //h0
   4783    FILTER_8TAP_8W  vr1 //h1
   4784    FILTER_8TAP_8W  vr2 //h2
   4785    FILTER_8TAP_8W  vr3 //h3
   4786    FILTER_8TAP_8W  vr4 //h4
   4787    FILTER_8TAP_8W  vr5 //h5
   4788    FILTER_8TAP_8W  vr6 //h6
   4789 
   4790    //h0' low part
   4791    vilvl.h         vr23,    vr1,   vr0 //01
   4792    vilvl.h         vr24,    vr3,   vr2 //23
   4793    vilvl.h         vr25,    vr5,   vr4 //45
   4794    //h0' high part
   4795    vilvh.h         vr26,    vr1,   vr0 //01
   4796    vilvh.h         vr27,    vr3,   vr2 //23
   4797    vilvh.h         vr28,    vr5,   vr4 //45
   4798 
   4799    //h1' low part
   4800    vilvl.h         vr29,    vr2,   vr1 //12
   4801    vilvl.h         vr30,    vr4,   vr3 //34
   4802    vilvl.h         vr31,    vr6,   vr5 //56
   4803    //h1' high part
   4804    vilvh.h         vr0,     vr2,   vr1 //12
   4805    vilvh.h         vr1,     vr4,   vr3 //34
   4806    vilvh.h         vr2,     vr6,   vr5 //56
   4807 
   4808 .l_\lable\()prep_hv_8w_loop_lsx:
   4809    vld             vr3,     a1,    0
   4810    vldx            vr4,     a1,    a2
   4811    add.d           a1,      a1,    t2
   4812 
   4813    FILTER_8TAP_8W  vr3 //h7
   4814    FILTER_8TAP_8W  vr4 //h8
   4815 
   4816    //h0' low part
   4817    vilvl.h         vr16,    vr3,   vr6 //67 ~low
   4818    vmulwev.w.h     vr13,    vr23,  vr9
   4819    vmulwev.w.h     vr14,    vr24,  vr20
   4820    vmulwev.w.h     vr15,    vr25,  vr21
   4821    vmulwev.w.h     vr17,    vr16,  vr22
   4822    vmaddwod.w.h    vr13,    vr23,  vr9
   4823    vmaddwod.w.h    vr14,    vr24,  vr20
   4824    vmaddwod.w.h    vr15,    vr25,  vr21
   4825    vmaddwod.w.h    vr17,    vr16,  vr22
   4826    vadd.w          vr13,    vr13,  vr14
   4827    vadd.w          vr13,    vr13,  vr15
   4828    vadd.w          vr13,    vr13,  vr17
   4829    //cache
   4830    vaddi.hu        vr23,    vr24,  0
   4831    vaddi.hu        vr24,    vr25,  0
   4832    vaddi.hu        vr25,    vr16,  0
   4833 
   4834    //h0' high part
   4835    vilvh.h         vr17,    vr3,   vr6 //67 ~high
   4836    vmulwev.w.h     vr14,    vr26,  vr9
   4837    vmulwev.w.h     vr15,    vr27,  vr20
   4838    vmulwev.w.h     vr16,    vr28,  vr21
   4839    vmulwev.w.h     vr18,    vr17,  vr22
   4840    vmaddwod.w.h    vr14,    vr26,  vr9
   4841    vmaddwod.w.h    vr15,    vr27,  vr20
   4842    vmaddwod.w.h    vr16,    vr28,  vr21
   4843    vmaddwod.w.h    vr18,    vr17,  vr22
   4844    vadd.w          vr14,    vr14,  vr15
   4845    vadd.w          vr14,    vr14,  vr16
   4846    vadd.w          vr14,    vr14,  vr18
   4847    vssrarni.h.w    vr14,    vr13,  6
   4848    vst             vr14,    a0,    0
   4849    add.d           a0,      a0,    t6
   4850    //cache
   4851    vaddi.hu        vr26,    vr27,  0
   4852    vaddi.hu        vr27,    vr28,  0
   4853    vaddi.hu        vr28,    vr17,  0
   4854    vaddi.hu        vr6,     vr4,   0
   4855 
   4856    vilvl.h         vr5,     vr4,   vr3 //78 ~low
   4857    vilvh.h         vr4,     vr4,   vr3 //78 ~high
   4858 
   4859    //h1' low part
   4860    vmulwev.w.h     vr13,    vr29,  vr9
   4861    vmulwev.w.h     vr14,    vr30,  vr20
   4862    vmulwev.w.h     vr15,    vr31,  vr21
   4863    vmulwev.w.h     vr16,    vr5,   vr22
   4864    vmaddwod.w.h    vr13,    vr29,  vr9
   4865    vmaddwod.w.h    vr14,    vr30,  vr20
   4866    vmaddwod.w.h    vr15,    vr31,  vr21
   4867    vmaddwod.w.h    vr16,    vr5,   vr22
   4868    vadd.w          vr13,    vr13,  vr14
   4869    vadd.w          vr13,    vr13,  vr15
   4870    vadd.w          vr13,    vr13,  vr16
   4871    //cache
   4872    vaddi.hu        vr29,    vr30,  0
   4873    vaddi.hu        vr30,    vr31,  0
   4874    vaddi.hu        vr31,    vr5,   0
   4875 
   4876    //h1' high part
   4877    vmulwev.w.h     vr14,    vr0,   vr9
   4878    vmulwev.w.h     vr15,    vr1,   vr20
   4879    vmulwev.w.h     vr16,    vr2,   vr21
   4880    vmulwev.w.h     vr17,    vr4,   vr22
   4881    vmaddwod.w.h    vr14,    vr0,   vr9
   4882    vmaddwod.w.h    vr15,    vr1,   vr20
   4883    vmaddwod.w.h    vr16,    vr2,   vr21
   4884    vmaddwod.w.h    vr17,    vr4,   vr22
   4885    vadd.w          vr14,    vr14,  vr15
   4886    vadd.w          vr14,    vr14,  vr16
   4887    vadd.w          vr14,    vr14,  vr17
   4888    vssrarni.h.w    vr14,    vr13,  6
   4889    vst             vr14,    a0,    0
   4890    add.d           a0,      a0,    t6
   4891    //cache
   4892    vaddi.hu        vr0,     vr1,   0
   4893    vaddi.hu        vr1,     vr2,   0
   4894    vaddi.hu        vr2,     vr4,   0
   4895    addi.w          a4,      a4,    -2
   4896    bnez            a4,      .l_\lable\()prep_hv_8w_loop_lsx
   4897    addi.d          a1,      t0,    8
   4898    addi.d          t0,      t0,    8
   4899    addi.d          a0,      t8,    16
   4900    addi.d          t8,      t8,    16
   4901    addi.d          a4,      t5,    0
   4902    addi.w          a3,      a3,    -8
   4903    bnez            a3,      .l_\lable\()prep_hv_8w_loop0_lsx
   4904    fld.d           f24,     sp,    0
   4905    fld.d           f25,     sp,    8
   4906    fld.d           f26,     sp,    16
   4907    fld.d           f27,     sp,    24
   4908    fld.d           f28,     sp,    32
   4909    fld.d           f29,     sp,    40
   4910    fld.d           f30,     sp,    48
   4911    fld.d           f31,     sp,    56
   4912    addi.d          sp,      sp,    8*8
   4913    b                .l_\lable\()end_pre_8tap_lsx
   4914 
   4915 .l_\lable\()v_lsx:
   4916    srli.w           a7,    a7,     2
   4917    blt              t0,    a4,     .l_\lable\()v_idx_fv_lsx
   4918    andi             a7,    a7,     1
   4919    addi.w           a7,    a7,     3
   4920 .l_\lable\()v_idx_fv_lsx:
   4921    addi.w           t5,     zero,  120
   4922    mul.w            a7,     a7,    t5
   4923    addi.w           t5,     a6,    -1
   4924    slli.w           t5,     t5,    3
   4925    add.w            a7,     a7,    t5
   4926    add.d            a7,     t6,    a7 //fv's offset
   4927    vldrepl.d        vr8,    a7,    0
   4928 
   4929    vilvl.h          vr8,    vr8,   vr8
   4930    vreplvei.w       vr9,    vr8,   1
   4931    vreplvei.w       vr10,   vr8,   2
   4932    vreplvei.w       vr11,   vr8,   3
   4933    vreplvei.w       vr8,    vr8,   0
   4934 
   4935    sub.d            a1,     a1,    t3
   4936    beq              a3,     t0,    .l_\lable\()v_4w_lsx
   4937    blt              t0,     a3,    .l_\lable\()v_8w_lsx
   4938 .l_\lable\()v_4w_lsx:
   4939    fld.s            f0,     a1,    0
   4940    fldx.s           f1,     a1,    a2
   4941    fldx.s           f2,     a1,    t2
   4942    add.d            a1,     a1,    t3
   4943    fld.s            f3,     a1,    0
   4944    fldx.s           f4,     a1,    a2
   4945    fldx.s           f5,     a1,    t2
   4946    fldx.s           f6,     a1,    t3
   4947    add.d            a1,     a1,    t4
   4948 
   4949    vilvl.w          vr0,    vr1,   vr0
   4950    vilvl.w          vr1,    vr2,   vr1
   4951    vilvl.b          vr0,    vr1,   vr0 //0 1 1 2
   4952    vilvl.w          vr1,    vr3,   vr2
   4953    vilvl.w          vr2,    vr4,   vr3
   4954    vilvl.b          vr1,    vr2,   vr1 //2 3 3 4
   4955    vilvl.w          vr2,    vr5,   vr4
   4956    vilvl.w          vr3,    vr6,   vr5
   4957    vilvl.b          vr2,    vr3,   vr2 //4 5 5 6
   4958 .l_\lable\()v_4w_loop_lsx:
   4959    fld.s            f7,     a1,     0
   4960 
   4961    vilvl.w          vr3,    vr7,   vr6
   4962    fldx.s           f6,     a1,    a2
   4963    add.d            a1,     a1,    t2
   4964    vilvl.w          vr4,    vr6,   vr7
   4965    vilvl.b          vr3,    vr4,   vr3 //6 7 7 8
   4966 
   4967    vmulwev.h.bu.b   vr12,   vr0,   vr8
   4968    vmulwev.h.bu.b   vr13,   vr1,   vr9
   4969    vmulwev.h.bu.b   vr14,   vr2,   vr10
   4970    vmulwev.h.bu.b   vr15,   vr3,   vr11
   4971    vmaddwod.h.bu.b  vr12,   vr0,   vr8
   4972    vmaddwod.h.bu.b  vr13,   vr1,   vr9
   4973    vmaddwod.h.bu.b  vr14,   vr2,   vr10
   4974    vmaddwod.h.bu.b  vr15,   vr3,   vr11
   4975    vaddi.hu         vr0,    vr1,   0
   4976    vaddi.hu         vr1,    vr2,   0
   4977    vaddi.hu         vr2,    vr3,   0
   4978    vadd.h           vr12,   vr12,  vr13
   4979    vadd.h           vr12,   vr12,  vr14
   4980    vadd.h           vr12,   vr12,  vr15
   4981 
   4982    vsrari.h         vr12,   vr12,  2
   4983    vst              vr12,   a0,    0
   4984    addi.d           a0,     a0,    16
   4985    addi.w           a4,     a4,    -2
   4986    bnez             a4,     .l_\lable\()v_4w_loop_lsx
   4987    b                .l_\lable\()end_pre_8tap_lsx
   4988 
   4989 .l_\lable\()v_8w_lsx:
   4990    addi.d           t0,     a1,    0
   4991    addi.d           t5,     a4,    0
   4992    addi.d           t8,     a0,    0
   4993    slli.w           t6,     a3,    1
   4994 .l_\lable\()v_8w_loop0_lsx:
   4995    fld.d            f0,     a1,    0
   4996    fldx.d           f1,     a1,    a2
   4997    fldx.d           f2,     a1,    t2
   4998    add.d            a1,     a1,    t3
   4999    fld.d            f3,     a1,    0
   5000    fldx.d           f4,     a1,    a2
   5001    fldx.d           f5,     a1,    t2
   5002    fldx.d           f6,     a1,    t3
   5003    add.d            a1,     a1,    t4
   5004 
   5005    vilvl.b          vr0,    vr1,   vr0 //0 1
   5006    vilvl.b          vr1,    vr2,   vr1 //1 2
   5007    vilvl.b          vr2,    vr3,   vr2 //2 3
   5008    vilvl.b          vr3,    vr4,   vr3 //3 4
   5009    vilvl.b          vr4,    vr5,   vr4 //4 5
   5010    vilvl.b          vr5,    vr6,   vr5 //5 6
   5011 .l_\lable\()v_8w_loop_lsx:
   5012    fld.d            f7,     a1,    0
   5013    vilvl.b          vr12,   vr7,   vr6 //6 7
   5014    fldx.d           f6,     a1,    a2
   5015    add.d            a1,     a1,    t2
   5016    vilvl.b          vr13,   vr6,   vr7 //7 8
   5017 
   5018    vmulwev.h.bu.b   vr14,   vr0,   vr8
   5019    vmulwev.h.bu.b   vr15,   vr1,   vr8
   5020    vmulwev.h.bu.b   vr16,   vr2,   vr9
   5021    vmulwev.h.bu.b   vr17,   vr3,   vr9
   5022    vmulwev.h.bu.b   vr18,   vr4,   vr10
   5023    vmulwev.h.bu.b   vr19,   vr5,   vr10
   5024    vmulwev.h.bu.b   vr20,   vr12,  vr11
   5025    vmulwev.h.bu.b   vr21,   vr13,  vr11
   5026    vmaddwod.h.bu.b  vr14,   vr0,   vr8
   5027    vmaddwod.h.bu.b  vr15,   vr1,   vr8
   5028    vmaddwod.h.bu.b  vr16,   vr2,   vr9
   5029    vmaddwod.h.bu.b  vr17,   vr3,   vr9
   5030    vmaddwod.h.bu.b  vr18,   vr4,   vr10
   5031    vmaddwod.h.bu.b  vr19,   vr5,   vr10
   5032    vmaddwod.h.bu.b  vr20,   vr12,  vr11
   5033    vmaddwod.h.bu.b  vr21,   vr13,  vr11
   5034 
   5035    vaddi.hu         vr0,    vr2,   0
   5036    vaddi.hu         vr1,    vr3,   0
   5037    vaddi.hu         vr2,    vr4,   0
   5038    vaddi.hu         vr3,    vr5,   0
   5039    vaddi.hu         vr4,    vr12,  0
   5040    vaddi.hu         vr5,    vr13,  0
   5041    vadd.h           vr14,   vr14,  vr16
   5042    vadd.h           vr14,   vr14,  vr18
   5043    vadd.h           vr14,   vr14,  vr20
   5044    vadd.h           vr15,   vr15,  vr17
   5045    vadd.h           vr15,   vr15,  vr19
   5046    vadd.h           vr15,   vr15,  vr21
   5047 
   5048    vsrari.h         vr14,   vr14,  2
   5049    vsrari.h         vr15,   vr15,  2
   5050    vst              vr14,   a0,    0
   5051    add.d            a0,     a0,    t6
   5052    vst              vr15,   a0,    0
   5053    add.d            a0,     a0,    t6
   5054    addi.w           a4,     a4,    -2
   5055    bnez             a4,     .l_\lable\()v_8w_loop_lsx
   5056    addi.d           a1,     t0,    8
   5057    addi.d           t0,     t0,    8
   5058    addi.d           a0,     t8,    16
   5059    addi.d           t8,     t8,    16
   5060    addi.d           a4,     t5,    0
   5061    addi.d           a3,     a3,    -8
   5062    bnez             a3,     .l_\lable\()v_8w_loop0_lsx
   5063 .l_\lable\()end_pre_8tap_lsx:
   5064 .endm
   5065 
   5066 function prep_8tap_regular_8bpc_lsx
   5067    addi.w a7, zero, 0
   5068    PREP_8TAP_8BPC_LSX 0
   5069 endfunc
   5070 
   5071 function prep_8tap_smooth_regular_8bpc_lsx
   5072    addi.w a7, zero, 1
   5073    PREP_8TAP_8BPC_LSX 1
   5074 endfunc
   5075 
   5076 function prep_8tap_sharp_regular_8bpc_lsx
   5077    addi.w a7, zero, 2
   5078    PREP_8TAP_8BPC_LSX 2
   5079 endfunc
   5080 
   5081 function prep_8tap_regular_smooth_8bpc_lsx
   5082    addi.w a7, zero, 4
   5083    PREP_8TAP_8BPC_LSX 4
   5084 endfunc
   5085 
   5086 function prep_8tap_smooth_8bpc_lsx
   5087    addi.w a7, zero, 5
   5088    PREP_8TAP_8BPC_LSX 5
   5089 endfunc
   5090 
   5091 function prep_8tap_sharp_smooth_8bpc_lsx
   5092    addi.w a7, zero, 6
   5093    PREP_8TAP_8BPC_LSX 6
   5094 endfunc
   5095 
   5096 function prep_8tap_regular_sharp_8bpc_lsx
   5097    addi.w a7, zero, 8
   5098    PREP_8TAP_8BPC_LSX 8
   5099 endfunc
   5100 
   5101 function prep_8tap_smooth_sharp_8bpc_lsx
   5102    addi.w a7, zero, 9
   5103    PREP_8TAP_8BPC_LSX 9
   5104 endfunc
   5105 
   5106 function prep_8tap_sharp_8bpc_lsx
   5107    addi.w a7, zero, 10
   5108    PREP_8TAP_8BPC_LSX 10
   5109 endfunc
   5110 
   5111 /*
   5112 * static void blend_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
   5113                         const int w, int h, const uint8_t *mask)
   5114 */
   5115 function blend_8bpc_lsx
   5116    addi.d        t8,     zero,    64
   5117    vreplgr2vr.b  vr23,   t8
   5118 
   5119    clz.w         t0,     a3
   5120    li.w          t1,     26
   5121    sub.w         t0,     t0,      t1
   5122    la.local      t1,     .BLEND_LSX_JRTABLE
   5123    alsl.d        t0,     t0,      t1,    1
   5124    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
   5125    add.d         t1,     t1,      t2 // Get absolute address
   5126    jirl          $r0,    t1,      0
   5127 
   5128    .align   3
   5129 .BLEND_LSX_JRTABLE:
   5130    .hword .BLEND_W32_LSX  - .BLEND_LSX_JRTABLE
   5131    .hword .BLEND_W16_LSX  - .BLEND_LSX_JRTABLE
   5132    .hword .BLEND_W8_LSX   - .BLEND_LSX_JRTABLE
   5133    .hword .BLEND_W4_LSX   - .BLEND_LSX_JRTABLE
   5134 
   5135 .BLEND_W4_LSX:
   5136    vld             vr0,    a0,      0
   5137    vld             vr1,    a2,      0
   5138    vld             vr2,    a5,      0
   5139 
   5140    vsllwil.hu.bu   vr1,    vr1,     0
   5141    vsllwil.hu.bu   vr4,    vr2,     0
   5142    vmul.h          vr1,    vr1,     vr4  //b*m
   5143    vsub.b          vr3,    vr23,    vr2
   5144    vsllwil.hu.bu   vr0,    vr0,     0
   5145    vsllwil.hu.bu   vr3,    vr3,     0
   5146    vmadd.h         vr1,    vr0,     vr3
   5147    vssrarni.bu.h   vr1,    vr1,     6
   5148 
   5149    vstelm.w        vr1,    a0,      0,   0
   5150    addi.w          a4,     a4,      -1
   5151    add.d           a0,     a0,      a1
   5152    addi.d          a2,     a2,      4
   5153    addi.d          a5,     a5,      4
   5154 
   5155    blt             zero,   a4,     .BLEND_W4_LSX
   5156    b              .BLEND_END_LSX
   5157 .BLEND_W8_LSX:
   5158    vld             vr0,    a0,      0
   5159    vld             vr1,    a2,      0
   5160    vld             vr2,    a5,      0
   5161 
   5162    vsllwil.hu.bu   vr1,    vr1,     0
   5163    vsllwil.hu.bu   vr4,    vr2,     0
   5164    vmul.h          vr1,    vr1,     vr4  //b*m
   5165    vsub.b          vr3,    vr23,    vr2
   5166    vsllwil.hu.bu   vr0,    vr0,     0
   5167    vsllwil.hu.bu   vr3,    vr3,     0
   5168    vmadd.h         vr1,    vr0,     vr3
   5169    vssrarni.bu.h   vr1,    vr1,     6
   5170 
   5171    vstelm.d        vr1,    a0,      0,   0
   5172    addi.w          a4,     a4,      -1
   5173    add.d           a0,     a0,      a1
   5174    addi.d          a2,     a2,      8
   5175    addi.d          a5,     a5,      8
   5176 
   5177    blt             zero,   a4,     .BLEND_W8_LSX
   5178    b               .BLEND_END_LSX
   5179 .BLEND_W16_LSX:
   5180    vld             vr0,    a0,      0
   5181    vld             vr1,    a2,      0
   5182    vld             vr2,    a5,      0
   5183 
   5184    vexth.hu.bu     vr5,    vr1
   5185    vsllwil.hu.bu   vr1,    vr1,     0
   5186    vexth.hu.bu     vr6,    vr2
   5187    vsllwil.hu.bu   vr4,    vr2,     0
   5188    vmul.h          vr1,    vr1,     vr4  //b*m
   5189    vmul.h          vr5,    vr5,     vr6  //b*m
   5190    vsub.b          vr3,    vr23,    vr2
   5191    vexth.hu.bu     vr7,    vr0
   5192    vexth.hu.bu     vr8,    vr3
   5193    vmadd.h         vr5,    vr7,     vr8
   5194    vsllwil.hu.bu   vr0,    vr0,     0
   5195    vsllwil.hu.bu   vr3,    vr3,     0
   5196    vmadd.h         vr1,    vr0,     vr3
   5197    vssrarni.bu.h   vr5,    vr1,     6
   5198 
   5199    vst             vr5,    a0,      0
   5200    addi.w          a4,     a4,      -1
   5201    add.d           a0,     a0,      a1
   5202    addi.d          a2,     a2,      16
   5203    addi.d          a5,     a5,      16
   5204 
   5205    blt             zero,   a4,     .BLEND_W16_LSX
   5206    b               .BLEND_END_LSX
   5207 .BLEND_W32_LSX:
   5208    vld             vr0,    a0,      0
   5209    vld             vr1,    a2,      0
   5210    vld             vr2,    a5,      0
   5211 
   5212    vexth.hu.bu     vr5,    vr1
   5213    vsllwil.hu.bu   vr1,    vr1,     0
   5214    vexth.hu.bu     vr6,    vr2
   5215    vsllwil.hu.bu   vr4,    vr2,     0
   5216    vmul.h          vr1,    vr1,     vr4  //b*m
   5217    vmul.h          vr5,    vr5,     vr6  //b*m
   5218    vsub.b          vr3,    vr23,    vr2
   5219    vexth.hu.bu     vr7,    vr0
   5220    vexth.hu.bu     vr8,    vr3
   5221    vmadd.h         vr5,    vr7,     vr8
   5222    vsllwil.hu.bu   vr0,    vr0,     0
   5223    vsllwil.hu.bu   vr3,    vr3,     0
   5224    vmadd.h         vr1,    vr0,     vr3
   5225    vssrarni.bu.h   vr5,    vr1,     6
   5226 
   5227    vst             vr5,    a0,      0
   5228 
   5229    /* sencond */
   5230    vld             vr0,    a0,      16
   5231    vld             vr1,    a2,      16
   5232    vld             vr2,    a5,      16
   5233 
   5234    vexth.hu.bu     vr5,    vr1
   5235    vsllwil.hu.bu   vr1,    vr1,     0
   5236    vexth.hu.bu     vr6,    vr2
   5237    vsllwil.hu.bu   vr4,    vr2,     0
   5238    vmul.h          vr1,    vr1,     vr4  //b*m
   5239    vmul.h          vr5,    vr5,     vr6  //b*m
   5240    vsub.b          vr3,    vr23,    vr2
   5241    vexth.hu.bu     vr7,    vr0
   5242    vexth.hu.bu     vr8,    vr3
   5243    vmadd.h         vr5,    vr7,     vr8
   5244    vsllwil.hu.bu   vr0,    vr0,     0
   5245    vsllwil.hu.bu   vr3,    vr3,     0
   5246    vmadd.h         vr1,    vr0,     vr3
   5247    vssrarni.bu.h   vr5,    vr1,     6
   5248 
   5249    vst             vr5,    a0,      16
   5250    addi.w          a4,     a4,      -1
   5251    add.d           a0,     a0,      a1
   5252    addi.d          a2,     a2,      32
   5253    addi.d          a5,     a5,      32
   5254 
   5255    blt             zero,   a4,     .BLEND_W32_LSX
   5256 .BLEND_END_LSX:
   5257 
   5258 endfunc
   5259 
   5260 const obmc_masks_la
   5261 /* Unused */
   5262 .byte 0,  0,  0,  0
   5263 /* 2 */
   5264 .byte 45, 19, 64, 0
   5265 /* 4 */
   5266 .byte 39, 25, 50, 14, 59,  5, 64,  0
   5267 /* 8 */
   5268 .byte 36, 28, 42, 22, 48, 16, 53, 11, 57,  7, 61,  3, 64,  0, 64,  0
   5269 /* 16 */
   5270 .byte 34, 30, 37, 27, 40, 24, 43, 21, 46, 18, 49, 15, 52, 12, 54, 10
   5271 .byte 56,  8, 58,  6, 60,  4, 61,  3, 64,  0, 64,  0, 64,  0, 64,  0
   5272 /* 32 */
   5273 .byte 33, 31, 35, 29, 36, 28, 38, 26, 40, 24, 41, 23, 43, 21, 44, 20
   5274 .byte 45, 19, 47, 17, 48, 16, 50, 14, 51, 13, 52, 12, 53, 11, 55,  9
   5275 .byte 56,  8, 57,  7, 58,  6, 59,  5, 60,  4, 60,  4, 61,  3, 62,  2
   5276 endconst
   5277 
   5278 /*
   5279 * static void blend_v_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
   5280                           const int w, int h)
   5281 */
   5282 function blend_v_8bpc_lsx
   5283    la.local      t8,     obmc_masks_la
   5284 
   5285    clz.w         t0,     a3
   5286    li.w          t1,     26
   5287    sub.w         t0,     t0,      t1
   5288    la.local      t1,     .BLEND_V_LSX_JRTABLE
   5289    alsl.d        t0,     t0,      t1,    1
   5290    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
   5291    add.d         t1,     t1,      t2 // Get absolute address
   5292    jirl          $r0,    t1,      0
   5293 
   5294    .align   3
   5295 .BLEND_V_LSX_JRTABLE:
   5296    .hword .BLEND_V_W32_LSX  - .BLEND_V_LSX_JRTABLE
   5297    .hword .BLEND_V_W16_LSX  - .BLEND_V_LSX_JRTABLE
   5298    .hword .BLEND_V_W8_LSX   - .BLEND_V_LSX_JRTABLE
   5299    .hword .BLEND_V_W4_LSX   - .BLEND_V_LSX_JRTABLE
   5300    .hword .BLEND_V_W2_LSX   - .BLEND_V_LSX_JRTABLE
   5301    .hword .BLEND_V_W2_LSX_1 - .BLEND_V_LSX_JRTABLE  //Instructions must be 4-byte aligned
   5302 
   5303 .BLEND_V_W2_LSX:
   5304    ld.bu           t6,     t8,      4
   5305    ld.bu           t7,     t8,      5
   5306 
   5307 .BLEND_V_W2_LSX_1:
   5308    ld.bu           t0,     a0,      0
   5309    ld.bu           t1,     a2,      0
   5310    mul.d           t0,     t0,      t6
   5311    mul.d           t1,     t1,      t7
   5312    addi.d          t0,     t0,      32
   5313    add.d           t0,     t0,      t1
   5314    srli.d          t0,     t0,      6
   5315    st.b            t0,     a0,      0
   5316 
   5317    addi.w          a4,     a4,      -1
   5318    add.d           a0,     a0,      a1
   5319    addi.d          a2,     a2,      2
   5320    addi.d          a5,     a5,      2
   5321 
   5322    blt             zero,   a4,     .BLEND_V_W2_LSX_1
   5323    b               .BLEND_V_END_LSX
   5324 
   5325 .BLEND_V_W4_LSX:
   5326    vld             vr20,   t8,      8
   5327 
   5328 .BLEND_V_W4_LSX_1:
   5329    vld             vr0,    a0,      0
   5330    vld             vr1,    a2,      0
   5331 
   5332    vilvl.b         vr0,    vr1,     vr0
   5333    vdp2.h.bu       vr1,    vr0,     vr20
   5334    vssrarni.bu.h   vr1,    vr1,     6
   5335 
   5336    vstelm.h        vr1,    a0,      0,   0
   5337    vstelm.b        vr1,    a0,      2,   2
   5338    addi.w          a4,     a4,      -1
   5339    add.d           a0,     a0,      a1
   5340    addi.d          a2,     a2,      4
   5341 
   5342    blt             zero,   a4,     .BLEND_V_W4_LSX_1
   5343    b              .BLEND_V_END_LSX
   5344 
   5345 .BLEND_V_W8_LSX:
   5346    vld             vr20,   t8,      16
   5347 
   5348 .BLEND_V_W8_LSX_1:
   5349    vld             vr0,    a0,      0
   5350    vld             vr1,    a2,      0
   5351 
   5352    vilvl.b         vr0,    vr1,     vr0
   5353    vdp2.h.bu       vr1,    vr0,     vr20
   5354    vssrarni.bu.h   vr1,    vr1,     6
   5355 
   5356    vstelm.w        vr1,    a0,      0,   0
   5357    vstelm.h        vr1,    a0,      4,   2
   5358    addi.w          a4,     a4,      -1
   5359    add.d           a0,     a0,      a1
   5360    addi.d          a2,     a2,      8
   5361 
   5362    blt             zero,   a4,     .BLEND_V_W8_LSX_1
   5363    b              .BLEND_V_END_LSX
   5364 
   5365 .BLEND_V_W16_LSX:
   5366    vld             vr20,   t8,      32
   5367    vld             vr21,   t8,      48
   5368 
   5369 .BLEND_V_W16_LSX_1:
   5370    vld             vr0,    a0,      0
   5371    vld             vr1,    a2,      0
   5372 
   5373    vilvl.b         vr2,    vr1,     vr0
   5374    vilvh.b         vr3,    vr1,     vr0
   5375    vmulwev.h.bu    vr4,    vr2,     vr20
   5376    vmulwev.h.bu    vr5,    vr3,     vr21
   5377    vmaddwod.h.bu   vr4,    vr2,     vr20
   5378    vmaddwod.h.bu   vr5,    vr3,     vr21
   5379    vssrarni.bu.h   vr5,    vr4,     6
   5380 
   5381    vstelm.d        vr5,    a0,      0,   0
   5382    vstelm.w        vr5,    a0,      8,   2
   5383    addi.w          a4,     a4,      -1
   5384    add.d           a0,     a0,      a1
   5385    addi.d          a2,     a2,      16
   5386 
   5387    blt             zero,   a4,     .BLEND_V_W16_LSX_1
   5388    b              .BLEND_V_END_LSX
   5389 
   5390 .BLEND_V_W32_LSX:
   5391    vld             vr20,   t8,      64
   5392    vld             vr21,   t8,      80
   5393    vld             vr22,   t8,      96
   5394 
   5395 .BLEND_V_W32_LSX_1:
   5396    vld             vr0,    a0,      0
   5397    vld             vr1,    a0,      16
   5398    vld             vr2,    a2,      0
   5399    vld             vr3,    a2,      16
   5400 
   5401    vilvl.b         vr4,    vr2,     vr0
   5402    vmulwev.h.bu    vr7,    vr4,     vr20
   5403    vilvh.b         vr5,    vr2,     vr0
   5404    vmulwev.h.bu    vr8,    vr5,     vr21
   5405    vilvl.b         vr6,    vr3,     vr1
   5406    vmulwev.h.bu    vr9,    vr6,     vr22
   5407    vmaddwod.h.bu   vr7,    vr4,     vr20
   5408    vmaddwod.h.bu   vr8,    vr5,     vr21
   5409    vmaddwod.h.bu   vr9,    vr6,     vr22
   5410    vssrarni.bu.h   vr8,    vr7,     6
   5411    vssrarni.bu.h   vr9,    vr9,     6
   5412 
   5413    vst             vr8,    a0,      0
   5414    vstelm.d        vr9,    a0,      16,   0
   5415    addi.w          a4,     a4,      -1
   5416    add.d           a0,     a0,      a1
   5417    addi.d          a2,     a2,      32
   5418 
   5419    blt             zero,   a4,     .BLEND_V_W32_LSX_1
   5420 
   5421 .BLEND_V_END_LSX:
   5422 
   5423 endfunc
   5424 
   5425 /*
   5426 * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
   5427                           const int w, int h)
   5428 */
   5429 function blend_h_8bpc_lsx
   5430    la.local      t8,     obmc_masks_la
   5431    alsl.d        t8,     a4,      t8,    1
   5432    srli.d        t0,     a4,      1
   5433    srli.d        t1,     a4,      2
   5434    add.d         a4,     t0,      t1  // h = (h * 3) >> 2;
   5435    slli.d        a4,     a4,      1
   5436    add.d         a4,     a4,      t8
   5437 
   5438    clz.w         t0,     a3
   5439    li.w          t1,     24
   5440    sub.w         t0,     t0,      t1
   5441    la.local      t1,     .BLEND_H_LSX_JRTABLE
   5442    alsl.d        t0,     t0,      t1,    1
   5443    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
   5444    add.d         t1,     t1,      t2 // Get absolute address
   5445    jirl          $r0,    t1,      0
   5446 
   5447    .align   3
   5448 .BLEND_H_LSX_JRTABLE:
   5449    .hword .BLEND_H_W128_LSX - .BLEND_H_LSX_JRTABLE
   5450    .hword .BLEND_H_W64_LSX  - .BLEND_H_LSX_JRTABLE
   5451    .hword .BLEND_H_W32_LSX  - .BLEND_H_LSX_JRTABLE
   5452    .hword .BLEND_H_W16_LSX  - .BLEND_H_LSX_JRTABLE
   5453    .hword .BLEND_H_W8_LSX   - .BLEND_H_LSX_JRTABLE
   5454    .hword .BLEND_H_W4_LSX   - .BLEND_H_LSX_JRTABLE
   5455    .hword .BLEND_H_W2_LSX   - .BLEND_H_LSX_JRTABLE
   5456    .hword .BLEND_H_END_LSX  - .BLEND_H_LSX_JRTABLE  //Instructions must be 4-byte aligned
   5457 
   5458 .BLEND_H_W2_LSX:
   5459    vldrepl.h       vr20,   t8,      0
   5460    vld             vr0,    a0,      0
   5461    vld             vr1,    a2,      0
   5462 
   5463    vilvl.b         vr0,    vr1,     vr0
   5464    vdp2.h.bu       vr1,    vr0,     vr20
   5465    vssrarni.bu.h   vr1,    vr1,     6
   5466 
   5467    vstelm.h        vr1,    a0,      0,   0
   5468    addi.d          t8,     t8,      2
   5469    add.d           a0,     a0,      a1
   5470    addi.d          a2,     a2,      2
   5471 
   5472    blt             t8,     a4,     .BLEND_H_W2_LSX
   5473    b               .BLEND_H_END_LSX
   5474 
   5475 .BLEND_H_W4_LSX:
   5476    vldrepl.h       vr20,   t8,      0
   5477    vld             vr0,    a0,      0
   5478    vld             vr1,    a2,      0
   5479 
   5480    vilvl.b         vr0,    vr1,     vr0
   5481    vdp2.h.bu       vr1,    vr0,     vr20
   5482    vssrarni.bu.h   vr1,    vr1,     6
   5483 
   5484    vstelm.w        vr1,    a0,      0,   0
   5485    addi.d          t8,     t8,      2
   5486    add.d           a0,     a0,      a1
   5487    addi.d          a2,     a2,      4
   5488 
   5489    blt             t8,     a4,     .BLEND_H_W4_LSX
   5490    b               .BLEND_H_END_LSX
   5491 
   5492 .BLEND_H_W8_LSX:
   5493    vldrepl.h       vr20,   t8,      0
   5494    vld             vr0,    a0,      0
   5495    vld             vr1,    a2,      0
   5496 
   5497    vilvl.b         vr0,    vr1,     vr0
   5498    vdp2.h.bu       vr1,    vr0,     vr20
   5499    vssrarni.bu.h   vr1,    vr1,     6
   5500 
   5501    vstelm.d        vr1,    a0,      0,   0
   5502    addi.d          t8,     t8,      2
   5503    add.d           a0,     a0,      a1
   5504    addi.d          a2,     a2,      8
   5505 
   5506    blt             t8,     a4,     .BLEND_H_W8_LSX
   5507    b               .BLEND_H_END_LSX
   5508 
   5509 .BLEND_H_W16_LSX:
   5510    vldrepl.h       vr20,   t8,      0
   5511    vld             vr0,    a0,      0
   5512    vld             vr1,    a2,      0
   5513 
   5514    vilvl.b         vr2,    vr1,     vr0
   5515    vilvh.b         vr3,    vr1,     vr0
   5516    vmulwev.h.bu    vr4,    vr2,     vr20
   5517    vmulwev.h.bu    vr5,    vr3,     vr20
   5518    vmaddwod.h.bu   vr4,    vr2,     vr20
   5519    vmaddwod.h.bu   vr5,    vr3,     vr20
   5520    vssrarni.bu.h   vr5,    vr4,     6
   5521 
   5522    vst             vr5,    a0,      0
   5523    addi.d          t8,     t8,      2
   5524    add.d           a0,     a0,      a1
   5525    addi.d          a2,     a2,      16
   5526 
   5527    blt             t8,     a4,     .BLEND_H_W16_LSX
   5528    b               .BLEND_H_END_LSX
   5529 
   5530 .BLEND_H_W32_LSX:
   5531    vldrepl.h       vr20,   t8,      0
   5532 
   5533    vld             vr0,    a0,      0
   5534    vld             vr1,    a0,      16
   5535    vld             vr2,    a2,      0
   5536    vld             vr3,    a2,      16
   5537 
   5538    vilvl.b         vr4,    vr2,     vr0
   5539    vilvh.b         vr5,    vr2,     vr0
   5540    vilvl.b         vr6,    vr3,     vr1
   5541    vilvh.b         vr3,    vr3,     vr1
   5542    vmulwev.h.bu    vr7,    vr4,     vr20
   5543    vmulwev.h.bu    vr8,    vr5,     vr20
   5544    vmulwev.h.bu    vr9,    vr6,     vr20
   5545    vmulwev.h.bu    vr0,    vr3,     vr20
   5546    vmaddwod.h.bu   vr7,    vr4,     vr20
   5547    vmaddwod.h.bu   vr8,    vr5,     vr20
   5548    vmaddwod.h.bu   vr9,    vr6,     vr20
   5549    vmaddwod.h.bu   vr0,    vr3,     vr20
   5550    vssrarni.bu.h   vr8,    vr7,     6
   5551    vssrarni.bu.h   vr0,    vr9,     6
   5552 
   5553    vst             vr8,    a0,      0
   5554    vst             vr0,    a0,      16
   5555    addi.d          t8,     t8,      2
   5556    add.d           a0,     a0,      a1
   5557    addi.d          a2,     a2,      32
   5558 
   5559    blt             t8,     a4,     .BLEND_H_W32_LSX
   5560    b               .BLEND_H_END_LSX
   5561 
   5562 .BLEND_H_W64_LSX:
   5563    vldrepl.h       vr20,   t8,      0
   5564 
   5565    vld             vr0,    a0,      0
   5566    vld             vr1,    a0,      16
   5567    vld             vr2,    a0,      32
   5568    vld             vr3,    a0,      48
   5569    vld             vr4,    a2,      0
   5570    vld             vr5,    a2,      16
   5571    vld             vr6,    a2,      32
   5572    vld             vr7,    a2,      48
   5573 
   5574    vilvl.b         vr8,    vr4,     vr0
   5575    vilvh.b         vr9,    vr4,     vr0
   5576    vilvl.b         vr10,   vr5,     vr1
   5577    vilvh.b         vr11,   vr5,     vr1
   5578    vilvl.b         vr12,   vr6,     vr2
   5579    vilvh.b         vr13,   vr6,     vr2
   5580    vilvl.b         vr14,   vr7,     vr3
   5581    vilvh.b         vr15,   vr7,     vr3
   5582    vmulwev.h.bu    vr0,    vr8,     vr20
   5583    vmulwev.h.bu    vr1,    vr9,     vr20
   5584    vmulwev.h.bu    vr2,    vr10,    vr20
   5585    vmulwev.h.bu    vr3,    vr11,    vr20
   5586    vmulwev.h.bu    vr4,    vr12,    vr20
   5587    vmulwev.h.bu    vr5,    vr13,    vr20
   5588    vmulwev.h.bu    vr6,    vr14,    vr20
   5589    vmulwev.h.bu    vr7,    vr15,    vr20
   5590 
   5591    vmaddwod.h.bu   vr0,    vr8,     vr20
   5592    vmaddwod.h.bu   vr1,    vr9,     vr20
   5593    vmaddwod.h.bu   vr2,    vr10,    vr20
   5594    vmaddwod.h.bu   vr3,    vr11,    vr20
   5595    vmaddwod.h.bu   vr4,    vr12,    vr20
   5596    vmaddwod.h.bu   vr5,    vr13,    vr20
   5597    vmaddwod.h.bu   vr6,    vr14,    vr20
   5598    vmaddwod.h.bu   vr7,    vr15,    vr20
   5599 
   5600    vssrarni.bu.h   vr1,    vr0,     6
   5601    vssrarni.bu.h   vr3,    vr2,     6
   5602    vssrarni.bu.h   vr5,    vr4,     6
   5603    vssrarni.bu.h   vr7,    vr6,     6
   5604 
   5605    vst             vr1,    a0,      0
   5606    vst             vr3,    a0,      16
   5607    vst             vr5,    a0,      32
   5608    vst             vr7,    a0,      48
   5609    addi.d          t8,     t8,      2
   5610    add.d           a0,     a0,      a1
   5611    addi.d          a2,     a2,      64
   5612 
   5613    blt             t8,     a4,     .BLEND_H_W64_LSX
   5614    b               .BLEND_H_END_LSX
   5615 
   5616 .BLEND_H_W128_LSX:
   5617    vldrepl.h       vr20,   t8,      0
   5618 
   5619    vld             vr0,    a0,      0
   5620    vld             vr1,    a0,      16
   5621    vld             vr2,    a0,      32
   5622    vld             vr3,    a0,      48
   5623    vld             vr4,    a2,      0
   5624    vld             vr5,    a2,      16
   5625    vld             vr6,    a2,      32
   5626    vld             vr7,    a2,      48
   5627 
   5628    vilvl.b         vr8,    vr4,     vr0
   5629    vilvh.b         vr9,    vr4,     vr0
   5630    vilvl.b         vr10,   vr5,     vr1
   5631    vilvh.b         vr11,   vr5,     vr1
   5632    vilvl.b         vr12,   vr6,     vr2
   5633    vilvh.b         vr13,   vr6,     vr2
   5634    vilvl.b         vr14,   vr7,     vr3
   5635    vilvh.b         vr15,   vr7,     vr3
   5636    vmulwev.h.bu    vr0,    vr8,     vr20
   5637    vmulwev.h.bu    vr1,    vr9,     vr20
   5638    vmulwev.h.bu    vr2,    vr10,    vr20
   5639    vmulwev.h.bu    vr3,    vr11,    vr20
   5640    vmulwev.h.bu    vr4,    vr12,    vr20
   5641    vmulwev.h.bu    vr5,    vr13,    vr20
   5642    vmulwev.h.bu    vr6,    vr14,    vr20
   5643    vmulwev.h.bu    vr7,    vr15,    vr20
   5644 
   5645    vmaddwod.h.bu   vr0,    vr8,     vr20
   5646    vmaddwod.h.bu   vr1,    vr9,     vr20
   5647    vmaddwod.h.bu   vr2,    vr10,    vr20
   5648    vmaddwod.h.bu   vr3,    vr11,    vr20
   5649    vmaddwod.h.bu   vr4,    vr12,    vr20
   5650    vmaddwod.h.bu   vr5,    vr13,    vr20
   5651    vmaddwod.h.bu   vr6,    vr14,    vr20
   5652    vmaddwod.h.bu   vr7,    vr15,    vr20
   5653 
   5654    vssrarni.bu.h   vr1,    vr0,     6
   5655    vssrarni.bu.h   vr3,    vr2,     6
   5656    vssrarni.bu.h   vr5,    vr4,     6
   5657    vssrarni.bu.h   vr7,    vr6,     6
   5658 
   5659    vst             vr1,    a0,      0
   5660    vst             vr3,    a0,      16
   5661    vst             vr5,    a0,      32
   5662    vst             vr7,    a0,      48
   5663 
   5664    /* second */
   5665    vld             vr0,    a0,      64
   5666    vld             vr1,    a0,      80
   5667    vld             vr2,    a0,      96
   5668    vld             vr3,    a0,      112
   5669    vld             vr4,    a2,      64
   5670    vld             vr5,    a2,      80
   5671    vld             vr6,    a2,      96
   5672    vld             vr7,    a2,      112
   5673 
   5674    vilvl.b         vr8,    vr4,     vr0
   5675    vilvh.b         vr9,    vr4,     vr0
   5676    vilvl.b         vr10,   vr5,     vr1
   5677    vilvh.b         vr11,   vr5,     vr1
   5678    vilvl.b         vr12,   vr6,     vr2
   5679    vilvh.b         vr13,   vr6,     vr2
   5680    vilvl.b         vr14,   vr7,     vr3
   5681    vilvh.b         vr15,   vr7,     vr3
   5682    vmulwev.h.bu    vr0,    vr8,     vr20
   5683    vmulwev.h.bu    vr1,    vr9,     vr20
   5684    vmulwev.h.bu    vr2,    vr10,    vr20
   5685    vmulwev.h.bu    vr3,    vr11,    vr20
   5686    vmulwev.h.bu    vr4,    vr12,    vr20
   5687    vmulwev.h.bu    vr5,    vr13,    vr20
   5688    vmulwev.h.bu    vr6,    vr14,    vr20
   5689    vmulwev.h.bu    vr7,    vr15,    vr20
   5690 
   5691    vmaddwod.h.bu   vr0,    vr8,     vr20
   5692    vmaddwod.h.bu   vr1,    vr9,     vr20
   5693    vmaddwod.h.bu   vr2,    vr10,    vr20
   5694    vmaddwod.h.bu   vr3,    vr11,    vr20
   5695    vmaddwod.h.bu   vr4,    vr12,    vr20
   5696    vmaddwod.h.bu   vr5,    vr13,    vr20
   5697    vmaddwod.h.bu   vr6,    vr14,    vr20
   5698    vmaddwod.h.bu   vr7,    vr15,    vr20
   5699 
   5700    vssrarni.bu.h   vr1,    vr0,     6
   5701    vssrarni.bu.h   vr3,    vr2,     6
   5702    vssrarni.bu.h   vr5,    vr4,     6
   5703    vssrarni.bu.h   vr7,    vr6,     6
   5704 
   5705    vst             vr1,    a0,      64
   5706    vst             vr3,    a0,      80
   5707    vst             vr5,    a0,      96
   5708    vst             vr7,    a0,      112
   5709 
   5710    addi.d          t8,     t8,      2
   5711    add.d           a0,     a0,      a1
   5712    addi.d          a2,     a2,      128
   5713 
   5714    blt             t8,     a4,     .BLEND_H_W128_LSX
   5715    b               .BLEND_H_END_LSX
   5716 
   5717 .BLEND_H_END_LSX:
   5718 
   5719 endfunc
   5720 
   5721 /*
   5722 * static void blend_h_lsx(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
   5723                           const int w, int h)
   5724 */
   5725 function blend_h_8bpc_lasx
   5726    la.local      t8,     obmc_masks_la
   5727    alsl.d        t8,     a4,      t8,    1
   5728    srli.d        t0,     a4,      1
   5729    srli.d        t1,     a4,      2
   5730    add.d         a4,     t0,      t1  // h = (h * 3) >> 2;
   5731    slli.d        a4,     a4,      1
   5732    add.d         a4,     a4,      t8
   5733 
   5734    clz.w         t0,     a3
   5735    li.w          t1,     24
   5736    sub.w         t0,     t0,      t1
   5737    la.local      t1,     .BLEND_H_LASX_JRTABLE
   5738    alsl.d        t0,     t0,      t1,    1
   5739    ld.h          t2,     t0,      0  // The jump addresses are relative to JRTABLE
   5740    add.d         t1,     t1,      t2 // Get absolute address
   5741    jirl          $r0,    t1,      0
   5742 
   5743    .align   3
   5744 .BLEND_H_LASX_JRTABLE:
   5745    .hword .BLEND_H_W128_LASX - .BLEND_H_LASX_JRTABLE
   5746    .hword .BLEND_H_W64_LASX  - .BLEND_H_LASX_JRTABLE
   5747    .hword .BLEND_H_W32_LASX  - .BLEND_H_LASX_JRTABLE
   5748    .hword .BLEND_H_W16_LASX  - .BLEND_H_LASX_JRTABLE
   5749    .hword .BLEND_H_W8_LASX   - .BLEND_H_LASX_JRTABLE
   5750    .hword .BLEND_H_W4_LASX   - .BLEND_H_LASX_JRTABLE
   5751    .hword .BLEND_H_W2_LASX   - .BLEND_H_LASX_JRTABLE
   5752    .hword .BLEND_H_END_LASX  - .BLEND_H_LASX_JRTABLE  //Instructions must be 4-byte aligned
   5753 
   5754 .BLEND_H_W2_LASX:
   5755    vldrepl.h       vr20,   t8,      0
   5756    vld             vr0,    a0,      0
   5757    vld             vr1,    a2,      0
   5758 
   5759    vilvl.b         vr0,    vr1,     vr0
   5760    vdp2.h.bu       vr1,    vr0,     vr20
   5761    vssrarni.bu.h   vr1,    vr1,     6
   5762 
   5763    vstelm.h        vr1,    a0,      0,   0
   5764    addi.d          t8,     t8,      2
   5765    add.d           a0,     a0,      a1
   5766    addi.d          a2,     a2,      2
   5767 
   5768    blt             t8,     a4,     .BLEND_H_W2_LASX
   5769    b               .BLEND_H_END_LASX
   5770 
   5771 .BLEND_H_W4_LASX:
   5772    vldrepl.h       vr20,   t8,      0
   5773    vld             vr0,    a0,      0
   5774    vld             vr1,    a2,      0
   5775 
   5776    vilvl.b         vr0,    vr1,     vr0
   5777    vdp2.h.bu       vr1,    vr0,     vr20
   5778    vssrarni.bu.h   vr1,    vr1,     6
   5779 
   5780    vstelm.w        vr1,    a0,      0,   0
   5781    addi.d          t8,     t8,      2
   5782    add.d           a0,     a0,      a1
   5783    addi.d          a2,     a2,      4
   5784 
   5785    blt             t8,     a4,     .BLEND_H_W4_LASX
   5786    b               .BLEND_H_END_LASX
   5787 
   5788 .BLEND_H_W8_LASX:
   5789    vldrepl.h       vr20,   t8,      0
   5790    vld             vr0,    a0,      0
   5791    vld             vr1,    a2,      0
   5792 
   5793    vilvl.b         vr0,    vr1,     vr0
   5794    vdp2.h.bu       vr1,    vr0,     vr20
   5795    vssrarni.bu.h   vr1,    vr1,     6
   5796 
   5797    vstelm.d        vr1,    a0,      0,   0
   5798    addi.d          t8,     t8,      2
   5799    add.d           a0,     a0,      a1
   5800    addi.d          a2,     a2,      8
   5801 
   5802    blt             t8,     a4,     .BLEND_H_W8_LASX
   5803    b               .BLEND_H_END_LASX
   5804 
   5805 .BLEND_H_W16_LASX:
   5806    vldrepl.h       vr20,   t8,      0
   5807    vld             vr0,    a0,      0
   5808    vld             vr1,    a2,      0
   5809 
   5810    vilvl.b         vr2,    vr1,     vr0
   5811    vilvh.b         vr3,    vr1,     vr0
   5812    vmulwev.h.bu    vr4,    vr2,     vr20
   5813    vmulwev.h.bu    vr5,    vr3,     vr20
   5814    vmaddwod.h.bu   vr4,    vr2,     vr20
   5815    vmaddwod.h.bu   vr5,    vr3,     vr20
   5816    vssrarni.bu.h   vr5,    vr4,     6
   5817 
   5818    vst             vr5,    a0,      0
   5819    addi.d          t8,     t8,      2
   5820    add.d           a0,     a0,      a1
   5821    addi.d          a2,     a2,      16
   5822 
   5823    blt             t8,     a4,     .BLEND_H_W16_LSX
   5824    b               .BLEND_H_END_LSX
   5825 
   5826 .BLEND_H_W32_LASX:
   5827    xvldrepl.h      xr20,   t8,      0
   5828 
   5829    xvld            xr0,    a0,      0
   5830    xvld            xr1,    a2,      0
   5831 
   5832    xvilvl.b        xr2,    xr1,     xr0
   5833    xvilvh.b        xr3,    xr1,     xr0
   5834 
   5835    xvmulwev.h.bu   xr4,    xr2,     xr20
   5836    xvmulwev.h.bu   xr5,    xr3,     xr20
   5837    xvmaddwod.h.bu  xr4,    xr2,     xr20
   5838    xvmaddwod.h.bu  xr5,    xr3,     xr20
   5839    xvssrarni.bu.h  xr5,    xr4,     6
   5840 
   5841    xvst            xr5,    a0,      0
   5842    addi.d          t8,     t8,      2
   5843    add.d           a0,     a0,      a1
   5844    addi.d          a2,     a2,      32
   5845 
   5846    blt             t8,     a4,     .BLEND_H_W32_LASX
   5847    b               .BLEND_H_END_LASX
   5848 
   5849 .BLEND_H_W64_LASX:
   5850    xvldrepl.h      xr20,   t8,      0
   5851 
   5852    xvld            xr0,    a0,      0
   5853    xvld            xr1,    a0,      32
   5854    xvld            xr2,    a2,      0
   5855    xvld            xr3,    a2,      32
   5856 
   5857    xvilvl.b        xr4,    xr2,     xr0
   5858    xvilvh.b        xr5,    xr2,     xr0
   5859    xvilvl.b        xr6,    xr3,     xr1
   5860    xvilvh.b        xr7,    xr3,     xr1
   5861 
   5862    xvmulwev.h.bu   xr0,    xr4,     xr20
   5863    xvmulwev.h.bu   xr1,    xr5,     xr20
   5864    xvmulwev.h.bu   xr2,    xr6,     xr20
   5865    xvmulwev.h.bu   xr3,    xr7,     xr20
   5866    xvmaddwod.h.bu  xr0,    xr4,     xr20
   5867    xvmaddwod.h.bu  xr1,    xr5,     xr20
   5868    xvmaddwod.h.bu  xr2,    xr6,     xr20
   5869    xvmaddwod.h.bu  xr3,    xr7,     xr20
   5870    xvssrarni.bu.h  xr1,    xr0,     6
   5871    xvssrarni.bu.h  xr3,    xr2,     6
   5872 
   5873    xvst            xr1,    a0,      0
   5874    xvst            xr3,    a0,      32
   5875    addi.d          t8,     t8,      2
   5876    add.d           a0,     a0,      a1
   5877    addi.d          a2,     a2,      64
   5878 
   5879    blt             t8,     a4,     .BLEND_H_W64_LASX
   5880    b               .BLEND_H_END_LASX
   5881 
   5882 .BLEND_H_W128_LASX:
   5883    xvldrepl.h      xr20,   t8,      0
   5884 
   5885    xvld            xr0,    a0,      0
   5886    xvld            xr1,    a0,      32
   5887    xvld            xr2,    a0,      64
   5888    xvld            xr3,    a0,      96
   5889    xvld            xr4,    a2,      0
   5890    xvld            xr5,    a2,      32
   5891    xvld            xr6,    a2,      64
   5892    xvld            xr7,    a2,      96
   5893 
   5894    xvilvl.b        xr8,    xr4,     xr0
   5895    xvilvh.b        xr9,    xr4,     xr0
   5896    xvilvl.b        xr10,   xr5,     xr1
   5897    xvilvh.b        xr11,   xr5,     xr1
   5898    xvilvl.b        xr12,   xr6,     xr2
   5899    xvilvh.b        xr13,   xr6,     xr2
   5900    xvilvl.b        xr14,   xr7,     xr3
   5901    xvilvh.b        xr15,   xr7,     xr3
   5902 
   5903    xvmulwev.h.bu   xr0,    xr8,     xr20
   5904    xvmulwev.h.bu   xr1,    xr9,     xr20
   5905    xvmulwev.h.bu   xr2,    xr10,    xr20
   5906    xvmulwev.h.bu   xr3,    xr11,    xr20
   5907    xvmulwev.h.bu   xr4,    xr12,    xr20
   5908    xvmulwev.h.bu   xr5,    xr13,    xr20
   5909    xvmulwev.h.bu   xr6,    xr14,    xr20
   5910    xvmulwev.h.bu   xr7,    xr15,    xr20
   5911    xvmaddwod.h.bu  xr0,    xr8,     xr20
   5912    xvmaddwod.h.bu  xr1,    xr9,     xr20
   5913    xvmaddwod.h.bu  xr2,    xr10,    xr20
   5914    xvmaddwod.h.bu  xr3,    xr11,    xr20
   5915    xvmaddwod.h.bu  xr4,    xr12,    xr20
   5916    xvmaddwod.h.bu  xr5,    xr13,    xr20
   5917    xvmaddwod.h.bu  xr6,    xr14,    xr20
   5918    xvmaddwod.h.bu  xr7,    xr15,    xr20
   5919    xvssrarni.bu.h  xr1,    xr0,     6
   5920    xvssrarni.bu.h  xr3,    xr2,     6
   5921    xvssrarni.bu.h  xr5,    xr4,     6
   5922    xvssrarni.bu.h  xr7,    xr6,     6
   5923 
   5924    xvst            xr1,    a0,      0
   5925    xvst            xr3,    a0,      32
   5926    xvst            xr5,    a0,      64
   5927    xvst            xr7,    a0,      96
   5928    addi.d          t8,     t8,      2
   5929    add.d           a0,     a0,      a1
   5930    addi.d          a2,     a2,      128
   5931 
   5932    blt             t8,     a4,     .BLEND_H_W128_LASX
   5933    b               .BLEND_H_END_LASX
   5934 
   5935 .BLEND_H_END_LASX:
   5936 
   5937 endfunc
   5938 
   5939 /*
   5940 *  a1=16 | a2=8 | a3=4
   5941 *  temp reg: a4
   5942 */
   5943 .macro PIXEL_COPY_LSX _dst, _src, _size
   5944    blt             \_size,  a1,     8f
   5945 16:
   5946    vld             vr0,     \_src,  0
   5947    vst             vr0,     \_dst,  0
   5948    addi.d          \_size,  \_size, -16
   5949    addi.d          \_dst,   \_dst,  16
   5950    addi.d          \_src,   \_src,  16
   5951    blt             a1,      \_size, 16b
   5952 8:
   5953    blt             \_size,  a2,     14f
   5954    ld.d            a4,      \_src,  0
   5955    st.d            a4,      \_dst,  0
   5956    addi.d          \_size,  \_size, -8
   5957    addi.d          \_dst,   \_dst,  8
   5958    addi.d          \_src,   \_src,  8
   5959 14:
   5960    blt             \_size,  a3,     11f
   5961    ld.w            a4,      \_src,  0
   5962    st.w            a4,      \_dst,  0
   5963    addi.d          \_size,  \_size, -4
   5964    addi.d          \_dst,   \_dst,  4
   5965    addi.d          \_src,   \_src,  4
   5966 11:
   5967    beqz            \_size,  110f
   5968 111:
   5969    ld.b            a4,      \_src,  0
   5970    st.b            a4,      \_dst,  0
   5971    addi.d          \_size,  \_size, -1
   5972    addi.d          \_dst,   \_dst,  1
   5973    addi.d          \_src,   \_src,  1
   5974    bnez            \_size,  111b
   5975 110:
   5976 .endm
   5977 
   5978 /*
   5979 *  a1=16 | a2=8 | a3=4
   5980 */
   5981 .macro PIXEL_SET_LSX _dst, _vsrc, _size
   5982    blt             \_size,  a1,     8f
   5983 16:
   5984    vst             \_vsrc,  \_dst,  0
   5985    addi.d          \_size,  \_size, -16
   5986    addi.d          \_dst,   \_dst,  16
   5987    blt             a1,      \_size, 16b
   5988 8:
   5989    blt             \_size,  a2,     14f
   5990    vstelm.d        \_vsrc,  \_dst,  0,   0
   5991    addi.d          \_size,  \_size, -8
   5992    addi.d          \_dst,   \_dst,  8
   5993 14:
   5994    blt             \_size,  a3,     11f
   5995    vstelm.w        \_vsrc,  \_dst,  0,   0
   5996    addi.d          \_size,  \_size, -4
   5997    addi.d          \_dst,   \_dst,  4
   5998 11:
   5999    beqz            \_size,  110f
   6000 111:
   6001    vstelm.b        \_vsrc,  \_dst,  0,   0
   6002    addi.d          \_size,  \_size, -1
   6003    addi.d          \_dst,   \_dst,  1
   6004    bnez            \_size,  111b
   6005 110:
   6006 .endm
   6007 
   6008 /*
   6009 *  temp reg: a4 a5 t2 t3 vr0
   6010 */
   6011 .macro DEGE_LOOP need_left, need_right
   6012 0:
   6013    addi.d          t2,      t6,     0   // dst
   6014    addi.d          t3,      t7,     0   // src
   6015 .if \need_left
   6016    vldrepl.b       vr0,     t3,     0
   6017    addi.d          a5,      t0,     0
   6018    PIXEL_SET_LSX t2, vr0, a5
   6019 .endif
   6020 
   6021    addi.d          a5,      t4,     0
   6022    PIXEL_COPY_LSX t2, t3, a5
   6023 
   6024 .if \need_right
   6025    vldrepl.b       vr0,     t3,     -1
   6026    addi.d          a5,      t1,     0
   6027    PIXEL_SET_LSX t2, vr0, a5
   6028 .endif
   6029 
   6030    addi.d          t5,      t5,     -1
   6031    add.d           t7,      t7,     t8
   6032    add.d           t6,      t6,     a7
   6033    bnez            t5,      0b
   6034 .endm
   6035 
   6036 /*
   6037 * static void emu_edge_c(const intptr_t bw, const intptr_t bh,
   6038 *                        const intptr_t iw, const intptr_t ih,
   6039 *                        const intptr_t x, const intptr_t y,
   6040 *                        pixel *dst, const ptrdiff_t dst_stride,
   6041 *                        const pixel *ref, const ptrdiff_t ref_stride)
   6042 */
   6043 function emu_edge_8bpc_lsx
   6044    vxor.v          vr23,   vr23,    vr23   // zero
   6045    addi.d          t0,     a3,      -1     // ih - 1
   6046    addi.d          t1,     a2,      -1     // iw - 1
   6047    vreplgr2vr.w    vr22,   t0
   6048    vinsgr2vr.w     vr22,   t1,        1
   6049    vreplgr2vr.w    vr0,    a5
   6050    vinsgr2vr.w     vr0,    a4,        1     // [0] - h | [1] - w
   6051 
   6052    vclip.w         vr2,    vr0,      vr23,    vr22
   6053    vpickve2gr.w    t0,     vr2,      0
   6054    ld.d            t2,     sp,       0
   6055    ld.d            t8,     sp,       8     // ref_stride
   6056    mul.w           t0,     t0,       t8
   6057    vpickve2gr.w    t1,     vr2,      1
   6058    add.d           t2,     t2,       t1
   6059    add.d           t7,     t0,       t2    // ref
   6060 
   6061    addi.d          t0,     a0,       -1     // bw - 1
   6062    addi.d          t1,     a1,       -1     // bh - 1
   6063    vreplgr2vr.w    vr21,   t0
   6064    vreplgr2vr.w    vr22,   t1
   6065    vilvl.d         vr21,   vr22,      vr21
   6066    sub.d           t2,     zero,      a4    // -x
   6067    add.d           t3,     a0,        a4
   6068    sub.d           t3,     t3,        a2    // x + bw - iw
   6069    sub.d           t4,     zero,      a5    // -y
   6070    add.d           t5,     a1,        a5
   6071    sub.d           t5,     t5,        a3    // y + bh - ih
   6072    vreplgr2vr.w    vr0,    t2
   6073    vinsgr2vr.w     vr0,    t3,        1
   6074    vinsgr2vr.w     vr0,    t4,        2
   6075    vinsgr2vr.w     vr0,    t5,        3
   6076    vclip.w         vr2,    vr0,       vr23,    vr21
   6077    vpickve2gr.w    t0,     vr2,       0     // left_ext
   6078    vpickve2gr.w    t1,     vr2,       1     // right_ext
   6079    vpickve2gr.w    t2,     vr2,       2     // top_ext
   6080    vpickve2gr.w    t3,     vr2,       3     // bottom_ext
   6081 
   6082    mul.w           t6,     t2,        a7
   6083    add.d           t4,     t0,        t1
   6084    add.d           t5,     t2,        t3
   6085    sub.d           t4,     a0,        t4    // center_w
   6086    sub.d           t5,     a1,        t5    // center_h
   6087 
   6088    addi.d          a1,     zero,      16
   6089    addi.d          a2,     zero,      8
   6090    addi.d          a3,     zero,      4
   6091    add.d           t6,     t6,        a6    // blk
   6092 
   6093    beqz            t0,     2f
   6094    // need_left
   6095    beqz            t1,     3f
   6096    // need_left + need_right
   6097    DEGE_LOOP       1,   1
   6098    b               5f
   6099 
   6100 2:
   6101    // !need_left
   6102    beqz            t1,     4f
   6103    // !need_left + need_right
   6104    DEGE_LOOP       0,   1
   6105    b               5f
   6106 
   6107 3:
   6108    // need_left + !need_right
   6109    DEGE_LOOP       1,   0
   6110    b               5f
   6111 
   6112 4:
   6113    // !need_left + !need_right
   6114    DEGE_LOOP       0,   0
   6115 
   6116 5:
   6117    vpickve2gr.w    t2,     vr2,       2     // top_ext
   6118    vpickve2gr.w    t3,     vr2,       3     // bottom_ext
   6119    sub.d           t7,     a7,        a0    // dst_stride - bw
   6120    mul.w           t8,     t2,        a7
   6121 
   6122    beqz            t3,     2f
   6123    // need_bottom
   6124    sub.d           t0,     t6,        a7    //  &dst[-PXSTRIDE(dst_stride)]
   6125 1:
   6126    addi.d          t1,     t0,        0
   6127    addi.d          a5,     a0,        0
   6128    PIXEL_COPY_LSX t6, t1, a5
   6129    add.d           t6,     t6,        t7
   6130    addi.d          t3,     t3,   -1
   6131    bnez            t3,     1b
   6132 2:
   6133    beqz            t2,     3f
   6134    // need_top
   6135    add.d           t8,     t8,        a6    // blk
   6136 1:
   6137    addi.d          t1,     t8,        0
   6138    addi.d          a5,     a0,        0
   6139    PIXEL_COPY_LSX a6, t1, a5
   6140    add.d           a6,     a6,        t7
   6141    addi.d          t2,     t2,   -1
   6142    bnez            t2,     1b
   6143 3:
   6144 
   6145 endfunc