tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx.S (182424B)


      1 /*
      2 * Copyright © 2023, VideoLAN and dav1d authors
      3 * Copyright © 2023, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/loongarch/loongson_asm.S"
     29 #include "src/loongarch/loongson_util.S"
     30 
     31 .macro PUSH_REG
     32    addi.d           sp,     sp,    -64
     33    fst.d            f24,    sp,     0
     34    fst.d            f25,    sp,     8
     35    fst.d            f26,    sp,     16
     36    fst.d            f27,    sp,     24
     37    fst.d            f28,    sp,     32
     38    fst.d            f29,    sp,     40
     39    fst.d            f30,    sp,     48
     40    fst.d            f31,    sp,     56
     41 .endm
     42 
     43 .macro POP_REG
     44    fld.d            f24,    sp,     0
     45    fld.d            f25,    sp,     8
     46    fld.d            f26,    sp,     16
     47    fld.d            f27,    sp,     24
     48    fld.d            f28,    sp,     32
     49    fld.d            f29,    sp,     40
     50    fld.d            f30,    sp,     48
     51    fld.d            f31,    sp,     56
     52    addi.d           sp,     sp,     64
     53 .endm
     54 
     55 .macro malloc_space number
     56    li.w          t0,       \number
     57    sub.d         sp,       sp,       t0
     58    addi.d        sp,       sp,       -64
     59    PUSH_REG
     60 .endm
     61 
     62 .macro free_space number
     63    POP_REG
     64    li.w          t0,       \number
     65    add.d         sp,       sp,       t0
     66    addi.d        sp,       sp,       64
     67 .endm
     68 
     69 .macro iwht4
     70    vadd.h        vr0,       vr0,     vr1
     71    vsub.h        vr4,       vr2,     vr3
     72    vsub.h        vr5,       vr0,     vr4
     73    vsrai.h       vr5,       vr5,     1
     74    vsub.h        vr2,       vr5,     vr1
     75    vsub.h        vr1,       vr5,     vr3
     76    vadd.h        vr3,       vr4,     vr2
     77    vsub.h        vr0,       vr0,     vr1
     78 .endm
     79 
     80 .macro DST_ADD_W4 in0, in1, in2, in3, in4, in5
     81    vilvl.w       \in0,     \in1,     \in0  // 0 1  2  3  4  5  6  7 x ...
     82    vilvl.w       \in2,     \in3,     \in2  // 8 9 10 11 12 13 14 15 x ...
     83    vsllwil.hu.bu \in0,     \in0,     0
     84    vsllwil.hu.bu \in2,     \in2,     0
     85    vadd.h        \in0,     \in4,     \in0
     86    vadd.h        \in2,     \in5,     \in2
     87    vssrani.bu.h  \in2,     \in0,     0
     88    vstelm.w      \in2,     a0,       0,    0
     89    vstelmx.w     \in2,     a0,       a1,   1
     90    vstelmx.w     \in2,     a0,       a1,   2
     91    vstelmx.w     \in2,     a0,       a1,   3
     92 .endm
     93 
     94 .macro VLD_DST_ADD_W4 in0, in1
     95    vld           vr0,      a0,       0
     96    vldx          vr1,      a0,       a1
     97    vld           vr2,      t2,       0
     98    vldx          vr3,      t2,       a1
     99 
    100    DST_ADD_W4    vr0, vr1, vr2, vr3, \in0, \in1
    101 .endm
    102 
    103 function inv_txfm_add_wht_wht_4x4_8bpc_lsx
    104    vld           vr0,       a2,      0
    105    vld           vr2,       a2,      16
    106 
    107    vxor.v        vr20,      vr20,    vr20
    108    vsrai.h       vr0,       vr0,     2
    109    vsrai.h       vr2,       vr2,     2
    110    vst           vr20,      a2,      0
    111    vpickod.d     vr1,       vr0,     vr0
    112    vpickod.d     vr3,       vr2,     vr2
    113    vst           vr20,      a2,      16
    114 
    115    iwht4
    116 
    117    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
    118 
    119    iwht4
    120 
    121    vilvl.d       vr4,       vr1,     vr0
    122    vilvl.d       vr5,       vr3,     vr2
    123    alsl.d        t2,        a1,      a0,    1
    124    VLD_DST_ADD_W4 vr4, vr5
    125 endfunc
    126 
    127 const idct_coeffs, align=4
    128    .word          2896, 2896*8, 1567, 3784
    129    .word          799, 4017, 3406, 2276
    130    .word          401, 4076, 3166, 2598
    131    .word          1931, 3612, 3920, 1189
    132    .word          201, 4091, 3035, 2751
    133    .word          1751, 3703, 3857, 1380
    134    .word          995, 3973, 3513, 2106
    135    .word          2440, 3290, 4052, 601
    136 endconst
    137 
    138 .macro vsrari_h_x4 in0, in1, in2, in3, out0, out1, out2, out3, shift
    139    vsrari.h      \out0,    \in0,     \shift
    140    vsrari.h      \out1,    \in1,     \shift
    141    vsrari.h      \out2,    \in2,     \shift
    142    vsrari.h      \out3,    \in3,     \shift
    143 .endm
    144 
    145 .macro vsrari_h_x8 in0, in1, in2, in3, in4, in5, in6, in7, out0, \
    146                   out1, out2, out3, out4, out5, out6, out7, shift
    147    vsrari.h      \out0,    \in0,     \shift
    148    vsrari.h      \out1,    \in1,     \shift
    149    vsrari.h      \out2,    \in2,     \shift
    150    vsrari.h      \out3,    \in3,     \shift
    151    vsrari.h      \out4,    \in4,     \shift
    152    vsrari.h      \out5,    \in5,     \shift
    153    vsrari.h      \out6,    \in6,     \shift
    154    vsrari.h      \out7,    \in7,     \shift
    155 .endm
    156 
    157 .macro vmulev_vmaddod_lsx in0, in1, in2, in3, out0, out1, sz
    158    vmulwev.w.h   \out0,    \in0,     \in2
    159    vmulwod.w.h   \out1,    \in0,     \in2
    160    vmaddwev.w.h  \out0,    \in1,     \in3
    161    vmaddwod.w.h  \out1,    \in1,     \in3
    162 .ifc \sz, .4h
    163    vilvl.w       \out0,    \out1,    \out0
    164 .else
    165    vilvl.w       vr22,     \out1,    \out0
    166    vilvh.w       \out1,    \out1,    \out0
    167    vor.v         \out0,    vr22,     vr22
    168 .endif
    169 .endm
    170 
    171 const idct_coeffs_h, align=4
    172    .short          2896, 2896*8, 1567, 3784
    173    .short          799, 4017, 3406, 2276
    174    .short          401, 4076, 3166, 2598
    175    .short          1931, 3612, 3920, 1189
    176    .short          201, 4091, 3035, 2751
    177    .short          1751, 3703, 3857, 1380
    178    .short          995, 3973, 3513, 2106
    179    .short          2440, 3290, 4052, 601
    180 endconst
    181 
    182 const iadst4_coeffs, align=4
    183    .word          1321, 3803, 2482, 3344
    184 endconst
    185 
    186 .macro inv_dct4_lsx in0, in1, in2, in3, out0, out1, out2, out3, sz
    187    la.local      t0,       idct_coeffs_h
    188 
    189    vldrepl.h     vr20,     t0,       0    // 2896
    190    vmulev_vmaddod_lsx \in0, \in2, vr20, vr20, vr16, vr18, \sz
    191    vneg.h        vr21,     vr20
    192    vmulev_vmaddod_lsx \in0, \in2, vr20, vr21, vr17, vr19, \sz
    193    vssrarni.h.w  vr18,     vr16,     12   // t0
    194    vssrarni.h.w  vr19,     vr17,     12   // t1
    195 
    196    vldrepl.h     vr20,     t0,       4    // 1567
    197    vldrepl.h     vr21,     t0,       6    // 3784
    198    vmulev_vmaddod_lsx \in1, \in3, vr21, vr20, \in0, vr16, \sz
    199    vneg.h        vr21,     vr21
    200    vmulev_vmaddod_lsx \in1, \in3, vr20, vr21, \in2, vr17, \sz
    201    vssrarni.h.w  vr16,     \in0,     12   // t3
    202    vssrarni.h.w  vr17,     \in2,     12   // t2
    203 
    204    vsadd.h       \out0,    vr18,     vr16
    205    vsadd.h       \out1,    vr19,     vr17
    206    vssub.h       \out2,    vr19,     vr17
    207    vssub.h       \out3,    vr18,     vr16
    208 .endm
    209 
    210 functionl inv_dct_4h_x4_lsx
    211    inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .4h
    212 endfuncl
    213 
    214 functionl inv_dct_8h_x4_lsx
    215    inv_dct4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, .8h
    216 endfuncl
    217 
    218 .macro inv_adst4_core_lsx in0, in1, in2, in3, out0, out1, out2, out3
    219    vsub.w        vr16,     \in0,    \in2  // in0-in2
    220    vmul.w        vr17,     \in0,    vr20  // in0*1321
    221    vmul.w        vr19,     \in0,    vr22  // in0*2482
    222    vmul.w        vr18,     \in1,    vr23  // in1*3344
    223    vmadd.w       vr17,     \in2,    vr21  // in0*1321+in2*3803
    224    vmsub.w       vr19,     \in2,    vr20  // in2*1321
    225    vadd.w        vr16,     vr16,    \in3  // in0-in2+in3
    226    vmadd.w       vr17,     \in3,    vr22  // in0*1321+in2*3803+in3*2482
    227    vmsub.w       vr19,     \in3,    vr21  // in0*2482-in2*1321-in3*3803
    228    vadd.w        vr15,     vr17,    vr19
    229    vmul.w        \out2,    vr16,    vr23  // out[2] 8  9  10 11
    230    vadd.w        \out0,    vr17,    vr18  // out[0] 0  1  2  3
    231    vadd.w        \out1,    vr19,    vr18  // out[1] 4  5  6  7
    232    vsub.w        \out3,    vr15,    vr18  // out[3] 12 13 14 15
    233 .endm
    234 
    235 .macro inv_adst4_lsx in0, in1, in2, in3, out0, out1, out2, out3
    236    la.local      t0,       iadst4_coeffs
    237 
    238    vldrepl.w     vr20,     t0,      0     // 1321
    239    vldrepl.w     vr21,     t0,      4     // 3803
    240    vldrepl.w     vr22,     t0,      8     // 2482
    241    vldrepl.w     vr23,     t0,      12    // 3344
    242 
    243    vsllwil.w.h   vr0,      \in0,    0
    244    vsllwil.w.h   vr1,      \in1,    0
    245    vsllwil.w.h   vr2,      \in2,    0
    246    vsllwil.w.h   vr3,      \in3,    0
    247    inv_adst4_core_lsx vr0, vr1, vr2, vr3, \out0, \out1, \out2, \out3
    248    vssrarni.h.w  \out0,    \out0,   12
    249    vssrarni.h.w  \out1,    \out1,   12
    250    vssrarni.h.w  \out2,    \out2,   12
    251    vssrarni.h.w  \out3,    \out3,   12
    252 .endm
    253 
    254 functionl inv_adst_4h_x4_lsx
    255    inv_adst4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
    256 endfuncl
    257 
    258 functionl inv_flipadst_4h_x4_lsx
    259    inv_adst4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
    260 endfuncl
    261 
    262 .macro inv_adst_8x4_lsx in0, in1, in2, in3, out0, out1, out2, out3
    263    la.local      t0,       iadst4_coeffs
    264    vldrepl.w     vr20,     t0,      0     // 1321
    265    vldrepl.w     vr21,     t0,      4     // 3803
    266    vldrepl.w     vr22,     t0,      8     // 2482
    267    vldrepl.w     vr23,     t0,      12    // 3344
    268 
    269    vsllwil.w.h   vr10,     \in0,     0     // in0
    270    vsllwil.w.h   vr11,     \in1,     0     // in1
    271    vsllwil.w.h   vr12,     \in2,     0     // in2
    272    vsllwil.w.h   vr13,     \in3,     0     // in3
    273    inv_adst4_core_lsx vr10, vr11, vr12, vr13, vr10, vr11, vr12, vr13
    274 
    275    vexth.w.h     \in0,      \in0           // in0
    276    vexth.w.h     \in1,      \in1           // in1
    277    vexth.w.h     \in2,      \in2           // in2
    278    vexth.w.h     \in3,      \in3           // in3
    279    inv_adst4_core_lsx \in0, \in1, \in2, \in3, \out0, \out1, \out2, \out3
    280 
    281    vssrarni.h.w  \out0,     vr10,    12
    282    vssrarni.h.w  \out1,     vr11,    12
    283    vssrarni.h.w  \out2,     vr12,    12
    284    vssrarni.h.w  \out3,     vr13,    12
    285 .endm
    286 
    287 functionl inv_adst_8h_x4_lsx
    288    inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3
    289 endfuncl
    290 
    291 functionl inv_flipadst_8h_x4_lsx
    292    inv_adst_8x4_lsx vr0, vr1, vr2, vr3, vr3, vr2, vr1, vr0
    293 endfuncl
    294 
    295 functionl inv_identity_4h_x4_lsx
    296    li.w          t0,       1697
    297    vreplgr2vr.h  vr20,     t0
    298 
    299    vilvl.d       vr0,      vr1,      vr0
    300    vilvl.d       vr2,      vr3,      vr2
    301    vmulwev.w.h   vr16,     vr0,      vr20
    302    vmulwod.w.h   vr17,     vr0,      vr20
    303    vmulwev.w.h   vr18,     vr2,      vr20
    304    vmulwod.w.h   vr19,     vr2,      vr20
    305    vilvl.w       vr1,      vr17,     vr16
    306    vilvh.w       vr3,      vr17,     vr16
    307    vilvl.w       vr22,     vr19,     vr18
    308    vilvh.w       vr23,     vr19,     vr18
    309    vssrarni.h.w  vr3,      vr1,      12
    310    vssrarni.h.w  vr23,     vr22,     12
    311    vsadd.h       vr0,      vr3,      vr0  // t0
    312    vsadd.h       vr2,      vr23,     vr2  // t2
    313    vilvh.d       vr1,      vr0,      vr0  // t1
    314    vilvh.d       vr3,      vr2,      vr2  // t3
    315 endfuncl
    316 
    317 .macro inv_identity4_lsx1 in0, in1, in2, out0, out1
    318    vsllwil.w.h   vr16,     \in0,     0
    319    vexth.w.h     vr17,     \in1
    320    vmul.w        vr18,     vr16,     \in2
    321    vmul.w        vr19,     vr17,     \in2
    322    vsrari.w      vr18,     vr18,     12
    323    vsrari.w      vr19,     vr19,     12
    324    vadd.w        \out0,    vr18,     vr16
    325    vadd.w        \out1,    vr19,     vr17
    326    vssrarni.h.w  \out1,    \out0,    1
    327 .endm
    328 
    329 functionl inv_identity_8h_x4_lsx
    330    li.w          t0,        1697
    331    vreplgr2vr.h  vr20,      t0
    332    vmulwev.w.h   vr16,      vr0,     vr20
    333    vmulwod.w.h   vr17,      vr0,     vr20
    334    vmulwev.w.h   vr18,      vr1,     vr20
    335    vmulwod.w.h   vr19,      vr1,     vr20
    336    vilvl.w       vr21,      vr17,    vr16
    337    vilvh.w       vr22,      vr17,    vr16
    338    vilvl.w       vr23,      vr19,    vr18
    339    vilvh.w       vr16,      vr19,    vr18
    340    vssrarni.h.w  vr22,      vr21,    12
    341    vssrarni.h.w  vr16,      vr23,    12
    342    vsadd.h       vr0,       vr22,    vr0  // t0
    343    vsadd.h       vr1,       vr16,    vr1  // t1
    344    vmulwev.w.h   vr16,      vr2,     vr20
    345    vmulwod.w.h   vr17,      vr2,     vr20
    346    vmulwev.w.h   vr18,      vr3,     vr20
    347    vmulwod.w.h   vr19,      vr3,     vr20
    348    vilvl.w       vr21,      vr17,    vr16
    349    vilvh.w       vr22,      vr17,    vr16
    350    vilvl.w       vr23,      vr19,    vr18
    351    vilvh.w       vr16,      vr19,    vr18
    352    vssrarni.h.w  vr22,      vr21,    12
    353    vssrarni.h.w  vr16,      vr23,    12
    354    vsadd.h       vr2,       vr22,    vr2  // t2
    355    vsadd.h       vr3,       vr16,    vr3  // t3
    356 endfuncl
    357 
    358 functionl inv_identity_8h_x4_lsx1
    359    li.w          t0,        1697
    360    vreplgr2vr.w  vr20,      t0
    361 .irp i, vr0, vr1, vr2, vr3
    362    inv_identity4_lsx1 \i, \i vr20, vr21, \i
    363 .endr
    364 endfuncl
    365 
    366 functionl inv_txfm_add_4x4_lsx
    367    vxor.v        vr23,     vr23,     vr23
    368    vld           vr0,      a2,       0
    369    vld           vr2,      a2,       16
    370    vilvh.d       vr1,      vr0,      vr0
    371    vilvh.d       vr3,      vr2,      vr2
    372    vst           vr23,     a2,       0
    373    vst           vr23,     a2,       16
    374 
    375    move          t6,       ra
    376    jirl          ra,       t7,       0
    377    move          ra,       t6
    378 
    379    LSX_TRANSPOSE4x4_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5
    380 
    381    move          t6,       ra
    382    jirl          ra,       t8,       0
    383    move          ra,       t6
    384 
    385    vilvl.d       vr4,      vr1,      vr0
    386    vilvl.d       vr5,      vr3,      vr2
    387    vsrari.h      vr4,      vr4,      4
    388    vsrari.h      vr5,      vr5,      4
    389    alsl.d        t2,       a1,       a0,    1
    390    VLD_DST_ADD_W4 vr4, vr5
    391 endfuncl
    392 
    393 .macro idct_dc w, h, shift
    394    ld.h          t2,       a2,       0      // dc
    395    vldi          vr0,      0x8b5            // 181
    396    vreplgr2vr.w  vr1,      t2
    397    vldi          vr20,     0x880            // 128
    398    vmul.w        vr2,      vr0,      vr1    // dc * 181
    399    st.h          zero,     a2,       0
    400    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
    401    vld           vr10,     a0,       0      // 0 1 2 3 4 5 6 7
    402 
    403 .if (2*\w == \h) || (2*\h == \w)
    404    vmul.w        vr2,      vr0,      vr2
    405    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
    406 .endif
    407 .if \shift>0
    408    vsrari.w      vr2,      vr2,      \shift      // (dc + rnd) >> shift
    409 .endif
    410    vldx          vr11,     a0,       a1     // 8 9 10 11 12 13 14 15
    411    alsl.d        t2,       a1,       a0,    1
    412    vmadd.w       vr20,     vr2,      vr0
    413    vld           vr12,     t2,       0      // 16 17 18 19 20 21 22 23
    414    vssrarni.h.w  vr20,     vr20,     12
    415    vldx          vr13,     t2,       a1     // 24 25 26 27 28 29 30 31
    416 .endm
    417 
    418 .macro fun4x4 txfm1, txfm2
    419 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_lsx
    420 .ifc \txfm1\()_\txfm2, dct_dct
    421    bnez          a3,       1f
    422 
    423    idct_dc 4, 4, 0
    424 
    425    DST_ADD_W4    vr10, vr11, vr12, vr13, vr20, vr20
    426    b             .\txfm1\()_\txfm2\()_4X4_END
    427 1:
    428 .endif
    429 
    430    la.local     t7,    inv_\txfm1\()_4h_x4_lsx
    431    la.local     t8,    inv_\txfm2\()_4h_x4_lsx
    432 
    433    b            inv_txfm_add_4x4_lsx
    434 .\txfm1\()_\txfm2\()_4X4_END:
    435 endfunc
    436 .endm
    437 
    438 fun4x4 dct, dct
    439 fun4x4 identity, identity
    440 fun4x4 adst, dct
    441 fun4x4 dct, adst
    442 fun4x4 adst, adst
    443 fun4x4 dct, flipadst
    444 fun4x4 flipadst, adst
    445 fun4x4 adst, flipadst
    446 fun4x4 flipadst, dct
    447 fun4x4 flipadst, flipadst
    448 fun4x4 dct, identity
    449 fun4x4 identity, dct
    450 fun4x4 flipadst, identity
    451 fun4x4 identity, flipadst
    452 fun4x4 identity, adst
    453 fun4x4 adst, identity
    454 
    455 const iadst8_coeffs_h, align=4
    456    .short          4076, 401, 3612, 1931
    457    .short          2598, 3166, 1189, 3920
    458    .short          2896, 0, 1567, 3784, 0, 0, 0, 0
    459 endconst
    460 
    461 .macro inv_adst8_lsx out0, out1, out2, out3, out4, out5, out6, out7, sz
    462    la.local      t0,       iadst8_coeffs_h
    463 
    464    vldrepl.h     vr20,     t0,       0     // 4076
    465    vldrepl.h     vr21,     t0,       2     // 401
    466    vmulev_vmaddod_lsx vr7, vr0, vr20, vr21, vr16, vr17, \sz
    467    vneg.h        vr20,     vr20
    468    vmulev_vmaddod_lsx vr7, vr0, vr21, vr20, vr18, vr19, \sz
    469    vssrarni.h.w  vr17,     vr16,     12    // t0a
    470    vssrarni.h.w  vr19,     vr18,     12    // t1a
    471 
    472    vldrepl.h     vr20,     t0,       4     // 3612
    473    vldrepl.h     vr21,     t0,       6     // 1931
    474    vmulev_vmaddod_lsx vr5, vr2, vr20, vr21, vr0, vr16, \sz
    475    vneg.h        vr20,     vr20
    476    vmulev_vmaddod_lsx vr5, vr2, vr21, vr20, vr7, vr18, \sz
    477    vssrarni.h.w  vr16,     vr0,      12    // t2a
    478    vssrarni.h.w  vr18,     vr7,      12    // t3a
    479 
    480    vldrepl.h     vr20,     t0,       8     // 2598
    481    vldrepl.h     vr21,     t0,       10    // 3166
    482    vmulev_vmaddod_lsx vr3, vr4, vr20, vr21, vr2, vr0, \sz
    483    vneg.h        vr20,     vr20
    484    vmulev_vmaddod_lsx vr3, vr4, vr21, vr20, vr5, vr7, \sz
    485    vssrarni.h.w  vr0,      vr2,      12    // t4a
    486    vssrarni.h.w  vr7,      vr5,      12    // t5a
    487 
    488    vldrepl.h     vr20,     t0,       12    // 1189
    489    vldrepl.h     vr21,     t0,       14    // 3920
    490    vmulev_vmaddod_lsx vr1, vr6, vr20, vr21, vr3, vr2, \sz
    491    vneg.h        vr20,     vr20
    492    vmulev_vmaddod_lsx vr1, vr6, vr21, vr20, vr4, vr5, \sz
    493    vssrarni.h.w  vr2,      vr3,      12    // t6a
    494    vssrarni.h.w  vr5,      vr4,      12    // t7a
    495 
    496    vsadd.h       vr3,      vr17,     vr0   // t0
    497    vssub.h       vr4,      vr17,     vr0   // t4
    498    vsadd.h       vr1,      vr19,     vr7   // t1
    499    vssub.h       vr6,      vr19,     vr7   // t5
    500    vsadd.h       vr17,     vr16,     vr2   // t2
    501    vssub.h       vr19,     vr16,     vr2   // t6
    502    vsadd.h       vr0,      vr18,     vr5   // t3
    503    vssub.h       vr7,      vr18,     vr5   // t7
    504 
    505    la.local      t0,       idct_coeffs_h
    506 
    507    vldrepl.h     vr20,     t0,       4     // 1567
    508    vldrepl.h     vr21,     t0,       6     // 3784
    509    vmulev_vmaddod_lsx vr4, vr6, vr21, vr20, vr16, vr5, \sz
    510    vneg.h        vr21,     vr21
    511    vmulev_vmaddod_lsx vr4, vr6, vr20, vr21, vr18, vr2, \sz
    512    vssrarni.h.w  vr5,      vr16,     12    // t4a
    513    vssrarni.h.w  vr2,      vr18,     12    // t5a
    514 
    515    vneg.h        vr21,     vr21
    516    vmulev_vmaddod_lsx vr7, vr19, vr20, vr21, vr4, vr16, \sz
    517    vneg.h        vr20,     vr20
    518    vmulev_vmaddod_lsx vr7, vr19, vr21, vr20, vr6, vr18, \sz
    519    vssrarni.h.w  vr16,     vr4,      12    // t7a
    520    vssrarni.h.w  vr18,     vr6,      12    // t6a
    521 
    522    vsadd.h       vr4,      vr5,      vr18  // out1
    523    vssub.h       vr19,     vr5,      vr18  // t6
    524    vsadd.h       vr20,     vr1,      vr0   // out7
    525    vssub.h       vr18,     vr1,      vr0   // t3
    526    vsadd.h       \out0,    vr3,      vr17  // out0
    527    vssub.h       vr5,      vr3,      vr17  // t2
    528    vsadd.h       \out6,    vr2,      vr16  // out6
    529    vssub.h       vr23,     vr2,      vr16  // t7
    530 
    531    vsllwil.w.h   vr3,      vr20,     0     // out7
    532    vexth.w.h     \out7,    vr20            // out7
    533    vsllwil.w.h   vr21,     vr4,      0     // out1
    534    vexth.w.h     \out1,    vr4             // out1
    535    vneg.w        vr3,      vr3
    536    vneg.w        \out7,    \out7
    537    vneg.w        vr21,     vr21
    538    vneg.w        \out1,    \out1
    539    vssrarni.h.w  \out7,    vr3,      0
    540    vssrarni.h.w  \out1,    vr21,     0
    541 
    542    la.local      t0,       idct_coeffs_h
    543 
    544    vldrepl.h     vr20,     t0,       0     // 2896
    545    vmulev_vmaddod_lsx vr5, vr18, vr20, vr20, vr16, \out3, \sz
    546    vneg.h        vr21,     vr20
    547    vmulev_vmaddod_lsx vr5, vr18, vr20, vr21, vr17, \out4, \sz
    548    vsrari.w      vr16,     vr16,     12
    549    vsrari.w      \out3,    \out3,    12
    550    vneg.w        vr16,     vr16
    551    vneg.w        \out3,    \out3
    552    vssrarni.h.w  \out3,    vr16,     0     // out3
    553    vssrarni.h.w  \out4,    vr17,     12    // out4
    554 
    555    vmulev_vmaddod_lsx vr19, vr23, vr20, vr20, vr16, \out2, \sz
    556    vmulev_vmaddod_lsx vr19, vr23, vr20, vr21, vr17, \out5, \sz
    557    vssrarni.h.w  \out2,    vr16,     12    // out2
    558    vsrari.w      vr17,     vr17,     12
    559    vsrari.w      \out5,    \out5,    12
    560    vneg.w        vr17,     vr17
    561    vneg.w        \out5,    \out5
    562    vssrarni.h.w  \out5,    vr17,     0     // out5
    563 .endm
    564 
    565 functionl inv_adst_8h_x8_lsx
    566    inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
    567 endfuncl
    568 
    569 functionl inv_flipadst_8h_x8_lsx
    570    inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
    571 endfuncl
    572 
    573 functionl inv_adst_4h_x8_lsx
    574    inv_adst8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
    575 endfuncl
    576 
    577 functionl inv_flipadst_4h_x8_lsx
    578    inv_adst8_lsx vr7, vr6, vr5, vr4, vr3, vr2, vr1, vr0, .8h
    579 endfuncl
    580 
    581 .macro inv_dct8_lsx in0, in1, in2, in3, in4, in5, in6, in7, sz
    582    inv_dct4_lsx \in0, \in2, \in4, \in6, \in0, \in2, \in4, \in6, \sz
    583 
    584    la.local      t0,       idct_coeffs_h
    585 
    586    vldrepl.h     vr20,     t0,       8        // 799
    587    vldrepl.h     vr21,     t0,       10       // 4017
    588    vmulev_vmaddod_lsx  \in1, \in7, vr21, vr20, vr16, vr17, \sz
    589    vneg.h        vr21,     vr21
    590    vmulev_vmaddod_lsx  \in1, \in7, vr20, vr21, vr18, vr19, \sz
    591    vssrarni.h.w  vr17,     vr16,     12       // t7a
    592    vssrarni.h.w  vr19,     vr18,     12       // t4a
    593 
    594    vldrepl.h     vr20,     t0,       12       // 3406
    595    vldrepl.h     vr21,     t0,       14       // 2276
    596    vmulev_vmaddod_lsx  \in5, \in3, vr21, vr20, \in1, vr16, \sz
    597    vneg.h        vr21,     vr21
    598    vmulev_vmaddod_lsx  \in5, \in3, vr20, vr21, \in7, vr18, \sz
    599    vssrarni.h.w  vr16,     \in1,       12      // t6a
    600    vssrarni.h.w  vr18,     \in7,       12      // t5a
    601 
    602    vssub.h       \in7,     vr19,      vr18     // t5a
    603    vsadd.h       vr18,     vr19,      vr18     // t4
    604    vssub.h       \in5,     vr17,      vr16     // t6a
    605    vsadd.h       vr16,     vr17,      vr16     // t7
    606 
    607    vldrepl.h     vr20,     t0,        0        // 2896
    608    vmulev_vmaddod_lsx  \in5, \in7, vr20, vr20, \in1, vr17, \sz
    609    vneg.h        vr21,     vr20
    610    vmulev_vmaddod_lsx  \in5, \in7, vr20, vr21, vr23, vr19, \sz
    611    vssrarni.h.w  vr17,     \in1,      12       // t6
    612    vssrarni.h.w  vr19,     vr23,      12       // t5
    613 
    614    vssub.h       \in7,      \in0,     vr16     //c[7]
    615    vsadd.h       \in0,      \in0,     vr16     //c[0]
    616    vssub.h       \in5,      \in4,     vr19     //c[5]
    617    vsadd.h       vr23,      \in4,     vr19     //c[2]
    618    vssub.h       \in4,      \in6,     vr18     //c[4]
    619    vsadd.h       \in3,      \in6,     vr18     //c[3]
    620    vssub.h       \in6,      \in2,     vr17     //c[6]
    621    vsadd.h       \in1,      \in2,     vr17     //c[1]
    622    vor.v         \in2,      vr23,     vr23
    623 .endm
    624 
    625 functionl inv_dct_8h_x8_lsx
    626    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
    627 endfuncl
    628 
    629 functionl inv_dct_4h_x8_lsx
    630    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .4h
    631 endfuncl
    632 
    633 .macro DST_ADD_W8 in0, in1, in2, in3, in4, in5, in6, in7
    634    vsllwil.hu.bu vr0,      \in0,     0
    635    vsllwil.hu.bu vr1,      \in1,     0
    636    vsllwil.hu.bu vr2,      \in2,     0
    637    vsllwil.hu.bu vr3,      \in3,     0
    638    vadd.h        vr0,      \in4,     vr0
    639    vadd.h        vr1,      \in5,     vr1
    640    vadd.h        vr2,      \in6,     vr2
    641    vadd.h        vr3,      \in7,     vr3
    642    vssrani.bu.h  vr1,      vr0,      0
    643    vssrani.bu.h  vr3,      vr2,      0
    644    vstelm.d      vr1,      a0,       0,    0
    645    vstelmx.d     vr1,      a0,       a1,   1
    646    vstelmx.d     vr3,      a0,       a1,   0
    647    vstelmx.d     vr3,      a0,       a1,   1
    648 .endm
    649 
    650 .macro VLD_DST_ADD_W8 in0, in1, in2, in3
    651    vld           vr0,      a0,       0
    652    vldx          vr1,      a0,       a1
    653    vld           vr2,      t2,       0
    654    vldx          vr3,      t2,       a1
    655 
    656    DST_ADD_W8 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3
    657 .endm
    658 
    659 functionl inv_identity_8h_x8_lsx
    660 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    661    vsadd.h       \i,       \i,       \i
    662 .endr
    663 endfuncl
    664 
    665 functionl inv_identity_4h_x8_lsx
    666 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    667    vsadd.h       \i,       \i,       \i
    668 .endr
    669 endfuncl
    670 
    671 .macro def_fn_8x8_base variant
    672 functionl inv_txfm_\variant\()add_8x8_lsx
    673    vxor.v  vr23, vr23, vr23
    674    vld_x8 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    675 .irp i, 0, 16, 32, 48, 64, 80, 96, 112
    676    vst           vr23,     a2,       \i
    677 .endr
    678 
    679 .ifc \variant, identity_
    680    // The identity shl #1 and downshift srshr #1 cancel out
    681    b             .itx_8x8_epilog
    682 .else
    683 
    684    move          t6,       ra
    685    jirl          ra,       t7,       0
    686    move          ra,       t6
    687 
    688 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
    689    vsrari.h      \i,       \i,       1
    690 .endr
    691 
    692 .itx_8x8_epilog:
    693    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    694                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    695                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
    696 
    697    move          t6,       ra
    698    jirl          ra,       t8,       0
    699    move          ra,       t6
    700 
    701    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
    702                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
    703 
    704    alsl.d        t2,       a1,       a0,     1
    705    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
    706    add.d         a0,       a0,       a1
    707    alsl.d        t2,       a1,       a0,     1
    708    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
    709 .endif
    710 endfuncl
    711 .endm
    712 
    713 def_fn_8x8_base identity_
    714 def_fn_8x8_base
    715 
    716 .macro fn8x8 txfm1, txfm2
    717 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_lsx
    718 .ifc \txfm1\()_\txfm2, dct_dct
    719    bnez          a3,       .NO_HAS_DCONLY_8x8
    720 
    721    idct_dc 8, 8, 1
    722 
    723    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
    724 
    725    add.d         a0,       a1,       a0
    726    alsl.d        t2,       a1,       a0,     1
    727    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
    728 
    729    b             .\txfm1\()_\txfm2\()_8X8_END
    730 .NO_HAS_DCONLY_8x8:
    731 .endif
    732    la.local      t8,       inv_\txfm2\()_8h_x8_lsx
    733 .ifc \txfm1, identity
    734    b             inv_txfm_identity_add_8x8_lsx
    735 .else
    736    la.local      t7,       inv_\txfm1\()_8h_x8_lsx
    737    b             inv_txfm_add_8x8_lsx
    738 .endif
    739 .\txfm1\()_\txfm2\()_8X8_END:
    740 endfunc
    741 .endm
    742 
    743 fn8x8 dct, dct
    744 fn8x8 identity, identity
    745 fn8x8 dct, adst
    746 fn8x8 dct, flipadst
    747 fn8x8 dct, identity
    748 fn8x8 adst, dct
    749 fn8x8 adst, adst
    750 fn8x8 adst, flipadst
    751 fn8x8 flipadst, dct
    752 fn8x8 flipadst, adst
    753 fn8x8 flipadst, flipadst
    754 fn8x8 identity, dct
    755 fn8x8 adst, identity
    756 fn8x8 flipadst, identity
    757 fn8x8 identity, adst
    758 fn8x8 identity, flipadst
    759 
    760 .macro rect2_lsx in0, in1, out0
    761    vsllwil.w.h   vr22,     \in0,     0     // in1
    762    vexth.w.h     \in0,     \in0            // in1
    763    vmul.w        vr22,     vr22,     \in1
    764    vmul.w        \out0,    \in0,     \in1
    765    vssrarni.h.w  \out0,    vr22,     12
    766 .endm
    767 
    768 .macro LSX_TRANSPOSE8x4_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
    769                          out2, out3, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5
    770    vilvl.h       \tmp0,    \in1,     \in0
    771    vilvl.h       \tmp1,    \in3,     \in2
    772    vilvl.w       \tmp2,    \tmp1,    \tmp0
    773    vilvh.w       \tmp3,    \tmp1,    \tmp0
    774    vilvl.h       \tmp0,    \in5,     \in4
    775    vilvl.h       \tmp1,    \in7,     \in6
    776    vilvl.w       \tmp4,    \tmp1,    \tmp0
    777    vilvh.w       \tmp5,    \tmp1,    \tmp0
    778    vilvl.d       \out0,    \tmp4,    \tmp2
    779    vilvh.d       \out1,    \tmp4,    \tmp2
    780    vilvl.d       \out2,    \tmp5,    \tmp3
    781    vilvh.d       \out3,    \tmp5,    \tmp3
    782 .endm
    783 
    784 functionl inv_txfm_add_8x4_lsx
    785    vxor.v        vr23,     vr23,     vr23
    786    vld           vr0,      a2,       0
    787    vld           vr2,      a2,       16
    788    vld           vr4,      a2,       32
    789    vld           vr6,      a2,       48
    790 .irp i, 0, 16, 32, 48
    791    vst           vr23,     a2,       \i
    792 .endr
    793 
    794    li.w          t0,       2896
    795    vreplgr2vr.w  vr23,     t0
    796    rect2_lsx     vr0,      vr23,     vr0
    797    rect2_lsx     vr2,      vr23,     vr2
    798    rect2_lsx     vr4,      vr23,     vr4
    799    rect2_lsx     vr6,      vr23,     vr6
    800 
    801    vilvh.d       vr1,      vr0,      vr0
    802    vilvh.d       vr3,      vr2,      vr2
    803    vilvh.d       vr5,      vr4,      vr4
    804    vilvh.d       vr7,      vr6,      vr6
    805 
    806    move          t6,       ra
    807    jirl          ra,       t7,       0
    808    move          ra,       t6
    809 
    810    LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
    811                       vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21
    812 
    813    move          t6,       ra
    814    jirl          ra,       t8,       0
    815    move          ra,       t6
    816 
    817    vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4
    818 
    819    alsl.d        t2,       a1,       a0,     1
    820    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
    821 endfuncl
    822 
    823 .macro LSX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, out4, \
    824                          out5, out6, out7, tmp0, tmp1, tmp2, tmp3
    825    vilvl.h       \tmp0,    \in1,     \in0
    826    vilvl.h       \tmp1,    \in3,     \in2
    827    vilvh.h       \tmp2,    \in1,     \in0
    828    vilvh.h       \tmp3,    \in3,     \in2
    829    vilvl.w       \out0,    \tmp1,    \tmp0
    830    vilvh.w       \out2,    \tmp1,    \tmp0
    831    vilvl.w       \out4,    \tmp3,    \tmp2
    832    vilvh.w       \out6,    \tmp3,    \tmp2
    833 
    834    vbsrl.v       \out1,    \out0,    8
    835    vbsrl.v       \out3,    \out2,    8
    836    vbsrl.v       \out5,    \out4,    8
    837    vbsrl.v       \out7,    \out6,    8
    838    vinsgr2vr.d   \out0,    zero,     1
    839    vinsgr2vr.d   \out2,    zero,     1
    840    vinsgr2vr.d   \out4,    zero,     1
    841    vinsgr2vr.d   \out6,    zero,     1
    842 .endm
    843 
    844 functionl inv_txfm_add_4x8_lsx
    845    vxor.v        vr23,     vr23,     vr23
    846    vld           vr0,      a2,       0
    847    vld           vr1,      a2,       16
    848    vld           vr2,      a2,       32
    849    vld           vr3,      a2,       48
    850 .irp i, 0, 16, 32, 48
    851    vst           vr23,     a2,       \i
    852 .endr
    853 
    854    li.w          t0,       2896
    855    vreplgr2vr.w  vr23,     t0
    856    rect2_lsx     vr0,      vr23,     vr0
    857    rect2_lsx     vr1,      vr23,     vr1
    858    rect2_lsx     vr2,      vr23,     vr2
    859    rect2_lsx     vr3,      vr23,     vr3
    860 
    861    move          t6,       ra
    862    jirl          ra,       t7,       0
    863    move          ra,       t6
    864 
    865    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
    866                       vr6, vr7, vr16, vr17, vr18, vr19
    867 
    868    move          t6,       ra
    869    jirl          ra,       t8,       0
    870    move          ra,       t6
    871 
    872    vilvl.d       vr0,      vr1,      vr0
    873    vilvl.d       vr1,      vr3,      vr2
    874    vilvl.d       vr2,      vr5,      vr4
    875    vilvl.d       vr3,      vr7,      vr6
    876 
    877    vsrari_h_x4 vr0, vr1, vr2, vr3, vr16, vr17, vr18, vr19, 4
    878 
    879    alsl.d        t2,       a1,       a0,    1
    880    VLD_DST_ADD_W4 vr16, vr17
    881    add.d         a0,       a1,       a0
    882    alsl.d        t2,       a1,       a0,    1
    883    VLD_DST_ADD_W4 vr18, vr19
    884 endfuncl
    885 
    886 .macro fn8x4 txfm1, txfm2
    887 function inv_txfm_add_\txfm1\()_\txfm2\()_8x4_8bpc_lsx
    888 .ifc \txfm1()_\txfm2, dct_dct
    889    bnez          a3,       .NO_HAS_DCONLY_8x4
    890 
    891    idct_dc 8, 4, 0
    892 
    893    DST_ADD_W8 vr10, vr11, vr12, vr13, vr5, vr5, vr5, vr5
    894 
    895    b             .\txfm1\()_\txfm2\()_8X4_END
    896 .NO_HAS_DCONLY_8x4:
    897 .endif
    898    la.local      t7,       inv_\txfm1\()_4h_x8_lsx
    899    la.local      t8,       inv_\txfm2\()_8h_x4_lsx
    900    b             inv_txfm_add_8x4_lsx
    901 .\txfm1\()_\txfm2\()_8X4_END:
    902 endfunc
    903 .endm
    904 
    905 fn8x4 dct, dct
    906 fn8x4 identity, identity
    907 fn8x4 dct, adst
    908 fn8x4 dct, flipadst
    909 fn8x4 dct, identity
    910 fn8x4 adst, dct
    911 fn8x4 adst, adst
    912 fn8x4 adst, flipadst
    913 fn8x4 flipadst, dct
    914 fn8x4 flipadst, adst
    915 fn8x4 flipadst, flipadst
    916 fn8x4 identity, dct
    917 fn8x4 adst, identity
    918 fn8x4 flipadst, identity
    919 fn8x4 identity, adst
    920 fn8x4 identity, flipadst
    921 
    922 .macro fn4x8 txfm1, txfm2
    923 function inv_txfm_add_\txfm1\()_\txfm2\()_4x8_8bpc_lsx
    924 .ifc \txfm1()_\txfm2, dct_dct
    925    bnez          a3,       .NO_HAS_DCONLY_4x8
    926 
    927    idct_dc 4, 8, 0
    928 
    929    DST_ADD_W4 vr10, vr11, vr12, vr13, vr20, vr20
    930 
    931    add.d         a0,       a0,       a1
    932    alsl.d        t2,       a1,       a0,   1
    933    VLD_DST_ADD_W4 vr5, vr5
    934    b             .\txfm1\()_\txfm2\()_4X8_END
    935 .NO_HAS_DCONLY_4x8:
    936 .endif
    937    la.local      t7,       inv_\txfm1\()_8h_x4_lsx
    938    la.local      t8,       inv_\txfm2\()_4h_x8_lsx
    939    b             inv_txfm_add_4x8_lsx
    940 .\txfm1\()_\txfm2\()_4X8_END:
    941 endfunc
    942 .endm
    943 
    944 fn4x8 dct, dct
    945 fn4x8 identity, identity
    946 fn4x8 dct, adst
    947 fn4x8 dct, flipadst
    948 fn4x8 dct, identity
    949 fn4x8 adst, dct
    950 fn4x8 adst, adst
    951 fn4x8 adst, flipadst
    952 fn4x8 flipadst, dct
    953 fn4x8 flipadst, adst
    954 fn4x8 flipadst, flipadst
    955 fn4x8 identity, dct
    956 fn4x8 adst, identity
    957 fn4x8 flipadst, identity
    958 fn4x8 identity, adst
    959 fn4x8 identity, flipadst
    960 
    961 .macro inv_identity4_lsx_x2 in0, in1, in2, in3, in4, out0, out1
    962    vsllwil.w.h   vr4,      \in0,    0
    963    vexth.w.h     vr5,      \in0
    964    vsllwil.w.h   vr6,      \in1,    0
    965    vexth.w.h     vr7,      \in1
    966    vmul.w        vr4,      vr4,     \in2
    967    vmul.w        vr5,      vr5,     \in2
    968    vmul.w        vr6,      vr6,     \in2
    969    vmul.w        vr7,      vr7,     \in2
    970    vssrarni.h.w  vr5,      vr4,     12
    971    vssrarni.h.w  vr7,      vr6,     12
    972    vsadd.h       \out0,    vr5,     \in3
    973    vsadd.h       \out1,    vr7,     \in4
    974 .endm
    975 
    976 .macro vmul_vmadd_w in0, in1, in2, in3, out0, out1
    977    vsllwil.w.h   vr22,     \in0,     0
    978    vexth.w.h     vr23,     \in0
    979    vmul.w        \out0,    vr22,     \in2
    980    vmul.w        \out1,    vr23,     \in2
    981    vsllwil.w.h   vr22,     \in1,     0
    982    vexth.w.h     vr23,     \in1
    983    vmadd.w       \out0,    vr22,     \in3
    984    vmadd.w       \out1,    vr23,     \in3
    985 .endm
    986 
    987 .macro vmul_vmsub_w in0, in1, in2, in3, out0, out1
    988    vsllwil.w.h   vr22,     \in0,     0
    989    vexth.w.h     vr23,     \in0
    990    vmul.w        \out0,    vr22,     \in2
    991    vmul.w        \out1,    vr23,     \in2
    992    vsllwil.w.h   vr22,     \in1,     0
    993    vexth.w.h     vr23,     \in1
    994    vmsub.w       \out0,    vr22,     \in3
    995    vmsub.w       \out1,    vr23,     \in3
    996 .endm
    997 
    998 .macro inv_dct16_lsx sz
    999    inv_dct8_lsx vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14, \sz
   1000 
   1001    la.local      t0,       idct_coeffs_h
   1002    vldrepl.h     vr20,     t0,       16        // 401
   1003    vldrepl.h     vr21,     t0,       18        // 4076
   1004    vmulev_vmaddod_lsx vr1, vr15, vr21, vr20, vr16, vr17, \sz
   1005    vneg.h        vr21,     vr21
   1006    vmulev_vmaddod_lsx vr1, vr15, vr20, vr21, vr18, vr19, \sz
   1007    vssrarni.h.w  vr17,     vr16,     12        // t15a
   1008    vssrarni.h.w  vr19,     vr18,     12        // t8a
   1009    vldrepl.h     vr20,     t0,       20        // 3166 -> 1583
   1010    vldrepl.h     vr21,     t0,       22        // 2598 -> 1299
   1011    vmulev_vmaddod_lsx vr9, vr7, vr21, vr20, vr1, vr16, \sz
   1012    vneg.h        vr21,     vr21
   1013    vmulev_vmaddod_lsx vr9, vr7, vr20, vr21, vr15, vr18, \sz
   1014    vssrarni.h.w  vr16,     vr1,      12        // t14a
   1015    vssrarni.h.w  vr18,     vr15,     12        // t9a
   1016    vldrepl.h     vr20,     t0,       24        // 1931
   1017    vldrepl.h     vr21,     t0,       26        // 3612
   1018    vmulev_vmaddod_lsx vr5, vr11, vr21, vr20, vr7, vr1, \sz
   1019    vneg.h        vr21,     vr21
   1020    vmulev_vmaddod_lsx vr5, vr11, vr20, vr21, vr9, vr15, \sz
   1021    vssrarni.h.w  vr1,      vr7,      12        // t13a
   1022    vssrarni.h.w  vr15,     vr9,      12        // t10a
   1023    vldrepl.h     vr20,     t0,       28        // 3920
   1024    vldrepl.h     vr21,     t0,       30        // 1189
   1025    vmulev_vmaddod_lsx vr13, vr3, vr21, vr20, vr5, vr7, \sz
   1026    vneg.h        vr21,     vr21
   1027    vmulev_vmaddod_lsx vr13, vr3, vr20, vr21, vr11, vr9, \sz
   1028    vssrarni.h.w  vr7,      vr5,      12        // t12a
   1029    vssrarni.h.w  vr9,      vr11,     12        // t11a
   1030 
   1031    vsadd.h       vr5,      vr19,     vr18     // t8
   1032    vssub.h       vr11,     vr19,     vr18     // t9
   1033    vssub.h       vr3,      vr9,      vr15     // t10
   1034    vsadd.h       vr13,     vr9,      vr15     // t11
   1035    vsadd.h       vr18,     vr7,      vr1      // t12
   1036    vssub.h       vr19,     vr7,      vr1      // t13
   1037    vssub.h       vr9,      vr17,     vr16     // t14
   1038    vsadd.h       vr15,     vr17,     vr16     // t15
   1039 
   1040    vldrepl.h     vr20,     t0,       4        // 1567
   1041    vldrepl.h     vr21,     t0,       6        // 3784
   1042    vmulev_vmaddod_lsx vr9, vr11, vr21, vr20, vr1, vr16, \sz
   1043    vneg.h        vr21,     vr21
   1044    vmulev_vmaddod_lsx vr9, vr11, vr20, vr21, vr7, vr17, \sz
   1045    vssrarni.h.w  vr16,     vr1,      12       // t14a
   1046    vssrarni.h.w  vr17,     vr7,      12       // t9a
   1047 
   1048    vneg.h        vr21,     vr21
   1049    vmulev_vmaddod_lsx vr19, vr3, vr21, vr20, vr9, vr1, \sz
   1050    vneg.h        vr21,     vr21
   1051    vmulev_vmaddod_lsx vr19, vr3, vr20, vr21, vr11, vr7, \sz
   1052    vneg.w        vr1,      vr1
   1053    vneg.w        vr9,      vr9
   1054    vssrarni.h.w  vr7,      vr11,     12       // t13a
   1055    vssrarni.h.w  vr1,      vr9,      12       // t10a
   1056    vsadd.h       vr9,      vr5,      vr13     // t8a
   1057    vssub.h       vr11,     vr5,      vr13     // t11a
   1058    vssub.h       vr3,      vr15,     vr18     // t12a
   1059    vsadd.h       vr19,     vr15,     vr18     // t15a
   1060    vsadd.h       vr5,      vr17,     vr1      // t9
   1061    vssub.h       vr13,     vr17,     vr1      // t10
   1062    vssub.h       vr15,     vr16,     vr7      // t13
   1063    vsadd.h       vr18,     vr16,     vr7      // t14
   1064 
   1065    vldrepl.h     vr20,     t0,       0        // 2896
   1066    vmulev_vmaddod_lsx vr15, vr13, vr20, vr20, vr1, vr7, \sz
   1067    vneg.h        vr21,     vr20
   1068    vmulev_vmaddod_lsx vr15, vr13, vr20, vr21, vr17, vr16, \sz
   1069    vssrarni.h.w  vr7,      vr1,      12       // t13a
   1070    vssrarni.h.w  vr16,     vr17,     12       // t10a
   1071 
   1072    vmulev_vmaddod_lsx vr3, vr11, vr20, vr20, vr13, vr23, \sz
   1073    vmulev_vmaddod_lsx vr3, vr11, vr20, vr21, vr15, vr17, \sz
   1074    vssrarni.h.w  vr23,     vr13,     12       // t12
   1075    vssrarni.h.w  vr17,     vr15,     12       // t11
   1076 
   1077    vssub.h       vr15,     vr0,     vr19      // c[15]
   1078    vsadd.h       vr0,      vr0,     vr19      // c[0]
   1079    vsadd.h       vr1,      vr2,     vr18      // c[1]
   1080    vssub.h       vr20,     vr2,     vr18      // c[14]
   1081    vsadd.h       vr2,      vr4,     vr7       // c[2]
   1082    vssub.h       vr13,     vr4,     vr7       // c[13]
   1083    vsadd.h       vr3,      vr6,     vr23      // c[3]
   1084    vssub.h       vr21,     vr6,     vr23      // c[12]
   1085    vsadd.h       vr4,      vr8,     vr17      // c[4]
   1086    vssub.h       vr11,     vr8,     vr17      // c[11]
   1087    vsadd.h       vr7,      vr14,    vr9       // c[7]
   1088    vssub.h       vr8,      vr14,    vr9       // c[8]
   1089    vsadd.h       vr6,      vr12,    vr5       // c[6]
   1090    vssub.h       vr9,      vr12,    vr5       // c[9]
   1091    vsadd.h       vr5,      vr10,    vr16      // c[5]
   1092    vssub.h       vr10,     vr10,    vr16      // c[10]
   1093    vor.v         vr14,     vr20,    vr20
   1094    vor.v         vr12,     vr21,    vr21
   1095 .endm
   1096 
   1097 functionl inv_dct_8h_x16_lsx
   1098    inv_dct16_lsx .8h
   1099 endfuncl
   1100 
   1101 functionl inv_dct_4h_x16_lsx
   1102    inv_dct16_lsx .4h
   1103 endfuncl
   1104 
   1105 .macro VLD_DST_ADD_W4_x4 in0, in1, in2, in3, in4, in5, in6 ,in7
   1106    alsl.d        t2,       a1,       a0,    1
   1107 
   1108    VLD_DST_ADD_W4 \in0, \in1
   1109 
   1110    add.d         a0,       a1,       a0
   1111    alsl.d        t2,       a1,       a0,    1
   1112    VLD_DST_ADD_W4 \in2, \in3
   1113 
   1114    add.d         a0,       a1,       a0
   1115    alsl.d        t2,       a1,       a0,    1
   1116    VLD_DST_ADD_W4 \in4, \in5
   1117 
   1118    add.d         a0,       a1,       a0
   1119    alsl.d        t2,       a1,       a0,    1
   1120    VLD_DST_ADD_W4 \in6, \in7
   1121 .endm
   1122 
   1123 .macro def_fn_4x16_base txfm
   1124 functionl inv_txfm_\txfm\()add_4x16_lsx
   1125    PUSH_REG
   1126    blt           a3,       t5,       416f
   1127    vld           vr0,      a2,       16
   1128    vld           vr1,      a2,       48
   1129    vld           vr2,      a2,       80
   1130    vld           vr3,      a2,       112
   1131    vxor.v        vr23,     vr23,     vr23
   1132 .irp i, 16, 48, 80, 112
   1133    vst           vr23,     a2,       \i
   1134 .endr
   1135 
   1136    move          t6,       ra
   1137    jirl          ra,       t7,       0
   1138    move          ra,       t6
   1139 
   1140 .ifnc \txfm, identity_
   1141    vsrari.h      vr0,      vr0,      1
   1142    vsrari.h      vr1,      vr1,      1
   1143    vsrari.h      vr2,      vr2,      1
   1144    vsrari.h      vr3,      vr3,      1
   1145 .endif
   1146 
   1147    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr8, vr9, vr24, vr25, vr26, \
   1148                       vr27, vr14, vr28, vr10, vr11, vr12, vr13
   1149 
   1150 416:
   1151    bge           a3,       t5,       416416f
   1152 .irp i, vr8, vr9, vr24, vr25, vr26, vr27, vr14, vr28
   1153    vxor.v        \i,       \i,       \i
   1154 .endr
   1155 
   1156 416416:
   1157    vld           vr0,      a2,       0
   1158    vld           vr1,      a2,       32
   1159    vld           vr2,      a2,       64
   1160    vld           vr3,      a2,       96
   1161    vxor.v        vr23,     vr23,     vr23
   1162 .irp i, 0, 32, 64, 96
   1163    vst           vr23,     a2,       \i
   1164 .endr
   1165 
   1166    move          t6,       ra
   1167    jirl          ra,       t7,       0
   1168    move          ra,       t6
   1169 
   1170 .ifnc \txfm, identity_
   1171    vsrari.h      vr0,      vr0,      1
   1172    vsrari.h      vr1,      vr1,      1
   1173    vsrari.h      vr2,      vr2,      1
   1174    vsrari.h      vr3,      vr3,      1
   1175 .endif
   1176 
   1177    LSX_TRANSPOSE4x8_H vr0, vr1, vr2, vr3, vr0, vr1, vr2, vr3, vr4, vr5, \
   1178                       vr6, vr7, vr16, vr17, vr18, vr19
   1179 
   1180    vor.v         vr10,     vr24,     vr24
   1181    vor.v         vr11,     vr25,     vr25
   1182    vor.v         vr12,     vr26,     vr26
   1183    vor.v         vr13,     vr27,     vr27
   1184    vor.v         vr15,     vr28,     vr28
   1185 
   1186    move          t6,       ra
   1187    jirl          ra,       t8,       0
   1188    move          ra,       t6
   1189 
   1190    vilvl.d       vr16,     vr1,      vr0
   1191    vilvl.d       vr17,     vr3,      vr2
   1192    vilvl.d       vr18,     vr5,      vr4
   1193    vilvl.d       vr19,     vr7,      vr6
   1194    vilvl.d       vr20,     vr9,      vr8
   1195    vilvl.d       vr21,     vr11,     vr10
   1196    vilvl.d       vr22,     vr13,     vr12
   1197    vilvl.d       vr23,     vr15,     vr14
   1198 
   1199 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1200    vsrari.h     \i,       \i,       4
   1201 .endr
   1202 
   1203    VLD_DST_ADD_W4_x4 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1204    POP_REG
   1205 endfuncl
   1206 .endm
   1207 
   1208 def_fn_4x16_base identity_
   1209 def_fn_4x16_base
   1210 
   1211 .macro fn4x16 txfm1, txfm2, eob_half
   1212 function inv_txfm_add_\txfm1\()_\txfm2\()_4x16_8bpc_lsx
   1213 .ifc \txfm1()_\txfm2, dct_dct
   1214    bnez          a3,       .NO_HAS_DCONLY_4x16
   1215 
   1216    idct_dc 4, 16, 1
   1217 
   1218    DST_ADD_W4 vr10, vr11, vr12, vr13, vr5, vr5
   1219 
   1220 .rept 3
   1221    add.d         a0,       a1,       a0
   1222    alsl.d        t2,       a1,       a0,   1
   1223 
   1224    VLD_DST_ADD_W4 vr5, vr5
   1225 .endr
   1226    b             .\txfm1\()_\txfm2\()_4X16_END
   1227 
   1228 .NO_HAS_DCONLY_4x16:
   1229 .endif
   1230    li.w          t5,       \eob_half
   1231    la.local      t7,       inv_\txfm1\()_8h_x4_lsx
   1232 .ifc \txfm1, identity
   1233    la.local      t7,       inv_\txfm1\()_8h_x4_lsx1
   1234 .endif
   1235    la.local      t8,       inv_\txfm2\()_4h_x16_lsx
   1236 
   1237 .ifc \txfm1, identity
   1238    b             inv_txfm_identity_add_4x16_lsx
   1239 .else
   1240    b             inv_txfm_add_4x16_lsx
   1241 .endif
   1242 .\txfm1\()_\txfm2\()_4X16_END:
   1243 endfunc
   1244 .endm
   1245 
   1246 fn4x16 dct, dct, 29
   1247 fn4x16 identity, identity, 29
   1248 fn4x16 dct, adst, 29
   1249 fn4x16 dct, flipadst, 29
   1250 fn4x16 dct, identity, 8
   1251 fn4x16 adst, dct, 29
   1252 fn4x16 adst, adst, 29
   1253 fn4x16 adst, flipadst, 29
   1254 fn4x16 flipadst, dct, 29
   1255 fn4x16 flipadst, adst, 29
   1256 fn4x16 flipadst, flipadst, 29
   1257 fn4x16 identity, dct, 32
   1258 fn4x16 adst, identity, 8
   1259 fn4x16 flipadst, identity, 8
   1260 fn4x16 identity, adst, 32
   1261 fn4x16 identity, flipadst, 32
   1262 
   1263 .macro inv_identity16_lsx in0, in1, in2, out0, sz
   1264 .ifc \sz, .8h
   1265    vsllwil.w.h   vr16,     \in0,     0
   1266    vexth.w.h     vr17,     \in0
   1267    vmul.w        vr16,     vr16,     \in1
   1268    vmul.w        vr17,     vr17,     \in1
   1269    vsadd.h       \in2,     \in2,     \in2
   1270    vssrarni.h.w  vr17,     vr16,     11
   1271    vsadd.h       \out0,    vr17,     \in2
   1272 .else
   1273    vsllwil.w.h   vr16,     \in0,     0
   1274    vmul.w        vr16,     vr16,     \in1
   1275    vsadd.h       \in2,     \in2,     \in2
   1276    vssrarni.h.w  vr16,     vr16,     11
   1277    vsadd.h       \out0,    vr16,     \in2
   1278 .endif
   1279 .endm
   1280 
   1281 .macro inv_identity16_lsx1 in0, in1, in2, out0
   1282    vsllwil.w.h   vr16,     \in0,     0
   1283    vexth.w.h     vr17,     \in1
   1284    vmul.w        vr18,     vr16,     \in2
   1285    vmul.w        vr19,     vr17,     \in2
   1286    vsrari.w      vr18,     vr18,     11
   1287    vsrari.w      vr19,     vr19,     11
   1288    vslli.w       vr16,     vr16,     1
   1289    vslli.w       vr17,     vr17,     1
   1290    vadd.w        vr16,     vr18,     vr16
   1291    vadd.w        \out0,    vr19,     vr17
   1292    vssrarni.h.w  \out0,    vr16,     1
   1293 .endm
   1294 
   1295 functionl inv_identity_8h_x16_lsx
   1296    li.w          t0,       1697
   1297    vreplgr2vr.w  vr20,     t0
   1298 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
   1299    vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1300    inv_identity16_lsx \i, vr20, \i, \i, .8h
   1301 .endr
   1302 endfuncl
   1303 
   1304 functionl inv_identity_4h_x16_lsx
   1305    li.w          t0,       1697
   1306    vreplgr2vr.w  vr20,     t0
   1307 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
   1308    vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1309    inv_identity16_lsx \i, vr20, \i, \i, .4h
   1310 .endr
   1311 endfuncl
   1312 
   1313 functionl inv_identity_8h_x16_lsx1
   1314    li.w          t0,       1697
   1315    vreplgr2vr.w  vr20,     t0
   1316 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, \
   1317    vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1318    inv_identity16_lsx1 \i, \i, vr20, \i
   1319 .endr
   1320 endfuncl
   1321 
   1322 const iadst16_coeffs_h, align=4
   1323    .short         4091, 201, 3973, 995
   1324    .short         3703, 1751, 3290, 2440
   1325    .short         2751, 3035, 2106, 3513
   1326    .short         1380, 3857, 601, 4052
   1327 endconst
   1328 
   1329 .macro inv_adst16_lsx txfm, sz
   1330    la.local      t0,       iadst16_coeffs_h
   1331    vldrepl.h     vr20,     t0,        0        // 4091
   1332    vldrepl.h     vr21,     t0,        2        // 201
   1333    vmulev_vmaddod_lsx vr15, vr0, vr20, vr21, vr16, vr18, \sz
   1334    vneg.h        vr20,     vr20
   1335    vmulev_vmaddod_lsx vr15, vr0, vr21, vr20, vr17, vr19, \sz
   1336    vssrarni.h.w  vr18,     vr16,      12       // t0
   1337    vssrarni.h.w  vr19,     vr17,      12       // t1
   1338    vldrepl.h     vr20,     t0,        4        // 3973
   1339    vldrepl.h     vr21,     t0,        6        // 995
   1340    vmulev_vmaddod_lsx vr13, vr2, vr20, vr21, vr16, vr0, \sz
   1341    vneg.h        vr20,     vr20
   1342    vmulev_vmaddod_lsx vr13, vr2, vr21, vr20, vr17, vr15, \sz
   1343    vssrarni.h.w  vr0,      vr16,      12       // t2
   1344    vssrarni.h.w  vr15,     vr17,      12       // t3
   1345    vldrepl.h     vr20,     t0,        8       // 3703
   1346    vldrepl.h     vr21,     t0,        10       // 1751
   1347    vmulev_vmaddod_lsx vr11, vr4, vr20, vr21, vr16, vr2, \sz
   1348    vneg.h        vr20,     vr20
   1349    vmulev_vmaddod_lsx vr11, vr4, vr21, vr20, vr17, vr13, \sz
   1350    vssrarni.h.w  vr2,      vr16,      12       // t4
   1351    vssrarni.h.w  vr13,     vr17,      12       // t5
   1352    vldrepl.h     vr20,     t0,        12       // 3290 -> 1645
   1353    vldrepl.h     vr21,     t0,        14       // 2440 -> 1220
   1354    vmulev_vmaddod_lsx vr9, vr6, vr20, vr21, vr16, vr4, \sz
   1355    vneg.h        vr20,     vr20
   1356    vmulev_vmaddod_lsx vr9, vr6, vr21, vr20, vr17, vr11, \sz
   1357    vssrarni.h.w  vr4,      vr16,      12       // t6
   1358    vssrarni.h.w  vr11,     vr17,      12       // t7
   1359    vldrepl.h     vr20,     t0,        16       // 2751
   1360    vldrepl.h     vr21,     t0,        18       // 3035
   1361    vmulev_vmaddod_lsx vr7, vr8, vr20, vr21, vr16, vr6, \sz
   1362    vneg.h        vr20,     vr20
   1363    vmulev_vmaddod_lsx vr7, vr8, vr21, vr20, vr17, vr9, \sz
   1364    vssrarni.h.w  vr6,      vr16,      12       // t8
   1365    vssrarni.h.w  vr9,      vr17,      12       // t9
   1366    vldrepl.h     vr20,     t0,        20       // 2106
   1367    vldrepl.h     vr21,     t0,        22       // 3513
   1368    vmulev_vmaddod_lsx vr5, vr10, vr20, vr21, vr16, vr7, \sz
   1369    vneg.h        vr20,     vr20
   1370    vmulev_vmaddod_lsx vr5, vr10, vr21, vr20, vr17, vr8, \sz
   1371    vssrarni.h.w  vr7,      vr16,      12       // t10
   1372    vssrarni.h.w  vr8,      vr17,      12       // t11
   1373    vldrepl.h     vr20,     t0,        24       // 1380
   1374    vldrepl.h     vr21,     t0,        26       // 3857
   1375    vmulev_vmaddod_lsx vr3, vr12, vr20, vr21, vr16, vr5, \sz
   1376    vneg.h        vr20,     vr20
   1377    vmulev_vmaddod_lsx vr3, vr12, vr21, vr20, vr17, vr10, \sz
   1378    vssrarni.h.w  vr5,      vr16,      12       // t12
   1379    vssrarni.h.w  vr10,     vr17,      12       // t13
   1380    vldrepl.h     vr20,     t0,        28       // 601
   1381    vldrepl.h     vr21,     t0,        30       // 4052
   1382    vmulev_vmaddod_lsx vr1, vr14, vr20, vr21, vr16, vr3, \sz
   1383    vneg.h        vr20,     vr20
   1384    vmulev_vmaddod_lsx vr1, vr14, vr21, vr20, vr17, vr12, \sz
   1385    vssrarni.h.w  vr3,      vr16,      12       // t14
   1386    vssrarni.h.w  vr12,     vr17,      12       // t15
   1387 
   1388    vsadd.h       vr1,      vr18,      vr6      // t0a
   1389    vssub.h       vr14,     vr18,      vr6      // t8a
   1390    vsadd.h       vr16,     vr19,      vr9      // t1a
   1391    vssub.h       vr17,     vr19,      vr9      // t9a
   1392    vsadd.h       vr6,      vr0,       vr7      // t2a
   1393    vssub.h       vr18,     vr0,       vr7      // t10a
   1394    vsadd.h       vr9,      vr15,      vr8      // t3a
   1395    vssub.h       vr19,     vr15,      vr8      // t11a
   1396    vsadd.h       vr0,      vr2,       vr5      // t4a
   1397    vssub.h       vr7,      vr2,       vr5      // t12a
   1398    vsadd.h       vr8,      vr13,      vr10     // t5a
   1399    vssub.h       vr15,     vr13,      vr10     // t13a
   1400    vsadd.h       vr2,      vr4,       vr3      // t6a
   1401    vssub.h       vr5,      vr4,       vr3      // t14a
   1402    vsadd.h       vr10,     vr11,      vr12     // t7a
   1403    vssub.h       vr13,     vr11,      vr12     // t15a
   1404 
   1405    la.local      t0,       idct_coeffs_h
   1406 
   1407    vldrepl.h     vr20,     t0,        8        // 799
   1408    vldrepl.h     vr21,     t0,        10       // 4017
   1409    vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr3, vr11, \sz
   1410    vneg.h        vr21,     vr21
   1411    vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr4, vr12, \sz
   1412    vssrarni.h.w  vr11,     vr3,       12       // t8
   1413    vssrarni.h.w  vr12,     vr4,       12       // t9
   1414    vneg.h        vr21,     vr21
   1415    vmulev_vmaddod_lsx vr15, vr7, vr20, vr21, vr3, vr14, \sz
   1416    vneg.h        vr20,     vr20
   1417    vmulev_vmaddod_lsx vr15, vr7, vr21, vr20, vr4, vr17, \sz
   1418    vssrarni.h.w  vr14,     vr3,       12       // t13
   1419    vssrarni.h.w  vr17,     vr4,       12       // t12
   1420    vldrepl.h     vr20,     t0,        12       // 3406
   1421    vldrepl.h     vr21,     t0,        14       // 2276
   1422    vmulev_vmaddod_lsx vr18, vr19, vr21, vr20, vr3, vr7, \sz
   1423    vneg.h        vr21,     vr21
   1424    vmulev_vmaddod_lsx vr18, vr19, vr20, vr21, vr4, vr15, \sz
   1425    vssrarni.h.w  vr7,      vr3,       12       // t10
   1426    vssrarni.h.w  vr15,     vr4,       12       // t11
   1427    vneg.h        vr21,     vr21
   1428    vmulev_vmaddod_lsx vr13, vr5, vr20, vr21, vr3, vr18, \sz
   1429    vneg.h        vr20,     vr20
   1430    vmulev_vmaddod_lsx vr13, vr5, vr21, vr20, vr4, vr19, \sz
   1431    vssrarni.h.w  vr18,     vr3,       12       // t15
   1432    vssrarni.h.w  vr19,     vr4,       12       // t14
   1433 
   1434    vsadd.h       vr5,      vr1,       vr0      // t0
   1435    vssub.h       vr13,     vr1,       vr0      // t4
   1436    vsadd.h       vr3,      vr16,      vr8      // t1
   1437    vssub.h       vr4,      vr16,      vr8      // t5
   1438    vsadd.h       vr0,      vr6,       vr2      // t2
   1439    vssub.h       vr1,      vr6,       vr2      // t6
   1440    vsadd.h       vr8,      vr9,       vr10     // t3
   1441    vssub.h       vr16,     vr9,       vr10     // t7
   1442    vsadd.h       vr2,      vr11,      vr17     // t8a
   1443    vssub.h       vr6,      vr11,      vr17     // t12a
   1444    vsadd.h       vr9,      vr12,      vr14     // t9a
   1445    vssub.h       vr10,     vr12,      vr14     // t13a
   1446    vsadd.h       vr11,     vr7,       vr19     // t10a
   1447    vssub.h       vr17,     vr7,       vr19     // t14a
   1448    vsadd.h       vr12,     vr15,      vr18     // t11a
   1449    vssub.h       vr14,     vr15,      vr18     // t15a
   1450 
   1451    vldrepl.h     vr20,     t0,        4        // 1567
   1452    vldrepl.h     vr21,     t0,        6       // 3784
   1453    vmulev_vmaddod_lsx vr13, vr4, vr21, vr20, vr7, vr18, \sz
   1454    vneg.h        vr21,     vr21
   1455    vmulev_vmaddod_lsx vr13, vr4, vr20, vr21, vr15, vr19, \sz
   1456    vssrarni.h.w  vr18,     vr7,       12       // t4a
   1457    vssrarni.h.w  vr19,     vr15,      12       // t5a
   1458    vneg.h        vr21,     vr21
   1459    vmulev_vmaddod_lsx vr16, vr1, vr20, vr21, vr7, vr4, \sz
   1460    vneg.h        vr20,     vr20
   1461    vmulev_vmaddod_lsx vr16, vr1, vr21, vr20, vr15, vr13, \sz
   1462    vssrarni.h.w  vr4,      vr7,       12       // t7a
   1463    vssrarni.h.w  vr13,     vr15,      12       // t6a
   1464    vneg.h        vr20,     vr20
   1465    vmulev_vmaddod_lsx vr6, vr10, vr21, vr20, vr7, vr1, \sz
   1466    vneg.h        vr21,     vr21
   1467    vmulev_vmaddod_lsx vr6, vr10, vr20, vr21, vr15, vr16, \sz
   1468    vssrarni.h.w  vr1,      vr7,       12       // t12
   1469    vssrarni.h.w  vr16,     vr15,      12       // t13
   1470    vneg.h        vr21,     vr21
   1471    vmulev_vmaddod_lsx vr14, vr17, vr20, vr21, vr7, vr6, \sz
   1472    vneg.h        vr20,     vr20
   1473    vmulev_vmaddod_lsx vr14, vr17, vr21, vr20, vr15, vr10, \sz
   1474    vssrarni.h.w  vr6,      vr7,       12       // t15
   1475    vssrarni.h.w  vr10,     vr15,      12       // t14
   1476 
   1477    vssub.h       vr17,     vr5,       vr0      // t2a
   1478    vsadd.h       vr14,     vr5,       vr0      // out[0]
   1479    vssub.h       vr7,      vr3,       vr8      // t3a
   1480    vsadd.h       vr15,     vr3,       vr8      // out[15]
   1481    vsllwil.w.h   vr22,     vr15,      0
   1482    vexth.w.h     vr15,     vr15
   1483    vneg.w        vr22,     vr22
   1484    vneg.w        vr15,     vr15
   1485    vssrarni.h.w  vr15,     vr22,      0        // out[15]
   1486 
   1487    vsadd.h       vr3,      vr19,      vr4      // out[12]
   1488    vssub.h       vr8,      vr19,      vr4      // t7
   1489    vssub.h       vr0,      vr18,      vr13     // t6
   1490    vsadd.h       vr5,      vr18,      vr13     // out[3]
   1491    vsllwil.w.h   vr22,     vr5,       0
   1492    vexth.w.h     vr5,      vr5
   1493    vneg.w        vr22,     vr22
   1494    vneg.w        vr5,      vr5
   1495    vssrarni.h.w  vr5,      vr22,      0        // out[3]
   1496 
   1497    vsadd.h       vr13,     vr9,       vr12     // out[14]
   1498    vssub.h       vr19,     vr9,       vr12     // t11
   1499    vssub.h       vr4,      vr2,       vr11     // t10
   1500    vsadd.h       vr18,     vr2,       vr11     // out[1]
   1501    vsllwil.w.h   vr22,     vr18,      0
   1502    vexth.w.h     vr18,     vr18
   1503    vneg.w        vr22,     vr22
   1504    vneg.w        vr18,     vr18
   1505    vssrarni.h.w  vr18,     vr22,      0        // out[1]
   1506 
   1507    vsadd.h       vr2,      vr1,       vr10     // out[2]
   1508    vssub.h       vr11,     vr1,       vr10     // t14a
   1509    vssub.h       vr12,     vr16,      vr6      // t15a
   1510    vsadd.h       vr9,      vr16,      vr6      // out[13]
   1511    vsllwil.w.h   vr22,     vr9,       0
   1512    vexth.w.h     vr9,      vr9
   1513    vneg.w        vr22,     vr22
   1514    vneg.w        vr9,      vr9
   1515    vssrarni.h.w  vr9,      vr22,      0        // out[13]
   1516 
   1517    vldrepl.h     vr20,     t0,        0        // 2896
   1518    vmulev_vmaddod_lsx vr17, vr7, vr20, vr20, vr6, vr10, \sz
   1519    vneg.h        vr21,     vr20
   1520    vmulev_vmaddod_lsx vr17, vr7, vr20, vr21, vr16, vr1, \sz
   1521    vssrarni.h.w  vr1,      vr16,      12       // out[8]
   1522    vsrari.w      vr6,      vr6,       12
   1523    vsrari.w      vr10,     vr10,      12
   1524    vneg.w        vr6,      vr6
   1525    vneg.w        vr10,     vr10
   1526    vssrarni.h.w  vr10,     vr6,       0        // out[7]
   1527    vmulev_vmaddod_lsx vr0, vr8, vr20, vr21, vr16, vr17, \sz
   1528    vmulev_vmaddod_lsx vr0, vr8, vr20, vr20, vr6, vr7, \sz
   1529    vssrarni.h.w  vr7,      vr6,       12       // out[4]
   1530    vsrari.w      vr16,     vr16,      12
   1531    vsrari.w      vr17,     vr17,      12
   1532    vneg.w        vr16,     vr16
   1533    vneg.w        vr17,     vr17
   1534    vssrarni.h.w  vr17,     vr16,       0        // out[11]
   1535 
   1536    vmulev_vmaddod_lsx vr4, vr19, vr20, vr21, vr16, vr0, \sz
   1537    vmulev_vmaddod_lsx vr4, vr19, vr20, vr20, vr6, vr8, \sz
   1538    vssrarni.h.w  vr8,      vr6,       12       // out[6]
   1539    vsrari.w      vr16,     vr16,      12
   1540    vsrari.w      vr0,      vr0,       12
   1541    vneg.w        vr16,     vr16
   1542    vneg.w        vr0,      vr0
   1543    vssrarni.h.w  vr0,      vr16,      0    // out[9]
   1544 
   1545    vmulev_vmaddod_lsx vr11, vr12, vr20, vr20, vr6, vr4, \sz
   1546    vmulev_vmaddod_lsx vr11, vr12, vr20, vr21, vr16, vr19, \sz
   1547    vssrarni.h.w  vr19,     vr16,      12       // out[10]
   1548    vsrari.w      vr6,      vr6,       12
   1549    vsrari.w      vr4,      vr4,       12
   1550    vneg.w        vr6,      vr6
   1551    vneg.w        vr4,      vr4
   1552    vssrarni.h.w  vr4,      vr6,       0        // out[5]
   1553 
   1554 .ifc \txfm, adst
   1555    vor.v         vr12,     vr3,       vr3
   1556    vor.v         vr3,      vr5,       vr5
   1557    vor.v         vr5,      vr4,       vr4
   1558    vor.v         vr4,      vr7,       vr7
   1559    vor.v         vr7,      vr10,      vr10
   1560    vor.v         vr10,     vr19,      vr19
   1561    vor.v         vr6,      vr8,       vr8
   1562    vor.v         vr8,      vr1,       vr1
   1563    vor.v         vr11,     vr17,      vr17
   1564    vor.v         vr20,     vr13,      vr13
   1565    vor.v         vr13,     vr9,       vr9
   1566    vor.v         vr9,      vr0,       vr0
   1567    vor.v         vr0,      vr14,      vr14
   1568    vor.v         vr14,     vr20,      vr20
   1569    vor.v         vr1,      vr18,      vr18
   1570 .else
   1571    vor.v         vr6,      vr0,       vr0
   1572    vor.v         vr0,      vr15,      vr15
   1573    vor.v         vr15,     vr14,      vr14
   1574    vor.v         vr14,     vr18,      vr18
   1575    vor.v         vr11,     vr7,       vr7
   1576    vor.v         vr7,      vr1,       vr1
   1577    vor.v         vr1,      vr13,      vr13
   1578    vor.v         vr13,     vr2,       vr2
   1579    vor.v         vr2,      vr9,       vr9
   1580    vor.v         vr9,      vr8,       vr8
   1581    vor.v         vr8,      vr10,      vr10
   1582    vor.v         vr10,     vr4,       vr4
   1583    vor.v         vr4,      vr17,      vr17
   1584    vor.v         vr12,     vr5,       vr5
   1585    vor.v         vr5,      vr19,      vr19
   1586 .endif
   1587 .endm // inv_adst16_lsx
   1588 
   1589 functionl inv_adst_8h_x16_lsx
   1590    inv_adst16_lsx adst, 8h
   1591 endfuncl
   1592 
   1593 functionl inv_flipadst_8h_x16_lsx
   1594    inv_adst16_lsx flipadst, 8h
   1595 endfuncl
   1596 
   1597 functionl inv_adst_4h_x16_lsx
   1598    inv_adst16_lsx adst, 4h
   1599 endfuncl
   1600 
   1601 functionl inv_flipadst_4h_x16_lsx
   1602    inv_adst16_lsx flipadst, 4h
   1603 endfuncl
   1604 
   1605 .macro VLD_DST_ADD_W8_x4 in0, in1, in2, in3, in4, in5, in6, in7, in8, \
   1606                         in9, in10, in11, in12, in13, in14, in15
   1607 
   1608    alsl.d        t2,       a1,       a0,    1
   1609    VLD_DST_ADD_W8 \in0, \in1, \in2, \in3
   1610 
   1611    add.d         a0,       a1,       a0
   1612    alsl.d        t2,       a1,       a0,    1
   1613    VLD_DST_ADD_W8 \in4, \in5, \in6, \in7
   1614 
   1615    add.d         a0,       a1,       a0
   1616    alsl.d        t2,       a1,       a0,    1
   1617    VLD_DST_ADD_W8 \in8, \in9, \in10, \in11
   1618 
   1619    add.d         a0,       a1,       a0
   1620    alsl.d        t2,       a1,       a0,    1
   1621    VLD_DST_ADD_W8 \in12, \in13, \in14, \in15
   1622 .endm
   1623 
   1624 .macro def_base_8x16 txfm1
   1625 functionl inv_txfm_\txfm1\()add_8x16_lsx
   1626    blt     a3,    t5,   816f
   1627    vld_x8 a2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1628    vxor.v        vr23,     vr23,     vr23
   1629 .irp i, 16, 48, 80, 112, 144, 176, 208, 240
   1630    vst           vr23,     a2,       \i
   1631 .endr
   1632 
   1633    li.w          t0,       2896
   1634    vreplgr2vr.w  vr23,     t0
   1635 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1636    rect2_lsx     \i,       vr23,     \i
   1637 .endr
   1638 
   1639 .ifc \txfm1, identity_
   1640    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1641                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1642                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1643 .else
   1644    move          t6,       ra
   1645    jirl          ra,       t7,       0
   1646    move          ra,       t6
   1647 
   1648    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1649                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 1
   1650 
   1651    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1652                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1653                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1654 .endif
   1655 
   1656 816:
   1657    bge           a3,       t5,       816816f
   1658 .irp i, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1659    vxor.v        \i,       \i,       \i
   1660 .endr
   1661 
   1662 816816:
   1663    vld_x8 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1664    vxor.v        vr23,     vr23,     vr23
   1665 .irp i, 0, 32, 64, 96, 128, 160, 192, 224
   1666    vst           vr23,     a2,       \i
   1667 .endr
   1668 
   1669    li.w          t0,       2896
   1670    vreplgr2vr.w  vr23,     t0
   1671 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1672    rect2_lsx     \i,       vr23,     \i
   1673 .endr
   1674 
   1675 .ifc \txfm1, identity_
   1676 
   1677 .else
   1678    move          t6,       ra
   1679    jirl          ra,       t7,       0
   1680    move          ra,       t6
   1681 
   1682 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1683    vsrari.h      \i,       \i,       1
   1684 .endr
   1685 .endif
   1686 
   1687    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1688                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1689                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1690 
   1691    move          t6,       ra
   1692    jirl          ra,       t8,       0
   1693    move          ra,       t6
   1694 
   1695    vor.v   vr0, vr0, vr0
   1696    vsrari_h_x8 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1697                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, 4
   1698    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1699                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4
   1700 
   1701    VLD_DST_ADD_W8_x4 vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1702                      vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1703 endfuncl
   1704 .endm
   1705 
   1706 def_base_8x16 identity_
   1707 def_base_8x16
   1708 
   1709 .macro DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11
   1710    vsllwil.hu.bu vr4,      \in0,     0
   1711    vexth.hu.bu   vr0,      \in0
   1712    vsllwil.hu.bu vr5,      \in1,     0
   1713    vexth.hu.bu   vr1,      \in1
   1714    vsllwil.hu.bu vr6,      \in2,     0
   1715    vexth.hu.bu   vr2,      \in2
   1716    vsllwil.hu.bu vr7,      \in3,     0
   1717    vexth.hu.bu   vr3,      \in3
   1718    vadd.h        vr4,      vr4,      \in4
   1719    vadd.h        vr0,      vr0,      \in5
   1720    vadd.h        vr5,      vr5,      \in6
   1721    vadd.h        vr1,      vr1,      \in7
   1722    vadd.h        vr6,      vr6,      \in8
   1723    vadd.h        vr2,      vr2,      \in9
   1724    vadd.h        vr7,      vr7,      \in10
   1725    vadd.h        vr3,      vr3,      \in11
   1726    vssrani.bu.h  vr0,      vr4,      0
   1727    vssrani.bu.h  vr1,      vr5,      0
   1728    vssrani.bu.h  vr2,      vr6,      0
   1729    vssrani.bu.h  vr3,      vr7,      0
   1730    vst           vr0,      a0,       0
   1731    vstx          vr1,      a0,       a1
   1732    vst           vr2,      t2,       0
   1733    vstx          vr3,      t2,       a1
   1734 .endm
   1735 
   1736 .macro VLD_DST_ADD_W16 in0, in1, in2, in3, in4, in5, in6, in7
   1737    vld           vr0,      a0,       0
   1738    vldx          vr1,      a0,       a1
   1739    vld           vr2,      t2,       0
   1740    vldx          vr3,      t2,       a1
   1741    DST_ADD_W16 vr0, vr1, vr2, vr3, \in0, \in1, \in2, \in3, \
   1742                \in4, \in5, \in6, \in7
   1743 .endm
   1744 
   1745 .macro def_fn_16x8 txfm1
   1746 functionl inv_txfm_\txfm1\()add_16x8_lsx
   1747    PUSH_REG
   1748 
   1749    vld_x16 a2, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1750            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1751    vxor.v        vr23,     vr23,     vr23
   1752 .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, \
   1753    176, 192, 208, 224, 240
   1754    vst           vr23,     a2,       \i
   1755 .endr
   1756 
   1757    li.w          t0,       2896
   1758    vreplgr2vr.w  vr23,     t0
   1759 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1760    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1761    rect2_lsx     \i,       vr23,     \i
   1762 .endr
   1763 
   1764    move          t6,       ra
   1765    jirl          ra,       t7,       0
   1766    move          ra,       t6
   1767 
   1768 .ifnc \txfm1, identity_
   1769 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1770    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1771    vsrari.h       \i,       \i,       1
   1772 .endr
   1773 .endif
   1774 
   1775    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1776                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1777                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1778 
   1779    move          t6,       ra
   1780    jirl          ra,       t8,       0
   1781    move          ra,       t6
   1782 
   1783    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1784                vr24, vr25, vr26, vr27, vr28, vr29, vr30, vr31, 4
   1785 
   1786    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1787                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1788                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1789    move          t6,       ra
   1790    jirl          ra,       t8,       0
   1791    move          ra,       t6
   1792 
   1793    vsrari_h_x8 vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1794                vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, 4
   1795 
   1796    alsl.d        t2,       a1,       a0,    1
   1797    VLD_DST_ADD_W16 vr24, vr8, vr25, vr9, vr26, vr10, vr27, vr11
   1798 
   1799    alsl.d        a0,       a1,       a0,    2
   1800    alsl.d        t2,       a1,       a0,    1
   1801    VLD_DST_ADD_W16 vr28, vr12, vr29, vr13, vr30, vr14, vr31, vr15
   1802 
   1803    POP_REG
   1804 endfuncl
   1805 .endm
   1806 
   1807 def_fn_16x8 identity_
   1808 def_fn_16x8
   1809 
   1810 .macro fun16x8 txfm1, txfm2
   1811 function inv_txfm_add_\txfm1\()_\txfm2\()_16x8_8bpc_lsx
   1812 .ifc \txfm1\()_\txfm2, dct_dct
   1813    bnez          a3,       .NO_HAS_DCONLY_16x8
   1814 
   1815    idct_dc 16, 8, 1
   1816 
   1817    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
   1818                vr20, vr20, vr20, vr20, vr20
   1819 
   1820    alsl.d        a0,       a1,       a0,     2
   1821    alsl.d        t2,       a1,       a0,     1
   1822    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   1823    b             .\txfm1\()_\txfm2\()_16x8_END
   1824 .NO_HAS_DCONLY_16x8:
   1825 .endif
   1826 
   1827    la.local     t7,    inv_\txfm1\()_8h_x16_lsx
   1828 .ifc \txfm1, identity
   1829    la.local     t7,    inv_identity_8h_x16_lsx1
   1830 .endif
   1831 
   1832    la.local     t8,    inv_\txfm2\()_8h_x8_lsx
   1833 
   1834 .ifc \txfm1, identity
   1835    b            inv_txfm_identity_add_16x8_lsx
   1836 .else
   1837    b            inv_txfm_add_16x8_lsx
   1838 .endif
   1839 
   1840 .\txfm1\()_\txfm2\()_16x8_END:
   1841 endfunc
   1842 .endm
   1843 
   1844 fun16x8 dct, dct
   1845 fun16x8 identity, identity
   1846 fun16x8 dct, adst
   1847 fun16x8 dct, flipadst
   1848 fun16x8 dct, identity
   1849 fun16x8 adst, dct
   1850 fun16x8 adst, adst
   1851 fun16x8 adst, flipadst
   1852 fun16x8 flipadst, dct
   1853 fun16x8 flipadst, adst
   1854 fun16x8 flipadst, flipadst
   1855 fun16x8 identity, dct
   1856 fun16x8 adst, identity
   1857 fun16x8 flipadst, identity
   1858 fun16x8 identity, adst
   1859 fun16x8 identity, flipadst
   1860 
   1861 .macro fun8x16 txfm1, txfm2, eob_half
   1862 function inv_txfm_add_\txfm1\()_\txfm2\()_8x16_8bpc_lsx
   1863 .ifc \txfm1\()_\txfm2, dct_dct
   1864    bnez          a3,       .NO_HAS_DCONLY_8x16
   1865 
   1866    idct_dc 8, 16, 1
   1867 
   1868    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
   1869 .rept 3
   1870    add.d         a0,       a1,       a0
   1871    alsl.d        t2,       a1,       a0,     1
   1872    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
   1873 .endr
   1874 
   1875    b             .\txfm1\()_\txfm2\()_8x16_END
   1876 .NO_HAS_DCONLY_8x16:
   1877 .endif
   1878    li.w         t5,    \eob_half
   1879 .ifnc \txfm1, identity
   1880    la.local     t7,    inv_\txfm1\()_8h_x8_lsx
   1881 .endif
   1882 
   1883    la.local     t8,    inv_\txfm2\()_8h_x16_lsx
   1884 .ifc \txfm1, identity
   1885    b            inv_txfm_identity_add_8x16_lsx
   1886 .else
   1887    b            inv_txfm_add_8x16_lsx
   1888 .endif
   1889 .\txfm1\()_\txfm2\()_8x16_END:
   1890 endfunc
   1891 .endm
   1892 
   1893 fun8x16 dct, dct, 43
   1894 fun8x16 identity, identity, 43
   1895 fun8x16 dct, adst, 43
   1896 fun8x16 dct, flipadst, 43
   1897 fun8x16 dct, identity, 8
   1898 fun8x16 adst, dct, 43
   1899 fun8x16 adst, adst, 43
   1900 fun8x16 adst, flipadst, 43
   1901 fun8x16 flipadst, dct, 43
   1902 fun8x16 flipadst, adst, 43
   1903 fun8x16 flipadst, flipadst, 43
   1904 fun8x16 identity, dct, 64
   1905 fun8x16 adst, identity, 8
   1906 fun8x16 flipadst, identity, 8
   1907 fun8x16 identity, adst, 64
   1908 fun8x16 identity, flipadst, 64
   1909 
   1910 functionl inv_txfm_add_16x16_lsx
   1911    malloc_space 512
   1912 
   1913    addi.d        t1,       sp,       64
   1914    addi.d        t2,       a2,       0
   1915 .rept 2
   1916    vld_x16 a2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1917            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1918 
   1919    vxor.v        vr23,     vr23,     vr23
   1920 .irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, \
   1921    384, 416, 448, 480
   1922    vst           vr23,     a2,       \i
   1923 .endr
   1924 
   1925    move          t6,       ra
   1926    jirl          ra,       t7,       0
   1927    move          ra,       t6
   1928 
   1929    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1930                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1931                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1932 
   1933    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1934                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   1935                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1936 
   1937 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1938    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1939    vsrari.h       \i,       \i,       2
   1940 .endr
   1941    vst_x8 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   1942    vst_x8 t1, 16, 32, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1943    addi.d         t1,       t1,       256
   1944    addi.d         a2,       a2,       16
   1945    blt            a3,       t5,       1616f
   1946 .endr
   1947 
   1948 1616:
   1949    bge           a3,       t5,       16161616f
   1950    addi.d        t1,       sp,       320
   1951    vxor.v        vr23,     vr23,     vr23
   1952 .irp i, 0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, \
   1953    240
   1954    vst           vr23,     t1,       \i
   1955 .endr
   1956 
   1957 16161616:
   1958    addi.d        t1,       sp,       64
   1959 .rept 2
   1960    vld_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1961            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1962 
   1963    move          t6,       ra
   1964    jirl          ra,       t8,       0
   1965    move          ra,       t6
   1966 
   1967    vst_x16 t1, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   1968            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   1969 
   1970    addi.d        t1,       t1,       16
   1971 .endr
   1972    alsl.d        t2,       a1,       a0,    1
   1973    addi.d        t1,       sp,       64
   1974 .rept 4
   1975    vld_x8 t1, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1976    vsrari_h_x8 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
   1977                vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23 4
   1978    VLD_DST_ADD_W16 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   1979    alsl.d        a0,       a1,       a0,    2
   1980    alsl.d        t2,       a1,       a0,    1
   1981    addi.d        t1,       t1,       128
   1982 .endr
   1983    free_space 512
   1984 endfuncl
   1985 
   1986 .macro fun16x16 txfm1, txfm2, eob_half
   1987 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_lsx
   1988 .ifc \txfm1\()_\txfm2, dct_dct
   1989    bnez          a3,       .NO_HAS_DCONLY_16x16
   1990 
   1991    idct_dc 16, 16, 2
   1992 
   1993    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
   1994                    vr20, vr20, vr20, vr20, vr20
   1995 .rept 3
   1996    alsl.d        a0,       a1,       a0,     2
   1997    alsl.d        t2,       a1,       a0,     1
   1998 
   1999    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2000 .endr
   2001    b             .\txfm1\()_\txfm2\()_16x16_END
   2002 .NO_HAS_DCONLY_16x16:
   2003 .endif
   2004    li.w         t5,    \eob_half
   2005    la.local     t7,    inv_\txfm1\()_8h_x16_lsx
   2006    la.local     t8,    inv_\txfm2\()_8h_x16_lsx
   2007 
   2008    b            inv_txfm_add_16x16_lsx
   2009 .\txfm1\()_\txfm2\()_16x16_END:
   2010 endfunc
   2011 .endm
   2012 
   2013 fun16x16 dct, dct, 36
   2014 fun16x16 adst, adst, 36
   2015 fun16x16 adst, dct, 36
   2016 fun16x16 dct, adst, 36
   2017 fun16x16 flipadst, dct, 36
   2018 fun16x16 dct, flipadst, 36
   2019 fun16x16 adst, flipadst, 36
   2020 fun16x16 flipadst, adst, 36
   2021 
   2022 .macro dct_8x32_core_lsx in1, in2, vld_st0, vld_st1, vld_stride, \
   2023                         vst_st0, vst_st1, vst_st2, vst_st3, vst_stride, \
   2024                         transpose8x8, shift
   2025    la.local      t0,       idct_coeffs
   2026    vldrepl.w     vr20,     t0,       64           // 201
   2027    vldrepl.w     vr21,     t0,       68           // 4091
   2028    vmul_vmadd_w vr0, vr30, vr21, vr20, vr8, vr9
   2029    vmul_vmsub_w vr0, vr30, vr20, vr21, vr11, vr10
   2030    vssrarni.h.w  vr9,      vr8,      12           // t31a
   2031    vssrarni.h.w  vr10,     vr11,     12           // t16a
   2032    vldrepl.w     vr20,     t0,       72           // 3035
   2033    vldrepl.w     vr21,     t0,       76           // 2751
   2034    vmul_vmadd_w vr19, vr7, vr21, vr20, vr8, vr0
   2035    vmul_vmsub_w vr19, vr7, vr20, vr21, vr11, vr30
   2036    vssrarni.h.w  vr0,      vr8,      12           // t30a
   2037    vssrarni.h.w  vr30,     vr11,     12           // t17a
   2038    vldrepl.w     vr20,     t0,       80           // 1751
   2039    vldrepl.w     vr21,     t0,       84           // 3703
   2040    vmul_vmadd_w vr4, vr26, vr21, vr20, vr8, vr7
   2041    vmul_vmsub_w vr4, vr26, vr20, vr21, vr11, vr19
   2042    vssrarni.h.w  vr7,      vr8,      12           // t29a
   2043    vssrarni.h.w  vr19,     vr11,     12           // t18a
   2044    vldrepl.w     vr20,     t0,       88           // 3857
   2045    vldrepl.w     vr21,     t0,       92           // 1380
   2046    vmul_vmadd_w vr27, vr3, vr21, vr20, vr8, vr4
   2047    vmul_vmsub_w vr27, vr3, vr20, vr21, vr11, vr26
   2048    vssrarni.h.w  vr4,      vr8,      12           // t28a
   2049    vssrarni.h.w  vr26,     vr11,     12           // t19a
   2050    vldrepl.w     vr20,     t0,       96           // 995
   2051    vldrepl.w     vr21,     t0,       100          // 3973
   2052    vmul_vmadd_w vr2, vr28, vr21, vr20, vr8, vr3
   2053    vmul_vmsub_w vr2, vr28, vr20, vr21, vr11, vr27
   2054    vssrarni.h.w  vr3,      vr8,      12           // t27a
   2055    vssrarni.h.w  vr27,     vr11,     12           // t20a
   2056    vldrepl.w     vr20,     t0,       104          // 3513
   2057    vldrepl.w     vr21,     t0,       108          // 2106
   2058    vmul_vmadd_w vr25, vr5, vr21, vr20, vr8, vr2
   2059    vmul_vmsub_w vr25, vr5, vr20, vr21, vr11, vr28
   2060    vssrarni.h.w  vr2,      vr8,      12           // t26a
   2061    vssrarni.h.w  vr28,     vr11,     12           // t21a
   2062    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
   2063    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
   2064    vmul_vmadd_w vr6, vr24, vr21, vr20, vr8, vr5
   2065    vmul_vmsub_w vr6, vr24, vr20, vr21, vr11, vr25
   2066    vssrarni.h.w  vr5,      vr8,      12           // t25a
   2067    vssrarni.h.w  vr25,     vr11,     12           // t22a
   2068    vldrepl.w     vr20,     t0,       120          // 4052
   2069    vldrepl.w     vr21,     t0,       124          // 601
   2070    vmul_vmadd_w vr29, vr1, vr21, vr20, vr8, vr6
   2071    vmul_vmsub_w vr29, vr1, vr20, vr21, vr11, vr24
   2072    vssrarni.h.w  vr6,      vr8,      12           // t24a
   2073    vssrarni.h.w  vr24,     vr11,     12           // t23a
   2074 
   2075    vsadd.h       vr1,      vr10,     vr30         // t16
   2076    vssub.h       vr29,     vr10,     vr30         // t17
   2077    vssub.h       vr8,      vr26,     vr19         // t18
   2078    vsadd.h       vr31,     vr26,     vr19         // t19
   2079    vsadd.h       vr10,     vr27,     vr28         // t20
   2080    vssub.h       vr30,     vr27,     vr28         // t21
   2081    vssub.h       vr19,     vr24,     vr25         // t22
   2082    vsadd.h       vr26,     vr24,     vr25         // t23
   2083    vsadd.h       vr27,     vr6,      vr5          // t24
   2084    vssub.h       vr28,     vr6,      vr5          // t25
   2085    vssub.h       vr24,     vr3,      vr2          // t26
   2086    vsadd.h       vr25,     vr3,      vr2          // t27
   2087    vsadd.h       vr5,      vr4,      vr7          // t28
   2088    vssub.h       vr6,      vr4,      vr7          // t29
   2089    vssub.h       vr2,      vr9,      vr0          // t30
   2090    vsadd.h       vr3,      vr9,      vr0          // t31
   2091 
   2092    vldrepl.w     vr20,     t0,       16           // 799
   2093    vldrepl.w     vr21,     t0,       20           // 4017
   2094    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
   2095    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
   2096    vssrarni.h.w  vr7,      vr4,      12           // t30a
   2097    vssrarni.h.w  vr0,      vr11,     12           // t17a
   2098    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
   2099    vneg.w        vr4,      vr4
   2100    vneg.w        vr9,      vr9
   2101    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
   2102    vssrarni.h.w  vr9,      vr4,      12           // t18a
   2103    vssrarni.h.w  vr2,      vr11,     12           // t29a
   2104    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
   2105    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
   2106    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
   2107    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
   2108    vssrarni.h.w  vr29,     vr4,      12           // t26a
   2109    vssrarni.h.w  vr6,      vr11,     12           // t21a
   2110    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
   2111    vneg.w        vr4,      vr4
   2112    vneg.w        vr8,      vr8
   2113    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
   2114    vssrarni.h.w  vr8,      vr4,      12           // t22a
   2115    vssrarni.h.w  vr24,     vr11,     12           // t25a
   2116 
   2117    vsadd.h       vr4,      vr1,      vr31         // t16a
   2118    vssub.h       vr30,     vr1,      vr31         // t19a
   2119    vsadd.h       vr19,     vr0,      vr9          // t17
   2120    vssub.h       vr28,     vr0,      vr9          // t18
   2121    vssub.h       vr1,      vr26,     vr10         // t20a
   2122    vsadd.h       vr31,     vr26,     vr10         // t23a
   2123    vssub.h       vr0,      vr8,      vr6          // t21
   2124    vsadd.h       vr9,      vr8,      vr6          // t22
   2125    vsadd.h       vr10,     vr27,     vr25         // t24a
   2126    vssub.h       vr26,     vr27,     vr25         // t27a
   2127    vsadd.h       vr6,      vr24,     vr29         // t25
   2128    vssub.h       vr8,      vr24,     vr29         // t26
   2129    vssub.h       vr25,     vr3,      vr5          // t28a
   2130    vsadd.h       vr27,     vr3,      vr5          // t31a
   2131    vssub.h       vr24,     vr7,      vr2          // t29
   2132    vsadd.h       vr29,     vr7,      vr2          // t30
   2133 
   2134    vldrepl.w     vr20,     t0,       8            // 1567
   2135    vldrepl.w     vr21,     t0,       12           // 3784
   2136    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
   2137    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
   2138    vssrarni.h.w  vr5,      vr3,      12           // t29a
   2139    vssrarni.h.w  vr2,      vr11,     12           // 18a
   2140    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
   2141    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
   2142    vssrarni.h.w  vr7,      vr3,      12           // t28
   2143    vssrarni.h.w  vr24,     vr11,     12           // t19
   2144    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
   2145    vneg.w        vr3,      vr3
   2146    vneg.w        vr28,     vr28
   2147    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
   2148    vssrarni.h.w  vr28,     vr3,      12           // t20
   2149    vssrarni.h.w  vr25,     vr11,     12           // t27
   2150    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
   2151    vneg.w        vr3,      vr3
   2152    vneg.w        vr30,     vr30
   2153    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
   2154    vssrarni.h.w  vr30,     vr3,      12           // t21a
   2155    vssrarni.h.w  vr1,      vr11,     12           // t26a
   2156 
   2157    vsadd.h       vr3,      vr4,      vr31         // t16
   2158    vssub.h       vr26,     vr4,      vr31         // t23
   2159    vsadd.h       vr0,      vr19,     vr9          // t17a
   2160    vssub.h       vr8,      vr19,     vr9          // t22a
   2161    vsadd.h       vr4,      vr2,      vr30         // t18
   2162    vssub.h       vr31,     vr2,      vr30         // t21
   2163    vsadd.h       vr9,      vr24,     vr28         // t19a
   2164    vssub.h       vr19,     vr24,     vr28         // t20a
   2165    vssub.h       vr2,      vr27,     vr10         // t24
   2166    vsadd.h       vr30,     vr27,     vr10         // t31
   2167    vssub.h       vr24,     vr29,     vr6          // t25a
   2168    vsadd.h       vr28,     vr29,     vr6          // t30a
   2169    vssub.h       vr10,     vr5,      vr1          // t26
   2170    vsadd.h       vr27,     vr5,      vr1          // t29
   2171    vssub.h       vr6,      vr7,      vr25         // t27a
   2172    vsadd.h       vr29,     vr7,      vr25         // t28a
   2173 
   2174    vldrepl.w     vr20,     t0,       0            // 2896
   2175    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
   2176    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
   2177    vssrarni.h.w  vr5,      vr1,      12           // t20
   2178    vssrarni.h.w  vr7,      vr11,     12           // t27
   2179    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
   2180    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
   2181    vssrarni.h.w  vr25,     vr1,      12           // t21a
   2182    vssrarni.h.w  vr6,      vr11,     12           // t26a
   2183    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
   2184    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
   2185    vssrarni.h.w  vr19,     vr1,      12           // t22
   2186    vssrarni.h.w  vr10,     vr11,     12           // t25
   2187    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
   2188    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
   2189    vssrarni.h.w  vr31,     vr1,      12           // t23a
   2190    vssrarni.h.w  vr8,      vr11,     12           // t24a
   2191 
   2192    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
   2193    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
   2194    vld_x8 \in2, \vld_st0, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
   2195 
   2196    vsadd.h       vr1,      vr11,     vr30         // c[0]
   2197    vssub.h       vr2,      vr11,     vr30         // c[31]
   2198    vsadd.h       vr24,     vr12,     vr28         // c[1]
   2199    vssub.h       vr26,     vr12,     vr28         // c[30]
   2200    vsadd.h       vr11,     vr13,     vr27         // c[2]
   2201    vssub.h       vr30,     vr13,     vr27         // c[29]
   2202    vsadd.h       vr12,     vr14,     vr29         // c[3]
   2203    vssub.h       vr28,     vr14,     vr29         // c[28]
   2204    vsadd.h       vr13,     vr15,     vr7          // c[4]
   2205    vssub.h       vr27,     vr15,     vr7          // c[27]
   2206    vsadd.h       vr14,     vr16,     vr6          // c[5]
   2207    vssub.h       vr29,     vr16,     vr6          // c[26]
   2208    vsadd.h       vr7,      vr17,     vr10         // c[6]
   2209    vssub.h       vr15,     vr17,     vr10         // c[25]
   2210    vsadd.h       vr6,      vr18,     vr8          // c[7]
   2211    vssub.h       vr16,     vr18,     vr8          // c[24]
   2212 
   2213 .ifnb \transpose8x8
   2214    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
   2215                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
   2216                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
   2217 .endif
   2218 
   2219 .ifnb \shift
   2220 .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
   2221    vsrari.h      \i,       \i,       \shift
   2222 .endr
   2223 .endif
   2224 
   2225    vst_x8 \in1, \vst_st0, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
   2226 
   2227 .ifnb \transpose8x8
   2228    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
   2229                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
   2230                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
   2231 .endif
   2232 
   2233 .ifnb \shift
   2234 .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
   2235    vsrari.h      \i,       \i,       \shift
   2236 .endr
   2237 .endif
   2238 
   2239    vst_x8 \in1, \vst_st1, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
   2240 
   2241    vld_x8 \in2, \vld_st1, \vld_stride, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
   2242 
   2243    vsadd.h       vr1,      vr11,     vr31         // c[8]
   2244    vssub.h       vr2,      vr11,     vr31         // c[23]
   2245    vsadd.h       vr24,     vr12,     vr19         // c[9]
   2246    vssub.h       vr26,     vr12,     vr19         // c[22]
   2247    vsadd.h       vr11,     vr13,     vr25         // c[10]
   2248    vssub.h       vr30,     vr13,     vr25         // c[21]
   2249    vsadd.h       vr12,     vr14,     vr5          // c[11]
   2250    vssub.h       vr28,     vr14,     vr5          // c[20]
   2251    vsadd.h       vr13,     vr15,     vr9          // c[12]
   2252    vssub.h       vr27,     vr15,     vr9          // c[19]
   2253    vsadd.h       vr14,     vr16,     vr4          // c[13]
   2254    vssub.h       vr29,     vr16,     vr4          // c[18]
   2255    vsadd.h       vr7,      vr17,     vr0          // c[14]
   2256    vssub.h       vr15,     vr17,     vr0          // c[17]
   2257    vsadd.h       vr6,      vr18,     vr3          // c[15]
   2258    vssub.h       vr16,     vr18,     vr3          // c[16]
   2259 
   2260 .ifnb \transpose8x8
   2261    LSX_TRANSPOSE8x8_H vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
   2262                       vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6, \
   2263                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
   2264 .endif
   2265 
   2266 .ifnb \shift
   2267 .irp i, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
   2268    vsrari.h      \i,       \i,       \shift
   2269 .endr
   2270 .endif
   2271 
   2272    vst_x8 \in1, \vst_st2, \vst_stride, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
   2273 
   2274 .ifnb \transpose8x8
   2275    LSX_TRANSPOSE8x8_H vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
   2276                       vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2, \
   2277                       vr8, vr10, vr17, vr18, vr20, vr21, vr22, vr23
   2278 .endif
   2279 
   2280 .ifnb \shift
   2281 .irp i, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
   2282    vsrari.h      \i,       \i,       \shift
   2283 .endr
   2284 .endif
   2285 
   2286    vst_x8 \in1, \vst_st3, \vst_stride, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
   2287 .endm
   2288 
   2289 const eob_32x32
   2290        .short 36, 136, 300, 1024
   2291 endconst
   2292 
   2293 const eob_8x32
   2294        .short 43, 107, 171, 256
   2295 endconst
   2296 
   2297 const eob_16x32
   2298        .short 36, 151, 279, 512
   2299 endconst
   2300 
   2301 .macro DST_ADD_W32 in0, in1, in2, in3, in4, in5, in6, in7
   2302    vsllwil.hu.bu vr4,      vr10,     0
   2303    vsllwil.hu.bu vr5,      vr11,     0
   2304    vsllwil.hu.bu vr6,      vr12,     0
   2305    vsllwil.hu.bu vr7,      vr13,     0
   2306    vexth.hu.bu   vr10,     vr10
   2307    vexth.hu.bu   vr11,     vr11
   2308    vexth.hu.bu   vr12,     vr12
   2309    vexth.hu.bu   vr13,     vr13
   2310    vadd.h        vr4,      vr4,      \in0
   2311    vadd.h        vr10,     vr10,     \in1
   2312    vadd.h        vr5,      vr5,      \in2
   2313    vadd.h        vr11,     vr11,     \in3
   2314    vadd.h        vr6,      vr6,      \in4
   2315    vadd.h        vr12,     vr12,     \in5
   2316    vadd.h        vr7,      vr7,      \in6
   2317    vadd.h        vr13,     vr13,     \in7
   2318    vssrani.bu.h  vr10,     vr4,      0
   2319    vssrani.bu.h  vr11,     vr5,      0
   2320    vssrani.bu.h  vr12,     vr6,      0
   2321    vssrani.bu.h  vr13,     vr7,      0
   2322    vst           vr10,     a0,       0
   2323    vst           vr11,     a0,       16
   2324    vst           vr12,     t2,       0
   2325    vst           vr13,     t2,       16
   2326 .endm
   2327 
   2328 .macro idct_dc_w32 w, h, shift
   2329    ld.h          t2,       a2,       0      // dc
   2330    vldi          vr0,      0x8b5            // 181
   2331    vreplgr2vr.w  vr1,      t2
   2332    vldi          vr20,     0x880            // 128
   2333    vmul.w        vr2,      vr0,      vr1    // dc * 181
   2334    st.h          zero,     a2,       0
   2335    add.d         t2,       a0,       a1
   2336    vsrari.w      vr2,      vr2,      8      // (dc * 181 + 128) >> 8
   2337    vld           vr13,     t2,       16
   2338 
   2339 .if (2*\w == \h) || (2*\h == \w)
   2340    vmul.w        vr2,      vr2,      vr0
   2341    vsrari.w      vr2,      vr2,      8
   2342 .endif
   2343 
   2344 .if \shift>0
   2345    vsrari.w      vr2,      vr2,      \shift      // (dc + rnd) >> shift
   2346 .endif
   2347    vld           vr11,     a0,       16
   2348    vmadd.w       vr20,     vr2,      vr0
   2349    vld           vr12,     t2,       0
   2350    vssrarni.h.w  vr20,     vr20,     12
   2351    vld           vr10,     a0,       0
   2352 .endm
   2353 
   2354 function inv_txfm_add_dct_dct_32x8_8bpc_lsx
   2355    bnez          a3,       .NO_HAS_DCONLY_32x8
   2356 
   2357    idct_dc_w32 32, 8, 2
   2358 
   2359    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2360 
   2361 .rept 3
   2362    alsl.d        a0,       a1,       a0,     1
   2363    add.d         t2,       a0,       a1
   2364    vld           vr10,     a0,       0
   2365    vld           vr11,     a0,       16
   2366    vld           vr12,     t2,       0
   2367    vld           vr13,     t2,       16
   2368    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2369 .endr
   2370    b             .DCT_DCT_32X8_END
   2371 .NO_HAS_DCONLY_32x8:
   2372    malloc_space 512+256
   2373 
   2374    addi.d        t1,       sp,       64
   2375    addi.d        t2,       a2,       0
   2376    addi.d        t3,       sp,       64
   2377    addi.d        t3,       t3,       512
   2378 
   2379    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2380            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2381 
   2382    vxor.v        vr31,     vr31,     vr31
   2383    vst_x16 t2, 0, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2384            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2385 
   2386    inv_dct16_lsx .8h
   2387 
   2388    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2389            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2390 
   2391    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2392            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   2393 
   2394    vxor.v        vr31,     vr31,     vr31
   2395 
   2396    vst_x16 t2, 16, 32, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2397            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2398 
   2399    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2
   2400 
   2401    addi.d        t2,       sp,       64
   2402 .rept 4
   2403    vld_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   2404 
   2405    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
   2406 
   2407 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   2408    vsrari.h      \i,       \i,       4
   2409 .endr
   2410 
   2411    vst_x8 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   2412 
   2413    addi.d        t2,       t2,       16
   2414 .endr
   2415 
   2416    addi.d        t0,       sp,       64
   2417 .rept 4
   2418    add.d         t2,       a0,       a1
   2419    vld           vr10,     a0,       0
   2420    vld           vr11,     a0,       16
   2421    vld           vr12,     t2,       0
   2422    vld           vr13,     t2,       16
   2423    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   2424    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   2425    alsl.d        a0,       a1,       a0,     1
   2426    addi.d        t0,       t0,       128
   2427 .endr
   2428    free_space 512+256
   2429 .DCT_DCT_32X8_END:
   2430 endfunc
   2431 
   2432 function inv_txfm_add_dct_dct_32x16_8bpc_lsx
   2433    bnez          a3,       .NO_HAS_DCONLY_32x16
   2434 
   2435    idct_dc_w32 32, 16, 1
   2436 
   2437    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2438 
   2439 .rept 7
   2440    alsl.d        a0,       a1,       a0,     1
   2441    add.d         t2,       a0,       a1
   2442    vld           vr10,     a0,       0
   2443    vld           vr11,     a0,       16
   2444    vld           vr12,     t2,       0
   2445    vld           vr13,     t2,       16
   2446    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2447 .endr
   2448    b             .DCT_DCT_32X16_END
   2449 .NO_HAS_DCONLY_32x16:
   2450    malloc_space 1024+256                            // 32*32*2+512
   2451    addi.d        t1,       sp,       64
   2452    addi.d        t2,       a2,       0
   2453    addi.d        t3,       sp,       64
   2454    addi.d        t3,       t3,       1024
   2455 .rept 2
   2456    vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2457            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2458 
   2459    vxor.v        vr31,     vr31,     vr31
   2460    vst_x16 t2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2461            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2462 
   2463    li.w          t0,       2896
   2464    vreplgr2vr.w  vr23,     t0
   2465 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2466     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2467    rect2_lsx   \i, vr23, \i
   2468 .endr
   2469 
   2470    inv_dct16_lsx .8h
   2471 
   2472    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2473            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2474 
   2475    vld_x16 t2, 32, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2476            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   2477 
   2478    la.local      t0,       idct_coeffs
   2479    vldrepl.w     vr23,     t0,       0        // 2896
   2480 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2481    vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   2482    rect2_lsx \i, vr23, \i
   2483 .endr
   2484    vxor.v        vr31,     vr31,     vr31
   2485    vst_x16 t2, 32, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2486            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2487 
   2488    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 1
   2489 
   2490    addi.d        t2,       t2,       16
   2491    addi.d        t1,       t1,       512
   2492 .endr
   2493 
   2494    addi.d        t2,       sp,       64
   2495 .rept 4
   2496    vld_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2497            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2498 
   2499    inv_dct16_lsx .8h
   2500 
   2501 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2502    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2503    vsrari.h      \i,       \i,       4
   2504 .endr
   2505 
   2506    vst_x16 t2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2507            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2508 
   2509    addi.d        t2,       t2,       16
   2510 .endr
   2511 
   2512    addi.d        t0,       sp,       64
   2513 .rept 8
   2514    add.d         t2,       a0,       a1
   2515    vld           vr10,     a0,       0
   2516    vld           vr11,     a0,       16
   2517    vld           vr12,     t2,       0
   2518    vld           vr13,     t2,       16
   2519    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   2520    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   2521 
   2522    alsl.d        a0,       a1,       a0,     1
   2523    addi.d        t0,       t0,       128
   2524 .endr
   2525    free_space 1024+256
   2526 .DCT_DCT_32X16_END:
   2527 endfunc
   2528 
   2529 function inv_txfm_add_dct_dct_32x32_8bpc_lsx
   2530    bnez          a3,       .NO_HAS_DCONLY_32x32
   2531 
   2532    idct_dc_w32 32, 32, 2
   2533 
   2534    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2535 .rept 15
   2536    alsl.d        a0,       a1,       a0,     1
   2537    add.d         t2,       a0,       a1
   2538    vld           vr10,     a0,       0
   2539    vld           vr11,     a0,       16
   2540    vld           vr12,     t2,       0
   2541    vld           vr13,     t2,       16
   2542    DST_ADD_W32 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   2543 .endr
   2544    b             .DCT_DCT_32X32_END
   2545 .NO_HAS_DCONLY_32x32:
   2546    malloc_space 2560                              // 32*32*2+512
   2547 
   2548    addi.d        t1,       sp,       64
   2549    addi.d        t2,       a2,       0
   2550    addi.d        t3,       sp,       1024
   2551    addi.d        t3,       t3,       1024
   2552    addi.d        t3,       t3,       64
   2553 
   2554    la.local      t8,       eob_32x32
   2555 .DCT_DCT_EOB_32x32:
   2556    ld.h          t7,       t8,       0
   2557    addi.d        t8,       t8,       2
   2558 
   2559    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2560            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2561 
   2562    vxor.v        vr31,     vr31,     vr31
   2563    vst_x16 t2, 0, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2564            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2565 
   2566    inv_dct16_lsx .8h
   2567 
   2568    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2569            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2570 
   2571    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2572            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   2573 
   2574    vxor.v        vr31,     vr31,     vr31
   2575 
   2576    vst_x16 t2, 64, 128, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2577            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2578 
   2579    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 48, 16, 32, 64, transpose8x8, 2
   2580 
   2581    addi.d        t2,       t2,       16
   2582    addi.d        t1,       t1,       512
   2583    bge           a3,       t7,       .DCT_DCT_EOB_32x32
   2584 
   2585    la.local      t8,       eob_32x32
   2586    vxor.v        vr31,     vr31,     vr31
   2587    ld.h          t7,       t8,       4
   2588    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END   // a3>=t7
   2589    vst_x16 sp, 64+1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2590        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2591    addi.d        t1,       sp,       256+64
   2592    vst_x16 t1, 1536, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2593        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2594 
   2595    ld.h          t7,       t8,       2
   2596    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END
   2597    vst_x16 sp, 64+1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2598        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2599    vst_x16 t1, 1024, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2600        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2601 
   2602    ld.h          t7,       t8,       0
   2603    bge           a3,       t7,       .DCT_DCT_EOB_32x32_END
   2604    vst_x16 sp, 64+512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2605        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2606 
   2607    vst_x16 t1, 512, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   2608        vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   2609 
   2610 .DCT_DCT_EOB_32x32_END:
   2611    addi.d        t2,       sp,       64
   2612    addi.d        t1,       sp,       64
   2613 .rept 4
   2614    vld_x16 t2, 0, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2615            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2616 
   2617    inv_dct16_lsx .8h
   2618 
   2619    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2620            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   2621 
   2622    vld_x16 t2, 64, 128, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   2623            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   2624 
   2625    dct_8x32_core_lsx t1, t3, 0, 128, 16, 0, 1536, 512, 1024, 64, , 4
   2626 
   2627    addi.d        t2,       t2,       16
   2628    addi.d        t1,       t1,       16
   2629 .endr
   2630 
   2631    addi.d        t0,       sp,       64
   2632 .rept 16
   2633    add.d         t2,       a0,       a1
   2634    vld           vr10,     a0,       0
   2635    vld           vr11,     a0,       16
   2636    vld           vr12,     t2,       0
   2637    vld           vr13,     t2,       16
   2638    vld_x8 t0, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   2639    DST_ADD_W32 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   2640    alsl.d        a0,       a1,       a0,     1
   2641    addi.d        t0,       t0,       128
   2642 .endr
   2643 
   2644    free_space 2560                                // 32*32*2+512
   2645 .DCT_DCT_32X32_END:
   2646 endfunc
   2647 
   2648 /*
   2649 * temp: vr8, vr9, vr10, vr12, vr20, vr21, vr22, vr23
   2650 */
   2651 .macro dct_8x8_tx64_core_lsx in0, in1, in2, in3, in4, in5, in6, in7, out0, \
   2652                             out1, out2, out3, out4, out5, out6, out7, rect2
   2653 
   2654    la.local      t0,       idct_coeffs
   2655 
   2656 .ifc \rect2, rect2_lsx
   2657    vldrepl.w     vr23,      t0,       0        // 2896
   2658 .irp i, \in0, \in1, \in2, \in3, \in4, \in5, \in6, \in7
   2659    rect2_lsx \i, vr23, \i
   2660 .endr
   2661 .endif
   2662 
   2663    la.local      t0,       idct_coeffs
   2664 
   2665    vldrepl.w     vr20,     t0,       8            // 1567
   2666    vldrepl.w     vr21,     t0,       12           // 3784
   2667    vsllwil.w.h   vr22,     \in2,     0
   2668    vexth.w.h     vr23,     \in2
   2669    vmul.w        vr8,      vr22,     vr20
   2670    vmul.w        vr10,     vr23,     vr20
   2671    vmul.w        \in2,     vr22,     vr21
   2672    vmul.w        vr9,      vr23,     vr21
   2673    vssrarni.h.w  vr10,     vr8,      12           // t2
   2674    vssrarni.h.w  vr9,      \in2,     12           // t3
   2675 
   2676    vldrepl.w     vr20,     t0,       0            // 2896
   2677    vsllwil.w.h   vr22,     \in0,     0
   2678    vexth.w.h     vr23,     \in0
   2679    vmul.w        vr8,      vr22,     vr20
   2680    vmul.w        \in2,     vr23,     vr20
   2681    vssrarni.h.w  \in2,     vr8,      12
   2682 
   2683    vsadd.h       vr8,      \in2,     vr9          // c[0]
   2684    vssub.h       vr9,      \in2,     vr9          // c[3]
   2685    vsadd.h       \in0,     \in2,     vr10         // c[1]
   2686    vssub.h       vr10,     \in2,     vr10         // c[2]
   2687 
   2688    // inv_dct8_1d_internal_c tx64
   2689    // in1 in3
   2690    vldrepl.w     vr20,     t0,       16           // 799
   2691    vldrepl.w     vr21,     t0,       20           // 4017
   2692 
   2693    vsllwil.w.h   vr22,     \in1,     0
   2694    vexth.w.h     vr23,     \in1
   2695    vmul.w        \in2,     vr22,     vr21
   2696    vmul.w        \in4,     vr23,     vr21
   2697    vmul.w        \in1,     vr22,     vr20
   2698    vmul.w        \in6,     vr23,     vr20
   2699    vssrarni.h.w  \in4,     \in2,     12           // t7a
   2700    vssrarni.h.w  \in6,     \in1,     12           // t4a
   2701 
   2702    vldrepl.w     vr20,     t0,       24           // 3406
   2703    vldrepl.w     vr21,     t0,       28           // 2276
   2704 
   2705    vsllwil.w.h   vr22,     \in3,     0
   2706    vexth.w.h     vr23,     \in3
   2707    vneg.w        vr21,     vr21
   2708    vmul.w        \in2,     vr22,     vr20
   2709    vmul.w        \in1,     vr23,     vr20
   2710    vmul.w        \in3,     vr22,     vr21
   2711    vmul.w        \in7,     vr23,     vr21
   2712    vssrarni.h.w  \in1,     \in2,     12           // t6a
   2713    vssrarni.h.w  \in7,     \in3,     12           // t5a
   2714 
   2715    vsadd.h       \in3,     \in6,     \in7         // t4
   2716    vssub.h       \in6,     \in6,     \in7         // t5a
   2717    vsadd.h       \in5,     \in4,     \in1         // t7
   2718    vssub.h       \in4,     \in4,     \in1         // t6a
   2719 
   2720    vldrepl.w     vr20,     t0,       0            // 2896
   2721    vmul_vmadd_w  \in4, \in6, vr20, vr20, vr21, \in1
   2722    vmul_vmsub_w  \in4, \in6, vr20, vr20, \in2, \in7
   2723    vssrarni.h.w  \in1,     vr21,     12           // t6
   2724    vssrarni.h.w  \in7,     \in2,     12           // t5
   2725 
   2726    vsadd.h       \out0,    vr8,      \in5         // c[0]
   2727    vssub.h       \out7,    vr8,      \in5         // c[7]
   2728    vsadd.h       \out1,    \in0,     \in1         // c[1]
   2729    vssub.h       \out6,    \in0,     \in1         // c[6]
   2730    vsadd.h       \out2,    vr10,     \in7         // c[2]
   2731    vssub.h       \out5,    vr10,     \in7         // c[5]
   2732    vsadd.h       \out3,    vr9,      \in3         // c[3]
   2733    vssub.h       \out4,    vr9,      \in3         // c[4]
   2734 .endm
   2735 
   2736 /*
   2737 * input:  in0,  in1,  in2,  in3,  in4,  in5,  in6,  in7       (fixed)
   2738 *         vr0,  vr1,  vr2,  vr3,  vr4,  vr5,  vr6,  vr7
   2739 *         in8,  in9,  in10, in11, in12, in13, in14, in15
   2740 *         vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   2741 * output: out0, out1, out2, out3, out4, out5, out6, out7      (fixed)
   2742 *         vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16
   2743 *         out8, out9, out10, out11, out12, out13, out14, out15
   2744 *         vr27, vr30, vr23,  vr21,  vr29,  vr26,  vr25,  vr24
   2745 */
   2746 .macro dct_8x16_tx64_core_lsx rect2
   2747    dct_8x8_tx64_core_lsx vr0, vr2, vr4, vr6, vr19, vr25, vr27, vr29, vr11, \
   2748                          vr12, vr13, vr14, vr15, vr16, vr17, vr18, \rect2
   2749 
   2750    // in1 in3 in5 in7 in9  in11 in13 in15
   2751    // vr1 vr3 vr5 vr7 vr24 vr26 vr28 vr30
   2752    la.local      t0,       idct_coeffs
   2753 
   2754 .ifc \rect2, rect2_lsx
   2755    vldrepl.w     vr23,      t0,       0        // 2896
   2756 .irp i, vr1, vr3, vr5, vr7, vr24, vr26, vr28, vr30
   2757    rect2_lsx \i, vr23, \i
   2758 .endr
   2759 .endif
   2760 
   2761    vldrepl.w     vr20,     t0,       32           // 401
   2762    vldrepl.w     vr21,     t0,       36           // 4076
   2763    vsllwil.w.h   vr22,     vr1,      0
   2764    vexth.w.h     vr23,     vr1
   2765    vmul.w        vr0,      vr22,     vr21
   2766    vmul.w        vr10,     vr23,     vr21
   2767    vmul.w        vr1,      vr22,     vr20
   2768    vmul.w        vr29,     vr23,     vr20
   2769    vssrarni.h.w  vr10,     vr0,      12           // t15a
   2770    vssrarni.h.w  vr29,     vr1,      12           // t8a
   2771 
   2772    vldrepl.w     vr20,     t0,       40           // 3166 -> 1583
   2773    vldrepl.w     vr21,     t0,       44           // 2598 -> 1299
   2774    vsllwil.w.h   vr22,     vr7,      0
   2775    vexth.w.h     vr23,     vr7
   2776    vneg.w        vr21,     vr21
   2777    vmul.w        vr0,      vr22,     vr20
   2778    vmul.w        vr30,     vr23,     vr20
   2779    vmul.w        vr7,      vr22,     vr21
   2780    vmul.w        vr31,     vr23,     vr21
   2781    vssrarni.h.w  vr30,     vr0,      12           // t14a
   2782    vssrarni.h.w  vr31,     vr7,      12           // t9a
   2783 
   2784    vldrepl.w     vr20,     t0,       48           // 1931
   2785    vldrepl.w     vr21,     t0,       52           // 3612
   2786    vsllwil.w.h   vr22,     vr5,      0
   2787    vexth.w.h     vr23,     vr5
   2788    vmul.w        vr0,      vr22,     vr21
   2789    vmul.w        vr24,     vr23,     vr21
   2790    vmul.w        vr5,      vr22,     vr20
   2791    vmul.w        vr25,     vr23,     vr20
   2792    vssrarni.h.w  vr24,     vr0,      12           // t13a
   2793    vssrarni.h.w  vr25,     vr5,      12           // t10a
   2794 
   2795    vldrepl.w     vr20,     t0,       56           // 3920
   2796    vldrepl.w     vr21,     t0,       60           // 1189
   2797    vsllwil.w.h   vr22,     vr3,      0
   2798    vexth.w.h     vr23,     vr3
   2799    vneg.w        vr21,     vr21
   2800    vmul.w        vr0,      vr22,     vr20
   2801    vmul.w        vr26,     vr23,     vr20
   2802    vmul.w        vr3,      vr22,     vr21
   2803    vmul.w        vr27,     vr23,     vr21
   2804    vssrarni.h.w  vr26,     vr0,      12           // t12a
   2805    vssrarni.h.w  vr27,     vr3,      12           // t11a
   2806 
   2807    // vr22 vr23 vr30 vr31 vr24 vr25 vr26 vr27
   2808    vsadd.h       vr28,     vr29,      vr31        // t8
   2809    vssub.h       vr19,     vr29,      vr31        // t9
   2810    vssub.h       vr29,     vr27,      vr25        // t10
   2811    vsadd.h       vr9,      vr27,      vr25        // t11
   2812    vsadd.h       vr31,     vr26,      vr24        // t12
   2813    vssub.h       vr25,     vr26,      vr24        // t13
   2814    vssub.h       vr27,     vr10,      vr30        // t14
   2815    vsadd.h       vr24,     vr10,      vr30        // t15
   2816 
   2817    vldrepl.w     vr20,     t0,       8            // 1567
   2818    vldrepl.w     vr21,     t0,       12           // 3784
   2819    vmul_vmadd_w vr27, vr19, vr21, vr20, vr0, vr26
   2820    vmul_vmsub_w vr27, vr19, vr20, vr21, vr1, vr30
   2821    vssrarni.h.w  vr26,     vr0,       12          // t14a
   2822    vssrarni.h.w  vr30,     vr1,       12          // t9a
   2823 
   2824    vmul_vmadd_w vr25, vr29, vr21, vr20, vr0, vr19
   2825    vneg.w        vr0,      vr0
   2826    vneg.w        vr19,     vr19
   2827    vmul_vmsub_w vr25, vr29, vr20, vr21, vr1, vr27
   2828    vssrarni.h.w  vr19,     vr0,       12          // t10a
   2829    vssrarni.h.w  vr27,     vr1,       12          // t13a
   2830 
   2831    vsadd.h       vr25,     vr28,     vr9          // t8a
   2832    vssub.h       vr29,     vr28,     vr9          // t11a
   2833    vssub.h       vr28,     vr24,     vr31         // t12a
   2834    vsadd.h       vr10,     vr24,     vr31         // t15a
   2835    vsadd.h       vr9,      vr30,     vr19         // t9
   2836    vssub.h       vr31,     vr30,     vr19         // t10
   2837    vssub.h       vr30,     vr26,     vr27         // t13
   2838    vsadd.h       vr24,     vr26,     vr27         // t14
   2839 
   2840    vldrepl.w     vr20,     t0,       0            // 2896
   2841    vmul_vmadd_w vr30, vr31, vr20, vr20, vr0, vr26
   2842    vmul_vmsub_w vr30, vr31, vr20, vr20, vr1, vr27
   2843    vssrarni.h.w  vr26,     vr0,      12           // t13a
   2844    vssrarni.h.w  vr27,     vr1,      12           // t10a
   2845 
   2846    vmul_vmadd_w vr28, vr29, vr20, vr20, vr0, vr31
   2847    vmul_vmsub_w vr28, vr29, vr20, vr20, vr1, vr30
   2848    vssrarni.h.w  vr31,     vr0,      12           // t12
   2849    vssrarni.h.w  vr30,     vr1,      12           // t11
   2850 
   2851    // vr11 vr12 ... vr18
   2852    vsadd.h       vr28,     vr14,     vr31         // c[3]
   2853    vssub.h       vr29,     vr14,     vr31         // c[12]
   2854    vsadd.h       vr20,     vr15,     vr30         // c[4]
   2855    vssub.h       vr21,     vr15,     vr30         // c[11]
   2856    vsadd.h       vr14,     vr16,     vr27         // c[5]
   2857    vssub.h       vr23,     vr16,     vr27         // c[10]
   2858    vsadd.h       vr15,     vr17,     vr9          // c[6]
   2859    vssub.h       vr30,     vr17,     vr9          // c[9]
   2860    vsadd.h       vr16,     vr18,     vr25         // c[7]
   2861    vssub.h       vr27,     vr18,     vr25         // c[8]
   2862    vsadd.h       vr17,     vr13,     vr26         // c[2]
   2863    vssub.h       vr26,     vr13,     vr26         // c[13]
   2864    vsadd.h       vr18,     vr12,     vr24         // c[1]
   2865    vssub.h       vr25,     vr12,     vr24         // c[14]
   2866    vsadd.h       vr22,     vr11,     vr10         // c[0]
   2867    vssub.h       vr24,     vr11,     vr10         // c[15]
   2868 .endm // dct_8x16_tx64_core_lsx
   2869 
   2870 .macro vmul_vssrarni_hw in0, in1, in2, tmp0, tmp1, out0, out1
   2871    vsllwil.w.h   vr22,      \in0,     0
   2872    vexth.w.h     vr23,      \in0
   2873    vmul.w        \tmp0,     vr22,     \in1
   2874    vmul.w        \out0,     vr23,     \in1
   2875    vmul.w        \tmp1,     vr22,     \in2
   2876    vmul.w        \out1,     vr23,     \in2
   2877    vssrarni.h.w  \out0,     \tmp0,    12
   2878    vssrarni.h.w  \out1,     \tmp1,    12
   2879 .endm
   2880 
   2881 const idct64_coeffs, align=4
   2882    .word         101, 4095, 2967, -2824
   2883    .word         1660, 3745, 3822, -1474
   2884    .word         4076, 401, 4017, 799
   2885    .word         4036, -700, 2359, 3349
   2886    .word         3461, -2191, 897, 3996
   2887    .word         -3166, -2598, -799, -4017
   2888    .word         501, 4065, 3229, -2520
   2889    .word         2019, 3564, 3948, -1092
   2890    .word         3612, 1931, 2276, 3406
   2891    .word         4085, -301, 2675, 3102
   2892    .word         3659, -1842, 1285, 3889
   2893    .word         -3920, -1189, -3406, -2276
   2894 endconst
   2895 
   2896 .macro dct64_step1_lsx
   2897    vldrepl.w     vr20,     t0,       0            // 101
   2898    vldrepl.w     vr21,     t0,       4            // 4095
   2899    vmul_vssrarni_hw vr0, vr20, vr21, vr16, vr0, vr8, vr9    // vr8 t32a vr9 t63a
   2900    vldrepl.w     vr20,     t0,       8            // 2967
   2901    vldrepl.w     vr21,     t0,       12           // -2824
   2902    vmul_vssrarni_hw vr1, vr20, vr21, vr16, vr1, vr10, vr11  // vr10 t62a vr11 t33a
   2903    vldrepl.w     vr20,     t0,       16           // 1660
   2904    vldrepl.w     vr21,     t0,       20           // 3745
   2905    vmul_vssrarni_hw vr2, vr20, vr21, vr16, vr2, vr12, vr13  // vr12 t34a vr13 t61a
   2906    vldrepl.w     vr20,     t0,       24           // 3822
   2907    vldrepl.w     vr21,     t0,       28           // -1474
   2908    vmul_vssrarni_hw vr3, vr20, vr21, vr16, vr3, vr14, vr15  // vr14 t60a vr15 t35a
   2909 
   2910    vsadd.h       vr0,      vr8,      vr11         // t32
   2911    vssub.h       vr1,      vr8,      vr11         // t33
   2912    vssub.h       vr2,      vr15,     vr12         // t34
   2913    vsadd.h       vr3,      vr15,     vr12         // t35
   2914    vsadd.h       vr4,      vr14,     vr13         // t60
   2915    vssub.h       vr5,      vr14,     vr13         // t61
   2916    vssub.h       vr6,      vr9,      vr10         // t62
   2917    vsadd.h       vr7,      vr9,      vr10         // t63
   2918 
   2919    vldrepl.w     vr20,     t0,       32           // 4076
   2920    vldrepl.w     vr21,     t0,       36           // 401
   2921    vmul_vmadd_w vr6, vr1, vr20, vr21, vr9, vr10
   2922    vmul_vmsub_w vr6, vr1, vr21, vr20, vr13, vr11
   2923    vssrarni.h.w  vr10,     vr9,      12           // t62a
   2924    vssrarni.h.w  vr11,     vr13,     12           // t33a
   2925 
   2926    vmul_vmadd_w vr5, vr2, vr20, vr21, vr9, vr1
   2927    vmul_vmsub_w vr5, vr2, vr21, vr20, vr13, vr6
   2928    vneg.w        vr9,      vr9
   2929    vneg.w        vr1,      vr1
   2930    vssrarni.h.w  vr6,      vr13,     12           // t61a
   2931    vssrarni.h.w  vr1,      vr9,      12           // t34a
   2932 
   2933    vsadd.h       vr2,      vr0,      vr3          // t32a
   2934    vssub.h       vr5,      vr0,      vr3          // t35a
   2935    vsadd.h       vr9,      vr11,     vr1          // t33
   2936    vssub.h       vr13,     vr11,     vr1          // t34
   2937    vssub.h       vr0,      vr7,      vr4          // t60a
   2938    vsadd.h       vr3,      vr7,      vr4          // t63a
   2939    vssub.h       vr1,      vr10,     vr6          // t61
   2940    vsadd.h       vr11,     vr10,     vr6          // t62
   2941 
   2942    vldrepl.w     vr20,     t0,       40           // 4017
   2943    vldrepl.w     vr21,     t0,       44           // 799
   2944    vmul_vmadd_w vr1, vr13, vr20, vr21, vr8, vr4
   2945    vmul_vmsub_w vr1, vr13, vr21, vr20, vr12, vr7
   2946    vssrarni.h.w  vr4,      vr8,      12           // t61a
   2947    vssrarni.h.w  vr7,      vr12,     12           // t34a
   2948 
   2949    vmul_vmadd_w vr0, vr5, vr20, vr21, vr8, vr6
   2950    vmul_vmsub_w vr0, vr5, vr21, vr20, vr12, vr10
   2951    vssrarni.h.w  vr6,      vr8,      12           // t60
   2952    vssrarni.h.w  vr10,     vr12,     12           // t35
   2953 
   2954    vst_x8 t6, 0, 16, vr2, vr9, vr7, vr10, vr6, vr4, vr11, vr3
   2955 .endm // dct64_step1
   2956 
   2957    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   2958    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   2959    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   2960    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   2961 .macro dct64_step2_lsx
   2962    vld           vr0,      t5,       0            // t32a
   2963    vld           vr2,      t4,       0            // t63a
   2964    vld           vr3,      t5,       16*8         // t56a
   2965    vld           vr1,      t4,       16*8         // t39a
   2966    vld           vr4,      t5,       16*16        // t40a
   2967    vld           vr6,      t4,       16*16        // t55a
   2968    vld           vr7,      t5,       16*24        // t48a
   2969    vld           vr5,      t4,       16*24        // t47a
   2970 
   2971    vsadd.h       vr8,      vr0,      vr1          // t32
   2972    vssub.h       vr9,      vr0,      vr1          // t39
   2973    vsadd.h       vr10,     vr2,      vr3          // t63
   2974    vssub.h       vr11,     vr2,      vr3          // t56
   2975    vssub.h       vr12,     vr5,      vr4          // t40
   2976    vsadd.h       vr13,     vr5,      vr4          // t47
   2977    vsadd.h       vr14,     vr7,      vr6          // t48
   2978    vssub.h       vr15,     vr7,      vr6          // t55
   2979    vldrepl.w     vr20,     t0,       8            // 1567
   2980    vldrepl.w     vr21,     t0,       12           // 3784
   2981    vmul_vmadd_w  vr11, vr9, vr21, vr20, vr0, vr2
   2982    vmul_vmsub_w  vr11, vr9, vr20, vr21, vr1, vr3
   2983    vssrarni.h.w  vr2,      vr0,      12           // t56a
   2984    vssrarni.h.w  vr3,      vr1,      12           // t39a
   2985    vmul_vmadd_w  vr15, vr12, vr21, vr20, vr0, vr4
   2986    vmul_vmsub_w  vr15, vr12, vr20, vr21, vr1, vr5
   2987    vneg.w        vr0,      vr0
   2988    vneg.w        vr4,      vr4
   2989    vssrarni.h.w  vr5,      vr1,      12           // t55a
   2990    vssrarni.h.w  vr4,      vr0,      12           // t40a
   2991    vsadd.h       vr9,      vr8,      vr13         // t32a
   2992    vssub.h       vr11,     vr8,      vr13         // t47a
   2993    vsadd.h       vr6,      vr3,      vr4          // t39
   2994    vssub.h       vr7,      vr3,      vr4          // t40
   2995    vssub.h       vr12,     vr10,     vr14         // t48a
   2996    vsadd.h       vr15,     vr10,     vr14         // t63a
   2997    vssub.h       vr0,      vr2,      vr5          // t55
   2998    vsadd.h       vr1,      vr2,      vr5          // t56
   2999 
   3000    vldrepl.w     vr20,     t0,       0            // 2896
   3001    vmul_vmsub_w vr0, vr7, vr20, vr20, vr8, vr13
   3002    vmul_vmadd_w vr0, vr7, vr20, vr20, vr3, vr4
   3003    vssrarni.h.w  vr13,     vr8,      12           // t40a
   3004    vssrarni.h.w  vr4,      vr3,      12           // t55a
   3005    vmul_vmsub_w vr12, vr11, vr20, vr20, vr8, vr10
   3006    vmul_vmadd_w vr12, vr11, vr20, vr20, vr3, vr14
   3007    vssrarni.h.w  vr10,     vr8,      12           // t47
   3008    vssrarni.h.w  vr14,     vr3,      12           // t48
   3009 
   3010    // t32a t39 t40a t47  t48  t55a t56 t63a
   3011    // vr9  vr6 vr13 vr10 vr14 vr4  vr1 vr15
   3012    vst           vr9,      t5,       0            // t32a
   3013    vst           vr6,      t4,       0            // t39
   3014    vst           vr13,     t5,       16*8         // t40a
   3015    vst           vr10,     t4,       16*8         // t47
   3016    vst           vr14,     t5,       16*16        // t48
   3017    vst           vr4,      t4,       16*16        // t55a
   3018    vst           vr1,      t5,       16*24        // t56
   3019    vst           vr15,     t4,       16*24        // t63a
   3020 .endm // dct64_step2_lsx
   3021 
   3022 .macro dct64_step3_lsx
   3023    //                t0   t1   t2   t3   t4    t5    t6    t7
   3024    vld_x8 t3, 0, 16, vr2, vr3, vr7, vr8, vr11, vr12, vr16, vr17
   3025    vld           vr9,      t5,       16*24    // t56
   3026    vld           vr6,      t5,       16*24+16 // t57a
   3027    vld           vr13,     t5,       16*24+32 // t58
   3028    vld           vr10,     t5,       16*24+48 // t59a
   3029    vld           vr14,     t4,       16*24-48 // t60
   3030    vld           vr4,      t4,       16*24-32 // t61a
   3031    vld           vr1,      t4,       16*24-16 // t62
   3032    vld           vr15,     t4,       16*24    // t63a
   3033    vsadd.h       vr20,     vr2,      vr15     // c[0]
   3034    vssub.h       vr21,     vr2,      vr15     // c[63]
   3035    vsadd.h       vr22,     vr3,      vr1      // c[1]
   3036    vssub.h       vr23,     vr3,      vr1      // c[62]
   3037    vsadd.h       vr24,     vr7,      vr4      // c[2]
   3038    vssub.h       vr25,     vr7,      vr4      // c[61]
   3039    vsadd.h       vr26,     vr8,      vr14     // c[3]
   3040    vssub.h       vr27,     vr8,      vr14     // c[60]
   3041    vsadd.h       vr28,     vr11,     vr10     // c[4]
   3042    vssub.h       vr29,     vr11,     vr10     // c[59]
   3043    vsadd.h       vr30,     vr12,     vr13     // c[5]
   3044    vssub.h       vr31,     vr12,     vr13     // c[58]
   3045    vsadd.h       vr2,      vr16,     vr6      // c[6]
   3046    vssub.h       vr15,     vr16,     vr6      // c[57]
   3047    vsadd.h       vr1,      vr17,     vr9      // c[7]
   3048    vssub.h       vr3,      vr17,     vr9      // c[56]
   3049 .endm // dct64_step3_lsx
   3050 
   3051 .macro dct64_step4_lsx transpose8x8, shift, start0, stride0, start1, stride1
   3052    dct64_step3_lsx
   3053 
   3054 .ifnb \transpose8x8
   3055    LSX_TRANSPOSE8x8_H vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
   3056                       vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
   3057                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
   3058 
   3059    LSX_TRANSPOSE8x8_H vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
   3060                       vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21, \
   3061                       vr4, vr7, vr8, vr14, vr10, vr11, vr12, vr13
   3062 .endif
   3063 
   3064 .ifnb \shift
   3065 .irp i, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1, \
   3066     vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
   3067     vsrari.h     \i,       \i,       \shift
   3068 .endr
   3069 .endif
   3070 
   3071    vst_x8 t7, \start0, \stride0, vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
   3072 
   3073    vst_x8 t7, \start1, \stride1, vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
   3074 .endm // dct64_step4_lsx
   3075 
   3076 .macro dct64_step5_lsx in0, in1, in2, in3, in4, in5, in6, in7
   3077    fld.d         f4,       t0,       0
   3078    fldx.d        f5,       t0,       a1
   3079    fld.d         f6,       t6,       0
   3080    fldx.d        f7,       t6,       a1
   3081    alsl.d        t0,       a1,       t0,    2
   3082    alsl.d        t6,       a1,       t6,    2
   3083    fld.d         f8,       t0,       0
   3084    fldx.d        f9,       t0,       a1
   3085    fld.d         f10,      t6,       0
   3086    fldx.d        f11,      t6,       a1
   3087 .irp i, vr4, vr5, vr6, vr7, vr8, vr9, vr10, vr11
   3088    vsllwil.hu.bu   \i,      \i,       0
   3089 .endr
   3090    vsrari.h      vr20,     \in0,     4
   3091    vsrari.h      vr22,     \in1,     4
   3092    vsrari.h      vr24,     \in2,     4
   3093    vsrari.h      vr26,     \in3,     4
   3094    vsrari.h      vr28,     \in4,     4
   3095    vsrari.h      vr30,     \in5,     4
   3096    vsrari.h      vr2,      \in6,     4
   3097    vsrari.h      vr1,      \in7,     4
   3098    vadd.h        vr4,      vr4,      vr20
   3099    vadd.h        vr5,      vr5,      vr22
   3100    vadd.h        vr6,      vr6,      vr24
   3101    vadd.h        vr7,      vr7,      vr26
   3102    vadd.h        vr8,      vr8,      vr28
   3103    vadd.h        vr9,      vr9,      vr30
   3104    vadd.h        vr10,     vr10,     vr2
   3105    vadd.h        vr11,     vr11,     vr1
   3106    vssrani.bu.h  vr5,      vr4,      0
   3107    vssrani.bu.h  vr7,      vr6,      0
   3108    vssrani.bu.h  vr9,      vr8,      0
   3109    vssrani.bu.h  vr11,     vr10,     0
   3110 
   3111    vstelm.d      vr5,      t1,       0,     0
   3112    vstelm.d      vr5,      t2,       0,     1
   3113    alsl.d        t1,       a1,       t1,    1
   3114    alsl.d        t2,       a1,       t2,    1
   3115    vstelm.d      vr7,      t1,       0,     0
   3116    vstelm.d      vr7,      t2,       0,     1
   3117    alsl.d        t1,       a1,       t1,    1
   3118    alsl.d        t2,       a1,       t2,    1
   3119    vstelm.d      vr9,      t1,       0,     0
   3120    vstelm.d      vr9,      t2,       0,     1
   3121    alsl.d        t1,       a1,       t1,    1
   3122    alsl.d        t2,       a1,       t2,    1
   3123    vstelm.d      vr11,     t1,       0,     0
   3124    vstelm.d      vr11,     t2,       0,     1
   3125 .endm // dct64_step5_lsx
   3126 
   3127 .macro dct_8x32_tx64_new_lsx vld_loc0, stride0, vld_loc1, stride1, rect2
   3128    vld_x8 t2, \vld_loc0, \stride0, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   3129 
   3130    dct_8x16_tx64_core_lsx \rect2
   3131 
   3132    vst_x16 t3, 0, 16, vr22, vr18, vr17, vr28, vr20, vr14, vr15, vr16, \
   3133            vr27, vr30, vr23, vr21, vr29, vr26, vr25, vr24
   3134 
   3135    vxor.v        vr31,     vr31,     vr31
   3136    vst_x8 t2, \vld_loc0, \stride0, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3137 
   3138    vld_x8 t2, \vld_loc1, \stride1, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   3139 
   3140    vst_x8 t2, \vld_loc1, \stride1, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3141 
   3142    la.local      t0,       idct_coeffs
   3143 
   3144 .ifc \rect2, rect2_lsx
   3145    vldrepl.w     vr23,      t0,       0        // 2896
   3146 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   3147    rect2_lsx \i, vr23, \i
   3148 .endr
   3149 .endif
   3150 
   3151    vldrepl.w     vr20,     t0,       64           // 201
   3152    vldrepl.w     vr21,     t0,       68           // 4091
   3153    vsllwil.w.h   vr22,     vr0,      0
   3154    vexth.w.h     vr23,     vr0
   3155    vmul.w        vr8,      vr22,     vr21
   3156    vmul.w        vr9,      vr23,     vr21
   3157    vmul.w        vr0,      vr22,     vr20
   3158    vmul.w        vr10,     vr23,     vr20
   3159    vssrarni.h.w  vr9,      vr8,      12           // t31a
   3160    vssrarni.h.w  vr10,     vr0,      12           // t16a
   3161 
   3162    vldrepl.w     vr20,     t0,       72           // 3035
   3163    vldrepl.w     vr21,     t0,       76           // 2751
   3164    vsllwil.w.h   vr22,     vr7,      0
   3165    vexth.w.h     vr23,     vr7
   3166    vneg.w        vr21,     vr21
   3167    vmul.w        vr8,      vr22,     vr20
   3168    vmul.w        vr0,      vr23,     vr20
   3169    vmul.w        vr7,      vr22,     vr21
   3170    vmul.w        vr30,     vr23,     vr21
   3171    vssrarni.h.w  vr0,      vr8,      12           // t30a
   3172    vssrarni.h.w  vr30,     vr7,      12           // t17a
   3173 
   3174    vldrepl.w     vr20,     t0,       80           // 1751
   3175    vldrepl.w     vr21,     t0,       84           // 3703
   3176    vsllwil.w.h   vr22,     vr4,      0
   3177    vexth.w.h     vr23,     vr4
   3178    vmul.w        vr8,      vr22,     vr21
   3179    vmul.w        vr7,      vr23,     vr21
   3180    vmul.w        vr4,      vr22,     vr20
   3181    vmul.w        vr19,     vr23,     vr20
   3182    vssrarni.h.w  vr7,      vr8,      12           // t29a
   3183    vssrarni.h.w  vr19,     vr4,      12           // t18a
   3184 
   3185    vldrepl.w     vr20,     t0,       88           // 3857
   3186    vldrepl.w     vr21,     t0,       92           // 1380
   3187    vsllwil.w.h   vr22,     vr3,      0
   3188    vexth.w.h     vr23,     vr3
   3189    vneg.w        vr21,     vr21
   3190    vmul.w        vr8,      vr22,     vr20
   3191    vmul.w        vr4,      vr23,     vr20
   3192    vmul.w        vr3,      vr22,     vr21
   3193    vmul.w        vr26,     vr23,     vr21
   3194    vssrarni.h.w  vr4,      vr8,      12           // t28a
   3195    vssrarni.h.w  vr26,     vr3,      12           // t19a
   3196 
   3197    vldrepl.w     vr20,     t0,       96           // 995
   3198    vldrepl.w     vr21,     t0,       100          // 3973
   3199    vsllwil.w.h   vr22,     vr2,      0
   3200    vexth.w.h     vr23,     vr2
   3201    vmul.w        vr8,      vr22,     vr21
   3202    vmul.w        vr3,      vr23,     vr21
   3203    vmul.w        vr2,      vr22,     vr20
   3204    vmul.w        vr27,     vr23,     vr20
   3205    vssrarni.h.w  vr3,      vr8,      12           // t27a
   3206    vssrarni.h.w  vr27,     vr2,      12           // t20a
   3207 
   3208    vldrepl.w     vr20,     t0,       104          // 3513
   3209    vldrepl.w     vr21,     t0,       108          // 2106
   3210    vsllwil.w.h   vr22,     vr5,      0
   3211    vexth.w.h     vr23,     vr5
   3212    vneg.w        vr21,     vr21
   3213    vmul.w        vr8,      vr22,     vr20
   3214    vmul.w        vr2,      vr23,     vr20
   3215    vmul.w        vr5,      vr22,     vr21
   3216    vmul.w        vr28,     vr23,     vr21
   3217    vssrarni.h.w  vr2,      vr8,      12           // t26a
   3218    vssrarni.h.w  vr28,     vr5,      12           // t21a
   3219 
   3220    vldrepl.w     vr20,     t0,       112          // 2440 -> 1220
   3221    vldrepl.w     vr21,     t0,       116          // 3290 -> 1645
   3222    vsllwil.w.h   vr22,     vr6,      0
   3223    vexth.w.h     vr23,     vr6
   3224    vmul.w        vr8,      vr22,     vr21
   3225    vmul.w        vr5,      vr23,     vr21
   3226    vmul.w        vr6,      vr22,     vr20
   3227    vmul.w        vr25,     vr23,     vr20
   3228    vssrarni.h.w  vr5,      vr8,      12           // t25a
   3229    vssrarni.h.w  vr25,     vr6,      12           // t22a
   3230 
   3231    vldrepl.w     vr20,     t0,       120          // 4052
   3232    vldrepl.w     vr21,     t0,       124          // 601
   3233    vsllwil.w.h   vr22,     vr1,      0
   3234    vexth.w.h     vr23,     vr1
   3235    vneg.w        vr21,     vr21
   3236    vmul.w        vr8,      vr22,     vr20
   3237    vmul.w        vr6,      vr23,     vr20
   3238    vmul.w        vr1,      vr22,     vr21
   3239    vmul.w        vr24,     vr23,     vr21
   3240    vssrarni.h.w  vr6,      vr8,      12           // t24a
   3241    vssrarni.h.w  vr24,     vr1,      12           // t23a
   3242 
   3243    vsadd.h       vr1,      vr10,     vr30         // t16
   3244    vssub.h       vr29,     vr10,     vr30         // t17
   3245    vssub.h       vr8,      vr26,     vr19         // t18
   3246    vsadd.h       vr31,     vr26,     vr19         // t19
   3247    vsadd.h       vr10,     vr27,     vr28         // t20
   3248    vssub.h       vr30,     vr27,     vr28         // t21
   3249    vssub.h       vr19,     vr24,     vr25         // t22
   3250    vsadd.h       vr26,     vr24,     vr25         // t23
   3251    vsadd.h       vr27,     vr6,      vr5          // t24
   3252    vssub.h       vr28,     vr6,      vr5          // t25
   3253    vssub.h       vr24,     vr3,      vr2          // t26
   3254    vsadd.h       vr25,     vr3,      vr2          // t27
   3255    vsadd.h       vr5,      vr4,      vr7          // t28
   3256    vssub.h       vr6,      vr4,      vr7          // t29
   3257    vssub.h       vr2,      vr9,      vr0          // t30
   3258    vsadd.h       vr3,      vr9,      vr0          // t31
   3259 
   3260    vldrepl.w     vr20,     t0,       16           // 799
   3261    vldrepl.w     vr21,     t0,       20           // 4017
   3262    vmul_vmadd_w vr2, vr29, vr21, vr20, vr4, vr7
   3263    vmul_vmsub_w vr2, vr29, vr20, vr21, vr11, vr0
   3264    vssrarni.h.w  vr7,      vr4,      12           // t30a
   3265    vssrarni.h.w  vr0,      vr11,     12           // t17a
   3266    vmul_vmadd_w vr6, vr8, vr21, vr20, vr4, vr9
   3267    vneg.w        vr4,      vr4
   3268    vneg.w        vr9,      vr9
   3269    vmul_vmsub_w vr6, vr8, vr20, vr21, vr11, vr2
   3270    vssrarni.h.w  vr9,      vr4,      12           // t18a
   3271    vssrarni.h.w  vr2,      vr11,     12           // t29a
   3272 
   3273    vldrepl.w     vr20,     t0,       24           // 3406 -> 1703
   3274    vldrepl.w     vr21,     t0,       28           // 2276 -> 1138
   3275    vmul_vmadd_w vr24, vr30, vr21, vr20, vr4, vr29
   3276    vmul_vmsub_w vr24, vr30, vr20, vr21, vr11, vr6
   3277    vssrarni.h.w  vr29,     vr4,      12           // t26a
   3278    vssrarni.h.w  vr6,      vr11,     12           // t21a
   3279 
   3280    vmul_vmadd_w vr28, vr19, vr21, vr20, vr4, vr8
   3281    vneg.w        vr4,      vr4
   3282    vneg.w        vr8,      vr8
   3283    vmul_vmsub_w vr28, vr19, vr20, vr21, vr11, vr24
   3284    vssrarni.h.w  vr8,      vr4,      12           // t22a
   3285    vssrarni.h.w  vr24,     vr11,     12           // t25a
   3286 
   3287    vsadd.h       vr4,      vr1,      vr31         // t16a
   3288    vssub.h       vr30,     vr1,      vr31         // t19a
   3289    vsadd.h       vr19,     vr0,      vr9          // t17
   3290    vssub.h       vr28,     vr0,      vr9          // t18
   3291    vssub.h       vr1,      vr26,     vr10         // t20a
   3292    vsadd.h       vr31,     vr26,     vr10         // t23a
   3293    vssub.h       vr0,      vr8,      vr6          // t21
   3294    vsadd.h       vr9,      vr8,      vr6          // t22
   3295    vsadd.h       vr10,     vr27,     vr25         // t24a
   3296    vssub.h       vr26,     vr27,     vr25         // t27a
   3297    vsadd.h       vr6,      vr24,     vr29         // t25
   3298    vssub.h       vr8,      vr24,     vr29         // t26
   3299    vssub.h       vr25,     vr3,      vr5          // t28a
   3300    vsadd.h       vr27,     vr3,      vr5          // t31a
   3301    vssub.h       vr24,     vr7,      vr2          // t29
   3302    vsadd.h       vr29,     vr7,      vr2          // t30
   3303 
   3304    vldrepl.w     vr20,     t0,       8            // 1567
   3305    vldrepl.w     vr21,     t0,       12           // 3784
   3306    vmul_vmadd_w vr24, vr28, vr21, vr20, vr3, vr5
   3307    vmul_vmsub_w vr24, vr28, vr20, vr21, vr11, vr2
   3308    vssrarni.h.w  vr5,      vr3,      12           // t29a
   3309    vssrarni.h.w  vr2,      vr11,     12           // 18a
   3310 
   3311    vmul_vmadd_w vr25, vr30, vr21, vr20, vr3, vr7
   3312    vmul_vmsub_w vr25, vr30, vr20, vr21, vr11, vr24
   3313    vssrarni.h.w  vr7,      vr3,      12           // t28
   3314    vssrarni.h.w  vr24,     vr11,     12           // t19
   3315 
   3316    vmul_vmadd_w vr26, vr1, vr21, vr20, vr3, vr28
   3317    vneg.w        vr3,      vr3
   3318    vneg.w        vr28,     vr28
   3319    vmul_vmsub_w vr26, vr1, vr20, vr21, vr11, vr25
   3320    vssrarni.h.w  vr28,     vr3,      12           // t20
   3321    vssrarni.h.w  vr25,     vr11,     12           // t27
   3322 
   3323    vmul_vmadd_w vr8, vr0, vr21, vr20, vr3, vr30
   3324    vneg.w        vr3,      vr3
   3325    vneg.w        vr30,     vr30
   3326    vmul_vmsub_w vr8, vr0, vr20, vr21, vr11, vr1
   3327    vssrarni.h.w  vr30,     vr3,      12           // t21a
   3328    vssrarni.h.w  vr1,      vr11,     12           // t26a
   3329 
   3330    vsadd.h       vr3,      vr4,      vr31         // t16
   3331    vssub.h       vr26,     vr4,      vr31         // t23
   3332    vsadd.h       vr0,      vr19,     vr9          // t17a
   3333    vssub.h       vr8,      vr19,     vr9          // t22a
   3334    vsadd.h       vr4,      vr2,      vr30         // t18
   3335    vssub.h       vr31,     vr2,      vr30         // t21
   3336    vsadd.h       vr9,      vr24,     vr28         // t19a
   3337    vssub.h       vr19,     vr24,     vr28         // t20a
   3338    vssub.h       vr2,      vr27,     vr10         // t24
   3339    vsadd.h       vr30,     vr27,     vr10         // t31
   3340    vssub.h       vr24,     vr29,     vr6          // t25a
   3341    vsadd.h       vr28,     vr29,     vr6          // t30a
   3342    vssub.h       vr10,     vr5,      vr1          // t26
   3343    vsadd.h       vr27,     vr5,      vr1          // t29
   3344    vssub.h       vr6,      vr7,      vr25         // t27a
   3345    vsadd.h       vr29,     vr7,      vr25         // t28a
   3346 
   3347    vldrepl.w     vr20,     t0,       0            // 2896
   3348    vmul_vmsub_w vr6, vr19, vr20, vr20, vr1, vr5
   3349    vmul_vmadd_w vr6, vr19, vr20, vr20, vr11, vr7
   3350    vssrarni.h.w  vr5,      vr1,      12           // t20
   3351    vssrarni.h.w  vr7,      vr11,     12           // t27
   3352 
   3353    vmul_vmsub_w vr10, vr31, vr20, vr20, vr1, vr25
   3354    vmul_vmadd_w vr10, vr31, vr20, vr20, vr11, vr6
   3355    vssrarni.h.w  vr25,     vr1,      12           // t21a
   3356    vssrarni.h.w  vr6,      vr11,     12           // t26a
   3357 
   3358    vmul_vmsub_w vr24, vr8, vr20, vr20, vr1, vr19
   3359    vmul_vmadd_w vr24, vr8, vr20, vr20, vr11, vr10
   3360    vssrarni.h.w  vr19,     vr1,      12           // t22
   3361    vssrarni.h.w  vr10,     vr11,     12           // t25
   3362 
   3363    vmul_vmsub_w vr2, vr26, vr20, vr20, vr1, vr31
   3364    vmul_vmadd_w vr2, vr26, vr20, vr20, vr11, vr8
   3365    vssrarni.h.w  vr31,     vr1,      12           // t23a
   3366    vssrarni.h.w  vr8,      vr11,     12           // t24a
   3367 
   3368    // t31 t30a t29 t28a t27 t26a t25 t24a t23a t22 t21a t20 t19a t18 t17a t16
   3369    // vr30 vr28 vr27 vr29 vr7 vr6 vr10 vr8 vr31 vr19 vr25 vr5 vr9 vr4 vr0 vr3
   3370    vld_x8 t3, 0, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
   3371 
   3372    vsadd.h       vr1,      vr11,     vr30         // c[0]
   3373    vssub.h       vr2,      vr11,     vr30         // c[31]
   3374    vsadd.h       vr24,     vr12,     vr28         // c[1]
   3375    vssub.h       vr26,     vr12,     vr28         // c[30]
   3376    vsadd.h       vr11,     vr13,     vr27         // c[2]
   3377    vssub.h       vr30,     vr13,     vr27         // c[29]
   3378    vsadd.h       vr12,     vr14,     vr29         // c[3]
   3379    vssub.h       vr28,     vr14,     vr29         // c[28]
   3380    vsadd.h       vr13,     vr15,     vr7          // c[4]
   3381    vssub.h       vr27,     vr15,     vr7          // c[27]
   3382    vsadd.h       vr14,     vr16,     vr6          // c[5]
   3383    vssub.h       vr29,     vr16,     vr6          // c[26]
   3384    vsadd.h       vr7,      vr17,     vr10         // c[6]
   3385    vssub.h       vr15,     vr17,     vr10         // c[25]
   3386    vsadd.h       vr6,      vr18,     vr8          // c[7]
   3387    vssub.h       vr16,     vr18,     vr8          // c[24]
   3388 
   3389    vst_x8 t3, 0, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
   3390 
   3391    vst_x8 t3, 384, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
   3392 
   3393    vld_x8 t3, 128, 16, vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18
   3394 
   3395    vsadd.h       vr1,      vr11,     vr31         // c[8]
   3396    vssub.h       vr2,      vr11,     vr31         // c[23]
   3397    vsadd.h       vr24,     vr12,     vr19         // c[9]
   3398    vssub.h       vr26,     vr12,     vr19         // c[22]
   3399    vsadd.h       vr11,     vr13,     vr25         // c[10]
   3400    vssub.h       vr30,     vr13,     vr25         // c[21]
   3401    vsadd.h       vr12,     vr14,     vr5          // c[11]
   3402    vssub.h       vr28,     vr14,     vr5          // c[20]
   3403    vsadd.h       vr13,     vr15,     vr9          // c[12]
   3404    vssub.h       vr27,     vr15,     vr9          // c[19]
   3405    vsadd.h       vr14,     vr16,     vr4          // c[13]
   3406    vssub.h       vr29,     vr16,     vr4          // c[18]
   3407    vsadd.h       vr7,      vr17,     vr0          // c[14]
   3408    vssub.h       vr15,     vr17,     vr0          // c[17]
   3409    vsadd.h       vr6,      vr18,     vr3          // c[15]
   3410    vssub.h       vr16,     vr18,     vr3          // c[16]
   3411 
   3412    vst_x8 t3, 128, 16, vr1, vr24, vr11, vr12, vr13, vr14, vr7, vr6
   3413 
   3414    vst_x8 t3, 256, 16, vr16, vr15, vr29, vr27, vr28, vr30, vr26, vr2
   3415 .endm // dct_8x32_tx64_new_lsx
   3416 
   3417 .macro DST_ADD_W64 in0, in1, in2, in3, in4, in5, in6, in7
   3418    vsllwil.hu.bu vr4,      vr10,     0
   3419    vsllwil.hu.bu vr5,      vr11,     0
   3420    vsllwil.hu.bu vr6,      vr12,     0
   3421    vsllwil.hu.bu vr7,      vr13,     0
   3422    vexth.hu.bu   vr10,     vr10
   3423    vexth.hu.bu   vr11,     vr11
   3424    vexth.hu.bu   vr12,     vr12
   3425    vexth.hu.bu   vr13,     vr13
   3426    vadd.h        vr4,      vr4,      \in0
   3427    vadd.h        vr10,     vr10,     \in1
   3428    vadd.h        vr5,      vr5,      \in2
   3429    vadd.h        vr11,     vr11,     \in3
   3430    vadd.h        vr6,      vr6,      \in4
   3431    vadd.h        vr12,     vr12,     \in5
   3432    vadd.h        vr7,      vr7,      \in6
   3433    vadd.h        vr13,     vr13,     \in7
   3434    vssrani.bu.h  vr10,     vr4,      0
   3435    vssrani.bu.h  vr11,     vr5,      0
   3436    vssrani.bu.h  vr12,     vr6,      0
   3437    vssrani.bu.h  vr13,     vr7,      0
   3438    vst           vr10,     a0,       0
   3439    vst           vr11,     a0,       16
   3440    vst           vr12,     a0,       32
   3441    vst           vr13,     a0,       48
   3442 .endm
   3443 
   3444 .macro idct_dc_w64 w, h, shift
   3445    ld.h          t2,       a2,       0
   3446    vldi          vr0,      0x8b5
   3447    vreplgr2vr.w  vr1,      t2
   3448    vldi          vr20,     0x880
   3449    vmul.w        vr2,      vr0,      vr1
   3450    st.h          zero,     a2,       0
   3451    vsrari.w      vr2,      vr2,      8
   3452    vld           vr13,     a0,       48
   3453 
   3454 .if (2*\w == \h) || (2*\h == \w)
   3455    vmul.w        vr2,      vr2,      vr0
   3456    vsrari.w      vr2,      vr2,      8
   3457 .endif
   3458 
   3459 .if \shift>0
   3460    vsrari.w      vr2,      vr2,      \shift
   3461 .endif
   3462    vld           vr11,     a0,       16
   3463    vmadd.w       vr20,     vr2,      vr0
   3464    vld           vr12,     a0,       32
   3465    vssrarni.h.w  vr20,     vr20,     12
   3466    vld           vr10,     a0,       0
   3467 .endm
   3468 
   3469 function inv_txfm_add_dct_dct_64x64_8bpc_lsx
   3470    bnez          a3,       .NO_HAS_DCONLY_64x64
   3471 
   3472    idct_dc_w64 64, 64, 2
   3473 
   3474    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   3475 
   3476    li.w          t3,       63
   3477 .loop63:
   3478    add.d         a0,       a0,       a1
   3479    vld           vr10,     a0,       0
   3480    vld           vr11,     a0,       16
   3481    vld           vr12,     a0,       32
   3482    vld           vr13,     a0,       48
   3483    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   3484    addi.d        t3,       t3,       -1
   3485    blt           zero,     t3,       .loop63
   3486    b             .DCT_DCT_64X64_END
   3487 .NO_HAS_DCONLY_64x64:
   3488 
   3489    malloc_space  64*32*2+512+512
   3490 
   3491 .macro dct64x64_core1_lsx shift, rect2
   3492    //addi.d        t2,       a2,       \in0
   3493    //addi.d        t7,       t7,       \in1
   3494    li.w          t4,       64*32*2+64
   3495    add.d         t3,       sp,       t4
   3496    addi.d        t6,       t3,       512
   3497    add.d         t5,       t6,       zero
   3498 
   3499    dct_8x32_tx64_new_lsx 0, 256, 128, 256, \rect2
   3500 
   3501    la.local      t0,       idct64_coeffs
   3502    vxor.v        vr31,     vr31,     vr31
   3503 
   3504    //addi.d        a4,       a2,       \in2         // 32 ...
   3505    // in1/31/17/15 -> t32a/33/34a/35/60/61a/62/63a
   3506    vld           vr0,      a4,       128*0        // in1
   3507    vld           vr1,      a4,       128*15       // in31
   3508    vld           vr2,      a4,       128*8        // in17
   3509    vld           vr3,      a4,       128*7        // in15
   3510    la.local      a6,       idct_coeffs
   3511 .ifc \rect2, rect2_lsx
   3512    vldrepl.w     vr23,      a6,       0        // 2896
   3513 .irp i, vr0, vr1, vr2, vr3
   3514    rect2_lsx \i, vr23, \i
   3515 .endr
   3516 .endif
   3517    vst           vr31,     a4,       128*0
   3518    vst           vr31,     a4,       128*15
   3519    vst           vr31,     a4,       128*8
   3520    vst           vr31,     a4,       128*7
   3521    dct64_step1_lsx
   3522 
   3523    addi.d        t0,       t0,       48
   3524    addi.d        t6,       t6,       128
   3525    // in7/25/23/ 9 -> t56a/57/58a/59/36/37a/38/39a
   3526    vld           vr0,      a4,       128*3        // in7
   3527    vld           vr1,      a4,       128*12       // in25
   3528    vld           vr2,      a4,       128*11       // in23
   3529    vld           vr3,      a4,       128*4        // in9
   3530    la.local      a6,       idct_coeffs
   3531 .ifc \rect2, rect2_lsx
   3532    vldrepl.w     vr23,      a6,       0        // 2896
   3533 .irp i, vr0, vr1, vr2, vr3
   3534    rect2_lsx \i, vr23, \i
   3535 .endr
   3536 .endif
   3537    vst           vr31,     a4,       128*3
   3538    vst           vr31,     a4,       128*12
   3539    vst           vr31,     a4,       128*11
   3540    vst           vr31,     a4,       128*4
   3541    dct64_step1_lsx
   3542 
   3543    addi.d        t0,       t0,       48
   3544    addi.d        t6,       t6,       128
   3545    // in5/27/21/11 -> t40a/41/42a/43/52/53a/54/55a
   3546    vld           vr0,      a4,       128*2        // in5
   3547    vld           vr1,      a4,       128*13       // in27
   3548    vld           vr2,      a4,       128*10       // in21
   3549    vld           vr3,      a4,       128*5        // in11
   3550    la.local      a6,       idct_coeffs
   3551 .ifc \rect2, rect2_lsx
   3552    vldrepl.w     vr23,      a6,      0        // 2896
   3553 .irp i, vr0, vr1, vr2, vr3
   3554    rect2_lsx \i, vr23, \i
   3555 .endr
   3556 .endif
   3557    vst           vr31,     a4,       128*2
   3558    vst           vr31,     a4,       128*13
   3559    vst           vr31,     a4,       128*10
   3560    vst           vr31,     a4,       128*5
   3561    dct64_step1_lsx
   3562 
   3563    addi.d        t0,       t0,       48
   3564    addi.d        t6,       t6,       128
   3565    // in3/29/19/13 -> t48a/49/50a/51/44/45a/46/47a
   3566    vld           vr0,      a4,       128*1        // in3
   3567    vld           vr1,      a4,       128*14       // in29
   3568    vld           vr2,      a4,       128*9        // in19
   3569    vld           vr3,      a4,       128*6        // in13
   3570    la.local      a6,       idct_coeffs
   3571 .ifc \rect2, rect2_lsx
   3572    vldrepl.w     vr23,      a6,       0        // 2896
   3573 .irp i, vr0, vr1, vr2, vr3
   3574    rect2_lsx \i, vr23, \i
   3575 .endr
   3576 .endif
   3577    vst           vr31,     a4,       128*1
   3578    vst           vr31,     a4,       128*14
   3579    vst           vr31,     a4,       128*9
   3580    vst           vr31,     a4,       128*6
   3581    dct64_step1_lsx
   3582 
   3583    la.local      t0,       idct_coeffs
   3584    addi.d        t4,       t5,       16*7
   3585    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
   3586    dct64_step2_lsx
   3587 
   3588    addi.d        t5,       t5,       16
   3589    addi.d        t4,       t4,       -16
   3590    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
   3591    dct64_step2_lsx
   3592 
   3593    addi.d        t5,       t5,       16
   3594    addi.d        t4,       t4,       -16
   3595    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
   3596    dct64_step2_lsx
   3597 
   3598    addi.d        t5,       t5,       16
   3599    addi.d        t4,       t4,       -16
   3600    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
   3601    dct64_step2_lsx
   3602 
   3603    li.w          t4,       64*32*2+64+512
   3604    add.d         t5,       t4,       sp
   3605    addi.d        t4,       t5,       16*7
   3606    dct64_step4_lsx transpose8x8, \shift, 0, 128, 112, 128
   3607 
   3608    addi.d        t3,       t3,       128
   3609    addi.d        t4,       t4,       -16*8
   3610    addi.d        t5,       t5,       -16*8
   3611    dct64_step4_lsx transpose8x8, \shift, 16, 128, 96, 128
   3612 
   3613    addi.d        t5,       t5,       -16*8
   3614    addi.d        t4,       t4,       -16*8
   3615    addi.d        t3,       t3,       128
   3616    dct64_step4_lsx transpose8x8, \shift, 32, 128, 80, 128
   3617 
   3618    addi.d        t5,       t5,       -16*8
   3619    addi.d        t4,       t4,       -16*8
   3620    addi.d        t3,       t3,       128
   3621    dct64_step4_lsx transpose8x8, \shift, 48, 128, 64, 128
   3622 .endm
   3623    la.local      t8,       eob_32x32
   3624    addi.d        t2,       a2,       0
   3625    addi.d        t7,       sp,       64
   3626    addi.d        t7,       t7,       0
   3627    addi.d        a4,       a2,       64
   3628 .DCT_DCT_EOB_64x64:
   3629    ld.h          a5,       t8,       0
   3630    addi.d        t8,       t8,       2
   3631    dct64x64_core1_lsx 2, no_rect2
   3632    addi.d        t2,       t2,       16
   3633    addi.d        t7,       t7,       128*8
   3634    addi.d        a4,       a4,       16
   3635    bge           a3,       a5,       .DCT_DCT_EOB_64x64
   3636 
   3637    la.local      t8,       eob_32x32
   3638    vxor.v        vr31,     vr31,     vr31
   3639 
   3640    ld.h          t7,       t8,       4
   3641    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
   3642    li.d          t1,       1024*3+64
   3643    add.d         t0,       sp,       t1
   3644 .rept 4
   3645    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   3646            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3647    addi.d t0, t0, 256
   3648 .endr
   3649 
   3650    ld.h          t7,       t8,       2
   3651    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
   3652    li.d          t1,       1024*2+64
   3653    add.d         t0,       sp,       t1
   3654 .rept 4
   3655    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   3656            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3657    addi.d        t0,       t0,       256
   3658 .endr
   3659    ld.h          t7,       t8,       0
   3660    bge           a3,       t7,       .DCT_DCT_EOB_64x64_END
   3661 
   3662    li.d          t1,       1024*1+64
   3663    add.d         t0,       sp,       t1
   3664 .rept 4
   3665    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   3666            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3667    addi.d        t0,       t0,       256
   3668 .endr
   3669 
   3670 .DCT_DCT_EOB_64x64_END:
   3671 
   3672 .macro dct64x64_core2_lsx in0, in1, rect2
   3673    addi.d        t2,       sp,       64+\in0
   3674    addi.d        t7,       sp,       64+\in0
   3675    li.w          t4,       64*32*2+64
   3676    add.d         t3,       sp,       t4
   3677    addi.d        t6,       t3,       512
   3678    add.d         t5,       t6,       zero
   3679 
   3680    addi.d        t2,       t2,       1024
   3681    addi.d        t2,       t2,       1024
   3682    dct_8x32_tx64_new_lsx -2048, 512, 256-2048, 512, \rect2
   3683 
   3684    la.local      t0,       idct64_coeffs
   3685    addi.d        t2,       sp,       64+64*2+\in0
   3686    addi.d        t4,       t2,       256*7
   3687    addi.d        t4,       t4,       256
   3688 
   3689    vld           vr0,      t2,       256*0        // in1
   3690    vld           vr1,      t4,       256*7        // in31
   3691    vld           vr2,      t4,       256*0        // in17
   3692    vld           vr3,      t2,       256*7        // in15
   3693    dct64_step1_lsx
   3694 
   3695    addi.d        t0,       t0,       48
   3696    addi.d        t6,       t6,       128
   3697    vld           vr0,      t2,       256*3        // in7
   3698    vld           vr1,      t4,       256*4        // in25
   3699    vld           vr2,      t4,       256*3        // in23
   3700    vld           vr3,      t2,       256*4        // in9
   3701    dct64_step1_lsx
   3702 
   3703    addi.d        t0,        t0,       48
   3704    addi.d        t6,        t6,       128
   3705    vld           vr0,       t2,       256*2       // in5
   3706    vld           vr1,       t4,       256*5       // in27
   3707    vld           vr2,       t4,       256*2       // in21
   3708    vld           vr3,       t2,       256*5       // in11
   3709    dct64_step1_lsx
   3710 
   3711    addi.d        t0,        t0,       48
   3712    addi.d        t6,        t6,       128
   3713    vld           vr0,       t2,       256*1       // in3
   3714    vld           vr1,       t4,       256*6       // in29
   3715    vld           vr2,       t4,       256*1       // in19
   3716    vld           vr3,       t2,       256*6       // in13
   3717    dct64_step1_lsx
   3718 
   3719    la.local      t0,       idct_coeffs
   3720    addi.d        t4,       t5,       16*7
   3721    // t32a/t39/t40a/t47/t48/t55a/t56/t63a
   3722    dct64_step2_lsx
   3723 
   3724    addi.d        t5,       t5,       16
   3725    addi.d        t4,       t4,       -16
   3726    // t33/t38a/t41/t46a/t49a/t54/t57a/t62
   3727    dct64_step2_lsx
   3728 
   3729    addi.d        t5,       t5,       16
   3730    addi.d        t4,       t4,       -16
   3731    // t34a/t37/t42a/t45/t50/t53a/t58/t61a
   3732    dct64_step2_lsx
   3733 
   3734    addi.d        t5,       t5,       16
   3735    addi.d        t4,       t4,       -16
   3736    // t35/t36a/t43/t44a/t51a/t52/t59a/t60
   3737    dct64_step2_lsx
   3738 
   3739    li.w          t4,       64*32*2+64+512
   3740    add.d         t5,       t4,       sp
   3741    addi.d        t4,       t5,       16*7
   3742    addi.d        a0,       a0,       \in1
   3743    // 0 - 7, 56 -63
   3744    dct64_step3_lsx
   3745    li.w          t8,       0
   3746    mul.w         t0,       t8,       a1
   3747    add.d         t0,       a0,       t0
   3748    alsl.d        t6,       a1,       t0,      1
   3749    addi.d        t1,       t0,       0
   3750    add.d         t2,       t0,       a1
   3751    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
   3752    li.w          t8,       56
   3753    mul.w         t0,       t8,       a1
   3754    add.d         t0,       a0,       t0
   3755    alsl.d        t6,       a1,       t0,      1
   3756    addi.d        t1,       t0,       0
   3757    add.d         t2,       t0,       a1
   3758    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
   3759 
   3760    // 8 - 15, 48 - 55
   3761    addi.d        t3,       t3,       128
   3762    addi.d        t4,       t4,       -16*8
   3763    addi.d        t5,       t5,       -16*8
   3764    dct64_step3_lsx
   3765    li.w          t8,       8
   3766    mul.w         t0,       t8,       a1
   3767    add.d         t0,       t0,       a0
   3768    alsl.d        t6,       a1,       t0,     1
   3769    addi.d        t1,       t0,       0
   3770    add.d         t2,       t0,       a1
   3771    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
   3772    li.w          t8,       48
   3773    mul.w         t0,       t8,       a1
   3774    add.d         t0,       t0,       a0
   3775    alsl.d        t6,       a1,       t0,     1
   3776    addi.d        t1,       t0,       0
   3777    add.d         t2,       t0,       a1
   3778    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
   3779 
   3780    // 16 - 23, 40 - 47
   3781    addi.d        t3,       t3,       128
   3782    addi.d        t4,       t4,       -16*8
   3783    addi.d        t5,       t5,       -16*8
   3784    dct64_step3_lsx
   3785    li.w          t8,       16
   3786    mul.w         t0,       t8,       a1
   3787    add.d         t0,       t0,       a0
   3788    alsl.d        t6,       a1,       t0,     1
   3789    addi.d        t1,       t0,       0
   3790    add.d         t2,       t0,       a1
   3791    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
   3792    li.w          t8,       40
   3793    mul.w         t0,       t8,       a1
   3794    add.d         t0,       t0,       a0
   3795    alsl.d        t6,       a1,       t0,     1
   3796    addi.d        t1,       t0,       0
   3797    add.d         t2,       t0,       a1
   3798    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
   3799 
   3800    // 24 - 31, 32 - 39
   3801    addi.d        t3,       t3,       128
   3802    addi.d        t4,       t4,       -16*8
   3803    addi.d        t5,       t5,       -16*8
   3804    dct64_step3_lsx
   3805    li.w          t8,       24
   3806    mul.w         t0,       t8,       a1
   3807    add.d         t0,       t0,       a0
   3808    alsl.d        t6,       a1,       t0,     1
   3809    addi.d        t1,       t0,       0
   3810    add.d         t2,       t0,       a1
   3811    dct64_step5_lsx vr20, vr22, vr24, vr26, vr28, vr30, vr2, vr1
   3812    li.w          t8,       32
   3813    mul.w         t0,       t8,       a1
   3814    add.d         t0,       t0,       a0
   3815    alsl.d        t6,       a1,       t0,     1
   3816    addi.d        t1,       t0,       0
   3817    add.d         t2,       t0,       a1
   3818    dct64_step5_lsx vr3, vr15, vr31, vr29, vr27, vr25, vr23, vr21
   3819 .endm
   3820    dct64x64_core2_lsx 16*0, 0, no_rect2
   3821    dct64x64_core2_lsx 16*1, 8, no_rect2
   3822    dct64x64_core2_lsx 16*2, 8, no_rect2
   3823    dct64x64_core2_lsx 16*3, 8, no_rect2
   3824    dct64x64_core2_lsx 16*4, 8, no_rect2
   3825    dct64x64_core2_lsx 16*5, 8, no_rect2
   3826    dct64x64_core2_lsx 16*6, 8, no_rect2
   3827    dct64x64_core2_lsx 16*7, 8, no_rect2
   3828 
   3829    free_space 64*32*2+512+512
   3830 .DCT_DCT_64X64_END:
   3831 endfunc
   3832 
   3833 function inv_txfm_add_dct_dct_64x32_8bpc_lsx
   3834    bnez          a3,       .NO_HAS_DCONLY_64x32
   3835 
   3836    idct_dc_w64 64, 32, 1
   3837 
   3838    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   3839 
   3840    li.w          t3,       31
   3841 .loop31:
   3842    add.d         a0,       a0,       a1
   3843    vld           vr10,     a0,       0
   3844    vld           vr11,     a0,       16
   3845    vld           vr12,     a0,       32
   3846    vld           vr13,     a0,       48
   3847    DST_ADD_W64 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   3848    addi.d        t3,       t3,       -1
   3849    blt           zero,     t3,       .loop31
   3850    b             .DCT_DCT_64X32_END
   3851 .NO_HAS_DCONLY_64x32:
   3852    malloc_space  64*32*2+512+512
   3853 
   3854    la.local      t8,       eob_32x32
   3855    addi.d        t2,       a2,       0
   3856    addi.d        t7,       sp,       64
   3857    addi.d        t7,       t7,       0
   3858    addi.d        a4,       a2,       64
   3859 .DCT_DCT_EOB_64x32:
   3860    ld.h          a5,       t8,       0
   3861    addi.d        t8,       t8,       2
   3862    dct64x64_core1_lsx 1, rect2_lsx
   3863    addi.d        t2,       t2,       16
   3864    addi.d        t7,       t7,       128*8
   3865    addi.d        a4,       a4,       16
   3866    bge           a3,       a5,       .DCT_DCT_EOB_64x32
   3867 
   3868    la.local      t8,       eob_32x32
   3869    vxor.v        vr31,     vr31,     vr31
   3870 
   3871    ld.h          t7,       t8,       4
   3872    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
   3873    li.d          t1,       1024*3+64
   3874    add.d         t0,       sp,       t1
   3875 .rept 4
   3876    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   3877            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3878    addi.d t0, t0, 256
   3879 .endr
   3880 
   3881    ld.h          t7,       t8,       2
   3882    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
   3883    li.d          t1,       1024*2+64
   3884    add.d         t0,       sp,       t1
   3885 .rept 4
   3886    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   3887            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3888    addi.d        t0,       t0,       256
   3889 .endr
   3890 
   3891    ld.h          t7,       t8,       0
   3892    bge           a3,       t7,       .DCT_DCT_EOB_64x32_END
   3893    li.d          t1,       1024*1+64
   3894    add.d         t0,       sp,       t1
   3895 .rept 4
   3896    vst_x16 t0, 0, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31, \
   3897            vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   3898    addi.d        t0,       t0,       256
   3899 .endr
   3900 
   3901 .DCT_DCT_EOB_64x32_END:
   3902    addi.d        t2,       sp,       64
   3903    li.w          t4,       64*32*2+64
   3904    add.d         t3,       sp,       t4
   3905    addi.d        t5,       sp,       64
   3906    addi.d        t5,       t5,       1024
   3907    addi.d        t5,       t5,       1024
   3908 .rept 8
   3909    vld_x8 t2, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   3910 
   3911    addi.d        t4,       t2,       1024
   3912    addi.d        t4,       t4,       1024
   3913 
   3914    vld_x8 t4, 0, 256, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   3915 
   3916    inv_dct16_lsx no_rect2
   3917 
   3918    vst_x16 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   3919            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   3920 
   3921    addi.d        t4,       t2,       128
   3922    vld_x8 t4, 0, 256, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   3923 
   3924    addi.d        t4,       t4,       1024
   3925    addi.d        t4,       t4,       1024
   3926 
   3927    vld_x8 t4, 0, 256, vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   3928 
   3929    dct_8x32_core_lsx t5, t3, 0, 128, 16, -2048, 1024, -1024, 0, 128, , 4
   3930 
   3931    addi.d        t2,       t2,       16
   3932    addi.d        t5,       t5,       16
   3933    addi.d        t1,       t1,       16
   3934 .endr
   3935    addi.d        t2,       sp,       64
   3936    li.w          t3,       32
   3937 .loop32:
   3938    vld           vr10,     a0,       0
   3939    vld           vr11,     a0,       16
   3940    vld           vr12,     a0,       32
   3941    vld           vr13,     a0,       48
   3942    vld_x8 t2, 0, 16, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   3943    DST_ADD_W64 vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   3944    add.d         a0,       a0,       a1
   3945    addi.d        t2,       t2,       128
   3946    addi.d        t3,       t3,       -1
   3947    blt           zero,     t3,       .loop32
   3948 
   3949    free_space  64*32*2+512+512
   3950 .DCT_DCT_64X32_END:
   3951 endfunc
   3952 
   3953 .macro VLD_DST_ADD_W8_H32 in0
   3954    vld           vr4,      t3,       0
   3955    vld           vr5,      t3,       16
   3956    vld           vr6,      t3,       32
   3957    vld           vr7,      t3,       48
   3958    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
   3959    addi.d        t3,       t3,       64
   3960    add.d         a0,       a1,       a0
   3961    alsl.d        t2,       a1,       t2,     2
   3962    vld           vr4,      t3,       0
   3963    vld           vr5,      t3,       16
   3964    vld           vr6,      t3,       32
   3965    vld           vr7,      t3,       48
   3966    VLD_DST_ADD_W8 vr4, vr5, vr6, vr7
   3967    addi.d        t3,       sp,       \in0
   3968    add.d         a0,       a1,       a0
   3969    alsl.d        t2,       a1,       t2,     2
   3970 .endm
   3971 
   3972 function inv_txfm_add_dct_dct_8x32_8bpc_lsx
   3973    bnez          a3,       .NO_HAS_DCONLY_8x32
   3974 
   3975    idct_dc 8, 32, 2
   3976 
   3977    DST_ADD_W8 vr10, vr11, vr12, vr13, vr20, vr20, vr20, vr20
   3978 .rept 7
   3979    add.d         a0,       a1,       a0
   3980    alsl.d        t2,       a1,       a0,     1
   3981 
   3982    VLD_DST_ADD_W8 vr20, vr20, vr20, vr20
   3983 .endr
   3984    b             .DCT_DCT_8X32_END
   3985 .NO_HAS_DCONLY_8x32:
   3986    malloc_space 512
   3987 
   3988    la.local      t8,       eob_8x32
   3989    addi.d        t3,       sp,       64
   3990    addi.d        t2,       a2,       0
   3991 .DCT_DCT_EOB_8x32:
   3992    ld.h          t7,       t8,       0
   3993    addi.d        t8,       t8,       2
   3994 
   3995    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   3996 
   3997    inv_dct8_lsx vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, .8h
   3998 
   3999 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   4000    vsrari.h      \i,       \i,       2
   4001 .endr
   4002 
   4003    vxor.v        vr31,     vr31,     vr31
   4004    vst_x8 a2, 0, 64, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4005 
   4006    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4007                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4008                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   4009 
   4010    vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   4011 
   4012    addi.d        a2,       a2,       16
   4013    addi.d        t3,       t3,       128
   4014    bge           a3,       t7,       .DCT_DCT_EOB_8x32
   4015 
   4016    la.local      t8,       eob_8x32
   4017    vxor.v        vr31,     vr31,     vr31
   4018    ld.h          t7,       t8,       4
   4019    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
   4020    vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4021 
   4022    ld.h          t7,       t8,       2
   4023    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
   4024    vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4025 
   4026    ld.h          t7,       t8,       0
   4027    bge           a3,       t7,       .DCT_DCT_EOB_8x32_END
   4028    vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4029 .DCT_DCT_EOB_8x32_END:
   4030    addi.d        t2,       sp,       64
   4031    addi.d        t3,       sp,       64
   4032 
   4033    vld_x16 t2, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4034            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4035 
   4036    inv_dct16_lsx .8h
   4037 
   4038    vst_x16 t3, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4039            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4040 
   4041    vld_x16 t2, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4042            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   4043 
   4044    dct_8x32_core_lsx t2, t3, 0, 256, 32, 0, 128, 256, 384, 16, , 4
   4045 
   4046    alsl.d        t2,       a1,       a0,     1
   4047    addi.d        t3,       sp,       64
   4048 
   4049    VLD_DST_ADD_W8_H32 320
   4050    VLD_DST_ADD_W8_H32 448
   4051    VLD_DST_ADD_W8_H32 192
   4052    VLD_DST_ADD_W8_H32 0
   4053 
   4054    free_space 512
   4055 .DCT_DCT_8X32_END:
   4056 endfunc
   4057 
   4058 function inv_txfm_add_identity_identity_8x32_8bpc_lsx
   4059    la.local      t7,       eob_8x32
   4060    alsl.d        t2,       a1,       a0,     1
   4061 
   4062 .IDENTITY_IDENTITY_EOB_8x32:
   4063    ld.h          t6,       t7,       0
   4064    addi.d        t7,       t7,       2
   4065    vld_x8 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   4066 
   4067    vxor.v        vr23,     vr23,     vr23
   4068    vst_x8 a2, 0, 64, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23
   4069 
   4070 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   4071    vsrari.h       \i,       \i,       1
   4072 .endr
   4073 
   4074    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4075                   vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23, \
   4076                   vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4077 
   4078 .irp i, vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   4079    vsrari.h       \i,       \i,       2
   4080 .endr
   4081    VLD_DST_ADD_W8 vr16, vr17, vr18, vr19
   4082    add.d         a0,       a1,       a0
   4083    alsl.d        t2,       a1,       a0,     1
   4084 
   4085    VLD_DST_ADD_W8 vr20, vr21, vr22, vr23
   4086    add.d         a0,       a1,       a0
   4087    alsl.d        t2,       a1,       a0,     1
   4088 
   4089    addi.d        a2,       a2,       16
   4090    bge           a3,       t6,       .IDENTITY_IDENTITY_EOB_8x32
   4091 endfunc
   4092 
   4093 .macro def_fn_16x4_base txfm
   4094 functionl inv_txfm_\txfm\()add_16x4_lsx
   4095    vld_x8 a2, 0, 16, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14
   4096 
   4097 .ifc \txfm, identity_
   4098    li.w          t0,       1697
   4099    vreplgr2vr.w  vr20,     t0
   4100 .irp i, vr0, vr2, vr4, vr6, vr8, vr10, vr12, vr14
   4101    inv_identity16_lsx \i, vr20, \i, \i, .8h
   4102 .endr
   4103 
   4104    vilvh.d       vr1,      vr0,      vr0
   4105    vilvh.d       vr3,      vr2,      vr2
   4106    vilvh.d       vr5,      vr4,      vr4
   4107    vilvh.d       vr7,      vr6,      vr6
   4108    vilvh.d       vr9,      vr8,      vr8
   4109    vilvh.d       vr11,     vr10,     vr10
   4110    vilvh.d       vr13,     vr12,     vr12
   4111    vilvh.d       vr15,     vr14,     vr14
   4112 .else
   4113    vilvh.d       vr1,      vr0,      vr0
   4114    vilvh.d       vr3,      vr2,      vr2
   4115    vilvh.d       vr5,      vr4,      vr4
   4116    vilvh.d       vr7,      vr6,      vr6
   4117    vilvh.d       vr9,      vr8,      vr8
   4118    vilvh.d       vr11,     vr10,     vr10
   4119    vilvh.d       vr13,     vr12,     vr12
   4120    vilvh.d       vr15,     vr14,     vr14
   4121 
   4122    move          t6,       ra
   4123    jirl          ra,       t7,       0
   4124    move          ra,       t6
   4125 .endif
   4126 
   4127    vxor.v        vr23,     vr23,     vr23
   4128    vst_x8 a2, 0, 16, vr23, vr23, vr23, vr23, vr23, vr23, vr23, vr23
   4129 
   4130    LSX_TRANSPOSE8x4_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr0, vr1, \
   4131                       vr2, vr3, vr16, vr17, vr18, vr19, vr20, vr21
   4132 
   4133    LSX_TRANSPOSE8x4_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, vr4, \
   4134                       vr5, vr6, vr7, vr16, vr17, vr18, vr19, vr20, vr21
   4135 
   4136    vsrari.h      vr0,      vr0,      1
   4137    vsrari.h      vr1,      vr1,      1
   4138    vsrari.h      vr2,      vr2,      1
   4139    vsrari.h      vr3,      vr3,      1
   4140    move          t6,       ra
   4141    jirl          ra,       t8,       0
   4142    move          ra,       t6
   4143 
   4144    vsrari.h      vr8,      vr0,      4
   4145    vsrari.h      vr9,      vr1,      4
   4146    vsrari.h      vr10,     vr2,      4
   4147    vsrari.h      vr11,     vr3,      4
   4148    vsrari.h      vr0,      vr4,      1
   4149    vsrari.h      vr1,      vr5,      1
   4150    vsrari.h      vr2,      vr6,      1
   4151    vsrari.h      vr3,      vr7,      1
   4152 
   4153    move          t6,       ra
   4154    jirl          ra,       t8,       0
   4155    move          ra,       t6
   4156 
   4157    vsrari.h      vr16,     vr0,      4
   4158    vsrari.h      vr17,     vr1,      4
   4159    vsrari.h      vr18,     vr2,      4
   4160    vsrari.h      vr19,     vr3,      4
   4161 
   4162    alsl.d        t2,       a1,       a0,    1
   4163    VLD_DST_ADD_W16 vr8, vr16, vr9, vr17, vr10, vr18, vr11, vr19
   4164 endfuncl
   4165 .endm
   4166 
   4167 def_fn_16x4_base identity_
   4168 def_fn_16x4_base
   4169 
   4170 .macro fn_16x4 txfm1, txfm2
   4171 function inv_txfm_add_\txfm1\()_\txfm2\()_16x4_8bpc_lsx
   4172 .ifc \txfm1\()_\txfm2, dct_dct
   4173    bnez          a3,       .NO_HAS_DCONLY_16x4
   4174 
   4175    idct_dc 16, 4, 1
   4176 
   4177    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
   4178                vr20, vr20, vr20, vr20, vr20
   4179    b             .\txfm1\()_\txfm2\()_16x4_END
   4180 .NO_HAS_DCONLY_16x4:
   4181 .endif
   4182 
   4183 .ifnc \txfm1, identity
   4184    la.local     t7,    inv_\txfm1\()_4h_x16_lsx
   4185 .endif
   4186    la.local     t8,    inv_\txfm2\()_8h_x4_lsx
   4187 
   4188 .ifc \txfm1, identity
   4189    b            inv_txfm_identity_add_16x4_lsx
   4190 .else
   4191    b            inv_txfm_add_16x4_lsx
   4192 .endif
   4193 .\txfm1\()_\txfm2\()_16x4_END:
   4194 endfunc
   4195 .endm
   4196 
   4197 fn_16x4 dct, dct
   4198 fn_16x4 identity, identity
   4199 fn_16x4 adst, dct
   4200 
   4201 .macro VLD_DST_ADD_W16_H32 in0
   4202    vld           vr14,     t3,       0
   4203    vld           vr15,     t3,       16
   4204    vld           vr16,     t3,       32
   4205    vld           vr17,     t3,       48
   4206    vld           vr18,     t5,       0
   4207    vld           vr19,     t5,       16
   4208    vld           vr20,     t5,       32
   4209    vld           vr21,     t5,       48
   4210    vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \
   4211                vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4
   4212    VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21
   4213    alsl.d        a0,       a1,       a0,    2
   4214    alsl.d        t2,       a1,       t2,    2
   4215    addi.d        t3,       t3,       64
   4216    addi.d        t5,       t5,       64
   4217    vld           vr14,     t3,       0
   4218    vld           vr15,     t3,       16
   4219    vld           vr16,     t3,       32
   4220    vld           vr17,     t3,       48
   4221    vld           vr18,     t5,       0
   4222    vld           vr19,     t5,       16
   4223    vld           vr20,     t5,       32
   4224    vld           vr21,     t5,       48
   4225    vsrari_h_x8 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, \
   4226                vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21, 4
   4227    VLD_DST_ADD_W16 vr14, vr18, vr15, vr19, vr16, vr20, vr17, vr21
   4228    alsl.d        a0,       a1,       a0,    2
   4229    alsl.d        t2,       a1,       t2,    2
   4230    addi.d        t3,       sp,       \in0
   4231    addi.d        t5,       sp,       \in0+512
   4232 .endm
   4233 
   4234 function inv_txfm_add_dct_dct_16x32_8bpc_lsx
   4235    bnez          a3,       .NO_HAS_DCONLY_16x32
   4236 
   4237    idct_dc 16, 32, 1
   4238 
   4239    DST_ADD_W16 vr10, vr11, vr12, vr13, vr20, vr20, vr20, \
   4240                    vr20, vr20, vr20, vr20, vr20
   4241 .rept 7
   4242    alsl.d        a0,       a1,       a0,     2
   4243    alsl.d        t2,       a1,       a0,     1
   4244 
   4245    VLD_DST_ADD_W16 vr20, vr20, vr20, vr20, vr20, vr20, vr20, vr20
   4246 .endr
   4247    b             .DCT_DCT_16x32_END
   4248 .NO_HAS_DCONLY_16x32:
   4249    malloc_space 512+512
   4250 
   4251    addi.d        t3,       sp,       64
   4252    la.local      t8,       eob_16x32
   4253 
   4254 .DCT_DCT_EOB_16x32:
   4255    ld.h          t7,       t8,       0
   4256    addi.d        t8,       t8,       2
   4257    vld_x16 a2, 0, 64, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4258            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4259 
   4260    vxor.v        vr31,     vr31,     vr31
   4261 .irp i, 0, 64, 128, 192, 256, 320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960
   4262    vst           vr31,     a2,       \i
   4263 .endr
   4264 
   4265    li.w          t0,       2896
   4266    vreplgr2vr.w  vr23,     t0
   4267 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4268     vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4269    rect2_lsx   \i, vr23, \i
   4270 .endr
   4271 
   4272    inv_dct16_lsx .8h
   4273 
   4274    LSX_TRANSPOSE8x8_H vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4275                       vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4276                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   4277 
   4278    LSX_TRANSPOSE8x8_H vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   4279                       vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15, \
   4280                       vr16, vr17, vr18, vr19, vr20, vr21, vr22, vr23
   4281 
   4282 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4283    vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4284    vsrari.h       \i,       \i,       1
   4285 .endr
   4286 
   4287    vst_x8 t3, 0, 16, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7
   4288    vst_x8 t3, 512, 16, vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4289 
   4290    addi.d        a2,       a2,       16
   4291    addi.d        t3,       t3,       128
   4292    bge           a3,       t7,       .DCT_DCT_EOB_16x32
   4293 
   4294    la.local      t8,       eob_16x32
   4295    vxor.v        vr31,     vr31,     vr31
   4296 
   4297    ld.h          t7,       t8,       4
   4298    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
   4299    vst_x8 sp, 64+384, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4300    vst_x8 sp, 64+896, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4301 
   4302    ld.h          t7,       t8,       2
   4303    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
   4304    vst_x8 sp, 64+256, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4305    vst_x8 sp, 64+768, 16,  vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4306 
   4307    ld.h          t7,       t8,       0
   4308    bge           a3,       t7,       .DCT_DCT_EOB_16x32_END
   4309    vst_x8 sp, 64+128, 16, vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4310    vst_x8 sp, 64+512+128, 16  vr31, vr31, vr31, vr31, vr31, vr31, vr31, vr31
   4311 
   4312 .DCT_DCT_EOB_16x32_END:
   4313    addi.d      t7,   sp,    64
   4314 .rept 2
   4315    vld_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4316            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4317 
   4318    inv_dct16_lsx .8h
   4319 
   4320    vst_x16 t7, 0, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4321            vr8, vr9, vr10, vr11, vr12, vr13, vr14, vr15
   4322 
   4323    vld_x16 t7, 16, 32, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, \
   4324            vr19, vr24, vr25, vr26, vr27, vr28, vr29, vr30
   4325 
   4326    dct_8x32_core_lsx t7, t7, 0, 256, 32, 0, 128, 256, 384, 16, ,
   4327 
   4328    addi.d        t7,       t7,       512
   4329 .endr
   4330    alsl.d        t2,       a1,       a0,    1
   4331    addi.d        t3,       sp,       64
   4332    addi.d        t5,       sp,       512+64
   4333 
   4334    VLD_DST_ADD_W16_H32 320
   4335    VLD_DST_ADD_W16_H32 448
   4336    VLD_DST_ADD_W16_H32 192
   4337    VLD_DST_ADD_W16_H32 0
   4338 
   4339    free_space 512+512
   4340 .DCT_DCT_16x32_END:
   4341 endfunc
   4342 
   4343 .macro xvmulev_xvmaddod_lasx in0, in1, in2, in3, out0, out1
   4344    xvmulwev.w.h   \out0,    \in0,     \in2
   4345    xvmulwod.w.h   \out1,    \in0,     \in2
   4346    xvmaddwev.w.h  \out0,    \in1,     \in3
   4347    xvmaddwod.w.h  \out1,    \in1,     \in3
   4348 .endm
   4349 
   4350 .macro xvsrari_h_x16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, \
   4351                     in11, in12, in13, in14, in15, out0, out1, out2, out3, \
   4352                     out4, out5, out6, out7, out8, out9, out10, out11, out12, \
   4353                     out13, out14, out15, shift
   4354    xvsrari.h  \out0,       \in0,     \shift
   4355    xvsrari.h  \out1,       \in1,     \shift
   4356    xvsrari.h  \out2,       \in2,     \shift
   4357    xvsrari.h  \out3,       \in3,     \shift
   4358    xvsrari.h  \out4,       \in4,     \shift
   4359    xvsrari.h  \out5,       \in5,     \shift
   4360    xvsrari.h  \out6,       \in6,     \shift
   4361    xvsrari.h  \out7,       \in7,     \shift
   4362    xvsrari.h  \out8,       \in8,     \shift
   4363    xvsrari.h  \out9,       \in9,     \shift
   4364    xvsrari.h  \out10,      \in10,    \shift
   4365    xvsrari.h  \out11,      \in11,    \shift
   4366    xvsrari.h  \out12,      \in12,    \shift
   4367    xvsrari.h  \out13,      \in13,    \shift
   4368    xvsrari.h  \out14,      \in14,    \shift
   4369    xvsrari.h  \out15,      \in15,    \shift
   4370 .endm
   4371 
   4372 .macro xvpermi_q_x2 in0, in1, in2, in3, out0, out1, out2, out3, tmp0, tmp1
   4373    xvor.v      \tmp0,      \in0,     \in0
   4374    xvor.v      \tmp1,      \in1,     \in1
   4375    xvpermi.q   \out0,      \in2,     0x02
   4376    xvpermi.q   \out1,      \in3,     0x02
   4377    xvpermi.q   \out2,      \tmp0,    0x31
   4378    xvpermi.q   \out3,      \tmp1,    0x31
   4379 .endm
   4380 
   4381 .macro DST_ADD_W16_LASX in0, in1, in2, in3, in4, in5, in6, in7
   4382    vext2xv.hu.bu xr0,      \in0
   4383    vext2xv.hu.bu xr1,      \in1
   4384    vext2xv.hu.bu xr2,      \in2
   4385    vext2xv.hu.bu xr3,      \in3
   4386    xvadd.h       xr0,      xr0,      \in4
   4387    xvadd.h       xr1,      xr1,      \in5
   4388    xvadd.h       xr2,      xr2,      \in6
   4389    xvadd.h       xr3,      xr3,      \in7
   4390    xvssrani.bu.h xr1,      xr0,      0
   4391    xvssrani.bu.h xr3,      xr2,      0
   4392    xvpermi.d     xr0,      xr1,      0b11011000
   4393    xvpermi.d     xr2,      xr3,      0b11011000
   4394    xvpermi.d     xr1,      xr0,      0b00001110
   4395    xvpermi.d     xr3,      xr2,      0b00001110
   4396    vst           vr0,      a0,       0
   4397    vstx          vr1,      a0,       a1
   4398    vst           vr2,      t2,       0
   4399    vstx          vr3,      t2,       a1
   4400 .endm
   4401 
   4402 .macro XVLD_DST_ADD_W16 in0, in1, in2, in3
   4403    vld           vr0,      a0,       0
   4404    vldx          vr1,      a0,       a1
   4405    vld           vr2,      t2,       0
   4406    vldx          vr3,      t2,       a1
   4407    DST_ADD_W16_LASX xr0, xr1, xr2, xr3, \in0, \in1, \in2, \in3
   4408 .endm
   4409 
   4410 .macro inv_adst16_lasx
   4411    la.local      t0,       iadst16_coeffs_h
   4412 
   4413    xvldrepl.h    xr20,     t0,       0        // 4091
   4414    xvldrepl.h    xr21,     t0,       2        // 201
   4415    xvmulev_xvmaddod_lasx xr15, xr0, xr20, xr21, xr16, xr18
   4416    xvneg.h       xr20,     xr20
   4417    xvmulev_xvmaddod_lasx xr15, xr0, xr21, xr20, xr17, xr19
   4418    xvilvl.w      xr15,     xr18,     xr16
   4419    xvilvl.w      xr0,      xr19,     xr17
   4420    xvilvh.w      xr18,     xr18,     xr16
   4421    xvilvh.w      xr19,     xr19,     xr17
   4422    xvssrarni.h.w xr18,     xr15,     12       // t0
   4423    xvssrarni.h.w xr19,     xr0,      12       // t1
   4424 
   4425    xvldrepl.h    xr20,     t0,       4        // 3973
   4426    xvldrepl.h    xr21,     t0,       6        // 995
   4427    xvmulev_xvmaddod_lasx xr13, xr2, xr20, xr21, xr16, xr0
   4428    xvneg.h       xr20,     xr20
   4429    xvmulev_xvmaddod_lasx xr13, xr2, xr21, xr20, xr17, xr15
   4430    xvilvl.w      xr13,     xr0,      xr16
   4431    xvilvl.w      xr2,      xr15,     xr17
   4432    xvilvh.w      xr0,      xr0,      xr16
   4433    xvilvh.w      xr15,     xr15,     xr17
   4434    xvssrarni.h.w xr0,      xr13,     12       // t2
   4435    xvssrarni.h.w xr15,     xr2,      12       // t3
   4436 
   4437    xvldrepl.h    xr20,     t0,        8       // 3703
   4438    xvldrepl.h    xr21,     t0,        10      // 1751
   4439    xvmulev_xvmaddod_lasx xr11, xr4, xr20, xr21, xr16, xr2
   4440    xvneg.h       xr20,     xr20
   4441    xvmulev_xvmaddod_lasx xr11, xr4, xr21, xr20, xr17, xr13
   4442    xvilvl.w      xr11,     xr2,       xr16
   4443    xvilvl.w      xr4,      xr13,      xr17
   4444    xvilvh.w      xr2,      xr2,       xr16
   4445    xvilvh.w      xr13,     xr13,      xr17
   4446    xvssrarni.h.w xr2,      xr11,      12       // t4
   4447    xvssrarni.h.w xr13,     xr4,       12       // t5
   4448 
   4449    xvldrepl.h    xr20,     t0,        12       // 3290 -> 1645
   4450    xvldrepl.h    xr21,     t0,        14       // 2440 -> 1220
   4451    xvmulev_xvmaddod_lasx xr9, xr6, xr20, xr21, xr16, xr4
   4452    xvneg.h       xr20,     xr20
   4453    xvmulev_xvmaddod_lasx xr9, xr6, xr21, xr20, xr17, xr11
   4454    xvilvl.w      xr9,      xr4,       xr16
   4455    xvilvl.w      xr6,      xr11,      xr17
   4456    xvilvh.w      xr4,      xr4,       xr16
   4457    xvilvh.w      xr11,     xr11,      xr17
   4458    xvssrarni.h.w xr4,      xr9,       12       // t6
   4459    xvssrarni.h.w xr11,     xr6,       12       // t7
   4460 
   4461    xvldrepl.h    xr20,     t0,        16       // 2751
   4462    xvldrepl.h    xr21,     t0,        18       // 3035
   4463    xvmulev_xvmaddod_lasx xr7, xr8, xr20, xr21, xr16, xr6
   4464    xvneg.h       xr20,     xr20
   4465    xvmulev_xvmaddod_lasx xr7, xr8, xr21, xr20, xr17, xr9
   4466    xvilvl.w      xr7,      xr6,       xr16
   4467    xvilvl.w      xr8,      xr9,       xr17
   4468    xvilvh.w      xr6,      xr6,       xr16
   4469    xvilvh.w      xr9,      xr9,       xr17
   4470    xvssrarni.h.w xr6,      xr7,       12       // t8
   4471    xvssrarni.h.w xr9,      xr8,       12       // t9
   4472 
   4473    xvldrepl.h    xr20,     t0,        20       // 2106
   4474    xvldrepl.h    xr21,     t0,        22       // 3513
   4475    xvmulev_xvmaddod_lasx xr5, xr10, xr20, xr21, xr16, xr7
   4476    xvneg.h       xr20,     xr20
   4477    xvmulev_xvmaddod_lasx xr5, xr10, xr21, xr20, xr17, xr8
   4478    xvilvl.w      xr5,      xr7,       xr16
   4479    xvilvl.w      xr10,     xr8,       xr17
   4480    xvilvh.w      xr7,      xr7,       xr16
   4481    xvilvh.w      xr8,      xr8,       xr17
   4482    xvssrarni.h.w xr7,      xr5,       12       // t10
   4483    xvssrarni.h.w xr8,      xr10,      12       // t11
   4484 
   4485    xvldrepl.h    xr20,     t0,        24       // 1380
   4486    xvldrepl.h    xr21,     t0,        26       // 3857
   4487    xvmulev_xvmaddod_lasx xr3, xr12, xr20, xr21, xr16, xr5
   4488    xvneg.h       xr20,     xr20
   4489    xvmulev_xvmaddod_lasx xr3, xr12, xr21, xr20, xr17, xr10
   4490    xvilvl.w      xr3,      xr5,       xr16
   4491    xvilvl.w      xr12,     xr10,      xr17
   4492    xvilvh.w      xr5,      xr5,       xr16
   4493    xvilvh.w      xr10,     xr10,      xr17
   4494    xvssrarni.h.w xr5,      xr3,       12       // t12
   4495    xvssrarni.h.w xr10,     xr12,      12       // t13
   4496 
   4497    xvldrepl.h    xr20,     t0,        28       // 601
   4498    xvldrepl.h    xr21,     t0,        30       // 4052
   4499    xvmulev_xvmaddod_lasx xr1, xr14, xr20, xr21, xr16, xr3
   4500    xvneg.h       xr20,     xr20
   4501    xvmulev_xvmaddod_lasx xr1, xr14, xr21, xr20, xr17, xr12
   4502    xvilvl.w      xr1,      xr3,       xr16
   4503    xvilvl.w      xr14,     xr12,      xr17
   4504    xvilvh.w      xr3,      xr3,       xr16
   4505    xvilvh.w      xr12,     xr12,      xr17
   4506    xvssrarni.h.w xr3,      xr1,       12       // t14
   4507    xvssrarni.h.w xr12,     xr14,      12       // t15
   4508 
   4509    xvsadd.h      xr1,      xr18,      xr6      // t0a
   4510    xvssub.h      xr14,     xr18,      xr6      // t8a
   4511    xvsadd.h      xr16,     xr19,      xr9      // t1a
   4512    xvssub.h      xr17,     xr19,      xr9      // t9a
   4513    xvsadd.h      xr6,      xr0,       xr7      // t2a
   4514    xvssub.h      xr18,     xr0,       xr7      // t10a
   4515    xvsadd.h      xr9,      xr15,      xr8      // t3a
   4516    xvssub.h      xr19,     xr15,      xr8      // t11a
   4517    xvsadd.h      xr0,      xr2,       xr5      // t4a
   4518    xvssub.h      xr7,      xr2,       xr5      // t12a
   4519    xvsadd.h      xr8,      xr13,      xr10     // t5a
   4520    xvssub.h      xr15,     xr13,      xr10     // t13a
   4521    xvsadd.h      xr2,      xr4,       xr3      // t6a
   4522    xvssub.h      xr5,      xr4,       xr3      // t14a
   4523    xvsadd.h      xr10,     xr11,      xr12     // t7a
   4524    xvssub.h      xr13,     xr11,      xr12     // t15a
   4525 
   4526    la.local      t0,       idct_coeffs_h
   4527 
   4528    xvldrepl.h    xr20,     t0,        8        // 799
   4529    xvldrepl.h    xr21,     t0,        10       // 4017
   4530    xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr3, xr11
   4531    xvneg.h       xr21,     xr21
   4532    xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr4, xr12
   4533    xvilvl.w      xr14,     xr11,      xr3
   4534    xvilvl.w      xr17,     xr12,      xr4
   4535    xvilvh.w      xr11,     xr11,      xr3
   4536    xvilvh.w      xr12,     xr12,      xr4
   4537    xvssrarni.h.w xr11,     xr14,      12       // t8
   4538    xvssrarni.h.w xr12,     xr17,      12       // t9
   4539 
   4540    xvneg.h       xr21,     xr21
   4541    xvmulev_xvmaddod_lasx xr15, xr7, xr20, xr21, xr3, xr14
   4542    xvneg.h       xr20,     xr20
   4543    xvmulev_xvmaddod_lasx xr15, xr7, xr21, xr20, xr4, xr17
   4544    xvilvl.w      xr15,     xr14,      xr3
   4545    xvilvl.w      xr7,      xr17,      xr4
   4546    xvilvh.w      xr14,     xr14,      xr3
   4547    xvilvh.w      xr17,     xr17,      xr4
   4548    xvssrarni.h.w xr14,     xr15,      12       // t13
   4549    xvssrarni.h.w xr17,     xr7,       12       // t12
   4550 
   4551    xvldrepl.h    xr20,     t0,        12       // 3406
   4552    xvldrepl.h    xr21,     t0,        14       // 2276
   4553    xvmulev_xvmaddod_lasx xr18, xr19, xr21, xr20, xr3, xr7
   4554    xvneg.h       xr21,     xr21
   4555    xvmulev_xvmaddod_lasx xr18, xr19, xr20, xr21, xr4, xr15
   4556    xvilvl.w      xr18,     xr7,       xr3
   4557    xvilvl.w      xr19,     xr15,      xr4
   4558    xvilvh.w      xr7,      xr7,       xr3
   4559    xvilvh.w      xr15,     xr15,      xr4
   4560    xvssrarni.h.w xr7,      xr18,      12       // t10
   4561    xvssrarni.h.w xr15,     xr19,      12       // t11
   4562 
   4563    xvneg.h       xr21,     xr21
   4564    xvmulev_xvmaddod_lasx xr13, xr5, xr20, xr21, xr3, xr18
   4565    xvneg.h       xr20,     xr20
   4566    xvmulev_xvmaddod_lasx xr13, xr5, xr21, xr20, xr4, xr19
   4567    xvilvl.w      xr13,     xr18,      xr3
   4568    xvilvl.w      xr5,      xr19,      xr4
   4569    xvilvh.w      xr18,     xr18,      xr3
   4570    xvilvh.w      xr19,     xr19,      xr4
   4571    xvssrarni.h.w xr18,     xr13,      12       // t15
   4572    xvssrarni.h.w xr19,     xr5,       12       // t14
   4573 
   4574    xvsadd.h      xr5,      xr1,       xr0      // t0
   4575    xvssub.h      xr13,     xr1,       xr0      // t4
   4576    xvsadd.h      xr3,      xr16,      xr8      // t1
   4577    xvssub.h      xr4,      xr16,      xr8      // t5
   4578    xvsadd.h      xr0,      xr6,       xr2      // t2
   4579    xvssub.h      xr1,      xr6,       xr2      // t6
   4580    xvsadd.h      xr8,      xr9,       xr10     // t3
   4581    xvssub.h      xr16,     xr9,       xr10     // t7
   4582    xvsadd.h      xr2,      xr11,      xr17     // t8a
   4583    xvssub.h      xr6,      xr11,      xr17     // t12a
   4584    xvsadd.h      xr9,      xr12,      xr14     // t9a
   4585    xvssub.h      xr10,     xr12,      xr14     // t13a
   4586    xvsadd.h      xr11,     xr7,       xr19     // t10a
   4587    xvssub.h      xr17,     xr7,       xr19     // t14a
   4588    xvsadd.h      xr12,     xr15,      xr18     // t11a
   4589    xvssub.h      xr14,     xr15,      xr18     // t15a
   4590 
   4591    la.local      t0,       idct_coeffs_h
   4592 
   4593    xvldrepl.h    xr20,     t0,        4        // 1567
   4594    xvldrepl.h    xr21,     t0,        6        // 3784
   4595    xvmulev_xvmaddod_lasx xr13, xr4, xr21, xr20, xr7, xr18
   4596    xvneg.h       xr21,     xr21
   4597    xvmulev_xvmaddod_lasx xr13, xr4, xr20, xr21, xr15, xr19
   4598    xvilvl.w      xr13,     xr18,      xr7
   4599    xvilvl.w      xr4,      xr19,      xr15
   4600    xvilvh.w      xr18,     xr18,      xr7
   4601    xvilvh.w      xr19,     xr19,      xr15
   4602    xvssrarni.h.w xr18,     xr13,      12       // t4a
   4603    xvssrarni.h.w xr19,     xr4,       12       // t5a
   4604 
   4605    xvneg.h       xr21,     xr21
   4606    xvmulev_xvmaddod_lasx xr16, xr1, xr20, xr21, xr7, xr4
   4607    xvneg.h       xr20,     xr20
   4608    xvmulev_xvmaddod_lasx xr16, xr1, xr21, xr20, xr15, xr13
   4609    xvilvl.w      xr16,     xr4,       xr7
   4610    xvilvl.w      xr1,      xr13,      xr15
   4611    xvilvh.w      xr4,      xr4,       xr7
   4612    xvilvh.w      xr13,     xr13,      xr15
   4613    xvssrarni.h.w xr4,      xr16,      12       // t7a
   4614    xvssrarni.h.w xr13,     xr1,       12       // t6a
   4615 
   4616    xvneg.h       xr20,     xr20
   4617    xvmulev_xvmaddod_lasx xr6, xr10, xr21, xr20, xr7, xr1
   4618    xvneg.h       xr21,     xr21
   4619    xvmulev_xvmaddod_lasx xr6, xr10, xr20, xr21, xr15, xr16
   4620    xvilvl.w      xr6,      xr1,       xr7
   4621    xvilvl.w      xr10,     xr16,      xr15
   4622    xvilvh.w      xr1,      xr1,       xr7
   4623    xvilvh.w      xr16,     xr16,      xr15
   4624    xvssrarni.h.w xr1,      xr6,       12       // t12
   4625    xvssrarni.h.w xr16,     xr10,      12       // t13
   4626 
   4627    xvneg.h       xr21,     xr21
   4628    xvmulev_xvmaddod_lasx xr14, xr17, xr20, xr21, xr7, xr6
   4629    xvneg.h       xr20,     xr20
   4630    xvmulev_xvmaddod_lasx xr14, xr17, xr21, xr20, xr15, xr10
   4631    xvilvl.w      xr14,     xr6,       xr7
   4632    xvilvl.w      xr17,     xr10,      xr15
   4633    xvilvh.w      xr6,      xr6,       xr7
   4634    xvilvh.w      xr10,     xr10,      xr15
   4635    xvssrarni.h.w xr6,      xr14,      12       // t15
   4636    xvssrarni.h.w xr10,     xr17,      12       // t14
   4637 
   4638    xvsadd.h       xr14,     xr5,       xr0      // out[0]
   4639    xvssub.h       xr17,     xr5,       xr0      // t2a
   4640    xvssub.h       xr7,      xr3,       xr8      // t3a
   4641    xvsadd.h       xr15,     xr3,       xr8      // out[15]
   4642    xvsllwil.w.h   xr22,     xr15,      0
   4643    xvexth.w.h     xr15,     xr15
   4644    xvneg.w        xr22,     xr22
   4645    xvneg.w        xr15,     xr15
   4646    xvssrarni.h.w  xr15,     xr22,      0        // out[15]
   4647    xvssub.h       xr7,      xr3,       xr8      // t3a
   4648 
   4649    xvsadd.h       xr3,      xr19,      xr4      // out[12]
   4650    xvssub.h       xr8,      xr19,      xr4      // t7
   4651    xvssub.h       xr0,      xr18,      xr13     // t6
   4652    xvsadd.h       xr5,      xr18,      xr13     // out[3]
   4653    xvsllwil.w.h   xr22,     xr5,       0
   4654    xvexth.w.h     xr5,      xr5
   4655    xvneg.w        xr22,     xr22
   4656    xvneg.w        xr5,      xr5
   4657    xvssrarni.h.w  xr5,      xr22,      0        // out[3]
   4658 
   4659    xvsadd.h       xr13,     xr9,       xr12     // out[14]
   4660    xvssub.h       xr19,     xr9,       xr12     // t11
   4661    xvssub.h       xr4,      xr2,       xr11     // t10
   4662    xvsadd.h       xr18,     xr2,       xr11     // out[1]
   4663    xvsllwil.w.h   xr22,     xr18,      0
   4664    xvexth.w.h     xr18,     xr18
   4665    xvneg.w        xr22,     xr22
   4666    xvneg.w        xr18,     xr18
   4667    xvssrarni.h.w  xr18,     xr22,      0        // out[1]
   4668 
   4669    xvsadd.h       xr2,      xr1,       xr10     // out[2]
   4670    xvssub.h       xr11,     xr1,       xr10     // t14a
   4671    xvssub.h       xr12,     xr16,      xr6      // t15a
   4672    xvsadd.h       xr9,      xr16,      xr6      // out[13]
   4673    xvsllwil.w.h   xr22,     xr9,       0
   4674    xvexth.w.h     xr9,      xr9
   4675    xvneg.w        xr22,     xr22
   4676    xvneg.w        xr9,      xr9
   4677    xvssrarni.h.w  xr9,      xr22,      0        // out[13]
   4678 
   4679    xvldrepl.h     xr20,     t0,        0        // 2896
   4680    xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr20, xr6, xr10
   4681    xvneg.h        xr21,     xr20
   4682    xvmulev_xvmaddod_lasx xr17, xr7, xr20, xr21, xr16, xr1
   4683    xvilvl.w       xr17,     xr10,      xr6
   4684    xvilvl.w       xr7,      xr1,       xr16
   4685    xvilvh.w       xr10,     xr10,      xr6
   4686    xvilvh.w       xr1,      xr1,       xr16
   4687    xvssrarni.h.w  xr1,      xr7,       12       // out[8]
   4688    xvsrari.w      xr17,     xr17,      12
   4689    xvsrari.w      xr10,     xr10,      12
   4690    xvneg.w        xr17,     xr17
   4691    xvneg.w        xr10,     xr10
   4692    xvssrarni.h.w  xr10,     xr17,      0        // out[7]
   4693 
   4694    xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr21, xr16, xr17
   4695    xvmulev_xvmaddod_lasx xr0, xr8, xr20, xr20, xr6, xr7
   4696    xvilvl.w       xr0,      xr17,      xr16
   4697    xvilvl.w       xr8,      xr7,       xr6
   4698    xvilvh.w       xr17,     xr17,      xr16
   4699    xvilvh.w       xr7,      xr7,       xr6
   4700    xvssrarni.h.w  xr7,      xr8,       12       // out[4]
   4701    xvsrari.w      xr0,      xr0,       12
   4702    xvsrari.w      xr17,     xr17,      12
   4703    xvneg.w        xr0,      xr0
   4704    xvneg.w        xr17,     xr17
   4705    xvssrarni.h.w xr17,      xr0,       0        // out[11]
   4706 
   4707    xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr21, xr16, xr0
   4708    xvmulev_xvmaddod_lasx xr4, xr19, xr20, xr20, xr6, xr8
   4709    xvilvl.w       xr4,      xr0,       xr16
   4710    xvilvl.w       xr19,     xr8,       xr6
   4711    xvilvh.w       xr0,      xr0,       xr16
   4712    xvilvh.w       xr8,      xr8,       xr6
   4713    xvssrarni.h.w  xr8,      xr19,      12       // out[6]
   4714    xvsrari.w      xr4,      xr4,       12
   4715    xvsrari.w      xr0,      xr0,       12
   4716    xvneg.w        xr4,      xr4
   4717    xvneg.w        xr0,      xr0
   4718    xvssrarni.h.w  xr0,      xr4,       0        // out[9]
   4719    xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr20, xr6, xr4
   4720    xvmulev_xvmaddod_lasx xr11, xr12, xr20, xr21, xr16, xr19
   4721    xvilvl.w       xr11,     xr4,       xr6
   4722    xvilvl.w       xr12,     xr19,      xr16
   4723    xvilvh.w       xr4,      xr4,       xr6
   4724    xvilvh.w       xr19,     xr19,      xr16
   4725    xvssrarni.h.w  xr19,     xr12,      12       // out[10]
   4726    xvsrari.w      xr11,     xr11,      12
   4727    xvsrari.w      xr4,      xr4,       12
   4728    xvneg.w        xr11,     xr11
   4729    xvneg.w        xr4,      xr4
   4730    xvssrarni.h.w  xr4,      xr11,      0        // out[5]
   4731 .endm
   4732 
   4733 function inv_txfm_add_adst_adst_16x16_8bpc_lasx
   4734    PUSH_REG
   4735    xvld_x16 a2, 0, 32, xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \
   4736             xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15
   4737 
   4738    inv_adst16_lasx
   4739 
   4740    LASX_TRANSPOSE8x8_H xr14, xr18, xr2, xr5, xr7, xr4, xr8, xr10, \
   4741                        xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \
   4742                        xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27
   4743 
   4744    LASX_TRANSPOSE8x8_H xr1,  xr0,  xr19, xr17, xr3, xr9, xr13, xr15, \
   4745                        xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \
   4746                        xr20, xr21, xr22, xr23, xr24, xr25, xr26, xr27
   4747 
   4748    xvsrari_h_x16 xr14, xr18, xr2, xr5, xr7, xr28, xr6, xr10, \
   4749                  xr29, xr30, xr11, xr17, xr31, xr19, xr16, xr15, \
   4750                  xr0, xr1, xr2, xr3, xr4, xr5, xr6, xr7, \
   4751                  xr8, xr9, xr10, xr11, xr12, xr13, xr14, xr15, 2
   4752 
   4753    xvpermi_q_x2 xr0, xr1, xr8, xr9, xr0, xr1, xr8, xr9, xr20, xr21
   4754    xvpermi_q_x2 xr2, xr3, xr10, xr11, xr2, xr3, xr10, xr11, xr20, xr21
   4755    xvpermi_q_x2 xr4, xr5, xr12, xr13, xr4, xr5, xr12, xr13, xr20, xr21
   4756    xvpermi_q_x2 xr6, xr7, xr14, xr15, xr6, xr7, xr14, xr15, xr20, xr21
   4757 
   4758    inv_adst16_lasx
   4759 
   4760    xvsrari_h_x16 xr14, xr18, xr2,  xr5,  xr7,  xr4, xr8,  xr10, \
   4761                  xr1,  xr0,  xr19, xr17, xr3,  xr9, xr13, xr15, \
   4762                  xr14, xr18, xr11, xr5,  xr7,  xr4, xr8,  xr10, \
   4763                  xr12, xr16, xr19, xr17, xr20, xr9, xr13, xr15, 4
   4764 
   4765    xvxor.v       xr23,     xr23,     xr23
   4766 .irp i, 0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480
   4767    xvst          xr23,     a2,       \i
   4768 .endr
   4769    alsl.d        t2,       a1,       a0,    1
   4770    XVLD_DST_ADD_W16 xr14, xr18, xr11, xr5
   4771    alsl.d        a0,       a1,       a0,    2
   4772    alsl.d        t2,       a1,       a0,    1
   4773    XVLD_DST_ADD_W16 xr7, xr4, xr8, xr10
   4774    alsl.d        a0,       a1,       a0,    2
   4775    alsl.d        t2,       a1,       a0,    1
   4776    XVLD_DST_ADD_W16 xr12, xr16, xr19, xr17
   4777    alsl.d        a0,       a1,       a0,    2
   4778    alsl.d        t2,       a1,       a0,    1
   4779    XVLD_DST_ADD_W16 xr20, xr9, xr13, xr15
   4780    POP_REG
   4781 endfunc