tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

itx.S (40949B)


      1 /******************************************************************************
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2023, Nathan Egge
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 *****************************************************************************/
     27 
     28 #include "src/riscv/asm.S"
     29 
     30 function inv_txfm_add_4x4_rvv, export=1, ext=v
     31  csrw vxrm, zero
     32 
     33  vsetivli zero, 4, e16, mf2, ta, ma
     34  vle16.v v0, (a2)
     35  addi t0, a2, 8
     36  vle16.v v1, (t0)
     37  addi t0, t0, 8
     38  vle16.v v2, (t0)
     39  addi t0, t0, 8
     40  vle16.v v3, (t0)
     41 
     42  jalr t0, a4
     43 
     44  vmv.v.x v4, zero
     45 
     46  vsseg4e16.v v0, (a2)
     47  vle16.v v0, (a2)
     48  vse16.v v4, (a2)
     49  addi t0, a2, 8
     50  vle16.v v1, (t0)
     51  vse16.v v4, (t0)
     52  addi t0, t0, 8
     53  vle16.v v2, (t0)
     54  vse16.v v4, (t0)
     55  addi t0, t0, 8
     56  vle16.v v3, (t0)
     57  vse16.v v4, (t0)
     58 
     59  jalr t0, a5
     60 
     61  vssra.vi v0, v0, 4
     62  vssra.vi v1, v1, 4
     63  vssra.vi v2, v2, 4
     64  vssra.vi v3, v3, 4
     65 
     66 itx_4x4_end:
     67  vsetvli zero, zero, e8, mf4, ta, ma
     68  vle8.v v4, (a0)
     69  add t0, a0, a1
     70  vle8.v v5, (t0)
     71  add t0, t0, a1
     72  vle8.v v6, (t0)
     73  add t0, t0, a1
     74  vle8.v v7, (t0)
     75 
     76  vwaddu.wv v0, v0, v4
     77  vwaddu.wv v1, v1, v5
     78  vwaddu.wv v2, v2, v6
     79  vwaddu.wv v3, v3, v7
     80 
     81  vsetvli zero, zero, e16, mf2, ta, ma
     82  vmax.vx v0, v0, zero
     83  vmax.vx v1, v1, zero
     84  vmax.vx v2, v2, zero
     85  vmax.vx v3, v3, zero
     86 
     87  vsetvli zero, zero, e8, mf4, ta, ma
     88 
     89  vnclipu.wi v4, v0, 0
     90  vnclipu.wi v5, v1, 0
     91  vnclipu.wi v6, v2, 0
     92  vnclipu.wi v7, v3, 0
     93 
     94  vse8.v v4, (a0)
     95  add a0, a0, a1
     96  vse8.v v5, (a0)
     97  add a0, a0, a1
     98  vse8.v v6, (a0)
     99  add a0, a0, a1
    100  vse8.v v7, (a0)
    101 
    102  ret
    103 endfunc
    104 
    105 function inv_identity_e16_x4_rvv, export=1, ext=v
    106  li t1, (5793-4096)*8
    107  vsmul.vx v4, v0, t1
    108  vsmul.vx v5, v1, t1
    109  vsmul.vx v6, v2, t1
    110  vsmul.vx v7, v3, t1
    111 
    112  vsadd.vv v0, v0, v4
    113  vsadd.vv v1, v1, v5
    114  vsadd.vv v2, v2, v6
    115  vsadd.vv v3, v3, v7
    116 
    117  jr t0
    118 endfunc
    119 
    120 .macro iwht_4
    121  vadd.vv v0, v0, v1
    122  vsub.vv v5, v2, v3
    123  vsub.vv v4, v0, v5
    124  vsra.vi v4, v4, 1
    125  vsub.vv v2, v4, v1
    126  vsub.vv v1, v4, v3
    127  vadd.vv v3, v5, v2
    128  vsub.vv v0, v0, v1
    129 .endm
    130 
    131 .macro idct_4 o0, o1, o2, o3
    132  li t1, 2896
    133  li t2, 1567
    134  li t3, 3784
    135 
    136  vwmul.vx v16, \o0, t1
    137  vwmul.vx v18, \o0, t1
    138  vwmacc.vx v16, t1, \o2
    139  neg t1, t1
    140  vwmacc.vx v18, t1, \o2
    141 
    142  vwmul.vx v20, \o1, t3
    143  neg t3, t3
    144  vwmul.vx v22, \o1, t2
    145  vwmacc.vx v20, t2, \o3
    146  vwmacc.vx v22, t3, \o3
    147 
    148  vnclip.wi v16, v16, 12
    149  vnclip.wi v18, v18, 12
    150  vnclip.wi v20, v20, 12
    151  vnclip.wi v22, v22, 12
    152 
    153  vsadd.vv \o0, v16, v20
    154  vsadd.vv \o1, v18, v22
    155  vssub.vv \o2, v18, v22
    156  vssub.vv \o3, v16, v20
    157 .endm
    158 
    159 .macro iadst_4 o0, o1, o2, o3, lm2, lm
    160  li t1, 1321
    161  li t2, 3803
    162  li t3, 2482
    163 
    164  vwmul.vx v16, v0, t1
    165  vwmul.vx v18, v0, t3
    166  neg t1, t1
    167  vwmacc.vx v16, t2, v2
    168  vwmacc.vx v18, t1, v2
    169  neg t2, t2
    170  vwmacc.vx v16, t3, v3
    171  vwmacc.vx v18, t2, v3
    172 
    173  vwsub.vv v20,  v0, v2
    174  vwadd.wv v20, v20, v3
    175 
    176  li t1, 3344
    177  vwmul.vx v22, v1, t1
    178 
    179  vsetvli zero, zero, e32, \lm2, ta, ma
    180 
    181  vmul.vx v20, v20, t1
    182 
    183  vadd.vv v24, v16, v18
    184  vadd.vv v16, v16, v22
    185  vadd.vv v18, v18, v22
    186  vsub.vv v22, v24, v22
    187 
    188  vsetvli zero, zero, e16, \lm, ta, ma
    189 
    190  vnclip.wi \o0, v16, 12
    191  vnclip.wi \o1, v18, 12
    192  vnclip.wi \o2, v20, 12
    193  vnclip.wi \o3, v22, 12
    194 .endm
    195 
    196 function inv_dct_e16_x4_rvv, export=1, ext=v
    197  idct_4 v0, v1, v2, v3
    198  jr t0
    199 endfunc
    200 
    201 function inv_adst_e16_x4_rvv, export=1, ext=v
    202  iadst_4 v0, v1, v2, v3, m1, mf2
    203  jr t0
    204 endfunc
    205 
    206 function inv_flipadst_e16_x4_rvv, export=1, ext=v
    207  iadst_4 v3, v2, v1, v0, m1, mf2
    208  jr t0
    209 endfunc
    210 
    211 function inv_adst_e16_x4w_rvv, export=1, ext=v
    212  iadst_4 v0, v1, v2, v3, m2, m1
    213  jr t0
    214 endfunc
    215 
    216 function inv_flipadst_e16_x4w_rvv, export=1, ext=v
    217  iadst_4 v3, v2, v1, v0, m2, m1
    218  jr t0
    219 endfunc
    220 
    221 function inv_txfm_add_wht_wht_4x4_8bpc_rvv, export=1, ext=v
    222  csrw vxrm, zero
    223 
    224  vsetivli zero, 4, e16, mf2, ta, ma
    225  vle16.v v0, (a2)
    226  addi t0, a2, 8
    227  vle16.v v1, (t0)
    228  addi t0, t0, 8
    229  vle16.v v2, (t0)
    230  addi t0, t0, 8
    231  vle16.v v3, (t0)
    232 
    233  vsra.vi v0, v0, 2
    234  vsra.vi v1, v1, 2
    235  vsra.vi v2, v2, 2
    236  vsra.vi v3, v3, 2
    237 
    238  iwht_4
    239 
    240  vmv.v.x v4, zero
    241 
    242  vsseg4e16.v v0, (a2)
    243  vle16.v v0, (a2)
    244  vse16.v v4, (a2)
    245  addi t0, a2, 8
    246  vle16.v v1, (t0)
    247  vse16.v v4, (t0)
    248  addi t0, t0, 8
    249  vle16.v v2, (t0)
    250  vse16.v v4, (t0)
    251  addi t0, t0, 8
    252  vle16.v v3, (t0)
    253  vse16.v v4, (t0)
    254 
    255  iwht_4
    256 
    257  j itx_4x4_end
    258 endfunc
    259 
    260 .macro def_fn_4x4 txfm1, txfm2
    261 function inv_txfm_add_\txfm1\()_\txfm2\()_4x4_8bpc_rvv, export=1, ext=v
    262 .ifc \txfm1\()_\txfm2, dct_dct
    263  beqz a3, 1f
    264 .endif
    265  la a4, inv_\txfm1\()_e16_x4_rvv
    266  la a5, inv_\txfm2\()_e16_x4_rvv
    267  j inv_txfm_add_4x4_rvv
    268 .ifc \txfm1\()_\txfm2, dct_dct
    269 1:
    270  csrw vxrm, zero
    271  vsetivli zero, 4, e16, mf2, ta, ma
    272  ld t2, (a2)
    273  li t1, 2896*8
    274  vmv.v.x v0, t2
    275  vsmul.vx v0, v0, t1
    276  sd x0, (a2)
    277  vsmul.vx v0, v0, t1
    278  vssra.vi v0, v0, 4
    279  vmv.v.v v1, v0
    280  vmv.v.v v2, v0
    281  vmv.v.v v3, v0
    282  j itx_4x4_end
    283 .endif
    284 endfunc
    285 .endm
    286 
    287 def_fn_4x4 dct, dct
    288 def_fn_4x4 identity, identity
    289 def_fn_4x4 dct, adst
    290 def_fn_4x4 dct, flipadst
    291 def_fn_4x4 dct, identity
    292 def_fn_4x4 adst, dct
    293 def_fn_4x4 adst, adst
    294 def_fn_4x4 adst, flipadst
    295 def_fn_4x4 flipadst, dct
    296 def_fn_4x4 flipadst, adst
    297 def_fn_4x4 flipadst, flipadst
    298 def_fn_4x4 identity, dct
    299 def_fn_4x4 adst, identity
    300 def_fn_4x4 flipadst, identity
    301 def_fn_4x4 identity, adst
    302 def_fn_4x4 identity, flipadst
    303 
    304 .macro def_fn_8x8_base variant
    305 function inv_txfm_\variant\()add_8x8_rvv, export=1, ext=v
    306  csrw vxrm, zero
    307 
    308  vsetivli zero, 8, e16, m1, ta, ma
    309  vle16.v v0, (a2)
    310  addi t0, a2, 16
    311  vle16.v v1, (t0)
    312  addi t0, t0, 16
    313  vle16.v v2, (t0)
    314  addi t0, t0, 16
    315  vle16.v v3, (t0)
    316  addi t0, t0, 16
    317  vle16.v v4, (t0)
    318  addi t0, t0, 16
    319  vle16.v v5, (t0)
    320  addi t0, t0, 16
    321  vle16.v v6, (t0)
    322  addi t0, t0, 16
    323  vle16.v v7, (t0)
    324 
    325 .ifc \variant, identity_
    326  // The identity vsadd.vv and downshift vssra.vi 1 cancel out
    327 
    328  j L(itx_8x8_epilog)
    329 .else
    330  jalr t0, a4
    331 
    332  vssra.vi v0, v0, 1
    333  vssra.vi v1, v1, 1
    334  vssra.vi v2, v2, 1
    335  vssra.vi v3, v3, 1
    336  vssra.vi v4, v4, 1
    337  vssra.vi v5, v5, 1
    338  vssra.vi v6, v6, 1
    339  vssra.vi v7, v7, 1
    340 
    341 L(itx_8x8_epilog):
    342  vsseg8e16.v v0, (a2)
    343  vle16.v v0, (a2)
    344  addi t0, a2, 16
    345  vle16.v v1, (t0)
    346  addi t0, t0, 16
    347  vle16.v v2, (t0)
    348  addi t0, t0, 16
    349  vle16.v v3, (t0)
    350  addi t0, t0, 16
    351  vle16.v v4, (t0)
    352  addi t0, t0, 16
    353  vle16.v v5, (t0)
    354  addi t0, t0, 16
    355  vle16.v v6, (t0)
    356  addi t0, t0, 16
    357  vle16.v v7, (t0)
    358 
    359  jalr t0, a5
    360 
    361  vssra.vi v0, v0, 4
    362  vssra.vi v1, v1, 4
    363  vssra.vi v2, v2, 4
    364  vssra.vi v3, v3, 4
    365  vssra.vi v4, v4, 4
    366  vssra.vi v5, v5, 4
    367  vssra.vi v6, v6, 4
    368  vssra.vi v7, v7, 4
    369 
    370  li t1, 64
    371  vsetvli zero, t1, e16, m8, ta, ma
    372  vmv.v.x v8, zero
    373  vse16.v v8, (a2)
    374 
    375 itx_8x8_end:
    376  vsetivli zero, 8, e8, mf2, ta, ma
    377  vle8.v v8, (a0)
    378  add t0, a0, a1
    379  vle8.v v9, (t0)
    380  add t0, t0, a1
    381  vle8.v v10, (t0)
    382  add t0, t0, a1
    383  vle8.v v11, (t0)
    384  add t0, t0, a1
    385  vle8.v v12, (t0)
    386  add t0, t0, a1
    387  vle8.v v13, (t0)
    388  add t0, t0, a1
    389  vle8.v v14, (t0)
    390  add t0, t0, a1
    391  vle8.v v15, (t0)
    392 
    393  vwaddu.wv v0, v0, v8
    394  vwaddu.wv v1, v1, v9
    395  vwaddu.wv v2, v2, v10
    396  vwaddu.wv v3, v3, v11
    397  vwaddu.wv v4, v4, v12
    398  vwaddu.wv v5, v5, v13
    399  vwaddu.wv v6, v6, v14
    400  vwaddu.wv v7, v7, v15
    401 
    402  vsetvli zero, zero, e16, m1, ta, ma
    403  vmax.vx v0, v0, zero
    404  vmax.vx v1, v1, zero
    405  vmax.vx v2, v2, zero
    406  vmax.vx v3, v3, zero
    407  vmax.vx v4, v4, zero
    408  vmax.vx v5, v5, zero
    409  vmax.vx v6, v6, zero
    410  vmax.vx v7, v7, zero
    411 
    412  vsetvli zero, zero, e8, mf2, ta, ma
    413 
    414  vnclipu.wi v8, v0, 0
    415  vnclipu.wi v9, v1, 0
    416  vnclipu.wi v10, v2, 0
    417  vnclipu.wi v11, v3, 0
    418  vnclipu.wi v12, v4, 0
    419  vnclipu.wi v13, v5, 0
    420  vnclipu.wi v14, v6, 0
    421  vnclipu.wi v15, v7, 0
    422 
    423  vse8.v v8, (a0)
    424  add a0, a0, a1
    425  vse8.v v9, (a0)
    426  add a0, a0, a1
    427  vse8.v v10, (a0)
    428  add a0, a0, a1
    429  vse8.v v11, (a0)
    430  add a0, a0, a1
    431  vse8.v v12, (a0)
    432  add a0, a0, a1
    433  vse8.v v13, (a0)
    434  add a0, a0, a1
    435  vse8.v v14, (a0)
    436  add a0, a0, a1
    437  vse8.v v15, (a0)
    438 
    439  ret
    440 .endif
    441 endfunc
    442 .endm
    443 
    444 def_fn_8x8_base identity_
    445 def_fn_8x8_base
    446 
    447 function inv_identity_e16_x8_rvv, export=1, ext=v
    448  vsadd.vv v0, v0, v0
    449  vsadd.vv v1, v1, v1
    450  vsadd.vv v2, v2, v2
    451  vsadd.vv v3, v3, v3
    452  vsadd.vv v4, v4, v4
    453  vsadd.vv v5, v5, v5
    454  vsadd.vv v6, v6, v6
    455  vsadd.vv v7, v7, v7
    456 
    457  jr t0
    458 endfunc
    459 
    460 .macro idct_8 o0, o1, o2, o3, o4, o5, o6, o7
    461  idct_4 \o0, \o2, \o4, \o6
    462 
    463  li t1, 799
    464  li t2, 4017
    465  li t3, 3406
    466  li t4, 2276
    467 
    468  vwmul.vx v22, \o1, t2
    469  neg t2, t2
    470  vwmul.vx v16, \o1, t1
    471  vwmacc.vx v22, t1, \o7
    472  vwmacc.vx v16, t2, \o7
    473 
    474  vwmul.vx v20, \o5, t4
    475  neg t4, t4
    476  vwmul.vx v18, \o5, t3
    477  vwmacc.vx v20, t3, \o3
    478  vwmacc.vx v18, t4, \o3
    479 
    480  vnclip.wi v16, v16, 12
    481  vnclip.wi v18, v18, 12
    482  vnclip.wi v20, v20, 12
    483  vnclip.wi v22, v22, 12
    484 
    485  vssub.vv \o7, v22, v20
    486  vsadd.vv v22, v22, v20
    487  vssub.vv \o1, v16, v18
    488  vsadd.vv v16, v16, v18
    489 
    490  li t2, 2896
    491 
    492  vwmul.vx v18, \o7, t2
    493  vwmul.vx v20, \o7, t2
    494  vwmacc.vx v20, t2, \o1
    495  neg t2, t2
    496  vwmacc.vx v18, t2, \o1
    497 
    498  vnclip.wi v18, v18, 12
    499  vnclip.wi v20, v20, 12
    500 
    501  vssub.vv \o7, \o0, v22
    502  vsadd.vv \o0, \o0, v22
    503  vssub.vv v17, \o2, v20
    504  vsadd.vv \o1, \o2, v20
    505  vssub.vv \o5, \o4, v18
    506  vsadd.vv \o2, \o4, v18
    507  vssub.vv \o4, \o6, v16
    508  vsadd.vv \o3, \o6, v16
    509  vmv.v.v \o6, v17
    510 .endm
    511 
    512 .macro iadst_8 o0, o1, o2, o3, o4, o5, o6, o7
    513  li t1, 4076
    514  li t2, 401
    515  li t3, 3612
    516  li t4, 1931
    517  li t5, 2598
    518  li t6, 3166
    519 
    520  vwmul.vx v16, v7, t1
    521  neg t1, t1
    522  vwmul.vx v18, v7, t2
    523  vwmacc.vx v16, t2, v0
    524  vwmacc.vx v18, t1, v0
    525 
    526  vwmul.vx v20, v5, t3
    527  neg t3, t3
    528  vwmul.vx v22, v5, t4
    529  vwmacc.vx v20, t4, v2
    530  vwmacc.vx v22, t3, v2
    531 
    532  vwmul.vx v24, v3, t5
    533  neg t5, t5
    534  vwmul.vx v26, v3, t6
    535  vwmacc.vx v24, t6, v4
    536  vwmacc.vx v26, t5, v4
    537 
    538  li t2, 1189
    539  li t3, 3920
    540  li t4, 1567
    541  li t5, 3784
    542  li t6, 2896
    543 
    544  vwmul.vx v28, v1, t2
    545  neg t2, t2
    546  vwmul.vx v30, v1, t3
    547  vwmacc.vx v28, t3, v6
    548  vwmacc.vx v30, t2, v6
    549 
    550  vnclip.wi v16, v16, 12
    551  vnclip.wi v18, v18, 12
    552  vnclip.wi v20, v20, 12
    553  vnclip.wi v22, v22, 12
    554  vnclip.wi v24, v24, 12
    555  vnclip.wi v26, v26, 12
    556  vnclip.wi v28, v28, 12
    557  vnclip.wi v30, v30, 12
    558 
    559  vssub.vv  v4, v16, v24
    560  vsadd.vv v16, v16, v24
    561  vsadd.vv  v1, v18, v26
    562  vsadd.vv  v2, v20, v28
    563  vsadd.vv  v3, v22, v30
    564  vssub.vv  v5, v18, v26
    565  vssub.vv  v6, v20, v28
    566  vssub.vv v30, v22, v30
    567 
    568  vsadd.vv \o0, v16, v2
    569  vsadd.vv \o7,  v1, v3
    570  vssub.vv  v2, v16, v2
    571  vssub.vv  v3,  v1, v3
    572 
    573  vwmul.vx v16,  v4, t5
    574  vwmul.vx v18,  v4, t4
    575  vwmul.vx v20, v30, t5
    576  vwmul.vx v22, v30, t4
    577  vwmacc.vx v16, t4, v5
    578  neg t4, t4
    579  vwmacc.vx v22, t5, v6
    580  neg t5, t5
    581  vwmacc.vx v20, t4, v6
    582  vwmacc.vx v18, t5, v5
    583 
    584  vnclip.wi v16, v16, 12
    585  vnclip.wi v18, v18, 12
    586  vnclip.wi v20, v20, 12
    587  vnclip.wi v22, v22, 12
    588 
    589  vsadd.vv \o1, v16, v20
    590  vsadd.vv \o6, v18, v22
    591  vssub.vv v16, v16, v20
    592  vssub.vv v17, v18, v22
    593 
    594  vwmul.vx v18, v2, t6
    595  vwmul.vx v20, v2, t6
    596  vwmul.vx v22, v16, t6
    597  vwmul.vx v24, v16, t6
    598  vwmacc.vx v18, t6, v3
    599  vwmacc.vx v22, t6, v17
    600  neg t6, t6
    601  vwmacc.vx v20, t6, v3
    602  vwmacc.vx v24, t6, v17
    603 
    604  vnclip.wi \o3, v18, 12
    605  vnclip.wi \o4, v20, 12
    606  vnclip.wi \o2, v22, 12
    607  vnclip.wi \o5, v24, 12
    608 
    609  vmv.v.x v16, zero
    610  vssub.vv \o1, v16, \o1
    611  vssub.vv \o3, v16, \o3
    612  vssub.vv \o5, v16, \o5
    613  vssub.vv \o7, v16, \o7
    614 .endm
    615 
    616 function inv_dct_e16_x8_rvv, export=1, ext=v
    617  idct_8 v0, v1, v2, v3, v4, v5, v6, v7
    618  jr t0
    619 endfunc
    620 
    621 function inv_adst_e16_x8_rvv, export=1, ext=v
    622  iadst_8 v0, v1, v2, v3, v4, v5, v6, v7
    623  jr t0
    624 endfunc
    625 
    626 function inv_flipadst_e16_x8_rvv, export=1, ext=v
    627  iadst_8 v7, v6, v5, v4, v3, v2, v1, v0
    628  jr t0
    629 endfunc
    630 
    631 .macro def_fn_8x8 txfm1, txfm2
    632 function inv_txfm_add_\txfm1\()_\txfm2\()_8x8_8bpc_rvv, export=1, ext=v
    633 .ifc \txfm1\()_\txfm2, dct_dct
    634  beqz a3, 1f
    635 .endif
    636  la a5, inv_\txfm2\()_e16_x8_rvv
    637 .ifc \txfm1, identity
    638  j inv_txfm_identity_add_8x8_rvv
    639 .else
    640  la a4, inv_\txfm1\()_e16_x8_rvv
    641  j inv_txfm_add_8x8_rvv
    642 .endif
    643 .ifc \txfm1\()_\txfm2, dct_dct
    644 1:
    645  csrw vxrm, zero
    646  vsetivli zero, 8, e16, m1, ta, ma
    647  ld t2, (a2)
    648  li t1, 2896*8
    649  vmv.v.x v0, t2
    650  vsmul.vx v0, v0, t1
    651  sd x0, (a2)
    652  vssra.vi v0, v0, 1
    653  vsmul.vx v0, v0, t1
    654  vssra.vi v0, v0, 4
    655  vmv.v.v v1, v0
    656  vmv.v.v v2, v0
    657  vmv.v.v v3, v0
    658  vmv.v.v v4, v0
    659  vmv.v.v v5, v0
    660  vmv.v.v v6, v0
    661  vmv.v.v v7, v0
    662  j itx_8x8_end
    663 .endif
    664 endfunc
    665 .endm
    666 
    667 def_fn_8x8 dct, dct
    668 def_fn_8x8 identity, identity
    669 def_fn_8x8 dct, adst
    670 def_fn_8x8 dct, flipadst
    671 def_fn_8x8 dct, identity
    672 def_fn_8x8 adst, dct
    673 def_fn_8x8 adst, adst
    674 def_fn_8x8 adst, flipadst
    675 def_fn_8x8 flipadst, dct
    676 def_fn_8x8 flipadst, adst
    677 def_fn_8x8 flipadst, flipadst
    678 def_fn_8x8 identity, dct
    679 def_fn_8x8 adst, identity
    680 def_fn_8x8 flipadst, identity
    681 def_fn_8x8 identity, adst
    682 def_fn_8x8 identity, flipadst
    683 
    684 function inv_txfm_add_4x8_rvv, export=1, ext=v
    685  csrw vxrm, zero
    686 
    687  vsetivli zero, 8, e16, m1, ta, ma
    688  vle16.v v0, (a2)
    689  addi t0, a2, 16
    690  vle16.v v1, (t0)
    691  addi t0, t0, 16
    692  vle16.v v2, (t0)
    693  addi t0, t0, 16
    694  vle16.v v3, (t0)
    695 
    696  li t1, 2896*8
    697 .irp i, 0, 1, 2, 3
    698  vsmul.vx v\i, v\i, t1
    699 .endr
    700 
    701  jalr t0, a4
    702 
    703  vsseg4e16.v v0, (a2)
    704 
    705  vsetivli zero, 4, e16, mf2, ta, ma
    706  vmv.v.x v8, zero
    707  vle16.v v0, (a2)
    708  vse16.v v8, (a2)
    709 .irp i, 1, 2, 3, 4, 5, 6, 7
    710  addi a2, a2, 8
    711  vle16.v v\i, (a2)
    712  vse16.v v8, (a2)
    713 .endr
    714 
    715  jalr t0, a5
    716 
    717 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
    718  vssra.vi v\i, v\i, 4
    719 .endr
    720 
    721  vsetvli zero, zero, e8, mf4, ta, ma
    722  vle8.v v8, (a0)
    723  add t0, a0, a1
    724  vle8.v v9, (t0)
    725 .irp i, 10, 11, 12, 13, 14, 15
    726  add t0, t0, a1
    727  vle8.v v\i, (t0)
    728 .endr
    729 
    730  vwaddu.wv v0, v0,  v8
    731  vwaddu.wv v1, v1,  v9
    732  vwaddu.wv v2, v2, v10
    733  vwaddu.wv v3, v3, v11
    734  vwaddu.wv v4, v4, v12
    735  vwaddu.wv v5, v5, v13
    736  vwaddu.wv v6, v6, v14
    737  vwaddu.wv v7, v7, v15
    738 
    739  vsetvli zero, zero, e16, mf2, ta, ma
    740 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
    741  vmax.vx v\i, v\i, zero
    742 .endr
    743 
    744  vsetvli zero, zero, e8, mf4, ta, ma
    745 
    746  vnclipu.wi  v8, v0, 0
    747  vnclipu.wi  v9, v1, 0
    748  vnclipu.wi v10, v2, 0
    749  vnclipu.wi v11, v3, 0
    750  vnclipu.wi v12, v4, 0
    751  vnclipu.wi v13, v5, 0
    752  vnclipu.wi v14, v6, 0
    753  vnclipu.wi v15, v7, 0
    754 
    755  vse8.v v8, (a0)
    756 .irp i, 9, 10, 11, 12, 13, 14, 15
    757  add a0, a0, a1
    758  vse8.v v\i, (a0)
    759 .endr
    760 
    761  ret
    762 endfunc
    763 
    764 function inv_txfm_add_8x4_rvv, export=1, ext=v
    765  csrw vxrm, zero
    766 
    767  vsetivli zero, 4, e16, mf2, ta, ma
    768  vle16.v v0, (a2)
    769  addi t0, a2, 8
    770  vle16.v v1, (t0)
    771 .irp i, 2, 3, 4, 5, 6, 7
    772  addi t0, t0, 8
    773  vle16.v v\i, (t0)
    774 .endr
    775 
    776  li t1, 2896*8
    777 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
    778  vsmul.vx v\i, v\i, t1
    779 .endr
    780 
    781  jalr t0, a4
    782 
    783  vsseg8e16.v v0, (a2)
    784 
    785  vsetivli zero, 8, e16, m1, ta, ma
    786  vmv.v.x v4, zero
    787  vle16.v v0, (a2)
    788  vse16.v v4, (a2)
    789 .irp i, 1, 2, 3
    790  addi a2, a2, 16
    791  vle16.v v\i, (a2)
    792  vse16.v v4, (a2)
    793 .endr
    794 
    795  jalr t0, a5
    796 
    797  vssra.vi v0, v0, 4
    798  vssra.vi v1, v1, 4
    799  vssra.vi v2, v2, 4
    800  vssra.vi v3, v3, 4
    801 
    802  vsetvli zero, zero, e8, mf2, ta, ma
    803  vle8.v v4, (a0)
    804  add t0, a0, a1
    805  vle8.v v5, (t0)
    806  add t0, t0, a1
    807  vle8.v v6, (t0)
    808  add t0, t0, a1
    809  vle8.v v7, (t0)
    810 
    811  vwaddu.wv v0, v0, v4
    812  vwaddu.wv v1, v1, v5
    813  vwaddu.wv v2, v2, v6
    814  vwaddu.wv v3, v3, v7
    815 
    816  vsetvli zero, zero, e16, m1, ta, ma
    817  vmax.vx v0, v0, zero
    818  vmax.vx v1, v1, zero
    819  vmax.vx v2, v2, zero
    820  vmax.vx v3, v3, zero
    821 
    822  vsetvli zero, zero, e8, mf2, ta, ma
    823 
    824  vnclipu.wi v4, v0, 0
    825  vnclipu.wi v5, v1, 0
    826  vnclipu.wi v6, v2, 0
    827  vnclipu.wi v7, v3, 0
    828 
    829  vse8.v v4, (a0)
    830  add a0, a0, a1
    831  vse8.v v5, (a0)
    832  add a0, a0, a1
    833  vse8.v v6, (a0)
    834  add a0, a0, a1
    835  vse8.v v7, (a0)
    836 
    837  ret
    838 endfunc
    839 
    840 /* Define symbols added in .if statement */
    841 .equ dct, 1
    842 .equ identity, 2
    843 .equ adst, 3
    844 .equ flipadst, 4
    845 
    846 .macro def_fn_48 w, h, txfm1, txfm2
    847 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
    848 .if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
    849  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
    850 .else
    851  la a4, inv_\txfm1\()_e16_x\w\()_rvv
    852 .endif
    853 .if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
    854  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
    855 .else
    856  la a5, inv_\txfm2\()_e16_x\h\()_rvv
    857 .endif
    858  j inv_txfm_add_\w\()x\h\()_rvv
    859 endfunc
    860 .endm
    861 
    862 .macro def_fns_48 w, h
    863 def_fn_48 \w, \h, dct, dct
    864 def_fn_48 \w, \h, identity, identity
    865 def_fn_48 \w, \h, dct, adst
    866 def_fn_48 \w, \h, dct, flipadst
    867 def_fn_48 \w, \h, dct, identity
    868 def_fn_48 \w, \h, adst, dct
    869 def_fn_48 \w, \h, adst, adst
    870 def_fn_48 \w, \h, adst, flipadst
    871 def_fn_48 \w, \h, flipadst, dct
    872 def_fn_48 \w, \h, flipadst, adst
    873 def_fn_48 \w, \h, flipadst, flipadst
    874 def_fn_48 \w, \h, identity, dct
    875 def_fn_48 \w, \h, adst, identity
    876 def_fn_48 \w, \h, flipadst, identity
    877 def_fn_48 \w, \h, identity, adst
    878 def_fn_48 \w, \h, identity, flipadst
    879 .endm
    880 
    881 def_fns_48 4, 8
    882 def_fns_48 8, 4
    883 
    884 function inv_identity_e16_x16_rvv, export=1, ext=v
    885  li t1, 2*(5793-4096)*8
    886 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    887  vsmul.vx v16, v\i, t1
    888  vsadd.vv v\i, v\i, v\i
    889  vsadd.vv v\i, v\i, v16
    890 .endr
    891  jr t0
    892 endfunc
    893 
    894 function inv_dct_e16_x16_rvv, export=1, ext=v
    895  idct_8 v0, v2, v4, v6, v8, v10, v12, v14
    896 
    897  li t1, 401
    898  li t2, 4076
    899  li t3, 3166
    900  li t4, 2598
    901 
    902  vwmul.vx v30, v1, t2
    903  neg t2, t2
    904  vwmul.vx v16, v1, t1
    905  vwmacc.vx v30, t1, v15
    906  vwmacc.vx v16, t2, v15
    907 
    908  vwmul.vx v28, v9, t4
    909  neg t4, t4
    910  vwmul.vx v18, v9, t3
    911  vwmacc.vx v28, t3, v7
    912  vwmacc.vx v18, t4, v7
    913 
    914  li t1, 1931
    915  li t2, 3612
    916  li t3, 3920
    917  li t4, 1189
    918 
    919  vwmul.vx v26, v5, t2
    920  neg t2, t2
    921  vwmul.vx v20, v5, t1
    922  vwmacc.vx v26, t1, v11
    923  vwmacc.vx v20, t2, v11
    924 
    925  vwmul.vx v24, v13, t4
    926  neg t4, t4
    927  vwmul.vx v22, v13, t3
    928  vwmacc.vx v24, t3, v3
    929  vwmacc.vx v22, t4, v3
    930 
    931  li t2, 2896
    932  li t3, 1567
    933  li t4, 3784
    934 
    935  vnclip.wi v16, v16, 12
    936  vnclip.wi v18, v18, 12
    937  vnclip.wi v20, v20, 12
    938  vnclip.wi v22, v22, 12
    939  vnclip.wi v24, v24, 12
    940  vnclip.wi v26, v26, 12
    941  vnclip.wi v28, v28, 12
    942  vnclip.wi v30, v30, 12
    943 
    944  vssub.vv  v3, v16, v18
    945  vsadd.vv v16, v16, v18
    946  vssub.vv  v5, v22, v20
    947  vsadd.vv v22, v22, v20
    948  vssub.vv v11, v24, v26
    949  vsadd.vv v24, v24, v26
    950  vssub.vv v13, v30, v28
    951  vsadd.vv v30, v30, v28
    952 
    953  vwmul.vx v28, v13, t4
    954  neg t4, t4
    955  vwmul.vx v18, v13, t3
    956  vwmul.vx v26, v11, t3
    957  vwmacc.vx v28, t3, v3
    958  neg t3, t3
    959  vwmul.vx v20, v11, t4
    960  vwmacc.vx v18, t4, v3
    961  vwmacc.vx v20, t3, v5
    962  vwmacc.vx v26, t4, v5
    963 
    964  vnclip.wi v18, v18, 12
    965  vnclip.wi v20, v20, 12
    966  vnclip.wi v26, v26, 12
    967  vnclip.wi v28, v28, 12
    968 
    969  vssub.vv  v5, v18, v20
    970  vsadd.vv v18, v18, v20
    971  vssub.vv v11, v28, v26
    972  vsadd.vv v28, v28, v26
    973 
    974  vssub.vv  v7, v16, v22
    975  vsadd.vv v16, v16, v22
    976  vssub.vv  v9, v30, v24
    977  vsadd.vv v30, v30, v24
    978 
    979  vwmul.vx v20, v11, t2
    980  vwmul.vx v22,  v9, t2
    981  vwmul.vx v24,  v9, t2
    982  vwmul.vx v26, v11, t2
    983  vwmacc.vx v24, t2, v7
    984  vwmacc.vx v26, t2, v5
    985  neg t2, t2
    986  vwmacc.vx v20, t2, v5
    987  vwmacc.vx v22, t2, v7
    988 
    989  vnclip.wi v20, v20, 12
    990  vnclip.wi v22, v22, 12
    991  vnclip.wi v24, v24, 12
    992  vnclip.wi v26, v26, 12
    993 
    994  vssub.vv v15,  v0, v30
    995  vsadd.vv  v0,  v0, v30
    996  vssub.vv v17,  v2, v28
    997  vsadd.vv  v1,  v2, v28
    998  vssub.vv v13,  v4, v26
    999  vsadd.vv  v2,  v4, v26
   1000  vssub.vv v19,  v6, v24
   1001  vsadd.vv  v3,  v6, v24
   1002  vssub.vv v11,  v8, v22
   1003  vsadd.vv  v4,  v8, v22
   1004  vsadd.vv  v5, v10, v20
   1005  vssub.vv v10, v10, v20
   1006  vssub.vv  v9, v12, v18
   1007  vsadd.vv  v6, v12, v18
   1008  vssub.vv  v8, v14, v16
   1009  vsadd.vv  v7, v14, v16
   1010  vmv.v.v v14, v17
   1011  vmv.v.v v12, v19
   1012 
   1013  jr t0
   1014 endfunc
   1015 
   1016 .macro iadst_16 o0, o1, o2, o3, o4, o5, o6, o7, o8, o9, o10, o11, o12, o13, o14, o15
   1017  li t1, 4091
   1018  li t2, 201
   1019  li t3, 3973
   1020  li t4, 995
   1021 
   1022  vwmul.vx v16, v15, t1
   1023  neg t1, t1
   1024  vwmul.vx v18, v15, t2
   1025  vwmacc.vx v16, t2, v0
   1026  vwmacc.vx v18, t1, v0
   1027 
   1028  vwmul.vx v20, v13, t3
   1029  neg t3, t3
   1030  vwmul.vx v22, v13, t4
   1031  vwmacc.vx v20, t4, v2
   1032  vwmacc.vx v22, t3, v2
   1033 
   1034  li t1, 3703
   1035  li t2, 1751
   1036  li t3, 3290
   1037  li t4, 2440
   1038 
   1039  vwmul.vx v24, v11, t1
   1040  neg t1, t1
   1041  vwmul.vx v26, v11, t2
   1042  vwmacc.vx v24, t2, v4
   1043  vwmacc.vx v26, t1, v4
   1044 
   1045  vwmul.vx v28, v9, t3
   1046  neg t3, t3
   1047  vwmul.vx v30, v9, t4
   1048  vwmacc.vx v28, t4, v6
   1049  vwmacc.vx v30, t3, v6
   1050 
   1051  vnclip.wi  v0, v16, 12
   1052  vnclip.wi v18, v18, 12
   1053  vnclip.wi  v2, v20, 12
   1054  vnclip.wi v22, v22, 12
   1055  vnclip.wi  v4, v24, 12
   1056  vnclip.wi v26, v26, 12
   1057  vnclip.wi  v6, v28, 12
   1058  vnclip.wi v30, v30, 12
   1059 
   1060  li t1, 2751
   1061  li t2, 3035
   1062  li t3, 2106
   1063  li t4, 3513
   1064 
   1065  vwmul.vx v16, v7, t1
   1066  neg t1, t1
   1067  vwmul.vx v20, v7, t2
   1068  vwmacc.vx v16, t2, v8
   1069  vwmacc.vx v20, t1, v8
   1070 
   1071  vwmul.vx v24, v5, t3
   1072  neg t3, t3
   1073  vwmul.vx v28, v5, t4
   1074  vwmacc.vx v24, t4, v10
   1075  vwmacc.vx v28, t3, v10
   1076 
   1077  vnclip.wi v16, v16, 12
   1078  vnclip.wi  v9, v20, 12
   1079  vnclip.wi v24, v24, 12
   1080  vnclip.wi v11, v28, 12
   1081 
   1082  vssub.vv  v8,  v0, v16
   1083  vsadd.vv  v0,  v0, v16
   1084  vssub.vv v10,  v2, v24
   1085  vsadd.vv  v2,  v2, v24
   1086 
   1087  li t1, 1380
   1088  li t2, 3857
   1089  li t3, 601
   1090  li t4, 4052
   1091 
   1092  vwmul.vx v16, v3, t1
   1093  neg t1, t1
   1094  vwmul.vx v20, v3, t2
   1095  vwmacc.vx v16, t2, v12
   1096  vwmacc.vx v20, t1, v12
   1097 
   1098  vwmul.vx v24, v1, t3
   1099  neg t3, t3
   1100  vwmul.vx v28, v1, t4
   1101  vwmacc.vx v24, t4, v14
   1102  vwmacc.vx v28, t3, v14
   1103 
   1104  vnclip.wi v16, v16, 12
   1105  vnclip.wi v13, v20, 12
   1106  vnclip.wi v24, v24, 12
   1107  vnclip.wi v15, v28, 12
   1108 
   1109  vssub.vv v12,  v4, v16
   1110  vsadd.vv v16,  v4, v16
   1111  vssub.vv v14,  v6, v24
   1112  vsadd.vv v20,  v6, v24
   1113 
   1114  vsadd.vv  v1, v18,  v9
   1115  vssub.vv  v9, v18,  v9
   1116  vsadd.vv  v3, v22, v11
   1117  vssub.vv v11, v22, v11
   1118  vsadd.vv v18, v26, v13
   1119  vssub.vv v13, v26, v13
   1120  vsadd.vv v22, v30, v15
   1121  vssub.vv v15, v30, v15
   1122 
   1123  vssub.vv v4, v0, v16
   1124  vsadd.vv v0, v0, v16
   1125  vssub.vv v5, v1, v18
   1126  vsadd.vv v1, v1, v18
   1127  vssub.vv v6, v2, v20
   1128  vsadd.vv v2, v2, v20
   1129  vssub.vv v7, v3, v22
   1130  vsadd.vv v3, v3, v22
   1131 
   1132  li t1, 799
   1133  li t2, 4017
   1134  li t3, 3406
   1135  li t4, 2276
   1136 
   1137  vwmul.vx v16,  v8, t2
   1138  vwmul.vx v18,  v8, t1
   1139  vwmul.vx v20, v10, t4
   1140  vwmul.vx v22, v10, t3
   1141  vwmul.vx v24, v13, t2
   1142  vwmul.vx v26, v13, t1
   1143  vwmul.vx v28, v15, t4
   1144  vwmul.vx v30, v15, t3
   1145  vwmacc.vx v16, t1,  v9
   1146  neg t1, t1
   1147  vwmacc.vx v20, t3, v11
   1148  neg t3, t3
   1149  vwmacc.vx v26, t2, v12
   1150  neg t2, t2
   1151  vwmacc.vx v30, t4, v14
   1152  neg t4, t4
   1153  vwmacc.vx v18, t2,  v9
   1154  vwmacc.vx v22, t4, v11
   1155  vwmacc.vx v24, t1, v12
   1156  vwmacc.vx v28, t3, v14
   1157 
   1158  li t2, 2896
   1159  li t3, 1567
   1160  li t4, 3784
   1161 
   1162  vnclip.wi v16, v16, 12
   1163  vnclip.wi v18, v18, 12
   1164  vnclip.wi v20, v20, 12
   1165  vnclip.wi v22, v22, 12
   1166  vnclip.wi v24, v24, 12
   1167  vnclip.wi v26, v26, 12
   1168  vnclip.wi v28, v28, 12
   1169  vnclip.wi v30, v30, 12
   1170 
   1171  vsadd.vv  v8, v16, v24
   1172  vsadd.vv  v9, v18, v26
   1173  vsadd.vv v10, v20, v28
   1174  vsadd.vv v11, v22, v30
   1175  vssub.vv v12, v16, v24
   1176  vssub.vv v13, v18, v26
   1177  vssub.vv v14, v20, v28
   1178  vssub.vv v15, v22, v30
   1179 
   1180  vwmul.vx v16,  v4, t4
   1181  vwmul.vx v18,  v4, t3
   1182  vwmul.vx v20,  v7, t4
   1183  vwmul.vx v22,  v7, t3
   1184  vwmul.vx v24, v12, t4
   1185  vwmul.vx v26, v12, t3
   1186  vwmul.vx v28, v15, t4
   1187  vwmul.vx v30, v15, t3
   1188  vwmacc.vx v16, t3,  v5
   1189  vwmacc.vx v22, t4,  v6
   1190  vwmacc.vx v24, t3, v13
   1191  neg t3, t3
   1192  vwmacc.vx v30, t4, v14
   1193  neg t4, t4
   1194  vwmacc.vx v20, t3,  v6
   1195  vwmacc.vx v28, t3, v14
   1196  vwmacc.vx v18, t4,  v5
   1197  vwmacc.vx v26, t4, v13
   1198 
   1199  vnclip.wi v16, v16, 12
   1200  vnclip.wi v18, v18, 12
   1201  vnclip.wi v20, v20, 12
   1202  vnclip.wi v22, v22, 12
   1203  vnclip.wi v24, v24, 12
   1204  vnclip.wi v26, v26, 12
   1205  vnclip.wi v28, v28, 12
   1206  vnclip.wi v30, v30, 12
   1207 
   1208 .ifc \o0, v0
   1209  vsadd.vv \o14, v9, v11
   1210  vssub.vv  v11, v9, v11
   1211  vssub.vv   v9, v1,  v3
   1212  vsadd.vv \o15, v1,  v3
   1213  vsadd.vv  \o1, v8, v10
   1214  vssub.vv  v10, v8, v10
   1215  vssub.vv   v8, v0,  v2
   1216  vsadd.vv  \o0, v0,  v2
   1217 .else
   1218  vsadd.vv  \o1, v8, v10
   1219  vssub.vv  v10, v8, v10
   1220  vssub.vv   v8, v0,  v2
   1221  vsadd.vv  \o0, v0,  v2
   1222  vsadd.vv   v2, v9, v11
   1223  vssub.vv  v11, v9, v11
   1224  vssub.vv   v9, v1,  v3
   1225  vsadd.vv \o15, v1,  v3
   1226  vmv.v.v  \o14, v2
   1227 .endif
   1228 
   1229  vsadd.vv  \o3, v16, v20
   1230  vssub.vv   v6, v16, v20
   1231  vsadd.vv \o12, v18, v22
   1232  vssub.vv   v7, v18, v22
   1233  vsadd.vv  \o2, v24, v28
   1234  vssub.vv  v24, v24, v28
   1235  vsadd.vv \o13, v26, v30
   1236  vssub.vv  v26, v26, v30
   1237 
   1238  neg t3, t2
   1239 
   1240  vwmul.vx v28, v24, t2
   1241  vwmul.vx v30, v24, t2
   1242  vwmacc.vx v28, t2, v26
   1243  vwmacc.vx v30, t3, v26
   1244 
   1245  vwmul.vx v24, v10, t2
   1246  vwmul.vx v26, v10, t2
   1247  vwmacc.vx v24, t2, v11
   1248  vwmacc.vx v26, t3, v11
   1249 
   1250  vwmul.vx v20, v6, t2
   1251  vwmul.vx v22, v6, t2
   1252  vwmacc.vx v20, t2, v7
   1253  vwmacc.vx v22, t3, v7
   1254 
   1255  vwmul.vx v16, v8, t2
   1256  vwmul.vx v18, v8, t2
   1257  vwmacc.vx v16, t2, v9
   1258  vwmacc.vx v18, t3, v9
   1259 
   1260  vnclip.wi  \o7, v16, 12
   1261  vnclip.wi  \o8, v18, 12
   1262  vnclip.wi  \o4, v20, 12
   1263  vnclip.wi \o11, v22, 12
   1264  vnclip.wi  \o6, v24, 12
   1265  vnclip.wi  \o9, v26, 12
   1266  vnclip.wi  \o5, v28, 12
   1267  vnclip.wi \o10, v30, 12
   1268 
   1269  vmv.v.x v16, zero
   1270  vssub.vv  \o1, v16,  \o1
   1271  vssub.vv  \o3, v16,  \o3
   1272  vssub.vv  \o5, v16,  \o5
   1273  vssub.vv  \o7, v16,  \o7
   1274  vssub.vv  \o9, v16,  \o9
   1275  vssub.vv \o11, v16, \o11
   1276  vssub.vv \o13, v16, \o13
   1277  vssub.vv \o15, v16, \o15
   1278 .endm
   1279 
   1280 function inv_adst_e16_x16_rvv, export=1, ext=v
   1281  iadst_16 v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15
   1282  jr t0
   1283 endfunc
   1284 
   1285 function inv_flipadst_e16_x16_rvv, export=1, ext=v
   1286  iadst_16 v15, v14, v13, v12, v11, v10, v9, v8, v7, v6, v5, v4, v3, v2, v1, v0
   1287  jr t0
   1288 endfunc
   1289 
   1290 .macro def_horz_16 variant
   1291 function inv_txfm_horz\variant\()_16x8_rvv, export=1, ext=v
   1292  vmv.v.x v16, zero
   1293  vle16.v v0, (t4)
   1294  vse16.v v16, (t4)
   1295 .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1296  add t4, t4, t6
   1297  vle16.v v\i, (t4)
   1298  vse16.v v16, (t4)
   1299 .endr
   1300 .ifc \variant, _identity
   1301  li t1, 2*(5793-4096)*8
   1302 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1303  vsmul.vx v16, v\i, t1
   1304  vsra.vi v16, v16, 1
   1305  vaadd.vv v\i, v\i, v16
   1306 .endr
   1307  j L(horz_16x8_epilog)
   1308 .else
   1309  jalr t0, a4
   1310 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1311  vssra.vi v\i, v\i, 2
   1312 .endr
   1313 L(horz_16x8_epilog):
   1314  vsse16.v v0, (t5), t6
   1315 .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1316  addi t5, t5, 2
   1317  vsse16.v v\i, (t5), t6
   1318 .endr
   1319  jr a7
   1320 .endif
   1321 endfunc
   1322 .endm
   1323 
   1324 def_horz_16 _identity
   1325 def_horz_16
   1326 
   1327 function inv_txfm_add_vert_8x16_rvv, export=1, ext=v
   1328  vsetivli zero, 8, e16, m1, ta, ma
   1329 
   1330  vle16.v v0, (t4)
   1331 .irp i, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1332  add t4, t4, t6
   1333  vle16.v v\i, (t4)
   1334 .endr
   1335 
   1336  jalr t0, a5
   1337 
   1338 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1339  vssra.vi v\i, v\i, 4
   1340 .endr
   1341 
   1342  vsetivli zero, 8, e8, mf2, ta, ma
   1343 
   1344  vle8.v v16, (t5)
   1345  add t0, t5, a1
   1346  vle8.v v17, (t0)
   1347 .irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1348  add t0, t0, a1
   1349  vle8.v v\i, (t0)
   1350 .endr
   1351 
   1352  vwaddu.wv v0, v0, v16
   1353  vwaddu.wv v1, v1, v17
   1354  vwaddu.wv v2, v2, v18
   1355  vwaddu.wv v3, v3, v19
   1356  vwaddu.wv v4, v4, v20
   1357  vwaddu.wv v5, v5, v21
   1358  vwaddu.wv v6, v6, v22
   1359  vwaddu.wv v7, v7, v23
   1360  vwaddu.wv v8, v8, v24
   1361  vwaddu.wv v9, v9, v25
   1362  vwaddu.wv v10, v10, v26
   1363  vwaddu.wv v11, v11, v27
   1364  vwaddu.wv v12, v12, v28
   1365  vwaddu.wv v13, v13, v29
   1366  vwaddu.wv v14, v14, v30
   1367  vwaddu.wv v15, v15, v31
   1368 
   1369  vsetvli zero, zero, e16, m1, ta, ma
   1370 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1371  vmax.vx v\i, v\i, zero
   1372 .endr
   1373 
   1374  vsetvli zero, zero, e8, mf2, ta, ma
   1375  vnclipu.wi v16, v0, 0
   1376  vnclipu.wi v17, v1, 0
   1377  vnclipu.wi v18, v2, 0
   1378  vnclipu.wi v19, v3, 0
   1379  vnclipu.wi v20, v4, 0
   1380  vnclipu.wi v21, v5, 0
   1381  vnclipu.wi v22, v6, 0
   1382  vnclipu.wi v23, v7, 0
   1383  vnclipu.wi v24, v8, 0
   1384  vnclipu.wi v25, v9, 0
   1385  vnclipu.wi v26, v10, 0
   1386  vnclipu.wi v27, v11, 0
   1387  vnclipu.wi v28, v12, 0
   1388  vnclipu.wi v29, v13, 0
   1389  vnclipu.wi v30, v14, 0
   1390  vnclipu.wi v31, v15, 0
   1391 
   1392  vse8.v v16, (t5)
   1393 .irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1394  add t5, t5, a1
   1395  vse8.v v\i, (t5)
   1396 .endr
   1397 
   1398  jr a7
   1399 endfunc
   1400 
   1401 function inv_txfm_add_16x16_rvv, export=1, ext=v
   1402  csrw vxrm, zero
   1403  vsetivli zero, 8, e16, m1, ta, ma
   1404  addi sp, sp, -16*32
   1405 .irp i, 8, 0
   1406  addi t4, a2, \i*2
   1407  addi t5, sp, \i*16*2
   1408 .if \i == 8
   1409  blt a3, a7, 1f
   1410 .endif
   1411  li t6, 16*2
   1412  jalr a7, a6
   1413 .if \i == 8
   1414  j 2f
   1415 1:
   1416  li t1, 64
   1417  vsetvli zero, t1, e16, m8, ta, ma
   1418  vmv.v.x v0, zero
   1419  vse16.v v0, (t5)
   1420  addi t5, t5, 128
   1421  vse16.v v0, (t5)
   1422  vsetivli zero, 8, e16, m1, ta, ma
   1423 2:
   1424 .endif
   1425 .endr
   1426 .irp i, 0, 8
   1427  addi t4, sp, \i*2
   1428  addi t5, a0, \i
   1429  li t6, 16*2
   1430  jal a7, inv_txfm_add_vert_8x16_rvv
   1431 .endr
   1432  addi sp, sp, 16*32
   1433  ret
   1434 endfunc
   1435 
   1436 .macro def_fn_16x16 txfm1, txfm2, eob_half
   1437 function inv_txfm_add_\txfm1\()_\txfm2\()_16x16_8bpc_rvv, export=1, ext=v
   1438 .ifc \txfm1\()_\txfm2, dct_dct
   1439 beqz a3, 1f
   1440 .endif
   1441 .ifc \txfm1, identity
   1442  la a6, inv_txfm_horz_identity_16x8_rvv
   1443 .else
   1444  la a6, inv_txfm_horz_16x8_rvv
   1445  la a4, inv_\txfm1\()_e16_x16_rvv
   1446 .endif
   1447  la a5, inv_\txfm2\()_e16_x16_rvv
   1448  li a7, \eob_half
   1449  j inv_txfm_add_16x16_rvv
   1450 .ifc \txfm1\()_\txfm2, dct_dct
   1451 1:
   1452  csrw vxrm, zero
   1453  vsetivli zero, 16, e16, m2, ta, ma
   1454  lh t2, (a2)
   1455  li t3, 2896*8
   1456  li t4, 1<<14
   1457  li t5, 0xFFFF
   1458  li t6, -0x10000
   1459 
   1460  sh x0, (a2)
   1461 
   1462  mul t2, t2, t3
   1463  add t2, t2, t4
   1464  srai t2, t2, 15
   1465  ble t2, t5, 3f
   1466  mv t2, t5
   1467 3:
   1468  ble t6, t2, 4f
   1469  mv t2, t6
   1470 4:
   1471  addi t2, t2, 2
   1472  srai t2, t2, 2
   1473  mul t2, t2, t3
   1474  add t2, t2, t4
   1475  srai t2, t2, 15
   1476  ble t2, t5, 5f
   1477  mv t2, t5
   1478 5:
   1479  ble t6, t2, 6f
   1480  mv t2, t6
   1481 6:
   1482  addi t2, t2, 8
   1483  srai t2, t2, 4
   1484  vmv.v.x v24, t2
   1485 
   1486  vsetvli zero, zero, e8, m1, ta, ma
   1487  add t2, a1, a1
   1488  li t3, 16
   1489 2:
   1490  add t0, a0, a1
   1491  vle8.v v16, (a0)
   1492  vle8.v v17, (t0)
   1493 
   1494  vwaddu.wv v0, v24, v16
   1495  vwaddu.wv v2, v24, v17
   1496 
   1497  addi t3, t3, -2 # loop counter
   1498 
   1499 
   1500  vsetvli zero, zero, e16, m2, ta, ma
   1501 .irp i, 0, 2
   1502  vmax.vx v\i, v\i, zero
   1503 .endr
   1504 
   1505  vsetvli zero, zero, e8, m1, ta, ma
   1506 
   1507  vnclipu.wi  v16, v0, 0
   1508  vnclipu.wi  v17, v2, 0
   1509 
   1510  add t0, a0, a1
   1511  vse8.v v16, (a0)
   1512  add a0, a0, t2
   1513  vse8.v v17, (t0)
   1514 
   1515  bnez t3, 2b
   1516 
   1517  ret
   1518 .endif
   1519 endfunc
   1520 .endm
   1521 
   1522 def_fn_16x16 dct, dct, 36
   1523 def_fn_16x16 identity, identity, 36
   1524 def_fn_16x16 dct, adst, 36
   1525 def_fn_16x16 dct, flipadst, 36
   1526 def_fn_16x16 dct, identity, 8
   1527 def_fn_16x16 adst, dct, 36
   1528 def_fn_16x16 adst, adst, 36
   1529 def_fn_16x16 adst, flipadst, 36
   1530 def_fn_16x16 flipadst, dct, 36
   1531 def_fn_16x16 flipadst, adst, 36
   1532 def_fn_16x16 flipadst, flipadst, 36
   1533 def_fn_16x16 identity, dct, 8
   1534 
   1535 .macro def_fn_416_base variant
   1536 function inv_txfm_\variant\()add_4x16_rvv, export=1, ext=v
   1537  csrw vxrm, zero
   1538 
   1539  vsetivli zero, 8, e16, m1, ta, ma
   1540 
   1541  blt a3, a6, 1f
   1542 
   1543  addi t0, a2, 16
   1544  vle16.v v0, (t0)
   1545  addi t0, t0, 32
   1546  vle16.v v1, (t0)
   1547  addi t0, t0, 32
   1548  vle16.v v2, (t0)
   1549  addi t0, t0, 32
   1550  vle16.v v3, (t0)
   1551 
   1552 .ifc \variant, identity_
   1553  li t1, (5793-4096)*8
   1554  vsmul.vx v8, v0, t1
   1555  vaadd.vv v4, v0, v8
   1556  vsmul.vx v8, v1, t1
   1557  vaadd.vv v5, v1, v8
   1558  vsmul.vx v8, v2, t1
   1559  vaadd.vv v6, v2, v8
   1560  vsmul.vx v8, v3, t1
   1561  vaadd.vv v7, v3, v8
   1562 .else
   1563  jalr t0, a4
   1564 
   1565  vssra.vi v4, v0, 1
   1566  vssra.vi v5, v1, 1
   1567  vssra.vi v6, v2, 1
   1568  vssra.vi v7, v3, 1
   1569 .endif
   1570 
   1571  j 2f
   1572 
   1573 1:
   1574 .irp i, 4, 5, 6, 7
   1575  vmv.v.x v\i, zero
   1576 .endr
   1577 
   1578 2:
   1579  vle16.v v0, (a2)
   1580  addi t0, a2, 32
   1581  vle16.v v1, (t0)
   1582  addi t0, t0, 32
   1583  vle16.v v2, (t0)
   1584  addi t0, t0, 32
   1585  vle16.v v3, (t0)
   1586 
   1587 .ifc \variant, identity_
   1588  li t1, (5793-4096)*8
   1589 .irp i, 0, 1, 2, 3
   1590  vsmul.vx v8, v\i, t1
   1591  vaadd.vv v\i, v\i, v8
   1592 .endr
   1593 
   1594  j L(itx_4x16_epilog)
   1595 .else
   1596  jalr t0, a4
   1597 
   1598  vssra.vi v0, v0, 1
   1599  vssra.vi v1, v1, 1
   1600  vssra.vi v2, v2, 1
   1601  vssra.vi v3, v3, 1
   1602 
   1603 L(itx_4x16_epilog):
   1604  vsseg4e16.v v0, (a2)
   1605  addi t0, a2, 64
   1606  vsseg4e16.v v4, (t0)
   1607 
   1608  vsetivli zero, 4, e16, mf2, ta, ma
   1609 
   1610  vmv.v.x v16, zero
   1611  vle16.v v0, (a2)
   1612  vse16.v v16, (a2)
   1613  addi t0, a2, 8
   1614  vle16.v v1, (t0)
   1615  vse16.v v16, (t0)
   1616 .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1617  addi t0, t0, 8
   1618  vle16.v v\i, (t0)
   1619  vse16.v v16, (t0)
   1620 .endr
   1621 
   1622  jalr t0, a5
   1623 
   1624 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1625  vssra.vi v\i, v\i, 4
   1626 .endr
   1627 
   1628  vsetvli zero, zero, e8, mf4, ta, ma
   1629 
   1630  vle8.v v16, (a0)
   1631  add t0, a0, a1
   1632  vle8.v v17, (t0)
   1633 .irp i, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1634  add t0, t0, a1
   1635  vle8.v v\i, (t0)
   1636 .endr
   1637 
   1638  vwaddu.wv  v0,  v0, v16
   1639  vwaddu.wv  v1,  v1, v17
   1640  vwaddu.wv  v2,  v2, v18
   1641  vwaddu.wv  v3,  v3, v19
   1642  vwaddu.wv  v4,  v4, v20
   1643  vwaddu.wv  v5,  v5, v21
   1644  vwaddu.wv  v6,  v6, v22
   1645  vwaddu.wv  v7,  v7, v23
   1646  vwaddu.wv  v8,  v8, v24
   1647  vwaddu.wv  v9,  v9, v25
   1648  vwaddu.wv v10, v10, v26
   1649  vwaddu.wv v11, v11, v27
   1650  vwaddu.wv v12, v12, v28
   1651  vwaddu.wv v13, v13, v29
   1652  vwaddu.wv v14, v14, v30
   1653  vwaddu.wv v15, v15, v31
   1654 
   1655  vsetvli zero, zero, e16, mf2, ta, ma
   1656 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1657  vmax.vx v\i, v\i, zero
   1658 .endr
   1659 
   1660  vsetvli zero, zero, e8, mf4, ta, ma
   1661 
   1662  vnclipu.wi v16,  v0, 0
   1663  vnclipu.wi v17,  v1, 0
   1664  vnclipu.wi v18,  v2, 0
   1665  vnclipu.wi v19,  v3, 0
   1666  vnclipu.wi v20,  v4, 0
   1667  vnclipu.wi v21,  v5, 0
   1668  vnclipu.wi v22,  v6, 0
   1669  vnclipu.wi v23,  v7, 0
   1670  vnclipu.wi v24,  v8, 0
   1671  vnclipu.wi v25,  v9, 0
   1672  vnclipu.wi v26, v10, 0
   1673  vnclipu.wi v27, v11, 0
   1674  vnclipu.wi v28, v12, 0
   1675  vnclipu.wi v29, v13, 0
   1676  vnclipu.wi v30, v14, 0
   1677  vnclipu.wi v31, v15, 0
   1678 
   1679  vse8.v v16, (a0)
   1680 .irp i, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
   1681  add a0, a0, a1
   1682  vse8.v v\i, (a0)
   1683 .endr
   1684 
   1685  ret
   1686 .endif
   1687 endfunc
   1688 
   1689 function inv_txfm_\variant\()add_16x4_rvv, export=1, ext=v
   1690  csrw vxrm, zero
   1691 
   1692  vsetivli zero, 4, e16, mf2, ta, ma
   1693  vle16.v v0, (a2)
   1694  addi t0, a2, 8
   1695  vle16.v v1, (t0)
   1696 .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1697  addi t0, t0, 8
   1698  vle16.v v\i, (t0)
   1699 .endr
   1700 
   1701 .ifc \variant, identity_
   1702  li t1, 2*(5793-4096)*8
   1703 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1704  vsmul.vx v16, v\i, t1
   1705  vssra.vi v16, v16, 1
   1706  vsadd.vv v\i, v\i, v16
   1707 .endr
   1708 
   1709  j L(itx_16x4_epilog)
   1710 .else
   1711  jalr t0, a4
   1712 
   1713 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1714  vssra.vi v\i, v\i, 1
   1715 .endr
   1716 
   1717 L(itx_16x4_epilog):
   1718  li t0, 32
   1719  vssseg8e16.v v0, (a2), t0
   1720  addi t1, a2, 16
   1721  vssseg8e16.v v8, (t1), t0
   1722 
   1723 .irp j, 0, 8
   1724  vsetivli zero, 8, e16, m1, ta, ma
   1725 
   1726  vmv.v.x v4, zero
   1727  addi t0, a2, \j*2
   1728  vle16.v v0, (t0)
   1729  vse16.v v4, (t0)
   1730 .irp i, 1, 2, 3
   1731  addi t0, t0, 32
   1732  vle16.v v\i, (t0)
   1733  vse16.v v4, (t0)
   1734 .endr
   1735 
   1736  jalr t0, a5
   1737 
   1738  vssra.vi v0, v0, 4
   1739  vssra.vi v1, v1, 4
   1740  vssra.vi v2, v2, 4
   1741  vssra.vi v3, v3, 4
   1742 
   1743  vsetvli zero, zero, e8, mf2, ta, ma
   1744  addi t0, a0, \j
   1745  vle8.v v4, (t0)
   1746  add t0, t0, a1
   1747  vle8.v v5, (t0)
   1748  add t0, t0, a1
   1749  vle8.v v6, (t0)
   1750  add t0, t0, a1
   1751  vle8.v v7, (t0)
   1752 
   1753  vwaddu.wv v0, v0, v4
   1754  vwaddu.wv v1, v1, v5
   1755  vwaddu.wv v2, v2, v6
   1756  vwaddu.wv v3, v3, v7
   1757 
   1758  vsetvli zero, zero, e16, m1, ta, ma
   1759  vmax.vx v0, v0, zero
   1760  vmax.vx v1, v1, zero
   1761  vmax.vx v2, v2, zero
   1762  vmax.vx v3, v3, zero
   1763 
   1764  vsetvli zero, zero, e8, mf2, ta, ma
   1765 
   1766  vnclipu.wi v4, v0, 0
   1767  vnclipu.wi v5, v1, 0
   1768  vnclipu.wi v6, v2, 0
   1769  vnclipu.wi v7, v3, 0
   1770 
   1771  addi t0, a0, \j
   1772  vse8.v v4, (t0)
   1773  add t0, t0, a1
   1774  vse8.v v5, (t0)
   1775  add t0, t0, a1
   1776  vse8.v v6, (t0)
   1777  add t0, t0, a1
   1778  vse8.v v7, (t0)
   1779 .endr
   1780 
   1781  ret
   1782 .endif
   1783 endfunc
   1784 .endm
   1785 
   1786 def_fn_416_base identity_
   1787 def_fn_416_base
   1788 
   1789 .macro def_fn_416 w, h, txfm1, txfm2, eob_half
   1790 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
   1791 .if \w == 4 && (\txfm1 == adst || \txfm1 == flipadst)
   1792  la a4, inv_\txfm1\()_e16_x\w\()w_rvv
   1793 .elseif \txfm1 != identity
   1794  la a4, inv_\txfm1\()_e16_x\w\()_rvv
   1795 .endif
   1796 .if \h == 4 && (\txfm2 == adst || \txfm2 == flipadst)
   1797  la a5, inv_\txfm2\()_e16_x\h\()w_rvv
   1798 .else
   1799  la a5, inv_\txfm2\()_e16_x\h\()_rvv
   1800 .endif
   1801 .if \w == 4
   1802  li a6, \eob_half
   1803 .endif
   1804 .ifc \txfm1, identity
   1805  j inv_txfm_identity_add_\w\()x\h\()_rvv
   1806 .else
   1807  j inv_txfm_add_\w\()x\h\()_rvv
   1808 .endif
   1809 endfunc
   1810 .endm
   1811 
   1812 .macro def_fns_416 w, h
   1813 def_fn_416 \w, \h, dct, dct, 29
   1814 def_fn_416 \w, \h, identity, identity, 29
   1815 def_fn_416 \w, \h, dct, adst, 29
   1816 def_fn_416 \w, \h, dct, flipadst, 29
   1817 def_fn_416 \w, \h, dct, identity, 8
   1818 def_fn_416 \w, \h, adst, dct, 29
   1819 def_fn_416 \w, \h, adst, adst, 29
   1820 def_fn_416 \w, \h, adst, flipadst, 29
   1821 def_fn_416 \w, \h, flipadst, dct, 29
   1822 def_fn_416 \w, \h, flipadst, adst, 29
   1823 def_fn_416 \w, \h, flipadst, flipadst, 29
   1824 def_fn_416 \w, \h, identity, dct, 32
   1825 def_fn_416 \w, \h, adst, identity, 8
   1826 def_fn_416 \w, \h, flipadst, identity, 8
   1827 def_fn_416 \w, \h, identity, adst, 32
   1828 def_fn_416 \w, \h, identity, flipadst, 32
   1829 .endm
   1830 
   1831 def_fns_416 4, 16
   1832 def_fns_416 16, 4
   1833 
   1834 .macro def_fn_816_base variant
   1835 function inv_txfm_\variant\()add_8x16_rvv, export=1, ext=v
   1836  csrw vxrm, zero
   1837 
   1838  vsetivli zero, 8, e16, m1, ta, ma
   1839 
   1840  blt a3, a6, 1f
   1841 
   1842  vmv.v.x v16, zero
   1843  addi t0, a2, 16
   1844  vle16.v v0, (t0)
   1845  vse16.v v16, (t0)
   1846 .irp i, 1, 2, 3, 4, 5, 6, 7
   1847  addi t0, t0, 32
   1848  vle16.v v\i, (t0)
   1849  vse16.v v16, (t0)
   1850 .endr
   1851 
   1852  li t1, 2896*8
   1853 .ifc \variant, identity_
   1854  vsmul.vx  v8, v0, t1
   1855  vsmul.vx  v9, v1, t1
   1856  vsmul.vx v10, v2, t1
   1857  vsmul.vx v11, v3, t1
   1858  vsmul.vx v12, v4, t1
   1859  vsmul.vx v13, v5, t1
   1860  vsmul.vx v14, v6, t1
   1861  vsmul.vx v15, v7, t1
   1862 .else
   1863 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
   1864  vsmul.vx v\i, v\i, t1
   1865 .endr
   1866 
   1867  jalr t0, a4
   1868 
   1869  vssra.vi  v8, v0, 1
   1870  vssra.vi  v9, v1, 1
   1871  vssra.vi v10, v2, 1
   1872  vssra.vi v11, v3, 1
   1873  vssra.vi v12, v4, 1
   1874  vssra.vi v13, v5, 1
   1875  vssra.vi v14, v6, 1
   1876  vssra.vi v15, v7, 1
   1877 .endif
   1878 
   1879  j 2f
   1880 
   1881 1:
   1882 .irp i, 8, 9, 10, 11, 12, 13, 14, 15
   1883  vmv.v.x v\i, zero
   1884 .endr
   1885 
   1886 2:
   1887  vmv.v.x v16, zero
   1888  vle16.v v0, (a2)
   1889  vse16.v v16, (a2)
   1890  addi t0, a2, 32
   1891  vle16.v v1, (t0)
   1892  vse16.v v16, (t0)
   1893 .irp i, 2, 3, 4, 5, 6, 7
   1894  addi t0, t0, 32
   1895  vle16.v v\i, (t0)
   1896  vse16.v v16, (t0)
   1897 .endr
   1898 
   1899  li t1, 2896*8
   1900 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
   1901  vsmul.vx v\i, v\i, t1
   1902 .endr
   1903 
   1904 .ifc \variant, identity_
   1905  j L(itx_8x16_epilog)
   1906 .else
   1907  jalr t0, a4
   1908 
   1909 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
   1910  vssra.vi v\i, v\i, 1
   1911 .endr
   1912 
   1913 L(itx_8x16_epilog):
   1914  addi t4, sp, -8*32
   1915  vsseg8e16.v v0, (t4)
   1916  addi t0, t4, 8*16
   1917  vsseg8e16.v v8, (t0)
   1918 
   1919  mv t5, a0
   1920  li t6, 16
   1921  jal a7, inv_txfm_add_vert_8x16_rvv
   1922 
   1923  ret
   1924 .endif
   1925 endfunc
   1926 
   1927 function inv_txfm_\variant\()add_16x8_rvv, export=1, ext=v
   1928  csrw vxrm, zero
   1929 
   1930  vsetivli zero, 8, e16, m1, ta, ma
   1931  vle16.v v0, (a2)
   1932  addi t0, a2, 16
   1933  vle16.v v1, (t0)
   1934 .irp i, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1935  addi t0, t0, 16
   1936  vle16.v v\i, (t0)
   1937 .endr
   1938 
   1939  li t1, 2896*8
   1940 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1941  vsmul.vx v\i, v\i, t1
   1942 .endr
   1943 
   1944 .ifc \variant, identity_
   1945  li t1, 2*(5793-4096)*8
   1946 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1947  vsmul.vx v16, v\i, t1
   1948  vssra.vi v16, v16, 1
   1949  vsadd.vv v\i, v\i, v16
   1950 .endr
   1951 
   1952  j L(itx_16x8_epilog)
   1953 .else
   1954  jalr t0, a4
   1955 
   1956 .irp i, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
   1957  vssra.vi v\i, v\i, 1
   1958 .endr
   1959 
   1960 L(itx_16x8_epilog):
   1961  li t0, 32
   1962  vssseg8e16.v v0, (a2), t0
   1963  addi t1, a2, 16
   1964  vssseg8e16.v v8, (t1), t0
   1965 
   1966 .irp j, 0, 8
   1967  vsetivli zero, 8, e16, m1, ta, ma
   1968 
   1969  vmv.v.x v8, zero
   1970  addi t0, a2, \j*2
   1971  vle16.v v0, (t0)
   1972  vse16.v v8, (t0)
   1973 .irp i, 1, 2, 3, 4, 5, 6, 7
   1974  addi t0, t0, 32
   1975  vle16.v v\i, (t0)
   1976  vse16.v v8, (t0)
   1977 .endr
   1978 
   1979  jalr t0, a5
   1980 
   1981 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
   1982  vssra.vi v\i, v\i, 4
   1983 .endr
   1984 
   1985  vsetvli zero, zero, e8, mf2, ta, ma
   1986  addi t0, a0, \j
   1987  vle8.v v8, (t0)
   1988 .irp i, 9, 10, 11, 12, 13, 14, 15
   1989  add t0, t0, a1
   1990  vle8.v v\i, (t0)
   1991 .endr
   1992 
   1993  vwaddu.wv v0, v0, v8
   1994  vwaddu.wv v1, v1, v9
   1995  vwaddu.wv v2, v2, v10
   1996  vwaddu.wv v3, v3, v11
   1997  vwaddu.wv v4, v4, v12
   1998  vwaddu.wv v5, v5, v13
   1999  vwaddu.wv v6, v6, v14
   2000  vwaddu.wv v7, v7, v15
   2001 
   2002  vsetvli zero, zero, e16, m1, ta, ma
   2003 .irp i, 0, 1, 2, 3, 4, 5, 6, 7
   2004  vmax.vx v\i, v\i, zero
   2005 .endr
   2006 
   2007  vsetvli zero, zero, e8, mf2, ta, ma
   2008 
   2009  vnclipu.wi  v8, v0, 0
   2010  vnclipu.wi  v9, v1, 0
   2011  vnclipu.wi v10, v2, 0
   2012  vnclipu.wi v11, v3, 0
   2013  vnclipu.wi v12, v4, 0
   2014  vnclipu.wi v13, v5, 0
   2015  vnclipu.wi v14, v6, 0
   2016  vnclipu.wi v15, v7, 0
   2017 
   2018  addi t0, a0, \j
   2019  vse8.v v8, (t0)
   2020 .irp i, 9, 10, 11, 12, 13, 14, 15
   2021  add t0, t0, a1
   2022  vse8.v v\i, (t0)
   2023 .endr
   2024 .endr
   2025 
   2026  ret
   2027 .endif
   2028 endfunc
   2029 .endm
   2030 
   2031 def_fn_816_base identity_
   2032 def_fn_816_base
   2033 
   2034 .macro def_fn_816 w, h, txfm1, txfm2, eob_half
   2035 function inv_txfm_add_\txfm1\()_\txfm2\()_\w\()x\h\()_8bpc_rvv, export=1
   2036 .ifnc \txfm1, identity
   2037  la a4, inv_\txfm1\()_e16_x\w\()_rvv
   2038 .endif
   2039  la a5, inv_\txfm2\()_e16_x\h\()_rvv
   2040 .if \w == 8
   2041  li a6, \eob_half
   2042 .endif
   2043 .ifc \txfm1, identity
   2044  j inv_txfm_identity_add_\w\()x\h\()_rvv
   2045 .else
   2046  j inv_txfm_add_\w\()x\h\()_rvv
   2047 .endif
   2048 endfunc
   2049 .endm
   2050 
   2051 .macro def_fns_816 w, h
   2052 def_fn_816 \w, \h, dct, dct, 43
   2053 def_fn_816 \w, \h, identity, identity, 43
   2054 def_fn_816 \w, \h, dct, adst, 43
   2055 def_fn_816 \w, \h, dct, flipadst, 43
   2056 def_fn_816 \w, \h, dct, identity, 8
   2057 def_fn_816 \w, \h, adst, dct, 43
   2058 def_fn_816 \w, \h, adst, adst, 43
   2059 def_fn_816 \w, \h, adst, flipadst, 43
   2060 def_fn_816 \w, \h, flipadst, dct, 43
   2061 def_fn_816 \w, \h, flipadst, adst, 43
   2062 def_fn_816 \w, \h, flipadst, flipadst, 43
   2063 def_fn_816 \w, \h, identity, dct, 64
   2064 def_fn_816 \w, \h, adst, identity, 8
   2065 def_fn_816 \w, \h, flipadst, identity, 8
   2066 def_fn_816 \w, \h, identity, adst, 64
   2067 def_fn_816 \w, \h, identity, flipadst, 64
   2068 .endm
   2069 
   2070 def_fns_816 8, 16
   2071 def_fns_816 16, 8