tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred.S (128422B)


      1 /*
      2 * Copyright © 2024, VideoLAN and dav1d authors
      3 * Copyright © 2024, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/loongarch/loongson_asm.S"
     29 
     30 .macro ipred_dc_gen topleft, width, height
     31    add.d          t0,      \width,  \height //dc
     32    srai.d         t0,      t0,      1
     33    addi.d         t3,      \topleft,1
     34 
     35    or             t1,      zero,    zero  //data index
     36    srai.d         t2,      \width,  4     //loop param
     37    beqz           t2,      2f
     38 
     39 1:  // width/16
     40    vldx           vr0,     t3,      t1
     41    vhaddw.hu.bu   vr0,     vr0,     vr0
     42    vhaddw.wu.hu   vr0,     vr0,     vr0
     43    vhaddw.du.wu   vr0,     vr0,     vr0
     44    vhaddw.qu.du   vr0,     vr0,     vr0
     45 
     46    vpickve2gr.du  t4,      vr0,     0
     47    add.d          t0,      t0,      t4
     48 
     49    addi.d         t1,      t1,      16
     50    addi.d         t2,      t2,      -1
     51    bnez           t2,      1b
     52    b              4f
     53 
     54 2:  // &8
     55    andi           t2,      \width,  8
     56    beqz           t2,      3f
     57 
     58    vxor.v         vr0,     vr0,     vr0
     59    fldx.d         f0,      t3,      t1
     60 
     61    vhaddw.hu.bu   vr0,     vr0,     vr0
     62    vhaddw.wu.hu   vr0,     vr0,     vr0
     63    vhaddw.du.wu   vr0,     vr0,     vr0
     64 
     65    vpickve2gr.du  t4,      vr0,     0
     66    add.d          t0,      t0,      t4
     67    addi.d         t1,      t1,      8
     68    b              4f
     69 
     70 3:  // &4
     71    andi           t2,      \width,  4
     72    beqz           t2,      4f
     73 
     74    vxor.v         vr0,     vr0,     vr0
     75    fldx.s         f0,      t3,      t1
     76 
     77    vhaddw.hu.bu   vr0,     vr0,     vr0
     78    vhaddw.wu.hu   vr0,     vr0,     vr0
     79 
     80    vpickve2gr.wu  t4,      vr0,     0
     81    add.d          t0,      t0,      t4
     82    addi.d         t1,      t1,      4
     83 
     84 4:
     85    addi.d         t3,      \topleft,0
     86    srai.d         t2,      \height, 4     //loop param
     87    beqz           t2,      8f
     88 
     89 7:  // height/16
     90    addi.d         t3,      t3,      -16
     91    vld            vr0,     t3,      0
     92 
     93    vhaddw.hu.bu   vr0,     vr0,     vr0
     94    vhaddw.wu.hu   vr0,     vr0,     vr0
     95    vhaddw.du.wu   vr0,     vr0,     vr0
     96    vhaddw.qu.du   vr0,     vr0,     vr0
     97 
     98    vpickve2gr.du  t4,      vr0,     0
     99    add.d          t0,      t0,      t4
    100 
    101    addi.d         t2,      t2,      -1
    102    bnez           t2,      7b
    103    b              10f
    104 
    105 8:  // &8
    106    andi           t2,      \height, 8
    107    beqz           t2,      9f
    108 
    109    addi.d         t3,      t3,      -8
    110    vxor.v         vr0,     vr0,     vr0
    111    fld.d          f0,      t3,      0
    112 
    113    vhaddw.hu.bu   vr0,     vr0,     vr0
    114    vhaddw.wu.hu   vr0,     vr0,     vr0
    115    vhaddw.du.wu   vr0,     vr0,     vr0
    116 
    117    vpickve2gr.du  t4,      vr0,     0
    118    add.d          t0,      t0,      t4
    119    b              10f
    120 
    121 9:  // &4
    122    andi           t2,      \height, 4
    123    beqz           t2,      10f
    124 
    125    addi.d         t3,      t3,      -4
    126    vxor.v         vr0,     vr0,     vr0
    127    fld.s          f0,      t3,      0
    128 
    129    vhaddw.hu.bu   vr0,     vr0,     vr0
    130    vhaddw.wu.hu   vr0,     vr0,     vr0
    131 
    132    vpickve2gr.wu  t4,      vr0,     0
    133    add.d          t0,      t0,      t4
    134 
    135 10:
    136    add.d          t1,      \width,  \height
    137    ctz.w          t1,      t1
    138    sra.w          t0,      t0,      t1
    139 
    140    // w != h
    141    beq            \width,  \height, 16f
    142    add.d          t2,      \height, \height
    143    add.d          t3,      \width,  \width
    144    slt            t2,      t2,      \width
    145    slt            t3,      t3,      \height
    146    or             t2,      t2,      t3
    147    li.w           t3,      0x3334
    148    maskeqz        t1,      t3,      t2
    149    li.w           t3,      0x5556
    150    masknez        t2,      t3,      t2
    151    or             t1,      t1,      t2
    152    mul.w          t0,      t0,      t1
    153    srai.w         t0,      t0,      16
    154 
    155 16:
    156 .endm
    157 
    158 .macro ipred_splat_dc dst, stride, width, height, dc
    159    li.w           t1,      4
    160    blt            t1,      \width,  2f
    161 
    162    li.w           t1,      0x01010101
    163    mulw.d.wu      t1,      \dc,     t1
    164    beqz           \height, 7f
    165    or             t2,      \dst,    \dst
    166 1:  // width <= 4
    167    st.w           t1,      t2,      0
    168    add.d          t2,      t2,      \stride
    169    addi.d         \height, \height, -1
    170    bnez           \height, 1b
    171    b              7f
    172 
    173 2:  //width > 4
    174    li.d           t1,      0x0101010101010101
    175    mul.d          t1,      \dc,     t1
    176    vreplgr2vr.d   vr0,     t1
    177    or             t4,      \dst,    \dst
    178    beqz           \height, 7f
    179 
    180 3:
    181    andi           t5,      \width,  64
    182    beqz           t5,      4f
    183    vst            vr0,     t4,      0
    184    vst            vr0,     t4,      16
    185    vst            vr0,     t4,      32
    186    vst            vr0,     t4,      48
    187    b              6f
    188 
    189 4:
    190    andi           t5,      \width,  32
    191    beqz           t5,      41f
    192    vst            vr0,     t4,      0
    193    vst            vr0,     t4,      16
    194    b              6f
    195 
    196 41:
    197    andi           t5,      \width,  16
    198    beqz           t5,      5f
    199    vst            vr0,     t4,      0
    200    b              6f
    201 
    202 5:
    203    fst.d          f0,      t4,      0
    204 
    205 6:
    206    add.d          t4,      t4,      \stride
    207    addi.d         \height, \height, -1
    208    bnez           \height, 3b
    209 
    210 7:
    211 .endm
    212 
    213 .macro ipred_dc_gen_top topleft, width
    214    srai.d         t0,      \width,  1
    215    addi.d         t1,      \topleft,1
    216 
    217    srai.d         t2,      \width,  4
    218    beqz           t2,      2f
    219 1:
    220    vld            vr0,     t1,      0
    221    vhaddw.hu.bu   vr0,     vr0,     vr0
    222    vhaddw.wu.hu   vr0,     vr0,     vr0
    223    vhaddw.du.wu   vr0,     vr0,     vr0
    224    vhaddw.qu.du   vr0,     vr0,     vr0
    225 
    226    vpickve2gr.du  t3,      vr0,     0
    227    add.d          t0,      t0,      t3
    228 
    229    addi.d         t1,      t1,      16
    230    addi.d         t2,      t2,      -1
    231    bnez           t2,      1b
    232    b              4f
    233 
    234 2:  // &8
    235    andi           t2,      \width,  8
    236    beqz           t2,      3f
    237 
    238    vxor.v         vr0,     vr0,     vr0
    239    fld.d          f0,      t1,      0
    240 
    241    vhaddw.hu.bu   vr0,     vr0,     vr0
    242    vhaddw.wu.hu   vr0,     vr0,     vr0
    243    vhaddw.du.wu   vr0,     vr0,     vr0
    244 
    245    vpickve2gr.du  t2,      vr0,     0
    246    add.d          t0,      t0,      t2
    247 
    248    addi.d         t1,      t1,      8
    249    b              4f
    250 
    251 3:  // &4
    252    andi           t2,      \width,  4
    253    beqz           t2,      4f
    254 
    255    vxor.v         vr0,     vr0,     vr0
    256    fld.s          f0,      t1,      0
    257 
    258    vhaddw.hu.bu   vr0,     vr0,     vr0
    259    vhaddw.wu.hu   vr0,     vr0,     vr0
    260 
    261    vpickve2gr.du  t2,      vr0,     0
    262    add.d          t0,      t0,      t2
    263    addi.d         t1,      t1,      4
    264 
    265 4:
    266    ctz.w          t1,      \width
    267    sra.w          t0,      t0,      t1
    268 .endm
    269 
    270 .macro ipred_dc_gen_left topleft, height
    271    srai.d         t0,      \height, 1
    272    srai.d         t2,      \height, 4     //loop param
    273    beqz           t2,      8f
    274 
    275 7:  // height/16
    276    addi.d         \topleft,\topleft,-16
    277    vld            vr0,     \topleft,0
    278 
    279    vhaddw.hu.bu   vr0,     vr0,     vr0
    280    vhaddw.wu.hu   vr0,     vr0,     vr0
    281    vhaddw.du.wu   vr0,     vr0,     vr0
    282    vhaddw.qu.du   vr0,     vr0,     vr0
    283 
    284    vpickve2gr.du  t4,      vr0,     0
    285    add.d          t0,      t0,      t4
    286 
    287    addi.d         t2,      t2,      -1
    288    bnez           t2,      7b
    289    b              10f
    290 
    291 8:  // &8
    292    andi           t2,      \height, 8
    293    beqz           t2,      9f
    294 
    295    addi.d         \topleft,\topleft,-8
    296    vxor.v         vr0,     vr0,     vr0
    297    fld.d          f0,      \topleft,0
    298 
    299    vhaddw.hu.bu   vr0,     vr0,     vr0
    300    vhaddw.wu.hu   vr0,     vr0,     vr0
    301    vhaddw.du.wu   vr0,     vr0,     vr0
    302 
    303    vpickve2gr.du  t4,      vr0,     0
    304    add.d          t0,      t0,      t4
    305    b              10f
    306 
    307 9:  // &4
    308    andi           t2,      \height, 4
    309    beqz           t2,      10f
    310 
    311    addi.d         \topleft,\topleft,-4
    312    vxor.v         vr0,     vr0,     vr0
    313    fld.s          f0,      \topleft,0
    314 
    315    vhaddw.hu.bu   vr0,     vr0,     vr0
    316    vhaddw.wu.hu   vr0,     vr0,     vr0
    317 
    318    vpickve2gr.wu  t4,      vr0,     0
    319    add.d          t0,      t0,      t4
    320 
    321 10:
    322    ctz.w          t1,      \height
    323    sra.w          t0,      t0,      t1
    324 
    325 .endm
    326 
    327 // void ipred_dc_lsx(pixel *dst, const ptrdiff_t stride,
    328 //                   const pixel *const topleft,
    329 //                   const int width, const int height, const int a,
    330 //                   const int max_width, const int max_height
    331 //                   HIGHBD_DECL_SUFFIX)
    332 function ipred_dc_8bpc_lsx
    333    ipred_dc_gen   a2, a3, a4
    334    ipred_splat_dc a0, a1, a3, a4, t0
    335 
    336 endfunc
    337 
    338 // void ipred_dc_128_lsx(pixel *dst, const ptrdiff_t stride,
    339 //                       const pixel *const topleft,
    340 //                       const int width, const int height, const int a,
    341 //                       const int max_width, const int max_height
    342 //                       HIGHBD_DECL_SUFFIX)
    343 function ipred_dc_128_8bpc_lsx
    344    li.w           t0,      128
    345    ipred_splat_dc a0, a1, a3, a4, t0
    346 
    347 endfunc
    348 
    349 // void ipred_dc_top_c(pixel *dst, const ptrdiff_t stride,
    350 //                     const pixel *const topleft,
    351 //                     const int width, const int height, const int a,
    352 //                     const int max_width, const int max_height
    353 //                     HIGHBD_DECL_SUFFIX)
    354 function ipred_dc_top_8bpc_lsx
    355    ipred_dc_gen_top a2, a3
    356    ipred_splat_dc   a0, a1, a3, a4, t0
    357 
    358 endfunc
    359 
    360 // void ipred_dc_left_c(pixel *dst, const ptrdiff_t stride,
    361 //                      const pixel *const topleft,
    362 //                      const int width, const int height, const int a,
    363 //                      const int max_width, const int max_height
    364 //                      HIGHBD_DECL_SUFFIX)
    365 function ipred_dc_left_8bpc_lsx
    366    ipred_dc_gen_left a2, a4
    367    ipred_splat_dc    a0, a1, a3, a4, t0
    368 
    369 endfunc
    370 
    371 .macro pixel_set_8bpc dst_ptr, src_ptr, width
    372    vldrepl.b      vr0,     \src_ptr, 0
    373 1:
    374    andi           a5,      \width,   64
    375    beqz           a5,      2f
    376 
    377    vst            vr0,     \dst_ptr, 0
    378    vst            vr0,     \dst_ptr, 16
    379    vst            vr0,     \dst_ptr, 32
    380    vst            vr0,     \dst_ptr, 48
    381    b              6f
    382 2:
    383    andi           a5,      \width,   32
    384    beqz           a5,      3f
    385 
    386    vst            vr0,     \dst_ptr, 0
    387    vst            vr0,     \dst_ptr, 16
    388    b              6f
    389 3:
    390    andi           a5,      \width,   16
    391    beqz           a5,      4f
    392 
    393    vst            vr0,     \dst_ptr, 0
    394    b              6f
    395 4:
    396    andi           a5,      \width,   8
    397    beqz           a5,      5f
    398 
    399    fst.d          f0,      \dst_ptr, 0
    400    b              6f
    401 5:
    402    andi           a5,      \width,   4
    403    beqz           a5,      6f
    404 
    405    fst.s          f0,      \dst_ptr, 0
    406 6:
    407 .endm
    408 
    409 // void ipred_h_c(pixel *dst, const ptrdiff_t stride,
    410 //                const pixel *const topleft,
    411 //                const int width, const int height, const int a,
    412 //                const int max_width, const int max_height
    413 //                HIGHBD_DECL_SUFFIX)
    414 function ipred_h_8bpc_lsx
    415    beqz           a4,      .IPRED_H_END
    416 .IPRED_H_LOOP:
    417    addi.d         a2,      a2,      -1
    418 
    419    pixel_set_8bpc a0, a2, a3
    420 
    421    add.d          a0,      a0,      a1
    422    addi.d         a4,      a4,      -1
    423    bnez           a4,      .IPRED_H_LOOP
    424 
    425 .IPRED_H_END:
    426 endfunc
    427 
    428 .macro pixel_copy_8bpc dst_ptr, src_ptr, width
    429 1:
    430    andi           a5,      \width,   64
    431    beqz           a5,      2f
    432 
    433    vld            vr0,     \src_ptr, 0
    434    vld            vr1,     \src_ptr, 16
    435    vld            vr2,     \src_ptr, 32
    436    vld            vr3,     \src_ptr, 48
    437 
    438    vst            vr0,     \dst_ptr, 0
    439    vst            vr1,     \dst_ptr, 16
    440    vst            vr2,     \dst_ptr, 32
    441    vst            vr3,     \dst_ptr, 48
    442 
    443    b              6f
    444 2:
    445    andi           a5,      \width,   32
    446    beqz           a5,      3f
    447 
    448    vld            vr0,     \src_ptr, 0
    449    vld            vr1,     \src_ptr, 16
    450 
    451    vst            vr0,     \dst_ptr, 0
    452    vst            vr1,     \dst_ptr, 16
    453 
    454    b              6f
    455 3:
    456    andi           a5,      \width,   16
    457    beqz           a5,      4f
    458 
    459    vld            vr0,     \src_ptr, 0
    460    vst            vr0,     \dst_ptr, 0
    461 
    462    b              6f
    463 4:
    464    andi           a5,      \width,   8
    465    beqz           a5,      5f
    466 
    467    fld.d          f0,      \src_ptr, 0
    468    fst.d          f0,      \dst_ptr, 0
    469 
    470    b              6f
    471 5:
    472    andi           a5,      \width,   4
    473    beqz           a5,      6f
    474 
    475    fld.s          f0,      \src_ptr, 0
    476    fst.s          f0,      \dst_ptr, 0
    477 6:
    478 .endm
    479 
    480 // void ipred_v_lsx(pixel *dst, const ptrdiff_t stride,
    481 //                  const pixel *const topleft,
    482 //                  const int width, const int height, const int a,
    483 //                  const int max_width, const int max_height
    484 //                  HIGHBD_DECL_SUFFIX)
    485 function ipred_v_8bpc_lsx
    486    beqz           a4,      .IPRED_V_END
    487    addi.d         a2,      a2,      1
    488 .IPRED_V_LOOP:
    489    pixel_copy_8bpc  a0, a2, a3
    490 
    491    add.d          a0,      a0,      a1
    492    addi.d         a4,      a4,      -1
    493    bnez           a4,      .IPRED_V_LOOP
    494 
    495 .IPRED_V_END:
    496 endfunc
    497 
    498 // void ipred_paeth_lsx(pixel *dst, const ptrdiff_t stride,
    499 //                      const pixel *const tl_ptr,
    500 //                      const int width, const int height, const int a,
    501 //                      const int max_width, const int max_height
    502 //                      HIGHBD_DECL_SUFFIX)
    503 function ipred_paeth_8bpc_lsx
    504    vldrepl.b      vr0,     a2,      0    //topleft
    505    vsllwil.hu.bu  vr0,     vr0,     0
    506    or             a6,      a2,      a2
    507    addi.d         a7,      a2,      1
    508 
    509 .IPRED_PAETH_H_LOOP:
    510    addi.d         a6,      a6,      -1
    511    vldrepl.b      vr1,     a6,      0   //left
    512    vsllwil.hu.bu  vr1,     vr1,     0
    513 
    514 .IPRED_PAETH_W_LOOP64:
    515    andi           a5,      a3,      64
    516    beqz           a5,      .IPRED_PAETH_W_LOOP32
    517 
    518    vld            vr2,     a7,      0   //top
    519    vpermi.w       vr9,     vr2,     0x0e
    520    vsllwil.hu.bu  vr2,     vr2,     0
    521    vsllwil.hu.bu  vr9,     vr9,     0
    522 
    523    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    524    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    525    vabsd.hu       vr10,    vr0,     vr9
    526 
    527    vadd.h         vr3,     vr0,     vr0
    528    vadd.h         vr6,     vr1,     vr2
    529    vadd.h         vr11,    vr1,     vr9
    530    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    531    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    532 
    533    vsle.hu        vr3,     vr5,     vr6
    534    vbitsel.v      vr7,     vr0,     vr2,    vr3
    535    vsle.hu        vr3,     vr4,     vr5
    536    vsle.hu        vr8,     vr4,     vr6
    537    vand.v         vr3,     vr3,     vr8
    538    vbitsel.v      vr3,     vr7,     vr1,    vr3
    539    vsrlni.b.h     vr3,     vr3,     0
    540 
    541    vsle.hu        vr12,    vr5,     vr11
    542    vbitsel.v      vr7,     vr0,     vr9,    vr12
    543    vsle.hu        vr12,    vr10,    vr5
    544    vsle.hu        vr8,     vr10,    vr11
    545    vand.v         vr12,    vr12,    vr8
    546    vbitsel.v      vr12,    vr7,     vr1,    vr12
    547    vsrlni.b.h     vr12,    vr12,    0
    548 
    549    vpermi.w       vr12,    vr3,     0x44
    550 
    551    vst            vr12,    a0,      0
    552 
    553    vld            vr2,     a7,      16   //top
    554    vpermi.w       vr9,     vr2,     0x0e
    555    vsllwil.hu.bu  vr2,     vr2,     0
    556    vsllwil.hu.bu  vr9,     vr9,     0
    557 
    558    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    559    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    560    vabsd.hu       vr10,    vr0,     vr9
    561 
    562    vadd.h         vr3,     vr0,     vr0
    563    vadd.h         vr6,     vr1,     vr2
    564    vadd.h         vr11,    vr1,     vr9
    565    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    566    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    567 
    568    vsle.hu        vr3,     vr5,     vr6
    569    vbitsel.v      vr7,     vr0,     vr2,    vr3
    570    vsle.hu        vr3,     vr4,     vr5
    571    vsle.hu        vr8,     vr4,     vr6
    572    vand.v         vr3,     vr3,     vr8
    573    vbitsel.v      vr3,     vr7,     vr1,    vr3
    574    vsrlni.b.h     vr3,     vr3,     0
    575 
    576    vsle.hu        vr12,    vr5,     vr11
    577    vbitsel.v      vr7,     vr0,     vr9,    vr12
    578    vsle.hu        vr12,    vr10,    vr5
    579    vsle.hu        vr8,     vr10,    vr11
    580    vand.v         vr12,    vr12,    vr8
    581    vbitsel.v      vr12,    vr7,     vr1,    vr12
    582    vsrlni.b.h     vr12,    vr12,    0
    583 
    584    vpermi.w       vr12,    vr3,     0x44
    585 
    586    vst            vr12,    a0,      16
    587 
    588    vld            vr2,     a7,      32   //top
    589    vpermi.w       vr9,     vr2,     0x0e
    590    vsllwil.hu.bu  vr2,     vr2,     0
    591    vsllwil.hu.bu  vr9,     vr9,     0
    592 
    593    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    594    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    595    vabsd.hu       vr10,    vr0,     vr9
    596 
    597    vadd.h         vr3,     vr0,     vr0
    598    vadd.h         vr6,     vr1,     vr2
    599    vadd.h         vr11,    vr1,     vr9
    600    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    601    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    602 
    603    vsle.hu        vr3,     vr5,     vr6
    604    vbitsel.v      vr7,     vr0,     vr2,    vr3
    605    vsle.hu        vr3,     vr4,     vr5
    606    vsle.hu        vr8,     vr4,     vr6
    607    vand.v         vr3,     vr3,     vr8
    608    vbitsel.v      vr3,     vr7,     vr1,    vr3
    609    vsrlni.b.h     vr3,     vr3,     0
    610 
    611    vsle.hu        vr12,    vr5,     vr11
    612    vbitsel.v      vr7,     vr0,     vr9,    vr12
    613    vsle.hu        vr12,    vr10,    vr5
    614    vsle.hu        vr8,     vr10,    vr11
    615    vand.v         vr12,    vr12,    vr8
    616    vbitsel.v      vr12,    vr7,     vr1,    vr12
    617    vsrlni.b.h     vr12,    vr12,    0
    618 
    619    vpermi.w       vr12,    vr3,     0x44
    620 
    621    vst            vr12,    a0,      32
    622 
    623    vld            vr2,     a7,      48   //top
    624    vpermi.w       vr9,     vr2,     0x0e
    625    vsllwil.hu.bu  vr2,     vr2,     0
    626    vsllwil.hu.bu  vr9,     vr9,     0
    627 
    628    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    629    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    630    vabsd.hu       vr10,    vr0,     vr9
    631 
    632    vadd.h         vr3,     vr0,     vr0
    633    vadd.h         vr6,     vr1,     vr2
    634    vadd.h         vr11,    vr1,     vr9
    635    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    636    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    637 
    638    vsle.hu        vr3,     vr5,     vr6
    639    vbitsel.v      vr7,     vr0,     vr2,    vr3
    640    vsle.hu        vr3,     vr4,     vr5
    641    vsle.hu        vr8,     vr4,     vr6
    642    vand.v         vr3,     vr3,     vr8
    643    vbitsel.v      vr3,     vr7,     vr1,    vr3
    644    vsrlni.b.h     vr3,     vr3,     0
    645 
    646    vsle.hu        vr12,    vr5,     vr11
    647    vbitsel.v      vr7,     vr0,     vr9,    vr12
    648    vsle.hu        vr12,    vr10,    vr5
    649    vsle.hu        vr8,     vr10,    vr11
    650    vand.v         vr12,    vr12,    vr8
    651    vbitsel.v      vr12,    vr7,     vr1,    vr12
    652    vsrlni.b.h     vr12,    vr12,    0
    653 
    654    vpermi.w       vr12,    vr3,     0x44
    655 
    656    vst            vr12,    a0,      48
    657 
    658    b              .IPRED_PAETH_W_LOOPEND
    659 
    660 .IPRED_PAETH_W_LOOP32:
    661    andi           a5,      a3,      32
    662    beqz           a5,      .IPRED_PAETH_W_LOOP16
    663 
    664    vld            vr2,     a7,      0   //top
    665    vpermi.w       vr9,     vr2,     0x0e
    666    vsllwil.hu.bu  vr2,     vr2,     0
    667    vsllwil.hu.bu  vr9,     vr9,     0
    668 
    669    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    670    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    671    vabsd.hu       vr10,    vr0,     vr9
    672 
    673    vadd.h         vr3,     vr0,     vr0
    674    vadd.h         vr6,     vr1,     vr2
    675    vadd.h         vr11,    vr1,     vr9
    676    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    677    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    678 
    679    vsle.hu        vr3,     vr5,     vr6
    680    vbitsel.v      vr7,     vr0,     vr2,    vr3
    681    vsle.hu        vr3,     vr4,     vr5
    682    vsle.hu        vr8,     vr4,     vr6
    683    vand.v         vr3,     vr3,     vr8
    684    vbitsel.v      vr3,     vr7,     vr1,    vr3
    685    vsrlni.b.h     vr3,     vr3,     0
    686 
    687    vsle.hu        vr12,    vr5,     vr11
    688    vbitsel.v      vr7,     vr0,     vr9,    vr12
    689    vsle.hu        vr12,    vr10,    vr5
    690    vsle.hu        vr8,     vr10,    vr11
    691    vand.v         vr12,    vr12,    vr8
    692    vbitsel.v      vr12,    vr7,     vr1,    vr12
    693    vsrlni.b.h     vr12,    vr12,    0
    694 
    695    vpermi.w       vr12,    vr3,     0x44
    696 
    697    vst            vr12,    a0,      0
    698 
    699    vld            vr2,     a7,      16   //top
    700    vpermi.w       vr9,     vr2,     0x0e
    701    vsllwil.hu.bu  vr2,     vr2,     0
    702    vsllwil.hu.bu  vr9,     vr9,     0
    703 
    704    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    705    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    706    vabsd.hu       vr10,    vr0,     vr9
    707 
    708    vadd.h         vr3,     vr0,     vr0
    709    vadd.h         vr6,     vr1,     vr2
    710    vadd.h         vr11,    vr1,     vr9
    711    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    712    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    713 
    714    vsle.hu        vr3,     vr5,     vr6
    715    vbitsel.v      vr7,     vr0,     vr2,    vr3
    716    vsle.hu        vr3,     vr4,     vr5
    717    vsle.hu        vr8,     vr4,     vr6
    718    vand.v         vr3,     vr3,     vr8
    719    vbitsel.v      vr3,     vr7,     vr1,    vr3
    720    vsrlni.b.h     vr3,     vr3,     0
    721 
    722    vsle.hu        vr12,    vr5,     vr11
    723    vbitsel.v      vr7,     vr0,     vr9,    vr12
    724    vsle.hu        vr12,    vr10,    vr5
    725    vsle.hu        vr8,     vr10,    vr11
    726    vand.v         vr12,    vr12,    vr8
    727    vbitsel.v      vr12,    vr7,     vr1,    vr12
    728    vsrlni.b.h     vr12,    vr12,    0
    729 
    730    vpermi.w       vr12,    vr3,     0x44
    731 
    732    vst            vr12,    a0,      16
    733 
    734    b              .IPRED_PAETH_W_LOOPEND
    735 
    736 .IPRED_PAETH_W_LOOP16:
    737    andi           a5,      a3,      16
    738    beqz           a5,      .IPRED_PAETH_W_LOOP8
    739 
    740    vld            vr2,     a7,      0   //top
    741    vpermi.w       vr9,     vr2,     0x0e
    742    vsllwil.hu.bu  vr2,     vr2,     0
    743    vsllwil.hu.bu  vr9,     vr9,     0
    744 
    745    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    746    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    747    vabsd.hu       vr10,    vr0,     vr9
    748 
    749    vadd.h         vr3,     vr0,     vr0
    750    vadd.h         vr6,     vr1,     vr2
    751    vadd.h         vr11,    vr1,     vr9
    752    vabsd.hu       vr6,     vr3,     vr6  //tldiff
    753    vabsd.hu       vr11,    vr3,     vr11 //tldiff
    754 
    755    vsle.hu        vr3,     vr5,     vr6
    756    vbitsel.v      vr7,     vr0,     vr2,    vr3
    757    vsle.hu        vr3,     vr4,     vr5
    758    vsle.hu        vr8,     vr4,     vr6
    759    vand.v         vr3,     vr3,     vr8
    760    vbitsel.v      vr3,     vr7,     vr1,    vr3
    761    vsrlni.b.h     vr3,     vr3,     0
    762 
    763    vsle.hu        vr12,    vr5,     vr11
    764    vbitsel.v      vr7,     vr0,     vr9,    vr12
    765    vsle.hu        vr12,    vr10,    vr5
    766    vsle.hu        vr8,     vr10,    vr11
    767    vand.v         vr12,    vr12,    vr8
    768    vbitsel.v      vr12,    vr7,     vr1,    vr12
    769    vsrlni.b.h     vr12,    vr12,    0
    770 
    771    vpermi.w       vr12,    vr3,     0x44
    772 
    773    vst            vr12,    a0,      0
    774 
    775    b              .IPRED_PAETH_W_LOOPEND
    776 
    777 .IPRED_PAETH_W_LOOP8:
    778    andi           a5,      a3,      8
    779    beqz           a5,      .IPRED_PAETH_W_LOOP4
    780 
    781    fld.d          f2,      a7,      0   //top
    782    vsllwil.hu.bu  vr2,     vr2,     0
    783 
    784    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    785    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    786 
    787    vadd.h         vr3,     vr0,     vr0
    788    vadd.h         vr6,     vr1,     vr2
    789    vabsd.hu       vr6,     vr3,     vr6 //tldiff
    790 
    791    vsle.hu        vr3,     vr5,     vr6
    792    vbitsel.v      vr7,     vr0,     vr2,    vr3
    793    vsle.hu        vr3,     vr4,     vr5
    794    vsle.hu        vr8,     vr4,     vr6
    795    vand.v         vr3,     vr3,     vr8
    796    vbitsel.v      vr3,     vr7,     vr1,    vr3
    797    vsrlni.b.h     vr3,     vr3,     0
    798    fst.d          f3,      a0,      0
    799 
    800    b              .IPRED_PAETH_W_LOOPEND
    801 
    802 .IPRED_PAETH_W_LOOP4:
    803    andi           a5,      a3,      4
    804    beqz           a5,      .IPRED_PAETH_W_LOOPEND
    805 
    806    fld.s          f2,      a7,      0   //top
    807    vsllwil.hu.bu  vr2,     vr2,     0
    808 
    809    vabsd.hu       vr5,     vr0,     vr1  //tdiff
    810    vabsd.hu       vr4,     vr0,     vr2  //ldiff
    811 
    812    vadd.h         vr3,     vr0,     vr0
    813    vadd.h         vr6,     vr1,     vr2
    814    vabsd.hu       vr6,     vr3,     vr6 //tldiff
    815 
    816    vsle.hu        vr3,     vr5,     vr6
    817    vbitsel.v      vr7,     vr0,     vr2,    vr3
    818    vsle.hu        vr3,     vr4,     vr5
    819    vsle.hu        vr8,     vr4,     vr6
    820    vand.v         vr3,     vr3,     vr8
    821    vbitsel.v      vr3,     vr7,     vr1,    vr3
    822    vsrlni.b.h     vr3,     vr3,     0
    823    fst.s          f3,      a0,      0
    824 
    825    b              .IPRED_PAETH_W_LOOPEND
    826 
    827 .IPRED_PAETH_W_LOOPEND:
    828    add.d         a0,       a0,      a1
    829    addi.d        a4,       a4,      -1
    830    bnez          a4,       .IPRED_PAETH_H_LOOP
    831 endfunc
    832 
    833 const dav1d_sm_weights
    834    .byte  0,   0
    835    // bs = 2
    836    .byte  255, 128
    837    // bs = 4
    838    .byte  255, 149,  85,  64
    839    // bs = 8
    840    .byte  255, 197, 146, 105,  73,  50,  37,  32
    841    // bs = 16
    842    .byte  255, 225, 196, 170, 145, 123, 102,  84
    843    .byte  68,  54,  43,  33,  26,  20,  17,  16
    844    // bs = 32
    845    .byte  255, 240, 225, 210, 196, 182, 169, 157
    846    .byte  145, 133, 122, 111, 101,  92,  83,  74
    847    .byte  66,  59,  52,  45,  39,  34,  29,  25
    848    .byte  21,  17,  14,  12,  10,   9,   8,   8
    849    // bs = 64
    850    .byte  255, 248, 240, 233, 225, 218, 210, 203
    851    .byte  196, 189, 182, 176, 169, 163, 156, 150
    852    .byte  144, 138, 133, 127, 121, 116, 111, 106
    853    .byte  101,  96,  91,  86,  82,  77,  73,  69
    854    .byte  65,  61,  57,  54,  50,  47,  44,  41
    855    .byte  38,  35,  32,  29,  27,  25,  22,  20
    856    .byte  18,  16,  15,  13,  12,  10,   9,   8
    857    .byte  7,   6,   6,   5,   5,   4,   4,   4
    858 endconst
    859 
    860 // void ipred_smooth_lsx(pixel *dst, const ptrdiff_t stride,
    861 //                       const pixel *const topleft,
    862 //                       const int width, const int height, const int a,
    863 //                       const int max_width, const int max_height
    864 //                       HIGHBD_DECL_SUFFIX)
    865 function ipred_smooth_8bpc_lsx
    866    la.local       a5,      dav1d_sm_weights
    867    add.d          a6,      a5,      a3  //hor
    868    add.d          a5,      a5,      a4  //ver
    869 
    870    add.d          a7,      a2,      a3
    871    sub.d          t0,      a2,      a4
    872 
    873    vldrepl.b      vr0,     a7,      0  //right
    874    vldrepl.b      vr1,     t0,      0  //bottom
    875 
    876    vsllwil.hu.bu  vr0,     vr0,     0
    877    vsllwil.wu.hu  vr0,     vr0,     0
    878    vsllwil.hu.bu  vr1,     vr1,     0
    879    vsllwil.wu.hu  vr1,     vr1,     0
    880 
    881    li.w           t0,      256
    882    vreplgr2vr.w   vr6,     t0
    883 
    884    addi.d         t0,      a2,      1   //ptr topleft[x]
    885    addi.d         t3,      a2,      -1  //ptr topleft[y]
    886 
    887 .IPRED_SMOOTH_H_LOOP:
    888    vldrepl.b      vr2,     a5,      0  //ver[y]
    889    vldrepl.b      vr3,     t3,      0  //topleft[y]
    890 
    891    vsllwil.hu.bu  vr2,     vr2,     0
    892    vsllwil.wu.hu  vr2,     vr2,     0
    893    vsllwil.hu.bu  vr3,     vr3,     0
    894    vsllwil.wu.hu  vr3,     vr3,     0
    895 
    896    vsub.w         vr7,     vr6,     vr2  //256-ver[y]
    897 
    898    or             t1,      zero,    zero  //xx
    899    srai.d         t2,      a3,      2     //loop max
    900 
    901 .IPRED_SMOOTH_W_LOOP:
    902    fldx.s         f4,      t0,      t1   //topleft[x]
    903    fldx.s         f5,      a6,      t1   //hor[x]
    904 
    905    vsllwil.hu.bu  vr4,     vr4,     0
    906    vsllwil.wu.hu  vr4,     vr4,     0
    907    vsllwil.hu.bu  vr5,     vr5,     0
    908    vsllwil.wu.hu  vr5,     vr5,     0
    909 
    910    vsub.w         vr8,     vr6,     vr5  //256-hor[x]
    911 
    912    vmul.w         vr9,     vr8,     vr0
    913    vmadd.w        vr9,     vr5,     vr3
    914    vmadd.w        vr9,     vr7,     vr1
    915    vmadd.w        vr9,     vr2,     vr4  //pred
    916 
    917    vadd.w         vr9,     vr9,     vr6
    918    vsrlni.h.w     vr9,     vr9,     9
    919    vsrlni.b.h     vr9,     vr9,     0
    920 
    921    fstx.s         f9,      a0,      t1
    922 
    923    addi.d         t1,      t1,      4
    924    addi.d         t2,      t2,      -1
    925    bnez           t2,      .IPRED_SMOOTH_W_LOOP
    926 
    927 .IPRED_SMOOTH_W_LOOP_END:
    928    addi.d         t3,      t3,      -1
    929    addi.d         a5,      a5,      1
    930    add.d          a0,      a0,      a1
    931    addi.d         a4,      a4,      -1
    932    bnez           a4,      .IPRED_SMOOTH_H_LOOP
    933 
    934 endfunc
    935 
    936 // void ipred_smooth_v_lsx(pixel *dst, const ptrdiff_t stride,
    937 //                         const pixel *const topleft,
    938 //                         const int width, const int height, const int a,
    939 //                         const int max_width, const int max_height
    940 //                         HIGHBD_DECL_SUFFIX)
    941 function ipred_smooth_v_8bpc_lsx
    942    la.local       a5,      dav1d_sm_weights
    943    add.d          a5,      a5,      a4  //ver
    944 
    945    sub.d          t0,      a2,      a4
    946    vldrepl.b      vr0,     t0,      0  //bottom
    947    vsllwil.hu.bu  vr0,     vr0,     0
    948 
    949    li.w           t0,      256
    950    vreplgr2vr.h   vr2,     t0
    951    li.w           t0,      128
    952    vreplgr2vr.h   vr3,     t0
    953 
    954    addi.d         t0,      a2,      1   //ptr topleft[x]
    955 
    956 .IPRED_SMOOTH_V_H_LOOP:
    957    vldrepl.b      vr1,     a5,      0  //ver[y]
    958    vsllwil.hu.bu  vr1,     vr1,     0
    959    vsub.h         vr5,     vr2,     vr1  //256-ver[y]
    960 
    961    or             t1,      zero,    zero  //xx
    962    srai.d         t2,      a3,      3     //loop max
    963    beqz           t2,      .IPRED_SMOOTH_V_W_LOOP4
    964 
    965 .IPRED_SMOOTH_V_W_LOOP8:
    966    fldx.d         f4,      t0,      t1   //topleft[x]
    967    vsllwil.hu.bu  vr4,     vr4,     0
    968 
    969    vmul.h         vr6,     vr5,     vr0
    970    vmadd.h        vr6,     vr1,     vr4  //pred
    971    vadd.h         vr6,     vr6,     vr3
    972    vsrlni.b.h     vr6,     vr6,     8
    973 
    974    fstx.d         f6,      a0,      t1
    975 
    976    addi.d         t1,      t1,      8
    977    addi.d         t2,      t2,      -1
    978    bnez           t2,      .IPRED_SMOOTH_V_W_LOOP8
    979    b              .IPRED_SMOOTH_V_W_LOOP_END
    980 
    981 .IPRED_SMOOTH_V_W_LOOP4:
    982    fldx.s         f4,      t0,      t1   //topleft[x]
    983    vsllwil.hu.bu  vr4,     vr4,     0
    984 
    985    vmul.h         vr6,     vr5,     vr0
    986    vmadd.h        vr6,     vr1,     vr4  //pred
    987    vadd.h         vr6,     vr6,     vr3
    988    vsrai.h        vr6,     vr6,     8
    989    vsrlni.b.h     vr6,     vr6,     0
    990 
    991    fstx.s         f6,      a0,      t1
    992 
    993    addi.d         t1,      t1,      4
    994 
    995 .IPRED_SMOOTH_V_W_LOOP_END:
    996    addi.d         a5,      a5,      1
    997    add.d          a0,      a0,      a1
    998    addi.d         a4,      a4,      -1
    999    bnez           a4,      .IPRED_SMOOTH_V_H_LOOP
   1000 
   1001 endfunc
   1002 
   1003 // void ipred_smooth_h_lsx(pixel *dst, const ptrdiff_t stride,
   1004 //                         const pixel *const topleft,
   1005 //                         const int width, const int height, const int a,
   1006 //                         const int max_width, const int max_height
   1007 //                         HIGHBD_DECL_SUFFIX)
   1008 function ipred_smooth_h_8bpc_lsx
   1009    la.local       a5,      dav1d_sm_weights
   1010    add.d          a6,      a5,      a3  //hor
   1011 
   1012    add.d          a7,      a2,      a3
   1013    vldrepl.b      vr0,     a7,      0  //right
   1014    vsllwil.hu.bu  vr0,     vr0,     0
   1015 
   1016    li.w           t0,      256
   1017    vreplgr2vr.h   vr1,     t0
   1018    li.w           t0,      128
   1019    vreplgr2vr.h   vr2,     t0
   1020 
   1021    addi.d         t3,      a2,      -1  //ptr topleft[y]
   1022 
   1023 .IPRED_SMOOTH_H_H_LOOP:
   1024    vldrepl.b      vr3,     t3,      0  //topleft[y]
   1025    vsllwil.hu.bu  vr3,     vr3,     0
   1026 
   1027    or             t1,      zero,    zero  //xx
   1028    srai.d         t2,      a3,      3     //loop max
   1029    beqz           t2,      .IPRED_SMOOTH_H_W_LOOP4
   1030 
   1031 .IPRED_SMOOTH_H_W_LOOP8:
   1032    fldx.d         f5,      a6,      t1   //hor[x]
   1033    vsllwil.hu.bu  vr5,     vr5,     0
   1034    vsub.h         vr4,     vr1,     vr5  //256-hor[x]
   1035 
   1036    vmul.h         vr6,     vr4,     vr0
   1037    vmadd.h        vr6,     vr5,     vr3  //pred
   1038    vadd.h         vr6,     vr6,     vr2
   1039    vsrlni.b.h     vr6,     vr6,     8
   1040 
   1041    fstx.d         f6,      a0,      t1
   1042 
   1043    addi.d         t1,      t1,      8
   1044    addi.d         t2,      t2,      -1
   1045    bnez           t2,      .IPRED_SMOOTH_H_W_LOOP8
   1046    b              .IPRED_SMOOTH_W_H_LOOP_END
   1047 
   1048 .IPRED_SMOOTH_H_W_LOOP4:
   1049    fldx.s         f5,      a6,      t1   //hor[x]
   1050    vsllwil.hu.bu  vr5,     vr5,     0
   1051    vsub.h         vr4,     vr1,     vr5  //256-hor[x]
   1052 
   1053    vmul.h         vr6,     vr4,     vr0
   1054    vmadd.h        vr6,     vr5,     vr3  //pred
   1055    vadd.h         vr6,     vr6,     vr2
   1056    vsrai.h        vr6,     vr6,     8
   1057    vsrlni.b.h     vr6,     vr6,     0
   1058 
   1059    fstx.s         f6,      a0,      t1
   1060 
   1061    addi.d         t1,      t1,      4
   1062 
   1063 .IPRED_SMOOTH_W_H_LOOP_END:
   1064    addi.d         t3,      t3,      -1
   1065    add.d          a0,      a0,      a1
   1066    addi.d         a4,      a4,      -1
   1067    bnez           a4,      .IPRED_SMOOTH_H_H_LOOP
   1068 
   1069 endfunc
   1070 
   1071 // void pal_pred_lsx(pixel *dst, const ptrdiff_t stride,
   1072 //                   const pixel *const pal, const uint8_t *idx,
   1073 //                   const int w, const int h)
   1074 function pal_pred_8bpc_lsx
   1075    srai.d         a7,      a5,      2
   1076 
   1077 .PAL_PRED_WLOOP4:
   1078    andi           a6,      a4,      4
   1079    beqz           a6,      .PAL_PRED_WLOOP8
   1080    fld.d          f0,      a3,      0
   1081    vsrli.b        vr1,     vr0,     4
   1082    vandi.b        vr2,     vr0,     7
   1083    vilvl.b        vr0,     vr1,     vr2
   1084    fld.d          f1,      a2,      0
   1085    vshuf.b        vr2,     vr1,     vr1,    vr0
   1086 
   1087    vstelm.w       vr2,     a0,      0,      0
   1088    add.d          a0,      a0,      a1
   1089    vstelm.w       vr2,     a0,      0,      1
   1090    add.d          a0,      a0,      a1
   1091    vstelm.w       vr2,     a0,      0,      2
   1092    add.d          a0,      a0,      a1
   1093    vstelm.w       vr2,     a0,      0,      3
   1094    add.d          a0,      a0,      a1
   1095 
   1096    addi.d         a3,      a3,      8
   1097    addi.d         a7,      a7,      -1
   1098    bnez           a7,      .PAL_PRED_WLOOP4
   1099    b              .PAL_PRED_END
   1100 
   1101 .PAL_PRED_WLOOP8:
   1102    andi           a6,      a4,      8
   1103    beqz           a6,      .PAL_PRED_WLOOP16
   1104 
   1105    vld            vr0,     a3,      0
   1106    vsrli.b        vr1,     vr0,     4
   1107    vandi.b        vr2,     vr0,     7
   1108    vilvl.b        vr0,     vr1,     vr2
   1109    vilvh.b        vr3,     vr1,     vr2
   1110    fld.d          f1,      a2,      0
   1111    vshuf.b        vr0,     vr1,     vr1,    vr0
   1112    vshuf.b        vr3,     vr1,     vr1,    vr3
   1113 
   1114    vstelm.d       vr0,     a0,      0,      0
   1115    add.d          a0,      a0,      a1
   1116    vstelm.d       vr0,     a0,      0,      1
   1117    add.d          a0,      a0,      a1
   1118 
   1119    vstelm.d       vr3,     a0,      0,      0
   1120    add.d          a0,      a0,      a1
   1121    vstelm.d       vr3,     a0,      0,      1
   1122    add.d          a0,      a0,      a1
   1123 
   1124    addi.d         a3,      a3,      16
   1125    addi.d         a7,      a7,      -1
   1126    bnez           a7,      .PAL_PRED_WLOOP8
   1127    b              .PAL_PRED_END
   1128 
   1129 .PAL_PRED_WLOOP16:
   1130    andi           a6,      a4,      16
   1131    beqz           a6,      .PAL_PRED_WLOOP32
   1132 
   1133    vld            vr0,     a3,      0
   1134    vld            vr1,     a3,      16
   1135    fld.d          f6,      a2,      0
   1136    vsrli.b        vr2,     vr0,     4
   1137    vandi.b        vr3,     vr0,     7
   1138    vsrli.b        vr4,     vr1,     4
   1139    vandi.b        vr5,     vr1,     7
   1140    vilvl.b        vr0,     vr2,     vr3
   1141    vilvh.b        vr1,     vr2,     vr3
   1142    vilvl.b        vr2,     vr4,     vr5
   1143    vilvh.b        vr3,     vr4,     vr5
   1144    vshuf.b        vr0,     vr6,     vr6,    vr0
   1145    vshuf.b        vr1,     vr6,     vr6,    vr1
   1146    vshuf.b        vr2,     vr6,     vr6,    vr2
   1147    vshuf.b        vr3,     vr6,     vr6,    vr3
   1148 
   1149    vst            vr0,     a0,      0
   1150    add.d          a0,      a0,      a1
   1151    vst            vr1,     a0,      0
   1152    add.d          a0,      a0,      a1
   1153    vst            vr2,     a0,      0
   1154    add.d          a0,      a0,      a1
   1155    vst            vr3,     a0,      0
   1156    add.d          a0,      a0,      a1
   1157 
   1158    addi.d         a3,      a3,      32
   1159    addi.d         a7,      a7,      -1
   1160    bnez           a7,      .PAL_PRED_WLOOP16
   1161    b              .PAL_PRED_END
   1162 
   1163 .PAL_PRED_WLOOP32:
   1164    andi           a6,      a4,      32
   1165    beqz           a6,      .PAL_PRED_WLOOP64
   1166 
   1167    vld            vr0,     a3,      0
   1168    vld            vr1,     a3,      16
   1169    vld            vr2,     a3,      32
   1170    vld            vr3,     a3,      48
   1171    fld.d          f4,      a2,      0
   1172    vsrli.b        vr5,     vr0,     4
   1173    vandi.b        vr6,     vr0,     7
   1174    vsrli.b        vr7,     vr1,     4
   1175    vandi.b        vr8,     vr1,     7
   1176    vsrli.b        vr9,     vr2,     4
   1177    vandi.b        vr10,    vr2,     7
   1178    vsrli.b        vr11,    vr3,     4
   1179    vandi.b        vr12,    vr3,     7
   1180    vilvl.b        vr0,     vr5,     vr6
   1181    vilvh.b        vr1,     vr5,     vr6
   1182    vilvl.b        vr2,     vr7,     vr8
   1183    vilvh.b        vr3,     vr7,     vr8
   1184    vilvl.b        vr5,     vr9,     vr10
   1185    vilvh.b        vr6,     vr9,     vr10
   1186    vilvl.b        vr7,     vr11,    vr12
   1187    vilvh.b        vr8,     vr11,    vr12
   1188    vshuf.b        vr0,     vr4,     vr4,    vr0
   1189    vshuf.b        vr1,     vr4,     vr4,    vr1
   1190    vshuf.b        vr2,     vr4,     vr4,    vr2
   1191    vshuf.b        vr3,     vr4,     vr4,    vr3
   1192    vshuf.b        vr5,     vr4,     vr4,    vr5
   1193    vshuf.b        vr6,     vr4,     vr4,    vr6
   1194    vshuf.b        vr7,     vr4,     vr4,    vr7
   1195    vshuf.b        vr8,     vr4,     vr4,    vr8
   1196 
   1197    vst            vr0,     a0,      0
   1198    vst            vr1,     a0,      16
   1199    add.d          a0,      a0,      a1
   1200    vst            vr2,     a0,      0
   1201    vst            vr3,     a0,      16
   1202    add.d          a0,      a0,      a1
   1203    vst            vr5,     a0,      0
   1204    vst            vr6,     a0,      16
   1205    add.d          a0,      a0,      a1
   1206    vst            vr7,     a0,      0
   1207    vst            vr8,     a0,      16
   1208    add.d          a0,      a0,      a1
   1209 
   1210    addi.d         a3,      a3,      64
   1211    addi.d         a7,      a7,      -1
   1212    bnez           a7,      .PAL_PRED_WLOOP32
   1213    b              .PAL_PRED_END
   1214 
   1215 .PAL_PRED_WLOOP64:
   1216    vld            vr0,     a3,      0
   1217    vld            vr1,     a3,      16
   1218    fld.d          f2,      a2,      0
   1219    vsrli.b        vr3,     vr0,     4
   1220    vandi.b        vr4,     vr0,     7
   1221    vsrli.b        vr5,     vr1,     4
   1222    vandi.b        vr6,     vr1,     7
   1223    vilvl.b        vr0,     vr3,     vr4
   1224    vilvh.b        vr1,     vr3,     vr4
   1225    vilvl.b        vr3,     vr5,     vr6
   1226    vilvh.b        vr4,     vr5,     vr6
   1227    vshuf.b        vr0,     vr2,     vr2,    vr0
   1228    vshuf.b        vr1,     vr2,     vr2,    vr1
   1229    vshuf.b        vr3,     vr2,     vr2,    vr3
   1230    vshuf.b        vr4,     vr2,     vr2,    vr4
   1231 
   1232    vst            vr0,     a0,      0
   1233    vst            vr1,     a0,      16
   1234    vst            vr3,     a0,      32
   1235    vst            vr4,     a0,      48
   1236 
   1237    add.d          a0,      a0,      a1
   1238    addi.d         a3,      a3,      32
   1239    addi.d         a5,      a5,      -1
   1240    bnez           a5,      .PAL_PRED_WLOOP64
   1241 
   1242 .PAL_PRED_END:
   1243 endfunc
   1244 
   1245 .macro apply_sign_vrh v, s, vrzero, vrt0 ,out
   1246    vslt.h         \vrt0,   \s,      \vrzero
   1247    vandn.v        \s,      \vrt0,   \v
   1248    vsigncov.h     \v,      \vrt0,   \v
   1249    vor.v          \out,    \s,      \v
   1250 .endm
   1251 
   1252 .macro iclip_pixel_vrh in0, in1, in2, tmp0, tmp1, out
   1253    vmin.h         \tmp0,   \in2,    \in0
   1254    vslt.h         \in0,    \in0,    \in1
   1255    vand.v         \tmp1,   \in0,    \in1
   1256    vandn.v        \tmp0,   \in0,    \tmp0
   1257    vor.v          \out,    \tmp1,   \tmp0
   1258 .endm
   1259 
   1260 .macro ipred_cfl_pred dst, stride, w, h, dc, ac, alpha
   1261    vreplgr2vr.h   vr2,     \alpha
   1262    vreplgr2vr.h   vr7,     \dc
   1263    li.w           t1,      32
   1264    vreplgr2vr.h   vr3,     t1
   1265    vxor.v         vr4,     vr4,     vr4
   1266    li.w           t1,      255
   1267    vreplgr2vr.h   vr6,     t1
   1268    add.d          t4,      \w,      \w
   1269 
   1270 1:
   1271    or             t1,      zero,    zero
   1272    or             t2,      zero,    zero
   1273    srai.d         t3,      \w,      3
   1274    beqz           t3,      3f
   1275 
   1276 2:
   1277    vldx           vr0,     \ac,     t1
   1278    vmul.h         vr1,     vr2,     vr0
   1279    vadda.h        vr0,     vr1,     vr3
   1280    vsrai.h        vr0,     vr0,     6
   1281    apply_sign_vrh vr0, vr1, vr4, vr5, vr0
   1282    vadd.h         vr1,     vr0,     vr7
   1283    iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0
   1284    vsrlni.b.h     vr0,     vr0,     0
   1285    fstx.d         f0,      \dst,    t2
   1286 
   1287    addi.d         t1,      t1,      16
   1288    addi.d         t2,      t2,      8
   1289    addi.d         t3,      t3,      -1
   1290    bnez           t3,      2b
   1291    b              4f
   1292 
   1293 3:
   1294    fld.d          f0,      \ac,     0
   1295    vmul.h         vr1,     vr2,     vr0
   1296    vadda.h        vr0,     vr1,     vr3
   1297    vsrai.h        vr0,     vr0,     6
   1298    apply_sign_vrh vr0, vr1, vr4, vr5, vr0
   1299    vadd.h         vr1,     vr0,     vr7
   1300    iclip_pixel_vrh vr1, vr4, vr6, vr5, vr8, vr0
   1301    vsrlni.b.h     vr0,     vr0,     0
   1302    fst.s          f0,      \dst,    0
   1303 
   1304 4:
   1305    add.d          \ac,     \ac,     t4
   1306    add.d          \dst,    \dst,    \stride
   1307    addi.d         \h,      \h,      -1
   1308    bnez           \h,      1b
   1309 .endm
   1310 
   1311 function ipred_cfl_8bpc_lsx
   1312    ipred_dc_gen   a2, a3, a4
   1313    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
   1314 endfunc
   1315 
   1316 function ipred_cfl_top_8bpc_lsx
   1317    ipred_dc_gen_top   a2, a3
   1318    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
   1319 endfunc
   1320 
   1321 function ipred_cfl_left_8bpc_lsx
   1322    ipred_dc_gen_left   a2, a4
   1323    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
   1324 endfunc
   1325 
   1326 function ipred_cfl_128_8bpc_lsx
   1327    li.w           t0,      128
   1328    ipred_cfl_pred a0, a1, a3, a4, t0, a5, a6
   1329 endfunc
   1330 
   1331 const dav1d_filter_intra_taps_lsx
   1332    //arr0  8*7
   1333 .byte    -6, -5, -3, -3, -4, -3, -3, -3
   1334 .byte    10,  2,  1,  1,  6,  2,  2,  1
   1335 .byte    0, 10,  1,  1,  0,  6,  2,  2
   1336 .byte    0,  0, 10,  2,  0,  0,  6,  2
   1337 .byte    0,  0,  0, 10,  0,  0,  0,  6
   1338 .byte    12,  9,  7,  5,  2,  2,  2,  3
   1339 .byte    0,  0,  0,  0, 12,  9,  7,  5
   1340    //arr1
   1341 .byte    -10,  -6,  -4,  -2, -10,  -6,  -4,  -2
   1342 .byte    16,   0,   0,   0,  16,   0,   0,   0
   1343 .byte    0,  16,   0,   0,   0,  16,   0,   0
   1344 .byte    0,   0,  16,   0,   0,   0,  16,   0
   1345 .byte    0,   0,   0,  16,   0,   0,   0,  16
   1346 .byte    10,   6,   4,   2,   0,   0,   0,   0
   1347 .byte    0,   0,   0,   0,  10,   6,   4,   2
   1348    //arr2
   1349 .byte    -8,  -8,  -8,  -8,  -4,  -4,  -4,  -4
   1350 .byte    8,   0,   0,   0,   4,   0,   0,   0
   1351 .byte    0,   8,   0,   0,   0,   4,   0,   0
   1352 .byte    0,   0,   8,   0,   0,   0,   4,   0
   1353 .byte    0,   0,   0,   8,   0,   0,   0,   4
   1354 .byte    16,  16,  16,  16,   0,   0,   0,   0
   1355 .byte    0,   0,   0,   0,  16,  16,  16,  16
   1356    //arr3
   1357 .byte    -2,  -1,  -1,   0,  -1,  -1,  -1,  -1
   1358 .byte    8,   3,   2,   1,   4,   3,   2,   2
   1359 .byte    0,   8,   3,   2,   0,   4,   3,   2
   1360 .byte    0,   0,   8,   3,   0,   0,   4,   3
   1361 .byte    0,   0,   0,   8,   0,   0,   0,   4
   1362 .byte    10,   6,   4,   2,   3,   4,   4,   3
   1363 .byte    0,   0,   0,   0,  10,   6,   4,   3
   1364    //arr4
   1365 .byte    -12, -10,  -9,  -8, -10,  -9,  -8,  -7
   1366 .byte    14,   0,   0,   0,  12,   1,   0,   0
   1367 .byte    0,  14,   0,   0,   0,  12,   0,   0
   1368 .byte    0,   0,  14,   0,   0,   0,  12,   1
   1369 .byte    0,   0,   0,  14,   0,   0,   0,  12
   1370 .byte    14,  12,  11,  10,   0,   0,   1,   1
   1371 .byte    0,   0,   0,   0,  14,  12,  11,   9
   1372 endconst
   1373 
   1374 .macro ipred_filter_load_p
   1375    vldrepl.b      vr0,     t0,      0
   1376    vldrepl.b      vr1,     a7,      0
   1377    vldrepl.b      vr2,     a7,      1
   1378    vldrepl.b      vr3,     a7,      2
   1379    vldrepl.b      vr4,     a7,      3
   1380    vldrepl.b      vr5,     t1,      0
   1381    vldrepl.b      vr6,     t1,      -1
   1382 
   1383    vsllwil.hu.bu  vr0,     vr0,     0
   1384    vsllwil.hu.bu  vr1,     vr1,     0
   1385    vsllwil.hu.bu  vr2,     vr2,     0
   1386    vsllwil.hu.bu  vr3,     vr3,     0
   1387    vsllwil.hu.bu  vr4,     vr4,     0
   1388    vsllwil.hu.bu  vr5,     vr5,     0
   1389    vsllwil.hu.bu  vr6,     vr6,     0
   1390 .endm
   1391 
   1392 .macro ipred_filter_loadx_p
   1393    vldrepl.b      vr0,     t0,      0
   1394    vldrepl.b      vr1,     a7,      0
   1395    vldrepl.b      vr2,     a7,      1
   1396    vldrepl.b      vr3,     a7,      2
   1397    vldrepl.b      vr4,     a7,      3
   1398    vldrepl.b      vr5,     t1,      0
   1399    ldx.bu         t3,      t1,      a1
   1400    vreplgr2vr.b   vr6,     t3
   1401 
   1402    vsllwil.hu.bu  vr0,     vr0,     0
   1403    vsllwil.hu.bu  vr1,     vr1,     0
   1404    vsllwil.hu.bu  vr2,     vr2,     0
   1405    vsllwil.hu.bu  vr3,     vr3,     0
   1406    vsllwil.hu.bu  vr4,     vr4,     0
   1407    vsllwil.hu.bu  vr5,     vr5,     0
   1408    vsllwil.hu.bu  vr6,     vr6,     0
   1409 .endm
   1410 
   1411 .macro ipred_filter_load_fltptr
   1412    fld.d          f7,      a6,      0
   1413    fld.d          f8,      a6,      8
   1414    fld.d          f9,      a6,      16
   1415    fld.d          f10,     a6,      24
   1416    fld.d          f11,     a6,      32
   1417    fld.d          f12,     a6,      40
   1418    fld.d          f13,     a6,      48
   1419 
   1420    vsllwil.h.b    vr7,     vr7,     0
   1421    vsllwil.h.b    vr8,     vr8,     0
   1422    vsllwil.h.b    vr9,     vr9,     0
   1423    vsllwil.h.b    vr10,    vr10,    0
   1424    vsllwil.h.b    vr11,    vr11,    0
   1425    vsllwil.h.b    vr12,    vr12,    0
   1426    vsllwil.h.b    vr13,    vr13,    0
   1427 .endm
   1428 
   1429 .macro ipred_filter_calc_acc
   1430    vmul.h         vr7,     vr7,     vr0
   1431    vmadd.h        vr7,     vr8,     vr1
   1432    vmadd.h        vr7,     vr9,     vr2
   1433    vmadd.h        vr7,     vr10,    vr3
   1434    vmadd.h        vr7,     vr11,    vr4
   1435    vmadd.h        vr7,     vr12,    vr5
   1436    vmadd.h        vr7,     vr13,    vr6
   1437    vaddi.hu       vr7,     vr7,     8
   1438    vsrai.h        vr7,     vr7,     4
   1439    iclip_pixel_vrh vr7, vr14, vr15, vr9, vr10, vr8
   1440    vsrlni.b.h     vr8,     vr8,     0
   1441 .endm
   1442 
   1443 // void ipred_filter_lsx(pixel *dst, const ptrdiff_t stride,
   1444 //                       const pixel *const topleft_in,
   1445 //                       const int width, const int height, int filt_idx,
   1446 //                       const int max_width, const int max_height
   1447 //                       HIGHBD_DECL_SUFFIX)
   1448 function ipred_filter_8bpc_lsx
   1449    andi           a5,      a5,      511
   1450    la.local       a6,      dav1d_filter_intra_taps_lsx
   1451    li.w           a7,      56
   1452    mul.w          a7,      a7,      a5
   1453    add.d          a6,      a6,      a7   //*filter
   1454    addi.d         a7,      a2,      1    //*top
   1455    or             a5,      zero,    zero //y
   1456    vxor.v         vr14,    vr14,    vr14
   1457    li.w           t0,      255
   1458    vreplgr2vr.h   vr15,    t0
   1459 
   1460 .FILTER_LOOP_H:
   1461    sub.d          t0,      a2,      a5   //*topleft
   1462    addi.d         t1,      t0,      -1   //left
   1463 
   1464    ctz.w          t2,      a3
   1465    addi.d         t3,      t2,      -2
   1466    beqz           t3,      .FILTER_LOOP_W4
   1467    addi.d         t3,      t2,      -3
   1468    beqz           t3,      .FILTER_LOOP_W8
   1469    addi.d         t3,      t2,      -4
   1470    beqz           t3,      .FILTER_LOOP_W16
   1471    addi.d         t3,      t2,      -5
   1472    beqz           t3,      .FILTER_LOOP_W32
   1473 
   1474 .FILTER_LOOP_W4:
   1475    ipred_filter_load_p
   1476 
   1477    or             t3,      a0,      a0  //*ptr
   1478 
   1479    ipred_filter_load_fltptr
   1480    ipred_filter_calc_acc
   1481 
   1482    fst.s          f8,      t3,      0
   1483    add.d          t3,      t3,      a1
   1484    vstelm.w       vr8,     t3,      0,      1
   1485    add.d          t3,      t3,      a1
   1486 
   1487    b              .FILTER_LOOP_W_END
   1488 
   1489 .FILTER_LOOP_W8:
   1490    ipred_filter_load_p
   1491 
   1492    or             t3,      a0,      a0
   1493 
   1494    ipred_filter_load_fltptr
   1495    ipred_filter_calc_acc
   1496 
   1497    fst.s          f8,      t3,      0
   1498    add.d          t3,      t3,      a1
   1499    vstelm.w       vr8,     t3,      0,      1
   1500    add.d          t3,      t3,      a1
   1501 
   1502    addi.d         t1,      a0,      3
   1503    addi.d         a7,      a7,      4
   1504    addi.d         t0,      a7,      -1
   1505 
   1506    ipred_filter_loadx_p
   1507 
   1508    addi.d         t3,      a0,      4
   1509 
   1510    ipred_filter_load_fltptr
   1511    ipred_filter_calc_acc
   1512 
   1513    fst.s          f8,      t3,      0
   1514    add.d          t3,      t3,      a1
   1515    vstelm.w       vr8,     t3,      0,      1
   1516    add.d          t3,      t3,      a1
   1517 
   1518    b              .FILTER_LOOP_W_END
   1519 
   1520 .FILTER_LOOP_W16:
   1521    ipred_filter_load_p
   1522 
   1523    or             t3,      a0,      a0
   1524 
   1525    ipred_filter_load_fltptr
   1526    ipred_filter_calc_acc
   1527 
   1528    fst.s          f8,      t3,      0
   1529    add.d          t3,      t3,      a1
   1530    vstelm.w       vr8,     t3,      0,      1
   1531    add.d          t3,      t3,      a1
   1532 
   1533    addi.d         t1,      a0,      3
   1534    addi.d         a7,      a7,      4
   1535    addi.d         t0,      a7,      -1
   1536 
   1537    ipred_filter_loadx_p
   1538 
   1539    addi.d         t3,      a0,      4
   1540 
   1541    ipred_filter_load_fltptr
   1542    ipred_filter_calc_acc
   1543 
   1544    fst.s          f8,      t3,      0
   1545    add.d          t3,      t3,      a1
   1546    vstelm.w       vr8,     t3,      0,      1
   1547    add.d          t3,      t3,      a1
   1548 
   1549    addi.d         t1,      a0,      7
   1550    addi.d         a7,      a7,      4
   1551    addi.d         t0,      a7,      -1
   1552 
   1553    ipred_filter_loadx_p
   1554 
   1555    addi.d         t3,      a0,      8
   1556 
   1557    ipred_filter_load_fltptr
   1558    ipred_filter_calc_acc
   1559 
   1560    fst.s          f8,      t3,      0
   1561    add.d          t3,      t3,      a1
   1562    vstelm.w       vr8,     t3,      0,      1
   1563    add.d          t3,      t3,      a1
   1564 
   1565    addi.d         t1,      a0,      11
   1566    addi.d         a7,      a7,      4
   1567    addi.d         t0,      a7,      -1
   1568 
   1569    ipred_filter_loadx_p
   1570 
   1571    addi.d         t3,      a0,      12
   1572 
   1573    ipred_filter_load_fltptr
   1574    ipred_filter_calc_acc
   1575 
   1576    fst.s          f8,      t3,      0
   1577    add.d          t3,      t3,      a1
   1578    vstelm.w       vr8,     t3,      0,      1
   1579    add.d          t3,      t3,      a1
   1580 
   1581    b              .FILTER_LOOP_W_END
   1582 
   1583 .FILTER_LOOP_W32:
   1584    ipred_filter_load_p
   1585 
   1586    or             t3,      a0,      a0
   1587 
   1588    ipred_filter_load_fltptr
   1589    ipred_filter_calc_acc
   1590 
   1591    fst.s          f8,      t3,      0
   1592    add.d          t3,      t3,      a1
   1593    vstelm.w       vr8,     t3,      0,      1
   1594    add.d          t3,      t3,      a1
   1595 
   1596    addi.d         t1,      a0,      3
   1597    addi.d         a7,      a7,      4
   1598    addi.d         t0,      a7,      -1
   1599 
   1600    ipred_filter_loadx_p
   1601 
   1602    addi.d         t3,      a0,      4
   1603 
   1604    ipred_filter_load_fltptr
   1605    ipred_filter_calc_acc
   1606 
   1607    fst.s          f8,      t3,      0
   1608    add.d          t3,      t3,      a1
   1609    vstelm.w       vr8,     t3,      0,      1
   1610    add.d          t3,      t3,      a1
   1611 
   1612    addi.d         t1,      a0,      7
   1613    addi.d         a7,      a7,      4
   1614    addi.d         t0,      a7,      -1
   1615 
   1616    ipred_filter_loadx_p
   1617 
   1618    addi.d         t3,      a0,      8
   1619 
   1620    ipred_filter_load_fltptr
   1621    ipred_filter_calc_acc
   1622 
   1623    fst.s          f8,      t3,      0
   1624    add.d          t3,      t3,      a1
   1625    vstelm.w       vr8,     t3,      0,      1
   1626    add.d          t3,      t3,      a1
   1627 
   1628    addi.d         t1,      a0,      11
   1629    addi.d         a7,      a7,      4
   1630    addi.d         t0,      a7,      -1
   1631 
   1632    ipred_filter_loadx_p
   1633 
   1634    addi.d         t3,      a0,      12
   1635 
   1636    ipred_filter_load_fltptr
   1637    ipred_filter_calc_acc
   1638 
   1639    fst.s          f8,      t3,      0
   1640    add.d          t3,      t3,      a1
   1641    vstelm.w       vr8,     t3,      0,      1
   1642    add.d          t3,      t3,      a1
   1643 
   1644    addi.d         t1,      a0,      15
   1645    addi.d         a7,      a7,      4
   1646    addi.d         t0,      a7,      -1
   1647 
   1648    ipred_filter_loadx_p
   1649 
   1650    addi.d         t3,      a0,      16
   1651 
   1652    ipred_filter_load_fltptr
   1653    ipred_filter_calc_acc
   1654 
   1655    fst.s          f8,      t3,      0
   1656    add.d          t3,      t3,      a1
   1657    vstelm.w       vr8,     t3,      0,      1
   1658    add.d          t3,      t3,      a1
   1659 
   1660    addi.d         t1,      a0,      19
   1661    addi.d         a7,      a7,      4
   1662    addi.d         t0,      a7,      -1
   1663 
   1664    ipred_filter_loadx_p
   1665 
   1666    addi.d         t3,      a0,      20
   1667 
   1668    ipred_filter_load_fltptr
   1669    ipred_filter_calc_acc
   1670 
   1671    fst.s          f8,      t3,      0
   1672    add.d          t3,      t3,      a1
   1673    vstelm.w       vr8,     t3,      0,      1
   1674    add.d          t3,      t3,      a1
   1675 
   1676    addi.d         t1,      a0,      23
   1677    addi.d         a7,      a7,      4
   1678    addi.d         t0,      a7,      -1
   1679 
   1680    ipred_filter_loadx_p
   1681 
   1682    addi.d         t3,      a0,      24
   1683 
   1684    ipred_filter_load_fltptr
   1685    ipred_filter_calc_acc
   1686 
   1687    fst.s          f8,      t3,      0
   1688    add.d          t3,      t3,      a1
   1689    vstelm.w       vr8,     t3,      0,      1
   1690    add.d          t3,      t3,      a1
   1691 
   1692    addi.d         t1,      a0,      27
   1693    addi.d         a7,      a7,      4
   1694    addi.d         t0,      a7,      -1
   1695 
   1696    ipred_filter_loadx_p
   1697 
   1698    addi.d         t3,      a0,      28
   1699 
   1700    ipred_filter_load_fltptr
   1701    ipred_filter_calc_acc
   1702 
   1703    fst.s          f8,      t3,      0
   1704    add.d          t3,      t3,      a1
   1705    vstelm.w       vr8,     t3,      0,      1
   1706    add.d          t3,      t3,      a1
   1707 
   1708 .FILTER_LOOP_W_END:
   1709    add.d          a7,      a0,      a1
   1710    add.d          t2,      a1,      a1
   1711    add.d          a0,      a0,      t2
   1712    addi.d         a5,      a5,      2
   1713    blt            a5,      a4,      .FILTER_LOOP_H
   1714 endfunc
   1715 
   1716 const dav1d_dr_intra_derivative
   1717    // Values that are 0 will never be used
   1718    .short  0         // Angles:
   1719    .short  1023, 0   //  3,  93, 183
   1720    .short  547       //  6,  96, 186
   1721    .short  372, 0, 0 //  9,  99, 189
   1722    .short  273       // 14, 104, 194
   1723    .short  215, 0    // 17, 107, 197
   1724    .short  178       // 20, 110, 200
   1725    .short  151, 0    // 23, 113, 203 (113 & 203 are base angles)
   1726    .short  132       // 26, 116, 206
   1727    .short  116, 0    // 29, 119, 209
   1728    .short  102, 0    // 32, 122, 212
   1729    .short  90        // 36, 126, 216
   1730    .short  80, 0     // 39, 129, 219
   1731    .short  71        // 42, 132, 222
   1732    .short  64, 0     // 45, 135, 225 (45 & 135 are base angles)
   1733    .short  57        // 48, 138, 228
   1734    .short  51, 0     // 51, 141, 231
   1735    .short  45, 0     // 54, 144, 234
   1736    .short  40        // 58, 148, 238
   1737    .short  35, 0     // 61, 151, 241
   1738    .short  31        // 64, 154, 244
   1739    .short  27, 0     // 67, 157, 247 (67 & 157 are base angles)
   1740    .short  23        // 70, 160, 250
   1741    .short  19, 0     // 73, 163, 253
   1742    .short  15, 0     // 76, 166, 256
   1743    .short  11, 0     // 81, 171, 261
   1744    .short  7         // 84, 174, 264
   1745    .short  3         // 87, 177, 267
   1746 endconst
   1747 
   1748 const z1_upsample_edge_kernel
   1749    .short  -1, 9, 9, -1, -1, 9, 9, -1
   1750 endconst
   1751 
   1752 const ipred_filter_edge_kernel1
   1753    .short  0, 4, 8, 4, 0, 4, 8, 4
   1754    .short  0, 5, 6, 5, 0, 5, 6, 5
   1755    .short  2, 4, 4, 4, 2, 4, 4, 4
   1756 endconst
   1757 
   1758 const ipred_filter_edge_kernel2
   1759    .short  0, 0, 0, 0, 0, 0, 0, 0
   1760    .short  0, 0, 0, 0, 0, 0, 0, 0
   1761    .short  2, 2, 2, 2, 2, 2, 2, 2
   1762 endconst
   1763 
   1764 .macro z1_upsample_edge_calc_loop
   1765    vsllwil.hu.bu  vr10,    vr7,     0
   1766    vsllwil.hu.bu  vr11,    vr11,    0
   1767    vsllwil.hu.bu  vr12,    vr12,    0
   1768    vsllwil.hu.bu  vr13,    vr13,    0
   1769 
   1770    vmul.h         vr10,    vr10,    vr0
   1771    vmul.h         vr11,    vr11,    vr0
   1772    vmul.h         vr12,    vr12,    vr0
   1773    vmul.h         vr13,    vr13,    vr0
   1774 
   1775    vhaddw.w.h     vr10,    vr10,    vr10
   1776    vhaddw.w.h     vr11,    vr11,    vr11
   1777    vhaddw.w.h     vr12,    vr12,    vr12
   1778    vhaddw.w.h     vr13,    vr13,    vr13
   1779    vhaddw.d.w     vr10,    vr10,    vr10
   1780    vhaddw.d.w     vr11,    vr11,    vr11
   1781    vhaddw.d.w     vr12,    vr12,    vr12
   1782    vhaddw.d.w     vr13,    vr13,    vr13
   1783 
   1784    vpackev.h      vr10,    vr11,    vr10
   1785    vpackev.h      vr11,    vr13,    vr12
   1786    vpackev.w      vr12,    vr11,    vr10  //s:01234567
   1787    vsrari.h       vr12,    vr12,    4
   1788    iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12
   1789    vsrlni.b.h     vr12,    vr12,    0  //out: 13579...
   1790    vbsrl.v        vr11,    vr7,     1  //out:02468...
   1791    vilvl.b        vr13,    vr12,    vr11
   1792 .endm
   1793 
   1794 .macro z1_upsample_edge_data_init1
   1795    vbsrl.v        vr11,    vr7,     1
   1796    vbsrl.v        vr12,    vr7,     2
   1797    vbsrl.v        vr13,    vr7,     3
   1798    z1_upsample_edge_calc_loop
   1799 .endm
   1800 
   1801 .macro z1_upsample_edge_data_init2
   1802    vbsrl.v        vr11,    vr7,     1
   1803    vbsrl.v        vr12,    vr7,     2
   1804    vextrins.b     vr12,    vr12,    0x76
   1805    vbsrl.v        vr13,    vr7,     3
   1806    vextrins.b     vr13,    vr13,    0x65
   1807    vextrins.b     vr13,    vr13,    0x75
   1808    z1_upsample_edge_calc_loop
   1809 .endm
   1810 
   1811 .macro z1_upsample_edge_calc_other
   1812    vsllwil.hu.bu  vr10,    vr7,     0
   1813    vmul.h         vr10,    vr10,    vr0
   1814    vhaddw.w.h     vr10,    vr10,    vr10
   1815    vhaddw.d.w     vr10,    vr10,    vr10
   1816    vreplvei.h     vr12,    vr10,    0   //s0-s7
   1817    vsrari.h       vr12,    vr12,    4
   1818 
   1819    iclip_pixel_vrh vr12, vr15, vr16, vr10, vr11, vr12
   1820    vsrlni.b.h     vr12,    vr12,    0
   1821    vilvl.b        vr13,    vr12,    vr7
   1822 .endm
   1823 
   1824 .macro z1_filter_edge_calc_loop1
   1825    vmul.h         vr10,    vr10,    vr1
   1826    vmul.h         vr11,    vr11,    vr1
   1827    vmul.h         vr12,    vr12,    vr1
   1828    vmul.h         vr13,    vr13,    vr1
   1829 
   1830    vhaddw.w.h     vr10,    vr10,    vr10
   1831    vhaddw.w.h     vr11,    vr11,    vr11
   1832    vhaddw.w.h     vr12,    vr12,    vr12
   1833    vhaddw.w.h     vr13,    vr13,    vr13
   1834    vhaddw.d.w     vr10,    vr10,    vr10
   1835    vhaddw.d.w     vr11,    vr11,    vr11
   1836    vhaddw.d.w     vr12,    vr12,    vr12
   1837    vhaddw.d.w     vr13,    vr13,    vr13
   1838 
   1839    vpackev.h      vr10,    vr11,    vr10
   1840    vpackev.h      vr11,    vr13,    vr12
   1841    vpackev.w      vr10,    vr11,    vr10  //s:01234567
   1842 .endm
   1843 
   1844 .macro z1_filter_edge_calc_loop2
   1845    vsllwil.hu.bu  vr13,    vr13,    0
   1846    vmadd.h        vr10,    vr13,    vr6
   1847    vsrari.h       vr12,    vr10,    4
   1848    vsrlni.b.h     vr12,    vr12,    0  //out: 0-7
   1849 .endm
   1850 
   1851 .macro z1_filter_edge_calc_other
   1852    vsllwil.hu.bu  vr10,    vr10,    0
   1853    vmul.h         vr11,    vr10,    vr1
   1854    vhaddw.w.h     vr11,    vr11,    vr11
   1855    vhaddw.d.w     vr11,    vr11,    vr11
   1856    vreplvei.h     vr12,    vr11,    4
   1857    vextrins.h     vr12,    vr11,    0x00
   1858 
   1859    vreplvei.h     vr13,    vr10,    1
   1860    vmadd.h        vr12,    vr13,    vr6
   1861    vsrari.h       vr12,    vr12,    4
   1862    vsrlni.b.h     vr12,    vr12,    0  //out: 0-7
   1863 .endm
   1864 
   1865 .macro z1_filter_edge_data_init1
   1866    vbsll.v        vr10,    vr7,     1
   1867    vextrins.b     vr10,    vr10,    0x01
   1868    vbsrl.v        vr12,    vr7,     1
   1869    vbsrl.v        vr13,    vr7,     2
   1870    vsllwil.hu.bu  vr10,    vr10,    0
   1871    vsllwil.hu.bu  vr11,    vr7,     0
   1872    vsllwil.hu.bu  vr12,    vr12,    0
   1873    vsllwil.hu.bu  vr13,    vr13,    0
   1874    z1_filter_edge_calc_loop1
   1875 .endm
   1876 
   1877 .macro z1_filter_edge_data_init2
   1878    vbsrl.v        vr11,    vr7,     1
   1879    vbsrl.v        vr12,    vr7,     2
   1880    vbsrl.v        vr13,    vr7,     3
   1881    vsllwil.hu.bu  vr10,    vr7,     0
   1882    vsllwil.hu.bu  vr11,    vr11,    0
   1883    vsllwil.hu.bu  vr12,    vr12,    0
   1884    vsllwil.hu.bu  vr13,    vr13,    0
   1885    z1_filter_edge_calc_loop1
   1886 .endm
   1887 
   1888 .macro z1_filter_edge_data_init3
   1889    vbsrl.v        vr11,    vr7,     1
   1890    vbsrl.v        vr12,    vr7,     2
   1891    vbsrl.v        vr13,    vr7,     3
   1892    vextrins.b     vr13,    vr13,    0x76
   1893    vsllwil.hu.bu  vr10,    vr7,     0
   1894    vsllwil.hu.bu  vr11,    vr11,    0
   1895    vsllwil.hu.bu  vr12,    vr12,    0
   1896    vsllwil.hu.bu  vr13,    vr13,    0
   1897    z1_filter_edge_calc_loop1
   1898 .endm
   1899 
   1900 .macro z1_filter_edge_data_init4
   1901    vbsll.v        vr10,    vr7,     1
   1902    vextrins.b     vr10,    vr10,    0x01
   1903    vbsrl.v        vr12,    vr7,     1
   1904    vbsrl.v        vr13,    vr7,     2
   1905    vextrins.b     vr13,    vr13,    0x76
   1906    vsllwil.hu.bu  vr10,    vr10,    0
   1907    vsllwil.hu.bu  vr11,    vr7,     0
   1908    vsllwil.hu.bu  vr12,    vr12,    0
   1909    vsllwil.hu.bu  vr13,    vr13,    0
   1910    z1_filter_edge_calc_loop1
   1911 .endm
   1912 
   1913 .macro pixel_set_8bpc_allw dst_ptr, src_ptr, width, tmp0, tmp1
   1914    vldrepl.b      vr10,    \src_ptr, 0
   1915    or             \tmp1,   zero,     zero
   1916    srai.d         \tmp0,   \width,   4
   1917    beqz           \tmp0,   2f
   1918 1:
   1919    vstx           vr10,    \dst_ptr, \tmp1
   1920    addi.d         \tmp1,   \tmp1,    16
   1921    addi.d         \tmp0,   \tmp0,    -1
   1922    bnez           \tmp0,   1b
   1923 2:
   1924    andi           \tmp0,   \width,   8
   1925    beqz           \tmp0,   3f
   1926    fstx.d         f10,     \dst_ptr, \tmp1
   1927    addi.d         \tmp1,   \tmp1,    8
   1928 3:
   1929    andi           \tmp0,   \width,   4
   1930    beqz           \tmp0,   4f
   1931    fstx.s         f10,     \dst_ptr, \tmp1
   1932    addi.d         \tmp1,   \tmp1,    4
   1933 4:
   1934    andi           \tmp0,   \width,   2
   1935    beqz           \tmp0,   5f
   1936    ldx.bu         \tmp0,   \src_ptr, zero
   1937    stx.b          \tmp0,   \dst_ptr, \tmp1
   1938    addi.d         \tmp1,   \tmp1,    1
   1939    stx.b          \tmp0,   \dst_ptr, \tmp1
   1940    addi.d         \tmp1,   \tmp1,    1
   1941 5:
   1942    andi           \tmp0,   \width,   1
   1943    beqz           \tmp0,   6f
   1944    ldx.bu         \tmp0,   \src_ptr, zero
   1945    stx.b          \tmp0,   \dst_ptr, \tmp1
   1946 6:
   1947 .endm
   1948 
   1949 // void ipred_z1_lsx(pixel *dst, const ptrdiff_t stride,
   1950 //                   const pixel *const topleft_in,
   1951 //                   const int width, const int height, int angle,
   1952 //                   const int max_width, const int max_height
   1953 //                   HIGHBD_DECL_SUFFIX)
   1954 function ipred_z1_8bpc_lsx
   1955    addi.d         a2,      a2,      1   //&topleft_in[1]
   1956    addi.d         sp,      sp,      -128
   1957    or             t2,      sp,      sp  //top_out
   1958    srai.d         a6,      a5,      9
   1959    andi           a6,      a6,      1   //is_sum
   1960    srai.d         a7,      a5,      10  //enable_intra_edge_filter
   1961    andi           a5,      a5,      511
   1962 
   1963    la.local       t0,      dav1d_dr_intra_derivative
   1964    andi           t1,      a5,      0xFFE
   1965    ldx.hu         t1,      t0,      t1  //dx
   1966 
   1967    beqz           a7,      .IPRED_Z1_NOTUA
   1968    add.d          t3,      a3,      a4
   1969    li.w           t4,      90
   1970    sub.w          t4,      t4,      a5
   1971    // ipred_get_upsample t5:upsample_above
   1972    li.w           t6,      16
   1973    sra.d          t6,      t6,      a6
   1974    bge            t6,      t3,      .Z1_GETUS1
   1975    addi.d         t5,      zero,    0
   1976    b              .Z1_GETUS2
   1977 .Z1_GETUS1:
   1978    addi.d         t5,      zero,    1
   1979 .Z1_GETUS2:
   1980    li.w           t6,      40
   1981    blt            t4,      t6,      .Z1_GETUS3
   1982    addi.d         t6,      zero,    0
   1983    b              .Z1_GETUS4
   1984 .Z1_GETUS3:
   1985    addi.d         t6,      zero,    1
   1986 .Z1_GETUS4:
   1987    and            t5,      t5,      t6
   1988 
   1989    beqz           t5,      .IPRED_Z1_NOTUA
   1990 
   1991    la.local       t0,      z1_upsample_edge_kernel
   1992    vld            vr0,     t0,      0   //kernel
   1993    vxor.v         vr15,    vr15,    vr15
   1994    li.w           t0,      255
   1995    vreplgr2vr.h   vr16,    t0
   1996 
   1997 .Z1_UEDGE_W4:
   1998    andi           t6,      a3,     4
   1999    beqz           t6,      .Z1_UEDGE_W8
   2000 .Z1_UEDGE_W4_H4:
   2001    andi           t6,      a4,     4
   2002    beqz           t6,      .Z1_UEDGE_W4_H8
   2003 
   2004    //0-6
   2005    vld            vr7,     a2,      -1
   2006    vbsrl.v        vr11,    vr7,     1
   2007    vbsrl.v        vr12,    vr7,     2
   2008    vextrins.b     vr12,    vr12,    0x76
   2009    vbsrl.v        vr13,    vr7,     3
   2010    z1_upsample_edge_calc_loop
   2011 
   2012    fst.d          f13,     t2,     0
   2013    vstelm.w       vr13,    t2,     8,    2
   2014    vstelm.h       vr13,    t2,     12,   6
   2015 
   2016    ld.bu          t7,      a2,     7
   2017    st.b           t7,      t2,     14
   2018 
   2019    b              .Z1_UEDGE_END
   2020 
   2021 .Z1_UEDGE_W4_H8:
   2022    andi           t6,      a4,     8
   2023    beqz           t6,      .Z1_UEDGE_W4_H16
   2024 
   2025    //0-7
   2026    vld            vr7,     a2,      -1
   2027    z1_upsample_edge_data_init2
   2028    vst            vr13,    t2,     0
   2029 
   2030    //8-10
   2031    vldrepl.b      vr7,     a2,     7
   2032    z1_upsample_edge_calc_other
   2033 
   2034    vstelm.w       vr13,    t2,     16,   0
   2035    vstelm.h       vr13,    t2,     20,   2
   2036 
   2037    ld.bu          t7,      a2,     7
   2038    st.b           t7,      t2,     22
   2039 
   2040    b              .Z1_UEDGE_END
   2041 
   2042 .Z1_UEDGE_W4_H16:
   2043    andi           t6,      a4,     16
   2044    beqz           t6,      .Z1_UEDGE_W4_H32
   2045 
   2046    //0-7
   2047    vld            vr7,     a2,      -1
   2048    z1_upsample_edge_data_init2
   2049    vst            vr13,    t2,     0
   2050 
   2051    //8-15
   2052    vldrepl.b      vr7,     a2,     7
   2053    z1_upsample_edge_calc_other
   2054    vst            vr13,    t2,     16
   2055 
   2056    //16-18
   2057    vstelm.w       vr13,    t2,     32,   0
   2058    vstelm.h       vr13,    t2,     36,   2
   2059 
   2060    ld.bu          t7,      a2,     7
   2061    st.b           t7,      t2,     38
   2062 
   2063    b              .Z1_UEDGE_END
   2064 
   2065 .Z1_UEDGE_W4_H32:
   2066    andi           t6,      a4,     32
   2067    beqz           t6,      .Z1_UEDGE_W4_H64
   2068 
   2069    //0-7
   2070    vld            vr7,     a2,      -1
   2071    z1_upsample_edge_data_init2
   2072    vst            vr13,    t2,     0
   2073 
   2074    //8-15
   2075    vldrepl.b      vr7,     a2,     7
   2076    z1_upsample_edge_calc_other
   2077    vst            vr13,    t2,     16
   2078 
   2079    vst            vr13,    t2,     32 //16-23
   2080    vst            vr13,    t2,     48 //24-31
   2081 
   2082    //32-34
   2083    vstelm.w       vr13,    t2,     64,   0
   2084    vstelm.h       vr13,    t2,     68,   2
   2085 
   2086    ld.bu          t7,      a2,     7
   2087    st.b           t7,      t2,     70
   2088 
   2089    b              .Z1_UEDGE_END
   2090 
   2091 .Z1_UEDGE_W4_H64:
   2092    //0-7
   2093    vld            vr7,     a2,      -1
   2094    z1_upsample_edge_data_init2
   2095    vst            vr13,    t2,     0
   2096 
   2097    //8-15
   2098    vldrepl.b      vr7,     a2,     7
   2099    z1_upsample_edge_calc_other
   2100    vst            vr13,    t2,     16
   2101 
   2102    vst            vr13,    t2,     32 //16-23
   2103    vst            vr13,    t2,     48 //24-31
   2104    vst            vr13,    t2,     64 //32-39
   2105    vst            vr13,    t2,     80 //40-47
   2106    vst            vr13,    t2,     96 //48-55
   2107    vst            vr13,    t2,     112 //56-63
   2108 
   2109    //64-66
   2110    vstelm.w       vr13,    t2,     128,   0
   2111    vstelm.h       vr13,    t2,     132,   2
   2112 
   2113    ld.bu          t7,      a2,     7
   2114    st.b           t7,      t2,     134
   2115 
   2116    b              .Z1_UEDGE_END
   2117 
   2118 .Z1_UEDGE_W8:
   2119    andi           t6,      a3,     8
   2120    beqz           t6,      .Z1_UEDGE_W16
   2121 .Z1_UEDGE_W8_H4:
   2122    andi           t6,      a4,     4
   2123    beqz           t6,      .Z1_UEDGE_W8_H8
   2124 
   2125    //0-7
   2126    vld            vr7,     a2,      -1
   2127    z1_upsample_edge_data_init1
   2128    vst            vr13,    t2,     0
   2129 
   2130    //8-15
   2131    vld            vr7,     a2,      7
   2132    vbsrl.v        vr11,    vr7,     1
   2133    vbsrl.v        vr12,    vr7,     2
   2134    vextrins.b     vr12,    vr12,    0x32
   2135    vbsrl.v        vr13,    vr7,     3
   2136    vextrins.b     vr13,    vr13,    0x21
   2137    vextrins.b     vr13,    vr13,    0x31
   2138    z1_upsample_edge_calc_loop
   2139    vstelm.w       vr13,    t2,     16,    0
   2140    vstelm.h       vr13,    t2,     20,    2
   2141 
   2142    ld.bu          t7,      a2,     11
   2143    st.b           t7,      t2,     22
   2144    b              .Z1_UEDGE_END
   2145 
   2146 .Z1_UEDGE_W8_H8:
   2147    andi           t6,      a4,     8
   2148    beqz           t6,      .Z1_UEDGE_W8_H16
   2149 
   2150    //0-7
   2151    vld            vr7,     a2,      -1
   2152    z1_upsample_edge_data_init1
   2153    vst            vr13,    t2,     0
   2154 
   2155    //8-14
   2156    vld            vr7,     a2,      7
   2157    vbsrl.v        vr11,    vr7,     1
   2158    vbsrl.v        vr12,    vr7,     2
   2159    vextrins.b     vr12,    vr12,    0x76
   2160    vbsrl.v        vr13,    vr7,     3
   2161    z1_upsample_edge_calc_loop
   2162    fst.d          f13,     t2,     16
   2163    vstelm.w       vr13,    t2,     24,    2
   2164    vstelm.h       vr13,    t2,     28,    6
   2165 
   2166    ld.bu          t7,      a2,     15
   2167    st.b           t7,      t2,     30
   2168    b              .Z1_UEDGE_END
   2169 
   2170 .Z1_UEDGE_W8_H16:
   2171    andi           t6,      a4,     16
   2172    beqz           t6,      .Z1_UEDGE_W8_H32
   2173 
   2174    //0-7
   2175    vld            vr7,     a2,      -1
   2176    z1_upsample_edge_data_init1
   2177    vst            vr13,    t2,     0
   2178 
   2179    //8-15
   2180    vld            vr7,     a2,      7
   2181    z1_upsample_edge_data_init2
   2182    vst            vr13,    t2,     16
   2183 
   2184    //16-22
   2185    vldrepl.b      vr7,     a2,     15
   2186    z1_upsample_edge_calc_other
   2187    fst.d          f13,     t2,     32
   2188    vstelm.w       vr13,    t2,     40,   2
   2189    vstelm.h       vr13,    t2,     44,   6
   2190 
   2191    ld.bu          t7,      a2,     15
   2192    st.b           t7,      t2,     46
   2193    b              .Z1_UEDGE_END
   2194 
   2195 .Z1_UEDGE_W8_H32:
   2196    andi           t6,      a4,     32
   2197    beqz           t6,      .Z1_UEDGE_W8_H64
   2198 
   2199    //0-7
   2200    vld            vr7,     a2,      -1
   2201    z1_upsample_edge_data_init1
   2202    vst            vr13,    t2,     0
   2203 
   2204    //8-15
   2205    vld            vr7,     a2,      7
   2206    z1_upsample_edge_data_init2
   2207    vst            vr13,    t2,     16
   2208 
   2209    //16-23
   2210    vldrepl.b      vr7,     a2,     15
   2211    z1_upsample_edge_calc_other
   2212    vst            vr13,    t2,     32
   2213 
   2214    vst            vr13,    t2,     48 //24-31
   2215 
   2216    //32-38
   2217    fst.d          f13,     t2,     64
   2218    vstelm.w       vr13,    t2,     72,   2
   2219    vstelm.h       vr13,    t2,     76,   6
   2220 
   2221    ld.bu          t7,      a2,     15
   2222    st.b           t7,      t2,     78
   2223    b              .Z1_UEDGE_END
   2224 
   2225 .Z1_UEDGE_W8_H64:
   2226    //0-7
   2227    vld            vr7,     a2,      -1
   2228    z1_upsample_edge_data_init1
   2229    vst            vr13,    t2,     0
   2230 
   2231    //8-15
   2232    vld            vr7,     a2,      7
   2233    z1_upsample_edge_data_init2
   2234    vst            vr13,    t2,     16
   2235 
   2236    //16-23
   2237    vldrepl.b      vr7,     a2,     15
   2238    z1_upsample_edge_calc_other
   2239    vst            vr13,    t2,     32
   2240 
   2241    vst            vr13,    t2,     48 //24-31
   2242    vst            vr13,    t2,     64 //32-39
   2243    vst            vr13,    t2,     80 //40-47
   2244    vst            vr13,    t2,     96 //48-55
   2245    vst            vr13,    t2,     112 //56-63
   2246 
   2247    //64-70
   2248    fst.d          f13,     t2,     128
   2249    vstelm.w       vr13,    t2,     136,   2
   2250    vstelm.h       vr13,    t2,     140,   6
   2251 
   2252    ld.bu          t7,      a2,     15
   2253    st.b           t7,      t2,     142
   2254    b              .Z1_UEDGE_END
   2255 
   2256 .Z1_UEDGE_W16:
   2257    andi           t6,      a3,     16
   2258    beqz           t6,      .Z1_UEDGE_W32
   2259 .Z1_UEDGE_W16_H4:
   2260    andi           t6,      a4,     4
   2261    beqz           t6,      .Z1_UEDGE_W16_H8
   2262 
   2263    //0-7
   2264    vld            vr7,     a2,      -1
   2265    z1_upsample_edge_data_init1
   2266    vst            vr13,    t2,     0
   2267 
   2268    //8-15
   2269    vld            vr7,     a2,      7
   2270    z1_upsample_edge_data_init1
   2271    vst            vr13,    t2,     16
   2272 
   2273    //16-18
   2274    vld            vr7,     a2,      15
   2275    z1_upsample_edge_data_init1
   2276    vstelm.w       vr13,    t2,     32,    0
   2277    vstelm.h       vr13,    t2,     36,    2
   2278 
   2279    ld.bu          t7,      a2,     19
   2280    st.b           t7,      t2,     38
   2281    b              .Z1_UEDGE_END
   2282 
   2283 .Z1_UEDGE_W16_H8:
   2284    andi           t6,      a4,     8
   2285    beqz           t6,      .Z1_UEDGE_W16_H16
   2286 
   2287    //0-7
   2288    vld            vr7,     a2,      -1
   2289    z1_upsample_edge_data_init1
   2290    vst            vr13,    t2,     0
   2291 
   2292    //8-15
   2293    vld            vr7,     a2,      7
   2294    z1_upsample_edge_data_init1
   2295    vst            vr13,    t2,      16
   2296 
   2297    //16-22
   2298    vld            vr7,     a2,      15
   2299    vbsrl.v        vr11,    vr7,     1
   2300    vbsrl.v        vr12,    vr7,     2
   2301    vextrins.b     vr12,    vr12,    0x76
   2302    vbsrl.v        vr13,    vr7,     3
   2303    z1_upsample_edge_calc_loop
   2304    fst.d          f13,     t2,     32
   2305    vstelm.w       vr13,    t2,     40,    2
   2306    vstelm.h       vr13,    t2,     44,    6
   2307 
   2308    ld.bu          t7,      a2,     23
   2309    st.b           t7,      t2,     46
   2310    b              .Z1_UEDGE_END
   2311 
   2312 .Z1_UEDGE_W16_H16:
   2313    andi           t6,      a4,     16
   2314    beqz           t6,      .Z1_UEDGE_W16_H32
   2315 
   2316    //0-7
   2317    vld            vr7,     a2,      -1
   2318    z1_upsample_edge_data_init1
   2319    vst            vr13,    t2,     0
   2320 
   2321    //8-15
   2322    vld            vr7,     a2,      7
   2323    z1_upsample_edge_data_init1
   2324    vst            vr13,    t2,     16
   2325 
   2326    //16-23
   2327    vld            vr7,     a2,      15
   2328    z1_upsample_edge_data_init1
   2329    vst            vr13,    t2,      32
   2330 
   2331    //24-30
   2332    vld            vr7,     a2,      23
   2333    vbsrl.v        vr11,    vr7,     1
   2334    vbsrl.v        vr12,    vr7,     2
   2335    vextrins.b     vr12,    vr12,    0x76
   2336    vbsrl.v        vr13,    vr7,     3
   2337    z1_upsample_edge_calc_loop
   2338    fst.d          f13,     t2,     48
   2339    vstelm.w       vr13,    t2,     56,    2
   2340    vstelm.h       vr13,    t2,     60,    6
   2341 
   2342    ld.bu          t7,      a2,     31
   2343    st.b           t7,      t2,     62
   2344    b              .Z1_UEDGE_END
   2345 
   2346 .Z1_UEDGE_W16_H32:
   2347    andi           t6,      a4,     32
   2348    beqz           t6,      .Z1_UEDGE_W16_H64
   2349 
   2350    //0-7
   2351    vld            vr7,     a2,      -1
   2352    z1_upsample_edge_data_init1
   2353    vst            vr13,    t2,     0
   2354 
   2355    //8-15
   2356    vld            vr7,     a2,      7
   2357    z1_upsample_edge_data_init1
   2358    vst            vr13,    t2,     16
   2359 
   2360    //16-23
   2361    vld            vr7,     a2,      15
   2362    z1_upsample_edge_data_init1
   2363    vst            vr13,    t2,      32
   2364 
   2365    //24-31
   2366    vld            vr7,     a2,      23
   2367    z1_upsample_edge_data_init2
   2368    vst            vr13,    t2,      48
   2369 
   2370    //32-39
   2371    vldrepl.b      vr7,     a2,      31
   2372    z1_upsample_edge_calc_other
   2373    vst            vr13,    t2,      64
   2374 
   2375    //40-46
   2376    fst.d          f13,     t2,     80
   2377    vstelm.w       vr13,    t2,     88,    2
   2378    vstelm.h       vr13,    t2,     92,    6
   2379 
   2380    ld.bu          t7,      a2,     31
   2381    st.b           t7,      t2,     94
   2382    b              .Z1_UEDGE_END
   2383 
   2384 .Z1_UEDGE_W16_H64:
   2385    //0-7
   2386    vld            vr7,     a2,      -1
   2387    z1_upsample_edge_data_init1
   2388    vst            vr13,    t2,     0
   2389 
   2390    //8-15
   2391    vld            vr7,     a2,      7
   2392    z1_upsample_edge_data_init1
   2393    vst            vr13,    t2,     16
   2394 
   2395    //16-23
   2396    vld            vr7,     a2,      15
   2397    z1_upsample_edge_data_init1
   2398    vst            vr13,    t2,      32
   2399 
   2400    //24-31
   2401    vld            vr7,     a2,      23
   2402    z1_upsample_edge_data_init2
   2403    vst            vr13,    t2,      48
   2404 
   2405    //32-39
   2406    vldrepl.b      vr7,     a2,      31
   2407    z1_upsample_edge_calc_other
   2408    vst            vr13,    t2,      64
   2409 
   2410    vst            vr13,    t2,      80  //40-47
   2411    vst            vr13,    t2,      96  //48-55
   2412    vst            vr13,    t2,      112 //56-63
   2413    vst            vr13,    t2,      128 //64-71
   2414 
   2415    //72-78
   2416    fst.d          f13,     t2,     144
   2417    vstelm.w       vr13,    t2,     152,    2
   2418    vstelm.h       vr13,    t2,     156,    6
   2419 
   2420    ld.bu          t7,      a2,     31
   2421    st.b           t7,      t2,     158
   2422    b              .Z1_UEDGE_END
   2423 
   2424 .Z1_UEDGE_W32:
   2425    andi           t6,      a3,     32
   2426    beqz           t6,      .Z1_UEDGE_W64
   2427 .Z1_UEDGE_W32_H8:
   2428    andi           t6,      a4,     8
   2429    beqz           t6,      .Z1_UEDGE_W32_H16
   2430 
   2431    //0-7
   2432    vld            vr7,     a2,      -1
   2433    z1_upsample_edge_data_init1
   2434    vst            vr13,    t2,     0
   2435 
   2436    //8-15
   2437    vld            vr7,     a2,      7
   2438    z1_upsample_edge_data_init1
   2439    vst            vr13,    t2,      16
   2440 
   2441    //16-23
   2442    vld            vr7,     a2,      15
   2443    z1_upsample_edge_data_init1
   2444    vst            vr13,    t2,      32
   2445 
   2446    //24-31
   2447    vld            vr7,     a2,      23
   2448    z1_upsample_edge_data_init1
   2449    vst            vr13,    t2,      48
   2450 
   2451    //32-38
   2452    vld            vr7,     a2,      31
   2453    vbsrl.v        vr11,    vr7,     1
   2454    vbsrl.v        vr12,    vr7,     2
   2455    vextrins.b     vr12,    vr12,    0x76
   2456    vbsrl.v        vr13,    vr7,     3
   2457    z1_upsample_edge_calc_loop
   2458    fst.d          f13,     t2,      64
   2459    vstelm.w       vr13,    t2,      72,    2
   2460    vstelm.h       vr13,    t2,      76,    6
   2461 
   2462    ld.bu          t7,      a2,     39
   2463    st.b           t7,      t2,     78
   2464    b              .Z1_UEDGE_END
   2465 
   2466 .Z1_UEDGE_W32_H16:
   2467    andi           t6,      a4,     16
   2468    beqz           t6,      .Z1_UEDGE_W32_H32
   2469 
   2470    //0-7
   2471    vld            vr7,     a2,      -1
   2472    z1_upsample_edge_data_init1
   2473    vst            vr13,    t2,     0
   2474 
   2475    //8-15
   2476    vld            vr7,     a2,      7
   2477    z1_upsample_edge_data_init1
   2478    vst            vr13,    t2,      16
   2479 
   2480    //16-23
   2481    vld            vr7,     a2,      15
   2482    z1_upsample_edge_data_init1
   2483    vst            vr13,    t2,      32
   2484 
   2485    //24-31
   2486    vld            vr7,     a2,      23
   2487    z1_upsample_edge_data_init1
   2488    vst            vr13,    t2,      48
   2489 
   2490    //32-39
   2491    vld            vr7,     a2,      31
   2492    z1_upsample_edge_data_init1
   2493    vst            vr13,    t2,      64
   2494 
   2495    //40-46
   2496    vld            vr7,     a2,      39
   2497    vbsrl.v        vr11,    vr7,     1
   2498    vbsrl.v        vr12,    vr7,     2
   2499    vextrins.b     vr12,    vr12,    0x76
   2500    vbsrl.v        vr13,    vr7,     3
   2501    z1_upsample_edge_calc_loop
   2502    fst.d          f13,     t2,      80
   2503    vstelm.w       vr13,    t2,      88,    2
   2504    vstelm.h       vr13,    t2,      92,    6
   2505 
   2506    ld.bu          t7,      a2,     47
   2507    st.b           t7,      t2,     94
   2508    b              .Z1_UEDGE_END
   2509 
   2510 .Z1_UEDGE_W32_H32:
   2511    andi           t6,      a4,     32
   2512    beqz           t6,      .Z1_UEDGE_W32_H64
   2513 
   2514    //0-7
   2515    vld            vr7,     a2,      -1
   2516    z1_upsample_edge_data_init1
   2517    vst            vr13,    t2,     0
   2518 
   2519    //8-15
   2520    vld            vr7,     a2,      7
   2521    z1_upsample_edge_data_init1
   2522    vst            vr13,    t2,      16
   2523 
   2524    //16-23
   2525    vld            vr7,     a2,      15
   2526    z1_upsample_edge_data_init1
   2527    vst            vr13,    t2,      32
   2528 
   2529    //24-31
   2530    vld            vr7,     a2,      23
   2531    z1_upsample_edge_data_init1
   2532    vst            vr13,    t2,      48
   2533 
   2534    //32-39
   2535    vld            vr7,     a2,      31
   2536    z1_upsample_edge_data_init1
   2537    vst            vr13,    t2,      64
   2538 
   2539    //40-47
   2540    vld            vr7,     a2,      39
   2541    z1_upsample_edge_data_init1
   2542    vst            vr13,    t2,      80
   2543 
   2544    //48-55
   2545    vld            vr7,     a2,      47
   2546    z1_upsample_edge_data_init1
   2547    vst            vr13,    t2,      96
   2548 
   2549    //56-62
   2550    vld            vr7,     a2,      55
   2551    vbsrl.v        vr11,    vr7,     1
   2552    vbsrl.v        vr12,    vr7,     2
   2553    vextrins.b     vr12,    vr12,    0x76
   2554    vbsrl.v        vr13,    vr7,     3
   2555    z1_upsample_edge_calc_loop
   2556    fst.d          f13,     t2,      112
   2557    vstelm.w       vr13,    t2,      120,   2
   2558    vstelm.h       vr13,    t2,      124,   6
   2559 
   2560    ld.bu          t7,      a2,     63
   2561    st.b           t7,      t2,     126
   2562    b              .Z1_UEDGE_END
   2563 
   2564 .Z1_UEDGE_W32_H64:
   2565    //0-7
   2566    vld            vr7,     a2,      -1
   2567    z1_upsample_edge_data_init1
   2568    vst            vr13,    t2,     0
   2569 
   2570    //8-15
   2571    vld            vr7,     a2,      7
   2572    z1_upsample_edge_data_init1
   2573    vst            vr13,    t2,      16
   2574 
   2575    //16-23
   2576    vld            vr7,     a2,      15
   2577    z1_upsample_edge_data_init1
   2578    vst            vr13,    t2,      32
   2579 
   2580    //24-31
   2581    vld            vr7,     a2,      23
   2582    z1_upsample_edge_data_init1
   2583    vst            vr13,    t2,      48
   2584 
   2585    //32-39
   2586    vld            vr7,     a2,      31
   2587    z1_upsample_edge_data_init1
   2588    vst            vr13,    t2,      64
   2589 
   2590    //40-47
   2591    vld            vr7,     a2,      39
   2592    z1_upsample_edge_data_init1
   2593    vst            vr13,    t2,      80
   2594 
   2595    //48-55
   2596    vld            vr7,     a2,      47
   2597    z1_upsample_edge_data_init1
   2598    vst            vr13,    t2,      96
   2599 
   2600    //56-63
   2601    vld            vr7,     a2,      55
   2602    z1_upsample_edge_data_init2
   2603    vst            vr13,    t2,      112
   2604 
   2605    //64-71
   2606    vldrepl.b      vr7,     a2,      63
   2607    z1_upsample_edge_calc_other
   2608    vst            vr13,    t2,      128
   2609 
   2610    vst            vr13,    t2,      144 //72-79
   2611    vst            vr13,    t2,      160 //80-87
   2612 
   2613    //88-94
   2614    fst.d          f13,     t2,     176
   2615    vstelm.w       vr13,    t2,     184,    2
   2616    vstelm.h       vr13,    t2,     188,    6
   2617 
   2618    ld.bu          t7,      a2,     63
   2619    st.b           t7,      t2,     190
   2620    b              .Z1_UEDGE_END
   2621 
   2622 .Z1_UEDGE_W64:
   2623 .Z1_UEDGE_W64_H16:
   2624    andi           t6,      a4,     16
   2625    beqz           t6,      .Z1_UEDGE_W64_H32
   2626 
   2627    //0-7
   2628    vld            vr7,     a2,      -1
   2629    z1_upsample_edge_data_init1
   2630    vst            vr13,    t2,     0
   2631 
   2632    //8-15
   2633    vld            vr7,     a2,      7
   2634    z1_upsample_edge_data_init1
   2635    vst            vr13,    t2,      16
   2636 
   2637    //16-23
   2638    vld            vr7,     a2,      15
   2639    z1_upsample_edge_data_init1
   2640    vst            vr13,    t2,      32
   2641 
   2642    //24-31
   2643    vld            vr7,     a2,      23
   2644    z1_upsample_edge_data_init1
   2645    vst            vr13,    t2,      48
   2646 
   2647    //32-39
   2648    vld            vr7,     a2,      31
   2649    z1_upsample_edge_data_init1
   2650    vst            vr13,    t2,      64
   2651 
   2652    //40-47
   2653    vld            vr7,     a2,      39
   2654    z1_upsample_edge_data_init1
   2655    vst            vr13,    t2,      80
   2656 
   2657    //48-55
   2658    vld            vr7,     a2,      47
   2659    z1_upsample_edge_data_init1
   2660    vst            vr13,    t2,      96
   2661 
   2662    //56-63
   2663    vld            vr7,     a2,      55
   2664    z1_upsample_edge_data_init1
   2665    vst            vr13,    t2,      112
   2666 
   2667    //64-71
   2668    vld            vr7,     a2,      63
   2669    z1_upsample_edge_data_init1
   2670    vst            vr13,    t2,      128
   2671 
   2672    //72-78
   2673    vld            vr7,     a2,      71
   2674    z1_upsample_edge_data_init2
   2675    fst.d          f13,     t2,     144
   2676    vstelm.w       vr13,    t2,     152,    2
   2677    vstelm.h       vr13,    t2,     156,    6
   2678 
   2679    ld.bu          t7,      a2,     79
   2680    st.b           t7,      t2,     158
   2681    b              .Z1_UEDGE_END
   2682 
   2683 .Z1_UEDGE_W64_H32:
   2684    andi           t6,      a4,     32
   2685    beqz           t6,      .Z1_UEDGE_W64_H64
   2686 
   2687    //0-7
   2688    vld            vr7,     a2,      -1
   2689    z1_upsample_edge_data_init1
   2690    vst            vr13,    t2,     0
   2691 
   2692    //8-15
   2693    vld            vr7,     a2,      7
   2694    z1_upsample_edge_data_init1
   2695    vst            vr13,    t2,      16
   2696 
   2697    //16-23
   2698    vld            vr7,     a2,      15
   2699    z1_upsample_edge_data_init1
   2700    vst            vr13,    t2,      32
   2701 
   2702    //24-31
   2703    vld            vr7,     a2,      23
   2704    z1_upsample_edge_data_init1
   2705    vst            vr13,    t2,      48
   2706 
   2707    //32-39
   2708    vld            vr7,     a2,      31
   2709    z1_upsample_edge_data_init1
   2710    vst            vr13,    t2,      64
   2711 
   2712    //40-47
   2713    vld            vr7,     a2,      39
   2714    z1_upsample_edge_data_init1
   2715    vst            vr13,    t2,      80
   2716 
   2717    //48-55
   2718    vld            vr7,     a2,      47
   2719    z1_upsample_edge_data_init1
   2720    vst            vr13,    t2,      96
   2721 
   2722    //56-63
   2723    vld            vr7,     a2,      55
   2724    z1_upsample_edge_data_init1
   2725    vst            vr13,    t2,      112
   2726 
   2727    //64-71
   2728    vld            vr7,     a2,      63
   2729    z1_upsample_edge_data_init1
   2730    vst            vr13,    t2,      128
   2731 
   2732    //72-79
   2733    vld            vr7,     a2,      71
   2734    z1_upsample_edge_data_init1
   2735    vst            vr13,    t2,      144
   2736 
   2737    //80-87
   2738    vld            vr7,     a2,      79
   2739    z1_upsample_edge_data_init1
   2740    vst            vr13,    t2,      160
   2741 
   2742    //88-94
   2743    vld            vr7,     a2,      87
   2744    z1_upsample_edge_data_init2
   2745    fst.d          f13,     t2,     176
   2746    vstelm.w       vr13,    t2,     184,    2
   2747    vstelm.h       vr13,    t2,     188,    6
   2748 
   2749    ld.bu          t7,      a2,     95
   2750    st.b           t7,      t2,     190
   2751    b              .Z1_UEDGE_END
   2752 
   2753 .Z1_UEDGE_W64_H64:
   2754    //0-7
   2755    vld            vr7,     a2,      -1
   2756    z1_upsample_edge_data_init1
   2757    vst            vr13,    t2,     0
   2758 
   2759    //8-15
   2760    vld            vr7,     a2,      7
   2761    z1_upsample_edge_data_init1
   2762    vst            vr13,    t2,      16
   2763 
   2764    //16-23
   2765    vld            vr7,     a2,      15
   2766    z1_upsample_edge_data_init1
   2767    vst            vr13,    t2,      32
   2768 
   2769    //24-31
   2770    vld            vr7,     a2,      23
   2771    z1_upsample_edge_data_init1
   2772    vst            vr13,    t2,      48
   2773 
   2774    //32-39
   2775    vld            vr7,     a2,      31
   2776    z1_upsample_edge_data_init1
   2777    vst            vr13,    t2,      64
   2778 
   2779    //40-47
   2780    vld            vr7,     a2,      39
   2781    z1_upsample_edge_data_init1
   2782    vst            vr13,    t2,      80
   2783 
   2784    //48-55
   2785    vld            vr7,     a2,      47
   2786    z1_upsample_edge_data_init1
   2787    vst            vr13,    t2,      96
   2788 
   2789    //56-63
   2790    vld            vr7,     a2,      55
   2791    z1_upsample_edge_data_init1
   2792    vst            vr13,    t2,      112
   2793 
   2794    //64-71
   2795    vld            vr7,     a2,      63
   2796    z1_upsample_edge_data_init1
   2797    vst            vr13,    t2,      128
   2798 
   2799    //72-79
   2800    vld            vr7,     a2,      71
   2801    z1_upsample_edge_data_init1
   2802    vst            vr13,    t2,      144
   2803 
   2804    //80-87
   2805    vld            vr7,     a2,      79
   2806    z1_upsample_edge_data_init1
   2807    vst            vr13,    t2,      160
   2808 
   2809    //88-95
   2810    vld            vr7,     a2,      87
   2811    z1_upsample_edge_data_init1
   2812    vst            vr13,    t2,      176
   2813 
   2814    //96-103
   2815    vld            vr7,     a2,      95
   2816    z1_upsample_edge_data_init1
   2817    vst            vr13,    t2,      192
   2818 
   2819    //104-111
   2820    vld            vr7,     a2,      103
   2821    z1_upsample_edge_data_init1
   2822    vst            vr13,    t2,      208
   2823 
   2824    //112-119
   2825    vld            vr7,     a2,      111
   2826    z1_upsample_edge_data_init1
   2827    vst            vr13,    t2,      224
   2828 
   2829    //120-126
   2830    vld            vr7,     a2,      119
   2831    z1_upsample_edge_data_init2
   2832    fst.d          f13,     t2,      240
   2833    vstelm.w       vr13,    t2,      248,    2
   2834    vstelm.h       vr13,    t2,      252,    6
   2835 
   2836    ld.bu          t7,      a2,      127
   2837    st.b           t7,      t2,      254
   2838    b              .Z1_UEDGE_END
   2839 
   2840 .Z1_UEDGE_END:
   2841    //upsample_edge end
   2842 
   2843    or             a7,      t2,      t2   //top
   2844    add.d          t0,      a3,      a4
   2845    slli.d         t0,      t0,      1
   2846    addi.d         t0,      t0,      -2   //max_base_x
   2847    slli.d         t1,      t1,      1
   2848    b              .IPRED_Z1_UA_END
   2849 
   2850 .IPRED_Z1_NOTUA:
   2851    or             t5,      zero,    zero  //upsample_above=0
   2852    beqz           a7,      .IPRED_Z1_NOTFS
   2853    add.d          a7,      a3,      a4  //w+h
   2854    li.w           t4,      90
   2855    sub.d          t4,      t4,      a5
   2856    // ipred_get_filter_strength a6:filter_strength
   2857    beqz           a6,      .Z1_GETFS20
   2858 .Z1_GETFS10:  //wh<=8
   2859    addi.d         t6,      a7,      -8
   2860    blt            zero,    t6,      .Z1_GETFS11
   2861    addi.d         t6,      t4,      -64
   2862    blt            t6,      zero,    .Z1_GETFS101
   2863    ori            a6,      zero,    2
   2864    b              .Z1_GETFS40
   2865 .Z1_GETFS101:
   2866    addi.d         t6,      t4,      -40
   2867    blt            t6,      zero,    .Z1_GETFS30
   2868    ori            a6,      zero,    1
   2869    b              .Z1_GETFS40
   2870 .Z1_GETFS11:  //wh<=16
   2871    addi.d         t6,      a7,      -16
   2872    blt            zero,    t6,      .Z1_GETFS12
   2873    addi.d         t6,      t4,      -48
   2874    blt            t6,      zero,    .Z1_GETFS111
   2875    ori            a6,      zero,    2
   2876    b              .Z1_GETFS40
   2877 .Z1_GETFS111:
   2878    addi.d         t6,      t4,      -20
   2879    blt            t6,      zero,    .Z1_GETFS30
   2880    ori            a6,      zero,    1
   2881    b              .Z1_GETFS40
   2882 .Z1_GETFS12:  //wh<=24
   2883    addi.d         t6,      a7,      -24
   2884    blt            zero,    t6,      .Z1_GETFS13
   2885    addi.d         t6,      t4,      -4
   2886    blt            t6,      zero,    .Z1_GETFS30
   2887    ori            a6,      zero,    3
   2888    b              .Z1_GETFS40
   2889 .Z1_GETFS13:
   2890    ori            a6,      zero,    3
   2891    b              .Z1_GETFS40
   2892 
   2893 .Z1_GETFS20:  //wh<=8
   2894    addi.d         t6,      a7,      -8
   2895    blt            zero,    t6,      .Z1_GETFS21
   2896    addi.d         t6,      t4,      -56
   2897    blt            t6,      zero,    .Z1_GETFS30
   2898    ori            a6,      zero,    1
   2899    b              .Z1_GETFS40
   2900 .Z1_GETFS21:  //wh<=16
   2901    addi.d         t6,      a7,      -16
   2902    blt            zero,    t6,      .Z1_GETFS22
   2903    addi.d         t6,      t4,      -40
   2904    blt            t6,      zero,    .Z1_GETFS30
   2905    ori            a6,      zero,    1
   2906    b              .Z1_GETFS40
   2907 .Z1_GETFS22:  //wh<=24
   2908    addi.d         t6,      a7,      -24
   2909    blt            zero,    t6,      .Z1_GETFS23
   2910    addi.d         t6,      t4,      -32
   2911    blt            t6,      zero,    .Z1_GETFS221
   2912    ori            a6,      zero,    3
   2913    b              .Z1_GETFS40
   2914 .Z1_GETFS221:
   2915    addi.d         t6,      t4,      -16
   2916    blt            t6,      zero,    .Z1_GETFS222
   2917    ori            a6,      zero,    2
   2918    b              .Z1_GETFS40
   2919 .Z1_GETFS222:
   2920    addi.d         t6,      t4,      -8
   2921    blt            t6,      zero,    .Z1_GETFS30
   2922    ori            a6,      zero,    1
   2923    b              .Z1_GETFS40
   2924 .Z1_GETFS23:  //wh<=32
   2925    addi.d         t6,      a7,      -32
   2926    blt            zero,    t6,      .Z1_GETFS24
   2927    addi.d         t6,      t4,      -32
   2928    blt            t6,      zero,    .Z1_GETFS231
   2929    ori            a6,      zero,    3
   2930    b              .Z1_GETFS40
   2931 .Z1_GETFS231:
   2932    addi.d         t6,      t4,      -4
   2933    blt            t6,      zero,    .Z1_GETFS232
   2934    ori            a6,      zero,    2
   2935    b              .Z1_GETFS40
   2936 .Z1_GETFS232:
   2937    ori            a6,      zero,    1
   2938    b              .Z1_GETFS40
   2939 .Z1_GETFS24:
   2940    ori            a6,      zero,    3
   2941    b              .Z1_GETFS40
   2942 .Z1_GETFS30:
   2943   or              a6,      zero,    zero
   2944 .Z1_GETFS40:
   2945 
   2946    beqz           a6,      .IPRED_Z1_NOTFS
   2947 
   2948 .IPRED_Z1_IFFS:
   2949    // filter_edge
   2950    addi.d         a6,      a6,      -1
   2951    slli.d         a6,      a6,      4
   2952    la.local       t0,      ipred_filter_edge_kernel1
   2953    vldx           vr1,     t0,      a6    //kernel[0-3]
   2954 
   2955    la.local       t0,      ipred_filter_edge_kernel2
   2956    vldx           vr6,     t0,      a6    //kernel[4]
   2957 
   2958 .IPRED_Z1_FS_W4:
   2959    andi           t0,      a3,      4
   2960    beqz           t0,      .IPRED_Z1_FS_W8
   2961 .IPRED_Z1_FS_W4_H4:
   2962    andi           t0,      a4,      4
   2963    beqz           t0,      .IPRED_Z1_FS_W4_H8
   2964 
   2965    //0-7
   2966    vld            vr7,     a2,      -1
   2967    z1_filter_edge_data_init4
   2968    vbsrl.v        vr13,    vr7,     3
   2969    vextrins.b     vr13,    vr13,    0x65
   2970    vextrins.b     vr13,    vr13,    0x75
   2971    z1_filter_edge_calc_loop2
   2972    fst.d          f12,     t2,      0
   2973    b              .IPRED_Z1_FS_END
   2974 
   2975 .IPRED_Z1_FS_W4_H8:
   2976    andi           t0,      a4,      8
   2977    beqz           t0,      .IPRED_Z1_FS_W4_H16
   2978 
   2979    //0-7
   2980    vld            vr7,     a2,      -1
   2981    z1_filter_edge_data_init4
   2982    vbsrl.v        vr13,    vr7,     3
   2983    vextrins.b     vr13,    vr13,    0x65
   2984    vextrins.b     vr13,    vr13,    0x75
   2985    z1_filter_edge_calc_loop2
   2986    fst.d          f12,     t2,      0
   2987 
   2988    //8-11
   2989    vreplvei.b     vr10,    vr7,     8
   2990    vextrins.b     vr10,    vr7,     0x07
   2991    z1_filter_edge_calc_other
   2992    fst.s          f12,     t2,      8
   2993 
   2994    b              .IPRED_Z1_FS_END
   2995 
   2996 .IPRED_Z1_FS_W4_H16:
   2997    andi           t0,      a4,      16
   2998    beqz           t0,      .IPRED_Z1_FS_W4_H32
   2999 
   3000    //0-7
   3001    vld            vr7,     a2,      -1
   3002    z1_filter_edge_data_init4
   3003    vbsrl.v        vr13,    vr7,     3
   3004    vextrins.b     vr13,    vr13,    0x65
   3005    vextrins.b     vr13,    vr13,    0x75
   3006    z1_filter_edge_calc_loop2
   3007    fst.d          f12,     t2,      0
   3008 
   3009    //8-15
   3010    vreplvei.b     vr10,    vr7,     8
   3011    vextrins.b     vr10,    vr7,     0x07
   3012    z1_filter_edge_calc_other
   3013    fst.d          f12,     t2,      8
   3014 
   3015    //16-19
   3016    vreplvei.b     vr12,    vr12,    1
   3017    fst.s          f12,     t2,      16
   3018 
   3019    b              .IPRED_Z1_FS_END
   3020 
   3021 .IPRED_Z1_FS_W4_H32:
   3022    andi           t0,      a4,      32
   3023    beqz           t0,      .IPRED_Z1_FS_W4_H64
   3024 
   3025    //0-7
   3026    vld            vr7,     a2,      -1
   3027    z1_filter_edge_data_init4
   3028    vbsrl.v        vr13,    vr7,     3
   3029    vextrins.b     vr13,    vr13,    0x65
   3030    vextrins.b     vr13,    vr13,    0x75
   3031    z1_filter_edge_calc_loop2
   3032    fst.d          f12,     t2,      0
   3033 
   3034    //8-15
   3035    vreplvei.b     vr10,    vr7,     8
   3036    vextrins.b     vr10,    vr7,     0x07
   3037    z1_filter_edge_calc_other
   3038    fst.d          f12,     t2,      8
   3039 
   3040    //16-23
   3041    vreplvei.b     vr12,    vr12,    1
   3042    fst.d          f12,     t2,      16
   3043 
   3044    fst.d          f12,     t2,      24 //24-31
   3045    fst.s          f12,     t2,      32 //32-35
   3046 
   3047    b              .IPRED_Z1_FS_END
   3048 
   3049 .IPRED_Z1_FS_W4_H64:
   3050    //0-7
   3051    vld            vr7,     a2,      -1
   3052    z1_filter_edge_data_init4
   3053    vbsrl.v        vr13,    vr7,     3
   3054    vextrins.b     vr13,    vr13,    0x65
   3055    vextrins.b     vr13,    vr13,    0x75
   3056    z1_filter_edge_calc_loop2
   3057    fst.d          f12,     t2,      0
   3058 
   3059    //8-15
   3060    vreplvei.b     vr10,    vr7,     8
   3061    vextrins.b     vr10,    vr7,     0x07
   3062    z1_filter_edge_calc_other
   3063    fst.d          f12,     t2,      8
   3064 
   3065    //16-23
   3066    vreplvei.b     vr12,    vr12,    1
   3067    fst.d          f12,     t2,      16
   3068 
   3069    fst.d          f12,     t2,      24 //24-31
   3070    fst.d          f12,     t2,      32 //32-39
   3071    fst.d          f12,     t2,      40 //40-47
   3072    fst.d          f12,     t2,      48 //48-55
   3073    fst.d          f12,     t2,      56 //56-63
   3074    fst.s          f12,     t2,      64 //64-67
   3075 
   3076    b              .IPRED_Z1_FS_END
   3077 
   3078 .IPRED_Z1_FS_W8:
   3079    andi           t0,      a3,      8
   3080    beqz           t0,      .IPRED_Z1_FS_W16
   3081 .IPRED_Z1_FS_W8_H4:
   3082    andi           t0,      a4,      4
   3083    beqz           t0,      .IPRED_Z1_FS_W8_H8
   3084 
   3085    //0-7
   3086    vld            vr7,     a2,      -1
   3087    z1_filter_edge_data_init1
   3088    vbsrl.v        vr13,    vr7,     3
   3089    z1_filter_edge_calc_loop2
   3090    fst.d          f12,     t2,      0
   3091 
   3092    //8-11
   3093    vld            vr7,     a2,      6
   3094    vbsrl.v        vr11,    vr7,     1
   3095    vbsrl.v        vr12,    vr7,     2
   3096    vbsrl.v        vr13,    vr7,     3
   3097    vextrins.b     vr13,    vr13,    0x32
   3098    vsllwil.hu.bu  vr10,    vr7,     0
   3099    vsllwil.hu.bu  vr11,    vr11,    0
   3100    vsllwil.hu.bu  vr12,    vr12,    0
   3101    vsllwil.hu.bu  vr13,    vr13,    0
   3102    z1_filter_edge_calc_loop1
   3103 
   3104    vbsrl.v        vr13,    vr7,     4
   3105    vextrins.b     vr13,    vr13,    0x21
   3106    vextrins.b     vr13,    vr13,    0x31
   3107    z1_filter_edge_calc_loop2
   3108    fst.s          f12,     t2,      8
   3109    b              .IPRED_Z1_FS_END
   3110 
   3111 .IPRED_Z1_FS_W8_H8:
   3112    andi           t0,      a4,      8
   3113    beqz           t0,      .IPRED_Z1_FS_W8_H16
   3114 
   3115    //0-7
   3116    vld            vr7,     a2,      -1
   3117    z1_filter_edge_data_init1
   3118    vbsrl.v        vr13,    vr7,     3
   3119    z1_filter_edge_calc_loop2
   3120    fst.d          f12,     t2,      0
   3121 
   3122    //8-15
   3123    vld            vr7,     a2,      6
   3124    z1_filter_edge_data_init3
   3125    vbsrl.v        vr13,    vr7,     4
   3126    vextrins.b     vr13,    vr13,    0x65
   3127    vextrins.b     vr13,    vr13,    0x75
   3128    z1_filter_edge_calc_loop2
   3129    fst.d          f12,     t2,      8
   3130    b              .IPRED_Z1_FS_END
   3131 
   3132 .IPRED_Z1_FS_W8_H16:
   3133    andi           t0,      a4,      16
   3134    beqz           t0,      .IPRED_Z1_FS_W8_H32
   3135 
   3136    //0-7
   3137    vld            vr7,     a2,      -1
   3138    z1_filter_edge_data_init1
   3139    vbsrl.v        vr13,    vr7,     3
   3140    z1_filter_edge_calc_loop2
   3141    fst.d          f12,     t2,      0
   3142 
   3143    //8-15
   3144    vld            vr7,     a2,      6
   3145    z1_filter_edge_data_init3
   3146    vbsrl.v        vr13,    vr7,     4
   3147    vextrins.b     vr13,    vr13,    0x65
   3148    vextrins.b     vr13,    vr13,    0x75
   3149    z1_filter_edge_calc_loop2
   3150    fst.d          f12,     t2,      8
   3151 
   3152    //16-23
   3153    vreplvei.b     vr10,    vr7,     9
   3154    vextrins.b     vr10,    vr7,     0x08
   3155    z1_filter_edge_calc_other
   3156    fst.d          f12,     t2,      16
   3157 
   3158    b              .IPRED_Z1_FS_END
   3159 
   3160 .IPRED_Z1_FS_W8_H32:
   3161    andi           t0,      a4,      32
   3162    beqz           t0,      .IPRED_Z1_FS_W8_H64
   3163 
   3164    //0-7
   3165    vld            vr7,     a2,      -1
   3166    z1_filter_edge_data_init1
   3167    vbsrl.v        vr13,    vr7,     3
   3168    z1_filter_edge_calc_loop2
   3169    fst.d          f12,     t2,      0
   3170 
   3171    //8-15
   3172    vld            vr7,     a2,      6
   3173    z1_filter_edge_data_init3
   3174    vbsrl.v        vr13,    vr7,     4
   3175    vextrins.b     vr13,    vr13,    0x65
   3176    vextrins.b     vr13,    vr13,    0x75
   3177    z1_filter_edge_calc_loop2
   3178    fst.d          f12,     t2,      8
   3179 
   3180    //16-23
   3181    vreplvei.b     vr10,    vr7,     9
   3182    vextrins.b     vr10,    vr7,     0x08
   3183    z1_filter_edge_calc_other
   3184    fst.d          f12,     t2,      16
   3185 
   3186    //24-31
   3187    vreplvei.b     vr12,    vr12,    1
   3188    fst.d          f12,     t2,      24
   3189 
   3190    //32-39
   3191    fst.d          f12,     t2,      32
   3192 
   3193    b              .IPRED_Z1_FS_END
   3194 
   3195 .IPRED_Z1_FS_W8_H64:
   3196    //0-7
   3197    vld            vr7,     a2,      -1
   3198    z1_filter_edge_data_init1
   3199    vbsrl.v        vr13,    vr7,     3
   3200    z1_filter_edge_calc_loop2
   3201    fst.d          f12,     t2,      0
   3202 
   3203    //8-15
   3204    vld            vr7,     a2,      6
   3205    z1_filter_edge_data_init3
   3206    vbsrl.v        vr13,    vr7,     4
   3207    vextrins.b     vr13,    vr13,    0x65
   3208    vextrins.b     vr13,    vr13,    0x75
   3209    z1_filter_edge_calc_loop2
   3210    fst.d          f12,     t2,      8
   3211 
   3212    //16-23
   3213    vreplvei.b     vr10,    vr7,     9
   3214    vextrins.b     vr10,    vr7,     0x08
   3215    z1_filter_edge_calc_other
   3216    fst.d          f12,     t2,      16
   3217 
   3218    //24-31
   3219    vreplvei.b     vr12,    vr12,    1
   3220    fst.d          f12,     t2,      24
   3221 
   3222    fst.d          f12,     t2,      32  //32-39
   3223    fst.d          f12,     t2,      40  //40-47
   3224    fst.d          f12,     t2,      48  //48-55
   3225    fst.d          f12,     t2,      56  //56-63
   3226    fst.d          f12,     t2,      64  //64-71
   3227 
   3228    b              .IPRED_Z1_FS_END
   3229 
   3230 .IPRED_Z1_FS_W16:
   3231    andi           t0,      a3,      16
   3232    beqz           t0,      .IPRED_Z1_FS_W32
   3233 .IPRED_Z1_FS_W16_H4:
   3234    andi           t0,      a4,      4
   3235    beqz           t0,      .IPRED_Z1_FS_W16_H8
   3236 
   3237    //0-7
   3238    vld            vr7,     a2,      -1
   3239    z1_filter_edge_data_init1
   3240    vbsrl.v        vr13,    vr7,     3
   3241    z1_filter_edge_calc_loop2
   3242    fst.d          f12,     t2,      0
   3243 
   3244    //8-15
   3245    vld            vr7,     a2,      6
   3246    z1_filter_edge_data_init2
   3247    vbsrl.v        vr13,    vr7,     4
   3248    z1_filter_edge_calc_loop2
   3249    fst.d          f12,     t2,      8
   3250 
   3251    //16-19
   3252    vld            vr7,     a2,      14
   3253    vbsrl.v        vr11,    vr7,     1
   3254    vbsrl.v        vr12,    vr7,     2
   3255    vbsrl.v        vr13,    vr7,     3
   3256    vextrins.b     vr13,    vr13,    0x32
   3257    vsllwil.hu.bu  vr10,    vr7,     0
   3258    vsllwil.hu.bu  vr11,    vr11,    0
   3259    vsllwil.hu.bu  vr12,    vr12,    0
   3260    vsllwil.hu.bu  vr13,    vr13,    0
   3261    z1_filter_edge_calc_loop1
   3262 
   3263    vbsrl.v        vr13,    vr7,     4
   3264    vextrins.b     vr13,    vr13,    0x21
   3265    vextrins.b     vr13,    vr13,    0x31
   3266    z1_filter_edge_calc_loop2
   3267    fst.s          f12,     t2,      16
   3268    b              .IPRED_Z1_FS_END
   3269 
   3270 .IPRED_Z1_FS_W16_H8:
   3271    andi           t0,      a4,      8
   3272    beqz           t0,      .IPRED_Z1_FS_W16_H16
   3273 
   3274    //0-7
   3275    vld            vr7,     a2,      -1
   3276    z1_filter_edge_data_init1
   3277    vbsrl.v        vr13,    vr7,     3
   3278    z1_filter_edge_calc_loop2
   3279    fst.d          f12,     t2,      0
   3280 
   3281    //8-15
   3282    vld            vr7,     a2,      6
   3283    z1_filter_edge_data_init2
   3284    vbsrl.v        vr13,    vr7,     4
   3285    z1_filter_edge_calc_loop2
   3286    fst.d          f12,     t2,      8
   3287 
   3288    //16-23
   3289    vld            vr7,     a2,      14
   3290    z1_filter_edge_data_init3
   3291    vbsrl.v        vr13,    vr7,     4
   3292    vextrins.b     vr13,    vr13,    0x65
   3293    vextrins.b     vr13,    vr13,    0x75
   3294    z1_filter_edge_calc_loop2
   3295    fst.d          f12,     t2,      16
   3296    b              .IPRED_Z1_FS_END
   3297 
   3298 .IPRED_Z1_FS_W16_H16:
   3299    andi           t0,      a4,      16
   3300    beqz           t0,      .IPRED_Z1_FS_W16_H32
   3301 
   3302    //0-7
   3303    vld            vr7,     a2,      -1
   3304    z1_filter_edge_data_init1
   3305    vbsrl.v        vr13,    vr7,     3
   3306    z1_filter_edge_calc_loop2
   3307    fst.d          f12,     t2,      0
   3308 
   3309    //8-15
   3310    vld            vr7,     a2,      6
   3311    z1_filter_edge_data_init2
   3312    vbsrl.v        vr13,    vr7,     4
   3313    z1_filter_edge_calc_loop2
   3314    fst.d          f12,     t2,      8
   3315 
   3316    //16-23
   3317    vld            vr7,     a2,      14
   3318    z1_filter_edge_data_init2
   3319    vbsrl.v        vr13,    vr7,     4
   3320    z1_filter_edge_calc_loop2
   3321    fst.d          f12,     t2,      16
   3322 
   3323    //24-31
   3324    vld            vr7,     a2,      22
   3325    z1_filter_edge_data_init3
   3326    vbsrl.v        vr13,    vr7,     4
   3327    vextrins.b     vr13,    vr13,    0x65
   3328    vextrins.b     vr13,    vr13,    0x75
   3329    z1_filter_edge_calc_loop2
   3330    fst.d          f12,     t2,      24
   3331    b              .IPRED_Z1_FS_END
   3332 
   3333 .IPRED_Z1_FS_W16_H32:
   3334    andi           t0,      a4,      32
   3335    beqz           t0,      .IPRED_Z1_FS_W16_H64
   3336 
   3337    //0-7
   3338    vld            vr7,     a2,      -1
   3339    z1_filter_edge_data_init1
   3340    vbsrl.v        vr13,    vr7,     3
   3341    z1_filter_edge_calc_loop2
   3342    fst.d          f12,     t2,      0
   3343 
   3344    //8-15
   3345    vld            vr7,     a2,      6
   3346    z1_filter_edge_data_init2
   3347    vbsrl.v        vr13,    vr7,     4
   3348    z1_filter_edge_calc_loop2
   3349    fst.d          f12,     t2,      8
   3350 
   3351    //16-23
   3352    vld            vr7,     a2,      14
   3353    z1_filter_edge_data_init2
   3354    vbsrl.v        vr13,    vr7,     4
   3355    z1_filter_edge_calc_loop2
   3356    fst.d          f12,     t2,      16
   3357 
   3358    //24-31
   3359    vld            vr7,     a2,      22
   3360    z1_filter_edge_data_init3
   3361    vbsrl.v        vr13,    vr7,     4
   3362    vextrins.b     vr13,    vr13,    0x65
   3363    vextrins.b     vr13,    vr13,    0x75
   3364    z1_filter_edge_calc_loop2
   3365    fst.d          f12,     t2,      24
   3366 
   3367    //32-39
   3368    vreplvei.b     vr10,    vr7,     9
   3369    vextrins.b     vr10,    vr7,     0x08
   3370    z1_filter_edge_calc_other
   3371    fst.d          f12,     t2,      32
   3372 
   3373    //40-47
   3374    vreplvei.b     vr12,    vr12,    1
   3375    fst.d          f12,     t2,      40
   3376 
   3377    b              .IPRED_Z1_FS_END
   3378 
   3379 .IPRED_Z1_FS_W16_H64:
   3380    //0-7
   3381    vld            vr7,     a2,      -1
   3382    z1_filter_edge_data_init1
   3383    vbsrl.v        vr13,    vr7,     3
   3384    z1_filter_edge_calc_loop2
   3385    fst.d          f12,     t2,      0
   3386 
   3387    //8-15
   3388    vld            vr7,     a2,      6
   3389    z1_filter_edge_data_init2
   3390    vbsrl.v        vr13,    vr7,     4
   3391    z1_filter_edge_calc_loop2
   3392    fst.d          f12,     t2,      8
   3393 
   3394    //16-23
   3395    vld            vr7,     a2,      14
   3396    z1_filter_edge_data_init2
   3397    vbsrl.v        vr13,    vr7,     4
   3398    z1_filter_edge_calc_loop2
   3399    fst.d          f12,     t2,      16
   3400 
   3401    //24-31
   3402    vld            vr7,     a2,      22
   3403    z1_filter_edge_data_init3
   3404    vbsrl.v        vr13,    vr7,     4
   3405    vextrins.b     vr13,    vr13,    0x65
   3406    vextrins.b     vr13,    vr13,    0x75
   3407    z1_filter_edge_calc_loop2
   3408    fst.d          f12,     t2,      24
   3409 
   3410    //32-39
   3411    vreplvei.b     vr10,    vr7,     9
   3412    vextrins.b     vr10,    vr7,     0x08
   3413    z1_filter_edge_calc_other
   3414    fst.d          f12,     t2,      32
   3415 
   3416    //40-47
   3417    vreplvei.b     vr12,    vr12,    1
   3418    fst.d          f12,     t2,      40
   3419 
   3420    fst.d          f12,     t2,      48 //48-55
   3421    fst.d          f12,     t2,      56 //56-63
   3422    fst.d          f12,     t2,      64 //64-71
   3423    fst.d          f12,     t2,      72 //72-81
   3424 
   3425    b              .IPRED_Z1_FS_END
   3426 
   3427 .IPRED_Z1_FS_W32:
   3428    andi           t0,      a3,      32
   3429    beqz           t0,      .IPRED_Z1_FS_W64
   3430 .IPRED_Z1_FS_W32_H8:
   3431    andi           t0,      a4,      8
   3432    beqz           t0,      .IPRED_Z1_FS_W32_H16
   3433 
   3434    //0-7
   3435    vld            vr7,     a2,      -1
   3436    z1_filter_edge_data_init1
   3437    vbsrl.v        vr13,    vr7,     3
   3438    z1_filter_edge_calc_loop2
   3439    fst.d          f12,     t2,      0
   3440 
   3441    //8-15
   3442    vld            vr7,     a2,      6
   3443    z1_filter_edge_data_init2
   3444    vbsrl.v        vr13,    vr7,     4
   3445    z1_filter_edge_calc_loop2
   3446    fst.d          f12,     t2,      8
   3447 
   3448    //16-23
   3449    vld            vr7,     a2,      14
   3450    z1_filter_edge_data_init2
   3451    vbsrl.v        vr13,    vr7,     4
   3452    z1_filter_edge_calc_loop2
   3453    fst.d          f12,     t2,      16
   3454 
   3455    //24-31
   3456    vld            vr7,     a2,      22
   3457    z1_filter_edge_data_init2
   3458    vbsrl.v        vr13,    vr7,     4
   3459    z1_filter_edge_calc_loop2
   3460    fst.d          f12,     t2,      24
   3461 
   3462    //32-39
   3463    vld            vr7,     a2,      30
   3464    z1_filter_edge_data_init3
   3465    vbsrl.v        vr13,    vr7,     4
   3466    vextrins.b     vr13,    vr13,    0x65
   3467    vextrins.b     vr13,    vr13,    0x75
   3468    z1_filter_edge_calc_loop2
   3469    fst.d          f12,     t2,      32
   3470 
   3471    b              .IPRED_Z1_FS_END
   3472 
   3473 .IPRED_Z1_FS_W32_H16:
   3474    andi           t0,      a4,      16
   3475    beqz           t0,      .IPRED_Z1_FS_W32_H32
   3476 
   3477    //0-7
   3478    vld            vr7,     a2,      -1
   3479    z1_filter_edge_data_init1
   3480    vbsrl.v        vr13,    vr7,     3
   3481    z1_filter_edge_calc_loop2
   3482    fst.d          f12,     t2,      0
   3483 
   3484    //8-15
   3485    vld            vr7,     a2,      6
   3486    z1_filter_edge_data_init2
   3487    vbsrl.v        vr13,    vr7,     4
   3488    z1_filter_edge_calc_loop2
   3489    fst.d          f12,     t2,      8
   3490 
   3491    //16-23
   3492    vld            vr7,     a2,      14
   3493    z1_filter_edge_data_init2
   3494 
   3495    vbsrl.v        vr13,    vr7,     4
   3496    z1_filter_edge_calc_loop2
   3497    fst.d          f12,     t2,      16
   3498 
   3499    //24-31
   3500    vld            vr7,     a2,      22
   3501    z1_filter_edge_data_init2
   3502    vbsrl.v        vr13,    vr7,     4
   3503    z1_filter_edge_calc_loop2
   3504    fst.d          f12,     t2,      24
   3505 
   3506    //32-39
   3507    vld            vr7,     a2,      30
   3508    z1_filter_edge_data_init2
   3509    vbsrl.v        vr13,    vr7,     4
   3510    z1_filter_edge_calc_loop2
   3511    fst.d          f12,     t2,      32
   3512 
   3513    //40-47
   3514    vld            vr7,     a2,      38
   3515    z1_filter_edge_data_init3
   3516    vbsrl.v        vr13,    vr7,     4
   3517    vextrins.b     vr13,    vr13,    0x65
   3518    vextrins.b     vr13,    vr13,    0x75
   3519    z1_filter_edge_calc_loop2
   3520    fst.d          f12,     t2,      40
   3521 
   3522    b              .IPRED_Z1_FS_END
   3523 
   3524 .IPRED_Z1_FS_W32_H32:
   3525    andi           t0,      a4,      32
   3526    beqz           t0,      .IPRED_Z1_FS_W32_H64
   3527 
   3528    //0-7
   3529    vld            vr7,     a2,      -1
   3530    z1_filter_edge_data_init1
   3531    vbsrl.v        vr13,    vr7,     3
   3532    z1_filter_edge_calc_loop2
   3533    fst.d          f12,     t2,      0
   3534 
   3535    //8-15
   3536    vld            vr7,     a2,      6
   3537    z1_filter_edge_data_init2
   3538    vbsrl.v        vr13,    vr7,     4
   3539    z1_filter_edge_calc_loop2
   3540    fst.d          f12,     t2,      8
   3541 
   3542    //16-23
   3543    vld            vr7,     a2,      14
   3544    z1_filter_edge_data_init2
   3545    vbsrl.v        vr13,    vr7,     4
   3546    z1_filter_edge_calc_loop2
   3547    fst.d          f12,     t2,      16
   3548 
   3549    //24-31
   3550    vld            vr7,     a2,      22
   3551    z1_filter_edge_data_init2
   3552    vbsrl.v        vr13,    vr7,     4
   3553    z1_filter_edge_calc_loop2
   3554    fst.d          f12,     t2,      24
   3555 
   3556    //32-39
   3557    vld            vr7,     a2,      30
   3558    z1_filter_edge_data_init2
   3559    vbsrl.v        vr13,    vr7,     4
   3560    z1_filter_edge_calc_loop2
   3561    fst.d          f12,     t2,      32
   3562 
   3563    //40-47
   3564    vld            vr7,     a2,      38
   3565    z1_filter_edge_data_init2
   3566    vbsrl.v        vr13,    vr7,     4
   3567    z1_filter_edge_calc_loop2
   3568    fst.d          f12,     t2,      40
   3569 
   3570    //48-55
   3571    vld            vr7,     a2,      46
   3572    z1_filter_edge_data_init2
   3573    vbsrl.v        vr13,    vr7,     4
   3574    z1_filter_edge_calc_loop2
   3575    fst.d          f12,     t2,      48
   3576 
   3577    //56-63
   3578    vld            vr7,     a2,      54
   3579    z1_filter_edge_data_init3
   3580    vbsrl.v        vr13,    vr7,     4
   3581    vextrins.b     vr13,    vr13,    0x65
   3582    vextrins.b     vr13,    vr13,    0x75
   3583    z1_filter_edge_calc_loop2
   3584    fst.d          f12,     t2,      56
   3585 
   3586    b              .IPRED_Z1_FS_END
   3587 
   3588 .IPRED_Z1_FS_W32_H64:
   3589    //0-7
   3590    vld            vr7,     a2,      -1
   3591    z1_filter_edge_data_init1
   3592    vbsrl.v        vr13,    vr7,     3
   3593    z1_filter_edge_calc_loop2
   3594    fst.d          f12,     t2,      0
   3595 
   3596    //8-15
   3597    vld            vr7,     a2,      6
   3598    z1_filter_edge_data_init2
   3599    vbsrl.v        vr13,    vr7,     4
   3600    z1_filter_edge_calc_loop2
   3601    fst.d          f12,     t2,      8
   3602 
   3603    //16-23
   3604    vld            vr7,     a2,      14
   3605    z1_filter_edge_data_init2
   3606    vbsrl.v        vr13,    vr7,     4
   3607    z1_filter_edge_calc_loop2
   3608    fst.d          f12,     t2,      16
   3609 
   3610    //24-31
   3611    vld            vr7,     a2,      22
   3612    z1_filter_edge_data_init2
   3613    vbsrl.v        vr13,    vr7,     4
   3614    z1_filter_edge_calc_loop2
   3615    fst.d          f12,     t2,      24
   3616 
   3617    //32-39
   3618    vld            vr7,     a2,      30
   3619    z1_filter_edge_data_init2
   3620    vbsrl.v        vr13,    vr7,     4
   3621    z1_filter_edge_calc_loop2
   3622    fst.d          f12,     t2,      32
   3623 
   3624    //40-47
   3625    vld            vr7,     a2,      38
   3626    z1_filter_edge_data_init2
   3627    vbsrl.v        vr13,    vr7,     4
   3628    z1_filter_edge_calc_loop2
   3629    fst.d          f12,     t2,      40
   3630 
   3631    //48-55
   3632    vld            vr7,     a2,      46
   3633    z1_filter_edge_data_init2
   3634    vbsrl.v        vr13,    vr7,     4
   3635    z1_filter_edge_calc_loop2
   3636    fst.d          f12,     t2,      48
   3637 
   3638    //56-63
   3639    vld            vr7,     a2,      54
   3640    z1_filter_edge_data_init3
   3641    vbsrl.v        vr13,    vr7,     4
   3642    vextrins.b     vr13,    vr13,    0x65
   3643    vextrins.b     vr13,    vr13,    0x75
   3644    z1_filter_edge_calc_loop2
   3645    fst.d          f12,     t2,      56
   3646 
   3647    //64-71
   3648    vreplvei.b     vr10,    vr7,     9
   3649    vextrins.b     vr10,    vr7,     0x08
   3650    z1_filter_edge_calc_other
   3651    fst.d          f12,     t2,      64
   3652 
   3653    //72-89
   3654    vreplvei.b     vr12,    vr12,    1
   3655    fst.d          f12,     t2,      72
   3656 
   3657    fst.d          f12,     t2,      80 //80-87
   3658    fst.d          f12,     t2,      88 //88-95
   3659 
   3660    b              .IPRED_Z1_FS_END
   3661 
   3662 .IPRED_Z1_FS_W64:
   3663 .IPRED_Z1_FS_W64_H16:
   3664    andi           t0,      a4,      16
   3665    beqz           t0,      .IPRED_Z1_FS_W64_H32
   3666 
   3667    //0-7
   3668    vld            vr7,     a2,      -1
   3669    z1_filter_edge_data_init1
   3670    vbsrl.v        vr13,    vr7,     3
   3671    z1_filter_edge_calc_loop2
   3672    fst.d          f12,     t2,      0
   3673 
   3674    //8-15
   3675    vld            vr7,     a2,      6
   3676    z1_filter_edge_data_init2
   3677    vbsrl.v        vr13,    vr7,     4
   3678    z1_filter_edge_calc_loop2
   3679    fst.d          f12,     t2,      8
   3680 
   3681    //16-23
   3682    vld            vr7,     a2,      14
   3683    z1_filter_edge_data_init2
   3684    vbsrl.v        vr13,    vr7,     4
   3685    z1_filter_edge_calc_loop2
   3686    fst.d          f12,     t2,      16
   3687 
   3688    //24-31
   3689    vld            vr7,     a2,      22
   3690    z1_filter_edge_data_init2
   3691    vbsrl.v        vr13,    vr7,     4
   3692    z1_filter_edge_calc_loop2
   3693    fst.d          f12,     t2,      24
   3694 
   3695    //32-39
   3696    vld            vr7,     a2,      30
   3697    z1_filter_edge_data_init2
   3698    vbsrl.v        vr13,    vr7,     4
   3699    z1_filter_edge_calc_loop2
   3700    fst.d          f12,     t2,      32
   3701 
   3702    //40-47
   3703    vld            vr7,     a2,      38
   3704    z1_filter_edge_data_init2
   3705    vbsrl.v        vr13,    vr7,     4
   3706    z1_filter_edge_calc_loop2
   3707    fst.d          f12,     t2,      40
   3708 
   3709    //48-55
   3710    vld            vr7,     a2,      46
   3711    z1_filter_edge_data_init2
   3712    vbsrl.v        vr13,    vr7,     4
   3713    z1_filter_edge_calc_loop2
   3714    fst.d          f12,     t2,      48
   3715 
   3716    //56-63
   3717    vld            vr7,     a2,      54
   3718    z1_filter_edge_data_init2
   3719    vbsrl.v        vr13,    vr7,     4
   3720    z1_filter_edge_calc_loop2
   3721    fst.d          f12,     t2,      56
   3722 
   3723    //64-71
   3724    vld            vr7,     a2,      62
   3725    z1_filter_edge_data_init2
   3726    vbsrl.v        vr13,    vr7,     4
   3727    z1_filter_edge_calc_loop2
   3728    fst.d          f12,     t2,      64
   3729 
   3730    //72-79
   3731    vld            vr7,     a2,      70
   3732    z1_filter_edge_data_init3
   3733    vbsrl.v        vr13,    vr7,     4
   3734    vextrins.b     vr13,    vr13,    0x65
   3735    vextrins.b     vr13,    vr13,    0x75
   3736    z1_filter_edge_calc_loop2
   3737    fst.d          f12,     t2,      72
   3738 
   3739    b              .IPRED_Z1_FS_END
   3740 
   3741 .IPRED_Z1_FS_W64_H32:
   3742    andi           t0,      a4,      32
   3743    beqz           t0,      .IPRED_Z1_FS_W64_H64
   3744 
   3745    //0-7
   3746    vld            vr7,     a2,      -1
   3747    z1_filter_edge_data_init1
   3748    vbsrl.v        vr13,    vr7,     3
   3749    z1_filter_edge_calc_loop2
   3750    fst.d          f12,     t2,      0
   3751 
   3752    //8-15
   3753    vld            vr7,     a2,      6
   3754    z1_filter_edge_data_init2
   3755    vbsrl.v        vr13,    vr7,     4
   3756    z1_filter_edge_calc_loop2
   3757    fst.d          f12,     t2,      8
   3758 
   3759    //16-23
   3760    vld            vr7,     a2,      14
   3761    z1_filter_edge_data_init2
   3762    vbsrl.v        vr13,    vr7,     4
   3763    z1_filter_edge_calc_loop2
   3764    fst.d          f12,     t2,      16
   3765 
   3766    //24-31
   3767    vld            vr7,     a2,      22
   3768    z1_filter_edge_data_init2
   3769    vbsrl.v        vr13,    vr7,     4
   3770    z1_filter_edge_calc_loop2
   3771    fst.d          f12,     t2,      24
   3772 
   3773    //32-39
   3774    vld            vr7,     a2,      30
   3775    z1_filter_edge_data_init2
   3776    vbsrl.v        vr13,    vr7,     4
   3777    z1_filter_edge_calc_loop2
   3778    fst.d          f12,     t2,      32
   3779 
   3780    //40-47
   3781    vld            vr7,     a2,      38
   3782    z1_filter_edge_data_init2
   3783    vbsrl.v        vr13,    vr7,     4
   3784    z1_filter_edge_calc_loop2
   3785    fst.d          f12,     t2,      40
   3786 
   3787    //48-55
   3788    vld            vr7,     a2,      46
   3789    z1_filter_edge_data_init2
   3790    vbsrl.v        vr13,    vr7,     4
   3791    z1_filter_edge_calc_loop2
   3792    fst.d          f12,     t2,      48
   3793 
   3794    //56-63
   3795    vld            vr7,     a2,      54
   3796    z1_filter_edge_data_init2
   3797    vbsrl.v        vr13,    vr7,     4
   3798    z1_filter_edge_calc_loop2
   3799    fst.d          f12,     t2,      56
   3800 
   3801    //64-71
   3802    vld            vr7,     a2,      62
   3803    z1_filter_edge_data_init2
   3804    vbsrl.v        vr13,    vr7,     4
   3805    z1_filter_edge_calc_loop2
   3806    fst.d          f12,     t2,      64
   3807 
   3808    //72-79
   3809    vld            vr7,     a2,      70
   3810    z1_filter_edge_data_init2
   3811    vbsrl.v        vr13,    vr7,     4
   3812    z1_filter_edge_calc_loop2
   3813    fst.d          f12,     t2,      72
   3814 
   3815    //80-87
   3816    vld            vr7,     a2,      78
   3817    z1_filter_edge_data_init2
   3818    vbsrl.v        vr13,    vr7,     4
   3819    z1_filter_edge_calc_loop2
   3820    fst.d          f12,     t2,      80
   3821 
   3822    //88-95
   3823    vld            vr7,     a2,      86
   3824    z1_filter_edge_data_init3
   3825    vbsrl.v        vr13,    vr7,     4
   3826    vextrins.b     vr13,    vr13,    0x65
   3827    vextrins.b     vr13,    vr13,    0x75
   3828    z1_filter_edge_calc_loop2
   3829    fst.d          f12,     t2,      88
   3830 
   3831    b              .IPRED_Z1_FS_END
   3832 
   3833 .IPRED_Z1_FS_W64_H64:
   3834    //0-7
   3835    vld            vr7,     a2,      -1
   3836    z1_filter_edge_data_init1
   3837    vbsrl.v        vr13,    vr7,     3
   3838    z1_filter_edge_calc_loop2
   3839    fst.d          f12,     t2,      0
   3840 
   3841    //8-15
   3842    vld            vr7,     a2,      6
   3843    z1_filter_edge_data_init2
   3844    vbsrl.v        vr13,    vr7,     4
   3845    z1_filter_edge_calc_loop2
   3846    fst.d          f12,     t2,      8
   3847 
   3848    //16-23
   3849    vld            vr7,     a2,      14
   3850    z1_filter_edge_data_init2
   3851    vbsrl.v        vr13,    vr7,     4
   3852    z1_filter_edge_calc_loop2
   3853    fst.d          f12,     t2,      16
   3854 
   3855    //24-31
   3856    vld            vr7,     a2,      22
   3857    z1_filter_edge_data_init2
   3858    vbsrl.v        vr13,    vr7,     4
   3859    z1_filter_edge_calc_loop2
   3860    fst.d          f12,     t2,      24
   3861 
   3862    //32-39
   3863    vld            vr7,     a2,      30
   3864    z1_filter_edge_data_init2
   3865    vbsrl.v        vr13,    vr7,     4
   3866    z1_filter_edge_calc_loop2
   3867    fst.d          f12,     t2,      32
   3868 
   3869    //40-47
   3870    vld            vr7,     a2,      38
   3871    z1_filter_edge_data_init2
   3872    vbsrl.v        vr13,    vr7,     4
   3873    z1_filter_edge_calc_loop2
   3874    fst.d          f12,     t2,      40
   3875 
   3876    //48-55
   3877    vld            vr7,     a2,      46
   3878    z1_filter_edge_data_init2
   3879    vbsrl.v        vr13,    vr7,     4
   3880    z1_filter_edge_calc_loop2
   3881    fst.d          f12,     t2,      48
   3882 
   3883    //56-63
   3884    vld            vr7,     a2,      54
   3885    z1_filter_edge_data_init2
   3886    vbsrl.v        vr13,    vr7,     4
   3887    z1_filter_edge_calc_loop2
   3888    fst.d          f12,     t2,      56
   3889 
   3890    //64-71
   3891    vld            vr7,     a2,      62
   3892    z1_filter_edge_data_init2
   3893    vbsrl.v        vr13,    vr7,     4
   3894    z1_filter_edge_calc_loop2
   3895    fst.d          f12,     t2,      64
   3896 
   3897    //72-79
   3898    vld            vr7,     a2,      70
   3899    z1_filter_edge_data_init2
   3900    vbsrl.v        vr13,    vr7,     4
   3901    z1_filter_edge_calc_loop2
   3902    fst.d          f12,     t2,      72
   3903 
   3904    //80-87
   3905    vld            vr7,     a2,      78
   3906    z1_filter_edge_data_init2
   3907    vbsrl.v        vr13,    vr7,     4
   3908    z1_filter_edge_calc_loop2
   3909    fst.d          f12,     t2,      80
   3910 
   3911    //88-95
   3912    vld            vr7,     a2,      86
   3913    z1_filter_edge_data_init2
   3914    vbsrl.v        vr13,    vr7,     4
   3915    z1_filter_edge_calc_loop2
   3916    fst.d          f12,     t2,      88
   3917 
   3918    //96-103
   3919    vld            vr7,     a2,      94
   3920    z1_filter_edge_data_init2
   3921    vbsrl.v        vr13,    vr7,     4
   3922    z1_filter_edge_calc_loop2
   3923    fst.d          f12,     t2,      96
   3924 
   3925    //104-111
   3926    vld            vr7,     a2,      102
   3927    z1_filter_edge_data_init2
   3928    vbsrl.v        vr13,    vr7,     4
   3929    z1_filter_edge_calc_loop2
   3930    fst.d          f12,     t2,      104
   3931 
   3932    //112-119
   3933    vld            vr7,     a2,      110
   3934    z1_filter_edge_data_init2
   3935    vbsrl.v        vr13,    vr7,     4
   3936    z1_filter_edge_calc_loop2
   3937    fst.d          f12,     t2,      112
   3938 
   3939    //120-127
   3940    vld            vr7,     a2,      118
   3941    z1_filter_edge_data_init3
   3942    vbsrl.v        vr13,    vr7,     4
   3943    vextrins.b     vr13,    vr13,    0x65
   3944    vextrins.b     vr13,    vr13,    0x75
   3945    z1_filter_edge_calc_loop2
   3946    fst.d          f12,     t2,      120
   3947 
   3948 .IPRED_Z1_FS_END:
   3949    addi.d         t0,      a7,      -1   //max_base_x
   3950    or             a7,      t2,      t2   //top
   3951    b              .IPRED_Z1_UA_END
   3952 
   3953 .IPRED_Z1_NOTFS:
   3954    or             a7,      a2,      a2   //top
   3955    // imin_gr
   3956    blt            a3,      a4,      .Z1_IMIN1
   3957    or             t0,      a4,      a4
   3958    b              .Z1_IMIN2
   3959 .Z1_IMIN1:
   3960    or             t0,      a3,      a3
   3961 .Z1_IMIN2:
   3962 
   3963    add.d          t0,      a3,      t0
   3964    addi.d         t0,      t0,      -1   //max_base_x
   3965 
   3966 .IPRED_Z1_UA_END:
   3967    //st dst, t1:dx  a2 a6 t6 t7
   3968    beqz           t5,      .Z1_UA0
   3969 
   3970    li.w           a5,      64
   3971    vreplgr2vr.h   vr0,     a5
   3972    vsrai.h        vr7,     vr0,     1
   3973    or             t2,      zero,    zero  //y
   3974    or             t3,      t1,      t1    //xpos
   3975 .Z1_LOOPY:
   3976    andi           t4,      t3,      0x3e  //frac
   3977    vreplgr2vr.h   vr1,     t4
   3978    vsub.h         vr2,     vr0,     vr1
   3979    or             a6,      zero,    zero  //x
   3980    or             a2,      zero,    zero  //base_num
   3981    srai.d         t6,      t3,      6     //base
   3982 
   3983    or             t7,      t6,      t6
   3984    bge            t7,      t0,      .Z1_LOOPX
   3985 .Z1_BASENUM:
   3986    addi.d         a2,      a2,      1
   3987    addi.d         t7,      t7,      2
   3988    blt            t7,      t0,      .Z1_BASENUM
   3989 
   3990 .Z1_LOOPX:
   3991    blt            a2,      a3,      .Z1_LOOPX_BASEMAX
   3992 
   3993    srai.d         t8,      a3,      3  //loop param
   3994    beqz           t8,      .Z1_LOOPX_W4
   3995 .Z1_LOOPX_W8:
   3996    add.d          t5,      a7,      t6
   3997    vld            vr3,     t5,      0
   3998    vpickev.b      vr5,     vr3,     vr3  //0 2 4 6...
   3999    vpickod.b      vr6,     vr3,     vr3  //1 3 5 7...
   4000    vsllwil.hu.bu  vr5,     vr5,     0
   4001    vsllwil.hu.bu  vr6,     vr6,     0
   4002 
   4003    vmul.h         vr3,     vr5,     vr2
   4004    vmadd.h        vr3,     vr6,     vr1
   4005    vadd.h         vr3,     vr3,     vr7
   4006    vsrai.h        vr3,     vr3,     6
   4007    vsrlni.b.h     vr3,     vr3,     0
   4008    fstx.d         f3,      a0,      a6
   4009 
   4010    addi.d         a6,      a6,      8
   4011    addi.d         t6,      t6,      16
   4012    addi.d         t8,      t8,      -1
   4013    bnez           t8,      .Z1_LOOPX_W8
   4014    b              .Z1_LOOPY_END
   4015 .Z1_LOOPX_W4:
   4016    vldx           vr3,     a7,      t6
   4017    vsllwil.hu.bu  vr3,     vr3,     0
   4018    vpickev.h      vr5,     vr3,     vr3  //0 2 4 6...
   4019    vpickod.h      vr6,     vr3,     vr3  //1 3 5 7...
   4020 
   4021    vmul.h         vr3,     vr5,     vr2
   4022    vmadd.h        vr3,     vr6,     vr1
   4023    vadd.h         vr3,     vr3,     vr7
   4024    vsrai.h        vr3,     vr3,     6
   4025    vsrlni.b.h     vr3,     vr3,     0
   4026    fstx.s         f3,      a0,      a6
   4027    b              .Z1_LOOPY_END
   4028 .Z1_LOOPX_BASEMAX:
   4029    srai.d         t8,      a2,      3  //loop param
   4030    beqz           t8,      .Z1_LOOPX_BASEMAX4
   4031 .Z1_LOOPX_BASEMAX8:
   4032    add.d          t5,      a7,      t6
   4033    vld            vr3,     t5,      0
   4034    vpickev.b      vr5,     vr3,     vr3  //0 2 4 6...
   4035    vpickod.b      vr6,     vr3,     vr3  //1 3 5 7...
   4036    vsllwil.hu.bu  vr5,     vr5,     0
   4037    vsllwil.hu.bu  vr6,     vr6,     0
   4038 
   4039    vmul.h         vr3,     vr5,     vr2
   4040    vmadd.h        vr3,     vr6,     vr1
   4041    vadd.h         vr3,     vr3,     vr7
   4042    vsrai.h        vr3,     vr3,     6
   4043    vsrlni.b.h     vr3,     vr3,     0
   4044    fstx.d         f3,      a0,      a6
   4045 
   4046    addi.d         a6,      a6,      8
   4047    addi.d         t6,      t6,      16
   4048    addi.d         t8,      t8,      -1
   4049    bnez           t8,      .Z1_LOOPX_BASEMAX8
   4050 .Z1_LOOPX_BASEMAX4:
   4051    andi           t8,      a2,      4
   4052    beqz           t8,      .Z1_LOOPX_BASEMAX2
   4053 
   4054    vldx           vr3,     a7,      t6
   4055    vsllwil.hu.bu  vr3,     vr3,     0
   4056    vpickev.h      vr5,     vr3,     vr3  //0 2 4 6...
   4057    vpickod.h      vr6,     vr3,     vr3  //1 3 5 7...
   4058 
   4059    vmul.h         vr3,     vr5,     vr2
   4060    vmadd.h        vr3,     vr6,     vr1
   4061    vadd.h         vr3,     vr3,     vr7
   4062    vsrai.h        vr3,     vr3,     6
   4063    vsrlni.b.h     vr3,     vr3,     0
   4064    fstx.s         f3,      a0,      a6
   4065 
   4066    addi.d         a6,      a6,      4
   4067    addi.d         t6,      t6,      8
   4068 .Z1_LOOPX_BASEMAX2:
   4069    andi           t8,      a2,     2
   4070    beqz           t8,      .Z1_LOOPX_BASEMAX1
   4071 
   4072    vldx           vr3,     a7,      t6
   4073    vsllwil.hu.bu  vr3,     vr3,     0
   4074    vpickev.h      vr5,     vr3,     vr3  //0 2 4 6...
   4075    vpickod.h      vr6,     vr3,     vr3  //1 3 5 7...
   4076 
   4077    vmul.h         vr3,     vr5,     vr2
   4078    vmadd.h        vr3,     vr6,     vr1
   4079    vadd.h         vr3,     vr3,     vr7
   4080    vsrai.h        vr3,     vr3,     6
   4081    vsrlni.b.h     vr3,     vr3,     0
   4082    vpickve2gr.bu  t7,      vr3,     0
   4083    vpickve2gr.bu  t8,      vr3,     1
   4084    stx.b          t7,      a0,      a6
   4085    addi.d         a6,      a6,      1
   4086    stx.b          t8,      a0,      a6
   4087    addi.d         a6,      a6,      1
   4088    addi.d         t6,      t6,      4
   4089 .Z1_LOOPX_BASEMAX1:
   4090    andi           t8,      a2,     1
   4091    beqz           t8,      .Z1_LOOPX_BASEMAX_MSET
   4092 
   4093    add.d          a2,      a7,      t6
   4094    sub.d          t7,      a5,      t4
   4095    ld.bu          t8,      a2,      0
   4096    mul.w          t7,      t7,      t8
   4097    ld.bu          t8,      a2,      1
   4098    mul.w          t8,      t8,      t4
   4099    add.d          t7,      t7,      t8
   4100    addi.d         t7,      t7,      32
   4101    srai.d         t7,      t7,      6
   4102    stx.b          t7,      a0,      a6
   4103 
   4104    addi.d         a6,      a6,      1
   4105 .Z1_LOOPX_BASEMAX_MSET:  //memset
   4106    add.d          t6,      a0,      a6  //dst
   4107    add.d          t7,      a7,      t0  //src
   4108    sub.d          a2,      a3,      a6  //size
   4109    pixel_set_8bpc_allw t6, t7, a2, t8, t4
   4110 .Z1_LOOPY_END:
   4111    addi.d         t2,      t2,      1
   4112    add.d          a0,      a0,      a1
   4113    add.d          t3,      t3,      t1
   4114    blt            t2,      a4,      .Z1_LOOPY
   4115    b              .Z1_END
   4116 
   4117 .Z1_UA0:
   4118    li.w           a5,      64
   4119    vreplgr2vr.h   vr0,     a5
   4120    vsrai.h        vr7,     vr0,     1
   4121    or             t2,      zero,    zero  //y
   4122    or             t3,      t1,      t1    //xpos
   4123 .Z1_UA0_LOOPY:
   4124    andi           t4,      t3,      0x3e  //frac
   4125    vreplgr2vr.h   vr1,     t4
   4126    vsub.h         vr2,     vr0,     vr1
   4127    or             a6,      zero,    zero  //x
   4128    srai.d         t6,      t3,      6     //base
   4129 
   4130    sub.d          a2,      t0,      t6     //a2:base_num
   4131    blt            a2,      zero,    .Z1_UA0_BASENUM
   4132    b              .Z1_UA0_LOOPX
   4133 .Z1_UA0_BASENUM:
   4134    or             a2,      zero,    zero
   4135 
   4136 .Z1_UA0_LOOPX:
   4137    blt            a2,      a3,      .Z1_UA0_LOOPX_BASEMAX
   4138 
   4139    srai.d         t8,      a3,      3  //loop param
   4140    beqz           t8,      .Z1_UA0_LOOPX_W4
   4141 .Z1_UA0_LOOPX_W8:
   4142    add.d          t5,      a7,      t6
   4143    vld            vr5,     t5,      0
   4144    vld            vr6,     t5,      1
   4145    vsllwil.hu.bu  vr5,     vr5,     0
   4146    vsllwil.hu.bu  vr6,     vr6,     0
   4147 
   4148    vmul.h         vr3,     vr5,     vr2
   4149    vmadd.h        vr3,     vr6,     vr1
   4150    vadd.h         vr3,     vr3,     vr7
   4151    vsrai.h        vr3,     vr3,     6
   4152    vsrlni.b.h     vr3,     vr3,     0
   4153    fstx.d         f3,      a0,      a6
   4154 
   4155    addi.d         a6,      a6,      8
   4156    addi.d         t6,      t6,      8
   4157    addi.d         t8,      t8,      -1
   4158    bnez           t8,      .Z1_UA0_LOOPX_W8
   4159    b              .Z1_UA0_LOOPY_END
   4160 .Z1_UA0_LOOPX_W4:
   4161    vldx           vr5,     a7,      t6
   4162    vsllwil.hu.bu  vr5,     vr5,     0
   4163    vbsrl.v        vr6,     vr5,     2
   4164 
   4165    vmul.h         vr3,     vr5,     vr2
   4166    vmadd.h        vr3,     vr6,     vr1
   4167    vadd.h         vr3,     vr3,     vr7
   4168    vsrai.h        vr3,     vr3,     6
   4169    vsrlni.b.h     vr3,     vr3,     0
   4170    fstx.s         f3,      a0,      a6
   4171    b              .Z1_UA0_LOOPY_END
   4172 .Z1_UA0_LOOPX_BASEMAX:
   4173    srai.d         t8,      a2,      3  //loop param
   4174    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX4
   4175 .Z1_UA0_LOOPX_BASEMAX8:
   4176    add.d          t5,      a7,      t6
   4177    vld            vr5,     t5,      0
   4178    vld            vr6,     t5,      1
   4179    vsllwil.hu.bu  vr5,     vr5,     0
   4180    vsllwil.hu.bu  vr6,     vr6,     0
   4181 
   4182    vmul.h         vr3,     vr5,     vr2
   4183    vmadd.h        vr3,     vr6,     vr1
   4184    vadd.h         vr3,     vr3,     vr7
   4185    vsrai.h        vr3,     vr3,     6
   4186    vsrlni.b.h     vr3,     vr3,     0
   4187    fstx.d         f3,      a0,      a6
   4188 
   4189    addi.d         a6,      a6,      8
   4190    addi.d         t6,      t6,      8
   4191    addi.d         t8,      t8,      -1
   4192    bnez           t8,      .Z1_UA0_LOOPX_BASEMAX8
   4193 .Z1_UA0_LOOPX_BASEMAX4:
   4194    andi           t8,      a2,      4
   4195    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX2
   4196 
   4197    vldx           vr5,     a7,      t6
   4198    vsllwil.hu.bu  vr5,     vr5,     0
   4199    vbsrl.v        vr6,     vr5,     2
   4200 
   4201    vmul.h         vr3,     vr5,     vr2
   4202    vmadd.h        vr3,     vr6,     vr1
   4203    vadd.h         vr3,     vr3,     vr7
   4204    vsrai.h        vr3,     vr3,     6
   4205    vsrlni.b.h     vr3,     vr3,     0
   4206    fstx.s         f3,      a0,      a6
   4207 
   4208    addi.d         a6,      a6,      4
   4209    addi.d         t6,      t6,      4
   4210 .Z1_UA0_LOOPX_BASEMAX2:
   4211    andi           t8,      a2,     2
   4212    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX1
   4213 
   4214    vldx           vr5,     a7,      t6
   4215    vsllwil.hu.bu  vr5,     vr5,     0
   4216    vbsrl.v        vr6,     vr5,     2
   4217 
   4218    vmul.h         vr3,     vr5,     vr2
   4219    vmadd.h        vr3,     vr6,     vr1
   4220    vadd.h         vr3,     vr3,     vr7
   4221    vsrai.h        vr3,     vr3,     6
   4222    vsrlni.b.h     vr3,     vr3,     0
   4223    vpickve2gr.bu  t7,      vr3,     0
   4224    vpickve2gr.bu  t8,      vr3,     1
   4225    stx.b          t7,      a0,      a6
   4226    addi.d         a6,      a6,      1
   4227    stx.b          t8,      a0,      a6
   4228    addi.d         a6,      a6,      1
   4229    addi.d         t6,      t6,      2
   4230 .Z1_UA0_LOOPX_BASEMAX1:
   4231    andi           t8,      a2,     1
   4232    beqz           t8,      .Z1_UA0_LOOPX_BASEMAX_MSET
   4233 
   4234    add.d          a2,      a7,      t6
   4235    sub.d          t7,      a5,      t4
   4236    ld.bu          t8,      a2,      0
   4237    mul.w          t7,      t7,      t8
   4238    ld.bu          t8,      a2,      1
   4239    mul.w          t8,      t8,      t4
   4240    add.d          t7,      t7,      t8
   4241    addi.d         t7,      t7,      32
   4242    srai.d         t7,      t7,      6
   4243    stx.b          t7,      a0,      a6
   4244 
   4245    addi.d         a6,      a6,      1
   4246 .Z1_UA0_LOOPX_BASEMAX_MSET:  //memset
   4247    add.d          t6,      a0,      a6  //dst
   4248    add.d          t7,      a7,      t0  //src
   4249    sub.d          a2,      a3,      a6  //size
   4250    pixel_set_8bpc_allw t6, t7, a2, t8, t4
   4251 .Z1_UA0_LOOPY_END:
   4252    addi.d         t2,      t2,      1
   4253    add.d          a0,      a0,      a1
   4254    add.d          t3,      t3,      t1
   4255    blt            t2,      a4,      .Z1_UA0_LOOPY
   4256 
   4257 .Z1_END:
   4258    addi.d         sp,      sp,      128
   4259 endfunc