tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef.S (69682B)


      1 /*
      2 * Copyright © 2024, VideoLAN and dav1d authors
      3 * Copyright © 2024, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/loongarch/loongson_asm.S"
     29 
     30 // static int cdef_find_dir_lsx(const pixel *img, const ptrdiff_t stride,
     31 //                            unsigned *const var HIGHBD_DECL_SUFFIX)
     32 // param: img: a0, stride: a1, var: a2
     33 function cdef_find_dir_8bpc_lsx
     34    addi.d         sp,    sp,    -64
     35    fst.d          f24,   sp,    0
     36    fst.d          f25,   sp,    8
     37    fst.d          f26,   sp,    16
     38    fst.d          f27,   sp,    24
     39    fst.d          f28,   sp,    32
     40    fst.d          f29,   sp,    40
     41    fst.d          f30,   sp,    48
     42    fst.d          f31,   sp,    56
     43 
     44    li.d           a3,    128
     45    vreplgr2vr.w   vr31,  a3
     46 
     47    // hv: vr0-vr3  diag: vr4-vr11  alt: vr12-vr23
     48 .irp i, vr0, vr1, vr2, vr3, vr4, vr5, vr6, vr7, vr8, vr9, vr10, \
     49        vr11, vr12, vr13, vr14, vr15, vr16, vr17, vr18, vr19, \
     50        vr20, vr21, vr22, vr23
     51    vxor.v      \i,       \i,       \i
     52 .endr
     53 
     54 .CFDL01:  // 8
     55    // 0
     56    fld.d          f24,   a0,    0  //img
     57    vpermi.w       vr25,  vr24,  0x01
     58 
     59    vsllwil.hu.bu  vr24,  vr24,  0
     60    vsllwil.hu.bu  vr24,  vr24,  0
     61    vsllwil.hu.bu  vr25,  vr25,  0
     62    vsllwil.hu.bu  vr25,  vr25,  0
     63 
     64    vsub.w         vr24,  vr24,  vr31  //px
     65    vsub.w         vr25,  vr25,  vr31
     66 
     67    vadd.w         vr4,   vr4,   vr24  //diag[0][y+x]
     68    vadd.w         vr5,   vr5,   vr25
     69 
     70    vpackev.w      vr26,  vr25,  vr24
     71    vpackod.w      vr27,  vr25,  vr24
     72    vpermi.w       vr26,  vr26,  0xd8 //px0246
     73    vpermi.w       vr27,  vr27,  0xd8 //px1357
     74    vadd.w         vr12,  vr12,  vr26
     75    vadd.w         vr12,  vr12,  vr27  //alt[0][y+(x>>1)]
     76 
     77    vhaddw.d.w     vr28,  vr24,  vr24
     78    vhaddw.q.d     vr28,  vr28,  vr28
     79    vpickve2gr.d   a3,    vr28,  0
     80    vhaddw.d.w     vr28,  vr25,  vr25
     81    vhaddw.q.d     vr28,  vr28,  vr28
     82    vpickve2gr.d   a4,    vr28,  0
     83    add.d          a3,    a3,    a4
     84    vinsgr2vr.w    vr0,   a3,    0    //hv[0][y]
     85 
     86    vadd.w         vr15,  vr15,  vr26
     87    vadd.w         vr15,  vr15,  vr27  //alt[1][3+y-(x>>1)]
     88    vpermi.w       vr15,  vr15,  0x1b
     89 
     90    vadd.w         vr9,   vr9,   vr24
     91    vadd.w         vr8,   vr8,   vr25
     92    vpermi.w       vr8,   vr8,   0x1b
     93    vpermi.w       vr9,   vr9,   0x1b  //diag[1][7+y-x]
     94 
     95    vxor.v         vr28,  vr28,  vr28
     96    vxor.v         vr29,  vr29,  vr29
     97    vadd.w         vr28,  vr28,  vr24
     98    vadd.w         vr29,  vr29,  vr25
     99    vextrins.w     vr18,  vr28,  0x30
    100    vshuf4i.w      vr19,  vr28,  0x39
    101    vextrins.w     vr19,  vr29,  0x30
    102    vshuf4i.w      vr20,  vr29,  0x39  //alt[2][3-(y>>1)+7]
    103    vinsgr2vr.w    vr20,  zero,  3
    104 
    105    vadd.w         vr2,   vr2,   vr24
    106    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    107 
    108    vadd.w         vr21,  vr21,  vr24
    109    vadd.w         vr22,  vr22,  vr25  //alt[3][(y>>1)+x]
    110 
    111    add.d          a0,    a0,    a1
    112 
    113    // 1
    114    fld.d          f24,   a0,    0  //img
    115    vpermi.w       vr25,  vr24,  0x01
    116 
    117    vsllwil.hu.bu  vr24,  vr24,  0
    118    vsllwil.hu.bu  vr24,  vr24,  0
    119    vsllwil.hu.bu  vr25,  vr25,  0
    120    vsllwil.hu.bu  vr25,  vr25,  0
    121 
    122    vsub.w         vr24,  vr24,  vr31  //px
    123    vsub.w         vr25,  vr25,  vr31
    124 
    125    vbsrl.v        vr28,  vr4,   4  //1-4
    126    vbsrl.v        vr29,  vr5,   4  //5-8
    127    vextrins.w     vr28,  vr5,   0x30
    128    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
    129    vadd.w         vr29,  vr29,  vr25
    130    vbsll.v        vr5,   vr29,  4
    131    vextrins.w     vr5,   vr28,  0x03
    132    vextrins.w     vr6,   vr29,  0x03
    133    vextrins.w     vr28,  vr4,   0x30
    134    vshuf4i.w      vr4,   vr28,  0x93
    135 
    136    vbsrl.v        vr28,  vr12,  4
    137    vextrins.w     vr28,  vr13,  0x30
    138    vpackev.w      vr26,  vr25,  vr24
    139    vpackod.w      vr27,  vr25,  vr24
    140    vpermi.w       vr26,  vr26,  0xd8 //px0246
    141    vpermi.w       vr27,  vr27,  0xd8 //px1357
    142    vadd.w         vr28,  vr28,  vr26
    143    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
    144    vextrins.w     vr13,  vr28,  0x03
    145    vextrins.w     vr28,  vr12,  0x30
    146    vshuf4i.w      vr12,  vr28,  0x93
    147 
    148    vhaddw.d.w     vr28,  vr24,  vr24
    149    vhaddw.q.d     vr28,  vr28,  vr28
    150    vpickve2gr.d   a3,    vr28,  0
    151    vhaddw.d.w     vr28,  vr25,  vr25
    152    vhaddw.q.d     vr28,  vr28,  vr28
    153    vpickve2gr.d   a4,    vr28,  0
    154    add.d          a3,    a3,    a4
    155    vinsgr2vr.w    vr0,   a3,    1    //hv[0][y]
    156 
    157    vbsrl.v        vr28,  vr15,  4
    158    vextrins.w     vr28,  vr16,  0x30
    159    vpermi.w       vr28,  vr28,  0x1b
    160    vadd.w         vr28,  vr28,  vr26
    161    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
    162    vextrins.w     vr16,  vr28,  0x00
    163    vextrins.w     vr28,  vr15,  0x00
    164    vshuf4i.w      vr15,  vr28,  0x6c
    165 
    166    vbsrl.v        vr28,  vr8,   4     //4321
    167    vbsrl.v        vr29,  vr9,   4     //8765
    168    vextrins.w     vr28,  vr9,   0x30
    169    vpermi.w       vr28,  vr28,  0x1b
    170    vpermi.w       vr29,  vr29,  0x1b
    171    vadd.w         vr29,  vr29,  vr24
    172    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
    173    vextrins.w     vr10,  vr29,  0x00
    174    vextrins.w     vr29,  vr28,  0x00
    175    vshuf4i.w      vr9,   vr29,  0x6c
    176    vextrins.w     vr28,  vr8,   0x00
    177    vshuf4i.w      vr8,   vr28,  0x6c
    178 
    179    vbsll.v        vr28,  vr19,  4
    180    vextrins.w     vr28,  vr18,  0x03
    181    vbsll.v        vr29,  vr20,  4
    182    vextrins.w     vr29,  vr19,  0x03
    183    vadd.w         vr28,  vr28,  vr24
    184    vadd.w         vr29,  vr29,  vr25  //alt[2][3-(y>>1)+7]
    185    vextrins.w     vr18,  vr28,  0x30
    186    vextrins.w     vr28,  vr29,  0x00
    187    vshuf4i.w      vr19,  vr28,  0x39
    188    vbsrl.v        vr20,  vr29,  4
    189 
    190    vadd.w         vr2,   vr2,   vr24
    191    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    192 
    193    vadd.w         vr21,  vr21,  vr24
    194    vadd.w         vr22,  vr22,  vr25  //alt[3][(y>>1)+x]
    195 
    196    add.d          a0,    a0,    a1
    197 
    198    // 2
    199    fld.d          f24,   a0,    0  //img
    200    vpermi.w       vr25,  vr24,  0x01
    201 
    202    vsllwil.hu.bu  vr24,  vr24,  0
    203    vsllwil.hu.bu  vr24,  vr24,  0
    204    vsllwil.hu.bu  vr25,  vr25,  0
    205    vsllwil.hu.bu  vr25,  vr25,  0
    206 
    207    vsub.w         vr24,  vr24,  vr31  //px
    208    vsub.w         vr25,  vr25,  vr31
    209 
    210    vbsrl.v        vr28,  vr4,   8
    211    vbsrl.v        vr29,  vr5,   8
    212    vextrins.d     vr28,  vr5,   0x10  //2-5
    213    vextrins.d     vr29,  vr6,   0x10  //6-9
    214    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
    215    vadd.w         vr29,  vr29,  vr25
    216    vextrins.d     vr4,   vr28,  0x10
    217    vextrins.d     vr5,   vr28,  0x01
    218    vextrins.d     vr5,   vr29,  0x10
    219    vextrins.d     vr6,   vr29,  0x01
    220 
    221    vbsrl.v        vr28,  vr12,  8
    222    vextrins.d     vr28,  vr13,  0x10
    223    vpackev.w      vr26,  vr25,  vr24
    224    vpackod.w      vr27,  vr25,  vr24
    225    vpermi.w       vr26,  vr26,  0xd8 //px0246
    226    vpermi.w       vr27,  vr27,  0xd8 //px1357
    227    vadd.w         vr28,  vr28,  vr26
    228    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
    229    vextrins.d     vr12,  vr28,  0x10
    230    vextrins.d     vr13,  vr28,  0x01
    231 
    232    vhaddw.d.w     vr28,  vr24,  vr24
    233    vhaddw.q.d     vr28,  vr28,  vr28
    234    vpickve2gr.d   a3,    vr28,  0
    235    vhaddw.d.w     vr28,  vr25,  vr25
    236    vhaddw.q.d     vr28,  vr28,  vr28
    237    vpickve2gr.d   a4,    vr28,  0
    238    add.d          a3,    a3,    a4
    239    vinsgr2vr.w    vr0,   a3,    2    //hv[0][y]
    240 
    241    vbsrl.v        vr28,  vr15,  8
    242    vextrins.d     vr28,  vr16,  0x10
    243    vpermi.w       vr28,  vr28,  0x1b
    244    vadd.w         vr28,  vr28,  vr26
    245    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
    246    vpermi.w       vr28,  vr28,  0x1b
    247    vextrins.d     vr15,  vr28,  0x10
    248    vextrins.d     vr16,  vr28,  0x01
    249 
    250    vbsrl.v        vr28,  vr8,   8
    251    vextrins.d     vr28,  vr9,   0x10
    252    vbsrl.v        vr29,  vr9,   8
    253    vextrins.d     vr29,  vr10,  0x10
    254    vpermi.w       vr28,  vr28,  0x1b  //5432
    255    vpermi.w       vr29,  vr29,  0x1b  //9876
    256    vadd.w         vr29,  vr29,  vr24
    257    vadd.w         vr28,  vr28,  vr25
    258    vpermi.w       vr28,  vr28,  0x1b
    259    vpermi.w       vr29,  vr29,  0x1b
    260    vextrins.d     vr8,   vr28,  0x10
    261    vextrins.d     vr9,   vr28,  0x01
    262    vextrins.d     vr9,   vr29,  0x10
    263    vextrins.d     vr10,  vr29,  0x01  //diag[1][7+y-x]
    264 
    265    vbsrl.v        vr28,  vr18,  8
    266    vextrins.d     vr28,  vr19,  0x10 //2345
    267    vbsrl.v        vr29,  vr19,  8
    268    vextrins.d     vr29,  vr20,  0x10 //6789
    269    vadd.w         vr28,  vr28,  vr24
    270    vadd.w         vr29,  vr29,  vr25
    271    vextrins.d     vr18,  vr28,  0x10
    272    vextrins.d     vr19,  vr28,  0x01
    273    vextrins.d     vr19,  vr29,  0x10
    274    vextrins.d     vr20,  vr29,  0x01   //alt[2][3-(y>>1)+7]
    275 
    276    vadd.w         vr2,   vr2,   vr24
    277    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    278 
    279    vbsrl.v        vr28,  vr21,  4
    280    vextrins.w     vr28,  vr22,  0x30  //1234
    281    vbsrl.v        vr29,  vr22,  4     //5678
    282    vadd.w         vr28,  vr28,  vr24
    283    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
    284    vextrins.w     vr23,  vr29,  0x03
    285    vextrins.w     vr29,  vr28,  0x33
    286    vshuf4i.w      vr22,  vr29,  0x93
    287    vextrins.w     vr28,  vr21,  0x30
    288    vshuf4i.w      vr21,  vr28,  0x93
    289 
    290    add.d          a0,    a0,    a1
    291 
    292    // 3
    293    fld.d          f24,   a0,    0  //img
    294    vpermi.w       vr25,  vr24,  0x01
    295 
    296    vsllwil.hu.bu  vr24,  vr24,  0
    297    vsllwil.hu.bu  vr24,  vr24,  0
    298    vsllwil.hu.bu  vr25,  vr25,  0
    299    vsllwil.hu.bu  vr25,  vr25,  0
    300 
    301    vsub.w         vr24,  vr24,  vr31  //px
    302    vsub.w         vr25,  vr25,  vr31
    303 
    304    vbsll.v        vr28,  vr5,   4
    305    vextrins.w     vr28,  vr4,   0x03 //3456
    306    vbsll.v        vr29,  vr6,   4
    307    vextrins.w     vr29,  vr5,   0x03 //78910
    308    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
    309    vadd.w         vr29,  vr29,  vr25
    310    vextrins.w     vr4,   vr28,  0x30
    311    vextrins.w     vr28,  vr29,  0x00
    312    vshuf4i.w      vr5,   vr28,  0x39
    313    vbsrl.v        vr6,   vr29,  4
    314 
    315    vbsll.v        vr28,  vr13,  4
    316    vextrins.w     vr28,  vr12,  0x03
    317    vpackev.w      vr26,  vr25,  vr24
    318    vpackod.w      vr27,  vr25,  vr24
    319    vpermi.w       vr26,  vr26,  0xd8 //px0246
    320    vpermi.w       vr27,  vr27,  0xd8 //px1357
    321    vadd.w         vr28,  vr28,  vr26
    322    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
    323    vextrins.w     vr12,  vr28,  0x30
    324    vbsrl.v        vr13,  vr28,  4
    325 
    326    vhaddw.d.w     vr28,  vr24,  vr24
    327    vhaddw.q.d     vr28,  vr28,  vr28
    328    vpickve2gr.d   a3,    vr28,  0
    329    vhaddw.d.w     vr28,  vr25,  vr25
    330    vhaddw.q.d     vr28,  vr28,  vr28
    331    vpickve2gr.d   a4,    vr28,  0
    332    add.d          a3,    a3,    a4
    333    vinsgr2vr.w    vr0,   a3,    3    //hv[0][y]
    334 
    335    vbsll.v        vr28,  vr16,  4
    336    vextrins.w     vr28,  vr15,  0x03
    337    vpermi.w       vr28,  vr28,  0x1b  //6543
    338    vadd.w         vr28,  vr28,  vr26
    339    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
    340    vextrins.w     vr15,  vr28,  0x33
    341    vshuf4i.w      vr16,  vr28,  0xc6
    342    vinsgr2vr.w    vr16,  zero,  3
    343 
    344    vbsll.v        vr28,  vr9,   4
    345    vextrins.w     vr28,  vr8,   0x03  //3456
    346    vbsll.v        vr29,  vr10,  4
    347    vextrins.w     vr29,  vr9,   0x03  //78910
    348    vpermi.w       vr28,  vr28,  0x1b  //6543
    349    vpermi.w       vr29,  vr29,  0x1b  //10987
    350    vadd.w         vr29,  vr29,  vr24
    351    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
    352    vextrins.w     vr8,   vr28,  0x33
    353    vextrins.w     vr28,  vr29,  0x33
    354    vshuf4i.w      vr9,   vr28,  0xc6
    355    vshuf4i.w      vr10,  vr29,  0xc6
    356    vinsgr2vr.w    vr10,  zero,  3
    357 
    358    vbsrl.v        vr28,  vr18,  8
    359    vextrins.d     vr28,  vr19,  0x10 //2345
    360    vbsrl.v        vr29,  vr19,  8
    361    vextrins.d     vr29,  vr20,  0x10 //6789
    362    vadd.w         vr28,  vr28,  vr24
    363    vadd.w         vr29,  vr29,  vr25
    364    vextrins.d     vr18,  vr28,  0x10
    365    vextrins.d     vr19,  vr28,  0x01
    366    vextrins.d     vr19,  vr29,  0x10
    367    vextrins.d     vr20,  vr29,  0x01   //alt[2][3-(y>>1)+7]
    368 
    369    vadd.w         vr2,   vr2,   vr24
    370    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    371 
    372    vbsrl.v        vr28,  vr21,  4
    373    vextrins.w     vr28,  vr22,  0x30  //1234
    374    vbsrl.v        vr29,  vr22,  4     //5678
    375    vextrins.w     vr29,  vr23,  0x30
    376    vadd.w         vr28,  vr28,  vr24
    377    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
    378    vextrins.w     vr23,  vr29,  0x03
    379    vextrins.w     vr29,  vr28,  0x33
    380    vshuf4i.w      vr22,  vr29,  0x93
    381    vextrins.w     vr28,  vr21,  0x30
    382    vshuf4i.w      vr21,  vr28,  0x93
    383 
    384    add.d          a0,    a0,    a1
    385 
    386    // 4
    387    fld.d          f24,   a0,    0  //img
    388    vpermi.w       vr25,  vr24,  0x01
    389 
    390    vsllwil.hu.bu  vr24,  vr24,  0
    391    vsllwil.hu.bu  vr24,  vr24,  0
    392    vsllwil.hu.bu  vr25,  vr25,  0
    393    vsllwil.hu.bu  vr25,  vr25,  0
    394 
    395    vsub.w         vr24,  vr24,  vr31  //px
    396    vsub.w         vr25,  vr25,  vr31
    397 
    398    vadd.w         vr5,   vr5,   vr24  //diag[0][y+x]
    399    vadd.w         vr6,   vr6,   vr25
    400 
    401    vpackev.w      vr26,  vr25,  vr24
    402    vpackod.w      vr27,  vr25,  vr24
    403    vpermi.w       vr26,  vr26,  0xd8 //px0246
    404    vpermi.w       vr27,  vr27,  0xd8 //px1357
    405    vadd.w         vr13,  vr13,  vr26
    406    vadd.w         vr13,  vr13,  vr27  //alt[0][y+(x>>1)]
    407 
    408    vhaddw.d.w     vr28,  vr24,  vr24
    409    vhaddw.q.d     vr28,  vr28,  vr28
    410    vpickve2gr.d   a3,    vr28,  0
    411    vhaddw.d.w     vr28,  vr25,  vr25
    412    vhaddw.q.d     vr28,  vr28,  vr28
    413    vpickve2gr.d   a4,    vr28,  0
    414    add.d          a3,    a3,    a4
    415    vinsgr2vr.w    vr1,   a3,    0    //hv[0][y]
    416 
    417    vpermi.w       vr16,  vr16,  0x1b
    418    vadd.w         vr16,  vr16,  vr26
    419    vadd.w         vr16,  vr16,  vr27  //alt[1][3+y-(x>>1)]
    420    vpermi.w       vr16,  vr16,  0x1b
    421 
    422    vpermi.w       vr9,   vr9,   0x1b
    423    vpermi.w       vr10,  vr10,  0x1b
    424    vadd.w         vr10,  vr10,  vr24
    425    vadd.w         vr9,   vr9,   vr25
    426    vpermi.w       vr9,   vr9,   0x1b
    427    vpermi.w       vr10,  vr10,  0x1b  //diag[1][7+y-x]
    428 
    429    vbsrl.v        vr28,  vr18,  4
    430    vextrins.w     vr28,  vr19,  0x30  //1234
    431    vbsrl.v        vr29,  vr19,  4
    432    vextrins.w     vr29,  vr20,  0x30  //5678
    433    vadd.w         vr28,  vr28,  vr24
    434    vadd.w         vr29,  vr29,  vr25  //alt[2][3-(y>>1)+7]
    435    vextrins.w     vr20,  vr29,  0x03
    436    vextrins.w     vr29,  vr28,  0x33
    437    vshuf4i.w      vr19,  vr29,  0x93
    438    vbsll.v        vr18,  vr28,  4
    439 
    440    vadd.w         vr2,   vr2,   vr24
    441    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    442 
    443    vbsrl.v        vr28,  vr21,  8
    444    vextrins.d     vr28,  vr22,  0x10
    445    vbsrl.v        vr29,  vr22,  8
    446    vextrins.d     vr29,  vr23,  0x10
    447    vadd.w         vr28,  vr28,  vr24
    448    vadd.w         vr29,  vr29,  vr25
    449    vextrins.d     vr21,  vr28,  0x10
    450    vextrins.d     vr22,  vr28,  0x01
    451    vextrins.d     vr22,  vr29,  0x10
    452    vextrins.d     vr23,  vr29,  0x01  //alt[3][(y>>1)+x]
    453 
    454    add.d          a0,    a0,    a1
    455 
    456    // 5
    457    fld.d          f24,   a0,    0  //img
    458    vpermi.w       vr25,  vr24,  0x01
    459 
    460    vsllwil.hu.bu  vr24,  vr24,  0
    461    vsllwil.hu.bu  vr24,  vr24,  0
    462    vsllwil.hu.bu  vr25,  vr25,  0
    463    vsllwil.hu.bu  vr25,  vr25,  0
    464 
    465    vsub.w         vr24,  vr24,  vr31  //px
    466    vsub.w         vr25,  vr25,  vr31
    467 
    468    vbsrl.v        vr28,  vr5,   4  //5-8
    469    vbsrl.v        vr29,  vr6,   4  //9-12
    470    vextrins.w     vr28,  vr6,   0x30
    471    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
    472    vadd.w         vr29,  vr29,  vr25
    473    vextrins.w     vr7,   vr29,  0x03
    474    vextrins.w     vr29,  vr28,  0x33
    475    vshuf4i.w      vr6,   vr29,  0x93
    476    vextrins.w     vr28,  vr5,   0x30
    477    vshuf4i.w      vr5,   vr28,  0x93
    478 
    479    vbsrl.v        vr28,  vr13,  4
    480    vextrins.w     vr28,  vr14,  0x30
    481    vpackev.w      vr26,  vr25,  vr24
    482    vpackod.w      vr27,  vr25,  vr24
    483    vpermi.w       vr26,  vr26,  0xd8 //px0246
    484    vpermi.w       vr27,  vr27,  0xd8 //px1357
    485    vadd.w         vr28,  vr28,  vr26
    486    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
    487    vextrins.w     vr14,  vr28,  0x03
    488    vextrins.w     vr28,  vr13,  0x30
    489    vshuf4i.w      vr13,  vr28,  0x93
    490 
    491    vhaddw.d.w     vr28,  vr24,  vr24
    492    vhaddw.q.d     vr28,  vr28,  vr28
    493    vpickve2gr.d   a3,    vr28,  0
    494    vhaddw.d.w     vr28,  vr25,  vr25
    495    vhaddw.q.d     vr28,  vr28,  vr28
    496    vpickve2gr.d   a4,    vr28,  0
    497    add.d          a3,    a3,    a4
    498    vinsgr2vr.w    vr1,   a3,    1    //hv[0][y]
    499 
    500    vbsrl.v        vr28,  vr16,  4
    501    vextrins.w     vr28,  vr17,  0x30
    502    vpermi.w       vr28,  vr28,  0x1b
    503    vadd.w         vr28,  vr28,  vr26
    504    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
    505    vextrins.w     vr17,  vr28,  0x00
    506    vextrins.w     vr28,  vr16,  0x00
    507    vshuf4i.w      vr16,  vr28,  0x6c
    508 
    509    vbsrl.v        vr28,  vr9,   4
    510    vbsrl.v        vr29,  vr10,  4
    511    vextrins.w     vr28,  vr10,  0x30
    512    vpermi.w       vr28,  vr28,  0x1b  //8-5
    513    vpermi.w       vr29,  vr29,  0x1b  //12-9
    514    vadd.w         vr29,  vr29,  vr24
    515    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
    516    vextrins.w     vr11,  vr29,  0x00
    517    vextrins.w     vr29,  vr28,  0x00
    518    vshuf4i.w      vr10,  vr29,  0x6c
    519    vextrins.w     vr28,  vr9,   0x00
    520    vshuf4i.w      vr9,   vr28,  0x6c
    521 
    522    vbsrl.v        vr28,  vr18,  4
    523    vextrins.w     vr28,  vr19,  0x30  //1234
    524    vbsrl.v        vr29,  vr19,  4
    525    vextrins.w     vr29,  vr20,  0x30  //5678
    526    vadd.w         vr28,  vr28,  vr24
    527    vadd.w         vr29,  vr29,  vr25  //alt[2][3-(y>>1)+7]
    528    vextrins.w     vr20,  vr29,  0x03
    529    vextrins.w     vr29,  vr28,  0x33
    530    vshuf4i.w      vr19,  vr29,  0x93
    531    vbsll.v        vr18,  vr28,  4
    532 
    533    vadd.w         vr2,   vr2,   vr24
    534    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    535 
    536    vbsrl.v        vr28,  vr21,  8
    537    vextrins.d     vr28,  vr22,  0x10
    538    vbsrl.v        vr29,  vr22,  8
    539    vextrins.d     vr29,  vr23,  0x10
    540    vadd.w         vr28,  vr28,  vr24
    541    vadd.w         vr29,  vr29,  vr25
    542    vextrins.d     vr21,  vr28,  0x10
    543    vextrins.d     vr22,  vr28,  0x01
    544    vextrins.d     vr22,  vr29,  0x10
    545    vextrins.d     vr23,  vr29,  0x01  //alt[3][(y>>1)+x]
    546 
    547    add.d          a0,    a0,    a1
    548 
    549    // 6
    550    fld.d          f24,   a0,    0  //img
    551    vpermi.w       vr25,  vr24,  0x01
    552 
    553    vsllwil.hu.bu  vr24,  vr24,  0
    554    vsllwil.hu.bu  vr24,  vr24,  0
    555    vsllwil.hu.bu  vr25,  vr25,  0
    556    vsllwil.hu.bu  vr25,  vr25,  0
    557 
    558    vsub.w         vr24,  vr24,  vr31  //px
    559    vsub.w         vr25,  vr25,  vr31
    560 
    561    vbsrl.v        vr28,  vr5,   8
    562    vbsrl.v        vr29,  vr6,   8
    563    vextrins.d     vr28,  vr6,   0x10  //6-9
    564    vextrins.d     vr29,  vr7,   0x10  //10-13
    565    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
    566    vadd.w         vr29,  vr29,  vr25
    567    vextrins.d     vr5,   vr28,  0x10
    568    vextrins.d     vr6,   vr28,  0x01
    569    vextrins.d     vr6,   vr29,  0x10
    570    vextrins.d     vr7,   vr29,  0x01
    571 
    572    vbsrl.v        vr28,  vr13,  8
    573    vextrins.d     vr28,  vr14,  0x10
    574    vpackev.w      vr26,  vr25,  vr24
    575    vpackod.w      vr27,  vr25,  vr24
    576    vpermi.w       vr26,  vr26,  0xd8 //px0246
    577    vpermi.w       vr27,  vr27,  0xd8 //px1357
    578    vadd.w         vr28,  vr28,  vr26
    579    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
    580    vextrins.d     vr13,  vr28,  0x10
    581    vextrins.d     vr14,  vr28,  0x01
    582 
    583    vhaddw.d.w     vr28,  vr24,  vr24
    584    vhaddw.q.d     vr28,  vr28,  vr28
    585    vpickve2gr.d   a3,    vr28,  0
    586    vhaddw.d.w     vr28,  vr25,  vr25
    587    vhaddw.q.d     vr28,  vr28,  vr28
    588    vpickve2gr.d   a4,    vr28,  0
    589    add.d          a3,    a3,    a4
    590    vinsgr2vr.w    vr1,   a3,    2    //hv[0][y]
    591 
    592    vbsrl.v        vr28,  vr16,  8
    593    vextrins.d     vr28,  vr17,  0x10
    594    vpermi.w       vr28,  vr28,  0x1b
    595    vadd.w         vr28,  vr28,  vr26
    596    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
    597    vpermi.w       vr28,  vr28,  0x1b
    598    vextrins.d     vr16,  vr28,  0x10
    599    vextrins.d     vr17,  vr28,  0x01
    600 
    601    vbsrl.v        vr28,  vr9,   8
    602    vextrins.d     vr28,  vr10,  0x10
    603    vbsrl.v        vr29,  vr10,  8
    604    vextrins.d     vr29,  vr11,  0x10
    605    vpermi.w       vr28,  vr28,  0x1b  //9876
    606    vpermi.w       vr29,  vr29,  0x1b  //13-10
    607    vadd.w         vr29,  vr29,  vr24
    608    vadd.w         vr28,  vr28,  vr25
    609    vpermi.w       vr28,  vr28,  0x1b
    610    vpermi.w       vr29,  vr29,  0x1b
    611    vextrins.d     vr9,   vr28,  0x10
    612    vextrins.d     vr10,  vr28,  0x01
    613    vextrins.d     vr10,  vr29,  0x10
    614    vextrins.d     vr11,  vr29,  0x01  //diag[1][7+y-x]
    615 
    616    vadd.w         vr18,  vr18,  vr24 //0123
    617    vadd.w         vr19,  vr19,  vr25 //4567 alt[2][3-(y>>1)+7]
    618 
    619    vadd.w         vr2,   vr2,   vr24
    620    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    621 
    622    vbsll.v        vr28,  vr22,  4
    623    vextrins.w     vr28,  vr21,  0x03  //3456
    624    vbsll.v        vr29,  vr23,  4
    625    vextrins.w     vr29,  vr22,  0x03  //78910
    626    vadd.w         vr28,  vr28,  vr24
    627    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
    628    vextrins.w     vr21,  vr28,  0x30
    629    vextrins.w     vr28,  vr29,  0x00
    630    vshuf4i.w      vr22,  vr28,  0x39
    631    vbsrl.v        vr23,  vr29,  4
    632 
    633    add.d          a0,    a0,    a1
    634 
    635    // 7
    636    fld.d          f24,   a0,    0  //img
    637    vpermi.w       vr25,  vr24,  0x01
    638 
    639    vsllwil.hu.bu  vr24,  vr24,  0
    640    vsllwil.hu.bu  vr24,  vr24,  0
    641    vsllwil.hu.bu  vr25,  vr25,  0
    642    vsllwil.hu.bu  vr25,  vr25,  0
    643 
    644    vsub.w         vr24,  vr24,  vr31  //px
    645    vsub.w         vr25,  vr25,  vr31
    646 
    647    vbsll.v        vr28,  vr6,   4
    648    vextrins.w     vr28,  vr5,   0x03 //78910
    649    vbsll.v        vr29,  vr7,   4
    650    vextrins.w     vr29,  vr6,   0x03 //11-14
    651    vadd.w         vr28,  vr28,  vr24  //diag[0][y+x]
    652    vadd.w         vr29,  vr29,  vr25
    653    vextrins.w     vr5,   vr28,  0x30
    654    vextrins.w     vr28,  vr29,  0x00
    655    vshuf4i.w      vr6,   vr28,  0x39
    656    vbsrl.v        vr7,   vr29,  4
    657 
    658    vbsll.v        vr28,  vr14,  4
    659    vextrins.w     vr28,  vr13,  0x03
    660    vpackev.w      vr26,  vr25,  vr24
    661    vpackod.w      vr27,  vr25,  vr24
    662    vpermi.w       vr26,  vr26,  0xd8 //px0246
    663    vpermi.w       vr27,  vr27,  0xd8 //px1357
    664    vadd.w         vr28,  vr28,  vr26
    665    vadd.w         vr28,  vr28,  vr27  //alt[0][y+(x>>1)]
    666    vextrins.w     vr13,  vr28,  0x30
    667    vbsrl.v        vr14,  vr28,  4
    668 
    669    vhaddw.d.w     vr28,  vr24,  vr24
    670    vhaddw.q.d     vr28,  vr28,  vr28
    671    vpickve2gr.d   a3,    vr28,  0
    672    vhaddw.d.w     vr28,  vr25,  vr25
    673    vhaddw.q.d     vr28,  vr28,  vr28
    674    vpickve2gr.d   a4,    vr28,  0
    675    add.d          a3,    a3,    a4
    676    vinsgr2vr.w    vr1,   a3,    3    //hv[0][y]
    677 
    678    vbsll.v        vr28,  vr17,  4
    679    vextrins.w     vr28,  vr16,  0x03
    680    vpermi.w       vr28,  vr28,  0x1b  //10987
    681    vadd.w         vr28,  vr28,  vr26
    682    vadd.w         vr28,  vr28,  vr27  //alt[1][3+y-(x>>1)]
    683    vextrins.w     vr16,  vr28,  0x33
    684    vshuf4i.w      vr17,  vr28,  0xc6
    685    vinsgr2vr.w    vr17,  zero,  3
    686 
    687    vbsll.v        vr28,  vr10,  4
    688    vextrins.w     vr28,  vr9,   0x03  //7-10
    689    vbsll.v        vr29,  vr11,  4
    690    vextrins.w     vr29,  vr10,  0x03  //11-14
    691    vpermi.w       vr28,  vr28,  0x1b  //10-7
    692    vpermi.w       vr29,  vr29,  0x1b  //14-11
    693    vadd.w         vr29,  vr29,  vr24
    694    vadd.w         vr28,  vr28,  vr25  //diag[1][7+y-x]
    695    vextrins.w     vr9,   vr28,  0x33
    696    vextrins.w     vr28,  vr29,  0x33
    697    vshuf4i.w      vr10,  vr28,  0xc6
    698    vshuf4i.w      vr11,  vr29,  0xc6
    699    vinsgr2vr.w    vr11,  zero,  3
    700 
    701    vadd.w         vr18,  vr18,  vr24 //0123
    702    vadd.w         vr19,  vr19,  vr25 //4567 alt[2][3-(y>>1)+7]
    703 
    704    vadd.w         vr2,   vr2,   vr24
    705    vadd.w         vr3,   vr3,   vr25  //hv[1][x]
    706 
    707    vbsll.v        vr28,  vr22,  4
    708    vextrins.w     vr28,  vr21,  0x03  //3456
    709    vbsll.v        vr29,  vr23,  4
    710    vextrins.w     vr29,  vr22,  0x03  //78910
    711    vadd.w         vr28,  vr28,  vr24
    712    vadd.w         vr29,  vr29,  vr25  //alt[3][(y>>1)+x]
    713    vextrins.w     vr21,  vr28,  0x30
    714    vextrins.w     vr28,  vr29,  0x00
    715    vshuf4i.w      vr22,  vr28,  0x39
    716    vbsrl.v        vr23,  vr29,  4
    717 
    718    add.d          a0,    a0,    a1
    719 
    720    vxor.v         vr24,  vr24,  vr24  //unsigned cost[8]
    721    vxor.v         vr25,  vr25,  vr25
    722 
    723    vmul.w         vr26,  vr0,   vr0
    724    vmul.w         vr27,  vr1,   vr1
    725    vhaddw.d.w     vr28,  vr26,  vr26
    726    vhaddw.q.d     vr28,  vr28,  vr28
    727    vpickve2gr.d   a3,    vr28,  0
    728    vhaddw.d.w     vr28,  vr27,  vr27
    729    vhaddw.q.d     vr28,  vr28,  vr28
    730    vpickve2gr.d   a4,    vr28,  0
    731    add.d          a3,    a3,    a4
    732 
    733    vmul.w         vr26,  vr2,   vr2
    734    vmul.w         vr27,  vr3,   vr3
    735    vhaddw.d.w     vr28,  vr26,  vr26
    736    vhaddw.q.d     vr28,  vr28,  vr28
    737    vpickve2gr.d   a4,    vr28,  0
    738    vhaddw.d.w     vr28,  vr27,  vr27
    739    vhaddw.q.d     vr28,  vr28,  vr28
    740    vpickve2gr.d   a5,    vr28,  0
    741    add.d          a4,    a4,    a5
    742 
    743    li.d           a6,    105
    744    mul.w          a3,    a3,    a6
    745    mul.w          a4,    a4,    a6
    746    vinsgr2vr.w    vr24,  a3,    2
    747    vinsgr2vr.w    vr25,  a4,    2
    748 
    749    vxor.v         vr30,  vr30,  vr30  //div_table
    750    vxor.v         vr31,  vr31,  vr31
    751    li.d           t0,    840
    752    vinsgr2vr.w    vr30,  t0,    0
    753    li.d           t0,    420
    754    vinsgr2vr.w    vr30,  t0,    1
    755    li.d           t0,    280
    756    vinsgr2vr.w    vr30,  t0,    2
    757    li.d           t0,    210
    758    vinsgr2vr.w    vr30,  t0,    3
    759    li.d           t0,    168
    760    vinsgr2vr.w    vr31,  t0,    0
    761    li.d           t0,    140
    762    vinsgr2vr.w    vr31,  t0,    1
    763    li.d           t0,    120
    764    vinsgr2vr.w    vr31,  t0,    2
    765 
    766    vbsll.v        vr27,  vr7,   4
    767    vextrins.w     vr27,  vr6,   0x03
    768    vpermi.w       vr27,  vr27,  0x1b
    769    vmul.w         vr26,  vr4,   vr4
    770    vmadd.w        vr26,  vr27,  vr27
    771    vmul.w         vr26,  vr26,  vr30
    772    vhaddw.d.w     vr28,  vr26,  vr26
    773    vhaddw.q.d     vr28,  vr28,  vr28
    774    vpickve2gr.d   a3,    vr28,  0
    775    vbsll.v        vr27,  vr6,   4
    776    vpermi.w       vr27,  vr27,  0x1b
    777    vmul.w         vr26,  vr5,   vr5
    778    vmadd.w        vr26,  vr27,  vr27
    779    vmul.w         vr26,  vr26,  vr31
    780    vextrins.w     vr26,  vr31,  0x33
    781    vhaddw.d.w     vr28,  vr26,  vr26
    782    vhaddw.q.d     vr28,  vr28,  vr28
    783    vpickve2gr.d   a4,    vr28,  0
    784    add.d          a3,    a3,    a4   //cost[0]
    785 
    786    vbsll.v        vr27,  vr11,  4
    787    vextrins.w     vr27,  vr10,  0x03
    788    vpermi.w       vr27,  vr27,  0x1b
    789    vmul.w         vr26,  vr8,   vr8
    790    vmadd.w        vr26,  vr27,  vr27
    791    vmul.w         vr26,  vr26,  vr30
    792    vhaddw.d.w     vr28,  vr26,  vr26
    793    vhaddw.q.d     vr28,  vr28,  vr28
    794    vpickve2gr.d   a4,    vr28,  0
    795    vbsll.v        vr27,  vr10,  4
    796    vpermi.w       vr27,  vr27,  0x1b
    797    vmul.w         vr26,  vr9,   vr9
    798    vmadd.w        vr26,  vr27,  vr27
    799    vmul.w         vr26,  vr26,  vr31
    800    vextrins.w     vr26,  vr31,  0x33
    801    vhaddw.d.w     vr28,  vr26,  vr26
    802    vhaddw.q.d     vr28,  vr28,  vr28
    803    vpickve2gr.d   a5,    vr28,  0
    804    add.d          a4,    a4,    a5   //cost[4]
    805 
    806    vpickve2gr.w   a5,    vr5,   3
    807    mul.w          a5,    a5,    a5
    808    mul.w          a5,    a5,    a6
    809    add.w          a3,    a3,    a5
    810    vinsgr2vr.w    vr24,  a3,    0
    811    vpickve2gr.w   a5,    vr9,   3
    812    mul.w          a5,    a5,    a5
    813    mul.w          a5,    a5,    a6
    814    add.w          a4,    a4,    a5
    815    vinsgr2vr.w    vr25,  a4,    0
    816 
    817    //n=0
    818    vpickve2gr.w   a3,    vr24,  1
    819    vmul.w         vr26,  vr13,  vr13
    820    vhaddw.d.w     vr28,  vr26,  vr26
    821    vhaddw.q.d     vr28,  vr28,  vr28
    822    vpickve2gr.d   a4,    vr28,  0
    823    vpickve2gr.w   a5,    vr12,  3
    824    mul.w          a5,    a5,    a5
    825    add.d          a3,    a3,    a4
    826    add.d          a3,    a3,    a5
    827    mul.w          a3,    a3,    a6  //*cost_ptr
    828 
    829    vextrins.w     vr29,  vr30,  0x01
    830    vextrins.w     vr29,  vr30,  0x13
    831    vextrins.w     vr29,  vr31,  0x21
    832    vextrins.w     vr29,  vr31,  0x33
    833    vbsll.v        vr27,  vr14,  4
    834    vpermi.w       vr27,  vr27,  0x1b
    835    vmul.w         vr28,  vr12,  vr12
    836    vextrins.w     vr28,  vr31,  0x33
    837    vmadd.w        vr28,  vr27,  vr27
    838    vmul.w         vr26,  vr28,  vr29
    839    vhaddw.d.w     vr28,  vr26,  vr26
    840    vhaddw.q.d     vr28,  vr28,  vr28
    841    vpickve2gr.d   a4,    vr28,  0
    842    add.d          a3,    a3,    a4
    843    vinsgr2vr.w    vr24,  a3,    1
    844 
    845    //n=1
    846    vpickve2gr.w   a3,    vr24,  3
    847    vmul.w         vr26,  vr16,  vr16
    848    vhaddw.d.w     vr28,  vr26,  vr26
    849    vhaddw.q.d     vr28,  vr28,  vr28
    850    vpickve2gr.d   a4,    vr28,  0
    851    vpickve2gr.w   a5,    vr15,  3
    852    mul.w          a5,    a5,    a5
    853    add.d          a3,    a3,    a4
    854    add.d          a3,    a3,    a5
    855    mul.w          a3,    a3,    a6  //*cost_ptr
    856 
    857    vbsll.v        vr27,  vr17,  4
    858    vpermi.w       vr27,  vr27,  0x1b
    859    vmul.w         vr28,  vr15,  vr15
    860    vextrins.w     vr28,  vr31,  0x33
    861    vmadd.w        vr28,  vr27,  vr27
    862    vmul.w         vr26,  vr28,  vr29
    863    vhaddw.d.w     vr28,  vr26,  vr26
    864    vhaddw.q.d     vr28,  vr28,  vr28
    865    vpickve2gr.d   a4,    vr28,  0
    866    add.d          a3,    a3,    a4
    867    vinsgr2vr.w    vr24,  a3,    3
    868 
    869    //n=2
    870    vpickve2gr.w   a3,    vr25,  1
    871    vmul.w         vr26,  vr19,  vr19
    872    vhaddw.d.w     vr28,  vr26,  vr26
    873    vhaddw.q.d     vr28,  vr28,  vr28
    874    vpickve2gr.d   a4,    vr28,  0
    875    vpickve2gr.w   a5,    vr18,  3
    876    mul.w          a5,    a5,    a5
    877    add.d          a3,    a3,    a4
    878    add.d          a3,    a3,    a5
    879    mul.w          a3,    a3,    a6  //*cost_ptr
    880 
    881    vbsll.v        vr27,  vr20,  4
    882    vpermi.w       vr27,  vr27,  0x1b
    883    vmul.w         vr28,  vr18,  vr18
    884    vextrins.w     vr28,  vr31,  0x33
    885    vmadd.w        vr28,  vr27,  vr27
    886    vmul.w         vr26,  vr28,  vr29
    887    vhaddw.d.w     vr28,  vr26,  vr26
    888    vhaddw.q.d     vr28,  vr28,  vr28
    889    vpickve2gr.d   a4,    vr28,  0
    890    add.d          a3,    a3,    a4
    891    vinsgr2vr.w    vr25,  a3,    1
    892 
    893    //n=3
    894    vpickve2gr.w   a3,    vr25,  3
    895    vmul.w         vr26,  vr22,  vr22
    896    vhaddw.d.w     vr28,  vr26,  vr26
    897    vhaddw.q.d     vr28,  vr28,  vr28
    898    vpickve2gr.d   a4,    vr28,  0
    899    vpickve2gr.w   a5,    vr21,  3
    900    mul.w          a5,    a5,    a5
    901    add.d          a3,    a3,    a4
    902    add.d          a3,    a3,    a5
    903    mul.w          a3,    a3,    a6  //*cost_ptr
    904 
    905    vbsll.v        vr27,  vr23,  4
    906    vpermi.w       vr27,  vr27,  0x1b
    907    vmul.w         vr28,  vr21,  vr21
    908    vextrins.w     vr28,  vr31,  0x33
    909    vmadd.w        vr28,  vr27,  vr27
    910    vmul.w         vr26,  vr28,  vr29
    911    vhaddw.d.w     vr28,  vr26,  vr26
    912    vhaddw.q.d     vr28,  vr28,  vr28
    913    vpickve2gr.d   a4,    vr28,  0
    914    add.d          a3,    a3,    a4
    915    vinsgr2vr.w    vr25,  a3,    3
    916 
    917    xor            a3,    a3,    a3  //best_dir
    918    vpickve2gr.w   a4,    vr24,  0   //best_cost
    919 .BSETDIR01:
    920    vpickve2gr.w   a5,    vr24,  1
    921    bge            a4,    a5,    .BSETDIR02
    922    or             a4,    a5,    a5
    923    ori            a3,    zero,  1
    924 .BSETDIR02:
    925    vpickve2gr.w   a5,    vr24,  2
    926    bge            a4,    a5,    .BSETDIR03
    927    or             a4,    a5,    a5
    928    ori            a3,    zero,  2
    929 .BSETDIR03:
    930    vpickve2gr.w   a5,    vr24,  3
    931    bge            a4,    a5,    .BSETDIR04
    932    or             a4,    a5,    a5
    933    ori            a3,    zero,  3
    934 .BSETDIR04:
    935    vpickve2gr.w   a5,    vr25,  0
    936    bge            a4,    a5,    .BSETDIR05
    937    or             a4,    a5,    a5
    938    ori            a3,    zero,  4
    939 .BSETDIR05:
    940    vpickve2gr.w   a5,    vr25,  1
    941    bge            a4,    a5,    .BSETDIR06
    942    or             a4,    a5,    a5
    943    ori            a3,    zero,  5
    944 .BSETDIR06:
    945    vpickve2gr.w   a5,    vr25,  2
    946    bge            a4,    a5,    .BSETDIR07
    947    or             a4,    a5,    a5
    948    ori            a3,    zero,  6
    949 .BSETDIR07:
    950    vpickve2gr.w   a5,    vr25,  3
    951    bge            a4,    a5,    .BSETDIREND
    952    or             a4,    a5,    a5
    953    ori            a3,    zero,  7
    954 .BSETDIREND:
    955    xori           a5,    a3,    4
    956    li.d           a1,    4
    957    bge            a5,    a1,    .GETCOST01
    958    vreplve.w      vr26,  vr24,  a5
    959    b              .GETCOST02
    960 .GETCOST01:
    961    vreplve.w      vr26,  vr25,  a5
    962 .GETCOST02:
    963    vpickve2gr.w   a5,    vr26,  0
    964    sub.w          a5,    a4,    a5
    965    srai.d         a5,    a5,    10
    966    st.w           a5,    a2,    0
    967    or             a0,    a3,    a3
    968 
    969    fld.d          f24,   sp,    0
    970    fld.d          f25,   sp,    8
    971    fld.d          f26,   sp,    16
    972    fld.d          f27,   sp,    24
    973    fld.d          f28,   sp,    32
    974    fld.d          f29,   sp,    40
    975    fld.d          f30,   sp,    48
    976    fld.d          f31,   sp,    56
    977    addi.d         sp,    sp,    64
    978 
    979 endfunc
    980 
    981 .macro cdef_fill tmp, stride, w, h
    982    beqz          \h,     700f         //h
    983    or            t0,     zero,  zero  //y
    984 100:
    985    or            t1,     zero,  zero  //xx
    986    srai.d        s6,     \w,    3     //x
    987    beqz          s6,     300f
    988 200:
    989    vstx          vr18,   \tmp,    t1
    990    addi.d        t1,     t1,    16
    991    addi.d        s6,     s6,    -1
    992    bnez          s6,     200b
    993 300:
    994    andi          s6,     \w,    4
    995    beqz          s6,     400f
    996    fstx.d        f18,    \tmp,    t1
    997    addi.d        t1,     t1,    8
    998 400:
    999    andi          s6,     \w,    2
   1000    beqz          s6,     500f
   1001    fstx.s        f18,    \tmp,    t1
   1002    addi.d        t1,     t1,    4
   1003 500:
   1004    andi          s6,     \w,    1
   1005    beqz          s6,     600f
   1006    li.w          s6,     -16384
   1007    stx.h         s6,     \tmp,    t1
   1008    addi.d        t1,     t1,    2
   1009 600:
   1010    add.d         \tmp,     \tmp,    \stride
   1011    add.d         \tmp,     \tmp,    \stride
   1012    addi.d        t0,     t0,    1
   1013    blt           t0,     \h,    100b
   1014 700:
   1015 .endm
   1016 
   1017 const dav1d_cdef_directions
   1018 .byte   1 * 12 + 0,  2 * 12 + 0
   1019 .byte   1 * 12 + 0,  2 * 12 - 1
   1020 .byte   -1 * 12 + 1, -2 * 12 + 2
   1021 .byte   0 * 12 + 1, -1 * 12 + 2
   1022 .byte   0 * 12 + 1,  0 * 12 + 2
   1023 .byte   0 * 12 + 1,  1 * 12 + 2
   1024 .byte   1 * 12 + 1,  2 * 12 + 2
   1025 .byte   1 * 12 + 0,  2 * 12 + 1
   1026 .byte   1 * 12 + 0,  2 * 12 + 0
   1027 .byte   1 * 12 + 0,  2 * 12 - 1
   1028 .byte   -1 * 12 + 1, -2 * 12 + 2
   1029 .byte   0 * 12 + 1, -1 * 12 + 2
   1030 endconst
   1031 
   1032 .macro constrain_vrh in0, in1, in2, tmp0, tmp1, out
   1033    vabsd.h        \tmp0, \in0,  vr23   //adiff
   1034    vsra.h         \tmp1, \tmp0, \in2
   1035    vsub.h         \tmp1, \in1,  \tmp1
   1036    vmax.h         \tmp1, vr23,  \tmp1  //imax
   1037    vmin.h         \tmp0, \tmp0, \tmp1  //imin
   1038 
   1039    //apply_sign
   1040    vslt.h         \tmp1, \in0,  vr23
   1041    vandn.v        \in0,  \tmp1, \tmp0
   1042    vsigncov.h     \tmp0, \tmp1, \tmp0
   1043    vor.v          \out,  \in0,  \tmp0
   1044 .endm
   1045 
   1046 .macro iclip_vrh in0, in1, in2, tmp0, tmp1, out
   1047    vmin.h         \tmp0, \in2,  \in0
   1048    vslt.h         \in0,  \in0,  \in1
   1049    vand.v         \tmp1, \in0,  \in1
   1050    vandn.v        \tmp0, \in0,  \tmp0
   1051    vor.v          \out,  \tmp1, \tmp0
   1052 .endm
   1053 
   1054 .macro cdef_padding_data
   1055    //y < 0
   1056    beqz          t7,     90f
   1057 4:
   1058    or            t4,     t5,    t5  //data index xx
   1059    slli.d        t0,     t4,    1
   1060    mul.w         t2,     t7,    s5
   1061    slli.d        t2,     t2,    1
   1062    add.d         t2,     s4,    t2
   1063 
   1064    sub.d         t3,     t6,    t5  //loop param x
   1065    srai.d        t3,     t3,    3
   1066    add.d         t3,     t3,    t5
   1067    beq           t5,     t3,    6f
   1068 5:  // /8
   1069    fldx.d        f18,    a3,    t4
   1070    vsllwil.hu.bu vr18,   vr18,  0
   1071    vstx          vr18,   t2,    t0
   1072    addi.d        t0,     t0,    16
   1073    addi.d        t4,     t4,    8
   1074 
   1075    addi.d        t3,     t3,    -1
   1076    bne           t5,     t3,    5b
   1077 6:  // &4
   1078    sub.d         t1,     t6,    t5
   1079    andi          t1,     t1,    4
   1080    beqz          t1,     7f
   1081 
   1082    fldx.s        f18,    a3,    t4
   1083    vsllwil.hu.bu vr18,   vr18,  0
   1084    fstx.d        f18,    t2,    t0
   1085    addi.d        t0,     t0,    8
   1086    addi.d        t4,     t4,    4
   1087 7:  // &2
   1088    sub.d         t1,     t6,    t5
   1089    andi          t1,     t1,    2
   1090    beqz          t1,     9f
   1091 
   1092    ldx.bu        t1,     a3,    t4
   1093    stx.h         t1,     t2,    t0
   1094    addi.d        t0,     t0,    2
   1095    addi.d        t4,     t4,    1
   1096    ldx.bu        t1,     a3,    t4
   1097    stx.h         t1,     t2,    t0
   1098    addi.d        t0,     t0,    2
   1099    addi.d        t4,     t4,    1
   1100 9:
   1101    add.d         a3,     a3,    a1
   1102    addi.d        t7,     t7,    1
   1103    bnez          t7,     4b
   1104 
   1105 90:
   1106    // y < h
   1107    beqz          s1,     12f
   1108    beqz          t5,     12f
   1109    or            t7,     zero,  zero  //y
   1110 10:
   1111    or            t4,     t5,    t5  //data index x
   1112 11:
   1113    slli.d        t3,     t7,    1
   1114    addi.d        t3,     t3,    2
   1115    add.d         t3,     t3,    t4
   1116    ldx.bu        t1,     a2,    t3
   1117 
   1118    mul.w         t3,     t7,    s5
   1119    add.d         t3,     t3,    t4
   1120    slli.d        t3,     t3,    1
   1121    stx.h         t1,     s4,    t3
   1122 
   1123    addi.d        t4,     t4,    1
   1124    bnez          t4,     11b
   1125 
   1126    addi.d        t7,     t7,    1
   1127    bne           t7,     s1,    10b
   1128 
   1129 12:
   1130    // y = 0 ; y < h
   1131    or            s0,     s4,    s4
   1132    beqz          s1,     20f
   1133    or            s6,     a0,    a0
   1134    or            t7,     zero,  zero  //y
   1135    srai.d        t4,     t6,    3    //loop max
   1136 13:
   1137    or            t0,     zero,  zero //loop param
   1138    or            t3,     t0,    t0   //data index src
   1139    or            t1,     t0,    t0   //data index tmp
   1140    beqz          t4,     16f
   1141 15:  // /8
   1142    fldx.d        f18,    s6,    t3
   1143    vsllwil.hu.bu vr18,   vr18,  0
   1144    vstx          vr18,   s0,    t1
   1145    addi.d        t3,     t3,    8
   1146    addi.d        t1,     t1,    16
   1147 
   1148    addi.d        t0,     t0,    1
   1149    blt           t0,     t4,    15b
   1150 16:  // &4
   1151    andi          t0,     t6,    4
   1152    beqz          t0,     17f
   1153 
   1154    fldx.s        f18,    s6,    t3
   1155    vsllwil.hu.bu vr18,   vr18,  0
   1156    fstx.d        f18,    s0,    t1
   1157    addi.d        t3,     t3,    4
   1158    addi.d        t1,     t1,    8
   1159 17:  // &2
   1160    andi          t0,     t6,    2
   1161    beqz          t0,     19f
   1162 
   1163    ldx.bu        t2,     s6,    t3
   1164    stx.h         t2,     s0,    t1
   1165    addi.d        t3,     t3,    1
   1166    addi.d        t1,     t1,    2
   1167    ldx.bu        t2,     s6,    t3
   1168    stx.h         t2,     s0,    t1
   1169    addi.d        t3,     t3,    1
   1170    addi.d        t1,     t1,    2
   1171 19: // src+ tmp+
   1172    add.d         s6,     s6,    a1
   1173    add.d         s0,     s0,    s5
   1174    add.d         s0,     s0,    s5
   1175 
   1176    addi.d        t7,     t7,    1
   1177    blt           t7,     s1,    13b
   1178 
   1179    // y = h ; y < y_end
   1180 20:
   1181    beq           s1,     t8,    27f
   1182    or            t7,     s1,    s1  //y
   1183    sub.d         t4,     t6,    t5
   1184    srai.d        t4,     t4,    3
   1185    add.d         t4,     t4,    t5   //8 loop max
   1186 21:
   1187    or            t0,     t5,    t5   //xx
   1188    or            t3,     t0,    t0   //data index bottom
   1189    slli.d        t1,     t0,    1    //data index tmp
   1190    beq           t5,     t4,    23f
   1191 22:  // /8
   1192    fldx.d        f18,    a4,    t3
   1193    vsllwil.hu.bu vr18,   vr18,  0
   1194    vstx          vr18,   s0,    t1
   1195    addi.d        t3,     t3,    8
   1196    addi.d        t1,     t1,    16
   1197 
   1198    addi.d        t0,     t0,    1
   1199    blt           t0,     t4,    22b
   1200 23:  // &4
   1201    sub.d         t0,     t6,    t5
   1202    andi          t0,     t0,    4
   1203    beqz          t0,     24f
   1204 
   1205    fldx.s        f18,    a4,    t3
   1206    vsllwil.hu.bu vr18,   vr18,  0
   1207    fstx.d        f18,    s0,    t1
   1208    addi.d        t3,     t3,    4
   1209    addi.d        t1,     t1,    8
   1210 24:  // &2
   1211    sub.d         t0,     t6,    t5
   1212    andi          t0,     t0,    2
   1213    beqz          t0,     26f
   1214 
   1215    ldx.bu        t2,     a4,    t3
   1216    stx.h         t2,     s0,    t1
   1217    addi.d        t3,     t3,    1
   1218    addi.d        t1,     t1,    2
   1219    ldx.bu        t2,     a4,    t3
   1220    stx.h         t2,     s0,    t1
   1221    addi.d        t3,     t3,    1
   1222    addi.d        t1,     t1,    2
   1223 26: // bottom+ tmp+
   1224    add.d         a4,     a4,    a1
   1225    add.d         s0,     s0,    s5
   1226    add.d         s0,     s0,    s5
   1227 
   1228    addi.d        t7,     t7,    1
   1229    blt           t7,     t8,    21b
   1230 27:
   1231    // padding end
   1232 .endm
   1233 
   1234 .macro cdef_pri_sec_init
   1235    clz.w          t3,    a6
   1236    sub.w          t3,    t2,    t3
   1237    sub.w          t3,    s7,    t3  //sec_shift
   1238 
   1239    vreplgr2vr.h   vr4,   t0         //pri_tap_k
   1240    vreplgr2vr.h   vr9,   a5         //pri_strength
   1241    vreplgr2vr.h   vr10,  t1         //pri_shift
   1242    vreplgr2vr.h   vr18,  a6         //sec_strength
   1243    vreplgr2vr.h   vr19,  t3         //sec_shift
   1244 
   1245    or             t2,    s1,    s1  //dowhile loop param
   1246    addi.d         s1,    a7,    2
   1247    slli.d         s1,    s1,    1   //directions dir+2
   1248    addi.d         s2,    a7,    4
   1249    slli.d         s2,    s2,    1   //directions dir+4
   1250    slli.d         s3,    a7,    1   //directions dir+0
   1251 
   1252    la.local       t0,    dav1d_cdef_directions
   1253    add.d          s1,    t0,    s1
   1254    ld.b           a2,    s1,    0  //off01
   1255    ld.b           a3,    s1,    1  //off11
   1256    add.d          s2,    t0,    s2
   1257    ld.b           s1,    s2,    0  //off02
   1258    ld.b           s2,    s2,    1  //off12
   1259    add.d          s3,    t0,    s3
   1260    ld.b           t0,    s3,    0  //off03
   1261    ld.b           s3,    s3,    1  //off13
   1262 
   1263    slli.d         a2,    a2,    1
   1264    slli.d         a3,    a3,    1
   1265    slli.d         s1,    s1,    1
   1266    slli.d         s2,    s2,    1
   1267    slli.d         t0,    t0,    1
   1268    slli.d         s3,    s3,    1
   1269 .endm
   1270 
   1271 .macro cdef_pri_init
   1272    vreplgr2vr.h   vr4,   t0         //pri_tap_k
   1273    vreplgr2vr.h   vr9,   a5         //pri_strength
   1274    vreplgr2vr.h   vr10,  t1         //pri_shift
   1275 
   1276    or             t2,    s1,    s1  //dowhile loop param
   1277    addi.d         s1,    a7,    2
   1278    slli.d         s1,    s1,    1   //directions dir+2
   1279 
   1280    la.local       t0,    dav1d_cdef_directions
   1281    add.d          s1,    t0,    s1
   1282    ld.b           a2,    s1,    0  //off01
   1283    ld.b           a3,    s1,    1  //off11
   1284 
   1285    slli.d         a2,    a2,    1
   1286    slli.d         a3,    a3,    1
   1287 .endm
   1288 
   1289 .macro cdef_sec_init
   1290    clz.w          t3,    a6
   1291    li.w           t2,    31
   1292    sub.w          t3,    t2,    t3
   1293    sub.w          t3,    s7,    t3  //sec_shift
   1294 
   1295    vreplgr2vr.h   vr18,  a6         //sec_strength
   1296    vreplgr2vr.h   vr19,  t3         //sec_shift
   1297 
   1298    or             t2,    s1,    s1  //dowhile loop param
   1299    addi.d         s2,    a7,    4
   1300    slli.d         s2,    s2,    1   //directions dir+4
   1301    slli.d         s3,    a7,    1   //directions dir+0
   1302 
   1303    la.local       t0,    dav1d_cdef_directions
   1304    add.d          s1,    t0,    s1
   1305    add.d          s2,    t0,    s2
   1306    ld.b           s1,    s2,    0  //off02
   1307    ld.b           s2,    s2,    1  //off12
   1308    add.d          s3,    t0,    s3
   1309    ld.b           t0,    s3,    0  //off03
   1310    ld.b           s3,    s3,    1  //off13
   1311 
   1312    slli.d         s1,    s1,    1
   1313    slli.d         s2,    s2,    1
   1314    slli.d         t0,    t0,    1
   1315    slli.d         s3,    s3,    1
   1316 .endm
   1317 
   1318 .macro cdef_process_data_w8 in0, in1
   1319    vsub.h       vr11,   vr5,   vr0
   1320    vsub.h       vr12,   vr6,   vr0
   1321    vsub.h       vr13,   vr7,   vr0
   1322    vsub.h       vr14,   vr8,   vr0
   1323 
   1324    constrain_vrh   vr11,  \in0,   \in1,  vr16,  vr17,  vr11
   1325    constrain_vrh   vr12,  \in0,   \in1,  vr16,  vr17,  vr12
   1326    constrain_vrh   vr13,  \in0,   \in1,  vr16,  vr17,  vr13
   1327    constrain_vrh   vr14,  \in0,   \in1,  vr16,  vr17,  vr14
   1328 .endm
   1329 
   1330 .macro cdef_process_data_w4 in0, in1
   1331    vpermi.w       vr6,  vr5,  0x44
   1332    vpermi.w       vr8,  vr7,  0x44
   1333 
   1334    vsub.h         vr12,  vr6,   vr0
   1335    vsub.h         vr14,  vr8,   vr0
   1336 
   1337    constrain_vrh   vr12,  \in0,   \in1,  vr16,  vr17,  vr12
   1338    constrain_vrh   vr14,  \in0,   \in1,  vr16,  vr17,  vr14
   1339 .endm
   1340 
   1341 .macro cdef_calc_sum_tapchange_w8
   1342    vmul.h         vr1,   vr15,  vr11  //sum
   1343    vmadd.h        vr1,   vr15,  vr12  //sum
   1344    vand.v         vr15,  vr15,  vr21
   1345    vor.v          vr15,  vr15,  vr22
   1346    vmadd.h        vr1,   vr15,  vr13  //sum
   1347    vmadd.h        vr1,   vr15,  vr14  //sum
   1348 .endm
   1349 
   1350 .macro cdef_calc_sum_tapchange_w4
   1351    vmul.h         vr1,   vr15,  vr12  //sum
   1352    vand.v         vr15,  vr15,  vr21
   1353    vor.v          vr15,  vr15,  vr22
   1354    vmadd.h        vr1,   vr15,  vr14  //sum
   1355 .endm
   1356 
   1357 .macro cdef_calc_sum_no_tapchange_w4 in0
   1358    vmadd.h        vr1,   \in0,  vr12
   1359    vmadd.h        vr1,   \in0,  vr14
   1360 .endm
   1361 
   1362 .macro cdef_calc_sum_no_tapchange_w8 in0
   1363    vmadd.h        vr1,   \in0,  vr11  //sum
   1364    vmadd.h        vr1,   \in0,  vr12
   1365    vmadd.h        vr1,   \in0,  vr13
   1366    vmadd.h        vr1,   \in0,  vr14
   1367 .endm
   1368 
   1369 .macro cdef_calc_maxmin_w4
   1370    vmin.hu        vr3,   vr6,   vr3
   1371    vmax.h         vr2,   vr6,   vr2
   1372    vmin.hu        vr3,   vr8,   vr3  //min
   1373    vmax.h         vr2,   vr8,   vr2  //max
   1374 .endm
   1375 
   1376 .macro cdef_calc_maxmin_w8
   1377    vmin.hu        vr3,   vr5,   vr3
   1378    vmax.h         vr2,   vr5,   vr2
   1379    vmin.hu        vr3,   vr6,   vr3
   1380    vmax.h         vr2,   vr6,   vr2
   1381    vmin.hu        vr3,   vr7,   vr3
   1382    vmax.h         vr2,   vr7,   vr2
   1383    vmin.hu        vr3,   vr8,   vr3  //min
   1384    vmax.h         vr2,   vr8,   vr2  //max
   1385 .endm
   1386 
   1387 .macro cdef_calc_dst
   1388    vslti.h        vr5,   vr1,   0
   1389    vand.v         vr5,   vr5,   vr20
   1390    vsub.h         vr5,   vr1,   vr5
   1391    vaddi.hu       vr5,   vr5,   8
   1392    vsrai.h        vr5,   vr5,   4
   1393    vadd.h         vr5,   vr0,   vr5
   1394 .endm
   1395 
   1396 //static NOINLINE void cdef_filter_block_lsx
   1397 //                    (pixel *dst, const ptrdiff_t dst_stride,
   1398 //                     const pixel (*left)[2], const pixel *const top,
   1399 //                     const int pri_strength, const int sec_strength,
   1400 //                     const int dir, const int damping, const int w, int h,
   1401 //                     const enum CdefEdgeFlags edges HIGHBD_DECL_SUFFIX)
   1402 // w=4 h=4
   1403 //param: dst:a0, dst_stride:a1, left:a2, top:a3, bottom:a4, pri_strength:a5
   1404 //sec_strength:a6, dir:a7, damping:s7, w:s0, h:s1, edges:s2
   1405 function cdef_filter_block_4x4_8bpc_lsx
   1406    ld.w           t0,    sp,    0
   1407    ld.w           t1,    sp,    8
   1408    addi.d         sp,    sp,    -(64+288)
   1409    st.d           s0,    sp,    0
   1410    st.d           s1,    sp,    8
   1411    st.d           s2,    sp,    16
   1412    st.d           s3,    sp,    24
   1413    st.d           s4,    sp,    32
   1414    st.d           s5,    sp,    40
   1415    st.d           s6,    sp,    48
   1416    st.d           s7,    sp,    56
   1417 
   1418    li.w           s0,    4         //w
   1419    li.w           s1,    4         //h
   1420    or             s2,    t1,    t1 //edges
   1421    or             s7,    t0,    t0 //damping
   1422 
   1423    li.d           s5,    12         //tmp_stride
   1424    addi.d         s4,    sp,    64
   1425    slli.d         t0,    s5,    1
   1426    addi.d         t0,    t0,    2
   1427    slli.d         t0,    t0,    1
   1428    add.d          s4,    s4,    t0  //ptr tmp
   1429    vxor.v         vr23,  vr23,  vr23
   1430    li.w           t2,    1
   1431    vreplgr2vr.h   vr20,  t2
   1432    vaddi.hu       vr21,  vr20,  2
   1433    vaddi.hu       vr22,  vr20,  1
   1434 
   1435    li.w          t0,     -16384
   1436    vreplgr2vr.h  vr18,   t0
   1437 
   1438    //padding
   1439    li.w          t5,     -2        //x_start
   1440    addi.d        t6,     s0,    2  //x_end
   1441    li.w          t7,     -2        //y_start
   1442    addi.d        t8,     s1,    2  //y_end
   1443    li.w          t2,     2
   1444 
   1445    andi          t4,     s2,    4
   1446    bnez          t4,     1f
   1447 
   1448    //CDEF_HAVE_TOP
   1449    slli.d        t3,     s5,    2
   1450    addi.d        t4,     s4,    -4
   1451    sub.d         t4,     t4,    t3
   1452    addi.d        t3,     s0,    4
   1453 
   1454    cdef_fill     t4,     s5,    t3,     t2
   1455 
   1456    or            t7,     zero,  zero
   1457 
   1458 1:  //CDEF_HAVE_BOTTOM
   1459    andi          t4,     s2,8
   1460    bnez          t4,     2f
   1461 
   1462    mul.w         t3,     s1,    s5
   1463    slli.d        t3,     t3,  1
   1464    add.d         t4,     s4,  t3
   1465    addi.d        t4,     t4,    -4
   1466    li.d          t3,     8
   1467 
   1468    cdef_fill     t4,     s5,    t3,     t2
   1469 
   1470    addi.d        t8,     t8,    -2
   1471 
   1472 2:  //CDEF_HAVE_LEFT
   1473    andi          t4,     s2,1
   1474    bnez          t4,     3f
   1475 
   1476    mul.w         t3,     t7,    s5
   1477    slli.d        t3,     t3,    1
   1478    add.d         t4,     s4,    t3
   1479    addi.d        t4,     t4,    -4
   1480    sub.d         t3,     t8,    t7
   1481 
   1482    cdef_fill     t4,     s5,    t2,     t3
   1483 
   1484    or            t5,     zero,  zero
   1485 
   1486 3:  //CDEF_HAVE_RIGHT
   1487    andi          t4,     s2,2
   1488    bnez          t4,     40f
   1489 
   1490    mul.w         t3,     t7,    s5
   1491    slli.d        t3,     t3,    1
   1492    add.d         t4,     s4,    t3
   1493    addi.d        t4,     t4,    8
   1494    sub.d         t3,     t8,    t7
   1495 
   1496    cdef_fill     t4,     s5,    t2,     t3
   1497 
   1498    addi.d        t6,     t6,    -2
   1499 
   1500 40:
   1501    cdef_padding_data
   1502 
   1503    beqz           a5,    33f
   1504 
   1505 28:  //if (pri_strength)
   1506    li.w           t0,    4
   1507    andi           t1,    a5,    1
   1508    sub.d          t0,    t0,    t1  //pri_tap
   1509 
   1510    clz.w          t1,    a5
   1511    li.d           t2,    31
   1512    sub.w          t1,    t2,    t1
   1513    sub.w          t1,    s7,    t1
   1514 
   1515    blt            t1,    zero,  281f
   1516    or             t1,    t1,    t1
   1517    b              282f
   1518 281:
   1519    or             t1,    zero,  zero   //t1: pri_shift
   1520 282:
   1521 
   1522    beqz           a6,    31f
   1523 
   1524 29:  //if (sec_strength)
   1525    cdef_pri_sec_init
   1526 
   1527 30:
   1528    fld.s          f0,    a0,    0     //px
   1529    vsllwil.hu.bu  vr0,   vr0,   0
   1530    vpermi.w       vr0,   vr0,   0x44
   1531 
   1532    vxor.v         vr1,   vr1,   vr1   //sum
   1533    vor.v          vr2,   vr0,   vr0   //max
   1534    vor.v          vr3,   vr0,   vr0   //min
   1535    vor.v          vr15,  vr4,   vr4   //pri_tap_k
   1536 
   1537    sub.d          t4,    s4,    a2
   1538    sub.d          t5,    s4,    a3
   1539 
   1540    fldx.d         f5,    s4,    a2   //p0_00
   1541    fld.d          f6,    t4,    0    //p0_01
   1542    fldx.d         f7,    s4,    a3   //p0_10
   1543    fld.d          f8,    t5,    0    //p0_11
   1544 
   1545    cdef_process_data_w4 vr9,   vr10
   1546    cdef_calc_sum_tapchange_w4
   1547    cdef_calc_maxmin_w4
   1548 
   1549    sub.d          t4,    s4,    s1  //tmp[-off02]
   1550    sub.d          t5,    s4,    t0  //tmp[-off03]
   1551 
   1552    fldx.d         f5,    s4,    s1   //s0_00
   1553    fld.d          f6,    t4,    0    //s0_01
   1554    fldx.d         f7,    s4,    t0   //s0_02
   1555    fld.d          f8,    t5,    0    //s0_03
   1556 
   1557    cdef_process_data_w4 vr18, vr19
   1558    cdef_calc_sum_no_tapchange_w4 vr22
   1559    cdef_calc_maxmin_w4
   1560 
   1561    sub.d          t4,    s4,    s2  //tmp[-off12]
   1562    sub.d          t5,    s4,    s3  //tmp[-off13]
   1563 
   1564    fldx.d         f5,    s4,    s2   //s0_10
   1565    fld.d          f6,    t4,    0    //s0_11
   1566    fldx.d         f7,    s4,    s3   //s0_12
   1567    fld.d          f8,    t5,    0    //s0_13
   1568 
   1569    cdef_process_data_w4 vr18, vr19
   1570    cdef_calc_sum_no_tapchange_w4 vr20
   1571    cdef_calc_maxmin_w4
   1572 
   1573    vshuf4i.w      vr5,   vr1,   0x0e
   1574    vshuf4i.w      vr6,   vr3,   0x0e
   1575    vshuf4i.w      vr7,   vr2,   0x0e
   1576    vadd.h         vr1,   vr1,   vr5
   1577    vmin.hu        vr3,   vr6,   vr3
   1578    vmax.h         vr2,   vr7,   vr2
   1579 
   1580    cdef_calc_dst
   1581    iclip_vrh       vr5,   vr3,   vr2,  vr16,  vr17,  vr5
   1582 
   1583    vsrlni.b.h     vr5,   vr5,   0
   1584    fst.s          f5,    a0,    0
   1585 
   1586    add.d          a0,    a0,    a1
   1587    add.d          s4,    s4,    s5
   1588    add.d          s4,    s4,    s5
   1589 
   1590    addi.d         t2,    t2,    -1
   1591    blt            zero,  t2,    30b
   1592    b              35f
   1593 
   1594 31:  // pri_strength only
   1595    cdef_pri_init
   1596 
   1597 32:
   1598    fld.s          f0,    a0,    0     //px
   1599    vsllwil.hu.bu  vr0,   vr0,   0
   1600    vpermi.w       vr0,   vr0,   0x44
   1601 
   1602    vxor.v         vr1,   vr1,   vr1   //sum
   1603    vor.v          vr15,  vr4,   vr4   //pri_tap_k
   1604 
   1605    sub.d          t4,    s4,    a2
   1606    sub.d          t5,    s4,    a3
   1607 
   1608    fldx.d         f5,    s4,    a2   //p0_00
   1609    fld.d          f6,    t4,    0    //p0_01
   1610    fldx.d         f7,    s4,    a3   //p0_10
   1611    fld.d          f8,    t5,    0    //p0_11
   1612 
   1613    cdef_process_data_w4 vr9,   vr10
   1614    cdef_calc_sum_tapchange_w4
   1615 
   1616    vshuf4i.w      vr5,   vr1,   0x0e
   1617    vadd.h         vr1,   vr1,   vr5
   1618 
   1619    cdef_calc_dst
   1620 
   1621    vsrlni.b.h     vr5,   vr5,   0
   1622    fst.s          f5,    a0,    0
   1623 
   1624    add.d          a0,    a0,    a1
   1625    add.d          s4,    s4,    s5
   1626    add.d          s4,    s4,    s5
   1627 
   1628    addi.d         t2,    t2,    -1
   1629    blt            zero,  t2,    32b
   1630    b              35f
   1631 
   1632 33:   // sec_strength only
   1633    cdef_sec_init
   1634 
   1635 34:
   1636    fld.s          f0,    a0,    0     //px
   1637    vsllwil.hu.bu  vr0,   vr0,   0
   1638    vpermi.w       vr0,   vr0,   0x44
   1639 
   1640    vxor.v         vr1,   vr1,   vr1   //sum
   1641 
   1642    sub.d          t4,    s4,    s1  //tmp[-off02]
   1643    sub.d          t5,    s4,    t0  //tmp[-off03]
   1644 
   1645    fldx.d         f5,    s4,    s1   //s0_00
   1646    fld.d          f6,    t4,    0    //s0_01
   1647    fldx.d         f7,    s4,    t0   //s0_02
   1648    fld.d          f8,    t5,    0    //s0_03
   1649 
   1650    cdef_process_data_w4 vr18, vr19
   1651    cdef_calc_sum_no_tapchange_w4 vr22
   1652 
   1653    sub.d          t4,    s4,    s2  //tmp[-off12]
   1654    sub.d          t5,    s4,    s3  //tmp[-off13]
   1655 
   1656    fldx.d         f5,    s4,    s2   //s0_10
   1657    fld.d          f6,    t4,    0    //s0_11
   1658    fldx.d         f7,    s4,    s3   //s0_12
   1659    fld.d          f8,    t5,    0    //s0_13
   1660 
   1661    cdef_process_data_w4 vr18, vr19
   1662    cdef_calc_sum_no_tapchange_w4 vr20
   1663 
   1664    vshuf4i.w      vr5,   vr1,   0x0e
   1665    vadd.h         vr1,   vr1,   vr5
   1666 
   1667    cdef_calc_dst
   1668 
   1669    vsrlni.b.h     vr5,   vr5,   0
   1670    fst.s          f5,    a0,    0
   1671 
   1672    add.d          a0,    a0,    a1
   1673    add.d          s4,    s4,    s5
   1674    add.d          s4,    s4,    s5
   1675 
   1676    addi.d         t2,    t2,    -1
   1677    blt            zero,  t2,    34b
   1678 
   1679 35:
   1680    ld.d           s0,    sp,    0
   1681    ld.d           s1,    sp,    8
   1682    ld.d           s2,    sp,    16
   1683    ld.d           s3,    sp,    24
   1684    ld.d           s4,    sp,    32
   1685    ld.d           s5,    sp,    40
   1686    ld.d           s6,    sp,    48
   1687    ld.d           s7,    sp,    56
   1688    addi.d         sp,    sp,    (64+288)
   1689 endfunc
   1690 
   1691 function cdef_filter_block_4x8_8bpc_lsx
   1692    ld.w           t0,    sp,    0
   1693    ld.w           t1,    sp,    8
   1694    addi.d         sp,    sp,    -(64+288)
   1695    st.d           s0,    sp,    0
   1696    st.d           s1,    sp,    8
   1697    st.d           s2,    sp,    16
   1698    st.d           s3,    sp,    24
   1699    st.d           s4,    sp,    32
   1700    st.d           s5,    sp,    40
   1701    st.d           s6,    sp,    48
   1702    st.d           s7,    sp,    56
   1703 
   1704    li.w           s0,    4         //w
   1705    li.w           s1,    8         //h
   1706    or             s2,    t1,    t1 //edges
   1707    or             s7,    t0,    t0 //damping
   1708 
   1709    li.d           s5,    12         //tmp_stride
   1710    addi.d         s4,    sp,    64
   1711    slli.d         t0,    s5,    1
   1712    addi.d         t0,    t0,    2
   1713    slli.d         t0,    t0,    1
   1714    add.d          s4,    s4,    t0  //ptr tmp
   1715    vxor.v         vr23,  vr23,  vr23
   1716    li.w           t2,    1
   1717    vreplgr2vr.h   vr20,  t2
   1718    vaddi.hu       vr21,  vr20,  2
   1719    vaddi.hu       vr22,  vr20,  1
   1720 
   1721    li.w          t0,     -16384
   1722    vreplgr2vr.h  vr18,   t0
   1723 
   1724    //padding
   1725    li.w          t5,     -2        //x_start
   1726    addi.d        t6,     s0,    2  //x_end
   1727    li.w          t7,     -2        //y_start
   1728    addi.d        t8,     s1,    2  //y_end
   1729    li.w          t2,     2
   1730 
   1731    andi          t4,     s2,    4
   1732    bnez          t4,     1f
   1733 
   1734    //CDEF_HAVE_TOP
   1735    slli.d        t3,     s5,    2
   1736    addi.d        t4,     s4,    -4
   1737    sub.d         t4,     t4,    t3
   1738    addi.d        t3,     s0,    4
   1739 
   1740    cdef_fill     t4,     s5,    t3,     t2
   1741 
   1742    or            t7,     zero,  zero
   1743 
   1744 1:  //CDEF_HAVE_BOTTOM
   1745    andi          t4,     s2,8
   1746    bnez          t4,     2f
   1747 
   1748    mul.w         t3,     s1,    s5
   1749    slli.d        t3,     t3,  1
   1750    add.d         t4,     s4,  t3
   1751    addi.d        t4,     t4,    -4
   1752    li.d          t3,     8
   1753 
   1754    cdef_fill     t4,     s5,    t3,     t2
   1755 
   1756    addi.d        t8,     t8,    -2
   1757 
   1758 2:  //CDEF_HAVE_LEFT
   1759    andi          t4,     s2,1
   1760    bnez          t4,     3f
   1761 
   1762    mul.w         t3,     t7,    s5
   1763    slli.d        t3,     t3,    1
   1764    add.d         t4,     s4,    t3
   1765    addi.d        t4,     t4,    -4
   1766    sub.d         t3,     t8,    t7
   1767 
   1768    cdef_fill     t4,     s5,    t2,     t3
   1769 
   1770    or            t5,     zero,  zero
   1771 
   1772 3:  //CDEF_HAVE_RIGHT
   1773    andi          t4,     s2,2
   1774    bnez          t4,     40f
   1775 
   1776    mul.w         t3,     t7,    s5
   1777    slli.d        t3,     t3,    1
   1778    add.d         t4,     s4,    t3
   1779    addi.d        t4,     t4,    8
   1780    sub.d         t3,     t8,    t7
   1781 
   1782    cdef_fill     t4,     s5,    t2,     t3
   1783 
   1784    addi.d        t6,     t6,    -2
   1785 
   1786 40:
   1787    cdef_padding_data
   1788 
   1789    beqz           a5,    33f
   1790 
   1791 28:  //if (pri_strength)
   1792    li.w           t0,    4
   1793    andi           t1,    a5,    1
   1794    sub.d          t0,    t0,    t1  //pri_tap
   1795 
   1796    clz.w          t1,    a5
   1797    li.d           t2,    31
   1798    sub.w          t1,    t2,    t1
   1799    sub.w          t1,    s7,    t1
   1800 
   1801    blt            t1,    zero,  281f
   1802    or             t1,    t1,    t1
   1803    b              282f
   1804 281:
   1805    or             t1,    zero,  zero   //t1: pri_shift
   1806 282:
   1807 
   1808    beqz           a6,    31f
   1809 
   1810 29:  //if (sec_strength)
   1811    cdef_pri_sec_init
   1812 
   1813 30:
   1814    fld.s          f0,    a0,    0     //px
   1815    vsllwil.hu.bu  vr0,   vr0,   0
   1816    vpermi.w       vr0,   vr0,   0x44
   1817 
   1818    vxor.v         vr1,   vr1,   vr1   //sum
   1819    vor.v          vr2,   vr0,   vr0   //max
   1820    vor.v          vr3,   vr0,   vr0   //min
   1821    vor.v          vr15,  vr4,   vr4   //pri_tap_k
   1822 
   1823    sub.d          t4,    s4,    a2
   1824    sub.d          t5,    s4,    a3
   1825 
   1826    fldx.d         f5,    s4,    a2   //p0_00
   1827    fld.d          f6,    t4,    0    //p0_01
   1828    fldx.d         f7,    s4,    a3   //p0_10
   1829    fld.d          f8,    t5,    0    //p0_11
   1830 
   1831    cdef_process_data_w4 vr9,   vr10
   1832    cdef_calc_sum_tapchange_w4
   1833    cdef_calc_maxmin_w4
   1834 
   1835    sub.d          t4,    s4,    s1  //tmp[-off02]
   1836    sub.d          t5,    s4,    t0  //tmp[-off03]
   1837 
   1838    fldx.d         f5,    s4,    s1   //s0_00
   1839    fld.d          f6,    t4,    0    //s0_01
   1840    fldx.d         f7,    s4,    t0   //s0_02
   1841    fld.d          f8,    t5,    0    //s0_03
   1842 
   1843    cdef_process_data_w4 vr18, vr19
   1844    cdef_calc_sum_no_tapchange_w4 vr22
   1845    cdef_calc_maxmin_w4
   1846 
   1847    sub.d          t4,    s4,    s2  //tmp[-off12]
   1848    sub.d          t5,    s4,    s3  //tmp[-off13]
   1849 
   1850    fldx.d         f5,    s4,    s2   //s0_10
   1851    fld.d          f6,    t4,    0    //s0_11
   1852    fldx.d         f7,    s4,    s3   //s0_12
   1853    fld.d          f8,    t5,    0    //s0_13
   1854 
   1855    cdef_process_data_w4 vr18, vr19
   1856    cdef_calc_sum_no_tapchange_w4 vr20
   1857    cdef_calc_maxmin_w4
   1858 
   1859    vshuf4i.w      vr5,   vr1,   0x0e
   1860    vshuf4i.w      vr6,   vr3,   0x0e
   1861    vshuf4i.w      vr7,   vr2,   0x0e
   1862    vadd.h         vr1,   vr1,   vr5
   1863    vmin.hu        vr3,   vr6,   vr3
   1864    vmax.h         vr2,   vr7,   vr2
   1865 
   1866    cdef_calc_dst
   1867    iclip_vrh       vr5,   vr3,   vr2,  vr16,  vr17,  vr5
   1868 
   1869    vsrlni.b.h     vr5,   vr5,   0
   1870    fst.s          f5,    a0,    0
   1871 
   1872    add.d          a0,    a0,    a1
   1873    add.d          s4,    s4,    s5
   1874    add.d          s4,    s4,    s5
   1875 
   1876    addi.d         t2,    t2,    -1
   1877    blt            zero,  t2,    30b
   1878    b              35f
   1879 
   1880 31:  // pri_strength only
   1881    cdef_pri_init
   1882 
   1883 32:
   1884    fld.s          f0,    a0,    0     //px
   1885    vsllwil.hu.bu  vr0,   vr0,   0
   1886    vpermi.w       vr0,   vr0,   0x44
   1887 
   1888    vxor.v         vr1,   vr1,   vr1   //sum
   1889    vor.v          vr15,  vr4,   vr4   //pri_tap_k
   1890 
   1891    sub.d          t4,    s4,    a2
   1892    sub.d          t5,    s4,    a3
   1893 
   1894    fldx.d         f5,    s4,    a2   //p0_00
   1895    fld.d          f6,    t4,    0    //p0_01
   1896    fldx.d         f7,    s4,    a3   //p0_10
   1897    fld.d          f8,    t5,    0    //p0_11
   1898 
   1899    cdef_process_data_w4 vr9,   vr10
   1900    cdef_calc_sum_tapchange_w4
   1901 
   1902    vshuf4i.w      vr5,   vr1,   0x0e
   1903    vadd.h         vr1,   vr1,   vr5
   1904 
   1905    cdef_calc_dst
   1906 
   1907    vsrlni.b.h     vr5,   vr5,   0
   1908    fst.s          f5,    a0,    0
   1909 
   1910    add.d          a0,    a0,    a1
   1911    add.d          s4,    s4,    s5
   1912    add.d          s4,    s4,    s5
   1913 
   1914    addi.d         t2,    t2,    -1
   1915    blt            zero,  t2,    32b
   1916    b              35f
   1917 
   1918 33:   // sec_strength only
   1919    cdef_sec_init
   1920 
   1921 34:
   1922    fld.s          f0,    a0,    0     //px
   1923    vsllwil.hu.bu  vr0,   vr0,   0
   1924    vpermi.w       vr0,   vr0,   0x44
   1925 
   1926    vxor.v         vr1,   vr1,   vr1   //sum
   1927 
   1928    sub.d          t4,    s4,    s1  //tmp[-off02]
   1929    sub.d          t5,    s4,    t0  //tmp[-off03]
   1930 
   1931    fldx.d         f5,    s4,    s1   //s0_00
   1932    fld.d          f6,    t4,    0    //s0_01
   1933    fldx.d         f7,    s4,    t0   //s0_02
   1934    fld.d          f8,    t5,    0    //s0_03
   1935 
   1936    cdef_process_data_w4 vr18, vr19
   1937    cdef_calc_sum_no_tapchange_w4 vr22
   1938 
   1939    sub.d          t4,    s4,    s2  //tmp[-off12]
   1940    sub.d          t5,    s4,    s3  //tmp[-off13]
   1941 
   1942    fldx.d         f5,    s4,    s2   //s0_10
   1943    fld.d          f6,    t4,    0    //s0_11
   1944    fldx.d         f7,    s4,    s3   //s0_12
   1945    fld.d          f8,    t5,    0    //s0_13
   1946 
   1947    cdef_process_data_w4 vr18, vr19
   1948    cdef_calc_sum_no_tapchange_w4 vr20
   1949 
   1950    vshuf4i.w      vr5,   vr1,   0x0e
   1951    vadd.h         vr1,   vr1,   vr5
   1952 
   1953    cdef_calc_dst
   1954 
   1955    vsrlni.b.h     vr5,   vr5,   0
   1956    fst.s          f5,    a0,    0
   1957 
   1958    add.d          a0,    a0,    a1
   1959    add.d          s4,    s4,    s5
   1960    add.d          s4,    s4,    s5
   1961 
   1962    addi.d         t2,    t2,    -1
   1963    blt            zero,  t2,    34b
   1964 
   1965 35:
   1966    ld.d           s0,    sp,    0
   1967    ld.d           s1,    sp,    8
   1968    ld.d           s2,    sp,    16
   1969    ld.d           s3,    sp,    24
   1970    ld.d           s4,    sp,    32
   1971    ld.d           s5,    sp,    40
   1972    ld.d           s6,    sp,    48
   1973    ld.d           s7,    sp,    56
   1974    addi.d         sp,    sp,    (64+288)
   1975 endfunc
   1976 
   1977 function cdef_filter_block_8x8_8bpc_lsx
   1978    ld.w           t0,    sp,    0
   1979    ld.w           t1,    sp,    8
   1980    addi.d         sp,    sp,    -(64+288)
   1981    st.d           s0,    sp,    0
   1982    st.d           s1,    sp,    8
   1983    st.d           s2,    sp,    16
   1984    st.d           s3,    sp,    24
   1985    st.d           s4,    sp,    32
   1986    st.d           s5,    sp,    40
   1987    st.d           s6,    sp,    48
   1988    st.d           s7,    sp,    56
   1989 
   1990    li.w           s0,    8         //w
   1991    li.w           s1,    8         //h
   1992    or             s2,    t1,    t1 //edges
   1993    or             s7,    t0,    t0 //damping
   1994 
   1995    // cdef_filter_block_kernel
   1996    li.d           s5,    12         //tmp_stride
   1997    addi.d         s4,    sp,    64
   1998    slli.d         t0,    s5,    1
   1999    addi.d         t0,    t0,    2
   2000    slli.d         t0,    t0,    1
   2001    add.d          s4,    s4,    t0  //ptr tmp
   2002    vxor.v         vr23,  vr23,  vr23
   2003    li.w           t2,    1
   2004    vreplgr2vr.h   vr20,  t2
   2005    vaddi.hu       vr21,  vr20,  2
   2006    vaddi.hu       vr22,  vr20,  1
   2007 
   2008    li.w          t0,     -16384
   2009    vreplgr2vr.h  vr18,   t0
   2010 
   2011    //padding
   2012    li.w          t5,     -2        //x_start
   2013    addi.d        t6,     s0,    2  //x_end
   2014    li.w          t7,     -2        //y_start
   2015    addi.d        t8,     s1,    2  //y_end
   2016    li.w          t2,     2
   2017 
   2018    andi          t4,     s2,    4
   2019    bnez          t4,     1f
   2020 
   2021    //CDEF_HAVE_TOP
   2022    slli.d        t3,     s5,    2
   2023    addi.d        t4,     s4,    -4
   2024    sub.d         t4,     t4,    t3
   2025    addi.d        t3,     s0,    4
   2026 
   2027    cdef_fill     t4,     s5,    t3,     t2
   2028 
   2029    or            t7,     zero,  zero
   2030 
   2031 1:  //CDEF_HAVE_BOTTOM
   2032    andi          t4,     s2,8
   2033    bnez          t4,     2f
   2034 
   2035    mul.w         t3,     s1,    s5
   2036    slli.d        t3,     t3,  1
   2037    add.d         t4,     s4,  t3
   2038    addi.d        t4,     t4,    -4
   2039    li.d          t3,     12
   2040 
   2041    cdef_fill     t4,     s5,    t3,    t2
   2042 
   2043    addi.d        t8,     t8,    -2
   2044 
   2045 2:  //CDEF_HAVE_LEFT
   2046    andi          t4,     s2,1
   2047    bnez          t4,     3f
   2048 
   2049    mul.w         t3,     t7,    s5
   2050    slli.d        t3,     t3,    1
   2051    add.d         t4,     s4,    t3
   2052    addi.d        t4,     t4,    -4
   2053    sub.d         t3,     t8,    t7
   2054    li.d          t2,     2
   2055 
   2056    cdef_fill     t4,     s5,    t2,    t3
   2057 
   2058    or            t5,     zero,  zero
   2059 
   2060 3:  //CDEF_HAVE_RIGHT
   2061    andi          t4,     s2,2
   2062    bnez          t4,     40f
   2063 
   2064    mul.w         t3,     t7,    s5
   2065    slli.d        t3,     t3,    1
   2066    add.d         t4,     s4,    t3
   2067    addi.d        t4,     t4,    16
   2068    sub.d         t3,     t8,    t7
   2069    li.d          t2,     2
   2070 
   2071    cdef_fill     t4,     s5,    t2,    t3
   2072 
   2073    addi.d        t6,     t6,    -2
   2074 
   2075 40:
   2076    cdef_padding_data
   2077 
   2078    beqz           a5,    33f
   2079 
   2080 28:  //if (pri_strength)
   2081    li.w           t0,    4
   2082    andi           t1,    a5,    1
   2083    sub.d          t0,    t0,    t1  //pri_tap
   2084 
   2085    //edit
   2086    clz.w          t1,    a5
   2087    li.d           t2,    31
   2088    sub.w          t3,    t2,    t1
   2089    sub.w          t3,    s7,    t3
   2090 
   2091    or             t1,    zero,  zero   //t1: pri_shift
   2092    blt            t3,    zero,  281f
   2093    or             t1,    t3,    t3
   2094 281:
   2095 
   2096    beqz           a6,    31f
   2097 
   2098 29:  //if (sec_strength)
   2099    cdef_pri_sec_init
   2100 
   2101 301:
   2102    fld.d          f0,    a0,    0     //px
   2103    vsllwil.hu.bu  vr0,   vr0,   0
   2104 
   2105    vxor.v         vr1,   vr1,   vr1   //sum
   2106    vor.v          vr2,   vr0,   vr0   //max
   2107    vor.v          vr3,   vr0,   vr0   //min
   2108    vor.v          vr15,  vr4,   vr4   //pri_tap_k
   2109 
   2110    sub.d          t4,    s4,    a2
   2111    sub.d          t5,    s4,    a3
   2112 
   2113    vldx           vr5,  s4,    a2
   2114    vld            vr6,  t4,    0
   2115    vldx           vr7,  s4,    a3
   2116    vld            vr8,  t5,    0
   2117 
   2118    cdef_process_data_w8 vr9, vr10
   2119    cdef_calc_sum_tapchange_w8
   2120    cdef_calc_maxmin_w8
   2121 
   2122    //s 00-03
   2123    sub.d          t4,    s4,    s1  //tmp[-off02]
   2124    sub.d          t5,    s4,    t0  //tmp[-off03]
   2125 
   2126    vldx           vr5,  s4,    s1
   2127    vld            vr6,  t4,    0
   2128    vldx           vr7,  s4,    t0
   2129    vld            vr8,  t5,    0
   2130 
   2131    cdef_process_data_w8 vr18, vr19
   2132    cdef_calc_sum_no_tapchange_w8 vr22
   2133    cdef_calc_maxmin_w8
   2134 
   2135    //s 10-13
   2136    sub.d          t4,    s4,    s2  //tmp[-off12]
   2137    sub.d          t5,    s4,    s3  //tmp[-off13]
   2138 
   2139    vldx           vr5,  s4,    s2
   2140    vld            vr6,  t4,    0
   2141    vldx           vr7,  s4,    s3
   2142    vld            vr8,  t5,    0
   2143 
   2144    cdef_process_data_w8 vr18, vr19
   2145    cdef_calc_sum_no_tapchange_w8 vr20
   2146 
   2147    cdef_calc_maxmin_w8
   2148    cdef_calc_dst
   2149 
   2150    iclip_vrh       vr5,   vr3,   vr2,  vr16,  vr17,  vr5
   2151 
   2152    vsrlni.b.h     vr5,   vr5,   0
   2153    fst.d          f5,    a0,    0
   2154 
   2155    add.d          a0,    a0,    a1
   2156    add.d          s4,    s4,    s5
   2157    add.d          s4,    s4,    s5
   2158 
   2159    addi.d         t2,    t2,    -1
   2160    blt            zero,  t2,    301b
   2161    b              35f
   2162 
   2163 31:  // pri_strength only
   2164    cdef_pri_init
   2165 
   2166 32:
   2167    fld.d          f0,    a0,    0     //px
   2168    vsllwil.hu.bu  vr0,   vr0,   0
   2169 
   2170    vxor.v         vr1,   vr1,   vr1   //sum
   2171    vor.v          vr15,  vr4,   vr4   //pri_tap_k
   2172 
   2173    sub.d          t4,    s4,    a2
   2174    sub.d          t5,    s4,    a3
   2175 
   2176    vldx           vr5,  s4,    a2
   2177    vld            vr6,  t4,    0
   2178    vldx           vr7,  s4,    a3
   2179    vld            vr8,  t5,    0
   2180 
   2181    cdef_process_data_w8 vr9, vr10
   2182    cdef_calc_sum_tapchange_w8
   2183    cdef_calc_dst
   2184 
   2185    vsrlni.b.h     vr5,   vr5,   0
   2186    fst.d          f5,    a0,    0
   2187 
   2188    add.d          a0,    a0,    a1
   2189    add.d          s4,    s4,    s5
   2190    add.d          s4,    s4,    s5
   2191 
   2192    addi.d         t2,    t2,    -1
   2193    blt            zero,  t2,    32b
   2194    b              35f
   2195 
   2196 33:   // sec_strength only
   2197    cdef_sec_init
   2198 
   2199 34:
   2200    fld.d          f0,    a0,    0     //px
   2201    vsllwil.hu.bu  vr0,   vr0,   0
   2202 
   2203    vxor.v         vr1,   vr1,   vr1   //sum
   2204 
   2205    sub.d          t4,    s4,    s1  //tmp[-off02]
   2206    sub.d          t5,    s4,    t0  //tmp[-off03]
   2207 
   2208    vldx           vr5,  s4,    s1
   2209    vld            vr6,  t4,    0
   2210    vldx           vr7,  s4,    t0
   2211    vld            vr8,  t5,    0
   2212 
   2213    cdef_process_data_w8 vr18,  vr19
   2214    cdef_calc_sum_no_tapchange_w8 vr22
   2215 
   2216    sub.d          t4,    s4,    s2  //tmp[-off12]
   2217    sub.d          t5,    s4,    s3  //tmp[-off13]
   2218 
   2219    vldx           vr5,  s4,    s2
   2220    vld            vr6,  t4,    0
   2221    vldx           vr7,  s4,    s3
   2222    vld            vr8,  t5,    0
   2223 
   2224    cdef_process_data_w8 vr18,  vr19
   2225    cdef_calc_sum_no_tapchange_w8 vr20
   2226    cdef_calc_dst
   2227 
   2228    vsrlni.b.h     vr5,   vr5,   0
   2229    fst.d          f5,    a0,    0
   2230 
   2231    add.d          a0,    a0,    a1
   2232    add.d          s4,    s4,    s5
   2233    add.d          s4,    s4,    s5
   2234 
   2235    addi.d         t2,    t2,    -1
   2236    blt            zero,  t2,    34b
   2237 
   2238 35:
   2239    ld.d           s0,    sp,    0
   2240    ld.d           s1,    sp,    8
   2241    ld.d           s2,    sp,    16
   2242    ld.d           s3,    sp,    24
   2243    ld.d           s4,    sp,    32
   2244    ld.d           s5,    sp,    40
   2245    ld.d           s6,    sp,    48
   2246    ld.d           s7,    sp,    56
   2247    addi.d         sp,    sp,    (64+288)
   2248 endfunc