tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration.S (78371B)


      1 /*
      2 * Copyright © 2023, VideoLAN and dav1d authors
      3 * Copyright © 2023, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/loongarch/loongson_asm.S"
     29 
     30 #define REST_UNIT_STRIDE (400)
     31 
     32 .macro MADD_HU_BU in0, in1, out0, out1
     33    vsllwil.hu.bu vr12,     \in0,     0
     34    vexth.hu.bu   vr13,     \in0
     35    vmadd.h       \out0,    vr12,     \in1
     36    vmadd.h       \out1,    vr13,     \in1
     37 .endm
     38 
     39 const wiener_shuf
     40 .byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
     41 endconst
     42 
     43 /*
     44 void wiener_filter_h_lsx(int32_t *hor_ptr,
     45                         uint8_t *tmp_ptr,
     46                         const int16_t filterh[8],
     47                         const int w, const int h)
     48 */
     49 function wiener_filter_h_8bpc_lsx
     50    addi.d        sp,       sp,       -40
     51    fst.d         f24,      sp,       0
     52    fst.d         f25,      sp,       8
     53    fst.d         f26,      sp,       16
     54    fst.d         f27,      sp,       24
     55    fst.d         f28,      sp,       32
     56    li.w          t7,       1<<14          // clip_limit
     57 
     58    la.local      t1,       wiener_shuf
     59    vld           vr4,      t1,       0
     60    vld           vr14,     a2,       0    // filter[0][k]
     61    vreplvei.h    vr21,     vr14,     0
     62    vreplvei.h    vr22,     vr14,     1
     63    vreplvei.h    vr23,     vr14,     2
     64    vreplvei.h    vr24,     vr14,     3
     65    vreplvei.h    vr25,     vr14,     4
     66    vreplvei.h    vr26,     vr14,     5
     67    vreplvei.h    vr27,     vr14,     6
     68    vreplgr2vr.w  vr0,      t7
     69 
     70 .WIENER_FILTER_H_H:
     71    addi.w        a4,       a4,       -1    // h
     72    addi.w        t0,       a3,       0     // w
     73    addi.d        t1,       a1,       0     // tmp_ptr
     74    addi.d        t2,       a0,       0     // hor_ptr
     75 
     76 .WIENER_FILTER_H_W:
     77    addi.w        t0,       t0,       -16
     78    vld           vr5,      t1,       0
     79    vld           vr13,     t1,       16
     80 
     81    vsubi.bu      vr14,     vr4,      2
     82    vsubi.bu      vr15,     vr4,      1
     83    vshuf.b       vr6,      vr13,     vr5,     vr14  // 1 ... 8, 9 ... 16
     84    vshuf.b       vr7,      vr13,     vr5,     vr15  // 2 ... 9, 10 ... 17
     85    vshuf.b       vr8,      vr13,     vr5,     vr4   // 3 ... 10, 11 ... 18
     86    vaddi.bu      vr14,     vr4,      1
     87    vaddi.bu      vr15,     vr4,      2
     88    vshuf.b       vr9,      vr13,     vr5,     vr14  // 4 ... 11, 12 ... 19
     89    vshuf.b       vr10,     vr13,     vr5,     vr15  // 5 ... 12, 13 ... 20
     90    vaddi.bu      vr14,     vr4,      3
     91    vshuf.b       vr11,     vr13,     vr5,     vr14  // 6 ... 13, 14 ... 21
     92 
     93    vsllwil.hu.bu vr15,     vr8,      0    //  3  4  5  6  7  8  9 10
     94    vexth.hu.bu   vr16,     vr8            // 11 12 13 14 15 16 17 18
     95    vsllwil.wu.hu vr17,     vr15,     7    //  3  4  5  6
     96    vexth.wu.hu   vr18,     vr15           //  7  8  9 10
     97    vsllwil.wu.hu vr19,     vr16,     7    // 11 12 13 14
     98    vexth.wu.hu   vr20,     vr16           // 15 16 17 18
     99    vslli.w       vr18,     vr18,     7
    100    vslli.w       vr20,     vr20,     7
    101    vxor.v        vr15,     vr15,     vr15
    102    vxor.v        vr14,     vr14,     vr14
    103 
    104    MADD_HU_BU    vr5,   vr21,  vr14,  vr15
    105    MADD_HU_BU    vr6,   vr22,  vr14,  vr15
    106    MADD_HU_BU    vr7,   vr23,  vr14,  vr15
    107    MADD_HU_BU    vr8,   vr24,  vr14,  vr15
    108    MADD_HU_BU    vr9,   vr25,  vr14,  vr15
    109    MADD_HU_BU    vr10,  vr26,  vr14,  vr15
    110    MADD_HU_BU    vr11,  vr27,  vr14,  vr15
    111 
    112    vsllwil.w.h   vr5,      vr14,     0   //  0  1  2  3
    113    vexth.w.h     vr6,      vr14          //  4  5  6  7
    114    vsllwil.w.h   vr7,      vr15,     0   //  8  9 10 11
    115    vexth.w.h     vr8,      vr15          // 12 13 14 15
    116    vadd.w        vr17,     vr17,     vr5
    117    vadd.w        vr18,     vr18,     vr6
    118    vadd.w        vr19,     vr19,     vr7
    119    vadd.w        vr20,     vr20,     vr8
    120    vadd.w        vr17,     vr17,     vr0
    121    vadd.w        vr18,     vr18,     vr0
    122    vadd.w        vr19,     vr19,     vr0
    123    vadd.w        vr20,     vr20,     vr0
    124 
    125    vsrli.w       vr1,      vr0,      1
    126    vsubi.wu      vr1,      vr1,      1
    127    vxor.v        vr3,      vr3,      vr3
    128    vsrari.w      vr17,     vr17,     3
    129    vsrari.w      vr18,     vr18,     3
    130    vsrari.w      vr19,     vr19,     3
    131    vsrari.w      vr20,     vr20,     3
    132    vclip.w       vr17,     vr17,     vr3,     vr1
    133    vclip.w       vr18,     vr18,     vr3,     vr1
    134    vclip.w       vr19,     vr19,     vr3,     vr1
    135    vclip.w       vr20,     vr20,     vr3,     vr1
    136 
    137    vst           vr17,     t2,       0
    138    vst           vr18,     t2,       16
    139    vst           vr19,     t2,       32
    140    vst           vr20,     t2,       48
    141    addi.d        t1,       t1,       16
    142    addi.d        t2,       t2,       64
    143    blt           zero,     t0,       .WIENER_FILTER_H_W
    144 
    145    addi.d        a1,       a1,       REST_UNIT_STRIDE
    146    addi.d        a0,       a0,       (REST_UNIT_STRIDE << 2)
    147    bnez          a4,       .WIENER_FILTER_H_H
    148 
    149    fld.d         f24,      sp,       0
    150    fld.d         f25,      sp,       8
    151    fld.d         f26,      sp,       16
    152    fld.d         f27,      sp,       24
    153    fld.d         f28,      sp,       32
    154    addi.d        sp,       sp,       40
    155 endfunc
    156 
    157 .macro APPLY_FILTER in0, in1, in2
    158    alsl.d         t7,      \in0,     \in1,    2
    159    vld            vr10,    t7,       0
    160    vld            vr11,    t7,       16
    161    vld            vr12,    t7,       32
    162    vld            vr13,    t7,       48
    163    vmadd.w        vr14,    vr10,     \in2
    164    vmadd.w        vr15,    vr11,     \in2
    165    vmadd.w        vr16,    vr12,     \in2
    166    vmadd.w        vr17,    vr13,     \in2
    167 .endm
    168 
    169 .macro wiener_filter_v_8bpc_core_lsx
    170    vreplgr2vr.w  vr14,     t6
    171    vreplgr2vr.w  vr15,     t6
    172    vreplgr2vr.w  vr16,     t6
    173    vreplgr2vr.w  vr17,     t6
    174 
    175    addi.w        t7,       t2,       0      // j + index k
    176    mul.w         t7,       t7,       t8     // (j + index) * REST_UNIT_STRIDE
    177    add.w         t7,       t7,       t4     // (j + index) * REST_UNIT_STRIDE + i
    178 
    179    APPLY_FILTER  t7, a2, vr2
    180    APPLY_FILTER  t8, t7, vr3
    181    APPLY_FILTER  t8, t7, vr4
    182    APPLY_FILTER  t8, t7, vr5
    183    APPLY_FILTER  t8, t7, vr6
    184    APPLY_FILTER  t8, t7, vr7
    185    APPLY_FILTER  t8, t7, vr8
    186    vssrarni.hu.w vr15,     vr14,     11
    187    vssrarni.hu.w vr17,     vr16,     11
    188    vssrlni.bu.h  vr17,     vr15,     0
    189 .endm
    190 
    191 /*
    192 void wiener_filter_v_lsx(uint8_t *p,
    193                         const ptrdiff_t p_stride,
    194                         const int32_t *hor,
    195                         const int16_t filterv[8],
    196                         const int w, const int h)
    197 */
    198 function wiener_filter_v_8bpc_lsx
    199    li.w          t6,       -(1 << 18)
    200 
    201    li.w          t8,       REST_UNIT_STRIDE
    202    ld.h          t0,       a3,       0
    203    ld.h          t1,       a3,       2
    204    vreplgr2vr.w  vr2,      t0
    205    vreplgr2vr.w  vr3,      t1
    206    ld.h          t0,       a3,       4
    207    ld.h          t1,       a3,       6
    208    vreplgr2vr.w  vr4,      t0
    209    vreplgr2vr.w  vr5,      t1
    210    ld.h          t0,       a3,       8
    211    ld.h          t1,       a3,       10
    212    vreplgr2vr.w  vr6,      t0
    213    vreplgr2vr.w  vr7,      t1
    214    ld.h          t0,       a3,       12
    215    vreplgr2vr.w  vr8,      t0
    216 
    217    andi          t1,       a4,       0xf
    218    sub.w         t0,       a4,       t1    // w-w%16
    219    or            t2,       zero,     zero  // j
    220    or            t4,       zero,     zero
    221    beqz          t0,       .WIENER_FILTER_V_W_LT16
    222 
    223 .WIENER_FILTER_V_H:
    224    andi          t1,       a4,       0xf
    225    add.d         t3,       zero,     a0     // p
    226    or            t4,       zero,     zero   // i
    227 
    228 .WIENER_FILTER_V_W:
    229 
    230    wiener_filter_v_8bpc_core_lsx
    231 
    232    mul.w         t5,       t2,       a1   // j * stride
    233    add.w         t5,       t5,       t4   // j * stride + i
    234    add.d         t3,       a0,       t5
    235    addi.w        t4,       t4,       16
    236    vst           vr17,     t3,       0
    237    bne           t0,       t4,       .WIENER_FILTER_V_W
    238 
    239    beqz          t1,       .WIENER_FILTER_V_W_EQ16
    240 
    241    wiener_filter_v_8bpc_core_lsx
    242 
    243    addi.d        t3,       t3,       16
    244    andi          t1,       a4,       0xf
    245 
    246 .WIENER_FILTER_V_ST_REM:
    247    vstelm.b      vr17,     t3,       0,    0
    248    vbsrl.v       vr17,     vr17,     1
    249    addi.d        t3,       t3,       1
    250    addi.w        t1,       t1,       -1
    251    bnez          t1,       .WIENER_FILTER_V_ST_REM
    252 .WIENER_FILTER_V_W_EQ16:
    253    addi.w        t2,       t2,       1
    254    blt           t2,       a5,       .WIENER_FILTER_V_H
    255    b              .WIENER_FILTER_V_END
    256 
    257 .WIENER_FILTER_V_W_LT16:
    258    andi          t1,       a4,       0xf
    259    add.d         t3,       zero,     a0
    260 
    261    wiener_filter_v_8bpc_core_lsx
    262 
    263    mul.w         t5,       t2,       a1   // j * stride
    264    add.d         t3,       a0,       t5
    265 
    266 .WIENER_FILTER_V_ST_REM_1:
    267    vstelm.b      vr17,     t3,       0,    0
    268    vbsrl.v       vr17,     vr17,     1
    269    addi.d        t3,       t3,       1
    270    addi.w        t1,       t1,       -1
    271    bnez          t1,       .WIENER_FILTER_V_ST_REM_1
    272 
    273    addi.w        t2,       t2,       1
    274    blt           t2,       a5,       .WIENER_FILTER_V_W_LT16
    275 
    276 .WIENER_FILTER_V_END:
    277 endfunc
    278 
    279 /*
    280 void boxsum3_h(int32_t *sumsq, coef *sum, const pixel *src,
    281               const int w, const int h)
    282 */
    283 function boxsum3_h_8bpc_lsx
    284    addi.d         a2,      a2,      REST_UNIT_STRIDE
    285    li.w           t0,      1
    286    addi.w         a3,      a3,      -2
    287    addi.w         a4,      a4,      -4
    288 
    289 .LBS3_H_H:
    290    alsl.d         t1,      t0,      a1,    1     // sum_v    *sum_v = sum + x
    291    alsl.d         t2,      t0,      a0,    2     // sumsq_v  *sumsq_v = sumsq + x
    292    add.d          t3,      t0,      a2           // s
    293    addi.w         t5,      a3,      0
    294 .LBS3_H_W:
    295    vld            vr0,     t3,      0
    296    vld            vr1,     t3,      REST_UNIT_STRIDE
    297    vld            vr2,     t3,      (REST_UNIT_STRIDE<<1)
    298 
    299    vilvl.b        vr3,     vr1,     vr0
    300    vhaddw.hu.bu   vr4,     vr3,     vr3
    301    vilvh.b        vr5,     vr1,     vr0
    302    vhaddw.hu.bu   vr6,     vr5,     vr5
    303    vsllwil.hu.bu  vr7,     vr2,     0
    304    vexth.hu.bu    vr8,     vr2
    305    // sum_v
    306    vadd.h         vr4,     vr4,     vr7
    307    vadd.h         vr6,     vr6,     vr8
    308    vst            vr4,     t1,      REST_UNIT_STRIDE<<1
    309    vst            vr6,     t1,      (REST_UNIT_STRIDE<<1)+16
    310    addi.d         t1,      t1,      32
    311    // sumsq
    312    vmulwev.h.bu   vr9,     vr3,     vr3
    313    vmulwod.h.bu   vr10,    vr3,     vr3
    314    vmulwev.h.bu   vr11,    vr5,     vr5
    315    vmulwod.h.bu   vr12,    vr5,     vr5
    316    vaddwev.w.hu   vr13,    vr10,    vr9
    317    vaddwod.w.hu   vr14,    vr10,    vr9
    318    vaddwev.w.hu   vr15,    vr12,    vr11
    319    vaddwod.w.hu   vr16,    vr12,    vr11
    320    vmaddwev.w.hu  vr13,    vr7,     vr7
    321    vmaddwod.w.hu  vr14,    vr7,     vr7
    322    vmaddwev.w.hu  vr15,    vr8,     vr8
    323    vmaddwod.w.hu  vr16,    vr8,     vr8
    324    vilvl.w        vr9,     vr14,    vr13
    325    vilvh.w        vr10,    vr14,    vr13
    326    vilvl.w        vr11,    vr16,    vr15
    327    vilvh.w        vr12,    vr16,    vr15
    328    vst            vr9,     t2,      REST_UNIT_STRIDE<<2
    329    vst            vr10,    t2,      (REST_UNIT_STRIDE<<2)+16
    330    vst            vr11,    t2,      (REST_UNIT_STRIDE<<2)+32
    331    vst            vr12,    t2,      (REST_UNIT_STRIDE<<2)+48
    332 
    333    addi.d         t2,      t2,      64
    334    addi.w         t5,      t5,      -16
    335    addi.d         t3,      t3,      16
    336    blt            zero,    t5,      .LBS3_H_W
    337 
    338    addi.d         a0,      a0,      REST_UNIT_STRIDE<<2
    339    addi.d         a1,      a1,      REST_UNIT_STRIDE<<1
    340    addi.d         a2,      a2,      REST_UNIT_STRIDE
    341    addi.d         a4,      a4,      -1
    342    blt            zero,    a4,      .LBS3_H_H
    343 endfunc
    344 
    345 /*
    346 void boxsum3_v(int32_t *sumsq, coef *sum,
    347               const int w, const int h)
    348 */
    349 function boxsum3_v_8bpc_lsx
    350    addi.d         a0,      a0,      (REST_UNIT_STRIDE<<2)
    351    addi.d         a1,      a1,      (REST_UNIT_STRIDE<<1)
    352    addi.w         a3,      a3,      -4
    353    addi.w         a2,      a2,      -4
    354 
    355 .LBS3_V_H:
    356    sub.w          t3,      a2,      zero
    357    addi.d         t0,      a0,      4
    358    addi.d         t1,      a1,      2
    359    addi.d         t5,      a0,      8
    360    addi.d         t6,      a1,      4
    361 
    362    vld            vr0,      t1,      0   // a 0 1 2 3 4 5 6 7
    363    vld            vr1,      t1,      2   // b 1 2 3 4 5 6 7 8
    364    vld            vr2,      t1,      4   // c 2 3 4 5 6 7 8 9
    365    vld            vr3,      t0,      0   // a2 0 1 2 3
    366    vld            vr4,      t0,      4   // b2 1 2 3 4
    367    vld            vr5,      t0,      8   // c2 2 3 4 5
    368    vld            vr6,      t0,      16  //    3 4 5 6
    369    vld            vr7,      t0,      20  //    4 5 6 7
    370    vld            vr8,      t0,      24  //    5 6 7 8
    371    vadd.h         vr9,      vr0,     vr1
    372    vadd.w         vr10,     vr3,     vr4
    373    vadd.w         vr11,     vr6,     vr7
    374    vadd.h         vr9,      vr9,     vr2
    375    vadd.w         vr10,     vr10,    vr5
    376    vadd.w         vr11,     vr11,    vr8
    377    vpickve2gr.h   t7,       vr2,     6
    378    vpickve2gr.w   t8,       vr8,     2
    379    vst            vr9,      t6,      0
    380    vst            vr10,     t5,      0
    381    vst            vr11,     t5,      16
    382 
    383    addi.d         t1,       t1,      16
    384    addi.d         t0,       t0,      32
    385    addi.d         t5,       t5,      32
    386    addi.d         t6,       t6,      16
    387    addi.d         t3,       t3,      -8
    388    bge            zero,     t3,      .LBS3_V_H0
    389 
    390 .LBS3_V_W8:
    391    vld            vr0,      t1,      0   // a 0 1 2 3 4 5 6 7
    392    vld            vr1,      t1,      2   // b 1 2 3 4 5 6 7 8
    393    vld            vr2,      t1,      4   // c 2 3 4 5 6 7 8 9
    394    vld            vr3,      t0,      0   // a2 0 1 2 3
    395    vld            vr4,      t0,      4   // b2 1 2 3 4
    396    vld            vr5,      t0,      8   // c2 2 3 4 5
    397    vld            vr6,      t0,      16  //    3 4 5 6
    398    vld            vr7,      t0,      20  //    4 5 6 7
    399    vld            vr8,      t0,      24  //    5 6 7 8
    400    vinsgr2vr.h    vr0,      t7,      0
    401    vinsgr2vr.w    vr3,      t8,      0
    402    vpickve2gr.h   t7,       vr2,     6
    403    vpickve2gr.w   t8,       vr8,     2
    404    vadd.h         vr9,      vr0,     vr1
    405    vadd.w         vr10,     vr3,     vr4
    406    vadd.w         vr11,     vr6,     vr7
    407    vadd.h         vr9,      vr9,     vr2
    408    vadd.w         vr10,     vr10,    vr5
    409    vadd.w         vr11,     vr11,    vr8
    410    vst            vr9,      t6,      0
    411    vst            vr10,     t5,      0
    412    vst            vr11,     t5,      16
    413    addi.d         t3,       t3,      -8
    414    addi.d         t1,       t1,      16
    415    addi.d         t0,       t0,      32
    416    addi.d         t5,       t5,      32
    417    addi.d         t6,       t6,      16
    418    blt            zero,     t3,      .LBS3_V_W8
    419 
    420 .LBS3_V_H0:
    421    addi.d         a1,       a1,      REST_UNIT_STRIDE<<1
    422    addi.d         a0,       a0,      REST_UNIT_STRIDE<<2
    423    addi.w         a3,       a3,      -1
    424    bnez           a3,       .LBS3_V_H
    425 endfunc
    426 
    427 /*
    428 boxsum3_selfguided_filter(int32_t *sumsq, coef *sum,
    429                          const int w, const int h,
    430                          const unsigned s)
    431 */
    432 function boxsum3_sgf_h_8bpc_lsx
    433    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
    434    addi.d        a0,       a0,        12   // AA
    435    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
    436    addi.d        a1,       a1,        6    // BB
    437    la.local      t8,       dav1d_sgr_x_by_x
    438    li.w          t6,       455
    439    vreplgr2vr.w  vr20,     t6
    440    li.w          t6,       255
    441    vreplgr2vr.w  vr22,     t6
    442    vaddi.wu      vr21,     vr22,      1  // 256
    443    vreplgr2vr.w  vr6,      a4
    444    vldi          vr19,     0x809
    445    addi.w        a2,       a2,        2  // w + 2
    446    addi.w        a3,       a3,        2  // h + 2
    447 
    448 .LBS3SGF_H_H:
    449    addi.w        t2,       a2,        0
    450    addi.d        t0,       a0,        -4
    451    addi.d        t1,       a1,        -2
    452 
    453 .LBS3SGF_H_W:
    454    addi.w        t2,       t2,        -8
    455    vld           vr0,      t0,        0   // AA[i]
    456    vld           vr1,      t0,        16
    457    vld           vr2,      t1,        0   // BB[i]
    458 
    459    vmul.w        vr4,      vr0,       vr19 // a * n
    460    vmul.w        vr5,      vr1,       vr19 // a * n
    461    vsllwil.w.h   vr9,      vr2,       0
    462    vexth.w.h     vr10,     vr2
    463    vmsub.w       vr4,      vr9,       vr9   // p
    464    vmsub.w       vr5,      vr10,      vr10   // p
    465    vmaxi.w       vr4,      vr4,       0
    466    vmaxi.w       vr5,      vr5,       0    // p
    467    vmul.w        vr4,      vr4,       vr6  // p * s
    468    vmul.w        vr5,      vr5,       vr6  // p * s
    469    vsrlri.w      vr4,      vr4,       20
    470    vsrlri.w      vr5,      vr5,       20   // z
    471    vmin.w        vr4,      vr4,       vr22
    472    vmin.w        vr5,      vr5,       vr22
    473 
    474    vpickve2gr.w  t6,       vr4,       0
    475    ldx.bu        t7,       t8,        t6
    476    vinsgr2vr.w   vr7,      t7,        0
    477    vpickve2gr.w  t6,       vr4,       1
    478    ldx.bu        t7,       t8,        t6
    479    vinsgr2vr.w   vr7,      t7,        1
    480    vpickve2gr.w  t6,       vr4,       2
    481    ldx.bu        t7,       t8,        t6
    482    vinsgr2vr.w   vr7,      t7,        2
    483    vpickve2gr.w  t6,       vr4,       3
    484    ldx.bu        t7,       t8,        t6
    485    vinsgr2vr.w   vr7,      t7,        3
    486 
    487    vpickve2gr.w  t6,       vr5,       0
    488    ldx.bu        t7,       t8,        t6
    489    vinsgr2vr.w   vr8,      t7,        0
    490    vpickve2gr.w  t6,       vr5,       1
    491    ldx.bu        t7,       t8,        t6
    492    vinsgr2vr.w   vr8,      t7,        1
    493    vpickve2gr.w  t6,       vr5,       2
    494    ldx.bu        t7,       t8,        t6
    495    vinsgr2vr.w   vr8,      t7,        2
    496    vpickve2gr.w  t6,       vr5,       3
    497    ldx.bu        t7,       t8,        t6
    498    vinsgr2vr.w   vr8,      t7,        3     // x
    499 
    500    vmul.w        vr9,      vr7,       vr9   // x * BB[i]
    501    vmul.w        vr10,     vr8,       vr10
    502    vmul.w        vr9,      vr9,       vr20  // x * BB[i] * sgr_one_by_x
    503    vmul.w        vr10,     vr10,      vr20
    504    vsrlri.w      vr9,      vr9,       12
    505    vsrlri.w      vr10,     vr10,      12
    506    vsub.w        vr7,      vr21,      vr7
    507    vsub.w        vr8,      vr21,      vr8
    508    vpickev.h     vr8,      vr8,       vr7
    509 
    510    vst           vr9,      t0,        0
    511    vst           vr10,     t0,        16
    512    vst           vr8,      t1,        0
    513    addi.d        t0,       t0,        32
    514    addi.d        t1,       t1,        16
    515    blt           zero,     t2,        .LBS3SGF_H_W
    516 
    517    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
    518    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
    519    addi.w        a3,       a3,        -1
    520    bnez          a3,       .LBS3SGF_H_H
    521 endfunc
    522 
    523 /*
    524 boxsum3_selfguided_filter(coef *dst, pixel *src,
    525                  int32_t *sumsq, coef *sum,
    526                  const int w, const int h)
    527 */
    528 function boxsum3_sgf_v_8bpc_lsx
    529    addi.d        a1,        a1,      (3*REST_UNIT_STRIDE+3)   // src
    530    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
    531    addi.d        a2,        a2,      (REST_UNIT_STRIDE<<2)+12
    532    addi.d        a3,        a3,      REST_UNIT_STRIDE<<2
    533    addi.d        a3,        a3,      6
    534 .LBS3SGF_V_H:
    535    // A int32_t *sumsq
    536    addi.d        t0,        a2,      -(REST_UNIT_STRIDE<<2)   // -stride
    537    addi.d        t1,        a2,      0    // sumsq
    538    addi.d        t2,        a2,      REST_UNIT_STRIDE<<2      // +stride
    539    addi.d        t6,        a1,      0
    540    addi.w        t7,        a4,      0
    541    addi.d        t8,        a0,      0
    542    // B coef *sum
    543    addi.d        t3,        a3,      -(REST_UNIT_STRIDE<<1)   // -stride
    544    addi.d        t4,        a3,      0
    545    addi.d        t5,        a3,      REST_UNIT_STRIDE<<1
    546 
    547 .LBS3SGF_V_W:
    548    vld           vr0,       t0,      0   // P[i - REST_UNIT_STRIDE]
    549    vld           vr1,       t0,      16
    550    vld           vr2,       t1,      -4  // P[i-1]  -1 0 1 2
    551    vld           vr3,       t1,      12           // 3 4 5 6
    552    vld           vr4,       t2,      0   // P[i + REST_UNIT_STRIDE]
    553    vld           vr5,       t2,      16
    554    vld           vr6,       t1,      0   // p[i]     0 1 2 3
    555    vld           vr7,       t1,      16           // 4 5 6 7
    556    vld           vr8,       t1,      4   // p[i+1]   1 2 3 4
    557    vld           vr9,       t1,      20           // 5 6 7 8
    558 
    559    vld           vr10,      t0,      -4  // P[i - 1 - REST_UNIT_STRIDE]
    560    vld           vr11,      t0,      12
    561    vld           vr12,      t2,      -4  // P[i - 1 + REST_UNIT_STRIDE]
    562    vld           vr13,      t2,      12
    563    vld           vr14,      t0,      4   // P[i + 1 - REST_UNIT_STRIDE]
    564    vld           vr15,      t0,      20
    565    vld           vr16,      t2,      4   // P[i + 1 + REST_UNIT_STRIDE]
    566    vld           vr17,      t2,      20
    567 
    568    vadd.w        vr0,       vr2,     vr0
    569    vadd.w        vr4,       vr6,     vr4
    570    vadd.w        vr0,       vr0,     vr8
    571    vadd.w        vr20,      vr0,     vr4
    572    vslli.w       vr20,      vr20,    2      // 0 1 2 3
    573    vadd.w        vr0,       vr1,     vr3
    574    vadd.w        vr4,       vr5,     vr7
    575    vadd.w        vr0,       vr0,     vr9
    576    vadd.w        vr21,      vr0,     vr4
    577    vslli.w       vr21,      vr21,    2      // 4 5 6 7
    578    vadd.w        vr12,      vr10,    vr12
    579    vadd.w        vr16,      vr14,    vr16
    580    vadd.w        vr22,      vr12,    vr16
    581    vslli.w       vr23,      vr22,    1
    582    vadd.w        vr22,      vr23,    vr22
    583    vadd.w        vr11,      vr11,    vr13
    584    vadd.w        vr15,      vr15,    vr17
    585    vadd.w        vr0,       vr11,    vr15
    586    vslli.w       vr23,      vr0,     1
    587    vadd.w        vr23,      vr23,    vr0
    588    vadd.w        vr20,      vr20,    vr22   // b
    589    vadd.w        vr21,      vr21,    vr23
    590 
    591    // B coef *sum
    592    vld           vr0,       t3,      0   // P[i - REST_UNIT_STRIDE]
    593    vld           vr1,       t4,      -2  // p[i - 1]
    594    vld           vr2,       t4,      0   // p[i]
    595    vld           vr3,       t4,      2   // p[i + 1]
    596    vld           vr4,       t5,      0   // P[i + REST_UNIT_STRIDE]
    597    vld           vr5,       t3,      -2  // P[i - 1 - REST_UNIT_STRIDE]
    598    vld           vr6,       t5,      -2  // P[i - 1 + REST_UNIT_STRIDE]
    599    vld           vr7,       t3,      2   // P[i + 1 - REST_UNIT_STRIDE]
    600    vld           vr8,       t5,      2   // P[i + 1 + REST_UNIT_STRIDE]
    601    vaddwev.w.h   vr9,       vr0,     vr1
    602    vaddwod.w.h   vr10,      vr0,     vr1
    603    vaddwev.w.h   vr11,      vr2,     vr3
    604    vaddwod.w.h   vr12,      vr2,     vr3
    605    vadd.w        vr9,       vr11,    vr9
    606    vadd.w        vr10,      vr12,    vr10
    607    vilvl.w       vr11,      vr10,    vr9    // 0 1 2 3
    608    vilvh.w       vr12,      vr10,    vr9    // 4 5 6 7
    609    vsllwil.w.h   vr0,       vr4,     0
    610    vexth.w.h     vr1,       vr4
    611    vadd.w        vr0,       vr11,    vr0
    612    vadd.w        vr1,       vr12,    vr1
    613    vslli.w       vr0,       vr0,     2
    614    vslli.w       vr1,       vr1,     2
    615    vaddwev.w.h   vr9,       vr5,     vr6
    616    vaddwod.w.h   vr10,      vr5,     vr6
    617    vaddwev.w.h   vr11,      vr7,     vr8
    618    vaddwod.w.h   vr12,      vr7,     vr8
    619    vadd.w        vr9,       vr11,    vr9
    620    vadd.w        vr10,      vr12,    vr10
    621    vilvl.w       vr13,      vr10,    vr9
    622    vilvh.w       vr14,      vr10,    vr9
    623    vslli.w       vr15,      vr13,    1
    624    vslli.w       vr16,      vr14,    1
    625    vadd.w        vr15,      vr13,    vr15   // a
    626    vadd.w        vr16,      vr14,    vr16
    627    vadd.w        vr22,      vr0,     vr15
    628    vadd.w        vr23,      vr1,     vr16
    629    vld           vr0,       t6,      0      // src
    630    vsllwil.hu.bu vr0,       vr0,     0
    631    vsllwil.wu.hu vr1,       vr0,     0
    632    vexth.wu.hu   vr2,       vr0
    633    vmadd.w       vr20,      vr22,    vr1
    634    vmadd.w       vr21,      vr23,    vr2
    635    vssrlrni.h.w  vr21,      vr20,    9
    636    vst           vr21,      t8,      0
    637    addi.d        t8,        t8,      16
    638 
    639    addi.d        t0,        t0,      32
    640    addi.d        t1,        t1,      32
    641    addi.d        t2,        t2,      32
    642    addi.d        t3,        t3,      16
    643    addi.d        t4,        t4,      16
    644    addi.d        t5,        t5,      16
    645    addi.d        t6,        t6,      8
    646    addi.w        t7,        t7,      -8
    647    blt           zero,      t7,      .LBS3SGF_V_W
    648 
    649    addi.w        a5,        a5,      -1
    650    addi.d        a0,        a0,      384*2
    651    addi.d        a1,        a1,      REST_UNIT_STRIDE
    652    addi.d        a3,        a3,      REST_UNIT_STRIDE<<1
    653    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
    654    bnez          a5,        .LBS3SGF_V_H
    655 endfunc
    656 
    657 function boxsum3_sgf_v_8bpc_lasx
    658    addi.d        a1,        a1,      (3*REST_UNIT_STRIDE+3)   // src
    659    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
    660    addi.d        a2,        a2,      (REST_UNIT_STRIDE<<2)+12
    661    addi.d        a3,        a3,      REST_UNIT_STRIDE<<2
    662    addi.d        a3,        a3,      6
    663 .LBS3SGF_V_H_LASX:
    664    // A int32_t *sumsq
    665    addi.d        t0,        a2,      -(REST_UNIT_STRIDE<<2)   // -stride
    666    addi.d        t1,        a2,      0    // sumsq
    667    addi.d        t2,        a2,      REST_UNIT_STRIDE<<2      // +stride
    668    addi.d        t6,        a1,      0
    669    addi.w        t7,        a4,      0
    670    addi.d        t8,        a0,      0
    671    // B coef *sum
    672    addi.d        t3,        a3,      -(REST_UNIT_STRIDE<<1)   // -stride
    673    addi.d        t4,        a3,      0
    674    addi.d        t5,        a3,      REST_UNIT_STRIDE<<1
    675 
    676 .LBS3SGF_V_W_LASX:
    677    xvld           xr0,       t0,      0   // P[i - REST_UNIT_STRIDE]
    678    xvld           xr1,       t0,      32
    679    xvld           xr2,       t1,      -4  // P[i-1]  -1 0 1 2
    680    xvld           xr3,       t1,      28           // 3 4 5 6
    681    xvld           xr4,       t2,      0   // P[i + REST_UNIT_STRIDE]
    682    xvld           xr5,       t2,      32
    683    xvld           xr6,       t1,      0   // p[i]     0 1 2 3
    684    xvld           xr7,       t1,      32           // 4 5 6 7
    685    xvld           xr8,       t1,      4   // p[i+1]   1 2 3 4
    686    xvld           xr9,       t1,      36           // 5 6 7 8
    687 
    688    xvld           xr10,      t0,      -4  // P[i - 1 - REST_UNIT_STRIDE]
    689    xvld           xr11,      t0,      28
    690    xvld           xr12,      t2,      -4  // P[i - 1 + REST_UNIT_STRIDE]
    691    xvld           xr13,      t2,      28
    692    xvld           xr14,      t0,      4   // P[i + 1 - REST_UNIT_STRIDE]
    693    xvld           xr15,      t0,      36
    694    xvld           xr16,      t2,      4   // P[i + 1 + REST_UNIT_STRIDE]
    695    xvld           xr17,      t2,      36
    696 
    697    xvadd.w        xr0,       xr2,     xr0
    698    xvadd.w        xr4,       xr6,     xr4
    699    xvadd.w        xr0,       xr0,     xr8
    700    xvadd.w        xr20,      xr0,     xr4
    701    xvslli.w       xr20,      xr20,    2      // 0 1 2 3
    702    xvadd.w        xr0,       xr1,     xr3
    703    xvadd.w        xr4,       xr5,     xr7
    704    xvadd.w        xr0,       xr0,     xr9
    705    xvadd.w        xr21,      xr0,     xr4
    706    xvslli.w       xr21,      xr21,    2      // 4 5 6 7
    707    xvadd.w        xr12,      xr10,    xr12
    708    xvadd.w        xr16,      xr14,    xr16
    709    xvadd.w        xr22,      xr12,    xr16
    710    xvslli.w       xr23,      xr22,    1
    711    xvadd.w        xr22,      xr23,    xr22
    712    xvadd.w        xr11,      xr11,    xr13
    713    xvadd.w        xr15,      xr15,    xr17
    714    xvadd.w        xr0,       xr11,    xr15
    715    xvslli.w       xr23,      xr0,     1
    716    xvadd.w        xr23,      xr23,    xr0
    717    xvadd.w        xr20,      xr20,    xr22   // b
    718    xvadd.w        xr21,      xr21,    xr23
    719 
    720    // B coef *sum
    721    xvld           xr0,       t3,      0   // P[i - REST_UNIT_STRIDE]
    722    xvld           xr1,       t4,      -2  // p[i - 1]
    723    xvld           xr2,       t4,      0   // p[i]
    724    xvld           xr3,       t4,      2   // p[i + 1]
    725    xvld           xr4,       t5,      0   // P[i + REST_UNIT_STRIDE]
    726    xvld           xr5,       t3,      -2  // P[i - 1 - REST_UNIT_STRIDE]
    727    xvld           xr6,       t5,      -2  // P[i - 1 + REST_UNIT_STRIDE]
    728    xvld           xr7,       t3,      2   // P[i + 1 - REST_UNIT_STRIDE]
    729    xvld           xr8,       t5,      2   // P[i + 1 + REST_UNIT_STRIDE]
    730 
    731    xvaddwev.w.h   xr9,       xr0,     xr1
    732    xvaddwod.w.h   xr10,      xr0,     xr1
    733    xvaddwev.w.h   xr11,      xr2,     xr3
    734    xvaddwod.w.h   xr12,      xr2,     xr3
    735    xvadd.w        xr9,       xr11,    xr9   // 0 2 4 6 8 10 12 14
    736    xvadd.w        xr10,      xr12,    xr10  // 1 3 5 7 9 11 13 15
    737    xvilvl.w       xr11,      xr10,    xr9   // 0 1 2 3 8 9 10 11
    738    xvilvh.w       xr12,      xr10,    xr9   // 4 5 6 7 12 13 14 15
    739    xvsllwil.w.h   xr0,       xr4,     0     // 0 1 2 3 8 9 10 11
    740    xvexth.w.h     xr1,       xr4            // 4 5 6 7 12 13 14 15
    741 
    742    xvadd.w        xr0,       xr11,    xr0
    743    xvadd.w        xr1,       xr12,    xr1
    744    xvslli.w       xr0,       xr0,     2
    745    xvslli.w       xr1,       xr1,     2
    746 
    747    xvaddwev.w.h   xr9,       xr5,     xr6
    748    xvaddwod.w.h   xr10,      xr5,     xr6
    749    xvaddwev.w.h   xr11,      xr7,     xr8
    750    xvaddwod.w.h   xr12,      xr7,     xr8
    751    xvadd.w        xr9,       xr11,    xr9
    752    xvadd.w        xr10,      xr12,    xr10
    753    xvilvl.w       xr13,      xr10,    xr9   // 0 1 2 3 8 9 10 11
    754    xvilvh.w       xr14,      xr10,    xr9   // 4 5 6 7 12 13 14 15
    755 
    756    xvslli.w       xr15,      xr13,    1
    757    xvslli.w       xr16,      xr14,    1
    758    xvadd.w        xr15,      xr13,    xr15   // a
    759    xvadd.w        xr16,      xr14,    xr16
    760    xvadd.w        xr22,      xr0,     xr15   // A B
    761    xvadd.w        xr23,      xr1,     xr16   // C D
    762 
    763    vld            vr0,       t6,      0      // src
    764    vilvh.d        vr2,       vr0,     vr0
    765    vext2xv.wu.bu  xr1,       xr0
    766    vext2xv.wu.bu  xr2,       xr2
    767    xvor.v         xr15,      xr22,    xr22   // A B
    768    xvpermi.q      xr22,      xr23,    0b00000010  // A C
    769    xvpermi.q      xr23,      xr15,    0b00110001
    770    xvmadd.w       xr20,      xr22,    xr1
    771    xvmadd.w       xr21,      xr23,    xr2
    772    xvssrlrni.h.w  xr21,      xr20,    9
    773    xvpermi.d      xr22,      xr21,    0b11011000
    774    xvst           xr22,      t8,      0
    775    addi.d         t8,        t8,      32
    776 
    777    addi.d        t0,        t0,      64
    778    addi.d        t1,        t1,      64
    779    addi.d        t2,        t2,      64
    780    addi.d        t3,        t3,      32
    781    addi.d        t4,        t4,      32
    782    addi.d        t5,        t5,      32
    783    addi.d        t6,        t6,      16
    784    addi.w        t7,        t7,      -16
    785    blt           zero,      t7,      .LBS3SGF_V_W_LASX
    786 
    787    addi.w        a5,        a5,      -1
    788    addi.d        a0,        a0,      384*2
    789    addi.d        a1,        a1,      REST_UNIT_STRIDE
    790    addi.d        a3,        a3,      REST_UNIT_STRIDE<<1
    791    addi.d        a2,        a2,      REST_UNIT_STRIDE<<2
    792    bnez          a5,        .LBS3SGF_V_H_LASX
    793 endfunc
    794 
    795 #define FILTER_OUT_STRIDE (384)
    796 
    797 /*
    798 sgr_3x3_finish_c(const pixel *p, const ptrdiff_t stride,
    799                   const int16_t *dst, const int w1;
    800                   const int w, const int h);
    801 */
    802 function sgr_3x3_finish_8bpc_lsx
    803    vreplgr2vr.w  vr3,     a3            // w1
    804    andi          t4,      a4,       0x7
    805    sub.w         t5,      a4,       t4
    806 
    807    beq           zero,    t5,       .LSGR3X3_REM
    808 
    809 .LSGR3X3_H:
    810    addi.d        t0,      a0,       0
    811    addi.d        t1,      a2,       0
    812    addi.w        t2,      t5,       0
    813    andi          t4,      a4,       0x7
    814 .LSGR3X3_W:
    815    vld           vr0,     t0,       0
    816    vld           vr1,     t1,       0
    817    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
    818    vsllwil.wu.hu vr4,     vr2,      0   // p
    819    vexth.wu.hu   vr5,     vr2           // p
    820    vslli.w       vr6,     vr4,      7
    821    vslli.w       vr7,     vr5,      7
    822    vsllwil.w.h   vr8,     vr1,      0   // dst
    823    vexth.w.h     vr9,     vr1           // dst
    824    vsub.w        vr8,     vr8,      vr4
    825    vsub.w        vr9,     vr9,      vr5
    826    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
    827    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
    828    vssrarni.hu.w vr7,     vr6,      11
    829    vssrlni.bu.h  vr7,     vr7,      0
    830    vstelm.d      vr7,     t0,       0,    0
    831    addi.d        t0,      t0,       8
    832    addi.d        t1,      t1,       16
    833    addi.d        t2,      t2,       -8
    834    bne           zero,    t2,       .LSGR3X3_W
    835 
    836    beq           t4,      zero,     .LSGR3X3_NOREM
    837 
    838    vld           vr0,     t0,       0
    839    vld           vr1,     t1,       0
    840    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
    841    vsllwil.wu.hu vr4,     vr2,      0   // p
    842    vexth.wu.hu   vr5,     vr2           // p
    843    vslli.w       vr6,     vr4,      7
    844    vslli.w       vr7,     vr5,      7
    845    vsllwil.w.h   vr8,     vr1,      0   // dst
    846    vexth.w.h     vr9,     vr1           // dst
    847    vsub.w        vr8,     vr8,      vr4
    848    vsub.w        vr9,     vr9,      vr5
    849    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
    850    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
    851    vssrarni.hu.w vr7,     vr6,      11
    852    vssrlni.bu.h  vr7,     vr7,      0
    853 
    854 .LSGR3X3_ST:
    855    vstelm.b      vr7,     t0,       0,    0
    856    addi.d        t0,      t0,       1
    857    vbsrl.v       vr7,     vr7,      1
    858    addi.w        t4,      t4,       -1
    859    bnez          t4,      .LSGR3X3_ST
    860 
    861 .LSGR3X3_NOREM:
    862    addi.w        a5,      a5,       -1
    863    add.d         a0,      a0,       a1
    864    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
    865    bnez          a5,      .LSGR3X3_H
    866    b             .LSGR3X3_END
    867 
    868 .LSGR3X3_REM:
    869    andi          t4,      a4,       0x7
    870    addi.d        t0,      a0,       0
    871    vld           vr0,     t0,       0
    872    vld           vr1,     a2,       0
    873    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
    874    vsllwil.wu.hu vr4,     vr2,      0   // p
    875    vexth.wu.hu   vr5,     vr2           // p
    876    vslli.w       vr6,     vr4,      7
    877    vslli.w       vr7,     vr5,      7
    878    vsllwil.w.h   vr8,     vr1,      0   // dst
    879    vexth.w.h     vr9,     vr1           // dst
    880    vsub.w        vr8,     vr8,      vr4
    881    vsub.w        vr9,     vr9,      vr5
    882    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
    883    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
    884    vssrarni.hu.w vr7,     vr6,      11
    885    vssrlni.bu.h  vr7,     vr7,      0
    886 
    887 .LSGR3X3_REM_ST:
    888    vstelm.b      vr7,     t0,       0,    0
    889    addi.d        t0,      t0,       1
    890    vbsrl.v       vr7,     vr7,      1
    891    addi.w        t4,      t4,       -1
    892    bnez          t4,      .LSGR3X3_REM_ST
    893    addi.w        a5,      a5,       -1
    894    add.d         a0,      a0,       a1
    895    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
    896    bnez          a5,      .LSGR3X3_REM
    897 
    898 .LSGR3X3_END:
    899 endfunc
    900 
    901 /*
    902 void boxsum5(int32_t *sumsq, coef *sum,
    903             const pixel *const src,
    904             const int w, const int h)
    905 */
    906 function boxsum5_h_8bpc_lsx
    907    addi.w        a4,      a4,        -4
    908    addi.d        a0,      a0,        REST_UNIT_STRIDE<<2
    909    addi.d        a1,      a1,        REST_UNIT_STRIDE<<1
    910    li.w          t6,      1
    911 .LBOXSUM5_H_H:
    912    addi.w        t3,      a3,        0
    913    addi.d        t2,      a2,        0
    914    addi.d        t0,      a0,        0
    915    addi.d        t1,      a1,        0
    916 
    917 .LBOXSUM5_H_W:
    918    vld           vr0,     t2,        0                   // a
    919    vld           vr1,     t2,        REST_UNIT_STRIDE    // b
    920    vld           vr2,     t2,        REST_UNIT_STRIDE<<1 // c
    921    vld           vr3,     t2,        REST_UNIT_STRIDE*3  // d
    922    vld           vr4,     t2,        REST_UNIT_STRIDE<<2 // e
    923 
    924    vilvl.b       vr5,     vr1,       vr0
    925    vilvh.b       vr6,     vr1,       vr0
    926    vilvl.b       vr7,     vr3,       vr2
    927    vilvh.b       vr8,     vr3,       vr2
    928    //sum_v
    929    vhaddw.hu.bu  vr9,     vr5,       vr5  // 0 1  2  3  4  5  6  7
    930    vhaddw.hu.bu  vr10,    vr6,       vr6  // 8 9 10 11 12 13 14 15  a+b
    931    vhaddw.hu.bu  vr11,    vr7,       vr7
    932    vhaddw.hu.bu  vr12,    vr8,       vr8
    933    vadd.h        vr9,     vr9,       vr11
    934    vadd.h        vr10,    vr10,      vr12  // a + b + c + d
    935    vsllwil.hu.bu vr11,    vr4,       0
    936    vexth.hu.bu   vr12,    vr4
    937    vadd.h        vr9,     vr9,       vr11
    938    vadd.h        vr10,    vr10,      vr12
    939    vst           vr9,     t1,        0
    940    vst           vr10,    t1,        16
    941    addi.d        t1,      t1,        32
    942 
    943    // sumsq
    944    vmulwev.h.bu  vr9,     vr5,       vr5  // a*a 0 1  2  3  4  5  6  7
    945    vmulwev.h.bu  vr10,    vr6,       vr6  // a*a 8 9 10 11 12 13 14 15
    946    vmulwod.h.bu  vr13,    vr5,       vr5  // b*b 0 1  2  3  4  5  6  7
    947    vmulwod.h.bu  vr14,    vr6,       vr6  // b*b 8 9 10 11 12 13 14 15
    948    vmulwev.h.bu  vr15,    vr7,       vr7  // c*c 0 1  2  3  4  5  6  7
    949    vmulwev.h.bu  vr16,    vr8,       vr8  // c*c 8 9 10 11 12 13 14 15
    950    vmulwod.h.bu  vr17,    vr7,       vr7  // d*d 0 1  2  3  4  5  6  7
    951    vmulwod.h.bu  vr18,    vr8,       vr8  // d*d 8 9 10 11 12 13 14 15
    952    vaddwev.w.hu  vr5,     vr9,       vr13  // 0 2 4 6
    953    vaddwod.w.hu  vr6,     vr9,       vr13  // 1 3 5 7
    954    vaddwev.w.hu  vr7,     vr10,      vr14  // 8 10 12 14
    955    vaddwod.w.hu  vr8,     vr10,      vr14  // 9 11 13 15   a + b
    956    vaddwev.w.hu  vr19,    vr15,      vr17  // 0 2 4 6
    957    vaddwod.w.hu  vr20,    vr15,      vr17  // 1 3 5 7
    958    vaddwev.w.hu  vr21,    vr16,      vr18  // 8 10 12 14
    959    vaddwod.w.hu  vr22,    vr16,      vr18  // 9 11 13 15   c + d
    960    vadd.w        vr5,     vr5,       vr19
    961    vadd.w        vr6,     vr6,       vr20
    962    vadd.w        vr7,     vr7,       vr21
    963    vadd.w        vr8,     vr8,       vr22
    964    vmaddwev.w.hu vr5,     vr11,      vr11
    965    vmaddwod.w.hu vr6,     vr11,      vr11
    966    vmaddwev.w.hu vr7,     vr12,      vr12
    967    vmaddwod.w.hu vr8,     vr12,      vr12
    968    vilvl.w       vr19,    vr6,       vr5
    969    vilvh.w       vr20,    vr6,       vr5
    970    vilvl.w       vr21,    vr8,       vr7
    971    vilvh.w       vr22,    vr8,       vr7
    972 
    973    vst           vr19,    t0,        0
    974    vst           vr20,    t0,        16
    975    vst           vr21,    t0,        32
    976    vst           vr22,    t0,        48
    977    addi.d        t0,      t0,        64
    978    addi.d        t2,      t2,        16
    979    addi.w        t3,      t3,        -16
    980    blt           zero,    t3,        .LBOXSUM5_H_W
    981 
    982    addi.d        a0,      a0,        REST_UNIT_STRIDE<<2
    983    addi.d        a1,      a1,        REST_UNIT_STRIDE<<1
    984    addi.d        a2,      a2,        REST_UNIT_STRIDE
    985    addi.d        a4,      a4,        -1
    986    bnez          a4,      .LBOXSUM5_H_H
    987 endfunc
    988 
    989 /*
    990 void boxsum5_h(int32_t *sumsq, coef *sum,
    991               const int w, const int h)
    992 */
    993 function boxsum5_v_8bpc_lsx
    994    addi.d         a0,      a0,      (REST_UNIT_STRIDE<<2)
    995    addi.d         a1,      a1,      (REST_UNIT_STRIDE<<1)
    996    addi.w         a3,      a3,      -4
    997    addi.w         a2,      a2,      -4
    998 
    999 .LBOXSUM5_V_H:
   1000    addi.w         t3,      a2,      0
   1001    addi.d         t0,      a0,      0
   1002    addi.d         t1,      a1,      0
   1003    addi.d         t2,      a0,      8
   1004    addi.d         t3,      a1,      4
   1005    addi.d         t4,      a2,      0
   1006 
   1007    vld            vr0,     t1,      0   // a 0 1 2 3 4 5 6 7
   1008    vld            vr1,     t1,      2   // b 1 2 3 4 5 6 7 8
   1009    vld            vr2,     t1,      4   // c 2
   1010    vld            vr3,     t1,      6   // d 3
   1011    vld            vr4,     t1,      8   // e 4 5 6 7 8 9 10 11
   1012    vadd.h         vr5,     vr0,     vr1
   1013    vadd.h         vr6,     vr2,     vr3
   1014    vpickve2gr.w   t5,      vr4,     2
   1015    vadd.h         vr5,     vr5,     vr6
   1016    vadd.h         vr5,     vr5,     vr4
   1017    vst            vr5,     t3,      0
   1018 
   1019    vld            vr0,     t0,      0  // 0 1 2 3   a
   1020    vld            vr1,     t0,      4  // 1 2 3 4   b
   1021    vld            vr2,     t0,      8  // 2 3 4 5   c
   1022    vld            vr3,     t0,      12 // 3 4 5 6   d
   1023    vld            vr4,     t0,      16 // 4 5 6 7   e  a
   1024    vld            vr5,     t0,      20 // 5 6 7 8      b
   1025    vld            vr6,     t0,      24 // 6 7 8 9      c
   1026    vld            vr7,     t0,      28 // 7 8 9 10     d
   1027    vld            vr8,     t0,      32 // 8 9 10 11    e
   1028 
   1029    vadd.w         vr9,     vr0,     vr1
   1030    vadd.w         vr10,    vr2,     vr3
   1031    vadd.w         vr9,     vr9,     vr10
   1032    vadd.w         vr9,     vr9,     vr4
   1033    vadd.w         vr10,    vr4,     vr5
   1034    vadd.w         vr11,    vr6,     vr7
   1035    vadd.w         vr10,    vr10,    vr8
   1036    vadd.w         vr10,    vr10,    vr11
   1037    vst            vr9,     t2,      0
   1038    vst            vr10,    t2,      16
   1039 
   1040    addi.d         t3,      t3,      16
   1041    addi.d         t1,      t1,      16
   1042    addi.d         t0,      t0,      32
   1043    addi.d         t2,      t2,      32
   1044    addi.w         t4,      t4,      -8
   1045    bge            zero,    t4,      .LBOXSUM5_V_H1
   1046 
   1047 .LBOXSUM5_V_W:
   1048    vld            vr0,     t1,      0   // a 0 1 2 3 4 5 6 7
   1049    vld            vr1,     t1,      2   // b 1 2 3 4 5 6 7 8
   1050    vld            vr2,     t1,      4   // c 2
   1051    vld            vr3,     t1,      6   // d 3
   1052    vld            vr4,     t1,      8   // e 4 5 6 7 8 9 10 11
   1053    vinsgr2vr.w    vr0,     t5,      0
   1054    vpickve2gr.w   t5,      vr4,     2
   1055    vextrins.h     vr1,     vr0,     0x01
   1056    vadd.h         vr5,     vr0,     vr1
   1057    vadd.h         vr6,     vr2,     vr3
   1058    vadd.h         vr5,     vr5,     vr6
   1059    vadd.h         vr5,     vr5,     vr4
   1060    vst            vr5,     t3,      0
   1061 
   1062    vaddi.hu       vr0,     vr8,     0  // 8  9 10 11  a
   1063    vld            vr1,     t0,      4  // 9 10 11 12  b
   1064    vld            vr2,     t0,      8  // 10 11 12 13 c
   1065    vld            vr3,     t0,      12 // 14 15 16 17 d
   1066    vld            vr4,     t0,      16 // 15 16 17 18 e  a
   1067    vld            vr5,     t0,      20 // 16 17 18 19    b
   1068    vld            vr6,     t0,      24 // 17 18 19 20    c
   1069    vld            vr7,     t0,      28 // 18 19 20 21    d
   1070    vld            vr8,     t0,      32 // 19 20 21 22    e
   1071    vextrins.w     vr1,     vr0,     0x01
   1072    vadd.w         vr9,     vr0,     vr1
   1073    vadd.w         vr10,    vr2,     vr3
   1074    vadd.w         vr9,     vr9,     vr10
   1075    vadd.w         vr9,     vr9,     vr4
   1076    vadd.w         vr10,    vr4,     vr5
   1077    vadd.w         vr11,    vr6,     vr7
   1078    vadd.w         vr10,    vr10,    vr8
   1079    vadd.w         vr10,    vr10,    vr11
   1080    vst            vr9,     t2,      0
   1081    vst            vr10,    t2,      16
   1082 
   1083    addi.d         t3,      t3,      16
   1084    addi.d         t1,      t1,      16
   1085    addi.d         t0,      t0,      32
   1086    addi.d         t2,      t2,      32
   1087    addi.w         t4,      t4,      -8
   1088    blt            zero,    t4,      .LBOXSUM5_V_W
   1089 
   1090 .LBOXSUM5_V_H1:
   1091    addi.d         a1,       a1,      REST_UNIT_STRIDE<<1
   1092    addi.d         a0,       a0,      REST_UNIT_STRIDE<<2
   1093    addi.w         a3,       a3,      -1
   1094    bnez           a3,       .LBOXSUM5_V_H
   1095 endfunc
   1096 
   1097 /*
   1098 selfguided_filter(int32_t *sumsq, coef *sum,
   1099                  const int w, const int h,
   1100                  const unsigned s)
   1101 */
   1102 function boxsum5_sgf_h_8bpc_lsx
   1103    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
   1104    addi.d        a0,       a0,        12   // AA
   1105    addi.d        a1,       a1,        REST_UNIT_STRIDE<<1
   1106    addi.d        a1,       a1,        6    // BB
   1107    la.local      t8,       dav1d_sgr_x_by_x
   1108    li.w          t6,       164
   1109    vreplgr2vr.w  vr20,     t6
   1110    li.w          t6,       255
   1111    vreplgr2vr.w  vr22,     t6
   1112    vaddi.wu      vr21,     vr22,      1  // 256
   1113    vreplgr2vr.w  vr6,      a4
   1114    vldi          vr19,     0x819
   1115    addi.w        a2,       a2,        2  // w + 2
   1116    addi.w        a3,       a3,        2  // h + 2
   1117 
   1118 .LBS5SGF_H_H:
   1119    addi.w        t2,       a2,        0
   1120    addi.d        t0,       a0,        -4
   1121    addi.d        t1,       a1,        -2
   1122 
   1123 .LBS5SGF_H_W:
   1124    vld           vr0,      t0,        0   // AA[i]
   1125    vld           vr1,      t0,        16
   1126    vld           vr2,      t1,        0   // BB[i]
   1127 
   1128    vmul.w        vr4,      vr0,       vr19 // a * n
   1129    vmul.w        vr5,      vr1,       vr19 // a * n
   1130    vsllwil.w.h   vr9,      vr2,       0
   1131    vexth.w.h     vr10,     vr2
   1132    vmsub.w       vr4,      vr9,       vr9   // p
   1133    vmsub.w       vr5,      vr10,      vr10   // p
   1134    vmaxi.w       vr4,      vr4,       0
   1135    vmaxi.w       vr5,      vr5,       0    // p
   1136    vmul.w        vr4,      vr4,       vr6  // p * s
   1137    vmul.w        vr5,      vr5,       vr6  // p * s
   1138    vsrlri.w      vr4,      vr4,       20
   1139    vsrlri.w      vr5,      vr5,       20   // z
   1140    vmin.w        vr4,      vr4,       vr22
   1141    vmin.w        vr5,      vr5,       vr22
   1142 
   1143    // load table data
   1144    vpickve2gr.w  t6,       vr4,       0
   1145    ldx.bu        t7,       t8,        t6
   1146    vinsgr2vr.w   vr7,      t7,        0
   1147    vpickve2gr.w  t6,       vr4,       1
   1148    ldx.bu        t7,       t8,        t6
   1149    vinsgr2vr.w   vr7,      t7,        1
   1150    vpickve2gr.w  t6,       vr4,       2
   1151    ldx.bu        t7,       t8,        t6
   1152    vinsgr2vr.w   vr7,      t7,        2
   1153    vpickve2gr.w  t6,       vr4,       3
   1154    ldx.bu        t7,       t8,        t6
   1155    vinsgr2vr.w   vr7,      t7,        3
   1156 
   1157    vpickve2gr.w  t6,       vr5,       0
   1158    ldx.bu        t7,       t8,        t6
   1159    vinsgr2vr.w   vr8,      t7,        0
   1160    vpickve2gr.w  t6,       vr5,       1
   1161    ldx.bu        t7,       t8,        t6
   1162    vinsgr2vr.w   vr8,      t7,        1
   1163    vpickve2gr.w  t6,       vr5,       2
   1164    ldx.bu        t7,       t8,        t6
   1165    vinsgr2vr.w   vr8,      t7,        2
   1166    vpickve2gr.w  t6,       vr5,       3
   1167    ldx.bu        t7,       t8,        t6
   1168    vinsgr2vr.w   vr8,      t7,        3     // x
   1169 
   1170    vmul.w        vr9,      vr7,       vr9   // x * BB[i]
   1171    vmul.w        vr10,     vr8,       vr10
   1172    vmul.w        vr9,      vr9,       vr20  // x * BB[i] * sgr_one_by_x
   1173    vmul.w        vr10,     vr10,      vr20
   1174    vsrlri.w      vr9,      vr9,       12
   1175    vsrlri.w      vr10,     vr10,      12
   1176    vsub.w        vr7,      vr21,      vr7
   1177    vsub.w        vr8,      vr21,      vr8
   1178    vpickev.h     vr8,      vr8,       vr7
   1179    vst           vr9,      t0,        0
   1180    vst           vr10,     t0,        16
   1181    vst           vr8,      t1,        0
   1182    addi.d        t0,       t0,        32
   1183    addi.d        t1,       t1,        16
   1184    addi.w        t2,       t2,        -8
   1185    blt           zero,     t2,        .LBS5SGF_H_W
   1186 
   1187    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
   1188    addi.d        a0,       a0,        REST_UNIT_STRIDE<<2
   1189    addi.d        a1,       a1,        REST_UNIT_STRIDE<<2
   1190    addi.w        a3,       a3,        -2
   1191    blt           zero,     a3,        .LBS5SGF_H_H
   1192 endfunc
   1193 
   1194 /*
   1195 selfguided_filter(coef *dst, pixel *src,
   1196                  int32_t *sumsq, coef *sum,
   1197                  const int w, const int h)
   1198 */
   1199 function boxsum5_sgf_v_8bpc_lsx
   1200    addi.d        a1,        a1,       3*REST_UNIT_STRIDE+3       // src
   1201    addi.d        a2,        a2,       (2*REST_UNIT_STRIDE+3)<<1  // A
   1202    addi.d        a2,        a2,       (2*REST_UNIT_STRIDE+3)<<1
   1203    addi.d        a3,        a3,       (2*REST_UNIT_STRIDE+3)<<1  // B
   1204    addi.w        a5,        a5,       -1
   1205    vldi          vr10,      0x806
   1206    vldi          vr11,      0x805
   1207    vldi          vr22,      0x406
   1208 
   1209 .LBS5SGF_V_H:
   1210    addi.d        t0,        a0,       0
   1211    addi.d        t1,        a1,       0
   1212    addi.d        t2,        a2,       0
   1213    addi.d        t3,        a3,       0
   1214    addi.w        t4,        a4,       0
   1215 
   1216    addi.d        t5,        a0,       384*2
   1217    addi.d        t6,        a1,       REST_UNIT_STRIDE
   1218    addi.d        t7,        a2,       REST_UNIT_STRIDE<<2
   1219    addi.d        t8,        a3,       REST_UNIT_STRIDE<<1   // B
   1220 .LBS5SGF_V_W:
   1221    // a
   1222    vld           vr0,       t3,       -REST_UNIT_STRIDE*2
   1223    vld           vr1,       t3,       REST_UNIT_STRIDE*2
   1224    vld           vr2,       t3,       (-REST_UNIT_STRIDE-1)*2
   1225    vld           vr3,       t3,       (REST_UNIT_STRIDE-1)*2
   1226    vld           vr4,       t3,       (1-REST_UNIT_STRIDE)*2
   1227    vld           vr5,       t3,       (1+REST_UNIT_STRIDE)*2
   1228    vaddwev.w.h   vr6,       vr0,      vr1
   1229    vaddwod.w.h   vr7,       vr0,      vr1
   1230    vmul.w        vr6,       vr6,      vr10
   1231    vmul.w        vr7,       vr7,      vr10
   1232    vaddwev.w.h   vr8,       vr2,      vr3
   1233    vaddwod.w.h   vr9,       vr2,      vr3
   1234    vaddwev.w.h   vr12,      vr4,      vr5
   1235    vaddwod.w.h   vr13,      vr4,      vr5
   1236    vadd.w        vr8,       vr8,      vr12
   1237    vadd.w        vr9,       vr9,      vr13
   1238    vmadd.w       vr6,       vr8,      vr11
   1239    vmadd.w       vr7,       vr9,      vr11
   1240    vilvl.w       vr18,      vr7,      vr6
   1241    vilvh.w       vr19,      vr7,      vr6
   1242    // b
   1243    vld           vr0,       t2,       -REST_UNIT_STRIDE*4
   1244    vld           vr1,       t2,       -REST_UNIT_STRIDE*4+16
   1245    vld           vr2,       t2,       REST_UNIT_STRIDE*4
   1246    vld           vr3,       t2,       REST_UNIT_STRIDE*4+16
   1247    vld           vr4,       t2,       (-REST_UNIT_STRIDE-1)*4
   1248    vld           vr5,       t2,       (-REST_UNIT_STRIDE-1)*4+16
   1249    vld           vr8,       t2,       (REST_UNIT_STRIDE-1)*4
   1250    vld           vr9,       t2,       (REST_UNIT_STRIDE-1)*4+16
   1251    vld           vr12,      t2,       (1-REST_UNIT_STRIDE)*4
   1252    vld           vr13,      t2,       (1-REST_UNIT_STRIDE)*4+16
   1253    vld           vr14,      t2,       (1+REST_UNIT_STRIDE)*4
   1254    vld           vr15,      t2,       (1+REST_UNIT_STRIDE)*4+16
   1255    vadd.w        vr0,       vr0,      vr2  // 0 1 2 3
   1256    vadd.w        vr1,       vr1,      vr3  // 4 5 6 7
   1257    vmul.w        vr20,      vr0,      vr10
   1258    vmul.w        vr21,      vr1,      vr10
   1259    vadd.w        vr4,       vr4,      vr8  // 0 1 2 3
   1260    vadd.w        vr5,       vr5,      vr9  // 4 5 6 7
   1261    vadd.w        vr12,      vr12,     vr14
   1262    vadd.w        vr13,      vr13,     vr15
   1263    vadd.w        vr12,      vr12,     vr4
   1264    vadd.w        vr13,      vr13,     vr5
   1265    vmadd.w       vr20,      vr12,     vr11
   1266    vmadd.w       vr21,      vr13,     vr11
   1267    vld           vr2,       t1,       0
   1268    vsllwil.hu.bu vr2,       vr2,      0
   1269    vsllwil.wu.hu vr3,       vr2,      0
   1270    vexth.wu.hu   vr4,       vr2
   1271    vmadd.w       vr20,      vr18,     vr3
   1272    vmadd.w       vr21,      vr19,     vr4
   1273    vssrlrni.h.w  vr21,      vr20,     9
   1274    vst           vr21,      t0,       0
   1275 
   1276    addi.d        t1,        t1,       8
   1277    addi.d        t2,        t2,       32
   1278    addi.d        t3,        t3,       16
   1279 
   1280    // a
   1281    vld           vr0,       t8,       0
   1282    vld           vr1,       t8,       -2
   1283    vld           vr2,       t8,       2
   1284    vmulwev.w.h   vr3,       vr0,      vr22
   1285    vmulwod.w.h   vr4,       vr0,      vr22
   1286    vaddwev.w.h   vr5,       vr1,      vr2
   1287    vaddwod.w.h   vr6,       vr1,      vr2
   1288    vmadd.w       vr3,       vr5,      vr11
   1289    vmadd.w       vr4,       vr6,      vr11
   1290    vilvl.w       vr19,      vr4,      vr3
   1291    vilvh.w       vr20,      vr4,      vr3
   1292    // b
   1293    vld           vr0,       t7,       0
   1294    vld           vr1,       t7,       -4
   1295    vld           vr2,       t7,       4
   1296    vld           vr5,       t7,       16
   1297    vld           vr6,       t7,       12
   1298    vld           vr7,       t7,       20
   1299    vmul.w        vr8,       vr0,      vr10
   1300    vmul.w        vr9,       vr5,      vr10
   1301    vadd.w        vr12,      vr1,      vr2
   1302    vadd.w        vr13,      vr6,      vr7
   1303    vmadd.w       vr8,       vr12,     vr11
   1304    vmadd.w       vr9,       vr13,     vr11
   1305    vld           vr2,       t6,       0
   1306    vsllwil.hu.bu vr2,       vr2,      0
   1307    vsllwil.wu.hu vr3,       vr2,      0
   1308    vexth.wu.hu   vr4,       vr2
   1309    vmadd.w       vr8,       vr19,     vr3
   1310    vmadd.w       vr9,       vr20,     vr4
   1311    vssrlrni.h.w  vr9,       vr8,      8
   1312    vst           vr9,       t0,       384*2
   1313 
   1314    addi.d        t0,        t0,       16
   1315    addi.d        t8,        t8,       16
   1316    addi.d        t7,        t7,       32
   1317    addi.d        t6,        t6,       8
   1318    addi.w        t4,        t4,       -8
   1319    blt           zero,      t4,       .LBS5SGF_V_W
   1320 
   1321    addi.w        a5,        a5,       -2
   1322    addi.d        a0,        a0,       384*4                // dst
   1323    addi.d        a1,        a1,       REST_UNIT_STRIDE<<1  // src
   1324    addi.d        a2,        a2,       REST_UNIT_STRIDE<<2  //
   1325    addi.d        a2,        a2,       REST_UNIT_STRIDE<<2
   1326    addi.d        a3,        a3,       REST_UNIT_STRIDE<<2  //
   1327    blt           zero,      a5,       .LBS5SGF_V_H
   1328    bnez          a5,        .LBS5SGF_END
   1329 .LBS5SGF_V_W1:
   1330    // a
   1331    vld           vr0,       a3,       -REST_UNIT_STRIDE*2
   1332    vld           vr1,       a3,       REST_UNIT_STRIDE*2
   1333    vld           vr2,       a3,       (-REST_UNIT_STRIDE-1)*2
   1334    vld           vr3,       a3,       (REST_UNIT_STRIDE-1)*2
   1335    vld           vr4,       a3,       (1-REST_UNIT_STRIDE)*2
   1336    vld           vr5,       a3,       (1+REST_UNIT_STRIDE)*2
   1337    vaddwev.w.h   vr6,       vr0,      vr1
   1338    vaddwod.w.h   vr7,       vr0,      vr1
   1339    vmul.w        vr6,       vr6,      vr10
   1340    vmul.w        vr7,       vr7,      vr10
   1341    vaddwev.w.h   vr8,       vr2,      vr3
   1342    vaddwod.w.h   vr9,       vr2,      vr3
   1343    vaddwev.w.h   vr12,      vr4,      vr5
   1344    vaddwod.w.h   vr13,      vr4,      vr5
   1345    vadd.w        vr8,       vr8,      vr12
   1346    vadd.w        vr9,       vr9,      vr13
   1347    vmadd.w       vr6,       vr8,      vr11
   1348    vmadd.w       vr7,       vr9,      vr11
   1349    vilvl.w       vr18,      vr7,      vr6
   1350    vilvh.w       vr19,      vr7,      vr6
   1351    // b
   1352    vld           vr0,       a2,       -REST_UNIT_STRIDE*4
   1353    vld           vr1,       a2,       -REST_UNIT_STRIDE*4+16
   1354    vld           vr2,       a2,       REST_UNIT_STRIDE*4
   1355    vld           vr3,       a2,       REST_UNIT_STRIDE*4+16
   1356    vld           vr4,       a2,       (-REST_UNIT_STRIDE-1)*4
   1357    vld           vr5,       a2,       (-REST_UNIT_STRIDE-1)*4+16
   1358    vld           vr8,       a2,       (REST_UNIT_STRIDE-1)*4
   1359    vld           vr9,       a2,       (REST_UNIT_STRIDE-1)*4+16
   1360    vld           vr12,      a2,       (1-REST_UNIT_STRIDE)*4
   1361    vld           vr13,      a2,       (1-REST_UNIT_STRIDE)*4+16
   1362    vld           vr14,      a2,       (1+REST_UNIT_STRIDE)*4
   1363    vld           vr15,      a2,       (1+REST_UNIT_STRIDE)*4+16
   1364    vadd.w        vr0,       vr0,      vr2  // 0 1 2 3
   1365    vadd.w        vr1,       vr1,      vr3  // 4 5 6 7
   1366    vmul.w        vr20,      vr0,      vr10
   1367    vmul.w        vr21,      vr1,      vr10
   1368    vadd.w        vr4,       vr4,      vr8  // 0 1 2 3
   1369    vadd.w        vr5,       vr5,      vr9  // 4 5 6 7
   1370    vadd.w        vr12,      vr12,     vr14
   1371    vadd.w        vr13,      vr13,     vr15
   1372    vadd.w        vr12,      vr12,     vr4
   1373    vadd.w        vr13,      vr13,     vr5
   1374    vmadd.w       vr20,      vr12,     vr11
   1375    vmadd.w       vr21,      vr13,     vr11
   1376    vld           vr2,       a1,       0
   1377    vsllwil.hu.bu vr2,       vr2,      0
   1378    vsllwil.wu.hu vr3,       vr2,      0
   1379    vexth.wu.hu   vr4,       vr2
   1380    vmadd.w       vr20,      vr18,     vr3
   1381    vmadd.w       vr21,      vr19,     vr4
   1382    vssrlrni.h.w  vr21,      vr20,     9
   1383    vst           vr21,      a0,       0
   1384    addi.d        a3,        a3,       16
   1385    addi.d        a2,        a2,       32
   1386    addi.d        a1,        a1,       8
   1387    addi.d        a0,        a0,       16
   1388    addi.w        a4,        a4,       -8
   1389    blt           zero,      a4,       .LBS5SGF_V_W1
   1390 .LBS5SGF_END:
   1391 endfunc
   1392 
   1393 /*
   1394 void dav1d_sgr_mix_finish_lsx(uint8_t *p, const ptrdiff_t stride,
   1395                              const int16_t *dst0, const int16_t *dst1,
   1396                              const int w0, const int w1,
   1397                              const int w, const int h);
   1398 */
   1399 function sgr_mix_finish_8bpc_lsx
   1400    vreplgr2vr.w  vr3,     a4            // w0
   1401    vreplgr2vr.w  vr13,    a5            // w1
   1402    andi          t4,      a6,       0x7
   1403    sub.w         t5,      a6,       t4
   1404 
   1405    beq           zero,    t5,      .LSGRMIX_REM
   1406 
   1407 .LSGRMIX_H:
   1408    addi.d        t0,      a0,       0
   1409    addi.d        t1,      a2,       0   // dst0
   1410    addi.d        t3,      a3,       0   // dst1
   1411    addi.w        t2,      t5,       0
   1412    andi          t4,      a6,       0x7
   1413 .LSGRMIX_W:
   1414    vld           vr0,     t0,       0
   1415    vld           vr1,     t1,       0
   1416    vld           vr10,    t3,       0
   1417    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
   1418    vsllwil.wu.hu vr4,     vr2,      0   // u 0 1 2 3
   1419    vexth.wu.hu   vr5,     vr2           // u 4 5 6 7
   1420    vslli.w       vr6,     vr4,      7
   1421    vslli.w       vr7,     vr5,      7
   1422    vsllwil.w.h   vr8,     vr1,      0   // dst0
   1423    vexth.w.h     vr9,     vr1           // dst0
   1424    vsub.w        vr8,     vr8,      vr4
   1425    vsub.w        vr9,     vr9,      vr5
   1426    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
   1427    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
   1428 
   1429    vsllwil.w.h   vr11,    vr10,     0    // dst1
   1430    vexth.w.h     vr12,    vr10           // dst1
   1431    vsub.w        vr11,    vr11,     vr4
   1432    vsub.w        vr12,    vr12,     vr5
   1433    vmadd.w       vr6,     vr11,     vr13
   1434    vmadd.w       vr7,     vr12,     vr13
   1435 
   1436    vssrarni.hu.w vr7,     vr6,      11
   1437    vssrlni.bu.h  vr7,     vr7,      0
   1438    vstelm.d      vr7,     t0,       0,    0
   1439    addi.d        t0,      t0,       8
   1440    addi.d        t1,      t1,       16
   1441    addi.d        t3,      t3,       16
   1442    addi.d        t2,      t2,       -8
   1443    bne           zero,    t2,       .LSGRMIX_W
   1444 
   1445    beq           t4,      zero,     .LSGRMIX_W8
   1446 
   1447    vld           vr0,     t0,       0
   1448    vld           vr1,     t1,       0
   1449    vld           vr10,    t3,       0
   1450    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
   1451    vsllwil.wu.hu vr4,     vr2,      0   // p
   1452    vexth.wu.hu   vr5,     vr2           // p
   1453    vslli.w       vr6,     vr4,      7
   1454    vslli.w       vr7,     vr5,      7
   1455    vsllwil.w.h   vr8,     vr1,      0   // dst
   1456    vexth.w.h     vr9,     vr1           // dst
   1457    vsub.w        vr8,     vr8,      vr4
   1458    vsub.w        vr9,     vr9,      vr5
   1459    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
   1460    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
   1461 
   1462    vsllwil.w.h   vr11,    vr10,     0    // dst1
   1463    vexth.w.h     vr12,    vr10           // dst1
   1464    vsub.w        vr11,    vr11,     vr4
   1465    vsub.w        vr12,    vr12,     vr5
   1466    vmadd.w       vr6,     vr11,     vr13
   1467    vmadd.w       vr7,     vr12,     vr13
   1468 
   1469    vssrarni.hu.w vr7,     vr6,      11
   1470    vssrlni.bu.h  vr7,     vr7,      0
   1471 
   1472 .LSGRMIX_ST:
   1473    vstelm.b      vr7,     t0,       0,    0
   1474    addi.d        t0,      t0,       1
   1475    vbsrl.v       vr7,     vr7,      1
   1476    addi.w        t4,      t4,       -1
   1477    bnez          t4,      .LSGRMIX_ST
   1478 
   1479 .LSGRMIX_W8:
   1480    addi.w        a7,      a7,       -1
   1481    add.d         a0,      a0,       a1
   1482    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
   1483    addi.d        a3,      a3,       (FILTER_OUT_STRIDE<<1)
   1484    bnez          a7,      .LSGRMIX_H
   1485    b             .LSGR_MIX_END
   1486 
   1487 .LSGRMIX_REM:
   1488    andi          t4,      a6,       0x7
   1489    vld           vr0,     a0,       0
   1490    vld           vr1,     a2,       0
   1491    vld           vr10,    a3,       0
   1492    vsllwil.hu.bu vr2,     vr0,      4   // u 8 h
   1493    vsllwil.wu.hu vr4,     vr2,      0   // p
   1494    vexth.wu.hu   vr5,     vr2           // p
   1495    vslli.w       vr6,     vr4,      7
   1496    vslli.w       vr7,     vr5,      7
   1497    vsllwil.w.h   vr8,     vr1,      0   // dst
   1498    vexth.w.h     vr9,     vr1           // dst
   1499    vsub.w        vr8,     vr8,      vr4
   1500    vsub.w        vr9,     vr9,      vr5
   1501    vmadd.w       vr6,     vr8,      vr3  // v 0 - 3
   1502    vmadd.w       vr7,     vr9,      vr3  // v 4 - 7
   1503 
   1504    vsllwil.w.h   vr11,    vr10,     0    // dst1
   1505    vexth.w.h     vr12,    vr10           // dst1
   1506    vsub.w        vr11,    vr11,     vr4
   1507    vsub.w        vr12,    vr12,     vr5
   1508    vmadd.w       vr6,     vr11,     vr13
   1509    vmadd.w       vr7,     vr12,     vr13
   1510 
   1511    vssrarni.hu.w vr7,     vr6,      11
   1512    vssrlni.bu.h  vr7,     vr7,      0
   1513    addi.d        t0,      a0,       0
   1514 .LSGRMIX_REM_ST:
   1515    vstelm.b      vr7,     t0,       0,    0
   1516    addi.d        t0,      t0,       1
   1517    vbsrl.v       vr7,     vr7,      1
   1518    addi.w        t4,      t4,       -1
   1519    bnez          t4,      .LSGRMIX_REM_ST
   1520 
   1521    addi.w        a7,      a7,       -1
   1522    add.d         a0,      a0,       a1
   1523    addi.d        a2,      a2,       (FILTER_OUT_STRIDE<<1)
   1524    addi.d        a3,      a3,       (FILTER_OUT_STRIDE<<1)
   1525    bnez          a7,      .LSGRMIX_REM
   1526 
   1527 .LSGR_MIX_END:
   1528 endfunc
   1529 
   1530 .macro MADD_HU_BU_LASX in0, in1, out0, out1
   1531    xvsllwil.hu.bu xr12,     \in0,     0
   1532    xvexth.hu.bu   xr13,     \in0
   1533    xvmadd.h       \out0,    xr12,     \in1
   1534    xvmadd.h       \out1,    xr13,     \in1
   1535 .endm
   1536 
   1537 const wiener_shuf_lasx
   1538 .byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
   1539 .byte 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18
   1540 endconst
   1541 
   1542 function wiener_filter_h_8bpc_lasx
   1543    addi.d         sp,       sp,       -40
   1544    fst.d          f24,      sp,       0
   1545    fst.d          f25,      sp,       8
   1546    fst.d          f26,      sp,       16
   1547    fst.d          f27,      sp,       24
   1548    fst.d          f28,      sp,       32
   1549    li.w           t7,       1<<14          // clip_limit
   1550 
   1551    la.local       t1,       wiener_shuf_lasx
   1552    xvld           xr4,      t1,       0
   1553    vld            vr27,     a2,       0    // filter[0][k]
   1554    xvpermi.q      xr14,     xr27,     0b00000000
   1555    xvrepl128vei.h xr21,     xr14,     0
   1556    xvrepl128vei.h xr22,     xr14,     1
   1557    xvrepl128vei.h xr23,     xr14,     2
   1558    xvrepl128vei.h xr24,     xr14,     3
   1559    xvrepl128vei.h xr25,     xr14,     4
   1560    xvrepl128vei.h xr26,     xr14,     5
   1561    xvrepl128vei.h xr27,     xr14,     6
   1562    xvreplgr2vr.w  xr0,      t7
   1563 
   1564 .WIENER_FILTER_H_H_LASX:
   1565    addi.w         a4,       a4,       -1    // h
   1566    addi.w         t0,       a3,       0     // w
   1567    addi.d         t1,       a1,       0     // tmp_ptr
   1568    addi.d         t2,       a0,       0     // hor_ptr
   1569 
   1570 .WIENER_FILTER_H_W_LASX:
   1571    addi.w         t0,       t0,       -32
   1572    xvld           xr5,      t1,       0
   1573    xvld           xr13,     t1,       16
   1574 
   1575    xvsubi.bu      xr14,     xr4,      2
   1576    xvsubi.bu      xr15,     xr4,      1
   1577    xvshuf.b       xr6,      xr13,     xr5,     xr14  // 1 ... 8, 9 ... 16
   1578    xvshuf.b       xr7,      xr13,     xr5,     xr15  // 2 ... 9, 10 ... 17
   1579    xvshuf.b       xr8,      xr13,     xr5,     xr4   // 3 ... 10, 11 ... 18
   1580    xvaddi.bu      xr14,     xr4,      1
   1581    xvaddi.bu      xr15,     xr4,      2
   1582    xvshuf.b       xr9,      xr13,     xr5,     xr14  // 4 ... 11, 12 ... 19
   1583    xvshuf.b       xr10,     xr13,     xr5,     xr15  // 5 ... 12, 13 ... 20
   1584    xvaddi.bu      xr14,     xr4,      3
   1585    xvshuf.b       xr11,     xr13,     xr5,     xr14  // 6 ... 13, 14 ... 21
   1586 
   1587    xvsllwil.hu.bu xr15,     xr8,      0    //  3  4  5  6  7  8  9 10
   1588    xvexth.hu.bu   xr16,     xr8            // 11 12 13 14 15 16 17 18
   1589    xvsllwil.wu.hu xr17,     xr15,     7    //  3  4  5  6
   1590    xvexth.wu.hu   xr18,     xr15           //  7  8  9 10
   1591    xvsllwil.wu.hu xr19,     xr16,     7    // 11 12 13 14
   1592    xvexth.wu.hu   xr20,     xr16           // 15 16 17 18
   1593    xvslli.w       xr18,     xr18,     7
   1594    xvslli.w       xr20,     xr20,     7
   1595    xvxor.v        xr15,     xr15,     xr15
   1596    xvxor.v        xr14,     xr14,     xr14
   1597 
   1598    MADD_HU_BU_LASX xr5,  xr21, xr14, xr15
   1599    MADD_HU_BU_LASX xr6,  xr22, xr14, xr15
   1600    MADD_HU_BU_LASX xr7,  xr23, xr14, xr15
   1601    MADD_HU_BU_LASX xr8,  xr24, xr14, xr15
   1602    MADD_HU_BU_LASX xr9,  xr25, xr14, xr15
   1603    MADD_HU_BU_LASX xr10, xr26, xr14, xr15
   1604    MADD_HU_BU_LASX xr11, xr27, xr14, xr15
   1605 
   1606    xvsllwil.w.h   xr5,      xr14,     0   //  0  1  2  3
   1607    xvexth.w.h     xr6,      xr14          //  4  5  6  7
   1608    xvsllwil.w.h   xr7,      xr15,     0   //  8  9 10 11
   1609    xvexth.w.h     xr8,      xr15          // 12 13 14 15
   1610    xvadd.w        xr17,     xr17,     xr5
   1611    xvadd.w        xr18,     xr18,     xr6
   1612    xvadd.w        xr19,     xr19,     xr7
   1613    xvadd.w        xr20,     xr20,     xr8
   1614    xvadd.w        xr17,     xr17,     xr0
   1615    xvadd.w        xr18,     xr18,     xr0
   1616    xvadd.w        xr19,     xr19,     xr0
   1617    xvadd.w        xr20,     xr20,     xr0
   1618 
   1619    xvsrli.w       xr1,      xr0,      1
   1620    xvsubi.wu      xr1,      xr1,      1
   1621    xvxor.v        xr3,      xr3,      xr3
   1622    xvsrari.w      xr17,     xr17,     3
   1623    xvsrari.w      xr18,     xr18,     3
   1624    xvsrari.w      xr19,     xr19,     3
   1625    xvsrari.w      xr20,     xr20,     3
   1626    xvclip.w       xr17,     xr17,     xr3,     xr1
   1627    xvclip.w       xr18,     xr18,     xr3,     xr1
   1628    xvclip.w       xr19,     xr19,     xr3,     xr1
   1629    xvclip.w       xr20,     xr20,     xr3,     xr1
   1630 
   1631    xvor.v         xr5,      xr17,     xr17
   1632    xvor.v         xr6,      xr19,     xr19
   1633    xvpermi.q      xr17,     xr18,     0b00000010
   1634    xvpermi.q      xr19,     xr20,     0b00000010
   1635 
   1636    xvst           xr17,     t2,       0
   1637    xvst           xr19,     t2,       32
   1638    xvpermi.q      xr18,     xr5,      0b00110001
   1639    xvpermi.q      xr20,     xr6,      0b00110001
   1640    xvst           xr18,     t2,       64
   1641    xvst           xr20,     t2,       96
   1642    addi.d         t1,       t1,       32
   1643    addi.d         t2,       t2,       128
   1644    blt            zero,     t0,       .WIENER_FILTER_H_W_LASX
   1645 
   1646    addi.d         a1,       a1,       REST_UNIT_STRIDE
   1647    addi.d         a0,       a0,       (REST_UNIT_STRIDE << 2)
   1648    bnez           a4,       .WIENER_FILTER_H_H_LASX
   1649 
   1650    fld.d          f24,      sp,       0
   1651    fld.d          f25,      sp,       8
   1652    fld.d          f26,      sp,       16
   1653    fld.d          f27,      sp,       24
   1654    fld.d          f28,      sp,       32
   1655    addi.d         sp,       sp,       40
   1656 endfunc
   1657 
   1658 .macro APPLY_FILTER_LASX in0, in1, in2
   1659    alsl.d         t7,       \in0,     \in1,    2
   1660    xvld           xr10,     t7,       0
   1661    xvld           xr12,     t7,       32
   1662    xvmadd.w       xr14,     xr10,     \in2
   1663    xvmadd.w       xr16,     xr12,     \in2
   1664 .endm
   1665 
   1666 .macro wiener_filter_v_8bpc_core_lasx
   1667    xvreplgr2vr.w  xr14,     t6
   1668    xvreplgr2vr.w  xr16,     t6
   1669 
   1670    addi.w         t7,       t2,       0      // j + index k
   1671    mul.w          t7,       t7,       t8     // (j + index) * REST_UNIT_STRIDE
   1672    add.w          t7,       t7,       t4     // (j + index) * REST_UNIT_STRIDE + i
   1673 
   1674    APPLY_FILTER_LASX  t7, a2, xr2
   1675    APPLY_FILTER_LASX  t8, t7, xr3
   1676    APPLY_FILTER_LASX  t8, t7, xr4
   1677    APPLY_FILTER_LASX  t8, t7, xr5
   1678    APPLY_FILTER_LASX  t8, t7, xr6
   1679    APPLY_FILTER_LASX  t8, t7, xr7
   1680    APPLY_FILTER_LASX  t8, t7, xr8
   1681    xvssrarni.hu.w xr16,     xr14,      11
   1682    xvpermi.d      xr17,     xr16,      0b11011000
   1683    xvssrlni.bu.h  xr17,     xr17,      0
   1684    xvpermi.d      xr17,     xr17,      0b00001000
   1685 .endm
   1686 
   1687 function wiener_filter_v_8bpc_lasx
   1688    li.w          t6,       -(1 << 18)
   1689 
   1690    li.w          t8,       REST_UNIT_STRIDE
   1691    ld.h          t0,       a3,       0
   1692    ld.h          t1,       a3,       2
   1693    xvreplgr2vr.w xr2,      t0
   1694    xvreplgr2vr.w xr3,      t1
   1695    ld.h          t0,       a3,       4
   1696    ld.h          t1,       a3,       6
   1697    xvreplgr2vr.w xr4,      t0
   1698    xvreplgr2vr.w xr5,      t1
   1699    ld.h          t0,       a3,       8
   1700    ld.h          t1,       a3,       10
   1701    xvreplgr2vr.w xr6,      t0
   1702    xvreplgr2vr.w xr7,      t1
   1703    ld.h          t0,       a3,       12
   1704    xvreplgr2vr.w xr8,      t0
   1705 
   1706    andi          t1,       a4,       0xf
   1707    sub.w         t0,       a4,       t1    // w-w%16
   1708    or            t2,       zero,     zero  // j
   1709    or            t4,       zero,     zero
   1710    beqz          t0,       .WIENER_FILTER_V_W_LT16_LASX
   1711 
   1712 .WIENER_FILTER_V_H_LASX:
   1713    andi          t1,       a4,       0xf
   1714    add.d         t3,       zero,     a0     // p
   1715    or            t4,       zero,     zero   // i
   1716 
   1717 .WIENER_FILTER_V_W_LASX:
   1718 
   1719    wiener_filter_v_8bpc_core_lasx
   1720 
   1721    mul.w         t5,       t2,       a1   // j * stride
   1722    add.w         t5,       t5,       t4   // j * stride + i
   1723    add.d         t3,       a0,       t5
   1724    addi.w        t4,       t4,       16
   1725    vst           vr17,     t3,       0
   1726    bne           t0,       t4,       .WIENER_FILTER_V_W_LASX
   1727 
   1728    beqz          t1,       .WIENER_FILTER_V_W_EQ16_LASX
   1729 
   1730    wiener_filter_v_8bpc_core_lsx
   1731 
   1732    addi.d        t3,       t3,       16
   1733    andi          t1,       a4,       0xf
   1734 
   1735 .WIENER_FILTER_V_ST_REM_LASX:
   1736    vstelm.b      vr17,     t3,       0,    0
   1737    vbsrl.v       vr17,     vr17,     1
   1738    addi.d        t3,       t3,       1
   1739    addi.w        t1,       t1,       -1
   1740    bnez          t1,       .WIENER_FILTER_V_ST_REM_LASX
   1741 .WIENER_FILTER_V_W_EQ16_LASX:
   1742    addi.w        t2,       t2,       1
   1743    blt           t2,       a5,       .WIENER_FILTER_V_H_LASX
   1744    b              .WIENER_FILTER_V_LASX_END
   1745 
   1746 .WIENER_FILTER_V_W_LT16_LASX:
   1747    andi          t1,       a4,       0xf
   1748    add.d         t3,       zero,     a0
   1749 
   1750    wiener_filter_v_8bpc_core_lsx
   1751 
   1752    mul.w         t5,       t2,       a1   // j * stride
   1753    add.d         t3,       a0,       t5
   1754 
   1755 .WIENER_FILTER_V_ST_REM_1_LASX:
   1756    vstelm.b      vr17,     t3,       0,    0
   1757    vbsrl.v       vr17,     vr17,     1
   1758    addi.d        t3,       t3,       1
   1759    addi.w        t1,       t1,       -1
   1760    bnez          t1,       .WIENER_FILTER_V_ST_REM_1_LASX
   1761 
   1762    addi.w        t2,       t2,       1
   1763    blt           t2,       a5,       .WIENER_FILTER_V_W_LT16_LASX
   1764 
   1765 .WIENER_FILTER_V_LASX_END:
   1766 endfunc
   1767 
   1768 function boxsum3_sgf_h_8bpc_lasx
   1769    addi.d         a0,       a0,        (REST_UNIT_STRIDE<<2)+12  // AA
   1770    //addi.d        a0,       a0,        12   // AA
   1771    addi.d         a1,       a1,        (REST_UNIT_STRIDE<<1)+6   // BB
   1772    //addi.d        a1,       a1,        6    // BB
   1773    la.local       t8,       dav1d_sgr_x_by_x
   1774    li.w           t6,       455
   1775    xvreplgr2vr.w  xr20,     t6
   1776    li.w           t6,       255
   1777    xvreplgr2vr.w  xr22,     t6
   1778    xvaddi.wu      xr21,     xr22,      1  // 256
   1779    xvreplgr2vr.w  xr6,      a4
   1780    xvldi          xr19,     0x809
   1781    addi.w         a2,       a2,        2  // w + 2
   1782    addi.w         a3,       a3,        2  // h + 2
   1783 
   1784 .LBS3SGF_H_H_LASX:
   1785    addi.w         t2,       a2,        0
   1786    addi.d         t0,       a0,        -4
   1787    addi.d         t1,       a1,        -2
   1788 
   1789 .LBS3SGF_H_W_LASX:
   1790    addi.w         t2,       t2,        -16
   1791    xvld           xr0,      t0,        0   // AA[i]
   1792    xvld           xr1,      t0,        32
   1793    xvld           xr2,      t1,        0   // BB[i]
   1794 
   1795    xvmul.w        xr4,      xr0,       xr19 // a * n
   1796    xvmul.w        xr5,      xr1,       xr19
   1797    vext2xv.w.h    xr9,      xr2
   1798    xvpermi.q      xr10,     xr2,       0b00000001
   1799    vext2xv.w.h    xr10,     xr10
   1800    xvmsub.w       xr4,      xr9,       xr9  // p
   1801    xvmsub.w       xr5,      xr10,      xr10
   1802    xvmaxi.w       xr4,      xr4,       0
   1803    xvmaxi.w       xr5,      xr5,       0
   1804    xvmul.w        xr4,      xr4,       xr6  // p * s
   1805    xvmul.w        xr5,      xr5,       xr6
   1806    xvsrlri.w      xr4,      xr4,       20
   1807    xvsrlri.w      xr5,      xr5,       20
   1808    xvmin.w        xr4,      xr4,       xr22
   1809    xvmin.w        xr5,      xr5,       xr22
   1810 
   1811    vpickve2gr.w   t6,       vr4,       0
   1812    ldx.bu         t7,       t8,        t6
   1813    vinsgr2vr.w    vr7,      t7,        0
   1814    vpickve2gr.w   t6,       vr4,       1
   1815    ldx.bu         t7,       t8,        t6
   1816    vinsgr2vr.w    vr7,      t7,        1
   1817    vpickve2gr.w   t6,       vr4,       2
   1818    ldx.bu         t7,       t8,        t6
   1819    vinsgr2vr.w    vr7,      t7,        2
   1820    vpickve2gr.w   t6,       vr4,       3
   1821    ldx.bu         t7,       t8,        t6
   1822    vinsgr2vr.w    vr7,      t7,        3
   1823 
   1824    xvpickve2gr.w  t6,       xr4,       4
   1825    ldx.bu         t7,       t8,        t6
   1826    xvinsgr2vr.w   xr7,      t7,        4
   1827    xvpickve2gr.w  t6,       xr4,       5
   1828    ldx.bu         t7,       t8,        t6
   1829    xvinsgr2vr.w   xr7,      t7,        5
   1830    xvpickve2gr.w  t6,       xr4,       6
   1831    ldx.bu         t7,       t8,        t6
   1832    xvinsgr2vr.w   xr7,      t7,        6
   1833    xvpickve2gr.w  t6,       xr4,       7
   1834    ldx.bu         t7,       t8,        t6
   1835    xvinsgr2vr.w   xr7,      t7,        7     // x
   1836 
   1837    vpickve2gr.w   t6,       vr5,       0
   1838    ldx.bu         t7,       t8,        t6
   1839    vinsgr2vr.w    vr8,      t7,        0
   1840    vpickve2gr.w   t6,       vr5,       1
   1841    ldx.bu         t7,       t8,        t6
   1842    vinsgr2vr.w    vr8,      t7,        1
   1843    vpickve2gr.w   t6,       vr5,       2
   1844    ldx.bu         t7,       t8,        t6
   1845    vinsgr2vr.w    vr8,      t7,        2
   1846    vpickve2gr.w   t6,       vr5,       3
   1847    ldx.bu         t7,       t8,        t6
   1848    vinsgr2vr.w    vr8,      t7,        3
   1849 
   1850    xvpickve2gr.w  t6,       xr5,       4
   1851    ldx.bu         t7,       t8,        t6
   1852    xvinsgr2vr.w   xr8,      t7,        4
   1853    xvpickve2gr.w  t6,       xr5,       5
   1854    ldx.bu         t7,       t8,        t6
   1855    xvinsgr2vr.w   xr8,      t7,        5
   1856    xvpickve2gr.w  t6,       xr5,       6
   1857    ldx.bu         t7,       t8,        t6
   1858    xvinsgr2vr.w   xr8,      t7,        6
   1859    xvpickve2gr.w  t6,       xr5,       7
   1860    ldx.bu         t7,       t8,        t6
   1861    xvinsgr2vr.w   xr8,      t7,        7     // x
   1862 
   1863    xvmul.w        xr9,      xr7,       xr9   // x * BB[i]
   1864    xvmul.w        xr10,     xr8,       xr10
   1865    xvmul.w        xr9,      xr9,       xr20  // x * BB[i] * sgr_one_by_x
   1866    xvmul.w        xr10,     xr10,      xr20
   1867    xvsrlri.w      xr9,      xr9,       12
   1868    xvsrlri.w      xr10,     xr10,      12
   1869    xvsub.w        xr7,      xr21,      xr7
   1870    xvsub.w        xr8,      xr21,      xr8
   1871    xvpickev.h     xr12,     xr8,       xr7
   1872    xvpermi.d      xr11,     xr12,      0b11011000
   1873 
   1874    xvst           xr9,      t0,        0
   1875    xvst           xr10,     t0,        32
   1876    xvst           xr11,     t1,        0
   1877    addi.d         t0,       t0,        64
   1878    addi.d         t1,       t1,        32
   1879    blt            zero,     t2,        .LBS3SGF_H_W_LASX
   1880 
   1881    addi.d         a0,       a0,        REST_UNIT_STRIDE<<2
   1882    addi.d         a1,       a1,        REST_UNIT_STRIDE<<1
   1883    addi.w         a3,       a3,        -1
   1884    bnez           a3,       .LBS3SGF_H_H_LASX
   1885 endfunc
   1886 
   1887 function boxsum3_h_8bpc_lasx
   1888    addi.d         a2,      a2,      REST_UNIT_STRIDE
   1889    li.w           t0,      1
   1890    addi.w         a3,      a3,      -2
   1891    addi.w         a4,      a4,      -4
   1892 .LBS3_H_H_LASX:
   1893    alsl.d         t1,      t0,      a1,    1     // sum_v    *sum_v = sum + x
   1894    alsl.d         t2,      t0,      a0,    2     // sumsq_v  *sumsq_v = sumsq + x
   1895    add.d          t3,      t0,      a2           // s
   1896    addi.w         t5,      a3,      0
   1897 
   1898 .LBS3_H_W_LASX:
   1899    xvld           xr0,     t3,      0
   1900    xvld           xr1,     t3,      REST_UNIT_STRIDE
   1901    xvld           xr2,     t3,      (REST_UNIT_STRIDE<<1)
   1902 
   1903    xvilvl.b       xr3,     xr1,     xr0
   1904    xvhaddw.hu.bu  xr4,     xr3,     xr3
   1905    xvilvh.b       xr5,     xr1,     xr0
   1906    xvhaddw.hu.bu  xr6,     xr5,     xr5
   1907    xvsllwil.hu.bu xr7,     xr2,     0
   1908    xvexth.hu.bu   xr8,     xr2
   1909    // sum_v
   1910    xvadd.h        xr4,     xr4,     xr7  // 0 2
   1911    xvadd.h        xr6,     xr6,     xr8  // 1 3
   1912    xvor.v         xr9,     xr4,     xr4
   1913    xvpermi.q      xr4,     xr6,     0b00000010
   1914    xvpermi.q      xr6,     xr9,     0b00110001
   1915    xvst           xr4,     t1,      REST_UNIT_STRIDE<<1
   1916    xvst           xr6,     t1,      (REST_UNIT_STRIDE<<1)+32
   1917    addi.d         t1,      t1,      64
   1918    // sumsq
   1919    xvmulwev.h.bu  xr9,     xr3,     xr3
   1920    xvmulwod.h.bu  xr10,    xr3,     xr3
   1921    xvmulwev.h.bu  xr11,    xr5,     xr5
   1922    xvmulwod.h.bu  xr12,    xr5,     xr5
   1923    xvaddwev.w.hu  xr13,    xr10,    xr9
   1924    xvaddwod.w.hu  xr14,    xr10,    xr9
   1925    xvaddwev.w.hu  xr15,    xr12,    xr11
   1926    xvaddwod.w.hu  xr16,    xr12,    xr11
   1927    xvmaddwev.w.hu xr13,    xr7,     xr7
   1928    xvmaddwod.w.hu xr14,    xr7,     xr7
   1929    xvmaddwev.w.hu xr15,    xr8,     xr8
   1930    xvmaddwod.w.hu xr16,    xr8,     xr8
   1931    xvilvl.w       xr9,     xr14,    xr13
   1932    xvilvh.w       xr10,    xr14,    xr13
   1933    xvilvl.w       xr11,    xr16,    xr15
   1934    xvilvh.w       xr12,    xr16,    xr15
   1935    xvor.v         xr7,     xr9,     xr9
   1936    xvor.v         xr8,     xr11,    xr11
   1937    xvpermi.q      xr9,     xr10,    0b00000010
   1938    xvpermi.q      xr10,    xr7,     0b00110001
   1939    xvpermi.q      xr11,    xr12,    0b00000010
   1940    xvpermi.q      xr12,    xr8,     0b00110001
   1941    xvst           xr9,     t2,      REST_UNIT_STRIDE<<2
   1942    xvst           xr11,    t2,      (REST_UNIT_STRIDE<<2)+32
   1943    xvst           xr10,    t2,      (REST_UNIT_STRIDE<<2)+64
   1944    xvst           xr12,    t2,      (REST_UNIT_STRIDE<<2)+96
   1945 
   1946    addi.d         t2,      t2,      128
   1947    addi.w         t5,      t5,      -32
   1948    addi.d         t3,      t3,      32
   1949    blt            zero,    t5,      .LBS3_H_W_LASX
   1950 
   1951    addi.d         a0,      a0,      REST_UNIT_STRIDE<<2
   1952    addi.d         a1,      a1,      REST_UNIT_STRIDE<<1
   1953    addi.d         a2,      a2,      REST_UNIT_STRIDE
   1954    addi.d         a4,      a4,      -1
   1955    blt            zero,    a4,      .LBS3_H_H_LASX
   1956 endfunc