tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

refmvs.S (27696B)


      1 /*
      2 * Copyright © 2023, VideoLAN and dav1d authors
      3 * Copyright © 2023, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/loongarch/loongson_asm.S"
     29 
     30 /*
     31 static void splat_mv_c(refmvs_block **rr, const refmvs_block *const rmv,
     32                       const int bx4, const int bw4, int bh4)
     33 */
     34 
     35 function splat_mv_lsx
     36    vld           vr0,      a1,       0          // 0 1 ... 11 ...
     37    clz.w         t4,       a3
     38    vaddi.bu      vr1,      vr0,      0
     39    addi.w        t4,       t4,       -26
     40    vextrins.w    vr1,      vr0,      0x30       // 0 1 2 ... 11 0 1 2 3
     41    la.local      t5,       .SPLAT_LSX_JRTABLE
     42    vbsrl.v       vr2,      vr1,      4          // 4 5 6 7...11 0 1 2 3 0 0 0 0
     43    alsl.d        t6,       t4,       t5,     1
     44    vextrins.w    vr2,      vr0,      0x31       // 4 5 6 7...11 0 1 2 3 4 5 6 7
     45    ld.h          t7,       t6,       0
     46    vbsrl.v       vr3,      vr2,      4          // 8 9 10 11 0 1 2 3 4 5 6 7 0 0 0 0
     47    add.d         t8,       t5,       t7
     48    alsl.d        a2,       a2,       a2,     1
     49    vextrins.w    vr3,      vr0,      0x32       // 8 9 10 11 0 1 2 3 4 5 6 7 8 9 10 11
     50    slli.w        a2,       a2,       2
     51    jirl          $r0,      t8,       0
     52 
     53 .SPLAT_LSX_JRTABLE:
     54    .hword .SPLAT_W32_LSX - .SPLAT_LSX_JRTABLE
     55    .hword .SPLAT_W16_LSX - .SPLAT_LSX_JRTABLE
     56    .hword .SPLAT_W8_LSX  - .SPLAT_LSX_JRTABLE
     57    .hword .SPLAT_W4_LSX  - .SPLAT_LSX_JRTABLE
     58    .hword .SPLAT_W2_LSX  - .SPLAT_LSX_JRTABLE
     59    .hword .SPLAT_W1_LSX  - .SPLAT_LSX_JRTABLE
     60 
     61 .SPLAT_W1_LSX:
     62    ld.d          t3,       a0,       0
     63    addi.d        a0,       a0,       8
     64    addi.d        a4,       a4,       -1
     65    add.d         t3,       t3,       a2
     66 
     67    fst.d         f1,       t3,       0
     68    fst.s         f3,       t3,       8
     69    blt           zero,     a4,       .SPLAT_W1_LSX
     70    b             .splat_end
     71 .SPLAT_W2_LSX:
     72    ld.d          t3,       a0,       0
     73    addi.d        a0,       a0,       8
     74    addi.d        a4,       a4,       -1
     75    add.d         t3,       t3,       a2
     76 
     77    vst           vr1,      t3,       0
     78    fst.d         f2,       t3,       16
     79    blt           zero,     a4,       .SPLAT_W2_LSX
     80    b             .splat_end
     81 
     82 .SPLAT_W4_LSX:
     83    ld.d          t3,       a0,       0
     84    addi.d        a0,       a0,       8
     85    addi.d        a4,       a4,       -1
     86    add.d         t3,       t3,       a2
     87 
     88    vst           vr1,      t3,       0
     89    vst           vr2,      t3,       16
     90    vst           vr3,      t3,       32
     91    blt           zero,     a4,       .SPLAT_W4_LSX
     92    b             .splat_end
     93 
     94 .SPLAT_W8_LSX:
     95    ld.d          t3,       a0,       0
     96    addi.d        a0,       a0,       8
     97    addi.d        a4,       a4,       -1
     98    add.d         t3,       t3,       a2
     99 
    100    vst           vr1,      t3,       0
    101    vst           vr2,      t3,       16
    102    vst           vr3,      t3,       32
    103 
    104    vst           vr1,      t3,       48
    105    vst           vr2,      t3,       64
    106    vst           vr3,      t3,       80
    107    blt           zero,     a4,       .SPLAT_W8_LSX
    108    b             .splat_end
    109 
    110 .SPLAT_W16_LSX:
    111    ld.d          t3,       a0,       0
    112    addi.d        a0,       a0,       8
    113    addi.d        a4,       a4,       -1
    114    add.d         t3,       t3,       a2
    115 
    116 .rept 2
    117    vst           vr1,      t3,       0
    118    vst           vr2,      t3,       16
    119    vst           vr3,      t3,       32
    120 
    121    vst           vr1,      t3,       48
    122    vst           vr2,      t3,       64
    123    vst           vr3,      t3,       80
    124 
    125    addi.d        t3,       t3,       96
    126 .endr
    127 
    128    blt           zero,     a4,       .SPLAT_W16_LSX
    129    b             .splat_end
    130 
    131 .SPLAT_W32_LSX:
    132    ld.d          t3,       a0,       0
    133    addi.d        a0,       a0,       8
    134    addi.d        a4,       a4,       -1
    135    add.d         t3,       t3,       a2
    136 
    137 .rept 4
    138    vst           vr1,      t3,       0
    139    vst           vr2,      t3,       16
    140    vst           vr3,      t3,       32
    141 
    142    vst           vr1,      t3,       48
    143    vst           vr2,      t3,       64
    144    vst           vr3,      t3,       80
    145 
    146    addi.d        t3,       t3,       96
    147 .endr
    148 
    149    blt           zero,     a4,       .SPLAT_W32_LSX
    150 
    151 .splat_end:
    152 endfunc
    153 
    154 const la_div_mult
    155 .short    0, 16384, 8192, 5461, 4096, 3276, 2730, 2340
    156 .short 2048,  1820, 1638, 1489, 1365, 1260, 1170, 1092
    157 .short 1024,   963,  910,  862,  819,  780,  744,  712
    158 .short  682,   655,  630,  606,  585,  564,  546,  528
    159 endconst
    160 
    161 /*
    162 *  temp reg: a6 a7
    163 */
    164 .macro LOAD_SET_LOOP is_odd
    165    slli.d          a6,      t6,     2
    166    add.d           a6,      a6,     t6  // col_w * 5
    167 0:
    168    addi.d          a7,      zero,   0   // x
    169 .if \is_odd
    170    stx.w           t7,      t3,     a7
    171    addi.d          a7,      a7,     5
    172    bge             a7,      a6,     2f
    173 .endif
    174 
    175 1:
    176    stx.w           t7,      t3,     a7
    177    addi.d          a7,      a7,     5
    178    stx.w           t7,      t3,     a7
    179    addi.d          a7,      a7,     5
    180    blt             a7,      a6,     1b
    181 2:
    182    add.d           t3,      t3,     t2
    183    addi.d          t5,      t5,     1
    184    blt             t5,      a5,     0b
    185 .endm
    186 
    187 /*
    188 * static void load_tmvs_c(const refmvs_frame *const rf, int tile_row_idx,
    189 *                         const int col_start8, const int col_end8,
    190 *                         const int row_start8, int row_end8)
    191 */
    192 function load_tmvs_lsx
    193    addi.d         sp,      sp,       -80
    194    st.d           s0,      sp,       0
    195    st.d           s1,      sp,       8
    196    st.d           s2,      sp,       16
    197    st.d           s3,      sp,       24
    198    st.d           s4,      sp,       32
    199    st.d           s5,      sp,       40
    200    st.d           s6,      sp,       48
    201    st.d           s7,      sp,       56
    202    st.d           s8,      sp,       64
    203 
    204    vld           vr16,     a0,       16
    205    vld           vr0,      a0,       48    // rf->mfmv_ref, rf->mfmv_ref2cur
    206    ld.w          s8,       a0,       80    // [0] - rf->n_mfmvs
    207    vld           vr17,     a0,       96    // [0] - rp_ref| [1]- rp_proj
    208    ld.d          t1,       a0,       112   // stride
    209    ld.w          t0,       a0,       128
    210    addi.w        t0,       t0,       -1
    211    bnez          t0,       1f
    212    addi.w        a1,       zero,     0
    213 1:
    214    addi.d        t0,       a3,       8
    215    vinsgr2vr.w   vr1,      t0,       0
    216    vinsgr2vr.w   vr1,      a5,       1
    217    vmin.w        vr1,      vr1,      vr16  // [0] col_end8i [1] row_end8
    218    addi.d        t0,       a2,       -8
    219    bge           t0,       zero,     2f
    220    addi.w        t0,       zero,     0     // t0 col_start8i
    221 2:
    222    vpickve2gr.d  t4,       vr17,     1     // rf->rp_proj
    223    slli.d        t2,       t1,       2
    224    add.d         t2,       t2,       t1    // stride * 5
    225    slli.d        a1,       a1,       4     // tile_row_idx * 16
    226    andi          t3,       a4,       0xf
    227    add.d         t3,       t3,       a1    // tile_row_idx * 16 + row_start8 & 15
    228    mul.w         t3,       t3,       t2
    229    mul.w         t8,       a1,       t2
    230    vpickve2gr.w  a5,       vr1,      1
    231    addi.d        t5,       a4,       0
    232    sub.d         t6,       a3,       a2     // col_end8 - col_start8
    233    li.w          t7,       0x80008000
    234    slli.d        a7,       a2,       2
    235    add.d         t3,       t3,       a2
    236    add.d         t3,       t3,       a7
    237    add.d         t3,       t3,       t4     // rp_proj
    238    andi          a6,       t6,       1
    239    bnez          a6,       3f
    240    LOAD_SET_LOOP 0
    241    b             4f
    242 3:
    243    LOAD_SET_LOOP 1
    244 4:
    245    addi.d        a6,       zero,     0      // n
    246    bge           a6,       s8,       .end_load
    247    add.d         t3,       t8,       t4     // rp_proj
    248    mul.w         t6,       a4,       t2
    249    addi.d        s7,       zero,     40
    250    vpickve2gr.w  t1,       vr1,      0      // col_end8i
    251    addi.d        t5,       a0,       58     // rf->mfmv_ref2ref - 1
    252    la.local      t8,       la_div_mult
    253    vld           vr6,      t8,       0
    254    vld           vr7,      t8,       16
    255    vld           vr8,      t8,       32
    256    vld           vr9,      t8,       48
    257    li.w          t8,       0x3fff
    258    vreplgr2vr.h  vr21,     t8
    259    vxor.v        vr18,     vr18,     vr18   // zero
    260    vsub.h        vr20,     vr18,     vr21
    261    vpickev.b     vr12,     vr7,      vr6
    262    vpickod.b     vr13,     vr7,      vr6
    263    vpickev.b     vr14,     vr9,      vr8
    264    vpickod.b     vr15,     vr9,      vr8
    265    vpickve2gr.d  s6,       vr17,     0       // rf->rp_ref
    266 5:
    267    vld           vr10,     t5,       0       // ref2ref [1...7]
    268    vpickve2gr.b  t8,       vr0,      8       // ref2cur
    269    vbsrl.v       vr0,      vr0,      1
    270    addi.w        t4,       t8,       32
    271    beqz          t4,       8f                // INVALID_REF2CUR
    272 
    273    vreplgr2vr.h  vr23,     t8
    274    vshuf.b       vr6,      vr14,     vr12,    vr10
    275    vshuf.b       vr7,      vr15,     vr13,    vr10
    276    vilvl.b       vr8,      vr7,      vr6
    277    vmulwev.w.h   vr6,      vr8,      vr23
    278    vmulwod.w.h   vr7,      vr8,      vr23
    279 
    280    vpickve2gr.b  s0,       vr0,      4       // ref
    281    slli.d        t8,       s0,       3
    282    ldx.d         s1,       s6,       t8      // rf->rp_ref[ref]
    283    addi.d        s0,       s0,       -4      // ref_sign
    284    vreplgr2vr.h  vr19,     s0
    285    add.d         s1,       s1,       t6      // &rf->rp_ref[ref][row_start8 * stride]
    286    addi.d        s2,       a4,       0       // y
    287    vilvl.w       vr8,      vr7,      vr6
    288    vilvh.w       vr9,      vr7,      vr6
    289 6:                                            // for (int y = row_start8;
    290    andi          s3,       s2,       0xff8
    291 
    292    addi.d        s4,       s3,       8
    293    blt           a4,       s3,       0f
    294    addi.d        s3,       a4,       0        // y_proj_start
    295 0:
    296    blt           s4,       a5,       0f
    297    addi.d        s4,       a5,       0        // y_proj_end
    298 0:
    299    addi.d        s5,       t0,       0        // x
    300 7:                                             // for (int x = col_start8i;
    301    slli.d        a7,       s5,       2
    302    add.d         a7,       a7,       s5
    303    add.d         a7,       s1,       a7      // rb
    304    vld           vr3,      a7,       0       // [rb]
    305    vpickve2gr.b  t4,       vr3,      4       // b_ref
    306    beqz          t4,       .end_x
    307    vreplve.b     vr11,     vr10,     t4
    308    vpickve2gr.b  t7,       vr11,     4       // ref2ref
    309    beqz          t7,       .end_x
    310    vsllwil.w.h   vr4,      vr3,      0
    311    vreplgr2vr.w  vr6,      t4
    312    vshuf.w       vr6,      vr9,      vr8      // frac
    313    vmul.w        vr5,      vr6,      vr4
    314    vsrai.w       vr4,      vr5,      31
    315    vadd.w        vr4,      vr4,      vr5
    316    vssrarni.h.w  vr4,      vr4,      14
    317    vclip.h       vr4,      vr4,      vr20,    vr21  // offset
    318    vxor.v        vr5,      vr4,      vr19    // offset.x ^ ref_sign
    319    vori.b        vr5,      vr5,      0x1     // offset.x ^ ref_sign
    320    vabsd.h       vr4,      vr4,      vr18
    321    vsrli.h       vr4,      vr4,      6       // abs(offset.x) >> 6
    322    vsigncov.h    vr4,      vr5,      vr4     // apply_sign
    323    vpickve2gr.h  s0,       vr4,      0
    324    add.d         s0,       s2,       s0      // pos_y
    325    blt           s0,       s3,       .n_posy
    326    bge           s0,       s4,       .n_posy
    327    andi          s0,       s0,       0xf
    328    mul.w         s0,       s0,       t2      // pos
    329    vpickve2gr.h  t7,       vr4,      1
    330    add.d         t7,       t7,       s5      // pos_x
    331    add.d         s0,       t3,       s0      // rp_proj + pos
    332 
    333 .loop_posx:
    334    andi          t4,       s5,       0xff8 // x_sb_align
    335 
    336    blt           t7,       a2,       .n_posx
    337    addi.d        t8,       t4,       -8
    338    blt           t7,       t8,       .n_posx
    339 
    340    bge           t7,       a3,       .n_posx
    341    addi.d        t4,       t4,       16
    342    bge           t7,       t4,       .n_posx
    343 
    344    slli.d        t4,       t7,       2
    345    add.d         t4,       t4,       t7      // pos_x * 5
    346    add.d         t4,       s0,       t4      // rp_proj[pos + pos_x]
    347    vstelm.w      vr3,      t4,       0,   0
    348    vstelm.b      vr11,     t4,       4,   4
    349 
    350 .n_posx:
    351    addi.d        s5,       s5,       1       // x + 1
    352    bge           s5,       t1,       .ret_posx
    353    addi.d        a7,       a7,       5       // rb + 1
    354    vld           vr4,      a7,       0       // [rb]
    355    vseq.b        vr5,      vr4,      vr3
    356 
    357    vpickve2gr.d  t8,       vr5,      0
    358    cto.d         t8,       t8
    359    blt           t8,       s7,       7b
    360 
    361    addi.d        t7,       t7,       1       // pos_x + 1
    362 
    363    /*  Core computing loop expansion(sencond)  */
    364    andi          t4,       s5,       0xff8 // x_sb_align
    365 
    366    blt           t7,       a2,       .n_posx
    367    addi.d        t8,       t4,       -8
    368    blt           t7,       t8,       .n_posx
    369 
    370    bge           t7,       a3,       .n_posx
    371    addi.d        t4,       t4,       16
    372    bge           t7,       t4,       .n_posx
    373 
    374    slli.d        t4,       t7,       2
    375    add.d         t4,       t4,       t7      // pos_x * 5
    376    add.d         t4,       s0,       t4      // rp_proj[pos + pos_x]
    377    vstelm.w      vr3,      t4,       0,   0
    378    vstelm.b      vr11,     t4,       4,   4
    379 
    380    addi.d        s5,       s5,       1       // x + 1
    381    bge           s5,       t1,       .ret_posx
    382    addi.d        a7,       a7,       5       // rb + 1
    383    vld           vr4,      a7,       0       // [rb]
    384    vseq.b        vr5,      vr4,      vr3
    385 
    386    vpickve2gr.d  t8,       vr5,      0
    387    cto.d         t8,       t8
    388    blt           t8,       s7,       7b
    389 
    390    addi.d        t7,       t7,       1       // pos_x + 1
    391 
    392    /*  Core computing loop expansion(third)  */
    393    andi          t4,       s5,       0xff8 // x_sb_align
    394 
    395    blt           t7,       a2,       .n_posx
    396    addi.d        t8,       t4,       -8
    397    blt           t7,       t8,       .n_posx
    398 
    399    bge           t7,       a3,       .n_posx
    400    addi.d        t4,       t4,       16
    401    bge           t7,       t4,       .n_posx
    402 
    403    slli.d        t4,       t7,       2
    404    add.d         t4,       t4,       t7      // pos_x * 5
    405    add.d         t4,       s0,       t4      // rp_proj[pos + pos_x]
    406    vstelm.w      vr3,      t4,       0,   0
    407    vstelm.b      vr11,     t4,       4,   4
    408 
    409    addi.d        s5,       s5,       1       // x + 1
    410    bge           s5,       t1,       .ret_posx
    411    addi.d        a7,       a7,       5       // rb + 1
    412    vld           vr4,      a7,       0       // [rb]
    413    vseq.b        vr5,      vr4,      vr3
    414 
    415    vpickve2gr.d  t8,       vr5,      0
    416    cto.d         t8,       t8
    417    blt           t8,       s7,       7b
    418 
    419    addi.d        t7,       t7,       1       // pos_x + 1
    420 
    421    b             .loop_posx
    422 
    423 .n_posy:
    424    addi.d        s5,       s5,       1       // x + 1
    425    bge           s5,       t1,       .ret_posx
    426    addi.d        a7,       a7,       5       // rb + 1
    427    vld           vr4,      a7,       0       // [rb]
    428    vseq.b        vr5,      vr4,      vr3
    429 
    430    vpickve2gr.d  t8,       vr5,      0
    431    cto.d         t8,       t8
    432    blt           t8,       s7,       7b
    433 
    434    addi.d        s5,       s5,       1       // x + 1
    435    bge           s5,       t1,       .ret_posx
    436    addi.d        a7,       a7,       5       // rb + 1
    437    vld           vr4,      a7,       0       // [rb]
    438    vseq.b        vr5,      vr4,      vr3
    439 
    440    vpickve2gr.d  t8,       vr5,      0
    441    cto.d         t8,       t8
    442    blt           t8,       s7,       7b
    443 
    444    b             .n_posy
    445 
    446 .end_x:
    447    addi.d        s5,       s5,       1       // x + 1
    448    blt           s5,       t1,       7b
    449 
    450 .ret_posx:
    451    add.d         s1,       s1,       t2      // r + stride
    452    addi.d        s2,       s2,       1       // y + 1
    453    blt           s2,       a5,       6b
    454 8:
    455    addi.d        a6,       a6,       1       // n + 1
    456    addi.d        t5,       t5,       7       // mfmv_ref2ref(offset) + 7
    457    blt           a6,       s8,       5b
    458 
    459 .end_load:
    460    ld.d           s0,      sp,       0
    461    ld.d           s1,      sp,       8
    462    ld.d           s2,      sp,       16
    463    ld.d           s3,      sp,       24
    464    ld.d           s4,      sp,       32
    465    ld.d           s5,      sp,       40
    466    ld.d           s6,      sp,       48
    467    ld.d           s7,      sp,       56
    468    ld.d           s8,      sp,       64
    469    addi.d         sp,      sp,       80
    470 endfunc
    471 
    472 const mv_tbls
    473    .byte           255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255
    474    .byte           0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0, 1, 2, 3, 8, 0
    475    .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
    476    .byte           4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4, 5, 6, 7, 9, 4
    477 endconst
    478 
    479 const mask_mult
    480    .byte           1, 0, 2, 0, 1, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0
    481 endconst
    482 
    483 const mask_mv0
    484    .byte           1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
    485 endconst
    486 
    487 const mask_mv1
    488    .byte           4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19
    489 endconst
    490 
    491 // void dav1d_save_tmvs_lsx(refmvs_temporal_block *rp, ptrdiff_t stride,
    492 //                          refmvs_block **rr, const uint8_t *ref_sign,
    493 //                          int col_end8, int row_end8,
    494 //                          int col_start8, int row_start8)
    495 function save_tmvs_lsx
    496    addi.d      sp,         sp,        -0x28
    497    st.d        s0,         sp,         0x00
    498    st.d        s1,         sp,         0x08
    499    st.d        s2,         sp,         0x10
    500    st.d        s3,         sp,         0x18
    501    st.d        s4,         sp,         0x20
    502    move        t0,         ra
    503 
    504    vxor.v      vr10,       vr10,       vr10
    505    vld         vr11,       a3,         0       // Load ref_sign[0] ~ Load ref_sign[7]
    506    la.local    t2,         .save_tevs_tbl
    507    la.local    s1,         mask_mult
    508    la.local    t7,         mv_tbls
    509    vld         vr9,        s1,         0       // Load mask_mult
    510    vslli.d     vr11,       vr11,       8       // 0, ref_sign[0], ... ,ref_sign[6]
    511    la.local    s3,         mask_mv0
    512    vld         vr8,        s3,         0       // Load mask_mv0
    513    la.local    s4,         mask_mv1
    514    vld         vr7,        s4,         0       // Load mask_mv1
    515    li.d        s0,         5
    516    li.d        t8,         12 * 2
    517    mul.d       a1,         a1,         s0     // stride *= 5
    518    sub.d       a5,         a5,         a7      // h = row_end8 - row_start8
    519    slli.d      a7,         a7,         1       // row_start8 <<= 1
    520 1:
    521    li.d        s0,         5
    522    andi        t3,         a7,         30      // (y & 15) * 2
    523    slli.d      s4,         t3,         3
    524    ldx.d       t3,         a2,         s4      // b = rr[(y & 15) * 2]
    525    addi.d      t3,         t3,         12      // &b[... + 1]
    526    mul.d       s4,         a4,         t8
    527    add.d       t4,         s4,         t3      // end_cand_b = &b[col_end8*2 + 1]
    528    mul.d       s3,         a6,         t8
    529    add.d       t3,         s3,         t3      // cand_b = &b[x*2 + 1]
    530    mul.d       s4,         a6,         s0
    531    add.d       a3,         s4,         a0      // &rp[x]
    532 2:
    533    /* First cand_b */
    534    ld.b        t5,         t3,         10      // cand_b->bs
    535    vld         vr0,        t3,         0       // cand_b->mv and ref
    536    alsl.d      t5,         t5,         t2,     2  // bt2 index
    537    ld.h        s3,         t3,         8       // cand_b->ref
    538    ld.h        t6,         t5,         0       // bt2
    539    move        s0,         t2
    540    alsl.d      t3,         t6,         t3,     1   // Next cand_b += bt2 * 2
    541    vor.v       vr2,        vr0,        vr0
    542    vinsgr2vr.h vr1,        s3,         0
    543    move        t1 ,        t3
    544    bge         t3,         t4,        3f
    545 
    546    /* Next cand_b */
    547    ld.b        s0,         t3,         10      // cand_b->bs
    548    vld         vr4,        t3,         0       // cand_b->mv and ref
    549    alsl.d      s0,         s0,         t2,     2 // bt2 index
    550    ld.h        s4,         t3,         8       // cand_b->ref
    551    ld.h        t6,         s0,         0       // bt2
    552    alsl.d      t3,         t6,         t3,     1   // Next cand_b += bt2*2
    553    vpackev.d   vr2,        vr4,        vr0     // a0.mv[0] a0.mv[1] a1.mv[0], a1.mv[1]
    554    vinsgr2vr.h vr1,        s4,         1   // a0.ref[0] a0.ref[1], a1.ref[0], a1.ref[1]
    555 3:
    556    vabsd.h     vr2,        vr2,        vr10    // abs(mv[].xy)
    557    vsle.b      vr16,       vr10,       vr1
    558    vand.v      vr1,        vr16,       vr1
    559    vshuf.b     vr1,        vr11,       vr11,   vr1     // ref_sign[ref]
    560    vsrli.h     vr2,        vr2,        12      // abs(mv[].xy) >> 12
    561    vilvl.b     vr1,        vr1,        vr1
    562    vmulwev.h.bu    vr1,    vr1,        vr9    // ef_sign[ref] * {1, 2}
    563 
    564    vseqi.w     vr2,        vr2,        0       // abs(mv[].xy) <= 4096
    565    vpickev.h   vr2,        vr2,        vr2     // abs() condition to 16 bit
    566 
    567    vand.v      vr1,        vr2,        vr1     // h[0-3] contains conditions for mv[0-1]
    568    vhaddw.wu.hu    vr1,    vr1,        vr1     // Combine condition for [1] and [0]
    569    vpickve2gr.wu   s1,     vr1,        0       // Extract case for first block
    570    vpickve2gr.wu   s2,     vr1,        1
    571 
    572    ld.hu           t5,     t5,         2       // Fetch jump table entry
    573    ld.hu           s0,     s0,         2
    574    alsl.d          s3,     s1,         t7,    4   // Load permutation table base on case
    575    vld             vr1,    s3,         0
    576    alsl.d          s4,     s2,         t7,    4
    577    vld             vr5,    s4,         0
    578    sub.d           t5,     t2,         t5     // Find jump table target
    579    sub.d           s0,     t2,         s0
    580 
    581    vshuf.b         vr0,    vr0,        vr0,    vr1 // Permute cand_b to output refmvs_temporal_block
    582    vshuf.b         vr4,    vr4,        vr4,    vr5
    583    vsle.b          vr16,   vr10,       vr1
    584    vand.v          vr0,    vr16,       vr0
    585 
    586    vsle.b          vr17,   vr10,       vr5
    587    vand.v          vr4,    vr17,       vr4
    588    // v1 follows on v0, with another 3 full repetitions of the pattern.
    589    vshuf.b         vr1,    vr0,        vr0,    vr8 // 1, 2, 3, ... , 15, 16
    590    vshuf.b         vr5,    vr4,        vr4,    vr8 // 1, 2, 3, ... , 15, 16
    591    // v2 ends with 3 complete repetitions of the pattern.
    592    vshuf.b         vr2,    vr1,        vr0,    vr7
    593    vshuf.b         vr6,    vr5,        vr4,    vr7    // 4, 5, 6, 7, ... , 12, 13, 14, 15, 16, 17, 18, 19
    594 
    595    jirl            ra,     t5,         0
    596    bge             t1 ,    t4,         4f      // if (cand_b >= end)
    597    vor.v           vr0,    vr4,        vr4
    598    vor.v           vr1,    vr5,        vr5
    599    vor.v           vr2,    vr6,        vr6
    600    jirl            ra,     s0,         0
    601    blt             t3,     t4,         2b      // if (cand_b < end)
    602 
    603 4:
    604    addi.d          a5,     a5,         -1      // h--
    605    addi.d          a7,     a7,         2       // y += 2
    606    add.d           a0,     a0,         a1      // rp += stride
    607    blt             zero,   a5,         1b
    608 
    609    ld.d        s0,         sp,         0x00
    610    ld.d        s1,         sp,         0x08
    611    ld.d        s2,         sp,         0x10
    612    ld.d        s3,         sp,         0x18
    613    ld.d        s4,         sp,         0x20
    614    addi.d      sp,         sp,         0x28
    615 
    616    move            ra,     t0
    617    jirl            zero,   ra,         0x00
    618 
    619 10:
    620    addi.d          s1,     a3,         4
    621    vstelm.w        vr0,    a3,         0,      0   // .mv
    622    vstelm.b        vr0,    s1,         0,      4   // .ref
    623    addi.d          a3,     a3,         5
    624    jirl            zero,   ra,         0x00
    625 20:
    626    addi.d          s1,     a3,         8
    627    vstelm.d        vr0,    a3,         0,      0   // .mv
    628    vstelm.h        vr0,    s1,         0,      4   // .ref
    629    addi.d          a3,     a3,         2 * 5
    630    jirl            zero,   ra,         0x00
    631 40:
    632    vst             vr0,    a3,         0
    633    vstelm.w        vr1,    a3,         0x10,   0
    634    addi.d          a3,     a3,         4 * 5
    635    jirl            zero,   ra,         0x00
    636 
    637 80:
    638    vst             vr0,    a3,         0
    639    vst             vr1,    a3,         0x10           // This writes 6 full entries plus 2 extra bytes
    640    vst             vr2,    a3,         5 * 8 - 16     // Write the last few, overlapping with the first write.
    641    addi.d          a3,     a3,         8 * 5
    642    jirl            zero,   ra,         0x00
    643 160:
    644    addi.d          s1,     a3,         6 * 5
    645    addi.d          s2,     a3,         12 * 5
    646    vst             vr0,    a3,         0
    647    vst             vr1,    a3,         0x10          // This writes 6 full entries plus 2 extra bytes
    648    vst             vr0,    a3,         6 * 5
    649    vst             vr1,    a3,         6 * 5 + 16    // Write another 6 full entries, slightly overlapping with the first set
    650    vstelm.d        vr0,    s2,         0,      0     // Write 8 bytes (one full entry) after the first 12
    651    vst             vr2,    a3,         5 * 16 - 16   // Write the last 3 entries
    652    addi.d          a3,     a3,         16 * 5
    653    jirl            zero,   ra,         0x00
    654 
    655 .save_tevs_tbl:
    656        .hword 16 * 12   // bt2 * 12, 12 is sizeof(refmvs_block)
    657        .hword .save_tevs_tbl - 160b
    658        .hword 16 * 12
    659        .hword .save_tevs_tbl - 160b
    660        .hword 8 * 12
    661        .hword .save_tevs_tbl -  80b
    662        .hword 8 * 12
    663        .hword .save_tevs_tbl -  80b
    664        .hword 8 * 12
    665        .hword .save_tevs_tbl -  80b
    666        .hword 8 * 12
    667        .hword .save_tevs_tbl -  80b
    668        .hword 4 * 12
    669        .hword .save_tevs_tbl -  40b
    670        .hword 4 * 12
    671        .hword .save_tevs_tbl -  40b
    672        .hword 4 * 12
    673        .hword .save_tevs_tbl -  40b
    674        .hword 4 * 12
    675        .hword .save_tevs_tbl -  40b
    676        .hword 2 * 12
    677        .hword .save_tevs_tbl -  20b
    678        .hword 2 * 12
    679        .hword .save_tevs_tbl -  20b
    680        .hword 2 * 12
    681        .hword .save_tevs_tbl -  20b
    682        .hword 2 * 12
    683        .hword .save_tevs_tbl -  20b
    684        .hword 2 * 12
    685        .hword .save_tevs_tbl -  20b
    686        .hword 1 * 12
    687        .hword .save_tevs_tbl -  10b
    688        .hword 1 * 12
    689        .hword .save_tevs_tbl -  10b
    690        .hword 1 * 12
    691        .hword .save_tevs_tbl -  10b
    692        .hword 1 * 12
    693        .hword .save_tevs_tbl -  10b
    694        .hword 1 * 12
    695        .hword .save_tevs_tbl -  10b
    696        .hword 1 * 12
    697        .hword .save_tevs_tbl -  10b
    698        .hword 1 * 12
    699        .hword .save_tevs_tbl -  10b
    700 endfunc