tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

msac.S (24502B)


      1 /*
      2 * Copyright © 2023, VideoLAN and dav1d authors
      3 * Copyright © 2023, Loongson Technology Corporation Limited
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "loongson_asm.S"
     29 
     30 const min_prob
     31  .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
     32 endconst
     33 
     34 const ph_0xff00
     35 .rept 8
     36  .short 0xff00
     37 .endr
     38 endconst
     39 
     40 .macro decode_symbol_adapt w
     41    addi.d          sp,      sp,     -48
     42    vldrepl.h       vr0,     a0,      24   //rng
     43    fst.s           f0,      sp,      0    //val==0
     44    vld             vr1,     a1,      0    //cdf
     45 .if \w == 16
     46    vld             vr11,    a1,      16
     47 .endif
     48    vldrepl.d       vr2,     a0,      16   //dif
     49    ld.w            t1,      a0,      32   //allow_update_cdf
     50    la.local        t2,      min_prob
     51    addi.d          t2,      t2,      30
     52    slli.w          t3,      a2,      1
     53    sub.d           t2,      t2,      t3
     54    vld             vr3,     t2,      0    //min_prob
     55 .if \w == 16
     56    vld             vr13,    t2,      16
     57 .endif
     58    vsrli.h         vr4,     vr0,     8    //r = s->rng >> 8
     59    vslli.h         vr4,     vr4,     8    //r << 8
     60    vsrli.h         vr5,     vr1,     6
     61    vslli.h         vr5,     vr5,     7
     62 .if \w == 16
     63    vsrli.h         vr15,    vr11,    6
     64    vslli.h         vr15,    vr15,    7
     65 .endif
     66    vmuh.hu         vr5,     vr4,     vr5
     67    vadd.h          vr5,     vr5,     vr3  //v
     68 .if \w == 16
     69    vmuh.hu         vr15,    vr4,     vr15
     70    vadd.h          vr15,    vr15,    vr13
     71 .endif
     72    addi.d          t8,      sp,      2
     73    vst             vr5,     t8,      0    //store v
     74 .if \w == 16
     75    vst             vr15,    t8,      16
     76 .endif
     77    vreplvei.h      vr20,    vr2,     3    //c
     78    vsle.hu         vr6,     vr5,     vr20
     79 .if \w == 16
     80    vsle.hu         vr16,    vr15,    vr20
     81    vpickev.b       vr21,    vr16,    vr6
     82 .endif
     83 .if \w <= 8
     84    vmskltz.h       vr10,    vr6
     85 .else
     86    vmskltz.b       vr10,    vr21
     87 .endif
     88    beqz            t1,      .renorm\()\w
     89 
     90    // update_cdf
     91    alsl.d          t1,      a2,      a1,   1
     92    ld.h            t2,      t1,      0    //count
     93    srli.w          t3,      t2,      4    //count >> 4
     94 .if \w == 16
     95    addi.w          t3,      t3,      5    //rate
     96 .else
     97    addi.w          t3,      t3,      4
     98    li.w            t5,      2
     99    sltu            t5,      t5,      a2
    100    add.w           t3,      t3,      t5   //rate
    101 .endif
    102    sltui           t5,      t2,      32
    103    add.w           t2,      t2,      t5   //count + (count < 32)
    104    vreplgr2vr.h    vr9,     t3
    105    vseq.h          vr7,     vr7,     vr7
    106    vavgr.hu        vr5,     vr6,     vr7  //i >= val ? -1 : 32768
    107    vsub.h          vr5,     vr5,     vr1
    108    vsub.h          vr8,     vr1,     vr6
    109 .if \w == 16
    110    vavgr.hu        vr15,    vr16,    vr7
    111    vsub.h          vr15,    vr15,    vr11
    112    vsub.h          vr18,    vr11,    vr16
    113 .endif
    114    vsra.h          vr5,     vr5,     vr9
    115    vadd.h          vr8,     vr8,     vr5
    116 .if \w == 4
    117    fst.d           f8,      a1,      0
    118 .else
    119    vst             vr8,     a1,      0
    120 .endif
    121 .if \w == 16
    122    vsra.h          vr15,    vr15,    vr9
    123    vadd.h          vr18,    vr18,    vr15
    124    vst             vr18,    a1,      16
    125 .endif
    126    st.h            t2,      t1,      0
    127 
    128 .renorm\()\w:
    129    vpickve2gr.h    t3,      vr10,    0
    130    ctz.w           a7,      t3            // ret
    131    alsl.d          t3,      a7,      t8,      1
    132    ld.hu           t4,      t3,      0    // v
    133    ld.hu           t5,      t3,      -2   // u
    134    sub.w           t5,      t5,      t4   // rng
    135    slli.d          t4,      t4,      48
    136    vpickve2gr.d    t6,      vr2,     0
    137    sub.d           t6,      t6,      t4   // dif
    138    clz.w           t4,      t5            // d
    139    xori            t4,      t4,      16   // d
    140    sll.d           t6,      t6,      t4
    141    ld.w            t0,      a0,      28   //cnt
    142    sll.w           t5,      t5,      t4
    143    sub.w           t7,      t0,      t4   // cnt-d
    144    st.w            t5,      a0,      24   // store rng
    145    bgeu            t0,      t4,      9f
    146 
    147    // refill
    148    ld.d            t0,      a0,      0    // buf_pos
    149    ld.d            t1,      a0,      8    // buf_end
    150    addi.d          t2,      t0,      8
    151    bltu            t1,      t2,      2f
    152 
    153    ld.d            t3,      t0,      0    // next_bits
    154    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
    155    nor             t3,      t3,      t3
    156    sub.w           t2,      zero,    t1
    157    revb.d          t3,      t3            // next_bits = bswap(next_bits)
    158    srli.w          t2,      t2,      3    // num_bytes_read
    159    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
    160    b               3f
    161 1:
    162    addi.w          t3,      t7,      -48
    163    srl.d           t3,      t3,      t3   // pad with ones
    164    b               4f
    165 2:
    166    bgeu            t0,      t1,      1b
    167    ld.d            t3,      t1,      -8   // next_bits
    168    sub.w           t2,      t2,      t1
    169    sub.w           t1,      t1,      t0   // num_bytes_left
    170    slli.w          t2,      t2,      3
    171    srl.d           t3,      t3,      t2
    172    addi.w          t2,      t7,      -48
    173    nor             t3,      t3,      t3
    174    sub.w           t4,      zero,    t2
    175    revb.d          t3,      t3
    176    srli.w          t4,      t4,      3
    177    srl.d           t3,      t3,      t2
    178    sltu            t2,      t1,      t4
    179    maskeqz         t1,      t1,      t2
    180    masknez         t2,      t4,      t2
    181    or              t2,      t2,      t1   // num_bytes_read
    182 3:
    183    slli.w          t1,      t2,      3
    184    add.d           t0,      t0,      t2
    185    add.w           t7,      t7,      t1   // cnt += num_bits_read
    186    st.d            t0,      a0,      0
    187 4:
    188    or              t6,      t6,      t3   // dif |= next_bits
    189 9:
    190    st.w            t7,      a0,      28   // store cnt
    191    st.d            t6,      a0,      16   // store dif
    192    move            a0,      a7
    193    addi.d          sp,      sp,      48
    194 .endm
    195 
    196 function msac_decode_symbol_adapt4_lsx
    197    decode_symbol_adapt 4
    198 endfunc
    199 
    200 function msac_decode_symbol_adapt8_lsx
    201    decode_symbol_adapt 8
    202 endfunc
    203 
    204 function msac_decode_symbol_adapt16_lsx
    205    decode_symbol_adapt 16
    206 endfunc
    207 
    208 function msac_decode_bool_lsx
    209    ld.w            t0,      a0,      24   // rng
    210    srli.w          a1,      a1,      6
    211    ld.d            t1,      a0,      16   // dif
    212    srli.w          t2,      t0,      8    // r >> 8
    213    mul.w           t2,      t2,      a1
    214    ld.w            a5,      a0,      28   // cnt
    215    srli.w          t2,      t2,      1
    216    addi.w          t2,      t2,      4    // v
    217    slli.d          t3,      t2,      48   // vw
    218    sltu            t4,      t1,      t3
    219    move            t8,      t4            // ret
    220    xori            t4,      t4,      1
    221    maskeqz         t6,      t3,      t4   // if (ret) vw
    222    sub.d           t6,      t1,      t6   // dif
    223    slli.w          t5,      t2,      1
    224    sub.w           t5,      t0,      t5   // r - 2v
    225    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
    226    add.w           t5,      t2,      t7   // v(rng)
    227 
    228    // renorm
    229    clz.w           t4,      t5            // d
    230    xori            t4,      t4,      16   // d
    231    sll.d           t6,      t6,      t4
    232    sll.w           t5,      t5,      t4
    233    sub.w           t7,      a5,      t4   // cnt-d
    234    st.w            t5,      a0,      24   // store rng
    235    bgeu            a5,      t4,      9f
    236 
    237    // refill
    238    ld.d            t0,      a0,      0    // buf_pos
    239    ld.d            t1,      a0,      8    // buf_end
    240    addi.d          t2,      t0,      8
    241    bltu            t1,      t2,      2f
    242 
    243    ld.d            t3,      t0,      0    // next_bits
    244    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
    245    nor             t3,      t3,      t3
    246    sub.w           t2,      zero,    t1
    247    revb.d          t3,      t3            // next_bits = bswap(next_bits)
    248    srli.w          t2,      t2,      3    // num_bytes_read
    249    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
    250    b               3f
    251 1:
    252    addi.w          t3,      t7,      -48
    253    srl.d           t3,      t3,      t3   // pad with ones
    254    b               4f
    255 2:
    256    bgeu            t0,      t1,      1b
    257    ld.d            t3,      t1,      -8   // next_bits
    258    sub.w           t2,      t2,      t1
    259    sub.w           t1,      t1,      t0   // num_bytes_left
    260    slli.w          t2,      t2,      3
    261    srl.d           t3,      t3,      t2
    262    addi.w          t2,      t7,      -48
    263    nor             t3,      t3,      t3
    264    sub.w           t4,      zero,    t2
    265    revb.d          t3,      t3
    266    srli.w          t4,      t4,      3
    267    srl.d           t3,      t3,      t2
    268    sltu            t2,      t1,      t4
    269    maskeqz         t1,      t1,      t2
    270    masknez         t2,      t4,      t2
    271    or              t2,      t2,      t1   // num_bytes_read
    272 3:
    273    slli.w          t1,      t2,      3
    274    add.d           t0,      t0,      t2
    275    add.w           t7,      t7,      t1   // cnt += num_bits_read
    276    st.d            t0,      a0,      0
    277 4:
    278    or              t6,      t6,      t3   // dif |= next_bits
    279 9:
    280    st.w            t7,      a0,      28   // store cnt
    281    st.d            t6,      a0,      16   // store dif
    282    move            a0,      t8
    283 endfunc
    284 
    285 function msac_decode_bool_equi_lsx
    286    ld.w            t0,      a0,      24   // rng
    287    ld.d            t1,      a0,      16   // dif
    288    ld.w            a5,      a0,      28   // cnt
    289    srli.w          t2,      t0,      8    // r >> 8
    290    slli.w          t2,      t2,      7
    291    addi.w          t2,      t2,      4    // v
    292 
    293    slli.d          t3,      t2,      48   // vw
    294    sltu            t4,      t1,      t3
    295    move            t8,      t4            // ret
    296    xori            t4,      t4,      1
    297    maskeqz         t6,      t3,      t4   // if (ret) vw
    298    sub.d           t6,      t1,      t6   // dif
    299    slli.w          t5,      t2,      1
    300    sub.w           t5,      t0,      t5   // r - 2v
    301    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
    302    add.w           t5,      t2,      t7   // v(rng)
    303 
    304    // renorm
    305    clz.w           t4,      t5            // d
    306    xori            t4,      t4,      16   // d
    307    sll.d           t6,      t6,      t4
    308    sll.w           t5,      t5,      t4
    309    sub.w           t7,      a5,      t4   // cnt-d
    310    st.w            t5,      a0,      24   // store rng
    311    bgeu            a5,      t4,      9f
    312 
    313    // refill
    314    ld.d            t0,      a0,      0    // buf_pos
    315    ld.d            t1,      a0,      8    // buf_end
    316    addi.d          t2,      t0,      8
    317    bltu            t1,      t2,      2f
    318 
    319    ld.d            t3,      t0,      0    // next_bits
    320    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
    321    nor             t3,      t3,      t3
    322    sub.w           t2,      zero,    t1
    323    revb.d          t3,      t3            // next_bits = bswap(next_bits)
    324    srli.w          t2,      t2,      3    // num_bytes_read
    325    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
    326    b               3f
    327 1:
    328    addi.w          t3,      t7,      -48
    329    srl.d           t3,      t3,      t3   // pad with ones
    330    b               4f
    331 2:
    332    bgeu            t0,      t1,      1b
    333    ld.d            t3,      t1,      -8   // next_bits
    334    sub.w           t2,      t2,      t1
    335    sub.w           t1,      t1,      t0   // num_bytes_left
    336    slli.w          t2,      t2,      3
    337    srl.d           t3,      t3,      t2
    338    addi.w          t2,      t7,      -48
    339    nor             t3,      t3,      t3
    340    sub.w           t4,      zero,    t2
    341    revb.d          t3,      t3
    342    srli.w          t4,      t4,      3
    343    srl.d           t3,      t3,      t2
    344    sltu            t2,      t1,      t4
    345    maskeqz         t1,      t1,      t2
    346    masknez         t2,      t4,      t2
    347    or              t2,      t2,      t1   // num_bytes_read
    348 3:
    349    slli.w          t1,      t2,      3
    350    add.d           t0,      t0,      t2
    351    add.w           t7,      t7,      t1   // cnt += num_bits_read
    352    st.d            t0,      a0,      0
    353 4:
    354    or              t6,      t6,      t3   // dif |= next_bits
    355 9:
    356    st.w            t7,      a0,      28   // store cnt
    357    st.d            t6,      a0,      16   // store dif
    358    move            a0,      t8
    359 endfunc
    360 
    361 function msac_decode_bool_adapt_lsx
    362    ld.hu           a3,      a1,      0    // cdf[0] /f
    363    ld.w            t0,      a0,      24   // rng
    364    ld.d            t1,      a0,      16   // dif
    365    srli.w          t2,      t0,      8    // r >> 8
    366    srli.w          a7,      a3,      6
    367    mul.w           t2,      t2,      a7
    368    ld.w            a4,      a0,      32   // allow_update_cdf
    369    ld.w            a5,      a0,      28   // cnt
    370    srli.w          t2,      t2,      1
    371    addi.w          t2,      t2,      4    // v
    372    slli.d          t3,      t2,      48   // vw
    373    sltu            t4,      t1,      t3
    374    move            t8,      t4            // bit
    375    xori            t4,      t4,      1
    376    maskeqz         t6,      t3,      t4   // if (ret) vw
    377    sub.d           t6,      t1,      t6   // dif
    378    slli.w          t5,      t2,      1
    379    sub.w           t5,      t0,      t5   // r - 2v
    380    maskeqz         t7,      t5,      t4   // if (ret) r - 2v
    381    add.w           t5,      t2,      t7   // v(rng)
    382    beqz            a4,      .renorm
    383 
    384    // update_cdf
    385    ld.hu           t0,      a1,      2    // cdf[1]
    386    srli.w          t1,      t0,      4
    387    addi.w          t1,      t1,      4    // rate
    388    sltui           t2,      t0,      32   // count < 32
    389    add.w           t0,      t0,      t2   // count + (count < 32)
    390    sub.w           a3,      a3,      t8   // cdf[0] -= bit
    391    slli.w          t4,      t8,      15
    392    sub.w           t7,      a3,      t4   // cdf[0] - bit - 32768
    393    sra.w           t7,      t7,      t1   // (cdf[0] - bit - 32768) >> rate
    394    sub.w           t7,      a3,      t7   // cdf[0]
    395    st.h            t7,      a1,      0
    396    st.h            t0,      a1,      2
    397 
    398 .renorm:
    399    clz.w           t4,      t5            // d
    400    xori            t4,      t4,      16   // d
    401    sll.d           t6,      t6,      t4
    402    sll.w           t5,      t5,      t4
    403    sub.w           t7,      a5,      t4   // cnt-d
    404    st.w            t5,      a0,      24   // store rng
    405    bgeu            a5,      t4,      9f
    406 
    407    // refill
    408    ld.d            t0,      a0,      0    // buf_pos
    409    ld.d            t1,      a0,      8    // buf_end
    410    addi.d          t2,      t0,      8
    411    bltu            t1,      t2,      2f
    412 
    413    ld.d            t3,      t0,      0    // next_bits
    414    addi.w          t1,      t7,      -48  // shift_bits = cnt + 16 (- 64)
    415    nor             t3,      t3,      t3
    416    sub.w           t2,      zero,    t1
    417    revb.d          t3,      t3            // next_bits = bswap(next_bits)
    418    srli.w          t2,      t2,      3    // num_bytes_read
    419    srl.d           t3,      t3,      t1   // next_bits >>= (shift_bits & 63)
    420    b               3f
    421 1:
    422    addi.w          t3,      t7,      -48
    423    srl.d           t3,      t3,      t3   // pad with ones
    424    b               4f
    425 2:
    426    bgeu            t0,      t1,      1b
    427    ld.d            t3,      t1,      -8   // next_bits
    428    sub.w           t2,      t2,      t1
    429    sub.w           t1,      t1,      t0   // num_bytes_left
    430    slli.w          t2,      t2,      3
    431    srl.d           t3,      t3,      t2
    432    addi.w          t2,      t7,      -48
    433    nor             t3,      t3,      t3
    434    sub.w           t4,      zero,    t2
    435    revb.d          t3,      t3
    436    srli.w          t4,      t4,      3
    437    srl.d           t3,      t3,      t2
    438    sltu            t2,      t1,      t4
    439    maskeqz         t1,      t1,      t2
    440    masknez         t2,      t4,      t2
    441    or              t2,      t2,      t1   // num_bytes_read
    442 3:
    443    slli.w          t1,      t2,      3
    444    add.d           t0,      t0,      t2
    445    add.w           t7,      t7,      t1   // cnt += num_bits_read
    446    st.d            t0,      a0,      0
    447 4:
    448    or              t6,      t6,      t3   // dif |= next_bits
    449 9:
    450    st.w            t7,      a0,      28   // store cnt
    451    st.d            t6,      a0,      16   // store dif
    452    move            a0,      t8
    453 endfunc
    454 
    455 .macro HI_TOK allow_update_cdf
    456 .\allow_update_cdf\()_hi_tok_lsx_start:
    457 .if \allow_update_cdf == 1
    458    ld.hu        a4,    a1,    0x06 // cdf[3]
    459 .endif
    460    vor.v        vr1,   vr0,   vr0
    461    vsrli.h      vr1,   vr1,   0x06 // cdf[val] >> EC_PROB_SHIFT
    462    vstelm.h     vr2,   sp,    0, 0 // -0x1a
    463    vand.v       vr2,   vr2,   vr4  // (8 x rng) & 0xff00
    464    vslli.h      vr1,   vr1,   0x07
    465    vmuh.hu      vr1,   vr1,   vr2
    466    vadd.h       vr1,   vr1,   vr5 // v += EC_MIN_PROB/* 4 */ * ((unsigned)n_symbols/* 3 */ - val);
    467    vst          vr1,   sp,    0x02 // -0x18
    468    vssub.hu     vr1,   vr1,   vr3 // v - c
    469    vseqi.h      vr1,   vr1,   0
    470 .if \allow_update_cdf == 1
    471    addi.d       t4,    a4,    0x50
    472    srli.d       t4,    t4,    0x04
    473    sltui        t7,    a4,    32
    474    add.w        a4,    a4,    t7
    475 
    476    vreplgr2vr.h vr7,   t4
    477    vavgr.hu     vr9,   vr8,   vr1
    478    vsub.h       vr9,   vr9,   vr0
    479    vsub.h       vr0,   vr0,   vr1
    480    vsra.h       vr9,   vr9,   vr7
    481    vadd.h       vr0,   vr0,   vr9
    482    vstelm.d     vr0,   a1,    0,  0
    483    st.h         a4,    a1,    0x06
    484 .endif
    485    vmsknz.b     vr7,   vr1
    486    movfr2gr.s   t4,    f7
    487    ctz.w        t4,    t4 // loop_times * 2
    488    addi.d       t7,    t4,    2
    489    ldx.hu       t6,    sp,    t4  // u
    490    ldx.hu       t5,    sp,    t7  // v
    491    addi.w       t3,    t3,    0x05
    492    addi.w       t4,    t4,   -0x05 // if t4 == 3, continue
    493    sub.w        t6,    t6,    t5   // u - v , rng for ctx_norm
    494    slli.d       t5,    t5,    0x30 //  (ec_win)v << (EC_WIN_SIZE - 16)
    495    sub.d        t1,    t1,    t5   //  s->dif - ((ec_win)v << (EC_WIN_SIZE - 16))
    496    // Init ctx_norm  param
    497    clz.w        t7,    t6
    498    xori         t7,    t7,    0x1f
    499    xori         t7,    t7,    0x0f //  d = 15 ^ (31 ^ clz(rng));
    500    sll.d        t1,    t1,    t7   //  dif << d
    501    sll.d        t6,    t6,    t7   //  rng << d
    502    // update vr2 8 x rng
    503    vreplgr2vr.h vr2,   t6
    504    vreplvei.h   vr2,   vr2,   0
    505    st.w         t6,    a0,    0x18 // store rng
    506    move         t0,    t2
    507    sub.w        t2,    t2,    t7   // cnt - d
    508    bgeu         t0,    t7,    .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end     // if ((unsigned)cnt < (unsigned)d)  goto ctx_norm_end
    509    // Step into ctx_fill
    510    ld.d         t5,    a0,    0x00 // buf_pos
    511    ld.d         t6,    a0,    0x08 // end_pos
    512    addi.d       t7,    t5,    0x08 // buf_pos + 8
    513    sub.d        t7,    t7,    t6   // (buf_pos + 8) - end_pos
    514    blt          zero,  t7,    .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob
    515    // (end_pos - buf_pos) >= 8
    516    ld.d         t6,    t5,    0x00 // load buf_pos[0]~buf_pos[7]
    517    addi.w       t7,    t2,   -0x30 // cnt - 0x30
    518    nor          t6,    t6,    t6   // not buf data
    519    revb.d       t6,    t6          // Byte reversal
    520    srl.d        t6,    t6,    t7   // Replace left shift with right shift
    521    sub.w        t7,    zero,  t7   // neg
    522    srli.w       t7,    t7,    0x03 // Loop times
    523    or           t1,    t1,    t6   // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
    524    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
    525 .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_eob:
    526    bge          t5,    t6,    .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one
    527    // end_pos - buf_pos < 8 && buf_pos < end_pos
    528    ld.d         t0,    t6,   -0x08
    529    slli.d       t7,    t7,    0x03
    530    srl.d        t6,    t0,    t7   // Retrieve the buf data and remove the excess data
    531    addi.w       t7,    t2,   -0x30 // cnt - 0x30
    532    nor          t6,    t6,    t6   // not
    533    revb.d       t6,    t6          // Byte reversal
    534    srl.d        t6,    t6,    t7   // Replace left shift with right shift
    535    sub.w        t7,    zero,  t7   // neg
    536    or           t1,    t1,    t6   // dif |= (ec_win)(*buf_pos++ ^ 0xff) << c
    537    ld.d         t6,    a0,    0x08 // end_pos
    538    srli.w       t7,    t7,    0x03 // Loop times
    539    sub.d        t6,    t6,    t5   // end_pos - buf_pos
    540    slt          t0,    t6,    t7
    541    maskeqz      a3,    t6,    t0   // min(loop_times, end_pos - buf_pos)
    542    masknez      t0,    t7,    t0
    543    or           t7,    a3,    t0
    544    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end
    545 .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_one:
    546    // buf_pos >= end_pos
    547    addi.w       t7,    t2,   -0x10
    548    andi         t7,    t7,    0xf
    549    nor          t0,    zero,  zero
    550    srl.d        t0,    t0,    t7
    551    or           t1,    t1,    t0 // dif |= ~(~(ec_win)0xff << c);
    552    b            .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end
    553 .\allow_update_cdf\()_hi_tok_lsx_ctx_refill_end:
    554    add.d        t5,    t5,    t7        // buf_pos + Loop_times
    555    st.d         t5,    a0,    0x00      // Store buf_pos
    556    alsl.w       t2,    t7,    t2,  0x03 // update cnt
    557 .\allow_update_cdf\()_hi_tok_lsx_ctx_norm_end:
    558    srli.d       t7,    t1,    0x30
    559    vreplgr2vr.h vr3,   t7        // broadcast the high 16 bits of dif
    560    add.w        t3,    t4,    t3 // update control parameter
    561    beqz         t3,    .\allow_update_cdf\()_hi_tok_lsx_end // control loop for at most 4 times.
    562    blt          zero,  t4,    .\allow_update_cdf\()_hi_tok_lsx_start // tok_br == 3
    563 .\allow_update_cdf\()_hi_tok_lsx_end:
    564    addi.d       t3,    t3,    0x1e
    565    st.d         t1,    a0,    0x10 // store dif
    566    st.w         t2,    a0,    0x1c // store cnt
    567    srli.w       a0,    t3,    0x01 // tok
    568    addi.d       sp,    sp,    0x1a
    569 .endm
    570 
    571 /**
    572 * @param unsigned dav1d_msac_decode_hi_tok_c(MsacContext *const s, uint16_t *const cdf)
    573 * * Reg Alloction
    574 * * vr0: cdf;
    575 * * vr1: temp;
    576 * * vr2: rng;
    577 * * vr3: dif;
    578 * * vr4: const 0xff00ff00...ff00ff00;
    579 * * vr5: const 0x0004080c;
    580 * * vr6: const 0;
    581 * * t0: allow_update_cdf, tmp;
    582 * * t1: dif;
    583 * * t2: cnt;
    584 * * t3: 0xffffffe8, outermost control parameter;
    585 * * t4: loop time
    586 * * t5: v, buf_pos, temp;
    587 * * t6: u, rng, end_pos, buf, temp;
    588 * * t7: temp;
    589 */
    590 function msac_decode_hi_tok_lsx
    591    fld.d     f0,    a1,   0    // Load cdf[0]~cdf[3]
    592    vldrepl.h vr2,   a0,   0x18 //  8 x rng, assert(rng <= 65535U), only the lower 16 bits are valid
    593    vldrepl.h vr3,   a0,   0x16 // broadcast the high 16 bits of dif, c = s->dif >> (EC_WIN_SIZE - 16)
    594    ld.w      t0,    a0,   0x20 // allow_update_cdf
    595    la.local  t7,    ph_0xff00
    596    vld       vr4,   t7,   0x00 // 0xff00ff00...ff00ff00
    597    la.local  t7,    min_prob
    598    vld       vr5,   t7,   12 * 2 // 0x0004080c
    599    vxor.v    vr6,   vr6,  vr6    // const 0
    600    ld.d      t1,    a0,   0x10   // dif
    601    ld.w      t2,    a0,   0x1c   // cnt
    602    orn       t3,    t3,   t3
    603    srli.d    t3,    t3,   32
    604    addi.d    t3,    t3,  -0x17 // 0xffffffe8
    605    vseq.h    vr8,   vr8,  vr8
    606    addi.d    sp,    sp,  -0x1a // alloc stack
    607    beqz      t0,    .hi_tok_lsx_no_update_cdf
    608    HI_TOK 1
    609    jirl      zero,  ra,   0x0
    610 .hi_tok_lsx_no_update_cdf:
    611    HI_TOK 0
    612 endfunc