tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

msac.S (24283B)


      1 /*
      2 * Copyright © 2019, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 #define BUF_POS 0
     32 #define BUF_END 8
     33 #define DIF 16
     34 #define RNG 24
     35 #define CNT 28
     36 #define ALLOW_UPDATE_CDF 32
     37 
     38 #define COEFFS_BASE_OFFSET 30
     39 #define MASKS8_OFFSET (64-COEFFS_BASE_OFFSET)
     40 
     41 const coeffs
     42        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
     43        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
     44        // masks8
     45        .short -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, -0x202, 0xF0E
     46 endconst
     47 
     48 .macro ld1_n d0, d1, src, sz, n
     49 .if \n <= 8
     50        ld1             {\d0\sz},  [\src]
     51 .else
     52        ld1             {\d0\sz, \d1\sz},  [\src]
     53 .endif
     54 .endm
     55 
     56 .macro st1_n s0, s1, dst, sz, n
     57 .if \n <= 8
     58        st1             {\s0\sz},  [\dst]
     59 .else
     60        st1             {\s0\sz, \s1\sz},  [\dst]
     61 .endif
     62 .endm
     63 
     64 .macro ushr_n d0, d1, s0, s1, shift, sz, n
     65        ushr            \d0\sz,  \s0\sz,  \shift
     66 .if \n == 16
     67        ushr            \d1\sz,  \s1\sz,  \shift
     68 .endif
     69 .endm
     70 
     71 .macro add_n d0, d1, s0, s1, s2, s3, sz, n
     72        add             \d0\sz,  \s0\sz,  \s2\sz
     73 .if \n == 16
     74        add             \d1\sz,  \s1\sz,  \s3\sz
     75 .endif
     76 .endm
     77 
     78 .macro sub_n d0, d1, s0, s1, s2, s3, sz, n
     79        sub             \d0\sz,  \s0\sz,  \s2\sz
     80 .if \n == 16
     81        sub             \d1\sz,  \s1\sz,  \s3\sz
     82 .endif
     83 .endm
     84 
     85 .macro and_n d0, d1, s0, s1, s2, s3, sz, n
     86        and             \d0\sz,  \s0\sz,  \s2\sz
     87 .if \n == 16
     88        and             \d1\sz,  \s1\sz,  \s3\sz
     89 .endif
     90 .endm
     91 
     92 .macro cmhs_n d0, d1, s0, s1, s2, s3, sz, n
     93        cmhs            \d0\sz,  \s0\sz,  \s2\sz
     94 .if \n == 16
     95        cmhs            \d1\sz,  \s1\sz,  \s3\sz
     96 .endif
     97 .endm
     98 
     99 .macro sshl_n d0, d1, s0, s1, s2, s3, sz, n
    100        sshl            \d0\sz,  \s0\sz,  \s2\sz
    101 .if \n == 16
    102        sshl            \d1\sz,  \s1\sz,  \s3\sz
    103 .endif
    104 .endm
    105 
    106 .macro sqdmulh_n d0, d1, s0, s1, s2, s3, sz, n
    107        sqdmulh         \d0\sz,  \s0\sz,  \s2\sz
    108 .if \n == 16
    109        sqdmulh         \d1\sz,  \s1\sz,  \s3\sz
    110 .endif
    111 .endm
    112 
    113 .macro str_n            idx0, idx1, dstreg, dstoff, n
    114        str             \idx0,  [\dstreg, \dstoff]
    115 .if \n == 16
    116        str             \idx1,  [\dstreg, \dstoff + 16]
    117 .endif
    118 .endm
    119 
    120 // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
    121 //                                               size_t n_symbols);
    122 
    123 function msac_decode_symbol_adapt4_neon, export=1
    124 .macro decode_update sz, szb, n
    125 .if \n == 16
    126        sub             sp,  sp,  #48
    127 .endif
    128        add             x8,  x0,  #RNG
    129        ld1_n           v0,  v1,  x1,  \sz, \n                    // cdf
    130        ld1r            {v29\sz}, [x8]                            // rng
    131        movrel          x9,  coeffs, COEFFS_BASE_OFFSET
    132        movi            v31\sz, #0x7f, lsl #8                     // 0x7f00
    133        sub             x10, x9,  x2, lsl #1
    134        mvni            v30\sz, #0x3f                             // 0xffc0
    135        and             v7\szb, v29\szb, v31\szb                  // rng & 0x7f00
    136 .if \n == 16
    137        str             h29, [sp, #14]                            // store original u = s->rng
    138 .endif
    139        and_n           v2,  v3,  v0,  v1,  v30, v30, \szb, \n    // cdf & 0xffc0
    140 
    141        ld1_n           v4,  v5,  x10, \sz, \n                    // EC_MIN_PROB * (n_symbols - ret)
    142        sqdmulh_n       v6,  v7,  v2,  v3,  v7,  v7,  \sz, \n     // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
    143        ldr             d28, [x0, #DIF]
    144 
    145        add_n           v4,  v5,  v2,  v3,  v4,  v5,  \sz, \n     // v = cdf + EC_MIN_PROB * (n_symbols - ret)
    146        add_n           v4,  v5,  v6,  v7,  v4,  v5,  \sz, \n     // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
    147 
    148        dup             v30\sz, v28.h[3]                          // dif >> (EC_WIN_SIZE - 16)
    149 .if \n == 8
    150        ldur            q31, [x9, #MASKS8_OFFSET]
    151 .elseif \n == 16
    152        str_n           q4,  q5,  sp, #16, \n                     // store v values to allow indexed access
    153 .endif
    154 
    155        // After the condition starts being true it continues, such that the vector looks like:
    156        //   0, 0, 0 ... -1, -1
    157        cmhs_n          v2,  v3,  v30, v30, v4,  v5,  \sz,  \n    // c >= v
    158 .if \n == 4
    159        ext             v29\szb, v29\szb, v4\szb, #6              // u
    160        umov            x15, v2.d[0]
    161        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
    162        rev             x15, x15
    163        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
    164        // rev + clz = count trailing zeros
    165        clz             x15, x15                                  // 16*ret
    166 .elseif \n == 8
    167        // The final short of the compare is always set.
    168        // Using addv, subtract -0x202*ret from this value to create a lookup table for a short.
    169        //  For n == 8:
    170        // -0x202 + -0x202 + ... + 0xF0E
    171        //                    (0x202*7) | (1 << 8)
    172        //                                    ^-------offset for second byte of the short
    173        and             v31\szb, v31\szb, v2\szb
    174        ext             v29\szb, v29\szb, v4\szb, #14             // u
    175        addv            h31, v31\sz                               // ((2*ret + 1) << 8) | (2*ret)
    176        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
    177        sub             v30\sz, v30\sz, v4\sz                     // (dif >> 48) - v
    178        smov            w15, v31.b[0]                             // 2*ret
    179        sub             v29\sz, v29\sz, v4\sz                     // rng = u-v
    180 .elseif \n == 16
    181        add             v6\sz,  v2\sz,  v3\sz
    182        addv            h31, v6\sz                                // -n + ret
    183        ldr             w4,  [x0, #ALLOW_UPDATE_CDF]
    184        smov            w15, v31.h[0]
    185 .endif
    186 
    187        cbz             w4,  0f
    188 
    189        // update_cdf
    190        ldrh            w3,  [x1, x2, lsl #1]                     // count = cdf[n_symbols]
    191 .if \n == 16
    192        // 16 case has a lower bound that guarantees n_symbols > 2
    193        mov             w4,  #-5
    194 .elseif \n == 8
    195        mvn             w14, w2
    196        mov             w4,  #-4
    197        cmn             w14, #3                                   // set C if n_symbols <= 2
    198 .else
    199        // if n_symbols < 4 (or < 6 even) then
    200        //   (1 + n_symbols) >> 2 == n_symbols > 2
    201        add             w14, w2,  #17                             // (1 + n_symbols) + (4 << 2)
    202 .endif
    203        sub_n           v16, v17, v0,  v1,  v2,  v3,  \sz, \n     // cdf + (i >= val ? 1 : 0)
    204        orr             v2\sz, #0x80, lsl #8
    205 .if \n == 16
    206        orr             v3\sz, #0x80, lsl #8
    207 .endif
    208 .if \n == 16
    209        sub             w4,  w4,  w3, lsr #4                      // -((count >> 4) + 5)
    210 .elseif \n == 8
    211        lsr             w14, w3,  #4                              // count >> 4
    212        sbc             w4,  w4,  w14                             // -((count >> 4) + (n_symbols > 2) + 4)
    213 .else
    214        neg             w4, w14, lsr #2                           // -((n_symbols > 2) + 4)
    215        sub             w4,  w4,  w3,  lsr #4                     // -((count >> 4) + (n_symbols > 2) + 4)
    216 .endif
    217        sub_n           v2,  v3,  v2,  v3,  v0,  v1,  \sz, \n     // (32768 - cdf[i]) or (-1 - cdf[i])
    218        dup             v6\sz,    w4                              // -rate
    219 
    220        sub             w3,  w3,  w3, lsr #5                      // count - (count == 32)
    221        sshl_n          v2,  v3,  v2,  v3,  v6,  v6,  \sz, \n     // ({32768,-1} - cdf[i]) >> rate
    222        add             w3,  w3,  #1                              // count + (count < 32)
    223        add_n           v0,  v1,  v16, v17, v2,  v3,  \sz, \n     // cdf + (32768 - cdf[i]) >> rate
    224        st1_n           v0,  v1,  x1,  \sz, \n
    225        strh            w3,  [x1, x2, lsl #1]
    226 
    227 0:
    228        // renorm
    229 .if \n == 4
    230        ldr             w6,  [x0, #CNT]
    231        ldr             x7,  [x0, #DIF]
    232        mov             x4,  v29.d[0]          // rng (packed)
    233        mov             x3,  v4.d[0]           // v (packed)
    234 
    235        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
    236        //  garbage in the remaining bits, but we can work around this.
    237        lsr             x4,  x4,  x15          // rng
    238        lsr             x3,  x3,  x15          // v
    239        lsl             w5,  w4,  #16          // rng << 16
    240        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
    241        clz             w5,  w5                // d = clz(rng << 16)
    242        lsl             w4,  w4,  w5           // rng << d
    243        subs            w6,  w6,  w5           // cnt -= d
    244        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
    245        strh            w4,  [x0, #RNG]
    246        b.lo            1f
    247        str             w6,  [x0, #CNT]
    248        str             x7,  [x0, #DIF]
    249        lsr             w0,  w15, #4
    250        ret
    251 1:
    252        lsr             w15, w15, #4
    253        b L(refill)
    254 .elseif \n == 8
    255        ldr             w6,  [x0, #CNT]
    256        tbl             v30.8b, {v30.16b}, v31.8b
    257        tbl             v29.8b, {v29.16b}, v31.8b
    258        ins             v28.h[3], v30.h[0]     // dif - (v << 48)
    259        clz             v0.4h,  v29.4h         // d = clz(rng)
    260        umov            w5,  v0.h[0]
    261        ushl            v29.4h, v29.4h, v0.4h  // rng << d
    262 
    263        // The vec for clz(rng) is filled with garbage after the first short,
    264        //  but ushl/sshl conveniently uses only the first byte for the shift
    265        //  amount.
    266        ushl            d28, d28, d0           // (dif - (v << 48)) << d
    267 
    268        subs            w6,  w6,  w5           // cnt -= d
    269        str             h29, [x0, #RNG]
    270        b.lo            1f
    271        str             w6,  [x0, #CNT]
    272        str             d28, [x0, #DIF]
    273        lsr             w0,  w15, #1           // ret
    274        ret
    275 1:
    276        lsr             w15, w15, #1           // ret
    277        mov             x7, v28.d[0]
    278        b L(refill)
    279 .elseif \n == 16
    280        add             x8,  sp,  w15, sxtw #1
    281        ldrh            w3,  [x8, #48]         // v
    282        ldurh           w4,  [x8, #46]         // u
    283        ldr             w6,  [x0, #CNT]
    284        ldr             x7,  [x0, #DIF]
    285        sub             w4,  w4,  w3           // rng = u - v
    286        clz             w5,  w4                // clz(rng)
    287        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
    288        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
    289        lsl             w4,  w4,  w5           // rng << d
    290        subs            w6,  w6,  w5           // cnt -= d
    291        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
    292        str             w4,  [x0, #RNG]
    293        add             sp,  sp,  #48
    294        b.lo            1f
    295        str             w6,  [x0, #CNT]
    296        str             x7,  [x0, #DIF]
    297        add             w0,  w15, #\n          // ret
    298        ret
    299 1:
    300        add             w15, w15, #\n          // ret
    301        b L(refill)
    302 .endif
    303 .endm
    304 
    305        decode_update   .4h, .8b, 4
    306 
    307 L(refill):
    308        // refill
    309        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
    310        add             x5,  x3,  #8
    311        subs            x5,  x5,  x4
    312        b.hi            6f
    313 
    314        ldr             x8,  [x3]              // next_bits
    315        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
    316        mvn             x8,  x8
    317        neg             w5,  w4
    318        rev             x8,  x8                // next_bits = bswap(next_bits)
    319        lsr             w5,  w5,  #3           // num_bytes_read
    320        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
    321 
    322 2:      // refill_end
    323        add             x3,  x3,  x5
    324        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
    325        str             x3,  [x0, #BUF_POS]
    326 
    327 3:      // refill_end2
    328        orr             x7,  x7,  x8           // dif |= next_bits
    329 
    330 4:      // end
    331        str             w6,  [x0, #CNT]
    332        str             x7,  [x0, #DIF]
    333 
    334        mov             w0,  w15
    335        ret
    336 
    337 5:      // pad_with_ones
    338        add             w8,  w6,  #-16
    339        ror             x8,  x8,  x8
    340        b               3b
    341 
    342 6:      // refill_eob
    343        cmp             x3,  x4
    344        b.hs            5b
    345 
    346        ldr             x8,  [x4, #-8]
    347        lsl             w5,  w5,  #3
    348        lsr             x8,  x8,  x5
    349        add             w5,  w6,  #-48
    350        mvn             x8,  x8
    351        sub             w4,  w4,  w3           // num_bytes_left
    352        rev             x8,  x8
    353        lsr             x8,  x8,  x5
    354        neg             w5,  w5
    355        lsr             w5,  w5,  #3
    356        cmp             w5,  w4
    357        csel            w5,  w5,  w4,  lo      // num_bytes_read
    358        b               2b
    359 endfunc
    360 
    361 function msac_decode_symbol_adapt8_neon, export=1
    362        decode_update   .8h, .16b, 8
    363 endfunc
    364 
    365 function msac_decode_symbol_adapt16_neon, export=1
    366        decode_update   .8h, .16b, 16
    367 endfunc
    368 
    369 function msac_decode_hi_tok_neon, export=1
    370        ld1             {v0.4h},  [x1]            // cdf
    371        add             x16, x0,  #RNG
    372        movi            v31.4h, #0x7f, lsl #8     // 0x7f00
    373        movrel          x17, coeffs, COEFFS_BASE_OFFSET-2*3
    374        mvni            v30.4h, #0x3f             // 0xffc0
    375        ldrh            w9,  [x1, #6]             // count = cdf[n_symbols]
    376        ld1r            {v3.4h},  [x16]           // rng
    377        ld1             {v29.4h}, [x17]           // EC_MIN_PROB * (n_symbols - ret)
    378        add             x17, x0,  #DIF + 6
    379        mov             w13, #-24*8
    380        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
    381        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
    382        ld1r            {v1.8h},  [x17]           // dif >> (EC_WIN_SIZE - 16)
    383        ldr             w6,  [x0, #CNT]
    384        ldr             x7,  [x0, #DIF]
    385 1:
    386        and             v7.8b,   v3.8b,   v31.8b  // rng & 0x7f00
    387        sqdmulh         v6.4h,   v17.4h,  v7.4h   // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
    388        add             v4.4h,   v17.4h,  v29.4h  // v = cdf + EC_MIN_PROB * (n_symbols - ret)
    389        add             v4.4h,   v6.4h,   v4.4h   // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
    390        cmhs            v2.4h,   v1.4h,   v4.4h   // c >= v
    391        add             w13, w13, #5*8
    392        ext             v18.8b, v3.8b,  v4.8b, #6 // u
    393        umov            x15, v2.d[0]
    394        rev             x15, x15
    395        sub             v18.4h, v18.4h, v4.4h     // rng = u-v
    396        // rev + clz = count trailing zeros
    397        clz             x15, x15                  // 16*ret
    398 
    399        cbz             w10, 2f
    400        // update_cdf
    401        sub             v5.4h,   v0.4h,   v2.4h   // cdf[i] + (i >= val ? 1 : 0)
    402        mov             w4,  #-5
    403        orr             v2.4h, #0x80, lsl #8      // i >= val ? -1 : 32768
    404        sub             w4,  w4,  w9, lsr #4      // -((count >> 4) + 5)
    405        sub             v2.4h,   v2.4h,   v0.4h   // (32768 - cdf[i]) or (-1 - cdf[i])
    406        dup             v6.4h,    w4              // -rate
    407 
    408        sub             w9,  w9,  w9, lsr #5      // count - (count == 32)
    409        sshl            v2.4h,   v2.4h,   v6.4h   // ({32768,-1} - cdf[i]) >> rate
    410        add             w9,  w9,  #1              // count + (count < 32)
    411        add             v0.4h,   v5.4h,   v2.4h   // cdf[i] + (32768 - cdf[i]) >> rate
    412        st1             {v0.4h},  [x1]
    413        and             v17.8b,  v0.8b,   v30.8b  // cdf & 0xffc0
    414        strh            w9,  [x1, #6]
    415 
    416 2:
    417        mov             x4,  v18.d[0]          // rng (packed)
    418        mov             x3,  v4.d[0]           // v (packed)
    419 
    420        // Shift 'v'/'rng' for ret into the 16 least sig bits. There is
    421        //  garbage in the remaining bits, but we can work around this.
    422        lsr             x4,  x4,  x15          // rng
    423        lsr             x3,  x3,  x15          // v
    424        lsl             w5,  w4,  #16          // rng << 16
    425        sub             x7,  x7,  x3, lsl #48  // dif - (v << 48)
    426        clz             w5,  w5                // d = clz(rng << 16)
    427        lsl             w4,  w4,  w5           // rng << d
    428        subs            w6,  w6,  w5           // cnt -= d
    429        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
    430        strh            w4,  [x0, #RNG]
    431        dup             v3.4h,   w4
    432        b.hs            5f
    433 
    434        // refill
    435        ldp             x3,  x4,  [x0]         // BUF_POS, BUF_END
    436        add             x5,  x3,  #8
    437        subs            x5,  x5,  x4
    438        b.hi            7f
    439 
    440        ldr             x8,  [x3]              // next_bits
    441        add             w4,  w6,  #-48         // shift_bits = cnt + 16 (- 64)
    442        mvn             x8,  x8
    443        neg             w5,  w4
    444        rev             x8,  x8                // next_bits = bswap(next_bits)
    445        lsr             w5,  w5,  #3           // num_bytes_read
    446        lsr             x8,  x8,  x4           // next_bits >>= (shift_bits & 63)
    447 
    448 3:      // refill_end
    449        add             x3,  x3,  x5
    450        add             w6,  w6,  w5, lsl #3   // cnt += num_bits_read
    451        str             x3,  [x0, #BUF_POS]
    452 
    453 4:      // refill_end2
    454        orr             x7,  x7,  x8           // dif |= next_bits
    455 
    456 5:      // end
    457        sub             w15, w15, #5*8
    458        lsr             x12, x7,  #48
    459        adds            w13, w13, w15          // carry = tok_br < 3 || tok == 15
    460        dup             v1.8h,   w12
    461        b.cc            1b                     // loop if !carry
    462        add             w13, w13, #30*8
    463        str             w6,  [x0, #CNT]
    464        str             x7,  [x0, #DIF]
    465        lsr             w0,  w13, #4
    466        ret
    467 
    468 6:      // pad_with_ones
    469        add             w8,  w6,  #-16
    470        ror             x8,  x8,  x8
    471        b               4b
    472 
    473 7:      // refill_eob
    474        cmp             x3,  x4
    475        b.hs            6b
    476 
    477        ldr             x8,  [x4, #-8]
    478        lsl             w5,  w5,  #3
    479        lsr             x8,  x8,  x5
    480        add             w5,  w6,  #-48
    481        mvn             x8,  x8
    482        sub             w4,  w4,  w3           // num_bytes_left
    483        rev             x8,  x8
    484        lsr             x8,  x8,  x5
    485        neg             w5,  w5
    486        lsr             w5,  w5,  #3
    487        cmp             w5,  w4
    488        csel            w5,  w5,  w4,  lo      // num_bytes_read
    489        b               3b
    490 endfunc
    491 
    492 function msac_decode_bool_equi_neon, export=1
    493        ldp             w5,  w6,  [x0, #RNG]   // + CNT
    494        ldr             x7,  [x0, #DIF]
    495        bic             w4,  w5,  #0xff        // r &= 0xff00
    496        add             w4,  w4,  #8
    497        subs            x8,  x7,  x4, lsl #47  // dif - vw
    498        lsr             w4,  w4,  #1           // v
    499        sub             w5,  w5,  w4           // r - v
    500        cset            w15, lo
    501        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
    502        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
    503 
    504        clz             w5,  w4                // clz(rng)
    505        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
    506        lsl             w4,  w4,  w5           // rng << d
    507        subs            w6,  w6,  w5           // cnt -= d
    508        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
    509        str             w4,  [x0, #RNG]
    510        b.lo            L(refill)
    511 
    512        str             w6,  [x0, #CNT]
    513        str             x7,  [x0, #DIF]
    514        mov             w0,  w15
    515        ret
    516 endfunc
    517 
    518 function msac_decode_bool_neon, export=1
    519        ldp             w5,  w6,  [x0, #RNG]   // + CNT
    520        ldr             x7,  [x0, #DIF]
    521        lsr             w4,  w5,  #8           // r >> 8
    522        bic             w1,  w1,  #0x3f        // f &= ~63
    523        mul             w4,  w4,  w1
    524        lsr             w4,  w4,  #7
    525        add             w4,  w4,  #4           // v
    526        subs            x8,  x7,  x4, lsl #48  // dif - vw
    527        sub             w5,  w5,  w4           // r - v
    528        cset            w15, lo
    529        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
    530        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
    531 
    532        clz             w5,  w4                // clz(rng)
    533        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
    534        lsl             w4,  w4,  w5           // rng << d
    535        subs            w6,  w6,  w5           // cnt -= d
    536        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
    537        str             w4,  [x0, #RNG]
    538        b.lo            L(refill)
    539 
    540        str             w6,  [x0, #CNT]
    541        str             x7,  [x0, #DIF]
    542        mov             w0,  w15
    543        ret
    544 endfunc
    545 
    546 function msac_decode_bool_adapt_neon, export=1
    547        ldr             w9,  [x1]              // cdf[0-1]
    548        ldp             w5,  w6,  [x0, #RNG]   // + CNT
    549        ldr             x7,  [x0, #DIF]
    550        lsr             w4,  w5,  #8           // r >> 8
    551        and             w2,  w9,  #0xffc0      // f &= ~63
    552        mul             w4,  w4,  w2
    553        lsr             w4,  w4,  #7
    554        add             w4,  w4,  #4           // v
    555        subs            x8,  x7,  x4, lsl #48  // dif - vw
    556        sub             w5,  w5,  w4           // r - v
    557        cset            w15, lo
    558        csel            w4,  w5,  w4,  hs      // if (ret) v = r - v;
    559        csel            x7,  x8,  x7,  hs      // if (ret) dif = dif - vw;
    560 
    561        ldr             w10, [x0, #ALLOW_UPDATE_CDF]
    562 
    563        clz             w5,  w4                // clz(rng)
    564        eor             w5,  w5,  #16          // d = clz(rng) ^ 16
    565 
    566        cbz             w10, 1f
    567 
    568        lsr             w2,  w9,  #16          // count = cdf[1]
    569        and             w9,  w9,  #0xffff      // cdf[0]
    570 
    571        sub             w3,  w2,  w2, lsr #5   // count - (count >= 32)
    572        lsr             w2,  w2,  #4           // count >> 4
    573        add             w10, w3,  #1           // count + (count < 32)
    574        add             w2,  w2,  #4           // rate = (count >> 4) | 4
    575 
    576        sub             w9,  w9,  w15          // cdf[0] -= bit
    577        sub             w11, w9,  w15, lsl #15 // {cdf[0], cdf[0] - 32769}
    578        asr             w11, w11, w2           // {cdf[0], cdf[0] - 32769} >> rate
    579        sub             w9,  w9,  w11          // cdf[0]
    580 
    581        strh            w9,  [x1]
    582        strh            w10, [x1, #2]
    583 
    584 1:
    585        lsl             w4,  w4,  w5           // rng << d
    586        subs            w6,  w6,  w5           // cnt -= d
    587        lsl             x7,  x7,  x5           // (dif - (v << 48)) << d
    588        str             w4,  [x0, #RNG]
    589        b.lo            L(refill)
    590 
    591        str             w6,  [x0, #CNT]
    592        str             x7,  [x0, #DIF]
    593        mov             w0,  w15
    594        ret
    595 endfunc