tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

msac.S (22034B)


      1 /*
      2 * Copyright © 2019, VideoLAN and dav1d authors
      3 * Copyright © 2020, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 #define BUF_POS 0
     32 #define BUF_END 4
     33 #define DIF 8
     34 #define RNG 12
     35 #define CNT 16
     36 #define ALLOW_UPDATE_CDF 20
     37 
     38 const coeffs
     39        .short 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
     40        .short 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0, 0
     41 endconst
     42 
     43 const bits, align=4
     44        .short   0x1,   0x2,   0x4,   0x8,   0x10,   0x20,   0x40,   0x80
     45        .short 0x100, 0x200, 0x400, 0x800, 0x1000, 0x2000, 0x4000, 0x8000
     46 endconst
     47 
     48 .macro vld1_align_n d0, q0, q1, src, n
     49 .if \n == 4
     50        vld1.16         {\d0},  [\src, :64]
     51 .elseif \n == 8
     52        vld1.16         {\q0},  [\src, :128]
     53 .else
     54        vld1.16         {\q0, \q1},  [\src, :128]
     55 .endif
     56 .endm
     57 
     58 .macro vld1_n d0, q0, q1, src, n
     59 .if \n == 4
     60        vld1.16         {\d0},  [\src]
     61 .elseif \n == 8
     62        vld1.16         {\q0},  [\src]
     63 .else
     64        vld1.16         {\q0, \q1},  [\src]
     65 .endif
     66 .endm
     67 
     68 .macro vst1_align_n d0, q0, q1, src, n
     69 .if \n == 4
     70        vst1.16         {\d0},  [\src, :64]
     71 .elseif \n == 8
     72        vst1.16         {\q0},  [\src, :128]
     73 .else
     74        vst1.16         {\q0, \q1},  [\src, :128]
     75 .endif
     76 .endm
     77 
     78 .macro vst1_n d0, q0, q1, src, n
     79 .if \n == 4
     80        vst1.16         {\d0},  [\src]
     81 .elseif \n == 8
     82        vst1.16         {\q0},  [\src]
     83 .else
     84        vst1.16         {\q0, \q1},  [\src]
     85 .endif
     86 .endm
     87 
     88 .macro vshr_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
     89 .if \n == 4
     90        vshr.u16        \d0,  \s0,  \s3
     91 .else
     92        vshr.u16        \d1,  \s1,  \s4
     93 .if \n == 16
     94        vshr.u16        \d2,  \s2,  \s5
     95 .endif
     96 .endif
     97 .endm
     98 
     99 .macro vadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    100 .if \n == 4
    101        vadd.i16        \d0,  \s0,  \s3
    102 .else
    103        vadd.i16        \d1,  \s1,  \s4
    104 .if \n == 16
    105        vadd.i16        \d2,  \s2,  \s5
    106 .endif
    107 .endif
    108 .endm
    109 
    110 .macro vsub_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    111 .if \n == 4
    112        vsub.i16        \d0,  \s0,  \s3
    113 .else
    114        vsub.i16        \d1,  \s1,  \s4
    115 .if \n == 16
    116        vsub.i16        \d2,  \s2,  \s5
    117 .endif
    118 .endif
    119 .endm
    120 
    121 .macro vand_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    122 .if \n == 4
    123        vand            \d0,  \s0,  \s3
    124 .else
    125        vand            \d1,  \s1,  \s4
    126 .if \n == 16
    127        vand            \d2,  \s2,  \s5
    128 .endif
    129 .endif
    130 .endm
    131 
    132 .macro vcge_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    133 .if \n == 4
    134        vcge.u16        \d0,  \s0,  \s3
    135 .else
    136        vcge.u16        \d1,  \s1,  \s4
    137 .if \n == 16
    138        vcge.u16        \d2,  \s2,  \s5
    139 .endif
    140 .endif
    141 .endm
    142 
    143 .macro vrhadd_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    144 .if \n == 4
    145        vrhadd.u16      \d0,  \s0,  \s3
    146 .else
    147        vrhadd.u16      \d1,  \s1,  \s4
    148 .if \n == 16
    149        vrhadd.u16      \d2,  \s2,  \s5
    150 .endif
    151 .endif
    152 .endm
    153 
    154 .macro vshl_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    155 .if \n == 4
    156        vshl.s16        \d0,  \s0,  \s3
    157 .else
    158        vshl.s16        \d1,  \s1,  \s4
    159 .if \n == 16
    160        vshl.s16        \d2,  \s2,  \s5
    161 .endif
    162 .endif
    163 .endm
    164 
    165 .macro vqdmulh_n d0, d1, d2, s0, s1, s2, s3, s4, s5, n
    166 .if \n == 4
    167        vqdmulh.s16     \d0,  \s0,  \s3
    168 .else
    169        vqdmulh.s16     \d1,  \s1,  \s4
    170 .if \n == 16
    171        vqdmulh.s16     \d2,  \s2,  \s5
    172 .endif
    173 .endif
    174 .endm
    175 
    176 // unsigned dav1d_msac_decode_symbol_adapt4_neon(MsacContext *s, uint16_t *cdf,
    177 //                                               size_t n_symbols);
    178 
    179 function msac_decode_symbol_adapt4_neon, export=1
    180 .macro decode_update n
    181        push            {r4-r10,lr}
    182        sub             sp,  sp,  #48
    183        add             r8,  r0,  #RNG
    184 
    185        vld1_align_n    d0,  q0,  q1,  r1,  \n                         // cdf
    186        vld1.16         {d16[]}, [r8, :16]                             // rng
    187        movrel_local    r9,  coeffs, 30
    188        vmov.i16        d30, #0x7f00                                   // 0x7f00
    189        sub             r9,  r9,  r2, lsl #1
    190        vmvn.i16        q14, #0x3f                                     // 0xffc0
    191        add             r8,  sp,  #14
    192        vand            d22, d16, d30                                  // rng & 0x7f00
    193        vst1.16         {d16[0]}, [r8, :16]                            // store original u = s->rng
    194        vand_n          d4,  q2,  q3,  d0,  q0,  q1, d28, q14, q14, \n // cdf & 0xffc0
    195 .if \n > 4
    196        vmov            d23, d22
    197 .endif
    198 
    199        vld1_n          d16, q8,  q9,  r9,  \n                          // EC_MIN_PROB * (n_symbols - ret)
    200        vqdmulh_n       d20, q10, q11, d4,  q2,  q3,  d22, q11, q11, \n // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
    201        add             r8,  r0,  #DIF + 2
    202 
    203        vadd_n          d16, q8,  q9,  d4,  q2,  q3,  d16, q8,  q9,  \n // v = cdf + EC_MIN_PROB * (n_symbols - ret)
    204 .if \n == 4
    205        vmov.i16        d17, #0
    206 .endif
    207        vadd_n          d16, q8,  q9,  d20, q10, q11, d16, q8,  q9,  \n // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
    208 
    209        add             r9,  sp,  #16
    210        vld1.16         {d20[]}, [r8, :16]                              // dif >> (EC_WIN_SIZE - 16)
    211        movrel_local    r8,  bits
    212        vst1_n          q8,  q8,  q9,  r9,  \n                          // store v values to allow indexed access
    213 
    214        vmov            d21, d20
    215        vld1_align_n    q12, q12, q13, r8,  \n
    216 .if \n == 16
    217        vmov            q11, q10
    218 .endif
    219 
    220        vcge_n          q2,  q2,  q3,  q10, q10, q11, q8,  q8,  q9,  \n // c >= v
    221 
    222        vand_n          q10, q10, q11, q2,  q2,  q3,  q12, q12, q13, \n // One bit per halfword set in the mask
    223 .if \n == 16
    224        vadd.i16        q10, q10, q11
    225 .endif
    226        vadd.i16        d20, d20, d21                                   // Aggregate mask bits
    227        ldr             r4,  [r0, #ALLOW_UPDATE_CDF]
    228        vpadd.i16       d20, d20, d20
    229        lsl             r10, r2,  #1
    230        vpadd.i16       d20, d20, d20
    231        vmov.u16        r3,  d20[0]
    232        cmp             r4,  #0
    233        rbit            r3,  r3
    234        clz             lr,  r3                                         // ret
    235 
    236        beq             L(renorm)
    237        // update_cdf
    238        ldrh            r3,  [r1, r10]                                  // count = cdf[n_symbols]
    239        vmov.i8         q10, #0xff
    240 .if \n == 16
    241        mov             r4,  #-5
    242 .else
    243        mvn             r12, r2
    244        mov             r4,  #-4
    245        cmn             r12, #3                                         // set C if n_symbols <= 2
    246 .endif
    247        vrhadd_n        d16, q8,  q9,  d20, q10, q10, d4,  q2,  q3,  \n // i >= val ? -1 : 32768
    248 .if \n == 16
    249        sub             r4,  r4,  r3, lsr #4                            // -((count >> 4) + 5)
    250 .else
    251        lsr             r12, r3,  #4                                    // count >> 4
    252        sbc             r4,  r4,  r12                                   // -((count >> 4) + (n_symbols > 2) + 4)
    253 .endif
    254        vsub_n          d16, q8,  q9,  d16, q8,  q9,  d0,  q0,  q1,  \n // (32768 - cdf[i]) or (-1 - cdf[i])
    255 .if \n == 4
    256        vdup.16         d20, r4                                         // -rate
    257 .else
    258        vdup.16         q10, r4                                         // -rate
    259 .endif
    260 
    261        sub             r3,  r3,  r3, lsr #5                            // count - (count == 32)
    262        vsub_n          d0,  q0,  q1,  d0,  q0,  q1,  d4,  q2,  q3,  \n // cdf + (i >= val ? 1 : 0)
    263        vshl_n          d16, q8,  q9,  d16, q8,  q9,  d20, q10, q10, \n // ({32768,-1} - cdf[i]) >> rate
    264        add             r3,  r3,  #1                                    // count + (count < 32)
    265        vadd_n          d0,  q0,  q1,  d0,  q0,  q1,  d16, q8,  q9,  \n // cdf + (32768 - cdf[i]) >> rate
    266        vst1_align_n    d0,  q0,  q1,  r1,  \n
    267        strh            r3,  [r1, r10]
    268 .endm
    269 
    270        decode_update   4
    271 
    272 L(renorm):
    273        add             r8,  sp,  #16
    274        add             r8,  r8,  lr, lsl #1
    275        ldrh            r3,  [r8]              // v
    276        ldrh            r4,  [r8, #-2]         // u
    277        ldr             r6,  [r0, #CNT]
    278        ldr             r7,  [r0, #DIF]
    279        sub             r4,  r4,  r3           // rng = u - v
    280        clz             r5,  r4                // clz(rng)
    281        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
    282        sub             r7,  r7,  r3, lsl #16  // dif - (v << 16)
    283 L(renorm2):
    284        lsl             r4,  r4,  r5           // rng << d
    285        subs            r6,  r6,  r5           // cnt -= d
    286        lsl             r7,  r7,  r5           // (dif - (v << 16)) << d
    287        str             r4,  [r0, #RNG]
    288        bhs             4f
    289 
    290        // refill
    291        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
    292        ldr             r4,  [r0, #BUF_END]    // BUF_END
    293        add             r5,  r3,  #4
    294        subs            r5,  r5,  r4
    295        bhi             6f
    296 
    297        ldr             r8,  [r3]              // next_bits
    298        rsb             r5,  r6,  #16
    299        add             r4,  r6,  #16          // shift_bits = cnt + 16
    300        mvn             r8,  r8
    301        lsr             r5,  r5,  #3           // num_bytes_read
    302        rev             r8,  r8                // next_bits = bswap(next_bits)
    303        lsr             r8,  r8,  r4           // next_bits >>= shift_bits
    304 
    305 2:      // refill_end
    306        add             r3,  r3,  r5
    307        add             r6,  r6,  r5, lsl #3   // cnt += num_bits_read
    308        str             r3,  [r0, #BUF_POS]
    309 
    310 3:      // refill_end2
    311        orr             r7,  r7,  r8           // dif |= next_bits
    312 
    313 4:      // end
    314        str             r6,  [r0, #CNT]
    315        str             r7,  [r0, #DIF]
    316        mov             r0,  lr
    317        add             sp,  sp,  #48
    318        pop             {r4-r10,pc}
    319 
    320 5:      // pad_with_ones
    321        add             r8,  r6,  #-240
    322        lsr             r8,  r8,  r8
    323        b               3b
    324 
    325 6:      // refill_eob
    326        cmp             r3,  r4
    327        bhs             5b
    328 
    329        ldr             r8,  [r4, #-4]
    330        lsl             r5,  r5,  #3
    331        lsr             r8,  r8,  r5
    332        add             r5,  r6,  #16
    333        mvn             r8,  r8
    334        sub             r4,  r4,  r3           // num_bytes_left
    335        rev             r8,  r8
    336        lsr             r8,  r8,  r5
    337        rsb             r5,  r6,  #16
    338        lsr             r5,  r5,  #3
    339        cmp             r5,  r4
    340        it              hs
    341        movhs           r5,  r4
    342        b               2b
    343 endfunc
    344 
    345 function msac_decode_symbol_adapt8_neon, export=1
    346        decode_update   8
    347        b               L(renorm)
    348 endfunc
    349 
    350 function msac_decode_symbol_adapt16_neon, export=1
    351        decode_update   16
    352        b               L(renorm)
    353 endfunc
    354 
    355 function msac_decode_hi_tok_neon, export=1
    356        push            {r4-r10,lr}
    357        vld1.16         {d0},  [r1, :64]       // cdf
    358        add             r4,  r0,  #RNG
    359        vmov.i16        d31, #0x7f00           // 0x7f00
    360        movrel_local    r5,  coeffs, 30-2*3
    361        vmvn.i16        d30, #0x3f             // 0xffc0
    362        ldrh            r9,  [r1, #6]          // count = cdf[n_symbols]
    363        vld1.16         {d1[]},  [r4, :16]     // rng
    364        movrel_local    r4,  bits
    365        vld1.16         {d29}, [r5]            // EC_MIN_PROB * (n_symbols - ret)
    366        add             r5,  r0,  #DIF + 2
    367        vld1.16         {q8}, [r4, :128]
    368        mov             r2,  #-24
    369        vand            d20, d0, d30           // cdf & 0xffc0
    370        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
    371        vld1.16         {d2[]}, [r5, :16]      // dif >> (EC_WIN_SIZE - 16)
    372        sub             sp,  sp,  #48
    373        ldr             r6,  [r0, #CNT]
    374        ldr             r7,  [r0, #DIF]
    375        vmov            d3,  d2
    376 1:
    377        vand            d23, d1,  d31          // rng & 0x7f00
    378        vqdmulh.s16     d18, d20, d23          // ((cdf >> EC_PROB_SHIFT) * (r - 128)) >> 1
    379        add             r12, sp,  #14
    380        vadd.i16        d6,  d20, d29          // v = cdf + EC_MIN_PROB * (n_symbols - ret)
    381        vadd.i16        d6,  d18, d6           // v = ((cdf >> EC_PROB_SHIFT) * r) >> 1 + EC_MIN_PROB * (n_symbols - ret)
    382        vmov.i16        d7,  #0
    383        vst1.16         {d1[0]}, [r12, :16]    // store original u = s->rng
    384        add             r12, sp,  #16
    385        vcge.u16        q2,  q1,  q3           // c >= v
    386        vst1.16         {q3},  [r12]           // store v values to allow indexed access
    387        vand            q9,  q2,  q8           // One bit per halfword set in the mask
    388 
    389        vadd.i16        d18, d18, d19          // Aggregate mask bits
    390        vpadd.i16       d18, d18, d18
    391        vpadd.i16       d18, d18, d18
    392        vmov.u16        r3,  d18[0]
    393        cmp             r10, #0
    394        add             r2,  r2,  #5
    395        rbit            r3,  r3
    396        add             r8,  sp,  #16
    397        clz             lr,  r3                // ret
    398 
    399        beq             2f
    400        // update_cdf
    401        vmov.i8         d22, #0xff
    402        mov             r4,  #-5
    403        vrhadd.u16      d6,  d22, d4           // i >= val ? -1 : 32768
    404        sub             r4,  r4,  r9, lsr #4   // -((count >> 4) + 5)
    405        vsub.i16        d6,  d6,  d0           // (32768 - cdf[i]) or (-1 - cdf[i])
    406        vdup.16         d18, r4                // -rate
    407 
    408        sub             r9,  r9,  r9, lsr #5   // count - (count == 32)
    409        vsub.i16        d0,  d0,  d4           // cdf + (i >= val ? 1 : 0)
    410        vshl.s16        d6,  d6,  d18          // ({32768,-1} - cdf[i]) >> rate
    411        add             r9,  r9,  #1           // count + (count < 32)
    412        vadd.i16        d0,  d0,  d6           // cdf + (32768 - cdf[i]) >> rate
    413        vst1.16         {d0},  [r1, :64]
    414        vand            d20, d0,  d30          // cdf & 0xffc0
    415        strh            r9,  [r1, #6]
    416 
    417 2:
    418        add             r8,  r8,  lr, lsl #1
    419        ldrh            r3,  [r8]              // v
    420        ldrh            r4,  [r8, #-2]         // u
    421        sub             r4,  r4,  r3           // rng = u - v
    422        clz             r5,  r4                // clz(rng)
    423        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
    424        sub             r7,  r7,  r3, lsl #16  // dif - (v << 16)
    425        lsl             r4,  r4,  r5           // rng << d
    426        subs            r6,  r6,  r5           // cnt -= d
    427        lsl             r7,  r7,  r5           // (dif - (v << 16)) << d
    428        str             r4,  [r0, #RNG]
    429        vdup.16         d1,  r4
    430        bhs             5f
    431 
    432        // refill
    433        ldr             r3,  [r0, #BUF_POS]    // BUF_POS
    434        ldr             r4,  [r0, #BUF_END]    // BUF_END
    435        add             r5,  r3,  #4
    436        subs            r5,  r5,  r4
    437        bhi             7f
    438 
    439        ldr             r8,  [r3]              // next_bits
    440        rsb             r5,  r6,  #16
    441        add             r4,  r6,  #16          // shift_bits = cnt + 16
    442        mvn             r8,  r8
    443        lsr             r5,  r5,  #3           // num_bytes_read
    444        rev             r8,  r8                // next_bits = bswap(next_bits)
    445        lsr             r8,  r8,  r4           // next_bits >>= shift_bits
    446 
    447 3:      // refill_end
    448        add             r3,  r3,  r5
    449        add             r6,  r6,  r5, lsl #3   // cnt += num_bits_read
    450        str             r3,  [r0, #BUF_POS]
    451 
    452 4:      // refill_end2
    453        orr             r7,  r7,  r8           // dif |= next_bits
    454 
    455 5:      // end
    456        lsl             lr,  lr,  #1
    457        sub             lr,  lr,  #5
    458        lsr             r12, r7,  #16
    459        adds            r2,  r2,  lr           // carry = tok_br < 3 || tok == 15
    460        vdup.16         q1,  r12
    461        bcc             1b                     // loop if !carry
    462        add             r2,  r2,  #30
    463        str             r6,  [r0, #CNT]
    464        add             sp,  sp,  #48
    465        str             r7,  [r0, #DIF]
    466        lsr             r0,  r2,  #1
    467        pop             {r4-r10,pc}
    468 
    469 6:      // pad_with_ones
    470        add             r8,  r6,  #-240
    471        lsr             r8,  r8,  r8
    472        b               4b
    473 
    474 7:      // refill_eob
    475        cmp             r3,  r4
    476        bhs             6b
    477 
    478        ldr             r8,  [r4, #-4]
    479        lsl             r5,  r5,  #3
    480        lsr             r8,  r8,  r5
    481        add             r5,  r6,  #16
    482        mvn             r8,  r8
    483        sub             r4,  r4,  r3           // num_bytes_left
    484        rev             r8,  r8
    485        lsr             r8,  r8,  r5
    486        rsb             r5,  r6,  #16
    487        lsr             r5,  r5,  #3
    488        cmp             r5,  r4
    489        it              hs
    490        movhs           r5,  r4
    491        b               3b
    492 endfunc
    493 
    494 function msac_decode_bool_equi_neon, export=1
    495        push            {r4-r10,lr}
    496        ldr             r5,  [r0, #RNG]
    497        ldr             r6,  [r0, #CNT]
    498        sub             sp,  sp,  #48
    499        ldr             r7,  [r0, #DIF]
    500        bic             r4,  r5,  #0xff        // r &= 0xff00
    501        add             r4,  r4,  #8
    502        mov             r2,  #0
    503        subs            r8,  r7,  r4, lsl #15  // dif - vw
    504        lsr             r4,  r4,  #1           // v
    505        sub             r5,  r5,  r4           // r - v
    506        itee            lo
    507        movlo           r2,  #1
    508        movhs           r4,  r5                // if (ret) v = r - v;
    509        movhs           r7,  r8                // if (ret) dif = dif - vw;
    510 
    511        clz             r5,  r4                // clz(rng)
    512        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
    513        mov             lr,  r2
    514        b               L(renorm2)
    515 endfunc
    516 
    517 function msac_decode_bool_neon, export=1
    518        push            {r4-r10,lr}
    519        ldr             r5,  [r0, #RNG]
    520        ldr             r6,  [r0, #CNT]
    521        sub             sp,  sp,  #48
    522        ldr             r7,  [r0, #DIF]
    523        lsr             r4,  r5,  #8           // r >> 8
    524        bic             r1,  r1,  #0x3f        // f &= ~63
    525        mul             r4,  r4,  r1
    526        mov             r2,  #0
    527        lsr             r4,  r4,  #7
    528        add             r4,  r4,  #4           // v
    529        subs            r8,  r7,  r4, lsl #16  // dif - vw
    530        sub             r5,  r5,  r4           // r - v
    531        itee            lo
    532        movlo           r2,  #1
    533        movhs           r4,  r5                // if (ret) v = r - v;
    534        movhs           r7,  r8                // if (ret) dif = dif - vw;
    535 
    536        clz             r5,  r4                // clz(rng)
    537        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
    538        mov             lr,  r2
    539        b               L(renorm2)
    540 endfunc
    541 
    542 function msac_decode_bool_adapt_neon, export=1
    543        push            {r4-r10,lr}
    544        ldr             r9,  [r1]              // cdf[0-1]
    545        ldr             r5,  [r0, #RNG]
    546        movw            lr,  #0xffc0
    547        ldr             r6,  [r0, #CNT]
    548        sub             sp,  sp,  #48
    549        ldr             r7,  [r0, #DIF]
    550        lsr             r4,  r5,  #8           // r >> 8
    551        and             r2,  r9,  lr           // f &= ~63
    552        mul             r4,  r4,  r2
    553        mov             r2,  #0
    554        lsr             r4,  r4,  #7
    555        add             r4,  r4,  #4           // v
    556        subs            r8,  r7,  r4, lsl #16  // dif - vw
    557        sub             r5,  r5,  r4           // r - v
    558        ldr             r10, [r0, #ALLOW_UPDATE_CDF]
    559        itee            lo
    560        movlo           r2,  #1
    561        movhs           r4,  r5                // if (ret) v = r - v;
    562        movhs           r7,  r8                // if (ret) dif = dif - vw;
    563 
    564        cmp             r10, #0
    565        clz             r5,  r4                // clz(rng)
    566        eor             r5,  r5,  #16          // d = clz(rng) ^ 16
    567        mov             lr,  r2
    568 
    569        beq             L(renorm2)
    570 
    571        lsr             r2,  r9,  #16          // count = cdf[1]
    572        uxth            r9,  r9                // cdf[0]
    573 
    574        sub             r3,  r2,  r2,  lsr #5  // count - (count >= 32)
    575        lsr             r2,  r2,  #4           // count >> 4
    576        add             r10, r3,  #1           // count + (count < 32)
    577        add             r2,  r2,  #4           // rate = (count >> 4) | 4
    578 
    579        sub             r9,  r9,  lr           // cdf[0] -= bit
    580        sub             r3,  r9,  lr,  lsl #15 // {cdf[0], cdf[0] - 32769}
    581        asr             r3,  r3,  r2           // {cdf[0], cdf[0] - 32769} >> rate
    582        sub             r9,  r9,  r3           // cdf[0]
    583 
    584        strh            r9,  [r1]
    585        strh            r10, [r1, #2]
    586 
    587        b               L(renorm2)
    588 endfunc