tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

msac.asm (18676B)


      1 ; Copyright © 2019, VideoLAN and dav1d authors
      2 ; Copyright © 2019, Two Orioles, LLC
      3 ; All rights reserved.
      4 ;
      5 ; Redistribution and use in source and binary forms, with or without
      6 ; modification, are permitted provided that the following conditions are met:
      7 ;
      8 ; 1. Redistributions of source code must retain the above copyright notice, this
      9 ;    list of conditions and the following disclaimer.
     10 ;
     11 ; 2. Redistributions in binary form must reproduce the above copyright notice,
     12 ;    this list of conditions and the following disclaimer in the documentation
     13 ;    and/or other materials provided with the distribution.
     14 ;
     15 ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 ; ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 ; WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 ; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 ; ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 ; (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 ; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 ; ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 ; (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 
     26 %include "config.asm"
     27 %include "ext/x86/x86inc.asm"
     28 
     29 SECTION_RODATA 64 ; avoids cacheline splits
     30 
     31 min_prob:  dw 60, 56, 52, 48, 44, 40, 36, 32, 28, 24, 20, 16, 12, 8, 4, 0
     32 pw_0xff00: times 8 dw 0xff00
     33 pw_32:     times 8 dw 32
     34 
     35 %if ARCH_X86_64
     36 %define resp   resq
     37 %define movp   movq
     38 %define c_shuf q3333
     39 %macro DECODE_SYMBOL_ADAPT_INIT 0-1
     40 %endmacro
     41 %else
     42 %define resp   resd
     43 %define movp   movd
     44 %define c_shuf q1111
     45 %macro DECODE_SYMBOL_ADAPT_INIT 0-1 0 ; hi_tok
     46    mov            t0, r0m
     47    mov            t1, r1m
     48 %if %1 == 0
     49    mov            t2, r2m
     50 %endif
     51 %if STACK_ALIGNMENT >= 16
     52    sub           esp, 40-%1*4
     53 %else
     54    mov           eax, esp
     55    and           esp, ~15
     56    sub           esp, 40-%1*4
     57    mov         [esp], eax
     58 %endif
     59 %endmacro
     60 %endif
     61 
     62 struc msac
     63    .buf:        resp 1
     64    .end:        resp 1
     65    .dif:        resp 1
     66    .rng:        resd 1
     67    .cnt:        resd 1
     68    .update_cdf: resd 1
     69 endstruc
     70 
     71 %define m(x, y) mangle(private_prefix %+ _ %+ x %+ y)
     72 
     73 SECTION .text
     74 
     75 %if WIN64
     76 DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 3, 8
     77 %define buf rsp+stack_offset+8 ; shadow space
     78 %elif UNIX64
     79 DECLARE_REG_TMP 0, 1, 2, 3, 4, 5, 7, 0, 8
     80 %define buf rsp-40 ; red zone
     81 %else
     82 DECLARE_REG_TMP 2, 3, 4, 1, 5, 6, 5, 2, 3
     83 %define buf esp+8
     84 %endif
     85 
     86 INIT_XMM sse2
     87 cglobal msac_decode_symbol_adapt4, 0, 6, 6
     88    DECODE_SYMBOL_ADAPT_INIT
     89    LEA           rax, pw_0xff00
     90    movd           m2, [t0+msac.rng]
     91    movq           m1, [t1]
     92    movp           m3, [t0+msac.dif]
     93    mov           t3d, [t0+msac.update_cdf]
     94    mov           t4d, t2d
     95    not            t2     ; -(n_symbols + 1)
     96    pshuflw        m2, m2, q0000
     97    movd     [buf+12], m2
     98    pand           m2, [rax]
     99    mova           m0, m1
    100    psrlw          m1, 6
    101    psllw          m1, 7
    102    pmulhuw        m1, m2
    103    movq           m2, [rax+t2*2]
    104    pshuflw        m3, m3, c_shuf
    105    paddw          m1, m2
    106    mova     [buf+16], m1
    107    psubusw        m1, m3
    108    pxor           m2, m2
    109    pcmpeqw        m1, m2 ; c >= v
    110    pmovmskb      eax, m1
    111    test          t3d, t3d
    112    jz .renorm ; !allow_update_cdf
    113 
    114 ; update_cdf:
    115    movzx         t3d, word [t1+t4*2] ; count
    116    pcmpeqw        m2, m2
    117    mov           t2d, t3d
    118    shr           t3d, 4
    119    cmp           t4d, 3
    120    sbb           t3d, -5 ; (count >> 4) + (n_symbols > 2) + 4
    121    cmp           t2d, 32
    122    adc           t2d, 0  ; count + (count < 32)
    123    movd           m3, t3d
    124    pavgw          m2, m1 ; i >= val ? -1 : 32768
    125    psubw          m2, m0 ; for (i = 0; i < val; i++)
    126    psubw          m0, m1 ;     cdf[i] += (32768 - cdf[i]) >> rate;
    127    psraw          m2, m3 ; for (; i < n_symbols; i++)
    128    paddw          m0, m2 ;     cdf[i] += ((  -1 - cdf[i]) >> rate) + 1;
    129    movq         [t1], m0
    130    mov     [t1+t4*2], t2w
    131 
    132 .renorm:
    133    tzcnt         eax, eax
    134    mov            t4, [t0+msac.dif]
    135    movzx         t1d, word [buf+rax+16] ; v
    136    movzx         t2d, word [buf+rax+14] ; u
    137    shr           eax, 1
    138 .renorm2:
    139 %if ARCH_X86_64 == 0
    140 %if STACK_ALIGNMENT >= 16
    141    add           esp, 40
    142 %else
    143    mov           esp, [esp]
    144 %endif
    145 %endif
    146    sub           t2d, t1d ; rng
    147    shl            t1, gprsize*8-16
    148    sub            t4, t1  ; dif - v
    149 .renorm3:
    150    mov           t1d, [t0+msac.cnt]
    151    movifnidn      t7, t0
    152 .renorm4:
    153    bsr           ecx, t2d
    154    xor           ecx, 15  ; d
    155 .renorm5:
    156    shl           t2d, cl
    157    shl            t4, cl
    158    mov [t7+msac.rng], t2d
    159    sub           t1d, ecx
    160    jae .end ; no refill required
    161 
    162 ; refill:
    163 %if ARCH_X86_64 == 0
    164    push           t5
    165 %endif
    166    mov            t2, [t7+msac.buf]
    167    mov            t5, [t7+msac.end]
    168    lea           rcx, [t2+gprsize]
    169    sub           rcx, t5
    170    ja .refill_eob
    171    mov            t5, [t2]
    172    lea           ecx, [t1+16-gprsize*8]
    173    not            t5
    174    bswap          t5
    175    shr            t5, cl
    176    neg           ecx
    177    shr           ecx, 3 ; num_bytes_read
    178    or             t4, t5
    179 .refill_end:
    180    add            t2, rcx
    181    lea           t1d, [t1+rcx*8] ; cnt += num_bits_read
    182    mov [t7+msac.buf], t2
    183 .refill_end2:
    184 %if ARCH_X86_64 == 0
    185    pop            t5
    186 %endif
    187 .end:
    188    mov [t7+msac.cnt], t1d
    189    mov [t7+msac.dif], t4
    190    RET
    191 .pad_with_ones:
    192    lea           ecx, [t1-16]
    193 %if ARCH_X86_64
    194    ror           rcx, cl
    195 %else
    196    shr           ecx, cl
    197 %endif
    198    or             t4, rcx
    199    jmp .refill_end2
    200 .refill_eob: ; avoid overreading the input buffer
    201    cmp            t2, t5
    202    jae .pad_with_ones ; eob reached
    203    ; We can safely do a register-sized load of the last bytes of the buffer
    204    ; as this code is only reached if the msac buffer size is >= gprsize.
    205    mov            t5, [t5-gprsize]
    206    shl           ecx, 3
    207    shr            t5, cl
    208    lea           ecx, [t1+16-gprsize*8]
    209    not            t5
    210    bswap          t5
    211    shr            t5, cl
    212    neg           ecx
    213    or             t4, t5
    214    mov           t5d, [t7+msac.end]
    215    shr           ecx, 3
    216    sub           t5d, t2d ; num_bytes_left
    217    cmp           ecx, t5d
    218    cmovae        ecx, t5d ; num_bytes_read
    219    jmp .refill_end
    220 
    221 cglobal msac_decode_symbol_adapt8, 0, 6, 6
    222    DECODE_SYMBOL_ADAPT_INIT
    223    LEA           rax, pw_0xff00
    224    movd           m2, [t0+msac.rng]
    225    mova           m1, [t1]
    226    movp           m3, [t0+msac.dif]
    227    mov           t3d, [t0+msac.update_cdf]
    228    mov           t4d, t2d
    229    not            t2
    230    pshuflw        m2, m2, q0000
    231    movd     [buf+12], m2
    232    punpcklqdq     m2, m2
    233    mova           m0, m1
    234    psrlw          m1, 6
    235    pand           m2, [rax]
    236    psllw          m1, 7
    237    pmulhuw        m1, m2
    238    movu           m2, [rax+t2*2]
    239    pshuflw        m3, m3, c_shuf
    240    paddw          m1, m2
    241    punpcklqdq     m3, m3
    242    mova     [buf+16], m1
    243    psubusw        m1, m3
    244    pxor           m2, m2
    245    pcmpeqw        m1, m2
    246    pmovmskb      eax, m1
    247    test          t3d, t3d
    248    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm
    249    movzx         t3d, word [t1+t4*2]
    250    pcmpeqw        m2, m2
    251    mov           t2d, t3d
    252    shr           t3d, 4
    253    cmp           t4d, 3 ; may be called with n_symbols <= 2
    254    sbb           t3d, -5
    255    cmp           t2d, 32
    256    adc           t2d, 0
    257    movd           m3, t3d
    258    pavgw          m2, m1
    259    psubw          m2, m0
    260    psubw          m0, m1
    261    psraw          m2, m3
    262    paddw          m0, m2
    263    mova         [t1], m0
    264    mov     [t1+t4*2], t2w
    265    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm
    266 
    267 cglobal msac_decode_symbol_adapt16, 0, 6, 6
    268    DECODE_SYMBOL_ADAPT_INIT
    269    LEA           rax, pw_0xff00
    270    movd           m4, [t0+msac.rng]
    271    mova           m2, [t1]
    272    mova           m3, [t1+16]
    273    movp           m5, [t0+msac.dif]
    274    mov           t3d, [t0+msac.update_cdf]
    275    mov           t4d, t2d
    276    not            t2
    277 %if WIN64
    278    sub           rsp, 48 ; need 36 bytes, shadow space is only 32
    279 %endif
    280    pshuflw        m4, m4, q0000
    281    movd      [buf-4], m4
    282    punpcklqdq     m4, m4
    283    mova           m0, m2
    284    psrlw          m2, 6
    285    mova           m1, m3
    286    psrlw          m3, 6
    287    pand           m4, [rax]
    288    psllw          m2, 7
    289    psllw          m3, 7
    290    pmulhuw        m2, m4
    291    pmulhuw        m3, m4
    292    movu           m4, [rax+t2*2]
    293    pshuflw        m5, m5, c_shuf
    294    paddw          m2, m4
    295    psubw          m4, [rax-pw_0xff00+pw_32]
    296    punpcklqdq     m5, m5
    297    paddw          m3, m4
    298    mova        [buf], m2
    299    psubusw        m2, m5
    300    mova     [buf+16], m3
    301    psubusw        m3, m5
    302    pxor           m4, m4
    303    pcmpeqw        m2, m4
    304    pcmpeqw        m3, m4
    305    packsswb       m5, m2, m3
    306    pmovmskb      eax, m5
    307    test          t3d, t3d
    308    jz .renorm
    309    movzx         t3d, word [t1+t4*2]
    310    pcmpeqw        m4, m4
    311    mova           m5, m4
    312    lea           t2d, [t3+80] ; only support n_symbols > 2
    313    shr           t2d, 4
    314    cmp           t3d, 32
    315    adc           t3d, 0
    316    pavgw          m4, m2
    317    pavgw          m5, m3
    318    psubw          m4, m0
    319    psubw          m0, m2
    320    movd           m2, t2d
    321    psubw          m5, m1
    322    psubw          m1, m3
    323    psraw          m4, m2
    324    psraw          m5, m2
    325    paddw          m0, m4
    326    paddw          m1, m5
    327    mova         [t1], m0
    328    mova      [t1+16], m1
    329    mov     [t1+t4*2], t3w
    330 .renorm:
    331    tzcnt         eax, eax
    332    mov            t4, [t0+msac.dif]
    333    movzx         t1d, word [buf+rax*2]
    334    movzx         t2d, word [buf+rax*2-2]
    335 %if WIN64
    336    add           rsp, 48
    337 %endif
    338    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm2
    339 
    340 cglobal msac_decode_bool_adapt, 0, 6, 0
    341    movifnidn      t1, r1mp
    342    movifnidn      t0, r0mp
    343    movzx         eax, word [t1]
    344    movzx         t3d, byte [t0+msac.rng+1]
    345    mov            t4, [t0+msac.dif]
    346    mov           t2d, [t0+msac.rng]
    347 %if ARCH_X86_64
    348    mov           t5d, eax
    349 %endif
    350    and           eax, ~63
    351    imul          eax, t3d
    352 %if UNIX64
    353    mov            t6, t4
    354 %endif
    355    shr           eax, 7
    356    add           eax, 4            ; v
    357    mov           t3d, eax
    358    shl           rax, gprsize*8-16 ; vw
    359    sub           t2d, t3d          ; r - v
    360    sub            t4, rax          ; dif - vw
    361    setb           al
    362    cmovb         t2d, t3d
    363    mov           t3d, [t0+msac.update_cdf]
    364 %if UNIX64
    365    cmovb          t4, t6
    366 %else
    367    cmovb          t4, [t0+msac.dif]
    368 %endif
    369 %if ARCH_X86_64 == 0
    370    movzx         eax, al
    371 %endif
    372    test          t3d, t3d
    373    jz m(msac_decode_symbol_adapt4, SUFFIX).renorm3
    374 %if UNIX64 == 0
    375    push           t6
    376 %endif
    377    movzx         t6d, word [t1+2]
    378 %if ARCH_X86_64 == 0
    379    push           t5
    380    movzx         t5d, word [t1]
    381 %endif
    382    movifnidn      t7, t0
    383    lea           ecx, [t6+64]
    384    cmp           t6d, 32
    385    adc           t6d, 0
    386    mov        [t1+2], t6w
    387    imul          t6d, eax, -32769
    388    shr           ecx, 4   ; rate
    389    add           t6d, t5d ; if (bit)
    390    sub           t5d, eax ;     cdf[0] -= ((cdf[0] - 32769) >> rate) + 1;
    391    sar           t6d, cl  ; else
    392    sub           t5d, t6d ;     cdf[0] -= cdf[0] >> rate;
    393    mov          [t1], t5w
    394 %if WIN64
    395    mov           t1d, [t7+msac.cnt]
    396    pop            t6
    397    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm4
    398 %else
    399 %if ARCH_X86_64 == 0
    400    pop            t5
    401    pop            t6
    402 %endif
    403    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
    404 %endif
    405 
    406 cglobal msac_decode_bool_equi, 0, 6, 0
    407    movifnidn      t0, r0mp
    408    mov           t1d, [t0+msac.rng]
    409    mov            t4, [t0+msac.dif]
    410    mov           t2d, t1d
    411    mov           t1b, 8
    412    mov            t3, t4
    413    mov           eax, t1d
    414    shr           t1d, 1            ; v
    415    shl           rax, gprsize*8-17 ; vw
    416    sub           t2d, t1d          ; r - v
    417    sub            t4, rax          ; dif - vw
    418    cmovb         t2d, t1d
    419    mov           t1d, [t0+msac.cnt]
    420    cmovb          t4, t3
    421    movifnidn      t7, t0
    422    mov           ecx, 0xbfff
    423    setb           al ; the upper 32 bits contains garbage but that's OK
    424    sub           ecx, t2d
    425    ; In this case of this function, (d =) 16 - clz(v) = 2 - (v >> 14)
    426    ;   i.e. (0 <= d <= 2) and v < (3 << 14)
    427    shr           ecx, 14           ; d
    428 %if ARCH_X86_64 == 0
    429    movzx         eax, al
    430 %endif
    431    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm5
    432 
    433 cglobal msac_decode_bool, 0, 6, 0
    434    movifnidn      t0, r0mp
    435    movifnidn     t1d, r1m
    436    movzx         eax, byte [t0+msac.rng+1] ; r >> 8
    437    mov            t4, [t0+msac.dif]
    438    mov           t2d, [t0+msac.rng]
    439    and           t1d, ~63
    440    imul          eax, t1d
    441    mov            t3, t4
    442    shr           eax, 7
    443    add           eax, 4            ; v
    444    mov           t1d, eax
    445    shl           rax, gprsize*8-16 ; vw
    446    sub           t2d, t1d          ; r - v
    447    sub            t4, rax          ; dif - vw
    448    cmovb         t2d, t1d
    449    cmovb          t4, t3
    450    setb           al
    451 %if ARCH_X86_64 == 0
    452    movzx         eax, al
    453 %endif
    454    jmp m(msac_decode_symbol_adapt4, SUFFIX).renorm3
    455 
    456 %macro HI_TOK 1 ; update_cdf
    457 %if ARCH_X86_64 == 0
    458    mov           eax, -24
    459 %endif
    460 %%loop:
    461 %if %1
    462    movzx         t2d, word [t1+3*2]
    463 %endif
    464    mova           m1, m0
    465    pshuflw        m2, m2, q0000
    466    psrlw          m1, 6
    467    movd     [buf+12], m2
    468    pand           m2, m4
    469    psllw          m1, 7
    470    pmulhuw        m1, m2
    471 %if ARCH_X86_64 == 0
    472    add           eax, 5
    473    mov       [buf+8], eax
    474 %endif
    475    pshuflw        m3, m3, c_shuf
    476    paddw          m1, m5
    477    movq     [buf+16], m1
    478    psubusw        m1, m3
    479    pxor           m2, m2
    480    pcmpeqw        m1, m2
    481    pmovmskb      eax, m1
    482 %if %1
    483    lea           ecx, [t2+80]
    484    pcmpeqw        m2, m2
    485    shr           ecx, 4
    486    cmp           t2d, 32
    487    adc           t2d, 0
    488    movd           m3, ecx
    489    pavgw          m2, m1
    490    psubw          m2, m0
    491    psubw          m0, m1
    492    psraw          m2, m3
    493    paddw          m0, m2
    494    movq         [t1], m0
    495    mov      [t1+3*2], t2w
    496 %endif
    497    tzcnt         eax, eax
    498    movzx         ecx, word [buf+rax+16]
    499    movzx         t2d, word [buf+rax+14]
    500 %if ARCH_X86_64
    501    add           t6d, 5
    502 %endif
    503    sub           eax, 5   ; setup for merging the tok_br and tok branches
    504    sub           t2d, ecx
    505    shl           rcx, gprsize*8-16
    506    sub            t4, rcx
    507    bsr           ecx, t2d
    508    xor           ecx, 15
    509    shl           t2d, cl
    510    shl            t4, cl
    511    movd           m2, t2d
    512    mov [t7+msac.rng], t2d
    513    sub           t5d, ecx
    514    jae %%end
    515 %if UNIX64 == 0
    516    push           t8
    517 %endif
    518    mov            t2, [t7+msac.buf]
    519    mov            t8, [t7+msac.end]
    520    lea           rcx, [t2+gprsize]
    521    sub           rcx, t8
    522    ja %%refill_eob
    523    mov            t8, [t2]
    524    lea           ecx, [t5+16-gprsize*8]
    525    not            t8
    526    bswap          t8
    527    shr            t8, cl
    528    neg           ecx
    529    shr           ecx, 3
    530    or             t4, t8
    531 %%refill_end:
    532    add            t2, rcx
    533    lea           t5d, [t5+rcx*8]
    534    mov [t7+msac.buf], t2
    535 %%refill_end2:
    536 %if UNIX64 == 0
    537    pop            t8
    538 %endif
    539 %%end:
    540    movp           m3, t4
    541 %if ARCH_X86_64
    542    add           t6d, eax ; CF = tok_br < 3 || tok == 15
    543    jnc %%loop
    544    lea           eax, [t6+30]
    545 %else
    546    add           eax, [buf+8]
    547    jnc %%loop
    548    add           eax, 30
    549 %if STACK_ALIGNMENT >= 16
    550    add           esp, 36
    551 %else
    552    mov           esp, [esp]
    553 %endif
    554 %endif
    555    mov [t7+msac.dif], t4
    556    shr           eax, 1
    557    mov [t7+msac.cnt], t5d
    558    RET
    559 %%pad_with_ones:
    560    ; ensure that dif is padded with at least 15 bits of ones at the end
    561    lea           ecx, [t5-16]
    562 %if ARCH_X86_64
    563    ror           rcx, cl
    564 %else
    565    shr           ecx, cl
    566 %endif
    567    or             t4, rcx
    568    jmp %%refill_end2
    569 %%refill_eob:
    570    cmp            t2, t8
    571    jae %%pad_with_ones
    572    mov            t8, [t8-gprsize]
    573    shl           ecx, 3
    574    shr            t8, cl
    575    lea           ecx, [t5+16-gprsize*8]
    576    not            t8
    577    bswap          t8
    578    shr            t8, cl
    579    neg           ecx
    580    or             t4, t8
    581    mov           t8d, [t7+msac.end]
    582    shr           ecx, 3
    583    sub           t8d, t2d
    584    cmp           ecx, t8d
    585    cmovae        ecx, t8d
    586    jmp %%refill_end
    587 %endmacro
    588 
    589 cglobal msac_decode_hi_tok, 0, 7 + ARCH_X86_64, 6
    590    DECODE_SYMBOL_ADAPT_INIT 1
    591 %if ARCH_X86_64 == 0 && PIC
    592    LEA            t2, min_prob+12*2
    593    %define base t2-(min_prob+12*2)
    594 %else
    595    %define base 0
    596 %endif
    597    movq           m0, [t1]
    598    movd           m2, [t0+msac.rng]
    599    mov           eax, [t0+msac.update_cdf]
    600    movq           m4, [base+pw_0xff00]
    601    movp           m3, [t0+msac.dif]
    602    movq           m5, [base+min_prob+12*2]
    603    mov            t4, [t0+msac.dif]
    604    mov           t5d, [t0+msac.cnt]
    605 %if ARCH_X86_64
    606    mov           t6d, -24
    607 %endif
    608    movifnidn      t7, t0
    609    test          eax, eax
    610    jz .no_update_cdf
    611    HI_TOK          1
    612 .no_update_cdf:
    613    HI_TOK          0
    614 
    615 %if ARCH_X86_64
    616 INIT_YMM avx2
    617 cglobal msac_decode_symbol_adapt16, 3, 6, 6
    618    lea           rax, [pw_0xff00]
    619    vpbroadcastw   m2, [t0+msac.rng]
    620    mova           m0, [t1]
    621    vpbroadcastw   m3, [t0+msac.dif+6]
    622    vbroadcasti128 m4, [rax]
    623    mov           t3d, [t0+msac.update_cdf]
    624    mov           t4d, t2d
    625    not            t2
    626    mov            r5, rsp
    627 %if WIN64
    628    and           rsp, ~31
    629    sub           rsp, 40
    630 %else
    631    and            r5, ~31
    632    %define buf r5-32
    633 %endif
    634    psrlw          m1, m0, 6
    635    movd      [buf-4], xm2
    636    pand           m2, m4
    637    psllw          m1, 7
    638    pmulhuw        m1, m2
    639    paddw          m1, [rax+t2*2]
    640    mova        [buf], m1
    641    pmaxuw         m1, m3
    642    pcmpeqw        m1, m3
    643    pmovmskb      eax, m1
    644    test          t3d, t3d
    645    jz .renorm
    646    movzx         t3d, word [t1+t4*2]
    647    pcmpeqw        m2, m2
    648    lea           t2d, [t3+80]
    649    shr           t2d, 4
    650    cmp           t3d, 32
    651    adc           t3d, 0
    652    movd          xm3, t2d
    653    pavgw          m2, m1
    654    psubw          m2, m0
    655    psubw          m0, m1
    656    psraw          m2, xm3
    657    paddw          m0, m2
    658    mova         [t1], m0
    659    mov     [t1+t4*2], t3w
    660 .renorm:
    661    tzcnt         eax, eax
    662    mov            t4, [t0+msac.dif]
    663    movzx         t1d, word [buf+rax-0]
    664    movzx         t2d, word [buf+rax-2]
    665    shr           eax, 1
    666 %if WIN64
    667    mov           rsp, r5
    668 %endif
    669    vzeroupper
    670    jmp m(msac_decode_symbol_adapt4, _sse2).renorm2
    671 %endif