[ tor-browser ].git.dasho

mpi_x86_asm.c (12811B)
      1 /*
      2 *  mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions.
      3 *
      4 * This Source Code Form is subject to the terms of the Mozilla Public
      5 * License, v. 2.0. If a copy of the MPL was not distributed with this
      6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      7 
      8 #include "mpi-priv.h"
      9 
     10 static int is_sse = -1;
     11 extern unsigned long s_mpi_is_sse2();
     12 
     13 /*
     14 *   ebp - 36:  caller's esi
     15 *   ebp - 32:  caller's edi
     16 *   ebp - 28:
     17 *   ebp - 24:
     18 *   ebp - 20:
     19 *   ebp - 16:
     20 *   ebp - 12:
     21 *   ebp - 8:
     22 *   ebp - 4:
     23 *   ebp + 0:   caller's ebp
     24 *   ebp + 4:   return address
     25 *   ebp + 8:   a   argument
     26 *   ebp + 12:  a_len   argument
     27 *   ebp + 16:  b   argument
     28 *   ebp + 20:  c   argument
     29 *   registers:
     30 *      eax:
     31 *  ebx:    carry
     32 *  ecx:    a_len
     33 *  edx:
     34 *  esi:    a ptr
     35 *  edi:    c ptr
     36 */
     37 __declspec(naked) void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
     38 {
     39    __asm {
     40    mov    eax, is_sse
     41    cmp    eax, 0
     42    je     s_mpv_mul_d_x86
     43    jg     s_mpv_mul_d_sse2
     44    call   s_mpi_is_sse2
     45    mov    is_sse, eax
     46    cmp    eax, 0
     47    jg     s_mpv_mul_d_sse2
     48 s_mpv_mul_d_x86:
     49    push   ebp
     50    mov    ebp,esp
     51    sub    esp,28
     52    push   edi
     53    push   esi
     54    push   ebx
     55    mov    ebx,0        ; carry = 0
     56    mov    ecx,[ebp+12]     ; ecx = a_len
     57    mov    edi,[ebp+20]
     58    cmp    ecx,0
     59    je     L_2          ; jmp if a_len == 0
     60    mov    esi,[ebp+8]      ; esi = a
     61    cld
     62 L_1:
     63    lodsd           ; eax = [ds:esi]; esi += 4
     64    mov    edx,[ebp+16]     ; edx = b
     65    mul    edx          ; edx:eax = Phi:Plo = a_i * b
     66 
     67    add    eax,ebx      ; add carry (ebx) to edx:eax
     68    adc    edx,0
     69    mov    ebx,edx      ; high half of product becomes next carry
     70 
     71    stosd           ; [es:edi] = ax; edi += 4;
     72    dec    ecx          ; --a_len
     73    jnz    L_1          ; jmp if a_len != 0
     74 L_2:
     75    mov    [edi],ebx        ; *c = carry
     76    pop    ebx
     77    pop    esi
     78    pop    edi
     79    leave
     80    ret
     81    nop
     82 s_mpv_mul_d_sse2:
     83    push   ebp
     84    mov    ebp, esp
     85    push   edi
     86    push   esi
     87    psubq  mm2, mm2     ; carry = 0
     88    mov    ecx, [ebp+12]    ; ecx = a_len
     89    movd   mm1, [ebp+16]    ; mm1 = b
     90    mov    edi, [ebp+20]
     91    cmp    ecx, 0
     92    je     L_6          ; jmp if a_len == 0
     93    mov    esi, [ebp+8]     ; esi = a
     94    cld
     95 L_5:
     96    movd   mm0, [esi]       ; mm0 = *a++
     97    add    esi, 4
     98    pmuludq mm0, mm1        ; mm0 = b * *a++
     99    paddq  mm2, mm0     ; add the carry
    100    movd   [edi], mm2       ; store the 32bit result
    101    add    edi, 4
    102    psrlq  mm2, 32      ; save the carry
    103    dec    ecx          ; --a_len
    104    jnz    L_5          ; jmp if a_len != 0
    105 L_6:
    106    movd   [edi], mm2       ; *c = carry
    107    emms
    108    pop    esi
    109    pop    edi
    110    leave
    111    ret
    112    nop
    113    }
    114 }
    115 
    116 /*
    117 *   ebp - 36:  caller's esi
    118 *   ebp - 32:  caller's edi
    119 *   ebp - 28:
    120 *   ebp - 24:
    121 *   ebp - 20:
    122 *   ebp - 16:
    123 *   ebp - 12:
    124 *   ebp - 8:
    125 *   ebp - 4:
    126 *   ebp + 0:   caller's ebp
    127 *   ebp + 4:   return address
    128 *   ebp + 8:   a   argument
    129 *   ebp + 12:  a_len   argument
    130 *   ebp + 16:  b   argument
    131 *   ebp + 20:  c   argument
    132 *   registers:
    133 *      eax:
    134 *  ebx:    carry
    135 *  ecx:    a_len
    136 *  edx:
    137 *  esi:    a ptr
    138 *  edi:    c ptr
    139 */
    140 __declspec(naked) void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    141 {
    142    __asm {
    143    mov    eax, is_sse
    144    cmp    eax, 0
    145    je     s_mpv_mul_d_add_x86
    146    jg     s_mpv_mul_d_add_sse2
    147    call   s_mpi_is_sse2
    148    mov    is_sse, eax
    149    cmp    eax, 0
    150    jg     s_mpv_mul_d_add_sse2
    151 s_mpv_mul_d_add_x86:
    152    push   ebp
    153    mov    ebp,esp
    154    sub    esp,28
    155    push   edi
    156    push   esi
    157    push   ebx
    158    mov    ebx,0        ; carry = 0
    159    mov    ecx,[ebp+12]     ; ecx = a_len
    160    mov    edi,[ebp+20]
    161    cmp    ecx,0
    162    je     L_11         ; jmp if a_len == 0
    163    mov    esi,[ebp+8]      ; esi = a
    164    cld
    165 L_10:
    166    lodsd           ; eax = [ds:esi]; esi += 4
    167    mov    edx,[ebp+16]     ; edx = b
    168    mul    edx          ; edx:eax = Phi:Plo = a_i * b
    169 
    170    add    eax,ebx      ; add carry (ebx) to edx:eax
    171    adc    edx,0
    172    mov    ebx,[edi]        ; add in current word from *c
    173    add    eax,ebx
    174    adc    edx,0
    175    mov    ebx,edx      ; high half of product becomes next carry
    176 
    177    stosd           ; [es:edi] = ax; edi += 4;
    178    dec    ecx          ; --a_len
    179    jnz    L_10         ; jmp if a_len != 0
    180 L_11:
    181    mov    [edi],ebx        ; *c = carry
    182    pop    ebx
    183    pop    esi
    184    pop    edi
    185    leave
    186    ret
    187    nop
    188 s_mpv_mul_d_add_sse2:
    189    push   ebp
    190    mov    ebp, esp
    191    push   edi
    192    push   esi
    193    psubq  mm2, mm2     ; carry = 0
    194    mov    ecx, [ebp+12]    ; ecx = a_len
    195    movd   mm1, [ebp+16]    ; mm1 = b
    196    mov    edi, [ebp+20]
    197    cmp    ecx, 0
    198    je     L_16         ; jmp if a_len == 0
    199    mov    esi, [ebp+8]     ; esi = a
    200    cld
    201 L_15:
    202    movd   mm0, [esi]       ; mm0 = *a++
    203    add    esi, 4
    204    pmuludq mm0, mm1        ; mm0 = b * *a++
    205    paddq  mm2, mm0     ; add the carry
    206    movd   mm0, [edi]
    207    paddq  mm2, mm0     ; add the carry
    208    movd   [edi], mm2       ; store the 32bit result
    209    add    edi, 4
    210    psrlq  mm2, 32      ; save the carry
    211    dec    ecx          ; --a_len
    212    jnz    L_15         ; jmp if a_len != 0
    213 L_16:
    214    movd   [edi], mm2       ; *c = carry
    215    emms
    216    pop    esi
    217    pop    edi
    218    leave
    219    ret
    220    nop
    221    }
    222 }
    223 
    224 /*
    225 *   ebp - 36:  caller's esi
    226 *   ebp - 32:  caller's edi
    227 *   ebp - 28:
    228 *   ebp - 24:
    229 *   ebp - 20:
    230 *   ebp - 16:
    231 *   ebp - 12:
    232 *   ebp - 8:
    233 *   ebp - 4:
    234 *   ebp + 0:   caller's ebp
    235 *   ebp + 4:   return address
    236 *   ebp + 8:   a   argument
    237 *   ebp + 12:  a_len   argument
    238 *   ebp + 16:  b   argument
    239 *   ebp + 20:  c   argument
    240 *   registers:
    241 *      eax:
    242 *  ebx:    carry
    243 *  ecx:    a_len
    244 *  edx:
    245 *  esi:    a ptr
    246 *  edi:    c ptr
    247 */
    248 __declspec(naked) void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c)
    249 {
    250    __asm {
    251    mov    eax, is_sse
    252    cmp    eax, 0
    253    je     s_mpv_mul_d_add_prop_x86
    254    jg     s_mpv_mul_d_add_prop_sse2
    255    call   s_mpi_is_sse2
    256    mov    is_sse, eax
    257    cmp    eax, 0
    258    jg     s_mpv_mul_d_add_prop_sse2
    259 s_mpv_mul_d_add_prop_x86:
    260    push   ebp
    261    mov    ebp,esp
    262    sub    esp,28
    263    push   edi
    264    push   esi
    265    push   ebx
    266    mov    ebx,0        ; carry = 0
    267    mov    ecx,[ebp+12]     ; ecx = a_len
    268    mov    edi,[ebp+20]
    269    cmp    ecx,0
    270    je     L_21         ; jmp if a_len == 0
    271    cld
    272    mov    esi,[ebp+8]      ; esi = a
    273 L_20:
    274    lodsd           ; eax = [ds:esi]; esi += 4
    275    mov    edx,[ebp+16]     ; edx = b
    276    mul    edx          ; edx:eax = Phi:Plo = a_i * b
    277 
    278    add    eax,ebx      ; add carry (ebx) to edx:eax
    279    adc    edx,0
    280    mov    ebx,[edi]        ; add in current word from *c
    281    add    eax,ebx
    282    adc    edx,0
    283    mov    ebx,edx      ; high half of product becomes next carry
    284 
    285    stosd           ; [es:edi] = ax; edi += 4;
    286    dec    ecx          ; --a_len
    287    jnz    L_20         ; jmp if a_len != 0
    288 L_21:
    289    cmp    ebx,0        ; is carry zero?
    290    jz     L_23
    291    mov    eax,[edi]        ; add in current word from *c
    292    add    eax,ebx
    293    stosd           ; [es:edi] = ax; edi += 4;
    294    jnc    L_23
    295 L_22:
    296    mov    eax,[edi]        ; add in current word from *c
    297    adc    eax,0
    298    stosd           ; [es:edi] = ax; edi += 4;
    299    jc     L_22
    300 L_23:
    301    pop    ebx
    302    pop    esi
    303    pop    edi
    304    leave
    305    ret
    306    nop
    307 s_mpv_mul_d_add_prop_sse2:
    308    push   ebp
    309    mov    ebp, esp
    310    push   edi
    311    push   esi
    312    push   ebx
    313    psubq  mm2, mm2     ; carry = 0
    314    mov    ecx, [ebp+12]    ; ecx = a_len
    315    movd   mm1, [ebp+16]    ; mm1 = b
    316    mov    edi, [ebp+20]
    317    cmp    ecx, 0
    318    je     L_26         ; jmp if a_len == 0
    319    mov    esi, [ebp+8]     ; esi = a
    320    cld
    321 L_25:
    322    movd   mm0, [esi]       ; mm0 = *a++
    323    movd   mm3, [edi]       ; fetch the sum
    324    add    esi, 4
    325    pmuludq mm0, mm1        ; mm0 = b * *a++
    326    paddq  mm2, mm0     ; add the carry
    327    paddq  mm2, mm3     ; add *c++
    328    movd   [edi], mm2       ; store the 32bit result
    329    add    edi, 4
    330    psrlq  mm2, 32      ; save the carry
    331    dec    ecx          ; --a_len
    332    jnz    L_25         ; jmp if a_len != 0
    333 L_26:
    334    movd   ebx, mm2
    335    cmp    ebx, 0       ; is carry zero?
    336    jz     L_28
    337    mov    eax, [edi]
    338    add    eax, ebx
    339    stosd
    340    jnc    L_28
    341 L_27:
    342    mov    eax, [edi]       ; add in current word from *c
    343    adc    eax, 0
    344    stosd           ; [es:edi] = ax; edi += 4;
    345    jc     L_27
    346 L_28:
    347    emms
    348    pop    ebx
    349    pop    esi
    350    pop    edi
    351    leave
    352    ret
    353    nop
    354    }
    355 }
    356 
    357 /*
    358 *   ebp - 20:  caller's esi
    359 *   ebp - 16:  caller's edi
    360 *   ebp - 12:
    361 *   ebp - 8:   carry
    362 *   ebp - 4:   a_len   local
    363 *   ebp + 0:   caller's ebp
    364 *   ebp + 4:   return address
    365 *   ebp + 8:   pa  argument
    366 *   ebp + 12:  a_len   argument
    367 *   ebp + 16:  ps  argument
    368 *   ebp + 20:
    369 *   registers:
    370 *      eax:
    371 *  ebx:    carry
    372 *  ecx:    a_len
    373 *  edx:
    374 *  esi:    a ptr
    375 *  edi:    c ptr
    376 */
    377 __declspec(naked) void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs)
    378 {
    379    __asm {
    380     mov    eax, is_sse
    381     cmp    eax, 0
    382     je     s_mpv_sqr_add_prop_x86
    383     jg     s_mpv_sqr_add_prop_sse2
    384     call   s_mpi_is_sse2
    385     mov    is_sse, eax
    386     cmp    eax, 0
    387     jg     s_mpv_sqr_add_prop_sse2
    388 s_mpv_sqr_add_prop_x86:
    389     push   ebp
    390     mov    ebp,esp
    391     sub    esp,12
    392     push   edi
    393     push   esi
    394     push   ebx
    395     mov    ebx,0       ; carry = 0
    396     mov    ecx,[ebp+12]    ; a_len
    397     mov    edi,[ebp+16]    ; edi = ps
    398     cmp    ecx,0
    399     je     L_31        ; jump if a_len == 0
    400     cld
    401     mov    esi,[ebp+8]     ; esi = pa
    402 L_30:
    403     lodsd          ; eax = [ds:si]; si += 4;
    404     mul    eax
    405 
    406     add    eax,ebx     ; add "carry"
    407     adc    edx,0
    408     mov    ebx,[edi]
    409     add    eax,ebx     ; add low word from result
    410     mov    ebx,[edi+4]
    411     stosd          ; [es:di] = eax; di += 4;
    412     adc    edx,ebx     ; add high word from result
    413     mov    ebx,0
    414     mov    eax,edx
    415     adc    ebx,0
    416     stosd          ; [es:di] = eax; di += 4;
    417     dec    ecx         ; --a_len
    418     jnz    L_30        ; jmp if a_len != 0
    419 L_31:
    420    cmp    ebx,0        ; is carry zero?
    421    jz     L_34
    422    mov    eax,[edi]        ; add in current word from *c
    423    add    eax,ebx
    424    stosd           ; [es:edi] = ax; edi += 4;
    425    jnc    L_34
    426 L_32:
    427    mov    eax,[edi]        ; add in current word from *c
    428    adc    eax,0
    429    stosd           ; [es:edi] = ax; edi += 4;
    430    jc     L_32
    431 L_34:
    432    pop    ebx
    433    pop    esi
    434    pop    edi
    435    leave
    436    ret
    437    nop
    438 s_mpv_sqr_add_prop_sse2:
    439    push   ebp
    440    mov    ebp, esp
    441    push   edi
    442    push   esi
    443    push   ebx
    444    psubq  mm2, mm2     ; carry = 0
    445    mov    ecx, [ebp+12]    ; ecx = a_len
    446    mov    edi, [ebp+16]
    447    cmp    ecx, 0
    448    je     L_36     ; jmp if a_len == 0
    449    mov    esi, [ebp+8]     ; esi = a
    450    cld
    451 L_35:
    452    movd   mm0, [esi]       ; mm0 = *a
    453    movd   mm3, [edi]       ; fetch the sum
    454    add    esi, 4
    455    pmuludq mm0, mm0        ; mm0 = sqr(a)
    456    paddq  mm2, mm0     ; add the carry
    457    paddq  mm2, mm3     ; add the low word
    458    movd   mm3, [edi+4]
    459    movd   [edi], mm2       ; store the 32bit result
    460    psrlq  mm2, 32
    461    paddq  mm2, mm3     ; add the high word
    462    movd   [edi+4], mm2     ; store the 32bit result
    463    psrlq  mm2, 32      ; save the carry.
    464    add    edi, 8
    465    dec    ecx          ; --a_len
    466    jnz    L_35         ; jmp if a_len != 0
    467 L_36:
    468    movd   ebx, mm2
    469    cmp    ebx, 0       ; is carry zero?
    470    jz     L_38
    471    mov    eax, [edi]
    472    add    eax, ebx
    473    stosd
    474    jnc    L_38
    475 L_37:
    476    mov    eax, [edi]       ; add in current word from *c
    477    adc    eax, 0
    478    stosd           ; [es:edi] = ax; edi += 4;
    479    jc     L_37
    480 L_38:
    481    emms
    482    pop    ebx
    483    pop    esi
    484    pop    edi
    485    leave
    486    ret
    487    nop
    488    }
    489 }
    490 
    491 /*
    492 *  Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized
    493 *  so its high bit is 1.   This code is from NSPR.
    494 *
    495 *  Dump of assembler code for function s_mpv_div_2dx1d:
    496 *
    497 *   esp +  0:   Caller's ebx
    498 *   esp +  4:  return address
    499 *   esp +  8:  Nhi argument
    500 *   esp + 12:  Nlo argument
    501 *   esp + 16:  divisor argument
    502 *   esp + 20:  qp  argument
    503 *   esp + 24:   rp argument
    504 *   registers:
    505 *      eax:
    506 *  ebx:    carry
    507 *  ecx:    a_len
    508 *  edx:
    509 *  esi:    a ptr
    510 *  edi:    c ptr
    511 */
    512 __declspec(naked) mp_err
    513    s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor,
    514                    mp_digit *qp, mp_digit *rp)
    515 {
    516    __asm {
    517       push   ebx
    518       mov    edx,[esp+8]
    519       mov    eax,[esp+12]
    520       mov    ebx,[esp+16]
    521       div    ebx
    522       mov    ebx,[esp+20]
    523       mov    [ebx],eax
    524       mov    ebx,[esp+24]
    525       mov    [ebx],edx
    526       xor    eax,eax       ; return zero
    527       pop    ebx
    528       ret
    529       nop
    530    }
    531 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE