mpi_x86_asm.c (12811B)
1 /* 2 * mpi_x86_asm.c - MSVC inline assembly implementation of s_mpv_ functions. 3 * 4 * This Source Code Form is subject to the terms of the Mozilla Public 5 * License, v. 2.0. If a copy of the MPL was not distributed with this 6 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 7 8 #include "mpi-priv.h" 9 10 static int is_sse = -1; 11 extern unsigned long s_mpi_is_sse2(); 12 13 /* 14 * ebp - 36: caller's esi 15 * ebp - 32: caller's edi 16 * ebp - 28: 17 * ebp - 24: 18 * ebp - 20: 19 * ebp - 16: 20 * ebp - 12: 21 * ebp - 8: 22 * ebp - 4: 23 * ebp + 0: caller's ebp 24 * ebp + 4: return address 25 * ebp + 8: a argument 26 * ebp + 12: a_len argument 27 * ebp + 16: b argument 28 * ebp + 20: c argument 29 * registers: 30 * eax: 31 * ebx: carry 32 * ecx: a_len 33 * edx: 34 * esi: a ptr 35 * edi: c ptr 36 */ 37 __declspec(naked) void s_mpv_mul_d(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 38 { 39 __asm { 40 mov eax, is_sse 41 cmp eax, 0 42 je s_mpv_mul_d_x86 43 jg s_mpv_mul_d_sse2 44 call s_mpi_is_sse2 45 mov is_sse, eax 46 cmp eax, 0 47 jg s_mpv_mul_d_sse2 48 s_mpv_mul_d_x86: 49 push ebp 50 mov ebp,esp 51 sub esp,28 52 push edi 53 push esi 54 push ebx 55 mov ebx,0 ; carry = 0 56 mov ecx,[ebp+12] ; ecx = a_len 57 mov edi,[ebp+20] 58 cmp ecx,0 59 je L_2 ; jmp if a_len == 0 60 mov esi,[ebp+8] ; esi = a 61 cld 62 L_1: 63 lodsd ; eax = [ds:esi]; esi += 4 64 mov edx,[ebp+16] ; edx = b 65 mul edx ; edx:eax = Phi:Plo = a_i * b 66 67 add eax,ebx ; add carry (ebx) to edx:eax 68 adc edx,0 69 mov ebx,edx ; high half of product becomes next carry 70 71 stosd ; [es:edi] = ax; edi += 4; 72 dec ecx ; --a_len 73 jnz L_1 ; jmp if a_len != 0 74 L_2: 75 mov [edi],ebx ; *c = carry 76 pop ebx 77 pop esi 78 pop edi 79 leave 80 ret 81 nop 82 s_mpv_mul_d_sse2: 83 push ebp 84 mov ebp, esp 85 push edi 86 push esi 87 psubq mm2, mm2 ; carry = 0 88 mov ecx, [ebp+12] ; ecx = a_len 89 movd mm1, [ebp+16] ; mm1 = b 90 mov edi, [ebp+20] 91 cmp ecx, 0 92 je L_6 ; jmp if a_len == 0 93 mov esi, [ebp+8] ; esi = a 94 cld 95 L_5: 96 movd mm0, [esi] ; mm0 = *a++ 97 add esi, 4 98 pmuludq mm0, mm1 ; mm0 = b * *a++ 99 paddq mm2, mm0 ; add the carry 100 movd [edi], mm2 ; store the 32bit result 101 add edi, 4 102 psrlq mm2, 32 ; save the carry 103 dec ecx ; --a_len 104 jnz L_5 ; jmp if a_len != 0 105 L_6: 106 movd [edi], mm2 ; *c = carry 107 emms 108 pop esi 109 pop edi 110 leave 111 ret 112 nop 113 } 114 } 115 116 /* 117 * ebp - 36: caller's esi 118 * ebp - 32: caller's edi 119 * ebp - 28: 120 * ebp - 24: 121 * ebp - 20: 122 * ebp - 16: 123 * ebp - 12: 124 * ebp - 8: 125 * ebp - 4: 126 * ebp + 0: caller's ebp 127 * ebp + 4: return address 128 * ebp + 8: a argument 129 * ebp + 12: a_len argument 130 * ebp + 16: b argument 131 * ebp + 20: c argument 132 * registers: 133 * eax: 134 * ebx: carry 135 * ecx: a_len 136 * edx: 137 * esi: a ptr 138 * edi: c ptr 139 */ 140 __declspec(naked) void s_mpv_mul_d_add(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 141 { 142 __asm { 143 mov eax, is_sse 144 cmp eax, 0 145 je s_mpv_mul_d_add_x86 146 jg s_mpv_mul_d_add_sse2 147 call s_mpi_is_sse2 148 mov is_sse, eax 149 cmp eax, 0 150 jg s_mpv_mul_d_add_sse2 151 s_mpv_mul_d_add_x86: 152 push ebp 153 mov ebp,esp 154 sub esp,28 155 push edi 156 push esi 157 push ebx 158 mov ebx,0 ; carry = 0 159 mov ecx,[ebp+12] ; ecx = a_len 160 mov edi,[ebp+20] 161 cmp ecx,0 162 je L_11 ; jmp if a_len == 0 163 mov esi,[ebp+8] ; esi = a 164 cld 165 L_10: 166 lodsd ; eax = [ds:esi]; esi += 4 167 mov edx,[ebp+16] ; edx = b 168 mul edx ; edx:eax = Phi:Plo = a_i * b 169 170 add eax,ebx ; add carry (ebx) to edx:eax 171 adc edx,0 172 mov ebx,[edi] ; add in current word from *c 173 add eax,ebx 174 adc edx,0 175 mov ebx,edx ; high half of product becomes next carry 176 177 stosd ; [es:edi] = ax; edi += 4; 178 dec ecx ; --a_len 179 jnz L_10 ; jmp if a_len != 0 180 L_11: 181 mov [edi],ebx ; *c = carry 182 pop ebx 183 pop esi 184 pop edi 185 leave 186 ret 187 nop 188 s_mpv_mul_d_add_sse2: 189 push ebp 190 mov ebp, esp 191 push edi 192 push esi 193 psubq mm2, mm2 ; carry = 0 194 mov ecx, [ebp+12] ; ecx = a_len 195 movd mm1, [ebp+16] ; mm1 = b 196 mov edi, [ebp+20] 197 cmp ecx, 0 198 je L_16 ; jmp if a_len == 0 199 mov esi, [ebp+8] ; esi = a 200 cld 201 L_15: 202 movd mm0, [esi] ; mm0 = *a++ 203 add esi, 4 204 pmuludq mm0, mm1 ; mm0 = b * *a++ 205 paddq mm2, mm0 ; add the carry 206 movd mm0, [edi] 207 paddq mm2, mm0 ; add the carry 208 movd [edi], mm2 ; store the 32bit result 209 add edi, 4 210 psrlq mm2, 32 ; save the carry 211 dec ecx ; --a_len 212 jnz L_15 ; jmp if a_len != 0 213 L_16: 214 movd [edi], mm2 ; *c = carry 215 emms 216 pop esi 217 pop edi 218 leave 219 ret 220 nop 221 } 222 } 223 224 /* 225 * ebp - 36: caller's esi 226 * ebp - 32: caller's edi 227 * ebp - 28: 228 * ebp - 24: 229 * ebp - 20: 230 * ebp - 16: 231 * ebp - 12: 232 * ebp - 8: 233 * ebp - 4: 234 * ebp + 0: caller's ebp 235 * ebp + 4: return address 236 * ebp + 8: a argument 237 * ebp + 12: a_len argument 238 * ebp + 16: b argument 239 * ebp + 20: c argument 240 * registers: 241 * eax: 242 * ebx: carry 243 * ecx: a_len 244 * edx: 245 * esi: a ptr 246 * edi: c ptr 247 */ 248 __declspec(naked) void s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len, mp_digit b, mp_digit *c) 249 { 250 __asm { 251 mov eax, is_sse 252 cmp eax, 0 253 je s_mpv_mul_d_add_prop_x86 254 jg s_mpv_mul_d_add_prop_sse2 255 call s_mpi_is_sse2 256 mov is_sse, eax 257 cmp eax, 0 258 jg s_mpv_mul_d_add_prop_sse2 259 s_mpv_mul_d_add_prop_x86: 260 push ebp 261 mov ebp,esp 262 sub esp,28 263 push edi 264 push esi 265 push ebx 266 mov ebx,0 ; carry = 0 267 mov ecx,[ebp+12] ; ecx = a_len 268 mov edi,[ebp+20] 269 cmp ecx,0 270 je L_21 ; jmp if a_len == 0 271 cld 272 mov esi,[ebp+8] ; esi = a 273 L_20: 274 lodsd ; eax = [ds:esi]; esi += 4 275 mov edx,[ebp+16] ; edx = b 276 mul edx ; edx:eax = Phi:Plo = a_i * b 277 278 add eax,ebx ; add carry (ebx) to edx:eax 279 adc edx,0 280 mov ebx,[edi] ; add in current word from *c 281 add eax,ebx 282 adc edx,0 283 mov ebx,edx ; high half of product becomes next carry 284 285 stosd ; [es:edi] = ax; edi += 4; 286 dec ecx ; --a_len 287 jnz L_20 ; jmp if a_len != 0 288 L_21: 289 cmp ebx,0 ; is carry zero? 290 jz L_23 291 mov eax,[edi] ; add in current word from *c 292 add eax,ebx 293 stosd ; [es:edi] = ax; edi += 4; 294 jnc L_23 295 L_22: 296 mov eax,[edi] ; add in current word from *c 297 adc eax,0 298 stosd ; [es:edi] = ax; edi += 4; 299 jc L_22 300 L_23: 301 pop ebx 302 pop esi 303 pop edi 304 leave 305 ret 306 nop 307 s_mpv_mul_d_add_prop_sse2: 308 push ebp 309 mov ebp, esp 310 push edi 311 push esi 312 push ebx 313 psubq mm2, mm2 ; carry = 0 314 mov ecx, [ebp+12] ; ecx = a_len 315 movd mm1, [ebp+16] ; mm1 = b 316 mov edi, [ebp+20] 317 cmp ecx, 0 318 je L_26 ; jmp if a_len == 0 319 mov esi, [ebp+8] ; esi = a 320 cld 321 L_25: 322 movd mm0, [esi] ; mm0 = *a++ 323 movd mm3, [edi] ; fetch the sum 324 add esi, 4 325 pmuludq mm0, mm1 ; mm0 = b * *a++ 326 paddq mm2, mm0 ; add the carry 327 paddq mm2, mm3 ; add *c++ 328 movd [edi], mm2 ; store the 32bit result 329 add edi, 4 330 psrlq mm2, 32 ; save the carry 331 dec ecx ; --a_len 332 jnz L_25 ; jmp if a_len != 0 333 L_26: 334 movd ebx, mm2 335 cmp ebx, 0 ; is carry zero? 336 jz L_28 337 mov eax, [edi] 338 add eax, ebx 339 stosd 340 jnc L_28 341 L_27: 342 mov eax, [edi] ; add in current word from *c 343 adc eax, 0 344 stosd ; [es:edi] = ax; edi += 4; 345 jc L_27 346 L_28: 347 emms 348 pop ebx 349 pop esi 350 pop edi 351 leave 352 ret 353 nop 354 } 355 } 356 357 /* 358 * ebp - 20: caller's esi 359 * ebp - 16: caller's edi 360 * ebp - 12: 361 * ebp - 8: carry 362 * ebp - 4: a_len local 363 * ebp + 0: caller's ebp 364 * ebp + 4: return address 365 * ebp + 8: pa argument 366 * ebp + 12: a_len argument 367 * ebp + 16: ps argument 368 * ebp + 20: 369 * registers: 370 * eax: 371 * ebx: carry 372 * ecx: a_len 373 * edx: 374 * esi: a ptr 375 * edi: c ptr 376 */ 377 __declspec(naked) void s_mpv_sqr_add_prop(const mp_digit *a, mp_size a_len, mp_digit *sqrs) 378 { 379 __asm { 380 mov eax, is_sse 381 cmp eax, 0 382 je s_mpv_sqr_add_prop_x86 383 jg s_mpv_sqr_add_prop_sse2 384 call s_mpi_is_sse2 385 mov is_sse, eax 386 cmp eax, 0 387 jg s_mpv_sqr_add_prop_sse2 388 s_mpv_sqr_add_prop_x86: 389 push ebp 390 mov ebp,esp 391 sub esp,12 392 push edi 393 push esi 394 push ebx 395 mov ebx,0 ; carry = 0 396 mov ecx,[ebp+12] ; a_len 397 mov edi,[ebp+16] ; edi = ps 398 cmp ecx,0 399 je L_31 ; jump if a_len == 0 400 cld 401 mov esi,[ebp+8] ; esi = pa 402 L_30: 403 lodsd ; eax = [ds:si]; si += 4; 404 mul eax 405 406 add eax,ebx ; add "carry" 407 adc edx,0 408 mov ebx,[edi] 409 add eax,ebx ; add low word from result 410 mov ebx,[edi+4] 411 stosd ; [es:di] = eax; di += 4; 412 adc edx,ebx ; add high word from result 413 mov ebx,0 414 mov eax,edx 415 adc ebx,0 416 stosd ; [es:di] = eax; di += 4; 417 dec ecx ; --a_len 418 jnz L_30 ; jmp if a_len != 0 419 L_31: 420 cmp ebx,0 ; is carry zero? 421 jz L_34 422 mov eax,[edi] ; add in current word from *c 423 add eax,ebx 424 stosd ; [es:edi] = ax; edi += 4; 425 jnc L_34 426 L_32: 427 mov eax,[edi] ; add in current word from *c 428 adc eax,0 429 stosd ; [es:edi] = ax; edi += 4; 430 jc L_32 431 L_34: 432 pop ebx 433 pop esi 434 pop edi 435 leave 436 ret 437 nop 438 s_mpv_sqr_add_prop_sse2: 439 push ebp 440 mov ebp, esp 441 push edi 442 push esi 443 push ebx 444 psubq mm2, mm2 ; carry = 0 445 mov ecx, [ebp+12] ; ecx = a_len 446 mov edi, [ebp+16] 447 cmp ecx, 0 448 je L_36 ; jmp if a_len == 0 449 mov esi, [ebp+8] ; esi = a 450 cld 451 L_35: 452 movd mm0, [esi] ; mm0 = *a 453 movd mm3, [edi] ; fetch the sum 454 add esi, 4 455 pmuludq mm0, mm0 ; mm0 = sqr(a) 456 paddq mm2, mm0 ; add the carry 457 paddq mm2, mm3 ; add the low word 458 movd mm3, [edi+4] 459 movd [edi], mm2 ; store the 32bit result 460 psrlq mm2, 32 461 paddq mm2, mm3 ; add the high word 462 movd [edi+4], mm2 ; store the 32bit result 463 psrlq mm2, 32 ; save the carry. 464 add edi, 8 465 dec ecx ; --a_len 466 jnz L_35 ; jmp if a_len != 0 467 L_36: 468 movd ebx, mm2 469 cmp ebx, 0 ; is carry zero? 470 jz L_38 471 mov eax, [edi] 472 add eax, ebx 473 stosd 474 jnc L_38 475 L_37: 476 mov eax, [edi] ; add in current word from *c 477 adc eax, 0 478 stosd ; [es:edi] = ax; edi += 4; 479 jc L_37 480 L_38: 481 emms 482 pop ebx 483 pop esi 484 pop edi 485 leave 486 ret 487 nop 488 } 489 } 490 491 /* 492 * Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized 493 * so its high bit is 1. This code is from NSPR. 494 * 495 * Dump of assembler code for function s_mpv_div_2dx1d: 496 * 497 * esp + 0: Caller's ebx 498 * esp + 4: return address 499 * esp + 8: Nhi argument 500 * esp + 12: Nlo argument 501 * esp + 16: divisor argument 502 * esp + 20: qp argument 503 * esp + 24: rp argument 504 * registers: 505 * eax: 506 * ebx: carry 507 * ecx: a_len 508 * edx: 509 * esi: a ptr 510 * edi: c ptr 511 */ 512 __declspec(naked) mp_err 513 s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, 514 mp_digit *qp, mp_digit *rp) 515 { 516 __asm { 517 push ebx 518 mov edx,[esp+8] 519 mov eax,[esp+12] 520 mov ebx,[esp+16] 521 div ebx 522 mov ebx,[esp+20] 523 mov [ebx],eax 524 mov ebx,[esp+24] 525 mov [ebx],edx 526 xor eax,eax ; return zero 527 pop ebx 528 ret 529 nop 530 } 531 }