mpi_sse2.s (7991B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 #ifdef DARWIN 6 #define s_mpv_mul_d _s_mpv_mul_d 7 #define s_mpv_mul_d_add _s_mpv_mul_d_add 8 #define s_mpv_mul_d_add_prop _s_mpv_mul_d_add_prop 9 #define s_mpv_sqr_add_prop _s_mpv_sqr_add_prop 10 #define s_mpv_div_2dx1d _s_mpv_div_2dx1d 11 #define TYPE_FUNCTION(x) 12 #else 13 #define TYPE_FUNCTION(x) .type x, @function 14 #endif 15 16 .text 17 18 # ebp - 8: caller's esi 19 # ebp - 4: caller's edi 20 # ebp + 0: caller's ebp 21 # ebp + 4: return address 22 # ebp + 8: a argument 23 # ebp + 12: a_len argument 24 # ebp + 16: b argument 25 # ebp + 20: c argument 26 # registers: 27 # ebx: 28 # ecx: a_len 29 # esi: a ptr 30 # edi: c ptr 31 .globl s_mpv_mul_d 32 .private_extern s_mpv_mul_d 33 TYPE_FUNCTION(s_mpv_mul_d) 34 s_mpv_mul_d: 35 push %ebp 36 mov %esp, %ebp 37 push %edi 38 push %esi 39 psubq %mm2, %mm2 # carry = 0 40 mov 12(%ebp), %ecx # ecx = a_len 41 movd 16(%ebp), %mm1 # mm1 = b 42 mov 20(%ebp), %edi 43 cmp $0, %ecx 44 je 2f # jmp if a_len == 0 45 mov 8(%ebp), %esi # esi = a 46 cld 47 1: 48 movd 0(%esi), %mm0 # mm0 = *a++ 49 add $4, %esi 50 pmuludq %mm1, %mm0 # mm0 = b * *a++ 51 paddq %mm0, %mm2 # add the carry 52 movd %mm2, 0(%edi) # store the 32bit result 53 add $4, %edi 54 psrlq $32, %mm2 # save the carry 55 dec %ecx # --a_len 56 jnz 1b # jmp if a_len != 0 57 2: 58 movd %mm2, 0(%edi) # *c = carry 59 emms 60 pop %esi 61 pop %edi 62 leave 63 ret 64 nop 65 66 # ebp - 8: caller's esi 67 # ebp - 4: caller's edi 68 # ebp + 0: caller's ebp 69 # ebp + 4: return address 70 # ebp + 8: a argument 71 # ebp + 12: a_len argument 72 # ebp + 16: b argument 73 # ebp + 20: c argument 74 # registers: 75 # ebx: 76 # ecx: a_len 77 # esi: a ptr 78 # edi: c ptr 79 .globl s_mpv_mul_d_add 80 .private_extern s_mpv_mul_d_add 81 TYPE_FUNCTION(s_mpv_mul_d_add) 82 s_mpv_mul_d_add: 83 push %ebp 84 mov %esp, %ebp 85 push %edi 86 push %esi 87 psubq %mm2, %mm2 # carry = 0 88 mov 12(%ebp), %ecx # ecx = a_len 89 movd 16(%ebp), %mm1 # mm1 = b 90 mov 20(%ebp), %edi 91 cmp $0, %ecx 92 je 2f # jmp if a_len == 0 93 mov 8(%ebp), %esi # esi = a 94 cld 95 1: 96 movd 0(%esi), %mm0 # mm0 = *a++ 97 add $4, %esi 98 pmuludq %mm1, %mm0 # mm0 = b * *a++ 99 paddq %mm0, %mm2 # add the carry 100 movd 0(%edi), %mm0 101 paddq %mm0, %mm2 # add the carry 102 movd %mm2, 0(%edi) # store the 32bit result 103 add $4, %edi 104 psrlq $32, %mm2 # save the carry 105 dec %ecx # --a_len 106 jnz 1b # jmp if a_len != 0 107 2: 108 movd %mm2, 0(%edi) # *c = carry 109 emms 110 pop %esi 111 pop %edi 112 leave 113 ret 114 nop 115 116 # ebp - 12: caller's ebx 117 # ebp - 8: caller's esi 118 # ebp - 4: caller's edi 119 # ebp + 0: caller's ebp 120 # ebp + 4: return address 121 # ebp + 8: a argument 122 # ebp + 12: a_len argument 123 # ebp + 16: b argument 124 # ebp + 20: c argument 125 # registers: 126 # eax: 127 # ebx: carry 128 # ecx: a_len 129 # esi: a ptr 130 # edi: c ptr 131 .globl s_mpv_mul_d_add_prop 132 .private_extern s_mpv_mul_d_add_prop 133 TYPE_FUNCTION(s_mpv_mul_d_add_prop) 134 s_mpv_mul_d_add_prop: 135 push %ebp 136 mov %esp, %ebp 137 push %edi 138 push %esi 139 push %ebx 140 psubq %mm2, %mm2 # carry = 0 141 mov 12(%ebp), %ecx # ecx = a_len 142 movd 16(%ebp), %mm1 # mm1 = b 143 mov 20(%ebp), %edi 144 cmp $0, %ecx 145 je 2f # jmp if a_len == 0 146 mov 8(%ebp), %esi # esi = a 147 cld 148 1: 149 movd 0(%esi), %mm0 # mm0 = *a++ 150 movd 0(%edi), %mm3 # fetch the sum 151 add $4, %esi 152 pmuludq %mm1, %mm0 # mm0 = b * *a++ 153 paddq %mm0, %mm2 # add the carry 154 paddq %mm3, %mm2 # add *c++ 155 movd %mm2, 0(%edi) # store the 32bit result 156 add $4, %edi 157 psrlq $32, %mm2 # save the carry 158 dec %ecx # --a_len 159 jnz 1b # jmp if a_len != 0 160 2: 161 movd %mm2, %ebx 162 cmp $0, %ebx # is carry zero? 163 jz 4f 164 mov 0(%edi), %eax 165 add %ebx, %eax 166 stosl 167 jnc 4f 168 3: 169 mov 0(%edi), %eax # add in current word from *c 170 adc $0, %eax 171 stosl # [es:edi] = ax; edi += 4; 172 jc 3b 173 4: 174 emms 175 pop %ebx 176 pop %esi 177 pop %edi 178 leave 179 ret 180 nop 181 182 # ebp - 12: caller's ebx 183 # ebp - 8: caller's esi 184 # ebp - 4: caller's edi 185 # ebp + 0: caller's ebp 186 # ebp + 4: return address 187 # ebp + 8: pa argument 188 # ebp + 12: a_len argument 189 # ebp + 16: ps argument 190 # registers: 191 # eax: 192 # ebx: carry 193 # ecx: a_len 194 # esi: a ptr 195 # edi: c ptr 196 .globl s_mpv_sqr_add_prop 197 .private_extern s_mpv_sqr_add_prop 198 TYPE_FUNCTION(s_mpv_sqr_add_prop) 199 s_mpv_sqr_add_prop: 200 push %ebp 201 mov %esp, %ebp 202 push %edi 203 push %esi 204 push %ebx 205 psubq %mm2, %mm2 # carry = 0 206 mov 12(%ebp), %ecx # ecx = a_len 207 mov 16(%ebp), %edi 208 cmp $0, %ecx 209 je 2f # jmp if a_len == 0 210 mov 8(%ebp), %esi # esi = a 211 cld 212 1: 213 movd 0(%esi), %mm0 # mm0 = *a 214 movd 0(%edi), %mm3 # fetch the sum 215 add $4, %esi 216 pmuludq %mm0, %mm0 # mm0 = sqr(a) 217 paddq %mm0, %mm2 # add the carry 218 paddq %mm3, %mm2 # add the low word 219 movd 4(%edi), %mm3 220 movd %mm2, 0(%edi) # store the 32bit result 221 psrlq $32, %mm2 222 paddq %mm3, %mm2 # add the high word 223 movd %mm2, 4(%edi) # store the 32bit result 224 psrlq $32, %mm2 # save the carry. 225 add $8, %edi 226 dec %ecx # --a_len 227 jnz 1b # jmp if a_len != 0 228 2: 229 movd %mm2, %ebx 230 cmp $0, %ebx # is carry zero? 231 jz 4f 232 mov 0(%edi), %eax 233 add %ebx, %eax 234 stosl 235 jnc 4f 236 3: 237 mov 0(%edi), %eax # add in current word from *c 238 adc $0, %eax 239 stosl # [es:edi] = ax; edi += 4; 240 jc 3b 241 4: 242 emms 243 pop %ebx 244 pop %esi 245 pop %edi 246 leave 247 ret 248 nop 249 250 # 251 # Divide 64-bit (Nhi,Nlo) by 32-bit divisor, which must be normalized 252 # so its high bit is 1. This code is from NSPR. 253 # 254 # mp_err s_mpv_div_2dx1d(mp_digit Nhi, mp_digit Nlo, mp_digit divisor, 255 # mp_digit *qp, mp_digit *rp) 256 257 # esp + 0: Caller's ebx 258 # esp + 4: return address 259 # esp + 8: Nhi argument 260 # esp + 12: Nlo argument 261 # esp + 16: divisor argument 262 # esp + 20: qp argument 263 # esp + 24: rp argument 264 # registers: 265 # eax: 266 # ebx: carry 267 # ecx: a_len 268 # edx: 269 # esi: a ptr 270 # edi: c ptr 271 # 272 .globl s_mpv_div_2dx1d 273 .private_extern s_mpv_div_2dx1d 274 TYPE_FUNCTION(s_mpv_div_2dx1d) 275 s_mpv_div_2dx1d: 276 push %ebx 277 mov 8(%esp), %edx 278 mov 12(%esp), %eax 279 mov 16(%esp), %ebx 280 div %ebx 281 mov 20(%esp), %ebx 282 mov %eax, 0(%ebx) 283 mov 24(%esp), %ebx 284 mov %edx, 0(%ebx) 285 xor %eax, %eax # return zero 286 pop %ebx 287 ret 288 nop 289 290 #ifndef DARWIN 291 # Magic indicating no need for an executable stack 292 .section .note.GNU-stack, "", @progbits 293 .previous 294 #endif