mpi_amd64_common.S (9208B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 6 # ------------------------------------------------------------------------ 7 # 8 # Implementation of s_mpv_mul_set_vec which exploits 9 # the 64X64->128 bit unsigned multiply instruction. 10 # 11 # ------------------------------------------------------------------------ 12 13 # r = a * digit, r and a are vectors of length len 14 # returns the carry digit 15 # r and a are 64 bit aligned. 16 # 17 # uint64_t 18 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 19 # 20 21 .text; .align 16; .globl s_mpv_mul_set_vec64; 22 23 #ifdef DARWIN 24 #define s_mpv_mul_set_vec64 _s_mpv_mul_set_vec64 25 .private_extern s_mpv_mul_set_vec64 26 s_mpv_mul_set_vec64: 27 #else 28 .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: 29 #endif 30 31 xorq %rax, %rax # if (len == 0) return (0) 32 testq %rdx, %rdx 33 jz .L17 34 35 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul 36 xorq %r9, %r9 # cy = 0 37 38 .L15: 39 cmpq $8, %r8 # 8 - len 40 jb .L16 41 movq 0(%rsi), %rax # rax = a[0] 42 movq 8(%rsi), %r11 # prefetch a[1] 43 mulq %rcx # p = a[0] * digit 44 addq %r9, %rax 45 adcq $0, %rdx # p += cy 46 movq %rax, 0(%rdi) # r[0] = lo(p) 47 movq %rdx, %r9 # cy = hi(p) 48 49 movq %r11, %rax 50 movq 16(%rsi), %r11 # prefetch a[2] 51 mulq %rcx # p = a[1] * digit 52 addq %r9, %rax 53 adcq $0, %rdx # p += cy 54 movq %rax, 8(%rdi) # r[1] = lo(p) 55 movq %rdx, %r9 # cy = hi(p) 56 57 movq %r11, %rax 58 movq 24(%rsi), %r11 # prefetch a[3] 59 mulq %rcx # p = a[2] * digit 60 addq %r9, %rax 61 adcq $0, %rdx # p += cy 62 movq %rax, 16(%rdi) # r[2] = lo(p) 63 movq %rdx, %r9 # cy = hi(p) 64 65 movq %r11, %rax 66 movq 32(%rsi), %r11 # prefetch a[4] 67 mulq %rcx # p = a[3] * digit 68 addq %r9, %rax 69 adcq $0, %rdx # p += cy 70 movq %rax, 24(%rdi) # r[3] = lo(p) 71 movq %rdx, %r9 # cy = hi(p) 72 73 movq %r11, %rax 74 movq 40(%rsi), %r11 # prefetch a[5] 75 mulq %rcx # p = a[4] * digit 76 addq %r9, %rax 77 adcq $0, %rdx # p += cy 78 movq %rax, 32(%rdi) # r[4] = lo(p) 79 movq %rdx, %r9 # cy = hi(p) 80 81 movq %r11, %rax 82 movq 48(%rsi), %r11 # prefetch a[6] 83 mulq %rcx # p = a[5] * digit 84 addq %r9, %rax 85 adcq $0, %rdx # p += cy 86 movq %rax, 40(%rdi) # r[5] = lo(p) 87 movq %rdx, %r9 # cy = hi(p) 88 89 movq %r11, %rax 90 movq 56(%rsi), %r11 # prefetch a[7] 91 mulq %rcx # p = a[6] * digit 92 addq %r9, %rax 93 adcq $0, %rdx # p += cy 94 movq %rax, 48(%rdi) # r[6] = lo(p) 95 movq %rdx, %r9 # cy = hi(p) 96 97 movq %r11, %rax 98 mulq %rcx # p = a[7] * digit 99 addq %r9, %rax 100 adcq $0, %rdx # p += cy 101 movq %rax, 56(%rdi) # r[7] = lo(p) 102 movq %rdx, %r9 # cy = hi(p) 103 104 addq $64, %rsi 105 addq $64, %rdi 106 subq $8, %r8 107 108 jz .L17 109 jmp .L15 110 111 .L16: 112 movq 0(%rsi), %rax 113 mulq %rcx # p = a[0] * digit 114 addq %r9, %rax 115 adcq $0, %rdx # p += cy 116 movq %rax, 0(%rdi) # r[0] = lo(p) 117 movq %rdx, %r9 # cy = hi(p) 118 decq %r8 119 jz .L17 120 121 movq 8(%rsi), %rax 122 mulq %rcx # p = a[1] * digit 123 addq %r9, %rax 124 adcq $0, %rdx # p += cy 125 movq %rax, 8(%rdi) # r[1] = lo(p) 126 movq %rdx, %r9 # cy = hi(p) 127 decq %r8 128 jz .L17 129 130 movq 16(%rsi), %rax 131 mulq %rcx # p = a[2] * digit 132 addq %r9, %rax 133 adcq $0, %rdx # p += cy 134 movq %rax, 16(%rdi) # r[2] = lo(p) 135 movq %rdx, %r9 # cy = hi(p) 136 decq %r8 137 jz .L17 138 139 movq 24(%rsi), %rax 140 mulq %rcx # p = a[3] * digit 141 addq %r9, %rax 142 adcq $0, %rdx # p += cy 143 movq %rax, 24(%rdi) # r[3] = lo(p) 144 movq %rdx, %r9 # cy = hi(p) 145 decq %r8 146 jz .L17 147 148 movq 32(%rsi), %rax 149 mulq %rcx # p = a[4] * digit 150 addq %r9, %rax 151 adcq $0, %rdx # p += cy 152 movq %rax, 32(%rdi) # r[4] = lo(p) 153 movq %rdx, %r9 # cy = hi(p) 154 decq %r8 155 jz .L17 156 157 movq 40(%rsi), %rax 158 mulq %rcx # p = a[5] * digit 159 addq %r9, %rax 160 adcq $0, %rdx # p += cy 161 movq %rax, 40(%rdi) # r[5] = lo(p) 162 movq %rdx, %r9 # cy = hi(p) 163 decq %r8 164 jz .L17 165 166 movq 48(%rsi), %rax 167 mulq %rcx # p = a[6] * digit 168 addq %r9, %rax 169 adcq $0, %rdx # p += cy 170 movq %rax, 48(%rdi) # r[6] = lo(p) 171 movq %rdx, %r9 # cy = hi(p) 172 decq %r8 173 jz .L17 174 175 176 .L17: 177 movq %r9, %rax 178 ret 179 180 #ifndef DARWIN 181 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 182 #endif 183 184 # ------------------------------------------------------------------------ 185 # 186 # Implementation of s_mpv_mul_add_vec which exploits 187 # the 64X64->128 bit unsigned multiply instruction. 188 # 189 # ------------------------------------------------------------------------ 190 191 # r += a * digit, r and a are vectors of length len 192 # returns the carry digit 193 # r and a are 64 bit aligned. 194 # 195 # uint64_t 196 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) 197 # 198 199 .text; .align 16; .globl s_mpv_mul_add_vec64; 200 201 #ifdef DARWIN 202 #define s_mpv_mul_add_vec64 _s_mpv_mul_add_vec64 203 .private_extern s_mpv_mul_add_vec64 204 s_mpv_mul_add_vec64: 205 #else 206 .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: 207 #endif 208 209 xorq %rax, %rax # if (len == 0) return (0) 210 testq %rdx, %rdx 211 jz .L27 212 213 movq %rdx, %r8 # Use r8 for len; %rdx is used by mul 214 xorq %r9, %r9 # cy = 0 215 216 .L25: 217 cmpq $8, %r8 # 8 - len 218 jb .L26 219 movq 0(%rsi), %rax # rax = a[0] 220 movq 0(%rdi), %r10 # r10 = r[0] 221 movq 8(%rsi), %r11 # prefetch a[1] 222 mulq %rcx # p = a[0] * digit 223 addq %r10, %rax 224 adcq $0, %rdx # p += r[0] 225 movq 8(%rdi), %r10 # prefetch r[1] 226 addq %r9, %rax 227 adcq $0, %rdx # p += cy 228 movq %rax, 0(%rdi) # r[0] = lo(p) 229 movq %rdx, %r9 # cy = hi(p) 230 231 movq %r11, %rax 232 movq 16(%rsi), %r11 # prefetch a[2] 233 mulq %rcx # p = a[1] * digit 234 addq %r10, %rax 235 adcq $0, %rdx # p += r[1] 236 movq 16(%rdi), %r10 # prefetch r[2] 237 addq %r9, %rax 238 adcq $0, %rdx # p += cy 239 movq %rax, 8(%rdi) # r[1] = lo(p) 240 movq %rdx, %r9 # cy = hi(p) 241 242 movq %r11, %rax 243 movq 24(%rsi), %r11 # prefetch a[3] 244 mulq %rcx # p = a[2] * digit 245 addq %r10, %rax 246 adcq $0, %rdx # p += r[2] 247 movq 24(%rdi), %r10 # prefetch r[3] 248 addq %r9, %rax 249 adcq $0, %rdx # p += cy 250 movq %rax, 16(%rdi) # r[2] = lo(p) 251 movq %rdx, %r9 # cy = hi(p) 252 253 movq %r11, %rax 254 movq 32(%rsi), %r11 # prefetch a[4] 255 mulq %rcx # p = a[3] * digit 256 addq %r10, %rax 257 adcq $0, %rdx # p += r[3] 258 movq 32(%rdi), %r10 # prefetch r[4] 259 addq %r9, %rax 260 adcq $0, %rdx # p += cy 261 movq %rax, 24(%rdi) # r[3] = lo(p) 262 movq %rdx, %r9 # cy = hi(p) 263 264 movq %r11, %rax 265 movq 40(%rsi), %r11 # prefetch a[5] 266 mulq %rcx # p = a[4] * digit 267 addq %r10, %rax 268 adcq $0, %rdx # p += r[4] 269 movq 40(%rdi), %r10 # prefetch r[5] 270 addq %r9, %rax 271 adcq $0, %rdx # p += cy 272 movq %rax, 32(%rdi) # r[4] = lo(p) 273 movq %rdx, %r9 # cy = hi(p) 274 275 movq %r11, %rax 276 movq 48(%rsi), %r11 # prefetch a[6] 277 mulq %rcx # p = a[5] * digit 278 addq %r10, %rax 279 adcq $0, %rdx # p += r[5] 280 movq 48(%rdi), %r10 # prefetch r[6] 281 addq %r9, %rax 282 adcq $0, %rdx # p += cy 283 movq %rax, 40(%rdi) # r[5] = lo(p) 284 movq %rdx, %r9 # cy = hi(p) 285 286 movq %r11, %rax 287 movq 56(%rsi), %r11 # prefetch a[7] 288 mulq %rcx # p = a[6] * digit 289 addq %r10, %rax 290 adcq $0, %rdx # p += r[6] 291 movq 56(%rdi), %r10 # prefetch r[7] 292 addq %r9, %rax 293 adcq $0, %rdx # p += cy 294 movq %rax, 48(%rdi) # r[6] = lo(p) 295 movq %rdx, %r9 # cy = hi(p) 296 297 movq %r11, %rax 298 mulq %rcx # p = a[7] * digit 299 addq %r10, %rax 300 adcq $0, %rdx # p += r[7] 301 addq %r9, %rax 302 adcq $0, %rdx # p += cy 303 movq %rax, 56(%rdi) # r[7] = lo(p) 304 movq %rdx, %r9 # cy = hi(p) 305 306 addq $64, %rsi 307 addq $64, %rdi 308 subq $8, %r8 309 310 jz .L27 311 jmp .L25 312 313 .L26: 314 movq 0(%rsi), %rax 315 movq 0(%rdi), %r10 316 mulq %rcx # p = a[0] * digit 317 addq %r10, %rax 318 adcq $0, %rdx # p += r[0] 319 addq %r9, %rax 320 adcq $0, %rdx # p += cy 321 movq %rax, 0(%rdi) # r[0] = lo(p) 322 movq %rdx, %r9 # cy = hi(p) 323 decq %r8 324 jz .L27 325 326 movq 8(%rsi), %rax 327 movq 8(%rdi), %r10 328 mulq %rcx # p = a[1] * digit 329 addq %r10, %rax 330 adcq $0, %rdx # p += r[1] 331 addq %r9, %rax 332 adcq $0, %rdx # p += cy 333 movq %rax, 8(%rdi) # r[1] = lo(p) 334 movq %rdx, %r9 # cy = hi(p) 335 decq %r8 336 jz .L27 337 338 movq 16(%rsi), %rax 339 movq 16(%rdi), %r10 340 mulq %rcx # p = a[2] * digit 341 addq %r10, %rax 342 adcq $0, %rdx # p += r[2] 343 addq %r9, %rax 344 adcq $0, %rdx # p += cy 345 movq %rax, 16(%rdi) # r[2] = lo(p) 346 movq %rdx, %r9 # cy = hi(p) 347 decq %r8 348 jz .L27 349 350 movq 24(%rsi), %rax 351 movq 24(%rdi), %r10 352 mulq %rcx # p = a[3] * digit 353 addq %r10, %rax 354 adcq $0, %rdx # p += r[3] 355 addq %r9, %rax 356 adcq $0, %rdx # p += cy 357 movq %rax, 24(%rdi) # r[3] = lo(p) 358 movq %rdx, %r9 # cy = hi(p) 359 decq %r8 360 jz .L27 361 362 movq 32(%rsi), %rax 363 movq 32(%rdi), %r10 364 mulq %rcx # p = a[4] * digit 365 addq %r10, %rax 366 adcq $0, %rdx # p += r[4] 367 addq %r9, %rax 368 adcq $0, %rdx # p += cy 369 movq %rax, 32(%rdi) # r[4] = lo(p) 370 movq %rdx, %r9 # cy = hi(p) 371 decq %r8 372 jz .L27 373 374 movq 40(%rsi), %rax 375 movq 40(%rdi), %r10 376 mulq %rcx # p = a[5] * digit 377 addq %r10, %rax 378 adcq $0, %rdx # p += r[5] 379 addq %r9, %rax 380 adcq $0, %rdx # p += cy 381 movq %rax, 40(%rdi) # r[5] = lo(p) 382 movq %rdx, %r9 # cy = hi(p) 383 decq %r8 384 jz .L27 385 386 movq 48(%rsi), %rax 387 movq 48(%rdi), %r10 388 mulq %rcx # p = a[6] * digit 389 addq %r10, %rax 390 adcq $0, %rdx # p += r[6] 391 addq %r9, %rax 392 adcq $0, %rdx # p += cy 393 movq %rax, 48(%rdi) # r[6] = lo(p) 394 movq %rdx, %r9 # cy = hi(p) 395 decq %r8 396 jz .L27 397 398 399 .L27: 400 movq %r9, %rax 401 ret 402 403 #ifndef DARWIN 404 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64 405 406 # Magic indicating no need for an executable stack 407 .section .note.GNU-stack, "", @progbits 408 .previous 409 #endif