tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mpi_amd64_common.S (9208B)


      1 # This Source Code Form is subject to the terms of the Mozilla Public
      2 # License, v. 2.0. If a copy of the MPL was not distributed with this
      3 # file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 
      5 
      6 # ------------------------------------------------------------------------
      7 #
      8 #  Implementation of s_mpv_mul_set_vec which exploits
      9 #  the 64X64->128 bit  unsigned multiply instruction.
     10 #
     11 # ------------------------------------------------------------------------
     12 
     13 # r = a * digit, r and a are vectors of length len
     14 # returns the carry digit
     15 # r and a are 64 bit aligned.
     16 #
     17 # uint64_t
     18 # s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
     19 #
     20 
     21 .text; .align 16; .globl s_mpv_mul_set_vec64;
     22 
     23 #ifdef DARWIN
     24 #define s_mpv_mul_set_vec64		_s_mpv_mul_set_vec64
     25 .private_extern s_mpv_mul_set_vec64
     26 s_mpv_mul_set_vec64:
     27 #else
     28 .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
     29 #endif
     30 
     31 xorq	%rax, %rax		# if (len == 0) return (0)
     32 testq	%rdx, %rdx
     33 jz	.L17
     34 
     35 movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
     36 xorq	%r9, %r9		# cy = 0
     37 
     38 .L15:
     39 cmpq	$8, %r8			# 8 - len
     40 jb	.L16
     41 movq	0(%rsi), %rax		# rax = a[0]
     42 movq	8(%rsi), %r11		# prefetch a[1]
     43 mulq	%rcx			# p = a[0] * digit
     44 addq	%r9, %rax
     45 adcq	$0, %rdx		# p += cy
     46 movq	%rax, 0(%rdi)		# r[0] = lo(p)
     47 movq	%rdx, %r9		# cy = hi(p)
     48 
     49 movq	%r11, %rax
     50 movq	16(%rsi), %r11		# prefetch a[2]
     51 mulq	%rcx			# p = a[1] * digit
     52 addq	%r9, %rax
     53 adcq	$0, %rdx		# p += cy
     54 movq	%rax, 8(%rdi)		# r[1] = lo(p)
     55 movq	%rdx, %r9		# cy = hi(p)
     56 
     57 movq	%r11, %rax
     58 movq	24(%rsi), %r11		# prefetch a[3]
     59 mulq	%rcx			# p = a[2] * digit
     60 addq	%r9, %rax
     61 adcq	$0, %rdx		# p += cy
     62 movq	%rax, 16(%rdi)		# r[2] = lo(p)
     63 movq	%rdx, %r9		# cy = hi(p)
     64 
     65 movq	%r11, %rax
     66 movq	32(%rsi), %r11		# prefetch a[4]
     67 mulq	%rcx			# p = a[3] * digit
     68 addq	%r9, %rax
     69 adcq	$0, %rdx		# p += cy
     70 movq	%rax, 24(%rdi)		# r[3] = lo(p)
     71 movq	%rdx, %r9		# cy = hi(p)
     72 
     73 movq	%r11, %rax
     74 movq	40(%rsi), %r11		# prefetch a[5]
     75 mulq	%rcx			# p = a[4] * digit
     76 addq	%r9, %rax
     77 adcq	$0, %rdx		# p += cy
     78 movq	%rax, 32(%rdi)		# r[4] = lo(p)
     79 movq	%rdx, %r9		# cy = hi(p)
     80 
     81 movq	%r11, %rax
     82 movq	48(%rsi), %r11		# prefetch a[6]
     83 mulq	%rcx			# p = a[5] * digit
     84 addq	%r9, %rax
     85 adcq	$0, %rdx		# p += cy
     86 movq	%rax, 40(%rdi)		# r[5] = lo(p)
     87 movq	%rdx, %r9		# cy = hi(p)
     88 
     89 movq	%r11, %rax
     90 movq	56(%rsi), %r11		# prefetch a[7]
     91 mulq	%rcx			# p = a[6] * digit
     92 addq	%r9, %rax
     93 adcq	$0, %rdx		# p += cy
     94 movq	%rax, 48(%rdi)		# r[6] = lo(p)
     95 movq	%rdx, %r9		# cy = hi(p)
     96 
     97 movq	%r11, %rax
     98 mulq	%rcx			# p = a[7] * digit
     99 addq	%r9, %rax
    100 adcq	$0, %rdx		# p += cy
    101 movq	%rax, 56(%rdi)		# r[7] = lo(p)
    102 movq	%rdx, %r9		# cy = hi(p)
    103 
    104 addq	$64, %rsi
    105 addq	$64, %rdi
    106 subq	$8, %r8
    107 
    108 jz	.L17
    109 jmp	.L15
    110 
    111 .L16:
    112 movq	0(%rsi), %rax
    113 mulq	%rcx			# p = a[0] * digit
    114 addq	%r9, %rax
    115 adcq	$0, %rdx		# p += cy
    116 movq	%rax, 0(%rdi)		# r[0] = lo(p)
    117 movq	%rdx, %r9		# cy = hi(p)
    118 decq	%r8
    119 jz	.L17
    120 
    121 movq	8(%rsi), %rax
    122 mulq	%rcx			# p = a[1] * digit
    123 addq	%r9, %rax
    124 adcq	$0, %rdx		# p += cy
    125 movq	%rax, 8(%rdi)		# r[1] = lo(p)
    126 movq	%rdx, %r9		# cy = hi(p)
    127 decq	%r8
    128 jz	.L17
    129 
    130 movq	16(%rsi), %rax
    131 mulq	%rcx			# p = a[2] * digit
    132 addq	%r9, %rax
    133 adcq	$0, %rdx		# p += cy
    134 movq	%rax, 16(%rdi)		# r[2] = lo(p)
    135 movq	%rdx, %r9		# cy = hi(p)
    136 decq	%r8
    137 jz	.L17
    138 
    139 movq	24(%rsi), %rax
    140 mulq	%rcx			# p = a[3] * digit
    141 addq	%r9, %rax
    142 adcq	$0, %rdx		# p += cy
    143 movq	%rax, 24(%rdi)		# r[3] = lo(p)
    144 movq	%rdx, %r9		# cy = hi(p)
    145 decq	%r8
    146 jz	.L17
    147 
    148 movq	32(%rsi), %rax
    149 mulq	%rcx			# p = a[4] * digit
    150 addq	%r9, %rax
    151 adcq	$0, %rdx		# p += cy
    152 movq	%rax, 32(%rdi)		# r[4] = lo(p)
    153 movq	%rdx, %r9		# cy = hi(p)
    154 decq	%r8
    155 jz	.L17
    156 
    157 movq	40(%rsi), %rax
    158 mulq	%rcx			# p = a[5] * digit
    159 addq	%r9, %rax
    160 adcq	$0, %rdx		# p += cy
    161 movq	%rax, 40(%rdi)		# r[5] = lo(p)
    162 movq	%rdx, %r9		# cy = hi(p)
    163 decq	%r8
    164 jz	.L17
    165 
    166 movq	48(%rsi), %rax
    167 mulq	%rcx			# p = a[6] * digit
    168 addq	%r9, %rax
    169 adcq	$0, %rdx		# p += cy
    170 movq	%rax, 48(%rdi)		# r[6] = lo(p)
    171 movq	%rdx, %r9		# cy = hi(p)
    172 decq	%r8
    173 jz	.L17
    174 
    175 
    176 .L17:
    177 movq	%r9, %rax
    178 ret
    179 
    180 #ifndef DARWIN
    181 .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
    182 #endif
    183 
    184 # ------------------------------------------------------------------------
    185 #
    186 #  Implementation of s_mpv_mul_add_vec which exploits
    187 #  the 64X64->128 bit  unsigned multiply instruction.
    188 #
    189 # ------------------------------------------------------------------------
    190 
    191 # r += a * digit, r and a are vectors of length len
    192 # returns the carry digit
    193 # r and a are 64 bit aligned.
    194 #
    195 # uint64_t
    196 # s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
    197 #
    198 
    199 .text; .align 16; .globl s_mpv_mul_add_vec64;
    200 
    201 #ifdef DARWIN
    202 #define s_mpv_mul_add_vec64      _s_mpv_mul_add_vec64
    203 .private_extern s_mpv_mul_add_vec64
    204 s_mpv_mul_add_vec64:
    205 #else
    206 .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
    207 #endif
    208 
    209 xorq	%rax, %rax		# if (len == 0) return (0)
    210 testq	%rdx, %rdx
    211 jz	.L27
    212 
    213 movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
    214 xorq	%r9, %r9		# cy = 0
    215 
    216 .L25:
    217 cmpq	$8, %r8			# 8 - len
    218 jb	.L26
    219 movq	0(%rsi), %rax		# rax = a[0]
    220 movq	0(%rdi), %r10		# r10 = r[0]
    221 movq	8(%rsi), %r11		# prefetch a[1]
    222 mulq	%rcx			# p = a[0] * digit
    223 addq	%r10, %rax
    224 adcq	$0, %rdx		# p += r[0]
    225 movq	8(%rdi), %r10		# prefetch r[1]
    226 addq	%r9, %rax
    227 adcq	$0, %rdx		# p += cy
    228 movq	%rax, 0(%rdi)		# r[0] = lo(p)
    229 movq	%rdx, %r9		# cy = hi(p)
    230 
    231 movq	%r11, %rax
    232 movq	16(%rsi), %r11		# prefetch a[2]
    233 mulq	%rcx			# p = a[1] * digit
    234 addq	%r10, %rax
    235 adcq	$0, %rdx		# p += r[1]
    236 movq	16(%rdi), %r10		# prefetch r[2]
    237 addq	%r9, %rax
    238 adcq	$0, %rdx		# p += cy
    239 movq	%rax, 8(%rdi)		# r[1] = lo(p)
    240 movq	%rdx, %r9		# cy = hi(p)
    241 
    242 movq	%r11, %rax
    243 movq	24(%rsi), %r11		# prefetch a[3]
    244 mulq	%rcx			# p = a[2] * digit
    245 addq	%r10, %rax
    246 adcq	$0, %rdx		# p += r[2]
    247 movq	24(%rdi), %r10		# prefetch r[3]
    248 addq	%r9, %rax
    249 adcq	$0, %rdx		# p += cy
    250 movq	%rax, 16(%rdi)		# r[2] = lo(p)
    251 movq	%rdx, %r9		# cy = hi(p)
    252 
    253 movq	%r11, %rax
    254 movq	32(%rsi), %r11		# prefetch a[4]
    255 mulq	%rcx			# p = a[3] * digit
    256 addq	%r10, %rax
    257 adcq	$0, %rdx		# p += r[3]
    258 movq	32(%rdi), %r10		# prefetch r[4]
    259 addq	%r9, %rax
    260 adcq	$0, %rdx		# p += cy
    261 movq	%rax, 24(%rdi)		# r[3] = lo(p)
    262 movq	%rdx, %r9		# cy = hi(p)
    263 
    264 movq	%r11, %rax
    265 movq	40(%rsi), %r11		# prefetch a[5]
    266 mulq	%rcx			# p = a[4] * digit
    267 addq	%r10, %rax
    268 adcq	$0, %rdx		# p += r[4]
    269 movq	40(%rdi), %r10		# prefetch r[5]
    270 addq	%r9, %rax
    271 adcq	$0, %rdx		# p += cy
    272 movq	%rax, 32(%rdi)		# r[4] = lo(p)
    273 movq	%rdx, %r9		# cy = hi(p)
    274 
    275 movq	%r11, %rax
    276 movq	48(%rsi), %r11		# prefetch a[6]
    277 mulq	%rcx			# p = a[5] * digit
    278 addq	%r10, %rax
    279 adcq	$0, %rdx		# p += r[5]
    280 movq	48(%rdi), %r10		# prefetch r[6]
    281 addq	%r9, %rax
    282 adcq	$0, %rdx		# p += cy
    283 movq	%rax, 40(%rdi)		# r[5] = lo(p)
    284 movq	%rdx, %r9		# cy = hi(p)
    285 
    286 movq	%r11, %rax
    287 movq	56(%rsi), %r11		# prefetch a[7]
    288 mulq	%rcx			# p = a[6] * digit
    289 addq	%r10, %rax
    290 adcq	$0, %rdx		# p += r[6]
    291 movq	56(%rdi), %r10		# prefetch r[7]
    292 addq	%r9, %rax
    293 adcq	$0, %rdx		# p += cy
    294 movq	%rax, 48(%rdi)		# r[6] = lo(p)
    295 movq	%rdx, %r9		# cy = hi(p)
    296 
    297 movq	%r11, %rax
    298 mulq	%rcx			# p = a[7] * digit
    299 addq	%r10, %rax
    300 adcq	$0, %rdx		# p += r[7]
    301 addq	%r9, %rax
    302 adcq	$0, %rdx		# p += cy
    303 movq	%rax, 56(%rdi)		# r[7] = lo(p)
    304 movq	%rdx, %r9		# cy = hi(p)
    305 
    306 addq	$64, %rsi
    307 addq	$64, %rdi
    308 subq	$8, %r8
    309 
    310 jz	.L27
    311 jmp	.L25
    312 
    313 .L26:
    314 movq	0(%rsi), %rax
    315 movq	0(%rdi), %r10
    316 mulq	%rcx			# p = a[0] * digit
    317 addq	%r10, %rax
    318 adcq	$0, %rdx		# p += r[0]
    319 addq	%r9, %rax
    320 adcq	$0, %rdx		# p += cy
    321 movq	%rax, 0(%rdi)		# r[0] = lo(p)
    322 movq	%rdx, %r9		# cy = hi(p)
    323 decq	%r8
    324 jz	.L27
    325 
    326 movq	8(%rsi), %rax
    327 movq	8(%rdi), %r10
    328 mulq	%rcx			# p = a[1] * digit
    329 addq	%r10, %rax
    330 adcq	$0, %rdx		# p += r[1]
    331 addq	%r9, %rax
    332 adcq	$0, %rdx		# p += cy
    333 movq	%rax, 8(%rdi)		# r[1] = lo(p)
    334 movq	%rdx, %r9		# cy = hi(p)
    335 decq	%r8
    336 jz	.L27
    337 
    338 movq	16(%rsi), %rax
    339 movq	16(%rdi), %r10
    340 mulq	%rcx			# p = a[2] * digit
    341 addq	%r10, %rax
    342 adcq	$0, %rdx		# p += r[2]
    343 addq	%r9, %rax
    344 adcq	$0, %rdx		# p += cy
    345 movq	%rax, 16(%rdi)		# r[2] = lo(p)
    346 movq	%rdx, %r9		# cy = hi(p)
    347 decq	%r8
    348 jz	.L27
    349 
    350 movq	24(%rsi), %rax
    351 movq	24(%rdi), %r10
    352 mulq	%rcx			# p = a[3] * digit
    353 addq	%r10, %rax
    354 adcq	$0, %rdx		# p += r[3]
    355 addq	%r9, %rax
    356 adcq	$0, %rdx		# p += cy
    357 movq	%rax, 24(%rdi)		# r[3] = lo(p)
    358 movq	%rdx, %r9		# cy = hi(p)
    359 decq	%r8
    360 jz	.L27
    361 
    362 movq	32(%rsi), %rax
    363 movq	32(%rdi), %r10
    364 mulq	%rcx			# p = a[4] * digit
    365 addq	%r10, %rax
    366 adcq	$0, %rdx		# p += r[4]
    367 addq	%r9, %rax
    368 adcq	$0, %rdx		# p += cy
    369 movq	%rax, 32(%rdi)		# r[4] = lo(p)
    370 movq	%rdx, %r9		# cy = hi(p)
    371 decq	%r8
    372 jz	.L27
    373 
    374 movq	40(%rsi), %rax
    375 movq	40(%rdi), %r10
    376 mulq	%rcx			# p = a[5] * digit
    377 addq	%r10, %rax
    378 adcq	$0, %rdx		# p += r[5]
    379 addq	%r9, %rax
    380 adcq	$0, %rdx		# p += cy
    381 movq	%rax, 40(%rdi)		# r[5] = lo(p)
    382 movq	%rdx, %r9		# cy = hi(p)
    383 decq	%r8
    384 jz	.L27
    385 
    386 movq	48(%rsi), %rax
    387 movq	48(%rdi), %r10
    388 mulq	%rcx			# p = a[6] * digit
    389 addq	%r10, %rax
    390 adcq	$0, %rdx		# p += r[6]
    391 addq	%r9, %rax
    392 adcq	$0, %rdx		# p += cy
    393 movq	%rax, 48(%rdi)		# r[6] = lo(p)
    394 movq	%rdx, %r9		# cy = hi(p)
    395 decq	%r8
    396 jz	.L27
    397 
    398 
    399 .L27:
    400 movq	%r9, %rax
    401 ret
    402 
    403 #ifndef DARWIN
    404 .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64
    405 
    406 # Magic indicating no need for an executable stack
    407 .section .note.GNU-stack, "", @progbits
    408 .previous
    409 #endif