arcfour-amd64-gas.s (2478B)
1 # This Source Code Form is subject to the terms of the Mozilla Public 2 # License, v. 2.0. If a copy of the MPL was not distributed with this 3 # file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 # ** ARCFOUR implementation optimized for AMD64. 6 # ** 7 # ** The throughput achieved by this code is about 320 MBytes/sec, on 8 # ** a 1.8 GHz AMD Opteron (rev C0) processor. 9 10 .text 11 .align 16 12 .globl ARCFOUR 13 .type ARCFOUR,@function 14 ARCFOUR: 15 pushq %rbp 16 pushq %rbx 17 movq %rdi, %rbp # key = ARG(key) 18 movq %rsi, %rbx # rbx = ARG(len) 19 movq %rdx, %rsi # in = ARG(in) 20 movq %rcx, %rdi # out = ARG(out) 21 movq (%rbp), %rcx # x = key->x 22 movq 8(%rbp), %rdx # y = key->y 23 addq $16, %rbp # d = key->data 24 incq %rcx # x++ 25 andq $255, %rcx # x &= 0xff 26 leaq -8(%rbx,%rsi), %rbx # rbx = in+len-8 27 movq %rbx, %r9 # tmp = in+len-8 28 movq 0(%rbp,%rcx,8), %rax # tx = d[x] 29 cmpq %rsi, %rbx # cmp in with in+len-8 30 jl .Lend # jump if (in+len-8 < in) 31 32 .Lstart: 33 addq $8, %rsi # increment in 34 addq $8, %rdi # increment out 35 36 # generate the next 8 bytes of the rc4 stream into %r8 37 movq $8, %r11 # byte counter 38 1: addb %al, %dl # y += tx 39 movl 0(%rbp,%rdx,8), %ebx # ty = d[y] 40 movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty 41 addb %al, %bl # val = ty + tx 42 movl %eax, 0(%rbp,%rdx,8) # d[y] = tx 43 incb %cl # x++ (NEXT ROUND) 44 movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) 45 movb 0(%rbp,%rbx,8), %r8b # val = d[val] 46 decb %r11b 47 rorq $8, %r8 # (ror does not change ZF) 48 jnz 1b 49 50 # xor 8 bytes 51 xorq -8(%rsi), %r8 52 cmpq %r9, %rsi # cmp in+len-8 with in 53 movq %r8, -8(%rdi) 54 jle .Lstart # jump if (in <= in+len-8) 55 56 .Lend: 57 addq $8, %r9 # tmp = in+len 58 59 # handle the last bytes, one by one 60 1: cmpq %rsi, %r9 # cmp in with in+len 61 jle .Lfinished # jump if (in+len <= in) 62 addb %al, %dl # y += tx 63 movl 0(%rbp,%rdx,8), %ebx # ty = d[y] 64 movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty 65 addb %al, %bl # val = ty + tx 66 movl %eax, 0(%rbp,%rdx,8) # d[y] = tx 67 incb %cl # x++ (NEXT ROUND) 68 movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) 69 movb 0(%rbp,%rbx,8), %r8b # val = d[val] 70 xorb (%rsi), %r8b # xor 1 byte 71 movb %r8b, (%rdi) 72 incq %rsi # in++ 73 incq %rdi # out++ 74 jmp 1b 75 76 .Lfinished: 77 decq %rcx # x-- 78 movb %dl, -8(%rbp) # key->y = y 79 movb %cl, -16(%rbp) # key->x = x 80 popq %rbx 81 popq %rbp 82 ret 83 .L_ARCFOUR_end: 84 .size ARCFOUR,.L_ARCFOUR_end-ARCFOUR 85 86 # Magic indicating no need for an executable stack 87 .section .note.GNU-stack,"",@progbits 88 .previous