arcfour-amd64-masm.asm (3882B)
1 ; This Source Code Form is subject to the terms of the Mozilla Public 2 ; License, v. 2.0. If a copy of the MPL was not distributed with this 3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 ; ** ARCFOUR implementation optimized for AMD64. 6 ; ** 7 ; ** The throughput achieved by this code is about 320 MBytes/sec, on 8 ; ** a 1.8 GHz AMD Opteron (rev C0) processor. 9 10 .CODE 11 12 ; extern void ARCFOUR(RC4Context *cx, unsigned long long inputLen, 13 ; const unsigned char *input, unsigned char *output); 14 15 16 ARCFOUR PROC 17 18 push rbp 19 push rbx 20 push rsi 21 push rdi 22 23 mov rbp, rcx ; key = ARG(key) 24 mov rbx, rdx ; rbx = ARG(len) 25 mov rsi, r8 ; in = ARG(in) 26 mov rdi, r9 ; out = ARG(out) 27 mov rcx, [rbp] ; x = key->x 28 mov rdx, [rbp+8] ; y = key->y 29 add rbp, 16 ; d = key->data 30 inc rcx ; x++ 31 and rcx, 0ffh ; x &= 0xff 32 lea rbx, [rbx+rsi-8] ; rbx = in+len-8 33 mov r9, rbx ; tmp = in+len-8 34 mov rax, [rbp+rcx*8] ; tx = d[x] 35 cmp rbx, rsi ; cmp in with in+len-8 36 jl Lend ; jump if (in+len-8 < in) 37 38 Lstart: 39 add rsi, 8 ; increment in 40 add rdi, 8 ; increment out 41 42 ; 43 ; generate the next 8 bytes of the rc4 stream into r8 44 ; 45 46 mov r11, 8 ; byte counter 47 48 @@: 49 add dl, al ; y += tx 50 mov ebx, [rbp+rdx*8] ; ty = d[y] 51 mov [rbp+rcx*8], ebx ; d[x] = ty 52 add bl, al ; val = ty + tx 53 mov [rbp+rdx*8], eax ; d[y] = tx 54 inc cl ; x++ (NEXT ROUND) 55 mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) 56 mov r8b, [rbp+rbx*8] ; val = d[val] 57 dec r11b 58 ror r8, 8 ; (ror does not change ZF) 59 jnz @b 60 61 ; 62 ; xor 8 bytes 63 ; 64 65 xor r8, [rsi-8] 66 cmp rsi, r9 ; cmp in+len-8 with in 67 mov [rdi-8], r8 68 jle Lstart 69 70 Lend: 71 add r9, 8 ; tmp = in+len 72 73 ; 74 ; handle the last bytes, one by one 75 ; 76 77 @@: 78 cmp r9, rsi ; cmp in with in+len 79 jle Lfinished ; jump if (in+len <= in) 80 add dl, al ; y += tx 81 mov ebx, [rbp+rdx*8] ; ty = d[y] 82 mov [rbp+rcx*8], ebx ; d[x] = ty 83 add bl, al ; val = ty + tx 84 mov [rbp+rdx*8], eax ; d[y] = tx 85 inc cl ; x++ (NEXT ROUND) 86 mov eax, [rbp+rcx*8] ; tx = d[x] (NEXT ROUND) 87 mov r8b, [rbp+rbx*8] ; val = d[val] 88 xor r8b, [rsi] ; xor 1 byte 89 mov [rdi], r8b 90 inc rsi ; in++ 91 inc rdi 92 jmp @b 93 94 Lfinished: 95 dec rcx ; x-- 96 mov [rbp-8], dl ; key->y = y 97 mov [rbp-16], cl ; key->x = x 98 99 pop rdi 100 pop rsi 101 pop rbx 102 pop rbp 103 ret 104 105 ARCFOUR ENDP 106 107 END