sha512p8-ppc.pl (11516B)
1 #!/usr/bin/env perl 2 3 # ==================================================================== 4 # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL 5 # project. The module is dual licensed under OpenSSL and CRYPTOGAMS 6 # licenses depending on where you obtain it. For further details see 7 # https://github.com/dot-asm/cryptogams/. 8 # ==================================================================== 9 10 # SHA256/512 for PowerISA v2.07. 11 # 12 # Accurate performance measurements are problematic, because it's 13 # always virtualized setup with possibly throttled processor. 14 # Relative comparison is therefore more informative. This module is 15 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something 16 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than 17 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than 18 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting 19 # result is degree of computational resources' utilization. POWER8 is 20 # "massively multi-threaded chip" and difference between single- and 21 # maximum multi-process benchmark results tells that utilization is 22 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and 23 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals 24 # to single-process one, given that all threads end up on the same 25 # physical core. 26 # 27 ###################################################################### 28 # Believed-to-be-accurate results in cycles per processed byte [on 29 # little-endian system]. Numbers in square brackets are for 64-bit 30 # build of sha512-ppc.pl, presented for reference. 31 # 32 # POWER8 POWER9 33 # SHA256 9.7 [15.8] 11.2 [12.5] 34 # SHA512 6.1 [10.3] 7.0 [7.9] 35 36 $flavour=shift; 37 $output =shift; 38 39 if ($flavour =~ /64/) { 40 $SIZE_T=8; 41 $LRSAVE=2*$SIZE_T; 42 $STU="stdu"; 43 $POP="ld"; 44 $PUSH="std"; 45 } elsif ($flavour =~ /32/) { 46 $SIZE_T=4; 47 $LRSAVE=$SIZE_T; 48 $STU="stwu"; 49 $POP="lwz"; 50 $PUSH="stw"; 51 } else { die "nonsense $flavour"; } 52 53 $LENDIAN=($flavour=~/le/); 54 55 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 56 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or 57 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or 58 die "can't locate ppc-xlate.pl"; 59 60 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!"; 61 62 if ($output =~ /512/) { 63 $bits=512; 64 $SZ=8; 65 $sz="d"; 66 $rounds=80; 67 } else { 68 $bits=256; 69 $SZ=4; 70 $sz="w"; 71 $rounds=64; 72 } 73 74 $func="sha${bits}_block_p8"; 75 $LOCALS=8*$SIZE_T+8*16; 76 $FRAME=$LOCALS+9*16+6*$SIZE_T; 77 78 $sp ="r1"; 79 $toc="r2"; 80 $ctx="r3"; 81 $inp="r4"; 82 $num="r5"; 83 $Tbl="r6"; 84 $idx="r7"; 85 $lrsave="r8"; 86 $offload="r11"; 87 $vrsave="r12"; 88 @I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31))); 89 90 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7)); 91 @X=map("v$_",(8..19,24..27)); 92 ($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31)); 93 94 sub ROUND { 95 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_; 96 my $j=($i+1)%16; 97 my $k=($i+2)%8; 98 99 $code.=<<___ if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1)); 100 lvx_u @X[$i+1],0,$inp ; load X[i] in advance 101 addi $inp,$inp,16 102 ___ 103 $code.=<<___ if ($i<16 && ($i%(16/$SZ))); 104 vsldoi @X[$i],@X[$i-1],@X[$i-1],$SZ 105 ___ 106 $code.=<<___ if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0); 107 vperm @X[$i],@X[$i],@X[$i],$lemask 108 ___ 109 $code.=<<___ if ($i>=15); 110 vshasigma${sz} $Sigma,@X[($j+1)%16],0,0 111 vaddu${sz}m @X[$j],@X[$j],$Sigma 112 vshasigma${sz} $Sigma,@X[($j+14)%16],0,15 113 vaddu${sz}m @X[$j],@X[$j],$Sigma 114 vaddu${sz}m @X[$j],@X[$j],@X[($j+9)%16] 115 ___ 116 $code.=<<___; 117 vaddu${sz}m $h,$h,@X[$i%16] ; h+=X[i] 118 vsel $Func,$g,$f,$e ; Ch(e,f,g) 119 vaddu${sz}m $g,$g,$Ki ; future h+=K[i] 120 vaddu${sz}m $h,$h,$Func ; h+=Ch(e,f,g) 121 vshasigma${sz} $Sigma,$e,1,15 ; Sigma1(e) 122 vaddu${sz}m $h,$h,$Sigma ; h+=Sigma1(e) 123 vxor $Func,$a,$b 124 vsel $Func,$b,$c,$Func ; Maj(a,b,c) 125 vaddu${sz}m $d,$d,$h ; d+=h 126 vshasigma${sz} $Sigma,$a,1,0 ; Sigma0(a) 127 vaddu${sz}m $Sigma,$Sigma,$Func ; Sigma0(a)+Maj(a,b,c) 128 vaddu${sz}m $h,$h,$Sigma ; h+=Sigma0(a)+Maj(a,b,c) 129 lvx $Ki,@I[$k],$idx ; load next K[i] 130 ___ 131 $code.=<<___ if ($k == 7); 132 addi $idx,$idx,0x80 133 ___ 134 } 135 136 $code=<<___; 137 .machine "any" 138 .text 139 140 .globl $func 141 .align 6 142 $func: 143 $STU $sp,-$FRAME($sp) 144 mflr $lrsave 145 li r10,`$LOCALS+15` 146 li r11,`$LOCALS+31` 147 stvx v24,r10,$sp # ABI says so 148 addi r10,r10,32 149 mfspr $vrsave,256 150 stvx v25,r11,$sp 151 addi r11,r11,32 152 stvx v26,r10,$sp 153 addi r10,r10,32 154 stvx v27,r11,$sp 155 addi r11,r11,32 156 stvx v28,r10,$sp 157 addi r10,r10,32 158 stvx v29,r11,$sp 159 addi r11,r11,32 160 stvx v30,r10,$sp 161 stvx v31,r11,$sp 162 li r11,-4096+255 # 0xfffff0ff 163 stw $vrsave,`$FRAME-6*$SIZE_T-4`($sp) # save vrsave 164 li $x10,0x10 165 $PUSH r26,`$FRAME-6*$SIZE_T`($sp) 166 li $x20,0x20 167 $PUSH r27,`$FRAME-5*$SIZE_T`($sp) 168 li $x30,0x30 169 $PUSH r28,`$FRAME-4*$SIZE_T`($sp) 170 li $x40,0x40 171 $PUSH r29,`$FRAME-3*$SIZE_T`($sp) 172 li $x50,0x50 173 $PUSH r30,`$FRAME-2*$SIZE_T`($sp) 174 li $x60,0x60 175 $PUSH r31,`$FRAME-1*$SIZE_T`($sp) 176 li $x70,0x70 177 $PUSH $lrsave,`$FRAME+$LRSAVE`($sp) 178 mtspr 256,r11 179 180 bl LPICmeup 181 addi $offload,$sp,`8*$SIZE_T+15` 182 ___ 183 $code.=<<___ if ($LENDIAN); 184 li $idx,8 185 lvsl $lemask,0,$idx 186 vspltisb $Ki,0x0f 187 vxor $lemask,$lemask,$Ki 188 ___ 189 $code.=<<___ if ($SZ==4); 190 lvx_4w $A,$x00,$ctx 191 lvx_4w $E,$x10,$ctx 192 vsldoi $B,$A,$A,4 # unpack 193 vsldoi $C,$A,$A,8 194 vsldoi $D,$A,$A,12 195 vsldoi $F,$E,$E,4 196 vsldoi $G,$E,$E,8 197 vsldoi $H,$E,$E,12 198 ___ 199 $code.=<<___ if ($SZ==8); 200 lvx_u $A,$x00,$ctx 201 lvx_u $C,$x10,$ctx 202 lvx_u $E,$x20,$ctx 203 vsldoi $B,$A,$A,8 # unpack 204 lvx_u $G,$x30,$ctx 205 vsldoi $D,$C,$C,8 206 vsldoi $F,$E,$E,8 207 vsldoi $H,$G,$G,8 208 ___ 209 $code.=<<___; 210 li r0,`($rounds-16)/16` # inner loop counter 211 b Loop 212 .align 5 213 Loop: 214 lvx $Ki,$x00,$Tbl 215 lvx_u @X[0],0,$inp 216 addi $inp,$inp,16 217 mr $idx,$Tbl # copy $Tbl 218 stvx $A,$x00,$offload # offload $A-$H 219 stvx $B,$x10,$offload 220 stvx $C,$x20,$offload 221 stvx $D,$x30,$offload 222 stvx $E,$x40,$offload 223 stvx $F,$x50,$offload 224 stvx $G,$x60,$offload 225 stvx $H,$x70,$offload 226 vaddu${sz}m $H,$H,$Ki # h+K[i] 227 lvx $Ki,$x10,$Tbl 228 ___ 229 for ($i=0;$i<16;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } 230 $code.=<<___; 231 mtctr r0 232 b L16_xx 233 .align 5 234 L16_xx: 235 ___ 236 for (;$i<32;$i++) { &ROUND($i,@V); unshift(@V,pop(@V)); } 237 $code.=<<___; 238 bdnz L16_xx 239 240 lvx @X[2],$x00,$offload 241 subic. $num,$num,1 242 lvx @X[3],$x10,$offload 243 vaddu${sz}m $A,$A,@X[2] 244 lvx @X[4],$x20,$offload 245 vaddu${sz}m $B,$B,@X[3] 246 lvx @X[5],$x30,$offload 247 vaddu${sz}m $C,$C,@X[4] 248 lvx @X[6],$x40,$offload 249 vaddu${sz}m $D,$D,@X[5] 250 lvx @X[7],$x50,$offload 251 vaddu${sz}m $E,$E,@X[6] 252 lvx @X[8],$x60,$offload 253 vaddu${sz}m $F,$F,@X[7] 254 lvx @X[9],$x70,$offload 255 vaddu${sz}m $G,$G,@X[8] 256 vaddu${sz}m $H,$H,@X[9] 257 bne Loop 258 ___ 259 $code.=<<___ if ($SZ==4); 260 lvx @X[0],$x20,$idx 261 vperm $A,$A,$B,$Ki # pack the answer 262 lvx @X[1],$x30,$idx 263 vperm $E,$E,$F,$Ki 264 vperm $A,$A,$C,@X[0] 265 vperm $E,$E,$G,@X[0] 266 vperm $A,$A,$D,@X[1] 267 vperm $E,$E,$H,@X[1] 268 stvx_4w $A,$x00,$ctx 269 stvx_4w $E,$x10,$ctx 270 ___ 271 $code.=<<___ if ($SZ==8); 272 vperm $A,$A,$B,$Ki # pack the answer 273 vperm $C,$C,$D,$Ki 274 vperm $E,$E,$F,$Ki 275 vperm $G,$G,$H,$Ki 276 stvx_u $A,$x00,$ctx 277 stvx_u $C,$x10,$ctx 278 stvx_u $E,$x20,$ctx 279 stvx_u $G,$x30,$ctx 280 ___ 281 $code.=<<___; 282 addi $offload,$sp,`$LOCALS+15` 283 mtlr $lrsave 284 mtspr 256,$vrsave 285 lvx v24,$x00,$offload # ABI says so 286 lvx v25,$x10,$offload 287 lvx v26,$x20,$offload 288 lvx v27,$x30,$offload 289 lvx v28,$x40,$offload 290 lvx v29,$x50,$offload 291 lvx v30,$x60,$offload 292 lvx v31,$x70,$offload 293 $POP r26,`$FRAME-6*$SIZE_T`($sp) 294 $POP r27,`$FRAME-5*$SIZE_T`($sp) 295 $POP r28,`$FRAME-4*$SIZE_T`($sp) 296 $POP r29,`$FRAME-3*$SIZE_T`($sp) 297 $POP r30,`$FRAME-2*$SIZE_T`($sp) 298 $POP r31,`$FRAME-1*$SIZE_T`($sp) 299 addi $sp,$sp,$FRAME 300 blr 301 .long 0 302 .byte 0,12,4,1,0x80,6,3,0 303 .long 0 304 .size $func,.-$func 305 ___ 306 307 # Ugly hack here, because PPC assembler syntax seem to vary too 308 # much from platforms to platform... 309 $code.=<<___; 310 .align 6 311 LPICmeup: 312 mflr r0 313 bcl 20,31,\$+4 314 mflr $Tbl ; vvvvvv "distance" between . and 1st data entry 315 addi $Tbl,$Tbl,`64-8` 316 mtlr r0 317 blr 318 .long 0 319 .byte 0,12,0x14,0,0,0,0,0 320 .space `64-9*4` 321 ___ 322 323 if ($SZ==8) { 324 local *table = sub { 325 foreach(@_) { $code.=".quad $_,$_\n"; } 326 }; 327 table( 328 "0x428a2f98d728ae22","0x7137449123ef65cd", 329 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc", 330 "0x3956c25bf348b538","0x59f111f1b605d019", 331 "0x923f82a4af194f9b","0xab1c5ed5da6d8118", 332 "0xd807aa98a3030242","0x12835b0145706fbe", 333 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2", 334 "0x72be5d74f27b896f","0x80deb1fe3b1696b1", 335 "0x9bdc06a725c71235","0xc19bf174cf692694", 336 "0xe49b69c19ef14ad2","0xefbe4786384f25e3", 337 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65", 338 "0x2de92c6f592b0275","0x4a7484aa6ea6e483", 339 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5", 340 "0x983e5152ee66dfab","0xa831c66d2db43210", 341 "0xb00327c898fb213f","0xbf597fc7beef0ee4", 342 "0xc6e00bf33da88fc2","0xd5a79147930aa725", 343 "0x06ca6351e003826f","0x142929670a0e6e70", 344 "0x27b70a8546d22ffc","0x2e1b21385c26c926", 345 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df", 346 "0x650a73548baf63de","0x766a0abb3c77b2a8", 347 "0x81c2c92e47edaee6","0x92722c851482353b", 348 "0xa2bfe8a14cf10364","0xa81a664bbc423001", 349 "0xc24b8b70d0f89791","0xc76c51a30654be30", 350 "0xd192e819d6ef5218","0xd69906245565a910", 351 "0xf40e35855771202a","0x106aa07032bbd1b8", 352 "0x19a4c116b8d2d0c8","0x1e376c085141ab53", 353 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8", 354 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb", 355 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3", 356 "0x748f82ee5defb2fc","0x78a5636f43172f60", 357 "0x84c87814a1f0ab72","0x8cc702081a6439ec", 358 "0x90befffa23631e28","0xa4506cebde82bde9", 359 "0xbef9a3f7b2c67915","0xc67178f2e372532b", 360 "0xca273eceea26619c","0xd186b8c721c0c207", 361 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178", 362 "0x06f067aa72176fba","0x0a637dc5a2c898a6", 363 "0x113f9804bef90dae","0x1b710b35131c471b", 364 "0x28db77f523047d84","0x32caab7b40c72493", 365 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c", 366 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a", 367 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0"); 368 $code.=<<___ if (!$LENDIAN); 369 .quad 0x0001020304050607,0x1011121314151617 370 ___ 371 $code.=<<___ if ($LENDIAN); # quad-swapped 372 .quad 0x1011121314151617,0x0001020304050607 373 ___ 374 } else { 375 local *table = sub { 376 foreach(@_) { $code.=".long $_,$_,$_,$_\n"; } 377 }; 378 table( 379 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5", 380 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5", 381 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3", 382 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174", 383 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc", 384 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da", 385 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7", 386 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967", 387 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13", 388 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85", 389 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3", 390 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070", 391 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5", 392 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3", 393 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208", 394 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0"); 395 $code.=<<___ if (!$LENDIAN); 396 .long 0x00010203,0x10111213,0x10111213,0x10111213 397 .long 0x00010203,0x04050607,0x10111213,0x10111213 398 .long 0x00010203,0x04050607,0x08090a0b,0x10111213 399 ___ 400 $code.=<<___ if ($LENDIAN); # word-swapped 401 .long 0x10111213,0x10111213,0x10111213,0x00010203 402 .long 0x10111213,0x10111213,0x04050607,0x00010203 403 .long 0x10111213,0x08090a0b,0x04050607,0x00010203 404 ___ 405 } 406 $code.=<<___; 407 .asciz "SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>" 408 .align 2 409 ___ 410 411 $code =~ s/\`([^\`]*)\`/eval $1/gem; 412 print $code; 413 close STDOUT;