tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sha512p8-ppc.pl (11516B)


      1 #!/usr/bin/env perl
      2 
      3 # ====================================================================
      4 # Written by Andy Polyakov, @dot-asm, initially for use in the OpenSSL
      5 # project. The module is dual licensed under OpenSSL and CRYPTOGAMS
      6 # licenses depending on where you obtain it. For further details see
      7 # https://github.com/dot-asm/cryptogams/.
      8 # ====================================================================
      9 
     10 # SHA256/512 for PowerISA v2.07.
     11 #
     12 # Accurate performance measurements are problematic, because it's
     13 # always virtualized setup with possibly throttled processor.
     14 # Relative comparison is therefore more informative. This module is
     15 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
     16 # else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
     17 # hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
     18 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
     19 # result is degree of computational resources' utilization. POWER8 is
     20 # "massively multi-threaded chip" and difference between single- and
     21 # maximum multi-process benchmark results tells that utilization is
     22 # whooping 94%. For sha512-ppc.pl we get [not unimpressive] 84% and
     23 # for sha1-ppc.pl - 73%. 100% means that multi-process result equals
     24 # to single-process one, given that all threads end up on the same
     25 # physical core.
     26 #
     27 ######################################################################
     28 # Believed-to-be-accurate results in cycles per processed byte [on
     29 # little-endian system]. Numbers in square brackets are for 64-bit
     30 # build of sha512-ppc.pl, presented for reference.
     31 #
     32 #		POWER8		POWER9
     33 # SHA256	9.7 [15.8]	11.2 [12.5]
     34 # SHA512	6.1 [10.3]	7.0 [7.9]
     35 
     36 $flavour=shift;
     37 $output =shift;
     38 
     39 if ($flavour =~ /64/) {
     40 $SIZE_T=8;
     41 $LRSAVE=2*$SIZE_T;
     42 $STU="stdu";
     43 $POP="ld";
     44 $PUSH="std";
     45 } elsif ($flavour =~ /32/) {
     46 $SIZE_T=4;
     47 $LRSAVE=$SIZE_T;
     48 $STU="stwu";
     49 $POP="lwz";
     50 $PUSH="stw";
     51 } else { die "nonsense $flavour"; }
     52 
     53 $LENDIAN=($flavour=~/le/);
     54 
     55 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
     56 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
     57 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
     58 die "can't locate ppc-xlate.pl";
     59 
     60 open STDOUT,"| $^X $xlate $flavour $output" || die "can't call $xlate: $!";
     61 
     62 if ($output =~ /512/) {
     63 $bits=512;
     64 $SZ=8;
     65 $sz="d";
     66 $rounds=80;
     67 } else {
     68 $bits=256;
     69 $SZ=4;
     70 $sz="w";
     71 $rounds=64;
     72 }
     73 
     74 $func="sha${bits}_block_p8";
     75 $LOCALS=8*$SIZE_T+8*16;
     76 $FRAME=$LOCALS+9*16+6*$SIZE_T;
     77 
     78 $sp ="r1";
     79 $toc="r2";
     80 $ctx="r3";
     81 $inp="r4";
     82 $num="r5";
     83 $Tbl="r6";
     84 $idx="r7";
     85 $lrsave="r8";
     86 $offload="r11";
     87 $vrsave="r12";
     88 @I = ($x00,$x10,$x20,$x30,$x40,$x50,$x60,$x70) = (0,map("r$_",(10,26..31)));
     89 
     90 @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("v$_",(0..7));
     91 @X=map("v$_",(8..19,24..27));
     92 ($Ki,$Func,$Sigma,$lemask)=map("v$_",(28..31));
     93 
     94 sub ROUND {
     95 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
     96 my $j=($i+1)%16;
     97 my $k=($i+2)%8;
     98 
     99 $code.=<<___		if ($i<15 && ($i%(16/$SZ))==(16/$SZ-1));
    100 lvx_u		@X[$i+1],0,$inp		; load X[i] in advance
    101 addi		$inp,$inp,16
    102 ___
    103 $code.=<<___		if ($i<16 && ($i%(16/$SZ)));
    104 vsldoi		@X[$i],@X[$i-1],@X[$i-1],$SZ
    105 ___
    106 $code.=<<___		if ($LENDIAN && $i<16 && ($i%(16/$SZ))==0);
    107 vperm		@X[$i],@X[$i],@X[$i],$lemask
    108 ___
    109 $code.=<<___		if ($i>=15);
    110 vshasigma${sz}	$Sigma,@X[($j+1)%16],0,0
    111 vaddu${sz}m	@X[$j],@X[$j],$Sigma
    112 vshasigma${sz}	$Sigma,@X[($j+14)%16],0,15
    113 vaddu${sz}m	@X[$j],@X[$j],$Sigma
    114 vaddu${sz}m	@X[$j],@X[$j],@X[($j+9)%16]
    115 ___
    116 $code.=<<___;
    117 vaddu${sz}m	$h,$h,@X[$i%16]		; h+=X[i]
    118 vsel		$Func,$g,$f,$e		; Ch(e,f,g)
    119 vaddu${sz}m	$g,$g,$Ki		; future h+=K[i]
    120 vaddu${sz}m	$h,$h,$Func		; h+=Ch(e,f,g)
    121 vshasigma${sz}	$Sigma,$e,1,15		; Sigma1(e)
    122 vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma1(e)
    123 vxor		$Func,$a,$b
    124 vsel		$Func,$b,$c,$Func	; Maj(a,b,c)
    125 vaddu${sz}m	$d,$d,$h		; d+=h
    126 vshasigma${sz}	$Sigma,$a,1,0		; Sigma0(a)
    127 vaddu${sz}m	$Sigma,$Sigma,$Func	; Sigma0(a)+Maj(a,b,c)
    128 vaddu${sz}m	$h,$h,$Sigma		; h+=Sigma0(a)+Maj(a,b,c)
    129 lvx		$Ki,@I[$k],$idx		; load next K[i]
    130 ___
    131 $code.=<<___		if ($k == 7);
    132 addi		$idx,$idx,0x80
    133 ___
    134 }
    135 
    136 $code=<<___;
    137 .machine	"any"
    138 .text
    139 
    140 .globl	$func
    141 .align	6
    142 $func:
    143 $STU		$sp,-$FRAME($sp)
    144 mflr		$lrsave
    145 li		r10,`$LOCALS+15`
    146 li		r11,`$LOCALS+31`
    147 stvx		v24,r10,$sp		# ABI says so
    148 addi		r10,r10,32
    149 mfspr		$vrsave,256
    150 stvx		v25,r11,$sp
    151 addi		r11,r11,32
    152 stvx		v26,r10,$sp
    153 addi		r10,r10,32
    154 stvx		v27,r11,$sp
    155 addi		r11,r11,32
    156 stvx		v28,r10,$sp
    157 addi		r10,r10,32
    158 stvx		v29,r11,$sp
    159 addi		r11,r11,32
    160 stvx		v30,r10,$sp
    161 stvx		v31,r11,$sp
    162 li		r11,-4096+255		# 0xfffff0ff
    163 stw		$vrsave,`$FRAME-6*$SIZE_T-4`($sp)	# save vrsave
    164 li		$x10,0x10
    165 $PUSH		r26,`$FRAME-6*$SIZE_T`($sp)
    166 li		$x20,0x20
    167 $PUSH		r27,`$FRAME-5*$SIZE_T`($sp)
    168 li		$x30,0x30
    169 $PUSH		r28,`$FRAME-4*$SIZE_T`($sp)
    170 li		$x40,0x40
    171 $PUSH		r29,`$FRAME-3*$SIZE_T`($sp)
    172 li		$x50,0x50
    173 $PUSH		r30,`$FRAME-2*$SIZE_T`($sp)
    174 li		$x60,0x60
    175 $PUSH		r31,`$FRAME-1*$SIZE_T`($sp)
    176 li		$x70,0x70
    177 $PUSH		$lrsave,`$FRAME+$LRSAVE`($sp)
    178 mtspr		256,r11
    179 
    180 bl		LPICmeup
    181 addi		$offload,$sp,`8*$SIZE_T+15`
    182 ___
    183 $code.=<<___		if ($LENDIAN);
    184 li		$idx,8
    185 lvsl		$lemask,0,$idx
    186 vspltisb	$Ki,0x0f
    187 vxor		$lemask,$lemask,$Ki
    188 ___
    189 $code.=<<___		if ($SZ==4);
    190 lvx_4w		$A,$x00,$ctx
    191 lvx_4w		$E,$x10,$ctx
    192 vsldoi		$B,$A,$A,4		# unpack
    193 vsldoi		$C,$A,$A,8
    194 vsldoi		$D,$A,$A,12
    195 vsldoi		$F,$E,$E,4
    196 vsldoi		$G,$E,$E,8
    197 vsldoi		$H,$E,$E,12
    198 ___
    199 $code.=<<___		if ($SZ==8);
    200 lvx_u		$A,$x00,$ctx
    201 lvx_u		$C,$x10,$ctx
    202 lvx_u		$E,$x20,$ctx
    203 vsldoi		$B,$A,$A,8		# unpack
    204 lvx_u		$G,$x30,$ctx
    205 vsldoi		$D,$C,$C,8
    206 vsldoi		$F,$E,$E,8
    207 vsldoi		$H,$G,$G,8
    208 ___
    209 $code.=<<___;
    210 li		r0,`($rounds-16)/16`	# inner loop counter
    211 b		Loop
    212 .align	5
    213 Loop:
    214 lvx		$Ki,$x00,$Tbl
    215 lvx_u		@X[0],0,$inp
    216 addi		$inp,$inp,16
    217 mr		$idx,$Tbl		# copy $Tbl
    218 stvx		$A,$x00,$offload	# offload $A-$H
    219 stvx		$B,$x10,$offload
    220 stvx		$C,$x20,$offload
    221 stvx		$D,$x30,$offload
    222 stvx		$E,$x40,$offload
    223 stvx		$F,$x50,$offload
    224 stvx		$G,$x60,$offload
    225 stvx		$H,$x70,$offload
    226 vaddu${sz}m	$H,$H,$Ki		# h+K[i]
    227 lvx		$Ki,$x10,$Tbl
    228 ___
    229 for ($i=0;$i<16;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
    230 $code.=<<___;
    231 mtctr		r0
    232 b		L16_xx
    233 .align	5
    234 L16_xx:
    235 ___
    236 for (;$i<32;$i++)	{ &ROUND($i,@V); unshift(@V,pop(@V)); }
    237 $code.=<<___;
    238 bdnz		L16_xx
    239 
    240 lvx		@X[2],$x00,$offload
    241 subic.		$num,$num,1
    242 lvx		@X[3],$x10,$offload
    243 vaddu${sz}m	$A,$A,@X[2]
    244 lvx		@X[4],$x20,$offload
    245 vaddu${sz}m	$B,$B,@X[3]
    246 lvx		@X[5],$x30,$offload
    247 vaddu${sz}m	$C,$C,@X[4]
    248 lvx		@X[6],$x40,$offload
    249 vaddu${sz}m	$D,$D,@X[5]
    250 lvx		@X[7],$x50,$offload
    251 vaddu${sz}m	$E,$E,@X[6]
    252 lvx		@X[8],$x60,$offload
    253 vaddu${sz}m	$F,$F,@X[7]
    254 lvx		@X[9],$x70,$offload
    255 vaddu${sz}m	$G,$G,@X[8]
    256 vaddu${sz}m	$H,$H,@X[9]
    257 bne		Loop
    258 ___
    259 $code.=<<___		if ($SZ==4);
    260 lvx		@X[0],$x20,$idx
    261 vperm		$A,$A,$B,$Ki		# pack the answer
    262 lvx		@X[1],$x30,$idx
    263 vperm		$E,$E,$F,$Ki
    264 vperm		$A,$A,$C,@X[0]
    265 vperm		$E,$E,$G,@X[0]
    266 vperm		$A,$A,$D,@X[1]
    267 vperm		$E,$E,$H,@X[1]
    268 stvx_4w		$A,$x00,$ctx
    269 stvx_4w		$E,$x10,$ctx
    270 ___
    271 $code.=<<___		if ($SZ==8);
    272 vperm		$A,$A,$B,$Ki		# pack the answer
    273 vperm		$C,$C,$D,$Ki
    274 vperm		$E,$E,$F,$Ki
    275 vperm		$G,$G,$H,$Ki
    276 stvx_u		$A,$x00,$ctx
    277 stvx_u		$C,$x10,$ctx
    278 stvx_u		$E,$x20,$ctx
    279 stvx_u		$G,$x30,$ctx
    280 ___
    281 $code.=<<___;
    282 addi		$offload,$sp,`$LOCALS+15`
    283 mtlr		$lrsave
    284 mtspr		256,$vrsave
    285 lvx		v24,$x00,$offload	# ABI says so
    286 lvx		v25,$x10,$offload
    287 lvx		v26,$x20,$offload
    288 lvx		v27,$x30,$offload
    289 lvx		v28,$x40,$offload
    290 lvx		v29,$x50,$offload
    291 lvx		v30,$x60,$offload
    292 lvx		v31,$x70,$offload
    293 $POP		r26,`$FRAME-6*$SIZE_T`($sp)
    294 $POP		r27,`$FRAME-5*$SIZE_T`($sp)
    295 $POP		r28,`$FRAME-4*$SIZE_T`($sp)
    296 $POP		r29,`$FRAME-3*$SIZE_T`($sp)
    297 $POP		r30,`$FRAME-2*$SIZE_T`($sp)
    298 $POP		r31,`$FRAME-1*$SIZE_T`($sp)
    299 addi		$sp,$sp,$FRAME
    300 blr
    301 .long		0
    302 .byte		0,12,4,1,0x80,6,3,0
    303 .long		0
    304 .size	$func,.-$func
    305 ___
    306 
    307 # Ugly hack here, because PPC assembler syntax seem to vary too
    308 # much from platforms to platform...
    309 $code.=<<___;
    310 .align	6
    311 LPICmeup:
    312 mflr	r0
    313 bcl	20,31,\$+4
    314 mflr	$Tbl	; vvvvvv "distance" between . and 1st data entry
    315 addi	$Tbl,$Tbl,`64-8`
    316 mtlr	r0
    317 blr
    318 .long	0
    319 .byte	0,12,0x14,0,0,0,0,0
    320 .space	`64-9*4`
    321 ___
    322 
    323 if ($SZ==8) {
    324    local *table = sub {
    325 foreach(@_) { $code.=".quad	$_,$_\n"; }
    326    };
    327    table(
    328 "0x428a2f98d728ae22","0x7137449123ef65cd",
    329 "0xb5c0fbcfec4d3b2f","0xe9b5dba58189dbbc",
    330 "0x3956c25bf348b538","0x59f111f1b605d019",
    331 "0x923f82a4af194f9b","0xab1c5ed5da6d8118",
    332 "0xd807aa98a3030242","0x12835b0145706fbe",
    333 "0x243185be4ee4b28c","0x550c7dc3d5ffb4e2",
    334 "0x72be5d74f27b896f","0x80deb1fe3b1696b1",
    335 "0x9bdc06a725c71235","0xc19bf174cf692694",
    336 "0xe49b69c19ef14ad2","0xefbe4786384f25e3",
    337 "0x0fc19dc68b8cd5b5","0x240ca1cc77ac9c65",
    338 "0x2de92c6f592b0275","0x4a7484aa6ea6e483",
    339 "0x5cb0a9dcbd41fbd4","0x76f988da831153b5",
    340 "0x983e5152ee66dfab","0xa831c66d2db43210",
    341 "0xb00327c898fb213f","0xbf597fc7beef0ee4",
    342 "0xc6e00bf33da88fc2","0xd5a79147930aa725",
    343 "0x06ca6351e003826f","0x142929670a0e6e70",
    344 "0x27b70a8546d22ffc","0x2e1b21385c26c926",
    345 "0x4d2c6dfc5ac42aed","0x53380d139d95b3df",
    346 "0x650a73548baf63de","0x766a0abb3c77b2a8",
    347 "0x81c2c92e47edaee6","0x92722c851482353b",
    348 "0xa2bfe8a14cf10364","0xa81a664bbc423001",
    349 "0xc24b8b70d0f89791","0xc76c51a30654be30",
    350 "0xd192e819d6ef5218","0xd69906245565a910",
    351 "0xf40e35855771202a","0x106aa07032bbd1b8",
    352 "0x19a4c116b8d2d0c8","0x1e376c085141ab53",
    353 "0x2748774cdf8eeb99","0x34b0bcb5e19b48a8",
    354 "0x391c0cb3c5c95a63","0x4ed8aa4ae3418acb",
    355 "0x5b9cca4f7763e373","0x682e6ff3d6b2b8a3",
    356 "0x748f82ee5defb2fc","0x78a5636f43172f60",
    357 "0x84c87814a1f0ab72","0x8cc702081a6439ec",
    358 "0x90befffa23631e28","0xa4506cebde82bde9",
    359 "0xbef9a3f7b2c67915","0xc67178f2e372532b",
    360 "0xca273eceea26619c","0xd186b8c721c0c207",
    361 "0xeada7dd6cde0eb1e","0xf57d4f7fee6ed178",
    362 "0x06f067aa72176fba","0x0a637dc5a2c898a6",
    363 "0x113f9804bef90dae","0x1b710b35131c471b",
    364 "0x28db77f523047d84","0x32caab7b40c72493",
    365 "0x3c9ebe0a15c9bebc","0x431d67c49c100d4c",
    366 "0x4cc5d4becb3e42b6","0x597f299cfc657e2a",
    367 "0x5fcb6fab3ad6faec","0x6c44198c4a475817","0");
    368 $code.=<<___	if (!$LENDIAN);
    369 .quad	0x0001020304050607,0x1011121314151617
    370 ___
    371 $code.=<<___	if ($LENDIAN);	# quad-swapped
    372 .quad	0x1011121314151617,0x0001020304050607
    373 ___
    374 } else {
    375    local *table = sub {
    376 foreach(@_) { $code.=".long	$_,$_,$_,$_\n"; }
    377    };
    378    table(
    379 "0x428a2f98","0x71374491","0xb5c0fbcf","0xe9b5dba5",
    380 "0x3956c25b","0x59f111f1","0x923f82a4","0xab1c5ed5",
    381 "0xd807aa98","0x12835b01","0x243185be","0x550c7dc3",
    382 "0x72be5d74","0x80deb1fe","0x9bdc06a7","0xc19bf174",
    383 "0xe49b69c1","0xefbe4786","0x0fc19dc6","0x240ca1cc",
    384 "0x2de92c6f","0x4a7484aa","0x5cb0a9dc","0x76f988da",
    385 "0x983e5152","0xa831c66d","0xb00327c8","0xbf597fc7",
    386 "0xc6e00bf3","0xd5a79147","0x06ca6351","0x14292967",
    387 "0x27b70a85","0x2e1b2138","0x4d2c6dfc","0x53380d13",
    388 "0x650a7354","0x766a0abb","0x81c2c92e","0x92722c85",
    389 "0xa2bfe8a1","0xa81a664b","0xc24b8b70","0xc76c51a3",
    390 "0xd192e819","0xd6990624","0xf40e3585","0x106aa070",
    391 "0x19a4c116","0x1e376c08","0x2748774c","0x34b0bcb5",
    392 "0x391c0cb3","0x4ed8aa4a","0x5b9cca4f","0x682e6ff3",
    393 "0x748f82ee","0x78a5636f","0x84c87814","0x8cc70208",
    394 "0x90befffa","0xa4506ceb","0xbef9a3f7","0xc67178f2","0");
    395 $code.=<<___	if (!$LENDIAN);
    396 .long	0x00010203,0x10111213,0x10111213,0x10111213
    397 .long	0x00010203,0x04050607,0x10111213,0x10111213
    398 .long	0x00010203,0x04050607,0x08090a0b,0x10111213
    399 ___
    400 $code.=<<___	if ($LENDIAN);	# word-swapped
    401 .long	0x10111213,0x10111213,0x10111213,0x00010203
    402 .long	0x10111213,0x10111213,0x04050607,0x00010203
    403 .long	0x10111213,0x08090a0b,0x04050607,0x00010203
    404 ___
    405 }
    406 $code.=<<___;
    407 .asciz	"SHA${bits} for PowerISA 2.07, CRYPTOGAMS by <appro\@openssl.org>"
    408 .align	2
    409 ___
    410 
    411 $code =~ s/\`([^\`]*)\`/eval $1/gem;
    412 print $code;
    413 close STDOUT;