tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ppc-gcm.s (23184B)


      1 # This submission to NSS is to be made available under the terms of the
      2 # Mozilla Public License, v. 2.0. You can obtain one at //mozilla.org/MPL/2.0/
      3 # Copyright(c) 2021, Niels Möller and Mamone Tarsha
      4 
      5 # Registers:
      6 
      7 .set SP, 1
      8 .set TOCP, 2
      9 
     10 .macro VEC_LOAD_DATA   VR, DATA, GPR
     11    addis        \GPR, 2, \DATA@got@ha
     12    ld           \GPR, \DATA@got@l(\GPR)
     13    lvx          \VR, 0, \GPR
     14 .endm
     15 
     16 .macro VEC_LOAD   VR, GPR, IDX
     17    lxvd2x       \VR+32, \IDX, \GPR
     18    vperm        \VR, \VR, \VR, SWAP_MASK
     19 .endm
     20 
     21 .macro VEC_LOAD_INC   VR, GPR, IDX
     22    lxvd2x       \VR+32, \IDX, \GPR
     23    addi         \IDX,\IDX,16
     24    vperm        \VR, \VR, \VR, SWAP_MASK
     25 .endm
     26 
     27 .macro VEC_STORE   VR, GPR, IDX
     28    vperm        \VR, \VR, \VR, SWAP_MASK
     29    stxvd2x      \VR+32, \IDX, \GPR
     30 .endm
     31 
     32 # 0 < LEN < 16, pad the remaining bytes with zeros
     33 .macro LOAD_LEN  DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
     34    li           \TMP0, 0
     35    li           \VAL1, 0
     36    li           \VAL0, 0
     37    andi.        \TMP1, \LEN, 8
     38    beq          1f
     39    ldbrx        \VAL1, 0, \DATA
     40    li           \TMP0, 8
     41 1:
     42    andi.        \TMP1, \LEN, 7
     43    beq          3f
     44    li           \TMP1, 56
     45 2:
     46    lbzx         \TMP2, \TMP0, \DATA
     47    sld          \TMP2, \TMP2, \TMP1
     48    subi         \TMP1, \TMP1, 8
     49    or           \VAL0, \VAL0, \TMP2
     50    addi         \TMP0, \TMP0, 1
     51    cmpld        \TMP0, \LEN
     52    bne          2b
     53    andi.        \TMP1, \LEN, 8
     54    bne          3f
     55    mr           \VAL1, \VAL0
     56    li           \VAL0, 0
     57 3:
     58 .endm
     59 
     60 # 0 < LEN < 16
     61 .macro STORE_LEN DATA, LEN, VAL1, VAL0, TMP0, TMP1, TMP2
     62    andi.        \TMP1, \LEN, 8
     63    beq          1f
     64    stdbrx       \VAL1, 0, \DATA
     65    li           \TMP0, 8
     66    b            2f
     67 1:
     68    li           \TMP0, 0
     69    mr           \VAL0, \VAL1
     70 2:
     71    andi.        \TMP1, \LEN, 7
     72    beq          4f
     73    li           \TMP1, 56
     74 3:
     75    srd          \TMP2, \VAL0, \TMP1
     76    subi         \TMP1, \TMP1, 8
     77    stbx         \TMP2, \TMP0, \DATA
     78    addi         \TMP0, \TMP0, 1
     79    cmpld        \TMP0, \LEN
     80    bne          3b
     81 4:
     82 .endm
     83 
     84 .text
     85 
     86 ################################################################################
     87 # Generates the H table
     88 # void ppc_aes_gcmINIT(uint8_t Htbl[16*8], uint32_t *KS, int NR);
     89 .globl	ppc_aes_gcmINIT
     90 .type	ppc_aes_gcmINIT,@function
     91 .align	5
     92 ppc_aes_gcmINIT:
     93 addis	TOCP,12,(.TOC.-ppc_aes_gcmINIT)@ha
     94 addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmINIT)@l
     95 .localentry	ppc_aes_gcmINIT, .-ppc_aes_gcmINIT
     96 
     97 .set Htbl, 3
     98 .set KS, 4
     99 .set NR, 5
    100 
    101 .set ZERO, 19
    102 .set MSB, 18
    103 .set ONE, 17
    104 .set SWAP_MASK, 0
    105 .set POLY, 1
    106 .set K, 2
    107 .set H, 3
    108 .set H2, 4
    109 .set H3, 5
    110 .set H4, 6
    111 .set HP, 7
    112 .set HS, 8
    113 .set R, 9
    114 .set F, 10
    115 .set T, 11
    116 .set H1M, 12
    117 .set H1L, 13
    118 .set H2M, 14
    119 .set H2L, 15
    120 .set H3M, 16
    121 .set H3L, 17
    122 .set H4M, 18
    123 .set H4L, 19
    124 
    125    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 6
    126    VEC_LOAD_DATA POLY, .Lpoly, 6
    127 
    128    li           6, 0
    129    VEC_LOAD_INC H, KS, 6
    130    VEC_LOAD_INC K, KS, 6
    131    vcipher      H, H, K
    132    VEC_LOAD_INC K, KS, 6
    133    vcipher      H, H, K
    134    VEC_LOAD_INC K, KS, 6
    135    vcipher      H, H, K
    136    VEC_LOAD_INC K, KS, 6
    137    vcipher      H, H, K
    138    VEC_LOAD_INC K, KS, 6
    139    vcipher      H, H, K
    140    VEC_LOAD_INC K, KS, 6
    141    vcipher      H, H, K
    142    VEC_LOAD_INC K, KS, 6
    143    vcipher      H, H, K
    144    VEC_LOAD_INC K, KS, 6
    145    vcipher      H, H, K
    146    VEC_LOAD_INC K, KS, 6
    147    vcipher      H, H, K
    148    cmpwi        NR, 10
    149    beq          .LH_done
    150    VEC_LOAD_INC K, KS, 6
    151    vcipher      H, H, K
    152    VEC_LOAD_INC K, KS, 6
    153    vcipher      H, H, K
    154    cmpwi        NR, 12
    155    beq          .LH_done
    156    VEC_LOAD_INC K, KS, 6
    157    vcipher      H, H, K
    158    VEC_LOAD_INC K, KS, 6
    159    vcipher      H, H, K
    160 
    161 .LH_done:
    162    VEC_LOAD     K, KS, 6
    163    vcipherlast  H, H, K
    164 
    165    vupkhsb      MSB, H
    166    vspltisb     ONE, 1
    167    vspltb       MSB, MSB, 0
    168    vsl          H, H, ONE
    169    vand         MSB, MSB, POLY
    170    vxor         ZERO, ZERO, ZERO
    171    vxor         H, H, MSB
    172    vsldoi       POLY, ZERO, POLY, 8
    173 
    174    vpmsumd      HP, H, POLY
    175    vsldoi       HS, H, H, 8
    176    vxor         HP, HP, HS
    177    vsldoi       H1L, HP, HS, 8
    178    vsldoi       H1M, HS, HP, 8
    179    vsldoi       H1L, H1L, H1L, 8
    180 
    181    # calculate H^2
    182 
    183    vpmsumd      F, H, H1L
    184    vpmsumd      R, H, H1M
    185 
    186    vpmsumd      T, F, POLY
    187    vsldoi       H2, F, F, 8
    188    vxor         R, R, T
    189    vxor         H2, H2, R
    190 
    191    vpmsumd      HP, H2, POLY
    192    vsldoi       HS, H2, H2, 8
    193    vxor         HP, HP, HS
    194    vsldoi       H2L, HP, HS, 8
    195    vsldoi       H2M, HS, HP, 8
    196    vsldoi       H2L, H2L, H2L, 8
    197 
    198    # calculate H^3
    199 
    200    vpmsumd      F, H2, H1L
    201    vpmsumd      R, H2, H1M
    202 
    203    vpmsumd      T, F, POLY
    204    vsldoi       H3, F, F, 8
    205    vxor         R, R, T
    206    vxor         H3, H3, R
    207 
    208    vpmsumd      HP, H3, POLY
    209    vsldoi       HS, H3, H3, 8
    210    vxor         HP, HP, HS
    211    vsldoi       H3L, HP, HS, 8
    212    vsldoi       H3M, HS, HP, 8
    213    vsldoi       H3L, H3L, H3L, 8
    214 
    215    # calculate H^4
    216 
    217    vpmsumd      F, H2, H2L
    218    vpmsumd      R, H2, H2M
    219 
    220    vpmsumd      T, F, POLY
    221    vsldoi       H4, F, F, 8
    222    vxor         R, R, T
    223    vxor         H4, H4, R
    224 
    225    vpmsumd      HP, H4, POLY
    226    vsldoi       HS, H4, H4, 8
    227    vxor         HP, HP, HS
    228    vsldoi       H4L, HP, HS, 8
    229    vsldoi       H4M, HS, HP, 8
    230    vsldoi       H4L, H4L, H4L, 8
    231 
    232    li           8, 16*1
    233    li           9, 16*2
    234    li           10, 16*3
    235    stxvd2x      H1L+32, 0, Htbl
    236    stxvd2x      H1M+32, 8, Htbl
    237    stxvd2x      H2L+32, 9, Htbl
    238    stxvd2x      H2M+32, 10, Htbl
    239    li           7, 16*4
    240    li           8, 16*5
    241    li           9, 16*6
    242    li           10, 16*7
    243    stxvd2x      H3L+32, 7, Htbl
    244    stxvd2x      H3M+32, 8, Htbl
    245    stxvd2x      H4L+32, 9, Htbl
    246    stxvd2x      H4M+32, 10, Htbl
    247 
    248    blr
    249 .size ppc_aes_gcmINIT, . - ppc_aes_gcmINIT
    250 
    251 ################################################################################
    252 # Authenticate only
    253 # void ppc_aes_gcmHASH(uint8_t Htbl[16*8], uint8_t *AAD, uint64_t Alen, uint8_t *Tp);
    254 .globl	ppc_aes_gcmHASH
    255 .type	ppc_aes_gcmHASH,@function
    256 .align	5
    257 ppc_aes_gcmHASH:
    258 addis	TOCP,12,(.TOC.-ppc_aes_gcmHASH)@ha
    259 addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmHASH)@l
    260 .localentry	ppc_aes_gcmHASH, .-ppc_aes_gcmHASH
    261 
    262 .set Htbl, 3
    263 .set AAD, 4
    264 .set Alen, 5
    265 .set Tp, 6
    266 
    267 .set SWAP_MASK, 0
    268 .set POLY, 1
    269 .set D, 2
    270 .set C0, 3
    271 .set C1, 4
    272 .set C2, 5
    273 .set C3, 6
    274 .set T, 7
    275 .set R, 8
    276 .set F, 9
    277 .set R2, 10
    278 .set F2, 11
    279 .set R3, 12
    280 .set F3, 13
    281 .set R4, 14
    282 .set F4, 15
    283 .set H1M, 16
    284 .set H1L, 17
    285 .set H2M, 18
    286 .set H2L, 19
    287 .set H3M, 28
    288 .set H3L, 29
    289 .set H4M, 30
    290 .set H4L, 31
    291 
    292    # store non-volatile vector registers
    293    addi         7, SP, -16
    294    stvx         31, 0, 7
    295    addi         7, SP, -32
    296    stvx         30, 0, 7
    297    addi         7, SP, -48
    298    stvx         29, 0, 7
    299    addi         7, SP, -64
    300    stvx         28, 0, 7
    301    
    302    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 7
    303    VEC_LOAD_DATA POLY, .Lpoly_r, 7
    304 
    305    VEC_LOAD     D, Tp, 0
    306 
    307    # --- process 4 blocks ---
    308 
    309    srdi.        7, Alen, 6               # 4-blocks loop count
    310    beq          .L2x
    311 
    312    mtctr        7                        # set counter register
    313 
    314    # load table elements
    315    li           8, 1*16
    316    li           9, 2*16
    317    li           10, 3*16
    318    lxvd2x       H1L+32, 0, Htbl
    319    lxvd2x       H1M+32, 8, Htbl
    320    lxvd2x       H2L+32, 9, Htbl
    321    lxvd2x       H2M+32, 10, Htbl
    322    li           7, 4*16
    323    li           8, 5*16
    324    li           9, 6*16
    325    li           10, 7*16
    326    lxvd2x       H3L+32, 7, Htbl
    327    lxvd2x       H3M+32, 8, Htbl
    328    lxvd2x       H4L+32, 9, Htbl
    329    lxvd2x       H4M+32, 10, Htbl
    330 
    331    li           8, 0x10
    332    li           9, 0x20
    333    li           10, 0x30
    334 .align 5
    335 .L4x_loop:
    336    # load input
    337    lxvd2x       C0+32, 0, AAD
    338    lxvd2x       C1+32, 8, AAD
    339    lxvd2x       C2+32, 9, AAD
    340    lxvd2x       C3+32, 10, AAD
    341 
    342    vperm        C0, C0, C0, SWAP_MASK
    343    vperm        C1, C1, C1, SWAP_MASK
    344    vperm        C2, C2, C2, SWAP_MASK
    345    vperm        C3, C3, C3, SWAP_MASK
    346 
    347    # digest combining
    348    vxor         C0, C0, D
    349 
    350    # polynomial multiplication
    351    vpmsumd      F2, H3L, C1
    352    vpmsumd      R2, H3M, C1
    353    vpmsumd      F3, H2L, C2
    354    vpmsumd      R3, H2M, C2
    355    vpmsumd      F4, H1L, C3
    356    vpmsumd      R4, H1M, C3
    357    vpmsumd      F, H4L, C0
    358    vpmsumd      R, H4M, C0
    359 
    360    # deferred recombination of partial products
    361    vxor         F3, F3, F4
    362    vxor         R3, R3, R4
    363    vxor         F, F, F2
    364    vxor         R, R, R2
    365    vxor         F, F, F3
    366    vxor         R, R, R3
    367 
    368    # reduction
    369    vpmsumd      T, F, POLY
    370    vsldoi       D, F, F, 8
    371    vxor         R, R, T
    372    vxor         D, R, D
    373 
    374    addi         AAD, AAD, 0x40
    375    bdnz         .L4x_loop
    376 
    377    clrldi       Alen, Alen, 58
    378 .L2x:
    379    # --- process 2 blocks ---
    380 
    381    srdi.        7, Alen, 5
    382    beq          .L1x
    383 
    384    # load table elements
    385    li           8, 1*16
    386    li           9, 2*16
    387    li           10, 3*16
    388    lxvd2x       H1L+32, 0, Htbl
    389    lxvd2x       H1M+32, 8, Htbl
    390    lxvd2x       H2L+32, 9, Htbl
    391    lxvd2x       H2M+32, 10, Htbl
    392 
    393    # load input
    394    li           10, 0x10
    395    lxvd2x       C0+32, 0, AAD
    396    lxvd2x       C1+32, 10, AAD
    397 
    398    vperm        C0, C0, C0, SWAP_MASK
    399    vperm        C1, C1, C1, SWAP_MASK
    400 
    401    # previous digest combining
    402    vxor         C0, C0, D
    403 
    404    # polynomial multiplication
    405    vpmsumd      F2, H1L, C1
    406    vpmsumd      R2, H1M, C1
    407    vpmsumd      F, H2L, C0
    408    vpmsumd      R, H2M, C0
    409 
    410    # deferred recombination of partial products
    411    vxor         F, F, F2
    412    vxor         R, R, R2
    413 
    414    # reduction
    415    vpmsumd      T, F, POLY
    416    vsldoi       D, F, F, 8
    417    vxor         R, R, T
    418    vxor         D, R, D
    419 
    420    addi         AAD, AAD, 0x20
    421    clrldi       Alen, Alen, 59
    422 .L1x:
    423    # --- process 1 block ---
    424 
    425    srdi.        7, Alen, 4
    426    beq          .Ltail
    427 
    428    # load table elements
    429    li           8, 1*16
    430    lxvd2x       H1L+32, 0, Htbl
    431    lxvd2x       H1M+32, 8, Htbl
    432 
    433    # load input
    434    lxvd2x       C0+32, 0, AAD
    435 
    436    vperm        C0, C0, C0, SWAP_MASK
    437 
    438    # previous digest combining
    439    vxor         C0, C0, D
    440 
    441    # polynomial multiplication
    442    vpmsumd      F, H1L, C0
    443    vpmsumd      R, H1M, C0
    444 
    445    # reduction
    446    vpmsumd      T, F, POLY
    447    vsldoi       D, F, F, 8
    448    vxor         R, R, T
    449    vxor         D, R, D
    450 
    451    addi         AAD, AAD, 0x10
    452    clrldi       Alen, Alen, 60
    453 
    454 .Ltail:
    455    cmpldi       Alen, 0
    456    beq          .Lh_done
    457    # --- process the final partial block ---
    458 
    459    # load table elements
    460    li           8, 1*16
    461    lxvd2x       H1L+32, 0, Htbl
    462    lxvd2x       H1M+32, 8, Htbl
    463 
    464    LOAD_LEN     AAD, Alen, 10, 9, 3, 7, 8
    465    mtvrd        C0, 10
    466    mtvrd        C1, 9
    467    xxmrghd      C0+32, C0+32, C1+32
    468 
    469    # previous digest combining
    470    vxor         C0, C0, D
    471 
    472    # polynomial multiplication
    473    vpmsumd      F, H1L, C0
    474    vpmsumd      R, H1M, C0
    475 
    476    # reduction
    477    vpmsumd      T, F, POLY
    478    vsldoi       D, F, F, 8
    479    vxor         R, R, T
    480    vxor         D, R, D
    481 .Lh_done:
    482    VEC_STORE    D, Tp, 0
    483 
    484    # restore non-volatile vector registers
    485    addi         7, SP, -16
    486    lvx          31, 0, 7
    487    addi         7, SP, -32
    488    lvx          30, 0, 7
    489    addi         7, SP, -48
    490    lvx          29, 0, 7
    491    addi         7, SP, -64
    492    lvx          28, 0, 7
    493    blr
    494 .size ppc_aes_gcmHASH, . - ppc_aes_gcmHASH
    495 
    496 ################################################################################
    497 # Generates the final GCM tag
    498 # void ppc_aes_gcmTAG(uint8_t Htbl[16*8], uint8_t *Tp, uint64_t Mlen, uint64_t Alen, uint8_t* X0, uint8_t* TAG);
    499 .globl	ppc_aes_gcmTAG
    500 .type	ppc_aes_gcmTAG,@function
    501 .align	5
    502 ppc_aes_gcmTAG:
    503 addis	TOCP,12,(.TOC.-ppc_aes_gcmTAG)@ha
    504 addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmTAG)@l
    505 .localentry	ppc_aes_gcmTAG, .-ppc_aes_gcmTAG
    506 
    507 .set Htbl, 3
    508 .set Tp, 4
    509 .set Mlen, 5
    510 .set Alen, 6
    511 .set X0, 7
    512 .set TAG, 8
    513 
    514 .set SWAP_MASK, 0
    515 .set POLY, 1
    516 .set D, 2
    517 .set C0, 3
    518 .set C1, 4
    519 .set T, 5
    520 .set R, 6
    521 .set F, 7
    522 .set H1M, 8
    523 .set H1L, 9
    524 .set X, 10
    525 
    526    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
    527    VEC_LOAD_DATA POLY, .Lpoly_r, 9
    528    
    529    VEC_LOAD     D, Tp, 0
    530    
    531    # load table elements
    532    li           9, 1*16
    533    lxvd2x       H1L+32, 0, Htbl
    534    lxvd2x       H1M+32, 9, Htbl
    535 
    536    sldi         Alen, Alen, 3
    537    sldi         Mlen, Mlen, 3
    538    mtvrd        C0, Alen
    539    mtvrd        C1, Mlen
    540    xxmrghd      C0+32, C0+32, C1+32
    541 
    542    # previous digest combining
    543    vxor         C0, C0, D
    544 
    545    # polynomial multiplication
    546    vpmsumd      F, H1L, C0
    547    vpmsumd      R, H1M, C0
    548 
    549    # reduction
    550    vpmsumd      T, F, POLY
    551    vsldoi       D, F, F, 8
    552    vxor         R, R, T
    553    vxor         D, R, D
    554 
    555    lxvd2x       X+32, 0, X0
    556    vperm        D, D, D, SWAP_MASK
    557    vxor         X, X, D
    558    stxvd2x      X+32, 0, TAG
    559 
    560    blr
    561 .size ppc_aes_gcmTAG, . - ppc_aes_gcmTAG
    562 
    563 ################################################################################
    564 # Crypt only
    565 # void ppc_aes_gcmCRYPT(const uint8_t* PT, uint8_t* CT, uint64_t LEN, uint8_t *CTRP, uint32_t *KS, int NR);
    566 .globl	ppc_aes_gcmCRYPT
    567 .type	ppc_aes_gcmCRYPT,@function
    568 .align	5
    569 ppc_aes_gcmCRYPT:
    570 addis	TOCP,12,(.TOC.-ppc_aes_gcmCRYPT)@ha
    571 addi	TOCP,TOCP,(.TOC.-ppc_aes_gcmCRYPT)@l
    572 .localentry	ppc_aes_gcmCRYPT, .-ppc_aes_gcmCRYPT
    573 
    574 .set PT, 3
    575 .set CT, 4
    576 .set LEN, 5
    577 .set CTRP, 6
    578 .set KS, 7
    579 .set NR, 8
    580 
    581 .set SWAP_MASK, 0
    582 .set K, 1
    583 .set CTR, 2
    584 .set CTR0, 3
    585 .set CTR1, 4
    586 .set CTR2, 5
    587 .set CTR3, 6
    588 .set CTR4, 7
    589 .set CTR5, 8
    590 .set CTR6, 9
    591 .set CTR7, 10
    592 .set ZERO, 11
    593 .set I1, 12
    594 .set I2, 13
    595 .set I3, 14
    596 .set I4, 15
    597 .set I5, 16
    598 .set I6, 17
    599 .set I7, 18
    600 .set I8, 19
    601 .set IN0, 24
    602 .set IN1, 25
    603 .set IN2, 26
    604 .set IN3, 27
    605 .set IN4, 28
    606 .set IN5, 29
    607 .set IN6, 30
    608 .set IN7, 31
    609 
    610 .macro ROUND_8
    611    VEC_LOAD_INC K, KS, 10
    612    vcipher      CTR0, CTR0, K
    613    vcipher      CTR1, CTR1, K
    614    vcipher      CTR2, CTR2, K
    615    vcipher      CTR3, CTR3, K
    616    vcipher      CTR4, CTR4, K
    617    vcipher      CTR5, CTR5, K
    618    vcipher      CTR6, CTR6, K
    619    vcipher      CTR7, CTR7, K
    620 .endm
    621 
    622 .macro ROUND_4
    623    VEC_LOAD_INC K, KS, 10
    624    vcipher      CTR0, CTR0, K
    625    vcipher      CTR1, CTR1, K
    626    vcipher      CTR2, CTR2, K
    627    vcipher      CTR3, CTR3, K
    628 .endm
    629 
    630 .macro ROUND_2
    631    VEC_LOAD_INC K, KS, 10
    632    vcipher      CTR0, CTR0, K
    633    vcipher      CTR1, CTR1, K
    634 .endm
    635 
    636 .macro ROUND_1
    637    VEC_LOAD_INC K, KS, 10
    638    vcipher      CTR0, CTR0, K
    639 .endm
    640 
    641    # store non-volatile general registers
    642    std          31,-8(SP);
    643    std          30,-16(SP);
    644    std          29,-24(SP);
    645    std          28,-32(SP);
    646    std          27,-40(SP);
    647    std          26,-48(SP);
    648    std          25,-56(SP);
    649 
    650    # store non-volatile vector registers
    651    addi         9, SP, -80
    652    stvx         31, 0, 9
    653    addi         9, SP, -96
    654    stvx         30, 0, 9
    655    addi         9, SP, -112
    656    stvx         29, 0, 9
    657    addi         9, SP, -128
    658    stvx         28, 0, 9
    659    addi         9, SP, -144
    660    stvx         27, 0, 9
    661    addi         9, SP, -160
    662    stvx         26, 0, 9
    663    addi         9, SP, -176
    664    stvx         25, 0, 9
    665    addi         9, SP, -192
    666    stvx         24, 0, 9
    667 
    668    VEC_LOAD_DATA SWAP_MASK, .Ldb_bswap_mask, 9
    669 
    670    vxor         ZERO, ZERO, ZERO
    671    vspltisb     I1, 1
    672    vspltisb     I2, 2
    673    vspltisb     I3, 3
    674    vspltisb     I4, 4
    675    vspltisb     I5, 5
    676    vspltisb     I6, 6
    677    vspltisb     I7, 7
    678    vspltisb     I8, 8
    679    vsldoi       I1, ZERO, I1, 1
    680    vsldoi       I2, ZERO, I2, 1
    681    vsldoi       I3, ZERO, I3, 1
    682    vsldoi       I4, ZERO, I4, 1
    683    vsldoi       I5, ZERO, I5, 1
    684    vsldoi       I6, ZERO, I6, 1
    685    vsldoi       I7, ZERO, I7, 1
    686    vsldoi       I8, ZERO, I8, 1
    687 
    688    VEC_LOAD     CTR, CTRP, 0
    689 
    690    srdi.        9, LEN, 7
    691    beq          .Lctr_4x
    692 
    693    mtctr        9
    694 
    695    li           25, 0x10
    696    li           26, 0x20
    697    li           27, 0x30
    698    li           28, 0x40
    699    li           29, 0x50
    700    li           30, 0x60
    701    li           31, 0x70
    702 
    703 .align 5
    704 .L8x_loop:
    705    li           10, 0
    706    VEC_LOAD_INC K, KS, 10
    707 
    708    vadduwm      CTR1, CTR, I1
    709    vadduwm      CTR2, CTR, I2
    710    vadduwm      CTR3, CTR, I3
    711    vadduwm      CTR4, CTR, I4
    712    vadduwm      CTR5, CTR, I5
    713    vadduwm      CTR6, CTR, I6
    714    vadduwm      CTR7, CTR, I7
    715 
    716    vxor         CTR0, CTR,  K
    717    vxor         CTR1, CTR1, K
    718    vxor         CTR2, CTR2, K
    719    vxor         CTR3, CTR3, K
    720    vxor         CTR4, CTR4, K
    721    vxor         CTR5, CTR5, K
    722    vxor         CTR6, CTR6, K
    723    vxor         CTR7, CTR7, K
    724 
    725    ROUND_8
    726    ROUND_8
    727    ROUND_8
    728    ROUND_8
    729    ROUND_8
    730    ROUND_8
    731    ROUND_8
    732    ROUND_8
    733    ROUND_8
    734    cmpwi        NR, 10
    735    beq          .Llast_8
    736    ROUND_8
    737    ROUND_8
    738    cmpwi        NR, 12
    739    beq          .Llast_8
    740    ROUND_8
    741    ROUND_8
    742 
    743 .Llast_8:
    744    VEC_LOAD     K, KS, 10
    745    vcipherlast  CTR0, CTR0, K
    746    vcipherlast  CTR1, CTR1, K
    747    vcipherlast  CTR2, CTR2, K
    748    vcipherlast  CTR3, CTR3, K
    749    vcipherlast  CTR4, CTR4, K
    750    vcipherlast  CTR5, CTR5, K
    751    vcipherlast  CTR6, CTR6, K
    752    vcipherlast  CTR7, CTR7, K
    753 
    754    lxvd2x       IN0+32, 0,  PT
    755    lxvd2x       IN1+32, 25, PT
    756    lxvd2x       IN2+32, 26, PT
    757    lxvd2x       IN3+32, 27, PT
    758    lxvd2x       IN4+32, 28, PT
    759    lxvd2x       IN5+32, 29, PT
    760    lxvd2x       IN6+32, 30, PT
    761    lxvd2x       IN7+32, 31, PT
    762 
    763    vperm        CTR0, CTR0, CTR0, SWAP_MASK
    764    vperm        CTR1, CTR1, CTR1, SWAP_MASK
    765    vperm        CTR2, CTR2, CTR2, SWAP_MASK
    766    vperm        CTR3, CTR3, CTR3, SWAP_MASK
    767    vperm        CTR4, CTR4, CTR4, SWAP_MASK
    768    vperm        CTR5, CTR5, CTR5, SWAP_MASK
    769    vperm        CTR6, CTR6, CTR6, SWAP_MASK
    770    vperm        CTR7, CTR7, CTR7, SWAP_MASK
    771 
    772    vxor         IN0, IN0, CTR0
    773    vxor         IN1, IN1, CTR1
    774    vxor         IN2, IN2, CTR2
    775    vxor         IN3, IN3, CTR3
    776    vxor         IN4, IN4, CTR4
    777    vxor         IN5, IN5, CTR5
    778    vxor         IN6, IN6, CTR6
    779    vxor         IN7, IN7, CTR7
    780 
    781    stxvd2x      IN0+32, 0,  CT
    782    stxvd2x      IN1+32, 25, CT
    783    stxvd2x      IN2+32, 26, CT
    784    stxvd2x      IN3+32, 27, CT
    785    stxvd2x      IN4+32, 28, CT
    786    stxvd2x      IN5+32, 29, CT
    787    stxvd2x      IN6+32, 30, CT
    788    stxvd2x      IN7+32, 31, CT
    789 
    790    vadduwm      CTR, CTR, I8
    791    addi         PT, PT, 0x80
    792    addi         CT, CT, 0x80
    793    bdnz         .L8x_loop
    794 
    795    clrldi       LEN, LEN, 57
    796 
    797 .Lctr_4x:
    798    srdi.        9, LEN, 6
    799    beq          .Lctr_2x
    800 
    801    li           10, 0
    802    li           29, 0x10
    803    li           30, 0x20
    804    li           31, 0x30
    805 
    806    VEC_LOAD_INC K, KS, 10
    807 
    808    vadduwm      CTR1, CTR, I1
    809    vadduwm      CTR2, CTR, I2
    810    vadduwm      CTR3, CTR, I3
    811 
    812    vxor         CTR0, CTR,  K
    813    vxor         CTR1, CTR1, K
    814    vxor         CTR2, CTR2, K
    815    vxor         CTR3, CTR3, K
    816 
    817    ROUND_4
    818    ROUND_4
    819    ROUND_4
    820    ROUND_4
    821    ROUND_4
    822    ROUND_4
    823    ROUND_4
    824    ROUND_4
    825    ROUND_4
    826    cmpwi        NR, 10
    827    beq          .Llast_4
    828    ROUND_4
    829    ROUND_4
    830    cmpwi        NR, 12
    831    beq          .Llast_4
    832    ROUND_4
    833    ROUND_4
    834 
    835 .Llast_4:
    836    VEC_LOAD     K, KS, 10
    837    vcipherlast  CTR0, CTR0, K
    838    vcipherlast  CTR1, CTR1, K
    839    vcipherlast  CTR2, CTR2, K
    840    vcipherlast  CTR3, CTR3, K
    841 
    842    lxvd2x       IN0+32, 0,  PT
    843    lxvd2x       IN1+32, 29, PT
    844    lxvd2x       IN2+32, 30, PT
    845    lxvd2x       IN3+32, 31, PT
    846 
    847    vperm        CTR0, CTR0, CTR0, SWAP_MASK
    848    vperm        CTR1, CTR1, CTR1, SWAP_MASK
    849    vperm        CTR2, CTR2, CTR2, SWAP_MASK
    850    vperm        CTR3, CTR3, CTR3, SWAP_MASK
    851 
    852    vxor         IN0, IN0, CTR0
    853    vxor         IN1, IN1, CTR1
    854    vxor         IN2, IN2, CTR2
    855    vxor         IN3, IN3, CTR3
    856 
    857    stxvd2x      IN0+32, 0,  CT
    858    stxvd2x      IN1+32, 29, CT
    859    stxvd2x      IN2+32, 30, CT
    860    stxvd2x      IN3+32, 31, CT
    861 
    862    vadduwm      CTR, CTR, I4
    863    addi         PT, PT, 0x40
    864    addi         CT, CT, 0x40
    865 
    866    clrldi       LEN, LEN, 58
    867 
    868 .Lctr_2x:
    869    srdi.        9, LEN, 5
    870    beq          .Lctr_1x
    871 
    872    li           10, 0
    873    li           31, 0x10
    874 
    875    VEC_LOAD_INC K, KS, 10
    876 
    877    vadduwm      CTR1, CTR, I1
    878 
    879    vxor         CTR0, CTR,  K
    880    vxor         CTR1, CTR1, K
    881 
    882    ROUND_2
    883    ROUND_2
    884    ROUND_2
    885    ROUND_2
    886    ROUND_2
    887    ROUND_2
    888    ROUND_2
    889    ROUND_2
    890    ROUND_2
    891    cmpwi        NR, 10
    892    beq          .Llast_2
    893    ROUND_2
    894    ROUND_2
    895    cmpwi        NR, 12
    896    beq          .Llast_2
    897    ROUND_2
    898    ROUND_2
    899 
    900 .Llast_2:
    901    VEC_LOAD     K, KS, 10
    902    vcipherlast  CTR0, CTR0, K
    903    vcipherlast  CTR1, CTR1, K
    904 
    905    lxvd2x       IN0+32, 0,  PT
    906    lxvd2x       IN1+32, 31, PT
    907 
    908    vperm        CTR0, CTR0, CTR0, SWAP_MASK
    909    vperm        CTR1, CTR1, CTR1, SWAP_MASK
    910 
    911    vxor         IN0, IN0, CTR0
    912    vxor         IN1, IN1, CTR1
    913 
    914    stxvd2x      IN0+32, 0,  CT
    915    stxvd2x      IN1+32, 31, CT
    916 
    917    vadduwm      CTR, CTR, I2
    918    addi         PT, PT, 0x20
    919    addi         CT, CT, 0x20
    920 
    921    clrldi       LEN, LEN, 59
    922 
    923 .Lctr_1x:
    924    srdi.        9, LEN, 4
    925    beq          .Lctr_tail
    926 
    927    li           10, 0
    928 
    929    VEC_LOAD_INC K, KS, 10
    930    vxor         CTR0, CTR,  K
    931 
    932    ROUND_1
    933    ROUND_1
    934    ROUND_1
    935    ROUND_1
    936    ROUND_1
    937    ROUND_1
    938    ROUND_1
    939    ROUND_1
    940    ROUND_1
    941    cmpwi        NR, 10
    942    beq          .Llast_1
    943    ROUND_1
    944    ROUND_1
    945    cmpwi        NR, 12
    946    beq          .Llast_1
    947    ROUND_1
    948    ROUND_1
    949 
    950 .Llast_1:
    951    VEC_LOAD     K, KS, 10
    952    vcipherlast  CTR0, CTR0, K
    953 
    954    lxvd2x       IN0+32, 0, PT
    955 
    956    vperm        CTR0, CTR0, CTR0, SWAP_MASK
    957 
    958    vxor         IN0, IN0, CTR0
    959 
    960    stxvd2x      IN0+32, 0, CT
    961 
    962    vadduwm      CTR, CTR, I1
    963    addi         PT, PT, 0x10
    964    addi         CT, CT, 0x10
    965 
    966    clrldi       LEN, LEN, 60
    967 
    968 .Lctr_tail:
    969    cmpldi       LEN, 0
    970    beq          .Lc_done
    971 
    972    li           10, 0
    973 
    974    VEC_LOAD_INC K, KS, 10
    975    vxor         CTR0, CTR,  K
    976 
    977    ROUND_1
    978    ROUND_1
    979    ROUND_1
    980    ROUND_1
    981    ROUND_1
    982    ROUND_1
    983    ROUND_1
    984    ROUND_1
    985    ROUND_1
    986    cmpwi        NR, 10
    987    beq          .Llast_tail
    988    ROUND_1
    989    ROUND_1
    990    cmpwi        NR, 12
    991    beq          .Llast_tail
    992    ROUND_1
    993    ROUND_1
    994 
    995 .Llast_tail:
    996    VEC_LOAD     K, KS, 10
    997    vcipherlast  CTR0, CTR0, K
    998 
    999    LOAD_LEN     PT, LEN, 10, 9, 29, 30, 31
   1000 
   1001    vsldoi       CTR1, CTR0, CTR0, 8
   1002    mfvrd        31, CTR0
   1003    mfvrd        30, CTR1
   1004 
   1005    xor          10, 10, 31
   1006    xor          9, 9, 30
   1007 
   1008    STORE_LEN    CT, LEN, 10, 9, 29, 30, 31
   1009 
   1010    vadduwm      CTR, CTR, I1
   1011 
   1012 .Lc_done:
   1013    VEC_STORE    CTR, CTRP, 0
   1014 
   1015    # restore non-volatile vector registers
   1016    addi         9, SP, -80
   1017    lvx          31, 0, 9
   1018    addi         9, SP, -96
   1019    lvx          30, 0, 9
   1020    addi         9, SP, -112
   1021    lvx          29, 0, 9
   1022    addi         9, SP, -128
   1023    lvx          28, 0, 9
   1024    addi         9, SP, -144
   1025    lvx          27, 0, 9
   1026    addi         9, SP, -160
   1027    lvx          26, 0, 9
   1028    addi         9, SP, -176
   1029    lvx          25, 0, 9
   1030    addi         9, SP, -192
   1031    lvx          24, 0, 9
   1032    
   1033    # restore non-volatile general registers
   1034    ld           31,-8(SP);
   1035    ld           30,-16(SP);
   1036    ld           29,-24(SP);
   1037    ld           28,-32(SP);
   1038    ld           27,-40(SP);
   1039    ld           26,-48(SP);
   1040    ld           25,-56(SP);
   1041    blr
   1042 .size ppc_aes_gcmCRYPT, . - ppc_aes_gcmCRYPT
   1043 
   1044 .data
   1045 .align	4
   1046 .Lpoly:
   1047 .byte	1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
   1048 .Lpoly_r:
   1049    .byte	0,0,0,0,0,0,0,0xc2,0,0,0,0,0,0,0,0
   1050 .Ldb_bswap_mask:
   1051 .byte	8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7