tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

hppa20.s (28203B)


      1 ; This Source Code Form is subject to the terms of the Mozilla Public
      2 ; License, v. 2.0. If a copy of the MPL was not distributed with this
      3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/.
      4 
      5 #ifdef __LP64__
      6        .LEVEL   2.0W
      7 #else
      8 ;       .LEVEL   1.1
      9 ;       .ALLOW   2.0N
     10        .LEVEL   2.0
     11 #endif
     12        .SPACE   $TEXT$,SORT=8
     13        .SUBSPA  $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24
     14 
     15 ; ***************************************************************
     16 ;
     17 ;                 maxpy_[little/big]
     18 ;
     19 ; ***************************************************************
     20 
     21 ; There is no default -- you must specify one or the other.
     22 #define LITTLE_WORDIAN 1
     23 
     24 #ifdef LITTLE_WORDIAN
     25 #define EIGHT 8
     26 #define SIXTEEN 16
     27 #define THIRTY_TWO 32
     28 #define UN_EIGHT -8
     29 #define UN_SIXTEEN -16
     30 #define UN_TWENTY_FOUR -24
     31 #endif
     32 
     33 #ifdef BIG_WORDIAN
     34 #define EIGHT -8
     35 #define SIXTEEN -16
     36 #define THIRTY_TWO -32
     37 #define UN_EIGHT 8
     38 #define UN_SIXTEEN 16
     39 #define UN_TWENTY_FOUR 24
     40 #endif
     41 
     42 ; This performs a multiple-precision integer version of "daxpy",
     43 ; Using the selected addressing direction.  "Little-wordian" means that
     44 ; the least significant word of a number is stored at the lowest address.
     45 ; "Big-wordian" means that the most significant word is at the lowest
     46 ; address.  Either way, the incoming address of the vector is that
     47 ; of the least significant word.  That means that, for little-wordian
     48 ; addressing, we move the address upward as we propagate carries
     49 ; from the least significant word to the most significant.  For
     50 ; big-wordian we move the address downward.
     51 
     52 ; We use the following registers:
     53 ;
     54 ;     r2   return PC, of course
     55 ;     r26 = arg1 =  length
     56 ;     r25 = arg2 =  address of scalar
     57 ;     r24 = arg3 =  multiplicand vector
     58 ;     r23 = arg4 =  result vector
     59 ;
     60 ;     fr9 = scalar loaded once only from r25
     61 
     62 ; The cycle counts shown in the bodies below are simply the result of a
     63 ; scheduling by hand.  The actual PCX-U hardware does it differently.
     64 ; The intention is that the overall speed is the same.
     65 
     66 ; The pipeline startup and shutdown code is constructed in the usual way,
     67 ; by taking the loop bodies and removing unnecessary instructions.
     68 ; We have left the comments describing cycle numbers in the code.
     69 ; These are intended for reference when comparing with the main loop,
     70 ; and have no particular relationship to actual cycle numbers.
     71 
     72 #ifdef LITTLE_WORDIAN
     73 maxpy_little
     74 #else
     75 maxpy_big
     76 #endif
     77        .PROC
     78        .CALLINFO FRAME=120,ENTRY_GR=4
     79        .ENTRY
     80        STW,MA  %r3,128(%sp)
     81        STW     %r4,-124(%sp)
     82 
     83        ADDIB,< -1,%r26,$L0         ; If N = 0, exit immediately.
     84        FLDD    0(%r25),%fr9        ; fr9 = scalar
     85 
     86 ; First startup
     87 
     88        FLDD    0(%r24),%fr24       ; Cycle 1
     89        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
     90        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
     91        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
     92        CMPIB,> 3,%r26,$N_IS_SMALL  ; Pick out cases N = 1, 2, or 3
     93        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
     94        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
     95        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
     96        FSTD    %fr24,-96(%sp)
     97        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
     98        FSTD    %fr25,-80(%sp)
     99        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
    100        FSTD    %fr31,-64(%sp)
    101        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
    102        FSTD    %fr27,-48(%sp)
    103 
    104 ; Second startup
    105 
    106        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
    107        FSTD    %fr30,-56(%sp)
    108        FLDD    0(%r24),%fr24
    109 
    110        FSTD    %fr26,-88(%sp)      ; Cycle 2
    111 
    112        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
    113        FSTD    %fr28,-104(%sp)
    114 
    115        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
    116        LDD     -96(%sp),%r3
    117        FSTD    %fr29,-72(%sp)
    118 
    119        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
    120        LDD     -64(%sp),%r19
    121        LDD     -80(%sp),%r21
    122 
    123        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
    124        LDD     -56(%sp),%r20
    125        ADD     %r21,%r3,%r3
    126 
    127        ADD,DC  %r20,%r19,%r19      ; Cycle 7
    128        LDD     -88(%sp),%r4
    129        SHRPD   %r3,%r0,32,%r21
    130        LDD     -48(%sp),%r1
    131 
    132        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
    133        LDD     -104(%sp),%r31
    134        ADD,DC  %r0,%r0,%r20
    135        SHRPD   %r19,%r3,32,%r3
    136 
    137        LDD     -72(%sp),%r29       ; Cycle 9
    138        SHRPD   %r20,%r19,32,%r20
    139        ADD     %r21,%r1,%r1
    140 
    141        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
    142        ADD,DC  %r3,%r4,%r4
    143        FSTD    %fr24,-96(%sp)
    144 
    145        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
    146        ADD,DC  %r0,%r20,%r20
    147        LDD     0(%r23),%r3
    148        FSTD    %fr25,-80(%sp)
    149 
    150        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
    151        FSTD    %fr31,-64(%sp)
    152 
    153        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
    154        ADD     %r0,%r0,%r0         ; clear the carry bit
    155        ADDIB,<= -4,%r26,$ENDLOOP   ; actually happens in cycle 12
    156        FSTD    %fr27,-48(%sp)
    157 ;        MFCTL   %cr16,%r21         ; for timing
    158 ;        STD     %r21,-112(%sp)
    159 
    160 ; Here is the loop.
    161 
    162 $LOOP   XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
    163        ADD,DC  %r29,%r4,%r4
    164        FSTD    %fr30,-56(%sp)
    165        FLDD    0(%r24),%fr24
    166 
    167        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
    168        ADD,DC  %r0,%r20,%r20
    169        FSTD    %fr26,-88(%sp)
    170 
    171        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
    172        ADD     %r3,%r1,%r1
    173        FSTD    %fr28,-104(%sp)
    174        LDD     UN_EIGHT(%r23),%r21
    175 
    176        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
    177        ADD,DC  %r21,%r4,%r28
    178        FSTD    %fr29,-72(%sp)    
    179        LDD     -96(%sp),%r3
    180 
    181        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
    182        ADD,DC  %r20,%r31,%r22
    183        LDD     -64(%sp),%r19
    184        LDD     -80(%sp),%r21
    185 
    186        XMPYU   %fr9L,%fr24R,%fr24  ; Cycle 6
    187        ADD     %r21,%r3,%r3
    188        LDD     -56(%sp),%r20
    189        STD     %r1,UN_SIXTEEN(%r23)
    190 
    191        ADD,DC  %r20,%r19,%r19      ; Cycle 7
    192        SHRPD   %r3,%r0,32,%r21
    193        LDD     -88(%sp),%r4
    194        LDD     -48(%sp),%r1
    195 
    196        ADD,DC  %r0,%r0,%r20        ; Cycle 8
    197        SHRPD   %r19,%r3,32,%r3
    198        FLDD    EIGHT(%r24),%fr28
    199        LDD     -104(%sp),%r31
    200 
    201        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
    202        ADD     %r21,%r1,%r1
    203        STD     %r28,UN_EIGHT(%r23)
    204        LDD     -72(%sp),%r29
    205 
    206        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
    207        ADD,DC  %r3,%r4,%r4
    208        FSTD    %fr24,-96(%sp)
    209 
    210        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
    211        ADD,DC  %r0,%r20,%r20
    212        FSTD    %fr25,-80(%sp)
    213        LDD     0(%r23),%r3
    214 
    215        LDO     SIXTEEN(%r24),%r24  ; Cycle 12
    216        FSTD    %fr31,-64(%sp)
    217 
    218        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
    219        ADD     %r22,%r1,%r1
    220        ADDIB,> -2,%r26,$LOOP       ; actually happens in cycle 12
    221        FSTD    %fr27,-48(%sp)
    222 
    223 $ENDLOOP
    224 
    225 ; Shutdown code, first stage.
    226 
    227 ;        MFCTL   %cr16,%r21         ; for timing
    228 ;        STD     %r21,UN_SIXTEEN(%r23)
    229 ;        LDD     -112(%sp),%r21
    230 ;        STD     %r21,UN_EIGHT(%r23)
    231 
    232        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
    233        ADD,DC  %r29,%r4,%r4
    234        CMPIB,= 0,%r26,$ONEMORE
    235        FSTD    %fr30,-56(%sp)
    236 
    237        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
    238        ADD,DC  %r0,%r20,%r20
    239        FSTD    %fr26,-88(%sp)
    240 
    241        ADD     %r3,%r1,%r1         ; Cycle 3
    242        FSTD    %fr28,-104(%sp)
    243        LDD     UN_EIGHT(%r23),%r21
    244 
    245        ADD,DC  %r21,%r4,%r28       ; Cycle 4
    246        FSTD    %fr29,-72(%sp)    
    247        STD     %r28,UN_EIGHT(%r23) ; moved up from cycle 9
    248        LDD     -96(%sp),%r3
    249 
    250        ADD,DC  %r20,%r31,%r22      ; Cycle 5
    251        STD     %r1,UN_SIXTEEN(%r23)
    252 $JOIN4
    253        LDD     -64(%sp),%r19
    254        LDD     -80(%sp),%r21
    255 
    256        ADD     %r21,%r3,%r3        ; Cycle 6
    257        LDD     -56(%sp),%r20
    258 
    259        ADD,DC  %r20,%r19,%r19      ; Cycle 7
    260        SHRPD   %r3,%r0,32,%r21
    261        LDD     -88(%sp),%r4
    262        LDD     -48(%sp),%r1
    263 
    264        ADD,DC  %r0,%r0,%r20        ; Cycle 8
    265        SHRPD   %r19,%r3,32,%r3
    266        LDD     -104(%sp),%r31
    267 
    268        SHRPD   %r20,%r19,32,%r20   ; Cycle 9
    269        ADD     %r21,%r1,%r1
    270        LDD     -72(%sp),%r29
    271 
    272        ADD,DC  %r3,%r4,%r4         ; Cycle 10
    273 
    274        ADD,DC  %r0,%r20,%r20       ; Cycle 11
    275        LDD     0(%r23),%r3
    276 
    277        ADD     %r22,%r1,%r1        ; Cycle 13
    278 
    279 ; Shutdown code, second stage.
    280 
    281        ADD,DC  %r29,%r4,%r4        ; Cycle 1
    282 
    283        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
    284        ADD,DC  %r0,%r20,%r20
    285 
    286        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
    287        ADD     %r3,%r1,%r1
    288 
    289        ADD,DC  %r21,%r4,%r28       ; Cycle 4
    290 
    291        ADD,DC  %r20,%r31,%r22      ; Cycle 5
    292 
    293        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
    294 
    295        STD     %r28,UN_EIGHT(%r23) ; Cycle 9
    296 
    297        LDD     0(%r23),%r3         ; Cycle 11
    298 
    299 ; Shutdown code, third stage.
    300 
    301        LDO     SIXTEEN(%r23),%r23
    302        ADD     %r3,%r22,%r1
    303 $JOIN1  ADD,DC  %r0,%r0,%r21
    304        CMPIB,*= 0,%r21,$L0         ; if no overflow, exit
    305        STD     %r1,UN_SIXTEEN(%r23)
    306 
    307 ; Final carry propagation
    308 
    309 $FINAL1 LDO     EIGHT(%r23),%r23
    310        LDD     UN_SIXTEEN(%r23),%r21
    311        ADDI    1,%r21,%r21
    312        CMPIB,*= 0,%r21,$FINAL1     ; Keep looping if there is a carry.
    313        STD     %r21,UN_SIXTEEN(%r23)
    314        B       $L0
    315        NOP
    316 
    317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3.
    318 ; We do the usual trick -- branch out of the startup code at appropriate
    319 ; points, and branch into the shutdown code.
    320 
    321 $N_IS_SMALL
    322        CMPIB,= 0,%r26,$N_IS_ONE
    323        FSTD    %fr24,-96(%sp)      ; Cycle 10
    324        FLDD    EIGHT(%r24),%fr28   ; Cycle 8
    325        XMPYU   %fr9L,%fr28R,%fr31  ; Cycle 10
    326        XMPYU   %fr9R,%fr28L,%fr30  ; Cycle 11
    327        FSTD    %fr25,-80(%sp)
    328        FSTD    %fr31,-64(%sp)      ; Cycle 12
    329        XMPYU   %fr9R,%fr28R,%fr29  ; Cycle 13
    330        FSTD    %fr27,-48(%sp)
    331        XMPYU   %fr9L,%fr28L,%fr28  ; Cycle 1
    332        CMPIB,= 2,%r26,$N_IS_THREE
    333        FSTD    %fr30,-56(%sp)
    334 
    335 ; N = 2
    336        FSTD    %fr26,-88(%sp)      ; Cycle 2
    337        FSTD    %fr28,-104(%sp)     ; Cycle 3
    338        LDD     -96(%sp),%r3        ; Cycle 4
    339        FSTD    %fr29,-72(%sp)
    340        B       $JOIN4
    341        ADD     %r0,%r0,%r22
    342 
    343 $N_IS_THREE
    344        FLDD    SIXTEEN(%r24),%fr24
    345        FSTD    %fr26,-88(%sp)      ; Cycle 2
    346        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
    347        FSTD    %fr28,-104(%sp)
    348        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
    349        LDD     -96(%sp),%r3
    350        FSTD    %fr29,-72(%sp)
    351        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
    352        LDD     -64(%sp),%r19
    353        LDD     -80(%sp),%r21
    354        B       $JOIN3
    355        ADD     %r0,%r0,%r22
    356 
    357 $N_IS_ONE
    358        FSTD    %fr25,-80(%sp)
    359        FSTD    %fr27,-48(%sp)
    360        FSTD    %fr26,-88(%sp)      ; Cycle 2
    361        B       $JOIN5
    362        ADD     %r0,%r0,%r22
    363 
    364 ; We came out of the unrolled loop with wrong parity.  Do one more
    365 ; single cycle.  This is quite tricky, because of the way the
    366 ; carry chains and SHRPD chains have been chopped up.
    367 
    368 $ONEMORE
    369 
    370        FLDD    0(%r24),%fr24
    371 
    372        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
    373        ADD,DC  %r0,%r20,%r20
    374        FSTD    %fr26,-88(%sp)
    375 
    376        XMPYU   %fr9R,%fr24R,%fr27  ; Cycle 3
    377        FSTD    %fr28,-104(%sp)
    378        LDD     UN_EIGHT(%r23),%r21
    379        ADD     %r3,%r1,%r1
    380 
    381        XMPYU   %fr9R,%fr24L,%fr25  ; Cycle 4
    382        ADD,DC  %r21,%r4,%r28
    383        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
    384        LDD     -96(%sp),%r3
    385        FSTD    %fr29,-72(%sp)    
    386 
    387        XMPYU   %fr9L,%fr24L,%fr26  ; Cycle 5
    388        ADD,DC  %r20,%r31,%r22
    389        LDD     -64(%sp),%r19
    390        LDD     -80(%sp),%r21
    391 
    392        STD     %r1,UN_SIXTEEN(%r23); Cycle 6
    393 $JOIN3
    394        XMPYU   %fr9L,%fr24R,%fr24
    395        LDD     -56(%sp),%r20
    396        ADD     %r21,%r3,%r3
    397 
    398        ADD,DC  %r20,%r19,%r19      ; Cycle 7
    399        LDD     -88(%sp),%r4
    400        SHRPD   %r3,%r0,32,%r21
    401        LDD     -48(%sp),%r1
    402 
    403        LDD     -104(%sp),%r31      ; Cycle 8
    404        ADD,DC  %r0,%r0,%r20
    405        SHRPD   %r19,%r3,32,%r3
    406 
    407        LDD     -72(%sp),%r29       ; Cycle 9
    408        SHRPD   %r20,%r19,32,%r20
    409        ADD     %r21,%r1,%r1
    410 
    411        ADD,DC  %r3,%r4,%r4         ; Cycle 10
    412        FSTD    %fr24,-96(%sp)
    413 
    414        ADD,DC  %r0,%r20,%r20       ; Cycle 11
    415        LDD     0(%r23),%r3
    416        FSTD    %fr25,-80(%sp)
    417 
    418        ADD     %r22,%r1,%r1        ; Cycle 13
    419        FSTD    %fr27,-48(%sp)
    420 
    421 ; Shutdown code, stage 1-1/2.
    422 
    423        ADD,DC  %r29,%r4,%r4        ; Cycle 1
    424 
    425        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
    426        ADD,DC  %r0,%r20,%r20     
    427        FSTD    %fr26,-88(%sp)
    428 
    429        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
    430        ADD     %r3,%r1,%r1
    431 
    432        ADD,DC  %r21,%r4,%r28       ; Cycle 4
    433        STD     %r28,UN_EIGHT(%r23) ; moved from cycle 9
    434 
    435        ADD,DC  %r20,%r31,%r22      ; Cycle 5
    436        STD     %r1,UN_SIXTEEN(%r23)
    437 $JOIN5
    438        LDD     -96(%sp),%r3        ; moved from cycle 4
    439        LDD     -80(%sp),%r21
    440        ADD     %r21,%r3,%r3        ; Cycle 6
    441        ADD,DC  %r0,%r0,%r19        ; Cycle 7
    442        LDD     -88(%sp),%r4
    443        SHRPD   %r3,%r0,32,%r21
    444        LDD     -48(%sp),%r1
    445        SHRPD   %r19,%r3,32,%r3     ; Cycle 8
    446        ADD     %r21,%r1,%r1        ; Cycle 9
    447        ADD,DC  %r3,%r4,%r4         ; Cycle 10
    448        LDD     0(%r23),%r3         ; Cycle 11
    449        ADD     %r22,%r1,%r1        ; Cycle 13
    450 
    451 ; Shutdown code, stage 2-1/2.
    452 
    453        ADD,DC  %r0,%r4,%r4         ; Cycle 1
    454        LDO     SIXTEEN(%r23),%r23  ; Cycle 2
    455        LDD     UN_EIGHT(%r23),%r21 ; Cycle 3
    456        ADD     %r3,%r1,%r1
    457        STD     %r1,UN_SIXTEEN(%r23)
    458        ADD,DC  %r21,%r4,%r1
    459        B       $JOIN1
    460        LDO     EIGHT(%r23),%r23
    461 
    462 ; exit
    463 
    464 $L0
    465        LDW     -124(%sp),%r4
    466        BVE     (%r2)
    467        .EXIT
    468        LDW,MB  -128(%sp),%r3
    469 
    470        .PROCEND
    471 
    472 ; ***************************************************************
    473 ;
    474 ;                 add_diag_[little/big]
    475 ;
    476 ; ***************************************************************
    477 
    478 ; The arguments are as follows:
    479 ;     r2   return PC, of course
    480 ;     r26 = arg1 =  length
    481 ;     r25 = arg2 =  vector to square
    482 ;     r24 = arg3 =  result vector
    483 
    484 #ifdef LITTLE_WORDIAN
    485 add_diag_little
    486 #else
    487 add_diag_big
    488 #endif
    489        .PROC
    490        .CALLINFO FRAME=120,ENTRY_GR=4
    491        .ENTRY
    492        STW,MA  %r3,128(%sp)
    493        STW     %r4,-124(%sp)
    494 
    495        ADDIB,< -1,%r26,$Z0         ; If N=0, exit immediately.
    496        NOP
    497 
    498 ; Startup code
    499 
    500        FLDD    0(%r25),%fr7        ; Cycle 2 (alternate body)
    501        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
    502        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
    503        XMPYU   %fr7L,%fr7L,%fr30
    504        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
    505        FSTD    %fr29,-88(%sp)
    506        FSTD    %fr27,-72(%sp)      ; Cycle 7
    507        CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body)
    508        FSTD    %fr30,-96(%sp)
    509        FLDD    UN_EIGHT(%r25),%fr7 ; Cycle 2
    510        LDD     -88(%sp),%r22       ; Cycle 3
    511        LDD     -72(%sp),%r31       ; Cycle 4
    512        XMPYU   %fr7R,%fr7R,%fr28
    513        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
    514        XMPYU   %fr7L,%fr7L,%fr31
    515        LDD     -96(%sp),%r20       ; Cycle 6
    516        FSTD    %fr28,-80(%sp)
    517        ADD     %r0,%r0,%r0         ; clear the carry bit
    518        ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7
    519        FSTD    %fr24,-64(%sp)
    520 
    521 ; Here is the loop.  It is unrolled twice, modelled after the "alternate body" and then the "main body".
    522 
    523 $DIAGLOOP
    524        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
    525        LDO     SIXTEEN(%r25),%r25
    526        LDD     0(%r24),%r1
    527        FSTD    %fr31,-104(%sp)
    528        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
    529        ADD,DC  %r22,%r3,%r3
    530        FLDD    UN_SIXTEEN(%r25),%fr7   
    531        ADD,DC  %r0,%r20,%r20       ; Cycle 3
    532        ADD     %r1,%r3,%r3
    533        XMPYU   %fr7R,%fr7R,%fr29   ; Cycle 4
    534        LDD     -80(%sp),%r21
    535        STD     %r3,0(%r24)
    536        XMPYU   %fr7L,%fr7R,%fr27   ; Cycle 5
    537        XMPYU   %fr7L,%fr7L,%fr30
    538        LDD     -64(%sp),%r29       
    539        LDD     EIGHT(%r24),%r1  
    540        ADD,DC  %r4,%r20,%r20       ; Cycle 6
    541        LDD     -104(%sp),%r19
    542        FSTD    %fr29,-88(%sp)
    543        ADD     %r20,%r1,%r1        ; Cycle 7
    544        FSTD    %fr27,-72(%sp)
    545        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
    546        LDO     THIRTY_TWO(%r24),%r24
    547        LDD     UN_SIXTEEN(%r24),%r28
    548        FSTD    %fr30,-96(%sp)
    549        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
    550        ADD,DC  %r21,%r4,%r4
    551        FLDD    UN_EIGHT(%r25),%fr7
    552        STD     %r1,UN_TWENTY_FOUR(%r24)
    553        ADD,DC  %r0,%r19,%r19       ; Cycle 3
    554        ADD     %r28,%r4,%r4
    555        XMPYU   %fr7R,%fr7R,%fr28   ; Cycle 4
    556        LDD     -88(%sp),%r22
    557        STD     %r4,UN_SIXTEEN(%r24)
    558        XMPYU   %fr7L,%fr7R,%fr24   ; Cycle 5
    559        XMPYU   %fr7L,%fr7L,%fr31
    560        LDD     -72(%sp),%r31
    561        LDD     UN_EIGHT(%r24),%r28
    562        ADD,DC  %r3,%r19,%r19       ; Cycle 6
    563        LDD     -96(%sp),%r20
    564        FSTD    %fr28,-80(%sp)
    565        ADD     %r19,%r28,%r28      ; Cycle 7
    566        FSTD    %fr24,-64(%sp)
    567        ADDIB,> -2,%r26,$DIAGLOOP   ; Cycle 8
    568        STD     %r28,UN_EIGHT(%r24)
    569 
    570 $ENDDIAGLOOP
    571 
    572        ADD,DC  %r0,%r22,%r22    
    573        CMPIB,= 0,%r26,$ONEMOREDIAG
    574        SHRPD   %r31,%r0,31,%r3
    575 
    576 ; Shutdown code, first stage.
    577 
    578        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
    579        LDD     0(%r24),%r28
    580        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
    581        ADD     %r3,%r22,%r3
    582        ADD,DC  %r0,%r20,%r20       ; Cycle 3
    583        LDD     -80(%sp),%r21
    584        ADD     %r3,%r28,%r3
    585        LDD     -64(%sp),%r29       ; Cycle 4
    586        STD     %r3,0(%r24)
    587        LDD     EIGHT(%r24),%r1     ; Cycle 5
    588        LDO     SIXTEEN(%r25),%r25  ; Cycle 6
    589        LDD     -104(%sp),%r19
    590        ADD,DC  %r4,%r20,%r20
    591        ADD     %r20,%r1,%r1        ; Cycle 7
    592        ADD,DC  %r0,%r21,%r21       ; Cycle 8
    593        STD     %r1,EIGHT(%r24)
    594 
    595 ; Shutdown code, second stage.
    596 
    597        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
    598        LDO     THIRTY_TWO(%r24),%r24
    599        LDD     UN_SIXTEEN(%r24),%r1
    600        SHRPD   %r0,%r29,31,%r3      ; Cycle 2
    601        ADD     %r4,%r21,%r4
    602        ADD,DC  %r0,%r19,%r19       ; Cycle 3
    603        ADD     %r4,%r1,%r4
    604        STD     %r4,UN_SIXTEEN(%r24); Cycle 4
    605        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
    606        ADD,DC  %r3,%r19,%r19       ; Cycle 6       
    607        ADD     %r19,%r28,%r28      ; Cycle 7
    608        ADD,DC  %r0,%r0,%r22        ; Cycle 8
    609        CMPIB,*= 0,%r22,$Z0         ; if no overflow, exit
    610        STD     %r28,UN_EIGHT(%r24)
    611 
    612 ; Final carry propagation
    613 
    614 $FDIAG2
    615        LDO     EIGHT(%r24),%r24
    616        LDD     UN_EIGHT(%r24),%r26
    617        ADDI    1,%r26,%r26
    618        CMPIB,*= 0,%r26,$FDIAG2     ; Keep looping if there is a carry.
    619        STD     %r26,UN_EIGHT(%r24)
    620 
    621        B   $Z0
    622        NOP
    623 
    624 ; Here is the code that handles the difficult case N=1.
    625 ; We do the usual trick -- branch out of the startup code at appropriate
    626 ; points, and branch into the shutdown code.
    627 
    628 $DIAG_N_IS_ONE
    629 
    630        LDD     -88(%sp),%r22
    631        LDD     -72(%sp),%r31
    632        B       $JOINDIAG
    633        LDD     -96(%sp),%r20
    634 
    635 ; We came out of the unrolled loop with wrong parity.  Do one more
    636 ; single cycle.  This is the "alternate body".  It will, of course,
    637 ; give us opposite registers from the other case, so we need
    638 ; completely different shutdown code.
    639 
    640 $ONEMOREDIAG
    641        FSTD    %fr31,-104(%sp)     ; Cycle 1 (alternate body)
    642        LDD     0(%r24),%r28
    643        FLDD    0(%r25),%fr7        ; Cycle 2
    644        SHRPD   %r0,%r31,31,%r4
    645        ADD     %r3,%r22,%r3
    646        ADD,DC  %r0,%r20,%r20       ; Cycle 3
    647        LDD     -80(%sp),%r21
    648        ADD     %r3,%r28,%r3
    649        LDD     -64(%sp),%r29       ; Cycle 4
    650        STD     %r3,0(%r24)
    651        XMPYU   %fr7R,%fr7R,%fr29
    652        LDD     EIGHT(%r24),%r1     ; Cycle 5
    653        XMPYU   %fr7L,%fr7R,%fr27
    654        XMPYU   %fr7L,%fr7L,%fr30
    655        LDD     -104(%sp),%r19      ; Cycle 6
    656        FSTD    %fr29,-88(%sp)
    657        ADD,DC  %r4,%r20,%r20
    658        FSTD    %fr27,-72(%sp)      ; Cycle 7
    659        ADD     %r20,%r1,%r1
    660        ADD,DC  %r0,%r21,%r21       ; Cycle 8
    661        STD     %r1,EIGHT(%r24)
    662 
    663 ; Shutdown code, first stage.
    664 
    665        SHRPD   %r29,%r0,31,%r4     ; Cycle 1 (main body)
    666        LDO     THIRTY_TWO(%r24),%r24
    667        FSTD    %fr30,-96(%sp)
    668        LDD     UN_SIXTEEN(%r24),%r1
    669        SHRPD   %r0,%r29,31,%r3     ; Cycle 2
    670        ADD     %r4,%r21,%r4
    671        ADD,DC  %r0,%r19,%r19       ; Cycle 3
    672        LDD     -88(%sp),%r22
    673        ADD     %r4,%r1,%r4
    674        LDD     -72(%sp),%r31       ; Cycle 4
    675        STD     %r4,UN_SIXTEEN(%r24)
    676        LDD     UN_EIGHT(%r24),%r28 ; Cycle 5
    677        LDD     -96(%sp),%r20       ; Cycle 6
    678        ADD,DC  %r3,%r19,%r19
    679        ADD     %r19,%r28,%r28      ; Cycle 7
    680        ADD,DC  %r0,%r22,%r22       ; Cycle 8
    681        STD     %r28,UN_EIGHT(%r24)
    682 
    683 ; Shutdown code, second stage.
    684 
    685 $JOINDIAG
    686        SHRPD   %r31,%r0,31,%r3     ; Cycle 1 (alternate body)
    687        LDD     0(%r24),%r28        
    688        SHRPD   %r0,%r31,31,%r4     ; Cycle 2
    689        ADD     %r3,%r22,%r3
    690        ADD,DC  %r0,%r20,%r20       ; Cycle 3
    691        ADD     %r3,%r28,%r3
    692        STD     %r3,0(%r24)         ; Cycle 4
    693        LDD     EIGHT(%r24),%r1     ; Cycle 5
    694        ADD,DC  %r4,%r20,%r20
    695        ADD     %r20,%r1,%r1        ; Cycle 7
    696        ADD,DC  %r0,%r0,%r21        ; Cycle 8
    697        CMPIB,*= 0,%r21,$Z0         ; if no overflow, exit
    698        STD     %r1,EIGHT(%r24)
    699 
    700 ; Final carry propagation
    701 
    702 $FDIAG1
    703        LDO     EIGHT(%r24),%r24
    704        LDD     EIGHT(%r24),%r26
    705        ADDI    1,%r26,%r26
    706        CMPIB,*= 0,%r26,$FDIAG1    ; Keep looping if there is a carry.
    707        STD     %r26,EIGHT(%r24)
    708 
    709 $Z0
    710        LDW     -124(%sp),%r4
    711        BVE     (%r2)
    712        .EXIT
    713        LDW,MB  -128(%sp),%r3
    714        .PROCEND
    715 ;	.ALLOW
    716 
    717        .SPACE         $TEXT$
    718        .SUBSPA        $CODE$
    719 #ifdef LITTLE_WORDIAN
    720 #ifdef __GNUC__
    721 ; GNU-as (as of 2.19) does not support LONG_RETURN
    722        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR
    723        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR
    724 #else
    725        .EXPORT        maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
    726        .EXPORT        add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
    727 #endif
    728 #else
    729        .EXPORT        maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN
    730        .EXPORT        add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN
    731 #endif
    732        .END
    733 
    734 
    735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big"
    736 ; 
    737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big"
    738 ; performs a 64-bit x any-size multiply, and adds the
    739 ; result to an area of memory.  That is, it performs
    740 ; something like
    741 ; 
    742 ;      A B C D
    743 ;    *       Z
    744 ;   __________
    745 ;    P Q R S T
    746 ; 
    747 ; and then adds the "PQRST" vector into an area of memory,
    748 ; handling all carries.
    749 ; 
    750 ; Digression on nomenclature and endian-ness:
    751 ; 
    752 ; Each of the capital letters in the above represents a 64-bit
    753 ; quantity.  That is, you could think of the discussion as
    754 ; being in terms of radix-16-quintillion arithmetic.  The data
    755 ; type being manipulated is "unsigned long long int".  This
    756 ; requires the 64-bit extension of the HP-UX C compiler,
    757 ; available at release 10.  You need these compiler flags to
    758 ; enable these extensions:
    759 ; 
    760 ;       -Aa +e +DA2.0 +DS2.0
    761 ; 
    762 ; (The first specifies ANSI C, the second enables the
    763 ; extensions, which are beyond ANSI C, and the third and
    764 ; fourth tell the compiler to use whatever features of the
    765 ; PA2.0 architecture it wishes, in order to made the code more
    766 ; efficient.  Since the presence of the assembly code will
    767 ; make the program unable to run on anything less than PA2.0,
    768 ; you might as well gain the performance enhancements in the C
    769 ; code as well.)
    770 ; 
    771 ; Questions of "endian-ness" often come up, usually in the
    772 ; context of byte ordering in a word.  These routines have a
    773 ; similar issue, that could be called "wordian-ness".
    774 ; Independent of byte ordering (PA is always big-endian), one
    775 ; can make two choices when representing extremely large
    776 ; numbers as arrays of 64-bit doublewords in memory.
    777 ; 
    778 ; "Little-wordian" layout means that the least significant
    779 ; word of a number is stored at the lowest address.
    780 ; 
    781 ;   MSW     LSW
    782 ;    |       |
    783 ;    V       V
    784 ; 
    785 ;    A B C D E
    786 ; 
    787 ;    ^     ^ ^
    788 ;    |     | |____ address 0
    789 ;    |     |
    790 ;    |     |_______address 8
    791 ;    |
    792 ;    address 32
    793 ; 
    794 ; "Big-wordian" means that the most significant word is at the
    795 ; lowest address.
    796 ; 
    797 ;   MSW     LSW
    798 ;    |       |
    799 ;    V       V
    800 ; 
    801 ;    A B C D E
    802 ; 
    803 ;    ^     ^ ^
    804 ;    |     | |____ address 32
    805 ;    |     |
    806 ;    |     |_______address 24
    807 ;    |
    808 ;    address 0
    809 ; 
    810 ; When you compile the file, you must specify one or the other, with
    811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN".
    812 ; 
    813 ;     Incidentally, you assemble this file as part of your
    814 ;     project with the same C compiler as the rest of the program.
    815 ;     My "makefile" for a superprecision arithmetic package has
    816 ;     the following stuff:
    817 ; 
    818 ;     # definitions:
    819 ;     CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1
    820 ;     CFLAGS = +O3
    821 ;     LDFLAGS = -L /usr/lib -Wl,-aarchive
    822 ; 
    823 ;     # general build rule for ".s" files:
    824 ;     .s.o:
    825 ;             $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN
    826 ; 
    827 ;     # Now any bind step that calls for pa20.o will assemble pa20.s
    828 ; 
    829 ; End of digression, back to arithmetic:
    830 ; 
    831 ; The way we multiply two huge numbers is, of course, to multiply
    832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding
    833 ; the result vectors with increasing offsets, the way we learned
    834 ; in school, back before we all used calculators:
    835 ; 
    836 ;            A B C D
    837 ;          * W X Y Z
    838 ;         __________
    839 ;          P Q R S T
    840 ;        E F G H I
    841 ;      M N O P Q
    842 ;  + R S T U V
    843 ;    _______________
    844 ;    F I N A L S U M
    845 ; 
    846 ; So we call maxpy_PA20_big (in my case; my package is
    847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments
    848 ; in turn as the "scalar", and giving the "ABCD" vector each
    849 ; time.  We direct it to add its result into an area of memory
    850 ; that we have cleared at the start.  We skew the exact
    851 ; location into that area with each call.
    852 ; 
    853 ; The prototype for the function is
    854 ; 
    855 ; extern void maxpy_PA20_big(
    856 ;    int length,        /* Number of doublewords in the multiplicand vector. */
    857 ;    const long long int *scalaraddr,    /* Address to fetch the scalar. */
    858 ;    const long long int *multiplicand,  /* The multiplicand vector. */
    859 ;    long long int *result);             /* Where to accumulate the result. */
    860 ; 
    861 ; (You should place a copy of this prototype in an include file
    862 ; or in your C file.)
    863 ; 
    864 ; Now, IN ALL CASES, the given address for the multiplicand or
    865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD.
    866 ; That word is, of course, the word at which the routine
    867 ; starts processing.  "maxpy_PA20_little" then increases the
    868 ; addresses as it computes.  "maxpy_PA20_big" decreases them.
    869 ; 
    870 ; In our example above, "length" would be 4 in each case.
    871 ; "multiplicand" would be the "ABCD" vector.  Specifically,
    872 ; the address of the element "D".  "scalaraddr" would be the
    873 ; address of "W", "X", "Y", or "Z" on the four calls that we
    874 ; would make.  (The order doesn't matter, of course.)
    875 ; "result" would be the appropriate address in the result
    876 ; area.  When multiplying by "Z", that would be the least
    877 ; significant word.  When multiplying by "Y", it would be the
    878 ; next higher word (8 bytes higher if little-wordian; 8 bytes
    879 ; lower if big-wordian), and so on.  The size of the result
    880 ; area must be the the sum of the sizes of the multiplicand
    881 ; and multiplier vectors, and must be initialized to zero
    882 ; before we start.
    883 ; 
    884 ; Whenever the routine adds its partial product into the result
    885 ; vector, it follows carry chains as far as they need to go.
    886 ; 
    887 ; Here is the super-precision multiply routine that I use for
    888 ; my package.  The package is big-wordian.  I have taken out
    889 ; handling of exponents (it's a floating point package):
    890 ; 
    891 ; static void mul_PA20(
    892 ;   int size,
    893 ;   const long long int *arg1,
    894 ;   const long long int *arg2,
    895 ;   long long int *result)
    896 ; {
    897 ;    int i;
    898 ; 
    899 ;    for (i=0 ; i<2*size ; i++) result[i] = 0ULL;
    900 ; 
    901 ;    for (i=0 ; i<size ; i++) {
    902 ;       maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]);
    903 ;    }
    904 ; }