tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

hpma512.s (26033B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 /*
      5 *
      6 *  This PA-RISC 2.0 function computes the product of two unsigned integers,
      7 *  and adds the result to a previously computed integer.  The multiplicand
      8 *  is a 512-bit (64-byte, eight doubleword) unsigned integer, stored in
      9 *  memory in little-double-wordian order.  The multiplier is an unsigned
     10 *  64-bit integer.  The previously computed integer to which the product is
     11 *  added is located in the result ("res") area, and is assumed to be a
     12 *  576-bit (72-byte, nine doubleword) unsigned integer, stored in memory
     13 *  in little-double-wordian order.  This value normally will be the result
     14 *  of a previously computed nine doubleword result.  It is not necessary
     15 *  to pad the multiplicand with an additional 64-bit zero doubleword.
     16 *
     17 *  Multiplicand, multiplier, and addend ideally should be aligned at
     18 *  16-byte boundaries for best performance.  The code will function
     19 *  correctly for alignment at eight-byte boundaries which are not 16-byte
     20 *  boundaries, but the execution may be slightly slower due to even/odd
     21 *  bank conflicts on PA-RISC 8000 processors.
     22 *
     23 *  This function is designed to accept the same calling sequence as Bill
     24 *  Ackerman's "maxpy_little" function.  The carry from the ninth doubleword
     25 *  of the result is written to the tenth word of the result, as is done by
     26 *  Bill Ackerman's function.  The final carry also is returned as an
     27 *  integer, which may be ignored.  The function prototype may be either
     28 *  of the following:
     29 *
     30 *      void multacc512( int l, chunk* m, const chunk* a, chunk* res );
     31 *          or
     32 *      int multacc512( int l, chunk* m, const chunk* a, chunk* res );
     33 *
     34 *  where:  "l" originally denoted vector lengths.  This parameter is
     35 *      ignored.  This function always assumes a multiplicand length of
     36 *      512 bits (eight doublewords), and addend and result lengths of
     37 *      576 bits (nine doublewords).
     38 *
     39 *      "m" is a pointer to the doubleword multiplier, ideally aligned
     40 *      on a 16-byte boundary.
     41 *
     42 *      "a" is a pointer to the eight-doubleword multiplicand, stored
     43 *      in little-double-wordian order, and ideally aligned on a 16-byte
     44 *      boundary.
     45 *
     46 *      "res" is a pointer to the nine doubleword addend, and to the
     47 *      nine-doubleword product computed by this function.  The result
     48 *      also is stored in little-double-wordian order, and ideally is
     49 *      aligned on a 16-byte boundary. It is expected that the alignment
     50 *      of the "res" area may alternate between even/odd doubleword
     51 *      boundaries for successive calls for 512-bit x 512-bit
     52 *      multiplications.
     53 *
     54 *  The code for this function has been scheduled to use the parallelism
     55 *  of the PA-RISC 8000 series microprocessors as well as the author was
     56 *  able.  Comments and/or suggestions for improvement are welcomed.
     57 *
     58 *  The code is "64-bit safe".  This means it may be called in either
     59 *  the 32ILP context or the 64LP context.  All 64-bits of registers are
     60 *  saved and restored.
     61 *
     62 *  This code is self-contained.  It requires no other header files in order
     63 *  to compile and to be linkable on a PA-RISC 2.0 machine.  Symbolic
     64 *  definitions for registers and stack offsets are included within this
     65 *  one source file.
     66 *
     67 *  This is a leaf routine.  As such, minimal use is made of the stack area.
     68 *  Of the 192 bytes allocated, 64 bytes are used for saving/restoring eight
     69 *  general registers, and 128 bytes are used to move intermediate products
     70 *  from the floating-point registers to the general registers.  Stack
     71 *  protocols assure proper alignment of these areas.
     72 *
     73 */
     74 
     75 
     76 /*  ====================================================================*/
     77 /*      symbolic definitions for PA-RISC registers      */
     78 /*      in the MIPS style, avoids lots of case shifts       */
     79 /*      assigments (except t4) preserve register number parity  */
     80 /*  ====================================================================*/
     81 
     82 #define zero    %r0         /* permanent zero */
     83 #define t5      %r1         /* temp register, altered by addil */
     84 
     85 #define rp      %r2         /* return pointer */
     86 
     87 #define s1      %r3         /* callee saves register*/
     88 #define s0      %r4         /* callee saves register*/
     89 #define s3      %r5         /* callee saves register*/
     90 #define s2      %r6         /* callee saves register*/
     91 #define s5      %r7         /* callee saves register*/
     92 #define s4      %r8         /* callee saves register*/
     93 #define s7      %r9         /* callee saves register*/
     94 #define s6      %r10        /* callee saves register*/
     95 
     96 #define t1      %r19        /* caller saves register*/
     97 #define t0      %r20        /* caller saves register*/
     98 #define t3      %r21        /* caller saves register*/
     99 #define t2      %r22        /* caller saves register*/
    100 
    101 #define a3      %r23        /* fourth argument register, high word */
    102 #define a2      %r24        /* third argument register, low word*/
    103 #define a1      %r25        /* second argument register, high word*/
    104 #define a0      %r26        /* first argument register, low word*/
    105 
    106 #define v0      %r28        /* high order return value*/
    107 #define v1      %r29        /* low order return value*/
    108 
    109 #define sp      %r30        /* stack pointer*/
    110 #define t4      %r31        /* temporary register   */
    111 
    112 #define fa0     %fr4        /* first argument register*/
    113 #define fa1     %fr5        /* second argument register*/
    114 #define fa2     %fr6        /* third argument register*/
    115 #define fa3     %fr7        /* fourth argument register*/
    116 
    117 #define fa0r    %fr4R       /* first argument register*/
    118 #define fa1r    %fr5R       /* second argument register*/
    119 #define fa2r    %fr6R       /* third argument register*/
    120 #define fa3r    %fr7R       /* fourth argument register*/
    121 
    122 #define ft0     %fr8        /* caller saves register*/
    123 #define ft1     %fr9        /* caller saves register*/
    124 #define ft2     %fr10       /* caller saves register*/
    125 #define ft3     %fr11       /* caller saves register*/
    126 
    127 #define ft0r    %fr8R       /* caller saves register*/
    128 #define ft1r    %fr9R       /* caller saves register*/
    129 #define ft2r    %fr10R      /* caller saves register*/
    130 #define ft3r    %fr11R      /* caller saves register*/
    131 
    132 #define ft4     %fr22       /* caller saves register*/
    133 #define ft5     %fr23       /* caller saves register*/
    134 #define ft6     %fr24       /* caller saves register*/
    135 #define ft7     %fr25       /* caller saves register*/
    136 #define ft8     %fr26       /* caller saves register*/
    137 #define ft9     %fr27       /* caller saves register*/
    138 #define ft10    %fr28       /* caller saves register*/
    139 #define ft11    %fr29       /* caller saves register*/
    140 #define ft12    %fr30       /* caller saves register*/
    141 #define ft13    %fr31       /* caller saves register*/
    142 
    143 #define ft4r    %fr22R      /* caller saves register*/
    144 #define ft5r    %fr23R      /* caller saves register*/
    145 #define ft6r    %fr24R      /* caller saves register*/
    146 #define ft7r    %fr25R      /* caller saves register*/
    147 #define ft8r    %fr26R      /* caller saves register*/
    148 #define ft9r    %fr27R      /* caller saves register*/
    149 #define ft10r   %fr28R      /* caller saves register*/
    150 #define ft11r   %fr29R      /* caller saves register*/
    151 #define ft12r   %fr30R      /* caller saves register*/
    152 #define ft13r   %fr31R      /* caller saves register*/
    153 
    154 
    155 
    156 /*  ================================================================== */
    157 /*      functional definitions for PA-RISC registers           */
    158 /*  ================================================================== */
    159 
    160 /*              general registers           */
    161 
    162 #define T1      a0          /* temp, (length parameter ignored)             */
    163 
    164 #define pM      a1          /* -> 64-bit multiplier                         */
    165 #define T2      a1          /* temp, (after fetching multiplier)            */
    166 
    167 #define pA      a2          /* -> multiplicand vector (8 64-bit words)      */
    168 #define T3      a2          /* temp, (after fetching multiplicand)          */
    169 
    170 #define pR      a3          /* -> addend vector (8 64-bit doublewords,
    171                                  result vector (9 64-bit words)            */
    172 
    173 #define S0      s0          /* callee saves summand registers               */
    174 #define S1      s1
    175 #define S2      s2
    176 #define S3      s3
    177 #define S4      s4
    178 #define S5      s5
    179 #define S6      s6
    180 #define S7      s7
    181 
    182 #define S8      v0          /* caller saves summand registers               */
    183 #define S9      v1
    184 #define S10     t0
    185 #define S11     t1
    186 #define S12     t2
    187 #define S13     t3
    188 #define S14     t4
    189 #define S15     t5
    190 
    191 
    192 
    193 /*              floating-point registers                                    */
    194 
    195 #define M       fa0         /* multiplier double word                       */
    196 #define MR      fa0r        /* low order half of multiplier double word     */
    197 #define ML      fa0         /* high order half of multiplier double word    */
    198 
    199 #define A0      fa2         /* multiplicand double word 0                   */
    200 #define A0R     fa2r        /* low order half of multiplicand double word   */
    201 #define A0L     fa2         /* high order half of multiplicand double word  */
    202 
    203 #define A1      fa3         /* multiplicand double word 1                   */
    204 #define A1R     fa3r        /* low order half of multiplicand double word   */
    205 #define A1L     fa3         /* high order half of multiplicand double word  */
    206 
    207 #define A2      ft0         /* multiplicand double word 2                   */
    208 #define A2R     ft0r        /* low order half of multiplicand double word   */
    209 #define A2L     ft0         /* high order half of multiplicand double word  */
    210 
    211 #define A3      ft1         /* multiplicand double word 3                   */
    212 #define A3R     ft1r        /* low order half of multiplicand double word   */
    213 #define A3L     ft1         /* high order half of multiplicand double word  */
    214 
    215 #define A4      ft2         /* multiplicand double word 4                   */
    216 #define A4R     ft2r        /* low order half of multiplicand double word   */
    217 #define A4L     ft2         /* high order half of multiplicand double word  */
    218 
    219 #define A5      ft3         /* multiplicand double word 5                   */
    220 #define A5R     ft3r        /* low order half of multiplicand double word   */
    221 #define A5L     ft3         /* high order half of multiplicand double word  */
    222 
    223 #define A6      ft4         /* multiplicand double word 6                   */
    224 #define A6R     ft4r        /* low order half of multiplicand double word   */
    225 #define A6L     ft4         /* high order half of multiplicand double word  */
    226 
    227 #define A7      ft5         /* multiplicand double word 7                   */
    228 #define A7R     ft5r        /* low order half of multiplicand double word   */
    229 #define A7L     ft5         /* high order half of multiplicand double word  */
    230 
    231 #define P0      ft6         /* product word 0                               */
    232 #define P1      ft7         /* product word 0                               */
    233 #define P2      ft8         /* product word 0                               */
    234 #define P3      ft9         /* product word 0                               */
    235 #define P4      ft10        /* product word 0                               */
    236 #define P5      ft11        /* product word 0                               */
    237 #define P6      ft12        /* product word 0                               */
    238 #define P7      ft13        /* product word 0                               */
    239 
    240 
    241 
    242 
    243 /*  ======================================================================  */
    244 /*      symbolic definitions for HP-UX stack offsets                        */
    245 /*      symbolic definitions for memory NOPs                                */
    246 /*  ======================================================================  */
    247 
    248 #define ST_SZ       192         /* stack area total size                    */
    249 
    250 #define SV0         -192(sp)    /* general register save area               */
    251 #define SV1         -184(sp)
    252 #define SV2         -176(sp)
    253 #define SV3         -168(sp)
    254 #define SV4         -160(sp)
    255 #define SV5         -152(sp)
    256 #define SV6         -144(sp)
    257 #define SV7         -136(sp)
    258 
    259 #define XF0         -128(sp)    /* data transfer area                       */
    260 #define XF1         -120(sp)    /* for floating-pt to integer regs          */
    261 #define XF2         -112(sp)
    262 #define XF3         -104(sp)
    263 #define XF4         -96(sp)
    264 #define XF5         -88(sp)
    265 #define XF6         -80(sp)
    266 #define XF7         -72(sp)
    267 #define XF8         -64(sp)
    268 #define XF9         -56(sp)
    269 #define XF10        -48(sp)
    270 #define XF11        -40(sp)
    271 #define XF12        -32(sp)
    272 #define XF13        -24(sp)
    273 #define XF14        -16(sp)
    274 #define XF15        -8(sp)
    275 
    276 #define mnop    proberi (sp),3,zero     /* memory NOP                       */
    277 
    278 
    279 
    280 
    281 /*  ======================================================================  */
    282 /*      assembler formalities                                               */
    283 /*  ======================================================================  */
    284 
    285 #ifdef __LP64__
    286                .level  2.0W
    287 #else
    288                .level  2.0
    289 #endif
    290                .space    $TEXT$
    291                .subspa   $CODE$
    292                .align    16
    293 
    294 /*  ======================================================================  */
    295 /*      here to compute 64-bit x 512-bit product + 512-bit addend           */
    296 /*  ======================================================================  */
    297 
    298 multacc512
    299        .PROC
    300        .CALLINFO
    301        .ENTRY
    302    fldd    0(pM),M                 ; multiplier double word
    303    ldo     ST_SZ(sp),sp            ; push stack
    304 
    305    fldd    0(pA),A0                ; multiplicand double word 0
    306    std     S1,SV1                  ; save s1
    307 
    308    fldd    16(pA),A2               ; multiplicand double word 2
    309    std     S3,SV3                  ; save s3
    310 
    311    fldd    32(pA),A4               ; multiplicand double word 4
    312    std     S5,SV5                  ; save s5
    313 
    314    fldd    48(pA),A6               ; multiplicand double word 6
    315    std     S7,SV7                  ; save s7
    316 
    317 
    318    std     S0,SV0                  ; save s0
    319    fldd    8(pA),A1                ; multiplicand double word 1
    320    xmpyu   MR,A0L,P0               ; A0 cross 32-bit word products
    321    xmpyu   ML,A0R,P2
    322 
    323    std     S2,SV2                  ; save s2
    324    fldd    24(pA),A3               ; multiplicand double word 3
    325    xmpyu   MR,A2L,P4               ; A2 cross 32-bit word products
    326    xmpyu   ML,A2R,P6
    327 
    328    std     S4,SV4                  ; save s4
    329    fldd    40(pA),A5               ; multiplicand double word 5
    330 
    331    std     S6,SV6                  ; save s6
    332    fldd    56(pA),A7               ; multiplicand double word 7
    333 
    334 
    335    fstd    P0,XF0                  ; MR * A0L
    336    xmpyu   MR,A0R,P0               ; A0 right 32-bit word product
    337    xmpyu   MR,A1L,P1               ; A1 cross 32-bit word product
    338 
    339    fstd    P2,XF2                  ; ML * A0R
    340    xmpyu   ML,A0L,P2               ; A0 left 32-bit word product
    341    xmpyu   ML,A1R,P3               ; A1 cross 32-bit word product
    342 
    343    fstd    P4,XF4                  ; MR * A2L
    344    xmpyu   MR,A2R,P4               ; A2 right 32-bit word product
    345    xmpyu   MR,A3L,P5               ; A3 cross 32-bit word product
    346 
    347    fstd    P6,XF6                  ; ML * A2R
    348    xmpyu   ML,A2L,P6               ; A2 parallel 32-bit word product
    349    xmpyu   ML,A3R,P7               ; A3 cross 32-bit word product
    350 
    351 
    352    ldd     XF0,S0                  ; MR * A0L
    353    fstd    P1,XF1                  ; MR * A1L
    354 
    355    ldd     XF2,S2                  ; ML * A0R
    356    fstd    P3,XF3                  ; ML * A1R
    357 
    358    ldd     XF4,S4                  ; MR * A2L
    359    fstd    P5,XF5                  ; MR * A3L
    360    xmpyu   MR,A1R,P1               ; A1 parallel 32-bit word products
    361    xmpyu   ML,A1L,P3
    362 
    363    ldd     XF6,S6                  ; ML * A2R
    364    fstd    P7,XF7                  ; ML * A3R
    365    xmpyu   MR,A3R,P5               ; A3 parallel 32-bit word products
    366    xmpyu   ML,A3L,P7
    367 
    368 
    369    fstd    P0,XF0                  ; MR * A0R
    370    ldd     XF1,S1                  ; MR * A1L
    371    nop
    372    add     S0,S2,T1                ; A0 cross product sum
    373 
    374    fstd    P2,XF2                  ; ML * A0L
    375    ldd     XF3,S3                  ; ML * A1R
    376    add,dc  zero,zero,S0            ; A0 cross product sum carry
    377    depd,z  T1,31,32,S2             ; A0 cross product sum << 32
    378 
    379    fstd    P4,XF4                  ; MR * A2R
    380    ldd     XF5,S5                  ; MR * A3L
    381    shrpd   S0,T1,32,S0             ; A0 carry | cross product sum >> 32
    382    add     S4,S6,T3                ; A2 cross product sum
    383 
    384    fstd    P6,XF6                  ; ML * A2L
    385    ldd     XF7,S7                  ; ML * A3R
    386    add,dc  zero,zero,S4            ; A2 cross product sum carry
    387    depd,z  T3,31,32,S6             ; A2 cross product sum << 32
    388 
    389 
    390    ldd     XF0,S8                  ; MR * A0R
    391    fstd    P1,XF1                  ; MR * A1R
    392    xmpyu   MR,A4L,P0               ; A4 cross 32-bit word product
    393    xmpyu   MR,A5L,P1               ; A5 cross 32-bit word product
    394 
    395    ldd     XF2,S10                 ; ML * A0L
    396    fstd    P3,XF3                  ; ML * A1L
    397    xmpyu   ML,A4R,P2               ; A4 cross 32-bit word product
    398    xmpyu   ML,A5R,P3               ; A5 cross 32-bit word product
    399 
    400    ldd     XF4,S12                 ; MR * A2R
    401    fstd    P5,XF5                  ; MR * A3L
    402    xmpyu   MR,A6L,P4               ; A6 cross 32-bit word product
    403    xmpyu   MR,A7L,P5               ; A7 cross 32-bit word product
    404 
    405    ldd     XF6,S14                 ; ML * A2L
    406    fstd    P7,XF7                  ; ML * A3L
    407    xmpyu   ML,A6R,P6               ; A6 cross 32-bit word product
    408    xmpyu   ML,A7R,P7               ; A7 cross 32-bit word product
    409 
    410 
    411    fstd    P0,XF0                  ; MR * A4L
    412    ldd     XF1,S9                  ; MR * A1R
    413    shrpd   S4,T3,32,S4             ; A2 carry | cross product sum >> 32
    414    add     S1,S3,T1                ; A1 cross product sum
    415 
    416    fstd    P2,XF2                  ; ML * A4R
    417    ldd     XF3,S11                 ; ML * A1L
    418    add,dc  zero,zero,S1            ; A1 cross product sum carry
    419    depd,z  T1,31,32,S3             ; A1 cross product sum << 32
    420 
    421    fstd    P4,XF4                  ; MR * A6L
    422    ldd     XF5,S13                 ; MR * A3R
    423    shrpd   S1,T1,32,S1             ; A1 carry | cross product sum >> 32
    424    add     S5,S7,T3                ; A3 cross product sum
    425 
    426    fstd    P6,XF6                  ; ML * A6R
    427    ldd     XF7,S15                 ; ML * A3L
    428    add,dc  zero,zero,S5            ; A3 cross product sum carry
    429    depd,z  T3,31,32,S7             ; A3 cross product sum << 32
    430 
    431 
    432    shrpd   S5,T3,32,S5             ; A3 carry | cross product sum >> 32
    433    add     S2,S8,S8                ; M * A0 right doubleword, P0 doubleword
    434 
    435    add,dc  S0,S10,S10              ; M * A0 left doubleword
    436    add     S3,S9,S9                ; M * A1 right doubleword
    437 
    438    add,dc  S1,S11,S11              ; M * A1 left doubleword
    439    add     S6,S12,S12              ; M * A2 right doubleword
    440 
    441 
    442    ldd     24(pR),S3               ; Addend word 3
    443    fstd    P1,XF1                  ; MR * A5L
    444    add,dc  S4,S14,S14              ; M * A2 left doubleword
    445    xmpyu   MR,A5R,P1               ; A5 right 32-bit word product
    446 
    447    ldd     8(pR),S1                ; Addend word 1
    448    fstd    P3,XF3                  ; ML * A5R
    449    add     S7,S13,S13              ; M * A3 right doubleword
    450    xmpyu   ML,A5L,P3               ; A5 left 32-bit word product
    451 
    452    ldd     0(pR),S7                ; Addend word 0
    453    fstd    P5,XF5                  ; MR * A7L
    454    add,dc  S5,S15,S15              ; M * A3 left doubleword
    455    xmpyu   MR,A7R,P5               ; A7 right 32-bit word product
    456 
    457    ldd     16(pR),S5               ; Addend word 2
    458    fstd    P7,XF7                  ; ML * A7R
    459    add     S10,S9,S9               ; P1 doubleword
    460    xmpyu   ML,A7L,P7               ; A7 left 32-bit word products
    461 
    462 
    463    ldd     XF0,S0                  ; MR * A4L
    464    fstd    P1,XF9                  ; MR * A5R
    465    add,dc  S11,S12,S12             ; P2 doubleword
    466    xmpyu   MR,A4R,P0               ; A4 right 32-bit word product
    467 
    468    ldd     XF2,S2                  ; ML * A4R
    469    fstd    P3,XF11                 ; ML * A5L
    470    add,dc  S14,S13,S13             ; P3 doubleword
    471    xmpyu   ML,A4L,P2               ; A4 left 32-bit word product
    472 
    473    ldd     XF6,S6                  ; ML * A6R
    474    fstd    P5,XF13                 ; MR * A7R
    475    add,dc  zero,S15,T2             ; P4 partial doubleword
    476    xmpyu   MR,A6R,P4               ; A6 right 32-bit word product
    477 
    478    ldd     XF4,S4                  ; MR * A6L
    479    fstd    P7,XF15                 ; ML * A7L
    480    add     S7,S8,S8                ; R0 + P0, new R0 doubleword
    481    xmpyu   ML,A6L,P6               ; A6 left 32-bit word product
    482 
    483 
    484    fstd    P0,XF0                  ; MR * A4R
    485    ldd     XF7,S7                  ; ML * A7R
    486    add,dc  S1,S9,S9                ; c + R1 + P1, new R1 doubleword
    487 
    488    fstd    P2,XF2                  ; ML * A4L
    489    ldd     XF1,S1                  ; MR * A5L
    490    add,dc  S5,S12,S12              ; c + R2 + P2, new R2 doubleword
    491 
    492    fstd    P4,XF4                  ; MR * A6R
    493    ldd     XF5,S5                  ; MR * A7L
    494    add,dc  S3,S13,S13              ; c + R3 + P3, new R3 doubleword
    495 
    496    fstd    P6,XF6                  ; ML * A6L
    497    ldd     XF3,S3                  ; ML * A5R
    498    add,dc  zero,T2,T2              ; c + partial P4
    499    add     S0,S2,T1                ; A4 cross product sum
    500 
    501 
    502    std     S8,0(pR)                ; save R0
    503    add,dc  zero,zero,S0            ; A4 cross product sum carry
    504    depd,z  T1,31,32,S2             ; A4 cross product sum << 32
    505 
    506    std     S9,8(pR)                ; save R1
    507    shrpd   S0,T1,32,S0             ; A4 carry | cross product sum >> 32
    508    add     S4,S6,T3                ; A6 cross product sum
    509 
    510    std     S12,16(pR)              ; save R2
    511    add,dc  zero,zero,S4            ; A6 cross product sum carry
    512    depd,z  T3,31,32,S6             ; A6 cross product sum << 32
    513 
    514 
    515    std     S13,24(pR)              ; save R3
    516    shrpd   S4,T3,32,S4             ; A6 carry | cross product sum >> 32
    517    add     S1,S3,T1                ; A5 cross product sum
    518 
    519    ldd     XF0,S8                  ; MR * A4R
    520    add,dc  zero,zero,S1            ; A5 cross product sum carry
    521    depd,z  T1,31,32,S3             ; A5 cross product sum << 32
    522 
    523    ldd     XF2,S10                 ; ML * A4L
    524    ldd     XF9,S9                  ; MR * A5R
    525    shrpd   S1,T1,32,S1             ; A5 carry | cross product sum >> 32
    526    add     S5,S7,T3                ; A7 cross product sum
    527 
    528    ldd     XF4,S12                 ; MR * A6R
    529    ldd     XF11,S11                ; ML * A5L
    530    add,dc  zero,zero,S5            ; A7 cross product sum carry
    531    depd,z  T3,31,32,S7             ; A7 cross product sum << 32
    532 
    533    ldd     XF6,S14                 ; ML * A6L
    534    ldd     XF13,S13                ; MR * A7R
    535    shrpd   S5,T3,32,S5             ; A7 carry | cross product sum >> 32
    536    add     S2,S8,S8                ; M * A4 right doubleword
    537 
    538 
    539    ldd     XF15,S15                ; ML * A7L
    540    add,dc  S0,S10,S10              ; M * A4 left doubleword
    541    add     S3,S9,S9                ; M * A5 right doubleword
    542 
    543    add,dc  S1,S11,S11              ; M * A5 left doubleword
    544    add     S6,S12,S12              ; M * A6 right doubleword
    545 
    546    ldd     32(pR),S0               ; Addend word 4
    547    ldd     40(pR),S1               ; Addend word 5
    548    add,dc  S4,S14,S14              ; M * A6 left doubleword
    549    add     S7,S13,S13              ; M * A7 right doubleword
    550 
    551    ldd     48(pR),S2               ; Addend word 6
    552    ldd     56(pR),S3               ; Addend word 7
    553    add,dc  S5,S15,S15              ; M * A7 left doubleword
    554    add     S8,T2,S8                ; P4 doubleword
    555 
    556    ldd     64(pR),S4               ; Addend word 8
    557    ldd     SV5,s5                  ; restore s5
    558    add,dc  S10,S9,S9               ; P5 doubleword
    559    add,dc  S11,S12,S12             ; P6 doubleword
    560 
    561 
    562    ldd     SV6,s6                  ; restore s6
    563    ldd     SV7,s7                  ; restore s7
    564    add,dc  S14,S13,S13             ; P7 doubleword
    565    add,dc  zero,S15,S15            ; P8 doubleword
    566 
    567    add     S0,S8,S8                ; new R4 doubleword
    568 
    569    ldd     SV0,s0                  ; restore s0
    570    std     S8,32(pR)               ; save R4
    571    add,dc  S1,S9,S9                ; new R5 doubleword
    572 
    573    ldd     SV1,s1                  ; restore s1
    574    std     S9,40(pR)               ; save R5
    575    add,dc  S2,S12,S12              ; new R6 doubleword
    576 
    577    ldd     SV2,s2                  ; restore s2
    578    std     S12,48(pR)              ; save R6
    579    add,dc  S3,S13,S13              ; new R7 doubleword
    580 
    581    ldd     SV3,s3                  ; restore s3
    582    std     S13,56(pR)              ; save R7
    583    add,dc  S4,S15,S15              ; new R8 doubleword
    584 
    585    ldd     SV4,s4                  ; restore s4
    586    std     S15,64(pR)              ; save result[8]
    587    add,dc  zero,zero,v0            ; return carry from R8
    588 
    589    CMPIB,*= 0,v0,$L0               ; if no overflow, exit
    590    LDO     8(pR),pR
    591 
    592 $FINAL1                             ; Final carry propagation
    593    LDD     64(pR),v0
    594    LDO     8(pR),pR
    595    ADDI    1,v0,v0
    596    CMPIB,*= 0,v0,$FINAL1           ; Keep looping if there is a carry.
    597    STD     v0,56(pR)
    598 $L0
    599    bv      zero(rp)                ; -> caller
    600    ldo     -ST_SZ(sp),sp           ; pop stack
    601 
    602 /*  ======================================================================  */
    603 /*      end of module                                                       */
    604 /*  ======================================================================  */
    605 
    606 
    607        bve (rp)
    608        .EXIT
    609        nop
    610                .PROCEND
    611                .SPACE         $TEXT$
    612                .SUBSPA        $CODE$
    613                .EXPORT        multacc512,ENTRY
    614 
    615        .end