hppa20.s (28203B)
1 ; This Source Code Form is subject to the terms of the Mozilla Public 2 ; License, v. 2.0. If a copy of the MPL was not distributed with this 3 ; file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5 #ifdef __LP64__ 6 .LEVEL 2.0W 7 #else 8 ; .LEVEL 1.1 9 ; .ALLOW 2.0N 10 .LEVEL 2.0 11 #endif 12 .SPACE $TEXT$,SORT=8 13 .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24 14 15 ; *************************************************************** 16 ; 17 ; maxpy_[little/big] 18 ; 19 ; *************************************************************** 20 21 ; There is no default -- you must specify one or the other. 22 #define LITTLE_WORDIAN 1 23 24 #ifdef LITTLE_WORDIAN 25 #define EIGHT 8 26 #define SIXTEEN 16 27 #define THIRTY_TWO 32 28 #define UN_EIGHT -8 29 #define UN_SIXTEEN -16 30 #define UN_TWENTY_FOUR -24 31 #endif 32 33 #ifdef BIG_WORDIAN 34 #define EIGHT -8 35 #define SIXTEEN -16 36 #define THIRTY_TWO -32 37 #define UN_EIGHT 8 38 #define UN_SIXTEEN 16 39 #define UN_TWENTY_FOUR 24 40 #endif 41 42 ; This performs a multiple-precision integer version of "daxpy", 43 ; Using the selected addressing direction. "Little-wordian" means that 44 ; the least significant word of a number is stored at the lowest address. 45 ; "Big-wordian" means that the most significant word is at the lowest 46 ; address. Either way, the incoming address of the vector is that 47 ; of the least significant word. That means that, for little-wordian 48 ; addressing, we move the address upward as we propagate carries 49 ; from the least significant word to the most significant. For 50 ; big-wordian we move the address downward. 51 52 ; We use the following registers: 53 ; 54 ; r2 return PC, of course 55 ; r26 = arg1 = length 56 ; r25 = arg2 = address of scalar 57 ; r24 = arg3 = multiplicand vector 58 ; r23 = arg4 = result vector 59 ; 60 ; fr9 = scalar loaded once only from r25 61 62 ; The cycle counts shown in the bodies below are simply the result of a 63 ; scheduling by hand. The actual PCX-U hardware does it differently. 64 ; The intention is that the overall speed is the same. 65 66 ; The pipeline startup and shutdown code is constructed in the usual way, 67 ; by taking the loop bodies and removing unnecessary instructions. 68 ; We have left the comments describing cycle numbers in the code. 69 ; These are intended for reference when comparing with the main loop, 70 ; and have no particular relationship to actual cycle numbers. 71 72 #ifdef LITTLE_WORDIAN 73 maxpy_little 74 #else 75 maxpy_big 76 #endif 77 .PROC 78 .CALLINFO FRAME=120,ENTRY_GR=4 79 .ENTRY 80 STW,MA %r3,128(%sp) 81 STW %r4,-124(%sp) 82 83 ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately. 84 FLDD 0(%r25),%fr9 ; fr9 = scalar 85 86 ; First startup 87 88 FLDD 0(%r24),%fr24 ; Cycle 1 89 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 90 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 91 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 92 CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3 93 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 94 FLDD EIGHT(%r24),%fr28 ; Cycle 8 95 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 96 FSTD %fr24,-96(%sp) 97 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 98 FSTD %fr25,-80(%sp) 99 LDO SIXTEEN(%r24),%r24 ; Cycle 12 100 FSTD %fr31,-64(%sp) 101 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 102 FSTD %fr27,-48(%sp) 103 104 ; Second startup 105 106 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 107 FSTD %fr30,-56(%sp) 108 FLDD 0(%r24),%fr24 109 110 FSTD %fr26,-88(%sp) ; Cycle 2 111 112 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 113 FSTD %fr28,-104(%sp) 114 115 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 116 LDD -96(%sp),%r3 117 FSTD %fr29,-72(%sp) 118 119 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 120 LDD -64(%sp),%r19 121 LDD -80(%sp),%r21 122 123 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 124 LDD -56(%sp),%r20 125 ADD %r21,%r3,%r3 126 127 ADD,DC %r20,%r19,%r19 ; Cycle 7 128 LDD -88(%sp),%r4 129 SHRPD %r3,%r0,32,%r21 130 LDD -48(%sp),%r1 131 132 FLDD EIGHT(%r24),%fr28 ; Cycle 8 133 LDD -104(%sp),%r31 134 ADD,DC %r0,%r0,%r20 135 SHRPD %r19,%r3,32,%r3 136 137 LDD -72(%sp),%r29 ; Cycle 9 138 SHRPD %r20,%r19,32,%r20 139 ADD %r21,%r1,%r1 140 141 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 142 ADD,DC %r3,%r4,%r4 143 FSTD %fr24,-96(%sp) 144 145 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 146 ADD,DC %r0,%r20,%r20 147 LDD 0(%r23),%r3 148 FSTD %fr25,-80(%sp) 149 150 LDO SIXTEEN(%r24),%r24 ; Cycle 12 151 FSTD %fr31,-64(%sp) 152 153 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 154 ADD %r0,%r0,%r0 ; clear the carry bit 155 ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12 156 FSTD %fr27,-48(%sp) 157 ; MFCTL %cr16,%r21 ; for timing 158 ; STD %r21,-112(%sp) 159 160 ; Here is the loop. 161 162 $LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 163 ADD,DC %r29,%r4,%r4 164 FSTD %fr30,-56(%sp) 165 FLDD 0(%r24),%fr24 166 167 LDO SIXTEEN(%r23),%r23 ; Cycle 2 168 ADD,DC %r0,%r20,%r20 169 FSTD %fr26,-88(%sp) 170 171 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 172 ADD %r3,%r1,%r1 173 FSTD %fr28,-104(%sp) 174 LDD UN_EIGHT(%r23),%r21 175 176 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 177 ADD,DC %r21,%r4,%r28 178 FSTD %fr29,-72(%sp) 179 LDD -96(%sp),%r3 180 181 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 182 ADD,DC %r20,%r31,%r22 183 LDD -64(%sp),%r19 184 LDD -80(%sp),%r21 185 186 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 187 ADD %r21,%r3,%r3 188 LDD -56(%sp),%r20 189 STD %r1,UN_SIXTEEN(%r23) 190 191 ADD,DC %r20,%r19,%r19 ; Cycle 7 192 SHRPD %r3,%r0,32,%r21 193 LDD -88(%sp),%r4 194 LDD -48(%sp),%r1 195 196 ADD,DC %r0,%r0,%r20 ; Cycle 8 197 SHRPD %r19,%r3,32,%r3 198 FLDD EIGHT(%r24),%fr28 199 LDD -104(%sp),%r31 200 201 SHRPD %r20,%r19,32,%r20 ; Cycle 9 202 ADD %r21,%r1,%r1 203 STD %r28,UN_EIGHT(%r23) 204 LDD -72(%sp),%r29 205 206 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 207 ADD,DC %r3,%r4,%r4 208 FSTD %fr24,-96(%sp) 209 210 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 211 ADD,DC %r0,%r20,%r20 212 FSTD %fr25,-80(%sp) 213 LDD 0(%r23),%r3 214 215 LDO SIXTEEN(%r24),%r24 ; Cycle 12 216 FSTD %fr31,-64(%sp) 217 218 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 219 ADD %r22,%r1,%r1 220 ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12 221 FSTD %fr27,-48(%sp) 222 223 $ENDLOOP 224 225 ; Shutdown code, first stage. 226 227 ; MFCTL %cr16,%r21 ; for timing 228 ; STD %r21,UN_SIXTEEN(%r23) 229 ; LDD -112(%sp),%r21 230 ; STD %r21,UN_EIGHT(%r23) 231 232 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 233 ADD,DC %r29,%r4,%r4 234 CMPIB,= 0,%r26,$ONEMORE 235 FSTD %fr30,-56(%sp) 236 237 LDO SIXTEEN(%r23),%r23 ; Cycle 2 238 ADD,DC %r0,%r20,%r20 239 FSTD %fr26,-88(%sp) 240 241 ADD %r3,%r1,%r1 ; Cycle 3 242 FSTD %fr28,-104(%sp) 243 LDD UN_EIGHT(%r23),%r21 244 245 ADD,DC %r21,%r4,%r28 ; Cycle 4 246 FSTD %fr29,-72(%sp) 247 STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9 248 LDD -96(%sp),%r3 249 250 ADD,DC %r20,%r31,%r22 ; Cycle 5 251 STD %r1,UN_SIXTEEN(%r23) 252 $JOIN4 253 LDD -64(%sp),%r19 254 LDD -80(%sp),%r21 255 256 ADD %r21,%r3,%r3 ; Cycle 6 257 LDD -56(%sp),%r20 258 259 ADD,DC %r20,%r19,%r19 ; Cycle 7 260 SHRPD %r3,%r0,32,%r21 261 LDD -88(%sp),%r4 262 LDD -48(%sp),%r1 263 264 ADD,DC %r0,%r0,%r20 ; Cycle 8 265 SHRPD %r19,%r3,32,%r3 266 LDD -104(%sp),%r31 267 268 SHRPD %r20,%r19,32,%r20 ; Cycle 9 269 ADD %r21,%r1,%r1 270 LDD -72(%sp),%r29 271 272 ADD,DC %r3,%r4,%r4 ; Cycle 10 273 274 ADD,DC %r0,%r20,%r20 ; Cycle 11 275 LDD 0(%r23),%r3 276 277 ADD %r22,%r1,%r1 ; Cycle 13 278 279 ; Shutdown code, second stage. 280 281 ADD,DC %r29,%r4,%r4 ; Cycle 1 282 283 LDO SIXTEEN(%r23),%r23 ; Cycle 2 284 ADD,DC %r0,%r20,%r20 285 286 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 287 ADD %r3,%r1,%r1 288 289 ADD,DC %r21,%r4,%r28 ; Cycle 4 290 291 ADD,DC %r20,%r31,%r22 ; Cycle 5 292 293 STD %r1,UN_SIXTEEN(%r23); Cycle 6 294 295 STD %r28,UN_EIGHT(%r23) ; Cycle 9 296 297 LDD 0(%r23),%r3 ; Cycle 11 298 299 ; Shutdown code, third stage. 300 301 LDO SIXTEEN(%r23),%r23 302 ADD %r3,%r22,%r1 303 $JOIN1 ADD,DC %r0,%r0,%r21 304 CMPIB,*= 0,%r21,$L0 ; if no overflow, exit 305 STD %r1,UN_SIXTEEN(%r23) 306 307 ; Final carry propagation 308 309 $FINAL1 LDO EIGHT(%r23),%r23 310 LDD UN_SIXTEEN(%r23),%r21 311 ADDI 1,%r21,%r21 312 CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry. 313 STD %r21,UN_SIXTEEN(%r23) 314 B $L0 315 NOP 316 317 ; Here is the code that handles the difficult cases N=1, N=2, and N=3. 318 ; We do the usual trick -- branch out of the startup code at appropriate 319 ; points, and branch into the shutdown code. 320 321 $N_IS_SMALL 322 CMPIB,= 0,%r26,$N_IS_ONE 323 FSTD %fr24,-96(%sp) ; Cycle 10 324 FLDD EIGHT(%r24),%fr28 ; Cycle 8 325 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 326 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 327 FSTD %fr25,-80(%sp) 328 FSTD %fr31,-64(%sp) ; Cycle 12 329 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 330 FSTD %fr27,-48(%sp) 331 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 332 CMPIB,= 2,%r26,$N_IS_THREE 333 FSTD %fr30,-56(%sp) 334 335 ; N = 2 336 FSTD %fr26,-88(%sp) ; Cycle 2 337 FSTD %fr28,-104(%sp) ; Cycle 3 338 LDD -96(%sp),%r3 ; Cycle 4 339 FSTD %fr29,-72(%sp) 340 B $JOIN4 341 ADD %r0,%r0,%r22 342 343 $N_IS_THREE 344 FLDD SIXTEEN(%r24),%fr24 345 FSTD %fr26,-88(%sp) ; Cycle 2 346 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 347 FSTD %fr28,-104(%sp) 348 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 349 LDD -96(%sp),%r3 350 FSTD %fr29,-72(%sp) 351 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 352 LDD -64(%sp),%r19 353 LDD -80(%sp),%r21 354 B $JOIN3 355 ADD %r0,%r0,%r22 356 357 $N_IS_ONE 358 FSTD %fr25,-80(%sp) 359 FSTD %fr27,-48(%sp) 360 FSTD %fr26,-88(%sp) ; Cycle 2 361 B $JOIN5 362 ADD %r0,%r0,%r22 363 364 ; We came out of the unrolled loop with wrong parity. Do one more 365 ; single cycle. This is quite tricky, because of the way the 366 ; carry chains and SHRPD chains have been chopped up. 367 368 $ONEMORE 369 370 FLDD 0(%r24),%fr24 371 372 LDO SIXTEEN(%r23),%r23 ; Cycle 2 373 ADD,DC %r0,%r20,%r20 374 FSTD %fr26,-88(%sp) 375 376 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 377 FSTD %fr28,-104(%sp) 378 LDD UN_EIGHT(%r23),%r21 379 ADD %r3,%r1,%r1 380 381 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 382 ADD,DC %r21,%r4,%r28 383 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 384 LDD -96(%sp),%r3 385 FSTD %fr29,-72(%sp) 386 387 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 388 ADD,DC %r20,%r31,%r22 389 LDD -64(%sp),%r19 390 LDD -80(%sp),%r21 391 392 STD %r1,UN_SIXTEEN(%r23); Cycle 6 393 $JOIN3 394 XMPYU %fr9L,%fr24R,%fr24 395 LDD -56(%sp),%r20 396 ADD %r21,%r3,%r3 397 398 ADD,DC %r20,%r19,%r19 ; Cycle 7 399 LDD -88(%sp),%r4 400 SHRPD %r3,%r0,32,%r21 401 LDD -48(%sp),%r1 402 403 LDD -104(%sp),%r31 ; Cycle 8 404 ADD,DC %r0,%r0,%r20 405 SHRPD %r19,%r3,32,%r3 406 407 LDD -72(%sp),%r29 ; Cycle 9 408 SHRPD %r20,%r19,32,%r20 409 ADD %r21,%r1,%r1 410 411 ADD,DC %r3,%r4,%r4 ; Cycle 10 412 FSTD %fr24,-96(%sp) 413 414 ADD,DC %r0,%r20,%r20 ; Cycle 11 415 LDD 0(%r23),%r3 416 FSTD %fr25,-80(%sp) 417 418 ADD %r22,%r1,%r1 ; Cycle 13 419 FSTD %fr27,-48(%sp) 420 421 ; Shutdown code, stage 1-1/2. 422 423 ADD,DC %r29,%r4,%r4 ; Cycle 1 424 425 LDO SIXTEEN(%r23),%r23 ; Cycle 2 426 ADD,DC %r0,%r20,%r20 427 FSTD %fr26,-88(%sp) 428 429 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 430 ADD %r3,%r1,%r1 431 432 ADD,DC %r21,%r4,%r28 ; Cycle 4 433 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 434 435 ADD,DC %r20,%r31,%r22 ; Cycle 5 436 STD %r1,UN_SIXTEEN(%r23) 437 $JOIN5 438 LDD -96(%sp),%r3 ; moved from cycle 4 439 LDD -80(%sp),%r21 440 ADD %r21,%r3,%r3 ; Cycle 6 441 ADD,DC %r0,%r0,%r19 ; Cycle 7 442 LDD -88(%sp),%r4 443 SHRPD %r3,%r0,32,%r21 444 LDD -48(%sp),%r1 445 SHRPD %r19,%r3,32,%r3 ; Cycle 8 446 ADD %r21,%r1,%r1 ; Cycle 9 447 ADD,DC %r3,%r4,%r4 ; Cycle 10 448 LDD 0(%r23),%r3 ; Cycle 11 449 ADD %r22,%r1,%r1 ; Cycle 13 450 451 ; Shutdown code, stage 2-1/2. 452 453 ADD,DC %r0,%r4,%r4 ; Cycle 1 454 LDO SIXTEEN(%r23),%r23 ; Cycle 2 455 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 456 ADD %r3,%r1,%r1 457 STD %r1,UN_SIXTEEN(%r23) 458 ADD,DC %r21,%r4,%r1 459 B $JOIN1 460 LDO EIGHT(%r23),%r23 461 462 ; exit 463 464 $L0 465 LDW -124(%sp),%r4 466 BVE (%r2) 467 .EXIT 468 LDW,MB -128(%sp),%r3 469 470 .PROCEND 471 472 ; *************************************************************** 473 ; 474 ; add_diag_[little/big] 475 ; 476 ; *************************************************************** 477 478 ; The arguments are as follows: 479 ; r2 return PC, of course 480 ; r26 = arg1 = length 481 ; r25 = arg2 = vector to square 482 ; r24 = arg3 = result vector 483 484 #ifdef LITTLE_WORDIAN 485 add_diag_little 486 #else 487 add_diag_big 488 #endif 489 .PROC 490 .CALLINFO FRAME=120,ENTRY_GR=4 491 .ENTRY 492 STW,MA %r3,128(%sp) 493 STW %r4,-124(%sp) 494 495 ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately. 496 NOP 497 498 ; Startup code 499 500 FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body) 501 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 502 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 503 XMPYU %fr7L,%fr7L,%fr30 504 LDO SIXTEEN(%r25),%r25 ; Cycle 6 505 FSTD %fr29,-88(%sp) 506 FSTD %fr27,-72(%sp) ; Cycle 7 507 CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body) 508 FSTD %fr30,-96(%sp) 509 FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2 510 LDD -88(%sp),%r22 ; Cycle 3 511 LDD -72(%sp),%r31 ; Cycle 4 512 XMPYU %fr7R,%fr7R,%fr28 513 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 514 XMPYU %fr7L,%fr7L,%fr31 515 LDD -96(%sp),%r20 ; Cycle 6 516 FSTD %fr28,-80(%sp) 517 ADD %r0,%r0,%r0 ; clear the carry bit 518 ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7 519 FSTD %fr24,-64(%sp) 520 521 ; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body". 522 523 $DIAGLOOP 524 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) 525 LDO SIXTEEN(%r25),%r25 526 LDD 0(%r24),%r1 527 FSTD %fr31,-104(%sp) 528 SHRPD %r0,%r31,31,%r4 ; Cycle 2 529 ADD,DC %r22,%r3,%r3 530 FLDD UN_SIXTEEN(%r25),%fr7 531 ADD,DC %r0,%r20,%r20 ; Cycle 3 532 ADD %r1,%r3,%r3 533 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 534 LDD -80(%sp),%r21 535 STD %r3,0(%r24) 536 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 537 XMPYU %fr7L,%fr7L,%fr30 538 LDD -64(%sp),%r29 539 LDD EIGHT(%r24),%r1 540 ADD,DC %r4,%r20,%r20 ; Cycle 6 541 LDD -104(%sp),%r19 542 FSTD %fr29,-88(%sp) 543 ADD %r20,%r1,%r1 ; Cycle 7 544 FSTD %fr27,-72(%sp) 545 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 546 LDO THIRTY_TWO(%r24),%r24 547 LDD UN_SIXTEEN(%r24),%r28 548 FSTD %fr30,-96(%sp) 549 SHRPD %r0,%r29,31,%r3 ; Cycle 2 550 ADD,DC %r21,%r4,%r4 551 FLDD UN_EIGHT(%r25),%fr7 552 STD %r1,UN_TWENTY_FOUR(%r24) 553 ADD,DC %r0,%r19,%r19 ; Cycle 3 554 ADD %r28,%r4,%r4 555 XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4 556 LDD -88(%sp),%r22 557 STD %r4,UN_SIXTEEN(%r24) 558 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 559 XMPYU %fr7L,%fr7L,%fr31 560 LDD -72(%sp),%r31 561 LDD UN_EIGHT(%r24),%r28 562 ADD,DC %r3,%r19,%r19 ; Cycle 6 563 LDD -96(%sp),%r20 564 FSTD %fr28,-80(%sp) 565 ADD %r19,%r28,%r28 ; Cycle 7 566 FSTD %fr24,-64(%sp) 567 ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8 568 STD %r28,UN_EIGHT(%r24) 569 570 $ENDDIAGLOOP 571 572 ADD,DC %r0,%r22,%r22 573 CMPIB,= 0,%r26,$ONEMOREDIAG 574 SHRPD %r31,%r0,31,%r3 575 576 ; Shutdown code, first stage. 577 578 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) 579 LDD 0(%r24),%r28 580 SHRPD %r0,%r31,31,%r4 ; Cycle 2 581 ADD %r3,%r22,%r3 582 ADD,DC %r0,%r20,%r20 ; Cycle 3 583 LDD -80(%sp),%r21 584 ADD %r3,%r28,%r3 585 LDD -64(%sp),%r29 ; Cycle 4 586 STD %r3,0(%r24) 587 LDD EIGHT(%r24),%r1 ; Cycle 5 588 LDO SIXTEEN(%r25),%r25 ; Cycle 6 589 LDD -104(%sp),%r19 590 ADD,DC %r4,%r20,%r20 591 ADD %r20,%r1,%r1 ; Cycle 7 592 ADD,DC %r0,%r21,%r21 ; Cycle 8 593 STD %r1,EIGHT(%r24) 594 595 ; Shutdown code, second stage. 596 597 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 598 LDO THIRTY_TWO(%r24),%r24 599 LDD UN_SIXTEEN(%r24),%r1 600 SHRPD %r0,%r29,31,%r3 ; Cycle 2 601 ADD %r4,%r21,%r4 602 ADD,DC %r0,%r19,%r19 ; Cycle 3 603 ADD %r4,%r1,%r4 604 STD %r4,UN_SIXTEEN(%r24); Cycle 4 605 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 606 ADD,DC %r3,%r19,%r19 ; Cycle 6 607 ADD %r19,%r28,%r28 ; Cycle 7 608 ADD,DC %r0,%r0,%r22 ; Cycle 8 609 CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit 610 STD %r28,UN_EIGHT(%r24) 611 612 ; Final carry propagation 613 614 $FDIAG2 615 LDO EIGHT(%r24),%r24 616 LDD UN_EIGHT(%r24),%r26 617 ADDI 1,%r26,%r26 618 CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry. 619 STD %r26,UN_EIGHT(%r24) 620 621 B $Z0 622 NOP 623 624 ; Here is the code that handles the difficult case N=1. 625 ; We do the usual trick -- branch out of the startup code at appropriate 626 ; points, and branch into the shutdown code. 627 628 $DIAG_N_IS_ONE 629 630 LDD -88(%sp),%r22 631 LDD -72(%sp),%r31 632 B $JOINDIAG 633 LDD -96(%sp),%r20 634 635 ; We came out of the unrolled loop with wrong parity. Do one more 636 ; single cycle. This is the "alternate body". It will, of course, 637 ; give us opposite registers from the other case, so we need 638 ; completely different shutdown code. 639 640 $ONEMOREDIAG 641 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) 642 LDD 0(%r24),%r28 643 FLDD 0(%r25),%fr7 ; Cycle 2 644 SHRPD %r0,%r31,31,%r4 645 ADD %r3,%r22,%r3 646 ADD,DC %r0,%r20,%r20 ; Cycle 3 647 LDD -80(%sp),%r21 648 ADD %r3,%r28,%r3 649 LDD -64(%sp),%r29 ; Cycle 4 650 STD %r3,0(%r24) 651 XMPYU %fr7R,%fr7R,%fr29 652 LDD EIGHT(%r24),%r1 ; Cycle 5 653 XMPYU %fr7L,%fr7R,%fr27 654 XMPYU %fr7L,%fr7L,%fr30 655 LDD -104(%sp),%r19 ; Cycle 6 656 FSTD %fr29,-88(%sp) 657 ADD,DC %r4,%r20,%r20 658 FSTD %fr27,-72(%sp) ; Cycle 7 659 ADD %r20,%r1,%r1 660 ADD,DC %r0,%r21,%r21 ; Cycle 8 661 STD %r1,EIGHT(%r24) 662 663 ; Shutdown code, first stage. 664 665 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 666 LDO THIRTY_TWO(%r24),%r24 667 FSTD %fr30,-96(%sp) 668 LDD UN_SIXTEEN(%r24),%r1 669 SHRPD %r0,%r29,31,%r3 ; Cycle 2 670 ADD %r4,%r21,%r4 671 ADD,DC %r0,%r19,%r19 ; Cycle 3 672 LDD -88(%sp),%r22 673 ADD %r4,%r1,%r4 674 LDD -72(%sp),%r31 ; Cycle 4 675 STD %r4,UN_SIXTEEN(%r24) 676 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 677 LDD -96(%sp),%r20 ; Cycle 6 678 ADD,DC %r3,%r19,%r19 679 ADD %r19,%r28,%r28 ; Cycle 7 680 ADD,DC %r0,%r22,%r22 ; Cycle 8 681 STD %r28,UN_EIGHT(%r24) 682 683 ; Shutdown code, second stage. 684 685 $JOINDIAG 686 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) 687 LDD 0(%r24),%r28 688 SHRPD %r0,%r31,31,%r4 ; Cycle 2 689 ADD %r3,%r22,%r3 690 ADD,DC %r0,%r20,%r20 ; Cycle 3 691 ADD %r3,%r28,%r3 692 STD %r3,0(%r24) ; Cycle 4 693 LDD EIGHT(%r24),%r1 ; Cycle 5 694 ADD,DC %r4,%r20,%r20 695 ADD %r20,%r1,%r1 ; Cycle 7 696 ADD,DC %r0,%r0,%r21 ; Cycle 8 697 CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit 698 STD %r1,EIGHT(%r24) 699 700 ; Final carry propagation 701 702 $FDIAG1 703 LDO EIGHT(%r24),%r24 704 LDD EIGHT(%r24),%r26 705 ADDI 1,%r26,%r26 706 CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry. 707 STD %r26,EIGHT(%r24) 708 709 $Z0 710 LDW -124(%sp),%r4 711 BVE (%r2) 712 .EXIT 713 LDW,MB -128(%sp),%r3 714 .PROCEND 715 ; .ALLOW 716 717 .SPACE $TEXT$ 718 .SUBSPA $CODE$ 719 #ifdef LITTLE_WORDIAN 720 #ifdef __GNUC__ 721 ; GNU-as (as of 2.19) does not support LONG_RETURN 722 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 723 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR 724 #else 725 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN 726 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN 727 #endif 728 #else 729 .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN 730 .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN 731 #endif 732 .END 733 734 735 ; How to use "maxpy_PA20_little" and "maxpy_PA20_big" 736 ; 737 ; The routine "maxpy_PA20_little" or "maxpy_PA20_big" 738 ; performs a 64-bit x any-size multiply, and adds the 739 ; result to an area of memory. That is, it performs 740 ; something like 741 ; 742 ; A B C D 743 ; * Z 744 ; __________ 745 ; P Q R S T 746 ; 747 ; and then adds the "PQRST" vector into an area of memory, 748 ; handling all carries. 749 ; 750 ; Digression on nomenclature and endian-ness: 751 ; 752 ; Each of the capital letters in the above represents a 64-bit 753 ; quantity. That is, you could think of the discussion as 754 ; being in terms of radix-16-quintillion arithmetic. The data 755 ; type being manipulated is "unsigned long long int". This 756 ; requires the 64-bit extension of the HP-UX C compiler, 757 ; available at release 10. You need these compiler flags to 758 ; enable these extensions: 759 ; 760 ; -Aa +e +DA2.0 +DS2.0 761 ; 762 ; (The first specifies ANSI C, the second enables the 763 ; extensions, which are beyond ANSI C, and the third and 764 ; fourth tell the compiler to use whatever features of the 765 ; PA2.0 architecture it wishes, in order to made the code more 766 ; efficient. Since the presence of the assembly code will 767 ; make the program unable to run on anything less than PA2.0, 768 ; you might as well gain the performance enhancements in the C 769 ; code as well.) 770 ; 771 ; Questions of "endian-ness" often come up, usually in the 772 ; context of byte ordering in a word. These routines have a 773 ; similar issue, that could be called "wordian-ness". 774 ; Independent of byte ordering (PA is always big-endian), one 775 ; can make two choices when representing extremely large 776 ; numbers as arrays of 64-bit doublewords in memory. 777 ; 778 ; "Little-wordian" layout means that the least significant 779 ; word of a number is stored at the lowest address. 780 ; 781 ; MSW LSW 782 ; | | 783 ; V V 784 ; 785 ; A B C D E 786 ; 787 ; ^ ^ ^ 788 ; | | |____ address 0 789 ; | | 790 ; | |_______address 8 791 ; | 792 ; address 32 793 ; 794 ; "Big-wordian" means that the most significant word is at the 795 ; lowest address. 796 ; 797 ; MSW LSW 798 ; | | 799 ; V V 800 ; 801 ; A B C D E 802 ; 803 ; ^ ^ ^ 804 ; | | |____ address 32 805 ; | | 806 ; | |_______address 24 807 ; | 808 ; address 0 809 ; 810 ; When you compile the file, you must specify one or the other, with 811 ; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN". 812 ; 813 ; Incidentally, you assemble this file as part of your 814 ; project with the same C compiler as the rest of the program. 815 ; My "makefile" for a superprecision arithmetic package has 816 ; the following stuff: 817 ; 818 ; # definitions: 819 ; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1 820 ; CFLAGS = +O3 821 ; LDFLAGS = -L /usr/lib -Wl,-aarchive 822 ; 823 ; # general build rule for ".s" files: 824 ; .s.o: 825 ; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN 826 ; 827 ; # Now any bind step that calls for pa20.o will assemble pa20.s 828 ; 829 ; End of digression, back to arithmetic: 830 ; 831 ; The way we multiply two huge numbers is, of course, to multiply 832 ; the "ABCD" vector by each of the "WXYZ" doublewords, adding 833 ; the result vectors with increasing offsets, the way we learned 834 ; in school, back before we all used calculators: 835 ; 836 ; A B C D 837 ; * W X Y Z 838 ; __________ 839 ; P Q R S T 840 ; E F G H I 841 ; M N O P Q 842 ; + R S T U V 843 ; _______________ 844 ; F I N A L S U M 845 ; 846 ; So we call maxpy_PA20_big (in my case; my package is 847 ; big-wordian) repeatedly, giving the W, X, Y, and Z arguments 848 ; in turn as the "scalar", and giving the "ABCD" vector each 849 ; time. We direct it to add its result into an area of memory 850 ; that we have cleared at the start. We skew the exact 851 ; location into that area with each call. 852 ; 853 ; The prototype for the function is 854 ; 855 ; extern void maxpy_PA20_big( 856 ; int length, /* Number of doublewords in the multiplicand vector. */ 857 ; const long long int *scalaraddr, /* Address to fetch the scalar. */ 858 ; const long long int *multiplicand, /* The multiplicand vector. */ 859 ; long long int *result); /* Where to accumulate the result. */ 860 ; 861 ; (You should place a copy of this prototype in an include file 862 ; or in your C file.) 863 ; 864 ; Now, IN ALL CASES, the given address for the multiplicand or 865 ; the result is that of the LEAST SIGNIFICANT DOUBLEWORD. 866 ; That word is, of course, the word at which the routine 867 ; starts processing. "maxpy_PA20_little" then increases the 868 ; addresses as it computes. "maxpy_PA20_big" decreases them. 869 ; 870 ; In our example above, "length" would be 4 in each case. 871 ; "multiplicand" would be the "ABCD" vector. Specifically, 872 ; the address of the element "D". "scalaraddr" would be the 873 ; address of "W", "X", "Y", or "Z" on the four calls that we 874 ; would make. (The order doesn't matter, of course.) 875 ; "result" would be the appropriate address in the result 876 ; area. When multiplying by "Z", that would be the least 877 ; significant word. When multiplying by "Y", it would be the 878 ; next higher word (8 bytes higher if little-wordian; 8 bytes 879 ; lower if big-wordian), and so on. The size of the result 880 ; area must be the the sum of the sizes of the multiplicand 881 ; and multiplier vectors, and must be initialized to zero 882 ; before we start. 883 ; 884 ; Whenever the routine adds its partial product into the result 885 ; vector, it follows carry chains as far as they need to go. 886 ; 887 ; Here is the super-precision multiply routine that I use for 888 ; my package. The package is big-wordian. I have taken out 889 ; handling of exponents (it's a floating point package): 890 ; 891 ; static void mul_PA20( 892 ; int size, 893 ; const long long int *arg1, 894 ; const long long int *arg2, 895 ; long long int *result) 896 ; { 897 ; int i; 898 ; 899 ; for (i=0 ; i<2*size ; i++) result[i] = 0ULL; 900 ; 901 ; for (i=0 ; i<size ; i++) { 902 ; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]); 903 ; } 904 ; }