loongson_asm.S (22199B)
1 /********************************************************************* 2 * Copyright (c) 2022 Loongson Technology Corporation Limited 3 * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn) 4 * Shiyou Yin(yinshiyou-hf@loongson.cn) 5 * 6 * Permission to use, copy, modify, and/or distribute this software for any 7 * purpose with or without fee is hereby granted, provided that the above 8 * copyright notice and this permission notice appear in all copies. 9 * 10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 17 *********************************************************************/ 18 19 /* 20 * This file is a LoongArch assembly helper file and available under ISC 21 * license. It provides a large number of macros and alias to simplify 22 * writing assembly code, especially for LSX and LASX optimizations. 23 * 24 * Any one can modify it or add new features for his/her own purposes. 25 * Contributing a patch will be appreciated as it might be useful for 26 * others as well. Send patches to loongson contributor mentioned above. 27 * 28 * MAJOR version: Usage changes, incompatible with previous version. 29 * MINOR version: Add new macros/functions, or bug fixes. 30 * MICRO version: Comment changes or implementation changes. 31 */ 32 33 #define LML_VERSION_MAJOR 0 34 #define LML_VERSION_MINOR 4 35 #define LML_VERSION_MICRO 0 36 37 #define DEFAULT_ALIGN 5 38 39 /* Set prefix as needed. */ 40 #ifndef PRIVATE_PREFIX 41 #define PRIVATE_PREFIX dav1d_ 42 #endif 43 44 #define PASTE(a,b) a ## b 45 #define CONCAT(a,b) PASTE(a,b) 46 47 #ifdef PREFIX 48 #define ASM_PREF CONCAT(_,PRIVATE_PREFIX) 49 #else 50 #define ASM_PREF PRIVATE_PREFIX 51 #endif 52 53 .macro function name, align=DEFAULT_ALIGN 54 .macro endfunc 55 jirl $r0, $r1, 0x0 56 .size ASM_PREF\name, . - ASM_PREF\name 57 .purgem endfunc 58 .endm 59 .text ; 60 .align \align ; 61 .globl ASM_PREF\name ; 62 .hidden ASM_PREF\name ; 63 .type ASM_PREF\name, @function ; 64 ASM_PREF\name: ; 65 .endm 66 67 .macro const name, align=DEFAULT_ALIGN 68 .macro endconst 69 .size \name, . - \name 70 .purgem endconst 71 .endm 72 .section .rodata 73 .align \align 74 \name: 75 .endm 76 77 /* 78 *============================================================================ 79 * LoongArch register alias 80 *============================================================================ 81 */ 82 83 #define a0 $a0 84 #define a1 $a1 85 #define a2 $a2 86 #define a3 $a3 87 #define a4 $a4 88 #define a5 $a5 89 #define a6 $a6 90 #define a7 $a7 91 92 #define t0 $t0 93 #define t1 $t1 94 #define t2 $t2 95 #define t3 $t3 96 #define t4 $t4 97 #define t5 $t5 98 #define t6 $t6 99 #define t7 $t7 100 #define t8 $t8 101 102 #define s0 $s0 103 #define s1 $s1 104 #define s2 $s2 105 #define s3 $s3 106 #define s4 $s4 107 #define s5 $s5 108 #define s6 $s6 109 #define s7 $s7 110 #define s8 $s8 111 112 #define zero $zero 113 #define sp $sp 114 #define ra $ra 115 116 #define fa0 $fa0 117 #define fa1 $fa1 118 #define fa2 $fa2 119 #define fa3 $fa3 120 #define fa4 $fa4 121 #define fa5 $fa5 122 #define fa6 $fa6 123 #define fa7 $fa7 124 #define ft0 $ft0 125 #define ft1 $ft1 126 #define ft2 $ft2 127 #define ft3 $ft3 128 #define ft4 $ft4 129 #define ft5 $ft5 130 #define ft6 $ft6 131 #define ft7 $ft7 132 #define ft8 $ft8 133 #define ft9 $ft9 134 #define ft10 $ft10 135 #define ft11 $ft11 136 #define ft12 $ft12 137 #define ft13 $ft13 138 #define ft14 $ft14 139 #define ft15 $ft15 140 #define fs0 $fs0 141 #define fs1 $fs1 142 #define fs2 $fs2 143 #define fs3 $fs3 144 #define fs4 $fs4 145 #define fs5 $fs5 146 #define fs6 $fs6 147 #define fs7 $fs7 148 149 #define f0 $f0 150 #define f1 $f1 151 #define f2 $f2 152 #define f3 $f3 153 #define f4 $f4 154 #define f5 $f5 155 #define f6 $f6 156 #define f7 $f7 157 #define f8 $f8 158 #define f9 $f9 159 #define f10 $f10 160 #define f11 $f11 161 #define f12 $f12 162 #define f13 $f13 163 #define f14 $f14 164 #define f15 $f15 165 #define f16 $f16 166 #define f17 $f17 167 #define f18 $f18 168 #define f19 $f19 169 #define f20 $f20 170 #define f21 $f21 171 #define f22 $f22 172 #define f23 $f23 173 #define f24 $f24 174 #define f25 $f25 175 #define f26 $f26 176 #define f27 $f27 177 #define f28 $f28 178 #define f29 $f29 179 #define f30 $f30 180 #define f31 $f31 181 182 #define vr0 $vr0 183 #define vr1 $vr1 184 #define vr2 $vr2 185 #define vr3 $vr3 186 #define vr4 $vr4 187 #define vr5 $vr5 188 #define vr6 $vr6 189 #define vr7 $vr7 190 #define vr8 $vr8 191 #define vr9 $vr9 192 #define vr10 $vr10 193 #define vr11 $vr11 194 #define vr12 $vr12 195 #define vr13 $vr13 196 #define vr14 $vr14 197 #define vr15 $vr15 198 #define vr16 $vr16 199 #define vr17 $vr17 200 #define vr18 $vr18 201 #define vr19 $vr19 202 #define vr20 $vr20 203 #define vr21 $vr21 204 #define vr22 $vr22 205 #define vr23 $vr23 206 #define vr24 $vr24 207 #define vr25 $vr25 208 #define vr26 $vr26 209 #define vr27 $vr27 210 #define vr28 $vr28 211 #define vr29 $vr29 212 #define vr30 $vr30 213 #define vr31 $vr31 214 215 #define xr0 $xr0 216 #define xr1 $xr1 217 #define xr2 $xr2 218 #define xr3 $xr3 219 #define xr4 $xr4 220 #define xr5 $xr5 221 #define xr6 $xr6 222 #define xr7 $xr7 223 #define xr8 $xr8 224 #define xr9 $xr9 225 #define xr10 $xr10 226 #define xr11 $xr11 227 #define xr12 $xr12 228 #define xr13 $xr13 229 #define xr14 $xr14 230 #define xr15 $xr15 231 #define xr16 $xr16 232 #define xr17 $xr17 233 #define xr18 $xr18 234 #define xr19 $xr19 235 #define xr20 $xr20 236 #define xr21 $xr21 237 #define xr22 $xr22 238 #define xr23 $xr23 239 #define xr24 $xr24 240 #define xr25 $xr25 241 #define xr26 $xr26 242 #define xr27 $xr27 243 #define xr28 $xr28 244 #define xr29 $xr29 245 #define xr30 $xr30 246 #define xr31 $xr31 247 248 /* 249 *============================================================================ 250 * LSX/LASX synthesize instructions 251 *============================================================================ 252 */ 253 254 /* 255 * Description : Dot product of byte vector elements 256 * Arguments : Inputs - vj, vk 257 * Outputs - vd 258 * Return Type - halfword 259 */ 260 .macro vdp2.h.bu vd, vj, vk 261 vmulwev.h.bu \vd, \vj, \vk 262 vmaddwod.h.bu \vd, \vj, \vk 263 .endm 264 265 .macro vdp2.h.bu.b vd, vj, vk 266 vmulwev.h.bu.b \vd, \vj, \vk 267 vmaddwod.h.bu.b \vd, \vj, \vk 268 .endm 269 270 .macro vdp2.w.h vd, vj, vk 271 vmulwev.w.h \vd, \vj, \vk 272 vmaddwod.w.h \vd, \vj, \vk 273 .endm 274 275 .macro xvdp2.h.bu xd, xj, xk 276 xvmulwev.h.bu \xd, \xj, \xk 277 xvmaddwod.h.bu \xd, \xj, \xk 278 .endm 279 280 .macro xvdp2.h.bu.b xd, xj, xk 281 xvmulwev.h.bu.b \xd, \xj, \xk 282 xvmaddwod.h.bu.b \xd, \xj, \xk 283 .endm 284 285 .macro xvdp2.w.h xd, xj, xk 286 xvmulwev.w.h \xd, \xj, \xk 287 xvmaddwod.w.h \xd, \xj, \xk 288 .endm 289 290 /* 291 * Description : Dot product & addition of halfword vector elements 292 * Arguments : Inputs - vj, vk 293 * Outputs - vd 294 * Return Type - twice size of input 295 */ 296 .macro vdp2add.h.bu vd, vj, vk 297 vmaddwev.h.bu \vd, \vj, \vk 298 vmaddwod.h.bu \vd, \vj, \vk 299 .endm 300 301 .macro vdp2add.h.bu.b vd, vj, vk 302 vmaddwev.h.bu.b \vd, \vj, \vk 303 vmaddwod.h.bu.b \vd, \vj, \vk 304 .endm 305 306 .macro vdp2add.w.h vd, vj, vk 307 vmaddwev.w.h \vd, \vj, \vk 308 vmaddwod.w.h \vd, \vj, \vk 309 .endm 310 311 .macro xvdp2add.h.bu.b xd, xj, xk 312 xvmaddwev.h.bu.b \xd, \xj, \xk 313 xvmaddwod.h.bu.b \xd, \xj, \xk 314 .endm 315 316 .macro xvdp2add.w.h xd, xj, xk 317 xvmaddwev.w.h \xd, \xj, \xk 318 xvmaddwod.w.h \xd, \xj, \xk 319 .endm 320 321 /* 322 * Description : Range element vj[i] to vk[i] ~ vj[i] 323 * clip: vj > vk ? vj : vk && vj < va ? vj : va 324 */ 325 .macro vclip.h vd, vj, vk, va 326 vmax.h \vd, \vj, \vk 327 vmin.h \vd, \vd, \va 328 .endm 329 330 .macro vclip.w vd, vj, vk, va 331 vmax.w \vd, \vj, \vk 332 vmin.w \vd, \vd, \va 333 .endm 334 335 .macro xvclip.h xd, xj, xk, xa 336 xvmax.h \xd, \xj, \xk 337 xvmin.h \xd, \xd, \xa 338 .endm 339 340 .macro xvclip.w xd, xj, xk, xa 341 xvmax.w \xd, \xj, \xk 342 xvmin.w \xd, \xd, \xa 343 .endm 344 345 /* 346 * Description : Range element vj[i] to 0 ~ 255 347 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0 348 */ 349 .macro vclip255.h vd, vj 350 vmaxi.h \vd, \vj, 0 351 vsat.hu \vd, \vd, 7 352 .endm 353 354 .macro vclip255.w vd, vj 355 vmaxi.w \vd, \vj, 0 356 vsat.wu \vd, \vd, 7 357 .endm 358 359 .macro xvclip255.h xd, xj 360 xvmaxi.h \xd, \xj, 0 361 xvsat.hu \xd, \xd, 7 362 .endm 363 364 .macro xvclip255.w xd, xj 365 xvmaxi.w \xd, \xj, 0 366 xvsat.wu \xd, \xd, 7 367 .endm 368 369 /* 370 * Description : Store elements of vector 371 * vd : Data vector to be stroed 372 * rk : Address of data storage 373 * ra : Offset of address 374 * si : Index of data in vd 375 */ 376 .macro vstelmx.b vd, rk, ra, si 377 add.d \rk, \rk, \ra 378 vstelm.b \vd, \rk, 0, \si 379 .endm 380 381 .macro vstelmx.h vd, rk, ra, si 382 add.d \rk, \rk, \ra 383 vstelm.h \vd, \rk, 0, \si 384 .endm 385 386 .macro vstelmx.w vd, rk, ra, si 387 add.d \rk, \rk, \ra 388 vstelm.w \vd, \rk, 0, \si 389 .endm 390 391 .macro vstelmx.d vd, rk, ra, si 392 add.d \rk, \rk, \ra 393 vstelm.d \vd, \rk, 0, \si 394 .endm 395 396 .macro vmov xd, xj 397 vor.v \xd, \xj, \xj 398 .endm 399 400 .macro xmov xd, xj 401 xvor.v \xd, \xj, \xj 402 .endm 403 404 .macro xvstelmx.d xd, rk, ra, si 405 add.d \rk, \rk, \ra 406 xvstelm.d \xd, \rk, 0, \si 407 .endm 408 409 /* 410 *============================================================================ 411 * LSX/LASX custom macros 412 *============================================================================ 413 */ 414 415 /* 416 * Load 4 float, double, V128, v256 elements with stride. 417 */ 418 .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 419 fld.s \out0, \src, 0 420 fldx.s \out1, \src, \stride 421 fldx.s \out2, \src, \stride2 422 fldx.s \out3, \src, \stride3 423 .endm 424 425 .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 426 fld.d \out0, \src, 0 427 fldx.d \out1, \src, \stride 428 fldx.d \out2, \src, \stride2 429 fldx.d \out3, \src, \stride3 430 .endm 431 432 .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 433 vld \out0, \src, 0 434 vldx \out1, \src, \stride 435 vldx \out2, \src, \stride2 436 vldx \out3, \src, \stride3 437 .endm 438 439 .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3 440 xvld \out0, \src, 0 441 xvldx \out1, \src, \stride 442 xvldx \out2, \src, \stride2 443 xvldx \out3, \src, \stride3 444 .endm 445 446 /* 447 * Description : Transpose 4x4 block with half-word elements in vectors 448 * Arguments : Inputs - in0, in1, in2, in3 449 * Outputs - out0, out1, out2, out3 450 */ 451 .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ 452 tmp0, tmp1 453 vilvl.h \tmp0, \in1, \in0 454 vilvl.h \tmp1, \in3, \in2 455 vilvl.w \out0, \tmp1, \tmp0 456 vilvh.w \out2, \tmp1, \tmp0 457 vilvh.d \out1, \out0, \out0 458 vilvh.d \out3, \out0, \out2 459 .endm 460 461 /* 462 * Description : Transpose 4x4 block with word elements in vectors 463 * Arguments : Inputs - in0, in1, in2, in3 464 * Outputs - out0, out1, out2, out3 465 * Details : 466 * Example : 467 * 1, 2, 3, 4 1, 5, 9,13 468 * 5, 6, 7, 8 to 2, 6,10,14 469 * 9,10,11,12 =====> 3, 7,11,15 470 * 13,14,15,16 4, 8,12,16 471 */ 472 .macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ 473 tmp0, tmp1 474 475 vilvl.w \tmp0, \in1, \in0 476 vilvh.w \out1, \in1, \in0 477 vilvl.w \tmp1, \in3, \in2 478 vilvh.w \out3, \in3, \in2 479 480 vilvl.d \out0, \tmp1, \tmp0 481 vilvl.d \out2, \out3, \out1 482 vilvh.d \out3, \out3, \out1 483 vilvh.d \out1, \tmp1, \tmp0 484 .endm 485 486 /* 487 * Description : Transpose 8x8 block with half-word elements in vectors 488 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 489 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 490 */ 491 .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ 492 out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \ 493 tmp3, tmp4, tmp5, tmp6, tmp7 494 vilvl.h \tmp0, \in6, \in4 495 vilvl.h \tmp1, \in7, \in5 496 vilvl.h \tmp2, \in2, \in0 497 vilvl.h \tmp3, \in3, \in1 498 499 vilvl.h \tmp4, \tmp1, \tmp0 500 vilvh.h \tmp5, \tmp1, \tmp0 501 vilvl.h \tmp6, \tmp3, \tmp2 502 vilvh.h \tmp7, \tmp3, \tmp2 503 504 vilvh.h \tmp0, \in6, \in4 505 vilvh.h \tmp1, \in7, \in5 506 vilvh.h \tmp2, \in2, \in0 507 vilvh.h \tmp3, \in3, \in1 508 509 vpickev.d \out0, \tmp4, \tmp6 510 vpickod.d \out1, \tmp4, \tmp6 511 vpickev.d \out2, \tmp5, \tmp7 512 vpickod.d \out3, \tmp5, \tmp7 513 514 vilvl.h \tmp4, \tmp1, \tmp0 515 vilvh.h \tmp5, \tmp1, \tmp0 516 vilvl.h \tmp6, \tmp3, \tmp2 517 vilvh.h \tmp7, \tmp3, \tmp2 518 519 vpickev.d \out4, \tmp4, \tmp6 520 vpickod.d \out5, \tmp4, \tmp6 521 vpickev.d \out6, \tmp5, \tmp7 522 vpickod.d \out7, \tmp5, \tmp7 523 .endm 524 525 /* 526 * Description : Transpose 16x8 block with byte elements in vectors 527 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 528 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 529 */ 530 .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \ 531 in8, in9, in10, in11, in12, in13, in14, in15, \ 532 out0, out1, out2, out3, out4, out5, out6, out7,\ 533 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 534 xvilvl.b \tmp0, \in2, \in0 535 xvilvl.b \tmp1, \in3, \in1 536 xvilvl.b \tmp2, \in6, \in4 537 xvilvl.b \tmp3, \in7, \in5 538 xvilvl.b \tmp4, \in10, \in8 539 xvilvl.b \tmp5, \in11, \in9 540 xvilvl.b \tmp6, \in14, \in12 541 xvilvl.b \tmp7, \in15, \in13 542 xvilvl.b \out0, \tmp1, \tmp0 543 xvilvh.b \out1, \tmp1, \tmp0 544 xvilvl.b \out2, \tmp3, \tmp2 545 xvilvh.b \out3, \tmp3, \tmp2 546 xvilvl.b \out4, \tmp5, \tmp4 547 xvilvh.b \out5, \tmp5, \tmp4 548 xvilvl.b \out6, \tmp7, \tmp6 549 xvilvh.b \out7, \tmp7, \tmp6 550 xvilvl.w \tmp0, \out2, \out0 551 xvilvh.w \tmp2, \out2, \out0 552 xvilvl.w \tmp4, \out3, \out1 553 xvilvh.w \tmp6, \out3, \out1 554 xvilvl.w \tmp1, \out6, \out4 555 xvilvh.w \tmp3, \out6, \out4 556 xvilvl.w \tmp5, \out7, \out5 557 xvilvh.w \tmp7, \out7, \out5 558 xvilvl.d \out0, \tmp1, \tmp0 559 xvilvh.d \out1, \tmp1, \tmp0 560 xvilvl.d \out2, \tmp3, \tmp2 561 xvilvh.d \out3, \tmp3, \tmp2 562 xvilvl.d \out4, \tmp5, \tmp4 563 xvilvh.d \out5, \tmp5, \tmp4 564 xvilvl.d \out6, \tmp7, \tmp6 565 xvilvh.d \out7, \tmp7, \tmp6 566 .endm 567 568 /* 569 * Description : Transpose 4x4 block with half-word elements in vectors 570 * Arguments : Inputs - in0, in1, in2, in3 571 * Outputs - out0, out1, out2, out3 572 */ 573 .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ 574 tmp0, tmp1 575 xvilvl.h \tmp0, \in1, \in0 576 xvilvl.h \tmp1, \in3, \in2 577 xvilvl.w \out0, \tmp1, \tmp0 578 xvilvh.w \out2, \tmp1, \tmp0 579 xvilvh.d \out1, \out0, \out0 580 xvilvh.d \out3, \out0, \out2 581 .endm 582 583 /* 584 * Description : Transpose 4x8 block with half-word elements in vectors 585 * Arguments : Inputs - in0, in1, in2, in3 586 * Outputs - out0, out1, out2, out3 587 */ 588 .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \ 589 tmp0, tmp1 590 xvilvl.h \tmp0, \in2, \in0 591 xvilvl.h \tmp1, \in3, \in1 592 xvilvl.h \out2, \tmp1, \tmp0 593 xvilvh.h \out3, \tmp1, \tmp0 594 595 xvilvl.d \out0, \out2, \out2 596 xvilvh.d \out1, \out2, \out2 597 xvilvl.d \out2, \out3, \out3 598 xvilvh.d \out3, \out3, \out3 599 .endm 600 601 /* 602 * Description : Transpose 8x8 block with half-word elements in vectors 603 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 604 * Outputs - out0, out1, out2, out3, out4, out5, out6, out7 605 */ 606 .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \ 607 out0, out1, out2, out3, out4, out5, out6, out7, \ 608 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7 609 xvilvl.h \tmp0, \in6, \in4 610 xvilvl.h \tmp1, \in7, \in5 611 xvilvl.h \tmp2, \in2, \in0 612 xvilvl.h \tmp3, \in3, \in1 613 614 xvilvl.h \tmp4, \tmp1, \tmp0 615 xvilvh.h \tmp5, \tmp1, \tmp0 616 xvilvl.h \tmp6, \tmp3, \tmp2 617 xvilvh.h \tmp7, \tmp3, \tmp2 618 619 xvilvh.h \tmp0, \in6, \in4 620 xvilvh.h \tmp1, \in7, \in5 621 xvilvh.h \tmp2, \in2, \in0 622 xvilvh.h \tmp3, \in3, \in1 623 624 xvpickev.d \out0, \tmp4, \tmp6 625 xvpickod.d \out1, \tmp4, \tmp6 626 xvpickev.d \out2, \tmp5, \tmp7 627 xvpickod.d \out3, \tmp5, \tmp7 628 629 xvilvl.h \tmp4, \tmp1, \tmp0 630 xvilvh.h \tmp5, \tmp1, \tmp0 631 xvilvl.h \tmp6, \tmp3, \tmp2 632 xvilvh.h \tmp7, \tmp3, \tmp2 633 634 xvpickev.d \out4, \tmp4, \tmp6 635 xvpickod.d \out5, \tmp4, \tmp6 636 xvpickev.d \out6, \tmp5, \tmp7 637 xvpickod.d \out7, \tmp5, \tmp7 638 .endm 639 640 /* 641 * Description : Transpose 2x4x4 block with half-word elements in vectors 642 * Arguments : Inputs - in0, in1, in2, in3 643 * Outputs - out0, out1, out2, out3 644 */ 645 .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \ 646 tmp0, tmp1, tmp2 647 xvilvh.h \tmp1, \in0, \in1 648 xvilvl.h \out1, \in0, \in1 649 xvilvh.h \tmp0, \in2, \in3 650 xvilvl.h \out3, \in2, \in3 651 652 xvilvh.w \tmp2, \out3, \out1 653 xvilvl.w \out3, \out3, \out1 654 655 xvilvl.w \out2, \tmp0, \tmp1 656 xvilvh.w \tmp1, \tmp0, \tmp1 657 658 xvilvh.d \out0, \out2, \out3 659 xvilvl.d \out2, \out2, \out3 660 xvilvh.d \out1, \tmp1, \tmp2 661 xvilvl.d \out3, \tmp1, \tmp2 662 .endm 663 664 /* 665 * Description : Transpose 4x4 block with word elements in vectors 666 * Arguments : Inputs - in0, in1, in2, in3 667 * Outputs - out0, out1, out2, out3 668 * Details : 669 * Example : 670 * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13 671 * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14 672 * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15 673 * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16 674 */ 675 .macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \ 676 tmp0, tmp1 677 678 xvilvl.w \tmp0, \in1, \in0 679 xvilvh.w \out1, \in1, \in0 680 xvilvl.w \tmp1, \in3, \in2 681 xvilvh.w \out3, \in3, \in2 682 683 xvilvl.d \out0, \tmp1, \tmp0 684 xvilvl.d \out2, \out3, \out1 685 xvilvh.d \out3, \out3, \out1 686 xvilvh.d \out1, \tmp1, \tmp0 687 .endm 688 689 /* 690 * Description : Transpose 8x8 block with word elements in vectors 691 * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 692 * Outputs - out0, out1, out2, out3, out4, out5, out6, 693 * _out7 694 * Example : LASX_TRANSPOSE8x8_W 695 * in0 : 1,2,3,4,5,6,7,8 696 * in1 : 2,2,3,4,5,6,7,8 697 * in2 : 3,2,3,4,5,6,7,8 698 * in3 : 4,2,3,4,5,6,7,8 699 * in4 : 5,2,3,4,5,6,7,8 700 * in5 : 6,2,3,4,5,6,7,8 701 * in6 : 7,2,3,4,5,6,7,8 702 * in7 : 8,2,3,4,5,6,7,8 703 * 704 * out0 : 1,2,3,4,5,6,7,8 705 * out1 : 2,2,2,2,2,2,2,2 706 * out2 : 3,3,3,3,3,3,3,3 707 * out3 : 4,4,4,4,4,4,4,4 708 * out4 : 5,5,5,5,5,5,5,5 709 * out5 : 6,6,6,6,6,6,6,6 710 * out6 : 7,7,7,7,7,7,7,7 711 * out7 : 8,8,8,8,8,8,8,8 712 */ 713 .macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\ 714 out0, out1, out2, out3, out4, out5, out6, out7,\ 715 tmp0, tmp1, tmp2, tmp3 716 xvilvl.w \tmp0, \in2, \in0 717 xvilvl.w \tmp1, \in3, \in1 718 xvilvh.w \tmp2, \in2, \in0 719 xvilvh.w \tmp3, \in3, \in1 720 xvilvl.w \out0, \tmp1, \tmp0 721 xvilvh.w \out1, \tmp1, \tmp0 722 xvilvl.w \out2, \tmp3, \tmp2 723 xvilvh.w \out3, \tmp3, \tmp2 724 725 xvilvl.w \tmp0, \in6, \in4 726 xvilvl.w \tmp1, \in7, \in5 727 xvilvh.w \tmp2, \in6, \in4 728 xvilvh.w \tmp3, \in7, \in5 729 xvilvl.w \out4, \tmp1, \tmp0 730 xvilvh.w \out5, \tmp1, \tmp0 731 xvilvl.w \out6, \tmp3, \tmp2 732 xvilvh.w \out7, \tmp3, \tmp2 733 734 xmov \tmp0, \out0 735 xmov \tmp1, \out1 736 xmov \tmp2, \out2 737 xmov \tmp3, \out3 738 xvpermi.q \out0, \out4, 0x02 739 xvpermi.q \out1, \out5, 0x02 740 xvpermi.q \out2, \out6, 0x02 741 xvpermi.q \out3, \out7, 0x02 742 xvpermi.q \out4, \tmp0, 0x31 743 xvpermi.q \out5, \tmp1, 0x31 744 xvpermi.q \out6, \tmp2, 0x31 745 xvpermi.q \out7, \tmp3, 0x31 746 .endm 747 748 /* 749 * Description : Transpose 4x4 block with double-word elements in vectors 750 * Arguments : Inputs - in0, in1, in2, in3 751 * Outputs - out0, out1, out2, out3 752 * Example : LASX_TRANSPOSE4x4_D 753 * in0 : 1,2,3,4 754 * in1 : 1,2,3,4 755 * in2 : 1,2,3,4 756 * in3 : 1,2,3,4 757 * 758 * out0 : 1,1,1,1 759 * out1 : 2,2,2,2 760 * out2 : 3,3,3,3 761 * out3 : 4,4,4,4 762 */ 763 .macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \ 764 tmp0, tmp1 765 xvilvl.d \tmp0, \in1, \in0 766 xvilvh.d \out1, \in1, \in0 767 xvilvh.d \tmp1, \in3, \in2 768 xvilvl.d \out2, \in3, \in2 769 770 xvor.v \out0, \tmp0, \tmp0 771 xvor.v \out3, \tmp1, \tmp1 772 773 xvpermi.q \out0, \out2, 0x02 774 xvpermi.q \out2, \tmp0, 0x31 775 xvpermi.q \out3, \out1, 0x31 776 xvpermi.q \out1, \tmp1, 0x02 777 .endm