tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

loongson_asm.S (22199B)


      1 /*********************************************************************
      2 * Copyright (c) 2022 Loongson Technology Corporation Limited
      3 * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
      4 *                Shiyou Yin(yinshiyou-hf@loongson.cn)
      5 *
      6 * Permission to use, copy, modify, and/or distribute this software for any
      7 * purpose with or without fee is hereby granted, provided that the above
      8 * copyright notice and this permission notice appear in all copies.
      9 *
     10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
     11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
     12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
     13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
     15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
     16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
     17 *********************************************************************/
     18 
     19 /*
     20 * This file is a LoongArch assembly helper file and available under ISC
     21 * license. It provides a large number of macros and alias to simplify
     22 * writing assembly code, especially for LSX and LASX optimizations.
     23 *
     24 * Any one can modify it or add new features for his/her own purposes.
     25 * Contributing a patch will be appreciated as it might be useful for
     26 * others as well. Send patches to loongson contributor mentioned above.
     27 *
     28 * MAJOR version: Usage changes, incompatible with previous version.
     29 * MINOR version: Add new macros/functions, or bug fixes.
     30 * MICRO version: Comment changes or implementation changes.
     31 */
     32 
     33 #define LML_VERSION_MAJOR 0
     34 #define LML_VERSION_MINOR 4
     35 #define LML_VERSION_MICRO 0
     36 
     37 #define DEFAULT_ALIGN    5
     38 
     39 /* Set prefix as needed. */
     40 #ifndef PRIVATE_PREFIX
     41 #define PRIVATE_PREFIX dav1d_
     42 #endif
     43 
     44 #define PASTE(a,b) a ## b
     45 #define CONCAT(a,b) PASTE(a,b)
     46 
     47 #ifdef PREFIX
     48 #define ASM_PREF CONCAT(_,PRIVATE_PREFIX)
     49 #else
     50 #define ASM_PREF PRIVATE_PREFIX
     51 #endif
     52 
     53 .macro function name, align=DEFAULT_ALIGN
     54 .macro endfunc
     55    jirl    $r0, $r1, 0x0
     56    .size ASM_PREF\name, . - ASM_PREF\name
     57    .purgem endfunc
     58 .endm
     59 .text ;
     60 .align \align ;
     61 .globl ASM_PREF\name ;
     62 .hidden ASM_PREF\name ;
     63 .type  ASM_PREF\name, @function ;
     64 ASM_PREF\name: ;
     65 .endm
     66 
     67 .macro  const name, align=DEFAULT_ALIGN
     68    .macro endconst
     69    .size  \name, . - \name
     70    .purgem endconst
     71    .endm
     72 .section .rodata
     73 .align   \align
     74 \name:
     75 .endm
     76 
     77 /*
     78 *============================================================================
     79 * LoongArch register alias
     80 *============================================================================
     81 */
     82 
     83 #define a0 $a0
     84 #define a1 $a1
     85 #define a2 $a2
     86 #define a3 $a3
     87 #define a4 $a4
     88 #define a5 $a5
     89 #define a6 $a6
     90 #define a7 $a7
     91 
     92 #define t0 $t0
     93 #define t1 $t1
     94 #define t2 $t2
     95 #define t3 $t3
     96 #define t4 $t4
     97 #define t5 $t5
     98 #define t6 $t6
     99 #define t7 $t7
    100 #define t8 $t8
    101 
    102 #define s0 $s0
    103 #define s1 $s1
    104 #define s2 $s2
    105 #define s3 $s3
    106 #define s4 $s4
    107 #define s5 $s5
    108 #define s6 $s6
    109 #define s7 $s7
    110 #define s8 $s8
    111 
    112 #define zero $zero
    113 #define sp   $sp
    114 #define ra   $ra
    115 
    116 #define fa0  $fa0
    117 #define fa1  $fa1
    118 #define fa2  $fa2
    119 #define fa3  $fa3
    120 #define fa4  $fa4
    121 #define fa5  $fa5
    122 #define fa6  $fa6
    123 #define fa7  $fa7
    124 #define ft0  $ft0
    125 #define ft1  $ft1
    126 #define ft2  $ft2
    127 #define ft3  $ft3
    128 #define ft4  $ft4
    129 #define ft5  $ft5
    130 #define ft6  $ft6
    131 #define ft7  $ft7
    132 #define ft8  $ft8
    133 #define ft9  $ft9
    134 #define ft10 $ft10
    135 #define ft11 $ft11
    136 #define ft12 $ft12
    137 #define ft13 $ft13
    138 #define ft14 $ft14
    139 #define ft15 $ft15
    140 #define fs0  $fs0
    141 #define fs1  $fs1
    142 #define fs2  $fs2
    143 #define fs3  $fs3
    144 #define fs4  $fs4
    145 #define fs5  $fs5
    146 #define fs6  $fs6
    147 #define fs7  $fs7
    148 
    149 #define f0  $f0
    150 #define f1  $f1
    151 #define f2  $f2
    152 #define f3  $f3
    153 #define f4  $f4
    154 #define f5  $f5
    155 #define f6  $f6
    156 #define f7  $f7
    157 #define f8  $f8
    158 #define f9  $f9
    159 #define f10 $f10
    160 #define f11 $f11
    161 #define f12 $f12
    162 #define f13 $f13
    163 #define f14 $f14
    164 #define f15 $f15
    165 #define f16 $f16
    166 #define f17 $f17
    167 #define f18 $f18
    168 #define f19 $f19
    169 #define f20 $f20
    170 #define f21 $f21
    171 #define f22 $f22
    172 #define f23 $f23
    173 #define f24 $f24
    174 #define f25 $f25
    175 #define f26 $f26
    176 #define f27 $f27
    177 #define f28 $f28
    178 #define f29 $f29
    179 #define f30 $f30
    180 #define f31 $f31
    181 
    182 #define vr0 $vr0
    183 #define vr1 $vr1
    184 #define vr2 $vr2
    185 #define vr3 $vr3
    186 #define vr4 $vr4
    187 #define vr5 $vr5
    188 #define vr6 $vr6
    189 #define vr7 $vr7
    190 #define vr8 $vr8
    191 #define vr9 $vr9
    192 #define vr10 $vr10
    193 #define vr11 $vr11
    194 #define vr12 $vr12
    195 #define vr13 $vr13
    196 #define vr14 $vr14
    197 #define vr15 $vr15
    198 #define vr16 $vr16
    199 #define vr17 $vr17
    200 #define vr18 $vr18
    201 #define vr19 $vr19
    202 #define vr20 $vr20
    203 #define vr21 $vr21
    204 #define vr22 $vr22
    205 #define vr23 $vr23
    206 #define vr24 $vr24
    207 #define vr25 $vr25
    208 #define vr26 $vr26
    209 #define vr27 $vr27
    210 #define vr28 $vr28
    211 #define vr29 $vr29
    212 #define vr30 $vr30
    213 #define vr31 $vr31
    214 
    215 #define xr0 $xr0
    216 #define xr1 $xr1
    217 #define xr2 $xr2
    218 #define xr3 $xr3
    219 #define xr4 $xr4
    220 #define xr5 $xr5
    221 #define xr6 $xr6
    222 #define xr7 $xr7
    223 #define xr8 $xr8
    224 #define xr9 $xr9
    225 #define xr10 $xr10
    226 #define xr11 $xr11
    227 #define xr12 $xr12
    228 #define xr13 $xr13
    229 #define xr14 $xr14
    230 #define xr15 $xr15
    231 #define xr16 $xr16
    232 #define xr17 $xr17
    233 #define xr18 $xr18
    234 #define xr19 $xr19
    235 #define xr20 $xr20
    236 #define xr21 $xr21
    237 #define xr22 $xr22
    238 #define xr23 $xr23
    239 #define xr24 $xr24
    240 #define xr25 $xr25
    241 #define xr26 $xr26
    242 #define xr27 $xr27
    243 #define xr28 $xr28
    244 #define xr29 $xr29
    245 #define xr30 $xr30
    246 #define xr31 $xr31
    247 
    248 /*
    249 *============================================================================
    250 * LSX/LASX synthesize instructions
    251 *============================================================================
    252 */
    253 
    254 /*
    255 * Description : Dot product of byte vector elements
    256 * Arguments   : Inputs  - vj, vk
    257 *               Outputs - vd
    258 *               Return Type - halfword
    259 */
    260 .macro vdp2.h.bu vd, vj, vk
    261    vmulwev.h.bu      \vd,    \vj,    \vk
    262    vmaddwod.h.bu     \vd,    \vj,    \vk
    263 .endm
    264 
    265 .macro vdp2.h.bu.b vd, vj, vk
    266    vmulwev.h.bu.b    \vd,    \vj,    \vk
    267    vmaddwod.h.bu.b   \vd,    \vj,    \vk
    268 .endm
    269 
    270 .macro vdp2.w.h vd, vj, vk
    271    vmulwev.w.h       \vd,    \vj,    \vk
    272    vmaddwod.w.h      \vd,    \vj,    \vk
    273 .endm
    274 
    275 .macro xvdp2.h.bu xd, xj, xk
    276    xvmulwev.h.bu    \xd,    \xj,    \xk
    277    xvmaddwod.h.bu   \xd,    \xj,    \xk
    278 .endm
    279 
    280 .macro xvdp2.h.bu.b xd, xj, xk
    281    xvmulwev.h.bu.b    \xd,  \xj,    \xk
    282    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
    283 .endm
    284 
    285 .macro xvdp2.w.h xd, xj, xk
    286    xvmulwev.w.h       \xd,  \xj,    \xk
    287    xvmaddwod.w.h      \xd,  \xj,    \xk
    288 .endm
    289 
    290 /*
    291 * Description : Dot product & addition of halfword vector elements
    292 * Arguments   : Inputs  - vj, vk
    293 *               Outputs - vd
    294 *               Return Type - twice size of input
    295 */
    296 .macro vdp2add.h.bu vd, vj, vk
    297    vmaddwev.h.bu     \vd,    \vj,    \vk
    298    vmaddwod.h.bu     \vd,    \vj,    \vk
    299 .endm
    300 
    301 .macro vdp2add.h.bu.b vd, vj, vk
    302    vmaddwev.h.bu.b   \vd,    \vj,    \vk
    303    vmaddwod.h.bu.b   \vd,    \vj,    \vk
    304 .endm
    305 
    306 .macro vdp2add.w.h vd, vj, vk
    307    vmaddwev.w.h      \vd,    \vj,    \vk
    308    vmaddwod.w.h      \vd,    \vj,    \vk
    309 .endm
    310 
    311 .macro xvdp2add.h.bu.b xd, xj, xk
    312    xvmaddwev.h.bu.b   \xd,  \xj,    \xk
    313    xvmaddwod.h.bu.b   \xd,  \xj,    \xk
    314 .endm
    315 
    316 .macro xvdp2add.w.h xd, xj, xk
    317    xvmaddwev.w.h      \xd,  \xj,    \xk
    318    xvmaddwod.w.h      \xd,  \xj,    \xk
    319 .endm
    320 
    321 /*
    322 * Description : Range element vj[i] to vk[i] ~ vj[i]
    323 * clip: vj > vk ? vj : vk && vj < va ? vj : va
    324 */
    325 .macro vclip.h  vd,  vj, vk, va
    326    vmax.h    \vd,  \vj,   \vk
    327    vmin.h    \vd,  \vd,   \va
    328 .endm
    329 
    330 .macro vclip.w  vd,  vj, vk, va
    331    vmax.w    \vd,  \vj,   \vk
    332    vmin.w    \vd,  \vd,   \va
    333 .endm
    334 
    335 .macro xvclip.h  xd,  xj, xk, xa
    336    xvmax.h    \xd,  \xj,   \xk
    337    xvmin.h    \xd,  \xd,   \xa
    338 .endm
    339 
    340 .macro xvclip.w  xd,  xj, xk, xa
    341    xvmax.w    \xd,  \xj,   \xk
    342    xvmin.w    \xd,  \xd,   \xa
    343 .endm
    344 
    345 /*
    346 * Description : Range element vj[i] to 0 ~ 255
    347 * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
    348 */
    349 .macro vclip255.h  vd, vj
    350    vmaxi.h   \vd,   \vj,  0
    351    vsat.hu   \vd,   \vd,  7
    352 .endm
    353 
    354 .macro vclip255.w  vd, vj
    355    vmaxi.w   \vd,   \vj,  0
    356    vsat.wu   \vd,   \vd,  7
    357 .endm
    358 
    359 .macro xvclip255.h  xd, xj
    360    xvmaxi.h   \xd,   \xj,  0
    361    xvsat.hu   \xd,   \xd,  7
    362 .endm
    363 
    364 .macro xvclip255.w  xd, xj
    365    xvmaxi.w   \xd,   \xj,  0
    366    xvsat.wu   \xd,   \xd,  7
    367 .endm
    368 
    369 /*
    370 * Description : Store elements of vector
    371 * vd : Data vector to be stroed
    372 * rk : Address of data storage
    373 * ra : Offset of address
    374 * si : Index of data in vd
    375 */
    376 .macro vstelmx.b vd, rk, ra, si
    377    add.d      \rk,  \rk,  \ra
    378    vstelm.b   \vd,  \rk,  0, \si
    379 .endm
    380 
    381 .macro vstelmx.h vd, rk, ra, si
    382    add.d      \rk,  \rk,  \ra
    383    vstelm.h   \vd,  \rk,  0, \si
    384 .endm
    385 
    386 .macro vstelmx.w vd, rk, ra, si
    387    add.d      \rk,  \rk,  \ra
    388    vstelm.w   \vd,  \rk,  0, \si
    389 .endm
    390 
    391 .macro vstelmx.d  vd, rk, ra, si
    392    add.d      \rk,  \rk,  \ra
    393    vstelm.d   \vd,  \rk,  0, \si
    394 .endm
    395 
    396 .macro vmov xd, xj
    397    vor.v  \xd,  \xj,  \xj
    398 .endm
    399 
    400 .macro xmov xd, xj
    401    xvor.v  \xd,  \xj,  \xj
    402 .endm
    403 
    404 .macro xvstelmx.d  xd, rk, ra, si
    405    add.d      \rk, \rk,  \ra
    406    xvstelm.d  \xd, \rk,  0, \si
    407 .endm
    408 
    409 /*
    410 *============================================================================
    411 * LSX/LASX custom macros
    412 *============================================================================
    413 */
    414 
    415 /*
    416 * Load 4 float, double, V128, v256 elements with stride.
    417 */
    418 .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    419    fld.s     \out0,    \src,    0
    420    fldx.s    \out1,    \src,    \stride
    421    fldx.s    \out2,    \src,    \stride2
    422    fldx.s    \out3,    \src,    \stride3
    423 .endm
    424 
    425 .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    426    fld.d     \out0,    \src,    0
    427    fldx.d    \out1,    \src,    \stride
    428    fldx.d    \out2,    \src,    \stride2
    429    fldx.d    \out3,    \src,    \stride3
    430 .endm
    431 
    432 .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    433    vld     \out0,    \src,    0
    434    vldx    \out1,    \src,    \stride
    435    vldx    \out2,    \src,    \stride2
    436    vldx    \out3,    \src,    \stride3
    437 .endm
    438 
    439 .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
    440    xvld    \out0,    \src,    0
    441    xvldx   \out1,    \src,    \stride
    442    xvldx   \out2,    \src,    \stride2
    443    xvldx   \out3,    \src,    \stride3
    444 .endm
    445 
    446 /*
    447 * Description : Transpose 4x4 block with half-word elements in vectors
    448 * Arguments   : Inputs  - in0, in1, in2, in3
    449 *               Outputs - out0, out1, out2, out3
    450 */
    451 .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
    452                          tmp0, tmp1
    453    vilvl.h   \tmp0,  \in1,   \in0
    454    vilvl.h   \tmp1,  \in3,   \in2
    455    vilvl.w   \out0,  \tmp1,  \tmp0
    456    vilvh.w   \out2,  \tmp1,  \tmp0
    457    vilvh.d   \out1,  \out0,  \out0
    458    vilvh.d   \out3,  \out0,  \out2
    459 .endm
    460 
    461 /*
    462 * Description : Transpose 4x4 block with word elements in vectors
    463 * Arguments   : Inputs  - in0, in1, in2, in3
    464 *               Outputs - out0, out1, out2, out3
    465 * Details     :
    466 * Example     :
    467 *               1, 2, 3, 4            1, 5, 9,13
    468 *               5, 6, 7, 8    to      2, 6,10,14
    469 *               9,10,11,12  =====>    3, 7,11,15
    470 *              13,14,15,16            4, 8,12,16
    471 */
    472 .macro LSX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
    473                          tmp0, tmp1
    474 
    475    vilvl.w    \tmp0,   \in1,    \in0
    476    vilvh.w    \out1,   \in1,    \in0
    477    vilvl.w    \tmp1,   \in3,    \in2
    478    vilvh.w    \out3,   \in3,    \in2
    479 
    480    vilvl.d    \out0,   \tmp1,   \tmp0
    481    vilvl.d    \out2,   \out3,   \out1
    482    vilvh.d    \out3,   \out3,   \out1
    483    vilvh.d    \out1,   \tmp1,   \tmp0
    484 .endm
    485 
    486 /*
    487 * Description : Transpose 8x8 block with half-word elements in vectors
    488 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
    489 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
    490 */
    491 .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1,   \
    492                          out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
    493                          tmp3, tmp4, tmp5, tmp6, tmp7
    494    vilvl.h      \tmp0,    \in6,   \in4
    495    vilvl.h      \tmp1,    \in7,   \in5
    496    vilvl.h      \tmp2,    \in2,   \in0
    497    vilvl.h      \tmp3,    \in3,   \in1
    498 
    499    vilvl.h      \tmp4,    \tmp1,  \tmp0
    500    vilvh.h      \tmp5,    \tmp1,  \tmp0
    501    vilvl.h      \tmp6,    \tmp3,  \tmp2
    502    vilvh.h      \tmp7,    \tmp3,  \tmp2
    503 
    504    vilvh.h      \tmp0,    \in6,   \in4
    505    vilvh.h      \tmp1,    \in7,   \in5
    506    vilvh.h      \tmp2,    \in2,   \in0
    507    vilvh.h      \tmp3,    \in3,   \in1
    508 
    509    vpickev.d    \out0,    \tmp4,  \tmp6
    510    vpickod.d    \out1,    \tmp4,  \tmp6
    511    vpickev.d    \out2,    \tmp5,  \tmp7
    512    vpickod.d    \out3,    \tmp5,  \tmp7
    513 
    514    vilvl.h      \tmp4,    \tmp1,  \tmp0
    515    vilvh.h      \tmp5,    \tmp1,  \tmp0
    516    vilvl.h      \tmp6,    \tmp3,  \tmp2
    517    vilvh.h      \tmp7,    \tmp3,  \tmp2
    518 
    519    vpickev.d    \out4,    \tmp4,  \tmp6
    520    vpickod.d    \out5,    \tmp4,  \tmp6
    521    vpickev.d    \out6,    \tmp5,  \tmp7
    522    vpickod.d    \out7,    \tmp5,  \tmp7
    523 .endm
    524 
    525 /*
    526 * Description : Transpose 16x8 block with byte elements in vectors
    527 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
    528 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
    529 */
    530 .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7,        \
    531                            in8, in9, in10, in11, in12, in13, in14, in15,  \
    532                            out0, out1, out2, out3, out4, out5, out6, out7,\
    533                            tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
    534    xvilvl.b   \tmp0,    \in2,     \in0
    535    xvilvl.b   \tmp1,    \in3,     \in1
    536    xvilvl.b   \tmp2,    \in6,     \in4
    537    xvilvl.b   \tmp3,    \in7,     \in5
    538    xvilvl.b   \tmp4,    \in10,    \in8
    539    xvilvl.b   \tmp5,    \in11,    \in9
    540    xvilvl.b   \tmp6,    \in14,    \in12
    541    xvilvl.b   \tmp7,    \in15,    \in13
    542    xvilvl.b   \out0,    \tmp1,    \tmp0
    543    xvilvh.b   \out1,    \tmp1,    \tmp0
    544    xvilvl.b   \out2,    \tmp3,    \tmp2
    545    xvilvh.b   \out3,    \tmp3,    \tmp2
    546    xvilvl.b   \out4,    \tmp5,    \tmp4
    547    xvilvh.b   \out5,    \tmp5,    \tmp4
    548    xvilvl.b   \out6,    \tmp7,    \tmp6
    549    xvilvh.b   \out7,    \tmp7,    \tmp6
    550    xvilvl.w   \tmp0,    \out2,    \out0
    551    xvilvh.w   \tmp2,    \out2,    \out0
    552    xvilvl.w   \tmp4,    \out3,    \out1
    553    xvilvh.w   \tmp6,    \out3,    \out1
    554    xvilvl.w   \tmp1,    \out6,    \out4
    555    xvilvh.w   \tmp3,    \out6,    \out4
    556    xvilvl.w   \tmp5,    \out7,    \out5
    557    xvilvh.w   \tmp7,    \out7,    \out5
    558    xvilvl.d   \out0,    \tmp1,    \tmp0
    559    xvilvh.d   \out1,    \tmp1,    \tmp0
    560    xvilvl.d   \out2,    \tmp3,    \tmp2
    561    xvilvh.d   \out3,    \tmp3,    \tmp2
    562    xvilvl.d   \out4,    \tmp5,    \tmp4
    563    xvilvh.d   \out5,    \tmp5,    \tmp4
    564    xvilvl.d   \out6,    \tmp7,    \tmp6
    565    xvilvh.d   \out7,    \tmp7,    \tmp6
    566 .endm
    567 
    568 /*
    569 * Description : Transpose 4x4 block with half-word elements in vectors
    570 * Arguments   : Inputs  - in0, in1, in2, in3
    571 *               Outputs - out0, out1, out2, out3
    572 */
    573 .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
    574                           tmp0, tmp1
    575    xvilvl.h   \tmp0,  \in1,   \in0
    576    xvilvl.h   \tmp1,  \in3,   \in2
    577    xvilvl.w   \out0,  \tmp1,  \tmp0
    578    xvilvh.w   \out2,  \tmp1,  \tmp0
    579    xvilvh.d   \out1,  \out0,  \out0
    580    xvilvh.d   \out3,  \out0,  \out2
    581 .endm
    582 
    583 /*
    584 * Description : Transpose 4x8 block with half-word elements in vectors
    585 * Arguments   : Inputs  - in0, in1, in2, in3
    586 *               Outputs - out0, out1, out2, out3
    587 */
    588 .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
    589                           tmp0, tmp1
    590    xvilvl.h      \tmp0,    \in2,   \in0
    591    xvilvl.h      \tmp1,    \in3,   \in1
    592    xvilvl.h      \out2,    \tmp1,  \tmp0
    593    xvilvh.h      \out3,    \tmp1,  \tmp0
    594 
    595    xvilvl.d      \out0,    \out2,  \out2
    596    xvilvh.d      \out1,    \out2,  \out2
    597    xvilvl.d      \out2,    \out3,  \out3
    598    xvilvh.d      \out3,    \out3,  \out3
    599 .endm
    600 
    601 /*
    602 * Description : Transpose 8x8 block with half-word elements in vectors
    603 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
    604 *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
    605 */
    606 .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7,         \
    607                           out0, out1, out2, out3, out4, out5, out6, out7, \
    608                           tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
    609    xvilvl.h     \tmp0,   \in6,     \in4
    610    xvilvl.h     \tmp1,   \in7,     \in5
    611    xvilvl.h     \tmp2,   \in2,     \in0
    612    xvilvl.h     \tmp3,   \in3,     \in1
    613 
    614    xvilvl.h     \tmp4,   \tmp1,    \tmp0
    615    xvilvh.h     \tmp5,   \tmp1,    \tmp0
    616    xvilvl.h     \tmp6,   \tmp3,    \tmp2
    617    xvilvh.h     \tmp7,   \tmp3,    \tmp2
    618 
    619    xvilvh.h     \tmp0,   \in6,     \in4
    620    xvilvh.h     \tmp1,   \in7,     \in5
    621    xvilvh.h     \tmp2,   \in2,     \in0
    622    xvilvh.h     \tmp3,   \in3,     \in1
    623 
    624    xvpickev.d   \out0,   \tmp4,    \tmp6
    625    xvpickod.d   \out1,   \tmp4,    \tmp6
    626    xvpickev.d   \out2,   \tmp5,    \tmp7
    627    xvpickod.d   \out3,   \tmp5,    \tmp7
    628 
    629    xvilvl.h     \tmp4,   \tmp1,    \tmp0
    630    xvilvh.h     \tmp5,   \tmp1,    \tmp0
    631    xvilvl.h     \tmp6,   \tmp3,    \tmp2
    632    xvilvh.h     \tmp7,   \tmp3,    \tmp2
    633 
    634    xvpickev.d   \out4,   \tmp4,    \tmp6
    635    xvpickod.d   \out5,   \tmp4,    \tmp6
    636    xvpickev.d   \out6,   \tmp5,    \tmp7
    637    xvpickod.d   \out7,   \tmp5,    \tmp7
    638 .endm
    639 
    640 /*
    641 * Description : Transpose 2x4x4 block with half-word elements in vectors
    642 * Arguments   : Inputs  - in0, in1, in2, in3
    643 *               Outputs - out0, out1, out2, out3
    644 */
    645 .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
    646                             tmp0, tmp1, tmp2
    647    xvilvh.h   \tmp1,    \in0,     \in1
    648    xvilvl.h   \out1,    \in0,     \in1
    649    xvilvh.h   \tmp0,    \in2,     \in3
    650    xvilvl.h   \out3,    \in2,     \in3
    651 
    652    xvilvh.w   \tmp2,    \out3,    \out1
    653    xvilvl.w   \out3,    \out3,    \out1
    654 
    655    xvilvl.w   \out2,    \tmp0,    \tmp1
    656    xvilvh.w   \tmp1,    \tmp0,    \tmp1
    657 
    658    xvilvh.d   \out0,    \out2,    \out3
    659    xvilvl.d   \out2,    \out2,    \out3
    660    xvilvh.d   \out1,    \tmp1,    \tmp2
    661    xvilvl.d   \out3,    \tmp1,    \tmp2
    662 .endm
    663 
    664 /*
    665 * Description : Transpose 4x4 block with word elements in vectors
    666 * Arguments   : Inputs  - in0, in1, in2, in3
    667 *               Outputs - out0, out1, out2, out3
    668 * Details     :
    669 * Example     :
    670 *               1, 2, 3, 4,  1, 2, 3, 4        1,5, 9,13, 1,5, 9,13
    671 *               5, 6, 7, 8,  5, 6, 7, 8   to   2,6,10,14, 2,6,10,14
    672 *               9,10,11,12,  9,10,11,12 =====> 3,7,11,15, 3,7,11,15
    673 *              13,14,15,16, 13,14,15,16        4,8,12,16, 4,8,12,16
    674 */
    675 .macro LASX_TRANSPOSE4x4_W in0, in1, in2, in3, out0, out1, out2, out3, \
    676                           tmp0, tmp1
    677 
    678    xvilvl.w    \tmp0,   \in1,    \in0
    679    xvilvh.w    \out1,   \in1,    \in0
    680    xvilvl.w    \tmp1,   \in3,    \in2
    681    xvilvh.w    \out3,   \in3,    \in2
    682 
    683    xvilvl.d    \out0,   \tmp1,   \tmp0
    684    xvilvl.d    \out2,   \out3,   \out1
    685    xvilvh.d    \out3,   \out3,   \out1
    686    xvilvh.d    \out1,   \tmp1,   \tmp0
    687 .endm
    688 
    689 /*
    690 * Description : Transpose 8x8 block with word elements in vectors
    691 * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
    692 *               Outputs - out0, out1, out2, out3, out4, out5, out6,
    693 *               _out7
    694 * Example     : LASX_TRANSPOSE8x8_W
    695 *         in0 : 1,2,3,4,5,6,7,8
    696 *         in1 : 2,2,3,4,5,6,7,8
    697 *         in2 : 3,2,3,4,5,6,7,8
    698 *         in3 : 4,2,3,4,5,6,7,8
    699 *         in4 : 5,2,3,4,5,6,7,8
    700 *         in5 : 6,2,3,4,5,6,7,8
    701 *         in6 : 7,2,3,4,5,6,7,8
    702 *         in7 : 8,2,3,4,5,6,7,8
    703 *
    704 *        out0 : 1,2,3,4,5,6,7,8
    705 *        out1 : 2,2,2,2,2,2,2,2
    706 *        out2 : 3,3,3,3,3,3,3,3
    707 *        out3 : 4,4,4,4,4,4,4,4
    708 *        out4 : 5,5,5,5,5,5,5,5
    709 *        out5 : 6,6,6,6,6,6,6,6
    710 *        out6 : 7,7,7,7,7,7,7,7
    711 *        out7 : 8,8,8,8,8,8,8,8
    712 */
    713 .macro LASX_TRANSPOSE8x8_W in0, in1, in2, in3, in4, in5, in6, in7,\
    714                           out0, out1, out2, out3, out4, out5, out6, out7,\
    715                           tmp0, tmp1, tmp2, tmp3
    716    xvilvl.w    \tmp0,   \in2,    \in0
    717    xvilvl.w    \tmp1,   \in3,    \in1
    718    xvilvh.w    \tmp2,   \in2,    \in0
    719    xvilvh.w    \tmp3,   \in3,    \in1
    720    xvilvl.w    \out0,   \tmp1,   \tmp0
    721    xvilvh.w    \out1,   \tmp1,   \tmp0
    722    xvilvl.w    \out2,   \tmp3,   \tmp2
    723    xvilvh.w    \out3,   \tmp3,   \tmp2
    724 
    725    xvilvl.w    \tmp0,   \in6,    \in4
    726    xvilvl.w    \tmp1,   \in7,    \in5
    727    xvilvh.w    \tmp2,   \in6,    \in4
    728    xvilvh.w    \tmp3,   \in7,    \in5
    729    xvilvl.w    \out4,   \tmp1,   \tmp0
    730    xvilvh.w    \out5,   \tmp1,   \tmp0
    731    xvilvl.w    \out6,   \tmp3,   \tmp2
    732    xvilvh.w    \out7,   \tmp3,   \tmp2
    733 
    734    xmov        \tmp0,   \out0
    735    xmov        \tmp1,   \out1
    736    xmov        \tmp2,   \out2
    737    xmov        \tmp3,   \out3
    738    xvpermi.q   \out0,   \out4,   0x02
    739    xvpermi.q   \out1,   \out5,   0x02
    740    xvpermi.q   \out2,   \out6,   0x02
    741    xvpermi.q   \out3,   \out7,   0x02
    742    xvpermi.q   \out4,   \tmp0,   0x31
    743    xvpermi.q   \out5,   \tmp1,   0x31
    744    xvpermi.q   \out6,   \tmp2,   0x31
    745    xvpermi.q   \out7,   \tmp3,   0x31
    746 .endm
    747 
    748 /*
    749 * Description : Transpose 4x4 block with double-word elements in vectors
    750 * Arguments   : Inputs  - in0, in1, in2, in3
    751 *               Outputs - out0, out1, out2, out3
    752 * Example     : LASX_TRANSPOSE4x4_D
    753 *         in0 : 1,2,3,4
    754 *         in1 : 1,2,3,4
    755 *         in2 : 1,2,3,4
    756 *         in3 : 1,2,3,4
    757 *
    758 *        out0 : 1,1,1,1
    759 *        out1 : 2,2,2,2
    760 *        out2 : 3,3,3,3
    761 *        out3 : 4,4,4,4
    762 */
    763 .macro LASX_TRANSPOSE4x4_D in0, in1, in2, in3, out0, out1, out2, out3, \
    764                           tmp0, tmp1
    765    xvilvl.d    \tmp0,   \in1,    \in0
    766    xvilvh.d    \out1,   \in1,    \in0
    767    xvilvh.d    \tmp1,   \in3,    \in2
    768    xvilvl.d    \out2,   \in3,    \in2
    769 
    770    xvor.v      \out0,   \tmp0,   \tmp0
    771    xvor.v      \out3,   \tmp1,   \tmp1
    772 
    773    xvpermi.q   \out0,   \out2,   0x02
    774    xvpermi.q   \out2,   \tmp0,   0x31
    775    xvpermi.q   \out3,   \out1,   0x31
    776    xvpermi.q   \out1,   \tmp1,   0x02
    777 .endm