tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

loongson-mmintrin.h (23014B)


      1 /*
      2 * Loongson MMI optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      5 *                          All Rights Reserved.
      6 * Copyright (C) 2019, D. R. Commander.  All Rights Reserved.
      7 *
      8 * This software is provided 'as-is', without any express or implied
      9 * warranty.  In no event will the authors be held liable for any damages
     10 * arising from the use of this software.
     11 *
     12 * Permission is granted to anyone to use this software for any purpose,
     13 * including commercial applications, and to alter it and redistribute it
     14 * freely, subject to the following restrictions:
     15 *
     16 * 1. The origin of this software must not be misrepresented; you must not
     17 *    claim that you wrote the original software. If you use this software
     18 *    in a product, an acknowledgment in the product documentation would be
     19 *    appreciated but is not required.
     20 * 2. Altered source versions must be plainly marked as such, and must not be
     21 *    misrepresented as being the original software.
     22 * 3. This notice may not be removed or altered from any source distribution.
     23 */
     24 
     25 #ifndef __LOONGSON_MMINTRIN_H__
     26 #define __LOONGSON_MMINTRIN_H__
     27 
     28 #include <stdint.h>
     29 
     30 
     31 #define FUNCTION_ATTRIBS \
     32  __attribute__((__gnu_inline__, __always_inline__, __artificial__))
     33 
     34 
     35 /* Vectors are stored in 64-bit floating-point registers. */
     36 typedef double __m64;
     37 
     38 /* Having a 32-bit datatype allows us to use 32-bit loads in places like
     39   load8888. */
     40 typedef float __m32;
     41 
     42 
     43 /********** Set Operations **********/
     44 
     45 extern __inline __m64 FUNCTION_ATTRIBS
     46 _mm_setzero_si64(void)
     47 {
     48  return 0.0;
     49 }
     50 
     51 extern __inline __m64 FUNCTION_ATTRIBS
     52 _mm_set_pi8(uint8_t __b7, uint8_t __b6, uint8_t __b5, uint8_t __b4,
     53            uint8_t __b3, uint8_t __b2, uint8_t __b1, uint8_t __b0)
     54 {
     55  __m64 ret;
     56  uint32_t lo = ((uint32_t)__b6 << 24) |
     57                ((uint32_t)__b4 << 16) |
     58                ((uint32_t)__b2 << 8) |
     59                (uint32_t)__b0;
     60  uint32_t hi = ((uint32_t)__b7 << 24) |
     61                ((uint32_t)__b5 << 16) |
     62                ((uint32_t)__b3 << 8) |
     63                (uint32_t)__b1;
     64 
     65  asm("mtc1      %1, %0\n\t"
     66      "mtc1      %2, $f0\n\t"
     67      "punpcklbh %0, %0, $f0\n\t"
     68      : "=f" (ret)
     69      : "r" (lo), "r" (hi)
     70      : "$f0"
     71     );
     72 
     73  return ret;
     74 }
     75 
     76 extern __inline __m64 FUNCTION_ATTRIBS
     77 _mm_set_pi16(uint16_t __h3, uint16_t __h2, uint16_t __h1, uint16_t __h0)
     78 {
     79  __m64 ret;
     80  uint32_t lo = ((uint32_t)__h2 << 16) | (uint32_t)__h0;
     81  uint32_t hi = ((uint32_t)__h3 << 16) | (uint32_t)__h1;
     82 
     83  asm("mtc1      %1, %0\n\t"
     84      "mtc1      %2, $f0\n\t"
     85      "punpcklhw %0, %0, $f0\n\t"
     86      : "=f" (ret)
     87      : "r" (lo), "r" (hi)
     88      : "$f0"
     89     );
     90 
     91  return ret;
     92 }
     93 
     94 #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
     95  (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
     96 
     97 extern __inline __m64 FUNCTION_ATTRIBS
     98 _mm_set_pi32(uint32_t __i1, uint32_t __i0)
     99 {
    100  if (__builtin_constant_p(__i1) && __builtin_constant_p(__i0)) {
    101    uint64_t val = ((uint64_t)__i1 << 32) |
    102                   ((uint64_t)__i0 <<  0);
    103 
    104    return *(__m64 *)&val;
    105  } else if (__i1 == __i0) {
    106    uint64_t imm = _MM_SHUFFLE(1, 0, 1, 0);
    107    __m64 ret;
    108 
    109    asm("pshufh %0, %1, %2\n\t"
    110        : "=f" (ret)
    111        : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm)
    112       );
    113 
    114    return ret;
    115  } else {
    116    uint64_t val = ((uint64_t)__i1 << 32) |
    117                   ((uint64_t)__i0 <<  0);
    118 
    119    return *(__m64 *)&val;
    120  }
    121 }
    122 
    123 extern __inline __m64 FUNCTION_ATTRIBS
    124 _mm_set1_pi8(uint8_t __b0)
    125 {
    126  __m64 ret;
    127 
    128  asm("sll    $8, %1, 8\n\t"
    129      "or     %1, %1, $8\n\t"
    130      "mtc1   %1, %0\n\t"
    131      "mtc1   $0, $f0\n\t"
    132      "pshufh %0, %0, $f0\n\t"
    133      : "=f" (ret)
    134      : "r" (__b0)
    135      : "$8", "$f0"
    136     );
    137 
    138  return ret;
    139 }
    140 
    141 extern __inline __m64 FUNCTION_ATTRIBS
    142 _mm_set1_pi16(uint16_t __h0)
    143 {
    144  __m64 ret;
    145 
    146  asm("mtc1   %1, %0\n\t"
    147      "mtc1   $0, $f0\n\t"
    148      "pshufh %0, %0, $f0\n\t"
    149      : "=f" (ret)
    150      : "r" (__h0)
    151      : "$8", "$f0"
    152     );
    153 
    154  return ret;
    155 }
    156 
    157 extern __inline __m64 FUNCTION_ATTRIBS
    158 _mm_set1_pi32(unsigned __i0)
    159 {
    160  return _mm_set_pi32(__i0, __i0);
    161 }
    162 
    163 extern __inline __m64 FUNCTION_ATTRIBS
    164 _mm_setr_pi8(uint8_t __h0, uint8_t __h1, uint8_t __h2, uint8_t __h3,
    165             uint8_t __h4, uint8_t __h5, uint8_t __h6, uint8_t __h7)
    166 {
    167  return _mm_set_pi8(__h7, __h6, __h5, __h4,
    168                     __h3, __h2, __h1, __h0);
    169 }
    170 
    171 extern __inline __m64 FUNCTION_ATTRIBS
    172 _mm_setr_pi16(uint16_t __w0, uint16_t __w1, uint16_t __w2, uint16_t __w3)
    173 {
    174  return _mm_set_pi16(__w3, __w2, __w1, __w0);
    175 }
    176 
    177 extern __inline __m64 FUNCTION_ATTRIBS
    178 _mm_setr_pi32(uint32_t __i0, uint32_t __i1)
    179 {
    180  return _mm_set_pi32(__i1, __i0);
    181 }
    182 
    183 
    184 /********** Arithmetic Operations **********/
    185 
    186 extern __inline __m64 FUNCTION_ATTRIBS
    187 _mm_add_pi8(__m64 __m1, __m64 __m2)
    188 {
    189  __m64 ret;
    190 
    191  asm("paddb %0, %1, %2\n\t"
    192      : "=f" (ret)
    193      : "f" (__m1), "f" (__m2)
    194     );
    195 
    196  return ret;
    197 }
    198 
    199 extern __inline __m64 FUNCTION_ATTRIBS
    200 _mm_add_pi16(__m64 __m1, __m64 __m2)
    201 {
    202  __m64 ret;
    203 
    204  asm("paddh %0, %1, %2\n\t"
    205      : "=f" (ret)
    206      : "f" (__m1), "f" (__m2)
    207     );
    208 
    209  return ret;
    210 }
    211 
    212 extern __inline __m64 FUNCTION_ATTRIBS
    213 _mm_add_pi32(__m64 __m1, __m64 __m2)
    214 {
    215  __m64 ret;
    216 
    217  asm("paddw %0, %1, %2\n\t"
    218      : "=f" (ret)
    219      : "f" (__m1), "f" (__m2)
    220     );
    221 
    222  return ret;
    223 }
    224 
    225 extern __inline __m64 FUNCTION_ATTRIBS
    226 _mm_add_si64(__m64 __m1, __m64 __m2)
    227 {
    228  __m64 ret;
    229 
    230  asm("paddd %0, %1, %2\n\t"
    231      : "=f" (ret)
    232      : "f" (__m1), "f" (__m2)
    233     );
    234 
    235  return ret;
    236 }
    237 
    238 extern __inline __m64 FUNCTION_ATTRIBS
    239 _mm_adds_pi8(__m64 __m1, __m64 __m2)
    240 {
    241  __m64 ret;
    242 
    243  asm("paddsb %0, %1, %2\n\t"
    244      : "=f" (ret)
    245      : "f" (__m1), "f" (__m2)
    246     );
    247 
    248  return ret;
    249 }
    250 
    251 extern __inline __m64 FUNCTION_ATTRIBS
    252 _mm_adds_pi16(__m64 __m1, __m64 __m2)
    253 {
    254  __m64 ret;
    255 
    256  asm("paddsh %0, %1, %2\n\t"
    257      : "=f" (ret)
    258      : "f" (__m1), "f" (__m2)
    259     );
    260 
    261  return ret;
    262 }
    263 
    264 
    265 extern __inline __m64 FUNCTION_ATTRIBS
    266 _mm_adds_pu8(__m64 __m1, __m64 __m2)
    267 {
    268  __m64 ret;
    269 
    270  asm("paddusb %0, %1, %2\n\t"
    271      : "=f" (ret)
    272      : "f" (__m1), "f" (__m2)
    273     );
    274 
    275  return ret;
    276 }
    277 
    278 extern __inline __m64 FUNCTION_ATTRIBS
    279 _mm_adds_pu16(__m64 __m1, __m64 __m2)
    280 {
    281  __m64 ret;
    282 
    283  asm("paddush %0, %1, %2\n\t"
    284      : "=f" (ret)
    285      : "f" (__m1), "f" (__m2)
    286     );
    287 
    288  return ret;
    289 }
    290 
    291 extern __inline __m64 FUNCTION_ATTRIBS
    292 _mm_avg_pu8(__m64 __m1, __m64 __m2)
    293 {
    294  __m64 ret;
    295 
    296  asm("pavgb %0, %1, %2\n\t"
    297      : "=f" (ret)
    298      : "f" (__m1), "f" (__m2)
    299     );
    300 
    301  return ret;
    302 }
    303 
    304 extern __inline __m64 FUNCTION_ATTRIBS
    305 _mm_avg_pu16(__m64 __m1, __m64 __m2)
    306 {
    307  __m64 ret;
    308 
    309  asm("pavgh %0, %1, %2\n\t"
    310      : "=f" (ret)
    311      : "f" (__m1), "f" (__m2)
    312     );
    313 
    314  return ret;
    315 }
    316 
    317 extern __inline __m64 FUNCTION_ATTRIBS
    318 _mm_madd_pi16(__m64 __m1, __m64 __m2)
    319 {
    320  __m64 ret;
    321 
    322  asm("pmaddhw %0, %1, %2\n\t"
    323      : "=f" (ret)
    324      : "f" (__m1), "f" (__m2)
    325     );
    326 
    327  return ret;
    328 }
    329 
    330 extern __inline __m64 FUNCTION_ATTRIBS
    331 _mm_max_pi16(__m64 __m1, __m64 __m2)
    332 {
    333  __m64 ret;
    334 
    335  asm("pmaxsh %0, %1, %2\n\t"
    336      : "=f" (ret)
    337      : "f" (__m1), "f" (__m2)
    338     );
    339 
    340  return ret;
    341 }
    342 
    343 extern __inline __m64 FUNCTION_ATTRIBS
    344 _mm_max_pu8(__m64 __m1, __m64 __m2)
    345 {
    346  __m64 ret;
    347 
    348  asm("pmaxub %0, %1, %2\n\t"
    349      : "=f" (ret)
    350      : "f" (__m1), "f" (__m2)
    351     );
    352 
    353  return ret;
    354 }
    355 
    356 extern __inline __m64 FUNCTION_ATTRIBS
    357 _mm_min_pi16(__m64 __m1, __m64 __m2)
    358 {
    359  __m64 ret;
    360 
    361  asm("pminsh %0, %1, %2\n\t"
    362      : "=f" (ret)
    363      : "f" (__m1), "f" (__m2)
    364     );
    365 
    366  return ret;
    367 }
    368 
    369 extern __inline __m64 FUNCTION_ATTRIBS
    370 _mm_min_pu8(__m64 __m1, __m64 __m2)
    371 {
    372  __m64 ret;
    373 
    374  asm("pminub %0, %1, %2\n\t"
    375      : "=f" (ret)
    376      : "f" (__m1), "f" (__m2)
    377     );
    378 
    379  return ret;
    380 }
    381 
    382 extern __inline int FUNCTION_ATTRIBS
    383 _mm_movemask_pi8(__m64 __m1)
    384 {
    385  int ret;
    386 
    387  asm("pmovmskb %0, %1\n\t"
    388      : "=r" (ret)
    389      : "y" (__m1)
    390     );
    391 
    392  return ret;
    393 }
    394 
    395 extern __inline __m64 FUNCTION_ATTRIBS
    396 _mm_mulhi_pi16(__m64 __m1, __m64 __m2)
    397 {
    398  __m64 ret;
    399 
    400  asm("pmulhh %0, %1, %2\n\t"
    401      : "=f" (ret)
    402      : "f" (__m1), "f" (__m2)
    403     );
    404 
    405  return ret;
    406 }
    407 
    408 extern __inline __m64 FUNCTION_ATTRIBS
    409 _mm_mulhi_pu16(__m64 __m1, __m64 __m2)
    410 {
    411  __m64 ret;
    412 
    413  asm("pmulhuh %0, %1, %2\n\t"
    414      : "=f" (ret)
    415      : "f" (__m1), "f" (__m2)
    416     );
    417 
    418  return ret;
    419 }
    420 
    421 extern __inline __m64 FUNCTION_ATTRIBS
    422 _mm_mullo_pi16(__m64 __m1, __m64 __m2)
    423 {
    424  __m64 ret;
    425 
    426  asm("pmullh %0, %1, %2\n\t"
    427      : "=f" (ret)
    428      : "f" (__m1), "f" (__m2)
    429     );
    430 
    431  return ret;
    432 }
    433 
    434 extern __inline __m64 FUNCTION_ATTRIBS
    435 _mm_mul_pu32(__m64 __m1, __m64 __m2)
    436 {
    437  __m64 ret;
    438 
    439  asm("pmuluw %0, %1, %2\n\t"
    440      : "=f" (ret)
    441      : "f" (__m1), "f" (__m2)
    442     );
    443 
    444  return ret;
    445 }
    446 
    447 extern __inline __m64 FUNCTION_ATTRIBS
    448 _mm_sad_pu8(__m64 __m1, __m64 __m2)
    449 {
    450  __m64 ret;
    451 
    452  asm("psadbh %0, %1, %2\n\t"
    453      : "=f" (ret)
    454      : "f" (__m1), "f" (__m2)
    455     );
    456 
    457  return ret;
    458 }
    459 
    460 
    461 extern __inline __m64 FUNCTION_ATTRIBS
    462 _mm_asub_pu8(__m64 __m1, __m64 __m2)
    463 {
    464  __m64 ret;
    465 
    466  asm("pasubub %0, %1, %2\n\t"
    467      : "=f" (ret)
    468      : "f" (__m1), "f" (__m2)
    469     );
    470 
    471  return ret;
    472 }
    473 
    474 extern __inline __m64 FUNCTION_ATTRIBS
    475 _mm_biadd_pu8(__m64 __m1, __m64 __m2)
    476 {
    477  __m64 ret;
    478 
    479  asm("biadd %0, %1, %2\n\t"
    480      : "=f" (ret)
    481      : "f" (__m1), "f" (__m2)
    482     );
    483 
    484  return ret;
    485 }
    486 
    487 extern __inline __m64 FUNCTION_ATTRIBS
    488 _mm_sub_pi8(__m64 __m1, __m64 __m2)
    489 {
    490  __m64 ret;
    491 
    492  asm("psubb %0, %1, %2\n\t"
    493      : "=f" (ret)
    494      : "f" (__m1), "f" (__m2)
    495     );
    496 
    497  return ret;
    498 }
    499 
    500 extern __inline __m64 FUNCTION_ATTRIBS
    501 _mm_sub_pi16(__m64 __m1, __m64 __m2)
    502 {
    503  __m64 ret;
    504 
    505  asm("psubh %0, %1, %2\n\t"
    506      : "=f" (ret)
    507      : "f" (__m1), "f" (__m2)
    508     );
    509 
    510  return ret;
    511 }
    512 
    513 extern __inline __m64 FUNCTION_ATTRIBS
    514 _mm_sub_pi32(__m64 __m1, __m64 __m2)
    515 {
    516  __m64 ret;
    517 
    518  asm("psubw %0, %1, %2\n\t"
    519      : "=f" (ret)
    520      : "f" (__m1), "f" (__m2)
    521     );
    522 
    523  return ret;
    524 }
    525 
    526 extern __inline __m64 FUNCTION_ATTRIBS
    527 _mm_sub_si64(__m64 __m1, __m64 __m2)
    528 {
    529  __m64 ret;
    530 
    531  asm("psubd %0, %1, %2\n\t"
    532      : "=f" (ret)
    533      : "f" (__m1), "f" (__m2)
    534     );
    535 
    536  return ret;
    537 }
    538 
    539 extern __inline __m64 FUNCTION_ATTRIBS
    540 _mm_subs_pi8(__m64 __m1, __m64 __m2)
    541 {
    542  __m64 ret;
    543 
    544  asm("psubsb %0, %1, %2\n\t"
    545      : "=f" (ret)
    546      : "f" (__m1), "f" (__m2)
    547     );
    548 
    549  return ret;
    550 }
    551 
    552 extern __inline __m64 FUNCTION_ATTRIBS
    553 _mm_subs_pi16(__m64 __m1, __m64 __m2)
    554 {
    555  __m64 ret;
    556 
    557  asm("psubsh %0, %1, %2\n\t"
    558      : "=f" (ret)
    559      : "f" (__m1), "f" (__m2)
    560     );
    561 
    562  return ret;
    563 }
    564 
    565 
    566 extern __inline __m64 FUNCTION_ATTRIBS
    567 _mm_subs_pu8(__m64 __m1, __m64 __m2)
    568 {
    569  __m64 ret;
    570 
    571  asm("psubusb %0, %1, %2\n\t"
    572      : "=f" (ret)
    573      : "f" (__m1), "f" (__m2)
    574     );
    575 
    576  return ret;
    577 }
    578 
    579 extern __inline __m64 FUNCTION_ATTRIBS
    580 _mm_subs_pu16(__m64 __m1, __m64 __m2)
    581 {
    582  __m64 ret;
    583 
    584  asm("psubush %0, %1, %2\n\t"
    585      : "=f" (ret)
    586      : "f" (__m1), "f" (__m2)
    587     );
    588 
    589  return ret;
    590 }
    591 
    592 
    593 /********** Logical Operations **********/
    594 
    595 extern __inline __m64 FUNCTION_ATTRIBS
    596 _mm_and_si64(__m64 __m1, __m64 __m2)
    597 {
    598  __m64 ret;
    599 
    600  asm("and %0, %1, %2\n\t"
    601      : "=f" (ret)
    602      : "f" (__m1), "f" (__m2)
    603     );
    604 
    605  return ret;
    606 }
    607 
    608 extern __inline __m64 FUNCTION_ATTRIBS
    609 _mm_andnot_si64(__m64 __m1, __m64 __m2)
    610 {
    611  __m64 ret;
    612 
    613  asm("andn %0, %1, %2\n\t"
    614      : "=f" (ret)
    615      : "f" (__m1), "f" (__m2)
    616     );
    617 
    618  return ret;
    619 }
    620 
    621 
    622 extern __inline __m64 FUNCTION_ATTRIBS
    623 _mm_or_si32(__m32 __m1, __m32 __m2)
    624 {
    625  __m32 ret;
    626 
    627  asm("or %0, %1, %2\n\t"
    628      : "=f" (ret)
    629      : "f" (__m1), "f" (__m2)
    630     );
    631 
    632  return ret;
    633 }
    634 
    635 extern __inline __m64 FUNCTION_ATTRIBS
    636 _mm_or_si64(__m64 __m1, __m64 __m2)
    637 {
    638  __m64 ret;
    639 
    640  asm("or %0, %1, %2\n\t"
    641      : "=f" (ret)
    642      : "f" (__m1), "f" (__m2)
    643     );
    644 
    645  return ret;
    646 }
    647 
    648 extern __inline __m64 FUNCTION_ATTRIBS
    649 _mm_xor_si64(__m64 __m1, __m64 __m2)
    650 {
    651  __m64 ret;
    652 
    653  asm("xor %0, %1, %2\n\t"
    654      : "=f" (ret)
    655      : "f" (__m1), "f" (__m2)
    656     );
    657 
    658  return ret;
    659 }
    660 
    661 
    662 /********** Shift Operations **********/
    663 
    664 extern __inline __m64 FUNCTION_ATTRIBS
    665 _mm_slli_pi16(__m64 __m, int64_t __count)
    666 {
    667  __m64 ret;
    668 
    669  asm("psllh  %0, %1, %2\n\t"
    670      : "=f" (ret)
    671      : "f" (__m), "f" (*(__m64 *)&__count)
    672     );
    673 
    674  return ret;
    675 }
    676 
    677 extern __inline __m64 FUNCTION_ATTRIBS
    678 _mm_slli_pi32(__m64 __m, int64_t __count)
    679 {
    680  __m64 ret;
    681 
    682  asm("psllw %0, %1, %2\n\t"
    683      : "=f" (ret)
    684      : "f" (__m), "f" (*(__m64 *)&__count)
    685     );
    686 
    687  return ret;
    688 }
    689 
    690 extern __inline __m64 FUNCTION_ATTRIBS
    691 _mm_slli_si64(__m64 __m, int64_t __count)
    692 {
    693  __m64 ret;
    694 
    695  asm("dsll  %0, %1, %2\n\t"
    696      : "=f" (ret)
    697      : "f" (__m), "f" (*(__m64 *)&__count)
    698     );
    699 
    700  return ret;
    701 }
    702 
    703 extern __inline __m64 FUNCTION_ATTRIBS
    704 _mm_srli_pi16(__m64 __m, int64_t __count)
    705 {
    706  __m64 ret;
    707 
    708  asm("psrlh %0, %1, %2\n\t"
    709      : "=f" (ret)
    710      : "f" (__m), "f" (*(__m64 *)&__count)
    711     );
    712 
    713  return ret;
    714 }
    715 
    716 extern __inline __m64 FUNCTION_ATTRIBS
    717 _mm_srli_pi32(__m64 __m, int64_t __count)
    718 {
    719  __m64 ret;
    720 
    721  asm("psrlw %0, %1, %2\n\t"
    722      : "=f" (ret)
    723      : "f" (__m), "f" (*(__m64 *)&__count)
    724     );
    725 
    726  return ret;
    727 }
    728 
    729 extern __inline __m64 FUNCTION_ATTRIBS
    730 _mm_srli_si64(__m64 __m, int64_t __count)
    731 {
    732  __m64 ret;
    733 
    734  asm("dsrl  %0, %1, %2\n\t"
    735      : "=f" (ret)
    736      : "f" (__m), "f" (*(__m64 *)&__count)
    737     );
    738 
    739  return ret;
    740 }
    741 
    742 extern __inline __m64 FUNCTION_ATTRIBS
    743 _mm_srai_pi16(__m64 __m, int64_t __count)
    744 {
    745  __m64 ret;
    746 
    747  asm("psrah %0, %1, %2\n\t"
    748      : "=f" (ret)
    749      : "f" (__m), "f" (*(__m64 *)&__count)
    750     );
    751 
    752  return ret;
    753 }
    754 
    755 extern __inline __m64 FUNCTION_ATTRIBS
    756 _mm_srai_pi32(__m64 __m, int64_t __count)
    757 {
    758  __m64 ret;
    759 
    760  asm("psraw %0, %1, %2\n\t"
    761      : "=f" (ret)
    762      : "f" (__m), "f" (*(__m64 *)&__count)
    763     );
    764 
    765  return ret;
    766 }
    767 
    768 extern __inline __m64 FUNCTION_ATTRIBS
    769 _mm_srai_si64(__m64 __m, int64_t __count)
    770 {
    771  __m64 ret;
    772 
    773  asm("dsra %0, %1, %2\n\t"
    774      : "=f" (ret)
    775      : "f" (__m), "f" (*(__m64 *)&__count)
    776     );
    777 
    778  return ret;
    779 }
    780 
    781 
    782 /********** Conversion Intrinsics **********/
    783 
    784 extern __inline __m64 FUNCTION_ATTRIBS
    785 to_m64(uint64_t x)
    786 {
    787  return *(__m64 *)&x;
    788 }
    789 
    790 extern __inline uint64_t FUNCTION_ATTRIBS
    791 to_uint64(__m64 x)
    792 {
    793  return *(uint64_t *)&x;
    794 }
    795 
    796 
    797 /********** Comparison Intrinsics **********/
    798 
    799 extern __inline __m64 FUNCTION_ATTRIBS
    800 _mm_cmpeq_pi8(__m64 __m1, __m64 __m2)
    801 {
    802  __m64 ret;
    803 
    804  asm("pcmpeqb %0, %1, %2\n\t"
    805      : "=f" (ret)
    806      : "f" (__m1), "f" (__m2)
    807     );
    808 
    809  return ret;
    810 }
    811 
    812 extern __inline __m64 FUNCTION_ATTRIBS
    813 _mm_cmpeq_pi16(__m64 __m1, __m64 __m2)
    814 {
    815  __m64 ret;
    816 
    817  asm("pcmpeqh %0, %1, %2\n\t"
    818      : "=f" (ret)
    819      : "f" (__m1), "f" (__m2)
    820     );
    821 
    822  return ret;
    823 }
    824 
    825 extern __inline __m64 FUNCTION_ATTRIBS
    826 _mm_cmpeq_pi32(__m64 __m1, __m64 __m2)
    827 {
    828  __m64 ret;
    829 
    830  asm("pcmpeqw %0, %1, %2\n\t"
    831      : "=f" (ret)
    832      : "f" (__m1), "f" (__m2)
    833     );
    834 
    835  return ret;
    836 }
    837 
    838 extern __inline __m64 FUNCTION_ATTRIBS
    839 _mm_cmpgt_pi8(__m64 __m1, __m64 __m2)
    840 {
    841  __m64 ret;
    842 
    843  asm("pcmpgtb %0, %1, %2\n\t"
    844      : "=f" (ret)
    845      : "f" (__m1), "f" (__m2)
    846     );
    847 
    848  return ret;
    849 }
    850 
    851 extern __inline __m64 FUNCTION_ATTRIBS
    852 _mm_cmpgt_pi16(__m64 __m1, __m64 __m2)
    853 {
    854  __m64 ret;
    855 
    856  asm("pcmpgth %0, %1, %2\n\t"
    857      : "=f" (ret)
    858      : "f" (__m1), "f" (__m2)
    859     );
    860 
    861  return ret;
    862 }
    863 
    864 extern __inline __m64 FUNCTION_ATTRIBS
    865 _mm_cmpgt_pi32(__m64 __m1, __m64 __m2)
    866 {
    867  __m64 ret;
    868 
    869  asm("pcmpgtw %0, %1, %2\n\t"
    870      : "=f" (ret)
    871      : "f" (__m1), "f" (__m2)
    872     );
    873 
    874  return ret;
    875 }
    876 
    877 extern __inline __m64 FUNCTION_ATTRIBS
    878 _mm_cmplt_pi8(__m64 __m1, __m64 __m2)
    879 {
    880  __m64 ret;
    881 
    882  asm("pcmpltb %0, %1, %2\n\t"
    883      : "=f" (ret)
    884      : "f" (__m1), "f" (__m2)
    885     );
    886 
    887  return ret;
    888 }
    889 
    890 extern __inline __m64 FUNCTION_ATTRIBS
    891 _mm_cmplt_pi16(__m64 __m1, __m64 __m2)
    892 {
    893  __m64 ret;
    894 
    895  asm("pcmplth %0, %1, %2\n\t"
    896      : "=f" (ret)
    897      : "f" (__m1), "f" (__m2)
    898     );
    899 
    900  return ret;
    901 }
    902 
    903 extern __inline __m64 FUNCTION_ATTRIBS
    904 _mm_cmplt_pi32(__m64 __m1, __m64 __m2)
    905 {
    906  __m64 ret;
    907 
    908  asm("pcmpltw %0, %1, %2\n\t"
    909      : "=f" (ret)
    910      : "f" (__m1), "f" (__m2)
    911     );
    912 
    913  return ret;
    914 }
    915 
    916 
    917 /********** Miscellaneous Operations **********/
    918 
    919 extern __inline __m64 FUNCTION_ATTRIBS
    920 _mm_packs_pi16(__m64 __m1, __m64 __m2)
    921 {
    922  __m64 ret;
    923 
    924  asm("packsshb %0, %1, %2\n\t"
    925      : "=f" (ret)
    926      : "f" (__m1), "f" (__m2)
    927     );
    928 
    929  return ret;
    930 }
    931 
    932 extern __inline __m64 FUNCTION_ATTRIBS
    933 _mm_packs_pi32(__m64 __m1, __m64 __m2)
    934 {
    935  __m64 ret;
    936 
    937  asm("packsswh %0, %1, %2\n\t"
    938      : "=f" (ret)
    939      : "f" (__m1), "f" (__m2)
    940     );
    941 
    942  return ret;
    943 }
    944 
    945 extern __inline __m64 FUNCTION_ATTRIBS
    946 _mm_packs_pi32_f(__m64 __m1, __m64 __m2)
    947 {
    948  __m64 ret;
    949 
    950  asm("packsswh %0, %1, %2\n\t"
    951      : "=f" (ret)
    952      : "f" (__m1), "f" (__m2)
    953     );
    954 
    955  return ret;
    956 }
    957 
    958 extern __inline __m64 FUNCTION_ATTRIBS
    959 _mm_packs_pu16(__m64 __m1, __m64 __m2)
    960 {
    961  __m64 ret;
    962 
    963  asm("packushb %0, %1, %2\n\t"
    964      : "=f" (ret)
    965      : "f" (__m1), "f" (__m2)
    966     );
    967 
    968  return ret;
    969 }
    970 
    971 extern __inline __m64 FUNCTION_ATTRIBS
    972 _mm_extract_pi16(__m64 __m, int64_t __pos)
    973 {
    974  __m64 ret;
    975 
    976  asm("pextrh %0, %1, %2\n\t"
    977      : "=f" (ret)
    978      : "f" (__m), "f" (*(__m64 *)&__pos)
    979     );
    980 
    981  return ret;
    982 }
    983 
    984 extern __inline __m64 FUNCTION_ATTRIBS
    985 _mm_insert_pi16(__m64 __m1, __m64 __m2, int64_t __pos)
    986 {
    987  __m64 ret;
    988 
    989  switch (__pos) {
    990  case 0:
    991 
    992    asm("pinsrh_0 %0, %1, %2\n\t"
    993        : "=f" (ret)
    994        : "f" (__m1), "f" (__m2), "i" (__pos)
    995       );
    996 
    997    break;
    998 
    999  case 1:
   1000 
   1001    asm("pinsrh_1 %0, %1, %2\n\t"
   1002        : "=f" (ret)
   1003        : "f" (__m1), "f" (__m2), "i" (__pos)
   1004       );
   1005 
   1006    break;
   1007  case 2:
   1008 
   1009    asm("pinsrh_2 %0, %1, %2\n\t"
   1010        : "=f" (ret)
   1011        : "f" (__m1), "f" (__m2), "i" (__pos)
   1012       );
   1013 
   1014    break;
   1015 
   1016  case 3:
   1017 
   1018    asm("pinsrh_3 %0, %1, %2\n\t"
   1019        : "=f" (ret)
   1020        : "f" (__m1), "f" (__m2), "i" (__pos)
   1021       );
   1022 
   1023    break;
   1024  }
   1025 
   1026  return ret;
   1027 }
   1028 
   1029 extern __inline __m64 FUNCTION_ATTRIBS
   1030 _mm_shuffle_pi16(__m64 __m, int64_t __n)
   1031 {
   1032  __m64 ret;
   1033 
   1034  asm("pshufh %0, %1, %2\n\t"
   1035      : "=f" (ret)
   1036      : "f" (__m), "f" (*(__m64 *)&__n)
   1037     );
   1038 
   1039  return ret;
   1040 }
   1041 
   1042 extern __inline __m64 FUNCTION_ATTRIBS
   1043 _mm_unpackhi_pi8(__m64 __m1, __m64 __m2)
   1044 {
   1045  __m64 ret;
   1046 
   1047  asm("punpckhbh %0, %1, %2\n\t"
   1048      : "=f" (ret)
   1049      : "f" (__m1), "f" (__m2)
   1050     );
   1051 
   1052  return ret;
   1053 }
   1054 
   1055 extern __inline __m64 FUNCTION_ATTRIBS
   1056 _mm_unpackhi_pi8_f(__m64 __m1, __m64 __m2)
   1057 {
   1058  __m64 ret;
   1059 
   1060  asm("punpckhbh %0, %1, %2\n\t"
   1061      : "=f" (ret)
   1062      : "f" (__m1), "f" (__m2)
   1063     );
   1064 
   1065  return ret;
   1066 }
   1067 
   1068 extern __inline __m64 FUNCTION_ATTRIBS
   1069 _mm_unpackhi_pi16(__m64 __m1, __m64 __m2)
   1070 {
   1071  __m64 ret;
   1072 
   1073  asm("punpckhhw %0, %1, %2\n\t"
   1074      : "=f" (ret)
   1075      : "f" (__m1), "f" (__m2)
   1076     );
   1077 
   1078  return ret;
   1079 }
   1080 
   1081 extern __inline __m64 FUNCTION_ATTRIBS
   1082 _mm_unpackhi_pi16_f(__m64 __m1, __m64 __m2)
   1083 {
   1084  __m64 ret;
   1085 
   1086  asm("punpckhhw %0, %1, %2\n\t"
   1087      : "=f" (ret)
   1088      : "f" (__m1), "f" (__m2)
   1089     );
   1090 
   1091  return ret;
   1092 }
   1093 
   1094 extern __inline __m64 FUNCTION_ATTRIBS
   1095 _mm_unpackhi_pi32(__m64 __m1, __m64 __m2)
   1096 {
   1097  __m64 ret;
   1098 
   1099  asm("punpckhwd %0, %1, %2\n\t"
   1100      : "=f" (ret)
   1101      : "f" (__m1), "f" (__m2)
   1102     );
   1103 
   1104  return ret;
   1105 }
   1106 
   1107 extern __inline __m64 FUNCTION_ATTRIBS
   1108 _mm_unpacklo_pi8(__m64 __m1, __m64 __m2)
   1109 {
   1110  __m64 ret;
   1111 
   1112  asm("punpcklbh %0, %1, %2\n\t"
   1113      : "=f" (ret)
   1114      : "f" (__m1), "f" (__m2)
   1115     );
   1116 
   1117  return ret;
   1118 }
   1119 
   1120 /* Since punpcklbh cares about the high 32-bits, we use the __m64 datatype,
   1121   which preserves the data. */
   1122 
   1123 extern __inline __m64 FUNCTION_ATTRIBS
   1124 _mm_unpacklo_pi8_f64(__m64 __m1, __m64 __m2)
   1125 {
   1126  __m64 ret;
   1127 
   1128  asm("punpcklbh %0, %1, %2\n\t"
   1129      : "=f" (ret)
   1130      : "f" (__m1), "f" (__m2)
   1131     );
   1132 
   1133  return ret;
   1134 }
   1135 
   1136 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32,
   1137   datatype, which allows load8888 to use 32-bit loads. */
   1138 
   1139 extern __inline __m64 FUNCTION_ATTRIBS
   1140 _mm_unpacklo_pi8_f(__m32 __m1, __m64 __m2)
   1141 {
   1142  __m64 ret;
   1143 
   1144  asm("punpcklbh %0, %1, %2\n\t"
   1145      : "=f" (ret)
   1146      : "f" (__m1), "f" (__m2)
   1147     );
   1148 
   1149  return ret;
   1150 }
   1151 
   1152 extern __inline __m64 FUNCTION_ATTRIBS
   1153 _mm_unpacklo_pi16(__m64 __m1, __m64 __m2)
   1154 {
   1155  __m64 ret;
   1156 
   1157  asm("punpcklhw %0, %1, %2\n\t"
   1158      : "=f" (ret)
   1159      : "f" (__m1), "f" (__m2)
   1160     );
   1161 
   1162  return ret;
   1163 }
   1164 
   1165 extern __inline __m64 FUNCTION_ATTRIBS
   1166 _mm_unpacklo_pi16_f(__m64 __m1, __m64 __m2)
   1167 {
   1168  __m64 ret;
   1169 
   1170  asm("punpcklhw %0, %1, %2\n\t"
   1171      : "=f" (ret)
   1172      : "f" (__m1), "f" (__m2)
   1173     );
   1174 
   1175  return ret;
   1176 }
   1177 
   1178 extern __inline __m64 FUNCTION_ATTRIBS
   1179 _mm_unpacklo_pi32(__m64 __m1, __m64 __m2)
   1180 {
   1181  __m64 ret;
   1182 
   1183  asm("punpcklwd %0, %1, %2\n\t"
   1184      : "=f" (ret)
   1185      : "f" (__m1), "f" (__m2)
   1186     );
   1187 
   1188  return ret;
   1189 }
   1190 
   1191 
   1192 extern __inline __m64 FUNCTION_ATTRIBS
   1193 _mm_unpacklo_pi32_f(__m64 __m1, __m64 __m2)
   1194 {
   1195  __m64 ret;
   1196 
   1197  asm("punpcklwd %0, %1, %2\n\t"
   1198      : "=f" (ret)
   1199      : "f" (__m1), "f" (__m2)
   1200     );
   1201 
   1202  return ret;
   1203 }
   1204 
   1205 extern __inline void FUNCTION_ATTRIBS
   1206 _mm_store_pi32(__m32 *dest, __m64 src)
   1207 {
   1208  src = _mm_packs_pu16(src, _mm_setzero_si64());
   1209 
   1210  asm("swc1 %1, %0\n\t"
   1211      : "=m" (*dest)
   1212      : "f" (src)
   1213      : "memory"
   1214     );
   1215 }
   1216 
   1217 extern __inline void FUNCTION_ATTRIBS
   1218 _mm_store_si64(__m64 *dest, __m64 src)
   1219 {
   1220  asm("sdc1 %1, %0 \n\t"
   1221      : "=m" (*dest)
   1222      : "f" (src)
   1223      : "memory"
   1224     );
   1225 }
   1226 
   1227 extern __inline void FUNCTION_ATTRIBS
   1228 _mm_storeu_si64(__m64 *dest, __m64 src)
   1229 {
   1230  asm("gssdlc1 %1, 7(%0) \n\t"
   1231      "gssdrc1 %1, 0(%0) \n\t"
   1232      :
   1233      : "r" (dest), "f" (src)
   1234      : "memory"
   1235     );
   1236 }
   1237 
   1238 extern __inline __m64 FUNCTION_ATTRIBS
   1239 _mm_load_si32(const __m32 *src)
   1240 {
   1241  __m32 ret;
   1242 
   1243  asm("lwc1 %0, %1\n\t"
   1244      : "=f" (ret)
   1245      : "m" (*src)
   1246     );
   1247 
   1248  return ret;
   1249 }
   1250 
   1251 extern __inline __m64 FUNCTION_ATTRIBS
   1252 _mm_load_si64(const __m64 *src)
   1253 {
   1254  __m64 ret;
   1255 
   1256  asm("ldc1 %0, %1\n\t"
   1257      : "=f" (ret)
   1258      : "m" (*src)
   1259      : "memory"
   1260     );
   1261 
   1262  return ret;
   1263 }
   1264 
   1265 extern __inline __m64 FUNCTION_ATTRIBS
   1266 _mm_loadu_si64(const __m64 *src)
   1267 {
   1268  __m64 ret;
   1269 
   1270  asm("gsldlc1 %0,  7(%1)\n\t"
   1271      "gsldrc1 %0,  0(%1)\n\t"
   1272      : "=f" (ret)
   1273      : "r" (src)
   1274      : "memory"
   1275     );
   1276 
   1277  return ret;
   1278 }
   1279 
   1280 extern __inline __m64 FUNCTION_ATTRIBS
   1281 _mm_loadlo_pi8(const uint32_t *src)
   1282 {
   1283  return _mm_unpacklo_pi8_f(*(__m32 *)src, _mm_setzero_si64());
   1284 }
   1285 
   1286 extern __inline __m64 FUNCTION_ATTRIBS
   1287 _mm_loadlo_pi8_f(__m64 src)
   1288 {
   1289  return _mm_unpacklo_pi8_f64(src, _mm_setzero_si64());
   1290 }
   1291 
   1292 extern __inline __m64 FUNCTION_ATTRIBS
   1293 _mm_loadhi_pi8_f(__m64 src)
   1294 {
   1295  return _mm_unpackhi_pi8_f(src, _mm_setzero_si64());
   1296 }
   1297 
   1298 extern __inline __m64 FUNCTION_ATTRIBS
   1299 _mm_loadlo_pi16(__m64 src)
   1300 {
   1301  return _mm_unpacklo_pi16(src, _mm_setzero_si64());
   1302 }
   1303 
   1304 extern __inline __m64 FUNCTION_ATTRIBS
   1305 _mm_loadlo_pi16_f(__m64 src)
   1306 {
   1307  return _mm_unpacklo_pi16_f(_mm_setzero_si64(), src);
   1308 }
   1309 
   1310 extern __inline __m64 FUNCTION_ATTRIBS
   1311 _mm_loadhi_pi16(__m64 src)
   1312 {
   1313  return _mm_unpackhi_pi16(src, _mm_setzero_si64());
   1314 }
   1315 
   1316 extern __inline __m64 FUNCTION_ATTRIBS
   1317 _mm_loadhi_pi16_f(__m64 src)
   1318 {
   1319  return _mm_unpackhi_pi16_f(_mm_setzero_si64(), src);
   1320 }
   1321 
   1322 extern __inline __m64 FUNCTION_ATTRIBS
   1323 _mm_expand_alpha(__m64 pixel)
   1324 {
   1325  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(3, 3, 3, 3));
   1326 }
   1327 
   1328 extern __inline __m64 FUNCTION_ATTRIBS
   1329 _mm_expand_alpha_rev(__m64 pixel)
   1330 {
   1331  return _mm_shuffle_pi16(pixel, _MM_SHUFFLE(0, 0, 0, 0));
   1332 }
   1333 
   1334 #endif  /* __LOONGSON_MMINTRIN_H__ */