loongson-mmintrin.h (10119B)
1 /* The gcc-provided loongson intrinsic functions are way too fucking broken 2 * to be of any use, otherwise I'd use them. 3 * 4 * - The hardware instructions are very similar to MMX or iwMMXt. Certainly 5 * close enough that they could have implemented the _mm_*-style intrinsic 6 * interface and had a ton of optimized code available to them. Instead they 7 * implemented something much, much worse. 8 * 9 * - pshuf takes a dead first argument, causing extra instructions to be 10 * generated. 11 * 12 * - There are no 64-bit shift or logical intrinsics, which means you have 13 * to implement them with inline assembly, but this is a nightmare because 14 * gcc doesn't understand that the integer vector datatypes are actually in 15 * floating-point registers, so you end up with braindead code like 16 * 17 * punpcklwd $f9,$f9,$f5 18 * dmtc1 v0,$f8 19 * punpcklwd $f19,$f19,$f5 20 * dmfc1 t9,$f9 21 * dmtc1 v0,$f9 22 * dmtc1 t9,$f20 23 * dmfc1 s0,$f19 24 * punpcklbh $f20,$f20,$f2 25 * 26 * where crap just gets copied back and forth between integer and floating- 27 * point registers ad nauseum. 28 * 29 * Instead of trying to workaround the problems from these crap intrinsics, I 30 * just implement the _mm_* intrinsics needed for pixman-mmx.c using inline 31 * assembly. 32 */ 33 34 #include <stdint.h> 35 36 /* vectors are stored in 64-bit floating-point registers */ 37 typedef double __m64; 38 /* having a 32-bit datatype allows us to use 32-bit loads in places like load8888 */ 39 typedef float __m32; 40 41 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 42 _mm_setzero_si64 (void) 43 { 44 return 0.0; 45 } 46 47 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 48 _mm_add_pi16 (__m64 __m1, __m64 __m2) 49 { 50 __m64 ret; 51 asm("paddh %0, %1, %2\n\t" 52 : "=f" (ret) 53 : "f" (__m1), "f" (__m2) 54 ); 55 return ret; 56 } 57 58 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 59 _mm_add_pi32 (__m64 __m1, __m64 __m2) 60 { 61 __m64 ret; 62 asm("paddw %0, %1, %2\n\t" 63 : "=f" (ret) 64 : "f" (__m1), "f" (__m2) 65 ); 66 return ret; 67 } 68 69 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 70 _mm_adds_pu16 (__m64 __m1, __m64 __m2) 71 { 72 __m64 ret; 73 asm("paddush %0, %1, %2\n\t" 74 : "=f" (ret) 75 : "f" (__m1), "f" (__m2) 76 ); 77 return ret; 78 } 79 80 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 81 _mm_adds_pu8 (__m64 __m1, __m64 __m2) 82 { 83 __m64 ret; 84 asm("paddusb %0, %1, %2\n\t" 85 : "=f" (ret) 86 : "f" (__m1), "f" (__m2) 87 ); 88 return ret; 89 } 90 91 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 92 _mm_and_si64 (__m64 __m1, __m64 __m2) 93 { 94 __m64 ret; 95 asm("and %0, %1, %2\n\t" 96 : "=f" (ret) 97 : "f" (__m1), "f" (__m2) 98 ); 99 return ret; 100 } 101 102 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 103 _mm_cmpeq_pi32 (__m64 __m1, __m64 __m2) 104 { 105 __m64 ret; 106 asm("pcmpeqw %0, %1, %2\n\t" 107 : "=f" (ret) 108 : "f" (__m1), "f" (__m2) 109 ); 110 return ret; 111 } 112 113 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 114 _mm_empty (void) 115 { 116 117 } 118 119 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 120 _mm_madd_pi16 (__m64 __m1, __m64 __m2) 121 { 122 __m64 ret; 123 asm("pmaddhw %0, %1, %2\n\t" 124 : "=f" (ret) 125 : "f" (__m1), "f" (__m2) 126 ); 127 return ret; 128 } 129 130 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 131 _mm_mulhi_pu16 (__m64 __m1, __m64 __m2) 132 { 133 __m64 ret; 134 asm("pmulhuh %0, %1, %2\n\t" 135 : "=f" (ret) 136 : "f" (__m1), "f" (__m2) 137 ); 138 return ret; 139 } 140 141 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 142 _mm_mullo_pi16 (__m64 __m1, __m64 __m2) 143 { 144 __m64 ret; 145 asm("pmullh %0, %1, %2\n\t" 146 : "=f" (ret) 147 : "f" (__m1), "f" (__m2) 148 ); 149 return ret; 150 } 151 152 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 153 _mm_or_si64 (__m64 __m1, __m64 __m2) 154 { 155 __m64 ret; 156 asm("or %0, %1, %2\n\t" 157 : "=f" (ret) 158 : "f" (__m1), "f" (__m2) 159 ); 160 return ret; 161 } 162 163 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 164 _mm_packs_pu16 (__m64 __m1, __m64 __m2) 165 { 166 __m64 ret; 167 asm("packushb %0, %1, %2\n\t" 168 : "=f" (ret) 169 : "f" (__m1), "f" (__m2) 170 ); 171 return ret; 172 } 173 174 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 175 _mm_packs_pi32 (__m64 __m1, __m64 __m2) 176 { 177 __m64 ret; 178 asm("packsswh %0, %1, %2\n\t" 179 : "=f" (ret) 180 : "f" (__m1), "f" (__m2) 181 ); 182 return ret; 183 } 184 185 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 186 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 187 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 188 _mm_set_pi16 (uint16_t __w3, uint16_t __w2, uint16_t __w1, uint16_t __w0) 189 { 190 if (__builtin_constant_p (__w3) && 191 __builtin_constant_p (__w2) && 192 __builtin_constant_p (__w1) && 193 __builtin_constant_p (__w0)) 194 { 195 uint64_t val = ((uint64_t)__w3 << 48) 196 | ((uint64_t)__w2 << 32) 197 | ((uint64_t)__w1 << 16) 198 | ((uint64_t)__w0 << 0); 199 return *(__m64 *)&val; 200 } 201 else if (__w3 == __w2 && __w2 == __w1 && __w1 == __w0) 202 { 203 /* TODO: handle other cases */ 204 uint64_t val = __w3; 205 uint64_t imm = _MM_SHUFFLE (0, 0, 0, 0); 206 __m64 ret; 207 asm("pshufh %0, %1, %2\n\t" 208 : "=f" (ret) 209 : "f" (*(__m64 *)&val), "f" (*(__m64 *)&imm) 210 ); 211 return ret; 212 } else { 213 uint64_t val = ((uint64_t)__w3 << 48) 214 | ((uint64_t)__w2 << 32) 215 | ((uint64_t)__w1 << 16) 216 | ((uint64_t)__w0 << 0); 217 return *(__m64 *)&val; 218 } 219 } 220 221 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 222 _mm_set_pi32 (unsigned __i1, unsigned __i0) 223 { 224 if (__builtin_constant_p (__i1) && 225 __builtin_constant_p (__i0)) 226 { 227 uint64_t val = ((uint64_t)__i1 << 32) 228 | ((uint64_t)__i0 << 0); 229 return *(__m64 *)&val; 230 } 231 else if (__i1 == __i0) 232 { 233 uint64_t imm = _MM_SHUFFLE (1, 0, 1, 0); 234 __m64 ret; 235 asm("pshufh %0, %1, %2\n\t" 236 : "=f" (ret) 237 : "f" (*(__m32 *)&__i1), "f" (*(__m64 *)&imm) 238 ); 239 return ret; 240 } else { 241 uint64_t val = ((uint64_t)__i1 << 32) 242 | ((uint64_t)__i0 << 0); 243 return *(__m64 *)&val; 244 } 245 } 246 #undef _MM_SHUFFLE 247 248 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 249 _mm_shuffle_pi16 (__m64 __m, int64_t __n) 250 { 251 __m64 ret; 252 asm("pshufh %0, %1, %2\n\t" 253 : "=f" (ret) 254 : "f" (__m), "f" (*(__m64 *)&__n) 255 ); 256 return ret; 257 } 258 259 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 260 _mm_slli_pi16 (__m64 __m, int64_t __count) 261 { 262 __m64 ret; 263 asm("psllh %0, %1, %2\n\t" 264 : "=f" (ret) 265 : "f" (__m), "f" (*(__m64 *)&__count) 266 ); 267 return ret; 268 } 269 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 270 _mm_slli_si64 (__m64 __m, int64_t __count) 271 { 272 __m64 ret; 273 asm("dsll %0, %1, %2\n\t" 274 : "=f" (ret) 275 : "f" (__m), "f" (*(__m64 *)&__count) 276 ); 277 return ret; 278 } 279 280 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 281 _mm_srli_pi16 (__m64 __m, int64_t __count) 282 { 283 __m64 ret; 284 asm("psrlh %0, %1, %2\n\t" 285 : "=f" (ret) 286 : "f" (__m), "f" (*(__m64 *)&__count) 287 ); 288 return ret; 289 } 290 291 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 292 _mm_srli_pi32 (__m64 __m, int64_t __count) 293 { 294 __m64 ret; 295 asm("psrlw %0, %1, %2\n\t" 296 : "=f" (ret) 297 : "f" (__m), "f" (*(__m64 *)&__count) 298 ); 299 return ret; 300 } 301 302 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 303 _mm_srli_si64 (__m64 __m, int64_t __count) 304 { 305 __m64 ret; 306 asm("dsrl %0, %1, %2\n\t" 307 : "=f" (ret) 308 : "f" (__m), "f" (*(__m64 *)&__count) 309 ); 310 return ret; 311 } 312 313 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 314 _mm_sub_pi16 (__m64 __m1, __m64 __m2) 315 { 316 __m64 ret; 317 asm("psubh %0, %1, %2\n\t" 318 : "=f" (ret) 319 : "f" (__m1), "f" (__m2) 320 ); 321 return ret; 322 } 323 324 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 325 _mm_unpackhi_pi8 (__m64 __m1, __m64 __m2) 326 { 327 __m64 ret; 328 asm("punpckhbh %0, %1, %2\n\t" 329 : "=f" (ret) 330 : "f" (__m1), "f" (__m2) 331 ); 332 return ret; 333 } 334 335 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 336 _mm_unpackhi_pi16 (__m64 __m1, __m64 __m2) 337 { 338 __m64 ret; 339 asm("punpckhhw %0, %1, %2\n\t" 340 : "=f" (ret) 341 : "f" (__m1), "f" (__m2) 342 ); 343 return ret; 344 } 345 346 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 347 _mm_unpacklo_pi8 (__m64 __m1, __m64 __m2) 348 { 349 __m64 ret; 350 asm("punpcklbh %0, %1, %2\n\t" 351 : "=f" (ret) 352 : "f" (__m1), "f" (__m2) 353 ); 354 return ret; 355 } 356 357 /* Since punpcklbh doesn't care about the high 32-bits, we use the __m32 datatype which 358 * allows load8888 to use 32-bit loads */ 359 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 360 _mm_unpacklo_pi8_f (__m32 __m1, __m64 __m2) 361 { 362 __m64 ret; 363 asm("punpcklbh %0, %1, %2\n\t" 364 : "=f" (ret) 365 : "f" (__m1), "f" (__m2) 366 ); 367 return ret; 368 } 369 370 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 371 _mm_unpacklo_pi16 (__m64 __m1, __m64 __m2) 372 { 373 __m64 ret; 374 asm("punpcklhw %0, %1, %2\n\t" 375 : "=f" (ret) 376 : "f" (__m1), "f" (__m2) 377 ); 378 return ret; 379 } 380 381 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 382 _mm_xor_si64 (__m64 __m1, __m64 __m2) 383 { 384 __m64 ret; 385 asm("xor %0, %1, %2\n\t" 386 : "=f" (ret) 387 : "f" (__m1), "f" (__m2) 388 ); 389 return ret; 390 } 391 392 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 393 loongson_extract_pi16 (__m64 __m, int64_t __pos) 394 { 395 __m64 ret; 396 asm("pextrh %0, %1, %2\n\t" 397 : "=f" (ret) 398 : "f" (__m), "f" (*(__m64 *)&__pos) 399 ); 400 return ret; 401 } 402 403 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 404 loongson_insert_pi16 (__m64 __m1, __m64 __m2, int64_t __pos) 405 { 406 __m64 ret; 407 asm("pinsrh_%3 %0, %1, %2\n\t" 408 : "=f" (ret) 409 : "f" (__m1), "f" (__m2), "i" (__pos) 410 ); 411 return ret; 412 }