SIMD.h (34576B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef _MOZILLA_GFX_SIMD_H_ 8 #define _MOZILLA_GFX_SIMD_H_ 9 10 /** 11 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it 12 * if they want access to the SSE2 functions. 13 */ 14 15 #ifdef SIMD_COMPILE_SSE2 16 # include <xmmintrin.h> 17 #endif 18 19 namespace mozilla { 20 namespace gfx { 21 22 namespace simd { 23 24 template <typename u8x16_t> 25 u8x16_t Load8(const uint8_t* aSource); 26 27 template <typename u8x16_t> 28 u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, 29 uint8_t g, uint8_t h, uint8_t i, uint8_t j, uint8_t k, uint8_t l, 30 uint8_t m, uint8_t n, uint8_t o, uint8_t p); 31 32 template <typename u8x16_t> 33 u8x16_t FromZero8(); 34 35 template <typename i16x8_t> 36 i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e, 37 int16_t f, int16_t g, int16_t h); 38 39 template <typename u16x8_t> 40 u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, 41 uint16_t f, uint16_t g, uint16_t h); 42 43 template <typename i16x8_t> 44 i16x8_t FromI16(int16_t a); 45 46 template <typename u16x8_t> 47 u16x8_t FromU16(uint16_t a); 48 49 template <typename i32x4_t> 50 i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d); 51 52 template <typename i32x4_t> 53 i32x4_t From32(int32_t a); 54 55 template <typename f32x4_t> 56 f32x4_t FromF32(float a, float b, float c, float d); 57 58 template <typename f32x4_t> 59 f32x4_t FromF32(float a); 60 61 // All SIMD backends overload these functions for their SIMD types: 62 63 #if 0 64 65 // Store 16 bytes to a 16-byte aligned address 66 void Store8(uint8_t* aTarget, u8x16_t aM); 67 68 // Fixed shifts 69 template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM); 70 template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM); 71 72 i16x8_t Add16(i16x8_t aM1, i16x8_t aM2); 73 i32x4_t Add32(i32x4_t aM1, i32x4_t aM2); 74 i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2); 75 i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2); 76 u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2); 77 u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2); 78 i32x4_t Min32(i32x4_t aM1, i32x4_t aM2); 79 i32x4_t Max32(i32x4_t aM1, i32x4_t aM2); 80 81 // Truncating i16 -> i16 multiplication 82 i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2); 83 84 // Long multiplication i16 -> i32 85 // aFactorsA1B1 = (a1[4] b1[4]) 86 // aFactorsA2B2 = (a2[4] b2[4]) 87 // aProductA = a1 * a2, aProductB = b1 * b2 88 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2, 89 i32x4_t& aProductA, i32x4_t& aProductB); 90 91 // Long multiplication + pairwise addition i16 -> i32 92 // See the scalar implementation for specifics. 93 i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB); 94 i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB); 95 96 // Set all four 32-bit components to the value of the component at aIndex. 97 template<int8_t aIndex> 98 i32x4_t Splat32(i32x4_t aM); 99 100 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them, 101 // re-interpret the result as sixteen 8-bit values. 102 template<int8_t aIndex> 103 u8x16_t Splat32On8(u8x16_t aM); 104 105 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM); 106 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM); 107 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM); 108 109 u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2); 110 u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2); 111 i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2); 112 i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2); 113 i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2); 114 115 i16x8_t UnpackLo8x8ToI16x8(u8x16_t m); 116 i16x8_t UnpackHi8x8ToI16x8(u8x16_t m); 117 u16x8_t UnpackLo8x8ToU16x8(u8x16_t m); 118 u16x8_t UnpackHi8x8ToU16x8(u8x16_t m); 119 120 i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2); 121 u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2); 122 u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4); 123 124 i32x4 FastDivideBy255(i32x4 m); 125 i16x8 FastDivideBy255_16(i16x8 m); 126 127 #endif 128 129 // Scalar 130 131 struct Scalaru8x16_t { 132 uint8_t u8[16]; 133 }; 134 135 union Scalari16x8_t { 136 int16_t i16[8]; 137 uint16_t u16[8]; 138 }; 139 140 typedef Scalari16x8_t Scalaru16x8_t; 141 142 struct Scalari32x4_t { 143 int32_t i32[4]; 144 }; 145 146 struct Scalarf32x4_t { 147 float f32[4]; 148 }; 149 150 template <> 151 inline Scalaru8x16_t Load8<Scalaru8x16_t>(const uint8_t* aSource) { 152 return *(Scalaru8x16_t*)aSource; 153 } 154 155 inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM) { 156 *(Scalaru8x16_t*)aTarget = aM; 157 } 158 159 template <> 160 inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c, 161 uint8_t d, uint8_t e, uint8_t f, 162 uint8_t g, uint8_t h, uint8_t i, 163 uint8_t j, uint8_t k, uint8_t l, 164 uint8_t m, uint8_t n, uint8_t o, 165 uint8_t p) { 166 Scalaru8x16_t _m; 167 _m.u8[0] = a; 168 _m.u8[1] = b; 169 _m.u8[2] = c; 170 _m.u8[3] = d; 171 _m.u8[4] = e; 172 _m.u8[5] = f; 173 _m.u8[6] = g; 174 _m.u8[7] = h; 175 _m.u8[8 + 0] = i; 176 _m.u8[8 + 1] = j; 177 _m.u8[8 + 2] = k; 178 _m.u8[8 + 3] = l; 179 _m.u8[8 + 4] = m; 180 _m.u8[8 + 5] = n; 181 _m.u8[8 + 6] = o; 182 _m.u8[8 + 7] = p; 183 return _m; 184 } 185 186 template <> 187 inline Scalaru8x16_t FromZero8<Scalaru8x16_t>() { 188 return From8<Scalaru8x16_t>(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); 189 } 190 191 template <> 192 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c, 193 int16_t d, int16_t e, int16_t f, 194 int16_t g, int16_t h) { 195 Scalari16x8_t m; 196 m.i16[0] = a; 197 m.i16[1] = b; 198 m.i16[2] = c; 199 m.i16[3] = d; 200 m.i16[4] = e; 201 m.i16[5] = f; 202 m.i16[6] = g; 203 m.i16[7] = h; 204 return m; 205 } 206 207 template <> 208 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c, 209 uint16_t d, uint16_t e, uint16_t f, 210 uint16_t g, uint16_t h) { 211 Scalaru16x8_t m; 212 m.u16[0] = a; 213 m.u16[1] = b; 214 m.u16[2] = c; 215 m.u16[3] = d; 216 m.u16[4] = e; 217 m.u16[5] = f; 218 m.u16[6] = g; 219 m.u16[7] = h; 220 return m; 221 } 222 223 template <> 224 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a) { 225 return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a); 226 } 227 228 template <> 229 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a) { 230 return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a); 231 } 232 233 template <> 234 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c, 235 int32_t d) { 236 Scalari32x4_t m; 237 m.i32[0] = a; 238 m.i32[1] = b; 239 m.i32[2] = c; 240 m.i32[3] = d; 241 return m; 242 } 243 244 template <> 245 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c, 246 float d) { 247 Scalarf32x4_t m; 248 m.f32[0] = a; 249 m.f32[1] = b; 250 m.f32[2] = c; 251 m.f32[3] = d; 252 return m; 253 } 254 255 template <> 256 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a) { 257 return FromF32<Scalarf32x4_t>(a, a, a, a); 258 } 259 260 template <> 261 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a) { 262 return From32<Scalari32x4_t>(a, a, a, a); 263 } 264 265 template <int32_t aNumberOfBits> 266 inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM) { 267 return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits, 268 uint16_t(aM.i16[1]) >> aNumberOfBits, 269 uint16_t(aM.i16[2]) >> aNumberOfBits, 270 uint16_t(aM.i16[3]) >> aNumberOfBits, 271 uint16_t(aM.i16[4]) >> aNumberOfBits, 272 uint16_t(aM.i16[5]) >> aNumberOfBits, 273 uint16_t(aM.i16[6]) >> aNumberOfBits, 274 uint16_t(aM.i16[7]) >> aNumberOfBits); 275 } 276 277 template <int32_t aNumberOfBits> 278 inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM) { 279 return From32<Scalari32x4_t>( 280 aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits, 281 aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits); 282 } 283 284 inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) { 285 return FromU16<Scalaru16x8_t>( 286 aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1], aM1.u16[2] + aM2.u16[2], 287 aM1.u16[3] + aM2.u16[3], aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5], 288 aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]); 289 } 290 291 inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2) { 292 return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1], 293 aM1.i32[2] + aM2.i32[2], 294 aM1.i32[3] + aM2.i32[3]); 295 } 296 297 inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) { 298 return FromU16<Scalaru16x8_t>( 299 aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1], aM1.u16[2] - aM2.u16[2], 300 aM1.u16[3] - aM2.u16[3], aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5], 301 aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]); 302 } 303 304 inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2) { 305 return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1], 306 aM1.i32[2] - aM2.i32[2], 307 aM1.i32[3] - aM2.i32[3]); 308 } 309 310 inline int32_t umin(int32_t a, int32_t b) { return a - ((a - b) & -(a > b)); } 311 312 inline int32_t umax(int32_t a, int32_t b) { return a - ((a - b) & -(a < b)); } 313 314 inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) { 315 return From8<Scalaru8x16_t>( 316 umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]), 317 umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]), 318 umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]), 319 umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]), 320 umin(aM1.u8[8 + 0], aM2.u8[8 + 0]), umin(aM1.u8[8 + 1], aM2.u8[8 + 1]), 321 umin(aM1.u8[8 + 2], aM2.u8[8 + 2]), umin(aM1.u8[8 + 3], aM2.u8[8 + 3]), 322 umin(aM1.u8[8 + 4], aM2.u8[8 + 4]), umin(aM1.u8[8 + 5], aM2.u8[8 + 5]), 323 umin(aM1.u8[8 + 6], aM2.u8[8 + 6]), umin(aM1.u8[8 + 7], aM2.u8[8 + 7])); 324 } 325 326 inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) { 327 return From8<Scalaru8x16_t>( 328 umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]), 329 umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]), 330 umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]), 331 umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]), 332 umax(aM1.u8[8 + 0], aM2.u8[8 + 0]), umax(aM1.u8[8 + 1], aM2.u8[8 + 1]), 333 umax(aM1.u8[8 + 2], aM2.u8[8 + 2]), umax(aM1.u8[8 + 3], aM2.u8[8 + 3]), 334 umax(aM1.u8[8 + 4], aM2.u8[8 + 4]), umax(aM1.u8[8 + 5], aM2.u8[8 + 5]), 335 umax(aM1.u8[8 + 6], aM2.u8[8 + 6]), umax(aM1.u8[8 + 7], aM2.u8[8 + 7])); 336 } 337 338 inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2) { 339 return From32<Scalari32x4_t>( 340 umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]), 341 umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3])); 342 } 343 344 inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2) { 345 return From32<Scalari32x4_t>( 346 umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]), 347 umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3])); 348 } 349 350 inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) { 351 return FromU16<Scalaru16x8_t>( 352 uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])), 353 uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])), 354 uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])), 355 uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])), 356 uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])), 357 uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])), 358 uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])), 359 uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7]))); 360 } 361 362 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1, 363 Scalari16x8_t aFactorsA2B2, 364 Scalari32x4_t& aProductA, 365 Scalari32x4_t& aProductB) { 366 aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0], 367 aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1], 368 aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2], 369 aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]); 370 aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4], 371 aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5], 372 aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6], 373 aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]); 374 } 375 376 inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA, 377 Scalari16x8_t aFactorsB) { 378 return From32<Scalari32x4_t>( 379 aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1], 380 aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3], 381 aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5], 382 aFactorsA.i16[6] * aFactorsB.i16[6] + 383 aFactorsA.i16[7] * aFactorsB.i16[7]); 384 } 385 386 template <int8_t aIndex> 387 inline void AssertIndex() { 388 static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3, 389 "Invalid splat index"); 390 } 391 392 template <int8_t aIndex> 393 inline Scalari32x4_t Splat32(Scalari32x4_t aM) { 394 AssertIndex<aIndex>(); 395 return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex], aM.i32[aIndex], 396 aM.i32[aIndex]); 397 } 398 399 template <int8_t i> 400 inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM) { 401 AssertIndex<i>(); 402 return From8<Scalaru8x16_t>( 403 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3], 404 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3], 405 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3], 406 aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3]); 407 } 408 409 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3> 410 inline Scalari32x4_t Shuffle32(Scalari32x4_t aM) { 411 AssertIndex<i0>(); 412 AssertIndex<i1>(); 413 AssertIndex<i2>(); 414 AssertIndex<i3>(); 415 Scalari32x4_t m = aM; 416 m.i32[0] = aM.i32[i3]; 417 m.i32[1] = aM.i32[i2]; 418 m.i32[2] = aM.i32[i1]; 419 m.i32[3] = aM.i32[i0]; 420 return m; 421 } 422 423 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3> 424 inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM) { 425 AssertIndex<i0>(); 426 AssertIndex<i1>(); 427 AssertIndex<i2>(); 428 AssertIndex<i3>(); 429 Scalari16x8_t m = aM; 430 m.i16[0] = aM.i16[i3]; 431 m.i16[1] = aM.i16[i2]; 432 m.i16[2] = aM.i16[i1]; 433 m.i16[3] = aM.i16[i0]; 434 return m; 435 } 436 437 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3> 438 inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM) { 439 AssertIndex<i0>(); 440 AssertIndex<i1>(); 441 AssertIndex<i2>(); 442 AssertIndex<i3>(); 443 Scalari16x8_t m = aM; 444 m.i16[4 + 0] = aM.i16[4 + i3]; 445 m.i16[4 + 1] = aM.i16[4 + i2]; 446 m.i16[4 + 2] = aM.i16[4 + i1]; 447 m.i16[4 + 3] = aM.i16[4 + i0]; 448 return m; 449 } 450 451 template <int8_t aIndexLo, int8_t aIndexHi> 452 inline Scalaru16x8_t Splat16(Scalaru16x8_t aM) { 453 AssertIndex<aIndexLo>(); 454 AssertIndex<aIndexHi>(); 455 Scalaru16x8_t m; 456 int16_t chosenValueLo = aM.u16[aIndexLo]; 457 m.u16[0] = chosenValueLo; 458 m.u16[1] = chosenValueLo; 459 m.u16[2] = chosenValueLo; 460 m.u16[3] = chosenValueLo; 461 int16_t chosenValueHi = aM.u16[4 + aIndexHi]; 462 m.u16[4] = chosenValueHi; 463 m.u16[5] = chosenValueHi; 464 m.u16[6] = chosenValueHi; 465 m.u16[7] = chosenValueHi; 466 return m; 467 } 468 469 inline Scalaru8x16_t InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2) { 470 return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1], m1.u8[2], 471 m2.u8[2], m1.u8[3], m2.u8[3], m1.u8[4], m2.u8[4], 472 m1.u8[5], m2.u8[5], m1.u8[6], m2.u8[6], m1.u8[7], 473 m2.u8[7]); 474 } 475 476 inline Scalaru8x16_t InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2) { 477 return From8<Scalaru8x16_t>( 478 m1.u8[8 + 0], m2.u8[8 + 0], m1.u8[8 + 1], m2.u8[8 + 1], m1.u8[8 + 2], 479 m2.u8[8 + 2], m1.u8[8 + 3], m2.u8[8 + 3], m1.u8[8 + 4], m2.u8[8 + 4], 480 m1.u8[8 + 5], m2.u8[8 + 5], m1.u8[8 + 6], m2.u8[8 + 6], m1.u8[8 + 7], 481 m2.u8[8 + 7]); 482 } 483 484 inline Scalaru16x8_t InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2) { 485 return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1], 486 m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]); 487 } 488 489 inline Scalaru16x8_t InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2) { 490 return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5], 491 m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]); 492 } 493 494 inline Scalari32x4_t InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2) { 495 return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]); 496 } 497 498 inline Scalari16x8_t UnpackLo8x8ToI16x8(Scalaru8x16_t aM) { 499 Scalari16x8_t m; 500 m.i16[0] = aM.u8[0]; 501 m.i16[1] = aM.u8[1]; 502 m.i16[2] = aM.u8[2]; 503 m.i16[3] = aM.u8[3]; 504 m.i16[4] = aM.u8[4]; 505 m.i16[5] = aM.u8[5]; 506 m.i16[6] = aM.u8[6]; 507 m.i16[7] = aM.u8[7]; 508 return m; 509 } 510 511 inline Scalari16x8_t UnpackHi8x8ToI16x8(Scalaru8x16_t aM) { 512 Scalari16x8_t m; 513 m.i16[0] = aM.u8[8 + 0]; 514 m.i16[1] = aM.u8[8 + 1]; 515 m.i16[2] = aM.u8[8 + 2]; 516 m.i16[3] = aM.u8[8 + 3]; 517 m.i16[4] = aM.u8[8 + 4]; 518 m.i16[5] = aM.u8[8 + 5]; 519 m.i16[6] = aM.u8[8 + 6]; 520 m.i16[7] = aM.u8[8 + 7]; 521 return m; 522 } 523 524 inline Scalaru16x8_t UnpackLo8x8ToU16x8(Scalaru8x16_t aM) { 525 return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]), 526 uint16_t(aM.u8[2]), uint16_t(aM.u8[3]), 527 uint16_t(aM.u8[4]), uint16_t(aM.u8[5]), 528 uint16_t(aM.u8[6]), uint16_t(aM.u8[7])); 529 } 530 531 inline Scalaru16x8_t UnpackHi8x8ToU16x8(Scalaru8x16_t aM) { 532 return FromU16<Scalaru16x8_t>(aM.u8[8 + 0], aM.u8[8 + 1], aM.u8[8 + 2], 533 aM.u8[8 + 3], aM.u8[8 + 4], aM.u8[8 + 5], 534 aM.u8[8 + 6], aM.u8[8 + 7]); 535 } 536 537 template <uint8_t aNumBytes> 538 inline Scalaru8x16_t Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678) { 539 Scalaru8x16_t m; 540 for (uint8_t i = 0; i < 16; i++) { 541 uint8_t sourceByte = i + aNumBytes; 542 m.u8[i] = 543 sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16]; 544 } 545 return m; 546 } 547 548 template <typename T> 549 inline int16_t SaturateTo16(T a) { 550 return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN); 551 } 552 553 inline Scalari16x8_t PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2) { 554 Scalari16x8_t m; 555 m.i16[0] = SaturateTo16(m1.i32[0]); 556 m.i16[1] = SaturateTo16(m1.i32[1]); 557 m.i16[2] = SaturateTo16(m1.i32[2]); 558 m.i16[3] = SaturateTo16(m1.i32[3]); 559 m.i16[4] = SaturateTo16(m2.i32[0]); 560 m.i16[5] = SaturateTo16(m2.i32[1]); 561 m.i16[6] = SaturateTo16(m2.i32[2]); 562 m.i16[7] = SaturateTo16(m2.i32[3]); 563 return m; 564 } 565 566 template <typename T> 567 inline uint16_t SaturateToU16(T a) { 568 return uint16_t(umin(a & -(a >= 0), INT16_MAX)); 569 } 570 571 inline Scalaru16x8_t PackAndSaturate32ToU16(Scalari32x4_t m1, 572 Scalari32x4_t m2) { 573 Scalaru16x8_t m; 574 m.u16[0] = SaturateToU16(m1.i32[0]); 575 m.u16[1] = SaturateToU16(m1.i32[1]); 576 m.u16[2] = SaturateToU16(m1.i32[2]); 577 m.u16[3] = SaturateToU16(m1.i32[3]); 578 m.u16[4] = SaturateToU16(m2.i32[0]); 579 m.u16[5] = SaturateToU16(m2.i32[1]); 580 m.u16[6] = SaturateToU16(m2.i32[2]); 581 m.u16[7] = SaturateToU16(m2.i32[3]); 582 return m; 583 } 584 585 template <typename T> 586 inline uint8_t SaturateTo8(T a) { 587 return uint8_t(umin(a & -(a >= 0), 255)); 588 } 589 590 inline Scalaru8x16_t PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2, 591 Scalari32x4_t m3, 592 const Scalari32x4_t& m4) { 593 Scalaru8x16_t m; 594 m.u8[0] = SaturateTo8(m1.i32[0]); 595 m.u8[1] = SaturateTo8(m1.i32[1]); 596 m.u8[2] = SaturateTo8(m1.i32[2]); 597 m.u8[3] = SaturateTo8(m1.i32[3]); 598 m.u8[4] = SaturateTo8(m2.i32[0]); 599 m.u8[5] = SaturateTo8(m2.i32[1]); 600 m.u8[6] = SaturateTo8(m2.i32[2]); 601 m.u8[7] = SaturateTo8(m2.i32[3]); 602 m.u8[8] = SaturateTo8(m3.i32[0]); 603 m.u8[9] = SaturateTo8(m3.i32[1]); 604 m.u8[10] = SaturateTo8(m3.i32[2]); 605 m.u8[11] = SaturateTo8(m3.i32[3]); 606 m.u8[12] = SaturateTo8(m4.i32[0]); 607 m.u8[13] = SaturateTo8(m4.i32[1]); 608 m.u8[14] = SaturateTo8(m4.i32[2]); 609 m.u8[15] = SaturateTo8(m4.i32[3]); 610 return m; 611 } 612 613 inline Scalaru8x16_t PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2) { 614 Scalaru8x16_t m; 615 m.u8[0] = SaturateTo8(m1.i16[0]); 616 m.u8[1] = SaturateTo8(m1.i16[1]); 617 m.u8[2] = SaturateTo8(m1.i16[2]); 618 m.u8[3] = SaturateTo8(m1.i16[3]); 619 m.u8[4] = SaturateTo8(m1.i16[4]); 620 m.u8[5] = SaturateTo8(m1.i16[5]); 621 m.u8[6] = SaturateTo8(m1.i16[6]); 622 m.u8[7] = SaturateTo8(m1.i16[7]); 623 m.u8[8] = SaturateTo8(m2.i16[0]); 624 m.u8[9] = SaturateTo8(m2.i16[1]); 625 m.u8[10] = SaturateTo8(m2.i16[2]); 626 m.u8[11] = SaturateTo8(m2.i16[3]); 627 m.u8[12] = SaturateTo8(m2.i16[4]); 628 m.u8[13] = SaturateTo8(m2.i16[5]); 629 m.u8[14] = SaturateTo8(m2.i16[6]); 630 m.u8[15] = SaturateTo8(m2.i16[7]); 631 return m; 632 } 633 634 // Fast approximate division by 255. It has the property that 635 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255. 636 // But it only uses two adds and two shifts instead of an 637 // integer division (which is expensive on many processors). 638 // 639 // equivalent to v/255 640 template <class B, class A> 641 inline B FastDivideBy255(A v) { 642 return ((v << 8) + v + 255) >> 16; 643 } 644 645 inline Scalaru16x8_t FastDivideBy255_16(Scalaru16x8_t m) { 646 return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])), 647 FastDivideBy255<uint16_t>(int32_t(m.u16[1])), 648 FastDivideBy255<uint16_t>(int32_t(m.u16[2])), 649 FastDivideBy255<uint16_t>(int32_t(m.u16[3])), 650 FastDivideBy255<uint16_t>(int32_t(m.u16[4])), 651 FastDivideBy255<uint16_t>(int32_t(m.u16[5])), 652 FastDivideBy255<uint16_t>(int32_t(m.u16[6])), 653 FastDivideBy255<uint16_t>(int32_t(m.u16[7]))); 654 } 655 656 inline Scalari32x4_t FastDivideBy255(Scalari32x4_t m) { 657 return From32<Scalari32x4_t>( 658 FastDivideBy255<int32_t>(m.i32[0]), FastDivideBy255<int32_t>(m.i32[1]), 659 FastDivideBy255<int32_t>(m.i32[2]), FastDivideBy255<int32_t>(m.i32[3])); 660 } 661 662 inline Scalaru8x16_t Pick(Scalaru8x16_t mask, Scalaru8x16_t a, 663 Scalaru8x16_t b) { 664 return From8<Scalaru8x16_t>( 665 (a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]), 666 (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]), 667 (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]), 668 (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]), 669 (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]), 670 (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]), 671 (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]), 672 (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]), 673 (a.u8[8 + 0] & (~mask.u8[8 + 0])) | (b.u8[8 + 0] & mask.u8[8 + 0]), 674 (a.u8[8 + 1] & (~mask.u8[8 + 1])) | (b.u8[8 + 1] & mask.u8[8 + 1]), 675 (a.u8[8 + 2] & (~mask.u8[8 + 2])) | (b.u8[8 + 2] & mask.u8[8 + 2]), 676 (a.u8[8 + 3] & (~mask.u8[8 + 3])) | (b.u8[8 + 3] & mask.u8[8 + 3]), 677 (a.u8[8 + 4] & (~mask.u8[8 + 4])) | (b.u8[8 + 4] & mask.u8[8 + 4]), 678 (a.u8[8 + 5] & (~mask.u8[8 + 5])) | (b.u8[8 + 5] & mask.u8[8 + 5]), 679 (a.u8[8 + 6] & (~mask.u8[8 + 6])) | (b.u8[8 + 6] & mask.u8[8 + 6]), 680 (a.u8[8 + 7] & (~mask.u8[8 + 7])) | (b.u8[8 + 7] & mask.u8[8 + 7])); 681 } 682 683 inline Scalari32x4_t Pick(Scalari32x4_t mask, Scalari32x4_t a, 684 Scalari32x4_t b) { 685 return From32<Scalari32x4_t>( 686 (a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]), 687 (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]), 688 (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]), 689 (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3])); 690 } 691 692 inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t) { 693 return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t, 694 a.f32[1] + (b.f32[1] - a.f32[1]) * t, 695 a.f32[2] + (b.f32[2] - a.f32[2]) * t, 696 a.f32[3] + (b.f32[3] - a.f32[3]) * t); 697 } 698 699 inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa, 700 float wb) { 701 return FromF32<Scalarf32x4_t>( 702 a.f32[0] * wa + b.f32[0] * wb, a.f32[1] * wa + b.f32[1] * wb, 703 a.f32[2] * wa + b.f32[2] * wb, a.f32[3] * wa + b.f32[3] * wb); 704 } 705 706 inline Scalarf32x4_t AbsF32(Scalarf32x4_t a) { 707 return FromF32<Scalarf32x4_t>(fabs(a.f32[0]), fabs(a.f32[1]), fabs(a.f32[2]), 708 fabs(a.f32[3])); 709 } 710 711 inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b) { 712 return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0], a.f32[1] + b.f32[1], 713 a.f32[2] + b.f32[2], a.f32[3] + b.f32[3]); 714 } 715 716 inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b) { 717 return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0], a.f32[1] * b.f32[1], 718 a.f32[2] * b.f32[2], a.f32[3] * b.f32[3]); 719 } 720 721 inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b) { 722 return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0], a.f32[1] / b.f32[1], 723 a.f32[2] / b.f32[2], a.f32[3] / b.f32[3]); 724 } 725 726 template <uint8_t aIndex> 727 inline Scalarf32x4_t SplatF32(Scalarf32x4_t m) { 728 AssertIndex<aIndex>(); 729 return FromF32<Scalarf32x4_t>(m.f32[aIndex], m.f32[aIndex], m.f32[aIndex], 730 m.f32[aIndex]); 731 } 732 733 inline Scalari32x4_t F32ToI32(Scalarf32x4_t m) { 734 return From32<Scalari32x4_t>( 735 int32_t(floor(m.f32[0] + 0.5f)), int32_t(floor(m.f32[1] + 0.5f)), 736 int32_t(floor(m.f32[2] + 0.5f)), int32_t(floor(m.f32[3] + 0.5f))); 737 } 738 739 #ifdef SIMD_COMPILE_SSE2 740 741 // SSE2 742 743 template <> 744 inline __m128i Load8<__m128i>(const uint8_t* aSource) { 745 return _mm_load_si128((const __m128i*)aSource); 746 } 747 748 inline void Store8(uint8_t* aTarget, __m128i aM) { 749 _mm_store_si128((__m128i*)aTarget, aM); 750 } 751 752 template <> 753 inline __m128i FromZero8<__m128i>() { 754 return _mm_setzero_si128(); 755 } 756 757 template <> 758 inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d, 759 uint8_t e, uint8_t f, uint8_t g, uint8_t h, 760 uint8_t i, uint8_t j, uint8_t k, uint8_t l, 761 uint8_t m, uint8_t n, uint8_t o, uint8_t p) { 762 return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g, 763 (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o); 764 } 765 766 template <> 767 inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d, 768 int16_t e, int16_t f, int16_t g, int16_t h) { 769 return _mm_setr_epi16(a, b, c, d, e, f, g, h); 770 } 771 772 template <> 773 inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d, 774 uint16_t e, uint16_t f, uint16_t g, 775 uint16_t h) { 776 return _mm_setr_epi16(a, b, c, d, e, f, g, h); 777 } 778 779 template <> 780 inline __m128i FromI16<__m128i>(int16_t a) { 781 return _mm_set1_epi16(a); 782 } 783 784 template <> 785 inline __m128i FromU16<__m128i>(uint16_t a) { 786 return _mm_set1_epi16((int16_t)a); 787 } 788 789 template <> 790 inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d) { 791 return _mm_setr_epi32(a, b, c, d); 792 } 793 794 template <> 795 inline __m128i From32<__m128i>(int32_t a) { 796 return _mm_set1_epi32(a); 797 } 798 799 template <> 800 inline __m128 FromF32<__m128>(float a, float b, float c, float d) { 801 return _mm_setr_ps(a, b, c, d); 802 } 803 804 template <> 805 inline __m128 FromF32<__m128>(float a) { 806 return _mm_set1_ps(a); 807 } 808 809 template <int32_t aNumberOfBits> 810 inline __m128i ShiftRight16(__m128i aM) { 811 return _mm_srli_epi16(aM, aNumberOfBits); 812 } 813 814 template <int32_t aNumberOfBits> 815 inline __m128i ShiftRight32(__m128i aM) { 816 return _mm_srai_epi32(aM, aNumberOfBits); 817 } 818 819 inline __m128i Add16(__m128i aM1, __m128i aM2) { 820 return _mm_add_epi16(aM1, aM2); 821 } 822 823 inline __m128i Add32(__m128i aM1, __m128i aM2) { 824 return _mm_add_epi32(aM1, aM2); 825 } 826 827 inline __m128i Sub16(__m128i aM1, __m128i aM2) { 828 return _mm_sub_epi16(aM1, aM2); 829 } 830 831 inline __m128i Sub32(__m128i aM1, __m128i aM2) { 832 return _mm_sub_epi32(aM1, aM2); 833 } 834 835 inline __m128i Min8(__m128i aM1, __m128i aM2) { return _mm_min_epu8(aM1, aM2); } 836 837 inline __m128i Max8(__m128i aM1, __m128i aM2) { return _mm_max_epu8(aM1, aM2); } 838 839 inline __m128i Min32(__m128i aM1, __m128i aM2) { 840 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); 841 __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2); 842 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2)); 843 } 844 845 inline __m128i Max32(__m128i aM1, __m128i aM2) { 846 __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2); 847 __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1); 848 return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1)); 849 } 850 851 inline __m128i Mul16(__m128i aM1, __m128i aM2) { 852 return _mm_mullo_epi16(aM1, aM2); 853 } 854 855 inline __m128i MulU16(__m128i aM1, __m128i aM2) { 856 return _mm_mullo_epi16(aM1, aM2); 857 } 858 859 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1, __m128i aFactorsA2B2, 860 __m128i& aProductA, __m128i& aProductB) { 861 __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2); 862 __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2); 863 aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi); 864 aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi); 865 } 866 867 inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA, __m128i aFactorsB) { 868 return _mm_madd_epi16(aFactorsA, aFactorsB); 869 } 870 871 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3> 872 inline __m128i Shuffle32(__m128i aM) { 873 AssertIndex<i0>(); 874 AssertIndex<i1>(); 875 AssertIndex<i2>(); 876 AssertIndex<i3>(); 877 return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3)); 878 } 879 880 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3> 881 inline __m128i ShuffleLo16(__m128i aM) { 882 AssertIndex<i0>(); 883 AssertIndex<i1>(); 884 AssertIndex<i2>(); 885 AssertIndex<i3>(); 886 return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); 887 } 888 889 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3> 890 inline __m128i ShuffleHi16(__m128i aM) { 891 AssertIndex<i0>(); 892 AssertIndex<i1>(); 893 AssertIndex<i2>(); 894 AssertIndex<i3>(); 895 return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3)); 896 } 897 898 template <int8_t aIndex> 899 inline __m128i Splat32(__m128i aM) { 900 return Shuffle32<aIndex, aIndex, aIndex, aIndex>(aM); 901 } 902 903 template <int8_t aIndex> 904 inline __m128i Splat32On8(__m128i aM) { 905 return Shuffle32<aIndex, aIndex, aIndex, aIndex>(aM); 906 } 907 908 template <int8_t aIndexLo, int8_t aIndexHi> 909 inline __m128i Splat16(__m128i aM) { 910 AssertIndex<aIndexLo>(); 911 AssertIndex<aIndexHi>(); 912 return ShuffleHi16<aIndexHi, aIndexHi, aIndexHi, aIndexHi>( 913 ShuffleLo16<aIndexLo, aIndexLo, aIndexLo, aIndexLo>(aM)); 914 } 915 916 inline __m128i UnpackLo8x8ToI16x8(__m128i m) { 917 __m128i zero = _mm_set1_epi8(0); 918 return _mm_unpacklo_epi8(m, zero); 919 } 920 921 inline __m128i UnpackHi8x8ToI16x8(__m128i m) { 922 __m128i zero = _mm_set1_epi8(0); 923 return _mm_unpackhi_epi8(m, zero); 924 } 925 926 inline __m128i UnpackLo8x8ToU16x8(__m128i m) { 927 __m128i zero = _mm_set1_epi8(0); 928 return _mm_unpacklo_epi8(m, zero); 929 } 930 931 inline __m128i UnpackHi8x8ToU16x8(__m128i m) { 932 __m128i zero = _mm_set1_epi8(0); 933 return _mm_unpackhi_epi8(m, zero); 934 } 935 936 inline __m128i InterleaveLo8(__m128i m1, __m128i m2) { 937 return _mm_unpacklo_epi8(m1, m2); 938 } 939 940 inline __m128i InterleaveHi8(__m128i m1, __m128i m2) { 941 return _mm_unpackhi_epi8(m1, m2); 942 } 943 944 inline __m128i InterleaveLo16(__m128i m1, __m128i m2) { 945 return _mm_unpacklo_epi16(m1, m2); 946 } 947 948 inline __m128i InterleaveHi16(__m128i m1, __m128i m2) { 949 return _mm_unpackhi_epi16(m1, m2); 950 } 951 952 inline __m128i InterleaveLo32(__m128i m1, __m128i m2) { 953 return _mm_unpacklo_epi32(m1, m2); 954 } 955 956 template <uint8_t aNumBytes> 957 inline __m128i Rotate8(__m128i a1234, __m128i a5678) { 958 return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes), 959 _mm_slli_si128(a5678, 16 - aNumBytes)); 960 } 961 962 inline __m128i PackAndSaturate32To16(__m128i m1, __m128i m2) { 963 return _mm_packs_epi32(m1, m2); 964 } 965 966 inline __m128i PackAndSaturate32ToU16(__m128i m1, __m128i m2) { 967 return _mm_packs_epi32(m1, m2); 968 } 969 970 inline __m128i PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3, 971 const __m128i& m4) { 972 // Pack into 8 16bit signed integers (saturating). 973 __m128i m12 = _mm_packs_epi32(m1, m2); 974 __m128i m34 = _mm_packs_epi32(m3, m4); 975 976 // Pack into 16 8bit unsigned integers (saturating). 977 return _mm_packus_epi16(m12, m34); 978 } 979 980 inline __m128i PackAndSaturate16To8(__m128i m1, __m128i m2) { 981 // Pack into 16 8bit unsigned integers (saturating). 982 return _mm_packus_epi16(m1, m2); 983 } 984 985 inline __m128i FastDivideBy255(__m128i m) { 986 // v = m << 8 987 __m128i v = _mm_slli_epi32(m, 8); 988 // v = v + (m + (255,255,255,255)) 989 v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255))); 990 // v = v >> 16 991 return _mm_srai_epi32(v, 16); 992 } 993 994 inline __m128i FastDivideBy255_16(__m128i m) { 995 __m128i zero = _mm_set1_epi16(0); 996 __m128i lo = _mm_unpacklo_epi16(m, zero); 997 __m128i hi = _mm_unpackhi_epi16(m, zero); 998 return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi)); 999 } 1000 1001 inline __m128i Pick(__m128i mask, __m128i a, __m128i b) { 1002 return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b)); 1003 } 1004 1005 inline __m128 MixF32(__m128 a, __m128 b, float t) { 1006 return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t))); 1007 } 1008 1009 inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb) { 1010 return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)), 1011 _mm_mul_ps(b, _mm_set1_ps(wb))); 1012 } 1013 1014 inline __m128 AbsF32(__m128 a) { 1015 return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a); 1016 } 1017 1018 inline __m128 AddF32(__m128 a, __m128 b) { return _mm_add_ps(a, b); } 1019 1020 inline __m128 MulF32(__m128 a, __m128 b) { return _mm_mul_ps(a, b); } 1021 1022 inline __m128 DivF32(__m128 a, __m128 b) { return _mm_div_ps(a, b); } 1023 1024 template <uint8_t aIndex> 1025 inline __m128 SplatF32(__m128 m) { 1026 AssertIndex<aIndex>(); 1027 return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex)); 1028 } 1029 1030 inline __m128i F32ToI32(__m128 m) { return _mm_cvtps_epi32(m); } 1031 1032 #endif // SIMD_COMPILE_SSE2 1033 1034 } // namespace simd 1035 1036 } // namespace gfx 1037 } // namespace mozilla 1038 1039 #endif // _MOZILLA_GFX_SIMD_H_