tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SIMD.h (34576B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #ifndef _MOZILLA_GFX_SIMD_H_
      8 #define _MOZILLA_GFX_SIMD_H_
      9 
     10 /**
     11 * Consumers of this file need to #define SIMD_COMPILE_SSE2 before including it
     12 * if they want access to the SSE2 functions.
     13 */
     14 
     15 #ifdef SIMD_COMPILE_SSE2
     16 #  include <xmmintrin.h>
     17 #endif
     18 
     19 namespace mozilla {
     20 namespace gfx {
     21 
     22 namespace simd {
     23 
     24 template <typename u8x16_t>
     25 u8x16_t Load8(const uint8_t* aSource);
     26 
     27 template <typename u8x16_t>
     28 u8x16_t From8(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f,
     29              uint8_t g, uint8_t h, uint8_t i, uint8_t j, uint8_t k, uint8_t l,
     30              uint8_t m, uint8_t n, uint8_t o, uint8_t p);
     31 
     32 template <typename u8x16_t>
     33 u8x16_t FromZero8();
     34 
     35 template <typename i16x8_t>
     36 i16x8_t FromI16(int16_t a, int16_t b, int16_t c, int16_t d, int16_t e,
     37                int16_t f, int16_t g, int16_t h);
     38 
     39 template <typename u16x8_t>
     40 u16x8_t FromU16(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e,
     41                uint16_t f, uint16_t g, uint16_t h);
     42 
     43 template <typename i16x8_t>
     44 i16x8_t FromI16(int16_t a);
     45 
     46 template <typename u16x8_t>
     47 u16x8_t FromU16(uint16_t a);
     48 
     49 template <typename i32x4_t>
     50 i32x4_t From32(int32_t a, int32_t b, int32_t c, int32_t d);
     51 
     52 template <typename i32x4_t>
     53 i32x4_t From32(int32_t a);
     54 
     55 template <typename f32x4_t>
     56 f32x4_t FromF32(float a, float b, float c, float d);
     57 
     58 template <typename f32x4_t>
     59 f32x4_t FromF32(float a);
     60 
     61 // All SIMD backends overload these functions for their SIMD types:
     62 
     63 #if 0
     64 
     65 // Store 16 bytes to a 16-byte aligned address
     66 void Store8(uint8_t* aTarget, u8x16_t aM);
     67 
     68 // Fixed shifts
     69 template<int32_t aNumberOfBits> i16x8_t ShiftRight16(i16x8_t aM);
     70 template<int32_t aNumberOfBits> i32x4_t ShiftRight32(i32x4_t aM);
     71 
     72 i16x8_t Add16(i16x8_t aM1, i16x8_t aM2);
     73 i32x4_t Add32(i32x4_t aM1, i32x4_t aM2);
     74 i16x8_t Sub16(i16x8_t aM1, i16x8_t aM2);
     75 i32x4_t Sub32(i32x4_t aM1, i32x4_t aM2);
     76 u8x16_t Min8(u8x16_t aM1, iu8x16_t aM2);
     77 u8x16_t Max8(u8x16_t aM1, iu8x16_t aM2);
     78 i32x4_t Min32(i32x4_t aM1, i32x4_t aM2);
     79 i32x4_t Max32(i32x4_t aM1, i32x4_t aM2);
     80 
     81 // Truncating i16 -> i16 multiplication
     82 i16x8_t Mul16(i16x8_t aM1, i16x8_t aM2);
     83 
     84 // Long multiplication i16 -> i32
     85 // aFactorsA1B1 = (a1[4] b1[4])
     86 // aFactorsA2B2 = (a2[4] b2[4])
     87 // aProductA = a1 * a2, aProductB = b1 * b2
     88 void Mul16x4x2x2To32x4x2(i16x8_t aFactorsA1B1, i16x8_t aFactorsA2B2,
     89                         i32x4_t& aProductA, i32x4_t& aProductB);
     90 
     91 // Long multiplication + pairwise addition i16 -> i32
     92 // See the scalar implementation for specifics.
     93 i32x4_t MulAdd16x8x2To32x4(i16x8_t aFactorsA, i16x8_t aFactorsB);
     94 i32x4_t MulAdd16x8x2To32x4(u16x8_t aFactorsA, u16x8_t aFactorsB);
     95 
     96 // Set all four 32-bit components to the value of the component at aIndex.
     97 template<int8_t aIndex>
     98 i32x4_t Splat32(i32x4_t aM);
     99 
    100 // Interpret the input as four 32-bit values, apply Splat32<aIndex> on them,
    101 // re-interpret the result as sixteen 8-bit values.
    102 template<int8_t aIndex>
    103 u8x16_t Splat32On8(u8x16_t aM);
    104 
    105 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i32x4 Shuffle32(i32x4 aM);
    106 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleLo16(i16x8 aM);
    107 template<int8_t i0, int8_t i1, int8_t i2, int8_t i3> i16x8 ShuffleHi16(i16x8 aM);
    108 
    109 u8x16_t InterleaveLo8(u8x16_t m1, u8x16_t m2);
    110 u8x16_t InterleaveHi8(u8x16_t m1, u8x16_t m2);
    111 i16x8_t InterleaveLo16(i16x8_t m1, i16x8_t m2);
    112 i16x8_t InterleaveHi16(i16x8_t m1, i16x8_t m2);
    113 i32x4_t InterleaveLo32(i32x4_t m1, i32x4_t m2);
    114 
    115 i16x8_t UnpackLo8x8ToI16x8(u8x16_t m);
    116 i16x8_t UnpackHi8x8ToI16x8(u8x16_t m);
    117 u16x8_t UnpackLo8x8ToU16x8(u8x16_t m);
    118 u16x8_t UnpackHi8x8ToU16x8(u8x16_t m);
    119 
    120 i16x8_t PackAndSaturate32To16(i32x4_t m1, i32x4_t m2);
    121 u8x16_t PackAndSaturate16To8(i16x8_t m1, i16x8_t m2);
    122 u8x16_t PackAndSaturate32To8(i32x4_t m1, i32x4_t m2, i32x4_t m3, const i32x4_t& m4);
    123 
    124 i32x4 FastDivideBy255(i32x4 m);
    125 i16x8 FastDivideBy255_16(i16x8 m);
    126 
    127 #endif
    128 
    129 // Scalar
    130 
    131 struct Scalaru8x16_t {
    132  uint8_t u8[16];
    133 };
    134 
    135 union Scalari16x8_t {
    136  int16_t i16[8];
    137  uint16_t u16[8];
    138 };
    139 
    140 typedef Scalari16x8_t Scalaru16x8_t;
    141 
    142 struct Scalari32x4_t {
    143  int32_t i32[4];
    144 };
    145 
    146 struct Scalarf32x4_t {
    147  float f32[4];
    148 };
    149 
    150 template <>
    151 inline Scalaru8x16_t Load8<Scalaru8x16_t>(const uint8_t* aSource) {
    152  return *(Scalaru8x16_t*)aSource;
    153 }
    154 
    155 inline void Store8(uint8_t* aTarget, Scalaru8x16_t aM) {
    156  *(Scalaru8x16_t*)aTarget = aM;
    157 }
    158 
    159 template <>
    160 inline Scalaru8x16_t From8<Scalaru8x16_t>(uint8_t a, uint8_t b, uint8_t c,
    161                                          uint8_t d, uint8_t e, uint8_t f,
    162                                          uint8_t g, uint8_t h, uint8_t i,
    163                                          uint8_t j, uint8_t k, uint8_t l,
    164                                          uint8_t m, uint8_t n, uint8_t o,
    165                                          uint8_t p) {
    166  Scalaru8x16_t _m;
    167  _m.u8[0] = a;
    168  _m.u8[1] = b;
    169  _m.u8[2] = c;
    170  _m.u8[3] = d;
    171  _m.u8[4] = e;
    172  _m.u8[5] = f;
    173  _m.u8[6] = g;
    174  _m.u8[7] = h;
    175  _m.u8[8 + 0] = i;
    176  _m.u8[8 + 1] = j;
    177  _m.u8[8 + 2] = k;
    178  _m.u8[8 + 3] = l;
    179  _m.u8[8 + 4] = m;
    180  _m.u8[8 + 5] = n;
    181  _m.u8[8 + 6] = o;
    182  _m.u8[8 + 7] = p;
    183  return _m;
    184 }
    185 
    186 template <>
    187 inline Scalaru8x16_t FromZero8<Scalaru8x16_t>() {
    188  return From8<Scalaru8x16_t>(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);
    189 }
    190 
    191 template <>
    192 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a, int16_t b, int16_t c,
    193                                            int16_t d, int16_t e, int16_t f,
    194                                            int16_t g, int16_t h) {
    195  Scalari16x8_t m;
    196  m.i16[0] = a;
    197  m.i16[1] = b;
    198  m.i16[2] = c;
    199  m.i16[3] = d;
    200  m.i16[4] = e;
    201  m.i16[5] = f;
    202  m.i16[6] = g;
    203  m.i16[7] = h;
    204  return m;
    205 }
    206 
    207 template <>
    208 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a, uint16_t b, uint16_t c,
    209                                            uint16_t d, uint16_t e, uint16_t f,
    210                                            uint16_t g, uint16_t h) {
    211  Scalaru16x8_t m;
    212  m.u16[0] = a;
    213  m.u16[1] = b;
    214  m.u16[2] = c;
    215  m.u16[3] = d;
    216  m.u16[4] = e;
    217  m.u16[5] = f;
    218  m.u16[6] = g;
    219  m.u16[7] = h;
    220  return m;
    221 }
    222 
    223 template <>
    224 inline Scalari16x8_t FromI16<Scalari16x8_t>(int16_t a) {
    225  return FromI16<Scalari16x8_t>(a, a, a, a, a, a, a, a);
    226 }
    227 
    228 template <>
    229 inline Scalaru16x8_t FromU16<Scalaru16x8_t>(uint16_t a) {
    230  return FromU16<Scalaru16x8_t>(a, a, a, a, a, a, a, a);
    231 }
    232 
    233 template <>
    234 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a, int32_t b, int32_t c,
    235                                           int32_t d) {
    236  Scalari32x4_t m;
    237  m.i32[0] = a;
    238  m.i32[1] = b;
    239  m.i32[2] = c;
    240  m.i32[3] = d;
    241  return m;
    242 }
    243 
    244 template <>
    245 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a, float b, float c,
    246                                            float d) {
    247  Scalarf32x4_t m;
    248  m.f32[0] = a;
    249  m.f32[1] = b;
    250  m.f32[2] = c;
    251  m.f32[3] = d;
    252  return m;
    253 }
    254 
    255 template <>
    256 inline Scalarf32x4_t FromF32<Scalarf32x4_t>(float a) {
    257  return FromF32<Scalarf32x4_t>(a, a, a, a);
    258 }
    259 
    260 template <>
    261 inline Scalari32x4_t From32<Scalari32x4_t>(int32_t a) {
    262  return From32<Scalari32x4_t>(a, a, a, a);
    263 }
    264 
    265 template <int32_t aNumberOfBits>
    266 inline Scalari16x8_t ShiftRight16(Scalari16x8_t aM) {
    267  return FromI16<Scalari16x8_t>(uint16_t(aM.i16[0]) >> aNumberOfBits,
    268                                uint16_t(aM.i16[1]) >> aNumberOfBits,
    269                                uint16_t(aM.i16[2]) >> aNumberOfBits,
    270                                uint16_t(aM.i16[3]) >> aNumberOfBits,
    271                                uint16_t(aM.i16[4]) >> aNumberOfBits,
    272                                uint16_t(aM.i16[5]) >> aNumberOfBits,
    273                                uint16_t(aM.i16[6]) >> aNumberOfBits,
    274                                uint16_t(aM.i16[7]) >> aNumberOfBits);
    275 }
    276 
    277 template <int32_t aNumberOfBits>
    278 inline Scalari32x4_t ShiftRight32(Scalari32x4_t aM) {
    279  return From32<Scalari32x4_t>(
    280      aM.i32[0] >> aNumberOfBits, aM.i32[1] >> aNumberOfBits,
    281      aM.i32[2] >> aNumberOfBits, aM.i32[3] >> aNumberOfBits);
    282 }
    283 
    284 inline Scalaru16x8_t Add16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) {
    285  return FromU16<Scalaru16x8_t>(
    286      aM1.u16[0] + aM2.u16[0], aM1.u16[1] + aM2.u16[1], aM1.u16[2] + aM2.u16[2],
    287      aM1.u16[3] + aM2.u16[3], aM1.u16[4] + aM2.u16[4], aM1.u16[5] + aM2.u16[5],
    288      aM1.u16[6] + aM2.u16[6], aM1.u16[7] + aM2.u16[7]);
    289 }
    290 
    291 inline Scalari32x4_t Add32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
    292  return From32<Scalari32x4_t>(aM1.i32[0] + aM2.i32[0], aM1.i32[1] + aM2.i32[1],
    293                               aM1.i32[2] + aM2.i32[2],
    294                               aM1.i32[3] + aM2.i32[3]);
    295 }
    296 
    297 inline Scalaru16x8_t Sub16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) {
    298  return FromU16<Scalaru16x8_t>(
    299      aM1.u16[0] - aM2.u16[0], aM1.u16[1] - aM2.u16[1], aM1.u16[2] - aM2.u16[2],
    300      aM1.u16[3] - aM2.u16[3], aM1.u16[4] - aM2.u16[4], aM1.u16[5] - aM2.u16[5],
    301      aM1.u16[6] - aM2.u16[6], aM1.u16[7] - aM2.u16[7]);
    302 }
    303 
    304 inline Scalari32x4_t Sub32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
    305  return From32<Scalari32x4_t>(aM1.i32[0] - aM2.i32[0], aM1.i32[1] - aM2.i32[1],
    306                               aM1.i32[2] - aM2.i32[2],
    307                               aM1.i32[3] - aM2.i32[3]);
    308 }
    309 
    310 inline int32_t umin(int32_t a, int32_t b) { return a - ((a - b) & -(a > b)); }
    311 
    312 inline int32_t umax(int32_t a, int32_t b) { return a - ((a - b) & -(a < b)); }
    313 
    314 inline Scalaru8x16_t Min8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) {
    315  return From8<Scalaru8x16_t>(
    316      umin(aM1.u8[0], aM2.u8[0]), umin(aM1.u8[1], aM2.u8[1]),
    317      umin(aM1.u8[2], aM2.u8[2]), umin(aM1.u8[3], aM2.u8[3]),
    318      umin(aM1.u8[4], aM2.u8[4]), umin(aM1.u8[5], aM2.u8[5]),
    319      umin(aM1.u8[6], aM2.u8[6]), umin(aM1.u8[7], aM2.u8[7]),
    320      umin(aM1.u8[8 + 0], aM2.u8[8 + 0]), umin(aM1.u8[8 + 1], aM2.u8[8 + 1]),
    321      umin(aM1.u8[8 + 2], aM2.u8[8 + 2]), umin(aM1.u8[8 + 3], aM2.u8[8 + 3]),
    322      umin(aM1.u8[8 + 4], aM2.u8[8 + 4]), umin(aM1.u8[8 + 5], aM2.u8[8 + 5]),
    323      umin(aM1.u8[8 + 6], aM2.u8[8 + 6]), umin(aM1.u8[8 + 7], aM2.u8[8 + 7]));
    324 }
    325 
    326 inline Scalaru8x16_t Max8(Scalaru8x16_t aM1, Scalaru8x16_t aM2) {
    327  return From8<Scalaru8x16_t>(
    328      umax(aM1.u8[0], aM2.u8[0]), umax(aM1.u8[1], aM2.u8[1]),
    329      umax(aM1.u8[2], aM2.u8[2]), umax(aM1.u8[3], aM2.u8[3]),
    330      umax(aM1.u8[4], aM2.u8[4]), umax(aM1.u8[5], aM2.u8[5]),
    331      umax(aM1.u8[6], aM2.u8[6]), umax(aM1.u8[7], aM2.u8[7]),
    332      umax(aM1.u8[8 + 0], aM2.u8[8 + 0]), umax(aM1.u8[8 + 1], aM2.u8[8 + 1]),
    333      umax(aM1.u8[8 + 2], aM2.u8[8 + 2]), umax(aM1.u8[8 + 3], aM2.u8[8 + 3]),
    334      umax(aM1.u8[8 + 4], aM2.u8[8 + 4]), umax(aM1.u8[8 + 5], aM2.u8[8 + 5]),
    335      umax(aM1.u8[8 + 6], aM2.u8[8 + 6]), umax(aM1.u8[8 + 7], aM2.u8[8 + 7]));
    336 }
    337 
    338 inline Scalari32x4_t Min32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
    339  return From32<Scalari32x4_t>(
    340      umin(aM1.i32[0], aM2.i32[0]), umin(aM1.i32[1], aM2.i32[1]),
    341      umin(aM1.i32[2], aM2.i32[2]), umin(aM1.i32[3], aM2.i32[3]));
    342 }
    343 
    344 inline Scalari32x4_t Max32(Scalari32x4_t aM1, Scalari32x4_t aM2) {
    345  return From32<Scalari32x4_t>(
    346      umax(aM1.i32[0], aM2.i32[0]), umax(aM1.i32[1], aM2.i32[1]),
    347      umax(aM1.i32[2], aM2.i32[2]), umax(aM1.i32[3], aM2.i32[3]));
    348 }
    349 
    350 inline Scalaru16x8_t Mul16(Scalaru16x8_t aM1, Scalaru16x8_t aM2) {
    351  return FromU16<Scalaru16x8_t>(
    352      uint16_t(int32_t(aM1.u16[0]) * int32_t(aM2.u16[0])),
    353      uint16_t(int32_t(aM1.u16[1]) * int32_t(aM2.u16[1])),
    354      uint16_t(int32_t(aM1.u16[2]) * int32_t(aM2.u16[2])),
    355      uint16_t(int32_t(aM1.u16[3]) * int32_t(aM2.u16[3])),
    356      uint16_t(int32_t(aM1.u16[4]) * int32_t(aM2.u16[4])),
    357      uint16_t(int32_t(aM1.u16[5]) * int32_t(aM2.u16[5])),
    358      uint16_t(int32_t(aM1.u16[6]) * int32_t(aM2.u16[6])),
    359      uint16_t(int32_t(aM1.u16[7]) * int32_t(aM2.u16[7])));
    360 }
    361 
    362 inline void Mul16x4x2x2To32x4x2(Scalari16x8_t aFactorsA1B1,
    363                                Scalari16x8_t aFactorsA2B2,
    364                                Scalari32x4_t& aProductA,
    365                                Scalari32x4_t& aProductB) {
    366  aProductA = From32<Scalari32x4_t>(aFactorsA1B1.i16[0] * aFactorsA2B2.i16[0],
    367                                    aFactorsA1B1.i16[1] * aFactorsA2B2.i16[1],
    368                                    aFactorsA1B1.i16[2] * aFactorsA2B2.i16[2],
    369                                    aFactorsA1B1.i16[3] * aFactorsA2B2.i16[3]);
    370  aProductB = From32<Scalari32x4_t>(aFactorsA1B1.i16[4] * aFactorsA2B2.i16[4],
    371                                    aFactorsA1B1.i16[5] * aFactorsA2B2.i16[5],
    372                                    aFactorsA1B1.i16[6] * aFactorsA2B2.i16[6],
    373                                    aFactorsA1B1.i16[7] * aFactorsA2B2.i16[7]);
    374 }
    375 
    376 inline Scalari32x4_t MulAdd16x8x2To32x4(Scalari16x8_t aFactorsA,
    377                                        Scalari16x8_t aFactorsB) {
    378  return From32<Scalari32x4_t>(
    379      aFactorsA.i16[0] * aFactorsB.i16[0] + aFactorsA.i16[1] * aFactorsB.i16[1],
    380      aFactorsA.i16[2] * aFactorsB.i16[2] + aFactorsA.i16[3] * aFactorsB.i16[3],
    381      aFactorsA.i16[4] * aFactorsB.i16[4] + aFactorsA.i16[5] * aFactorsB.i16[5],
    382      aFactorsA.i16[6] * aFactorsB.i16[6] +
    383          aFactorsA.i16[7] * aFactorsB.i16[7]);
    384 }
    385 
    386 template <int8_t aIndex>
    387 inline void AssertIndex() {
    388  static_assert(aIndex == 0 || aIndex == 1 || aIndex == 2 || aIndex == 3,
    389                "Invalid splat index");
    390 }
    391 
    392 template <int8_t aIndex>
    393 inline Scalari32x4_t Splat32(Scalari32x4_t aM) {
    394  AssertIndex<aIndex>();
    395  return From32<Scalari32x4_t>(aM.i32[aIndex], aM.i32[aIndex], aM.i32[aIndex],
    396                               aM.i32[aIndex]);
    397 }
    398 
    399 template <int8_t i>
    400 inline Scalaru8x16_t Splat32On8(Scalaru8x16_t aM) {
    401  AssertIndex<i>();
    402  return From8<Scalaru8x16_t>(
    403      aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3],
    404      aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3],
    405      aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3],
    406      aM.u8[i * 4], aM.u8[i * 4 + 1], aM.u8[i * 4 + 2], aM.u8[i * 4 + 3]);
    407 }
    408 
    409 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
    410 inline Scalari32x4_t Shuffle32(Scalari32x4_t aM) {
    411  AssertIndex<i0>();
    412  AssertIndex<i1>();
    413  AssertIndex<i2>();
    414  AssertIndex<i3>();
    415  Scalari32x4_t m = aM;
    416  m.i32[0] = aM.i32[i3];
    417  m.i32[1] = aM.i32[i2];
    418  m.i32[2] = aM.i32[i1];
    419  m.i32[3] = aM.i32[i0];
    420  return m;
    421 }
    422 
    423 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
    424 inline Scalari16x8_t ShuffleLo16(Scalari16x8_t aM) {
    425  AssertIndex<i0>();
    426  AssertIndex<i1>();
    427  AssertIndex<i2>();
    428  AssertIndex<i3>();
    429  Scalari16x8_t m = aM;
    430  m.i16[0] = aM.i16[i3];
    431  m.i16[1] = aM.i16[i2];
    432  m.i16[2] = aM.i16[i1];
    433  m.i16[3] = aM.i16[i0];
    434  return m;
    435 }
    436 
    437 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
    438 inline Scalari16x8_t ShuffleHi16(Scalari16x8_t aM) {
    439  AssertIndex<i0>();
    440  AssertIndex<i1>();
    441  AssertIndex<i2>();
    442  AssertIndex<i3>();
    443  Scalari16x8_t m = aM;
    444  m.i16[4 + 0] = aM.i16[4 + i3];
    445  m.i16[4 + 1] = aM.i16[4 + i2];
    446  m.i16[4 + 2] = aM.i16[4 + i1];
    447  m.i16[4 + 3] = aM.i16[4 + i0];
    448  return m;
    449 }
    450 
    451 template <int8_t aIndexLo, int8_t aIndexHi>
    452 inline Scalaru16x8_t Splat16(Scalaru16x8_t aM) {
    453  AssertIndex<aIndexLo>();
    454  AssertIndex<aIndexHi>();
    455  Scalaru16x8_t m;
    456  int16_t chosenValueLo = aM.u16[aIndexLo];
    457  m.u16[0] = chosenValueLo;
    458  m.u16[1] = chosenValueLo;
    459  m.u16[2] = chosenValueLo;
    460  m.u16[3] = chosenValueLo;
    461  int16_t chosenValueHi = aM.u16[4 + aIndexHi];
    462  m.u16[4] = chosenValueHi;
    463  m.u16[5] = chosenValueHi;
    464  m.u16[6] = chosenValueHi;
    465  m.u16[7] = chosenValueHi;
    466  return m;
    467 }
    468 
    469 inline Scalaru8x16_t InterleaveLo8(Scalaru8x16_t m1, Scalaru8x16_t m2) {
    470  return From8<Scalaru8x16_t>(m1.u8[0], m2.u8[0], m1.u8[1], m2.u8[1], m1.u8[2],
    471                              m2.u8[2], m1.u8[3], m2.u8[3], m1.u8[4], m2.u8[4],
    472                              m1.u8[5], m2.u8[5], m1.u8[6], m2.u8[6], m1.u8[7],
    473                              m2.u8[7]);
    474 }
    475 
    476 inline Scalaru8x16_t InterleaveHi8(Scalaru8x16_t m1, Scalaru8x16_t m2) {
    477  return From8<Scalaru8x16_t>(
    478      m1.u8[8 + 0], m2.u8[8 + 0], m1.u8[8 + 1], m2.u8[8 + 1], m1.u8[8 + 2],
    479      m2.u8[8 + 2], m1.u8[8 + 3], m2.u8[8 + 3], m1.u8[8 + 4], m2.u8[8 + 4],
    480      m1.u8[8 + 5], m2.u8[8 + 5], m1.u8[8 + 6], m2.u8[8 + 6], m1.u8[8 + 7],
    481      m2.u8[8 + 7]);
    482 }
    483 
    484 inline Scalaru16x8_t InterleaveLo16(Scalaru16x8_t m1, Scalaru16x8_t m2) {
    485  return FromU16<Scalaru16x8_t>(m1.u16[0], m2.u16[0], m1.u16[1], m2.u16[1],
    486                                m1.u16[2], m2.u16[2], m1.u16[3], m2.u16[3]);
    487 }
    488 
    489 inline Scalaru16x8_t InterleaveHi16(Scalaru16x8_t m1, Scalaru16x8_t m2) {
    490  return FromU16<Scalaru16x8_t>(m1.u16[4], m2.u16[4], m1.u16[5], m2.u16[5],
    491                                m1.u16[6], m2.u16[6], m1.u16[7], m2.u16[7]);
    492 }
    493 
    494 inline Scalari32x4_t InterleaveLo32(Scalari32x4_t m1, Scalari32x4_t m2) {
    495  return From32<Scalari32x4_t>(m1.i32[0], m2.i32[0], m1.i32[1], m2.i32[1]);
    496 }
    497 
    498 inline Scalari16x8_t UnpackLo8x8ToI16x8(Scalaru8x16_t aM) {
    499  Scalari16x8_t m;
    500  m.i16[0] = aM.u8[0];
    501  m.i16[1] = aM.u8[1];
    502  m.i16[2] = aM.u8[2];
    503  m.i16[3] = aM.u8[3];
    504  m.i16[4] = aM.u8[4];
    505  m.i16[5] = aM.u8[5];
    506  m.i16[6] = aM.u8[6];
    507  m.i16[7] = aM.u8[7];
    508  return m;
    509 }
    510 
    511 inline Scalari16x8_t UnpackHi8x8ToI16x8(Scalaru8x16_t aM) {
    512  Scalari16x8_t m;
    513  m.i16[0] = aM.u8[8 + 0];
    514  m.i16[1] = aM.u8[8 + 1];
    515  m.i16[2] = aM.u8[8 + 2];
    516  m.i16[3] = aM.u8[8 + 3];
    517  m.i16[4] = aM.u8[8 + 4];
    518  m.i16[5] = aM.u8[8 + 5];
    519  m.i16[6] = aM.u8[8 + 6];
    520  m.i16[7] = aM.u8[8 + 7];
    521  return m;
    522 }
    523 
    524 inline Scalaru16x8_t UnpackLo8x8ToU16x8(Scalaru8x16_t aM) {
    525  return FromU16<Scalaru16x8_t>(uint16_t(aM.u8[0]), uint16_t(aM.u8[1]),
    526                                uint16_t(aM.u8[2]), uint16_t(aM.u8[3]),
    527                                uint16_t(aM.u8[4]), uint16_t(aM.u8[5]),
    528                                uint16_t(aM.u8[6]), uint16_t(aM.u8[7]));
    529 }
    530 
    531 inline Scalaru16x8_t UnpackHi8x8ToU16x8(Scalaru8x16_t aM) {
    532  return FromU16<Scalaru16x8_t>(aM.u8[8 + 0], aM.u8[8 + 1], aM.u8[8 + 2],
    533                                aM.u8[8 + 3], aM.u8[8 + 4], aM.u8[8 + 5],
    534                                aM.u8[8 + 6], aM.u8[8 + 7]);
    535 }
    536 
    537 template <uint8_t aNumBytes>
    538 inline Scalaru8x16_t Rotate8(Scalaru8x16_t a1234, Scalaru8x16_t a5678) {
    539  Scalaru8x16_t m;
    540  for (uint8_t i = 0; i < 16; i++) {
    541    uint8_t sourceByte = i + aNumBytes;
    542    m.u8[i] =
    543        sourceByte < 16 ? a1234.u8[sourceByte] : a5678.u8[sourceByte - 16];
    544  }
    545  return m;
    546 }
    547 
    548 template <typename T>
    549 inline int16_t SaturateTo16(T a) {
    550  return int16_t(a >= INT16_MIN ? (a <= INT16_MAX ? a : INT16_MAX) : INT16_MIN);
    551 }
    552 
    553 inline Scalari16x8_t PackAndSaturate32To16(Scalari32x4_t m1, Scalari32x4_t m2) {
    554  Scalari16x8_t m;
    555  m.i16[0] = SaturateTo16(m1.i32[0]);
    556  m.i16[1] = SaturateTo16(m1.i32[1]);
    557  m.i16[2] = SaturateTo16(m1.i32[2]);
    558  m.i16[3] = SaturateTo16(m1.i32[3]);
    559  m.i16[4] = SaturateTo16(m2.i32[0]);
    560  m.i16[5] = SaturateTo16(m2.i32[1]);
    561  m.i16[6] = SaturateTo16(m2.i32[2]);
    562  m.i16[7] = SaturateTo16(m2.i32[3]);
    563  return m;
    564 }
    565 
    566 template <typename T>
    567 inline uint16_t SaturateToU16(T a) {
    568  return uint16_t(umin(a & -(a >= 0), INT16_MAX));
    569 }
    570 
    571 inline Scalaru16x8_t PackAndSaturate32ToU16(Scalari32x4_t m1,
    572                                            Scalari32x4_t m2) {
    573  Scalaru16x8_t m;
    574  m.u16[0] = SaturateToU16(m1.i32[0]);
    575  m.u16[1] = SaturateToU16(m1.i32[1]);
    576  m.u16[2] = SaturateToU16(m1.i32[2]);
    577  m.u16[3] = SaturateToU16(m1.i32[3]);
    578  m.u16[4] = SaturateToU16(m2.i32[0]);
    579  m.u16[5] = SaturateToU16(m2.i32[1]);
    580  m.u16[6] = SaturateToU16(m2.i32[2]);
    581  m.u16[7] = SaturateToU16(m2.i32[3]);
    582  return m;
    583 }
    584 
    585 template <typename T>
    586 inline uint8_t SaturateTo8(T a) {
    587  return uint8_t(umin(a & -(a >= 0), 255));
    588 }
    589 
    590 inline Scalaru8x16_t PackAndSaturate32To8(Scalari32x4_t m1, Scalari32x4_t m2,
    591                                          Scalari32x4_t m3,
    592                                          const Scalari32x4_t& m4) {
    593  Scalaru8x16_t m;
    594  m.u8[0] = SaturateTo8(m1.i32[0]);
    595  m.u8[1] = SaturateTo8(m1.i32[1]);
    596  m.u8[2] = SaturateTo8(m1.i32[2]);
    597  m.u8[3] = SaturateTo8(m1.i32[3]);
    598  m.u8[4] = SaturateTo8(m2.i32[0]);
    599  m.u8[5] = SaturateTo8(m2.i32[1]);
    600  m.u8[6] = SaturateTo8(m2.i32[2]);
    601  m.u8[7] = SaturateTo8(m2.i32[3]);
    602  m.u8[8] = SaturateTo8(m3.i32[0]);
    603  m.u8[9] = SaturateTo8(m3.i32[1]);
    604  m.u8[10] = SaturateTo8(m3.i32[2]);
    605  m.u8[11] = SaturateTo8(m3.i32[3]);
    606  m.u8[12] = SaturateTo8(m4.i32[0]);
    607  m.u8[13] = SaturateTo8(m4.i32[1]);
    608  m.u8[14] = SaturateTo8(m4.i32[2]);
    609  m.u8[15] = SaturateTo8(m4.i32[3]);
    610  return m;
    611 }
    612 
    613 inline Scalaru8x16_t PackAndSaturate16To8(Scalari16x8_t m1, Scalari16x8_t m2) {
    614  Scalaru8x16_t m;
    615  m.u8[0] = SaturateTo8(m1.i16[0]);
    616  m.u8[1] = SaturateTo8(m1.i16[1]);
    617  m.u8[2] = SaturateTo8(m1.i16[2]);
    618  m.u8[3] = SaturateTo8(m1.i16[3]);
    619  m.u8[4] = SaturateTo8(m1.i16[4]);
    620  m.u8[5] = SaturateTo8(m1.i16[5]);
    621  m.u8[6] = SaturateTo8(m1.i16[6]);
    622  m.u8[7] = SaturateTo8(m1.i16[7]);
    623  m.u8[8] = SaturateTo8(m2.i16[0]);
    624  m.u8[9] = SaturateTo8(m2.i16[1]);
    625  m.u8[10] = SaturateTo8(m2.i16[2]);
    626  m.u8[11] = SaturateTo8(m2.i16[3]);
    627  m.u8[12] = SaturateTo8(m2.i16[4]);
    628  m.u8[13] = SaturateTo8(m2.i16[5]);
    629  m.u8[14] = SaturateTo8(m2.i16[6]);
    630  m.u8[15] = SaturateTo8(m2.i16[7]);
    631  return m;
    632 }
    633 
    634 // Fast approximate division by 255. It has the property that
    635 // for all 0 <= n <= 255*255, FAST_DIVIDE_BY_255(n) == n/255.
    636 // But it only uses two adds and two shifts instead of an
    637 // integer division (which is expensive on many processors).
    638 //
    639 // equivalent to v/255
    640 template <class B, class A>
    641 inline B FastDivideBy255(A v) {
    642  return ((v << 8) + v + 255) >> 16;
    643 }
    644 
    645 inline Scalaru16x8_t FastDivideBy255_16(Scalaru16x8_t m) {
    646  return FromU16<Scalaru16x8_t>(FastDivideBy255<uint16_t>(int32_t(m.u16[0])),
    647                                FastDivideBy255<uint16_t>(int32_t(m.u16[1])),
    648                                FastDivideBy255<uint16_t>(int32_t(m.u16[2])),
    649                                FastDivideBy255<uint16_t>(int32_t(m.u16[3])),
    650                                FastDivideBy255<uint16_t>(int32_t(m.u16[4])),
    651                                FastDivideBy255<uint16_t>(int32_t(m.u16[5])),
    652                                FastDivideBy255<uint16_t>(int32_t(m.u16[6])),
    653                                FastDivideBy255<uint16_t>(int32_t(m.u16[7])));
    654 }
    655 
    656 inline Scalari32x4_t FastDivideBy255(Scalari32x4_t m) {
    657  return From32<Scalari32x4_t>(
    658      FastDivideBy255<int32_t>(m.i32[0]), FastDivideBy255<int32_t>(m.i32[1]),
    659      FastDivideBy255<int32_t>(m.i32[2]), FastDivideBy255<int32_t>(m.i32[3]));
    660 }
    661 
    662 inline Scalaru8x16_t Pick(Scalaru8x16_t mask, Scalaru8x16_t a,
    663                          Scalaru8x16_t b) {
    664  return From8<Scalaru8x16_t>(
    665      (a.u8[0] & (~mask.u8[0])) | (b.u8[0] & mask.u8[0]),
    666      (a.u8[1] & (~mask.u8[1])) | (b.u8[1] & mask.u8[1]),
    667      (a.u8[2] & (~mask.u8[2])) | (b.u8[2] & mask.u8[2]),
    668      (a.u8[3] & (~mask.u8[3])) | (b.u8[3] & mask.u8[3]),
    669      (a.u8[4] & (~mask.u8[4])) | (b.u8[4] & mask.u8[4]),
    670      (a.u8[5] & (~mask.u8[5])) | (b.u8[5] & mask.u8[5]),
    671      (a.u8[6] & (~mask.u8[6])) | (b.u8[6] & mask.u8[6]),
    672      (a.u8[7] & (~mask.u8[7])) | (b.u8[7] & mask.u8[7]),
    673      (a.u8[8 + 0] & (~mask.u8[8 + 0])) | (b.u8[8 + 0] & mask.u8[8 + 0]),
    674      (a.u8[8 + 1] & (~mask.u8[8 + 1])) | (b.u8[8 + 1] & mask.u8[8 + 1]),
    675      (a.u8[8 + 2] & (~mask.u8[8 + 2])) | (b.u8[8 + 2] & mask.u8[8 + 2]),
    676      (a.u8[8 + 3] & (~mask.u8[8 + 3])) | (b.u8[8 + 3] & mask.u8[8 + 3]),
    677      (a.u8[8 + 4] & (~mask.u8[8 + 4])) | (b.u8[8 + 4] & mask.u8[8 + 4]),
    678      (a.u8[8 + 5] & (~mask.u8[8 + 5])) | (b.u8[8 + 5] & mask.u8[8 + 5]),
    679      (a.u8[8 + 6] & (~mask.u8[8 + 6])) | (b.u8[8 + 6] & mask.u8[8 + 6]),
    680      (a.u8[8 + 7] & (~mask.u8[8 + 7])) | (b.u8[8 + 7] & mask.u8[8 + 7]));
    681 }
    682 
    683 inline Scalari32x4_t Pick(Scalari32x4_t mask, Scalari32x4_t a,
    684                          Scalari32x4_t b) {
    685  return From32<Scalari32x4_t>(
    686      (a.i32[0] & (~mask.i32[0])) | (b.i32[0] & mask.i32[0]),
    687      (a.i32[1] & (~mask.i32[1])) | (b.i32[1] & mask.i32[1]),
    688      (a.i32[2] & (~mask.i32[2])) | (b.i32[2] & mask.i32[2]),
    689      (a.i32[3] & (~mask.i32[3])) | (b.i32[3] & mask.i32[3]));
    690 }
    691 
    692 inline Scalarf32x4_t MixF32(Scalarf32x4_t a, Scalarf32x4_t b, float t) {
    693  return FromF32<Scalarf32x4_t>(a.f32[0] + (b.f32[0] - a.f32[0]) * t,
    694                                a.f32[1] + (b.f32[1] - a.f32[1]) * t,
    695                                a.f32[2] + (b.f32[2] - a.f32[2]) * t,
    696                                a.f32[3] + (b.f32[3] - a.f32[3]) * t);
    697 }
    698 
    699 inline Scalarf32x4_t WSumF32(Scalarf32x4_t a, Scalarf32x4_t b, float wa,
    700                             float wb) {
    701  return FromF32<Scalarf32x4_t>(
    702      a.f32[0] * wa + b.f32[0] * wb, a.f32[1] * wa + b.f32[1] * wb,
    703      a.f32[2] * wa + b.f32[2] * wb, a.f32[3] * wa + b.f32[3] * wb);
    704 }
    705 
    706 inline Scalarf32x4_t AbsF32(Scalarf32x4_t a) {
    707  return FromF32<Scalarf32x4_t>(fabs(a.f32[0]), fabs(a.f32[1]), fabs(a.f32[2]),
    708                                fabs(a.f32[3]));
    709 }
    710 
    711 inline Scalarf32x4_t AddF32(Scalarf32x4_t a, Scalarf32x4_t b) {
    712  return FromF32<Scalarf32x4_t>(a.f32[0] + b.f32[0], a.f32[1] + b.f32[1],
    713                                a.f32[2] + b.f32[2], a.f32[3] + b.f32[3]);
    714 }
    715 
    716 inline Scalarf32x4_t MulF32(Scalarf32x4_t a, Scalarf32x4_t b) {
    717  return FromF32<Scalarf32x4_t>(a.f32[0] * b.f32[0], a.f32[1] * b.f32[1],
    718                                a.f32[2] * b.f32[2], a.f32[3] * b.f32[3]);
    719 }
    720 
    721 inline Scalarf32x4_t DivF32(Scalarf32x4_t a, Scalarf32x4_t b) {
    722  return FromF32<Scalarf32x4_t>(a.f32[0] / b.f32[0], a.f32[1] / b.f32[1],
    723                                a.f32[2] / b.f32[2], a.f32[3] / b.f32[3]);
    724 }
    725 
    726 template <uint8_t aIndex>
    727 inline Scalarf32x4_t SplatF32(Scalarf32x4_t m) {
    728  AssertIndex<aIndex>();
    729  return FromF32<Scalarf32x4_t>(m.f32[aIndex], m.f32[aIndex], m.f32[aIndex],
    730                                m.f32[aIndex]);
    731 }
    732 
    733 inline Scalari32x4_t F32ToI32(Scalarf32x4_t m) {
    734  return From32<Scalari32x4_t>(
    735      int32_t(floor(m.f32[0] + 0.5f)), int32_t(floor(m.f32[1] + 0.5f)),
    736      int32_t(floor(m.f32[2] + 0.5f)), int32_t(floor(m.f32[3] + 0.5f)));
    737 }
    738 
    739 #ifdef SIMD_COMPILE_SSE2
    740 
    741 // SSE2
    742 
    743 template <>
    744 inline __m128i Load8<__m128i>(const uint8_t* aSource) {
    745  return _mm_load_si128((const __m128i*)aSource);
    746 }
    747 
    748 inline void Store8(uint8_t* aTarget, __m128i aM) {
    749  _mm_store_si128((__m128i*)aTarget, aM);
    750 }
    751 
    752 template <>
    753 inline __m128i FromZero8<__m128i>() {
    754  return _mm_setzero_si128();
    755 }
    756 
    757 template <>
    758 inline __m128i From8<__m128i>(uint8_t a, uint8_t b, uint8_t c, uint8_t d,
    759                              uint8_t e, uint8_t f, uint8_t g, uint8_t h,
    760                              uint8_t i, uint8_t j, uint8_t k, uint8_t l,
    761                              uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
    762  return _mm_setr_epi16((b << 8) + a, (d << 8) + c, (e << 8) + f, (h << 8) + g,
    763                        (j << 8) + i, (l << 8) + k, (m << 8) + n, (p << 8) + o);
    764 }
    765 
    766 template <>
    767 inline __m128i FromI16<__m128i>(int16_t a, int16_t b, int16_t c, int16_t d,
    768                                int16_t e, int16_t f, int16_t g, int16_t h) {
    769  return _mm_setr_epi16(a, b, c, d, e, f, g, h);
    770 }
    771 
    772 template <>
    773 inline __m128i FromU16<__m128i>(uint16_t a, uint16_t b, uint16_t c, uint16_t d,
    774                                uint16_t e, uint16_t f, uint16_t g,
    775                                uint16_t h) {
    776  return _mm_setr_epi16(a, b, c, d, e, f, g, h);
    777 }
    778 
    779 template <>
    780 inline __m128i FromI16<__m128i>(int16_t a) {
    781  return _mm_set1_epi16(a);
    782 }
    783 
    784 template <>
    785 inline __m128i FromU16<__m128i>(uint16_t a) {
    786  return _mm_set1_epi16((int16_t)a);
    787 }
    788 
    789 template <>
    790 inline __m128i From32<__m128i>(int32_t a, int32_t b, int32_t c, int32_t d) {
    791  return _mm_setr_epi32(a, b, c, d);
    792 }
    793 
    794 template <>
    795 inline __m128i From32<__m128i>(int32_t a) {
    796  return _mm_set1_epi32(a);
    797 }
    798 
    799 template <>
    800 inline __m128 FromF32<__m128>(float a, float b, float c, float d) {
    801  return _mm_setr_ps(a, b, c, d);
    802 }
    803 
    804 template <>
    805 inline __m128 FromF32<__m128>(float a) {
    806  return _mm_set1_ps(a);
    807 }
    808 
    809 template <int32_t aNumberOfBits>
    810 inline __m128i ShiftRight16(__m128i aM) {
    811  return _mm_srli_epi16(aM, aNumberOfBits);
    812 }
    813 
    814 template <int32_t aNumberOfBits>
    815 inline __m128i ShiftRight32(__m128i aM) {
    816  return _mm_srai_epi32(aM, aNumberOfBits);
    817 }
    818 
    819 inline __m128i Add16(__m128i aM1, __m128i aM2) {
    820  return _mm_add_epi16(aM1, aM2);
    821 }
    822 
    823 inline __m128i Add32(__m128i aM1, __m128i aM2) {
    824  return _mm_add_epi32(aM1, aM2);
    825 }
    826 
    827 inline __m128i Sub16(__m128i aM1, __m128i aM2) {
    828  return _mm_sub_epi16(aM1, aM2);
    829 }
    830 
    831 inline __m128i Sub32(__m128i aM1, __m128i aM2) {
    832  return _mm_sub_epi32(aM1, aM2);
    833 }
    834 
    835 inline __m128i Min8(__m128i aM1, __m128i aM2) { return _mm_min_epu8(aM1, aM2); }
    836 
    837 inline __m128i Max8(__m128i aM1, __m128i aM2) { return _mm_max_epu8(aM1, aM2); }
    838 
    839 inline __m128i Min32(__m128i aM1, __m128i aM2) {
    840  __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
    841  __m128i m1_greater_than_m2 = _mm_cmpgt_epi32(aM1, aM2);
    842  return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m1_greater_than_m2));
    843 }
    844 
    845 inline __m128i Max32(__m128i aM1, __m128i aM2) {
    846  __m128i m1_minus_m2 = _mm_sub_epi32(aM1, aM2);
    847  __m128i m2_greater_than_m1 = _mm_cmpgt_epi32(aM2, aM1);
    848  return _mm_sub_epi32(aM1, _mm_and_si128(m1_minus_m2, m2_greater_than_m1));
    849 }
    850 
    851 inline __m128i Mul16(__m128i aM1, __m128i aM2) {
    852  return _mm_mullo_epi16(aM1, aM2);
    853 }
    854 
    855 inline __m128i MulU16(__m128i aM1, __m128i aM2) {
    856  return _mm_mullo_epi16(aM1, aM2);
    857 }
    858 
    859 inline void Mul16x4x2x2To32x4x2(__m128i aFactorsA1B1, __m128i aFactorsA2B2,
    860                                __m128i& aProductA, __m128i& aProductB) {
    861  __m128i prodAB_lo = _mm_mullo_epi16(aFactorsA1B1, aFactorsA2B2);
    862  __m128i prodAB_hi = _mm_mulhi_epi16(aFactorsA1B1, aFactorsA2B2);
    863  aProductA = _mm_unpacklo_epi16(prodAB_lo, prodAB_hi);
    864  aProductB = _mm_unpackhi_epi16(prodAB_lo, prodAB_hi);
    865 }
    866 
    867 inline __m128i MulAdd16x8x2To32x4(__m128i aFactorsA, __m128i aFactorsB) {
    868  return _mm_madd_epi16(aFactorsA, aFactorsB);
    869 }
    870 
    871 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
    872 inline __m128i Shuffle32(__m128i aM) {
    873  AssertIndex<i0>();
    874  AssertIndex<i1>();
    875  AssertIndex<i2>();
    876  AssertIndex<i3>();
    877  return _mm_shuffle_epi32(aM, _MM_SHUFFLE(i0, i1, i2, i3));
    878 }
    879 
    880 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
    881 inline __m128i ShuffleLo16(__m128i aM) {
    882  AssertIndex<i0>();
    883  AssertIndex<i1>();
    884  AssertIndex<i2>();
    885  AssertIndex<i3>();
    886  return _mm_shufflelo_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
    887 }
    888 
    889 template <int8_t i0, int8_t i1, int8_t i2, int8_t i3>
    890 inline __m128i ShuffleHi16(__m128i aM) {
    891  AssertIndex<i0>();
    892  AssertIndex<i1>();
    893  AssertIndex<i2>();
    894  AssertIndex<i3>();
    895  return _mm_shufflehi_epi16(aM, _MM_SHUFFLE(i0, i1, i2, i3));
    896 }
    897 
    898 template <int8_t aIndex>
    899 inline __m128i Splat32(__m128i aM) {
    900  return Shuffle32<aIndex, aIndex, aIndex, aIndex>(aM);
    901 }
    902 
    903 template <int8_t aIndex>
    904 inline __m128i Splat32On8(__m128i aM) {
    905  return Shuffle32<aIndex, aIndex, aIndex, aIndex>(aM);
    906 }
    907 
    908 template <int8_t aIndexLo, int8_t aIndexHi>
    909 inline __m128i Splat16(__m128i aM) {
    910  AssertIndex<aIndexLo>();
    911  AssertIndex<aIndexHi>();
    912  return ShuffleHi16<aIndexHi, aIndexHi, aIndexHi, aIndexHi>(
    913      ShuffleLo16<aIndexLo, aIndexLo, aIndexLo, aIndexLo>(aM));
    914 }
    915 
    916 inline __m128i UnpackLo8x8ToI16x8(__m128i m) {
    917  __m128i zero = _mm_set1_epi8(0);
    918  return _mm_unpacklo_epi8(m, zero);
    919 }
    920 
    921 inline __m128i UnpackHi8x8ToI16x8(__m128i m) {
    922  __m128i zero = _mm_set1_epi8(0);
    923  return _mm_unpackhi_epi8(m, zero);
    924 }
    925 
    926 inline __m128i UnpackLo8x8ToU16x8(__m128i m) {
    927  __m128i zero = _mm_set1_epi8(0);
    928  return _mm_unpacklo_epi8(m, zero);
    929 }
    930 
    931 inline __m128i UnpackHi8x8ToU16x8(__m128i m) {
    932  __m128i zero = _mm_set1_epi8(0);
    933  return _mm_unpackhi_epi8(m, zero);
    934 }
    935 
    936 inline __m128i InterleaveLo8(__m128i m1, __m128i m2) {
    937  return _mm_unpacklo_epi8(m1, m2);
    938 }
    939 
    940 inline __m128i InterleaveHi8(__m128i m1, __m128i m2) {
    941  return _mm_unpackhi_epi8(m1, m2);
    942 }
    943 
    944 inline __m128i InterleaveLo16(__m128i m1, __m128i m2) {
    945  return _mm_unpacklo_epi16(m1, m2);
    946 }
    947 
    948 inline __m128i InterleaveHi16(__m128i m1, __m128i m2) {
    949  return _mm_unpackhi_epi16(m1, m2);
    950 }
    951 
    952 inline __m128i InterleaveLo32(__m128i m1, __m128i m2) {
    953  return _mm_unpacklo_epi32(m1, m2);
    954 }
    955 
    956 template <uint8_t aNumBytes>
    957 inline __m128i Rotate8(__m128i a1234, __m128i a5678) {
    958  return _mm_or_si128(_mm_srli_si128(a1234, aNumBytes),
    959                      _mm_slli_si128(a5678, 16 - aNumBytes));
    960 }
    961 
    962 inline __m128i PackAndSaturate32To16(__m128i m1, __m128i m2) {
    963  return _mm_packs_epi32(m1, m2);
    964 }
    965 
    966 inline __m128i PackAndSaturate32ToU16(__m128i m1, __m128i m2) {
    967  return _mm_packs_epi32(m1, m2);
    968 }
    969 
    970 inline __m128i PackAndSaturate32To8(__m128i m1, __m128i m2, __m128i m3,
    971                                    const __m128i& m4) {
    972  // Pack into 8 16bit signed integers (saturating).
    973  __m128i m12 = _mm_packs_epi32(m1, m2);
    974  __m128i m34 = _mm_packs_epi32(m3, m4);
    975 
    976  // Pack into 16 8bit unsigned integers (saturating).
    977  return _mm_packus_epi16(m12, m34);
    978 }
    979 
    980 inline __m128i PackAndSaturate16To8(__m128i m1, __m128i m2) {
    981  // Pack into 16 8bit unsigned integers (saturating).
    982  return _mm_packus_epi16(m1, m2);
    983 }
    984 
    985 inline __m128i FastDivideBy255(__m128i m) {
    986  // v = m << 8
    987  __m128i v = _mm_slli_epi32(m, 8);
    988  // v = v + (m + (255,255,255,255))
    989  v = _mm_add_epi32(v, _mm_add_epi32(m, _mm_set1_epi32(255)));
    990  // v = v >> 16
    991  return _mm_srai_epi32(v, 16);
    992 }
    993 
    994 inline __m128i FastDivideBy255_16(__m128i m) {
    995  __m128i zero = _mm_set1_epi16(0);
    996  __m128i lo = _mm_unpacklo_epi16(m, zero);
    997  __m128i hi = _mm_unpackhi_epi16(m, zero);
    998  return _mm_packs_epi32(FastDivideBy255(lo), FastDivideBy255(hi));
    999 }
   1000 
   1001 inline __m128i Pick(__m128i mask, __m128i a, __m128i b) {
   1002  return _mm_or_si128(_mm_andnot_si128(mask, a), _mm_and_si128(mask, b));
   1003 }
   1004 
   1005 inline __m128 MixF32(__m128 a, __m128 b, float t) {
   1006  return _mm_add_ps(a, _mm_mul_ps(_mm_sub_ps(b, a), _mm_set1_ps(t)));
   1007 }
   1008 
   1009 inline __m128 WSumF32(__m128 a, __m128 b, float wa, float wb) {
   1010  return _mm_add_ps(_mm_mul_ps(a, _mm_set1_ps(wa)),
   1011                    _mm_mul_ps(b, _mm_set1_ps(wb)));
   1012 }
   1013 
   1014 inline __m128 AbsF32(__m128 a) {
   1015  return _mm_max_ps(_mm_sub_ps(_mm_setzero_ps(), a), a);
   1016 }
   1017 
   1018 inline __m128 AddF32(__m128 a, __m128 b) { return _mm_add_ps(a, b); }
   1019 
   1020 inline __m128 MulF32(__m128 a, __m128 b) { return _mm_mul_ps(a, b); }
   1021 
   1022 inline __m128 DivF32(__m128 a, __m128 b) { return _mm_div_ps(a, b); }
   1023 
   1024 template <uint8_t aIndex>
   1025 inline __m128 SplatF32(__m128 m) {
   1026  AssertIndex<aIndex>();
   1027  return _mm_shuffle_ps(m, m, _MM_SHUFFLE(aIndex, aIndex, aIndex, aIndex));
   1028 }
   1029 
   1030 inline __m128i F32ToI32(__m128 m) { return _mm_cvtps_epi32(m); }
   1031 
   1032 #endif  // SIMD_COMPILE_SSE2
   1033 
   1034 }  // namespace simd
   1035 
   1036 }  // namespace gfx
   1037 }  // namespace mozilla
   1038 
   1039 #endif  // _MOZILLA_GFX_SIMD_H_