tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

FilterProcessingSIMD-inl.h (52215B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "FilterProcessing.h"
      8 
      9 #include "SIMD.h"
     10 #include "SVGTurbulenceRenderer-inl.h"
     11 
     12 namespace mozilla {
     13 namespace gfx {
     14 
     15 template <typename u8x16_t>
     16 inline already_AddRefed<DataSourceSurface> ConvertToB8G8R8A8_SIMD(
     17    SourceSurface* aSurface) {
     18  IntSize size = aSurface->GetSize();
     19  RefPtr<DataSourceSurface> output =
     20      Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
     21  if (!output) {
     22    return nullptr;
     23  }
     24 
     25  RefPtr<DataSourceSurface> input = aSurface->GetDataSurface();
     26  DataSourceSurface::ScopedMap inputMap(input, DataSourceSurface::READ);
     27  DataSourceSurface::ScopedMap outputMap(output, DataSourceSurface::READ_WRITE);
     28  const uint8_t* inputData = inputMap.GetData();
     29  uint8_t* outputData = outputMap.GetData();
     30  int32_t inputStride = inputMap.GetStride();
     31  int32_t outputStride = outputMap.GetStride();
     32  switch (input->GetFormat()) {
     33    case SurfaceFormat::B8G8R8A8:
     34      output = input;
     35      break;
     36    case SurfaceFormat::B8G8R8X8:
     37      for (int32_t y = 0; y < size.height; y++) {
     38        for (int32_t x = 0; x < size.width; x++) {
     39          int32_t inputIndex = y * inputStride + 4 * x;
     40          int32_t outputIndex = y * outputStride + 4 * x;
     41          outputData[outputIndex + 0] = inputData[inputIndex + 0];
     42          outputData[outputIndex + 1] = inputData[inputIndex + 1];
     43          outputData[outputIndex + 2] = inputData[inputIndex + 2];
     44          outputData[outputIndex + 3] = 255;
     45        }
     46      }
     47      break;
     48    case SurfaceFormat::R8G8B8A8:
     49      for (int32_t y = 0; y < size.height; y++) {
     50        for (int32_t x = 0; x < size.width; x++) {
     51          int32_t inputIndex = y * inputStride + 4 * x;
     52          int32_t outputIndex = y * outputStride + 4 * x;
     53          outputData[outputIndex + 2] = inputData[inputIndex + 0];
     54          outputData[outputIndex + 1] = inputData[inputIndex + 1];
     55          outputData[outputIndex + 0] = inputData[inputIndex + 2];
     56          outputData[outputIndex + 3] = inputData[inputIndex + 3];
     57        }
     58      }
     59      break;
     60    case SurfaceFormat::R8G8B8X8:
     61      for (int32_t y = 0; y < size.height; y++) {
     62        for (int32_t x = 0; x < size.width; x++) {
     63          int32_t inputIndex = y * inputStride + 4 * x;
     64          int32_t outputIndex = y * outputStride + 4 * x;
     65          outputData[outputIndex + 2] = inputData[inputIndex + 0];
     66          outputData[outputIndex + 1] = inputData[inputIndex + 1];
     67          outputData[outputIndex + 0] = inputData[inputIndex + 2];
     68          outputData[outputIndex + 3] = 255;
     69        }
     70      }
     71      break;
     72    case SurfaceFormat::A8:
     73      for (int32_t y = 0; y < size.height; y++) {
     74        for (int32_t x = 0; x < size.width; x += 16) {
     75          int32_t inputIndex = y * inputStride + x;
     76          int32_t outputIndex = y * outputStride + 4 * x;
     77          u8x16_t p1To16 = simd::Load8<u8x16_t>(&inputData[inputIndex]);
     78          // Turn AAAAAAAAAAAAAAAA into four chunks of 000A000A000A000A by
     79          // interleaving with 0000000000000000 twice.
     80          u8x16_t zero = simd::FromZero8<u8x16_t>();
     81          u8x16_t p1To8 = simd::InterleaveLo8(zero, p1To16);
     82          u8x16_t p9To16 = simd::InterleaveHi8(zero, p1To16);
     83          u8x16_t p1To4 = simd::InterleaveLo8(zero, p1To8);
     84          u8x16_t p5To8 = simd::InterleaveHi8(zero, p1To8);
     85          u8x16_t p9To12 = simd::InterleaveLo8(zero, p9To16);
     86          u8x16_t p13To16 = simd::InterleaveHi8(zero, p9To16);
     87          simd::Store8(&outputData[outputIndex], p1To4);
     88          if ((x + 4) * 4 < outputStride) {
     89            simd::Store8(&outputData[outputIndex + 4 * 4], p5To8);
     90          }
     91          if ((x + 8) * 4 < outputStride) {
     92            simd::Store8(&outputData[outputIndex + 4 * 8], p9To12);
     93          }
     94          if ((x + 12) * 4 < outputStride) {
     95            simd::Store8(&outputData[outputIndex + 4 * 12], p13To16);
     96          }
     97        }
     98      }
     99      break;
    100    default:
    101      output = nullptr;
    102      break;
    103  }
    104  return output.forget();
    105 }
    106 
    107 template <typename u8x16_t>
    108 inline void ExtractAlpha_SIMD(const IntSize& size, const uint8_t* sourceData,
    109                              int32_t sourceStride, uint8_t* alphaData,
    110                              int32_t alphaStride) {
    111  for (int32_t y = 0; y < size.height; y++) {
    112    for (int32_t x = 0; x < size.width; x += 16) {
    113      // Process 16 pixels at a time.
    114      // Turn up to four chunks of BGRABGRABGRABGRA into one chunk of
    115      // AAAAAAAAAAAAAAAA.
    116      int32_t sourceIndex = y * sourceStride + 4 * x;
    117      int32_t targetIndex = y * alphaStride + x;
    118 
    119      u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
    120      u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
    121      u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
    122 
    123      u8x16_t bgrabgrabgrabgra1 =
    124          simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
    125      if (4 * (x + 4) < sourceStride) {
    126        bgrabgrabgrabgra2 =
    127            simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
    128      }
    129      if (4 * (x + 8) < sourceStride) {
    130        bgrabgrabgrabgra3 =
    131            simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
    132      }
    133      if (4 * (x + 12) < sourceStride) {
    134        bgrabgrabgrabgra4 =
    135            simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
    136      }
    137 
    138      u8x16_t bbggrraabbggrraa1 =
    139          simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
    140      u8x16_t bbggrraabbggrraa2 =
    141          simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
    142      u8x16_t bbggrraabbggrraa3 =
    143          simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
    144      u8x16_t bbggrraabbggrraa4 =
    145          simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
    146      u8x16_t bbbbggggrrrraaaa1 =
    147          simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
    148      u8x16_t bbbbggggrrrraaaa2 =
    149          simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
    150      u8x16_t bbbbggggrrrraaaa3 =
    151          simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
    152      u8x16_t bbbbggggrrrraaaa4 =
    153          simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
    154      u8x16_t rrrrrrrraaaaaaaa1 =
    155          simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
    156      u8x16_t rrrrrrrraaaaaaaa2 =
    157          simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
    158      u8x16_t aaaaaaaaaaaaaaaa =
    159          simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
    160 
    161      simd::Store8(&alphaData[targetIndex], aaaaaaaaaaaaaaaa);
    162    }
    163  }
    164 }
    165 
    166 // This function calculates the result color values for four pixels, but for
    167 // only two color channels - either b & r or g & a. However, the a result will
    168 // not be used.
    169 // source and dest each contain 8 values, either bbbb gggg or rrrr aaaa.
    170 // sourceAlpha and destAlpha are of the form aaaa aaaa, where each aaaa is the
    171 // alpha of all four pixels (and both aaaa's are the same).
    172 // blendendComponent1 and blendedComponent2 are the out parameters.
    173 template <typename i16x8_t, typename i32x4_t, uint32_t aBlendMode>
    174 inline void BlendTwoComponentsOfFourPixels(i16x8_t source, i16x8_t sourceAlpha,
    175                                           i16x8_t dest,
    176                                           const i16x8_t& destAlpha,
    177                                           i32x4_t& blendedComponent1,
    178                                           i32x4_t& blendedComponent2) {
    179  i16x8_t x255 = simd::FromI16<i16x8_t>(255);
    180 
    181  switch (aBlendMode) {
    182    case BLEND_MODE_MULTIPLY: {
    183      // val = ((255 - destAlpha) * source + (255 - sourceAlpha + source) *
    184      // dest);
    185      i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
    186      i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
    187      i16x8_t twoFiftyFiveMinusSourceAlphaPlusSource =
    188          simd::Add16(twoFiftyFiveMinusSourceAlpha, source);
    189 
    190      i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
    191      i16x8_t leftFactor1 = simd::InterleaveLo16(
    192          twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
    193      blendedComponent1 =
    194          simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest1, leftFactor1);
    195      blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
    196 
    197      i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
    198      i16x8_t leftFactor2 = simd::InterleaveHi16(
    199          twoFiftyFiveMinusDestAlpha, twoFiftyFiveMinusSourceAlphaPlusSource);
    200      blendedComponent2 =
    201          simd::MulAdd16x8x2To32x4(sourceInterleavedWithDest2, leftFactor2);
    202      blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
    203 
    204      break;
    205    }
    206 
    207    case BLEND_MODE_SCREEN: {
    208      // val = 255 * (source + dest) + (0 - dest) * source;
    209      i16x8_t sourcePlusDest = simd::Add16(source, dest);
    210      i16x8_t zeroMinusDest = simd::Sub16(simd::FromI16<i16x8_t>(0), dest);
    211 
    212      i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest1 =
    213          simd::InterleaveLo16(x255, zeroMinusDest);
    214      i16x8_t sourcePlusDestInterleavedWithSource1 =
    215          simd::InterleaveLo16(sourcePlusDest, source);
    216      blendedComponent1 =
    217          simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest1,
    218                                   sourcePlusDestInterleavedWithSource1);
    219      blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
    220 
    221      i16x8_t twoFiftyFiveInterleavedWithZeroMinusDest2 =
    222          simd::InterleaveHi16(x255, zeroMinusDest);
    223      i16x8_t sourcePlusDestInterleavedWithSource2 =
    224          simd::InterleaveHi16(sourcePlusDest, source);
    225      blendedComponent2 =
    226          simd::MulAdd16x8x2To32x4(twoFiftyFiveInterleavedWithZeroMinusDest2,
    227                                   sourcePlusDestInterleavedWithSource2);
    228      blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
    229 
    230      break;
    231    }
    232 
    233    case BLEND_MODE_DARKEN:
    234    case BLEND_MODE_LIGHTEN: {
    235      // Darken:
    236      // val = min((255 - destAlpha) * source + 255                 * dest,
    237      //           255               * source + (255 - sourceAlpha) * dest);
    238      //
    239      // Lighten:
    240      // val = max((255 - destAlpha) * source + 255                 * dest,
    241      //           255               * source + (255 - sourceAlpha) * dest);
    242 
    243      i16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
    244      i16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
    245 
    246      i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1 =
    247          simd::InterleaveLo16(twoFiftyFiveMinusDestAlpha, x255);
    248      i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1 =
    249          simd::InterleaveLo16(x255, twoFiftyFiveMinusSourceAlpha);
    250      i16x8_t sourceInterleavedWithDest1 = simd::InterleaveLo16(source, dest);
    251      i32x4_t product1_1 = simd::MulAdd16x8x2To32x4(
    252          twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive1,
    253          sourceInterleavedWithDest1);
    254      i32x4_t product1_2 = simd::MulAdd16x8x2To32x4(
    255          twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha1,
    256          sourceInterleavedWithDest1);
    257      blendedComponent1 = aBlendMode == BLEND_MODE_DARKEN
    258                              ? simd::Min32(product1_1, product1_2)
    259                              : simd::Max32(product1_1, product1_2);
    260      blendedComponent1 = simd::FastDivideBy255(blendedComponent1);
    261 
    262      i16x8_t twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2 =
    263          simd::InterleaveHi16(twoFiftyFiveMinusDestAlpha, x255);
    264      i16x8_t twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2 =
    265          simd::InterleaveHi16(x255, twoFiftyFiveMinusSourceAlpha);
    266      i16x8_t sourceInterleavedWithDest2 = simd::InterleaveHi16(source, dest);
    267      i32x4_t product2_1 = simd::MulAdd16x8x2To32x4(
    268          twoFiftyFiveMinusDestAlphaInterleavedWithTwoFiftyFive2,
    269          sourceInterleavedWithDest2);
    270      i32x4_t product2_2 = simd::MulAdd16x8x2To32x4(
    271          twoFiftyFiveInterleavedWithTwoFiftyFiveMinusSourceAlpha2,
    272          sourceInterleavedWithDest2);
    273      blendedComponent2 = aBlendMode == BLEND_MODE_DARKEN
    274                              ? simd::Min32(product2_1, product2_2)
    275                              : simd::Max32(product2_1, product2_2);
    276      blendedComponent2 = simd::FastDivideBy255(blendedComponent2);
    277 
    278      break;
    279    }
    280  }
    281 }
    282 
    283 // The alpha channel is subject to a different calculation than the RGB
    284 // channels, and this calculation is the same for all blend modes:
    285 // resultAlpha * 255 = 255 * 255 - (255 - sourceAlpha) * (255 - destAlpha)
    286 template <typename i16x8_t, typename i32x4_t>
    287 inline i32x4_t BlendAlphaOfFourPixels(i16x8_t s_rrrraaaa1234,
    288                                      i16x8_t d_rrrraaaa1234) {
    289  // clang-format off
    290  // We're using MulAdd16x8x2To32x4, so we need to interleave our factors
    291  // appropriately. The calculation is rewritten as follows:
    292  // resultAlpha[0] * 255 = 255 * 255 - (255 - sourceAlpha[0]) * (255 - destAlpha[0])
    293  //                      = 255 * 255 + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
    294  //                      = (255 - 0) * (510 - 255) + (255 - sourceAlpha[0]) * (destAlpha[0] - 255)
    295  //                      = MulAdd(255 - IntLv(0, sourceAlpha), IntLv(510, destAlpha) - 255)[0]
    296  // clang-format on
    297  i16x8_t zeroInterleavedWithSourceAlpha =
    298      simd::InterleaveHi16(simd::FromI16<i16x8_t>(0), s_rrrraaaa1234);
    299  i16x8_t fiveTenInterleavedWithDestAlpha =
    300      simd::InterleaveHi16(simd::FromI16<i16x8_t>(510), d_rrrraaaa1234);
    301  i16x8_t f1 =
    302      simd::Sub16(simd::FromI16<i16x8_t>(255), zeroInterleavedWithSourceAlpha);
    303  i16x8_t f2 =
    304      simd::Sub16(fiveTenInterleavedWithDestAlpha, simd::FromI16<i16x8_t>(255));
    305  return simd::FastDivideBy255(simd::MulAdd16x8x2To32x4(f1, f2));
    306 }
    307 
    308 template <typename u8x16_t, typename i16x8_t>
    309 inline void UnpackAndShuffleComponents(u8x16_t bgrabgrabgrabgra1234,
    310                                       i16x8_t& bbbbgggg1234,
    311                                       i16x8_t& rrrraaaa1234) {
    312  // bgrabgrabgrabgra1234 -> bbbbgggg1234, rrrraaaa1234
    313  i16x8_t bgrabgra12 = simd::UnpackLo8x8ToI16x8(bgrabgrabgrabgra1234);
    314  i16x8_t bgrabgra34 = simd::UnpackHi8x8ToI16x8(bgrabgrabgrabgra1234);
    315  i16x8_t bbggrraa13 = simd::InterleaveLo16(bgrabgra12, bgrabgra34);
    316  i16x8_t bbggrraa24 = simd::InterleaveHi16(bgrabgra12, bgrabgra34);
    317  bbbbgggg1234 = simd::InterleaveLo16(bbggrraa13, bbggrraa24);
    318  rrrraaaa1234 = simd::InterleaveHi16(bbggrraa13, bbggrraa24);
    319 }
    320 
    321 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
    322 inline u8x16_t ShuffleAndPackComponents(i32x4_t bbbb1234, i32x4_t gggg1234,
    323                                        i32x4_t rrrr1234,
    324                                        const i32x4_t& aaaa1234) {
    325  // bbbb1234, gggg1234, rrrr1234, aaaa1234 -> bgrabgrabgrabgra1234
    326  i16x8_t bbbbgggg1234 = simd::PackAndSaturate32To16(bbbb1234, gggg1234);
    327  i16x8_t rrrraaaa1234 = simd::PackAndSaturate32To16(rrrr1234, aaaa1234);
    328  i16x8_t brbrbrbr1234 = simd::InterleaveLo16(bbbbgggg1234, rrrraaaa1234);
    329  i16x8_t gagagaga1234 = simd::InterleaveHi16(bbbbgggg1234, rrrraaaa1234);
    330  i16x8_t bgrabgra12 = simd::InterleaveLo16(brbrbrbr1234, gagagaga1234);
    331  i16x8_t bgrabgra34 = simd::InterleaveHi16(brbrbrbr1234, gagagaga1234);
    332  return simd::PackAndSaturate16To8(bgrabgra12, bgrabgra34);
    333 }
    334 
    335 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
    336 inline void ApplyBlending_SIMD(const DataSourceSurface::ScopedMap& aInputMap1,
    337                               const DataSourceSurface::ScopedMap& aInputMap2,
    338                               const DataSourceSurface::ScopedMap& aOutputMap,
    339                               const IntSize& aSize) {
    340  const uint8_t* source1Data = aInputMap1.GetData();
    341  const uint8_t* source2Data = aInputMap2.GetData();
    342  uint8_t* targetData = aOutputMap.GetData();
    343  int32_t targetStride = aOutputMap.GetStride();
    344  int32_t source1Stride = aInputMap1.GetStride();
    345  int32_t source2Stride = aInputMap2.GetStride();
    346 
    347  for (int32_t y = 0; y < aSize.height; y++) {
    348    for (int32_t x = 0; x < aSize.width; x += 4) {
    349      int32_t targetIndex = y * targetStride + 4 * x;
    350      int32_t source1Index = y * source1Stride + 4 * x;
    351      int32_t source2Index = y * source2Stride + 4 * x;
    352 
    353      u8x16_t s1234 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
    354      u8x16_t d1234 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
    355 
    356      // The blending calculation for the RGB channels all need access to the
    357      // alpha channel of their pixel, and the alpha calculation is different,
    358      // so it makes sense to separate by channel.
    359 
    360      i16x8_t s_bbbbgggg1234, s_rrrraaaa1234;
    361      i16x8_t d_bbbbgggg1234, d_rrrraaaa1234;
    362      UnpackAndShuffleComponents(s1234, s_bbbbgggg1234, s_rrrraaaa1234);
    363      UnpackAndShuffleComponents(d1234, d_bbbbgggg1234, d_rrrraaaa1234);
    364      i16x8_t s_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(s_rrrraaaa1234);
    365      i16x8_t d_aaaaaaaa1234 = simd::Shuffle32<3, 2, 3, 2>(d_rrrraaaa1234);
    366 
    367      // We only use blendedB, blendedG and blendedR.
    368      i32x4_t blendedB, blendedG, blendedR, blendedA;
    369      BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
    370          s_bbbbgggg1234, s_aaaaaaaa1234, d_bbbbgggg1234, d_aaaaaaaa1234,
    371          blendedB, blendedG);
    372      BlendTwoComponentsOfFourPixels<i16x8_t, i32x4_t, mode>(
    373          s_rrrraaaa1234, s_aaaaaaaa1234, d_rrrraaaa1234, d_aaaaaaaa1234,
    374          blendedR, blendedA);
    375 
    376      // Throw away blendedA and overwrite it with the correct blended alpha.
    377      blendedA = BlendAlphaOfFourPixels<i16x8_t, i32x4_t>(s_rrrraaaa1234,
    378                                                          d_rrrraaaa1234);
    379 
    380      u8x16_t result1234 = ShuffleAndPackComponents<i32x4_t, i16x8_t, u8x16_t>(
    381          blendedB, blendedG, blendedR, blendedA);
    382      simd::Store8(&targetData[targetIndex], result1234);
    383    }
    384  }
    385 }
    386 
    387 template <typename i32x4_t, typename i16x8_t, typename u8x16_t, BlendMode mode>
    388 inline already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
    389    DataSourceSurface* aInput1, DataSourceSurface* aInput2) {
    390  IntSize size = aInput1->GetSize();
    391  RefPtr<DataSourceSurface> target =
    392      Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
    393  if (!target) {
    394    return nullptr;
    395  }
    396 
    397  DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
    398  DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
    399  if (aInput1->Equals(aInput2)) {
    400    ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap1,
    401                                                        outputMap, size);
    402  } else {
    403    DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
    404    ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, mode>(inputMap1, inputMap2,
    405                                                        outputMap, size);
    406  }
    407 
    408  return target.forget();
    409 }
    410 
    411 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
    412 static already_AddRefed<DataSourceSurface> ApplyBlending_SIMD(
    413    DataSourceSurface* aInput1, DataSourceSurface* aInput2,
    414    BlendMode aBlendMode) {
    415  switch (aBlendMode) {
    416    case BLEND_MODE_MULTIPLY:
    417      return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_MULTIPLY>(
    418          aInput1, aInput2);
    419    case BLEND_MODE_SCREEN:
    420      return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_SCREEN>(
    421          aInput1, aInput2);
    422    case BLEND_MODE_DARKEN:
    423      return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_DARKEN>(
    424          aInput1, aInput2);
    425    case BLEND_MODE_LIGHTEN:
    426      return ApplyBlending_SIMD<i32x4_t, i16x8_t, u8x16_t, BLEND_MODE_LIGHTEN>(
    427          aInput1, aInput2);
    428    default:
    429      return nullptr;
    430  }
    431 }
    432 
    433 template <MorphologyOperator Operator, typename u8x16_t>
    434 static u8x16_t Morph8(u8x16_t a, u8x16_t b) {
    435  return Operator == MORPHOLOGY_OPERATOR_ERODE ? simd::Min8(a, b)
    436                                               : simd::Max8(a, b);
    437 }
    438 
    439 // Set every pixel to the per-component minimum or maximum of the pixels around
    440 // it that are up to aRadius pixels away from it (horizontally).
    441 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
    442 inline void ApplyMorphologyHorizontal_SIMD(
    443    const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
    444    int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
    445  static_assert(
    446      op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
    447      "unexpected morphology operator");
    448 
    449  int32_t kernelSize = aRadius + 1 + aRadius;
    450  MOZ_ASSERT(kernelSize >= 3, "don't call this with aRadius <= 0");
    451  MOZ_ASSERT(kernelSize % 4 == 1 || kernelSize % 4 == 3);
    452  int32_t completeKernelSizeForFourPixels = kernelSize + 3;
    453  MOZ_ASSERT(completeKernelSizeForFourPixels % 4 == 0 ||
    454             completeKernelSizeForFourPixels % 4 == 2);
    455 
    456  // aSourceData[-aRadius] and aDestData[0] are both aligned to 16 bytes, just
    457  // the way we need them to be.
    458 
    459  IntRect sourceRect = aDestRect;
    460  sourceRect.Inflate(aRadius, 0);
    461 
    462  for (int32_t y = aDestRect.Y(); y < aDestRect.YMost(); y++) {
    463    int32_t kernelStartX = aDestRect.X() - aRadius;
    464    for (int32_t x = aDestRect.X(); x < aDestRect.XMost();
    465         x += 4, kernelStartX += 4) {
    466      // We process four pixels (16 color values) at a time.
    467      // aSourceData[0] points to the pixel located at aDestRect.TopLeft();
    468      // source values can be read beyond that because the source is extended
    469      // by aRadius pixels.
    470 
    471      int32_t sourceIndex = y * aSourceStride + 4 * kernelStartX;
    472      u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
    473      u8x16_t m1234 = p1234;
    474 
    475      for (int32_t i = 4; i < completeKernelSizeForFourPixels; i += 4) {
    476        u8x16_t p5678 =
    477            (kernelStartX + i < sourceRect.XMost())
    478                ? simd::Load8<u8x16_t>(&aSourceData[sourceIndex + 4 * i])
    479                : simd::FromZero8<u8x16_t>();
    480        u8x16_t p2345 = simd::Rotate8<4>(p1234, p5678);
    481        u8x16_t p3456 = simd::Rotate8<8>(p1234, p5678);
    482        m1234 = Morph8<op, u8x16_t>(m1234, p2345);
    483        m1234 = Morph8<op, u8x16_t>(m1234, p3456);
    484        if (i + 2 < completeKernelSizeForFourPixels) {
    485          u8x16_t p4567 = simd::Rotate8<12>(p1234, p5678);
    486          m1234 = Morph8<op, u8x16_t>(m1234, p4567);
    487          m1234 = Morph8<op, u8x16_t>(m1234, p5678);
    488        }
    489        p1234 = p5678;
    490      }
    491 
    492      int32_t destIndex = y * aDestStride + 4 * x;
    493      simd::Store8(&aDestData[destIndex], m1234);
    494    }
    495  }
    496 }
    497 
    498 template <typename i16x8_t, typename u8x16_t>
    499 inline void ApplyMorphologyHorizontal_SIMD(
    500    const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
    501    int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
    502    MorphologyOperator aOp) {
    503  if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
    504    ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
    505        aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
    506  } else {
    507    ApplyMorphologyHorizontal_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t,
    508                                   u8x16_t>(
    509        aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
    510  }
    511 }
    512 
    513 // Set every pixel to the per-component minimum or maximum of the pixels around
    514 // it that are up to aRadius pixels away from it (vertically).
    515 template <MorphologyOperator op, typename i16x8_t, typename u8x16_t>
    516 static void ApplyMorphologyVertical_SIMD(
    517    const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
    518    int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius) {
    519  static_assert(
    520      op == MORPHOLOGY_OPERATOR_ERODE || op == MORPHOLOGY_OPERATOR_DILATE,
    521      "unexpected morphology operator");
    522 
    523  int32_t startY = aDestRect.Y() - aRadius;
    524  int32_t endY = aDestRect.Y() + aRadius;
    525  for (int32_t y = aDestRect.Y(); y < aDestRect.YMost();
    526       y++, startY++, endY++) {
    527    for (int32_t x = aDestRect.X(); x < aDestRect.XMost(); x += 4) {
    528      int32_t sourceIndex = startY * aSourceStride + 4 * x;
    529      u8x16_t u = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
    530      sourceIndex += aSourceStride;
    531      for (int32_t iy = startY + 1; iy <= endY;
    532           iy++, sourceIndex += aSourceStride) {
    533        u8x16_t u2 = simd::Load8<u8x16_t>(&aSourceData[sourceIndex]);
    534        u = Morph8<op, u8x16_t>(u, u2);
    535      }
    536 
    537      int32_t destIndex = y * aDestStride + 4 * x;
    538      simd::Store8(&aDestData[destIndex], u);
    539    }
    540  }
    541 }
    542 
    543 template <typename i16x8_t, typename u8x16_t>
    544 inline void ApplyMorphologyVertical_SIMD(
    545    const uint8_t* aSourceData, int32_t aSourceStride, uint8_t* aDestData,
    546    int32_t aDestStride, const IntRect& aDestRect, int32_t aRadius,
    547    MorphologyOperator aOp) {
    548  if (aOp == MORPHOLOGY_OPERATOR_ERODE) {
    549    ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_ERODE, i16x8_t, u8x16_t>(
    550        aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
    551  } else {
    552    ApplyMorphologyVertical_SIMD<MORPHOLOGY_OPERATOR_DILATE, i16x8_t, u8x16_t>(
    553        aSourceData, aSourceStride, aDestData, aDestStride, aDestRect, aRadius);
    554  }
    555 }
    556 
    557 template <typename i32x4_t, typename i16x8_t>
    558 static i32x4_t ColorMatrixMultiply(i16x8_t p, i16x8_t rows_bg, i16x8_t rows_ra,
    559                                   const i32x4_t& bias) {
    560  // int16_t p[8] == { b, g, r, a, b, g, r, a }.
    561  // int16_t rows_bg[8] == { bB, bG, bR, bA, gB, gG, gR, gA }.
    562  // int16_t rows_ra[8] == { rB, rG, rR, rA, aB, aG, aR, aA }.
    563  // int32_t bias[4] == { _B, _G, _R, _A }.
    564 
    565  i32x4_t sum = bias;
    566 
    567  // int16_t bg[8] = { b, g, b, g, b, g, b, g };
    568  i16x8_t bg = simd::ShuffleHi16<1, 0, 1, 0>(simd::ShuffleLo16<1, 0, 1, 0>(p));
    569  // int32_t prodsum_bg[4] =
    570  //   { b * bB + g * gB, b * bG + g * gG, b * bR + g * gR, b * bA + g * gA }
    571  i32x4_t prodsum_bg = simd::MulAdd16x8x2To32x4(bg, rows_bg);
    572  sum = simd::Add32(sum, prodsum_bg);
    573 
    574  // uint16_t ra[8] = { r, a, r, a, r, a, r, a };
    575  i16x8_t ra = simd::ShuffleHi16<3, 2, 3, 2>(simd::ShuffleLo16<3, 2, 3, 2>(p));
    576  // int32_t prodsum_ra[4] =
    577  //   { r * rB + a * aB, r * rG + a * aG, r * rR + a * aR, r * rA + a * aA }
    578  i32x4_t prodsum_ra = simd::MulAdd16x8x2To32x4(ra, rows_ra);
    579  sum = simd::Add32(sum, prodsum_ra);
    580 
    581  // int32_t sum[4] == { b * bB + g * gB + r * rB + a * aB + _B, ... }.
    582  return sum;
    583 }
    584 
    585 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
    586 static already_AddRefed<DataSourceSurface> ApplyColorMatrix_SIMD(
    587    DataSourceSurface* aInput, const Matrix5x4& aMatrix) {
    588  IntSize size = aInput->GetSize();
    589  RefPtr<DataSourceSurface> target =
    590      Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
    591  if (!target) {
    592    return nullptr;
    593  }
    594 
    595  DataSourceSurface::ScopedMap inputMap(aInput, DataSourceSurface::READ);
    596  DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
    597 
    598  const uint8_t* sourceData = inputMap.GetData();
    599  uint8_t* targetData = outputMap.GetData();
    600  int32_t sourceStride = inputMap.GetStride();
    601  int32_t targetStride = outputMap.GetStride();
    602 
    603  const int16_t factor = 128;
    604  const Float floatElementMax = INT16_MAX / factor;  // 255
    605  MOZ_ASSERT((floatElementMax * factor) <= INT16_MAX,
    606             "badly chosen float-to-int scale");
    607 
    608  const Float* floats = &aMatrix._11;
    609 
    610  ptrdiff_t componentOffsets[4] = {
    611      B8G8R8A8_COMPONENT_BYTEOFFSET_R, B8G8R8A8_COMPONENT_BYTEOFFSET_G,
    612      B8G8R8A8_COMPONENT_BYTEOFFSET_B, B8G8R8A8_COMPONENT_BYTEOFFSET_A};
    613 
    614  // We store the color matrix in rows_bgra in the following format:
    615  // { bB, bG, bR, bA, gB, gG, gR, gA }.
    616  // { bB, gB, bG, gG, bR, gR, bA, gA }
    617  // The way this is interleaved allows us to use the intrinsic _mm_madd_epi16
    618  // which works especially well for our use case.
    619  int16_t rows_bgra[2][8];
    620  for (size_t rowIndex = 0; rowIndex < 4; rowIndex++) {
    621    for (size_t colIndex = 0; colIndex < 4; colIndex++) {
    622      const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
    623      Float clampedFloatMatrixElement =
    624          std::clamp(floatMatrixElement, -floatElementMax, floatElementMax);
    625      int16_t scaledIntMatrixElement =
    626          int16_t(floorf(clampedFloatMatrixElement * factor + 0.5));
    627      int8_t bg_or_ra = componentOffsets[rowIndex] / 2;
    628      int8_t g_or_a = componentOffsets[rowIndex] % 2;
    629      int8_t B_or_G_or_R_or_A = componentOffsets[colIndex];
    630      rows_bgra[bg_or_ra][B_or_G_or_R_or_A * 2 + g_or_a] =
    631          scaledIntMatrixElement;
    632    }
    633  }
    634 
    635  int32_t rowBias[4];
    636  Float biasMax =
    637      (INT32_MAX - 4 * 255 * INT16_MAX - (factor / 2)) / (factor * 255);
    638  for (size_t colIndex = 0; colIndex < 4; colIndex++) {
    639    size_t rowIndex = 4;
    640    const Float& floatMatrixElement = floats[rowIndex * 4 + colIndex];
    641    Float clampedFloatMatrixElement =
    642        std::clamp(floatMatrixElement, -biasMax, biasMax);
    643    // Add 0.5 before multiplying by factor so that the later bitshift dividing
    644    // by factor is rounding to nearest
    645    Float scaledFloatMatrixElement =
    646        (clampedFloatMatrixElement * 255 + 0.5) * factor;
    647    int32_t scaledIntMatrixElement =
    648        int32_t(floorf(scaledFloatMatrixElement + 0.5));
    649    rowBias[componentOffsets[colIndex]] = scaledIntMatrixElement;
    650  }
    651 
    652  i16x8_t row_bg_v = simd::FromI16<i16x8_t>(
    653      rows_bgra[0][0], rows_bgra[0][1], rows_bgra[0][2], rows_bgra[0][3],
    654      rows_bgra[0][4], rows_bgra[0][5], rows_bgra[0][6], rows_bgra[0][7]);
    655 
    656  i16x8_t row_ra_v = simd::FromI16<i16x8_t>(
    657      rows_bgra[1][0], rows_bgra[1][1], rows_bgra[1][2], rows_bgra[1][3],
    658      rows_bgra[1][4], rows_bgra[1][5], rows_bgra[1][6], rows_bgra[1][7]);
    659 
    660  i32x4_t rowsBias_v =
    661      simd::From32<i32x4_t>(rowBias[0], rowBias[1], rowBias[2], rowBias[3]);
    662 
    663  for (int32_t y = 0; y < size.height; y++) {
    664    for (int32_t x = 0; x < size.width; x += 4) {
    665      MOZ_ASSERT(sourceStride >= 4 * (x + 4),
    666                 "need to be able to read 4 pixels at this position");
    667      MOZ_ASSERT(targetStride >= 4 * (x + 4),
    668                 "need to be able to write 4 pixels at this position");
    669      int32_t sourceIndex = y * sourceStride + 4 * x;
    670      int32_t targetIndex = y * targetStride + 4 * x;
    671 
    672      // We load 4 pixels, unpack them, process them 1 pixel at a time, and
    673      // finally pack and store the 4 result pixels.
    674 
    675      u8x16_t p1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
    676 
    677      // Splat needed to get each pixel twice into i16x8
    678      i16x8_t p11 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<0>(p1234));
    679      i16x8_t p22 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<1>(p1234));
    680      i16x8_t p33 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<2>(p1234));
    681      i16x8_t p44 = simd::UnpackLo8x8ToI16x8(simd::Splat32On8<3>(p1234));
    682 
    683      i32x4_t result_p1 =
    684          ColorMatrixMultiply(p11, row_bg_v, row_ra_v, rowsBias_v);
    685      i32x4_t result_p2 =
    686          ColorMatrixMultiply(p22, row_bg_v, row_ra_v, rowsBias_v);
    687      i32x4_t result_p3 =
    688          ColorMatrixMultiply(p33, row_bg_v, row_ra_v, rowsBias_v);
    689      i32x4_t result_p4 =
    690          ColorMatrixMultiply(p44, row_bg_v, row_ra_v, rowsBias_v);
    691 
    692      static_assert(factor == 1 << 7,
    693                    "Please adapt the calculation in the lines below for a "
    694                    "different factor.");
    695      u8x16_t result_p1234 = simd::PackAndSaturate32To8(
    696          simd::ShiftRight32<7>(result_p1), simd::ShiftRight32<7>(result_p2),
    697          simd::ShiftRight32<7>(result_p3), simd::ShiftRight32<7>(result_p4));
    698      simd::Store8(&targetData[targetIndex], result_p1234);
    699    }
    700  }
    701 
    702  return target.forget();
    703 }
    704 
    705 // source / dest: bgra bgra
    706 // sourceAlpha / destAlpha: aaaa aaaa
    707 // result: bgra bgra
    708 template <typename i32x4_t, typename u16x8_t, uint32_t aCompositeOperator>
    709 static inline u16x8_t CompositeTwoPixels(u16x8_t source, u16x8_t sourceAlpha,
    710                                         u16x8_t dest,
    711                                         const u16x8_t& destAlpha) {
    712  u16x8_t x255 = simd::FromU16<u16x8_t>(255);
    713 
    714  switch (aCompositeOperator) {
    715    case COMPOSITE_OPERATOR_OVER: {
    716      // val = dest * (255 - sourceAlpha) + source * 255;
    717      u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
    718 
    719      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
    720      u16x8_t rightFactor1 =
    721          simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, x255);
    722      i32x4_t result1 =
    723          simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
    724 
    725      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
    726      u16x8_t rightFactor2 =
    727          simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, x255);
    728      i32x4_t result2 =
    729          simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
    730 
    731      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
    732                                          simd::FastDivideBy255(result2));
    733    }
    734 
    735    case COMPOSITE_OPERATOR_IN: {
    736      // val = source * destAlpha;
    737      return simd::FastDivideBy255_16(simd::Mul16(source, destAlpha));
    738    }
    739 
    740    case COMPOSITE_OPERATOR_OUT: {
    741      // val = source * (255 - destAlpha);
    742      u16x8_t prod = simd::Mul16(source, simd::Sub16(x255, destAlpha));
    743      return simd::FastDivideBy255_16(prod);
    744    }
    745 
    746    case COMPOSITE_OPERATOR_ATOP: {
    747      // val = dest * (255 - sourceAlpha) + source * destAlpha;
    748      u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
    749 
    750      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
    751      u16x8_t rightFactor1 =
    752          simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha, destAlpha);
    753      i32x4_t result1 =
    754          simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
    755 
    756      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
    757      u16x8_t rightFactor2 =
    758          simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha, destAlpha);
    759      i32x4_t result2 =
    760          simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
    761 
    762      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
    763                                          simd::FastDivideBy255(result2));
    764    }
    765 
    766    case COMPOSITE_OPERATOR_XOR: {
    767      // val = dest * (255 - sourceAlpha) + source * (255 - destAlpha);
    768      u16x8_t twoFiftyFiveMinusSourceAlpha = simd::Sub16(x255, sourceAlpha);
    769      u16x8_t twoFiftyFiveMinusDestAlpha = simd::Sub16(x255, destAlpha);
    770 
    771      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
    772      u16x8_t rightFactor1 = simd::InterleaveLo16(twoFiftyFiveMinusSourceAlpha,
    773                                                  twoFiftyFiveMinusDestAlpha);
    774      i32x4_t result1 =
    775          simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
    776 
    777      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
    778      u16x8_t rightFactor2 = simd::InterleaveHi16(twoFiftyFiveMinusSourceAlpha,
    779                                                  twoFiftyFiveMinusDestAlpha);
    780      i32x4_t result2 =
    781          simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
    782 
    783      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
    784                                          simd::FastDivideBy255(result2));
    785    }
    786 
    787    case COMPOSITE_OPERATOR_LIGHTER: {
    788      // val = dest * sourceAlpha + source * destAlpha;
    789      u16x8_t destSourceInterleaved1 = simd::InterleaveLo16(dest, source);
    790      u16x8_t rightFactor1 = simd::InterleaveLo16(sourceAlpha, destAlpha);
    791      i32x4_t result1 =
    792          simd::MulAdd16x8x2To32x4(destSourceInterleaved1, rightFactor1);
    793 
    794      u16x8_t destSourceInterleaved2 = simd::InterleaveHi16(dest, source);
    795      u16x8_t rightFactor2 = simd::InterleaveHi16(sourceAlpha, destAlpha);
    796      i32x4_t result2 =
    797          simd::MulAdd16x8x2To32x4(destSourceInterleaved2, rightFactor2);
    798 
    799      return simd::PackAndSaturate32ToU16(simd::FastDivideBy255(result1),
    800                                          simd::FastDivideBy255(result2));
    801    }
    802 
    803    default:
    804      return simd::FromU16<u16x8_t>(0);
    805  }
    806 }
    807 
    808 template <typename i32x4_t, typename u16x8_t, typename u8x16_t, uint32_t op>
    809 static void ApplyComposition(DataSourceSurface* aSource,
    810                             DataSourceSurface* aDest) {
    811  IntSize size = aDest->GetSize();
    812 
    813  DataSourceSurface::ScopedMap input(aSource, DataSourceSurface::READ);
    814  DataSourceSurface::ScopedMap output(aDest, DataSourceSurface::READ_WRITE);
    815 
    816  const uint8_t* sourceData = input.GetData();
    817  uint8_t* destData = output.GetData();
    818  uint32_t sourceStride = input.GetStride();
    819  uint32_t destStride = output.GetStride();
    820 
    821  for (int32_t y = 0; y < size.height; y++) {
    822    for (int32_t x = 0; x < size.width; x += 4) {
    823      uint32_t sourceIndex = y * sourceStride + 4 * x;
    824      uint32_t destIndex = y * destStride + 4 * x;
    825 
    826      u8x16_t s1234 = simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
    827      u8x16_t d1234 = simd::Load8<u8x16_t>(&destData[destIndex]);
    828 
    829      u16x8_t s12 = simd::UnpackLo8x8ToU16x8(s1234);
    830      u16x8_t d12 = simd::UnpackLo8x8ToU16x8(d1234);
    831      u16x8_t sa12 = simd::Splat16<3, 3>(s12);
    832      u16x8_t da12 = simd::Splat16<3, 3>(d12);
    833      u16x8_t result12 =
    834          CompositeTwoPixels<i32x4_t, u16x8_t, op>(s12, sa12, d12, da12);
    835 
    836      u16x8_t s34 = simd::UnpackHi8x8ToU16x8(s1234);
    837      u16x8_t d34 = simd::UnpackHi8x8ToU16x8(d1234);
    838      u16x8_t sa34 = simd::Splat16<3, 3>(s34);
    839      u16x8_t da34 = simd::Splat16<3, 3>(d34);
    840      u16x8_t result34 =
    841          CompositeTwoPixels<i32x4_t, u16x8_t, op>(s34, sa34, d34, da34);
    842 
    843      u8x16_t result1234 = simd::PackAndSaturate16To8(result12, result34);
    844      simd::Store8(&destData[destIndex], result1234);
    845    }
    846  }
    847 }
    848 
    849 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
    850 static void ApplyComposition_SIMD(DataSourceSurface* aSource,
    851                                  DataSourceSurface* aDest,
    852                                  CompositeOperator aOperator) {
    853  switch (aOperator) {
    854    case COMPOSITE_OPERATOR_OVER:
    855      ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OVER>(
    856          aSource, aDest);
    857      break;
    858    case COMPOSITE_OPERATOR_IN:
    859      ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_IN>(
    860          aSource, aDest);
    861      break;
    862    case COMPOSITE_OPERATOR_OUT:
    863      ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_OUT>(
    864          aSource, aDest);
    865      break;
    866    case COMPOSITE_OPERATOR_ATOP:
    867      ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_ATOP>(
    868          aSource, aDest);
    869      break;
    870    case COMPOSITE_OPERATOR_XOR:
    871      ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_XOR>(
    872          aSource, aDest);
    873      break;
    874    case COMPOSITE_OPERATOR_LIGHTER:
    875      ApplyComposition<i32x4_t, i16x8_t, u8x16_t, COMPOSITE_OPERATOR_LIGHTER>(
    876          aSource, aDest);
    877      break;
    878    default:
    879      MOZ_CRASH("GFX: Incomplete switch");
    880  }
    881 }
    882 
    883 template <typename u8x16_t>
    884 static void SeparateColorChannels_SIMD(
    885    const IntSize& size, const uint8_t* sourceData, int32_t sourceStride,
    886    uint8_t* channel0Data, uint8_t* channel1Data, uint8_t* channel2Data,
    887    uint8_t* channel3Data, int32_t channelStride) {
    888  for (int32_t y = 0; y < size.height; y++) {
    889    for (int32_t x = 0; x < size.width; x += 16) {
    890      // Process 16 pixels at a time.
    891      int32_t sourceIndex = y * sourceStride + 4 * x;
    892      int32_t targetIndex = y * channelStride + x;
    893 
    894      u8x16_t bgrabgrabgrabgra2 = simd::FromZero8<u8x16_t>();
    895      u8x16_t bgrabgrabgrabgra3 = simd::FromZero8<u8x16_t>();
    896      u8x16_t bgrabgrabgrabgra4 = simd::FromZero8<u8x16_t>();
    897 
    898      u8x16_t bgrabgrabgrabgra1 =
    899          simd::Load8<u8x16_t>(&sourceData[sourceIndex]);
    900      if (4 * (x + 4) < sourceStride) {
    901        bgrabgrabgrabgra2 =
    902            simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 4]);
    903      }
    904      if (4 * (x + 8) < sourceStride) {
    905        bgrabgrabgrabgra3 =
    906            simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 8]);
    907      }
    908      if (4 * (x + 12) < sourceStride) {
    909        bgrabgrabgrabgra4 =
    910            simd::Load8<u8x16_t>(&sourceData[sourceIndex + 4 * 12]);
    911      }
    912 
    913      u8x16_t bbggrraabbggrraa1 =
    914          simd::InterleaveLo8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
    915      u8x16_t bbggrraabbggrraa2 =
    916          simd::InterleaveHi8(bgrabgrabgrabgra1, bgrabgrabgrabgra3);
    917      u8x16_t bbggrraabbggrraa3 =
    918          simd::InterleaveLo8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
    919      u8x16_t bbggrraabbggrraa4 =
    920          simd::InterleaveHi8(bgrabgrabgrabgra2, bgrabgrabgrabgra4);
    921      u8x16_t bbbbggggrrrraaaa1 =
    922          simd::InterleaveLo8(bbggrraabbggrraa1, bbggrraabbggrraa3);
    923      u8x16_t bbbbggggrrrraaaa2 =
    924          simd::InterleaveHi8(bbggrraabbggrraa1, bbggrraabbggrraa3);
    925      u8x16_t bbbbggggrrrraaaa3 =
    926          simd::InterleaveLo8(bbggrraabbggrraa2, bbggrraabbggrraa4);
    927      u8x16_t bbbbggggrrrraaaa4 =
    928          simd::InterleaveHi8(bbggrraabbggrraa2, bbggrraabbggrraa4);
    929      u8x16_t bbbbbbbbgggggggg1 =
    930          simd::InterleaveLo8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
    931      u8x16_t rrrrrrrraaaaaaaa1 =
    932          simd::InterleaveHi8(bbbbggggrrrraaaa1, bbbbggggrrrraaaa3);
    933      u8x16_t bbbbbbbbgggggggg2 =
    934          simd::InterleaveLo8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
    935      u8x16_t rrrrrrrraaaaaaaa2 =
    936          simd::InterleaveHi8(bbbbggggrrrraaaa2, bbbbggggrrrraaaa4);
    937      u8x16_t bbbbbbbbbbbbbbbb =
    938          simd::InterleaveLo8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
    939      u8x16_t gggggggggggggggg =
    940          simd::InterleaveHi8(bbbbbbbbgggggggg1, bbbbbbbbgggggggg2);
    941      u8x16_t rrrrrrrrrrrrrrrr =
    942          simd::InterleaveLo8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
    943      u8x16_t aaaaaaaaaaaaaaaa =
    944          simd::InterleaveHi8(rrrrrrrraaaaaaaa1, rrrrrrrraaaaaaaa2);
    945 
    946      simd::Store8(&channel0Data[targetIndex], bbbbbbbbbbbbbbbb);
    947      simd::Store8(&channel1Data[targetIndex], gggggggggggggggg);
    948      simd::Store8(&channel2Data[targetIndex], rrrrrrrrrrrrrrrr);
    949      simd::Store8(&channel3Data[targetIndex], aaaaaaaaaaaaaaaa);
    950    }
    951  }
    952 }
    953 
    954 template <typename u8x16_t>
    955 static void CombineColorChannels_SIMD(
    956    const IntSize& size, int32_t resultStride, uint8_t* resultData,
    957    int32_t channelStride, uint8_t* channel0Data, const uint8_t* channel1Data,
    958    const uint8_t* channel2Data, const uint8_t* channel3Data) {
    959  for (int32_t y = 0; y < size.height; y++) {
    960    for (int32_t x = 0; x < size.width; x += 16) {
    961      // Process 16 pixels at a time.
    962      int32_t resultIndex = y * resultStride + 4 * x;
    963      int32_t channelIndex = y * channelStride + x;
    964 
    965      u8x16_t bbbbbbbbbbbbbbbb =
    966          simd::Load8<u8x16_t>(&channel0Data[channelIndex]);
    967      u8x16_t gggggggggggggggg =
    968          simd::Load8<u8x16_t>(&channel1Data[channelIndex]);
    969      u8x16_t rrrrrrrrrrrrrrrr =
    970          simd::Load8<u8x16_t>(&channel2Data[channelIndex]);
    971      u8x16_t aaaaaaaaaaaaaaaa =
    972          simd::Load8<u8x16_t>(&channel3Data[channelIndex]);
    973 
    974      u8x16_t brbrbrbrbrbrbrbr1 =
    975          simd::InterleaveLo8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
    976      u8x16_t brbrbrbrbrbrbrbr2 =
    977          simd::InterleaveHi8(bbbbbbbbbbbbbbbb, rrrrrrrrrrrrrrrr);
    978      u8x16_t gagagagagagagaga1 =
    979          simd::InterleaveLo8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
    980      u8x16_t gagagagagagagaga2 =
    981          simd::InterleaveHi8(gggggggggggggggg, aaaaaaaaaaaaaaaa);
    982 
    983      u8x16_t bgrabgrabgrabgra1 =
    984          simd::InterleaveLo8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
    985      u8x16_t bgrabgrabgrabgra2 =
    986          simd::InterleaveHi8(brbrbrbrbrbrbrbr1, gagagagagagagaga1);
    987      u8x16_t bgrabgrabgrabgra3 =
    988          simd::InterleaveLo8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
    989      u8x16_t bgrabgrabgrabgra4 =
    990          simd::InterleaveHi8(brbrbrbrbrbrbrbr2, gagagagagagagaga2);
    991 
    992      simd::Store8(&resultData[resultIndex], bgrabgrabgrabgra1);
    993      if (4 * (x + 4) < resultStride) {
    994        simd::Store8(&resultData[resultIndex + 4 * 4], bgrabgrabgrabgra2);
    995      }
    996      if (4 * (x + 8) < resultStride) {
    997        simd::Store8(&resultData[resultIndex + 8 * 4], bgrabgrabgrabgra3);
    998      }
    999      if (4 * (x + 12) < resultStride) {
   1000        simd::Store8(&resultData[resultIndex + 12 * 4], bgrabgrabgrabgra4);
   1001      }
   1002    }
   1003  }
   1004 }
   1005 
   1006 template <typename u16x8_t, typename u8x16_t>
   1007 static void DoOpacityCalculation_SIMD(const IntSize& aSize,
   1008                                      uint8_t* aTargetData,
   1009                                      int32_t aTargetStride,
   1010                                      const uint8_t* aSourceData,
   1011                                      int32_t aSourceStride, Float aOpacity) {
   1012  uint8_t alphaValue = uint8_t(roundf(255.f * aOpacity));
   1013  u16x8_t alphaValues =
   1014      simd::FromU16<u16x8_t>(alphaValue, alphaValue, alphaValue, alphaValue,
   1015                             alphaValue, alphaValue, alphaValue, alphaValue);
   1016  for (int32_t y = 0; y < aSize.height; y++) {
   1017    for (int32_t x = 0; x < aSize.width; x += 4) {
   1018      int32_t inputIndex = y * aSourceStride + 4 * x;
   1019      int32_t targetIndex = y * aTargetStride + 4 * x;
   1020 
   1021      u8x16_t p1234 = simd::Load8<u8x16_t>(&aSourceData[inputIndex]);
   1022      u16x8_t p12 = simd::UnpackLo8x8ToU16x8(p1234);
   1023      u16x8_t p34 = simd::UnpackHi8x8ToU16x8(p1234);
   1024 
   1025      // Multiply all components with alpha.
   1026      p12 = simd::Mul16(p12, alphaValues);
   1027      p34 = simd::Mul16(p34, alphaValues);
   1028 
   1029      // Divide by 255 and pack.
   1030      u8x16_t result = simd::PackAndSaturate16To8(simd::ShiftRight16<8>(p12),
   1031                                                  simd::ShiftRight16<8>(p34));
   1032 
   1033      simd::Store8(&aTargetData[targetIndex], result);
   1034    }
   1035  }
   1036 }
   1037 
   1038 template <typename f32x4_t, typename i32x4_t, typename u8x16_t>
   1039 static already_AddRefed<DataSourceSurface> RenderTurbulence_SIMD(
   1040    const IntSize& aSize, const Point& aOffset, const Size& aBaseFrequency,
   1041    int32_t aSeed, int aNumOctaves, TurbulenceType aType, bool aStitch,
   1042    const Rect& aTileRect) {
   1043 #define RETURN_TURBULENCE(Type, Stitch)                                    \
   1044  SVGTurbulenceRenderer<Type, Stitch, f32x4_t, i32x4_t, u8x16_t> renderer( \
   1045      aBaseFrequency, aSeed, aNumOctaves, aTileRect);                      \
   1046  return renderer.Render(aSize, aOffset);
   1047 
   1048  switch (aType) {
   1049    case TURBULENCE_TYPE_TURBULENCE: {
   1050      if (aStitch) {
   1051        RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, true);
   1052      }
   1053      RETURN_TURBULENCE(TURBULENCE_TYPE_TURBULENCE, false);
   1054    }
   1055    case TURBULENCE_TYPE_FRACTAL_NOISE: {
   1056      if (aStitch) {
   1057        RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, true);
   1058      }
   1059      RETURN_TURBULENCE(TURBULENCE_TYPE_FRACTAL_NOISE, false);
   1060    }
   1061  }
   1062  return nullptr;
   1063 #undef RETURN_TURBULENCE
   1064 }
   1065 
   1066 // k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
   1067 template <typename i32x4_t, typename i16x8_t>
   1068 static MOZ_ALWAYS_INLINE i16x8_t ArithmeticCombineTwoPixels(
   1069    i16x8_t in1, i16x8_t in2, const i16x8_t& k1And4, const i16x8_t& k2And3) {
   1070  // Calculate input product: inProd = (in1 * in2) / 255.
   1071  i32x4_t inProd_1, inProd_2;
   1072  simd::Mul16x4x2x2To32x4x2(in1, in2, inProd_1, inProd_2);
   1073  i16x8_t inProd = simd::PackAndSaturate32To16(simd::FastDivideBy255(inProd_1),
   1074                                               simd::FastDivideBy255(inProd_2));
   1075 
   1076  // Calculate k1 * ((in1 * in2) / 255) + (k4/128) * 128
   1077  i16x8_t oneTwentyEight = simd::FromI16<i16x8_t>(128);
   1078  i16x8_t inProd1AndOneTwentyEight =
   1079      simd::InterleaveLo16(inProd, oneTwentyEight);
   1080  i16x8_t inProd2AndOneTwentyEight =
   1081      simd::InterleaveHi16(inProd, oneTwentyEight);
   1082  i32x4_t inProdTimesK1PlusK4_1 =
   1083      simd::MulAdd16x8x2To32x4(k1And4, inProd1AndOneTwentyEight);
   1084  i32x4_t inProdTimesK1PlusK4_2 =
   1085      simd::MulAdd16x8x2To32x4(k1And4, inProd2AndOneTwentyEight);
   1086 
   1087  // Calculate k2 * in1 + k3 * in2
   1088  i16x8_t in12_1 = simd::InterleaveLo16(in1, in2);
   1089  i16x8_t in12_2 = simd::InterleaveHi16(in1, in2);
   1090  i32x4_t inTimesK2K3_1 = simd::MulAdd16x8x2To32x4(k2And3, in12_1);
   1091  i32x4_t inTimesK2K3_2 = simd::MulAdd16x8x2To32x4(k2And3, in12_2);
   1092 
   1093  // Sum everything up and truncate the fractional part.
   1094  i32x4_t result_1 =
   1095      simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_1, inTimesK2K3_1));
   1096  i32x4_t result_2 =
   1097      simd::ShiftRight32<7>(simd::Add32(inProdTimesK1PlusK4_2, inTimesK2K3_2));
   1098  return simd::PackAndSaturate32To16(result_1, result_2);
   1099 }
   1100 
   1101 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
   1102 static void ApplyArithmeticCombine_SIMD(
   1103    const DataSourceSurface::ScopedMap& aInputMap1,
   1104    const DataSourceSurface::ScopedMap& aInputMap2,
   1105    const DataSourceSurface::ScopedMap& aOutputMap, const IntSize& aSize,
   1106    Float aK1, Float aK2, Float aK3, Float aK4) {
   1107  const uint8_t* source1Data = aInputMap1.GetData();
   1108  const uint8_t* source2Data = aInputMap2.GetData();
   1109  uint8_t* targetData = aOutputMap.GetData();
   1110  uint32_t source1Stride = aInputMap1.GetStride();
   1111  uint32_t source2Stride = aInputMap2.GetStride();
   1112  uint32_t targetStride = aOutputMap.GetStride();
   1113 
   1114  // The arithmetic combine filter does the following calculation:
   1115  // result = k1 * in1 * in2 + k2 * in1 + k3 * in2 + k4
   1116  //
   1117  // Or, with in1/2 integers between 0 and 255:
   1118  // result = (k1 * in1 * in2) / 255 + k2 * in1 + k3 * in2 + k4 * 255
   1119  //
   1120  // We want the whole calculation to happen in integer, with 16-bit factors.
   1121  // So we convert our factors to fixed-point with precision 1.8.7.
   1122  // K4 is premultiplied with 255, and it will be multiplied with 128 later
   1123  // during the actual calculation, because premultiplying it with 255 * 128
   1124  // would overflow int16.
   1125 
   1126  i16x8_t k1 = simd::FromI16<i16x8_t>(
   1127      int16_t(floorf(std::clamp(aK1, -255.0f, 255.0f) * 128 + 0.5f)));
   1128  i16x8_t k2 = simd::FromI16<i16x8_t>(
   1129      int16_t(floorf(std::clamp(aK2, -255.0f, 255.0f) * 128 + 0.5f)));
   1130  i16x8_t k3 = simd::FromI16<i16x8_t>(
   1131      int16_t(floorf(std::clamp(aK3, -255.0f, 255.0f) * 128 + 0.5f)));
   1132  i16x8_t k4 = simd::FromI16<i16x8_t>(
   1133      int16_t(floorf(std::clamp(aK4, -128.0f, 128.0f) * 255 + 0.5f)));
   1134 
   1135  i16x8_t k1And4 = simd::InterleaveLo16(k1, k4);
   1136  i16x8_t k2And3 = simd::InterleaveLo16(k2, k3);
   1137 
   1138  for (int32_t y = 0; y < aSize.height; y++) {
   1139    for (int32_t x = 0; x < aSize.width; x += 4) {
   1140      uint32_t source1Index = y * source1Stride + 4 * x;
   1141      uint32_t source2Index = y * source2Stride + 4 * x;
   1142      uint32_t targetIndex = y * targetStride + 4 * x;
   1143 
   1144      // Load and unpack.
   1145      u8x16_t in1 = simd::Load8<u8x16_t>(&source1Data[source1Index]);
   1146      u8x16_t in2 = simd::Load8<u8x16_t>(&source2Data[source2Index]);
   1147      i16x8_t in1_12 = simd::UnpackLo8x8ToI16x8(in1);
   1148      i16x8_t in1_34 = simd::UnpackHi8x8ToI16x8(in1);
   1149      i16x8_t in2_12 = simd::UnpackLo8x8ToI16x8(in2);
   1150      i16x8_t in2_34 = simd::UnpackHi8x8ToI16x8(in2);
   1151 
   1152      // Multiply and add.
   1153      i16x8_t result_12 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
   1154          in1_12, in2_12, k1And4, k2And3);
   1155      i16x8_t result_34 = ArithmeticCombineTwoPixels<i32x4_t, i16x8_t>(
   1156          in1_34, in2_34, k1And4, k2And3);
   1157 
   1158      // Pack and store.
   1159      simd::Store8(&targetData[targetIndex],
   1160                   simd::PackAndSaturate16To8(result_12, result_34));
   1161    }
   1162  }
   1163 }
   1164 
   1165 template <typename i32x4_t, typename i16x8_t, typename u8x16_t>
   1166 static already_AddRefed<DataSourceSurface> ApplyArithmeticCombine_SIMD(
   1167    DataSourceSurface* aInput1, DataSourceSurface* aInput2, Float aK1,
   1168    Float aK2, Float aK3, Float aK4) {
   1169  IntSize size = aInput1->GetSize();
   1170  RefPtr<DataSourceSurface> target =
   1171      Factory::CreateDataSourceSurface(size, SurfaceFormat::B8G8R8A8);
   1172  if (!target) {
   1173    return nullptr;
   1174  }
   1175 
   1176  DataSourceSurface::ScopedMap inputMap1(aInput1, DataSourceSurface::READ);
   1177  DataSourceSurface::ScopedMap outputMap(target, DataSourceSurface::READ_WRITE);
   1178 
   1179  if (aInput1->Equals(aInput2)) {
   1180    ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
   1181        inputMap1, inputMap1, outputMap, size, aK1, aK2, aK3, aK4);
   1182  } else {
   1183    DataSourceSurface::ScopedMap inputMap2(aInput2, DataSourceSurface::READ);
   1184    ApplyArithmeticCombine_SIMD<i32x4_t, i16x8_t, u8x16_t>(
   1185        inputMap1, inputMap2, outputMap, size, aK1, aK2, aK3, aK4);
   1186  }
   1187 
   1188  return target.forget();
   1189 }
   1190 
   1191 }  // namespace gfx
   1192 }  // namespace mozilla