tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

Swizzle.cpp (64516B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "Swizzle.h"
      8 #include "Logging.h"
      9 #include "Orientation.h"
     10 #include "Tools.h"
     11 #include "mozilla/CheckedInt.h"
     12 #include "mozilla/EndianUtils.h"
     13 #include "mozilla/UniquePtr.h"
     14 
     15 #ifdef USE_SSE2
     16 #  include "mozilla/SSE.h"
     17 #endif
     18 
     19 #ifdef USE_NEON
     20 #  include "mozilla/arm.h"
     21 #endif
     22 
     23 #include <new>
     24 
     25 namespace mozilla {
     26 namespace gfx {
     27 
     28 /**
     29 * Convenience macros for dispatching to various format combinations.
     30 */
     31 
     32 // Hash the formats to a relatively dense value to optimize jump table
     33 // generation. The first 6 formats in SurfaceFormat are the 32-bit BGRA variants
     34 // and are the most common formats dispatched here. Room is reserved in the
     35 // lowish bits for up to these 6 destination formats. If a destination format is
     36 // >= 6, the 6th bit is set to avoid collisions.
     37 #define FORMAT_KEY(aSrcFormat, aDstFormat) \
     38  (int(aSrcFormat) * 6 + int(aDstFormat) + (int(int(aDstFormat) >= 6) << 6))
     39 
     40 #define FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, ...) \
     41  case FORMAT_KEY(aSrcFormat, aDstFormat):            \
     42    __VA_ARGS__;                                      \
     43    return true;
     44 
     45 #define FORMAT_CASE(aSrcFormat, aDstFormat, ...) \
     46  FORMAT_CASE_EXPR(aSrcFormat, aDstFormat, FORMAT_CASE_CALL(__VA_ARGS__))
     47 
     48 #define FORMAT_CASE_ROW(aSrcFormat, aDstFormat, ...) \
     49  case FORMAT_KEY(aSrcFormat, aDstFormat):           \
     50    return &__VA_ARGS__;
     51 
     52 /**
     53 * Constexpr functions for analyzing format attributes in templates.
     54 */
     55 
     56 // Whether B comes before R in pixel memory layout.
     57 static constexpr bool IsBGRFormat(SurfaceFormat aFormat) {
     58  return aFormat == SurfaceFormat::B8G8R8A8 ||
     59 #if MOZ_LITTLE_ENDIAN()
     60         aFormat == SurfaceFormat::R5G6B5_UINT16 ||
     61 #endif
     62         aFormat == SurfaceFormat::B8G8R8X8 || aFormat == SurfaceFormat::B8G8R8;
     63 }
     64 
     65 // Whether the order of B and R need to be swapped to map from src to dst.
     66 static constexpr bool ShouldSwapRB(SurfaceFormat aSrcFormat,
     67                                   SurfaceFormat aDstFormat) {
     68  return IsBGRFormat(aSrcFormat) != IsBGRFormat(aDstFormat);
     69 }
     70 
     71 // The starting byte of the RGB components in pixel memory.
     72 static constexpr uint32_t RGBByteIndex(SurfaceFormat aFormat) {
     73  return aFormat == SurfaceFormat::A8R8G8B8 ||
     74                 aFormat == SurfaceFormat::X8R8G8B8
     75             ? 1
     76             : 0;
     77 }
     78 
     79 // The byte of the alpha component, which just comes after RGB.
     80 static constexpr uint32_t AlphaByteIndex(SurfaceFormat aFormat) {
     81  return (RGBByteIndex(aFormat) + 3) % 4;
     82 }
     83 
     84 // The endian-dependent bit shift to access RGB of a UINT32 pixel.
     85 static constexpr uint32_t RGBBitShift(SurfaceFormat aFormat) {
     86 #if MOZ_LITTLE_ENDIAN()
     87  return 8 * RGBByteIndex(aFormat);
     88 #else
     89  return 8 - 8 * RGBByteIndex(aFormat);
     90 #endif
     91 }
     92 
     93 // The endian-dependent bit shift to access alpha of a UINT32 pixel.
     94 static constexpr uint32_t AlphaBitShift(SurfaceFormat aFormat) {
     95  return (RGBBitShift(aFormat) + 24) % 32;
     96 }
     97 
     98 // Whether the pixel format should ignore the value of the alpha channel and
     99 // treat it as opaque.
    100 static constexpr bool IgnoreAlpha(SurfaceFormat aFormat) {
    101  return aFormat == SurfaceFormat::B8G8R8X8 ||
    102         aFormat == SurfaceFormat::R8G8B8X8 ||
    103         aFormat == SurfaceFormat::X8R8G8B8;
    104 }
    105 
    106 // Whether to force alpha to opaque to map from src to dst.
    107 static constexpr bool ShouldForceOpaque(SurfaceFormat aSrcFormat,
    108                                        SurfaceFormat aDstFormat) {
    109  return IgnoreAlpha(aSrcFormat) != IgnoreAlpha(aDstFormat);
    110 }
    111 
    112 #ifdef USE_SSE2
    113 /**
    114 * SSE2 optimizations
    115 */
    116 
    117 template <bool aSwapRB, bool aOpaqueAlpha>
    118 void Premultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    119 
    120 #  define PREMULTIPLY_SSE2(aSrcFormat, aDstFormat)                     \
    121    FORMAT_CASE(aSrcFormat, aDstFormat,                                \
    122                Premultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
    123                                 ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    124 
    125 template <bool aSwapRB, bool aOpaqueAlpha>
    126 void PremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
    127 
    128 #  define PREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat)            \
    129    FORMAT_CASE_ROW(                                              \
    130        aSrcFormat, aDstFormat,                                   \
    131        PremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
    132                            ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    133 
    134 template <bool aSwapRB>
    135 void Unpremultiply_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    136 
    137 #  define UNPREMULTIPLY_SSE2(aSrcFormat, aDstFormat) \
    138    FORMAT_CASE(aSrcFormat, aDstFormat,              \
    139                Unpremultiply_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat)>)
    140 
    141 template <bool aSwapRB>
    142 void UnpremultiplyRow_SSE2(const uint8_t*, uint8_t*, int32_t);
    143 
    144 #  define UNPREMULTIPLY_ROW_SSE2(aSrcFormat, aDstFormat) \
    145    FORMAT_CASE_ROW(                                     \
    146        aSrcFormat, aDstFormat,                          \
    147        UnpremultiplyRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat)>)
    148 
    149 template <bool aSwapRB, bool aOpaqueAlpha>
    150 void Swizzle_SSE2(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    151 
    152 #  define SWIZZLE_SSE2(aSrcFormat, aDstFormat)                     \
    153    FORMAT_CASE(aSrcFormat, aDstFormat,                            \
    154                Swizzle_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
    155                             ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    156 
    157 template <bool aSwapRB, bool aOpaqueAlpha>
    158 void SwizzleRow_SSE2(const uint8_t*, uint8_t*, int32_t);
    159 
    160 #  define SWIZZLE_ROW_SSE2(aSrcFormat, aDstFormat)            \
    161    FORMAT_CASE_ROW(                                          \
    162        aSrcFormat, aDstFormat,                               \
    163        SwizzleRow_SSE2<ShouldSwapRB(aSrcFormat, aDstFormat), \
    164                        ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    165 
    166 template <bool aSwapRB>
    167 void UnpackRowRGB24_SSSE3(const uint8_t*, uint8_t*, int32_t);
    168 
    169 #  define UNPACK_ROW_RGB_SSSE3(aDstFormat) \
    170    FORMAT_CASE_ROW(                       \
    171        SurfaceFormat::R8G8B8, aDstFormat, \
    172        UnpackRowRGB24_SSSE3<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
    173 
    174 template <bool aSwapRB>
    175 void UnpackRowRGB24_AVX2(const uint8_t*, uint8_t*, int32_t);
    176 
    177 #  define UNPACK_ROW_RGB_AVX2(aDstFormat)  \
    178    FORMAT_CASE_ROW(                       \
    179        SurfaceFormat::R8G8B8, aDstFormat, \
    180        UnpackRowRGB24_AVX2<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
    181 
    182 #endif
    183 
    184 #ifdef USE_NEON
    185 /**
    186 * ARM NEON optimizations
    187 */
    188 
    189 template <bool aSwapRB, bool aOpaqueAlpha>
    190 void Premultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    191 
    192 #  define PREMULTIPLY_NEON(aSrcFormat, aDstFormat)                     \
    193    FORMAT_CASE(aSrcFormat, aDstFormat,                                \
    194                Premultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
    195                                 ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    196 
    197 template <bool aSwapRB, bool aOpaqueAlpha>
    198 void PremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
    199 
    200 #  define PREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat)            \
    201    FORMAT_CASE_ROW(                                              \
    202        aSrcFormat, aDstFormat,                                   \
    203        PremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
    204                            ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    205 
    206 template <bool aSwapRB>
    207 void Unpremultiply_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    208 
    209 #  define UNPREMULTIPLY_NEON(aSrcFormat, aDstFormat) \
    210    FORMAT_CASE(aSrcFormat, aDstFormat,              \
    211                Unpremultiply_NEON<ShouldSwapRB(aSrcFormat, aDstFormat)>)
    212 
    213 template <bool aSwapRB>
    214 void UnpremultiplyRow_NEON(const uint8_t*, uint8_t*, int32_t);
    215 
    216 #  define UNPREMULTIPLY_ROW_NEON(aSrcFormat, aDstFormat) \
    217    FORMAT_CASE_ROW(                                     \
    218        aSrcFormat, aDstFormat,                          \
    219        UnpremultiplyRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat)>)
    220 
    221 template <bool aSwapRB, bool aOpaqueAlpha>
    222 void Swizzle_NEON(const uint8_t*, int32_t, uint8_t*, int32_t, IntSize);
    223 
    224 #  define SWIZZLE_NEON(aSrcFormat, aDstFormat)                     \
    225    FORMAT_CASE(aSrcFormat, aDstFormat,                            \
    226                Swizzle_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
    227                             ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    228 
    229 template <bool aSwapRB, bool aOpaqueAlpha>
    230 void SwizzleRow_NEON(const uint8_t*, uint8_t*, int32_t);
    231 
    232 #  define SWIZZLE_ROW_NEON(aSrcFormat, aDstFormat)            \
    233    FORMAT_CASE_ROW(                                          \
    234        aSrcFormat, aDstFormat,                               \
    235        SwizzleRow_NEON<ShouldSwapRB(aSrcFormat, aDstFormat), \
    236                        ShouldForceOpaque(aSrcFormat, aDstFormat)>)
    237 
    238 template <bool aSwapRB>
    239 void UnpackRowRGB24_NEON(const uint8_t*, uint8_t*, int32_t);
    240 
    241 #  define UNPACK_ROW_RGB_NEON(aDstFormat)  \
    242    FORMAT_CASE_ROW(                       \
    243        SurfaceFormat::R8G8B8, aDstFormat, \
    244        UnpackRowRGB24_NEON<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
    245 #endif
    246 
    247 /**
    248 * Premultiplying
    249 */
    250 
    251 // Fallback premultiply implementation that uses splayed pixel math to reduce
    252 // the multiplications used. That is, the R and B components are isolated from
    253 // the G and A components, which then can be multiplied as if they were two
    254 // 2-component vectors. Otherwise, an approximation if divide-by-255 is used
    255 // which is faster than an actual division. These optimizations are also used
    256 // for the SSE2 and NEON implementations.
    257 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
    258          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
    259 static void PremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
    260                                     int32_t aLength) {
    261  const uint8_t* end = aSrc + 4 * aLength;
    262  do {
    263    // Load and process 1 entire pixel at a time.
    264    uint32_t color = *reinterpret_cast<const uint32_t*>(aSrc);
    265 
    266    uint32_t a = aSrcAShift ? color >> aSrcAShift : color & 0xFF;
    267 
    268    // Isolate the R and B components.
    269    uint32_t rb = (color >> aSrcRGBShift) & 0x00FF00FF;
    270    // Swap the order of R and B if necessary.
    271    if (aSwapRB) {
    272      rb = (rb >> 16) | (rb << 16);
    273    }
    274    // Approximate the multiply by alpha and divide by 255 which is
    275    // essentially:
    276    // c = c*a + 255; c = (c + (c >> 8)) >> 8;
    277    // However, we omit the final >> 8 to fold it with the final shift into
    278    // place depending on desired output format.
    279    rb = rb * a + 0x00FF00FF;
    280    rb = (rb + ((rb >> 8) & 0x00FF00FF)) & 0xFF00FF00;
    281 
    282    // Use same approximation as above, but G is shifted 8 bits left.
    283    // Alpha is left out and handled separately.
    284    uint32_t g = color & (0xFF00 << aSrcRGBShift);
    285    g = g * a + (0xFF00 << aSrcRGBShift);
    286    g = (g + (g >> 8)) & (0xFF0000 << aSrcRGBShift);
    287 
    288    // The above math leaves RGB shifted left by 8 bits.
    289    // Shift them right if required for the output format.
    290    // then combine them back together to produce output pixel.
    291    // Add the alpha back on if the output format is not opaque.
    292    *reinterpret_cast<uint32_t*>(aDst) =
    293        (rb >> (8 - aDstRGBShift)) | (g >> (8 + aSrcRGBShift - aDstRGBShift)) |
    294        (aOpaqueAlpha ? 0xFF << aDstAShift : a << aDstAShift);
    295 
    296    aSrc += 4;
    297    aDst += 4;
    298  } while (aSrc < end);
    299 }
    300 
    301 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
    302          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
    303 static void PremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
    304                                   int32_t aLength) {
    305  PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
    306                           aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
    307 }
    308 
    309 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
    310          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
    311 static void PremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
    312                                uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
    313  for (int32_t height = aSize.height; height > 0; height--) {
    314    PremultiplyChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
    315                             aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
    316    aSrc += aSrcGap;
    317    aDst += aDstGap;
    318  }
    319 }
    320 
    321 #define PREMULTIPLY_FALLBACK_CASE(aSrcFormat, aDstFormat)                     \
    322  FORMAT_CASE(                                                                \
    323      aSrcFormat, aDstFormat,                                                 \
    324      PremultiplyFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
    325                          ShouldForceOpaque(aSrcFormat, aDstFormat),          \
    326                          RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
    327                          RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
    328 
    329 #define PREMULTIPLY_FALLBACK(aSrcFormat)                         \
    330  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
    331  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
    332  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
    333  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
    334  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
    335  PREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
    336 
    337 #define PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
    338  FORMAT_CASE_ROW(aSrcFormat, aDstFormat,                                 \
    339                  PremultiplyRowFallback<                                 \
    340                      ShouldSwapRB(aSrcFormat, aDstFormat),               \
    341                      ShouldForceOpaque(aSrcFormat, aDstFormat),          \
    342                      RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
    343                      RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
    344 
    345 #define PREMULTIPLY_ROW_FALLBACK(aSrcFormat)                         \
    346  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
    347  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8X8) \
    348  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
    349  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8X8) \
    350  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8) \
    351  PREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::X8R8G8B8)
    352 
    353 // If rows are tightly packed, and the size of the total area will fit within
    354 // the precision range of a single row, then process all the data as if it was
    355 // a single row.
    356 static inline IntSize CollapseSize(const IntSize& aSize, int32_t aSrcStride,
    357                                   int32_t aDstStride) {
    358  if (aSrcStride == aDstStride && (aSrcStride & 3) == 0 &&
    359      aSrcStride / 4 == aSize.width) {
    360    CheckedInt32 area = CheckedInt32(aSize.width) * CheckedInt32(aSize.height);
    361    if (area.isValid()) {
    362      return IntSize(area.value(), 1);
    363    }
    364  }
    365  return aSize;
    366 }
    367 
    368 static inline int32_t GetStrideGap(int32_t aWidth, SurfaceFormat aFormat,
    369                                   int32_t aStride) {
    370  CheckedInt32 used = CheckedInt32(aWidth) * BytesPerPixel(aFormat);
    371  if (!used.isValid() || used.value() < 0) {
    372    return -1;
    373  }
    374  return aStride - used.value();
    375 }
    376 
    377 bool PremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
    378                     SurfaceFormat aSrcFormat, uint8_t* aDst,
    379                     int32_t aDstStride, SurfaceFormat aDstFormat,
    380                     const IntSize& aSize) {
    381  if (aSize.IsEmpty()) {
    382    return true;
    383  }
    384  IntSize size = CollapseSize(aSize, aSrcStride, aDstStride);
    385  // Find gap from end of row to the start of the next row.
    386  int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
    387  int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
    388  MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
    389  if (srcGap < 0 || dstGap < 0) {
    390    return false;
    391  }
    392 
    393 #define FORMAT_CASE_CALL(...) __VA_ARGS__(aSrc, srcGap, aDst, dstGap, size)
    394 
    395 #ifdef USE_SSE2
    396  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    397      PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    398      PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
    399      PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    400      PREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
    401      PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    402      PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
    403      PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    404      PREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
    405      default:
    406        break;
    407    }
    408 #endif
    409 
    410 #ifdef USE_NEON
    411  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    412      PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    413      PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
    414      PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    415      PREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
    416      PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    417      PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
    418      PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    419      PREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
    420      default:
    421        break;
    422    }
    423 #endif
    424 
    425  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    426    PREMULTIPLY_FALLBACK(SurfaceFormat::B8G8R8A8)
    427    PREMULTIPLY_FALLBACK(SurfaceFormat::R8G8B8A8)
    428    PREMULTIPLY_FALLBACK(SurfaceFormat::A8R8G8B8)
    429    default:
    430      break;
    431  }
    432 
    433 #undef FORMAT_CASE_CALL
    434 
    435  MOZ_ASSERT(false, "Unsupported premultiply formats");
    436  return false;
    437 }
    438 
    439 SwizzleRowFn PremultiplyRow(SurfaceFormat aSrcFormat,
    440                            SurfaceFormat aDstFormat) {
    441 #ifdef USE_SSE2
    442  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    443      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    444      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
    445      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    446      PREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
    447      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    448      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
    449      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    450      PREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
    451      default:
    452        break;
    453    }
    454 #endif
    455 
    456 #ifdef USE_NEON
    457  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    458      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    459      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
    460      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    461      PREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
    462      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    463      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
    464      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    465      PREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
    466      default:
    467        break;
    468    }
    469 #endif
    470 
    471  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    472    PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
    473    PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
    474    PREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
    475    default:
    476      break;
    477  }
    478 
    479  MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
    480  return nullptr;
    481 }
    482 
    483 /**
    484 * Unpremultiplying
    485 */
    486 
    487 // Generate a table of 8.16 fixed-point reciprocals representing 1/alpha.
    488 #define UNPREMULQ(x) (0xFF00FFU / (x))
    489 #define UNPREMULQ_2(x) UNPREMULQ(x), UNPREMULQ((x) + 1)
    490 #define UNPREMULQ_4(x) UNPREMULQ_2(x), UNPREMULQ_2((x) + 2)
    491 #define UNPREMULQ_8(x) UNPREMULQ_4(x), UNPREMULQ_4((x) + 4)
    492 #define UNPREMULQ_16(x) UNPREMULQ_8(x), UNPREMULQ_8((x) + 8)
    493 #define UNPREMULQ_32(x) UNPREMULQ_16(x), UNPREMULQ_16((x) + 16)
    494 static const uint32_t sUnpremultiplyTable[256] = {0,
    495                                                  UNPREMULQ(1),
    496                                                  UNPREMULQ_2(2),
    497                                                  UNPREMULQ_4(4),
    498                                                  UNPREMULQ_8(8),
    499                                                  UNPREMULQ_16(16),
    500                                                  UNPREMULQ_32(32),
    501                                                  UNPREMULQ_32(64),
    502                                                  UNPREMULQ_32(96),
    503                                                  UNPREMULQ_32(128),
    504                                                  UNPREMULQ_32(160),
    505                                                  UNPREMULQ_32(192),
    506                                                  UNPREMULQ_32(224)};
    507 
    508 // Fallback unpremultiply implementation that uses 8.16 fixed-point reciprocal
    509 // math to eliminate any division by the alpha component. This optimization is
    510 // used for the SSE2 and NEON implementations, with some adaptations. This
    511 // implementation also accesses color components using individual byte accesses
    512 // as this profiles faster than accessing the pixel as a uint32_t and
    513 // shifting/masking to access components.
    514 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
    515          uint32_t aDstRGBIndex, uint32_t aDstAIndex>
    516 static void UnpremultiplyChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
    517                                       int32_t aLength) {
    518  const uint8_t* end = aSrc + 4 * aLength;
    519  do {
    520    uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
    521    uint8_t g = aSrc[aSrcRGBIndex + 1];
    522    uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
    523    uint8_t a = aSrc[aSrcAIndex];
    524 
    525    // Access the 8.16 reciprocal from the table based on alpha. Multiply by
    526    // the reciprocal and shift off the fraction bits to approximate the
    527    // division by alpha.
    528    uint32_t q = sUnpremultiplyTable[a];
    529    aDst[aDstRGBIndex + 0] = (r * q) >> 16;
    530    aDst[aDstRGBIndex + 1] = (g * q) >> 16;
    531    aDst[aDstRGBIndex + 2] = (b * q) >> 16;
    532    aDst[aDstAIndex] = a;
    533 
    534    aSrc += 4;
    535    aDst += 4;
    536  } while (aSrc < end);
    537 }
    538 
    539 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
    540          uint32_t aDstRGBIndex, uint32_t aDstAIndex>
    541 static void UnpremultiplyRowFallback(const uint8_t* aSrc, uint8_t* aDst,
    542                                     int32_t aLength) {
    543  UnpremultiplyChunkFallback<aSwapRB, aSrcRGBIndex, aSrcAIndex, aDstRGBIndex,
    544                             aDstAIndex>(aSrc, aDst, aLength);
    545 }
    546 
    547 template <bool aSwapRB, uint32_t aSrcRGBIndex, uint32_t aSrcAIndex,
    548          uint32_t aDstRGBIndex, uint32_t aDstAIndex>
    549 static void UnpremultiplyFallback(const uint8_t* aSrc, int32_t aSrcGap,
    550                                  uint8_t* aDst, int32_t aDstGap,
    551                                  IntSize aSize) {
    552  for (int32_t height = aSize.height; height > 0; height--) {
    553    UnpremultiplyChunkFallback<aSwapRB, aSrcRGBIndex, aSrcAIndex, aDstRGBIndex,
    554                               aDstAIndex>(aSrc, aDst, aSize.width);
    555    aSrc += aSrcGap;
    556    aDst += aDstGap;
    557  }
    558 }
    559 
    560 #define UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
    561  FORMAT_CASE(aSrcFormat, aDstFormat,                                   \
    562              UnpremultiplyFallback<                                    \
    563                  ShouldSwapRB(aSrcFormat, aDstFormat),                 \
    564                  RGBByteIndex(aSrcFormat), AlphaByteIndex(aSrcFormat), \
    565                  RGBByteIndex(aDstFormat), AlphaByteIndex(aDstFormat)>)
    566 
    567 #define UNPREMULTIPLY_FALLBACK(aSrcFormat)                         \
    568  UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
    569  UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
    570  UNPREMULTIPLY_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8)
    571 
    572 #define UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, aDstFormat)             \
    573  FORMAT_CASE_ROW(aSrcFormat, aDstFormat,                                   \
    574                  UnpremultiplyRowFallback<                                 \
    575                      ShouldSwapRB(aSrcFormat, aDstFormat),                 \
    576                      RGBByteIndex(aSrcFormat), AlphaByteIndex(aSrcFormat), \
    577                      RGBByteIndex(aDstFormat), AlphaByteIndex(aDstFormat)>)
    578 
    579 #define UNPREMULTIPLY_ROW_FALLBACK(aSrcFormat)                         \
    580  UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::B8G8R8A8) \
    581  UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::R8G8B8A8) \
    582  UNPREMULTIPLY_ROW_FALLBACK_CASE(aSrcFormat, SurfaceFormat::A8R8G8B8)
    583 
    584 bool UnpremultiplyData(const uint8_t* aSrc, int32_t aSrcStride,
    585                       SurfaceFormat aSrcFormat, uint8_t* aDst,
    586                       int32_t aDstStride, SurfaceFormat aDstFormat,
    587                       const IntSize& aSize) {
    588  if (aSize.IsEmpty()) {
    589    return true;
    590  }
    591  IntSize size = CollapseSize(aSize, aSrcStride, aDstStride);
    592  // Find gap from end of row to the start of the next row.
    593  int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
    594  int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
    595  MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
    596  if (srcGap < 0 || dstGap < 0) {
    597    return false;
    598  }
    599 
    600 #define FORMAT_CASE_CALL(...) __VA_ARGS__(aSrc, srcGap, aDst, dstGap, size)
    601 
    602 #ifdef USE_SSE2
    603  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    604      UNPREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    605      UNPREMULTIPLY_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    606      UNPREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    607      UNPREMULTIPLY_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    608      default:
    609        break;
    610    }
    611 #endif
    612 
    613 #ifdef USE_NEON
    614  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    615      UNPREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    616      UNPREMULTIPLY_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    617      UNPREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    618      UNPREMULTIPLY_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    619      default:
    620        break;
    621    }
    622 #endif
    623 
    624  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    625    UNPREMULTIPLY_FALLBACK(SurfaceFormat::B8G8R8A8)
    626    UNPREMULTIPLY_FALLBACK(SurfaceFormat::R8G8B8A8)
    627    UNPREMULTIPLY_FALLBACK(SurfaceFormat::A8R8G8B8)
    628    default:
    629      break;
    630  }
    631 
    632 #undef FORMAT_CASE_CALL
    633 
    634  MOZ_ASSERT(false, "Unsupported unpremultiply formats");
    635  return false;
    636 }
    637 
    638 SwizzleRowFn UnpremultiplyRow(SurfaceFormat aSrcFormat,
    639                              SurfaceFormat aDstFormat) {
    640 #ifdef USE_SSE2
    641  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    642      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    643      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    644      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    645      UNPREMULTIPLY_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    646      default:
    647        break;
    648    }
    649 #endif
    650 
    651 #ifdef USE_NEON
    652  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    653      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8A8)
    654      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
    655      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8A8)
    656      UNPREMULTIPLY_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
    657      default:
    658        break;
    659    }
    660 #endif
    661 
    662  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
    663    UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::B8G8R8A8)
    664    UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::R8G8B8A8)
    665    UNPREMULTIPLY_ROW_FALLBACK(SurfaceFormat::A8R8G8B8)
    666    default:
    667      break;
    668  }
    669 
    670  MOZ_ASSERT_UNREACHABLE("Unsupported premultiply formats");
    671  return nullptr;
    672 }
    673 
    674 /**
    675 * Swizzling
    676 */
    677 
    678 // Fallback swizzle implementation that uses shifting and masking to reorder
    679 // pixels.
    680 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
    681          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
    682 static void SwizzleChunkFallback(const uint8_t*& aSrc, uint8_t*& aDst,
    683                                 int32_t aLength) {
    684  const uint8_t* end = aSrc + 4 * aLength;
    685  do {
    686    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
    687 
    688    if (aSwapRB) {
    689      // Handle R and B swaps by exchanging words and masking.
    690      uint32_t rb =
    691          ((rgba << 16) | (rgba >> 16)) & (0x00FF00FF << aSrcRGBShift);
    692      uint32_t ga = rgba & ((0xFF << aSrcAShift) | (0xFF00 << aSrcRGBShift));
    693      rgba = rb | ga;
    694    }
    695 
    696    // If src and dst shifts differ, rotate left or right to move RGB into
    697    // place, i.e. ARGB -> RGBA or ARGB -> RGBA.
    698    if (aDstRGBShift > aSrcRGBShift) {
    699      rgba = (rgba << 8) | (aOpaqueAlpha ? 0x000000FF : rgba >> 24);
    700    } else if (aSrcRGBShift > aDstRGBShift) {
    701      rgba = (rgba >> 8) | (aOpaqueAlpha ? 0xFF000000 : rgba << 24);
    702    } else if (aOpaqueAlpha) {
    703      rgba |= 0xFF << aDstAShift;
    704    }
    705 
    706    *reinterpret_cast<uint32_t*>(aDst) = rgba;
    707 
    708    aSrc += 4;
    709    aDst += 4;
    710  } while (aSrc < end);
    711 }
    712 
    713 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
    714          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
    715 static void SwizzleRowFallback(const uint8_t* aSrc, uint8_t* aDst,
    716                               int32_t aLength) {
    717  SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
    718                       aDstRGBShift, aDstAShift>(aSrc, aDst, aLength);
    719 }
    720 
    721 template <bool aSwapRB, bool aOpaqueAlpha, uint32_t aSrcRGBShift,
    722          uint32_t aSrcAShift, uint32_t aDstRGBShift, uint32_t aDstAShift>
    723 static void SwizzleFallback(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    724                            int32_t aDstGap, IntSize aSize) {
    725  for (int32_t height = aSize.height; height > 0; height--) {
    726    SwizzleChunkFallback<aSwapRB, aOpaqueAlpha, aSrcRGBShift, aSrcAShift,
    727                         aDstRGBShift, aDstAShift>(aSrc, aDst, aSize.width);
    728    aSrc += aSrcGap;
    729    aDst += aDstGap;
    730  }
    731 }
    732 
    733 #define SWIZZLE_FALLBACK(aSrcFormat, aDstFormat)                          \
    734  FORMAT_CASE(                                                            \
    735      aSrcFormat, aDstFormat,                                             \
    736      SwizzleFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
    737                      ShouldForceOpaque(aSrcFormat, aDstFormat),          \
    738                      RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
    739                      RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
    740 
    741 #define SWIZZLE_ROW_FALLBACK(aSrcFormat, aDstFormat)                         \
    742  FORMAT_CASE_ROW(                                                           \
    743      aSrcFormat, aDstFormat,                                                \
    744      SwizzleRowFallback<ShouldSwapRB(aSrcFormat, aDstFormat),               \
    745                         ShouldForceOpaque(aSrcFormat, aDstFormat),          \
    746                         RGBBitShift(aSrcFormat), AlphaBitShift(aSrcFormat), \
    747                         RGBBitShift(aDstFormat), AlphaBitShift(aDstFormat)>)
    748 
    749 // Fast-path for matching formats.
    750 template <int32_t aBytesPerPixel>
    751 static void SwizzleRowCopy(const uint8_t* aSrc, uint8_t* aDst,
    752                           int32_t aLength) {
    753  if (aSrc != aDst) {
    754    memcpy(aDst, aSrc, aLength * aBytesPerPixel);
    755  }
    756 }
    757 
    758 // Fast-path for matching formats.
    759 static void SwizzleCopy(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    760                        int32_t aDstGap, IntSize aSize, int32_t aBPP) {
    761  if (aSrc != aDst) {
    762    int32_t rowLength = aBPP * aSize.width;
    763    for (int32_t height = aSize.height; height > 0; height--) {
    764      memcpy(aDst, aSrc, rowLength);
    765      aSrc += rowLength + aSrcGap;
    766      aDst += rowLength + aDstGap;
    767    }
    768  }
    769 }
    770 
    771 // Fast-path for conversions that swap all bytes.
    772 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
    773 static void SwizzleChunkSwap(const uint8_t*& aSrc, uint8_t*& aDst,
    774                             int32_t aLength) {
    775  const uint8_t* end = aSrc + 4 * aLength;
    776  do {
    777    // Use an endian swap to move the bytes, i.e. BGRA -> ARGB.
    778    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
    779 #if MOZ_LITTLE_ENDIAN()
    780    rgba = NativeEndian::swapToBigEndian(rgba);
    781 #else
    782    rgba = NativeEndian::swapToLittleEndian(rgba);
    783 #endif
    784    if (aOpaqueAlpha) {
    785      rgba |= 0xFF << aDstAShift;
    786    }
    787    *reinterpret_cast<uint32_t*>(aDst) = rgba;
    788    aSrc += 4;
    789    aDst += 4;
    790  } while (aSrc < end);
    791 }
    792 
    793 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
    794 static void SwizzleRowSwap(const uint8_t* aSrc, uint8_t* aDst,
    795                           int32_t aLength) {
    796  SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst, aLength);
    797 }
    798 
    799 template <bool aOpaqueAlpha, uint32_t aSrcAShift, uint32_t aDstAShift>
    800 static void SwizzleSwap(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    801                        int32_t aDstGap, IntSize aSize) {
    802  for (int32_t height = aSize.height; height > 0; height--) {
    803    SwizzleChunkSwap<aOpaqueAlpha, aSrcAShift, aDstAShift>(aSrc, aDst,
    804                                                           aSize.width);
    805    aSrc += aSrcGap;
    806    aDst += aDstGap;
    807  }
    808 }
    809 
    810 #define SWIZZLE_SWAP(aSrcFormat, aDstFormat)                 \
    811  FORMAT_CASE(                                               \
    812      aSrcFormat, aDstFormat,                                \
    813      SwizzleSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
    814                  AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
    815 
    816 #define SWIZZLE_ROW_SWAP(aSrcFormat, aDstFormat)                \
    817  FORMAT_CASE_ROW(                                              \
    818      aSrcFormat, aDstFormat,                                   \
    819      SwizzleRowSwap<ShouldForceOpaque(aSrcFormat, aDstFormat), \
    820                     AlphaBitShift(aSrcFormat), AlphaBitShift(aDstFormat)>)
    821 
    822 static void SwizzleChunkSwapRGB24(const uint8_t*& aSrc, uint8_t*& aDst,
    823                                  int32_t aLength) {
    824  const uint8_t* end = aSrc + 3 * aLength;
    825  do {
    826    uint8_t r = aSrc[0];
    827    uint8_t g = aSrc[1];
    828    uint8_t b = aSrc[2];
    829    aDst[0] = b;
    830    aDst[1] = g;
    831    aDst[2] = r;
    832    aSrc += 3;
    833    aDst += 3;
    834  } while (aSrc < end);
    835 }
    836 
    837 static void SwizzleRowSwapRGB24(const uint8_t* aSrc, uint8_t* aDst,
    838                                int32_t aLength) {
    839  SwizzleChunkSwapRGB24(aSrc, aDst, aLength);
    840 }
    841 
    842 static void SwizzleSwapRGB24(const uint8_t* aSrc, int32_t aSrcGap,
    843                             uint8_t* aDst, int32_t aDstGap, IntSize aSize) {
    844  for (int32_t height = aSize.height; height > 0; height--) {
    845    SwizzleChunkSwapRGB24(aSrc, aDst, aSize.width);
    846    aSrc += aSrcGap;
    847    aDst += aDstGap;
    848  }
    849 }
    850 
    851 #define SWIZZLE_SWAP_RGB24(aSrcFormat, aDstFormat) \
    852  FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleSwapRGB24)
    853 
    854 #define SWIZZLE_ROW_SWAP_RGB24(aSrcFormat, aDstFormat) \
    855  FORMAT_CASE_ROW(aSrcFormat, aDstFormat, SwizzleRowSwapRGB24)
    856 
    857 // Fast-path for conversions that force alpha to opaque.
    858 template <uint32_t aDstAShift>
    859 static void SwizzleChunkOpaqueUpdate(uint8_t*& aBuffer, int32_t aLength) {
    860  const uint8_t* end = aBuffer + 4 * aLength;
    861  do {
    862    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aBuffer);
    863    // Just add on the alpha bits to the source.
    864    rgba |= 0xFF << aDstAShift;
    865    *reinterpret_cast<uint32_t*>(aBuffer) = rgba;
    866    aBuffer += 4;
    867  } while (aBuffer < end);
    868 }
    869 
    870 template <uint32_t aDstAShift>
    871 static void SwizzleChunkOpaqueCopy(const uint8_t*& aSrc, uint8_t* aDst,
    872                                   int32_t aLength) {
    873  const uint8_t* end = aSrc + 4 * aLength;
    874  do {
    875    uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
    876    // Just add on the alpha bits to the source.
    877    rgba |= 0xFF << aDstAShift;
    878    *reinterpret_cast<uint32_t*>(aDst) = rgba;
    879    aSrc += 4;
    880    aDst += 4;
    881  } while (aSrc < end);
    882 }
    883 
    884 template <uint32_t aDstAShift>
    885 static void SwizzleRowOpaque(const uint8_t* aSrc, uint8_t* aDst,
    886                             int32_t aLength) {
    887  if (aSrc == aDst) {
    888    SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aLength);
    889  } else {
    890    SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aLength);
    891  }
    892 }
    893 
    894 template <uint32_t aDstAShift>
    895 static void SwizzleOpaque(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    896                          int32_t aDstGap, IntSize aSize) {
    897  if (aSrc == aDst) {
    898    // Modifying in-place, so just write out the alpha.
    899    for (int32_t height = aSize.height; height > 0; height--) {
    900      SwizzleChunkOpaqueUpdate<aDstAShift>(aDst, aSize.width);
    901      aDst += aDstGap;
    902    }
    903  } else {
    904    for (int32_t height = aSize.height; height > 0; height--) {
    905      SwizzleChunkOpaqueCopy<aDstAShift>(aSrc, aDst, aSize.width);
    906      aSrc += aSrcGap;
    907      aDst += aDstGap;
    908    }
    909  }
    910 }
    911 
    912 #define SWIZZLE_OPAQUE(aSrcFormat, aDstFormat) \
    913  FORMAT_CASE(aSrcFormat, aDstFormat, SwizzleOpaque<AlphaBitShift(aDstFormat)>)
    914 
    915 #define SWIZZLE_ROW_OPAQUE(aSrcFormat, aDstFormat) \
    916  FORMAT_CASE_ROW(aSrcFormat, aDstFormat,          \
    917                  SwizzleRowOpaque<AlphaBitShift(aDstFormat)>)
    918 
    919 // Packing of 32-bit formats to RGB565.
    920 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
    921 static void PackToRGB565(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    922                         int32_t aDstGap, IntSize aSize) {
    923  for (int32_t height = aSize.height; height > 0; height--) {
    924    const uint8_t* end = aSrc + 4 * aSize.width;
    925    do {
    926      uint32_t rgba = *reinterpret_cast<const uint32_t*>(aSrc);
    927 
    928      // Isolate the R, G, and B components and shift to final endian-dependent
    929      // locations.
    930      uint16_t rgb565;
    931      if (aSwapRB) {
    932        rgb565 = ((rgba & (0xF8 << aSrcRGBShift)) << (8 - aSrcRGBShift)) |
    933                 ((rgba & (0xFC00 << aSrcRGBShift)) >> (5 + aSrcRGBShift)) |
    934                 ((rgba & (0xF80000 << aSrcRGBShift)) >> (19 + aSrcRGBShift));
    935      } else {
    936        rgb565 = ((rgba & (0xF8 << aSrcRGBShift)) >> (3 + aSrcRGBShift)) |
    937                 ((rgba & (0xFC00 << aSrcRGBShift)) >> (5 + aSrcRGBShift)) |
    938                 ((rgba & (0xF80000 << aSrcRGBShift)) >> (8 + aSrcRGBShift));
    939      }
    940 
    941      *reinterpret_cast<uint16_t*>(aDst) = rgb565;
    942 
    943      aSrc += 4;
    944      aDst += 2;
    945    } while (aSrc < end);
    946 
    947    aSrc += aSrcGap;
    948    aDst += aDstGap;
    949  }
    950 }
    951 
    952 // Packing of 32-bit formats to 24-bit formats.
    953 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
    954 static void PackChunkToRGB24(const uint8_t*& aSrc, uint8_t*& aDst,
    955                             int32_t aLength) {
    956  const uint8_t* end = aSrc + 4 * aLength;
    957  do {
    958    uint8_t r = aSrc[aSrcRGBIndex + (aSwapRB ? 2 : 0)];
    959    uint8_t g = aSrc[aSrcRGBIndex + 1];
    960    uint8_t b = aSrc[aSrcRGBIndex + (aSwapRB ? 0 : 2)];
    961 
    962    aDst[0] = r;
    963    aDst[1] = g;
    964    aDst[2] = b;
    965 
    966    aSrc += 4;
    967    aDst += 3;
    968  } while (aSrc < end);
    969 }
    970 
    971 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
    972 static void PackRowToRGB24(const uint8_t* aSrc, uint8_t* aDst,
    973                           int32_t aLength) {
    974  PackChunkToRGB24<aSwapRB, aSrcRGBShift, aSrcRGBIndex>(aSrc, aDst, aLength);
    975 }
    976 
    977 template <bool aSwapRB, uint32_t aSrcRGBShift, uint32_t aSrcRGBIndex>
    978 static void PackToRGB24(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    979                        int32_t aDstGap, IntSize aSize) {
    980  for (int32_t height = aSize.height; height > 0; height--) {
    981    PackChunkToRGB24<aSwapRB, aSrcRGBShift, aSrcRGBIndex>(aSrc, aDst,
    982                                                          aSize.width);
    983    aSrc += aSrcGap;
    984    aDst += aDstGap;
    985  }
    986 }
    987 
    988 #define PACK_RGB_CASE(aSrcFormat, aDstFormat, aPackFunc)      \
    989  FORMAT_CASE(aSrcFormat, aDstFormat,                         \
    990              aPackFunc<ShouldSwapRB(aSrcFormat, aDstFormat), \
    991                        RGBBitShift(aSrcFormat), RGBByteIndex(aSrcFormat)>)
    992 
    993 #define PACK_RGB(aDstFormat, aPackFunc)                         \
    994  PACK_RGB_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
    995  PACK_RGB_CASE(SurfaceFormat::B8G8R8X8, aDstFormat, aPackFunc) \
    996  PACK_RGB_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
    997  PACK_RGB_CASE(SurfaceFormat::R8G8B8X8, aDstFormat, aPackFunc) \
    998  PACK_RGB_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc) \
    999  PACK_RGB_CASE(SurfaceFormat::X8R8G8B8, aDstFormat, aPackFunc)
   1000 
   1001 #define PACK_ROW_RGB_CASE(aSrcFormat, aDstFormat, aPackFunc)                   \
   1002  FORMAT_CASE_ROW(                                                             \
   1003      aSrcFormat, aDstFormat,                                                  \
   1004      aPackFunc<ShouldSwapRB(aSrcFormat, aDstFormat), RGBBitShift(aSrcFormat), \
   1005                RGBByteIndex(aSrcFormat)>)
   1006 
   1007 #define PACK_ROW_RGB(aDstFormat, aPackFunc)                         \
   1008  PACK_ROW_RGB_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
   1009  PACK_ROW_RGB_CASE(SurfaceFormat::B8G8R8X8, aDstFormat, aPackFunc) \
   1010  PACK_ROW_RGB_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
   1011  PACK_ROW_RGB_CASE(SurfaceFormat::R8G8B8X8, aDstFormat, aPackFunc) \
   1012  PACK_ROW_RGB_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc) \
   1013  PACK_ROW_RGB_CASE(SurfaceFormat::X8R8G8B8, aDstFormat, aPackFunc)
   1014 
   1015 // Packing of 32-bit formats to A8.
   1016 template <uint32_t aSrcAIndex>
   1017 static void PackToA8(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
   1018                     int32_t aDstGap, IntSize aSize) {
   1019  for (int32_t height = aSize.height; height > 0; height--) {
   1020    const uint8_t* end = aSrc + 4 * aSize.width;
   1021    do {
   1022      *aDst++ = aSrc[aSrcAIndex];
   1023      aSrc += 4;
   1024    } while (aSrc < end);
   1025    aSrc += aSrcGap;
   1026    aDst += aDstGap;
   1027  }
   1028 }
   1029 
   1030 #define PACK_ALPHA_CASE(aSrcFormat, aDstFormat, aPackFunc) \
   1031  FORMAT_CASE(aSrcFormat, aDstFormat, aPackFunc<AlphaByteIndex(aSrcFormat)>)
   1032 
   1033 #define PACK_ALPHA(aDstFormat, aPackFunc)                         \
   1034  PACK_ALPHA_CASE(SurfaceFormat::B8G8R8A8, aDstFormat, aPackFunc) \
   1035  PACK_ALPHA_CASE(SurfaceFormat::R8G8B8A8, aDstFormat, aPackFunc) \
   1036  PACK_ALPHA_CASE(SurfaceFormat::A8R8G8B8, aDstFormat, aPackFunc)
   1037 
   1038 template <bool aSwapRB>
   1039 void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
   1040  // Because we are expanding, we can only process the data back to front in
   1041  // case we are performing this in place.
   1042  const uint8_t* src = aSrc + 3 * (aLength - 1);
   1043  uint32_t* dst = reinterpret_cast<uint32_t*>(aDst + 4 * aLength);
   1044  while (src >= aSrc) {
   1045    uint8_t r = src[aSwapRB ? 2 : 0];
   1046    uint8_t g = src[1];
   1047    uint8_t b = src[aSwapRB ? 0 : 2];
   1048 #if MOZ_LITTLE_ENDIAN()
   1049    *--dst = 0xFF000000 | (b << 16) | (g << 8) | r;
   1050 #else
   1051    *--dst = 0x000000FF | (b << 8) | (g << 16) | (r << 24);
   1052 #endif
   1053    src -= 3;
   1054  }
   1055 }
   1056 
   1057 // Force instantiation of swizzle variants here.
   1058 template void UnpackRowRGB24<false>(const uint8_t*, uint8_t*, int32_t);
   1059 template void UnpackRowRGB24<true>(const uint8_t*, uint8_t*, int32_t);
   1060 
   1061 #define UNPACK_ROW_RGB(aDstFormat)       \
   1062  FORMAT_CASE_ROW(                       \
   1063      SurfaceFormat::R8G8B8, aDstFormat, \
   1064      UnpackRowRGB24<ShouldSwapRB(SurfaceFormat::R8G8B8, aDstFormat)>)
   1065 
   1066 static void UnpackRowRGB24_To_ARGB(const uint8_t* aSrc, uint8_t* aDst,
   1067                                   int32_t aLength) {
   1068  // Because we are expanding, we can only process the data back to front in
   1069  // case we are performing this in place.
   1070  const uint8_t* src = aSrc + 3 * (aLength - 1);
   1071  uint32_t* dst = reinterpret_cast<uint32_t*>(aDst + 4 * aLength);
   1072  while (src >= aSrc) {
   1073    uint8_t r = src[0];
   1074    uint8_t g = src[1];
   1075    uint8_t b = src[2];
   1076 #if MOZ_LITTLE_ENDIAN()
   1077    *--dst = 0x000000FF | (r << 8) | (g << 16) | (b << 24);
   1078 #else
   1079    *--dst = 0xFF000000 | (r << 24) | (g << 16) | b;
   1080 #endif
   1081    src -= 3;
   1082  }
   1083 }
   1084 
   1085 #define UNPACK_ROW_RGB_TO_ARGB(aDstFormat) \
   1086  FORMAT_CASE_ROW(SurfaceFormat::R8G8B8, aDstFormat, UnpackRowRGB24_To_ARGB)
   1087 
   1088 bool SwizzleData(const uint8_t* aSrc, int32_t aSrcStride,
   1089                 SurfaceFormat aSrcFormat, uint8_t* aDst, int32_t aDstStride,
   1090                 SurfaceFormat aDstFormat, const IntSize& aSize) {
   1091  if (aSize.IsEmpty()) {
   1092    return true;
   1093  }
   1094  IntSize size = CollapseSize(aSize, aSrcStride, aDstStride);
   1095  // Find gap from end of row to the start of the next row.
   1096  int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
   1097  int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
   1098  MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
   1099  if (srcGap < 0 || dstGap < 0) {
   1100    return false;
   1101  }
   1102 
   1103 #define FORMAT_CASE_CALL(...) __VA_ARGS__(aSrc, srcGap, aDst, dstGap, size)
   1104 
   1105 #ifdef USE_SSE2
   1106  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1107      SWIZZLE_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
   1108      SWIZZLE_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
   1109      SWIZZLE_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
   1110      SWIZZLE_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
   1111      SWIZZLE_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
   1112      SWIZZLE_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
   1113      SWIZZLE_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
   1114      SWIZZLE_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
   1115      default:
   1116        break;
   1117    }
   1118 #endif
   1119 
   1120 #ifdef USE_NEON
   1121  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1122      SWIZZLE_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
   1123      SWIZZLE_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
   1124      SWIZZLE_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
   1125      SWIZZLE_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
   1126      SWIZZLE_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
   1127      SWIZZLE_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
   1128      SWIZZLE_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
   1129      SWIZZLE_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
   1130      default:
   1131        break;
   1132    }
   1133 #endif
   1134 
   1135  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1136    SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
   1137    SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
   1138    SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
   1139    SWIZZLE_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
   1140 
   1141    SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
   1142    SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
   1143    SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
   1144    SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
   1145    SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::A8R8G8B8)
   1146    SWIZZLE_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::X8R8G8B8)
   1147 
   1148    SWIZZLE_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8A8)
   1149    SWIZZLE_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8X8)
   1150    SWIZZLE_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8X8)
   1151    SWIZZLE_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8A8)
   1152 
   1153    SWIZZLE_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::A8R8G8B8)
   1154    SWIZZLE_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::X8R8G8B8)
   1155    SWIZZLE_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::X8R8G8B8)
   1156    SWIZZLE_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::A8R8G8B8)
   1157    SWIZZLE_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8A8)
   1158    SWIZZLE_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8X8)
   1159    SWIZZLE_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8X8)
   1160    SWIZZLE_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8A8)
   1161 
   1162    SWIZZLE_SWAP_RGB24(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8)
   1163    SWIZZLE_SWAP_RGB24(SurfaceFormat::B8G8R8, SurfaceFormat::R8G8B8)
   1164 
   1165    SWIZZLE_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
   1166    SWIZZLE_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
   1167    SWIZZLE_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
   1168    SWIZZLE_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
   1169    SWIZZLE_OPAQUE(SurfaceFormat::A8R8G8B8, SurfaceFormat::X8R8G8B8)
   1170    SWIZZLE_OPAQUE(SurfaceFormat::X8R8G8B8, SurfaceFormat::A8R8G8B8)
   1171 
   1172    PACK_RGB(SurfaceFormat::R5G6B5_UINT16, PackToRGB565)
   1173    PACK_RGB(SurfaceFormat::B8G8R8, PackToRGB24)
   1174    PACK_RGB(SurfaceFormat::R8G8B8, PackToRGB24)
   1175    PACK_ALPHA(SurfaceFormat::A8, PackToA8)
   1176 
   1177    default:
   1178      break;
   1179  }
   1180 
   1181  if (aSrcFormat == aDstFormat) {
   1182    // If the formats match, just do a generic copy.
   1183    SwizzleCopy(aSrc, srcGap, aDst, dstGap, size, BytesPerPixel(aSrcFormat));
   1184    return true;
   1185  }
   1186 
   1187 #undef FORMAT_CASE_CALL
   1188 
   1189  MOZ_ASSERT(false, "Unsupported swizzle formats");
   1190  return false;
   1191 }
   1192 
   1193 static bool SwizzleYFlipDataInternal(const uint8_t* aSrc, int32_t aSrcStride,
   1194                                     SurfaceFormat aSrcFormat, uint8_t* aDst,
   1195                                     int32_t aDstStride,
   1196                                     SurfaceFormat aDstFormat,
   1197                                     const IntSize& aSize,
   1198                                     SwizzleRowFn aSwizzleFn) {
   1199  if (!aSwizzleFn) {
   1200    return false;
   1201  }
   1202 
   1203  // Guarantee our width and height are both greater than zero.
   1204  if (aSize.IsEmpty()) {
   1205    return true;
   1206  }
   1207 
   1208  // Unlike SwizzleData/PremultiplyData, we don't use the stride gaps directly,
   1209  // but we can use it to verify that the stride is valid for our width and
   1210  // format.
   1211  int32_t srcGap = GetStrideGap(aSize.width, aSrcFormat, aSrcStride);
   1212  int32_t dstGap = GetStrideGap(aSize.width, aDstFormat, aDstStride);
   1213  MOZ_ASSERT(srcGap >= 0 && dstGap >= 0);
   1214  if (srcGap < 0 || dstGap < 0) {
   1215    return false;
   1216  }
   1217 
   1218  // Swapping/swizzling to a new buffer is trivial.
   1219  if (aSrc != aDst) {
   1220    const uint8_t* src = aSrc;
   1221    const uint8_t* srcEnd = aSrc + aSize.height * aSrcStride;
   1222    uint8_t* dst = aDst + (aSize.height - 1) * aDstStride;
   1223    while (src < srcEnd) {
   1224      aSwizzleFn(src, dst, aSize.width);
   1225      src += aSrcStride;
   1226      dst -= aDstStride;
   1227    }
   1228    return true;
   1229  }
   1230 
   1231  if (aSrcStride != aDstStride) {
   1232    return false;
   1233  }
   1234 
   1235  // If we are swizzling in place, then we need a temporary row buffer.
   1236  UniquePtr<uint8_t[]> rowBuffer(new (std::nothrow) uint8_t[aDstStride]);
   1237  if (!rowBuffer) {
   1238    return false;
   1239  }
   1240 
   1241  // Swizzle and swap the top and bottom rows until we meet in the middle.
   1242  int32_t middleRow = aSize.height / 2;
   1243  uint8_t* top = aDst;
   1244  uint8_t* bottom = aDst + (aSize.height - 1) * aDstStride;
   1245  for (int32_t row = 0; row < middleRow; ++row) {
   1246    memcpy(rowBuffer.get(), bottom, aDstStride);
   1247    aSwizzleFn(top, bottom, aSize.width);
   1248    aSwizzleFn(rowBuffer.get(), top, aSize.width);
   1249    top += aDstStride;
   1250    bottom -= aDstStride;
   1251  }
   1252 
   1253  // If there is an odd numbered row, we haven't swizzled it yet.
   1254  if (aSize.height % 2 == 1) {
   1255    top = aDst + middleRow * aDstStride;
   1256    aSwizzleFn(top, top, aSize.width);
   1257  }
   1258  return true;
   1259 }
   1260 
   1261 bool SwizzleYFlipData(const uint8_t* aSrc, int32_t aSrcStride,
   1262                      SurfaceFormat aSrcFormat, uint8_t* aDst,
   1263                      int32_t aDstStride, SurfaceFormat aDstFormat,
   1264                      const IntSize& aSize) {
   1265  return SwizzleYFlipDataInternal(aSrc, aSrcStride, aSrcFormat, aDst,
   1266                                  aDstStride, aDstFormat, aSize,
   1267                                  SwizzleRow(aSrcFormat, aDstFormat));
   1268 }
   1269 
   1270 bool PremultiplyYFlipData(const uint8_t* aSrc, int32_t aSrcStride,
   1271                          SurfaceFormat aSrcFormat, uint8_t* aDst,
   1272                          int32_t aDstStride, SurfaceFormat aDstFormat,
   1273                          const IntSize& aSize) {
   1274  return SwizzleYFlipDataInternal(aSrc, aSrcStride, aSrcFormat, aDst,
   1275                                  aDstStride, aDstFormat, aSize,
   1276                                  PremultiplyRow(aSrcFormat, aDstFormat));
   1277 }
   1278 
   1279 SwizzleRowFn SwizzleRow(SurfaceFormat aSrcFormat, SurfaceFormat aDstFormat) {
   1280 #ifdef USE_SSE2
   1281  if (mozilla::supports_avx2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1282      UNPACK_ROW_RGB_AVX2(SurfaceFormat::R8G8B8X8)
   1283      UNPACK_ROW_RGB_AVX2(SurfaceFormat::R8G8B8A8)
   1284      UNPACK_ROW_RGB_AVX2(SurfaceFormat::B8G8R8X8)
   1285      UNPACK_ROW_RGB_AVX2(SurfaceFormat::B8G8R8A8)
   1286      default:
   1287        break;
   1288    }
   1289 
   1290  if (mozilla::supports_ssse3()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1291      UNPACK_ROW_RGB_SSSE3(SurfaceFormat::R8G8B8X8)
   1292      UNPACK_ROW_RGB_SSSE3(SurfaceFormat::R8G8B8A8)
   1293      UNPACK_ROW_RGB_SSSE3(SurfaceFormat::B8G8R8X8)
   1294      UNPACK_ROW_RGB_SSSE3(SurfaceFormat::B8G8R8A8)
   1295      default:
   1296        break;
   1297    }
   1298 
   1299  if (mozilla::supports_sse2()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1300      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
   1301      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
   1302      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
   1303      SWIZZLE_ROW_SSE2(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
   1304      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
   1305      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
   1306      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
   1307      SWIZZLE_ROW_SSE2(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
   1308      default:
   1309        break;
   1310    }
   1311 #endif
   1312 
   1313 #ifdef USE_NEON
   1314  if (mozilla::supports_neon()) switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1315      UNPACK_ROW_RGB_NEON(SurfaceFormat::R8G8B8X8)
   1316      UNPACK_ROW_RGB_NEON(SurfaceFormat::R8G8B8A8)
   1317      UNPACK_ROW_RGB_NEON(SurfaceFormat::B8G8R8X8)
   1318      UNPACK_ROW_RGB_NEON(SurfaceFormat::B8G8R8A8)
   1319      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
   1320      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
   1321      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
   1322      SWIZZLE_ROW_NEON(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
   1323      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
   1324      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
   1325      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
   1326      SWIZZLE_ROW_NEON(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
   1327      default:
   1328        break;
   1329    }
   1330 #endif
   1331 
   1332  switch (FORMAT_KEY(aSrcFormat, aDstFormat)) {
   1333    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8A8)
   1334    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8X8)
   1335    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8A8, SurfaceFormat::R8G8B8X8)
   1336    SWIZZLE_ROW_FALLBACK(SurfaceFormat::B8G8R8X8, SurfaceFormat::R8G8B8A8)
   1337 
   1338    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8A8)
   1339    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8X8)
   1340    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::B8G8R8X8)
   1341    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::B8G8R8A8)
   1342    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8A8, SurfaceFormat::A8R8G8B8)
   1343    SWIZZLE_ROW_FALLBACK(SurfaceFormat::R8G8B8X8, SurfaceFormat::X8R8G8B8)
   1344 
   1345    SWIZZLE_ROW_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8A8)
   1346    SWIZZLE_ROW_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8X8)
   1347    SWIZZLE_ROW_FALLBACK(SurfaceFormat::A8R8G8B8, SurfaceFormat::R8G8B8X8)
   1348    SWIZZLE_ROW_FALLBACK(SurfaceFormat::X8R8G8B8, SurfaceFormat::R8G8B8A8)
   1349 
   1350    SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8A8, SurfaceFormat::B8G8R8X8)
   1351    SWIZZLE_ROW_OPAQUE(SurfaceFormat::B8G8R8X8, SurfaceFormat::B8G8R8A8)
   1352    SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8A8, SurfaceFormat::R8G8B8X8)
   1353    SWIZZLE_ROW_OPAQUE(SurfaceFormat::R8G8B8X8, SurfaceFormat::R8G8B8A8)
   1354    SWIZZLE_ROW_OPAQUE(SurfaceFormat::A8R8G8B8, SurfaceFormat::X8R8G8B8)
   1355    SWIZZLE_ROW_OPAQUE(SurfaceFormat::X8R8G8B8, SurfaceFormat::A8R8G8B8)
   1356 
   1357    SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::A8R8G8B8)
   1358    SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8A8, SurfaceFormat::X8R8G8B8)
   1359    SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::X8R8G8B8)
   1360    SWIZZLE_ROW_SWAP(SurfaceFormat::B8G8R8X8, SurfaceFormat::A8R8G8B8)
   1361    SWIZZLE_ROW_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8A8)
   1362    SWIZZLE_ROW_SWAP(SurfaceFormat::A8R8G8B8, SurfaceFormat::B8G8R8X8)
   1363    SWIZZLE_ROW_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8X8)
   1364    SWIZZLE_ROW_SWAP(SurfaceFormat::X8R8G8B8, SurfaceFormat::B8G8R8A8)
   1365 
   1366    SWIZZLE_ROW_SWAP_RGB24(SurfaceFormat::R8G8B8, SurfaceFormat::B8G8R8)
   1367    SWIZZLE_ROW_SWAP_RGB24(SurfaceFormat::B8G8R8, SurfaceFormat::R8G8B8)
   1368 
   1369    UNPACK_ROW_RGB(SurfaceFormat::R8G8B8X8)
   1370    UNPACK_ROW_RGB(SurfaceFormat::R8G8B8A8)
   1371    UNPACK_ROW_RGB(SurfaceFormat::B8G8R8X8)
   1372    UNPACK_ROW_RGB(SurfaceFormat::B8G8R8A8)
   1373    UNPACK_ROW_RGB_TO_ARGB(SurfaceFormat::A8R8G8B8)
   1374    UNPACK_ROW_RGB_TO_ARGB(SurfaceFormat::X8R8G8B8)
   1375 
   1376    PACK_ROW_RGB(SurfaceFormat::R8G8B8, PackRowToRGB24)
   1377    PACK_ROW_RGB(SurfaceFormat::B8G8R8, PackRowToRGB24)
   1378 
   1379    default:
   1380      break;
   1381  }
   1382 
   1383  if (aSrcFormat == aDstFormat) {
   1384    switch (BytesPerPixel(aSrcFormat)) {
   1385      case 4:
   1386        return &SwizzleRowCopy<4>;
   1387      case 3:
   1388        return &SwizzleRowCopy<3>;
   1389      default:
   1390        break;
   1391    }
   1392  }
   1393 
   1394  MOZ_ASSERT_UNREACHABLE("Unsupported swizzle formats");
   1395  return nullptr;
   1396 }
   1397 
   1398 static IntRect ReorientRowRotate0FlipFallback(const uint8_t* aSrc,
   1399                                              int32_t aSrcRow, uint8_t* aDst,
   1400                                              const IntSize& aDstSize,
   1401                                              int32_t aDstStride) {
   1402  // Reverse order of pixels in the row.
   1403  const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
   1404  const uint32_t* end = src + aDstSize.width;
   1405  uint32_t* dst = reinterpret_cast<uint32_t*>(aDst + aSrcRow * aDstStride) +
   1406                  aDstSize.width - 1;
   1407  do {
   1408    *dst-- = *src++;
   1409  } while (src < end);
   1410 
   1411  return IntRect(0, aSrcRow, aDstSize.width, 1);
   1412 }
   1413 
   1414 static IntRect ReorientRowRotate90FlipFallback(const uint8_t* aSrc,
   1415                                               int32_t aSrcRow, uint8_t* aDst,
   1416                                               const IntSize& aDstSize,
   1417                                               int32_t aDstStride) {
   1418  // Copy row of pixels from top to bottom, into left to right columns.
   1419  const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
   1420  const uint32_t* end = src + aDstSize.height;
   1421  uint32_t* dst = reinterpret_cast<uint32_t*>(aDst) + aSrcRow;
   1422  int32_t stride = aDstStride / sizeof(uint32_t);
   1423  do {
   1424    *dst = *src++;
   1425    dst += stride;
   1426  } while (src < end);
   1427 
   1428  return IntRect(aSrcRow, 0, 1, aDstSize.height);
   1429 }
   1430 
   1431 static IntRect ReorientRowRotate180FlipFallback(const uint8_t* aSrc,
   1432                                                int32_t aSrcRow, uint8_t* aDst,
   1433                                                const IntSize& aDstSize,
   1434                                                int32_t aDstStride) {
   1435  // Copy row of pixels from top to bottom, into bottom to top rows.
   1436  uint8_t* dst = aDst + (aDstSize.height - aSrcRow - 1) * aDstStride;
   1437  memcpy(dst, aSrc, aDstSize.width * sizeof(uint32_t));
   1438  return IntRect(0, aDstSize.height - aSrcRow - 1, aDstSize.width, 1);
   1439 }
   1440 
   1441 static IntRect ReorientRowRotate270FlipFallback(const uint8_t* aSrc,
   1442                                                int32_t aSrcRow, uint8_t* aDst,
   1443                                                const IntSize& aDstSize,
   1444                                                int32_t aDstStride) {
   1445  // Copy row of pixels in reverse order from top to bottom, into right to left
   1446  // columns.
   1447  const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
   1448  const uint32_t* end = src + aDstSize.height;
   1449  uint32_t* dst =
   1450      reinterpret_cast<uint32_t*>(aDst + (aDstSize.height - 1) * aDstStride) +
   1451      aDstSize.width - aSrcRow - 1;
   1452  int32_t stride = aDstStride / sizeof(uint32_t);
   1453  do {
   1454    *dst = *src++;
   1455    dst -= stride;
   1456  } while (src < end);
   1457 
   1458  return IntRect(aDstSize.width - aSrcRow - 1, 0, 1, aDstSize.height);
   1459 }
   1460 
   1461 static IntRect ReorientRowRotate0Fallback(const uint8_t* aSrc, int32_t aSrcRow,
   1462                                          uint8_t* aDst,
   1463                                          const IntSize& aDstSize,
   1464                                          int32_t aDstStride) {
   1465  // Copy row of pixels into the destination.
   1466  uint8_t* dst = aDst + aSrcRow * aDstStride;
   1467  memcpy(dst, aSrc, aDstSize.width * sizeof(uint32_t));
   1468  return IntRect(0, aSrcRow, aDstSize.width, 1);
   1469 }
   1470 
   1471 static IntRect ReorientRowRotate90Fallback(const uint8_t* aSrc, int32_t aSrcRow,
   1472                                           uint8_t* aDst,
   1473                                           const IntSize& aDstSize,
   1474                                           int32_t aDstStride) {
   1475  // Copy row of pixels from top to bottom, into right to left columns.
   1476  const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
   1477  const uint32_t* end = src + aDstSize.height;
   1478  uint32_t* dst =
   1479      reinterpret_cast<uint32_t*>(aDst) + aDstSize.width - aSrcRow - 1;
   1480  int32_t stride = aDstStride / sizeof(uint32_t);
   1481  do {
   1482    *dst = *src++;
   1483    dst += stride;
   1484  } while (src < end);
   1485 
   1486  return IntRect(aDstSize.width - aSrcRow - 1, 0, 1, aDstSize.height);
   1487 }
   1488 
   1489 static IntRect ReorientRowRotate180Fallback(const uint8_t* aSrc,
   1490                                            int32_t aSrcRow, uint8_t* aDst,
   1491                                            const IntSize& aDstSize,
   1492                                            int32_t aDstStride) {
   1493  // Copy row of pixels in reverse order from top to bottom, into bottom to top
   1494  // rows.
   1495  const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
   1496  const uint32_t* end = src + aDstSize.width;
   1497  uint32_t* dst = reinterpret_cast<uint32_t*>(
   1498                      aDst + (aDstSize.height - aSrcRow - 1) * aDstStride) +
   1499                  aDstSize.width - 1;
   1500  do {
   1501    *dst-- = *src++;
   1502  } while (src < end);
   1503 
   1504  return IntRect(0, aDstSize.height - aSrcRow - 1, aDstSize.width, 1);
   1505 }
   1506 
   1507 static IntRect ReorientRowRotate270Fallback(const uint8_t* aSrc,
   1508                                            int32_t aSrcRow, uint8_t* aDst,
   1509                                            const IntSize& aDstSize,
   1510                                            int32_t aDstStride) {
   1511  // Copy row of pixels in reverse order from top to bottom, into left to right
   1512  // column.
   1513  const uint32_t* src = reinterpret_cast<const uint32_t*>(aSrc);
   1514  const uint32_t* end = src + aDstSize.height;
   1515  uint32_t* dst =
   1516      reinterpret_cast<uint32_t*>(aDst + (aDstSize.height - 1) * aDstStride) +
   1517      aSrcRow;
   1518  int32_t stride = aDstStride / sizeof(uint32_t);
   1519  do {
   1520    *dst = *src++;
   1521    dst -= stride;
   1522  } while (src < end);
   1523 
   1524  return IntRect(aSrcRow, 0, 1, aDstSize.height);
   1525 }
   1526 
   1527 ReorientRowFn ReorientRow(const struct image::Orientation& aOrientation) {
   1528  switch (aOrientation.flip) {
   1529    case image::Flip::Unflipped:
   1530      switch (aOrientation.rotation) {
   1531        case image::Angle::D0:
   1532          return &ReorientRowRotate0Fallback;
   1533        case image::Angle::D90:
   1534          return &ReorientRowRotate90Fallback;
   1535        case image::Angle::D180:
   1536          return &ReorientRowRotate180Fallback;
   1537        case image::Angle::D270:
   1538          return &ReorientRowRotate270Fallback;
   1539        default:
   1540          break;
   1541      }
   1542      break;
   1543    case image::Flip::Horizontal:
   1544      switch (aOrientation.rotation) {
   1545        case image::Angle::D0:
   1546          return &ReorientRowRotate0FlipFallback;
   1547        case image::Angle::D90:
   1548          if (aOrientation.flipFirst) {
   1549            return &ReorientRowRotate270FlipFallback;
   1550          } else {
   1551            return &ReorientRowRotate90FlipFallback;
   1552          }
   1553        case image::Angle::D180:
   1554          return &ReorientRowRotate180FlipFallback;
   1555        case image::Angle::D270:
   1556          if (aOrientation.flipFirst) {
   1557            return &ReorientRowRotate90FlipFallback;
   1558          } else {
   1559            return &ReorientRowRotate270FlipFallback;
   1560          }
   1561        default:
   1562          break;
   1563      }
   1564      break;
   1565    default:
   1566      break;
   1567  }
   1568 
   1569  MOZ_ASSERT_UNREACHABLE("Unhandled orientation!");
   1570  return nullptr;
   1571 }
   1572 
   1573 }  // namespace gfx
   1574 }  // namespace mozilla