tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

SwizzleNEON.cpp (18433B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "Swizzle.h"
      8 
      9 #include <arm_neon.h>
     10 
     11 namespace mozilla {
     12 namespace gfx {
     13 
     14 // Load 1-3 pixels into a 4 pixel vector.
     15 static MOZ_ALWAYS_INLINE uint16x8_t LoadRemainder_NEON(const uint8_t* aSrc,
     16                                                       size_t aLength) {
     17  const uint32_t* src32 = reinterpret_cast<const uint32_t*>(aSrc);
     18  uint32x4_t dst32;
     19  if (aLength >= 2) {
     20    // Load first 2 pixels
     21    dst32 = vcombine_u32(vld1_u32(src32), vdup_n_u32(0));
     22    // Load third pixel
     23    if (aLength >= 3) {
     24      dst32 = vld1q_lane_u32(src32 + 2, dst32, 2);
     25    }
     26  } else {
     27    // Load single pixel
     28    dst32 = vld1q_lane_u32(src32, vdupq_n_u32(0), 0);
     29  }
     30  return vreinterpretq_u16_u32(dst32);
     31 }
     32 
     33 // Store 1-3 pixels from a vector into memory without overwriting.
     34 static MOZ_ALWAYS_INLINE void StoreRemainder_NEON(uint8_t* aDst, size_t aLength,
     35                                                  const uint16x8_t& aSrc) {
     36  uint32_t* dst32 = reinterpret_cast<uint32_t*>(aDst);
     37  uint32x4_t src32 = vreinterpretq_u32_u16(aSrc);
     38  if (aLength >= 2) {
     39    // Store first 2 pixels
     40    vst1_u32(dst32, vget_low_u32(src32));
     41    // Store third pixel
     42    if (aLength >= 3) {
     43      vst1q_lane_u32(dst32 + 2, src32, 2);
     44    }
     45  } else {
     46    // Store single pixel
     47    vst1q_lane_u32(dst32, src32, 0);
     48  }
     49 }
     50 
     51 // Premultiply vector of 4 pixels using splayed math.
     52 template <bool aSwapRB, bool aOpaqueAlpha>
     53 static MOZ_ALWAYS_INLINE uint16x8_t
     54 PremultiplyVector_NEON(const uint16x8_t& aSrc) {
     55  // Isolate R and B with mask.
     56  const uint16x8_t mask = vdupq_n_u16(0x00FF);
     57  uint16x8_t rb = vandq_u16(aSrc, mask);
     58  // Swap R and B if necessary.
     59  if (aSwapRB) {
     60    rb = vrev32q_u16(rb);
     61  }
     62  // Isolate G and A by shifting down to bottom of word.
     63  uint16x8_t ga = vshrq_n_u16(aSrc, 8);
     64 
     65  // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
     66  uint16x8_t alphas = vtrnq_u16(ga, ga).val[1];
     67 
     68  // rb = rb*a + 255; rb += rb >> 8;
     69  rb = vmlaq_u16(mask, rb, alphas);
     70  rb = vsraq_n_u16(rb, rb, 8);
     71 
     72  // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
     73  if (!aOpaqueAlpha) {
     74    ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0x00FF0000)));
     75  }
     76  // ga = ga*a + 255; ga += ga >> 8;
     77  ga = vmlaq_u16(mask, ga, alphas);
     78  ga = vsraq_n_u16(ga, ga, 8);
     79  // If format is opaque, force output A to be 255.
     80  if (aOpaqueAlpha) {
     81    ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
     82  }
     83 
     84  // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
     85  return vsriq_n_u16(ga, rb, 8);
     86 }
     87 
     88 template <bool aSwapRB, bool aOpaqueAlpha>
     89 static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc,
     90                                                    uint8_t*& aDst,
     91                                                    int32_t aAlignedRow,
     92                                                    int32_t aRemainder) {
     93  // Process all 4-pixel chunks as one vector.
     94  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
     95    uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
     96    px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
     97    vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
     98    aSrc += 4 * 4;
     99    aDst += 4 * 4;
    100  }
    101 
    102  // Handle any 1-3 remaining pixels.
    103  if (aRemainder) {
    104    uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
    105    px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px);
    106    StoreRemainder_NEON(aDst, aRemainder, px);
    107  }
    108 }
    109 
    110 template <bool aSwapRB, bool aOpaqueAlpha>
    111 void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
    112  int32_t alignedRow = 4 * (aLength & ~3);
    113  int32_t remainder = aLength & 3;
    114  PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
    115                                               remainder);
    116 }
    117 
    118 template <bool aSwapRB, bool aOpaqueAlpha>
    119 void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    120                      int32_t aDstGap, IntSize aSize) {
    121  int32_t alignedRow = 4 * (aSize.width & ~3);
    122  int32_t remainder = aSize.width & 3;
    123  // Fold remainder into stride gap.
    124  aSrcGap += 4 * remainder;
    125  aDstGap += 4 * remainder;
    126 
    127  for (int32_t height = aSize.height; height > 0; height--) {
    128    PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
    129                                                 remainder);
    130    aSrc += aSrcGap;
    131    aDst += aDstGap;
    132  }
    133 }
    134 
    135 // Force instantiation of premultiply variants here.
    136 template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*,
    137                                                int32_t);
    138 template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*,
    139                                               int32_t);
    140 template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*,
    141                                               int32_t);
    142 template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*,
    143                                              int32_t);
    144 template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*,
    145                                             int32_t, IntSize);
    146 template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*,
    147                                            int32_t, IntSize);
    148 template void Premultiply_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
    149                                            int32_t, IntSize);
    150 template void Premultiply_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
    151                                           int32_t, IntSize);
    152 
    153 // This generates a table of fixed-point reciprocals representing 1/alpha
    154 // similar to the fallback implementation. However, the reciprocal must
    155 // ultimately be multiplied as an unsigned 9 bit upper part and a signed
    156 // 15 bit lower part to cheaply multiply. Thus, the lower 15 bits of the
    157 // reciprocal is stored 15 bits of the reciprocal are masked off and
    158 // stored in the low word. The upper 9 bits are masked and shifted to fit
    159 // into the high word. These then get independently multiplied with the
    160 // color component and recombined to provide the full recriprocal multiply.
    161 #define UNPREMULQ_NEON(x) \
    162  ((((0xFF00FFU / (x)) & 0xFF8000U) << 1) | ((0xFF00FFU / (x)) & 0x7FFFU))
    163 #define UNPREMULQ_NEON_2(x) UNPREMULQ_NEON(x), UNPREMULQ_NEON((x) + 1)
    164 #define UNPREMULQ_NEON_4(x) UNPREMULQ_NEON_2(x), UNPREMULQ_NEON_2((x) + 2)
    165 #define UNPREMULQ_NEON_8(x) UNPREMULQ_NEON_4(x), UNPREMULQ_NEON_4((x) + 4)
    166 #define UNPREMULQ_NEON_16(x) UNPREMULQ_NEON_8(x), UNPREMULQ_NEON_8((x) + 8)
    167 #define UNPREMULQ_NEON_32(x) UNPREMULQ_NEON_16(x), UNPREMULQ_NEON_16((x) + 16)
    168 static const uint32_t sUnpremultiplyTable_NEON[256] = {0,
    169                                                       UNPREMULQ_NEON(1),
    170                                                       UNPREMULQ_NEON_2(2),
    171                                                       UNPREMULQ_NEON_4(4),
    172                                                       UNPREMULQ_NEON_8(8),
    173                                                       UNPREMULQ_NEON_16(16),
    174                                                       UNPREMULQ_NEON_32(32),
    175                                                       UNPREMULQ_NEON_32(64),
    176                                                       UNPREMULQ_NEON_32(96),
    177                                                       UNPREMULQ_NEON_32(128),
    178                                                       UNPREMULQ_NEON_32(160),
    179                                                       UNPREMULQ_NEON_32(192),
    180                                                       UNPREMULQ_NEON_32(224)};
    181 
    182 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
    183 // that avoids doing any actual division.
    184 template <bool aSwapRB>
    185 static MOZ_ALWAYS_INLINE uint16x8_t
    186 UnpremultiplyVector_NEON(const uint16x8_t& aSrc) {
    187  // Isolate R and B with mask.
    188  uint16x8_t rb = vandq_u16(aSrc, vdupq_n_u16(0x00FF));
    189  // Swap R and B if necessary.
    190  if (aSwapRB) {
    191    rb = vrev32q_u16(rb);
    192  }
    193 
    194  // Isolate G and A by shifting down to bottom of word.
    195  uint16x8_t ga = vshrq_n_u16(aSrc, 8);
    196  // Extract the alphas for the 4 pixels from the now isolated words.
    197  int a1 = vgetq_lane_u16(ga, 1);
    198  int a2 = vgetq_lane_u16(ga, 3);
    199  int a3 = vgetq_lane_u16(ga, 5);
    200  int a4 = vgetq_lane_u16(ga, 7);
    201 
    202  // First load all of the interleaved low and high portions of the reciprocals
    203  // and combine them a single vector as lo1 hi1 lo2 hi2 lo3 hi3 lo4 hi4
    204  uint16x8_t q1234 = vreinterpretq_u16_u32(vld1q_lane_u32(
    205      &sUnpremultiplyTable_NEON[a4],
    206      vld1q_lane_u32(
    207          &sUnpremultiplyTable_NEON[a3],
    208          vld1q_lane_u32(
    209              &sUnpremultiplyTable_NEON[a2],
    210              vld1q_lane_u32(&sUnpremultiplyTable_NEON[a1], vdupq_n_u32(0), 0),
    211              1),
    212          2),
    213      3));
    214  // Transpose the interleaved low/high portions so that we produce
    215  // two separate duplicated vectors for the low and high portions respectively:
    216  // lo1 lo1 lo2 lo2 lo3 lo3 lo4 lo4 and hi1 hi1 hi2 hi2 hi3 hi3 hi4 hi4
    217  uint16x8x2_t q1234lohi = vtrnq_u16(q1234, q1234);
    218 
    219  // VQDMULH is a signed multiply that doubles (*2) the result, then takes the
    220  // high word.  To work around the signedness and the doubling, the low
    221  // portion of the reciprocal only stores the lower 15 bits, which fits in a
    222  // signed 16 bit integer. The high 9 bit portion is effectively also doubled
    223  // by 2 as a side-effect of being shifted for storage. Thus the output scale
    224  // of doing a normal multiply by the high portion and the VQDMULH by the low
    225  // portion are both doubled and can be safely added together. The resulting
    226  // sum just needs to be halved (via VHADD) to thus cancel out the doubling.
    227  // All this combines to produce a reciprocal multiply of the form:
    228  // rb = ((rb * hi) + ((rb * lo * 2) >> 16)) / 2
    229  rb = vhaddq_u16(
    230      vmulq_u16(rb, q1234lohi.val[1]),
    231      vreinterpretq_u16_s16(vqdmulhq_s16(
    232          vreinterpretq_s16_u16(rb), vreinterpretq_s16_u16(q1234lohi.val[0]))));
    233 
    234  // ga = ((ga * hi) + ((ga * lo * 2) >> 16)) / 2
    235  ga = vhaddq_u16(
    236      vmulq_u16(ga, q1234lohi.val[1]),
    237      vreinterpretq_u16_s16(vqdmulhq_s16(
    238          vreinterpretq_s16_u16(ga), vreinterpretq_s16_u16(q1234lohi.val[0]))));
    239 
    240  // Combine to the final pixel with ((rb | (ga << 8)) & ~0xFF000000) | (aSrc &
    241  // 0xFF000000), which inserts back in the original alpha value unchanged.
    242  return vbslq_u16(vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)), aSrc,
    243                   vsliq_n_u16(rb, ga, 8));
    244 }
    245 
    246 template <bool aSwapRB>
    247 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_NEON(const uint8_t*& aSrc,
    248                                                      uint8_t*& aDst,
    249                                                      int32_t aAlignedRow,
    250                                                      int32_t aRemainder) {
    251  // Process all 4-pixel chunks as one vector.
    252  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
    253    uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
    254    px = UnpremultiplyVector_NEON<aSwapRB>(px);
    255    vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
    256    aSrc += 4 * 4;
    257    aDst += 4 * 4;
    258  }
    259 
    260  // Handle any 1-3 remaining pixels.
    261  if (aRemainder) {
    262    uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
    263    px = UnpremultiplyVector_NEON<aSwapRB>(px);
    264    StoreRemainder_NEON(aDst, aRemainder, px);
    265  }
    266 }
    267 
    268 template <bool aSwapRB>
    269 void UnpremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst,
    270                           int32_t aLength) {
    271  int32_t alignedRow = 4 * (aLength & ~3);
    272  int32_t remainder = aLength & 3;
    273  UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
    274 }
    275 
    276 template <bool aSwapRB>
    277 void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    278                        int32_t aDstGap, IntSize aSize) {
    279  int32_t alignedRow = 4 * (aSize.width & ~3);
    280  int32_t remainder = aSize.width & 3;
    281  // Fold remainder into stride gap.
    282  aSrcGap += 4 * remainder;
    283  aDstGap += 4 * remainder;
    284 
    285  for (int32_t height = aSize.height; height > 0; height--) {
    286    UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder);
    287    aSrc += aSrcGap;
    288    aDst += aDstGap;
    289  }
    290 }
    291 
    292 // Force instantiation of unpremultiply variants here.
    293 template void UnpremultiplyRow_NEON<false>(const uint8_t*, uint8_t*, int32_t);
    294 template void UnpremultiplyRow_NEON<true>(const uint8_t*, uint8_t*, int32_t);
    295 template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*,
    296                                        int32_t, IntSize);
    297 template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*,
    298                                       int32_t, IntSize);
    299 
    300 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
    301 template <bool aSwapRB, bool aOpaqueAlpha>
    302 static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) {
    303  // Swap R and B, then add to G and A (forced to 255):
    304  // (((src>>16) | (src << 16)) & 0x00FF00FF) |
    305  //   ((src | 0xFF000000) & ~0x00FF00FF)
    306  return vbslq_u16(
    307      vdupq_n_u16(0x00FF), vrev32q_u16(aSrc),
    308      aOpaqueAlpha
    309          ? vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)))
    310          : aSrc);
    311 }
    312 
    313 #if 0
    314 // These specializations currently do not profile faster than the generic versions,
    315 // so disable them for now.
    316 
    317 // Optimized implementations for when there is no R and B swap.
    318 template<>
    319 static MOZ_ALWAYS_INLINE uint16x8_t
    320 SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc)
    321 {
    322  // Force alpha to 255.
    323  return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)));
    324 }
    325 
    326 template<>
    327 static MOZ_ALWAYS_INLINE uint16x8_t
    328 SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc)
    329 {
    330  return aSrc;
    331 }
    332 #endif
    333 
    334 template <bool aSwapRB, bool aOpaqueAlpha>
    335 static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc,
    336                                                uint8_t*& aDst,
    337                                                int32_t aAlignedRow,
    338                                                int32_t aRemainder) {
    339  // Process all 4-pixel chunks as one vector.
    340  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
    341    uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc));
    342    px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
    343    vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px);
    344    aSrc += 4 * 4;
    345    aDst += 4 * 4;
    346  }
    347 
    348  // Handle any 1-3 remaining pixels.
    349  if (aRemainder) {
    350    uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder);
    351    px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px);
    352    StoreRemainder_NEON(aDst, aRemainder, px);
    353  }
    354 }
    355 
    356 template <bool aSwapRB, bool aOpaqueAlpha>
    357 void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
    358  int32_t alignedRow = 4 * (aLength & ~3);
    359  int32_t remainder = aLength & 3;
    360  SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
    361 }
    362 
    363 template <bool aSwapRB, bool aOpaqueAlpha>
    364 void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    365                  int32_t aDstGap, IntSize aSize) {
    366  int32_t alignedRow = 4 * (aSize.width & ~3);
    367  int32_t remainder = aSize.width & 3;
    368  // Fold remainder into stride gap.
    369  aSrcGap += 4 * remainder;
    370  aDstGap += 4 * remainder;
    371 
    372  for (int32_t height = aSize.height; height > 0; height--) {
    373    SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
    374    aSrc += aSrcGap;
    375    aDst += aDstGap;
    376  }
    377 }
    378 
    379 // Force instantiation of swizzle variants here.
    380 template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t);
    381 template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t);
    382 template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*,
    383                                        int32_t, IntSize);
    384 template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*,
    385                                       int32_t, IntSize);
    386 
    387 template <bool aSwapRB>
    388 void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength);
    389 
    390 template <bool aSwapRB>
    391 void UnpackRowRGB24_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
    392  // Because this implementation will read an additional 4 bytes of data that
    393  // is ignored and masked over, we cannot use the accelerated version for the
    394  // last 1-5 pixels (3-15 bytes remaining) to guarantee we don't access memory
    395  // outside the buffer (we read in 16 byte chunks).
    396  if (aLength < 6) {
    397    UnpackRowRGB24<aSwapRB>(aSrc, aDst, aLength);
    398    return;
    399  }
    400 
    401  // Because we are expanding, we can only process the data back to front in
    402  // case we are performing this in place.
    403  int32_t alignedRow = (aLength - 2) & ~3;
    404  int32_t remainder = aLength - alignedRow;
    405 
    406  const uint8_t* src = aSrc + alignedRow * 3;
    407  uint8_t* dst = aDst + alignedRow * 4;
    408 
    409  // Handle 2-5 remaining pixels.
    410  UnpackRowRGB24<aSwapRB>(src, dst, remainder);
    411 
    412  uint8x8_t masklo;
    413  uint8x8_t maskhi;
    414  if (aSwapRB) {
    415    static const uint8_t masklo_data[] = {2, 1, 0, 0, 5, 4, 3, 0};
    416    static const uint8_t maskhi_data[] = {4, 3, 2, 0, 7, 6, 5, 0};
    417    masklo = vld1_u8(masklo_data);
    418    maskhi = vld1_u8(maskhi_data);
    419  } else {
    420    static const uint8_t masklo_data[] = {0, 1, 2, 0, 3, 4, 5, 0};
    421    static const uint8_t maskhi_data[] = {2, 3, 4, 0, 5, 6, 7, 0};
    422    masklo = vld1_u8(masklo_data);
    423    maskhi = vld1_u8(maskhi_data);
    424  }
    425 
    426  uint8x16_t alpha = vreinterpretq_u8_u32(vdupq_n_u32(0xFF000000));
    427 
    428  // Process all 4-pixel chunks as one vector.
    429  src -= 4 * 3;
    430  dst -= 4 * 4;
    431  while (src >= aSrc) {
    432    uint8x16_t px = vld1q_u8(src);
    433    // G2R2B1G1 R1B0G0R0 -> X1R1G1B1 X0R0G0B0
    434    uint8x8_t pxlo = vtbl1_u8(vget_low_u8(px), masklo);
    435    // B3G3R3B2 G2R2B1G1 -> X3R3G3B3 X2R2G2B2
    436    uint8x8_t pxhi =
    437        vtbl1_u8(vext_u8(vget_low_u8(px), vget_high_u8(px), 4), maskhi);
    438    px = vcombine_u8(pxlo, pxhi);
    439    px = vorrq_u8(px, alpha);
    440    vst1q_u8(dst, px);
    441    src -= 4 * 3;
    442    dst -= 4 * 4;
    443  }
    444 }
    445 
    446 // Force instantiation of swizzle variants here.
    447 template void UnpackRowRGB24_NEON<false>(const uint8_t*, uint8_t*, int32_t);
    448 template void UnpackRowRGB24_NEON<true>(const uint8_t*, uint8_t*, int32_t);
    449 
    450 }  // namespace gfx
    451 }  // namespace mozilla