[ tor-browser ].git.dasho

SwizzleSSE2.cpp (16370B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "Swizzle.h"
      8 
      9 #include <emmintrin.h>
     10 
     11 namespace mozilla::gfx {
     12 
     13 // Load 1-3 pixels into a 4 pixel vector.
     14 static MOZ_ALWAYS_INLINE __m128i LoadRemainder_SSE2(const uint8_t* aSrc,
     15                                                    size_t aLength) {
     16  __m128i px;
     17  if (aLength >= 2) {
     18    // Load first 2 pixels
     19    px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc));
     20    // Load third pixel
     21    if (aLength >= 3) {
     22      px = _mm_unpacklo_epi64(
     23          px,
     24          _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4)));
     25    }
     26  } else {
     27    // Load single pixel
     28    px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc));
     29  }
     30  return px;
     31 }
     32 
     33 // Store 1-3 pixels from a vector into memory without overwriting.
     34 static MOZ_ALWAYS_INLINE void StoreRemainder_SSE2(uint8_t* aDst, size_t aLength,
     35                                                  const __m128i& aSrc) {
     36  if (aLength >= 2) {
     37    // Store first 2 pixels
     38    _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc);
     39    // Store third pixel
     40    if (aLength >= 3) {
     41      *reinterpret_cast<uint32_t*>(aDst + 2 * 4) =
     42          _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4));
     43    }
     44  } else {
     45    // Store single pixel
     46    *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc);
     47  }
     48 }
     49 
     50 // Premultiply vector of 4 pixels using splayed math.
     51 template <bool aSwapRB, bool aOpaqueAlpha>
     52 static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) {
     53  // Isolate R and B with mask.
     54  const __m128i mask = _mm_set1_epi32(0x00FF00FF);
     55  __m128i rb = _mm_and_si128(mask, aSrc);
     56  // Swap R and B if necessary.
     57  if (aSwapRB) {
     58    rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
     59    rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
     60  }
     61  // Isolate G and A by shifting down to bottom of word.
     62  __m128i ga = _mm_srli_epi16(aSrc, 8);
     63 
     64  // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4
     65  __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1));
     66  alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1));
     67 
     68  // rb = rb*a + 255; rb += rb >> 8;
     69  rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask);
     70  rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8));
     71 
     72  // If format is not opaque, force A to 255 so that A*alpha/255 = alpha
     73  if (!aOpaqueAlpha) {
     74    ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000));
     75  }
     76  // ga = ga*a + 255; ga += ga >> 8;
     77  ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask);
     78  ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8));
     79  // If format is opaque, force output A to be 255.
     80  if (aOpaqueAlpha) {
     81    ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
     82  }
     83 
     84  // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00)
     85  rb = _mm_srli_epi16(rb, 8);
     86  ga = _mm_andnot_si128(mask, ga);
     87  return _mm_or_si128(rb, ga);
     88 }
     89 
     90 // Premultiply vector of aAlignedRow + aRemainder pixels.
     91 template <bool aSwapRB, bool aOpaqueAlpha>
     92 static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc,
     93                                                    uint8_t*& aDst,
     94                                                    int32_t aAlignedRow,
     95                                                    int32_t aRemainder) {
     96  // Process all 4-pixel chunks as one vector.
     97  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
     98    __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
     99    px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
    100    _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
    101    aSrc += 4 * 4;
    102    aDst += 4 * 4;
    103  }
    104 
    105  // Handle any 1-3 remaining pixels.
    106  if (aRemainder) {
    107    __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
    108    px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
    109    StoreRemainder_SSE2(aDst, aRemainder, px);
    110  }
    111 }
    112 
    113 // Premultiply vector of aLength pixels.
    114 template <bool aSwapRB, bool aOpaqueAlpha>
    115 void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
    116  int32_t alignedRow = 4 * (aLength & ~3);
    117  int32_t remainder = aLength & 3;
    118  PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
    119                                               remainder);
    120 }
    121 
    122 template <bool aSwapRB, bool aOpaqueAlpha>
    123 void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    124                      int32_t aDstGap, IntSize aSize) {
    125  int32_t alignedRow = 4 * (aSize.width & ~3);
    126  int32_t remainder = aSize.width & 3;
    127  // Fold remainder into stride gap.
    128  aSrcGap += 4 * remainder;
    129  aDstGap += 4 * remainder;
    130 
    131  for (int32_t height = aSize.height; height > 0; height--) {
    132    PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow,
    133                                                 remainder);
    134    aSrc += aSrcGap;
    135    aDst += aDstGap;
    136  }
    137 }
    138 
    139 // Force instantiation of premultiply variants here.
    140 template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*,
    141                                                int32_t);
    142 template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*,
    143                                               int32_t);
    144 template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*,
    145                                               int32_t);
    146 template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*,
    147                                              int32_t);
    148 template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*,
    149                                             int32_t, IntSize);
    150 template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*,
    151                                            int32_t, IntSize);
    152 template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
    153                                            int32_t, IntSize);
    154 template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
    155                                           int32_t, IntSize);
    156 
    157 // This generates a table of fixed-point reciprocals representing 1/alpha
    158 // similar to the fallback implementation. However, the reciprocal must fit
    159 // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas
    160 // require more bits than for larger alphas. We take advantage of this by
    161 // shifting the reciprocal down by either 3 or 8 bits depending on whether
    162 // the alpha value is less than 0x20. This is easy to then undo by multiplying
    163 // the color component to be unpremultiplying by either 8 or 0x100,
    164 // respectively. The 16 bit reciprocal is duplicated into both words of a
    165 // uint32_t here to reduce unpacking overhead.
    166 #define UNPREMULQ_SSE2(x) \
    167  (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8))))
    168 #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1)
    169 #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2)
    170 #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4)
    171 #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8)
    172 #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16)
    173 static const uint32_t sUnpremultiplyTable_SSE2[256] = {0,
    174                                                       UNPREMULQ_SSE2(1),
    175                                                       UNPREMULQ_SSE2_2(2),
    176                                                       UNPREMULQ_SSE2_4(4),
    177                                                       UNPREMULQ_SSE2_8(8),
    178                                                       UNPREMULQ_SSE2_16(16),
    179                                                       UNPREMULQ_SSE2_32(32),
    180                                                       UNPREMULQ_SSE2_32(64),
    181                                                       UNPREMULQ_SSE2_32(96),
    182                                                       UNPREMULQ_SSE2_32(128),
    183                                                       UNPREMULQ_SSE2_32(160),
    184                                                       UNPREMULQ_SSE2_32(192),
    185                                                       UNPREMULQ_SSE2_32(224)};
    186 
    187 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table
    188 // that avoids doing any actual division.
    189 template <bool aSwapRB>
    190 static MOZ_ALWAYS_INLINE __m128i UnpremultiplyVector_SSE2(const __m128i& aSrc) {
    191  // Isolate R and B with mask.
    192  __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
    193  // Swap R and B if necessary.
    194  if (aSwapRB) {
    195    rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
    196    rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
    197  }
    198 
    199  // Isolate G and A by shifting down to bottom of word.
    200  __m128i ga = _mm_srli_epi16(aSrc, 8);
    201  // Extract the alphas for the 4 pixels from the now isolated words.
    202  int a1 = _mm_extract_epi16(ga, 1);
    203  int a2 = _mm_extract_epi16(ga, 3);
    204  int a3 = _mm_extract_epi16(ga, 5);
    205  int a4 = _mm_extract_epi16(ga, 7);
    206 
    207  // Load the 16 bit reciprocals from the table for each alpha.
    208  // The reciprocals are doubled in each uint32_t entry.
    209  // Unpack them to a final vector of duplicated reciprocals of
    210  // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4.
    211  __m128i q12 =
    212      _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]),
    213                         _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2]));
    214  __m128i q34 =
    215      _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]),
    216                         _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4]));
    217  __m128i q1234 = _mm_unpacklo_epi64(q12, q34);
    218 
    219  // Check if the alphas are less than 0x20, so that we can undo
    220  // scaling of the reciprocals as appropriate.
    221  __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000));
    222  // Produce scale factors by ((a < 0x20) ^ 8) & 0x108,
    223  // such that scale is 0x100 if < 0x20, and 8 otherwise.
    224  scale = _mm_xor_si128(scale, _mm_set1_epi16(8));
    225  scale = _mm_and_si128(scale, _mm_set1_epi16(0x108));
    226  // Isolate G now so that we don't accidentally unpremultiply A.
    227  ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF));
    228 
    229  // Scale R, B, and G as required depending on reciprocal precision.
    230  rb = _mm_mullo_epi16(rb, scale);
    231  ga = _mm_mullo_epi16(ga, scale);
    232 
    233  // Multiply R, B, and G by the reciprocal, only taking the high word
    234  // too effectively shift right by 16.
    235  rb = _mm_mulhi_epu16(rb, q1234);
    236  ga = _mm_mulhi_epu16(ga, q1234);
    237 
    238  // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000),
    239  // which will add back on the original alpha value unchanged.
    240  ga = _mm_slli_si128(ga, 1);
    241  ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000)));
    242  return _mm_or_si128(rb, ga);
    243 }
    244 
    245 template <bool aSwapRB>
    246 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_SSE2(const uint8_t*& aSrc,
    247                                                      uint8_t*& aDst,
    248                                                      int32_t aAlignedRow,
    249                                                      int32_t aRemainder) {
    250  // Process all 4-pixel chunks as one vector.
    251  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
    252    __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
    253    px = UnpremultiplyVector_SSE2<aSwapRB>(px);
    254    _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
    255    aSrc += 4 * 4;
    256    aDst += 4 * 4;
    257  }
    258 
    259  // Handle any 1-3 remaining pixels.
    260  if (aRemainder) {
    261    __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
    262    px = UnpremultiplyVector_SSE2<aSwapRB>(px);
    263    StoreRemainder_SSE2(aDst, aRemainder, px);
    264  }
    265 }
    266 
    267 template <bool aSwapRB>
    268 void UnpremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst,
    269                           int32_t aLength) {
    270  int32_t alignedRow = 4 * (aLength & ~3);
    271  int32_t remainder = aLength & 3;
    272  UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
    273 }
    274 
    275 template <bool aSwapRB>
    276 void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    277                        int32_t aDstGap, IntSize aSize) {
    278  int32_t alignedRow = 4 * (aSize.width & ~3);
    279  int32_t remainder = aSize.width & 3;
    280  // Fold remainder into stride gap.
    281  aSrcGap += 4 * remainder;
    282  aDstGap += 4 * remainder;
    283 
    284  for (int32_t height = aSize.height; height > 0; height--) {
    285    UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder);
    286    aSrc += aSrcGap;
    287    aDst += aDstGap;
    288  }
    289 }
    290 
    291 // Force instantiation of unpremultiply variants here.
    292 template void UnpremultiplyRow_SSE2<false>(const uint8_t*, uint8_t*, int32_t);
    293 template void UnpremultiplyRow_SSE2<true>(const uint8_t*, uint8_t*, int32_t);
    294 template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*,
    295                                        int32_t, IntSize);
    296 template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*,
    297                                       int32_t, IntSize);
    298 
    299 // Swizzle a vector of 4 pixels providing swaps and opaquifying.
    300 template <bool aSwapRB, bool aOpaqueAlpha>
    301 static MOZ_ALWAYS_INLINE __m128i SwizzleVector_SSE2(const __m128i& aSrc) {
    302  // Isolate R and B.
    303  __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF));
    304  // Swap R and B.
    305  rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
    306  rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1));
    307  // Isolate G and A.
    308  __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00));
    309  // Force alpha to 255 if necessary.
    310  if (aOpaqueAlpha) {
    311    ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000));
    312  }
    313  // Combine everything back together.
    314  return _mm_or_si128(rb, ga);
    315 }
    316 
    317 #if 0
    318 // These specializations currently do not profile faster than the generic versions,
    319 // so disable them for now.
    320 
    321 // Optimized implementations for when there is no R and B swap.
    322 template<>
    323 MOZ_ALWAYS_INLINE __m128i
    324 SwizzleVector_SSE2<false, true>(const __m128i& aSrc)
    325 {
    326  // Force alpha to 255.
    327  return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000));
    328 }
    329 
    330 template<>
    331 MOZ_ALWAYS_INLINE __m128i
    332 SwizzleVector_SSE2<false, false>(const __m128i& aSrc)
    333 {
    334  return aSrc;
    335 }
    336 #endif
    337 
    338 template <bool aSwapRB, bool aOpaqueAlpha>
    339 static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc,
    340                                                uint8_t*& aDst,
    341                                                int32_t aAlignedRow,
    342                                                int32_t aRemainder) {
    343  // Process all 4-pixel chunks as one vector.
    344  for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) {
    345    __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc));
    346    px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
    347    _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px);
    348    aSrc += 4 * 4;
    349    aDst += 4 * 4;
    350  }
    351 
    352  // Handle any 1-3 remaining pixels.
    353  if (aRemainder) {
    354    __m128i px = LoadRemainder_SSE2(aSrc, aRemainder);
    355    px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px);
    356    StoreRemainder_SSE2(aDst, aRemainder, px);
    357  }
    358 }
    359 
    360 template <bool aSwapRB, bool aOpaqueAlpha>
    361 void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) {
    362  int32_t alignedRow = 4 * (aLength & ~3);
    363  int32_t remainder = aLength & 3;
    364  SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
    365 }
    366 
    367 template <bool aSwapRB, bool aOpaqueAlpha>
    368 void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst,
    369                  int32_t aDstGap, IntSize aSize) {
    370  int32_t alignedRow = 4 * (aSize.width & ~3);
    371  int32_t remainder = aSize.width & 3;
    372  // Fold remainder into stride gap.
    373  aSrcGap += 4 * remainder;
    374  aDstGap += 4 * remainder;
    375 
    376  for (int32_t height = aSize.height; height > 0; height--) {
    377    SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder);
    378    aSrc += aSrcGap;
    379    aDst += aDstGap;
    380  }
    381 }
    382 
    383 // Force instantiation of swizzle variants here.
    384 template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t);
    385 template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t);
    386 template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*,
    387                                        int32_t, IntSize);
    388 template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*,
    389                                       int32_t, IntSize);
    390 
    391 }  // namespace mozilla::gfx
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE