[ tor-browser ].git.dasho

ImageScalingSSE2.cpp (12890B)
      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 /* This Source Code Form is subject to the terms of the Mozilla Public
      4 * License, v. 2.0. If a copy of the MPL was not distributed with this
      5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      6 
      7 #include "ImageScaling.h"
      8 #include "mozilla/Attributes.h"
      9 
     10 #include "SSEHelpers.h"
     11 
     12 /* The functions below use the following system for averaging 4 pixels:
     13 *
     14 * The first observation is that a half-adder is implemented as follows:
     15 * R = S + 2C or in the case of a and b (a ^ b) + ((a & b) << 1);
     16 *
     17 * This can be trivially extended to three pixels by observaring that when
     18 * doing (a ^ b ^ c) as the sum, the carry is simply the bitwise-or of the
     19 * carries of the individual numbers, since the sum of 3 bits can only ever
     20 * have a carry of one.
     21 *
     22 * We then observe that the average is then ((carry << 1) + sum) >> 1, or,
     23 * assuming eliminating overflows and underflows, carry + (sum >> 1).
     24 *
     25 * We now average our existing sum with the fourth number, so we get:
     26 * sum2 = (sum + d) >> 1 or (sum >> 1) + (d >> 1).
     27 *
     28 * We now observe that our sum has been moved into place relative to the
     29 * carry, so we can now average with the carry to get the final 4 input
     30 * average: avg = (sum2 + carry) >> 1;
     31 *
     32 * Or to reverse the proof:
     33 * avg = ((sum >> 1) + carry + d >> 1) >> 1
     34 * avg = ((a + b + c) >> 1 + d >> 1) >> 1
     35 * avg = ((a + b + c + d) >> 2)
     36 *
     37 * An additional fact used in the SSE versions is the concept that we can
     38 * trivially convert a rounded average to a truncated average:
     39 *
     40 * We have:
     41 * f(a, b) = (a + b + 1) >> 1
     42 *
     43 * And want:
     44 * g(a, b) = (a + b) >> 1
     45 *
     46 * Observe:
     47 * ~f(~a, ~b) == ~((~a + ~b + 1) >> 1)
     48 *            == ~((-a - 1 + -b - 1 + 1) >> 1)
     49 *            == ~((-a - 1 + -b) >> 1)
     50 *            == ~((-(a + b) - 1) >> 1)
     51 *            == ~((~(a + b)) >> 1)
     52 *            == (a + b) >> 1
     53 *            == g(a, b)
     54 */
     55 
     56 MOZ_ALWAYS_INLINE __m128i _mm_not_si128(__m128i arg) {
     57  __m128i minusone = _mm_set1_epi32(0xffffffff);
     58  return _mm_xor_si128(arg, minusone);
     59 }
     60 
     61 /* We have to pass pointers here, MSVC does not allow passing more than 3
     62 * __m128i arguments on the stack. And it does not allow 16-byte aligned
     63 * stack variables. This inlines properly on MSVC 2010. It does -not- inline
     64 * with just the inline directive.
     65 */
     66 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x2(__m128i* a, __m128i* b, __m128i* c,
     67                                       __m128i* d) {
     68 #define shuf1 _MM_SHUFFLE(2, 0, 2, 0)
     69 #define shuf2 _MM_SHUFFLE(3, 1, 3, 1)
     70 
     71 // This cannot be an inline function as the __Imm argument to _mm_shuffle_ps
     72 // needs to be a compile time constant.
     73 #define shuffle_si128(arga, argb, imm)                      \
     74  _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps((arga)), \
     75                                  _mm_castsi128_ps((argb)), (imm)));
     76 
     77  __m128i t = shuffle_si128(*a, *b, shuf1);
     78  *b = shuffle_si128(*a, *b, shuf2);
     79  *a = t;
     80  t = shuffle_si128(*c, *d, shuf1);
     81  *d = shuffle_si128(*c, *d, shuf2);
     82  *c = t;
     83 
     84 #undef shuf1
     85 #undef shuf2
     86 #undef shuffle_si128
     87 
     88  __m128i sum = _mm_xor_si128(*a, _mm_xor_si128(*b, *c));
     89 
     90  __m128i carry =
     91      _mm_or_si128(_mm_and_si128(*a, *b),
     92                   _mm_or_si128(_mm_and_si128(*a, *c), _mm_and_si128(*b, *c)));
     93 
     94  sum = _mm_avg_epu8(_mm_not_si128(sum), _mm_not_si128(*d));
     95 
     96  return _mm_not_si128(_mm_avg_epu8(sum, _mm_not_si128(carry)));
     97 }
     98 
     99 MOZ_ALWAYS_INLINE __m128i avg_sse2_4x2_4x1(__m128i a, __m128i b) {
    100  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
    101 }
    102 
    103 MOZ_ALWAYS_INLINE __m128i avg_sse2_8x1_4x1(__m128i a, __m128i b) {
    104  __m128i t = _mm_castps_si128(_mm_shuffle_ps(
    105      _mm_castsi128_ps(a), _mm_castsi128_ps(b), _MM_SHUFFLE(3, 1, 3, 1)));
    106  b = _mm_castps_si128(_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b),
    107                                      _MM_SHUFFLE(2, 0, 2, 0)));
    108  a = t;
    109 
    110  return _mm_not_si128(_mm_avg_epu8(_mm_not_si128(a), _mm_not_si128(b)));
    111 }
    112 
    113 MOZ_ALWAYS_INLINE uint32_t Avg2x2(uint32_t a, uint32_t b, uint32_t c,
    114                                  uint32_t d) {
    115  uint32_t sum = a ^ b ^ c;
    116  uint32_t carry = (a & b) | (a & c) | (b & c);
    117 
    118  uint32_t mask = 0xfefefefe;
    119 
    120  // Not having a byte based average instruction means we should mask to avoid
    121  // underflow.
    122  sum = (((sum ^ d) & mask) >> 1) + (sum & d);
    123 
    124  return (((sum ^ carry) & mask) >> 1) + (sum & carry);
    125 }
    126 
    127 // Simple 2 pixel average version of the function above.
    128 MOZ_ALWAYS_INLINE uint32_t Avg2(uint32_t a, uint32_t b) {
    129  uint32_t sum = a ^ b;
    130  uint32_t carry = (a & b);
    131 
    132  uint32_t mask = 0xfefefefe;
    133 
    134  return ((sum & mask) >> 1) + carry;
    135 }
    136 
    137 namespace mozilla::gfx {
    138 
    139 void ImageHalfScaler::HalfImage2D_SSE2(uint8_t* aSource, int32_t aSourceStride,
    140                                       const IntSize& aSourceSize,
    141                                       uint8_t* aDest, uint32_t aDestStride) {
    142  const int Bpp = 4;
    143 
    144  for (int y = 0; y < aSourceSize.height; y += 2) {
    145    __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride);
    146    int x = 0;
    147    // Run a loop depending on alignment.
    148    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
    149        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
    150      for (; x < (aSourceSize.width - 7); x += 8) {
    151        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
    152        __m128i* lowerRow =
    153            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
    154 
    155        __m128i a = _mm_load_si128(upperRow);
    156        __m128i b = _mm_load_si128(upperRow + 1);
    157        __m128i c = _mm_load_si128(lowerRow);
    158        __m128i d = _mm_load_si128(lowerRow + 1);
    159 
    160        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
    161      }
    162    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
    163      for (; x < (aSourceSize.width - 7); x += 8) {
    164        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
    165        __m128i* lowerRow =
    166            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
    167 
    168        __m128i a = _mm_load_si128(upperRow);
    169        __m128i b = _mm_load_si128(upperRow + 1);
    170        __m128i c = loadUnaligned128(lowerRow);
    171        __m128i d = loadUnaligned128(lowerRow + 1);
    172 
    173        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
    174      }
    175    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
    176      for (; x < (aSourceSize.width - 7); x += 8) {
    177        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
    178        __m128i* lowerRow =
    179            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
    180 
    181        __m128i a = loadUnaligned128((__m128i*)upperRow);
    182        __m128i b = loadUnaligned128((__m128i*)upperRow + 1);
    183        __m128i c = _mm_load_si128((__m128i*)lowerRow);
    184        __m128i d = _mm_load_si128((__m128i*)lowerRow + 1);
    185 
    186        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
    187      }
    188    } else {
    189      for (; x < (aSourceSize.width - 7); x += 8) {
    190        __m128i* upperRow = (__m128i*)(aSource + (y * aSourceStride + x * Bpp));
    191        __m128i* lowerRow =
    192            (__m128i*)(aSource + ((y + 1) * aSourceStride + x * Bpp));
    193 
    194        __m128i a = loadUnaligned128(upperRow);
    195        __m128i b = loadUnaligned128(upperRow + 1);
    196        __m128i c = loadUnaligned128(lowerRow);
    197        __m128i d = loadUnaligned128(lowerRow + 1);
    198 
    199        *storage++ = avg_sse2_8x2(&a, &b, &c, &d);
    200      }
    201    }
    202 
    203    uint32_t* unalignedStorage = (uint32_t*)storage;
    204    // Take care of the final pixels, we know there's an even number of pixels
    205    // in the source rectangle. We use a 2x2 'simd' implementation for this.
    206    //
    207    // Potentially we only have to do this in the last row since overflowing
    208    // 8 pixels in an earlier row would appear to be harmless as it doesn't
    209    // touch invalid memory. Even when reading and writing to the same surface.
    210    // in practice we only do this when doing an additional downscale pass, and
    211    // in this situation we have unused stride to write into harmlessly.
    212    // I do not believe the additional code complexity would be worth it though.
    213    for (; x < aSourceSize.width; x += 2) {
    214      uint8_t* upperRow = aSource + (y * aSourceStride + x * Bpp);
    215      uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * Bpp);
    216 
    217      *unalignedStorage++ =
    218          Avg2x2(*(uint32_t*)upperRow, *((uint32_t*)upperRow + 1),
    219                 *(uint32_t*)lowerRow, *((uint32_t*)lowerRow + 1));
    220    }
    221  }
    222 }
    223 
    224 void ImageHalfScaler::HalfImageVertical_SSE2(uint8_t* aSource,
    225                                             int32_t aSourceStride,
    226                                             const IntSize& aSourceSize,
    227                                             uint8_t* aDest,
    228                                             uint32_t aDestStride) {
    229  for (int y = 0; y < aSourceSize.height; y += 2) {
    230    __m128i* storage = (__m128i*)(aDest + (y / 2) * aDestStride);
    231    int x = 0;
    232    // Run a loop depending on alignment.
    233    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16) &&
    234        !(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
    235      for (; x < (aSourceSize.width - 3); x += 4) {
    236        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
    237        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
    238 
    239        __m128i a = _mm_load_si128((__m128i*)upperRow);
    240        __m128i b = _mm_load_si128((__m128i*)lowerRow);
    241 
    242        *storage++ = avg_sse2_4x2_4x1(a, b);
    243      }
    244    } else if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
    245      // This line doesn't align well.
    246      for (; x < (aSourceSize.width - 3); x += 4) {
    247        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
    248        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
    249 
    250        __m128i a = _mm_load_si128((__m128i*)upperRow);
    251        __m128i b = loadUnaligned128((__m128i*)lowerRow);
    252 
    253        *storage++ = avg_sse2_4x2_4x1(a, b);
    254      }
    255    } else if (!(uintptr_t(aSource + ((y + 1) * aSourceStride)) % 16)) {
    256      for (; x < (aSourceSize.width - 3); x += 4) {
    257        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
    258        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
    259 
    260        __m128i a = loadUnaligned128((__m128i*)upperRow);
    261        __m128i b = _mm_load_si128((__m128i*)lowerRow);
    262 
    263        *storage++ = avg_sse2_4x2_4x1(a, b);
    264      }
    265    } else {
    266      for (; x < (aSourceSize.width - 3); x += 4) {
    267        uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
    268        uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
    269 
    270        __m128i a = loadUnaligned128((__m128i*)upperRow);
    271        __m128i b = loadUnaligned128((__m128i*)lowerRow);
    272 
    273        *storage++ = avg_sse2_4x2_4x1(a, b);
    274      }
    275    }
    276 
    277    uint32_t* unalignedStorage = (uint32_t*)storage;
    278    // Take care of the final pixels, we know there's an even number of pixels
    279    // in the source rectangle.
    280    //
    281    // Similar overflow considerations are valid as in the previous function.
    282    for (; x < aSourceSize.width; x++) {
    283      uint8_t* upperRow = aSource + (y * aSourceStride + x * 4);
    284      uint8_t* lowerRow = aSource + ((y + 1) * aSourceStride + x * 4);
    285 
    286      *unalignedStorage++ = Avg2(*(uint32_t*)upperRow, *(uint32_t*)lowerRow);
    287    }
    288  }
    289 }
    290 
    291 void ImageHalfScaler::HalfImageHorizontal_SSE2(uint8_t* aSource,
    292                                               int32_t aSourceStride,
    293                                               const IntSize& aSourceSize,
    294                                               uint8_t* aDest,
    295                                               uint32_t aDestStride) {
    296  for (int y = 0; y < aSourceSize.height; y++) {
    297    __m128i* storage = (__m128i*)(aDest + (y * aDestStride));
    298    int x = 0;
    299    // Run a loop depending on alignment.
    300    if (!(uintptr_t(aSource + (y * aSourceStride)) % 16)) {
    301      for (; x < (aSourceSize.width - 7); x += 8) {
    302        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
    303 
    304        __m128i a = _mm_load_si128(pixels);
    305        __m128i b = _mm_load_si128(pixels + 1);
    306 
    307        *storage++ = avg_sse2_8x1_4x1(a, b);
    308      }
    309    } else {
    310      for (; x < (aSourceSize.width - 7); x += 8) {
    311        __m128i* pixels = (__m128i*)(aSource + (y * aSourceStride + x * 4));
    312 
    313        __m128i a = loadUnaligned128(pixels);
    314        __m128i b = loadUnaligned128(pixels + 1);
    315 
    316        *storage++ = avg_sse2_8x1_4x1(a, b);
    317      }
    318    }
    319 
    320    uint32_t* unalignedStorage = (uint32_t*)storage;
    321    // Take care of the final pixels, we know there's an even number of pixels
    322    // in the source rectangle.
    323    //
    324    // Similar overflow considerations are valid as in the previous function.
    325    for (; x < aSourceSize.width; x += 2) {
    326      uint32_t* pixels = (uint32_t*)(aSource + (y * aSourceStride + x * 4));
    327 
    328      *unalignedStorage++ = Avg2(*pixels, *(pixels + 1));
    329    }
    330  }
    331 }
    332 
    333 }  // namespace mozilla::gfx
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE