tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ConvolutionFilterSSE2.cpp (12908B)


      1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */
      3 // Copyright (c) 2011-2016 Google Inc.
      4 // Use of this source code is governed by a BSD-style license that can be
      5 // found in the gfx/skia/LICENSE file.
      6 
      7 #include "SkConvolver.h"
      8 #include "mozilla/Attributes.h"
      9 #include <immintrin.h>
     10 
     11 namespace skia {
     12 
     13 static MOZ_ALWAYS_INLINE void AccumRemainder(
     14    const unsigned char* pixelsLeft,
     15    const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum,
     16    int r) {
     17  int remainder[4] = {0};
     18  for (int i = 0; i < r; i++) {
     19    SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i];
     20    remainder[0] += coeff * pixelsLeft[i * 4 + 0];
     21    remainder[1] += coeff * pixelsLeft[i * 4 + 1];
     22    remainder[2] += coeff * pixelsLeft[i * 4 + 2];
     23    remainder[3] += coeff * pixelsLeft[i * 4 + 3];
     24  }
     25  __m128i t =
     26      _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]);
     27  accum = _mm_add_epi32(accum, t);
     28 }
     29 
     30 // Convolves horizontally along a single row. The row data is given in
     31 // |srcData| and continues for the numValues() of the filter.
     32 void convolve_horizontally_sse2(const unsigned char* srcData,
     33                                const SkConvolutionFilter1D& filter,
     34                                unsigned char* outRow, bool /*hasAlpha*/) {
     35  // Output one pixel each iteration, calculating all channels (RGBA) together.
     36  int numValues = filter.numValues();
     37  for (int outX = 0; outX < numValues; outX++) {
     38    // Get the filter that determines the current output pixel.
     39    int filterOffset, filterLength;
     40    const SkConvolutionFilter1D::ConvolutionFixed* filterValues =
     41        filter.FilterForValue(outX, &filterOffset, &filterLength);
     42 
     43    // Compute the first pixel in this row that the filter affects. It will
     44    // touch |filterLength| pixels (4 bytes each) after this.
     45    const unsigned char* rowToFilter = &srcData[filterOffset * 4];
     46 
     47    __m128i zero = _mm_setzero_si128();
     48    __m128i accum = _mm_setzero_si128();
     49 
     50    // We will load and accumulate with four coefficients per iteration.
     51    for (int filterX = 0; filterX < filterLength >> 2; filterX++) {
     52      // Load 4 coefficients => duplicate 1st and 2nd of them for all channels.
     53      __m128i coeff, coeff16;
     54      // [16] xx xx xx xx c3 c2 c1 c0
     55      coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues));
     56      // [16] xx xx xx xx c1 c1 c0 c0
     57      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0));
     58      // [16] c1 c1 c1 c1 c0 c0 c0 c0
     59      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
     60 
     61      // Load four pixels => unpack the first two pixels to 16 bits =>
     62      // multiply with coefficients => accumulate the convolution result.
     63      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
     64      __m128i src8 =
     65          _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter));
     66      // [16] a1 b1 g1 r1 a0 b0 g0 r0
     67      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
     68      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
     69      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
     70      // [32]  a0*c0 b0*c0 g0*c0 r0*c0
     71      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     72      accum = _mm_add_epi32(accum, t);
     73      // [32]  a1*c1 b1*c1 g1*c1 r1*c1
     74      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     75      accum = _mm_add_epi32(accum, t);
     76 
     77      // Duplicate 3rd and 4th coefficients for all channels =>
     78      // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients
     79      // => accumulate the convolution results.
     80      // [16] xx xx xx xx c3 c3 c2 c2
     81      coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2));
     82      // [16] c3 c3 c3 c3 c2 c2 c2 c2
     83      coeff16 = _mm_unpacklo_epi16(coeff16, coeff16);
     84      // [16] a3 g3 b3 r3 a2 g2 b2 r2
     85      src16 = _mm_unpackhi_epi8(src8, zero);
     86      mul_hi = _mm_mulhi_epi16(src16, coeff16);
     87      mul_lo = _mm_mullo_epi16(src16, coeff16);
     88      // [32]  a2*c2 b2*c2 g2*c2 r2*c2
     89      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
     90      accum = _mm_add_epi32(accum, t);
     91      // [32]  a3*c3 b3*c3 g3*c3 r3*c3
     92      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
     93      accum = _mm_add_epi32(accum, t);
     94 
     95      // Advance the pixel and coefficients pointers.
     96      rowToFilter += 16;
     97      filterValues += 4;
     98    }
     99 
    100    // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3
    101    // coefficients one at a time.
    102    int r = filterLength & 3;
    103    if (r) {
    104      int remainderOffset = (filterOffset + filterLength - r) * 4;
    105      AccumRemainder(srcData + remainderOffset, filterValues, accum, r);
    106    }
    107 
    108    // Shift right for fixed point implementation.
    109    accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits);
    110 
    111    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    112    accum = _mm_packs_epi32(accum, zero);
    113    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    114    accum = _mm_packus_epi16(accum, zero);
    115 
    116    // Store the pixel value of 32 bits.
    117    *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum);
    118    outRow += 4;
    119  }
    120 }
    121 
    122 // Does vertical convolution to produce one output row. The filter values and
    123 // length are given in the first two parameters. These are applied to each
    124 // of the rows pointed to in the |sourceDataRows| array, with each row
    125 // being |pixelWidth| wide.
    126 //
    127 // The output must have room for |pixelWidth * 4| bytes.
    128 template <bool hasAlpha>
    129 static void ConvolveVertically(
    130    const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
    131    int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
    132    unsigned char* outRow) {
    133  // Output four pixels per iteration (16 bytes).
    134  int width = pixelWidth & ~3;
    135  __m128i zero = _mm_setzero_si128();
    136  for (int outX = 0; outX < width; outX += 4) {
    137    // Accumulated result for each pixel. 32 bits per RGBA channel.
    138    __m128i accum0 = _mm_setzero_si128();
    139    __m128i accum1 = _mm_setzero_si128();
    140    __m128i accum2 = _mm_setzero_si128();
    141    __m128i accum3 = _mm_setzero_si128();
    142 
    143    // Convolve with one filter coefficient per iteration.
    144    for (int filterY = 0; filterY < filterLength; filterY++) {
    145      // Duplicate the filter coefficient 8 times.
    146      // [16] cj cj cj cj cj cj cj cj
    147      __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
    148 
    149      // Load four pixels (16 bytes) together.
    150      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    151      const __m128i* src =
    152          reinterpret_cast<const __m128i*>(&sourceDataRows[filterY][outX << 2]);
    153      __m128i src8 = _mm_loadu_si128(src);
    154 
    155      // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels =>
    156      // multiply with current coefficient => accumulate the result.
    157      // [16] a1 b1 g1 r1 a0 b0 g0 r0
    158      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    159      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    160      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    161      // [32] a0 b0 g0 r0
    162      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    163      accum0 = _mm_add_epi32(accum0, t);
    164      // [32] a1 b1 g1 r1
    165      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    166      accum1 = _mm_add_epi32(accum1, t);
    167 
    168      // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels =>
    169      // multiply with current coefficient => accumulate the result.
    170      // [16] a3 b3 g3 r3 a2 b2 g2 r2
    171      src16 = _mm_unpackhi_epi8(src8, zero);
    172      mul_hi = _mm_mulhi_epi16(src16, coeff16);
    173      mul_lo = _mm_mullo_epi16(src16, coeff16);
    174      // [32] a2 b2 g2 r2
    175      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    176      accum2 = _mm_add_epi32(accum2, t);
    177      // [32] a3 b3 g3 r3
    178      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    179      accum3 = _mm_add_epi32(accum3, t);
    180    }
    181 
    182    // Shift right for fixed point implementation.
    183    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    184    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    185    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    186    accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits);
    187 
    188    // Packing 32 bits |accum| to 16 bits per channel (signed saturation).
    189    // [16] a1 b1 g1 r1 a0 b0 g0 r0
    190    accum0 = _mm_packs_epi32(accum0, accum1);
    191    // [16] a3 b3 g3 r3 a2 b2 g2 r2
    192    accum2 = _mm_packs_epi32(accum2, accum3);
    193 
    194    // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation).
    195    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    196    accum0 = _mm_packus_epi16(accum0, accum2);
    197 
    198    if (hasAlpha) {
    199      // Compute the max(ri, gi, bi) for each pixel.
    200      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    201      __m128i a = _mm_srli_epi32(accum0, 8);
    202      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    203      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    204      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    205      a = _mm_srli_epi32(accum0, 16);
    206      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    207      b = _mm_max_epu8(a, b);  // Max of r and g and b.
    208      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    209      b = _mm_slli_epi32(b, 24);
    210 
    211      // Make sure the value of alpha channel is always larger than maximum
    212      // value of color channels.
    213      accum0 = _mm_max_epu8(b, accum0);
    214    } else {
    215      // Set value of alpha channels to 0xFF.
    216      __m128i mask = _mm_set1_epi32(0xff000000);
    217      accum0 = _mm_or_si128(accum0, mask);
    218    }
    219 
    220    // Store the convolution result (16 bytes) and advance the pixel pointers.
    221    _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0);
    222    outRow += 16;
    223  }
    224 
    225  // When the width of the output is not divisible by 4, We need to save one
    226  // pixel (4 bytes) each time. And also the fourth pixel is always absent.
    227  int r = pixelWidth & 3;
    228  if (r) {
    229    __m128i accum0 = _mm_setzero_si128();
    230    __m128i accum1 = _mm_setzero_si128();
    231    __m128i accum2 = _mm_setzero_si128();
    232    for (int filterY = 0; filterY < filterLength; ++filterY) {
    233      __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]);
    234      // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    235      const __m128i* src = reinterpret_cast<const __m128i*>(
    236          &sourceDataRows[filterY][width << 2]);
    237      __m128i src8 = _mm_loadu_si128(src);
    238      // [16] a1 b1 g1 r1 a0 b0 g0 r0
    239      __m128i src16 = _mm_unpacklo_epi8(src8, zero);
    240      __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16);
    241      __m128i mul_lo = _mm_mullo_epi16(src16, coeff16);
    242      // [32] a0 b0 g0 r0
    243      __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    244      accum0 = _mm_add_epi32(accum0, t);
    245      // [32] a1 b1 g1 r1
    246      t = _mm_unpackhi_epi16(mul_lo, mul_hi);
    247      accum1 = _mm_add_epi32(accum1, t);
    248      // [16] a3 b3 g3 r3 a2 b2 g2 r2
    249      src16 = _mm_unpackhi_epi8(src8, zero);
    250      mul_hi = _mm_mulhi_epi16(src16, coeff16);
    251      mul_lo = _mm_mullo_epi16(src16, coeff16);
    252      // [32] a2 b2 g2 r2
    253      t = _mm_unpacklo_epi16(mul_lo, mul_hi);
    254      accum2 = _mm_add_epi32(accum2, t);
    255    }
    256 
    257    accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits);
    258    accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits);
    259    accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits);
    260    // [16] a1 b1 g1 r1 a0 b0 g0 r0
    261    accum0 = _mm_packs_epi32(accum0, accum1);
    262    // [16] a3 b3 g3 r3 a2 b2 g2 r2
    263    accum2 = _mm_packs_epi32(accum2, zero);
    264    // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0
    265    accum0 = _mm_packus_epi16(accum0, accum2);
    266    if (hasAlpha) {
    267      // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0
    268      __m128i a = _mm_srli_epi32(accum0, 8);
    269      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    270      __m128i b = _mm_max_epu8(a, accum0);  // Max of r and g.
    271      // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0
    272      a = _mm_srli_epi32(accum0, 16);
    273      // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0
    274      b = _mm_max_epu8(a, b);  // Max of r and g and b.
    275      // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00
    276      b = _mm_slli_epi32(b, 24);
    277      accum0 = _mm_max_epu8(b, accum0);
    278    } else {
    279      __m128i mask = _mm_set1_epi32(0xff000000);
    280      accum0 = _mm_or_si128(accum0, mask);
    281    }
    282 
    283    for (int i = 0; i < r; i++) {
    284      *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0);
    285      accum0 = _mm_srli_si128(accum0, 4);
    286      outRow += 4;
    287    }
    288  }
    289 }
    290 
    291 void convolve_vertically_sse2(
    292    const SkConvolutionFilter1D::ConvolutionFixed* filterValues,
    293    int filterLength, unsigned char* const* sourceDataRows, int pixelWidth,
    294    unsigned char* outRow, bool hasAlpha) {
    295  if (hasAlpha) {
    296    ConvolveVertically<true>(filterValues, filterLength, sourceDataRows,
    297                             pixelWidth, outRow);
    298  } else {
    299    ConvolveVertically<false>(filterValues, filterLength, sourceDataRows,
    300                              pixelWidth, outRow);
    301  }
    302 }
    303 
    304 }  // namespace skia