ConvolutionFilterSSE2.cpp (12908B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 // Copyright (c) 2011-2016 Google Inc. 4 // Use of this source code is governed by a BSD-style license that can be 5 // found in the gfx/skia/LICENSE file. 6 7 #include "SkConvolver.h" 8 #include "mozilla/Attributes.h" 9 #include <immintrin.h> 10 11 namespace skia { 12 13 static MOZ_ALWAYS_INLINE void AccumRemainder( 14 const unsigned char* pixelsLeft, 15 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, __m128i& accum, 16 int r) { 17 int remainder[4] = {0}; 18 for (int i = 0; i < r; i++) { 19 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; 20 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; 21 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; 22 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; 23 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; 24 } 25 __m128i t = 26 _mm_setr_epi32(remainder[0], remainder[1], remainder[2], remainder[3]); 27 accum = _mm_add_epi32(accum, t); 28 } 29 30 // Convolves horizontally along a single row. The row data is given in 31 // |srcData| and continues for the numValues() of the filter. 32 void convolve_horizontally_sse2(const unsigned char* srcData, 33 const SkConvolutionFilter1D& filter, 34 unsigned char* outRow, bool /*hasAlpha*/) { 35 // Output one pixel each iteration, calculating all channels (RGBA) together. 36 int numValues = filter.numValues(); 37 for (int outX = 0; outX < numValues; outX++) { 38 // Get the filter that determines the current output pixel. 39 int filterOffset, filterLength; 40 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 41 filter.FilterForValue(outX, &filterOffset, &filterLength); 42 43 // Compute the first pixel in this row that the filter affects. It will 44 // touch |filterLength| pixels (4 bytes each) after this. 45 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; 46 47 __m128i zero = _mm_setzero_si128(); 48 __m128i accum = _mm_setzero_si128(); 49 50 // We will load and accumulate with four coefficients per iteration. 51 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { 52 // Load 4 coefficients => duplicate 1st and 2nd of them for all channels. 53 __m128i coeff, coeff16; 54 // [16] xx xx xx xx c3 c2 c1 c0 55 coeff = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(filterValues)); 56 // [16] xx xx xx xx c1 c1 c0 c0 57 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(1, 1, 0, 0)); 58 // [16] c1 c1 c1 c1 c0 c0 c0 c0 59 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 60 61 // Load four pixels => unpack the first two pixels to 16 bits => 62 // multiply with coefficients => accumulate the convolution result. 63 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 64 __m128i src8 = 65 _mm_loadu_si128(reinterpret_cast<const __m128i*>(rowToFilter)); 66 // [16] a1 b1 g1 r1 a0 b0 g0 r0 67 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 68 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 69 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 70 // [32] a0*c0 b0*c0 g0*c0 r0*c0 71 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 72 accum = _mm_add_epi32(accum, t); 73 // [32] a1*c1 b1*c1 g1*c1 r1*c1 74 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 75 accum = _mm_add_epi32(accum, t); 76 77 // Duplicate 3rd and 4th coefficients for all channels => 78 // unpack the 3rd and 4th pixels to 16 bits => multiply with coefficients 79 // => accumulate the convolution results. 80 // [16] xx xx xx xx c3 c3 c2 c2 81 coeff16 = _mm_shufflelo_epi16(coeff, _MM_SHUFFLE(3, 3, 2, 2)); 82 // [16] c3 c3 c3 c3 c2 c2 c2 c2 83 coeff16 = _mm_unpacklo_epi16(coeff16, coeff16); 84 // [16] a3 g3 b3 r3 a2 g2 b2 r2 85 src16 = _mm_unpackhi_epi8(src8, zero); 86 mul_hi = _mm_mulhi_epi16(src16, coeff16); 87 mul_lo = _mm_mullo_epi16(src16, coeff16); 88 // [32] a2*c2 b2*c2 g2*c2 r2*c2 89 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 90 accum = _mm_add_epi32(accum, t); 91 // [32] a3*c3 b3*c3 g3*c3 r3*c3 92 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 93 accum = _mm_add_epi32(accum, t); 94 95 // Advance the pixel and coefficients pointers. 96 rowToFilter += 16; 97 filterValues += 4; 98 } 99 100 // When |filterLength| is not divisible by 4, we accumulate the last 1 - 3 101 // coefficients one at a time. 102 int r = filterLength & 3; 103 if (r) { 104 int remainderOffset = (filterOffset + filterLength - r) * 4; 105 AccumRemainder(srcData + remainderOffset, filterValues, accum, r); 106 } 107 108 // Shift right for fixed point implementation. 109 accum = _mm_srai_epi32(accum, SkConvolutionFilter1D::kShiftBits); 110 111 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 112 accum = _mm_packs_epi32(accum, zero); 113 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 114 accum = _mm_packus_epi16(accum, zero); 115 116 // Store the pixel value of 32 bits. 117 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum); 118 outRow += 4; 119 } 120 } 121 122 // Does vertical convolution to produce one output row. The filter values and 123 // length are given in the first two parameters. These are applied to each 124 // of the rows pointed to in the |sourceDataRows| array, with each row 125 // being |pixelWidth| wide. 126 // 127 // The output must have room for |pixelWidth * 4| bytes. 128 template <bool hasAlpha> 129 static void ConvolveVertically( 130 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 131 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth, 132 unsigned char* outRow) { 133 // Output four pixels per iteration (16 bytes). 134 int width = pixelWidth & ~3; 135 __m128i zero = _mm_setzero_si128(); 136 for (int outX = 0; outX < width; outX += 4) { 137 // Accumulated result for each pixel. 32 bits per RGBA channel. 138 __m128i accum0 = _mm_setzero_si128(); 139 __m128i accum1 = _mm_setzero_si128(); 140 __m128i accum2 = _mm_setzero_si128(); 141 __m128i accum3 = _mm_setzero_si128(); 142 143 // Convolve with one filter coefficient per iteration. 144 for (int filterY = 0; filterY < filterLength; filterY++) { 145 // Duplicate the filter coefficient 8 times. 146 // [16] cj cj cj cj cj cj cj cj 147 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); 148 149 // Load four pixels (16 bytes) together. 150 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 151 const __m128i* src = 152 reinterpret_cast<const __m128i*>(&sourceDataRows[filterY][outX << 2]); 153 __m128i src8 = _mm_loadu_si128(src); 154 155 // Unpack 1st and 2nd pixels from 8 bits to 16 bits for each channels => 156 // multiply with current coefficient => accumulate the result. 157 // [16] a1 b1 g1 r1 a0 b0 g0 r0 158 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 159 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 160 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 161 // [32] a0 b0 g0 r0 162 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 163 accum0 = _mm_add_epi32(accum0, t); 164 // [32] a1 b1 g1 r1 165 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 166 accum1 = _mm_add_epi32(accum1, t); 167 168 // Unpack 3rd and 4th pixels from 8 bits to 16 bits for each channels => 169 // multiply with current coefficient => accumulate the result. 170 // [16] a3 b3 g3 r3 a2 b2 g2 r2 171 src16 = _mm_unpackhi_epi8(src8, zero); 172 mul_hi = _mm_mulhi_epi16(src16, coeff16); 173 mul_lo = _mm_mullo_epi16(src16, coeff16); 174 // [32] a2 b2 g2 r2 175 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 176 accum2 = _mm_add_epi32(accum2, t); 177 // [32] a3 b3 g3 r3 178 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 179 accum3 = _mm_add_epi32(accum3, t); 180 } 181 182 // Shift right for fixed point implementation. 183 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); 184 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); 185 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); 186 accum3 = _mm_srai_epi32(accum3, SkConvolutionFilter1D::kShiftBits); 187 188 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 189 // [16] a1 b1 g1 r1 a0 b0 g0 r0 190 accum0 = _mm_packs_epi32(accum0, accum1); 191 // [16] a3 b3 g3 r3 a2 b2 g2 r2 192 accum2 = _mm_packs_epi32(accum2, accum3); 193 194 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 195 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 196 accum0 = _mm_packus_epi16(accum0, accum2); 197 198 if (hasAlpha) { 199 // Compute the max(ri, gi, bi) for each pixel. 200 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 201 __m128i a = _mm_srli_epi32(accum0, 8); 202 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 203 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. 204 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 205 a = _mm_srli_epi32(accum0, 16); 206 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 207 b = _mm_max_epu8(a, b); // Max of r and g and b. 208 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 209 b = _mm_slli_epi32(b, 24); 210 211 // Make sure the value of alpha channel is always larger than maximum 212 // value of color channels. 213 accum0 = _mm_max_epu8(b, accum0); 214 } else { 215 // Set value of alpha channels to 0xFF. 216 __m128i mask = _mm_set1_epi32(0xff000000); 217 accum0 = _mm_or_si128(accum0, mask); 218 } 219 220 // Store the convolution result (16 bytes) and advance the pixel pointers. 221 _mm_storeu_si128(reinterpret_cast<__m128i*>(outRow), accum0); 222 outRow += 16; 223 } 224 225 // When the width of the output is not divisible by 4, We need to save one 226 // pixel (4 bytes) each time. And also the fourth pixel is always absent. 227 int r = pixelWidth & 3; 228 if (r) { 229 __m128i accum0 = _mm_setzero_si128(); 230 __m128i accum1 = _mm_setzero_si128(); 231 __m128i accum2 = _mm_setzero_si128(); 232 for (int filterY = 0; filterY < filterLength; ++filterY) { 233 __m128i coeff16 = _mm_set1_epi16(filterValues[filterY]); 234 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 235 const __m128i* src = reinterpret_cast<const __m128i*>( 236 &sourceDataRows[filterY][width << 2]); 237 __m128i src8 = _mm_loadu_si128(src); 238 // [16] a1 b1 g1 r1 a0 b0 g0 r0 239 __m128i src16 = _mm_unpacklo_epi8(src8, zero); 240 __m128i mul_hi = _mm_mulhi_epi16(src16, coeff16); 241 __m128i mul_lo = _mm_mullo_epi16(src16, coeff16); 242 // [32] a0 b0 g0 r0 243 __m128i t = _mm_unpacklo_epi16(mul_lo, mul_hi); 244 accum0 = _mm_add_epi32(accum0, t); 245 // [32] a1 b1 g1 r1 246 t = _mm_unpackhi_epi16(mul_lo, mul_hi); 247 accum1 = _mm_add_epi32(accum1, t); 248 // [16] a3 b3 g3 r3 a2 b2 g2 r2 249 src16 = _mm_unpackhi_epi8(src8, zero); 250 mul_hi = _mm_mulhi_epi16(src16, coeff16); 251 mul_lo = _mm_mullo_epi16(src16, coeff16); 252 // [32] a2 b2 g2 r2 253 t = _mm_unpacklo_epi16(mul_lo, mul_hi); 254 accum2 = _mm_add_epi32(accum2, t); 255 } 256 257 accum0 = _mm_srai_epi32(accum0, SkConvolutionFilter1D::kShiftBits); 258 accum1 = _mm_srai_epi32(accum1, SkConvolutionFilter1D::kShiftBits); 259 accum2 = _mm_srai_epi32(accum2, SkConvolutionFilter1D::kShiftBits); 260 // [16] a1 b1 g1 r1 a0 b0 g0 r0 261 accum0 = _mm_packs_epi32(accum0, accum1); 262 // [16] a3 b3 g3 r3 a2 b2 g2 r2 263 accum2 = _mm_packs_epi32(accum2, zero); 264 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 265 accum0 = _mm_packus_epi16(accum0, accum2); 266 if (hasAlpha) { 267 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 268 __m128i a = _mm_srli_epi32(accum0, 8); 269 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 270 __m128i b = _mm_max_epu8(a, accum0); // Max of r and g. 271 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 272 a = _mm_srli_epi32(accum0, 16); 273 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 274 b = _mm_max_epu8(a, b); // Max of r and g and b. 275 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 276 b = _mm_slli_epi32(b, 24); 277 accum0 = _mm_max_epu8(b, accum0); 278 } else { 279 __m128i mask = _mm_set1_epi32(0xff000000); 280 accum0 = _mm_or_si128(accum0, mask); 281 } 282 283 for (int i = 0; i < r; i++) { 284 *(reinterpret_cast<int*>(outRow)) = _mm_cvtsi128_si32(accum0); 285 accum0 = _mm_srli_si128(accum0, 4); 286 outRow += 4; 287 } 288 } 289 } 290 291 void convolve_vertically_sse2( 292 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 293 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth, 294 unsigned char* outRow, bool hasAlpha) { 295 if (hasAlpha) { 296 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows, 297 pixelWidth, outRow); 298 } else { 299 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows, 300 pixelWidth, outRow); 301 } 302 } 303 304 } // namespace skia