ConvolutionFilterNEON.cpp (11953B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 // Copyright (c) 2011-2016 Google Inc. 4 // Use of this source code is governed by a BSD-style license that can be 5 // found in the gfx/skia/LICENSE file. 6 7 #include "SkConvolver.h" 8 #include "mozilla/Attributes.h" 9 #include <arm_neon.h> 10 11 namespace skia { 12 13 static MOZ_ALWAYS_INLINE void AccumRemainder( 14 const unsigned char* pixelsLeft, 15 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 16 int32x4_t& accum, int r) { 17 int remainder[4] = {0}; 18 for (int i = 0; i < r; i++) { 19 SkConvolutionFilter1D::ConvolutionFixed coeff = filterValues[i]; 20 remainder[0] += coeff * pixelsLeft[i * 4 + 0]; 21 remainder[1] += coeff * pixelsLeft[i * 4 + 1]; 22 remainder[2] += coeff * pixelsLeft[i * 4 + 2]; 23 remainder[3] += coeff * pixelsLeft[i * 4 + 3]; 24 } 25 int32x4_t t = {remainder[0], remainder[1], remainder[2], remainder[3]}; 26 accum += t; 27 } 28 29 // Convolves horizontally along a single row. The row data is given in 30 // |srcData| and continues for the numValues() of the filter. 31 void convolve_horizontally_neon(const unsigned char* srcData, 32 const SkConvolutionFilter1D& filter, 33 unsigned char* outRow, bool /*hasAlpha*/) { 34 // Loop over each pixel on this row in the output image. 35 int numValues = filter.numValues(); 36 for (int outX = 0; outX < numValues; outX++) { 37 uint8x8_t coeff_mask0 = vcreate_u8(0x0100010001000100); 38 uint8x8_t coeff_mask1 = vcreate_u8(0x0302030203020302); 39 uint8x8_t coeff_mask2 = vcreate_u8(0x0504050405040504); 40 uint8x8_t coeff_mask3 = vcreate_u8(0x0706070607060706); 41 // Get the filter that determines the current output pixel. 42 int filterOffset, filterLength; 43 const SkConvolutionFilter1D::ConvolutionFixed* filterValues = 44 filter.FilterForValue(outX, &filterOffset, &filterLength); 45 46 // Compute the first pixel in this row that the filter affects. It will 47 // touch |filterLength| pixels (4 bytes each) after this. 48 const unsigned char* rowToFilter = &srcData[filterOffset * 4]; 49 50 // Apply the filter to the row to get the destination pixel in |accum|. 51 int32x4_t accum = vdupq_n_s32(0); 52 for (int filterX = 0; filterX < filterLength >> 2; filterX++) { 53 // Load 4 coefficients 54 int16x4_t coeffs, coeff0, coeff1, coeff2, coeff3; 55 coeffs = vld1_s16(filterValues); 56 coeff0 = vreinterpret_s16_u8( 57 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask0)); 58 coeff1 = vreinterpret_s16_u8( 59 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask1)); 60 coeff2 = vreinterpret_s16_u8( 61 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask2)); 62 coeff3 = vreinterpret_s16_u8( 63 vtbl1_u8(vreinterpret_u8_s16(coeffs), coeff_mask3)); 64 65 // Load pixels and calc 66 uint8x16_t pixels = vld1q_u8(rowToFilter); 67 int16x8_t p01_16 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pixels))); 68 int16x8_t p23_16 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pixels))); 69 70 int16x4_t p0_src = vget_low_s16(p01_16); 71 int16x4_t p1_src = vget_high_s16(p01_16); 72 int16x4_t p2_src = vget_low_s16(p23_16); 73 int16x4_t p3_src = vget_high_s16(p23_16); 74 75 int32x4_t p0 = vmull_s16(p0_src, coeff0); 76 int32x4_t p1 = vmull_s16(p1_src, coeff1); 77 int32x4_t p2 = vmull_s16(p2_src, coeff2); 78 int32x4_t p3 = vmull_s16(p3_src, coeff3); 79 80 accum += p0; 81 accum += p1; 82 accum += p2; 83 accum += p3; 84 85 // Advance the pointers 86 rowToFilter += 16; 87 filterValues += 4; 88 } 89 90 int r = filterLength & 3; 91 if (r) { 92 int remainder_offset = (filterOffset + filterLength - r) * 4; 93 AccumRemainder(srcData + remainder_offset, filterValues, accum, r); 94 } 95 96 // Bring this value back in range. All of the filter scaling factors 97 // are in fixed point with kShiftBits bits of fractional part. 98 accum = vshrq_n_s32(accum, SkConvolutionFilter1D::kShiftBits); 99 100 // Pack and store the new pixel. 101 int16x4_t accum16 = vqmovn_s32(accum); 102 uint8x8_t accum8 = vqmovun_s16(vcombine_s16(accum16, accum16)); 103 vst1_lane_u32(reinterpret_cast<uint32_t*>(outRow), 104 vreinterpret_u32_u8(accum8), 0); 105 outRow += 4; 106 } 107 } 108 109 // Does vertical convolution to produce one output row. The filter values and 110 // length are given in the first two parameters. These are applied to each 111 // of the rows pointed to in the |sourceDataRows| array, with each row 112 // being |pixelWidth| wide. 113 // 114 // The output must have room for |pixelWidth * 4| bytes. 115 template <bool hasAlpha> 116 static void ConvolveVertically( 117 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 118 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth, 119 unsigned char* outRow) { 120 int width = pixelWidth & ~3; 121 122 // Output four pixels per iteration (16 bytes). 123 for (int outX = 0; outX < width; outX += 4) { 124 // Accumulated result for each pixel. 32 bits per RGBA channel. 125 int32x4_t accum0 = vdupq_n_s32(0); 126 int32x4_t accum1 = vdupq_n_s32(0); 127 int32x4_t accum2 = vdupq_n_s32(0); 128 int32x4_t accum3 = vdupq_n_s32(0); 129 130 // Convolve with one filter coefficient per iteration. 131 for (int filterY = 0; filterY < filterLength; filterY++) { 132 // Duplicate the filter coefficient 4 times. 133 // [16] cj cj cj cj 134 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); 135 136 // Load four pixels (16 bytes) together. 137 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 138 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][outX << 2]); 139 140 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); 141 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); 142 int16x4_t src16_0 = vget_low_s16(src16_01); 143 int16x4_t src16_1 = vget_high_s16(src16_01); 144 int16x4_t src16_2 = vget_low_s16(src16_23); 145 int16x4_t src16_3 = vget_high_s16(src16_23); 146 147 accum0 += vmull_s16(src16_0, coeff16); 148 accum1 += vmull_s16(src16_1, coeff16); 149 accum2 += vmull_s16(src16_2, coeff16); 150 accum3 += vmull_s16(src16_3, coeff16); 151 } 152 153 // Shift right for fixed point implementation. 154 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); 155 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); 156 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); 157 accum3 = vshrq_n_s32(accum3, SkConvolutionFilter1D::kShiftBits); 158 159 // Packing 32 bits |accum| to 16 bits per channel (signed saturation). 160 // [16] a1 b1 g1 r1 a0 b0 g0 r0 161 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); 162 // [16] a3 b3 g3 r3 a2 b2 g2 r2 163 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum3)); 164 165 // Packing 16 bits |accum| to 8 bits per channel (unsigned saturation). 166 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 167 uint8x16_t accum8 = 168 vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); 169 170 if (hasAlpha) { 171 // Compute the max(ri, gi, bi) for each pixel. 172 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 173 uint8x16_t a = 174 vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); 175 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 176 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g 177 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 178 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); 179 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 180 b = vmaxq_u8(a, b); // Max of r and g and b. 181 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 182 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); 183 184 // Make sure the value of alpha channel is always larger than maximum 185 // value of color channels. 186 accum8 = vmaxq_u8(b, accum8); 187 } else { 188 // Set value of alpha channels to 0xFF. 189 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | 190 vdupq_n_u32(0xFF000000)); 191 } 192 193 // Store the convolution result (16 bytes) and advance the pixel pointers. 194 vst1q_u8(outRow, accum8); 195 outRow += 16; 196 } 197 198 // Process the leftovers when the width of the output is not divisible 199 // by 4, that is at most 3 pixels. 200 int r = pixelWidth & 3; 201 if (r) { 202 int32x4_t accum0 = vdupq_n_s32(0); 203 int32x4_t accum1 = vdupq_n_s32(0); 204 int32x4_t accum2 = vdupq_n_s32(0); 205 206 for (int filterY = 0; filterY < filterLength; ++filterY) { 207 int16x4_t coeff16 = vdup_n_s16(filterValues[filterY]); 208 209 // [8] a3 b3 g3 r3 a2 b2 g2 r2 a1 b1 g1 r1 a0 b0 g0 r0 210 uint8x16_t src8 = vld1q_u8(&sourceDataRows[filterY][width << 2]); 211 212 int16x8_t src16_01 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(src8))); 213 int16x8_t src16_23 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(src8))); 214 int16x4_t src16_0 = vget_low_s16(src16_01); 215 int16x4_t src16_1 = vget_high_s16(src16_01); 216 int16x4_t src16_2 = vget_low_s16(src16_23); 217 218 accum0 += vmull_s16(src16_0, coeff16); 219 accum1 += vmull_s16(src16_1, coeff16); 220 accum2 += vmull_s16(src16_2, coeff16); 221 } 222 223 accum0 = vshrq_n_s32(accum0, SkConvolutionFilter1D::kShiftBits); 224 accum1 = vshrq_n_s32(accum1, SkConvolutionFilter1D::kShiftBits); 225 accum2 = vshrq_n_s32(accum2, SkConvolutionFilter1D::kShiftBits); 226 227 int16x8_t accum16_0 = vcombine_s16(vqmovn_s32(accum0), vqmovn_s32(accum1)); 228 int16x8_t accum16_1 = vcombine_s16(vqmovn_s32(accum2), vqmovn_s32(accum2)); 229 230 uint8x16_t accum8 = 231 vcombine_u8(vqmovun_s16(accum16_0), vqmovun_s16(accum16_1)); 232 233 if (hasAlpha) { 234 // Compute the max(ri, gi, bi) for each pixel. 235 // [8] xx a3 b3 g3 xx a2 b2 g2 xx a1 b1 g1 xx a0 b0 g0 236 uint8x16_t a = 237 vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 8)); 238 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 239 uint8x16_t b = vmaxq_u8(a, accum8); // Max of r and g 240 // [8] xx xx a3 b3 xx xx a2 b2 xx xx a1 b1 xx xx a0 b0 241 a = vreinterpretq_u8_u32(vshrq_n_u32(vreinterpretq_u32_u8(accum8), 16)); 242 // [8] xx xx xx max3 xx xx xx max2 xx xx xx max1 xx xx xx max0 243 b = vmaxq_u8(a, b); // Max of r and g and b. 244 // [8] max3 00 00 00 max2 00 00 00 max1 00 00 00 max0 00 00 00 245 b = vreinterpretq_u8_u32(vshlq_n_u32(vreinterpretq_u32_u8(b), 24)); 246 // Make sure the value of alpha channel is always larger than maximum 247 // value of color channels. 248 accum8 = vmaxq_u8(b, accum8); 249 } else { 250 // Set value of alpha channels to 0xFF. 251 accum8 = vreinterpretq_u8_u32(vreinterpretq_u32_u8(accum8) | 252 vdupq_n_u32(0xFF000000)); 253 } 254 255 switch (r) { 256 case 1: 257 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow), 258 vreinterpretq_u32_u8(accum8), 0); 259 break; 260 case 2: 261 vst1_u32(reinterpret_cast<uint32_t*>(outRow), 262 vreinterpret_u32_u8(vget_low_u8(accum8))); 263 break; 264 case 3: 265 vst1_u32(reinterpret_cast<uint32_t*>(outRow), 266 vreinterpret_u32_u8(vget_low_u8(accum8))); 267 vst1q_lane_u32(reinterpret_cast<uint32_t*>(outRow + 8), 268 vreinterpretq_u32_u8(accum8), 2); 269 break; 270 } 271 } 272 } 273 274 void convolve_vertically_neon( 275 const SkConvolutionFilter1D::ConvolutionFixed* filterValues, 276 int filterLength, unsigned char* const* sourceDataRows, int pixelWidth, 277 unsigned char* outRow, bool hasAlpha) { 278 if (hasAlpha) { 279 ConvolveVertically<true>(filterValues, filterLength, sourceDataRows, 280 pixelWidth, outRow); 281 } else { 282 ConvolveVertically<false>(filterValues, filterLength, sourceDataRows, 283 pixelWidth, outRow); 284 } 285 } 286 287 } // namespace skia