SwizzleNEON.cpp (18433B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "Swizzle.h" 8 9 #include <arm_neon.h> 10 11 namespace mozilla { 12 namespace gfx { 13 14 // Load 1-3 pixels into a 4 pixel vector. 15 static MOZ_ALWAYS_INLINE uint16x8_t LoadRemainder_NEON(const uint8_t* aSrc, 16 size_t aLength) { 17 const uint32_t* src32 = reinterpret_cast<const uint32_t*>(aSrc); 18 uint32x4_t dst32; 19 if (aLength >= 2) { 20 // Load first 2 pixels 21 dst32 = vcombine_u32(vld1_u32(src32), vdup_n_u32(0)); 22 // Load third pixel 23 if (aLength >= 3) { 24 dst32 = vld1q_lane_u32(src32 + 2, dst32, 2); 25 } 26 } else { 27 // Load single pixel 28 dst32 = vld1q_lane_u32(src32, vdupq_n_u32(0), 0); 29 } 30 return vreinterpretq_u16_u32(dst32); 31 } 32 33 // Store 1-3 pixels from a vector into memory without overwriting. 34 static MOZ_ALWAYS_INLINE void StoreRemainder_NEON(uint8_t* aDst, size_t aLength, 35 const uint16x8_t& aSrc) { 36 uint32_t* dst32 = reinterpret_cast<uint32_t*>(aDst); 37 uint32x4_t src32 = vreinterpretq_u32_u16(aSrc); 38 if (aLength >= 2) { 39 // Store first 2 pixels 40 vst1_u32(dst32, vget_low_u32(src32)); 41 // Store third pixel 42 if (aLength >= 3) { 43 vst1q_lane_u32(dst32 + 2, src32, 2); 44 } 45 } else { 46 // Store single pixel 47 vst1q_lane_u32(dst32, src32, 0); 48 } 49 } 50 51 // Premultiply vector of 4 pixels using splayed math. 52 template <bool aSwapRB, bool aOpaqueAlpha> 53 static MOZ_ALWAYS_INLINE uint16x8_t 54 PremultiplyVector_NEON(const uint16x8_t& aSrc) { 55 // Isolate R and B with mask. 56 const uint16x8_t mask = vdupq_n_u16(0x00FF); 57 uint16x8_t rb = vandq_u16(aSrc, mask); 58 // Swap R and B if necessary. 59 if (aSwapRB) { 60 rb = vrev32q_u16(rb); 61 } 62 // Isolate G and A by shifting down to bottom of word. 63 uint16x8_t ga = vshrq_n_u16(aSrc, 8); 64 65 // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4 66 uint16x8_t alphas = vtrnq_u16(ga, ga).val[1]; 67 68 // rb = rb*a + 255; rb += rb >> 8; 69 rb = vmlaq_u16(mask, rb, alphas); 70 rb = vsraq_n_u16(rb, rb, 8); 71 72 // If format is not opaque, force A to 255 so that A*alpha/255 = alpha 73 if (!aOpaqueAlpha) { 74 ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0x00FF0000))); 75 } 76 // ga = ga*a + 255; ga += ga >> 8; 77 ga = vmlaq_u16(mask, ga, alphas); 78 ga = vsraq_n_u16(ga, ga, 8); 79 // If format is opaque, force output A to be 255. 80 if (aOpaqueAlpha) { 81 ga = vorrq_u16(ga, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000))); 82 } 83 84 // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00) 85 return vsriq_n_u16(ga, rb, 8); 86 } 87 88 template <bool aSwapRB, bool aOpaqueAlpha> 89 static MOZ_ALWAYS_INLINE void PremultiplyChunk_NEON(const uint8_t*& aSrc, 90 uint8_t*& aDst, 91 int32_t aAlignedRow, 92 int32_t aRemainder) { 93 // Process all 4-pixel chunks as one vector. 94 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { 95 uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc)); 96 px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px); 97 vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px); 98 aSrc += 4 * 4; 99 aDst += 4 * 4; 100 } 101 102 // Handle any 1-3 remaining pixels. 103 if (aRemainder) { 104 uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder); 105 px = PremultiplyVector_NEON<aSwapRB, aOpaqueAlpha>(px); 106 StoreRemainder_NEON(aDst, aRemainder, px); 107 } 108 } 109 110 template <bool aSwapRB, bool aOpaqueAlpha> 111 void PremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { 112 int32_t alignedRow = 4 * (aLength & ~3); 113 int32_t remainder = aLength & 3; 114 PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, 115 remainder); 116 } 117 118 template <bool aSwapRB, bool aOpaqueAlpha> 119 void Premultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, 120 int32_t aDstGap, IntSize aSize) { 121 int32_t alignedRow = 4 * (aSize.width & ~3); 122 int32_t remainder = aSize.width & 3; 123 // Fold remainder into stride gap. 124 aSrcGap += 4 * remainder; 125 aDstGap += 4 * remainder; 126 127 for (int32_t height = aSize.height; height > 0; height--) { 128 PremultiplyChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, 129 remainder); 130 aSrc += aSrcGap; 131 aDst += aDstGap; 132 } 133 } 134 135 // Force instantiation of premultiply variants here. 136 template void PremultiplyRow_NEON<false, false>(const uint8_t*, uint8_t*, 137 int32_t); 138 template void PremultiplyRow_NEON<false, true>(const uint8_t*, uint8_t*, 139 int32_t); 140 template void PremultiplyRow_NEON<true, false>(const uint8_t*, uint8_t*, 141 int32_t); 142 template void PremultiplyRow_NEON<true, true>(const uint8_t*, uint8_t*, 143 int32_t); 144 template void Premultiply_NEON<false, false>(const uint8_t*, int32_t, uint8_t*, 145 int32_t, IntSize); 146 template void Premultiply_NEON<false, true>(const uint8_t*, int32_t, uint8_t*, 147 int32_t, IntSize); 148 template void Premultiply_NEON<true, false>(const uint8_t*, int32_t, uint8_t*, 149 int32_t, IntSize); 150 template void Premultiply_NEON<true, true>(const uint8_t*, int32_t, uint8_t*, 151 int32_t, IntSize); 152 153 // This generates a table of fixed-point reciprocals representing 1/alpha 154 // similar to the fallback implementation. However, the reciprocal must 155 // ultimately be multiplied as an unsigned 9 bit upper part and a signed 156 // 15 bit lower part to cheaply multiply. Thus, the lower 15 bits of the 157 // reciprocal is stored 15 bits of the reciprocal are masked off and 158 // stored in the low word. The upper 9 bits are masked and shifted to fit 159 // into the high word. These then get independently multiplied with the 160 // color component and recombined to provide the full recriprocal multiply. 161 #define UNPREMULQ_NEON(x) \ 162 ((((0xFF00FFU / (x)) & 0xFF8000U) << 1) | ((0xFF00FFU / (x)) & 0x7FFFU)) 163 #define UNPREMULQ_NEON_2(x) UNPREMULQ_NEON(x), UNPREMULQ_NEON((x) + 1) 164 #define UNPREMULQ_NEON_4(x) UNPREMULQ_NEON_2(x), UNPREMULQ_NEON_2((x) + 2) 165 #define UNPREMULQ_NEON_8(x) UNPREMULQ_NEON_4(x), UNPREMULQ_NEON_4((x) + 4) 166 #define UNPREMULQ_NEON_16(x) UNPREMULQ_NEON_8(x), UNPREMULQ_NEON_8((x) + 8) 167 #define UNPREMULQ_NEON_32(x) UNPREMULQ_NEON_16(x), UNPREMULQ_NEON_16((x) + 16) 168 static const uint32_t sUnpremultiplyTable_NEON[256] = {0, 169 UNPREMULQ_NEON(1), 170 UNPREMULQ_NEON_2(2), 171 UNPREMULQ_NEON_4(4), 172 UNPREMULQ_NEON_8(8), 173 UNPREMULQ_NEON_16(16), 174 UNPREMULQ_NEON_32(32), 175 UNPREMULQ_NEON_32(64), 176 UNPREMULQ_NEON_32(96), 177 UNPREMULQ_NEON_32(128), 178 UNPREMULQ_NEON_32(160), 179 UNPREMULQ_NEON_32(192), 180 UNPREMULQ_NEON_32(224)}; 181 182 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table 183 // that avoids doing any actual division. 184 template <bool aSwapRB> 185 static MOZ_ALWAYS_INLINE uint16x8_t 186 UnpremultiplyVector_NEON(const uint16x8_t& aSrc) { 187 // Isolate R and B with mask. 188 uint16x8_t rb = vandq_u16(aSrc, vdupq_n_u16(0x00FF)); 189 // Swap R and B if necessary. 190 if (aSwapRB) { 191 rb = vrev32q_u16(rb); 192 } 193 194 // Isolate G and A by shifting down to bottom of word. 195 uint16x8_t ga = vshrq_n_u16(aSrc, 8); 196 // Extract the alphas for the 4 pixels from the now isolated words. 197 int a1 = vgetq_lane_u16(ga, 1); 198 int a2 = vgetq_lane_u16(ga, 3); 199 int a3 = vgetq_lane_u16(ga, 5); 200 int a4 = vgetq_lane_u16(ga, 7); 201 202 // First load all of the interleaved low and high portions of the reciprocals 203 // and combine them a single vector as lo1 hi1 lo2 hi2 lo3 hi3 lo4 hi4 204 uint16x8_t q1234 = vreinterpretq_u16_u32(vld1q_lane_u32( 205 &sUnpremultiplyTable_NEON[a4], 206 vld1q_lane_u32( 207 &sUnpremultiplyTable_NEON[a3], 208 vld1q_lane_u32( 209 &sUnpremultiplyTable_NEON[a2], 210 vld1q_lane_u32(&sUnpremultiplyTable_NEON[a1], vdupq_n_u32(0), 0), 211 1), 212 2), 213 3)); 214 // Transpose the interleaved low/high portions so that we produce 215 // two separate duplicated vectors for the low and high portions respectively: 216 // lo1 lo1 lo2 lo2 lo3 lo3 lo4 lo4 and hi1 hi1 hi2 hi2 hi3 hi3 hi4 hi4 217 uint16x8x2_t q1234lohi = vtrnq_u16(q1234, q1234); 218 219 // VQDMULH is a signed multiply that doubles (*2) the result, then takes the 220 // high word. To work around the signedness and the doubling, the low 221 // portion of the reciprocal only stores the lower 15 bits, which fits in a 222 // signed 16 bit integer. The high 9 bit portion is effectively also doubled 223 // by 2 as a side-effect of being shifted for storage. Thus the output scale 224 // of doing a normal multiply by the high portion and the VQDMULH by the low 225 // portion are both doubled and can be safely added together. The resulting 226 // sum just needs to be halved (via VHADD) to thus cancel out the doubling. 227 // All this combines to produce a reciprocal multiply of the form: 228 // rb = ((rb * hi) + ((rb * lo * 2) >> 16)) / 2 229 rb = vhaddq_u16( 230 vmulq_u16(rb, q1234lohi.val[1]), 231 vreinterpretq_u16_s16(vqdmulhq_s16( 232 vreinterpretq_s16_u16(rb), vreinterpretq_s16_u16(q1234lohi.val[0])))); 233 234 // ga = ((ga * hi) + ((ga * lo * 2) >> 16)) / 2 235 ga = vhaddq_u16( 236 vmulq_u16(ga, q1234lohi.val[1]), 237 vreinterpretq_u16_s16(vqdmulhq_s16( 238 vreinterpretq_s16_u16(ga), vreinterpretq_s16_u16(q1234lohi.val[0])))); 239 240 // Combine to the final pixel with ((rb | (ga << 8)) & ~0xFF000000) | (aSrc & 241 // 0xFF000000), which inserts back in the original alpha value unchanged. 242 return vbslq_u16(vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000)), aSrc, 243 vsliq_n_u16(rb, ga, 8)); 244 } 245 246 template <bool aSwapRB> 247 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_NEON(const uint8_t*& aSrc, 248 uint8_t*& aDst, 249 int32_t aAlignedRow, 250 int32_t aRemainder) { 251 // Process all 4-pixel chunks as one vector. 252 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { 253 uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc)); 254 px = UnpremultiplyVector_NEON<aSwapRB>(px); 255 vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px); 256 aSrc += 4 * 4; 257 aDst += 4 * 4; 258 } 259 260 // Handle any 1-3 remaining pixels. 261 if (aRemainder) { 262 uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder); 263 px = UnpremultiplyVector_NEON<aSwapRB>(px); 264 StoreRemainder_NEON(aDst, aRemainder, px); 265 } 266 } 267 268 template <bool aSwapRB> 269 void UnpremultiplyRow_NEON(const uint8_t* aSrc, uint8_t* aDst, 270 int32_t aLength) { 271 int32_t alignedRow = 4 * (aLength & ~3); 272 int32_t remainder = aLength & 3; 273 UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder); 274 } 275 276 template <bool aSwapRB> 277 void Unpremultiply_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, 278 int32_t aDstGap, IntSize aSize) { 279 int32_t alignedRow = 4 * (aSize.width & ~3); 280 int32_t remainder = aSize.width & 3; 281 // Fold remainder into stride gap. 282 aSrcGap += 4 * remainder; 283 aDstGap += 4 * remainder; 284 285 for (int32_t height = aSize.height; height > 0; height--) { 286 UnpremultiplyChunk_NEON<aSwapRB>(aSrc, aDst, alignedRow, remainder); 287 aSrc += aSrcGap; 288 aDst += aDstGap; 289 } 290 } 291 292 // Force instantiation of unpremultiply variants here. 293 template void UnpremultiplyRow_NEON<false>(const uint8_t*, uint8_t*, int32_t); 294 template void UnpremultiplyRow_NEON<true>(const uint8_t*, uint8_t*, int32_t); 295 template void Unpremultiply_NEON<false>(const uint8_t*, int32_t, uint8_t*, 296 int32_t, IntSize); 297 template void Unpremultiply_NEON<true>(const uint8_t*, int32_t, uint8_t*, 298 int32_t, IntSize); 299 300 // Swizzle a vector of 4 pixels providing swaps and opaquifying. 301 template <bool aSwapRB, bool aOpaqueAlpha> 302 static MOZ_ALWAYS_INLINE uint16x8_t SwizzleVector_NEON(const uint16x8_t& aSrc) { 303 // Swap R and B, then add to G and A (forced to 255): 304 // (((src>>16) | (src << 16)) & 0x00FF00FF) | 305 // ((src | 0xFF000000) & ~0x00FF00FF) 306 return vbslq_u16( 307 vdupq_n_u16(0x00FF), vrev32q_u16(aSrc), 308 aOpaqueAlpha 309 ? vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000))) 310 : aSrc); 311 } 312 313 #if 0 314 // These specializations currently do not profile faster than the generic versions, 315 // so disable them for now. 316 317 // Optimized implementations for when there is no R and B swap. 318 template<> 319 static MOZ_ALWAYS_INLINE uint16x8_t 320 SwizzleVector_NEON<false, true>(const uint16x8_t& aSrc) 321 { 322 // Force alpha to 255. 323 return vorrq_u16(aSrc, vreinterpretq_u16_u32(vdupq_n_u32(0xFF000000))); 324 } 325 326 template<> 327 static MOZ_ALWAYS_INLINE uint16x8_t 328 SwizzleVector_NEON<false, false>(const uint16x8_t& aSrc) 329 { 330 return aSrc; 331 } 332 #endif 333 334 template <bool aSwapRB, bool aOpaqueAlpha> 335 static MOZ_ALWAYS_INLINE void SwizzleChunk_NEON(const uint8_t*& aSrc, 336 uint8_t*& aDst, 337 int32_t aAlignedRow, 338 int32_t aRemainder) { 339 // Process all 4-pixel chunks as one vector. 340 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { 341 uint16x8_t px = vld1q_u16(reinterpret_cast<const uint16_t*>(aSrc)); 342 px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px); 343 vst1q_u16(reinterpret_cast<uint16_t*>(aDst), px); 344 aSrc += 4 * 4; 345 aDst += 4 * 4; 346 } 347 348 // Handle any 1-3 remaining pixels. 349 if (aRemainder) { 350 uint16x8_t px = LoadRemainder_NEON(aSrc, aRemainder); 351 px = SwizzleVector_NEON<aSwapRB, aOpaqueAlpha>(px); 352 StoreRemainder_NEON(aDst, aRemainder, px); 353 } 354 } 355 356 template <bool aSwapRB, bool aOpaqueAlpha> 357 void SwizzleRow_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { 358 int32_t alignedRow = 4 * (aLength & ~3); 359 int32_t remainder = aLength & 3; 360 SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder); 361 } 362 363 template <bool aSwapRB, bool aOpaqueAlpha> 364 void Swizzle_NEON(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, 365 int32_t aDstGap, IntSize aSize) { 366 int32_t alignedRow = 4 * (aSize.width & ~3); 367 int32_t remainder = aSize.width & 3; 368 // Fold remainder into stride gap. 369 aSrcGap += 4 * remainder; 370 aDstGap += 4 * remainder; 371 372 for (int32_t height = aSize.height; height > 0; height--) { 373 SwizzleChunk_NEON<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder); 374 aSrc += aSrcGap; 375 aDst += aDstGap; 376 } 377 } 378 379 // Force instantiation of swizzle variants here. 380 template void SwizzleRow_NEON<true, false>(const uint8_t*, uint8_t*, int32_t); 381 template void SwizzleRow_NEON<true, true>(const uint8_t*, uint8_t*, int32_t); 382 template void Swizzle_NEON<true, false>(const uint8_t*, int32_t, uint8_t*, 383 int32_t, IntSize); 384 template void Swizzle_NEON<true, true>(const uint8_t*, int32_t, uint8_t*, 385 int32_t, IntSize); 386 387 template <bool aSwapRB> 388 void UnpackRowRGB24(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength); 389 390 template <bool aSwapRB> 391 void UnpackRowRGB24_NEON(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { 392 // Because this implementation will read an additional 4 bytes of data that 393 // is ignored and masked over, we cannot use the accelerated version for the 394 // last 1-5 pixels (3-15 bytes remaining) to guarantee we don't access memory 395 // outside the buffer (we read in 16 byte chunks). 396 if (aLength < 6) { 397 UnpackRowRGB24<aSwapRB>(aSrc, aDst, aLength); 398 return; 399 } 400 401 // Because we are expanding, we can only process the data back to front in 402 // case we are performing this in place. 403 int32_t alignedRow = (aLength - 2) & ~3; 404 int32_t remainder = aLength - alignedRow; 405 406 const uint8_t* src = aSrc + alignedRow * 3; 407 uint8_t* dst = aDst + alignedRow * 4; 408 409 // Handle 2-5 remaining pixels. 410 UnpackRowRGB24<aSwapRB>(src, dst, remainder); 411 412 uint8x8_t masklo; 413 uint8x8_t maskhi; 414 if (aSwapRB) { 415 static const uint8_t masklo_data[] = {2, 1, 0, 0, 5, 4, 3, 0}; 416 static const uint8_t maskhi_data[] = {4, 3, 2, 0, 7, 6, 5, 0}; 417 masklo = vld1_u8(masklo_data); 418 maskhi = vld1_u8(maskhi_data); 419 } else { 420 static const uint8_t masklo_data[] = {0, 1, 2, 0, 3, 4, 5, 0}; 421 static const uint8_t maskhi_data[] = {2, 3, 4, 0, 5, 6, 7, 0}; 422 masklo = vld1_u8(masklo_data); 423 maskhi = vld1_u8(maskhi_data); 424 } 425 426 uint8x16_t alpha = vreinterpretq_u8_u32(vdupq_n_u32(0xFF000000)); 427 428 // Process all 4-pixel chunks as one vector. 429 src -= 4 * 3; 430 dst -= 4 * 4; 431 while (src >= aSrc) { 432 uint8x16_t px = vld1q_u8(src); 433 // G2R2B1G1 R1B0G0R0 -> X1R1G1B1 X0R0G0B0 434 uint8x8_t pxlo = vtbl1_u8(vget_low_u8(px), masklo); 435 // B3G3R3B2 G2R2B1G1 -> X3R3G3B3 X2R2G2B2 436 uint8x8_t pxhi = 437 vtbl1_u8(vext_u8(vget_low_u8(px), vget_high_u8(px), 4), maskhi); 438 px = vcombine_u8(pxlo, pxhi); 439 px = vorrq_u8(px, alpha); 440 vst1q_u8(dst, px); 441 src -= 4 * 3; 442 dst -= 4 * 4; 443 } 444 } 445 446 // Force instantiation of swizzle variants here. 447 template void UnpackRowRGB24_NEON<false>(const uint8_t*, uint8_t*, int32_t); 448 template void UnpackRowRGB24_NEON<true>(const uint8_t*, uint8_t*, int32_t); 449 450 } // namespace gfx 451 } // namespace mozilla