SwizzleSSE2.cpp (16370B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #include "Swizzle.h" 8 9 #include <emmintrin.h> 10 11 namespace mozilla::gfx { 12 13 // Load 1-3 pixels into a 4 pixel vector. 14 static MOZ_ALWAYS_INLINE __m128i LoadRemainder_SSE2(const uint8_t* aSrc, 15 size_t aLength) { 16 __m128i px; 17 if (aLength >= 2) { 18 // Load first 2 pixels 19 px = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(aSrc)); 20 // Load third pixel 21 if (aLength >= 3) { 22 px = _mm_unpacklo_epi64( 23 px, 24 _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc + 2 * 4))); 25 } 26 } else { 27 // Load single pixel 28 px = _mm_cvtsi32_si128(*reinterpret_cast<const uint32_t*>(aSrc)); 29 } 30 return px; 31 } 32 33 // Store 1-3 pixels from a vector into memory without overwriting. 34 static MOZ_ALWAYS_INLINE void StoreRemainder_SSE2(uint8_t* aDst, size_t aLength, 35 const __m128i& aSrc) { 36 if (aLength >= 2) { 37 // Store first 2 pixels 38 _mm_storel_epi64(reinterpret_cast<__m128i*>(aDst), aSrc); 39 // Store third pixel 40 if (aLength >= 3) { 41 *reinterpret_cast<uint32_t*>(aDst + 2 * 4) = 42 _mm_cvtsi128_si32(_mm_srli_si128(aSrc, 2 * 4)); 43 } 44 } else { 45 // Store single pixel 46 *reinterpret_cast<uint32_t*>(aDst) = _mm_cvtsi128_si32(aSrc); 47 } 48 } 49 50 // Premultiply vector of 4 pixels using splayed math. 51 template <bool aSwapRB, bool aOpaqueAlpha> 52 static MOZ_ALWAYS_INLINE __m128i PremultiplyVector_SSE2(const __m128i& aSrc) { 53 // Isolate R and B with mask. 54 const __m128i mask = _mm_set1_epi32(0x00FF00FF); 55 __m128i rb = _mm_and_si128(mask, aSrc); 56 // Swap R and B if necessary. 57 if (aSwapRB) { 58 rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)); 59 rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)); 60 } 61 // Isolate G and A by shifting down to bottom of word. 62 __m128i ga = _mm_srli_epi16(aSrc, 8); 63 64 // Duplicate alphas to get vector of A1 A1 A2 A2 A3 A3 A4 A4 65 __m128i alphas = _mm_shufflelo_epi16(ga, _MM_SHUFFLE(3, 3, 1, 1)); 66 alphas = _mm_shufflehi_epi16(alphas, _MM_SHUFFLE(3, 3, 1, 1)); 67 68 // rb = rb*a + 255; rb += rb >> 8; 69 rb = _mm_add_epi16(_mm_mullo_epi16(rb, alphas), mask); 70 rb = _mm_add_epi16(rb, _mm_srli_epi16(rb, 8)); 71 72 // If format is not opaque, force A to 255 so that A*alpha/255 = alpha 73 if (!aOpaqueAlpha) { 74 ga = _mm_or_si128(ga, _mm_set1_epi32(0x00FF0000)); 75 } 76 // ga = ga*a + 255; ga += ga >> 8; 77 ga = _mm_add_epi16(_mm_mullo_epi16(ga, alphas), mask); 78 ga = _mm_add_epi16(ga, _mm_srli_epi16(ga, 8)); 79 // If format is opaque, force output A to be 255. 80 if (aOpaqueAlpha) { 81 ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000)); 82 } 83 84 // Combine back to final pixel with (rb >> 8) | (ga & 0xFF00FF00) 85 rb = _mm_srli_epi16(rb, 8); 86 ga = _mm_andnot_si128(mask, ga); 87 return _mm_or_si128(rb, ga); 88 } 89 90 // Premultiply vector of aAlignedRow + aRemainder pixels. 91 template <bool aSwapRB, bool aOpaqueAlpha> 92 static MOZ_ALWAYS_INLINE void PremultiplyChunk_SSE2(const uint8_t*& aSrc, 93 uint8_t*& aDst, 94 int32_t aAlignedRow, 95 int32_t aRemainder) { 96 // Process all 4-pixel chunks as one vector. 97 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { 98 __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc)); 99 px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px); 100 _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); 101 aSrc += 4 * 4; 102 aDst += 4 * 4; 103 } 104 105 // Handle any 1-3 remaining pixels. 106 if (aRemainder) { 107 __m128i px = LoadRemainder_SSE2(aSrc, aRemainder); 108 px = PremultiplyVector_SSE2<aSwapRB, aOpaqueAlpha>(px); 109 StoreRemainder_SSE2(aDst, aRemainder, px); 110 } 111 } 112 113 // Premultiply vector of aLength pixels. 114 template <bool aSwapRB, bool aOpaqueAlpha> 115 void PremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { 116 int32_t alignedRow = 4 * (aLength & ~3); 117 int32_t remainder = aLength & 3; 118 PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, 119 remainder); 120 } 121 122 template <bool aSwapRB, bool aOpaqueAlpha> 123 void Premultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, 124 int32_t aDstGap, IntSize aSize) { 125 int32_t alignedRow = 4 * (aSize.width & ~3); 126 int32_t remainder = aSize.width & 3; 127 // Fold remainder into stride gap. 128 aSrcGap += 4 * remainder; 129 aDstGap += 4 * remainder; 130 131 for (int32_t height = aSize.height; height > 0; height--) { 132 PremultiplyChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, 133 remainder); 134 aSrc += aSrcGap; 135 aDst += aDstGap; 136 } 137 } 138 139 // Force instantiation of premultiply variants here. 140 template void PremultiplyRow_SSE2<false, false>(const uint8_t*, uint8_t*, 141 int32_t); 142 template void PremultiplyRow_SSE2<false, true>(const uint8_t*, uint8_t*, 143 int32_t); 144 template void PremultiplyRow_SSE2<true, false>(const uint8_t*, uint8_t*, 145 int32_t); 146 template void PremultiplyRow_SSE2<true, true>(const uint8_t*, uint8_t*, 147 int32_t); 148 template void Premultiply_SSE2<false, false>(const uint8_t*, int32_t, uint8_t*, 149 int32_t, IntSize); 150 template void Premultiply_SSE2<false, true>(const uint8_t*, int32_t, uint8_t*, 151 int32_t, IntSize); 152 template void Premultiply_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, 153 int32_t, IntSize); 154 template void Premultiply_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, 155 int32_t, IntSize); 156 157 // This generates a table of fixed-point reciprocals representing 1/alpha 158 // similar to the fallback implementation. However, the reciprocal must fit 159 // in 16 bits to multiply cheaply. Observe that reciprocals of smaller alphas 160 // require more bits than for larger alphas. We take advantage of this by 161 // shifting the reciprocal down by either 3 or 8 bits depending on whether 162 // the alpha value is less than 0x20. This is easy to then undo by multiplying 163 // the color component to be unpremultiplying by either 8 or 0x100, 164 // respectively. The 16 bit reciprocal is duplicated into both words of a 165 // uint32_t here to reduce unpacking overhead. 166 #define UNPREMULQ_SSE2(x) \ 167 (0x10001U * (0xFF0220U / ((x) * ((x) < 0x20 ? 0x100 : 8)))) 168 #define UNPREMULQ_SSE2_2(x) UNPREMULQ_SSE2(x), UNPREMULQ_SSE2((x) + 1) 169 #define UNPREMULQ_SSE2_4(x) UNPREMULQ_SSE2_2(x), UNPREMULQ_SSE2_2((x) + 2) 170 #define UNPREMULQ_SSE2_8(x) UNPREMULQ_SSE2_4(x), UNPREMULQ_SSE2_4((x) + 4) 171 #define UNPREMULQ_SSE2_16(x) UNPREMULQ_SSE2_8(x), UNPREMULQ_SSE2_8((x) + 8) 172 #define UNPREMULQ_SSE2_32(x) UNPREMULQ_SSE2_16(x), UNPREMULQ_SSE2_16((x) + 16) 173 static const uint32_t sUnpremultiplyTable_SSE2[256] = {0, 174 UNPREMULQ_SSE2(1), 175 UNPREMULQ_SSE2_2(2), 176 UNPREMULQ_SSE2_4(4), 177 UNPREMULQ_SSE2_8(8), 178 UNPREMULQ_SSE2_16(16), 179 UNPREMULQ_SSE2_32(32), 180 UNPREMULQ_SSE2_32(64), 181 UNPREMULQ_SSE2_32(96), 182 UNPREMULQ_SSE2_32(128), 183 UNPREMULQ_SSE2_32(160), 184 UNPREMULQ_SSE2_32(192), 185 UNPREMULQ_SSE2_32(224)}; 186 187 // Unpremultiply a vector of 4 pixels using splayed math and a reciprocal table 188 // that avoids doing any actual division. 189 template <bool aSwapRB> 190 static MOZ_ALWAYS_INLINE __m128i UnpremultiplyVector_SSE2(const __m128i& aSrc) { 191 // Isolate R and B with mask. 192 __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF)); 193 // Swap R and B if necessary. 194 if (aSwapRB) { 195 rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)); 196 rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)); 197 } 198 199 // Isolate G and A by shifting down to bottom of word. 200 __m128i ga = _mm_srli_epi16(aSrc, 8); 201 // Extract the alphas for the 4 pixels from the now isolated words. 202 int a1 = _mm_extract_epi16(ga, 1); 203 int a2 = _mm_extract_epi16(ga, 3); 204 int a3 = _mm_extract_epi16(ga, 5); 205 int a4 = _mm_extract_epi16(ga, 7); 206 207 // Load the 16 bit reciprocals from the table for each alpha. 208 // The reciprocals are doubled in each uint32_t entry. 209 // Unpack them to a final vector of duplicated reciprocals of 210 // the form Q1 Q1 Q2 Q2 Q3 Q3 Q4 Q4. 211 __m128i q12 = 212 _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a1]), 213 _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a2])); 214 __m128i q34 = 215 _mm_unpacklo_epi32(_mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a3]), 216 _mm_cvtsi32_si128(sUnpremultiplyTable_SSE2[a4])); 217 __m128i q1234 = _mm_unpacklo_epi64(q12, q34); 218 219 // Check if the alphas are less than 0x20, so that we can undo 220 // scaling of the reciprocals as appropriate. 221 __m128i scale = _mm_cmplt_epi32(ga, _mm_set1_epi32(0x00200000)); 222 // Produce scale factors by ((a < 0x20) ^ 8) & 0x108, 223 // such that scale is 0x100 if < 0x20, and 8 otherwise. 224 scale = _mm_xor_si128(scale, _mm_set1_epi16(8)); 225 scale = _mm_and_si128(scale, _mm_set1_epi16(0x108)); 226 // Isolate G now so that we don't accidentally unpremultiply A. 227 ga = _mm_and_si128(ga, _mm_set1_epi32(0x000000FF)); 228 229 // Scale R, B, and G as required depending on reciprocal precision. 230 rb = _mm_mullo_epi16(rb, scale); 231 ga = _mm_mullo_epi16(ga, scale); 232 233 // Multiply R, B, and G by the reciprocal, only taking the high word 234 // too effectively shift right by 16. 235 rb = _mm_mulhi_epu16(rb, q1234); 236 ga = _mm_mulhi_epu16(ga, q1234); 237 238 // Combine back to final pixel with rb | (ga << 8) | (aSrc & 0xFF000000), 239 // which will add back on the original alpha value unchanged. 240 ga = _mm_slli_si128(ga, 1); 241 ga = _mm_or_si128(ga, _mm_and_si128(aSrc, _mm_set1_epi32(0xFF000000))); 242 return _mm_or_si128(rb, ga); 243 } 244 245 template <bool aSwapRB> 246 static MOZ_ALWAYS_INLINE void UnpremultiplyChunk_SSE2(const uint8_t*& aSrc, 247 uint8_t*& aDst, 248 int32_t aAlignedRow, 249 int32_t aRemainder) { 250 // Process all 4-pixel chunks as one vector. 251 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { 252 __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc)); 253 px = UnpremultiplyVector_SSE2<aSwapRB>(px); 254 _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); 255 aSrc += 4 * 4; 256 aDst += 4 * 4; 257 } 258 259 // Handle any 1-3 remaining pixels. 260 if (aRemainder) { 261 __m128i px = LoadRemainder_SSE2(aSrc, aRemainder); 262 px = UnpremultiplyVector_SSE2<aSwapRB>(px); 263 StoreRemainder_SSE2(aDst, aRemainder, px); 264 } 265 } 266 267 template <bool aSwapRB> 268 void UnpremultiplyRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, 269 int32_t aLength) { 270 int32_t alignedRow = 4 * (aLength & ~3); 271 int32_t remainder = aLength & 3; 272 UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder); 273 } 274 275 template <bool aSwapRB> 276 void Unpremultiply_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, 277 int32_t aDstGap, IntSize aSize) { 278 int32_t alignedRow = 4 * (aSize.width & ~3); 279 int32_t remainder = aSize.width & 3; 280 // Fold remainder into stride gap. 281 aSrcGap += 4 * remainder; 282 aDstGap += 4 * remainder; 283 284 for (int32_t height = aSize.height; height > 0; height--) { 285 UnpremultiplyChunk_SSE2<aSwapRB>(aSrc, aDst, alignedRow, remainder); 286 aSrc += aSrcGap; 287 aDst += aDstGap; 288 } 289 } 290 291 // Force instantiation of unpremultiply variants here. 292 template void UnpremultiplyRow_SSE2<false>(const uint8_t*, uint8_t*, int32_t); 293 template void UnpremultiplyRow_SSE2<true>(const uint8_t*, uint8_t*, int32_t); 294 template void Unpremultiply_SSE2<false>(const uint8_t*, int32_t, uint8_t*, 295 int32_t, IntSize); 296 template void Unpremultiply_SSE2<true>(const uint8_t*, int32_t, uint8_t*, 297 int32_t, IntSize); 298 299 // Swizzle a vector of 4 pixels providing swaps and opaquifying. 300 template <bool aSwapRB, bool aOpaqueAlpha> 301 static MOZ_ALWAYS_INLINE __m128i SwizzleVector_SSE2(const __m128i& aSrc) { 302 // Isolate R and B. 303 __m128i rb = _mm_and_si128(aSrc, _mm_set1_epi32(0x00FF00FF)); 304 // Swap R and B. 305 rb = _mm_shufflelo_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)); 306 rb = _mm_shufflehi_epi16(rb, _MM_SHUFFLE(2, 3, 0, 1)); 307 // Isolate G and A. 308 __m128i ga = _mm_and_si128(aSrc, _mm_set1_epi32(0xFF00FF00)); 309 // Force alpha to 255 if necessary. 310 if (aOpaqueAlpha) { 311 ga = _mm_or_si128(ga, _mm_set1_epi32(0xFF000000)); 312 } 313 // Combine everything back together. 314 return _mm_or_si128(rb, ga); 315 } 316 317 #if 0 318 // These specializations currently do not profile faster than the generic versions, 319 // so disable them for now. 320 321 // Optimized implementations for when there is no R and B swap. 322 template<> 323 MOZ_ALWAYS_INLINE __m128i 324 SwizzleVector_SSE2<false, true>(const __m128i& aSrc) 325 { 326 // Force alpha to 255. 327 return _mm_or_si128(aSrc, _mm_set1_epi32(0xFF000000)); 328 } 329 330 template<> 331 MOZ_ALWAYS_INLINE __m128i 332 SwizzleVector_SSE2<false, false>(const __m128i& aSrc) 333 { 334 return aSrc; 335 } 336 #endif 337 338 template <bool aSwapRB, bool aOpaqueAlpha> 339 static MOZ_ALWAYS_INLINE void SwizzleChunk_SSE2(const uint8_t*& aSrc, 340 uint8_t*& aDst, 341 int32_t aAlignedRow, 342 int32_t aRemainder) { 343 // Process all 4-pixel chunks as one vector. 344 for (const uint8_t* end = aSrc + aAlignedRow; aSrc < end;) { 345 __m128i px = _mm_loadu_si128(reinterpret_cast<const __m128i*>(aSrc)); 346 px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px); 347 _mm_storeu_si128(reinterpret_cast<__m128i*>(aDst), px); 348 aSrc += 4 * 4; 349 aDst += 4 * 4; 350 } 351 352 // Handle any 1-3 remaining pixels. 353 if (aRemainder) { 354 __m128i px = LoadRemainder_SSE2(aSrc, aRemainder); 355 px = SwizzleVector_SSE2<aSwapRB, aOpaqueAlpha>(px); 356 StoreRemainder_SSE2(aDst, aRemainder, px); 357 } 358 } 359 360 template <bool aSwapRB, bool aOpaqueAlpha> 361 void SwizzleRow_SSE2(const uint8_t* aSrc, uint8_t* aDst, int32_t aLength) { 362 int32_t alignedRow = 4 * (aLength & ~3); 363 int32_t remainder = aLength & 3; 364 SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder); 365 } 366 367 template <bool aSwapRB, bool aOpaqueAlpha> 368 void Swizzle_SSE2(const uint8_t* aSrc, int32_t aSrcGap, uint8_t* aDst, 369 int32_t aDstGap, IntSize aSize) { 370 int32_t alignedRow = 4 * (aSize.width & ~3); 371 int32_t remainder = aSize.width & 3; 372 // Fold remainder into stride gap. 373 aSrcGap += 4 * remainder; 374 aDstGap += 4 * remainder; 375 376 for (int32_t height = aSize.height; height > 0; height--) { 377 SwizzleChunk_SSE2<aSwapRB, aOpaqueAlpha>(aSrc, aDst, alignedRow, remainder); 378 aSrc += aSrcGap; 379 aDst += aDstGap; 380 } 381 } 382 383 // Force instantiation of swizzle variants here. 384 template void SwizzleRow_SSE2<true, false>(const uint8_t*, uint8_t*, int32_t); 385 template void SwizzleRow_SSE2<true, true>(const uint8_t*, uint8_t*, int32_t); 386 template void Swizzle_SSE2<true, false>(const uint8_t*, int32_t, uint8_t*, 387 int32_t, IntSize); 388 template void Swizzle_SSE2<true, true>(const uint8_t*, int32_t, uint8_t*, 389 int32_t, IntSize); 390 391 } // namespace mozilla::gfx