alpha_processing_sse2.c (17558B)
1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // Utilities for processing transparent channel. 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include "src/dsp/dsp.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <emmintrin.h> 18 19 #include "src/webp/types.h" 20 #include "src/dsp/cpu.h" 21 22 //------------------------------------------------------------------------------ 23 24 static int DispatchAlpha_SSE2(const uint8_t* WEBP_RESTRICT alpha, 25 int alpha_stride, int width, int height, 26 uint8_t* WEBP_RESTRICT dst, int dst_stride) { 27 // alpha_and stores an 'and' operation of all the alpha[] values. The final 28 // value is not 0xff if any of the alpha[] is not equal to 0xff. 29 uint32_t alpha_and = 0xff; 30 int i, j; 31 const __m128i zero = _mm_setzero_si128(); 32 const __m128i alpha_mask = _mm_set1_epi32((int)0xff); // to preserve A 33 const __m128i all_0xff = _mm_set1_epi8((char)0xff); 34 __m128i all_alphas16 = all_0xff; 35 __m128i all_alphas8 = all_0xff; 36 37 // We must be able to access 3 extra bytes after the last written byte 38 // 'dst[4 * width - 4]', because we don't know if alpha is the first or the 39 // last byte of the quadruplet. 40 for (j = 0; j < height; ++j) { 41 char* ptr = (char*)dst; 42 for (i = 0; i + 16 <= width - 1; i += 16) { 43 // load 16 alpha bytes 44 const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); 45 const __m128i a1_lo = _mm_unpacklo_epi8(a0, zero); 46 const __m128i a1_hi = _mm_unpackhi_epi8(a0, zero); 47 const __m128i a2_lo_lo = _mm_unpacklo_epi16(a1_lo, zero); 48 const __m128i a2_lo_hi = _mm_unpackhi_epi16(a1_lo, zero); 49 const __m128i a2_hi_lo = _mm_unpacklo_epi16(a1_hi, zero); 50 const __m128i a2_hi_hi = _mm_unpackhi_epi16(a1_hi, zero); 51 _mm_maskmoveu_si128(a2_lo_lo, alpha_mask, ptr + 0); 52 _mm_maskmoveu_si128(a2_lo_hi, alpha_mask, ptr + 16); 53 _mm_maskmoveu_si128(a2_hi_lo, alpha_mask, ptr + 32); 54 _mm_maskmoveu_si128(a2_hi_hi, alpha_mask, ptr + 48); 55 // accumulate 16 alpha 'and' in parallel 56 all_alphas16 = _mm_and_si128(all_alphas16, a0); 57 ptr += 64; 58 } 59 if (i + 8 <= width - 1) { 60 // load 8 alpha bytes 61 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); 62 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); 63 const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); 64 const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); 65 _mm_maskmoveu_si128(a2_lo, alpha_mask, ptr); 66 _mm_maskmoveu_si128(a2_hi, alpha_mask, ptr + 16); 67 // accumulate 8 alpha 'and' in parallel 68 all_alphas8 = _mm_and_si128(all_alphas8, a0); 69 i += 8; 70 } 71 for (; i < width; ++i) { 72 const uint32_t alpha_value = alpha[i]; 73 dst[4 * i] = alpha_value; 74 alpha_and &= alpha_value; 75 } 76 alpha += alpha_stride; 77 dst += dst_stride; 78 } 79 // Combine the eight alpha 'and' into a 8-bit mask. 80 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas8, all_0xff)) & 0xff; 81 return (alpha_and != 0xff || 82 _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas16, all_0xff)) != 0xffff); 83 } 84 85 static void DispatchAlphaToGreen_SSE2(const uint8_t* WEBP_RESTRICT alpha, 86 int alpha_stride, int width, int height, 87 uint32_t* WEBP_RESTRICT dst, 88 int dst_stride) { 89 int i, j; 90 const __m128i zero = _mm_setzero_si128(); 91 const int limit = width & ~15; 92 for (j = 0; j < height; ++j) { 93 for (i = 0; i < limit; i += 16) { // process 16 alpha bytes 94 const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); 95 const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! 96 const __m128i b1 = _mm_unpackhi_epi8(zero, a0); 97 const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); 98 const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); 99 const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); 100 const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); 101 _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); 102 _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); 103 _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); 104 _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); 105 } 106 for (; i < width; ++i) dst[i] = alpha[i] << 8; 107 alpha += alpha_stride; 108 dst += dst_stride; 109 } 110 } 111 112 static int ExtractAlpha_SSE2(const uint8_t* WEBP_RESTRICT argb, int argb_stride, 113 int width, int height, 114 uint8_t* WEBP_RESTRICT alpha, int alpha_stride) { 115 // alpha_and stores an 'and' operation of all the alpha[] values. The final 116 // value is not 0xff if any of the alpha[] is not equal to 0xff. 117 uint32_t alpha_and = 0xff; 118 int i, j; 119 const __m128i a_mask = _mm_set1_epi32(0xff); // to preserve alpha 120 const __m128i all_0xff = _mm_set_epi32(0, 0, ~0, ~0); 121 __m128i all_alphas = all_0xff; 122 123 // We must be able to access 3 extra bytes after the last written byte 124 // 'src[4 * width - 4]', because we don't know if alpha is the first or the 125 // last byte of the quadruplet. 126 const int limit = (width - 1) & ~7; 127 128 for (j = 0; j < height; ++j) { 129 const __m128i* src = (const __m128i*)argb; 130 for (i = 0; i < limit; i += 8) { 131 // load 32 argb bytes 132 const __m128i a0 = _mm_loadu_si128(src + 0); 133 const __m128i a1 = _mm_loadu_si128(src + 1); 134 const __m128i b0 = _mm_and_si128(a0, a_mask); 135 const __m128i b1 = _mm_and_si128(a1, a_mask); 136 const __m128i c0 = _mm_packs_epi32(b0, b1); 137 const __m128i d0 = _mm_packus_epi16(c0, c0); 138 // store 139 _mm_storel_epi64((__m128i*)&alpha[i], d0); 140 // accumulate eight alpha 'and' in parallel 141 all_alphas = _mm_and_si128(all_alphas, d0); 142 src += 2; 143 } 144 for (; i < width; ++i) { 145 const uint32_t alpha_value = argb[4 * i]; 146 alpha[i] = alpha_value; 147 alpha_and &= alpha_value; 148 } 149 argb += argb_stride; 150 alpha += alpha_stride; 151 } 152 // Combine the eight alpha 'and' into a 8-bit mask. 153 alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); 154 return (alpha_and == 0xff); 155 } 156 157 static void ExtractGreen_SSE2(const uint32_t* WEBP_RESTRICT argb, 158 uint8_t* WEBP_RESTRICT alpha, int size) { 159 int i; 160 const __m128i mask = _mm_set1_epi32(0xff); 161 const __m128i* src = (const __m128i*)argb; 162 163 for (i = 0; i + 16 <= size; i += 16, src += 4) { 164 const __m128i a0 = _mm_loadu_si128(src + 0); 165 const __m128i a1 = _mm_loadu_si128(src + 1); 166 const __m128i a2 = _mm_loadu_si128(src + 2); 167 const __m128i a3 = _mm_loadu_si128(src + 3); 168 const __m128i b0 = _mm_srli_epi32(a0, 8); 169 const __m128i b1 = _mm_srli_epi32(a1, 8); 170 const __m128i b2 = _mm_srli_epi32(a2, 8); 171 const __m128i b3 = _mm_srli_epi32(a3, 8); 172 const __m128i c0 = _mm_and_si128(b0, mask); 173 const __m128i c1 = _mm_and_si128(b1, mask); 174 const __m128i c2 = _mm_and_si128(b2, mask); 175 const __m128i c3 = _mm_and_si128(b3, mask); 176 const __m128i d0 = _mm_packs_epi32(c0, c1); 177 const __m128i d1 = _mm_packs_epi32(c2, c3); 178 const __m128i e = _mm_packus_epi16(d0, d1); 179 // store 180 _mm_storeu_si128((__m128i*)&alpha[i], e); 181 } 182 if (i + 8 <= size) { 183 const __m128i a0 = _mm_loadu_si128(src + 0); 184 const __m128i a1 = _mm_loadu_si128(src + 1); 185 const __m128i b0 = _mm_srli_epi32(a0, 8); 186 const __m128i b1 = _mm_srli_epi32(a1, 8); 187 const __m128i c0 = _mm_and_si128(b0, mask); 188 const __m128i c1 = _mm_and_si128(b1, mask); 189 const __m128i d = _mm_packs_epi32(c0, c1); 190 const __m128i e = _mm_packus_epi16(d, d); 191 _mm_storel_epi64((__m128i*)&alpha[i], e); 192 i += 8; 193 } 194 for (; i < size; ++i) alpha[i] = argb[i] >> 8; 195 } 196 197 //------------------------------------------------------------------------------ 198 // Non-dither premultiplied modes 199 200 #define MULTIPLIER(a) ((a) * 0x8081) 201 #define PREMULTIPLY(x, m) (((x) * (m)) >> 23) 202 203 // We can't use a 'const int' for the SHUFFLE value, because it has to be an 204 // immediate in the _mm_shufflexx_epi16() instruction. We really need a macro. 205 // We use: v / 255 = (v * 0x8081) >> 23, where v = alpha * {r,g,b} is a 16bit 206 // value. 207 #define APPLY_ALPHA(RGBX, SHUFFLE) do { \ 208 const __m128i argb0 = _mm_loadu_si128((const __m128i*)&(RGBX)); \ 209 const __m128i argb1_lo = _mm_unpacklo_epi8(argb0, zero); \ 210 const __m128i argb1_hi = _mm_unpackhi_epi8(argb0, zero); \ 211 const __m128i alpha0_lo = _mm_or_si128(argb1_lo, kMask); \ 212 const __m128i alpha0_hi = _mm_or_si128(argb1_hi, kMask); \ 213 const __m128i alpha1_lo = _mm_shufflelo_epi16(alpha0_lo, SHUFFLE); \ 214 const __m128i alpha1_hi = _mm_shufflelo_epi16(alpha0_hi, SHUFFLE); \ 215 const __m128i alpha2_lo = _mm_shufflehi_epi16(alpha1_lo, SHUFFLE); \ 216 const __m128i alpha2_hi = _mm_shufflehi_epi16(alpha1_hi, SHUFFLE); \ 217 /* alpha2 = [ff a0 a0 a0][ff a1 a1 a1] */ \ 218 const __m128i A0_lo = _mm_mullo_epi16(alpha2_lo, argb1_lo); \ 219 const __m128i A0_hi = _mm_mullo_epi16(alpha2_hi, argb1_hi); \ 220 const __m128i A1_lo = _mm_mulhi_epu16(A0_lo, kMult); \ 221 const __m128i A1_hi = _mm_mulhi_epu16(A0_hi, kMult); \ 222 const __m128i A2_lo = _mm_srli_epi16(A1_lo, 7); \ 223 const __m128i A2_hi = _mm_srli_epi16(A1_hi, 7); \ 224 const __m128i A3 = _mm_packus_epi16(A2_lo, A2_hi); \ 225 _mm_storeu_si128((__m128i*)&(RGBX), A3); \ 226 } while (0) 227 228 static void ApplyAlphaMultiply_SSE2(uint8_t* rgba, int alpha_first, 229 int w, int h, int stride) { 230 const __m128i zero = _mm_setzero_si128(); 231 const __m128i kMult = _mm_set1_epi16((short)0x8081); 232 const __m128i kMask = _mm_set_epi16(0, 0xff, 0xff, 0, 0, 0xff, 0xff, 0); 233 const int kSpan = 4; 234 while (h-- > 0) { 235 uint32_t* const rgbx = (uint32_t*)rgba; 236 int i; 237 if (!alpha_first) { 238 for (i = 0; i + kSpan <= w; i += kSpan) { 239 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(2, 3, 3, 3)); 240 } 241 } else { 242 for (i = 0; i + kSpan <= w; i += kSpan) { 243 APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 1)); 244 } 245 } 246 // Finish with left-overs. 247 for (; i < w; ++i) { 248 uint8_t* const rgb = rgba + (alpha_first ? 1 : 0); 249 const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3); 250 const uint32_t a = alpha[4 * i]; 251 if (a != 0xff) { 252 const uint32_t mult = MULTIPLIER(a); 253 rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult); 254 rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult); 255 rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult); 256 } 257 } 258 rgba += stride; 259 } 260 } 261 #undef MULTIPLIER 262 #undef PREMULTIPLY 263 264 //------------------------------------------------------------------------------ 265 // Alpha detection 266 267 static int HasAlpha8b_SSE2(const uint8_t* src, int length) { 268 const __m128i all_0xff = _mm_set1_epi8((char)0xff); 269 int i = 0; 270 for (; i + 16 <= length; i += 16) { 271 const __m128i v = _mm_loadu_si128((const __m128i*)(src + i)); 272 const __m128i bits = _mm_cmpeq_epi8(v, all_0xff); 273 const int mask = _mm_movemask_epi8(bits); 274 if (mask != 0xffff) return 1; 275 } 276 for (; i < length; ++i) if (src[i] != 0xff) return 1; 277 return 0; 278 } 279 280 static int HasAlpha32b_SSE2(const uint8_t* src, int length) { 281 const __m128i alpha_mask = _mm_set1_epi32(0xff); 282 const __m128i all_0xff = _mm_set1_epi8((char)0xff); 283 int i = 0; 284 // We don't know if we can access the last 3 bytes after the last alpha 285 // value 'src[4 * length - 4]' (because we don't know if alpha is the first 286 // or the last byte of the quadruplet). Hence the '-3' protection below. 287 length = length * 4 - 3; // size in bytes 288 for (; i + 64 <= length; i += 64) { 289 const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); 290 const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); 291 const __m128i a2 = _mm_loadu_si128((const __m128i*)(src + i + 32)); 292 const __m128i a3 = _mm_loadu_si128((const __m128i*)(src + i + 48)); 293 const __m128i b0 = _mm_and_si128(a0, alpha_mask); 294 const __m128i b1 = _mm_and_si128(a1, alpha_mask); 295 const __m128i b2 = _mm_and_si128(a2, alpha_mask); 296 const __m128i b3 = _mm_and_si128(a3, alpha_mask); 297 const __m128i c0 = _mm_packs_epi32(b0, b1); 298 const __m128i c1 = _mm_packs_epi32(b2, b3); 299 const __m128i d = _mm_packus_epi16(c0, c1); 300 const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); 301 const int mask = _mm_movemask_epi8(bits); 302 if (mask != 0xffff) return 1; 303 } 304 for (; i + 32 <= length; i += 32) { 305 const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); 306 const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 16)); 307 const __m128i b0 = _mm_and_si128(a0, alpha_mask); 308 const __m128i b1 = _mm_and_si128(a1, alpha_mask); 309 const __m128i c = _mm_packs_epi32(b0, b1); 310 const __m128i d = _mm_packus_epi16(c, c); 311 const __m128i bits = _mm_cmpeq_epi8(d, all_0xff); 312 const int mask = _mm_movemask_epi8(bits); 313 if (mask != 0xffff) return 1; 314 } 315 for (; i <= length; i += 4) if (src[i] != 0xff) return 1; 316 return 0; 317 } 318 319 static void AlphaReplace_SSE2(uint32_t* src, int length, uint32_t color) { 320 const __m128i m_color = _mm_set1_epi32((int)color); 321 const __m128i zero = _mm_setzero_si128(); 322 int i = 0; 323 for (; i + 8 <= length; i += 8) { 324 const __m128i a0 = _mm_loadu_si128((const __m128i*)(src + i + 0)); 325 const __m128i a1 = _mm_loadu_si128((const __m128i*)(src + i + 4)); 326 const __m128i b0 = _mm_srai_epi32(a0, 24); 327 const __m128i b1 = _mm_srai_epi32(a1, 24); 328 const __m128i c0 = _mm_cmpeq_epi32(b0, zero); 329 const __m128i c1 = _mm_cmpeq_epi32(b1, zero); 330 const __m128i d0 = _mm_and_si128(c0, m_color); 331 const __m128i d1 = _mm_and_si128(c1, m_color); 332 const __m128i e0 = _mm_andnot_si128(c0, a0); 333 const __m128i e1 = _mm_andnot_si128(c1, a1); 334 _mm_storeu_si128((__m128i*)(src + i + 0), _mm_or_si128(d0, e0)); 335 _mm_storeu_si128((__m128i*)(src + i + 4), _mm_or_si128(d1, e1)); 336 } 337 for (; i < length; ++i) if ((src[i] >> 24) == 0) src[i] = color; 338 } 339 340 // ----------------------------------------------------------------------------- 341 // Apply alpha value to rows 342 343 static void MultARGBRow_SSE2(uint32_t* const ptr, int width, int inverse) { 344 int x = 0; 345 if (!inverse) { 346 const int kSpan = 2; 347 const __m128i zero = _mm_setzero_si128(); 348 const __m128i k128 = _mm_set1_epi16(128); 349 const __m128i kMult = _mm_set1_epi16(0x0101); 350 const __m128i kMask = _mm_set_epi16(0, 0xff, 0, 0, 0, 0xff, 0, 0); 351 for (x = 0; x + kSpan <= width; x += kSpan) { 352 // To compute 'result = (int)(a * x / 255. + .5)', we use: 353 // tmp = a * v + 128, result = (tmp * 0x0101u) >> 16 354 const __m128i A0 = _mm_loadl_epi64((const __m128i*)&ptr[x]); 355 const __m128i A1 = _mm_unpacklo_epi8(A0, zero); 356 const __m128i A2 = _mm_or_si128(A1, kMask); 357 const __m128i A3 = _mm_shufflelo_epi16(A2, _MM_SHUFFLE(2, 3, 3, 3)); 358 const __m128i A4 = _mm_shufflehi_epi16(A3, _MM_SHUFFLE(2, 3, 3, 3)); 359 // here, A4 = [ff a0 a0 a0][ff a1 a1 a1] 360 const __m128i A5 = _mm_mullo_epi16(A4, A1); 361 const __m128i A6 = _mm_add_epi16(A5, k128); 362 const __m128i A7 = _mm_mulhi_epu16(A6, kMult); 363 const __m128i A10 = _mm_packus_epi16(A7, zero); 364 _mm_storel_epi64((__m128i*)&ptr[x], A10); 365 } 366 } 367 width -= x; 368 if (width > 0) WebPMultARGBRow_C(ptr + x, width, inverse); 369 } 370 371 static void MultRow_SSE2(uint8_t* WEBP_RESTRICT const ptr, 372 const uint8_t* WEBP_RESTRICT const alpha, 373 int width, int inverse) { 374 int x = 0; 375 if (!inverse) { 376 const __m128i zero = _mm_setzero_si128(); 377 const __m128i k128 = _mm_set1_epi16(128); 378 const __m128i kMult = _mm_set1_epi16(0x0101); 379 for (x = 0; x + 8 <= width; x += 8) { 380 const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); 381 const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); 382 const __m128i v1 = _mm_unpacklo_epi8(v0, zero); 383 const __m128i a1 = _mm_unpacklo_epi8(a0, zero); 384 const __m128i v2 = _mm_mullo_epi16(v1, a1); 385 const __m128i v3 = _mm_add_epi16(v2, k128); 386 const __m128i v4 = _mm_mulhi_epu16(v3, kMult); 387 const __m128i v5 = _mm_packus_epi16(v4, zero); 388 _mm_storel_epi64((__m128i*)&ptr[x], v5); 389 } 390 } 391 width -= x; 392 if (width > 0) WebPMultRow_C(ptr + x, alpha + x, width, inverse); 393 } 394 395 //------------------------------------------------------------------------------ 396 // Entry point 397 398 extern void WebPInitAlphaProcessingSSE2(void); 399 400 WEBP_TSAN_IGNORE_FUNCTION void WebPInitAlphaProcessingSSE2(void) { 401 WebPMultARGBRow = MultARGBRow_SSE2; 402 WebPMultRow = MultRow_SSE2; 403 WebPApplyAlphaMultiply = ApplyAlphaMultiply_SSE2; 404 WebPDispatchAlpha = DispatchAlpha_SSE2; 405 WebPDispatchAlphaToGreen = DispatchAlphaToGreen_SSE2; 406 WebPExtractAlpha = ExtractAlpha_SSE2; 407 WebPExtractGreen = ExtractGreen_SSE2; 408 409 WebPHasAlpha8b = HasAlpha8b_SSE2; 410 WebPHasAlpha32b = HasAlpha32b_SSE2; 411 WebPAlphaReplace = AlphaReplace_SSE2; 412 } 413 414 #else // !WEBP_USE_SSE2 415 416 WEBP_DSP_INIT_STUB(WebPInitAlphaProcessingSSE2) 417 418 #endif // WEBP_USE_SSE2