yuv_sse2.c (30298B)
1 // Copyright 2014 Google Inc. All Rights Reserved. 2 // 3 // Use of this source code is governed by a BSD-style license 4 // that can be found in the COPYING file in the root of the source 5 // tree. An additional intellectual property rights grant can be found 6 // in the file PATENTS. All contributing project authors may 7 // be found in the AUTHORS file in the root of the source tree. 8 // ----------------------------------------------------------------------------- 9 // 10 // YUV->RGB conversion functions 11 // 12 // Author: Skal (pascal.massimino@gmail.com) 13 14 #include "src/dsp/yuv.h" 15 16 #if defined(WEBP_USE_SSE2) 17 #include <emmintrin.h> 18 19 #include <stdlib.h> 20 21 #include "src/dsp/common_sse2.h" 22 #include "src/dsp/cpu.h" 23 #include "src/dsp/dsp.h" 24 #include "src/utils/utils.h" 25 #include "src/webp/decode.h" 26 #include "src/webp/types.h" 27 28 //----------------------------------------------------------------------------- 29 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler. 30 31 // These constants are 14b fixed-point version of ITU-R BT.601 constants. 32 // R = (19077 * y + 26149 * v - 14234) >> 6 33 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 34 // B = (19077 * y + 33050 * u - 17685) >> 6 35 static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0, 36 const __m128i* const U0, 37 const __m128i* const V0, 38 __m128i* const R, 39 __m128i* const G, 40 __m128i* const B) { 41 const __m128i k19077 = _mm_set1_epi16(19077); 42 const __m128i k26149 = _mm_set1_epi16(26149); 43 const __m128i k14234 = _mm_set1_epi16(14234); 44 // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic 45 const __m128i k33050 = _mm_set1_epi16((short)33050); 46 const __m128i k17685 = _mm_set1_epi16(17685); 47 const __m128i k6419 = _mm_set1_epi16(6419); 48 const __m128i k13320 = _mm_set1_epi16(13320); 49 const __m128i k8708 = _mm_set1_epi16(8708); 50 51 const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); 52 53 const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); 54 const __m128i R1 = _mm_sub_epi16(Y1, k14234); 55 const __m128i R2 = _mm_add_epi16(R1, R0); 56 57 const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); 58 const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); 59 const __m128i G2 = _mm_add_epi16(Y1, k8708); 60 const __m128i G3 = _mm_add_epi16(G0, G1); 61 const __m128i G4 = _mm_sub_epi16(G2, G3); 62 63 // be careful with the saturated *unsigned* arithmetic here! 64 const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); 65 const __m128i B1 = _mm_adds_epu16(B0, Y1); 66 const __m128i B2 = _mm_subs_epu16(B1, k17685); 67 68 // use logical shift for B2, which can be larger than 32767 69 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] 70 *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] 71 *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] 72 } 73 74 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically. 75 static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) { 76 const __m128i zero = _mm_setzero_si128(); 77 return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src)); 78 } 79 80 // Load and replicate the U/V samples 81 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) { 82 const __m128i zero = _mm_setzero_si128(); 83 const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src)); 84 const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0); 85 return _mm_unpacklo_epi16(tmp1, tmp1); // replicate samples 86 } 87 88 // Convert 32 samples of YUV444 to R/G/B 89 static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y, 90 const uint8_t* WEBP_RESTRICT const u, 91 const uint8_t* WEBP_RESTRICT const v, 92 __m128i* const R, __m128i* const G, 93 __m128i* const B) { 94 const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u), 95 V0 = Load_HI_16_SSE2(v); 96 ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B); 97 } 98 99 // Convert 32 samples of YUV420 to R/G/B 100 static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y, 101 const uint8_t* WEBP_RESTRICT const u, 102 const uint8_t* WEBP_RESTRICT const v, 103 __m128i* const R, __m128i* const G, 104 __m128i* const B) { 105 const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u), 106 V0 = Load_UV_HI_8_SSE2(v); 107 ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B); 108 } 109 110 // Pack R/G/B/A results into 32b output. 111 static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R, 112 const __m128i* const G, 113 const __m128i* const B, 114 const __m128i* const A, 115 uint8_t* WEBP_RESTRICT const dst) { 116 const __m128i rb = _mm_packus_epi16(*R, *B); 117 const __m128i ga = _mm_packus_epi16(*G, *A); 118 const __m128i rg = _mm_unpacklo_epi8(rb, ga); 119 const __m128i ba = _mm_unpackhi_epi8(rb, ga); 120 const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba); 121 const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba); 122 _mm_storeu_si128((__m128i*)(dst + 0), RGBA_lo); 123 _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi); 124 } 125 126 // Pack R/G/B/A results into 16b output. 127 static WEBP_INLINE void PackAndStore4444_SSE2( 128 const __m128i* const R, const __m128i* const G, const __m128i* const B, 129 const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) { 130 #if (WEBP_SWAP_16BIT_CSP == 0) 131 const __m128i rg0 = _mm_packus_epi16(*R, *G); 132 const __m128i ba0 = _mm_packus_epi16(*B, *A); 133 #else 134 const __m128i rg0 = _mm_packus_epi16(*B, *A); 135 const __m128i ba0 = _mm_packus_epi16(*R, *G); 136 #endif 137 const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0); 138 const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0); // rbrbrbrbrb... 139 const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0); // gagagagaga... 140 const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0); 141 const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4); 142 const __m128i rgba4444 = _mm_or_si128(rb2, ga2); 143 _mm_storeu_si128((__m128i*)dst, rgba4444); 144 } 145 146 // Pack R/G/B results into 16b output. 147 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R, 148 const __m128i* const G, 149 const __m128i* const B, 150 uint8_t* WEBP_RESTRICT const dst) { 151 const __m128i r0 = _mm_packus_epi16(*R, *R); 152 const __m128i g0 = _mm_packus_epi16(*G, *G); 153 const __m128i b0 = _mm_packus_epi16(*B, *B); 154 const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8)); 155 const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f)); 156 const __m128i g1 = 157 _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5); 158 const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3); 159 const __m128i rg = _mm_or_si128(r1, g1); 160 const __m128i gb = _mm_or_si128(g2, b1); 161 #if (WEBP_SWAP_16BIT_CSP == 0) 162 const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb); 163 #else 164 const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg); 165 #endif 166 _mm_storeu_si128((__m128i*)dst, rgb565); 167 } 168 169 // Pack the planar buffers 170 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... 171 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ... 172 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1, 173 __m128i* const in2, __m128i* const in3, 174 __m128i* const in4, __m128i* const in5, 175 uint8_t* WEBP_RESTRICT const rgb) { 176 // The input is 6 registers of sixteen 8b but for the sake of explanation, 177 // let's take 6 registers of four 8b values. 178 // To pack, we will keep taking one every two 8b integer and move it 179 // around as follows: 180 // Input: 181 // r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7 182 // Split the 6 registers in two sets of 3 registers: the first set as the even 183 // 8b bytes, the second the odd ones: 184 // r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7 185 // Repeat the same permutations twice more: 186 // r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7 187 // r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7 188 VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5); 189 190 _mm_storeu_si128((__m128i*)(rgb + 0), *in0); 191 _mm_storeu_si128((__m128i*)(rgb + 16), *in1); 192 _mm_storeu_si128((__m128i*)(rgb + 32), *in2); 193 _mm_storeu_si128((__m128i*)(rgb + 48), *in3); 194 _mm_storeu_si128((__m128i*)(rgb + 64), *in4); 195 _mm_storeu_si128((__m128i*)(rgb + 80), *in5); 196 } 197 198 void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y, 199 const uint8_t* WEBP_RESTRICT u, 200 const uint8_t* WEBP_RESTRICT v, 201 uint8_t* WEBP_RESTRICT dst) { 202 const __m128i kAlpha = _mm_set1_epi16(255); 203 int n; 204 for (n = 0; n < 32; n += 8, dst += 32) { 205 __m128i R, G, B; 206 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); 207 PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst); 208 } 209 } 210 211 void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y, 212 const uint8_t* WEBP_RESTRICT u, 213 const uint8_t* WEBP_RESTRICT v, 214 uint8_t* WEBP_RESTRICT dst) { 215 const __m128i kAlpha = _mm_set1_epi16(255); 216 int n; 217 for (n = 0; n < 32; n += 8, dst += 32) { 218 __m128i R, G, B; 219 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); 220 PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst); 221 } 222 } 223 224 void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y, 225 const uint8_t* WEBP_RESTRICT u, 226 const uint8_t* WEBP_RESTRICT v, 227 uint8_t* WEBP_RESTRICT dst) { 228 const __m128i kAlpha = _mm_set1_epi16(255); 229 int n; 230 for (n = 0; n < 32; n += 8, dst += 32) { 231 __m128i R, G, B; 232 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); 233 PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst); 234 } 235 } 236 237 void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y, 238 const uint8_t* WEBP_RESTRICT u, 239 const uint8_t* WEBP_RESTRICT v, 240 uint8_t* WEBP_RESTRICT dst) { 241 const __m128i kAlpha = _mm_set1_epi16(255); 242 int n; 243 for (n = 0; n < 32; n += 8, dst += 16) { 244 __m128i R, G, B; 245 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); 246 PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst); 247 } 248 } 249 250 void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y, 251 const uint8_t* WEBP_RESTRICT u, 252 const uint8_t* WEBP_RESTRICT v, 253 uint8_t* WEBP_RESTRICT dst) { 254 int n; 255 for (n = 0; n < 32; n += 8, dst += 16) { 256 __m128i R, G, B; 257 YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B); 258 PackAndStore565_SSE2(&R, &G, &B, dst); 259 } 260 } 261 262 void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y, 263 const uint8_t* WEBP_RESTRICT u, 264 const uint8_t* WEBP_RESTRICT v, 265 uint8_t* WEBP_RESTRICT dst) { 266 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 267 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; 268 269 YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); 270 YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1); 271 YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2); 272 YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3); 273 274 // Cast to 8b and store as RRRRGGGGBBBB. 275 rgb0 = _mm_packus_epi16(R0, R1); 276 rgb1 = _mm_packus_epi16(R2, R3); 277 rgb2 = _mm_packus_epi16(G0, G1); 278 rgb3 = _mm_packus_epi16(G2, G3); 279 rgb4 = _mm_packus_epi16(B0, B1); 280 rgb5 = _mm_packus_epi16(B2, B3); 281 282 // Pack as RGBRGBRGBRGB. 283 PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); 284 } 285 286 void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y, 287 const uint8_t* WEBP_RESTRICT u, 288 const uint8_t* WEBP_RESTRICT v, 289 uint8_t* WEBP_RESTRICT dst) { 290 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 291 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; 292 293 YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); 294 YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1); 295 YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2); 296 YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3); 297 298 // Cast to 8b and store as BBBBGGGGRRRR. 299 bgr0 = _mm_packus_epi16(B0, B1); 300 bgr1 = _mm_packus_epi16(B2, B3); 301 bgr2 = _mm_packus_epi16(G0, G1); 302 bgr3 = _mm_packus_epi16(G2, G3); 303 bgr4 = _mm_packus_epi16(R0, R1); 304 bgr5= _mm_packus_epi16(R2, R3); 305 306 // Pack as BGRBGRBGRBGR. 307 PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); 308 } 309 310 //----------------------------------------------------------------------------- 311 // Arbitrary-length row conversion functions 312 313 static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y, 314 const uint8_t* WEBP_RESTRICT u, 315 const uint8_t* WEBP_RESTRICT v, 316 uint8_t* WEBP_RESTRICT dst, int len) { 317 const __m128i kAlpha = _mm_set1_epi16(255); 318 int n; 319 for (n = 0; n + 8 <= len; n += 8, dst += 32) { 320 __m128i R, G, B; 321 YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); 322 PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst); 323 y += 8; 324 u += 4; 325 v += 4; 326 } 327 for (; n < len; ++n) { // Finish off 328 VP8YuvToRgba(y[0], u[0], v[0], dst); 329 dst += 4; 330 y += 1; 331 u += (n & 1); 332 v += (n & 1); 333 } 334 } 335 336 static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y, 337 const uint8_t* WEBP_RESTRICT u, 338 const uint8_t* WEBP_RESTRICT v, 339 uint8_t* WEBP_RESTRICT dst, int len) { 340 const __m128i kAlpha = _mm_set1_epi16(255); 341 int n; 342 for (n = 0; n + 8 <= len; n += 8, dst += 32) { 343 __m128i R, G, B; 344 YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); 345 PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst); 346 y += 8; 347 u += 4; 348 v += 4; 349 } 350 for (; n < len; ++n) { // Finish off 351 VP8YuvToBgra(y[0], u[0], v[0], dst); 352 dst += 4; 353 y += 1; 354 u += (n & 1); 355 v += (n & 1); 356 } 357 } 358 359 static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y, 360 const uint8_t* WEBP_RESTRICT u, 361 const uint8_t* WEBP_RESTRICT v, 362 uint8_t* WEBP_RESTRICT dst, int len) { 363 const __m128i kAlpha = _mm_set1_epi16(255); 364 int n; 365 for (n = 0; n + 8 <= len; n += 8, dst += 32) { 366 __m128i R, G, B; 367 YUV420ToRGB_SSE2(y, u, v, &R, &G, &B); 368 PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst); 369 y += 8; 370 u += 4; 371 v += 4; 372 } 373 for (; n < len; ++n) { // Finish off 374 VP8YuvToArgb(y[0], u[0], v[0], dst); 375 dst += 4; 376 y += 1; 377 u += (n & 1); 378 v += (n & 1); 379 } 380 } 381 382 static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y, 383 const uint8_t* WEBP_RESTRICT u, 384 const uint8_t* WEBP_RESTRICT v, 385 uint8_t* WEBP_RESTRICT dst, int len) { 386 int n; 387 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { 388 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 389 __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5; 390 391 YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); 392 YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1); 393 YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2); 394 YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3); 395 396 // Cast to 8b and store as RRRRGGGGBBBB. 397 rgb0 = _mm_packus_epi16(R0, R1); 398 rgb1 = _mm_packus_epi16(R2, R3); 399 rgb2 = _mm_packus_epi16(G0, G1); 400 rgb3 = _mm_packus_epi16(G2, G3); 401 rgb4 = _mm_packus_epi16(B0, B1); 402 rgb5 = _mm_packus_epi16(B2, B3); 403 404 // Pack as RGBRGBRGBRGB. 405 PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst); 406 407 y += 32; 408 u += 16; 409 v += 16; 410 } 411 for (; n < len; ++n) { // Finish off 412 VP8YuvToRgb(y[0], u[0], v[0], dst); 413 dst += 3; 414 y += 1; 415 u += (n & 1); 416 v += (n & 1); 417 } 418 } 419 420 static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y, 421 const uint8_t* WEBP_RESTRICT u, 422 const uint8_t* WEBP_RESTRICT v, 423 uint8_t* WEBP_RESTRICT dst, int len) { 424 int n; 425 for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) { 426 __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3; 427 __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5; 428 429 YUV420ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0); 430 YUV420ToRGB_SSE2(y + 8, u + 4, v + 4, &R1, &G1, &B1); 431 YUV420ToRGB_SSE2(y + 16, u + 8, v + 8, &R2, &G2, &B2); 432 YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3); 433 434 // Cast to 8b and store as BBBBGGGGRRRR. 435 bgr0 = _mm_packus_epi16(B0, B1); 436 bgr1 = _mm_packus_epi16(B2, B3); 437 bgr2 = _mm_packus_epi16(G0, G1); 438 bgr3 = _mm_packus_epi16(G2, G3); 439 bgr4 = _mm_packus_epi16(R0, R1); 440 bgr5 = _mm_packus_epi16(R2, R3); 441 442 // Pack as BGRBGRBGRBGR. 443 PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst); 444 445 y += 32; 446 u += 16; 447 v += 16; 448 } 449 for (; n < len; ++n) { // Finish off 450 VP8YuvToBgr(y[0], u[0], v[0], dst); 451 dst += 3; 452 y += 1; 453 u += (n & 1); 454 v += (n & 1); 455 } 456 } 457 458 //------------------------------------------------------------------------------ 459 // Entry point 460 461 extern void WebPInitSamplersSSE2(void); 462 463 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) { 464 WebPSamplers[MODE_RGB] = YuvToRgbRow_SSE2; 465 WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2; 466 WebPSamplers[MODE_BGR] = YuvToBgrRow_SSE2; 467 WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2; 468 WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2; 469 } 470 471 //------------------------------------------------------------------------------ 472 // RGB24/32 -> YUV converters 473 474 // Load eight 16b-words from *src. 475 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src)) 476 // Store either 16b-words into *dst 477 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V)) 478 479 // Function that inserts a value of the second half of the in buffer in between 480 // every two char of the first half. 481 static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2( 482 const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) { 483 out[0] = _mm_unpacklo_epi8(in[0], in[3]); 484 out[1] = _mm_unpackhi_epi8(in[0], in[3]); 485 out[2] = _mm_unpacklo_epi8(in[1], in[4]); 486 out[3] = _mm_unpackhi_epi8(in[1], in[4]); 487 out[4] = _mm_unpacklo_epi8(in[2], in[5]); 488 out[5] = _mm_unpackhi_epi8(in[2], in[5]); 489 } 490 491 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers: 492 // rrrr... rrrr... gggg... gggg... bbbb... bbbb.... 493 // Similar to PlanarTo24bHelper(), but in reverse order. 494 static WEBP_INLINE void RGB24PackedToPlanar_SSE2( 495 const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) { 496 __m128i tmp[6]; 497 tmp[0] = _mm_loadu_si128((const __m128i*)(rgb + 0)); 498 tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16)); 499 tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32)); 500 tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48)); 501 tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64)); 502 tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80)); 503 504 RGB24PackedToPlanarHelper_SSE2(tmp, out); 505 RGB24PackedToPlanarHelper_SSE2(out, tmp); 506 RGB24PackedToPlanarHelper_SSE2(tmp, out); 507 RGB24PackedToPlanarHelper_SSE2(out, tmp); 508 RGB24PackedToPlanarHelper_SSE2(tmp, out); 509 } 510 511 // Convert 8 packed ARGB to r[], g[], b[] 512 static WEBP_INLINE void RGB32PackedToPlanar_SSE2( 513 const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) { 514 const __m128i zero = _mm_setzero_si128(); 515 __m128i a0 = LOAD_16(argb + 0); 516 __m128i a1 = LOAD_16(argb + 4); 517 __m128i a2 = LOAD_16(argb + 8); 518 __m128i a3 = LOAD_16(argb + 12); 519 VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3); 520 rgb[0] = _mm_unpacklo_epi8(a1, zero); 521 rgb[1] = _mm_unpackhi_epi8(a1, zero); 522 rgb[2] = _mm_unpacklo_epi8(a2, zero); 523 rgb[3] = _mm_unpackhi_epi8(a2, zero); 524 rgb[4] = _mm_unpacklo_epi8(a3, zero); 525 rgb[5] = _mm_unpackhi_epi8(a3, zero); 526 } 527 528 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX 529 // It's a macro and not a function because we need to use immediate values with 530 // srai_epi32, e.g. 531 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \ 532 ROUNDER, DESCALE_FIX, OUT) do { \ 533 const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG); \ 534 const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG); \ 535 const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB); \ 536 const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB); \ 537 const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo); \ 538 const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi); \ 539 const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER); \ 540 const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER); \ 541 const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX); \ 542 const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX); \ 543 (OUT) = _mm_packs_epi32(V5_lo, V5_hi); \ 544 } while (0) 545 546 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A)) 547 static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R, 548 const __m128i* const G, 549 const __m128i* const B, 550 __m128i* const Y) { 551 const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); 552 const __m128i kGB_y = MK_CST_16(16384, 6420); 553 const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); 554 555 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); 556 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); 557 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); 558 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); 559 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); 560 } 561 562 static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R, 563 const __m128i* const G, 564 const __m128i* const B, 565 __m128i* const U, 566 __m128i* const V) { 567 const __m128i kRG_u = MK_CST_16(-9719, -19081); 568 const __m128i kGB_u = MK_CST_16(0, 28800); 569 const __m128i kRG_v = MK_CST_16(28800, 0); 570 const __m128i kGB_v = MK_CST_16(-24116, -4684); 571 const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2); 572 573 const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); 574 const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); 575 const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); 576 const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); 577 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u, 578 kHALF_UV, YUV_FIX + 2, *U); 579 TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v, 580 kHALF_UV, YUV_FIX + 2, *V); 581 } 582 583 #undef MK_CST_16 584 #undef TRANSFORM 585 586 static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb, 587 uint8_t* WEBP_RESTRICT y, int width) { 588 const int max_width = width & ~31; 589 int i; 590 for (i = 0; i < max_width; rgb += 3 * 16 * 2) { 591 __m128i rgb_plane[6]; 592 int j; 593 594 RGB24PackedToPlanar_SSE2(rgb, rgb_plane); 595 596 for (j = 0; j < 2; ++j, i += 16) { 597 const __m128i zero = _mm_setzero_si128(); 598 __m128i r, g, b, Y0, Y1; 599 600 // Convert to 16-bit Y. 601 r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero); 602 g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero); 603 b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero); 604 ConvertRGBToY_SSE2(&r, &g, &b, &Y0); 605 606 // Convert to 16-bit Y. 607 r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero); 608 g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero); 609 b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero); 610 ConvertRGBToY_SSE2(&r, &g, &b, &Y1); 611 612 // Cast to 8-bit and store. 613 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); 614 } 615 } 616 for (; i < width; ++i, rgb += 3) { // left-over 617 y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF); 618 } 619 } 620 621 static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr, 622 uint8_t* WEBP_RESTRICT y, int width) { 623 const int max_width = width & ~31; 624 int i; 625 for (i = 0; i < max_width; bgr += 3 * 16 * 2) { 626 __m128i bgr_plane[6]; 627 int j; 628 629 RGB24PackedToPlanar_SSE2(bgr, bgr_plane); 630 631 for (j = 0; j < 2; ++j, i += 16) { 632 const __m128i zero = _mm_setzero_si128(); 633 __m128i r, g, b, Y0, Y1; 634 635 // Convert to 16-bit Y. 636 b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero); 637 g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero); 638 r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero); 639 ConvertRGBToY_SSE2(&r, &g, &b, &Y0); 640 641 // Convert to 16-bit Y. 642 b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero); 643 g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero); 644 r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero); 645 ConvertRGBToY_SSE2(&r, &g, &b, &Y1); 646 647 // Cast to 8-bit and store. 648 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); 649 } 650 } 651 for (; i < width; ++i, bgr += 3) { // left-over 652 y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF); 653 } 654 } 655 656 static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb, 657 uint8_t* WEBP_RESTRICT y, int width) { 658 const int max_width = width & ~15; 659 int i; 660 for (i = 0; i < max_width; i += 16) { 661 __m128i Y0, Y1, rgb[6]; 662 RGB32PackedToPlanar_SSE2(&argb[i], rgb); 663 ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0); 664 ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1); 665 STORE_16(_mm_packus_epi16(Y0, Y1), y + i); 666 } 667 for (; i < width; ++i) { // left-over 668 const uint32_t p = argb[i]; 669 y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff, 670 YUV_HALF); 671 } 672 } 673 674 // Horizontal add (doubled) of two 16b values, result is 16b. 675 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ... 676 static void HorizontalAddPack_SSE2(const __m128i* const A, 677 const __m128i* const B, 678 __m128i* const out) { 679 const __m128i k2 = _mm_set1_epi16(2); 680 const __m128i C = _mm_madd_epi16(*A, k2); 681 const __m128i D = _mm_madd_epi16(*B, k2); 682 *out = _mm_packs_epi32(C, D); 683 } 684 685 static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb, 686 uint8_t* WEBP_RESTRICT u, 687 uint8_t* WEBP_RESTRICT v, 688 int src_width, int do_store) { 689 const int max_width = src_width & ~31; 690 int i; 691 for (i = 0; i < max_width; i += 32, u += 16, v += 16) { 692 __m128i rgb[6], U0, V0, U1, V1; 693 RGB32PackedToPlanar_SSE2(&argb[i], rgb); 694 HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); 695 HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); 696 HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); 697 ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0); 698 699 RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb); 700 HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]); 701 HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]); 702 HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]); 703 ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1); 704 705 U0 = _mm_packus_epi16(U0, U1); 706 V0 = _mm_packus_epi16(V0, V1); 707 if (!do_store) { 708 const __m128i prev_u = LOAD_16(u); 709 const __m128i prev_v = LOAD_16(v); 710 U0 = _mm_avg_epu8(U0, prev_u); 711 V0 = _mm_avg_epu8(V0, prev_v); 712 } 713 STORE_16(U0, u); 714 STORE_16(V0, v); 715 } 716 if (i < src_width) { // left-over 717 WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store); 718 } 719 } 720 721 // Convert 16 packed ARGB 16b-values to r[], g[], b[] 722 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2( 723 const uint16_t* WEBP_RESTRICT const rgbx, 724 __m128i* const r, __m128i* const g, __m128i* const b) { 725 const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x 726 const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x 727 const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... 728 const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... 729 // column-wise transpose 730 const __m128i A0 = _mm_unpacklo_epi16(in0, in1); 731 const __m128i A1 = _mm_unpackhi_epi16(in0, in1); 732 const __m128i A2 = _mm_unpacklo_epi16(in2, in3); 733 const __m128i A3 = _mm_unpackhi_epi16(in2, in3); 734 const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // r0 r1 r2 r3 | g0 g1 .. 735 const __m128i B1 = _mm_unpackhi_epi16(A0, A1); // b0 b1 b2 b3 | x x x x 736 const __m128i B2 = _mm_unpacklo_epi16(A2, A3); // r4 r5 r6 r7 | g4 g5 .. 737 const __m128i B3 = _mm_unpackhi_epi16(A2, A3); // b4 b5 b6 b7 | x x x x 738 *r = _mm_unpacklo_epi64(B0, B2); 739 *g = _mm_unpackhi_epi64(B0, B2); 740 *b = _mm_unpacklo_epi64(B1, B3); 741 } 742 743 static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb, 744 uint8_t* WEBP_RESTRICT u, 745 uint8_t* WEBP_RESTRICT v, int width) { 746 const int max_width = width & ~15; 747 const uint16_t* const last_rgb = rgb + 4 * max_width; 748 while (rgb < last_rgb) { 749 __m128i r, g, b, U0, V0, U1, V1; 750 RGBA32PackedToPlanar_16b_SSE2(rgb + 0, &r, &g, &b); 751 ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0); 752 RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b); 753 ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1); 754 STORE_16(_mm_packus_epi16(U0, U1), u); 755 STORE_16(_mm_packus_epi16(V0, V1), v); 756 u += 16; 757 v += 16; 758 rgb += 2 * 32; 759 } 760 if (max_width < width) { // left-over 761 WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width); 762 } 763 } 764 765 //------------------------------------------------------------------------------ 766 767 extern void WebPInitConvertARGBToYUVSSE2(void); 768 769 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) { 770 WebPConvertARGBToY = ConvertARGBToY_SSE2; 771 WebPConvertARGBToUV = ConvertARGBToUV_SSE2; 772 773 WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2; 774 WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2; 775 776 WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2; 777 } 778 779 #else // !WEBP_USE_SSE2 780 781 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2) 782 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2) 783 784 #endif // WEBP_USE_SSE2