tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_sse2.c (30298B)


      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // YUV->RGB conversion functions
     11 //
     12 // Author: Skal (pascal.massimino@gmail.com)
     13 
     14 #include "src/dsp/yuv.h"
     15 
     16 #if defined(WEBP_USE_SSE2)
     17 #include <emmintrin.h>
     18 
     19 #include <stdlib.h>
     20 
     21 #include "src/dsp/common_sse2.h"
     22 #include "src/dsp/cpu.h"
     23 #include "src/dsp/dsp.h"
     24 #include "src/utils/utils.h"
     25 #include "src/webp/decode.h"
     26 #include "src/webp/types.h"
     27 
     28 //-----------------------------------------------------------------------------
     29 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
     30 
     31 // These constants are 14b fixed-point version of ITU-R BT.601 constants.
     32 // R = (19077 * y             + 26149 * v - 14234) >> 6
     33 // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
     34 // B = (19077 * y + 33050 * u             - 17685) >> 6
     35 static void ConvertYUV444ToRGB_SSE2(const __m128i* const Y0,
     36                                    const __m128i* const U0,
     37                                    const __m128i* const V0,
     38                                    __m128i* const R,
     39                                    __m128i* const G,
     40                                    __m128i* const B) {
     41  const __m128i k19077 = _mm_set1_epi16(19077);
     42  const __m128i k26149 = _mm_set1_epi16(26149);
     43  const __m128i k14234 = _mm_set1_epi16(14234);
     44  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
     45  const __m128i k33050 = _mm_set1_epi16((short)33050);
     46  const __m128i k17685 = _mm_set1_epi16(17685);
     47  const __m128i k6419  = _mm_set1_epi16(6419);
     48  const __m128i k13320 = _mm_set1_epi16(13320);
     49  const __m128i k8708  = _mm_set1_epi16(8708);
     50 
     51  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
     52 
     53  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
     54  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
     55  const __m128i R2 = _mm_add_epi16(R1, R0);
     56 
     57  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
     58  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
     59  const __m128i G2 = _mm_add_epi16(Y1, k8708);
     60  const __m128i G3 = _mm_add_epi16(G0, G1);
     61  const __m128i G4 = _mm_sub_epi16(G2, G3);
     62 
     63  // be careful with the saturated *unsigned* arithmetic here!
     64  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
     65  const __m128i B1 = _mm_adds_epu16(B0, Y1);
     66  const __m128i B2 = _mm_subs_epu16(B1, k17685);
     67 
     68  // use logical shift for B2, which can be larger than 32767
     69  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
     70  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
     71  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
     72 }
     73 
     74 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
     75 static WEBP_INLINE __m128i Load_HI_16_SSE2(const uint8_t* src) {
     76  const __m128i zero = _mm_setzero_si128();
     77  return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
     78 }
     79 
     80 // Load and replicate the U/V samples
     81 static WEBP_INLINE __m128i Load_UV_HI_8_SSE2(const uint8_t* src) {
     82  const __m128i zero = _mm_setzero_si128();
     83  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
     84  const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
     85  return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
     86 }
     87 
     88 // Convert 32 samples of YUV444 to R/G/B
     89 static void YUV444ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
     90                             const uint8_t* WEBP_RESTRICT const u,
     91                             const uint8_t* WEBP_RESTRICT const v,
     92                             __m128i* const R, __m128i* const G,
     93                             __m128i* const B) {
     94  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_HI_16_SSE2(u),
     95                V0 = Load_HI_16_SSE2(v);
     96  ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
     97 }
     98 
     99 // Convert 32 samples of YUV420 to R/G/B
    100 static void YUV420ToRGB_SSE2(const uint8_t* WEBP_RESTRICT const y,
    101                             const uint8_t* WEBP_RESTRICT const u,
    102                             const uint8_t* WEBP_RESTRICT const v,
    103                             __m128i* const R, __m128i* const G,
    104                             __m128i* const B) {
    105  const __m128i Y0 = Load_HI_16_SSE2(y), U0 = Load_UV_HI_8_SSE2(u),
    106                V0 = Load_UV_HI_8_SSE2(v);
    107  ConvertYUV444ToRGB_SSE2(&Y0, &U0, &V0, R, G, B);
    108 }
    109 
    110 // Pack R/G/B/A results into 32b output.
    111 static WEBP_INLINE void PackAndStore4_SSE2(const __m128i* const R,
    112                                           const __m128i* const G,
    113                                           const __m128i* const B,
    114                                           const __m128i* const A,
    115                                           uint8_t* WEBP_RESTRICT const dst) {
    116  const __m128i rb = _mm_packus_epi16(*R, *B);
    117  const __m128i ga = _mm_packus_epi16(*G, *A);
    118  const __m128i rg = _mm_unpacklo_epi8(rb, ga);
    119  const __m128i ba = _mm_unpackhi_epi8(rb, ga);
    120  const __m128i RGBA_lo = _mm_unpacklo_epi16(rg, ba);
    121  const __m128i RGBA_hi = _mm_unpackhi_epi16(rg, ba);
    122  _mm_storeu_si128((__m128i*)(dst +  0), RGBA_lo);
    123  _mm_storeu_si128((__m128i*)(dst + 16), RGBA_hi);
    124 }
    125 
    126 // Pack R/G/B/A results into 16b output.
    127 static WEBP_INLINE void PackAndStore4444_SSE2(
    128     const __m128i* const R, const __m128i* const G, const __m128i* const B,
    129     const __m128i* const A, uint8_t* WEBP_RESTRICT const dst) {
    130 #if (WEBP_SWAP_16BIT_CSP == 0)
    131  const __m128i rg0 = _mm_packus_epi16(*R, *G);
    132  const __m128i ba0 = _mm_packus_epi16(*B, *A);
    133 #else
    134  const __m128i rg0 = _mm_packus_epi16(*B, *A);
    135  const __m128i ba0 = _mm_packus_epi16(*R, *G);
    136 #endif
    137  const __m128i mask_0xf0 = _mm_set1_epi8((char)0xf0);
    138  const __m128i rb1 = _mm_unpacklo_epi8(rg0, ba0);  // rbrbrbrbrb...
    139  const __m128i ga1 = _mm_unpackhi_epi8(rg0, ba0);  // gagagagaga...
    140  const __m128i rb2 = _mm_and_si128(rb1, mask_0xf0);
    141  const __m128i ga2 = _mm_srli_epi16(_mm_and_si128(ga1, mask_0xf0), 4);
    142  const __m128i rgba4444 = _mm_or_si128(rb2, ga2);
    143  _mm_storeu_si128((__m128i*)dst, rgba4444);
    144 }
    145 
    146 // Pack R/G/B results into 16b output.
    147 static WEBP_INLINE void PackAndStore565_SSE2(const __m128i* const R,
    148                                             const __m128i* const G,
    149                                             const __m128i* const B,
    150                                             uint8_t* WEBP_RESTRICT const dst) {
    151  const __m128i r0 = _mm_packus_epi16(*R, *R);
    152  const __m128i g0 = _mm_packus_epi16(*G, *G);
    153  const __m128i b0 = _mm_packus_epi16(*B, *B);
    154  const __m128i r1 = _mm_and_si128(r0, _mm_set1_epi8((char)0xf8));
    155  const __m128i b1 = _mm_and_si128(_mm_srli_epi16(b0, 3), _mm_set1_epi8(0x1f));
    156  const __m128i g1 =
    157      _mm_srli_epi16(_mm_and_si128(g0, _mm_set1_epi8((char)0xe0)), 5);
    158  const __m128i g2 = _mm_slli_epi16(_mm_and_si128(g0, _mm_set1_epi8(0x1c)), 3);
    159  const __m128i rg = _mm_or_si128(r1, g1);
    160  const __m128i gb = _mm_or_si128(g2, b1);
    161 #if (WEBP_SWAP_16BIT_CSP == 0)
    162  const __m128i rgb565 = _mm_unpacklo_epi8(rg, gb);
    163 #else
    164  const __m128i rgb565 = _mm_unpacklo_epi8(gb, rg);
    165 #endif
    166  _mm_storeu_si128((__m128i*)dst, rgb565);
    167 }
    168 
    169 // Pack the planar buffers
    170 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    171 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
    172 static WEBP_INLINE void PlanarTo24b_SSE2(__m128i* const in0, __m128i* const in1,
    173                                         __m128i* const in2, __m128i* const in3,
    174                                         __m128i* const in4, __m128i* const in5,
    175                                         uint8_t* WEBP_RESTRICT const rgb) {
    176  // The input is 6 registers of sixteen 8b but for the sake of explanation,
    177  // let's take 6 registers of four 8b values.
    178  // To pack, we will keep taking one every two 8b integer and move it
    179  // around as follows:
    180  // Input:
    181  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
    182  // Split the 6 registers in two sets of 3 registers: the first set as the even
    183  // 8b bytes, the second the odd ones:
    184  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
    185  // Repeat the same permutations twice more:
    186  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
    187  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
    188  VP8PlanarTo24b_SSE2(in0, in1, in2, in3, in4, in5);
    189 
    190  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
    191  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
    192  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
    193  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
    194  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
    195  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
    196 }
    197 
    198 void VP8YuvToRgba32_SSE2(const uint8_t* WEBP_RESTRICT y,
    199                         const uint8_t* WEBP_RESTRICT u,
    200                         const uint8_t* WEBP_RESTRICT v,
    201                         uint8_t* WEBP_RESTRICT dst) {
    202  const __m128i kAlpha = _mm_set1_epi16(255);
    203  int n;
    204  for (n = 0; n < 32; n += 8, dst += 32) {
    205    __m128i R, G, B;
    206    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
    207    PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
    208  }
    209 }
    210 
    211 void VP8YuvToBgra32_SSE2(const uint8_t* WEBP_RESTRICT y,
    212                         const uint8_t* WEBP_RESTRICT u,
    213                         const uint8_t* WEBP_RESTRICT v,
    214                         uint8_t* WEBP_RESTRICT dst) {
    215  const __m128i kAlpha = _mm_set1_epi16(255);
    216  int n;
    217  for (n = 0; n < 32; n += 8, dst += 32) {
    218    __m128i R, G, B;
    219    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
    220    PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
    221  }
    222 }
    223 
    224 void VP8YuvToArgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
    225                         const uint8_t* WEBP_RESTRICT u,
    226                         const uint8_t* WEBP_RESTRICT v,
    227                         uint8_t* WEBP_RESTRICT dst) {
    228  const __m128i kAlpha = _mm_set1_epi16(255);
    229  int n;
    230  for (n = 0; n < 32; n += 8, dst += 32) {
    231    __m128i R, G, B;
    232    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
    233    PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
    234  }
    235 }
    236 
    237 void VP8YuvToRgba444432_SSE2(const uint8_t* WEBP_RESTRICT y,
    238                             const uint8_t* WEBP_RESTRICT u,
    239                             const uint8_t* WEBP_RESTRICT v,
    240                             uint8_t* WEBP_RESTRICT dst) {
    241  const __m128i kAlpha = _mm_set1_epi16(255);
    242  int n;
    243  for (n = 0; n < 32; n += 8, dst += 16) {
    244    __m128i R, G, B;
    245    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
    246    PackAndStore4444_SSE2(&R, &G, &B, &kAlpha, dst);
    247  }
    248 }
    249 
    250 void VP8YuvToRgb56532_SSE2(const uint8_t* WEBP_RESTRICT y,
    251                           const uint8_t* WEBP_RESTRICT u,
    252                           const uint8_t* WEBP_RESTRICT v,
    253                           uint8_t* WEBP_RESTRICT dst) {
    254  int n;
    255  for (n = 0; n < 32; n += 8, dst += 16) {
    256    __m128i R, G, B;
    257    YUV444ToRGB_SSE2(y + n, u + n, v + n, &R, &G, &B);
    258    PackAndStore565_SSE2(&R, &G, &B, dst);
    259  }
    260 }
    261 
    262 void VP8YuvToRgb32_SSE2(const uint8_t* WEBP_RESTRICT y,
    263                        const uint8_t* WEBP_RESTRICT u,
    264                        const uint8_t* WEBP_RESTRICT v,
    265                        uint8_t* WEBP_RESTRICT dst) {
    266  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    267  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
    268 
    269  YUV444ToRGB_SSE2(y + 0, u + 0, v + 0, &R0, &G0, &B0);
    270  YUV444ToRGB_SSE2(y + 8, u + 8, v + 8, &R1, &G1, &B1);
    271  YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
    272  YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
    273 
    274  // Cast to 8b and store as RRRRGGGGBBBB.
    275  rgb0 = _mm_packus_epi16(R0, R1);
    276  rgb1 = _mm_packus_epi16(R2, R3);
    277  rgb2 = _mm_packus_epi16(G0, G1);
    278  rgb3 = _mm_packus_epi16(G2, G3);
    279  rgb4 = _mm_packus_epi16(B0, B1);
    280  rgb5 = _mm_packus_epi16(B2, B3);
    281 
    282  // Pack as RGBRGBRGBRGB.
    283  PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
    284 }
    285 
    286 void VP8YuvToBgr32_SSE2(const uint8_t* WEBP_RESTRICT y,
    287                        const uint8_t* WEBP_RESTRICT u,
    288                        const uint8_t* WEBP_RESTRICT v,
    289                        uint8_t* WEBP_RESTRICT dst) {
    290  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    291  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
    292 
    293  YUV444ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    294  YUV444ToRGB_SSE2(y +  8, u +  8, v +  8, &R1, &G1, &B1);
    295  YUV444ToRGB_SSE2(y + 16, u + 16, v + 16, &R2, &G2, &B2);
    296  YUV444ToRGB_SSE2(y + 24, u + 24, v + 24, &R3, &G3, &B3);
    297 
    298  // Cast to 8b and store as BBBBGGGGRRRR.
    299  bgr0 = _mm_packus_epi16(B0, B1);
    300  bgr1 = _mm_packus_epi16(B2, B3);
    301  bgr2 = _mm_packus_epi16(G0, G1);
    302  bgr3 = _mm_packus_epi16(G2, G3);
    303  bgr4 = _mm_packus_epi16(R0, R1);
    304  bgr5= _mm_packus_epi16(R2, R3);
    305 
    306  // Pack as BGRBGRBGRBGR.
    307  PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
    308 }
    309 
    310 //-----------------------------------------------------------------------------
    311 // Arbitrary-length row conversion functions
    312 
    313 static void YuvToRgbaRow_SSE2(const uint8_t* WEBP_RESTRICT y,
    314                              const uint8_t* WEBP_RESTRICT u,
    315                              const uint8_t* WEBP_RESTRICT v,
    316                              uint8_t* WEBP_RESTRICT dst, int len) {
    317  const __m128i kAlpha = _mm_set1_epi16(255);
    318  int n;
    319  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
    320    __m128i R, G, B;
    321    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
    322    PackAndStore4_SSE2(&R, &G, &B, &kAlpha, dst);
    323    y += 8;
    324    u += 4;
    325    v += 4;
    326  }
    327  for (; n < len; ++n) {   // Finish off
    328    VP8YuvToRgba(y[0], u[0], v[0], dst);
    329    dst += 4;
    330    y += 1;
    331    u += (n & 1);
    332    v += (n & 1);
    333  }
    334 }
    335 
    336 static void YuvToBgraRow_SSE2(const uint8_t* WEBP_RESTRICT y,
    337                              const uint8_t* WEBP_RESTRICT u,
    338                              const uint8_t* WEBP_RESTRICT v,
    339                              uint8_t* WEBP_RESTRICT dst, int len) {
    340  const __m128i kAlpha = _mm_set1_epi16(255);
    341  int n;
    342  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
    343    __m128i R, G, B;
    344    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
    345    PackAndStore4_SSE2(&B, &G, &R, &kAlpha, dst);
    346    y += 8;
    347    u += 4;
    348    v += 4;
    349  }
    350  for (; n < len; ++n) {   // Finish off
    351    VP8YuvToBgra(y[0], u[0], v[0], dst);
    352    dst += 4;
    353    y += 1;
    354    u += (n & 1);
    355    v += (n & 1);
    356  }
    357 }
    358 
    359 static void YuvToArgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
    360                              const uint8_t* WEBP_RESTRICT u,
    361                              const uint8_t* WEBP_RESTRICT v,
    362                              uint8_t* WEBP_RESTRICT dst, int len) {
    363  const __m128i kAlpha = _mm_set1_epi16(255);
    364  int n;
    365  for (n = 0; n + 8 <= len; n += 8, dst += 32) {
    366    __m128i R, G, B;
    367    YUV420ToRGB_SSE2(y, u, v, &R, &G, &B);
    368    PackAndStore4_SSE2(&kAlpha, &R, &G, &B, dst);
    369    y += 8;
    370    u += 4;
    371    v += 4;
    372  }
    373  for (; n < len; ++n) {   // Finish off
    374    VP8YuvToArgb(y[0], u[0], v[0], dst);
    375    dst += 4;
    376    y += 1;
    377    u += (n & 1);
    378    v += (n & 1);
    379  }
    380 }
    381 
    382 static void YuvToRgbRow_SSE2(const uint8_t* WEBP_RESTRICT y,
    383                             const uint8_t* WEBP_RESTRICT u,
    384                             const uint8_t* WEBP_RESTRICT v,
    385                             uint8_t* WEBP_RESTRICT dst, int len) {
    386  int n;
    387  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    388    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    389    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
    390 
    391    YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    392    YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
    393    YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
    394    YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
    395 
    396    // Cast to 8b and store as RRRRGGGGBBBB.
    397    rgb0 = _mm_packus_epi16(R0, R1);
    398    rgb1 = _mm_packus_epi16(R2, R3);
    399    rgb2 = _mm_packus_epi16(G0, G1);
    400    rgb3 = _mm_packus_epi16(G2, G3);
    401    rgb4 = _mm_packus_epi16(B0, B1);
    402    rgb5 = _mm_packus_epi16(B2, B3);
    403 
    404    // Pack as RGBRGBRGBRGB.
    405    PlanarTo24b_SSE2(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
    406 
    407    y += 32;
    408    u += 16;
    409    v += 16;
    410  }
    411  for (; n < len; ++n) {   // Finish off
    412    VP8YuvToRgb(y[0], u[0], v[0], dst);
    413    dst += 3;
    414    y += 1;
    415    u += (n & 1);
    416    v += (n & 1);
    417  }
    418 }
    419 
    420 static void YuvToBgrRow_SSE2(const uint8_t* WEBP_RESTRICT y,
    421                             const uint8_t* WEBP_RESTRICT u,
    422                             const uint8_t* WEBP_RESTRICT v,
    423                             uint8_t* WEBP_RESTRICT dst, int len) {
    424  int n;
    425  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    426    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    427    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
    428 
    429    YUV420ToRGB_SSE2(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    430    YUV420ToRGB_SSE2(y +  8, u +  4, v +  4, &R1, &G1, &B1);
    431    YUV420ToRGB_SSE2(y + 16, u +  8, v +  8, &R2, &G2, &B2);
    432    YUV420ToRGB_SSE2(y + 24, u + 12, v + 12, &R3, &G3, &B3);
    433 
    434    // Cast to 8b and store as BBBBGGGGRRRR.
    435    bgr0 = _mm_packus_epi16(B0, B1);
    436    bgr1 = _mm_packus_epi16(B2, B3);
    437    bgr2 = _mm_packus_epi16(G0, G1);
    438    bgr3 = _mm_packus_epi16(G2, G3);
    439    bgr4 = _mm_packus_epi16(R0, R1);
    440    bgr5 = _mm_packus_epi16(R2, R3);
    441 
    442    // Pack as BGRBGRBGRBGR.
    443    PlanarTo24b_SSE2(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
    444 
    445    y += 32;
    446    u += 16;
    447    v += 16;
    448  }
    449  for (; n < len; ++n) {   // Finish off
    450    VP8YuvToBgr(y[0], u[0], v[0], dst);
    451    dst += 3;
    452    y += 1;
    453    u += (n & 1);
    454    v += (n & 1);
    455  }
    456 }
    457 
    458 //------------------------------------------------------------------------------
    459 // Entry point
    460 
    461 extern void WebPInitSamplersSSE2(void);
    462 
    463 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE2(void) {
    464  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE2;
    465  WebPSamplers[MODE_RGBA] = YuvToRgbaRow_SSE2;
    466  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE2;
    467  WebPSamplers[MODE_BGRA] = YuvToBgraRow_SSE2;
    468  WebPSamplers[MODE_ARGB] = YuvToArgbRow_SSE2;
    469 }
    470 
    471 //------------------------------------------------------------------------------
    472 // RGB24/32 -> YUV converters
    473 
    474 // Load eight 16b-words from *src.
    475 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
    476 // Store either 16b-words into *dst
    477 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
    478 
    479 // Function that inserts a value of the second half of the in buffer in between
    480 // every two char of the first half.
    481 static WEBP_INLINE void RGB24PackedToPlanarHelper_SSE2(
    482    const __m128i* const in /*in[6]*/, __m128i* const out /*out[6]*/) {
    483  out[0] = _mm_unpacklo_epi8(in[0], in[3]);
    484  out[1] = _mm_unpackhi_epi8(in[0], in[3]);
    485  out[2] = _mm_unpacklo_epi8(in[1], in[4]);
    486  out[3] = _mm_unpackhi_epi8(in[1], in[4]);
    487  out[4] = _mm_unpacklo_epi8(in[2], in[5]);
    488  out[5] = _mm_unpackhi_epi8(in[2], in[5]);
    489 }
    490 
    491 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
    492 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    493 // Similar to PlanarTo24bHelper(), but in reverse order.
    494 static WEBP_INLINE void RGB24PackedToPlanar_SSE2(
    495    const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
    496  __m128i tmp[6];
    497  tmp[0] = _mm_loadu_si128((const __m128i*)(rgb +  0));
    498  tmp[1] = _mm_loadu_si128((const __m128i*)(rgb + 16));
    499  tmp[2] = _mm_loadu_si128((const __m128i*)(rgb + 32));
    500  tmp[3] = _mm_loadu_si128((const __m128i*)(rgb + 48));
    501  tmp[4] = _mm_loadu_si128((const __m128i*)(rgb + 64));
    502  tmp[5] = _mm_loadu_si128((const __m128i*)(rgb + 80));
    503 
    504  RGB24PackedToPlanarHelper_SSE2(tmp, out);
    505  RGB24PackedToPlanarHelper_SSE2(out, tmp);
    506  RGB24PackedToPlanarHelper_SSE2(tmp, out);
    507  RGB24PackedToPlanarHelper_SSE2(out, tmp);
    508  RGB24PackedToPlanarHelper_SSE2(tmp, out);
    509 }
    510 
    511 // Convert 8 packed ARGB to r[], g[], b[]
    512 static WEBP_INLINE void RGB32PackedToPlanar_SSE2(
    513    const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
    514  const __m128i zero = _mm_setzero_si128();
    515  __m128i a0 = LOAD_16(argb + 0);
    516  __m128i a1 = LOAD_16(argb + 4);
    517  __m128i a2 = LOAD_16(argb + 8);
    518  __m128i a3 = LOAD_16(argb + 12);
    519  VP8L32bToPlanar_SSE2(&a0, &a1, &a2, &a3);
    520  rgb[0] = _mm_unpacklo_epi8(a1, zero);
    521  rgb[1] = _mm_unpackhi_epi8(a1, zero);
    522  rgb[2] = _mm_unpacklo_epi8(a2, zero);
    523  rgb[3] = _mm_unpackhi_epi8(a2, zero);
    524  rgb[4] = _mm_unpacklo_epi8(a3, zero);
    525  rgb[5] = _mm_unpackhi_epi8(a3, zero);
    526 }
    527 
    528 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
    529 // It's a macro and not a function because we need to use immediate values with
    530 // srai_epi32, e.g.
    531 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
    532                  ROUNDER, DESCALE_FIX, OUT) do {               \
    533  const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
    534  const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
    535  const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
    536  const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
    537  const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
    538  const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
    539  const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
    540  const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
    541  const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
    542  const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
    543  (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
    544 } while (0)
    545 
    546 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
    547 static WEBP_INLINE void ConvertRGBToY_SSE2(const __m128i* const R,
    548                                           const __m128i* const G,
    549                                           const __m128i* const B,
    550                                           __m128i* const Y) {
    551  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
    552  const __m128i kGB_y = MK_CST_16(16384, 6420);
    553  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
    554 
    555  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
    556  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
    557  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
    558  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
    559  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
    560 }
    561 
    562 static WEBP_INLINE void ConvertRGBToUV_SSE2(const __m128i* const R,
    563                                            const __m128i* const G,
    564                                            const __m128i* const B,
    565                                            __m128i* const U,
    566                                            __m128i* const V) {
    567  const __m128i kRG_u = MK_CST_16(-9719, -19081);
    568  const __m128i kGB_u = MK_CST_16(0, 28800);
    569  const __m128i kRG_v = MK_CST_16(28800, 0);
    570  const __m128i kGB_v = MK_CST_16(-24116, -4684);
    571  const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
    572 
    573  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
    574  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
    575  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
    576  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
    577  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
    578            kHALF_UV, YUV_FIX + 2, *U);
    579  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
    580            kHALF_UV, YUV_FIX + 2, *V);
    581 }
    582 
    583 #undef MK_CST_16
    584 #undef TRANSFORM
    585 
    586 static void ConvertRGB24ToY_SSE2(const uint8_t* WEBP_RESTRICT rgb,
    587                                 uint8_t* WEBP_RESTRICT y, int width) {
    588  const int max_width = width & ~31;
    589  int i;
    590  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
    591    __m128i rgb_plane[6];
    592    int j;
    593 
    594    RGB24PackedToPlanar_SSE2(rgb, rgb_plane);
    595 
    596    for (j = 0; j < 2; ++j, i += 16) {
    597      const __m128i zero = _mm_setzero_si128();
    598      __m128i r, g, b, Y0, Y1;
    599 
    600      // Convert to 16-bit Y.
    601      r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
    602      g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
    603      b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
    604      ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
    605 
    606      // Convert to 16-bit Y.
    607      r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
    608      g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
    609      b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
    610      ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
    611 
    612      // Cast to 8-bit and store.
    613      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    614    }
    615  }
    616  for (; i < width; ++i, rgb += 3) {   // left-over
    617    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
    618  }
    619 }
    620 
    621 static void ConvertBGR24ToY_SSE2(const uint8_t* WEBP_RESTRICT bgr,
    622                                 uint8_t* WEBP_RESTRICT y, int width) {
    623  const int max_width = width & ~31;
    624  int i;
    625  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
    626    __m128i bgr_plane[6];
    627    int j;
    628 
    629    RGB24PackedToPlanar_SSE2(bgr, bgr_plane);
    630 
    631    for (j = 0; j < 2; ++j, i += 16) {
    632      const __m128i zero = _mm_setzero_si128();
    633      __m128i r, g, b, Y0, Y1;
    634 
    635      // Convert to 16-bit Y.
    636      b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
    637      g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
    638      r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
    639      ConvertRGBToY_SSE2(&r, &g, &b, &Y0);
    640 
    641      // Convert to 16-bit Y.
    642      b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
    643      g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
    644      r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
    645      ConvertRGBToY_SSE2(&r, &g, &b, &Y1);
    646 
    647      // Cast to 8-bit and store.
    648      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    649    }
    650  }
    651  for (; i < width; ++i, bgr += 3) {  // left-over
    652    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
    653  }
    654 }
    655 
    656 static void ConvertARGBToY_SSE2(const uint32_t* WEBP_RESTRICT argb,
    657                                uint8_t* WEBP_RESTRICT y, int width) {
    658  const int max_width = width & ~15;
    659  int i;
    660  for (i = 0; i < max_width; i += 16) {
    661    __m128i Y0, Y1, rgb[6];
    662    RGB32PackedToPlanar_SSE2(&argb[i], rgb);
    663    ConvertRGBToY_SSE2(&rgb[0], &rgb[2], &rgb[4], &Y0);
    664    ConvertRGBToY_SSE2(&rgb[1], &rgb[3], &rgb[5], &Y1);
    665    STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    666  }
    667  for (; i < width; ++i) {   // left-over
    668    const uint32_t p = argb[i];
    669    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
    670                     YUV_HALF);
    671  }
    672 }
    673 
    674 // Horizontal add (doubled) of two 16b values, result is 16b.
    675 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
    676 static void HorizontalAddPack_SSE2(const __m128i* const A,
    677                                   const __m128i* const B,
    678                                   __m128i* const out) {
    679  const __m128i k2 = _mm_set1_epi16(2);
    680  const __m128i C = _mm_madd_epi16(*A, k2);
    681  const __m128i D = _mm_madd_epi16(*B, k2);
    682  *out = _mm_packs_epi32(C, D);
    683 }
    684 
    685 static void ConvertARGBToUV_SSE2(const uint32_t* WEBP_RESTRICT argb,
    686                                 uint8_t* WEBP_RESTRICT u,
    687                                 uint8_t* WEBP_RESTRICT v,
    688                                 int src_width, int do_store) {
    689  const int max_width = src_width & ~31;
    690  int i;
    691  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
    692    __m128i rgb[6], U0, V0, U1, V1;
    693    RGB32PackedToPlanar_SSE2(&argb[i], rgb);
    694    HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
    695    HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
    696    HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
    697    ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
    698 
    699    RGB32PackedToPlanar_SSE2(&argb[i + 16], rgb);
    700    HorizontalAddPack_SSE2(&rgb[0], &rgb[1], &rgb[0]);
    701    HorizontalAddPack_SSE2(&rgb[2], &rgb[3], &rgb[2]);
    702    HorizontalAddPack_SSE2(&rgb[4], &rgb[5], &rgb[4]);
    703    ConvertRGBToUV_SSE2(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
    704 
    705    U0 = _mm_packus_epi16(U0, U1);
    706    V0 = _mm_packus_epi16(V0, V1);
    707    if (!do_store) {
    708      const __m128i prev_u = LOAD_16(u);
    709      const __m128i prev_v = LOAD_16(v);
    710      U0 = _mm_avg_epu8(U0, prev_u);
    711      V0 = _mm_avg_epu8(V0, prev_v);
    712    }
    713    STORE_16(U0, u);
    714    STORE_16(V0, v);
    715  }
    716  if (i < src_width) {  // left-over
    717    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
    718  }
    719 }
    720 
    721 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
    722 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE2(
    723    const uint16_t* WEBP_RESTRICT const rgbx,
    724    __m128i* const r, __m128i* const g, __m128i* const b) {
    725  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
    726  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
    727  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
    728  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
    729  // column-wise transpose
    730  const __m128i A0 = _mm_unpacklo_epi16(in0, in1);
    731  const __m128i A1 = _mm_unpackhi_epi16(in0, in1);
    732  const __m128i A2 = _mm_unpacklo_epi16(in2, in3);
    733  const __m128i A3 = _mm_unpackhi_epi16(in2, in3);
    734  const __m128i B0 = _mm_unpacklo_epi16(A0, A1);  // r0 r1 r2 r3 | g0 g1 ..
    735  const __m128i B1 = _mm_unpackhi_epi16(A0, A1);  // b0 b1 b2 b3 | x x x x
    736  const __m128i B2 = _mm_unpacklo_epi16(A2, A3);  // r4 r5 r6 r7 | g4 g5 ..
    737  const __m128i B3 = _mm_unpackhi_epi16(A2, A3);  // b4 b5 b6 b7 | x x x x
    738  *r = _mm_unpacklo_epi64(B0, B2);
    739  *g = _mm_unpackhi_epi64(B0, B2);
    740  *b = _mm_unpacklo_epi64(B1, B3);
    741 }
    742 
    743 static void ConvertRGBA32ToUV_SSE2(const uint16_t* WEBP_RESTRICT rgb,
    744                                   uint8_t* WEBP_RESTRICT u,
    745                                   uint8_t* WEBP_RESTRICT v, int width) {
    746  const int max_width = width & ~15;
    747  const uint16_t* const last_rgb = rgb + 4 * max_width;
    748  while (rgb < last_rgb) {
    749    __m128i r, g, b, U0, V0, U1, V1;
    750    RGBA32PackedToPlanar_16b_SSE2(rgb +  0, &r, &g, &b);
    751    ConvertRGBToUV_SSE2(&r, &g, &b, &U0, &V0);
    752    RGBA32PackedToPlanar_16b_SSE2(rgb + 32, &r, &g, &b);
    753    ConvertRGBToUV_SSE2(&r, &g, &b, &U1, &V1);
    754    STORE_16(_mm_packus_epi16(U0, U1), u);
    755    STORE_16(_mm_packus_epi16(V0, V1), v);
    756    u += 16;
    757    v += 16;
    758    rgb += 2 * 32;
    759  }
    760  if (max_width < width) {  // left-over
    761    WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
    762  }
    763 }
    764 
    765 //------------------------------------------------------------------------------
    766 
    767 extern void WebPInitConvertARGBToYUVSSE2(void);
    768 
    769 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE2(void) {
    770  WebPConvertARGBToY = ConvertARGBToY_SSE2;
    771  WebPConvertARGBToUV = ConvertARGBToUV_SSE2;
    772 
    773  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE2;
    774  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE2;
    775 
    776  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE2;
    777 }
    778 
    779 #else  // !WEBP_USE_SSE2
    780 
    781 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE2)
    782 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE2)
    783 
    784 #endif  // WEBP_USE_SSE2