tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_sse41.c (24716B)


      1 // Copyright 2014 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // YUV->RGB conversion functions
     11 //
     12 // Author: Skal (pascal.massimino@gmail.com)
     13 
     14 #include "src/dsp/yuv.h"
     15 
     16 #if defined(WEBP_USE_SSE41)
     17 #include <emmintrin.h>
     18 #include <smmintrin.h>
     19 
     20 #include <stdlib.h>
     21 
     22 #include "src/dsp/common_sse41.h"
     23 #include "src/dsp/cpu.h"
     24 #include "src/dsp/dsp.h"
     25 #include "src/utils/utils.h"
     26 #include "src/webp/decode.h"
     27 #include "src/webp/types.h"
     28 
     29 //-----------------------------------------------------------------------------
     30 // Convert spans of 32 pixels to various RGB formats for the fancy upsampler.
     31 
     32 // These constants are 14b fixed-point version of ITU-R BT.601 constants.
     33 // R = (19077 * y             + 26149 * v - 14234) >> 6
     34 // G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
     35 // B = (19077 * y + 33050 * u             - 17685) >> 6
     36 static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
     37                                     const __m128i* const U0,
     38                                     const __m128i* const V0,
     39                                     __m128i* const R,
     40                                     __m128i* const G,
     41                                     __m128i* const B) {
     42  const __m128i k19077 = _mm_set1_epi16(19077);
     43  const __m128i k26149 = _mm_set1_epi16(26149);
     44  const __m128i k14234 = _mm_set1_epi16(14234);
     45  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
     46  const __m128i k33050 = _mm_set1_epi16((short)33050);
     47  const __m128i k17685 = _mm_set1_epi16(17685);
     48  const __m128i k6419  = _mm_set1_epi16(6419);
     49  const __m128i k13320 = _mm_set1_epi16(13320);
     50  const __m128i k8708  = _mm_set1_epi16(8708);
     51 
     52  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);
     53 
     54  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
     55  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
     56  const __m128i R2 = _mm_add_epi16(R1, R0);
     57 
     58  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
     59  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
     60  const __m128i G2 = _mm_add_epi16(Y1, k8708);
     61  const __m128i G3 = _mm_add_epi16(G0, G1);
     62  const __m128i G4 = _mm_sub_epi16(G2, G3);
     63 
     64  // be careful with the saturated *unsigned* arithmetic here!
     65  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
     66  const __m128i B1 = _mm_adds_epu16(B0, Y1);
     67  const __m128i B2 = _mm_subs_epu16(B1, k17685);
     68 
     69  // use logical shift for B2, which can be larger than 32767
     70  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
     71  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
     72  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
     73 }
     74 
     75 // Load the bytes into the *upper* part of 16b words. That's "<< 8", basically.
     76 static WEBP_INLINE __m128i Load_HI_16_SSE41(const uint8_t* src) {
     77  const __m128i zero = _mm_setzero_si128();
     78  return _mm_unpacklo_epi8(zero, _mm_loadl_epi64((const __m128i*)src));
     79 }
     80 
     81 // Load and replicate the U/V samples
     82 static WEBP_INLINE __m128i Load_UV_HI_8_SSE41(const uint8_t* src) {
     83  const __m128i zero = _mm_setzero_si128();
     84  const __m128i tmp0 = _mm_cvtsi32_si128(WebPMemToInt32(src));
     85  const __m128i tmp1 = _mm_unpacklo_epi8(zero, tmp0);
     86  return _mm_unpacklo_epi16(tmp1, tmp1);   // replicate samples
     87 }
     88 
     89 // Convert 32 samples of YUV444 to R/G/B
     90 static void YUV444ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
     91                              const uint8_t* WEBP_RESTRICT const u,
     92                              const uint8_t* WEBP_RESTRICT const v,
     93                              __m128i* const R, __m128i* const G,
     94                              __m128i* const B) {
     95  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_HI_16_SSE41(u),
     96                V0 = Load_HI_16_SSE41(v);
     97  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
     98 }
     99 
    100 // Convert 32 samples of YUV420 to R/G/B
    101 static void YUV420ToRGB_SSE41(const uint8_t* WEBP_RESTRICT const y,
    102                              const uint8_t* WEBP_RESTRICT const u,
    103                              const uint8_t* WEBP_RESTRICT const v,
    104                              __m128i* const R, __m128i* const G,
    105                              __m128i* const B) {
    106  const __m128i Y0 = Load_HI_16_SSE41(y), U0 = Load_UV_HI_8_SSE41(u),
    107                V0 = Load_UV_HI_8_SSE41(v);
    108  ConvertYUV444ToRGB_SSE41(&Y0, &U0, &V0, R, G, B);
    109 }
    110 
    111 // Pack the planar buffers
    112 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    113 // triplet by triplet in the output buffer rgb as rgbrgbrgbrgb ...
    114 static WEBP_INLINE void PlanarTo24b_SSE41(
    115    __m128i* const in0, __m128i* const in1, __m128i* const in2,
    116    __m128i* const in3, __m128i* const in4, __m128i* const in5,
    117    uint8_t* WEBP_RESTRICT const rgb) {
    118  // The input is 6 registers of sixteen 8b but for the sake of explanation,
    119  // let's take 6 registers of four 8b values.
    120  // To pack, we will keep taking one every two 8b integer and move it
    121  // around as follows:
    122  // Input:
    123  //   r0r1r2r3 | r4r5r6r7 | g0g1g2g3 | g4g5g6g7 | b0b1b2b3 | b4b5b6b7
    124  // Split the 6 registers in two sets of 3 registers: the first set as the even
    125  // 8b bytes, the second the odd ones:
    126  //   r0r2r4r6 | g0g2g4g6 | b0b2b4b6 | r1r3r5r7 | g1g3g5g7 | b1b3b5b7
    127  // Repeat the same permutations twice more:
    128  //   r0r4g0g4 | b0b4r1r5 | g1g5b1b5 | r2r6g2g6 | b2b6r3r7 | g3g7b3b7
    129  //   r0g0b0r1 | g1b1r2g2 | b2r3g3b3 | r4g4b4r5 | g5b5r6g6 | b6r7g7b7
    130  VP8PlanarTo24b_SSE41(in0, in1, in2, in3, in4, in5);
    131 
    132  _mm_storeu_si128((__m128i*)(rgb +  0), *in0);
    133  _mm_storeu_si128((__m128i*)(rgb + 16), *in1);
    134  _mm_storeu_si128((__m128i*)(rgb + 32), *in2);
    135  _mm_storeu_si128((__m128i*)(rgb + 48), *in3);
    136  _mm_storeu_si128((__m128i*)(rgb + 64), *in4);
    137  _mm_storeu_si128((__m128i*)(rgb + 80), *in5);
    138 }
    139 
    140 void VP8YuvToRgb32_SSE41(const uint8_t* WEBP_RESTRICT y,
    141                         const uint8_t* WEBP_RESTRICT u,
    142                         const uint8_t* WEBP_RESTRICT v,
    143                         uint8_t* WEBP_RESTRICT dst) {
    144  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    145  __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
    146 
    147  YUV444ToRGB_SSE41(y + 0, u + 0, v + 0, &R0, &G0, &B0);
    148  YUV444ToRGB_SSE41(y + 8, u + 8, v + 8, &R1, &G1, &B1);
    149  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
    150  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
    151 
    152  // Cast to 8b and store as RRRRGGGGBBBB.
    153  rgb0 = _mm_packus_epi16(R0, R1);
    154  rgb1 = _mm_packus_epi16(R2, R3);
    155  rgb2 = _mm_packus_epi16(G0, G1);
    156  rgb3 = _mm_packus_epi16(G2, G3);
    157  rgb4 = _mm_packus_epi16(B0, B1);
    158  rgb5 = _mm_packus_epi16(B2, B3);
    159 
    160  // Pack as RGBRGBRGBRGB.
    161  PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
    162 }
    163 
    164 void VP8YuvToBgr32_SSE41(const uint8_t* WEBP_RESTRICT y,
    165                         const uint8_t* WEBP_RESTRICT u,
    166                         const uint8_t* WEBP_RESTRICT v,
    167                         uint8_t* WEBP_RESTRICT dst) {
    168  __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    169  __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
    170 
    171  YUV444ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    172  YUV444ToRGB_SSE41(y +  8, u +  8, v +  8, &R1, &G1, &B1);
    173  YUV444ToRGB_SSE41(y + 16, u + 16, v + 16, &R2, &G2, &B2);
    174  YUV444ToRGB_SSE41(y + 24, u + 24, v + 24, &R3, &G3, &B3);
    175 
    176  // Cast to 8b and store as BBBBGGGGRRRR.
    177  bgr0 = _mm_packus_epi16(B0, B1);
    178  bgr1 = _mm_packus_epi16(B2, B3);
    179  bgr2 = _mm_packus_epi16(G0, G1);
    180  bgr3 = _mm_packus_epi16(G2, G3);
    181  bgr4 = _mm_packus_epi16(R0, R1);
    182  bgr5= _mm_packus_epi16(R2, R3);
    183 
    184  // Pack as BGRBGRBGRBGR.
    185  PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
    186 }
    187 
    188 //-----------------------------------------------------------------------------
    189 // Arbitrary-length row conversion functions
    190 
    191 static void YuvToRgbRow_SSE41(const uint8_t* WEBP_RESTRICT y,
    192                              const uint8_t* WEBP_RESTRICT u,
    193                              const uint8_t* WEBP_RESTRICT v,
    194                              uint8_t* WEBP_RESTRICT dst, int len) {
    195  int n;
    196  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    197    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    198    __m128i rgb0, rgb1, rgb2, rgb3, rgb4, rgb5;
    199 
    200    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    201    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
    202    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
    203    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
    204 
    205    // Cast to 8b and store as RRRRGGGGBBBB.
    206    rgb0 = _mm_packus_epi16(R0, R1);
    207    rgb1 = _mm_packus_epi16(R2, R3);
    208    rgb2 = _mm_packus_epi16(G0, G1);
    209    rgb3 = _mm_packus_epi16(G2, G3);
    210    rgb4 = _mm_packus_epi16(B0, B1);
    211    rgb5 = _mm_packus_epi16(B2, B3);
    212 
    213    // Pack as RGBRGBRGBRGB.
    214    PlanarTo24b_SSE41(&rgb0, &rgb1, &rgb2, &rgb3, &rgb4, &rgb5, dst);
    215 
    216    y += 32;
    217    u += 16;
    218    v += 16;
    219  }
    220  for (; n < len; ++n) {   // Finish off
    221    VP8YuvToRgb(y[0], u[0], v[0], dst);
    222    dst += 3;
    223    y += 1;
    224    u += (n & 1);
    225    v += (n & 1);
    226  }
    227 }
    228 
    229 static void YuvToBgrRow_SSE41(const uint8_t* WEBP_RESTRICT y,
    230                              const uint8_t* WEBP_RESTRICT u,
    231                              const uint8_t* WEBP_RESTRICT v,
    232                              uint8_t* WEBP_RESTRICT dst, int len) {
    233  int n;
    234  for (n = 0; n + 32 <= len; n += 32, dst += 32 * 3) {
    235    __m128i R0, R1, R2, R3, G0, G1, G2, G3, B0, B1, B2, B3;
    236    __m128i bgr0, bgr1, bgr2, bgr3, bgr4, bgr5;
    237 
    238    YUV420ToRGB_SSE41(y +  0, u +  0, v +  0, &R0, &G0, &B0);
    239    YUV420ToRGB_SSE41(y +  8, u +  4, v +  4, &R1, &G1, &B1);
    240    YUV420ToRGB_SSE41(y + 16, u +  8, v +  8, &R2, &G2, &B2);
    241    YUV420ToRGB_SSE41(y + 24, u + 12, v + 12, &R3, &G3, &B3);
    242 
    243    // Cast to 8b and store as BBBBGGGGRRRR.
    244    bgr0 = _mm_packus_epi16(B0, B1);
    245    bgr1 = _mm_packus_epi16(B2, B3);
    246    bgr2 = _mm_packus_epi16(G0, G1);
    247    bgr3 = _mm_packus_epi16(G2, G3);
    248    bgr4 = _mm_packus_epi16(R0, R1);
    249    bgr5 = _mm_packus_epi16(R2, R3);
    250 
    251    // Pack as BGRBGRBGRBGR.
    252    PlanarTo24b_SSE41(&bgr0, &bgr1, &bgr2, &bgr3, &bgr4, &bgr5, dst);
    253 
    254    y += 32;
    255    u += 16;
    256    v += 16;
    257  }
    258  for (; n < len; ++n) {   // Finish off
    259    VP8YuvToBgr(y[0], u[0], v[0], dst);
    260    dst += 3;
    261    y += 1;
    262    u += (n & 1);
    263    v += (n & 1);
    264  }
    265 }
    266 
    267 //------------------------------------------------------------------------------
    268 // Entry point
    269 
    270 extern void WebPInitSamplersSSE41(void);
    271 
    272 WEBP_TSAN_IGNORE_FUNCTION void WebPInitSamplersSSE41(void) {
    273  WebPSamplers[MODE_RGB]  = YuvToRgbRow_SSE41;
    274  WebPSamplers[MODE_BGR]  = YuvToBgrRow_SSE41;
    275 }
    276 
    277 //------------------------------------------------------------------------------
    278 // RGB24/32 -> YUV converters
    279 
    280 // Load eight 16b-words from *src.
    281 #define LOAD_16(src) _mm_loadu_si128((const __m128i*)(src))
    282 // Store either 16b-words into *dst
    283 #define STORE_16(V, dst) _mm_storeu_si128((__m128i*)(dst), (V))
    284 
    285 #define WEBP_SSE41_SHUFF(OUT)  do {                  \
    286  const __m128i tmp0 = _mm_shuffle_epi8(A0, shuff0); \
    287  const __m128i tmp1 = _mm_shuffle_epi8(A1, shuff1); \
    288  const __m128i tmp2 = _mm_shuffle_epi8(A2, shuff2); \
    289  const __m128i tmp3 = _mm_shuffle_epi8(A3, shuff0); \
    290  const __m128i tmp4 = _mm_shuffle_epi8(A4, shuff1); \
    291  const __m128i tmp5 = _mm_shuffle_epi8(A5, shuff2); \
    292                                                     \
    293  /* OR everything to get one channel */             \
    294  const __m128i tmp6 = _mm_or_si128(tmp0, tmp1);     \
    295  const __m128i tmp7 = _mm_or_si128(tmp3, tmp4);     \
    296  out[OUT + 0] = _mm_or_si128(tmp6, tmp2);           \
    297  out[OUT + 1] = _mm_or_si128(tmp7, tmp5);           \
    298 } while (0);
    299 
    300 // Unpack the 8b input rgbrgbrgbrgb ... as contiguous registers:
    301 // rrrr... rrrr... gggg... gggg... bbbb... bbbb....
    302 // Similar to PlanarTo24bHelper(), but in reverse order.
    303 static WEBP_INLINE void RGB24PackedToPlanar_SSE41(
    304    const uint8_t* WEBP_RESTRICT const rgb, __m128i* const out /*out[6]*/) {
    305  const __m128i A0 = _mm_loadu_si128((const __m128i*)(rgb +  0));
    306  const __m128i A1 = _mm_loadu_si128((const __m128i*)(rgb + 16));
    307  const __m128i A2 = _mm_loadu_si128((const __m128i*)(rgb + 32));
    308  const __m128i A3 = _mm_loadu_si128((const __m128i*)(rgb + 48));
    309  const __m128i A4 = _mm_loadu_si128((const __m128i*)(rgb + 64));
    310  const __m128i A5 = _mm_loadu_si128((const __m128i*)(rgb + 80));
    311 
    312  // Compute RR.
    313  {
    314    const __m128i shuff0 = _mm_set_epi8(
    315        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0);
    316    const __m128i shuff1 = _mm_set_epi8(
    317        -1, -1, -1, -1, -1, 14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1);
    318    const __m128i shuff2 = _mm_set_epi8(
    319        13, 10, 7, 4, 1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    320    WEBP_SSE41_SHUFF(0)
    321  }
    322  // Compute GG.
    323  {
    324    const __m128i shuff0 = _mm_set_epi8(
    325        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1);
    326    const __m128i shuff1 = _mm_set_epi8(
    327        -1, -1, -1, -1, -1, 15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1);
    328    const __m128i shuff2 = _mm_set_epi8(
    329        14, 11, 8, 5, 2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    330    WEBP_SSE41_SHUFF(2)
    331  }
    332  // Compute BB.
    333  {
    334    const __m128i shuff0 = _mm_set_epi8(
    335        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 14, 11, 8, 5, 2);
    336    const __m128i shuff1 = _mm_set_epi8(
    337        -1, -1, -1, -1, -1, -1, 13, 10, 7, 4, 1, -1, -1, -1, -1, -1);
    338    const __m128i shuff2 = _mm_set_epi8(
    339        15, 12, 9, 6, 3, 0, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1);
    340    WEBP_SSE41_SHUFF(4)
    341  }
    342 }
    343 
    344 #undef WEBP_SSE41_SHUFF
    345 
    346 // Convert 8 packed ARGB to r[], g[], b[]
    347 static WEBP_INLINE void RGB32PackedToPlanar_SSE41(
    348    const uint32_t* WEBP_RESTRICT const argb, __m128i* const rgb /*in[6]*/) {
    349  const __m128i zero = _mm_setzero_si128();
    350  __m128i a0 = LOAD_16(argb + 0);
    351  __m128i a1 = LOAD_16(argb + 4);
    352  __m128i a2 = LOAD_16(argb + 8);
    353  __m128i a3 = LOAD_16(argb + 12);
    354  VP8L32bToPlanar_SSE41(&a0, &a1, &a2, &a3);
    355  rgb[0] = _mm_unpacklo_epi8(a1, zero);
    356  rgb[1] = _mm_unpackhi_epi8(a1, zero);
    357  rgb[2] = _mm_unpacklo_epi8(a2, zero);
    358  rgb[3] = _mm_unpackhi_epi8(a2, zero);
    359  rgb[4] = _mm_unpacklo_epi8(a3, zero);
    360  rgb[5] = _mm_unpackhi_epi8(a3, zero);
    361 }
    362 
    363 // This macro computes (RG * MULT_RG + GB * MULT_GB + ROUNDER) >> DESCALE_FIX
    364 // It's a macro and not a function because we need to use immediate values with
    365 // srai_epi32, e.g.
    366 #define TRANSFORM(RG_LO, RG_HI, GB_LO, GB_HI, MULT_RG, MULT_GB, \
    367                  ROUNDER, DESCALE_FIX, OUT) do {               \
    368  const __m128i V0_lo = _mm_madd_epi16(RG_LO, MULT_RG);         \
    369  const __m128i V0_hi = _mm_madd_epi16(RG_HI, MULT_RG);         \
    370  const __m128i V1_lo = _mm_madd_epi16(GB_LO, MULT_GB);         \
    371  const __m128i V1_hi = _mm_madd_epi16(GB_HI, MULT_GB);         \
    372  const __m128i V2_lo = _mm_add_epi32(V0_lo, V1_lo);            \
    373  const __m128i V2_hi = _mm_add_epi32(V0_hi, V1_hi);            \
    374  const __m128i V3_lo = _mm_add_epi32(V2_lo, ROUNDER);          \
    375  const __m128i V3_hi = _mm_add_epi32(V2_hi, ROUNDER);          \
    376  const __m128i V5_lo = _mm_srai_epi32(V3_lo, DESCALE_FIX);     \
    377  const __m128i V5_hi = _mm_srai_epi32(V3_hi, DESCALE_FIX);     \
    378  (OUT) = _mm_packs_epi32(V5_lo, V5_hi);                        \
    379 } while (0)
    380 
    381 #define MK_CST_16(A, B) _mm_set_epi16((B), (A), (B), (A), (B), (A), (B), (A))
    382 static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R,
    383                                            const __m128i* const G,
    384                                            const __m128i* const B,
    385                                            __m128i* const Y) {
    386  const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384);
    387  const __m128i kGB_y = MK_CST_16(16384, 6420);
    388  const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF);
    389 
    390  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
    391  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
    392  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
    393  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
    394  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y);
    395 }
    396 
    397 static WEBP_INLINE void ConvertRGBToUV_SSE41(const __m128i* const R,
    398                                             const __m128i* const G,
    399                                             const __m128i* const B,
    400                                             __m128i* const U,
    401                                             __m128i* const V) {
    402  const __m128i kRG_u = MK_CST_16(-9719, -19081);
    403  const __m128i kGB_u = MK_CST_16(0, 28800);
    404  const __m128i kRG_v = MK_CST_16(28800, 0);
    405  const __m128i kGB_v = MK_CST_16(-24116, -4684);
    406  const __m128i kHALF_UV = _mm_set1_epi32(((128 << YUV_FIX) + YUV_HALF) << 2);
    407 
    408  const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G);
    409  const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G);
    410  const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B);
    411  const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B);
    412  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_u, kGB_u,
    413            kHALF_UV, YUV_FIX + 2, *U);
    414  TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_v, kGB_v,
    415            kHALF_UV, YUV_FIX + 2, *V);
    416 }
    417 
    418 #undef MK_CST_16
    419 #undef TRANSFORM
    420 
    421 static void ConvertRGB24ToY_SSE41(const uint8_t* WEBP_RESTRICT rgb,
    422                                  uint8_t* WEBP_RESTRICT y, int width) {
    423  const int max_width = width & ~31;
    424  int i;
    425  for (i = 0; i < max_width; rgb += 3 * 16 * 2) {
    426    __m128i rgb_plane[6];
    427    int j;
    428 
    429    RGB24PackedToPlanar_SSE41(rgb, rgb_plane);
    430 
    431    for (j = 0; j < 2; ++j, i += 16) {
    432      const __m128i zero = _mm_setzero_si128();
    433      __m128i r, g, b, Y0, Y1;
    434 
    435      // Convert to 16-bit Y.
    436      r = _mm_unpacklo_epi8(rgb_plane[0 + j], zero);
    437      g = _mm_unpacklo_epi8(rgb_plane[2 + j], zero);
    438      b = _mm_unpacklo_epi8(rgb_plane[4 + j], zero);
    439      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
    440 
    441      // Convert to 16-bit Y.
    442      r = _mm_unpackhi_epi8(rgb_plane[0 + j], zero);
    443      g = _mm_unpackhi_epi8(rgb_plane[2 + j], zero);
    444      b = _mm_unpackhi_epi8(rgb_plane[4 + j], zero);
    445      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
    446 
    447      // Cast to 8-bit and store.
    448      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    449    }
    450  }
    451  for (; i < width; ++i, rgb += 3) {   // left-over
    452    y[i] = VP8RGBToY(rgb[0], rgb[1], rgb[2], YUV_HALF);
    453  }
    454 }
    455 
    456 static void ConvertBGR24ToY_SSE41(const uint8_t* WEBP_RESTRICT bgr,
    457                                  uint8_t* WEBP_RESTRICT y, int width) {
    458  const int max_width = width & ~31;
    459  int i;
    460  for (i = 0; i < max_width; bgr += 3 * 16 * 2) {
    461    __m128i bgr_plane[6];
    462    int j;
    463 
    464    RGB24PackedToPlanar_SSE41(bgr, bgr_plane);
    465 
    466    for (j = 0; j < 2; ++j, i += 16) {
    467      const __m128i zero = _mm_setzero_si128();
    468      __m128i r, g, b, Y0, Y1;
    469 
    470      // Convert to 16-bit Y.
    471      b = _mm_unpacklo_epi8(bgr_plane[0 + j], zero);
    472      g = _mm_unpacklo_epi8(bgr_plane[2 + j], zero);
    473      r = _mm_unpacklo_epi8(bgr_plane[4 + j], zero);
    474      ConvertRGBToY_SSE41(&r, &g, &b, &Y0);
    475 
    476      // Convert to 16-bit Y.
    477      b = _mm_unpackhi_epi8(bgr_plane[0 + j], zero);
    478      g = _mm_unpackhi_epi8(bgr_plane[2 + j], zero);
    479      r = _mm_unpackhi_epi8(bgr_plane[4 + j], zero);
    480      ConvertRGBToY_SSE41(&r, &g, &b, &Y1);
    481 
    482      // Cast to 8-bit and store.
    483      STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    484    }
    485  }
    486  for (; i < width; ++i, bgr += 3) {  // left-over
    487    y[i] = VP8RGBToY(bgr[2], bgr[1], bgr[0], YUV_HALF);
    488  }
    489 }
    490 
    491 static void ConvertARGBToY_SSE41(const uint32_t* WEBP_RESTRICT argb,
    492                                 uint8_t* WEBP_RESTRICT y, int width) {
    493  const int max_width = width & ~15;
    494  int i;
    495  for (i = 0; i < max_width; i += 16) {
    496    __m128i Y0, Y1, rgb[6];
    497    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
    498    ConvertRGBToY_SSE41(&rgb[0], &rgb[2], &rgb[4], &Y0);
    499    ConvertRGBToY_SSE41(&rgb[1], &rgb[3], &rgb[5], &Y1);
    500    STORE_16(_mm_packus_epi16(Y0, Y1), y + i);
    501  }
    502  for (; i < width; ++i) {   // left-over
    503    const uint32_t p = argb[i];
    504    y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >>  0) & 0xff,
    505                     YUV_HALF);
    506  }
    507 }
    508 
    509 // Horizontal add (doubled) of two 16b values, result is 16b.
    510 // in: A | B | C | D | ... -> out: 2*(A+B) | 2*(C+D) | ...
    511 static void HorizontalAddPack_SSE41(const __m128i* const A,
    512                                    const __m128i* const B,
    513                                    __m128i* const out) {
    514  const __m128i k2 = _mm_set1_epi16(2);
    515  const __m128i C = _mm_madd_epi16(*A, k2);
    516  const __m128i D = _mm_madd_epi16(*B, k2);
    517  *out = _mm_packs_epi32(C, D);
    518 }
    519 
    520 static void ConvertARGBToUV_SSE41(const uint32_t* WEBP_RESTRICT argb,
    521                                  uint8_t* WEBP_RESTRICT u,
    522                                  uint8_t* WEBP_RESTRICT v,
    523                                  int src_width, int do_store) {
    524  const int max_width = src_width & ~31;
    525  int i;
    526  for (i = 0; i < max_width; i += 32, u += 16, v += 16) {
    527    __m128i rgb[6], U0, V0, U1, V1;
    528    RGB32PackedToPlanar_SSE41(&argb[i], rgb);
    529    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
    530    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
    531    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
    532    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U0, &V0);
    533 
    534    RGB32PackedToPlanar_SSE41(&argb[i + 16], rgb);
    535    HorizontalAddPack_SSE41(&rgb[0], &rgb[1], &rgb[0]);
    536    HorizontalAddPack_SSE41(&rgb[2], &rgb[3], &rgb[2]);
    537    HorizontalAddPack_SSE41(&rgb[4], &rgb[5], &rgb[4]);
    538    ConvertRGBToUV_SSE41(&rgb[0], &rgb[2], &rgb[4], &U1, &V1);
    539 
    540    U0 = _mm_packus_epi16(U0, U1);
    541    V0 = _mm_packus_epi16(V0, V1);
    542    if (!do_store) {
    543      const __m128i prev_u = LOAD_16(u);
    544      const __m128i prev_v = LOAD_16(v);
    545      U0 = _mm_avg_epu8(U0, prev_u);
    546      V0 = _mm_avg_epu8(V0, prev_v);
    547    }
    548    STORE_16(U0, u);
    549    STORE_16(V0, v);
    550  }
    551  if (i < src_width) {  // left-over
    552    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
    553  }
    554 }
    555 
    556 // Convert 16 packed ARGB 16b-values to r[], g[], b[]
    557 static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
    558    const uint16_t* WEBP_RESTRICT const rgbx,
    559    __m128i* const r, __m128i* const g, __m128i* const b) {
    560  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
    561  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
    562  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
    563  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
    564  // aarrggbb as 16-bit.
    565  const __m128i shuff0 =
    566      _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
    567  const __m128i shuff1 =
    568      _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
    569  const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
    570  const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
    571  const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
    572  const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
    573  // R0R1G0G1
    574  // B0B1****
    575  // R2R3G2G3
    576  // B2B3****
    577  // (OR is used to free port 5 for the unpack)
    578  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
    579  const __m128i B1 = _mm_or_si128(A0, A1);
    580  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
    581  const __m128i B3 = _mm_or_si128(A2, A3);
    582  // Gather the channels.
    583  *r = _mm_unpacklo_epi64(B0, B2);
    584  *g = _mm_unpackhi_epi64(B0, B2);
    585  *b = _mm_unpackhi_epi64(B1, B3);
    586 }
    587 
    588 static void ConvertRGBA32ToUV_SSE41(const uint16_t* WEBP_RESTRICT rgb,
    589                                    uint8_t* WEBP_RESTRICT u,
    590                                    uint8_t* WEBP_RESTRICT v, int width) {
    591  const int max_width = width & ~15;
    592  const uint16_t* const last_rgb = rgb + 4 * max_width;
    593  while (rgb < last_rgb) {
    594    __m128i r, g, b, U0, V0, U1, V1;
    595    RGBA32PackedToPlanar_16b_SSE41(rgb +  0, &r, &g, &b);
    596    ConvertRGBToUV_SSE41(&r, &g, &b, &U0, &V0);
    597    RGBA32PackedToPlanar_16b_SSE41(rgb + 32, &r, &g, &b);
    598    ConvertRGBToUV_SSE41(&r, &g, &b, &U1, &V1);
    599    STORE_16(_mm_packus_epi16(U0, U1), u);
    600    STORE_16(_mm_packus_epi16(V0, V1), v);
    601    u += 16;
    602    v += 16;
    603    rgb += 2 * 32;
    604  }
    605  if (max_width < width) {  // left-over
    606    WebPConvertRGBA32ToUV_C(rgb, u, v, width - max_width);
    607  }
    608 }
    609 
    610 //------------------------------------------------------------------------------
    611 
    612 extern void WebPInitConvertARGBToYUVSSE41(void);
    613 
    614 WEBP_TSAN_IGNORE_FUNCTION void WebPInitConvertARGBToYUVSSE41(void) {
    615  WebPConvertARGBToY = ConvertARGBToY_SSE41;
    616  WebPConvertARGBToUV = ConvertARGBToUV_SSE41;
    617 
    618  WebPConvertRGB24ToY = ConvertRGB24ToY_SSE41;
    619  WebPConvertBGR24ToY = ConvertBGR24ToY_SSE41;
    620 
    621  WebPConvertRGBA32ToUV = ConvertRGBA32ToUV_SSE41;
    622 }
    623 
    624 //------------------------------------------------------------------------------
    625 
    626 #else  // !WEBP_USE_SSE41
    627 
    628 WEBP_DSP_INIT_STUB(WebPInitSamplersSSE41)
    629 WEBP_DSP_INIT_STUB(WebPInitConvertARGBToYUVSSE41)
    630 
    631 #endif  // WEBP_USE_SSE41