tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

lossless_sse41.c (5561B)


      1 // Copyright 2021 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // SSE41 variant of methods for lossless decoder
     11 
     12 #include "src/dsp/dsp.h"
     13 
     14 #if defined(WEBP_USE_SSE41)
     15 #include <emmintrin.h>
     16 #include <smmintrin.h>
     17 
     18 #include "src/webp/types.h"
     19 #include "src/dsp/cpu.h"
     20 #include "src/dsp/lossless.h"
     21 
     22 //------------------------------------------------------------------------------
     23 // Color-space conversion functions
     24 
     25 static void TransformColorInverse_SSE41(const VP8LMultipliers* const m,
     26                                        const uint32_t* const src,
     27                                        int num_pixels, uint32_t* dst) {
     28 // sign-extended multiplying constants, pre-shifted by 5.
     29 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
     30  const __m128i mults_rb =
     31      _mm_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 |
     32                           (CST(green_to_blue) & 0xffff)));
     33  const __m128i mults_b2 = _mm_set1_epi32(CST(red_to_blue));
     34 #undef CST
     35  const __m128i mask_ag = _mm_set1_epi32((int)0xff00ff00);
     36  const __m128i perm1 = _mm_setr_epi8(-1, 1, -1, 1, -1, 5, -1, 5,
     37                                      -1, 9, -1, 9, -1, 13, -1, 13);
     38  const __m128i perm2 = _mm_setr_epi8(-1, 2, -1, -1, -1, 6, -1, -1,
     39                                      -1, 10, -1, -1, -1, 14, -1, -1);
     40  int i;
     41  for (i = 0; i + 4 <= num_pixels; i += 4) {
     42    const __m128i A = _mm_loadu_si128((const __m128i*)(src + i));
     43    const __m128i B = _mm_shuffle_epi8(A, perm1); // argb -> g0g0
     44    const __m128i C = _mm_mulhi_epi16(B, mults_rb);
     45    const __m128i D = _mm_add_epi8(A, C);
     46    const __m128i E = _mm_shuffle_epi8(D, perm2);
     47    const __m128i F = _mm_mulhi_epi16(E, mults_b2);
     48    const __m128i G = _mm_add_epi8(D, F);
     49    const __m128i out = _mm_blendv_epi8(G, A, mask_ag);
     50    _mm_storeu_si128((__m128i*)&dst[i], out);
     51  }
     52  // Fall-back to C-version for left-overs.
     53  if (i != num_pixels) {
     54    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
     55  }
     56 }
     57 
     58 //------------------------------------------------------------------------------
     59 
     60 #define ARGB_TO_RGB_SSE41 do {                        \
     61  while (num_pixels >= 16) {                          \
     62    const __m128i in0 = _mm_loadu_si128(in + 0);      \
     63    const __m128i in1 = _mm_loadu_si128(in + 1);      \
     64    const __m128i in2 = _mm_loadu_si128(in + 2);      \
     65    const __m128i in3 = _mm_loadu_si128(in + 3);      \
     66    const __m128i a0 = _mm_shuffle_epi8(in0, perm0);  \
     67    const __m128i a1 = _mm_shuffle_epi8(in1, perm1);  \
     68    const __m128i a2 = _mm_shuffle_epi8(in2, perm2);  \
     69    const __m128i a3 = _mm_shuffle_epi8(in3, perm3);  \
     70    const __m128i b0 = _mm_blend_epi16(a0, a1, 0xc0); \
     71    const __m128i b1 = _mm_blend_epi16(a1, a2, 0xf0); \
     72    const __m128i b2 = _mm_blend_epi16(a2, a3, 0xfc); \
     73    _mm_storeu_si128(out + 0, b0);                    \
     74    _mm_storeu_si128(out + 1, b1);                    \
     75    _mm_storeu_si128(out + 2, b2);                    \
     76    in += 4;                                          \
     77    out += 3;                                         \
     78    num_pixels -= 16;                                 \
     79  }                                                   \
     80 } while (0)
     81 
     82 static void ConvertBGRAToRGB_SSE41(const uint32_t* WEBP_RESTRICT src,
     83                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
     84  const __m128i* in = (const __m128i*)src;
     85  __m128i* out = (__m128i*)dst;
     86  const __m128i perm0 = _mm_setr_epi8(2, 1, 0, 6, 5, 4, 10, 9,
     87                                      8, 14, 13, 12, -1, -1, -1, -1);
     88  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
     89  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
     90  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
     91 
     92  ARGB_TO_RGB_SSE41;
     93 
     94  // left-overs
     95  if (num_pixels > 0) {
     96    VP8LConvertBGRAToRGB_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
     97  }
     98 }
     99 
    100 static void ConvertBGRAToBGR_SSE41(const uint32_t* WEBP_RESTRICT src,
    101                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
    102  const __m128i* in = (const __m128i*)src;
    103  __m128i* out = (__m128i*)dst;
    104  const __m128i perm0 = _mm_setr_epi8(0, 1, 2, 4, 5, 6, 8, 9, 10,
    105                                      12, 13, 14, -1, -1, -1, -1);
    106  const __m128i perm1 = _mm_shuffle_epi32(perm0, 0x39);
    107  const __m128i perm2 = _mm_shuffle_epi32(perm0, 0x4e);
    108  const __m128i perm3 = _mm_shuffle_epi32(perm0, 0x93);
    109 
    110  ARGB_TO_RGB_SSE41;
    111 
    112  // left-overs
    113  if (num_pixels > 0) {
    114    VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
    115  }
    116 }
    117 
    118 #undef ARGB_TO_RGB_SSE41
    119 
    120 //------------------------------------------------------------------------------
    121 // Entry point
    122 
    123 extern void VP8LDspInitSSE41(void);
    124 
    125 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitSSE41(void) {
    126  VP8LTransformColorInverse = TransformColorInverse_SSE41;
    127  VP8LConvertBGRAToRGB = ConvertBGRAToRGB_SSE41;
    128  VP8LConvertBGRAToBGR = ConvertBGRAToBGR_SSE41;
    129 
    130  // SSE exports for AVX and above.
    131  VP8LTransformColorInverse_SSE = TransformColorInverse_SSE41;
    132  VP8LConvertBGRAToRGB_SSE = ConvertBGRAToRGB_SSE41;
    133 }
    134 
    135 #else  // !WEBP_USE_SSE41
    136 
    137 WEBP_DSP_INIT_STUB(VP8LDspInitSSE41)
    138 
    139 #endif  // WEBP_USE_SSE41