tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_row_win64.cpp (8047B)


      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "yuv_row.h"
      6 
      7 extern "C" {
      8 
      9 // x64 compiler doesn't support MMX and inline assembler.  Use SSE2 intrinsics.
     10 
     11 #define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048)
     12 #define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096)
     13 
     14 #include <emmintrin.h>
     15 
     16 static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf,
     17                                          const uint8_t* u_buf,
     18                                          const uint8_t* v_buf,
     19                                          uint8_t* rgb_buf,
     20                                          int width) {
     21  __m128i xmm0, xmmY1, xmmY2;
     22  __m128  xmmY;
     23 
     24  while (width >= 2) {
     25    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)),
     26                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++)));
     27 
     28    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
     29    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
     30 
     31    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++));
     32    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
     33 
     34    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
     35                          0x44);
     36    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
     37    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
     38 
     39    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
     40    rgb_buf += 8;
     41    width -= 2;
     42  }
     43 
     44  if (width) {
     45    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)),
     46                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf)));
     47    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf));
     48    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
     49    xmmY1 = _mm_srai_epi16(xmmY1, 6);
     50    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
     51    *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
     52  }
     53 }
     54 
     55 static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
     56                                    const uint8_t* u_buf,
     57                                    const uint8_t* v_buf,
     58                                    uint8_t* rgb_buf,
     59                                    int width,
     60                                    int source_dx) {
     61  __m128i xmm0, xmmY1, xmmY2;
     62  __m128  xmmY;
     63  uint8_t u, v, y;
     64  int x = 0;
     65 
     66  while (width >= 2) {
     67    u = u_buf[x >> 17];
     68    v = v_buf[x >> 17];
     69    y = y_buf[x >> 16];
     70    x += source_dx;
     71 
     72    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
     73                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
     74    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
     75    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
     76 
     77    y = y_buf[x >> 16];
     78    x += source_dx;
     79 
     80    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
     81    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
     82 
     83    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
     84                          0x44);
     85    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
     86    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
     87 
     88    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
     89    rgb_buf += 8;
     90    width -= 2;
     91  }
     92 
     93  if (width) {
     94    u = u_buf[x >> 17];
     95    v = v_buf[x >> 17];
     96    y = y_buf[x >> 16];
     97 
     98    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
     99                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
    100    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
    101    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    102    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    103    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    104    *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
    105  }
    106 }
    107 
    108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf,
    109                                          const uint8_t* u_buf,
    110                                          const uint8_t* v_buf,
    111                                          uint8_t* rgb_buf,
    112                                          int width,
    113                                          int source_dx) {
    114  __m128i xmm0, xmmY1, xmmY2;
    115  __m128  xmmY;
    116  uint8_t u0, u1, v0, v1, y0, y1;
    117  uint32_t uv_frac, y_frac, u, v, y;
    118  int x = 0;
    119 
    120  if (source_dx >= 0x20000) {
    121    x = 32768;
    122  }
    123 
    124  while(width >= 2) {
    125    u0 = u_buf[x >> 17];
    126    u1 = u_buf[(x >> 17) + 1];
    127    v0 = v_buf[x >> 17];
    128    v1 = v_buf[(x >> 17) + 1];
    129    y0 = y_buf[x >> 16];
    130    y1 = y_buf[(x >> 16) + 1];
    131    uv_frac = (x & 0x1fffe);
    132    y_frac = (x & 0xffff);
    133    u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
    134    v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
    135    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
    136    x += source_dx;
    137 
    138    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
    139                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
    140    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
    141    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    142 
    143    y0 = y_buf[x >> 16];
    144    y1 = y_buf[(x >> 16) + 1];
    145    y_frac = (x & 0xffff);
    146    y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
    147    x += source_dx;
    148 
    149    xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
    150    xmmY2 = _mm_adds_epi16(xmmY2, xmm0);
    151 
    152    xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
    153                          0x44);
    154    xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
    155    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    156 
    157    _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
    158    rgb_buf += 8;
    159    width -= 2;
    160  }
    161 
    162  if (width) {
    163    u = u_buf[x >> 17];
    164    v = v_buf[x >> 17];
    165    y = y_buf[x >> 16];
    166 
    167    xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)),
    168                          _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v)));
    169    xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y));
    170 
    171    xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
    172    xmmY1 = _mm_srai_epi16(xmmY1, 6);
    173    xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
    174    *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
    175  }
    176 }
    177 
    178 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
    179                              const uint8_t* u_buf,
    180                              const uint8_t* v_buf,
    181                              uint8_t* rgb_buf,
    182                              int width) {
    183  FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width);
    184 }
    185 
    186 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
    187                        const uint8_t* u_buf,
    188                        const uint8_t* v_buf,
    189                        uint8_t* rgb_buf,
    190                        int width,
    191                        int source_dx) {
    192  ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    193 }
    194 
    195 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
    196                              const uint8_t* u_buf,
    197                              const uint8_t* v_buf,
    198                              uint8_t* rgb_buf,
    199                              int width,
    200                              int source_dx) {
    201  LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width,
    202                                source_dx);
    203 }
    204 
    205 } // extern "C"