yuv_row_win64.cpp (8047B)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "yuv_row.h" 6 7 extern "C" { 8 9 // x64 compiler doesn't support MMX and inline assembler. Use SSE2 intrinsics. 10 11 #define kCoefficientsRgbU (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 2048) 12 #define kCoefficientsRgbV (reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 4096) 13 14 #include <emmintrin.h> 15 16 static void FastConvertYUVToRGB32Row_SSE2(const uint8_t* y_buf, 17 const uint8_t* u_buf, 18 const uint8_t* v_buf, 19 uint8_t* rgb_buf, 20 int width) { 21 __m128i xmm0, xmmY1, xmmY2; 22 __m128 xmmY; 23 24 while (width >= 2) { 25 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf++)), 26 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf++))); 27 28 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++)); 29 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 30 31 xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf++)); 32 xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 33 34 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 35 0x44); 36 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 37 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 38 39 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 40 rgb_buf += 8; 41 width -= 2; 42 } 43 44 if (width) { 45 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * *u_buf)), 46 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * *v_buf))); 47 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * *y_buf)); 48 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 49 xmmY1 = _mm_srai_epi16(xmmY1, 6); 50 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 51 *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 52 } 53 } 54 55 static void ScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf, 56 const uint8_t* u_buf, 57 const uint8_t* v_buf, 58 uint8_t* rgb_buf, 59 int width, 60 int source_dx) { 61 __m128i xmm0, xmmY1, xmmY2; 62 __m128 xmmY; 63 uint8_t u, v, y; 64 int x = 0; 65 66 while (width >= 2) { 67 u = u_buf[x >> 17]; 68 v = v_buf[x >> 17]; 69 y = y_buf[x >> 16]; 70 x += source_dx; 71 72 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), 73 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); 74 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); 75 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 76 77 y = y_buf[x >> 16]; 78 x += source_dx; 79 80 xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); 81 xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 82 83 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 84 0x44); 85 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 86 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 87 88 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 89 rgb_buf += 8; 90 width -= 2; 91 } 92 93 if (width) { 94 u = u_buf[x >> 17]; 95 v = v_buf[x >> 17]; 96 y = y_buf[x >> 16]; 97 98 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), 99 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); 100 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); 101 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 102 xmmY1 = _mm_srai_epi16(xmmY1, 6); 103 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 104 *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 105 } 106 } 107 108 static void LinearScaleYUVToRGB32Row_SSE2(const uint8_t* y_buf, 109 const uint8_t* u_buf, 110 const uint8_t* v_buf, 111 uint8_t* rgb_buf, 112 int width, 113 int source_dx) { 114 __m128i xmm0, xmmY1, xmmY2; 115 __m128 xmmY; 116 uint8_t u0, u1, v0, v1, y0, y1; 117 uint32_t uv_frac, y_frac, u, v, y; 118 int x = 0; 119 120 if (source_dx >= 0x20000) { 121 x = 32768; 122 } 123 124 while(width >= 2) { 125 u0 = u_buf[x >> 17]; 126 u1 = u_buf[(x >> 17) + 1]; 127 v0 = v_buf[x >> 17]; 128 v1 = v_buf[(x >> 17) + 1]; 129 y0 = y_buf[x >> 16]; 130 y1 = y_buf[(x >> 16) + 1]; 131 uv_frac = (x & 0x1fffe); 132 y_frac = (x & 0xffff); 133 u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; 134 v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; 135 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; 136 x += source_dx; 137 138 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), 139 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); 140 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); 141 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 142 143 y0 = y_buf[x >> 16]; 144 y1 = y_buf[(x >> 16) + 1]; 145 y_frac = (x & 0xffff); 146 y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; 147 x += source_dx; 148 149 xmmY2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); 150 xmmY2 = _mm_adds_epi16(xmmY2, xmm0); 151 152 xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 153 0x44); 154 xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); 155 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 156 157 _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); 158 rgb_buf += 8; 159 width -= 2; 160 } 161 162 if (width) { 163 u = u_buf[x >> 17]; 164 v = v_buf[x >> 17]; 165 y = y_buf[x >> 16]; 166 167 xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbU + 8 * u)), 168 _mm_loadl_epi64(reinterpret_cast<const __m128i*>(kCoefficientsRgbV + 8 * v))); 169 xmmY1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(reinterpret_cast<const uint8_t*>(kCoefficientsRgbY) + 8 * y)); 170 171 xmmY1 = _mm_adds_epi16(xmmY1, xmm0); 172 xmmY1 = _mm_srai_epi16(xmmY1, 6); 173 xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); 174 *reinterpret_cast<uint32_t*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); 175 } 176 } 177 178 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, 179 const uint8_t* u_buf, 180 const uint8_t* v_buf, 181 uint8_t* rgb_buf, 182 int width) { 183 FastConvertYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width); 184 } 185 186 void ScaleYUVToRGB32Row(const uint8_t* y_buf, 187 const uint8_t* u_buf, 188 const uint8_t* v_buf, 189 uint8_t* rgb_buf, 190 int width, 191 int source_dx) { 192 ScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 193 } 194 195 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, 196 const uint8_t* u_buf, 197 const uint8_t* v_buf, 198 uint8_t* rgb_buf, 199 int width, 200 int source_dx) { 201 LinearScaleYUVToRGB32Row_SSE2(y_buf, u_buf, v_buf, rgb_buf, width, 202 source_dx); 203 } 204 205 } // extern "C"