yuv_row_win.cpp (14435B)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "yuv_row.h" 6 #include "mozilla/SSE.h" 7 8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048 9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096 10 11 extern "C" { 12 13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 14 #if defined(__clang__) 15 // clang-cl has a bug where it doesn't mangle names in inline asm 16 // so let's do the mangling in the preprocessor (ugh) 17 // (but we still need to declare a dummy extern for the parser) 18 extern void* _kCoefficientsRgbY; 19 #define kCoefficientsRgbY _kCoefficientsRgbY 20 #endif 21 22 __declspec(naked) 23 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, 24 const uint8_t* u_buf, 25 const uint8_t* v_buf, 26 uint8_t* rgb_buf, 27 int width) { 28 __asm { 29 pushad 30 mov edx, [esp + 32 + 4] // Y 31 mov edi, [esp + 32 + 8] // U 32 mov esi, [esp + 32 + 12] // V 33 mov ebp, [esp + 32 + 16] // rgb 34 mov ecx, [esp + 32 + 20] // width 35 jmp convertend 36 37 convertloop : 38 movzx eax, byte ptr [edi] 39 add edi, 1 40 movzx ebx, byte ptr [esi] 41 add esi, 1 42 movq mm0, [kCoefficientsRgbU + 8 * eax] 43 movzx eax, byte ptr [edx] 44 paddsw mm0, [kCoefficientsRgbV + 8 * ebx] 45 movzx ebx, byte ptr [edx + 1] 46 movq mm1, [kCoefficientsRgbY + 8 * eax] 47 add edx, 2 48 movq mm2, [kCoefficientsRgbY + 8 * ebx] 49 paddsw mm1, mm0 50 paddsw mm2, mm0 51 psraw mm1, 6 52 psraw mm2, 6 53 packuswb mm1, mm2 54 movntq [ebp], mm1 55 add ebp, 8 56 convertend : 57 sub ecx, 2 58 jns convertloop 59 60 and ecx, 1 // odd number of pixels? 61 jz convertdone 62 63 movzx eax, byte ptr [edi] 64 movq mm0, [kCoefficientsRgbU + 8 * eax] 65 movzx eax, byte ptr [esi] 66 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 67 movzx eax, byte ptr [edx] 68 movq mm1, [kCoefficientsRgbY + 8 * eax] 69 paddsw mm1, mm0 70 psraw mm1, 6 71 packuswb mm1, mm1 72 movd [ebp], mm1 73 convertdone : 74 75 popad 76 ret 77 } 78 } 79 80 __declspec(naked) 81 void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, 82 const uint8_t* u_buf, 83 const uint8_t* v_buf, 84 uint8_t* rgb_buf, 85 int width, 86 int step) { 87 __asm { 88 pushad 89 mov edx, [esp + 32 + 4] // Y 90 mov edi, [esp + 32 + 8] // U 91 mov esi, [esp + 32 + 12] // V 92 mov ebp, [esp + 32 + 16] // rgb 93 mov ecx, [esp + 32 + 20] // width 94 mov ebx, [esp + 32 + 24] // step 95 jmp wend 96 97 wloop : 98 movzx eax, byte ptr [edi] 99 add edi, ebx 100 movq mm0, [kCoefficientsRgbU + 8 * eax] 101 movzx eax, byte ptr [esi] 102 add esi, ebx 103 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 104 movzx eax, byte ptr [edx] 105 add edx, ebx 106 movq mm1, [kCoefficientsRgbY + 8 * eax] 107 movzx eax, byte ptr [edx] 108 add edx, ebx 109 movq mm2, [kCoefficientsRgbY + 8 * eax] 110 paddsw mm1, mm0 111 paddsw mm2, mm0 112 psraw mm1, 6 113 psraw mm2, 6 114 packuswb mm1, mm2 115 movntq [ebp], mm1 116 add ebp, 8 117 wend : 118 sub ecx, 2 119 jns wloop 120 121 and ecx, 1 // odd number of pixels? 122 jz wdone 123 124 movzx eax, byte ptr [edi] 125 movq mm0, [kCoefficientsRgbU + 8 * eax] 126 movzx eax, byte ptr [esi] 127 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 128 movzx eax, byte ptr [edx] 129 movq mm1, [kCoefficientsRgbY + 8 * eax] 130 paddsw mm1, mm0 131 psraw mm1, 6 132 packuswb mm1, mm1 133 movd [ebp], mm1 134 wdone : 135 136 popad 137 ret 138 } 139 } 140 141 __declspec(naked) 142 void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, 143 const uint8_t* u_buf, 144 const uint8_t* v_buf, 145 uint8_t* rgb_buf, 146 int width, 147 int ystep, 148 int uvstep) { 149 __asm { 150 pushad 151 mov edx, [esp + 32 + 4] // Y 152 mov edi, [esp + 32 + 8] // U 153 mov esi, [esp + 32 + 12] // V 154 mov ebp, [esp + 32 + 16] // rgb 155 mov ecx, [esp + 32 + 20] // width 156 jmp wend 157 158 wloop : 159 movzx eax, byte ptr [edi] 160 mov ebx, [esp + 32 + 28] // uvstep 161 add edi, ebx 162 movq mm0, [kCoefficientsRgbU + 8 * eax] 163 movzx eax, byte ptr [esi] 164 add esi, ebx 165 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 166 movzx eax, byte ptr [edx] 167 mov ebx, [esp + 32 + 24] // ystep 168 add edx, ebx 169 movq mm1, [kCoefficientsRgbY + 8 * eax] 170 movzx eax, byte ptr [edx] 171 add edx, ebx 172 movq mm2, [kCoefficientsRgbY + 8 * eax] 173 paddsw mm1, mm0 174 paddsw mm2, mm0 175 psraw mm1, 6 176 psraw mm2, 6 177 packuswb mm1, mm2 178 movntq [ebp], mm1 179 add ebp, 8 180 wend : 181 sub ecx, 2 182 jns wloop 183 184 and ecx, 1 // odd number of pixels? 185 jz wdone 186 187 movzx eax, byte ptr [edi] 188 movq mm0, [kCoefficientsRgbU + 8 * eax] 189 movzx eax, byte ptr [esi] 190 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 191 movzx eax, byte ptr [edx] 192 movq mm1, [kCoefficientsRgbY + 8 * eax] 193 paddsw mm1, mm0 194 psraw mm1, 6 195 packuswb mm1, mm1 196 movd [ebp], mm1 197 wdone : 198 199 popad 200 ret 201 } 202 } 203 204 __declspec(naked) 205 void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf, 206 const uint8_t* u_buf, 207 const uint8_t* v_buf, 208 uint8_t* rgb_buf, 209 int width) { 210 __asm { 211 pushad 212 mov edx, [esp + 32 + 4] // Y 213 mov edi, [esp + 32 + 8] // U 214 mov esi, [esp + 32 + 12] // V 215 mov ebp, [esp + 32 + 16] // rgb 216 mov ecx, [esp + 32 + 20] // width 217 jmp wend 218 219 wloop : 220 movzx eax, byte ptr [edi] 221 add edi, 1 222 movzx ebx, byte ptr [esi] 223 add esi, 1 224 movq mm0, [kCoefficientsRgbU + 8 * eax] 225 movzx eax, byte ptr [edx] 226 paddsw mm0, [kCoefficientsRgbV + 8 * ebx] 227 movq mm1, [kCoefficientsRgbY + 8 * eax] 228 paddsw mm1, mm0 229 psraw mm1, 6 230 packuswb mm1, mm1 231 punpckldq mm1, mm1 232 movntq [ebp], mm1 233 234 movzx ebx, byte ptr [edx + 1] 235 add edx, 2 236 paddsw mm0, [kCoefficientsRgbY + 8 * ebx] 237 psraw mm0, 6 238 packuswb mm0, mm0 239 punpckldq mm0, mm0 240 movntq [ebp+8], mm0 241 add ebp, 16 242 wend : 243 sub ecx, 4 244 jns wloop 245 246 add ecx, 4 247 jz wdone 248 249 movzx eax, byte ptr [edi] 250 movq mm0, [kCoefficientsRgbU + 8 * eax] 251 movzx eax, byte ptr [esi] 252 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 253 movzx eax, byte ptr [edx] 254 movq mm1, [kCoefficientsRgbY + 8 * eax] 255 paddsw mm1, mm0 256 psraw mm1, 6 257 packuswb mm1, mm1 258 jmp wend1 259 260 wloop1 : 261 movd [ebp], mm1 262 add ebp, 4 263 wend1 : 264 sub ecx, 1 265 jns wloop1 266 wdone : 267 popad 268 ret 269 } 270 } 271 272 // This version does general purpose scaling by any amount, up or down. 273 // The only thing it cannot do is rotation by 90 or 270. 274 // For performance the chroma is under-sampled, reducing cost of a 3x 275 // 1080p scale from 8.4 ms to 5.4 ms. 276 __declspec(naked) 277 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, 278 const uint8_t* u_buf, 279 const uint8_t* v_buf, 280 uint8_t* rgb_buf, 281 int width, 282 int source_dx) { 283 __asm { 284 pushad 285 mov edx, [esp + 32 + 4] // Y 286 mov edi, [esp + 32 + 8] // U 287 mov esi, [esp + 32 + 12] // V 288 mov ebp, [esp + 32 + 16] // rgb 289 mov ecx, [esp + 32 + 20] // width 290 xor ebx, ebx // x 291 jmp scaleend 292 293 scaleloop : 294 mov eax, ebx 295 sar eax, 17 296 movzx eax, byte ptr [edi + eax] 297 movq mm0, [kCoefficientsRgbU + 8 * eax] 298 mov eax, ebx 299 sar eax, 17 300 movzx eax, byte ptr [esi + eax] 301 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 302 mov eax, ebx 303 add ebx, [esp + 32 + 24] // x += source_dx 304 sar eax, 16 305 movzx eax, byte ptr [edx + eax] 306 movq mm1, [kCoefficientsRgbY + 8 * eax] 307 mov eax, ebx 308 add ebx, [esp + 32 + 24] // x += source_dx 309 sar eax, 16 310 movzx eax, byte ptr [edx + eax] 311 movq mm2, [kCoefficientsRgbY + 8 * eax] 312 paddsw mm1, mm0 313 paddsw mm2, mm0 314 psraw mm1, 6 315 psraw mm2, 6 316 packuswb mm1, mm2 317 movntq [ebp], mm1 318 add ebp, 8 319 scaleend : 320 sub ecx, 2 321 jns scaleloop 322 323 and ecx, 1 // odd number of pixels? 324 jz scaledone 325 326 mov eax, ebx 327 sar eax, 17 328 movzx eax, byte ptr [edi + eax] 329 movq mm0, [kCoefficientsRgbU + 8 * eax] 330 mov eax, ebx 331 sar eax, 17 332 movzx eax, byte ptr [esi + eax] 333 paddsw mm0, [kCoefficientsRgbV + 8 * eax] 334 mov eax, ebx 335 sar eax, 16 336 movzx eax, byte ptr [edx + eax] 337 movq mm1, [kCoefficientsRgbY + 8 * eax] 338 paddsw mm1, mm0 339 psraw mm1, 6 340 packuswb mm1, mm1 341 movd [ebp], mm1 342 343 scaledone : 344 popad 345 ret 346 } 347 } 348 349 __declspec(naked) 350 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, 351 const uint8_t* u_buf, 352 const uint8_t* v_buf, 353 uint8_t* rgb_buf, 354 int width, 355 int source_dx) { 356 __asm { 357 pushad 358 mov edx, [esp + 32 + 4] // Y 359 mov edi, [esp + 32 + 8] // U 360 // [esp + 32 + 12] // V 361 mov ebp, [esp + 32 + 16] // rgb 362 mov ecx, [esp + 32 + 20] // width 363 imul ecx, [esp + 32 + 24] // source_dx 364 mov [esp + 32 + 20], ecx // source_width = width * source_dx 365 mov ecx, [esp + 32 + 24] // source_dx 366 xor ebx, ebx // x = 0 367 cmp ecx, 0x20000 368 jl lscaleend 369 mov ebx, 0x8000 // x = 0.5 for 1/2 or less 370 jmp lscaleend 371 lscaleloop: 372 mov eax, ebx 373 sar eax, 0x11 374 375 movzx ecx, byte ptr [edi + eax] 376 movzx esi, byte ptr [edi + eax + 1] 377 mov eax, ebx 378 and eax, 0x1fffe 379 imul esi, eax 380 xor eax, 0x1fffe 381 imul ecx, eax 382 add ecx, esi 383 shr ecx, 17 384 movq mm0, [kCoefficientsRgbU + 8 * ecx] 385 386 mov esi, [esp + 32 + 12] 387 mov eax, ebx 388 sar eax, 0x11 389 390 movzx ecx, byte ptr [esi + eax] 391 movzx esi, byte ptr [esi + eax + 1] 392 mov eax, ebx 393 and eax, 0x1fffe 394 imul esi, eax 395 xor eax, 0x1fffe 396 imul ecx, eax 397 add ecx, esi 398 shr ecx, 17 399 paddsw mm0, [kCoefficientsRgbV + 8 * ecx] 400 401 mov eax, ebx 402 sar eax, 0x10 403 movzx ecx, byte ptr [edx + eax] 404 movzx esi, byte ptr [1 + edx + eax] 405 mov eax, ebx 406 add ebx, [esp + 32 + 24] 407 and eax, 0xffff 408 imul esi, eax 409 xor eax, 0xffff 410 imul ecx, eax 411 add ecx, esi 412 shr ecx, 16 413 movq mm1, [kCoefficientsRgbY + 8 * ecx] 414 415 cmp ebx, [esp + 32 + 20] 416 jge lscalelastpixel 417 418 mov eax, ebx 419 sar eax, 0x10 420 movzx ecx, byte ptr [edx + eax] 421 movzx esi, byte ptr [edx + eax + 1] 422 mov eax, ebx 423 add ebx, [esp + 32 + 24] 424 and eax, 0xffff 425 imul esi, eax 426 xor eax, 0xffff 427 imul ecx, eax 428 add ecx, esi 429 shr ecx, 16 430 movq mm2, [kCoefficientsRgbY + 8 * ecx] 431 432 paddsw mm1, mm0 433 paddsw mm2, mm0 434 psraw mm1, 0x6 435 psraw mm2, 0x6 436 packuswb mm1, mm2 437 movntq [ebp], mm1 438 add ebp, 0x8 439 440 lscaleend: 441 cmp ebx, [esp + 32 + 20] 442 jl lscaleloop 443 popad 444 ret 445 446 lscalelastpixel: 447 paddsw mm1, mm0 448 psraw mm1, 6 449 packuswb mm1, mm1 450 movd [ebp], mm1 451 popad 452 ret 453 }; 454 } 455 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 456 457 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, 458 const uint8_t* u_buf, 459 const uint8_t* v_buf, 460 uint8_t* rgb_buf, 461 int width) { 462 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 463 if (mozilla::supports_sse()) { 464 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); 465 return; 466 } 467 #endif 468 469 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 470 } 471 472 void ScaleYUVToRGB32Row(const uint8_t* y_buf, 473 const uint8_t* u_buf, 474 const uint8_t* v_buf, 475 uint8_t* rgb_buf, 476 int width, 477 int source_dx) { 478 479 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 480 if (mozilla::supports_sse()) { 481 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 482 return; 483 } 484 #endif 485 486 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 487 } 488 489 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, 490 const uint8_t* u_buf, 491 const uint8_t* v_buf, 492 uint8_t* rgb_buf, 493 int width, 494 int source_dx) { 495 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86) 496 if (mozilla::supports_sse()) { 497 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 498 source_dx); 499 return; 500 } 501 #endif 502 503 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 504 } 505 506 } // extern "C"