yuv_row_posix.cpp (24655B)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #include "yuv_row.h" 6 #include "mozilla/SSE.h" 7 8 #define DCHECK(a) 9 10 extern "C" { 11 12 #if defined(ARCH_CPU_X86_64) 13 14 // We don't need CPUID guards here, since x86-64 implies SSE2. 15 16 // AMD64 ABI uses register paremters. 17 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, // rdi 18 const uint8_t* u_buf, // rsi 19 const uint8_t* v_buf, // rdx 20 uint8_t* rgb_buf, // rcx 21 int width) { // r8 22 asm volatile( 23 "jmp 1f\n" 24 "0:" 25 "movzb (%[u_buf]),%%r10\n" 26 "add $0x1,%[u_buf]\n" 27 "movzb (%[v_buf]),%%r11\n" 28 "add $0x1,%[v_buf]\n" 29 "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" 30 "movzb (%[y_buf]),%%r10\n" 31 "movq 4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n" 32 "movzb 0x1(%[y_buf]),%%r11\n" 33 "paddsw %%xmm1,%%xmm0\n" 34 "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n" 35 "add $0x2,%[y_buf]\n" 36 "movq (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n" 37 "paddsw %%xmm0,%%xmm2\n" 38 "paddsw %%xmm0,%%xmm3\n" 39 "shufps $0x44,%%xmm3,%%xmm2\n" 40 "psraw $0x6,%%xmm2\n" 41 "packuswb %%xmm2,%%xmm2\n" 42 "movq %%xmm2,0x0(%[rgb_buf])\n" 43 "add $0x8,%[rgb_buf]\n" 44 "1:" 45 "sub $0x2,%[width]\n" 46 "jns 0b\n" 47 48 "2:" 49 "add $0x1,%[width]\n" 50 "js 3f\n" 51 52 "movzb (%[u_buf]),%%r10\n" 53 "movq 2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n" 54 "movzb (%[v_buf]),%%r10\n" 55 "movq 4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" 56 "paddsw %%xmm1,%%xmm0\n" 57 "movzb (%[y_buf]),%%r10\n" 58 "movq (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n" 59 "paddsw %%xmm0,%%xmm1\n" 60 "psraw $0x6,%%xmm1\n" 61 "packuswb %%xmm1,%%xmm1\n" 62 "movd %%xmm1,0x0(%[rgb_buf])\n" 63 "3:" 64 : [y_buf] "+r"(y_buf), 65 [u_buf] "+r"(u_buf), 66 [v_buf] "+r"(v_buf), 67 [rgb_buf] "+r"(rgb_buf), 68 [width] "+r"(width) 69 : [kCoefficientsRgbY] "r" (kCoefficientsRgbY) 70 : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3" 71 ); 72 } 73 74 void ScaleYUVToRGB32Row(const uint8_t* y_buf, // rdi 75 const uint8_t* u_buf, // rsi 76 const uint8_t* v_buf, // rdx 77 uint8_t* rgb_buf, // rcx 78 int width, // r8 79 int source_dx) { // r9 80 asm volatile( 81 "xor %%r11,%%r11\n" 82 "sub $0x2,%[width]\n" 83 "js 1f\n" 84 85 "0:" 86 "mov %%r11,%%r10\n" 87 "sar $0x11,%%r10\n" 88 "movzb (%[u_buf],%%r10,1),%%rax\n" 89 "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n" 90 "movzb (%[v_buf],%%r10,1),%%rax\n" 91 "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" 92 "lea (%%r11,%[source_dx]),%%r10\n" 93 "sar $0x10,%%r11\n" 94 "movzb (%[y_buf],%%r11,1),%%rax\n" 95 "paddsw %%xmm1,%%xmm0\n" 96 "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" 97 "lea (%%r10,%[source_dx]),%%r11\n" 98 "sar $0x10,%%r10\n" 99 "movzb (%[y_buf],%%r10,1),%%rax\n" 100 "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n" 101 "paddsw %%xmm0,%%xmm1\n" 102 "paddsw %%xmm0,%%xmm2\n" 103 "shufps $0x44,%%xmm2,%%xmm1\n" 104 "psraw $0x6,%%xmm1\n" 105 "packuswb %%xmm1,%%xmm1\n" 106 "movq %%xmm1,0x0(%[rgb_buf])\n" 107 "add $0x8,%[rgb_buf]\n" 108 "sub $0x2,%[width]\n" 109 "jns 0b\n" 110 111 "1:" 112 "add $0x1,%[width]\n" 113 "js 2f\n" 114 115 "mov %%r11,%%r10\n" 116 "sar $0x11,%%r10\n" 117 "movzb (%[u_buf],%%r10,1),%%rax\n" 118 "movq 2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n" 119 "movzb (%[v_buf],%%r10,1),%%rax\n" 120 "movq 4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" 121 "paddsw %%xmm1,%%xmm0\n" 122 "sar $0x10,%%r11\n" 123 "movzb (%[y_buf],%%r11,1),%%rax\n" 124 "movq (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n" 125 "paddsw %%xmm0,%%xmm1\n" 126 "psraw $0x6,%%xmm1\n" 127 "packuswb %%xmm1,%%xmm1\n" 128 "movd %%xmm1,0x0(%[rgb_buf])\n" 129 130 "2:" 131 : [rgb_buf] "+r"(rgb_buf), 132 [width] "+r"(width) 133 : [y_buf] "r"(y_buf), 134 [u_buf] "r"(u_buf), 135 [v_buf] "r"(v_buf), 136 [kCoefficientsRgbY] "r" (kCoefficientsRgbY), 137 [source_dx] "r"(static_cast<long>(source_dx)) 138 : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2" 139 ); 140 } 141 142 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, 143 const uint8_t* u_buf, 144 const uint8_t* v_buf, 145 uint8_t* rgb_buf, 146 int width, 147 int source_dx) { 148 asm volatile( 149 "xor %%r11,%%r11\n" // x = 0 150 "sub $0x2,%[width]\n" 151 "js 2f\n" 152 "cmp $0x20000,%[source_dx]\n" // if source_dx >= 2.0 153 "jl 0f\n" 154 "mov $0x8000,%%r11\n" // x = 0.5 for 1/2 or less 155 "0:" 156 157 "1:" 158 "mov %%r11,%%r10\n" 159 "sar $0x11,%%r10\n" 160 161 "movzb (%[u_buf], %%r10, 1), %%r13 \n" 162 "movzb 1(%[u_buf], %%r10, 1), %%r14 \n" 163 "mov %%r11, %%rax \n" 164 "and $0x1fffe, %%rax \n" 165 "imul %%rax, %%r14 \n" 166 "xor $0x1fffe, %%rax \n" 167 "imul %%rax, %%r13 \n" 168 "add %%r14, %%r13 \n" 169 "shr $17, %%r13 \n" 170 "movq 2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n" 171 172 "movzb (%[v_buf], %%r10, 1), %%r13 \n" 173 "movzb 1(%[v_buf], %%r10, 1), %%r14 \n" 174 "mov %%r11, %%rax \n" 175 "and $0x1fffe, %%rax \n" 176 "imul %%rax, %%r14 \n" 177 "xor $0x1fffe, %%rax \n" 178 "imul %%rax, %%r13 \n" 179 "add %%r14, %%r13 \n" 180 "shr $17, %%r13 \n" 181 "movq 4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n" 182 183 "mov %%r11, %%rax \n" 184 "lea (%%r11,%[source_dx]),%%r10\n" 185 "sar $0x10,%%r11\n" 186 "paddsw %%xmm1,%%xmm0\n" 187 188 "movzb (%[y_buf], %%r11, 1), %%r13 \n" 189 "movzb 1(%[y_buf], %%r11, 1), %%r14 \n" 190 "and $0xffff, %%rax \n" 191 "imul %%rax, %%r14 \n" 192 "xor $0xffff, %%rax \n" 193 "imul %%rax, %%r13 \n" 194 "add %%r14, %%r13 \n" 195 "shr $16, %%r13 \n" 196 "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" 197 198 "mov %%r10, %%rax \n" 199 "lea (%%r10,%[source_dx]),%%r11\n" 200 "sar $0x10,%%r10\n" 201 202 "movzb (%[y_buf],%%r10,1), %%r13 \n" 203 "movzb 1(%[y_buf],%%r10,1), %%r14 \n" 204 "and $0xffff, %%rax \n" 205 "imul %%rax, %%r14 \n" 206 "xor $0xffff, %%rax \n" 207 "imul %%rax, %%r13 \n" 208 "add %%r14, %%r13 \n" 209 "shr $16, %%r13 \n" 210 "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n" 211 212 "paddsw %%xmm0,%%xmm1\n" 213 "paddsw %%xmm0,%%xmm2\n" 214 "shufps $0x44,%%xmm2,%%xmm1\n" 215 "psraw $0x6,%%xmm1\n" 216 "packuswb %%xmm1,%%xmm1\n" 217 "movq %%xmm1,0x0(%[rgb_buf])\n" 218 "add $0x8,%[rgb_buf]\n" 219 "sub $0x2,%[width]\n" 220 "jns 1b\n" 221 222 "2:" 223 "add $0x1,%[width]\n" 224 "js 3f\n" 225 226 "mov %%r11,%%r10\n" 227 "sar $0x11,%%r10\n" 228 229 "movzb (%[u_buf],%%r10,1), %%r13 \n" 230 "movq 2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n" 231 232 "movzb (%[v_buf],%%r10,1), %%r13 \n" 233 "movq 4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" 234 235 "paddsw %%xmm1,%%xmm0\n" 236 "sar $0x10,%%r11\n" 237 238 "movzb (%[y_buf],%%r11,1), %%r13 \n" 239 "movq (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n" 240 241 "paddsw %%xmm0,%%xmm1\n" 242 "psraw $0x6,%%xmm1\n" 243 "packuswb %%xmm1,%%xmm1\n" 244 "movd %%xmm1,0x0(%[rgb_buf])\n" 245 246 "3:" 247 : [rgb_buf] "+r"(rgb_buf), 248 [width] "+r"(width) 249 : [y_buf] "r"(y_buf), 250 [u_buf] "r"(u_buf), 251 [v_buf] "r"(v_buf), 252 [kCoefficientsRgbY] "r" (kCoefficientsRgbY), 253 [source_dx] "r"(static_cast<long>(source_dx)) 254 : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2" 255 ); 256 } 257 258 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__) 259 260 // PIC version is slower because less registers are available, so 261 // non-PIC is used on platforms where it is possible. 262 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, 263 const uint8_t* u_buf, 264 const uint8_t* v_buf, 265 uint8_t* rgb_buf, 266 int width); 267 asm( 268 ".text\n" 269 ".global FastConvertYUVToRGB32Row_SSE\n" 270 ".type FastConvertYUVToRGB32Row_SSE, @function\n" 271 "FastConvertYUVToRGB32Row_SSE:\n" 272 "pusha\n" 273 "mov 0x24(%esp),%edx\n" 274 "mov 0x28(%esp),%edi\n" 275 "mov 0x2c(%esp),%esi\n" 276 "mov 0x30(%esp),%ebp\n" 277 "mov 0x34(%esp),%ecx\n" 278 "jmp 1f\n" 279 280 "0:" 281 "movzbl (%edi),%eax\n" 282 "add $0x1,%edi\n" 283 "movzbl (%esi),%ebx\n" 284 "add $0x1,%esi\n" 285 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 286 "movzbl (%edx),%eax\n" 287 "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n" 288 "movzbl 0x1(%edx),%ebx\n" 289 "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 290 "add $0x2,%edx\n" 291 "movq kCoefficientsRgbY(,%ebx,8),%mm2\n" 292 "paddsw %mm0,%mm1\n" 293 "paddsw %mm0,%mm2\n" 294 "psraw $0x6,%mm1\n" 295 "psraw $0x6,%mm2\n" 296 "packuswb %mm2,%mm1\n" 297 "movntq %mm1,0x0(%ebp)\n" 298 "add $0x8,%ebp\n" 299 "1:" 300 "sub $0x2,%ecx\n" 301 "jns 0b\n" 302 303 "and $0x1,%ecx\n" 304 "je 2f\n" 305 306 "movzbl (%edi),%eax\n" 307 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 308 "movzbl (%esi),%eax\n" 309 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 310 "movzbl (%edx),%eax\n" 311 "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 312 "paddsw %mm0,%mm1\n" 313 "psraw $0x6,%mm1\n" 314 "packuswb %mm1,%mm1\n" 315 "movd %mm1,0x0(%ebp)\n" 316 "2:" 317 "popa\n" 318 "ret\n" 319 #if !defined(XP_MACOSX) 320 ".previous\n" 321 #endif 322 ); 323 324 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, 325 const uint8_t* u_buf, 326 const uint8_t* v_buf, 327 uint8_t* rgb_buf, 328 int width) 329 { 330 if (mozilla::supports_sse()) { 331 FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width); 332 return; 333 } 334 335 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 336 } 337 338 339 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, 340 const uint8_t* u_buf, 341 const uint8_t* v_buf, 342 uint8_t* rgb_buf, 343 int width, 344 int source_dx); 345 asm( 346 ".text\n" 347 ".global ScaleYUVToRGB32Row_SSE\n" 348 ".type ScaleYUVToRGB32Row_SSE, @function\n" 349 "ScaleYUVToRGB32Row_SSE:\n" 350 "pusha\n" 351 "mov 0x24(%esp),%edx\n" 352 "mov 0x28(%esp),%edi\n" 353 "mov 0x2c(%esp),%esi\n" 354 "mov 0x30(%esp),%ebp\n" 355 "mov 0x34(%esp),%ecx\n" 356 "xor %ebx,%ebx\n" 357 "jmp 1f\n" 358 359 "0:" 360 "mov %ebx,%eax\n" 361 "sar $0x11,%eax\n" 362 "movzbl (%edi,%eax,1),%eax\n" 363 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 364 "mov %ebx,%eax\n" 365 "sar $0x11,%eax\n" 366 "movzbl (%esi,%eax,1),%eax\n" 367 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 368 "mov %ebx,%eax\n" 369 "add 0x38(%esp),%ebx\n" 370 "sar $0x10,%eax\n" 371 "movzbl (%edx,%eax,1),%eax\n" 372 "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 373 "mov %ebx,%eax\n" 374 "add 0x38(%esp),%ebx\n" 375 "sar $0x10,%eax\n" 376 "movzbl (%edx,%eax,1),%eax\n" 377 "movq kCoefficientsRgbY(,%eax,8),%mm2\n" 378 "paddsw %mm0,%mm1\n" 379 "paddsw %mm0,%mm2\n" 380 "psraw $0x6,%mm1\n" 381 "psraw $0x6,%mm2\n" 382 "packuswb %mm2,%mm1\n" 383 "movntq %mm1,0x0(%ebp)\n" 384 "add $0x8,%ebp\n" 385 "1:" 386 "sub $0x2,%ecx\n" 387 "jns 0b\n" 388 389 "and $0x1,%ecx\n" 390 "je 2f\n" 391 392 "mov %ebx,%eax\n" 393 "sar $0x11,%eax\n" 394 "movzbl (%edi,%eax,1),%eax\n" 395 "movq kCoefficientsRgbY+2048(,%eax,8),%mm0\n" 396 "mov %ebx,%eax\n" 397 "sar $0x11,%eax\n" 398 "movzbl (%esi,%eax,1),%eax\n" 399 "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n" 400 "mov %ebx,%eax\n" 401 "sar $0x10,%eax\n" 402 "movzbl (%edx,%eax,1),%eax\n" 403 "movq kCoefficientsRgbY(,%eax,8),%mm1\n" 404 "paddsw %mm0,%mm1\n" 405 "psraw $0x6,%mm1\n" 406 "packuswb %mm1,%mm1\n" 407 "movd %mm1,0x0(%ebp)\n" 408 409 "2:" 410 "popa\n" 411 "ret\n" 412 #if !defined(XP_MACOSX) 413 ".previous\n" 414 #endif 415 ); 416 417 void ScaleYUVToRGB32Row(const uint8_t* y_buf, 418 const uint8_t* u_buf, 419 const uint8_t* v_buf, 420 uint8_t* rgb_buf, 421 int width, 422 int source_dx) 423 { 424 if (mozilla::supports_sse()) { 425 ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, 426 width, source_dx); 427 return; 428 } 429 430 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, 431 width, source_dx); 432 } 433 434 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, 435 const uint8_t* u_buf, 436 const uint8_t* v_buf, 437 uint8_t* rgb_buf, 438 int width, 439 int source_dx); 440 asm( 441 ".text\n" 442 ".global LinearScaleYUVToRGB32Row_SSE\n" 443 ".type LinearScaleYUVToRGB32Row_SSE, @function\n" 444 "LinearScaleYUVToRGB32Row_SSE:\n" 445 "pusha\n" 446 "mov 0x24(%esp),%edx\n" 447 "mov 0x28(%esp),%edi\n" 448 "mov 0x30(%esp),%ebp\n" 449 450 // source_width = width * source_dx + ebx 451 "mov 0x34(%esp), %ecx\n" 452 "imull 0x38(%esp), %ecx\n" 453 "mov %ecx, 0x34(%esp)\n" 454 455 "mov 0x38(%esp), %ecx\n" 456 "xor %ebx,%ebx\n" // x = 0 457 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 458 "jl 1f\n" 459 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 460 "jmp 1f\n" 461 462 "0:" 463 "mov %ebx,%eax\n" 464 "sar $0x11,%eax\n" 465 466 "movzbl (%edi,%eax,1),%ecx\n" 467 "movzbl 1(%edi,%eax,1),%esi\n" 468 "mov %ebx,%eax\n" 469 "andl $0x1fffe, %eax \n" 470 "imul %eax, %esi \n" 471 "xorl $0x1fffe, %eax \n" 472 "imul %eax, %ecx \n" 473 "addl %esi, %ecx \n" 474 "shrl $17, %ecx \n" 475 "movq kCoefficientsRgbY+2048(,%ecx,8),%mm0\n" 476 477 "mov 0x2c(%esp),%esi\n" 478 "mov %ebx,%eax\n" 479 "sar $0x11,%eax\n" 480 481 "movzbl (%esi,%eax,1),%ecx\n" 482 "movzbl 1(%esi,%eax,1),%esi\n" 483 "mov %ebx,%eax\n" 484 "andl $0x1fffe, %eax \n" 485 "imul %eax, %esi \n" 486 "xorl $0x1fffe, %eax \n" 487 "imul %eax, %ecx \n" 488 "addl %esi, %ecx \n" 489 "shrl $17, %ecx \n" 490 "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n" 491 492 "mov %ebx,%eax\n" 493 "sar $0x10,%eax\n" 494 "movzbl (%edx,%eax,1),%ecx\n" 495 "movzbl 1(%edx,%eax,1),%esi\n" 496 "mov %ebx,%eax\n" 497 "add 0x38(%esp),%ebx\n" 498 "andl $0xffff, %eax \n" 499 "imul %eax, %esi \n" 500 "xorl $0xffff, %eax \n" 501 "imul %eax, %ecx \n" 502 "addl %esi, %ecx \n" 503 "shrl $16, %ecx \n" 504 "movq kCoefficientsRgbY(,%ecx,8),%mm1\n" 505 506 "cmp 0x34(%esp), %ebx\n" 507 "jge 2f\n" 508 509 "mov %ebx,%eax\n" 510 "sar $0x10,%eax\n" 511 "movzbl (%edx,%eax,1),%ecx\n" 512 "movzbl 1(%edx,%eax,1),%esi\n" 513 "mov %ebx,%eax\n" 514 "add 0x38(%esp),%ebx\n" 515 "andl $0xffff, %eax \n" 516 "imul %eax, %esi \n" 517 "xorl $0xffff, %eax \n" 518 "imul %eax, %ecx \n" 519 "addl %esi, %ecx \n" 520 "shrl $16, %ecx \n" 521 "movq kCoefficientsRgbY(,%ecx,8),%mm2\n" 522 523 "paddsw %mm0,%mm1\n" 524 "paddsw %mm0,%mm2\n" 525 "psraw $0x6,%mm1\n" 526 "psraw $0x6,%mm2\n" 527 "packuswb %mm2,%mm1\n" 528 "movntq %mm1,0x0(%ebp)\n" 529 "add $0x8,%ebp\n" 530 531 "1:" 532 "cmp 0x34(%esp), %ebx\n" 533 "jl 0b\n" 534 "popa\n" 535 "ret\n" 536 537 "2:" 538 "paddsw %mm0, %mm1\n" 539 "psraw $6, %mm1\n" 540 "packuswb %mm1, %mm1\n" 541 "movd %mm1, (%ebp)\n" 542 "popa\n" 543 "ret\n" 544 #if !defined(XP_MACOSX) 545 ".previous\n" 546 #endif 547 ); 548 549 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, 550 const uint8_t* u_buf, 551 const uint8_t* v_buf, 552 uint8_t* rgb_buf, 553 int width, 554 int source_dx) 555 { 556 if (mozilla::supports_sse()) { 557 LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, 558 width, source_dx); 559 return; 560 } 561 562 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, 563 width, source_dx); 564 } 565 566 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__) 567 568 void PICConvertYUVToRGB32Row_SSE(const uint8_t* y_buf, 569 const uint8_t* u_buf, 570 const uint8_t* v_buf, 571 uint8_t* rgb_buf, 572 int width, 573 const int16_t *kCoefficientsRgbY); 574 575 asm( 576 ".text\n" 577 #if defined(XP_MACOSX) 578 "_PICConvertYUVToRGB32Row_SSE:\n" 579 #else 580 "PICConvertYUVToRGB32Row_SSE:\n" 581 #endif 582 "pusha\n" 583 "mov 0x24(%esp),%edx\n" 584 "mov 0x28(%esp),%edi\n" 585 "mov 0x2c(%esp),%esi\n" 586 "mov 0x30(%esp),%ebp\n" 587 "mov 0x38(%esp),%ecx\n" 588 589 "jmp 1f\n" 590 591 "0:" 592 "movzbl (%edi),%eax\n" 593 "add $0x1,%edi\n" 594 "movzbl (%esi),%ebx\n" 595 "add $0x1,%esi\n" 596 "movq 2048(%ecx,%eax,8),%mm0\n" 597 "movzbl (%edx),%eax\n" 598 "paddsw 4096(%ecx,%ebx,8),%mm0\n" 599 "movzbl 0x1(%edx),%ebx\n" 600 "movq 0(%ecx,%eax,8),%mm1\n" 601 "add $0x2,%edx\n" 602 "movq 0(%ecx,%ebx,8),%mm2\n" 603 "paddsw %mm0,%mm1\n" 604 "paddsw %mm0,%mm2\n" 605 "psraw $0x6,%mm1\n" 606 "psraw $0x6,%mm2\n" 607 "packuswb %mm2,%mm1\n" 608 "movntq %mm1,0x0(%ebp)\n" 609 "add $0x8,%ebp\n" 610 "1:" 611 "subl $0x2,0x34(%esp)\n" 612 "jns 0b\n" 613 614 "andl $0x1,0x34(%esp)\n" 615 "je 2f\n" 616 617 "movzbl (%edi),%eax\n" 618 "movq 2048(%ecx,%eax,8),%mm0\n" 619 "movzbl (%esi),%eax\n" 620 "paddsw 4096(%ecx,%eax,8),%mm0\n" 621 "movzbl (%edx),%eax\n" 622 "movq 0(%ecx,%eax,8),%mm1\n" 623 "paddsw %mm0,%mm1\n" 624 "psraw $0x6,%mm1\n" 625 "packuswb %mm1,%mm1\n" 626 "movd %mm1,0x0(%ebp)\n" 627 "2:" 628 "popa\n" 629 "ret\n" 630 #if !defined(XP_MACOSX) 631 ".previous\n" 632 #endif 633 ); 634 635 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, 636 const uint8_t* u_buf, 637 const uint8_t* v_buf, 638 uint8_t* rgb_buf, 639 int width) 640 { 641 if (mozilla::supports_sse()) { 642 PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 643 &kCoefficientsRgbY[0][0]); 644 return; 645 } 646 647 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 648 } 649 650 void PICScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, 651 const uint8_t* u_buf, 652 const uint8_t* v_buf, 653 uint8_t* rgb_buf, 654 int width, 655 int source_dx, 656 const int16_t *kCoefficientsRgbY); 657 658 asm( 659 ".text\n" 660 #if defined(XP_MACOSX) 661 "_PICScaleYUVToRGB32Row_SSE:\n" 662 #else 663 "PICScaleYUVToRGB32Row_SSE:\n" 664 #endif 665 "pusha\n" 666 "mov 0x24(%esp),%edx\n" 667 "mov 0x28(%esp),%edi\n" 668 "mov 0x2c(%esp),%esi\n" 669 "mov 0x30(%esp),%ebp\n" 670 "mov 0x3c(%esp),%ecx\n" 671 "xor %ebx,%ebx\n" 672 "jmp 1f\n" 673 674 "0:" 675 "mov %ebx,%eax\n" 676 "sar $0x11,%eax\n" 677 "movzbl (%edi,%eax,1),%eax\n" 678 "movq 2048(%ecx,%eax,8),%mm0\n" 679 "mov %ebx,%eax\n" 680 "sar $0x11,%eax\n" 681 "movzbl (%esi,%eax,1),%eax\n" 682 "paddsw 4096(%ecx,%eax,8),%mm0\n" 683 "mov %ebx,%eax\n" 684 "add 0x38(%esp),%ebx\n" 685 "sar $0x10,%eax\n" 686 "movzbl (%edx,%eax,1),%eax\n" 687 "movq 0(%ecx,%eax,8),%mm1\n" 688 "mov %ebx,%eax\n" 689 "add 0x38(%esp),%ebx\n" 690 "sar $0x10,%eax\n" 691 "movzbl (%edx,%eax,1),%eax\n" 692 "movq 0(%ecx,%eax,8),%mm2\n" 693 "paddsw %mm0,%mm1\n" 694 "paddsw %mm0,%mm2\n" 695 "psraw $0x6,%mm1\n" 696 "psraw $0x6,%mm2\n" 697 "packuswb %mm2,%mm1\n" 698 "movntq %mm1,0x0(%ebp)\n" 699 "add $0x8,%ebp\n" 700 "1:" 701 "subl $0x2,0x34(%esp)\n" 702 "jns 0b\n" 703 704 "andl $0x1,0x34(%esp)\n" 705 "je 2f\n" 706 707 "mov %ebx,%eax\n" 708 "sar $0x11,%eax\n" 709 "movzbl (%edi,%eax,1),%eax\n" 710 "movq 2048(%ecx,%eax,8),%mm0\n" 711 "mov %ebx,%eax\n" 712 "sar $0x11,%eax\n" 713 "movzbl (%esi,%eax,1),%eax\n" 714 "paddsw 4096(%ecx,%eax,8),%mm0\n" 715 "mov %ebx,%eax\n" 716 "sar $0x10,%eax\n" 717 "movzbl (%edx,%eax,1),%eax\n" 718 "movq 0(%ecx,%eax,8),%mm1\n" 719 "paddsw %mm0,%mm1\n" 720 "psraw $0x6,%mm1\n" 721 "packuswb %mm1,%mm1\n" 722 "movd %mm1,0x0(%ebp)\n" 723 724 "2:" 725 "popa\n" 726 "ret\n" 727 #if !defined(XP_MACOSX) 728 ".previous\n" 729 #endif 730 ); 731 732 void ScaleYUVToRGB32Row(const uint8_t* y_buf, 733 const uint8_t* u_buf, 734 const uint8_t* v_buf, 735 uint8_t* rgb_buf, 736 int width, 737 int source_dx) 738 { 739 if (mozilla::supports_sse()) { 740 PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx, 741 &kCoefficientsRgbY[0][0]); 742 return; 743 } 744 745 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 746 } 747 748 void PICLinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf, 749 const uint8_t* u_buf, 750 const uint8_t* v_buf, 751 uint8_t* rgb_buf, 752 int width, 753 int source_dx, 754 const int16_t *kCoefficientsRgbY); 755 756 asm( 757 ".text\n" 758 #if defined(XP_MACOSX) 759 "_PICLinearScaleYUVToRGB32Row_SSE:\n" 760 #else 761 "PICLinearScaleYUVToRGB32Row_SSE:\n" 762 #endif 763 "pusha\n" 764 "mov 0x24(%esp),%edx\n" 765 "mov 0x30(%esp),%ebp\n" 766 "mov 0x34(%esp),%ecx\n" 767 "mov 0x3c(%esp),%edi\n" 768 "xor %ebx,%ebx\n" 769 770 // source_width = width * source_dx + ebx 771 "mov 0x34(%esp), %ecx\n" 772 "imull 0x38(%esp), %ecx\n" 773 "mov %ecx, 0x34(%esp)\n" 774 775 "mov 0x38(%esp), %ecx\n" 776 "xor %ebx,%ebx\n" // x = 0 777 "cmp $0x20000,%ecx\n" // if source_dx >= 2.0 778 "jl 1f\n" 779 "mov $0x8000,%ebx\n" // x = 0.5 for 1/2 or less 780 "jmp 1f\n" 781 782 "0:" 783 "mov 0x28(%esp),%esi\n" 784 "mov %ebx,%eax\n" 785 "sar $0x11,%eax\n" 786 787 "movzbl (%esi,%eax,1),%ecx\n" 788 "movzbl 1(%esi,%eax,1),%esi\n" 789 "mov %ebx,%eax\n" 790 "andl $0x1fffe, %eax \n" 791 "imul %eax, %esi \n" 792 "xorl $0x1fffe, %eax \n" 793 "imul %eax, %ecx \n" 794 "addl %esi, %ecx \n" 795 "shrl $17, %ecx \n" 796 "movq 2048(%edi,%ecx,8),%mm0\n" 797 798 "mov 0x2c(%esp),%esi\n" 799 "mov %ebx,%eax\n" 800 "sar $0x11,%eax\n" 801 802 "movzbl (%esi,%eax,1),%ecx\n" 803 "movzbl 1(%esi,%eax,1),%esi\n" 804 "mov %ebx,%eax\n" 805 "andl $0x1fffe, %eax \n" 806 "imul %eax, %esi \n" 807 "xorl $0x1fffe, %eax \n" 808 "imul %eax, %ecx \n" 809 "addl %esi, %ecx \n" 810 "shrl $17, %ecx \n" 811 "paddsw 4096(%edi,%ecx,8),%mm0\n" 812 813 "mov %ebx,%eax\n" 814 "sar $0x10,%eax\n" 815 "movzbl (%edx,%eax,1),%ecx\n" 816 "movzbl 1(%edx,%eax,1),%esi\n" 817 "mov %ebx,%eax\n" 818 "add 0x38(%esp),%ebx\n" 819 "andl $0xffff, %eax \n" 820 "imul %eax, %esi \n" 821 "xorl $0xffff, %eax \n" 822 "imul %eax, %ecx \n" 823 "addl %esi, %ecx \n" 824 "shrl $16, %ecx \n" 825 "movq (%edi,%ecx,8),%mm1\n" 826 827 "cmp 0x34(%esp), %ebx\n" 828 "jge 2f\n" 829 830 "mov %ebx,%eax\n" 831 "sar $0x10,%eax\n" 832 "movzbl (%edx,%eax,1),%ecx\n" 833 "movzbl 1(%edx,%eax,1),%esi\n" 834 "mov %ebx,%eax\n" 835 "add 0x38(%esp),%ebx\n" 836 "andl $0xffff, %eax \n" 837 "imul %eax, %esi \n" 838 "xorl $0xffff, %eax \n" 839 "imul %eax, %ecx \n" 840 "addl %esi, %ecx \n" 841 "shrl $16, %ecx \n" 842 "movq (%edi,%ecx,8),%mm2\n" 843 844 "paddsw %mm0,%mm1\n" 845 "paddsw %mm0,%mm2\n" 846 "psraw $0x6,%mm1\n" 847 "psraw $0x6,%mm2\n" 848 "packuswb %mm2,%mm1\n" 849 "movntq %mm1,0x0(%ebp)\n" 850 "add $0x8,%ebp\n" 851 852 "1:" 853 "cmp %ebx, 0x34(%esp)\n" 854 "jg 0b\n" 855 "popa\n" 856 "ret\n" 857 858 "2:" 859 "paddsw %mm0, %mm1\n" 860 "psraw $6, %mm1\n" 861 "packuswb %mm1, %mm1\n" 862 "movd %mm1, (%ebp)\n" 863 "popa\n" 864 "ret\n" 865 #if !defined(XP_MACOSX) 866 ".previous\n" 867 #endif 868 ); 869 870 871 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, 872 const uint8_t* u_buf, 873 const uint8_t* v_buf, 874 uint8_t* rgb_buf, 875 int width, 876 int source_dx) 877 { 878 if (mozilla::supports_sse()) { 879 PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, 880 source_dx, &kCoefficientsRgbY[0][0]); 881 return; 882 } 883 884 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 885 } 886 #else 887 void FastConvertYUVToRGB32Row(const uint8_t* y_buf, 888 const uint8_t* u_buf, 889 const uint8_t* v_buf, 890 uint8_t* rgb_buf, 891 int width) { 892 FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1); 893 } 894 895 void ScaleYUVToRGB32Row(const uint8_t* y_buf, 896 const uint8_t* u_buf, 897 const uint8_t* v_buf, 898 uint8_t* rgb_buf, 899 int width, 900 int source_dx) { 901 ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 902 } 903 904 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf, 905 const uint8_t* u_buf, 906 const uint8_t* v_buf, 907 uint8_t* rgb_buf, 908 int width, 909 int source_dx) { 910 LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx); 911 } 912 #endif 913 914 }