tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_row_posix.cpp (24655B)


      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "yuv_row.h"
      6 #include "mozilla/SSE.h"
      7 
      8 #define DCHECK(a)
      9 
     10 extern "C" {
     11 
     12 #if defined(ARCH_CPU_X86_64)
     13 
     14 // We don't need CPUID guards here, since x86-64 implies SSE2.
     15 
     16 // AMD64 ABI uses register paremters.
     17 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,  // rdi
     18                              const uint8_t* u_buf,  // rsi
     19                              const uint8_t* v_buf,  // rdx
     20                              uint8_t* rgb_buf,      // rcx
     21                              int width) {         // r8
     22  asm volatile(
     23  "jmp    1f\n"
     24 "0:"
     25  "movzb  (%[u_buf]),%%r10\n"
     26  "add    $0x1,%[u_buf]\n"
     27  "movzb  (%[v_buf]),%%r11\n"
     28  "add    $0x1,%[v_buf]\n"
     29  "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
     30  "movzb  (%[y_buf]),%%r10\n"
     31  "movq   4096(%[kCoefficientsRgbY],%%r11,8),%%xmm1\n"
     32  "movzb  0x1(%[y_buf]),%%r11\n"
     33  "paddsw %%xmm1,%%xmm0\n"
     34  "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm2\n"
     35  "add    $0x2,%[y_buf]\n"
     36  "movq   (%[kCoefficientsRgbY],%%r11,8),%%xmm3\n"
     37  "paddsw %%xmm0,%%xmm2\n"
     38  "paddsw %%xmm0,%%xmm3\n"
     39  "shufps $0x44,%%xmm3,%%xmm2\n"
     40  "psraw  $0x6,%%xmm2\n"
     41  "packuswb %%xmm2,%%xmm2\n"
     42  "movq   %%xmm2,0x0(%[rgb_buf])\n"
     43  "add    $0x8,%[rgb_buf]\n"
     44 "1:"
     45  "sub    $0x2,%[width]\n"
     46  "jns    0b\n"
     47 
     48 "2:"
     49  "add    $0x1,%[width]\n"
     50  "js     3f\n"
     51 
     52  "movzb  (%[u_buf]),%%r10\n"
     53  "movq   2048(%[kCoefficientsRgbY],%%r10,8),%%xmm0\n"
     54  "movzb  (%[v_buf]),%%r10\n"
     55  "movq   4096(%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
     56  "paddsw %%xmm1,%%xmm0\n"
     57  "movzb  (%[y_buf]),%%r10\n"
     58  "movq   (%[kCoefficientsRgbY],%%r10,8),%%xmm1\n"
     59  "paddsw %%xmm0,%%xmm1\n"
     60  "psraw  $0x6,%%xmm1\n"
     61  "packuswb %%xmm1,%%xmm1\n"
     62  "movd   %%xmm1,0x0(%[rgb_buf])\n"
     63 "3:"
     64  : [y_buf] "+r"(y_buf),
     65    [u_buf] "+r"(u_buf),
     66    [v_buf] "+r"(v_buf),
     67    [rgb_buf] "+r"(rgb_buf),
     68    [width] "+r"(width)
     69  : [kCoefficientsRgbY] "r" (kCoefficientsRgbY)
     70  : "cc", "memory", "r10", "r11", "xmm0", "xmm1", "xmm2", "xmm3"
     71 );
     72 }
     73 
     74 void ScaleYUVToRGB32Row(const uint8_t* y_buf,  // rdi
     75                        const uint8_t* u_buf,  // rsi
     76                        const uint8_t* v_buf,  // rdx
     77                        uint8_t* rgb_buf,      // rcx
     78                        int width,           // r8
     79                        int source_dx) {     // r9
     80  asm volatile(
     81  "xor    %%r11,%%r11\n"
     82  "sub    $0x2,%[width]\n"
     83  "js     1f\n"
     84 
     85 "0:"
     86  "mov    %%r11,%%r10\n"
     87  "sar    $0x11,%%r10\n"
     88  "movzb  (%[u_buf],%%r10,1),%%rax\n"
     89  "movq   2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
     90  "movzb  (%[v_buf],%%r10,1),%%rax\n"
     91  "movq   4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
     92  "lea    (%%r11,%[source_dx]),%%r10\n"
     93  "sar    $0x10,%%r11\n"
     94  "movzb  (%[y_buf],%%r11,1),%%rax\n"
     95  "paddsw %%xmm1,%%xmm0\n"
     96  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
     97  "lea    (%%r10,%[source_dx]),%%r11\n"
     98  "sar    $0x10,%%r10\n"
     99  "movzb  (%[y_buf],%%r10,1),%%rax\n"
    100  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm2\n"
    101  "paddsw %%xmm0,%%xmm1\n"
    102  "paddsw %%xmm0,%%xmm2\n"
    103  "shufps $0x44,%%xmm2,%%xmm1\n"
    104  "psraw  $0x6,%%xmm1\n"
    105  "packuswb %%xmm1,%%xmm1\n"
    106  "movq   %%xmm1,0x0(%[rgb_buf])\n"
    107  "add    $0x8,%[rgb_buf]\n"
    108  "sub    $0x2,%[width]\n"
    109  "jns    0b\n"
    110 
    111 "1:"
    112  "add    $0x1,%[width]\n"
    113  "js     2f\n"
    114 
    115  "mov    %%r11,%%r10\n"
    116  "sar    $0x11,%%r10\n"
    117  "movzb  (%[u_buf],%%r10,1),%%rax\n"
    118  "movq   2048(%[kCoefficientsRgbY],%%rax,8),%%xmm0\n"
    119  "movzb  (%[v_buf],%%r10,1),%%rax\n"
    120  "movq   4096(%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
    121  "paddsw %%xmm1,%%xmm0\n"
    122  "sar    $0x10,%%r11\n"
    123  "movzb  (%[y_buf],%%r11,1),%%rax\n"
    124  "movq   (%[kCoefficientsRgbY],%%rax,8),%%xmm1\n"
    125  "paddsw %%xmm0,%%xmm1\n"
    126  "psraw  $0x6,%%xmm1\n"
    127  "packuswb %%xmm1,%%xmm1\n"
    128  "movd   %%xmm1,0x0(%[rgb_buf])\n"
    129 
    130 "2:"
    131  : [rgb_buf] "+r"(rgb_buf),
    132    [width] "+r"(width)
    133  : [y_buf] "r"(y_buf),
    134    [u_buf] "r"(u_buf),
    135    [v_buf] "r"(v_buf),
    136    [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
    137    [source_dx] "r"(static_cast<long>(source_dx))
    138  : "cc", "memory", "r10", "r11", "rax", "xmm0", "xmm1", "xmm2"
    139 );
    140 }
    141 
    142 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
    143                              const uint8_t* u_buf,
    144                              const uint8_t* v_buf,
    145                              uint8_t* rgb_buf,
    146                              int width,
    147                              int source_dx) {
    148  asm volatile(
    149  "xor    %%r11,%%r11\n"   // x = 0
    150  "sub    $0x2,%[width]\n"
    151  "js     2f\n"
    152  "cmp    $0x20000,%[source_dx]\n"   // if source_dx >= 2.0
    153  "jl     0f\n"
    154  "mov    $0x8000,%%r11\n" // x = 0.5 for 1/2 or less
    155 "0:"
    156 
    157 "1:"
    158  "mov    %%r11,%%r10\n"
    159  "sar    $0x11,%%r10\n"
    160 
    161  "movzb  (%[u_buf], %%r10, 1), %%r13 \n"
    162  "movzb  1(%[u_buf], %%r10, 1), %%r14 \n"
    163  "mov    %%r11, %%rax \n"
    164  "and    $0x1fffe, %%rax \n"
    165  "imul   %%rax, %%r14 \n"
    166  "xor    $0x1fffe, %%rax \n"
    167  "imul   %%rax, %%r13 \n"
    168  "add    %%r14, %%r13 \n"
    169  "shr    $17, %%r13 \n"
    170  "movq   2048(%[kCoefficientsRgbY],%%r13,8), %%xmm0\n"
    171 
    172  "movzb  (%[v_buf], %%r10, 1), %%r13 \n"
    173  "movzb  1(%[v_buf], %%r10, 1), %%r14 \n"
    174  "mov    %%r11, %%rax \n"
    175  "and    $0x1fffe, %%rax \n"
    176  "imul   %%rax, %%r14 \n"
    177  "xor    $0x1fffe, %%rax \n"
    178  "imul   %%rax, %%r13 \n"
    179  "add    %%r14, %%r13 \n"
    180  "shr    $17, %%r13 \n"
    181  "movq   4096(%[kCoefficientsRgbY],%%r13,8), %%xmm1\n"
    182 
    183  "mov    %%r11, %%rax \n"
    184  "lea    (%%r11,%[source_dx]),%%r10\n"
    185  "sar    $0x10,%%r11\n"
    186  "paddsw %%xmm1,%%xmm0\n"
    187 
    188  "movzb  (%[y_buf], %%r11, 1), %%r13 \n"
    189  "movzb  1(%[y_buf], %%r11, 1), %%r14 \n"
    190  "and    $0xffff, %%rax \n"
    191  "imul   %%rax, %%r14 \n"
    192  "xor    $0xffff, %%rax \n"
    193  "imul   %%rax, %%r13 \n"
    194  "add    %%r14, %%r13 \n"
    195  "shr    $16, %%r13 \n"
    196  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
    197 
    198  "mov    %%r10, %%rax \n"
    199  "lea    (%%r10,%[source_dx]),%%r11\n"
    200  "sar    $0x10,%%r10\n"
    201 
    202  "movzb  (%[y_buf],%%r10,1), %%r13 \n"
    203  "movzb  1(%[y_buf],%%r10,1), %%r14 \n"
    204  "and    $0xffff, %%rax \n"
    205  "imul   %%rax, %%r14 \n"
    206  "xor    $0xffff, %%rax \n"
    207  "imul   %%rax, %%r13 \n"
    208  "add    %%r14, %%r13 \n"
    209  "shr    $16, %%r13 \n"
    210  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm2\n"
    211 
    212  "paddsw %%xmm0,%%xmm1\n"
    213  "paddsw %%xmm0,%%xmm2\n"
    214  "shufps $0x44,%%xmm2,%%xmm1\n"
    215  "psraw  $0x6,%%xmm1\n"
    216  "packuswb %%xmm1,%%xmm1\n"
    217  "movq   %%xmm1,0x0(%[rgb_buf])\n"
    218  "add    $0x8,%[rgb_buf]\n"
    219  "sub    $0x2,%[width]\n"
    220  "jns    1b\n"
    221 
    222 "2:"
    223  "add    $0x1,%[width]\n"
    224  "js     3f\n"
    225 
    226  "mov    %%r11,%%r10\n"
    227  "sar    $0x11,%%r10\n"
    228 
    229  "movzb  (%[u_buf],%%r10,1), %%r13 \n"
    230  "movq   2048(%[kCoefficientsRgbY],%%r13,8),%%xmm0\n"
    231 
    232  "movzb  (%[v_buf],%%r10,1), %%r13 \n"
    233  "movq   4096(%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
    234 
    235  "paddsw %%xmm1,%%xmm0\n"
    236  "sar    $0x10,%%r11\n"
    237 
    238  "movzb  (%[y_buf],%%r11,1), %%r13 \n"
    239  "movq   (%[kCoefficientsRgbY],%%r13,8),%%xmm1\n"
    240 
    241  "paddsw %%xmm0,%%xmm1\n"
    242  "psraw  $0x6,%%xmm1\n"
    243  "packuswb %%xmm1,%%xmm1\n"
    244  "movd   %%xmm1,0x0(%[rgb_buf])\n"
    245 
    246 "3:"
    247  : [rgb_buf] "+r"(rgb_buf),
    248    [width] "+r"(width)
    249  : [y_buf] "r"(y_buf),
    250    [u_buf] "r"(u_buf),
    251    [v_buf] "r"(v_buf),
    252    [kCoefficientsRgbY] "r" (kCoefficientsRgbY),
    253    [source_dx] "r"(static_cast<long>(source_dx))
    254  : "cc", "memory", "r10", "r11", "r13", "r14", "rax", "xmm0", "xmm1", "xmm2"
    255 );
    256 }
    257 
    258 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && !defined(__PIC__)
    259 
    260 // PIC version is slower because less registers are available, so
    261 // non-PIC is used on platforms where it is possible.
    262 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
    263                                  const uint8_t* u_buf,
    264                                  const uint8_t* v_buf,
    265                                  uint8_t* rgb_buf,
    266                                  int width);
    267  asm(
    268  ".text\n"
    269  ".global FastConvertYUVToRGB32Row_SSE\n"
    270  ".type FastConvertYUVToRGB32Row_SSE, @function\n"
    271 "FastConvertYUVToRGB32Row_SSE:\n"
    272  "pusha\n"
    273  "mov    0x24(%esp),%edx\n"
    274  "mov    0x28(%esp),%edi\n"
    275  "mov    0x2c(%esp),%esi\n"
    276  "mov    0x30(%esp),%ebp\n"
    277  "mov    0x34(%esp),%ecx\n"
    278  "jmp    1f\n"
    279 
    280 "0:"
    281  "movzbl (%edi),%eax\n"
    282  "add    $0x1,%edi\n"
    283  "movzbl (%esi),%ebx\n"
    284  "add    $0x1,%esi\n"
    285  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    286  "movzbl (%edx),%eax\n"
    287  "paddsw kCoefficientsRgbY+4096(,%ebx,8),%mm0\n"
    288  "movzbl 0x1(%edx),%ebx\n"
    289  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
    290  "add    $0x2,%edx\n"
    291  "movq   kCoefficientsRgbY(,%ebx,8),%mm2\n"
    292  "paddsw %mm0,%mm1\n"
    293  "paddsw %mm0,%mm2\n"
    294  "psraw  $0x6,%mm1\n"
    295  "psraw  $0x6,%mm2\n"
    296  "packuswb %mm2,%mm1\n"
    297  "movntq %mm1,0x0(%ebp)\n"
    298  "add    $0x8,%ebp\n"
    299 "1:"
    300  "sub    $0x2,%ecx\n"
    301  "jns    0b\n"
    302 
    303  "and    $0x1,%ecx\n"
    304  "je     2f\n"
    305 
    306  "movzbl (%edi),%eax\n"
    307  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    308  "movzbl (%esi),%eax\n"
    309  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
    310  "movzbl (%edx),%eax\n"
    311  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
    312  "paddsw %mm0,%mm1\n"
    313  "psraw  $0x6,%mm1\n"
    314  "packuswb %mm1,%mm1\n"
    315  "movd   %mm1,0x0(%ebp)\n"
    316 "2:"
    317  "popa\n"
    318  "ret\n"
    319 #if !defined(XP_MACOSX)
    320  ".previous\n"
    321 #endif
    322 );
    323 
    324 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
    325                              const uint8_t* u_buf,
    326                              const uint8_t* v_buf,
    327                              uint8_t* rgb_buf,
    328                              int width)
    329 {
    330  if (mozilla::supports_sse()) {
    331    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
    332    return;
    333  }
    334 
    335  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
    336 }
    337 
    338 
    339 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    340                            const uint8_t* u_buf,
    341                            const uint8_t* v_buf,
    342                            uint8_t* rgb_buf,
    343                            int width,
    344                            int source_dx);
    345  asm(
    346  ".text\n"
    347  ".global ScaleYUVToRGB32Row_SSE\n"
    348  ".type ScaleYUVToRGB32Row_SSE, @function\n"
    349 "ScaleYUVToRGB32Row_SSE:\n"
    350  "pusha\n"
    351  "mov    0x24(%esp),%edx\n"
    352  "mov    0x28(%esp),%edi\n"
    353  "mov    0x2c(%esp),%esi\n"
    354  "mov    0x30(%esp),%ebp\n"
    355  "mov    0x34(%esp),%ecx\n"
    356  "xor    %ebx,%ebx\n"
    357  "jmp    1f\n"
    358 
    359 "0:"
    360  "mov    %ebx,%eax\n"
    361  "sar    $0x11,%eax\n"
    362  "movzbl (%edi,%eax,1),%eax\n"
    363  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    364  "mov    %ebx,%eax\n"
    365  "sar    $0x11,%eax\n"
    366  "movzbl (%esi,%eax,1),%eax\n"
    367  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
    368  "mov    %ebx,%eax\n"
    369  "add    0x38(%esp),%ebx\n"
    370  "sar    $0x10,%eax\n"
    371  "movzbl (%edx,%eax,1),%eax\n"
    372  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
    373  "mov    %ebx,%eax\n"
    374  "add    0x38(%esp),%ebx\n"
    375  "sar    $0x10,%eax\n"
    376  "movzbl (%edx,%eax,1),%eax\n"
    377  "movq   kCoefficientsRgbY(,%eax,8),%mm2\n"
    378  "paddsw %mm0,%mm1\n"
    379  "paddsw %mm0,%mm2\n"
    380  "psraw  $0x6,%mm1\n"
    381  "psraw  $0x6,%mm2\n"
    382  "packuswb %mm2,%mm1\n"
    383  "movntq %mm1,0x0(%ebp)\n"
    384  "add    $0x8,%ebp\n"
    385 "1:"
    386  "sub    $0x2,%ecx\n"
    387  "jns    0b\n"
    388 
    389  "and    $0x1,%ecx\n"
    390  "je     2f\n"
    391 
    392  "mov    %ebx,%eax\n"
    393  "sar    $0x11,%eax\n"
    394  "movzbl (%edi,%eax,1),%eax\n"
    395  "movq   kCoefficientsRgbY+2048(,%eax,8),%mm0\n"
    396  "mov    %ebx,%eax\n"
    397  "sar    $0x11,%eax\n"
    398  "movzbl (%esi,%eax,1),%eax\n"
    399  "paddsw kCoefficientsRgbY+4096(,%eax,8),%mm0\n"
    400  "mov    %ebx,%eax\n"
    401  "sar    $0x10,%eax\n"
    402  "movzbl (%edx,%eax,1),%eax\n"
    403  "movq   kCoefficientsRgbY(,%eax,8),%mm1\n"
    404  "paddsw %mm0,%mm1\n"
    405  "psraw  $0x6,%mm1\n"
    406  "packuswb %mm1,%mm1\n"
    407  "movd   %mm1,0x0(%ebp)\n"
    408 
    409 "2:"
    410  "popa\n"
    411  "ret\n"
    412 #if !defined(XP_MACOSX)
    413  ".previous\n"
    414 #endif
    415 );
    416 
    417 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
    418                        const uint8_t* u_buf,
    419                        const uint8_t* v_buf,
    420                        uint8_t* rgb_buf,
    421                        int width,
    422                        int source_dx)
    423 {
    424  if (mozilla::supports_sse()) {
    425    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
    426                           width, source_dx);
    427    return;
    428  }
    429 
    430  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
    431                       width, source_dx);
    432 }
    433 
    434 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    435                                  const uint8_t* u_buf,
    436                                  const uint8_t* v_buf,
    437                                  uint8_t* rgb_buf,
    438                                  int width,
    439                                  int source_dx);
    440  asm(
    441  ".text\n"
    442  ".global LinearScaleYUVToRGB32Row_SSE\n"
    443  ".type LinearScaleYUVToRGB32Row_SSE, @function\n"
    444 "LinearScaleYUVToRGB32Row_SSE:\n"
    445  "pusha\n"
    446  "mov    0x24(%esp),%edx\n"
    447  "mov    0x28(%esp),%edi\n"
    448  "mov    0x30(%esp),%ebp\n"
    449 
    450  // source_width = width * source_dx + ebx
    451  "mov    0x34(%esp), %ecx\n"
    452  "imull  0x38(%esp), %ecx\n"
    453  "mov    %ecx, 0x34(%esp)\n"
    454 
    455  "mov    0x38(%esp), %ecx\n"
    456  "xor    %ebx,%ebx\n"     // x = 0
    457  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
    458  "jl     1f\n"
    459  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
    460  "jmp    1f\n"
    461 
    462 "0:"
    463  "mov    %ebx,%eax\n"
    464  "sar    $0x11,%eax\n"
    465 
    466  "movzbl (%edi,%eax,1),%ecx\n"
    467  "movzbl 1(%edi,%eax,1),%esi\n"
    468  "mov    %ebx,%eax\n"
    469  "andl   $0x1fffe, %eax \n"
    470  "imul   %eax, %esi \n"
    471  "xorl   $0x1fffe, %eax \n"
    472  "imul   %eax, %ecx \n"
    473  "addl   %esi, %ecx \n"
    474  "shrl   $17, %ecx \n"
    475  "movq   kCoefficientsRgbY+2048(,%ecx,8),%mm0\n"
    476 
    477  "mov    0x2c(%esp),%esi\n"
    478  "mov    %ebx,%eax\n"
    479  "sar    $0x11,%eax\n"
    480 
    481  "movzbl (%esi,%eax,1),%ecx\n"
    482  "movzbl 1(%esi,%eax,1),%esi\n"
    483  "mov    %ebx,%eax\n"
    484  "andl   $0x1fffe, %eax \n"
    485  "imul   %eax, %esi \n"
    486  "xorl   $0x1fffe, %eax \n"
    487  "imul   %eax, %ecx \n"
    488  "addl   %esi, %ecx \n"
    489  "shrl   $17, %ecx \n"
    490  "paddsw kCoefficientsRgbY+4096(,%ecx,8),%mm0\n"
    491 
    492  "mov    %ebx,%eax\n"
    493  "sar    $0x10,%eax\n"
    494  "movzbl (%edx,%eax,1),%ecx\n"
    495  "movzbl 1(%edx,%eax,1),%esi\n"
    496  "mov    %ebx,%eax\n"
    497  "add    0x38(%esp),%ebx\n"
    498  "andl   $0xffff, %eax \n"
    499  "imul   %eax, %esi \n"
    500  "xorl   $0xffff, %eax \n"
    501  "imul   %eax, %ecx \n"
    502  "addl   %esi, %ecx \n"
    503  "shrl   $16, %ecx \n"
    504  "movq   kCoefficientsRgbY(,%ecx,8),%mm1\n"
    505 
    506  "cmp    0x34(%esp), %ebx\n"
    507  "jge    2f\n"
    508 
    509  "mov    %ebx,%eax\n"
    510  "sar    $0x10,%eax\n"
    511  "movzbl (%edx,%eax,1),%ecx\n"
    512  "movzbl 1(%edx,%eax,1),%esi\n"
    513  "mov    %ebx,%eax\n"
    514  "add    0x38(%esp),%ebx\n"
    515  "andl   $0xffff, %eax \n"
    516  "imul   %eax, %esi \n"
    517  "xorl   $0xffff, %eax \n"
    518  "imul   %eax, %ecx \n"
    519  "addl   %esi, %ecx \n"
    520  "shrl   $16, %ecx \n"
    521  "movq   kCoefficientsRgbY(,%ecx,8),%mm2\n"
    522 
    523  "paddsw %mm0,%mm1\n"
    524  "paddsw %mm0,%mm2\n"
    525  "psraw  $0x6,%mm1\n"
    526  "psraw  $0x6,%mm2\n"
    527  "packuswb %mm2,%mm1\n"
    528  "movntq %mm1,0x0(%ebp)\n"
    529  "add    $0x8,%ebp\n"
    530 
    531 "1:"
    532  "cmp    0x34(%esp), %ebx\n"
    533  "jl     0b\n"
    534  "popa\n"
    535  "ret\n"
    536 
    537 "2:"
    538  "paddsw %mm0, %mm1\n"
    539  "psraw $6, %mm1\n"
    540  "packuswb %mm1, %mm1\n"
    541  "movd %mm1, (%ebp)\n"
    542  "popa\n"
    543  "ret\n"
    544 #if !defined(XP_MACOSX)
    545  ".previous\n"
    546 #endif
    547 );
    548 
    549 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
    550                              const uint8_t* u_buf,
    551                              const uint8_t* v_buf,
    552                              uint8_t* rgb_buf,
    553                              int width,
    554                              int source_dx)
    555 {
    556  if (mozilla::supports_sse()) {
    557    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf,
    558                                 width, source_dx);
    559    return;
    560  }
    561 
    562  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf,
    563                             width, source_dx);
    564 }
    565 
    566 #elif defined(MOZILLA_MAY_SUPPORT_SSE) && defined(ARCH_CPU_X86_32) && defined(__PIC__)
    567 
    568 void PICConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
    569                                 const uint8_t* u_buf,
    570                                 const uint8_t* v_buf,
    571                                 uint8_t* rgb_buf,
    572                                 int width,
    573                                 const int16_t *kCoefficientsRgbY);
    574 
    575  asm(
    576  ".text\n"
    577 #if defined(XP_MACOSX)
    578 "_PICConvertYUVToRGB32Row_SSE:\n"
    579 #else
    580 "PICConvertYUVToRGB32Row_SSE:\n"
    581 #endif
    582  "pusha\n"
    583  "mov    0x24(%esp),%edx\n"
    584  "mov    0x28(%esp),%edi\n"
    585  "mov    0x2c(%esp),%esi\n"
    586  "mov    0x30(%esp),%ebp\n"
    587  "mov    0x38(%esp),%ecx\n"
    588 
    589  "jmp    1f\n"
    590 
    591 "0:"
    592  "movzbl (%edi),%eax\n"
    593  "add    $0x1,%edi\n"
    594  "movzbl (%esi),%ebx\n"
    595  "add    $0x1,%esi\n"
    596  "movq   2048(%ecx,%eax,8),%mm0\n"
    597  "movzbl (%edx),%eax\n"
    598  "paddsw 4096(%ecx,%ebx,8),%mm0\n"
    599  "movzbl 0x1(%edx),%ebx\n"
    600  "movq   0(%ecx,%eax,8),%mm1\n"
    601  "add    $0x2,%edx\n"
    602  "movq   0(%ecx,%ebx,8),%mm2\n"
    603  "paddsw %mm0,%mm1\n"
    604  "paddsw %mm0,%mm2\n"
    605  "psraw  $0x6,%mm1\n"
    606  "psraw  $0x6,%mm2\n"
    607  "packuswb %mm2,%mm1\n"
    608  "movntq %mm1,0x0(%ebp)\n"
    609  "add    $0x8,%ebp\n"
    610 "1:"
    611  "subl   $0x2,0x34(%esp)\n"
    612  "jns    0b\n"
    613 
    614  "andl   $0x1,0x34(%esp)\n"
    615  "je     2f\n"
    616 
    617  "movzbl (%edi),%eax\n"
    618  "movq   2048(%ecx,%eax,8),%mm0\n"
    619  "movzbl (%esi),%eax\n"
    620  "paddsw 4096(%ecx,%eax,8),%mm0\n"
    621  "movzbl (%edx),%eax\n"
    622  "movq   0(%ecx,%eax,8),%mm1\n"
    623  "paddsw %mm0,%mm1\n"
    624  "psraw  $0x6,%mm1\n"
    625  "packuswb %mm1,%mm1\n"
    626  "movd   %mm1,0x0(%ebp)\n"
    627 "2:"
    628  "popa\n"
    629  "ret\n"
    630 #if !defined(XP_MACOSX)
    631  ".previous\n"
    632 #endif
    633 );
    634 
    635 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
    636                              const uint8_t* u_buf,
    637                              const uint8_t* v_buf,
    638                              uint8_t* rgb_buf,
    639                              int width)
    640 {
    641  if (mozilla::supports_sse()) {
    642    PICConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
    643                                &kCoefficientsRgbY[0][0]);
    644    return;
    645  }
    646 
    647  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
    648 }
    649 
    650 void PICScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    651                               const uint8_t* u_buf,
    652                               const uint8_t* v_buf,
    653                               uint8_t* rgb_buf,
    654                               int width,
    655                               int source_dx,
    656                               const int16_t *kCoefficientsRgbY);
    657 
    658  asm(
    659  ".text\n"
    660 #if defined(XP_MACOSX)
    661 "_PICScaleYUVToRGB32Row_SSE:\n"
    662 #else
    663 "PICScaleYUVToRGB32Row_SSE:\n"
    664 #endif
    665  "pusha\n"
    666  "mov    0x24(%esp),%edx\n"
    667  "mov    0x28(%esp),%edi\n"
    668  "mov    0x2c(%esp),%esi\n"
    669  "mov    0x30(%esp),%ebp\n"
    670  "mov    0x3c(%esp),%ecx\n"
    671  "xor    %ebx,%ebx\n"
    672  "jmp    1f\n"
    673 
    674 "0:"
    675  "mov    %ebx,%eax\n"
    676  "sar    $0x11,%eax\n"
    677  "movzbl (%edi,%eax,1),%eax\n"
    678  "movq   2048(%ecx,%eax,8),%mm0\n"
    679  "mov    %ebx,%eax\n"
    680  "sar    $0x11,%eax\n"
    681  "movzbl (%esi,%eax,1),%eax\n"
    682  "paddsw 4096(%ecx,%eax,8),%mm0\n"
    683  "mov    %ebx,%eax\n"
    684  "add    0x38(%esp),%ebx\n"
    685  "sar    $0x10,%eax\n"
    686  "movzbl (%edx,%eax,1),%eax\n"
    687  "movq   0(%ecx,%eax,8),%mm1\n"
    688  "mov    %ebx,%eax\n"
    689  "add    0x38(%esp),%ebx\n"
    690  "sar    $0x10,%eax\n"
    691  "movzbl (%edx,%eax,1),%eax\n"
    692  "movq   0(%ecx,%eax,8),%mm2\n"
    693  "paddsw %mm0,%mm1\n"
    694  "paddsw %mm0,%mm2\n"
    695  "psraw  $0x6,%mm1\n"
    696  "psraw  $0x6,%mm2\n"
    697  "packuswb %mm2,%mm1\n"
    698  "movntq %mm1,0x0(%ebp)\n"
    699  "add    $0x8,%ebp\n"
    700 "1:"
    701  "subl   $0x2,0x34(%esp)\n"
    702  "jns    0b\n"
    703 
    704  "andl   $0x1,0x34(%esp)\n"
    705  "je     2f\n"
    706 
    707  "mov    %ebx,%eax\n"
    708  "sar    $0x11,%eax\n"
    709  "movzbl (%edi,%eax,1),%eax\n"
    710  "movq   2048(%ecx,%eax,8),%mm0\n"
    711  "mov    %ebx,%eax\n"
    712  "sar    $0x11,%eax\n"
    713  "movzbl (%esi,%eax,1),%eax\n"
    714  "paddsw 4096(%ecx,%eax,8),%mm0\n"
    715  "mov    %ebx,%eax\n"
    716  "sar    $0x10,%eax\n"
    717  "movzbl (%edx,%eax,1),%eax\n"
    718  "movq   0(%ecx,%eax,8),%mm1\n"
    719  "paddsw %mm0,%mm1\n"
    720  "psraw  $0x6,%mm1\n"
    721  "packuswb %mm1,%mm1\n"
    722  "movd   %mm1,0x0(%ebp)\n"
    723 
    724 "2:"
    725  "popa\n"
    726  "ret\n"
    727 #if !defined(XP_MACOSX)
    728  ".previous\n"
    729 #endif
    730 );
    731 
    732 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
    733                        const uint8_t* u_buf,
    734                        const uint8_t* v_buf,
    735                        uint8_t* rgb_buf,
    736                        int width,
    737                        int source_dx)
    738 {
    739  if (mozilla::supports_sse()) {
    740    PICScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx,
    741                              &kCoefficientsRgbY[0][0]);
    742    return;
    743  }
    744 
    745  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    746 }
    747 
    748 void PICLinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    749                                     const uint8_t* u_buf,
    750                                     const uint8_t* v_buf,
    751                                     uint8_t* rgb_buf,
    752                                     int width,
    753                                     int source_dx,
    754                                     const int16_t *kCoefficientsRgbY);
    755 
    756  asm(
    757  ".text\n"
    758 #if defined(XP_MACOSX)
    759 "_PICLinearScaleYUVToRGB32Row_SSE:\n"
    760 #else
    761 "PICLinearScaleYUVToRGB32Row_SSE:\n"
    762 #endif
    763  "pusha\n"
    764  "mov    0x24(%esp),%edx\n"
    765  "mov    0x30(%esp),%ebp\n"
    766  "mov    0x34(%esp),%ecx\n"
    767  "mov    0x3c(%esp),%edi\n"
    768  "xor    %ebx,%ebx\n"
    769 
    770  // source_width = width * source_dx + ebx
    771  "mov    0x34(%esp), %ecx\n"
    772  "imull  0x38(%esp), %ecx\n"
    773  "mov    %ecx, 0x34(%esp)\n"
    774 
    775  "mov    0x38(%esp), %ecx\n"
    776  "xor    %ebx,%ebx\n"     // x = 0
    777  "cmp    $0x20000,%ecx\n" // if source_dx >= 2.0
    778  "jl     1f\n"
    779  "mov    $0x8000,%ebx\n"  // x = 0.5 for 1/2 or less
    780  "jmp    1f\n"
    781 
    782 "0:"
    783  "mov    0x28(%esp),%esi\n"
    784  "mov    %ebx,%eax\n"
    785  "sar    $0x11,%eax\n"
    786 
    787  "movzbl (%esi,%eax,1),%ecx\n"
    788  "movzbl 1(%esi,%eax,1),%esi\n"
    789  "mov    %ebx,%eax\n"
    790  "andl   $0x1fffe, %eax \n"
    791  "imul   %eax, %esi \n"
    792  "xorl   $0x1fffe, %eax \n"
    793  "imul   %eax, %ecx \n"
    794  "addl   %esi, %ecx \n"
    795  "shrl   $17, %ecx \n"
    796  "movq   2048(%edi,%ecx,8),%mm0\n"
    797 
    798  "mov    0x2c(%esp),%esi\n"
    799  "mov    %ebx,%eax\n"
    800  "sar    $0x11,%eax\n"
    801 
    802  "movzbl (%esi,%eax,1),%ecx\n"
    803  "movzbl 1(%esi,%eax,1),%esi\n"
    804  "mov    %ebx,%eax\n"
    805  "andl   $0x1fffe, %eax \n"
    806  "imul   %eax, %esi \n"
    807  "xorl   $0x1fffe, %eax \n"
    808  "imul   %eax, %ecx \n"
    809  "addl   %esi, %ecx \n"
    810  "shrl   $17, %ecx \n"
    811  "paddsw 4096(%edi,%ecx,8),%mm0\n"
    812 
    813  "mov    %ebx,%eax\n"
    814  "sar    $0x10,%eax\n"
    815  "movzbl (%edx,%eax,1),%ecx\n"
    816  "movzbl 1(%edx,%eax,1),%esi\n"
    817  "mov    %ebx,%eax\n"
    818  "add    0x38(%esp),%ebx\n"
    819  "andl   $0xffff, %eax \n"
    820  "imul   %eax, %esi \n"
    821  "xorl   $0xffff, %eax \n"
    822  "imul   %eax, %ecx \n"
    823  "addl   %esi, %ecx \n"
    824  "shrl   $16, %ecx \n"
    825  "movq   (%edi,%ecx,8),%mm1\n"
    826 
    827  "cmp    0x34(%esp), %ebx\n"
    828  "jge    2f\n"
    829 
    830  "mov    %ebx,%eax\n"
    831  "sar    $0x10,%eax\n"
    832  "movzbl (%edx,%eax,1),%ecx\n"
    833  "movzbl 1(%edx,%eax,1),%esi\n"
    834  "mov    %ebx,%eax\n"
    835  "add    0x38(%esp),%ebx\n"
    836  "andl   $0xffff, %eax \n"
    837  "imul   %eax, %esi \n"
    838  "xorl   $0xffff, %eax \n"
    839  "imul   %eax, %ecx \n"
    840  "addl   %esi, %ecx \n"
    841  "shrl   $16, %ecx \n"
    842  "movq   (%edi,%ecx,8),%mm2\n"
    843 
    844  "paddsw %mm0,%mm1\n"
    845  "paddsw %mm0,%mm2\n"
    846  "psraw  $0x6,%mm1\n"
    847  "psraw  $0x6,%mm2\n"
    848  "packuswb %mm2,%mm1\n"
    849  "movntq %mm1,0x0(%ebp)\n"
    850  "add    $0x8,%ebp\n"
    851 
    852 "1:"
    853  "cmp    %ebx, 0x34(%esp)\n"
    854  "jg     0b\n"
    855  "popa\n"
    856  "ret\n"
    857 
    858 "2:"
    859  "paddsw %mm0, %mm1\n"
    860  "psraw $6, %mm1\n"
    861  "packuswb %mm1, %mm1\n"
    862  "movd %mm1, (%ebp)\n"
    863  "popa\n"
    864  "ret\n"
    865 #if !defined(XP_MACOSX)
    866  ".previous\n"
    867 #endif
    868 );
    869 
    870 
    871 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
    872                              const uint8_t* u_buf,
    873                              const uint8_t* v_buf,
    874                              uint8_t* rgb_buf,
    875                              int width,
    876                              int source_dx)
    877 {
    878  if (mozilla::supports_sse()) {
    879    PICLinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
    880                                    source_dx, &kCoefficientsRgbY[0][0]);
    881    return;
    882  }
    883 
    884  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    885 }
    886 #else
    887 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
    888                              const uint8_t* u_buf,
    889                              const uint8_t* v_buf,
    890                              uint8_t* rgb_buf,
    891                              int width) {
    892  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
    893 }
    894 
    895 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
    896                        const uint8_t* u_buf,
    897                        const uint8_t* v_buf,
    898                        uint8_t* rgb_buf,
    899                        int width,
    900                        int source_dx) {
    901  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    902 }
    903 
    904 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
    905                              const uint8_t* u_buf,
    906                              const uint8_t* v_buf,
    907                              uint8_t* rgb_buf,
    908                              int width,
    909                              int source_dx) {
    910  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    911 }
    912 #endif
    913 
    914 }