tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_row_win.cpp (14435B)


      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 #include "yuv_row.h"
      6 #include "mozilla/SSE.h"
      7 
      8 #define kCoefficientsRgbU kCoefficientsRgbY + 2048
      9 #define kCoefficientsRgbV kCoefficientsRgbY + 4096
     10 
     11 extern "C" {
     12 
     13 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
     14 #if defined(__clang__)
     15 // clang-cl has a bug where it doesn't mangle names in inline asm
     16 // so let's do the mangling in the preprocessor (ugh)
     17 // (but we still need to declare a dummy extern for the parser)
     18 extern void* _kCoefficientsRgbY;
     19 #define kCoefficientsRgbY _kCoefficientsRgbY
     20 #endif
     21 
     22 __declspec(naked)
     23 void FastConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
     24                                  const uint8_t* u_buf,
     25                                  const uint8_t* v_buf,
     26                                  uint8_t* rgb_buf,
     27                                  int width) {
     28  __asm {
     29    pushad
     30    mov       edx, [esp + 32 + 4]   // Y
     31    mov       edi, [esp + 32 + 8]   // U
     32    mov       esi, [esp + 32 + 12]  // V
     33    mov       ebp, [esp + 32 + 16]  // rgb
     34    mov       ecx, [esp + 32 + 20]  // width
     35    jmp       convertend
     36 
     37 convertloop :
     38    movzx     eax, byte ptr [edi]
     39    add       edi, 1
     40    movzx     ebx, byte ptr [esi]
     41    add       esi, 1
     42    movq      mm0, [kCoefficientsRgbU + 8 * eax]
     43    movzx     eax, byte ptr [edx]
     44    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
     45    movzx     ebx, byte ptr [edx + 1]
     46    movq      mm1, [kCoefficientsRgbY + 8 * eax]
     47    add       edx, 2
     48    movq      mm2, [kCoefficientsRgbY + 8 * ebx]
     49    paddsw    mm1, mm0
     50    paddsw    mm2, mm0
     51    psraw     mm1, 6
     52    psraw     mm2, 6
     53    packuswb  mm1, mm2
     54    movntq    [ebp], mm1
     55    add       ebp, 8
     56 convertend :
     57    sub       ecx, 2
     58    jns       convertloop
     59 
     60    and       ecx, 1  // odd number of pixels?
     61    jz        convertdone
     62 
     63    movzx     eax, byte ptr [edi]
     64    movq      mm0, [kCoefficientsRgbU + 8 * eax]
     65    movzx     eax, byte ptr [esi]
     66    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
     67    movzx     eax, byte ptr [edx]
     68    movq      mm1, [kCoefficientsRgbY + 8 * eax]
     69    paddsw    mm1, mm0
     70    psraw     mm1, 6
     71    packuswb  mm1, mm1
     72    movd      [ebp], mm1
     73 convertdone :
     74 
     75    popad
     76    ret
     77  }
     78 }
     79 
     80 __declspec(naked)
     81 void ConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
     82                              const uint8_t* u_buf,
     83                              const uint8_t* v_buf,
     84                              uint8_t* rgb_buf,
     85                              int width,
     86                              int step) {
     87  __asm {
     88    pushad
     89    mov       edx, [esp + 32 + 4]   // Y
     90    mov       edi, [esp + 32 + 8]   // U
     91    mov       esi, [esp + 32 + 12]  // V
     92    mov       ebp, [esp + 32 + 16]  // rgb
     93    mov       ecx, [esp + 32 + 20]  // width
     94    mov       ebx, [esp + 32 + 24]  // step
     95    jmp       wend
     96 
     97 wloop :
     98    movzx     eax, byte ptr [edi]
     99    add       edi, ebx
    100    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    101    movzx     eax, byte ptr [esi]
    102    add       esi, ebx
    103    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    104    movzx     eax, byte ptr [edx]
    105    add       edx, ebx
    106    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    107    movzx     eax, byte ptr [edx]
    108    add       edx, ebx
    109    movq      mm2, [kCoefficientsRgbY + 8 * eax]
    110    paddsw    mm1, mm0
    111    paddsw    mm2, mm0
    112    psraw     mm1, 6
    113    psraw     mm2, 6
    114    packuswb  mm1, mm2
    115    movntq    [ebp], mm1
    116    add       ebp, 8
    117 wend :
    118    sub       ecx, 2
    119    jns       wloop
    120 
    121    and       ecx, 1  // odd number of pixels?
    122    jz        wdone
    123 
    124    movzx     eax, byte ptr [edi]
    125    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    126    movzx     eax, byte ptr [esi]
    127    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    128    movzx     eax, byte ptr [edx]
    129    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    130    paddsw    mm1, mm0
    131    psraw     mm1, 6
    132    packuswb  mm1, mm1
    133    movd      [ebp], mm1
    134 wdone :
    135 
    136    popad
    137    ret
    138  }
    139 }
    140 
    141 __declspec(naked)
    142 void RotateConvertYUVToRGB32Row_SSE(const uint8_t* y_buf,
    143                                    const uint8_t* u_buf,
    144                                    const uint8_t* v_buf,
    145                                    uint8_t* rgb_buf,
    146                                    int width,
    147                                    int ystep,
    148                                    int uvstep) {
    149  __asm {
    150    pushad
    151    mov       edx, [esp + 32 + 4]   // Y
    152    mov       edi, [esp + 32 + 8]   // U
    153    mov       esi, [esp + 32 + 12]  // V
    154    mov       ebp, [esp + 32 + 16]  // rgb
    155    mov       ecx, [esp + 32 + 20]  // width
    156    jmp       wend
    157 
    158 wloop :
    159    movzx     eax, byte ptr [edi]
    160    mov       ebx, [esp + 32 + 28]  // uvstep
    161    add       edi, ebx
    162    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    163    movzx     eax, byte ptr [esi]
    164    add       esi, ebx
    165    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    166    movzx     eax, byte ptr [edx]
    167    mov       ebx, [esp + 32 + 24]  // ystep
    168    add       edx, ebx
    169    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    170    movzx     eax, byte ptr [edx]
    171    add       edx, ebx
    172    movq      mm2, [kCoefficientsRgbY + 8 * eax]
    173    paddsw    mm1, mm0
    174    paddsw    mm2, mm0
    175    psraw     mm1, 6
    176    psraw     mm2, 6
    177    packuswb  mm1, mm2
    178    movntq    [ebp], mm1
    179    add       ebp, 8
    180 wend :
    181    sub       ecx, 2
    182    jns       wloop
    183 
    184    and       ecx, 1  // odd number of pixels?
    185    jz        wdone
    186 
    187    movzx     eax, byte ptr [edi]
    188    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    189    movzx     eax, byte ptr [esi]
    190    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    191    movzx     eax, byte ptr [edx]
    192    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    193    paddsw    mm1, mm0
    194    psraw     mm1, 6
    195    packuswb  mm1, mm1
    196    movd      [ebp], mm1
    197 wdone :
    198 
    199    popad
    200    ret
    201  }
    202 }
    203 
    204 __declspec(naked)
    205 void DoubleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    206                             const uint8_t* u_buf,
    207                             const uint8_t* v_buf,
    208                             uint8_t* rgb_buf,
    209                             int width) {
    210  __asm {
    211    pushad
    212    mov       edx, [esp + 32 + 4]   // Y
    213    mov       edi, [esp + 32 + 8]   // U
    214    mov       esi, [esp + 32 + 12]  // V
    215    mov       ebp, [esp + 32 + 16]  // rgb
    216    mov       ecx, [esp + 32 + 20]  // width
    217    jmp       wend
    218 
    219 wloop :
    220    movzx     eax, byte ptr [edi]
    221    add       edi, 1
    222    movzx     ebx, byte ptr [esi]
    223    add       esi, 1
    224    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    225    movzx     eax, byte ptr [edx]
    226    paddsw    mm0, [kCoefficientsRgbV + 8 * ebx]
    227    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    228    paddsw    mm1, mm0
    229    psraw     mm1, 6
    230    packuswb  mm1, mm1
    231    punpckldq mm1, mm1
    232    movntq    [ebp], mm1
    233 
    234    movzx     ebx, byte ptr [edx + 1]
    235    add       edx, 2
    236    paddsw    mm0, [kCoefficientsRgbY + 8 * ebx]
    237    psraw     mm0, 6
    238    packuswb  mm0, mm0
    239    punpckldq mm0, mm0
    240    movntq    [ebp+8], mm0
    241    add       ebp, 16
    242 wend :
    243    sub       ecx, 4
    244    jns       wloop
    245 
    246    add       ecx, 4
    247    jz        wdone
    248 
    249    movzx     eax, byte ptr [edi]
    250    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    251    movzx     eax, byte ptr [esi]
    252    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    253    movzx     eax, byte ptr [edx]
    254    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    255    paddsw    mm1, mm0
    256    psraw     mm1, 6
    257    packuswb  mm1, mm1
    258    jmp       wend1
    259 
    260 wloop1 :
    261    movd      [ebp], mm1
    262    add       ebp, 4
    263 wend1 :
    264    sub       ecx, 1
    265    jns       wloop1
    266 wdone :
    267    popad
    268    ret
    269  }
    270 }
    271 
    272 // This version does general purpose scaling by any amount, up or down.
    273 // The only thing it cannot do is rotation by 90 or 270.
    274 // For performance the chroma is under-sampled, reducing cost of a 3x
    275 // 1080p scale from 8.4 ms to 5.4 ms.
    276 __declspec(naked)
    277 void ScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    278                            const uint8_t* u_buf,
    279                            const uint8_t* v_buf,
    280                            uint8_t* rgb_buf,
    281                            int width,
    282                            int source_dx) {
    283  __asm {
    284    pushad
    285    mov       edx, [esp + 32 + 4]   // Y
    286    mov       edi, [esp + 32 + 8]   // U
    287    mov       esi, [esp + 32 + 12]  // V
    288    mov       ebp, [esp + 32 + 16]  // rgb
    289    mov       ecx, [esp + 32 + 20]  // width
    290    xor       ebx, ebx              // x
    291    jmp       scaleend
    292 
    293 scaleloop :
    294    mov       eax, ebx
    295    sar       eax, 17
    296    movzx     eax, byte ptr [edi + eax]
    297    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    298    mov       eax, ebx
    299    sar       eax, 17
    300    movzx     eax, byte ptr [esi + eax]
    301    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    302    mov       eax, ebx
    303    add       ebx, [esp + 32 + 24]  // x += source_dx
    304    sar       eax, 16
    305    movzx     eax, byte ptr [edx + eax]
    306    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    307    mov       eax, ebx
    308    add       ebx, [esp + 32 + 24]  // x += source_dx
    309    sar       eax, 16
    310    movzx     eax, byte ptr [edx + eax]
    311    movq      mm2, [kCoefficientsRgbY + 8 * eax]
    312    paddsw    mm1, mm0
    313    paddsw    mm2, mm0
    314    psraw     mm1, 6
    315    psraw     mm2, 6
    316    packuswb  mm1, mm2
    317    movntq    [ebp], mm1
    318    add       ebp, 8
    319 scaleend :
    320    sub       ecx, 2
    321    jns       scaleloop
    322 
    323    and       ecx, 1  // odd number of pixels?
    324    jz        scaledone
    325 
    326    mov       eax, ebx
    327    sar       eax, 17
    328    movzx     eax, byte ptr [edi + eax]
    329    movq      mm0, [kCoefficientsRgbU + 8 * eax]
    330    mov       eax, ebx
    331    sar       eax, 17
    332    movzx     eax, byte ptr [esi + eax]
    333    paddsw    mm0, [kCoefficientsRgbV + 8 * eax]
    334    mov       eax, ebx
    335    sar       eax, 16
    336    movzx     eax, byte ptr [edx + eax]
    337    movq      mm1, [kCoefficientsRgbY + 8 * eax]
    338    paddsw    mm1, mm0
    339    psraw     mm1, 6
    340    packuswb  mm1, mm1
    341    movd      [ebp], mm1
    342 
    343 scaledone :
    344    popad
    345    ret
    346  }
    347 }
    348 
    349 __declspec(naked)
    350 void LinearScaleYUVToRGB32Row_SSE(const uint8_t* y_buf,
    351                                  const uint8_t* u_buf,
    352                                  const uint8_t* v_buf,
    353                                  uint8_t* rgb_buf,
    354                                  int width,
    355                                  int source_dx) {
    356  __asm {
    357    pushad
    358    mov       edx, [esp + 32 + 4]  // Y
    359    mov       edi, [esp + 32 + 8]  // U
    360                // [esp + 32 + 12] // V
    361    mov       ebp, [esp + 32 + 16] // rgb
    362    mov       ecx, [esp + 32 + 20] // width
    363    imul      ecx, [esp + 32 + 24] // source_dx
    364    mov       [esp + 32 + 20], ecx // source_width = width * source_dx
    365    mov       ecx, [esp + 32 + 24] // source_dx
    366    xor       ebx, ebx             // x = 0
    367    cmp       ecx, 0x20000
    368    jl        lscaleend
    369    mov       ebx, 0x8000          // x = 0.5 for 1/2 or less
    370    jmp       lscaleend
    371 lscaleloop:
    372    mov       eax, ebx
    373    sar       eax, 0x11
    374 
    375    movzx     ecx, byte ptr [edi + eax]
    376    movzx     esi, byte ptr [edi + eax + 1]
    377    mov       eax, ebx
    378    and       eax, 0x1fffe
    379    imul      esi, eax
    380    xor       eax, 0x1fffe
    381    imul      ecx, eax
    382    add       ecx, esi
    383    shr       ecx, 17
    384    movq      mm0, [kCoefficientsRgbU + 8 * ecx]
    385 
    386    mov       esi, [esp + 32 + 12]
    387    mov       eax, ebx
    388    sar       eax, 0x11
    389 
    390    movzx     ecx, byte ptr [esi + eax]
    391    movzx     esi, byte ptr [esi + eax + 1]
    392    mov       eax, ebx
    393    and       eax, 0x1fffe
    394    imul      esi, eax
    395    xor       eax, 0x1fffe
    396    imul      ecx, eax
    397    add       ecx, esi
    398    shr       ecx, 17
    399    paddsw    mm0, [kCoefficientsRgbV + 8 * ecx]
    400 
    401    mov       eax, ebx
    402    sar       eax, 0x10
    403    movzx     ecx, byte ptr [edx + eax]
    404    movzx     esi, byte ptr [1 + edx + eax]
    405    mov       eax, ebx
    406    add       ebx, [esp + 32 + 24]
    407    and       eax, 0xffff
    408    imul      esi, eax
    409    xor       eax, 0xffff
    410    imul      ecx, eax
    411    add       ecx, esi
    412    shr       ecx, 16
    413    movq      mm1, [kCoefficientsRgbY + 8 * ecx]
    414 
    415    cmp       ebx, [esp + 32 + 20]
    416    jge       lscalelastpixel
    417 
    418    mov       eax, ebx
    419    sar       eax, 0x10
    420    movzx     ecx, byte ptr [edx + eax]
    421    movzx     esi, byte ptr [edx + eax + 1]
    422    mov       eax, ebx
    423    add       ebx, [esp + 32 + 24]
    424    and       eax, 0xffff
    425    imul      esi, eax
    426    xor       eax, 0xffff
    427    imul      ecx, eax
    428    add       ecx, esi
    429    shr       ecx, 16
    430    movq      mm2, [kCoefficientsRgbY + 8 * ecx]
    431 
    432    paddsw    mm1, mm0
    433    paddsw    mm2, mm0
    434    psraw     mm1, 0x6
    435    psraw     mm2, 0x6
    436    packuswb  mm1, mm2
    437    movntq    [ebp], mm1
    438    add       ebp, 0x8
    439 
    440 lscaleend:
    441    cmp       ebx, [esp + 32 + 20]
    442    jl        lscaleloop
    443    popad
    444    ret
    445 
    446 lscalelastpixel:
    447    paddsw    mm1, mm0
    448    psraw     mm1, 6
    449    packuswb  mm1, mm1
    450    movd      [ebp], mm1
    451    popad
    452    ret
    453  };
    454 }
    455 #endif // if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
    456 
    457 void FastConvertYUVToRGB32Row(const uint8_t* y_buf,
    458                              const uint8_t* u_buf,
    459                              const uint8_t* v_buf,
    460                              uint8_t* rgb_buf,
    461                              int width) {
    462 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
    463  if (mozilla::supports_sse()) {
    464    FastConvertYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width);
    465    return;
    466  }
    467 #endif
    468 
    469  FastConvertYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, 1);
    470 }
    471 
    472 void ScaleYUVToRGB32Row(const uint8_t* y_buf,
    473                        const uint8_t* u_buf,
    474                        const uint8_t* v_buf,
    475                        uint8_t* rgb_buf,
    476                        int width,
    477                        int source_dx) {
    478 
    479 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
    480  if (mozilla::supports_sse()) {
    481    ScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    482    return;
    483  }
    484 #endif
    485 
    486  ScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    487 }
    488 
    489 void LinearScaleYUVToRGB32Row(const uint8_t* y_buf,
    490                              const uint8_t* u_buf,
    491                              const uint8_t* v_buf,
    492                              uint8_t* rgb_buf,
    493                              int width,
    494                              int source_dx) {
    495 #if defined(MOZILLA_MAY_SUPPORT_SSE) && defined(_M_IX86)
    496  if (mozilla::supports_sse()) {
    497    LinearScaleYUVToRGB32Row_SSE(y_buf, u_buf, v_buf, rgb_buf, width,
    498                                 source_dx);
    499    return;
    500  }
    501 #endif
    502 
    503  LinearScaleYUVToRGB32Row_C(y_buf, u_buf, v_buf, rgb_buf, width, source_dx);
    504 }
    505 
    506 } // extern "C"