tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jccolext-mmi.c (14239B)


      1 /*
      2 * Loongson MMI optimizations for libjpeg-turbo
      3 *
      4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
      6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      7 *                          All Rights Reserved.
      8 * Authors:  ZhuChen     <zhuchen@loongson.cn>
      9 *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
     10 *           CaiWanwei   <caiwanwei@loongson.cn>
     11 *           ZhangLixia  <zhanglixia-hf@loongson.cn>
     12 *
     13 * Based on the x86 SIMD extension for IJG JPEG library
     14 * Copyright (C) 1999-2006, MIYASAKA Masaru.
     15 *
     16 * This software is provided 'as-is', without any express or implied
     17 * warranty.  In no event will the authors be held liable for any damages
     18 * arising from the use of this software.
     19 *
     20 * Permission is granted to anyone to use this software for any purpose,
     21 * including commercial applications, and to alter it and redistribute it
     22 * freely, subject to the following restrictions:
     23 *
     24 * 1. The origin of this software must not be misrepresented; you must not
     25 *    claim that you wrote the original software. If you use this software
     26 *    in a product, an acknowledgment in the product documentation would be
     27 *    appreciated but is not required.
     28 * 2. Altered source versions must be plainly marked as such, and must not be
     29 *    misrepresented as being the original software.
     30 * 3. This notice may not be removed or altered from any source distribution.
     31 */
     32 
     33 /* This file is included by jccolor-mmi.c */
     34 
     35 
     36 #if RGB_RED == 0
     37 #define mmA  re
     38 #define mmB  ro
     39 #elif RGB_GREEN == 0
     40 #define mmA  ge
     41 #define mmB  go
     42 #elif RGB_BLUE == 0
     43 #define mmA  be
     44 #define mmB  bo
     45 #else
     46 #define mmA  xe
     47 #define mmB  xo
     48 #endif
     49 
     50 #if RGB_RED == 1
     51 #define mmC  re
     52 #define mmD  ro
     53 #elif RGB_GREEN == 1
     54 #define mmC  ge
     55 #define mmD  go
     56 #elif RGB_BLUE == 1
     57 #define mmC  be
     58 #define mmD  bo
     59 #else
     60 #define mmC  xe
     61 #define mmD  xo
     62 #endif
     63 
     64 #if RGB_RED == 2
     65 #define mmE  re
     66 #define mmF  ro
     67 #elif RGB_GREEN == 2
     68 #define mmE  ge
     69 #define mmF  go
     70 #elif RGB_BLUE == 2
     71 #define mmE  be
     72 #define mmF  bo
     73 #else
     74 #define mmE  xe
     75 #define mmF  xo
     76 #endif
     77 
     78 #if RGB_RED == 3
     79 #define mmG  re
     80 #define mmH  ro
     81 #elif RGB_GREEN == 3
     82 #define mmG  ge
     83 #define mmH  go
     84 #elif RGB_BLUE == 3
     85 #define mmG  be
     86 #define mmH  bo
     87 #else
     88 #define mmG  xe
     89 #define mmH  xo
     90 #endif
     91 
     92 
     93 void jsimd_rgb_ycc_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
     94                               JSAMPIMAGE output_buf, JDIMENSION output_row,
     95                               int num_rows)
     96 {
     97  JSAMPROW inptr, outptr0, outptr1, outptr2;
     98  int num_cols, col;
     99  __m64 re, ro, ge, go, be, bo, xe;
    100 #if RGB_PIXELSIZE == 4
    101  __m64 xo;
    102 #endif
    103  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
    104  __m64 ble, halfble, bhe, halfbhe, blo, halfblo, bho, halfbho;
    105  __m64 rle, halfrle, rhe, halfrhe, rlo, halfrlo, rho, halfrho;
    106  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
    107  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
    108  __m64 cble, cbhe, cbe, cblo, cbho, cbo, cb;
    109  __m64 crle, crhe, cre, crlo, crho, cro, cr;
    110 
    111  while (--num_rows >= 0) {
    112    inptr = *input_buf++;
    113    outptr0 = output_buf[0][output_row];
    114    outptr1 = output_buf[1][output_row];
    115    outptr2 = output_buf[2][output_row];
    116    output_row++;
    117 
    118    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
    119         outptr0 += 8, outptr1 += 8, outptr2 += 8) {
    120 
    121 #if RGB_PIXELSIZE == 3
    122 
    123      if (num_cols < 8) {
    124        col = num_cols * 3;
    125        asm(".set noreorder\r\n"
    126 
    127            "li       $8, 1\r\n"
    128            "move     $9, %3\r\n"
    129            "and      $10, $9, $8\r\n"
    130            "beqz     $10, 1f\r\n"
    131            "nop      \r\n"
    132            "subu     $9, $9, 1\r\n"
    133            "xor      $12, $12, $12\r\n"
    134            "move     $13, %5\r\n"
    135            PTR_ADDU  "$13, $13, $9\r\n"
    136            "lbu      $12, 0($13)\r\n"
    137 
    138            "1:       \r\n"
    139            "li       $8, 2\r\n"
    140            "and      $10, $9, $8\r\n"
    141            "beqz     $10, 2f\r\n"
    142            "nop      \r\n"
    143            "subu     $9, $9, 2\r\n"
    144            "xor      $11, $11, $11\r\n"
    145            "move     $13, %5\r\n"
    146            PTR_ADDU  "$13, $13, $9\r\n"
    147            "lhu      $11, 0($13)\r\n"
    148            "sll      $12, $12, 16\r\n"
    149            "or       $12, $12, $11\r\n"
    150 
    151            "2:       \r\n"
    152            "dmtc1    $12, %0\r\n"
    153            "li       $8, 4\r\n"
    154            "and      $10, $9, $8\r\n"
    155            "beqz     $10, 3f\r\n"
    156            "nop      \r\n"
    157            "subu     $9, $9, 4\r\n"
    158            "move     $13, %5\r\n"
    159            PTR_ADDU  "$13, $13, $9\r\n"
    160            "lwu      $14, 0($13)\r\n"
    161            "dmtc1    $14, %1\r\n"
    162            "dsll32   $12, $12, 0\r\n"
    163            "or       $12, $12, $14\r\n"
    164            "dmtc1    $12, %0\r\n"
    165 
    166            "3:       \r\n"
    167            "li       $8, 8\r\n"
    168            "and      $10, $9, $8\r\n"
    169            "beqz     $10, 4f\r\n"
    170            "nop      \r\n"
    171            "mov.s    %1, %0\r\n"
    172            "ldc1     %0, 0(%5)\r\n"
    173            "li       $9, 8\r\n"
    174            "j        5f\r\n"
    175            "nop      \r\n"
    176 
    177            "4:       \r\n"
    178            "li       $8, 16\r\n"
    179            "and      $10, $9, $8\r\n"
    180            "beqz     $10, 5f\r\n"
    181            "nop      \r\n"
    182            "mov.s    %2, %0\r\n"
    183            "ldc1     %0, 0(%5)\r\n"
    184            "ldc1     %1, 8(%5)\r\n"
    185 
    186            "5:       \r\n"
    187            "nop      \r\n"
    188            ".set reorder\r\n"
    189 
    190            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
    191            : "r" (col), "r" (num_rows), "r" (inptr)
    192            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
    193              "$14", "memory"
    194           );
    195      } else {
    196        if (!(((long)inptr) & 7)) {
    197          mmA = _mm_load_si64((__m64 *)&inptr[0]);
    198          mmG = _mm_load_si64((__m64 *)&inptr[8]);
    199          mmF = _mm_load_si64((__m64 *)&inptr[16]);
    200        } else {
    201          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
    202          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
    203          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
    204        }
    205        inptr += RGB_PIXELSIZE * 8;
    206      }
    207      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
    208      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
    209 
    210      mmA = _mm_unpackhi_pi8(mmA, mmG);
    211      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
    212 
    213      mmD = _mm_unpacklo_pi8(mmD, mmF);
    214      mmG = _mm_unpackhi_pi8(mmG, mmF);
    215 
    216      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
    217      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
    218 
    219      mmA = _mm_unpackhi_pi8(mmA, mmD);
    220      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
    221 
    222      mmE = _mm_unpacklo_pi8(mmE, mmG);
    223      mmD = _mm_unpackhi_pi8(mmD, mmG);
    224      mmC = _mm_loadhi_pi8_f(mmA);
    225      mmA = _mm_loadlo_pi8_f(mmA);
    226 
    227      mmB = _mm_loadhi_pi8_f(mmE);
    228      mmE = _mm_loadlo_pi8_f(mmE);
    229 
    230      mmF = _mm_loadhi_pi8_f(mmD);
    231      mmD = _mm_loadlo_pi8_f(mmD);
    232 
    233 #else  /* RGB_PIXELSIZE == 4 */
    234 
    235      if (num_cols < 8) {
    236        col = num_cols;
    237        asm(".set noreorder\r\n"
    238 
    239            "li       $8, 1\r\n"
    240            "move     $9, %4\r\n"
    241            "and      $10, $9, $8\r\n"
    242            "beqz     $10, 1f\r\n"
    243            "nop      \r\n"
    244            "subu     $9, $9, 1\r\n"
    245            PTR_SLL   "$11, $9, 2\r\n"
    246            "move     $13, %5\r\n"
    247            PTR_ADDU  "$13, $13, $11\r\n"
    248            "lwc1     %0, 0($13)\r\n"
    249 
    250            "1:       \r\n"
    251            "li       $8, 2\r\n"
    252            "and      $10, $9, $8\r\n"
    253            "beqz     $10, 2f\r\n"
    254            "nop      \r\n"
    255            "subu     $9, $9, 2\r\n"
    256            PTR_SLL   "$11, $9, 2\r\n"
    257            "move     $13, %5\r\n"
    258            PTR_ADDU  "$13, $13, $11\r\n"
    259            "mov.s    %1, %0\r\n"
    260            "ldc1     %0, 0($13)\r\n"
    261 
    262            "2:       \r\n"
    263            "li       $8, 4\r\n"
    264            "and      $10, $9, $8\r\n"
    265            "beqz     $10, 3f\r\n"
    266            "nop      \r\n"
    267            "mov.s    %2, %0\r\n"
    268            "mov.s    %3, %1\r\n"
    269            "ldc1     %0, 0(%5)\r\n"
    270            "ldc1     %1, 8(%5)\r\n"
    271 
    272            "3:       \r\n"
    273            "nop      \r\n"
    274            ".set reorder\r\n"
    275 
    276            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
    277            : "r" (col), "r" (inptr)
    278            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
    279           );
    280      } else {
    281        if (!(((long)inptr) & 7)) {
    282          mmA = _mm_load_si64((__m64 *)&inptr[0]);
    283          mmF = _mm_load_si64((__m64 *)&inptr[8]);
    284          mmD = _mm_load_si64((__m64 *)&inptr[16]);
    285          mmC = _mm_load_si64((__m64 *)&inptr[24]);
    286        } else {
    287          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
    288          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
    289          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
    290          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
    291        }
    292        inptr += RGB_PIXELSIZE * 8;
    293      }
    294      mmB = _mm_unpackhi_pi8(mmA, mmF);
    295      mmA = _mm_unpacklo_pi8(mmA, mmF);
    296 
    297      mmG = _mm_unpackhi_pi8(mmD, mmC);
    298      mmD = _mm_unpacklo_pi8(mmD, mmC);
    299 
    300      mmE = _mm_unpackhi_pi16(mmA, mmD);
    301      mmA = _mm_unpacklo_pi16(mmA, mmD);
    302 
    303      mmH = _mm_unpackhi_pi16(mmB, mmG);
    304      mmB = _mm_unpacklo_pi16(mmB, mmG);
    305 
    306      mmC = _mm_loadhi_pi8_f(mmA);
    307      mmA = _mm_loadlo_pi8_f(mmA);
    308 
    309      mmD = _mm_loadhi_pi8_f(mmB);
    310      mmB = _mm_loadlo_pi8_f(mmB);
    311 
    312      mmG = _mm_loadhi_pi8_f(mmE);
    313      mmE = _mm_loadlo_pi8_f(mmE);
    314 
    315      mmF = _mm_unpacklo_pi8(mmH, mmH);
    316      mmH = _mm_unpackhi_pi8(mmH, mmH);
    317      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
    318      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
    319 
    320 #endif
    321 
    322      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
    323       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
    324       *
    325       * (Original)
    326       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    327       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    328       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    329       *
    330       * (This implementation)
    331       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    332       * Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + CENTERJSAMPLE
    333       * Cr =  0.50000 * R - 0.41869 * G - 0.08131 * B + CENTERJSAMPLE
    334       */
    335 
    336      rglo = _mm_unpacklo_pi16(ro, go);
    337      rgho = _mm_unpackhi_pi16(ro, go);
    338      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
    339      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
    340      cblo = _mm_madd_pi16(rglo, PW_MF016_MF033);
    341      cbho = _mm_madd_pi16(rgho, PW_MF016_MF033);
    342 
    343      blo = _mm_loadlo_pi16_f(bo);
    344      bho = _mm_loadhi_pi16_f(bo);
    345      halfblo = _mm_srli_pi32(blo, 1);
    346      halfbho = _mm_srli_pi32(bho, 1);
    347 
    348      cblo = _mm_add_pi32(cblo, halfblo);
    349      cbho = _mm_add_pi32(cbho, halfbho);
    350      cblo = _mm_add_pi32(cblo, PD_ONEHALFM1_CJ);
    351      cbho = _mm_add_pi32(cbho, PD_ONEHALFM1_CJ);
    352      cblo = _mm_srli_pi32(cblo, SCALEBITS);
    353      cbho = _mm_srli_pi32(cbho, SCALEBITS);
    354      cbo = _mm_packs_pi32(cblo, cbho);
    355 
    356      rgle = _mm_unpacklo_pi16(re, ge);
    357      rghe = _mm_unpackhi_pi16(re, ge);
    358      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
    359      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
    360      cble = _mm_madd_pi16(rgle, PW_MF016_MF033);
    361      cbhe = _mm_madd_pi16(rghe, PW_MF016_MF033);
    362 
    363      ble = _mm_loadlo_pi16_f(be);
    364      bhe = _mm_loadhi_pi16_f(be);
    365      halfble = _mm_srli_pi32(ble, 1);
    366      halfbhe = _mm_srli_pi32(bhe, 1);
    367 
    368      cble = _mm_add_pi32(cble, halfble);
    369      cbhe = _mm_add_pi32(cbhe, halfbhe);
    370      cble = _mm_add_pi32(cble, PD_ONEHALFM1_CJ);
    371      cbhe = _mm_add_pi32(cbhe, PD_ONEHALFM1_CJ);
    372      cble = _mm_srli_pi32(cble, SCALEBITS);
    373      cbhe = _mm_srli_pi32(cbhe, SCALEBITS);
    374      cbe = _mm_packs_pi32(cble, cbhe);
    375 
    376      cbo = _mm_slli_pi16(cbo, BYTE_BIT);
    377      cb = _mm_or_si64(cbe, cbo);
    378 
    379      bglo = _mm_unpacklo_pi16(bo, go);
    380      bgho = _mm_unpackhi_pi16(bo, go);
    381      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
    382      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
    383      crlo = _mm_madd_pi16(bglo, PW_MF008_MF041);
    384      crho = _mm_madd_pi16(bgho, PW_MF008_MF041);
    385 
    386      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
    387      yho = _mm_add_pi32(yho_bg, yho_rg);
    388      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
    389      yho = _mm_add_pi32(yho, PD_ONEHALF);
    390      ylo = _mm_srli_pi32(ylo, SCALEBITS);
    391      yho = _mm_srli_pi32(yho, SCALEBITS);
    392      yo = _mm_packs_pi32(ylo, yho);
    393 
    394      rlo = _mm_loadlo_pi16_f(ro);
    395      rho = _mm_loadhi_pi16_f(ro);
    396      halfrlo = _mm_srli_pi32(rlo, 1);
    397      halfrho = _mm_srli_pi32(rho, 1);
    398 
    399      crlo = _mm_add_pi32(crlo, halfrlo);
    400      crho = _mm_add_pi32(crho, halfrho);
    401      crlo = _mm_add_pi32(crlo, PD_ONEHALFM1_CJ);
    402      crho = _mm_add_pi32(crho, PD_ONEHALFM1_CJ);
    403      crlo = _mm_srli_pi32(crlo, SCALEBITS);
    404      crho = _mm_srli_pi32(crho, SCALEBITS);
    405      cro = _mm_packs_pi32(crlo, crho);
    406 
    407      bgle = _mm_unpacklo_pi16(be, ge);
    408      bghe = _mm_unpackhi_pi16(be, ge);
    409      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
    410      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
    411      crle = _mm_madd_pi16(bgle, PW_MF008_MF041);
    412      crhe = _mm_madd_pi16(bghe, PW_MF008_MF041);
    413 
    414      yle = _mm_add_pi32(yle_bg, yle_rg);
    415      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
    416      yle = _mm_add_pi32(yle, PD_ONEHALF);
    417      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
    418      yle = _mm_srli_pi32(yle, SCALEBITS);
    419      yhe = _mm_srli_pi32(yhe, SCALEBITS);
    420      ye = _mm_packs_pi32(yle, yhe);
    421 
    422      yo = _mm_slli_pi16(yo, BYTE_BIT);
    423      y = _mm_or_si64(ye, yo);
    424 
    425      rle = _mm_loadlo_pi16_f(re);
    426      rhe = _mm_loadhi_pi16_f(re);
    427      halfrle = _mm_srli_pi32(rle, 1);
    428      halfrhe = _mm_srli_pi32(rhe, 1);
    429 
    430      crle = _mm_add_pi32(crle, halfrle);
    431      crhe = _mm_add_pi32(crhe, halfrhe);
    432      crle = _mm_add_pi32(crle, PD_ONEHALFM1_CJ);
    433      crhe = _mm_add_pi32(crhe, PD_ONEHALFM1_CJ);
    434      crle = _mm_srli_pi32(crle, SCALEBITS);
    435      crhe = _mm_srli_pi32(crhe, SCALEBITS);
    436      cre = _mm_packs_pi32(crle, crhe);
    437 
    438      cro = _mm_slli_pi16(cro, BYTE_BIT);
    439      cr = _mm_or_si64(cre, cro);
    440 
    441      _mm_store_si64((__m64 *)&outptr0[0], y);
    442      _mm_store_si64((__m64 *)&outptr1[0], cb);
    443      _mm_store_si64((__m64 *)&outptr2[0], cr);
    444    }
    445  }
    446 }
    447 
    448 #undef mmA
    449 #undef mmB
    450 #undef mmC
    451 #undef mmD
    452 #undef mmE
    453 #undef mmF
    454 #undef mmG
    455 #undef mmH