tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcgryext-mmi.c (10948B)


      1 /*
      2 * Loongson MMI optimizations for libjpeg-turbo
      3 *
      4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 * Copyright (C) 2014-2015, 2019, D. R. Commander.  All Rights Reserved.
      6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      7 *                          All Rights Reserved.
      8 * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
      9 *
     10 * Based on the x86 SIMD extension for IJG JPEG library
     11 * Copyright (C) 1999-2006, MIYASAKA Masaru.
     12 *
     13 * This software is provided 'as-is', without any express or implied
     14 * warranty.  In no event will the authors be held liable for any damages
     15 * arising from the use of this software.
     16 *
     17 * Permission is granted to anyone to use this software for any purpose,
     18 * including commercial applications, and to alter it and redistribute it
     19 * freely, subject to the following restrictions:
     20 *
     21 * 1. The origin of this software must not be misrepresented; you must not
     22 *    claim that you wrote the original software. If you use this software
     23 *    in a product, an acknowledgment in the product documentation would be
     24 *    appreciated but is not required.
     25 * 2. Altered source versions must be plainly marked as such, and must not be
     26 *    misrepresented as being the original software.
     27 * 3. This notice may not be removed or altered from any source distribution.
     28 */
     29 
     30 /* This file is included by jcgray-mmi.c */
     31 
     32 
     33 #if RGB_RED == 0
     34 #define mmA  re
     35 #define mmB  ro
     36 #elif RGB_GREEN == 0
     37 #define mmA  ge
     38 #define mmB  go
     39 #elif RGB_BLUE == 0
     40 #define mmA  be
     41 #define mmB  bo
     42 #else
     43 #define mmA  xe
     44 #define mmB  xo
     45 #endif
     46 
     47 #if RGB_RED == 1
     48 #define mmC  re
     49 #define mmD  ro
     50 #elif RGB_GREEN == 1
     51 #define mmC  ge
     52 #define mmD  go
     53 #elif RGB_BLUE == 1
     54 #define mmC  be
     55 #define mmD  bo
     56 #else
     57 #define mmC  xe
     58 #define mmD  xo
     59 #endif
     60 
     61 #if RGB_RED == 2
     62 #define mmE  re
     63 #define mmF  ro
     64 #elif RGB_GREEN == 2
     65 #define mmE  ge
     66 #define mmF  go
     67 #elif RGB_BLUE == 2
     68 #define mmE  be
     69 #define mmF  bo
     70 #else
     71 #define mmE  xe
     72 #define mmF  xo
     73 #endif
     74 
     75 #if RGB_RED == 3
     76 #define mmG  re
     77 #define mmH  ro
     78 #elif RGB_GREEN == 3
     79 #define mmG  ge
     80 #define mmH  go
     81 #elif RGB_BLUE == 3
     82 #define mmG  be
     83 #define mmH  bo
     84 #else
     85 #define mmG  xe
     86 #define mmH  xo
     87 #endif
     88 
     89 
     90 void jsimd_rgb_gray_convert_mmi(JDIMENSION image_width, JSAMPARRAY input_buf,
     91                                JSAMPIMAGE output_buf, JDIMENSION output_row,
     92                                int num_rows)
     93 {
     94  JSAMPROW inptr, outptr;
     95  int num_cols, col;
     96  __m64 re, ro, ge, go, be, bo, xe;
     97 #if RGB_PIXELSIZE == 4
     98  __m64 xo;
     99 #endif
    100  __m64 rgle, rghe, rglo, rgho, bgle, bghe, bglo, bgho;
    101  __m64 yle_rg, yhe_rg, yle_bg, yhe_bg, yle, yhe, ye;
    102  __m64 ylo_rg, yho_rg, ylo_bg, yho_bg, ylo, yho, yo, y;
    103 
    104  while (--num_rows >= 0) {
    105    inptr = *input_buf++;
    106    outptr = output_buf[0][output_row];
    107    output_row++;
    108 
    109    for (num_cols = image_width; num_cols > 0; num_cols -= 8,
    110         outptr += 8) {
    111 
    112 #if RGB_PIXELSIZE == 3
    113 
    114      if (num_cols < 8) {
    115        col = num_cols * 3;
    116        asm(".set noreorder\r\n"
    117 
    118            "li       $8, 1\r\n"
    119            "move     $9, %3\r\n"
    120            "and      $10, $9, $8\r\n"
    121            "beqz     $10, 1f\r\n"
    122            "nop      \r\n"
    123            "subu     $9, $9, 1\r\n"
    124            "xor      $12, $12, $12\r\n"
    125            "move     $13, %5\r\n"
    126            PTR_ADDU  "$13, $13, $9\r\n"
    127            "lbu      $12, 0($13)\r\n"
    128 
    129            "1:       \r\n"
    130            "li       $8, 2\r\n"
    131            "and      $10, $9, $8\r\n"
    132            "beqz     $10, 2f\r\n"
    133            "nop      \r\n"
    134            "subu     $9, $9, 2\r\n"
    135            "xor      $11, $11, $11\r\n"
    136            "move     $13, %5\r\n"
    137            PTR_ADDU  "$13, $13, $9\r\n"
    138            "lhu      $11, 0($13)\r\n"
    139            "sll      $12, $12, 16\r\n"
    140            "or       $12, $12, $11\r\n"
    141 
    142            "2:       \r\n"
    143            "dmtc1    $12, %0\r\n"
    144            "li       $8, 4\r\n"
    145            "and      $10, $9, $8\r\n"
    146            "beqz     $10, 3f\r\n"
    147            "nop      \r\n"
    148            "subu     $9, $9, 4\r\n"
    149            "move     $13, %5\r\n"
    150            PTR_ADDU  "$13, $13, $9\r\n"
    151            "lwu      $14, 0($13)\r\n"
    152            "dmtc1    $14, %1\r\n"
    153            "dsll32   $12, $12, 0\r\n"
    154            "or       $12, $12, $14\r\n"
    155            "dmtc1    $12, %0\r\n"
    156 
    157            "3:       \r\n"
    158            "li       $8, 8\r\n"
    159            "and      $10, $9, $8\r\n"
    160            "beqz     $10, 4f\r\n"
    161            "nop      \r\n"
    162            "mov.s    %1, %0\r\n"
    163            "ldc1     %0, 0(%5)\r\n"
    164            "li       $9, 8\r\n"
    165            "j        5f\r\n"
    166            "nop      \r\n"
    167 
    168            "4:       \r\n"
    169            "li       $8, 16\r\n"
    170            "and      $10, $9, $8\r\n"
    171            "beqz     $10, 5f\r\n"
    172            "nop      \r\n"
    173            "mov.s    %2, %0\r\n"
    174            "ldc1     %0, 0(%5)\r\n"
    175            "ldc1     %1, 8(%5)\r\n"
    176 
    177            "5:       \r\n"
    178            "nop      \r\n"
    179            ".set reorder\r\n"
    180 
    181            : "=f" (mmA), "=f" (mmG), "=f" (mmF)
    182            : "r" (col), "r" (num_rows), "r" (inptr)
    183            : "$f0", "$f2", "$f4", "$8", "$9", "$10", "$11", "$12", "$13",
    184              "$14", "memory"
    185           );
    186      } else {
    187        if (!(((long)inptr) & 7)) {
    188          mmA = _mm_load_si64((__m64 *)&inptr[0]);
    189          mmG = _mm_load_si64((__m64 *)&inptr[8]);
    190          mmF = _mm_load_si64((__m64 *)&inptr[16]);
    191        } else {
    192          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
    193          mmG = _mm_loadu_si64((__m64 *)&inptr[8]);
    194          mmF = _mm_loadu_si64((__m64 *)&inptr[16]);
    195        }
    196        inptr += RGB_PIXELSIZE * 8;
    197      }
    198      mmD = _mm_srli_si64(mmA, 4 * BYTE_BIT);
    199      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
    200 
    201      mmA = _mm_unpackhi_pi8(mmA, mmG);
    202      mmG = _mm_slli_si64(mmG, 4 * BYTE_BIT);
    203 
    204      mmD = _mm_unpacklo_pi8(mmD, mmF);
    205      mmG = _mm_unpackhi_pi8(mmG, mmF);
    206 
    207      mmE = _mm_srli_si64(mmA, 4 * BYTE_BIT);
    208      mmA = _mm_slli_si64(mmA, 4 * BYTE_BIT);
    209 
    210      mmA = _mm_unpackhi_pi8(mmA, mmD);
    211      mmD = _mm_slli_si64(mmD, 4 * BYTE_BIT);
    212 
    213      mmE = _mm_unpacklo_pi8(mmE, mmG);
    214      mmD = _mm_unpackhi_pi8(mmD, mmG);
    215      mmC = _mm_loadhi_pi8_f(mmA);
    216      mmA = _mm_loadlo_pi8_f(mmA);
    217 
    218      mmB = _mm_loadhi_pi8_f(mmE);
    219      mmE = _mm_loadlo_pi8_f(mmE);
    220 
    221      mmF = _mm_loadhi_pi8_f(mmD);
    222      mmD = _mm_loadlo_pi8_f(mmD);
    223 
    224 #else  /* RGB_PIXELSIZE == 4 */
    225 
    226      if (num_cols < 8) {
    227        col = num_cols;
    228        asm(".set noreorder\r\n"
    229 
    230            "li       $8, 1\r\n"
    231            "move     $9, %4\r\n"
    232            "and      $10, $9, $8\r\n"
    233            "beqz     $10, 1f\r\n"
    234            "nop      \r\n"
    235            "subu     $9, $9, 1\r\n"
    236            PTR_SLL   "$11, $9, 2\r\n"
    237            "move     $13, %5\r\n"
    238            PTR_ADDU  "$13, $13, $11\r\n"
    239            "lwc1     %0, 0($13)\r\n"
    240 
    241            "1:       \r\n"
    242            "li       $8, 2\r\n"
    243            "and      $10, $9, $8\r\n"
    244            "beqz     $10, 2f\r\n"
    245            "nop      \r\n"
    246            "subu     $9, $9, 2\r\n"
    247            PTR_SLL   "$11, $9, 2\r\n"
    248            "move     $13, %5\r\n"
    249            PTR_ADDU  "$13, $13, $11\r\n"
    250            "mov.s    %1, %0\r\n"
    251            "ldc1     %0, 0($13)\r\n"
    252 
    253            "2:       \r\n"
    254            "li       $8, 4\r\n"
    255            "and      $10, $9, $8\r\n"
    256            "beqz     $10, 3f\r\n"
    257            "nop      \r\n"
    258            "mov.s    %2, %0\r\n"
    259            "mov.s    %3, %1\r\n"
    260            "ldc1     %0, 0(%5)\r\n"
    261            "ldc1     %1, 8(%5)\r\n"
    262 
    263            "3:       \r\n"
    264            "nop      \r\n"
    265            ".set reorder\r\n"
    266 
    267            : "=f" (mmA), "=f" (mmF), "=f" (mmD), "=f" (mmC)
    268            : "r" (col), "r" (inptr)
    269            : "$f0", "$f2", "$8", "$9", "$10", "$11", "$13", "memory"
    270           );
    271      } else {
    272        if (!(((long)inptr) & 7)) {
    273          mmA = _mm_load_si64((__m64 *)&inptr[0]);
    274          mmF = _mm_load_si64((__m64 *)&inptr[8]);
    275          mmD = _mm_load_si64((__m64 *)&inptr[16]);
    276          mmC = _mm_load_si64((__m64 *)&inptr[24]);
    277        } else {
    278          mmA = _mm_loadu_si64((__m64 *)&inptr[0]);
    279          mmF = _mm_loadu_si64((__m64 *)&inptr[8]);
    280          mmD = _mm_loadu_si64((__m64 *)&inptr[16]);
    281          mmC = _mm_loadu_si64((__m64 *)&inptr[24]);
    282        }
    283        inptr += RGB_PIXELSIZE * 8;
    284      }
    285      mmB = _mm_unpackhi_pi8(mmA, mmF);
    286      mmA = _mm_unpacklo_pi8(mmA, mmF);
    287 
    288      mmG = _mm_unpackhi_pi8(mmD, mmC);
    289      mmD = _mm_unpacklo_pi8(mmD, mmC);
    290 
    291      mmE = _mm_unpackhi_pi16(mmA, mmD);
    292      mmA = _mm_unpacklo_pi16(mmA, mmD);
    293 
    294      mmH = _mm_unpackhi_pi16(mmB, mmG);
    295      mmB = _mm_unpacklo_pi16(mmB, mmG);
    296 
    297      mmC = _mm_loadhi_pi8_f(mmA);
    298      mmA = _mm_loadlo_pi8_f(mmA);
    299 
    300      mmD = _mm_loadhi_pi8_f(mmB);
    301      mmB = _mm_loadlo_pi8_f(mmB);
    302 
    303      mmG = _mm_loadhi_pi8_f(mmE);
    304      mmE = _mm_loadlo_pi8_f(mmE);
    305 
    306      mmF = _mm_unpacklo_pi8(mmH, mmH);
    307      mmH = _mm_unpackhi_pi8(mmH, mmH);
    308      mmF = _mm_srli_pi16(mmF, BYTE_BIT);
    309      mmH = _mm_srli_pi16(mmH, BYTE_BIT);
    310 
    311 #endif
    312 
    313      /* re=(R0 R2 R4 R6), ge=(G0 G2 G4 G6), be=(B0 B2 B4 B6)
    314       * ro=(R1 R3 R5 R7), go=(G1 G3 G5 G7), bo=(B1 B3 B5 B7)
    315       *
    316       * (Original)
    317       * Y  =  0.29900 * R + 0.58700 * G + 0.11400 * B
    318       *
    319       * (This implementation)
    320       * Y  =  0.29900 * R + 0.33700 * G + 0.11400 * B + 0.25000 * G
    321       */
    322 
    323      rglo = _mm_unpacklo_pi16(ro, go);
    324      rgho = _mm_unpackhi_pi16(ro, go);
    325      ylo_rg = _mm_madd_pi16(rglo, PW_F0299_F0337);
    326      yho_rg = _mm_madd_pi16(rgho, PW_F0299_F0337);
    327 
    328      rgle = _mm_unpacklo_pi16(re, ge);
    329      rghe = _mm_unpackhi_pi16(re, ge);
    330      yle_rg = _mm_madd_pi16(rgle, PW_F0299_F0337);
    331      yhe_rg = _mm_madd_pi16(rghe, PW_F0299_F0337);
    332 
    333      bglo = _mm_unpacklo_pi16(bo, go);
    334      bgho = _mm_unpackhi_pi16(bo, go);
    335      ylo_bg = _mm_madd_pi16(bglo, PW_F0114_F0250);
    336      yho_bg = _mm_madd_pi16(bgho, PW_F0114_F0250);
    337 
    338      ylo = _mm_add_pi32(ylo_bg, ylo_rg);
    339      yho = _mm_add_pi32(yho_bg, yho_rg);
    340      ylo = _mm_add_pi32(ylo, PD_ONEHALF);
    341      yho = _mm_add_pi32(yho, PD_ONEHALF);
    342      ylo = _mm_srli_pi32(ylo, SCALEBITS);
    343      yho = _mm_srli_pi32(yho, SCALEBITS);
    344      yo = _mm_packs_pi32(ylo, yho);
    345 
    346      bgle = _mm_unpacklo_pi16(be, ge);
    347      bghe = _mm_unpackhi_pi16(be, ge);
    348      yle_bg = _mm_madd_pi16(bgle, PW_F0114_F0250);
    349      yhe_bg = _mm_madd_pi16(bghe, PW_F0114_F0250);
    350 
    351      yle = _mm_add_pi32(yle_bg, yle_rg);
    352      yhe = _mm_add_pi32(yhe_bg, yhe_rg);
    353      yle = _mm_add_pi32(yle, PD_ONEHALF);
    354      yhe = _mm_add_pi32(yhe, PD_ONEHALF);
    355      yle = _mm_srli_pi32(yle, SCALEBITS);
    356      yhe = _mm_srli_pi32(yhe, SCALEBITS);
    357      ye = _mm_packs_pi32(yle, yhe);
    358 
    359      yo = _mm_slli_pi16(yo, BYTE_BIT);
    360      y = _mm_or_si64(ye, yo);
    361 
    362      _mm_store_si64((__m64 *)&outptr[0], y);
    363    }
    364  }
    365 }
    366 
    367 #undef mmA
    368 #undef mmB
    369 #undef mmC
    370 #undef mmD
    371 #undef mmE
    372 #undef mmF
    373 #undef mmG
    374 #undef mmH