tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdcolext-mmi.c (15201B)


      1 /*
      2 * Loongson MMI optimizations for libjpeg-turbo
      3 *
      4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
      6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      7 *                          All Rights Reserved.
      8 * Authors:  ZhuChen     <zhuchen@loongson.cn>
      9 *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
     10 *           CaiWanwei   <caiwanwei@loongson.cn>
     11 *
     12 * Based on the x86 SIMD extension for IJG JPEG library
     13 * Copyright (C) 1999-2006, MIYASAKA Masaru.
     14 *
     15 * This software is provided 'as-is', without any express or implied
     16 * warranty.  In no event will the authors be held liable for any damages
     17 * arising from the use of this software.
     18 *
     19 * Permission is granted to anyone to use this software for any purpose,
     20 * including commercial applications, and to alter it and redistribute it
     21 * freely, subject to the following restrictions:
     22 *
     23 * 1. The origin of this software must not be misrepresented; you must not
     24 *    claim that you wrote the original software. If you use this software
     25 *    in a product, an acknowledgment in the product documentation would be
     26 *    appreciated but is not required.
     27 * 2. Altered source versions must be plainly marked as such, and must not be
     28 *    misrepresented as being the original software.
     29 * 3. This notice may not be removed or altered from any source distribution.
     30 */
     31 
     32 /* This file is included by jdcolor-mmi.c */
     33 
     34 
     35 #if RGB_RED == 0
     36 #define mmA  re
     37 #define mmB  ro
     38 #elif RGB_GREEN == 0
     39 #define mmA  ge
     40 #define mmB  go
     41 #elif RGB_BLUE == 0
     42 #define mmA  be
     43 #define mmB  bo
     44 #else
     45 #define mmA  xe
     46 #define mmB  xo
     47 #endif
     48 
     49 #if RGB_RED == 1
     50 #define mmC  re
     51 #define mmD  ro
     52 #elif RGB_GREEN == 1
     53 #define mmC  ge
     54 #define mmD  go
     55 #elif RGB_BLUE == 1
     56 #define mmC  be
     57 #define mmD  bo
     58 #else
     59 #define mmC  xe
     60 #define mmD  xo
     61 #endif
     62 
     63 #if RGB_RED == 2
     64 #define mmE  re
     65 #define mmF  ro
     66 #elif RGB_GREEN == 2
     67 #define mmE  ge
     68 #define mmF  go
     69 #elif RGB_BLUE == 2
     70 #define mmE  be
     71 #define mmF  bo
     72 #else
     73 #define mmE  xe
     74 #define mmF  xo
     75 #endif
     76 
     77 #if RGB_RED == 3
     78 #define mmG  re
     79 #define mmH  ro
     80 #elif RGB_GREEN == 3
     81 #define mmG  ge
     82 #define mmH  go
     83 #elif RGB_BLUE == 3
     84 #define mmG  be
     85 #define mmH  bo
     86 #else
     87 #define mmG  xe
     88 #define mmH  xo
     89 #endif
     90 
     91 
     92 void jsimd_ycc_rgb_convert_mmi(JDIMENSION out_width, JSAMPIMAGE input_buf,
     93                               JDIMENSION input_row, JSAMPARRAY output_buf,
     94                               int num_rows)
     95 {
     96  JSAMPROW outptr, inptr0, inptr1, inptr2;
     97  int num_cols, col;
     98  __m64 ye, yo, y, cbe, cbe2, cbo, cbo2, cb, cre, cre2, cro, cro2, cr;
     99  __m64 re, ro, gle, ghe, ge, glo, gho, go, be, bo, xe = 0.0, xo = 0.0;
    100  __m64 decenter, mask;
    101 
    102  while (--num_rows >= 0) {
    103    inptr0 = input_buf[0][input_row];
    104    inptr1 = input_buf[1][input_row];
    105    inptr2 = input_buf[2][input_row];
    106    input_row++;
    107    outptr = *output_buf++;
    108 
    109    for (num_cols = out_width; num_cols > 0; num_cols -= 8,
    110         inptr0 += 8, inptr1 += 8, inptr2 += 8) {
    111 
    112      cb = _mm_load_si64((__m64 *)inptr1);
    113      cr = _mm_load_si64((__m64 *)inptr2);
    114      y = _mm_load_si64((__m64 *)inptr0);
    115 
    116      mask = decenter = 0.0;
    117      mask = _mm_cmpeq_pi16(mask, mask);
    118      decenter = _mm_cmpeq_pi16(decenter, decenter);
    119      mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
    120      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
    121 
    122      cbe = _mm_and_si64(mask, cb);           /* Cb(0246) */
    123      cbo = _mm_srli_pi16(cb, BYTE_BIT);      /* Cb(1357) */
    124      cre = _mm_and_si64(mask, cr);           /* Cr(0246) */
    125      cro = _mm_srli_pi16(cr, BYTE_BIT);      /* Cr(1357) */
    126      cbe = _mm_add_pi16(cbe, decenter);
    127      cbo = _mm_add_pi16(cbo, decenter);
    128      cre = _mm_add_pi16(cre, decenter);
    129      cro = _mm_add_pi16(cro, decenter);
    130 
    131      /* (Original)
    132       * R = Y                + 1.40200 * Cr
    133       * G = Y - 0.34414 * Cb - 0.71414 * Cr
    134       * B = Y + 1.77200 * Cb
    135       *
    136       * (This implementation)
    137       * R = Y                + 0.40200 * Cr + Cr
    138       * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
    139       * B = Y - 0.22800 * Cb + Cb + Cb
    140       */
    141 
    142      cbe2 = _mm_add_pi16(cbe, cbe);          /* 2*CbE */
    143      cbo2 = _mm_add_pi16(cbo, cbo);          /* 2*CbO */
    144      cre2 = _mm_add_pi16(cre, cre);          /* 2*CrE */
    145      cro2 = _mm_add_pi16(cro, cro);          /* 2*CrO */
    146 
    147      be = _mm_mulhi_pi16(cbe2, PW_MF0228);   /* (2*CbE * -FIX(0.22800) */
    148      bo = _mm_mulhi_pi16(cbo2, PW_MF0228);   /* (2*CbO * -FIX(0.22800) */
    149      re = _mm_mulhi_pi16(cre2, PW_F0402);    /* (2*CrE * FIX(0.40200)) */
    150      ro = _mm_mulhi_pi16(cro2, PW_F0402);    /* (2*CrO * FIX(0.40200)) */
    151 
    152      be = _mm_add_pi16(be, PW_ONE);
    153      bo = _mm_add_pi16(bo, PW_ONE);
    154      be = _mm_srai_pi16(be, 1);              /* (CbE * -FIX(0.22800)) */
    155      bo = _mm_srai_pi16(bo, 1);              /* (CbO * -FIX(0.22800)) */
    156      re = _mm_add_pi16(re, PW_ONE);
    157      ro = _mm_add_pi16(ro, PW_ONE);
    158      re = _mm_srai_pi16(re, 1);              /* (CrE * FIX(0.40200)) */
    159      ro = _mm_srai_pi16(ro, 1);              /* (CrO * FIX(0.40200)) */
    160 
    161      be = _mm_add_pi16(be, cbe);
    162      bo = _mm_add_pi16(bo, cbo);
    163      be = _mm_add_pi16(be, cbe);             /* (CbE * FIX(1.77200))=(B-Y)E */
    164      bo = _mm_add_pi16(bo, cbo);             /* (CbO * FIX(1.77200))=(B-Y)O */
    165      re = _mm_add_pi16(re, cre);             /* (CrE * FIX(1.40200))=(R-Y)E */
    166      ro = _mm_add_pi16(ro, cro);             /* (CrO * FIX(1.40200))=(R-Y)O */
    167 
    168      gle = _mm_unpacklo_pi16(cbe, cre);
    169      ghe = _mm_unpackhi_pi16(cbe, cre);
    170      gle = _mm_madd_pi16(gle, PW_MF0344_F0285);
    171      ghe = _mm_madd_pi16(ghe, PW_MF0344_F0285);
    172      glo = _mm_unpacklo_pi16(cbo, cro);
    173      gho = _mm_unpackhi_pi16(cbo, cro);
    174      glo = _mm_madd_pi16(glo, PW_MF0344_F0285);
    175      gho = _mm_madd_pi16(gho, PW_MF0344_F0285);
    176 
    177      gle = _mm_add_pi32(gle, PD_ONEHALF);
    178      ghe = _mm_add_pi32(ghe, PD_ONEHALF);
    179      gle = _mm_srai_pi32(gle, SCALEBITS);
    180      ghe = _mm_srai_pi32(ghe, SCALEBITS);
    181      glo = _mm_add_pi32(glo, PD_ONEHALF);
    182      gho = _mm_add_pi32(gho, PD_ONEHALF);
    183      glo = _mm_srai_pi32(glo, SCALEBITS);
    184      gho = _mm_srai_pi32(gho, SCALEBITS);
    185 
    186      ge = _mm_packs_pi32(gle, ghe);       /* CbE*-FIX(0.344)+CrE*FIX(0.285) */
    187      go = _mm_packs_pi32(glo, gho);       /* CbO*-FIX(0.344)+CrO*FIX(0.285) */
    188      ge = _mm_sub_pi16(ge, cre);  /* CbE*-FIX(0.344)+CrE*-FIX(0.714)=(G-Y)E */
    189      go = _mm_sub_pi16(go, cro);  /* CbO*-FIX(0.344)+CrO*-FIX(0.714)=(G-Y)O */
    190 
    191      ye = _mm_and_si64(mask, y);             /* Y(0246) */
    192      yo = _mm_srli_pi16(y, BYTE_BIT);        /* Y(1357) */
    193 
    194      re = _mm_add_pi16(re, ye);              /* ((R-Y)E+YE)=(R0 R2 R4 R6) */
    195      ro = _mm_add_pi16(ro, yo);              /* ((R-Y)O+YO)=(R1 R3 R5 R7) */
    196      re = _mm_packs_pu16(re, re);            /* (R0 R2 R4 R6 ** ** ** **) */
    197      ro = _mm_packs_pu16(ro, ro);            /* (R1 R3 R5 R7 ** ** ** **) */
    198 
    199      ge = _mm_add_pi16(ge, ye);              /* ((G-Y)E+YE)=(G0 G2 G4 G6) */
    200      go = _mm_add_pi16(go, yo);              /* ((G-Y)O+YO)=(G1 G3 G5 G7) */
    201      ge = _mm_packs_pu16(ge, ge);            /* (G0 G2 G4 G6 ** ** ** **) */
    202      go = _mm_packs_pu16(go, go);            /* (G1 G3 G5 G7 ** ** ** **) */
    203 
    204      be = _mm_add_pi16(be, ye);              /* (YE+(B-Y)E)=(B0 B2 B4 B6) */
    205      bo = _mm_add_pi16(bo, yo);              /* (YO+(B-Y)O)=(B1 B3 B5 B7) */
    206      be = _mm_packs_pu16(be, be);            /* (B0 B2 B4 B6 ** ** ** **) */
    207      bo = _mm_packs_pu16(bo, bo);            /* (B1 B3 B5 B7 ** ** ** **) */
    208 
    209 #if RGB_PIXELSIZE == 3
    210 
    211      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
    212      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
    213      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
    214      mmE = _mm_unpacklo_pi8(mmE, mmB);       /* (20 01 22 03 24 05 26 07) */
    215      mmD = _mm_unpacklo_pi8(mmD, mmF);       /* (11 21 13 23 15 25 17 27) */
    216 
    217      mmH = _mm_srli_si64(mmA, 2 * BYTE_BIT);
    218 
    219      mmG = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 05 06 16 26 07) */
    220      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 01 02 12 22 03) */
    221 
    222      mmE = _mm_srli_si64(mmE, 2 * BYTE_BIT);
    223      mmB = _mm_srli_si64(mmD, 2 * BYTE_BIT);  /* (13 23 15 25 17 27 -- --) */
    224 
    225      mmC = _mm_unpackhi_pi16(mmD, mmH);      /* (15 25 06 16 17 27 -- --) */
    226      mmD = _mm_unpacklo_pi16(mmD, mmH);      /* (11 21 02 12 13 23 04 14) */
    227 
    228      mmF = _mm_unpackhi_pi16(mmE, mmB);      /* (26 07 17 27 -- -- -- --) */
    229      mmE = _mm_unpacklo_pi16(mmE, mmB);      /* (22 03 13 23 24 05 15 25) */
    230 
    231      mmA = _mm_unpacklo_pi32(mmA, mmD);      /* (00 10 20 01 11 21 02 12) */
    232      mmE = _mm_unpacklo_pi32(mmE, mmG);      /* (22 03 13 23 04 14 24 05) */
    233      mmC = _mm_unpacklo_pi32(mmC, mmF);      /* (15 25 06 16 26 07 17 27) */
    234 
    235      if (num_cols >= 8) {
    236        if (!(((long)outptr) & 7)) {
    237          _mm_store_si64((__m64 *)outptr, mmA);
    238          _mm_store_si64((__m64 *)(outptr + 8), mmE);
    239          _mm_store_si64((__m64 *)(outptr + 16), mmC);
    240        } else {
    241          _mm_storeu_si64((__m64 *)outptr, mmA);
    242          _mm_storeu_si64((__m64 *)(outptr + 8), mmE);
    243          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
    244        }
    245        outptr += RGB_PIXELSIZE * 8;
    246      } else {
    247        col = num_cols * 3;
    248        asm(".set noreorder\r\n"
    249 
    250            "li       $8, 16\r\n"
    251            "move     $9, %4\r\n"
    252            "mov.s    $f4, %1\r\n"
    253            "mov.s    $f6, %3\r\n"
    254            "move     $10, %5\r\n"
    255            "bltu     $9, $8, 1f\r\n"
    256            "nop      \r\n"
    257            "gssdlc1  $f4, 7($10)\r\n"
    258            "gssdrc1  $f4, 0($10)\r\n"
    259            "gssdlc1  $f6, 7+8($10)\r\n"
    260            "gssdrc1  $f6, 8($10)\r\n"
    261            "mov.s    $f4, %2\r\n"
    262            "subu     $9, $9, 16\r\n"
    263            PTR_ADDU  "$10, $10, 16\r\n"
    264            "b        2f\r\n"
    265            "nop      \r\n"
    266 
    267            "1:       \r\n"
    268            "li       $8, 8\r\n"              /* st8 */
    269            "bltu     $9, $8, 2f\r\n"
    270            "nop      \r\n"
    271            "gssdlc1  $f4, 7($10)\r\n"
    272            "gssdrc1  $f4, 0($10)\r\n"
    273            "mov.s    $f4, %3\r\n"
    274            "subu     $9, $9, 8\r\n"
    275            PTR_ADDU  "$10, $10, 8\r\n"
    276 
    277            "2:       \r\n"
    278            "li       $8, 4\r\n"              /* st4 */
    279            "mfc1     $11, $f4\r\n"
    280            "bltu     $9, $8, 3f\r\n"
    281            "nop      \r\n"
    282            "swl      $11, 3($10)\r\n"
    283            "swr      $11, 0($10)\r\n"
    284            "li       $8, 32\r\n"
    285            "mtc1     $8, $f6\r\n"
    286            "dsrl     $f4, $f4, $f6\r\n"
    287            "mfc1     $11, $f4\r\n"
    288            "subu     $9, $9, 4\r\n"
    289            PTR_ADDU  "$10, $10, 4\r\n"
    290 
    291            "3:       \r\n"
    292            "li       $8, 2\r\n"              /* st2 */
    293            "bltu     $9, $8, 4f\r\n"
    294            "nop      \r\n"
    295            "ush      $11, 0($10)\r\n"
    296            "srl      $11, 16\r\n"
    297            "subu     $9, $9, 2\r\n"
    298            PTR_ADDU  "$10, $10, 2\r\n"
    299 
    300            "4:       \r\n"
    301            "li       $8, 1\r\n"              /* st1 */
    302            "bltu     $9, $8, 5f\r\n"
    303            "nop      \r\n"
    304            "sb       $11, 0($10)\r\n"
    305 
    306            "5:       \r\n"
    307            "nop      \r\n"                   /* end */
    308            : "=m" (*outptr)
    309            : "f" (mmA), "f" (mmC), "f" (mmE), "r" (col), "r" (outptr)
    310            : "$f4", "$f6", "$8", "$9", "$10", "$11", "memory"
    311           );
    312      }
    313 
    314 #else  /* RGB_PIXELSIZE == 4 */
    315 
    316 #ifdef RGBX_FILLER_0XFF
    317      xe = _mm_cmpeq_pi8(xe, xe);
    318      xo = _mm_cmpeq_pi8(xo, xo);
    319 #else
    320      xe = _mm_xor_si64(xe, xe);
    321      xo = _mm_xor_si64(xo, xo);
    322 #endif
    323      /* mmA=(00 02 04 06 ** ** ** **), mmB=(01 03 05 07 ** ** ** **) */
    324      /* mmC=(10 12 14 16 ** ** ** **), mmD=(11 13 15 17 ** ** ** **) */
    325      /* mmE=(20 22 24 26 ** ** ** **), mmF=(21 23 25 27 ** ** ** **) */
    326      /* mmG=(30 32 34 36 ** ** ** **), mmH=(31 33 35 37 ** ** ** **) */
    327 
    328      mmA = _mm_unpacklo_pi8(mmA, mmC);       /* (00 10 02 12 04 14 06 16) */
    329      mmE = _mm_unpacklo_pi8(mmE, mmG);       /* (20 30 22 32 24 34 26 36) */
    330      mmB = _mm_unpacklo_pi8(mmB, mmD);       /* (01 11 03 13 05 15 07 17) */
    331      mmF = _mm_unpacklo_pi8(mmF, mmH);       /* (21 31 23 33 25 35 27 37) */
    332 
    333      mmC = _mm_unpackhi_pi16(mmA, mmE);      /* (04 14 24 34 06 16 26 36) */
    334      mmA = _mm_unpacklo_pi16(mmA, mmE);      /* (00 10 20 30 02 12 22 32) */
    335      mmG = _mm_unpackhi_pi16(mmB, mmF);      /* (05 15 25 35 07 17 27 37) */
    336      mmB = _mm_unpacklo_pi16(mmB, mmF);      /* (01 11 21 31 03 13 23 33) */
    337 
    338      mmD = _mm_unpackhi_pi32(mmA, mmB);      /* (02 12 22 32 03 13 23 33) */
    339      mmA = _mm_unpacklo_pi32(mmA, mmB);      /* (00 10 20 30 01 11 21 31) */
    340      mmH = _mm_unpackhi_pi32(mmC, mmG);      /* (06 16 26 36 07 17 27 37) */
    341      mmC = _mm_unpacklo_pi32(mmC, mmG);      /* (04 14 24 34 05 15 25 35) */
    342 
    343      if (num_cols >= 8) {
    344        if (!(((long)outptr) & 7)) {
    345          _mm_store_si64((__m64 *)outptr, mmA);
    346          _mm_store_si64((__m64 *)(outptr + 8), mmD);
    347          _mm_store_si64((__m64 *)(outptr + 16), mmC);
    348          _mm_store_si64((__m64 *)(outptr + 24), mmH);
    349        } else {
    350          _mm_storeu_si64((__m64 *)outptr, mmA);
    351          _mm_storeu_si64((__m64 *)(outptr + 8), mmD);
    352          _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
    353          _mm_storeu_si64((__m64 *)(outptr + 24), mmH);
    354        }
    355        outptr += RGB_PIXELSIZE * 8;
    356      } else {
    357        col = num_cols;
    358        asm(".set noreorder\r\n"              /* st16 */
    359 
    360            "li       $8, 4\r\n"
    361            "move     $9, %6\r\n"
    362            "move     $10, %7\r\n"
    363            "mov.s    $f4, %2\r\n"
    364            "mov.s    $f6, %4\r\n"
    365            "bltu     $9, $8, 1f\r\n"
    366            "nop      \r\n"
    367            "gssdlc1  $f4, 7($10)\r\n"
    368            "gssdrc1  $f4, 0($10)\r\n"
    369            "gssdlc1  $f6, 7+8($10)\r\n"
    370            "gssdrc1  $f6, 8($10)\r\n"
    371            "mov.s    $f4, %3\r\n"
    372            "mov.s    $f6, %5\r\n"
    373            "subu     $9, $9, 4\r\n"
    374            PTR_ADDU  "$10, $10, 16\r\n"
    375 
    376            "1:       \r\n"
    377            "li       $8, 2\r\n"              /* st8 */
    378            "bltu     $9, $8, 2f\r\n"
    379            "nop      \r\n"
    380            "gssdlc1  $f4, 7($10)\r\n"
    381            "gssdrc1  $f4, 0($10)\r\n"
    382            "mov.s    $f4, $f6\r\n"
    383            "subu     $9, $9, 2\r\n"
    384            PTR_ADDU  "$10, $10, 8\r\n"
    385 
    386            "2:       \r\n"
    387            "li       $8, 1\r\n"              /* st4 */
    388            "bltu     $9, $8, 3f\r\n"
    389            "nop      \r\n"
    390            "gsswlc1  $f4, 3($10)\r\n"
    391            "gsswrc1  $f4, 0($10)\r\n"
    392 
    393            "3:       \r\n"
    394            "li       %1, 0\r\n"              /* end */
    395            : "=m" (*outptr), "=r" (col)
    396            : "f" (mmA), "f" (mmC), "f" (mmD), "f" (mmH), "r" (col),
    397              "r" (outptr)
    398            : "$f4", "$f6", "$8", "$9", "$10", "memory"
    399           );
    400      }
    401 
    402 #endif
    403 
    404    }
    405  }
    406 }
    407 
    408 #undef mmA
    409 #undef mmB
    410 #undef mmC
    411 #undef mmD
    412 #undef mmE
    413 #undef mmF
    414 #undef mmG
    415 #undef mmH