tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdmrgext-mmi.c (23377B)


      1 /*
      2 * Loongson MMI optimizations for libjpeg-turbo
      3 *
      4 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 * Copyright (C) 2015, 2019, D. R. Commander.  All Rights Reserved.
      6 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      7 *                          All Rights Reserved.
      8 * Authors:  ZhangLixia <zhanglixia-hf@loongson.cn>
      9 *
     10 * Based on the x86 SIMD extension for IJG JPEG library
     11 * Copyright (C) 1999-2006, MIYASAKA Masaru.
     12 *
     13 * This software is provided 'as-is', without any express or implied
     14 * warranty.  In no event will the authors be held liable for any damages
     15 * arising from the use of this software.
     16 *
     17 * Permission is granted to anyone to use this software for any purpose,
     18 * including commercial applications, and to alter it and redistribute it
     19 * freely, subject to the following restrictions:
     20 *
     21 * 1. The origin of this software must not be misrepresented; you must not
     22 *    claim that you wrote the original software. If you use this software
     23 *    in a product, an acknowledgment in the product documentation would be
     24 *    appreciated but is not required.
     25 * 2. Altered source versions must be plainly marked as such, and must not be
     26 *    misrepresented as being the original software.
     27 * 3. This notice may not be removed or altered from any source distribution.
     28 */
     29 
     30 /* This file is included by jdmerge-mmi.c */
     31 
     32 
     33 #if RGB_RED == 0
     34 #define mmA  re
     35 #define mmB  ro
     36 #elif RGB_GREEN == 0
     37 #define mmA  ge
     38 #define mmB  go
     39 #elif RGB_BLUE == 0
     40 #define mmA  be
     41 #define mmB  bo
     42 #else
     43 #define mmA  xe
     44 #define mmB  xo
     45 #endif
     46 
     47 #if RGB_RED == 1
     48 #define mmC  re
     49 #define mmD  ro
     50 #elif RGB_GREEN == 1
     51 #define mmC  ge
     52 #define mmD  go
     53 #elif RGB_BLUE == 1
     54 #define mmC  be
     55 #define mmD  bo
     56 #else
     57 #define mmC  xe
     58 #define mmD  xo
     59 #endif
     60 
     61 #if RGB_RED == 2
     62 #define mmE  re
     63 #define mmF  ro
     64 #elif RGB_GREEN == 2
     65 #define mmE  ge
     66 #define mmF  go
     67 #elif RGB_BLUE == 2
     68 #define mmE  be
     69 #define mmF  bo
     70 #else
     71 #define mmE  xe
     72 #define mmF  xo
     73 #endif
     74 
     75 #if RGB_RED == 3
     76 #define mmG  re
     77 #define mmH  ro
     78 #elif RGB_GREEN == 3
     79 #define mmG  ge
     80 #define mmH  go
     81 #elif RGB_BLUE == 3
     82 #define mmG  be
     83 #define mmH  bo
     84 #else
     85 #define mmG  xe
     86 #define mmH  xo
     87 #endif
     88 
     89 
     90 void jsimd_h2v1_merged_upsample_mmi(JDIMENSION output_width,
     91                                    JSAMPIMAGE input_buf,
     92                                    JDIMENSION in_row_group_ctr,
     93                                    JSAMPARRAY output_buf)
     94 {
     95  JSAMPROW outptr, inptr0, inptr1, inptr2;
     96  int num_cols, col;
     97  __m64 ythise, ythiso, ythis, ynexte, ynexto, ynext, yl, y;
     98  __m64 cbl, cbl2, cbh, cbh2, cb, crl, crl2, crh, crh2, cr;
     99  __m64 rle, rlo, rl, rhe, rho, rh, re, ro;
    100  __m64 ga, gb, gle, glo, gl, gc, gd, ghe, gho, gh, ge, go;
    101  __m64 ble, blo, bl, bhe, bho, bh, be, bo, xe = 0.0, xo = 0.0;
    102  __m64 decenter, mask, zero = 0.0;
    103 #if RGB_PIXELSIZE == 4
    104  __m64 mm8, mm9;
    105 #endif
    106 
    107  inptr0 = input_buf[0][in_row_group_ctr];
    108  inptr1 = input_buf[1][in_row_group_ctr];
    109  inptr2 = input_buf[2][in_row_group_ctr];
    110  outptr = output_buf[0];
    111 
    112  for (num_cols = output_width >> 1; num_cols > 0; num_cols -= 8,
    113       inptr0 += 16, inptr1 += 8, inptr2 += 8) {
    114 
    115    cb = _mm_load_si64((__m64 *)inptr1);
    116    cr = _mm_load_si64((__m64 *)inptr2);
    117    ythis = _mm_load_si64((__m64 *)inptr0);
    118    ynext = _mm_load_si64((__m64 *)inptr0 + 1);
    119 
    120    mask = decenter = 0.0;
    121    mask = _mm_cmpeq_pi16(mask, mask);
    122    decenter = _mm_cmpeq_pi16(decenter, decenter);
    123    mask = _mm_srli_pi16(mask, BYTE_BIT);   /* {0xFF 0x00 0xFF 0x00 ..} */
    124    decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
    125 
    126    cbl = _mm_unpacklo_pi8(cb, zero);         /* Cb(0123) */
    127    cbh = _mm_unpackhi_pi8(cb, zero);         /* Cb(4567) */
    128    crl = _mm_unpacklo_pi8(cr, zero);         /* Cr(0123) */
    129    crh = _mm_unpackhi_pi8(cr, zero);         /* Cr(4567) */
    130    cbl = _mm_add_pi16(cbl, decenter);
    131    cbh = _mm_add_pi16(cbh, decenter);
    132    crl = _mm_add_pi16(crl, decenter);
    133    crh = _mm_add_pi16(crh, decenter);
    134 
    135    /* (Original)
    136     * R = Y                + 1.40200 * Cr
    137     * G = Y - 0.34414 * Cb - 0.71414 * Cr
    138     * B = Y + 1.77200 * Cb
    139     *
    140     * (This implementation)
    141     * R = Y                + 0.40200 * Cr + Cr
    142     * G = Y - 0.34414 * Cb + 0.28586 * Cr - Cr
    143     * B = Y - 0.22800 * Cb + Cb + Cb
    144     */
    145 
    146    cbl2 = _mm_add_pi16(cbl, cbl);            /* 2*CbL */
    147    cbh2 = _mm_add_pi16(cbh, cbh);            /* 2*CbH */
    148    crl2 = _mm_add_pi16(crl, crl);            /* 2*CrL */
    149    crh2 = _mm_add_pi16(crh, crh);            /* 2*CrH */
    150 
    151    bl = _mm_mulhi_pi16(cbl2, PW_MF0228);     /* (2*CbL * -FIX(0.22800) */
    152    bh = _mm_mulhi_pi16(cbh2, PW_MF0228);     /* (2*CbH * -FIX(0.22800) */
    153    rl = _mm_mulhi_pi16(crl2, PW_F0402);      /* (2*CrL * FIX(0.40200)) */
    154    rh = _mm_mulhi_pi16(crh2, PW_F0402);      /* (2*CrH * FIX(0.40200)) */
    155 
    156    bl = _mm_add_pi16(bl, PW_ONE);
    157    bh = _mm_add_pi16(bh, PW_ONE);
    158    bl = _mm_srai_pi16(bl, 1);                /* (CbL * -FIX(0.22800)) */
    159    bh = _mm_srai_pi16(bh, 1);                /* (CbH * -FIX(0.22800)) */
    160    rl = _mm_add_pi16(rl, PW_ONE);
    161    rh = _mm_add_pi16(rh, PW_ONE);
    162    rl = _mm_srai_pi16(rl, 1);                /* (CrL * FIX(0.40200)) */
    163    rh = _mm_srai_pi16(rh, 1);                /* (CrH * FIX(0.40200)) */
    164 
    165    bl = _mm_add_pi16(bl, cbl);
    166    bh = _mm_add_pi16(bh, cbh);
    167    bl = _mm_add_pi16(bl, cbl);               /* (CbL * FIX(1.77200))=(B-Y)L */
    168    bh = _mm_add_pi16(bh, cbh);               /* (CbH * FIX(1.77200))=(B-Y)H */
    169    rl = _mm_add_pi16(rl, crl);               /* (CrL * FIX(1.40200))=(R-Y)L */
    170    rh = _mm_add_pi16(rh, crh);               /* (CrH * FIX(1.40200))=(R-Y)H */
    171 
    172    ga = _mm_unpacklo_pi16(cbl, crl);
    173    gb = _mm_unpackhi_pi16(cbl, crl);
    174    ga = _mm_madd_pi16(ga, PW_MF0344_F0285);
    175    gb = _mm_madd_pi16(gb, PW_MF0344_F0285);
    176    gc = _mm_unpacklo_pi16(cbh, crh);
    177    gd = _mm_unpackhi_pi16(cbh, crh);
    178    gc = _mm_madd_pi16(gc, PW_MF0344_F0285);
    179    gd = _mm_madd_pi16(gd, PW_MF0344_F0285);
    180 
    181    ga = _mm_add_pi32(ga, PD_ONEHALF);
    182    gb = _mm_add_pi32(gb, PD_ONEHALF);
    183    ga = _mm_srai_pi32(ga, SCALEBITS);
    184    gb = _mm_srai_pi32(gb, SCALEBITS);
    185    gc = _mm_add_pi32(gc, PD_ONEHALF);
    186    gd = _mm_add_pi32(gd, PD_ONEHALF);
    187    gc = _mm_srai_pi32(gc, SCALEBITS);
    188    gd = _mm_srai_pi32(gd, SCALEBITS);
    189 
    190    gl = _mm_packs_pi32(ga, gb);           /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
    191    gh = _mm_packs_pi32(gc, gd);           /* CbH*-FIX(0.344)+CrH*FIX(0.285) */
    192    gl = _mm_sub_pi16(gl, crl);    /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
    193    gh = _mm_sub_pi16(gh, crh);    /* CbH*-FIX(0.344)+CrH*-FIX(0.714)=(G-Y)H */
    194 
    195    ythise = _mm_and_si64(mask, ythis);       /* Y(0246) */
    196    ythiso = _mm_srli_pi16(ythis, BYTE_BIT);  /* Y(1357) */
    197    ynexte = _mm_and_si64(mask, ynext);       /* Y(8ACE) */
    198    ynexto = _mm_srli_pi16(ynext, BYTE_BIT);  /* Y(9BDF) */
    199 
    200    rle = _mm_add_pi16(rl, ythise);           /* (R0 R2 R4 R6) */
    201    rlo = _mm_add_pi16(rl, ythiso);           /* (R1 R3 R5 R7) */
    202    rhe = _mm_add_pi16(rh, ynexte);           /* (R8 RA RC RE) */
    203    rho = _mm_add_pi16(rh, ynexto);           /* (R9 RB RD RF) */
    204    re = _mm_packs_pu16(rle, rhe);            /* (R0 R2 R4 R6 R8 RA RC RE) */
    205    ro = _mm_packs_pu16(rlo, rho);            /* (R1 R3 R5 R7 R9 RB RD RF) */
    206 
    207    gle = _mm_add_pi16(gl, ythise);           /* (G0 G2 G4 G6) */
    208    glo = _mm_add_pi16(gl, ythiso);           /* (G1 G3 G5 G7) */
    209    ghe = _mm_add_pi16(gh, ynexte);           /* (G8 GA GC GE) */
    210    gho = _mm_add_pi16(gh, ynexto);           /* (G9 GB GD GF) */
    211    ge = _mm_packs_pu16(gle, ghe);            /* (G0 G2 G4 G6 G8 GA GC GE) */
    212    go = _mm_packs_pu16(glo, gho);            /* (G1 G3 G5 G7 G9 GB GD GF) */
    213 
    214    ble = _mm_add_pi16(bl, ythise);           /* (B0 B2 B4 B6) */
    215    blo = _mm_add_pi16(bl, ythiso);           /* (B1 B3 B5 B7) */
    216    bhe = _mm_add_pi16(bh, ynexte);           /* (B8 BA BC BE) */
    217    bho = _mm_add_pi16(bh, ynexto);           /* (B9 BB BD BF) */
    218    be = _mm_packs_pu16(ble, bhe);            /* (B0 B2 B4 B6 B8 BA BC BE) */
    219    bo = _mm_packs_pu16(blo, bho);            /* (B1 B3 B5 B7 B9 BB BD BF) */
    220 
    221 #if RGB_PIXELSIZE == 3
    222 
    223    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
    224    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
    225    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
    226    mmG = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
    227    mmA = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
    228    mmH = _mm_unpacklo_pi8(mmE, mmB);         /* (20 01 22 03 24 05 26 07) */
    229    mmE = _mm_unpackhi_pi8(mmE, mmB);         /* (28 09 2A 0B 2C 0D 2E 0F) */
    230    mmC = _mm_unpacklo_pi8(mmD, mmF);         /* (11 21 13 23 15 25 17 27) */
    231    mmD = _mm_unpackhi_pi8(mmD, mmF);         /* (19 29 1B 2B 1D 2D 1F 2F) */
    232 
    233    mmB = _mm_unpacklo_pi16(mmG, mmA);        /* (00 10 08 18 02 12 0A 1A) */
    234    mmA = _mm_unpackhi_pi16(mmG, mmA);        /* (04 14 0C 1C 06 16 0E 1E) */
    235    mmF = _mm_unpacklo_pi16(mmH, mmE);        /* (20 01 28 09 22 03 2A 0B) */
    236    mmE = _mm_unpackhi_pi16(mmH, mmE);        /* (24 05 2C 0D 26 07 2E 0F) */
    237    mmH = _mm_unpacklo_pi16(mmC, mmD);        /* (11 21 19 29 13 23 1B 2B) */
    238    mmG = _mm_unpackhi_pi16(mmC, mmD);        /* (15 25 1D 2D 17 27 1F 2F) */
    239 
    240    mmC = _mm_unpacklo_pi16(mmB, mmF);        /* (00 10 20 01 08 18 28 09) */
    241    mmB = _mm_srli_si64(mmB, 4 * BYTE_BIT);
    242    mmB = _mm_unpacklo_pi16(mmH, mmB);        /* (11 21 02 12 19 29 0A 1A) */
    243    mmD = _mm_unpackhi_pi16(mmF, mmH);        /* (22 03 13 23 2A 0B 1B 2B) */
    244    mmF = _mm_unpacklo_pi16(mmA, mmE);        /* (04 14 24 05 0C 1C 2C 0D) */
    245    mmA = _mm_srli_si64(mmA, 4 * BYTE_BIT);
    246    mmH = _mm_unpacklo_pi16(mmG, mmA);        /* (15 25 06 16 1D 2D 0E 1E) */
    247    mmG = _mm_unpackhi_pi16(mmE, mmG);        /* (26 07 17 27 2E 0F 1F 2F) */
    248 
    249    mmA = _mm_unpacklo_pi32(mmC, mmB);        /* (00 10 20 01 11 21 02 12) */
    250    mmE = _mm_unpackhi_pi32(mmC, mmB);        /* (08 18 28 09 19 29 0A 1A) */
    251    mmB = _mm_unpacklo_pi32(mmD, mmF);        /* (22 03 13 23 04 14 24 05) */
    252    mmF = _mm_unpackhi_pi32(mmD, mmF);        /* (2A 0B 1B 2B 0C 1C 2C 0D) */
    253    mmC = _mm_unpacklo_pi32(mmH, mmG);        /* (15 25 06 16 26 07 17 27) */
    254    mmG = _mm_unpackhi_pi32(mmH, mmG);        /* (1D 2D 0E 1E 2E 0F 1F 2F) */
    255 
    256    if (num_cols >= 8) {
    257      if (!(((long)outptr) & 7)) {
    258        _mm_store_si64((__m64 *)outptr, mmA);
    259        _mm_store_si64((__m64 *)(outptr + 8), mmB);
    260        _mm_store_si64((__m64 *)(outptr + 16), mmC);
    261        _mm_store_si64((__m64 *)(outptr + 24), mmE);
    262        _mm_store_si64((__m64 *)(outptr + 32), mmF);
    263        _mm_store_si64((__m64 *)(outptr + 40), mmG);
    264      } else {
    265        _mm_storeu_si64((__m64 *)outptr, mmA);
    266        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
    267        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
    268        _mm_storeu_si64((__m64 *)(outptr + 24), mmE);
    269        _mm_storeu_si64((__m64 *)(outptr + 32), mmF);
    270        _mm_storeu_si64((__m64 *)(outptr + 40), mmG);
    271      }
    272      outptr += RGB_PIXELSIZE * 16;
    273    } else {
    274      if (output_width & 1)
    275        col = num_cols * 6 + 3;
    276      else
    277        col = num_cols * 6;
    278 
    279      asm(".set noreorder\r\n"                /* st24 */
    280 
    281          "li       $8, 24\r\n"
    282          "move     $9, %7\r\n"
    283          "mov.s    $f4, %1\r\n"
    284          "mov.s    $f6, %2\r\n"
    285          "mov.s    $f8, %3\r\n"
    286          "move     $10, %8\r\n"
    287          "bltu     $9, $8, 1f\r\n"
    288          "nop      \r\n"
    289          "gssdlc1  $f4, 7($10)\r\n"
    290          "gssdrc1  $f4, 0($10)\r\n"
    291          "gssdlc1  $f6, 7+8($10)\r\n"
    292          "gssdrc1  $f6, 8($10)\r\n"
    293          "gssdlc1  $f8, 7+16($10)\r\n"
    294          "gssdrc1  $f8, 16($10)\r\n"
    295          "mov.s    $f4, %4\r\n"
    296          "mov.s    $f6, %5\r\n"
    297          "mov.s    $f8, %6\r\n"
    298          "subu     $9, $9, 24\r\n"
    299          PTR_ADDU  "$10, $10, 24\r\n"
    300 
    301          "1:       \r\n"
    302          "li       $8, 16\r\n"               /* st16 */
    303          "bltu     $9, $8, 2f\r\n"
    304          "nop      \r\n"
    305          "gssdlc1  $f4, 7($10)\r\n"
    306          "gssdrc1  $f4, 0($10)\r\n"
    307          "gssdlc1  $f6, 7+8($10)\r\n"
    308          "gssdrc1  $f6, 8($10)\r\n"
    309          "mov.s    $f4, $f8\r\n"
    310          "subu     $9, $9, 16\r\n"
    311          PTR_ADDU  "$10, $10, 16\r\n"
    312 
    313          "2:       \r\n"
    314          "li       $8,  8\r\n"               /* st8 */
    315          "bltu     $9, $8, 3f\r\n"
    316          "nop      \r\n"
    317          "gssdlc1  $f4, 7($10)\r\n"
    318          "gssdrc1  $f4, 0($10)\r\n"
    319          "mov.s    $f4, $f6\r\n"
    320          "subu     $9, $9, 8\r\n"
    321          PTR_ADDU  "$10, $10, 8\r\n"
    322 
    323          "3:       \r\n"
    324          "li       $8,  4\r\n"               /* st4 */
    325          "mfc1     $11, $f4\r\n"
    326          "bltu     $9, $8, 4f\r\n"
    327          "nop      \r\n"
    328          "swl      $11, 3($10)\r\n"
    329          "swr      $11, 0($10)\r\n"
    330          "li       $8, 32\r\n"
    331          "mtc1     $8, $f6\r\n"
    332          "dsrl     $f4, $f4, $f6\r\n"
    333          "mfc1     $11, $f4\r\n"
    334          "subu     $9, $9, 4\r\n"
    335          PTR_ADDU  "$10, $10, 4\r\n"
    336 
    337          "4:       \r\n"
    338          "li       $8, 2\r\n"                /* st2 */
    339          "bltu     $9, $8, 5f\r\n"
    340          "nop      \r\n"
    341          "ush      $11, 0($10)\r\n"
    342          "srl      $11, 16\r\n"
    343          "subu     $9, $9, 2\r\n"
    344          PTR_ADDU  "$10, $10, 2\r\n"
    345 
    346          "5:       \r\n"
    347          "li       $8, 1\r\n"                /* st1 */
    348          "bltu     $9, $8, 6f\r\n"
    349          "nop      \r\n"
    350          "sb       $11, 0($10)\r\n"
    351 
    352          "6:       \r\n"
    353          "nop      \r\n"                     /* end */
    354          : "=m" (*outptr)
    355          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmE), "f" (mmF),
    356            "f" (mmG), "r" (col), "r" (outptr)
    357          : "$f4", "$f6", "$f8", "$8", "$9", "$10", "$11", "memory"
    358         );
    359    }
    360 
    361 #else  /* RGB_PIXELSIZE == 4 */
    362 
    363 #ifdef RGBX_FILLER_0XFF
    364    xe = _mm_cmpeq_pi8(xe, xe);
    365    xo = _mm_cmpeq_pi8(xo, xo);
    366 #else
    367    xe = _mm_xor_si64(xe, xe);
    368    xo = _mm_xor_si64(xo, xo);
    369 #endif
    370    /* mmA=(00 02 04 06 08 0A 0C 0E), mmB=(01 03 05 07 09 0B 0D 0F) */
    371    /* mmC=(10 12 14 16 18 1A 1C 1E), mmD=(11 13 15 17 19 1B 1D 1F) */
    372    /* mmE=(20 22 24 26 28 2A 2C 2E), mmF=(21 23 25 27 29 2B 2D 2F) */
    373    /* mmG=(30 32 34 36 38 3A 3C 3E), mmH=(31 33 35 37 39 3B 3D 3F) */
    374 
    375    mm8 = _mm_unpacklo_pi8(mmA, mmC);         /* (00 10 02 12 04 14 06 16) */
    376    mm9 = _mm_unpackhi_pi8(mmA, mmC);         /* (08 18 0A 1A 0C 1C 0E 1E) */
    377    mmA = _mm_unpacklo_pi8(mmE, mmG);         /* (20 30 22 32 24 34 26 36) */
    378    mmE = _mm_unpackhi_pi8(mmE, mmG);         /* (28 38 2A 3A 2C 3C 2E 3E) */
    379 
    380    mmG = _mm_unpacklo_pi8(mmB, mmD);         /* (01 11 03 13 05 15 07 17) */
    381    mmB = _mm_unpackhi_pi8(mmB, mmD);         /* (09 19 0B 1B 0D 1D 0F 1F) */
    382    mmD = _mm_unpacklo_pi8(mmF, mmH);         /* (21 31 23 33 25 35 27 37) */
    383    mmF = _mm_unpackhi_pi8(mmF, mmH);         /* (29 39 2B 3B 2D 3D 2F 3F) */
    384 
    385    mmH = _mm_unpacklo_pi16(mm8, mmA);        /* (00 10 20 30 02 12 22 32) */
    386    mm8 = _mm_unpackhi_pi16(mm8, mmA);        /* (04 14 24 34 06 16 26 36) */
    387    mmA = _mm_unpacklo_pi16(mmG, mmD);        /* (01 11 21 31 03 13 23 33) */
    388    mmD = _mm_unpackhi_pi16(mmG, mmD);        /* (05 15 25 35 07 17 27 37) */
    389 
    390    mmG = _mm_unpackhi_pi16(mm9, mmE);        /* (0C 1C 2C 3C 0E 1E 2E 3E) */
    391    mm9 = _mm_unpacklo_pi16(mm9, mmE);        /* (08 18 28 38 0A 1A 2A 3A) */
    392    mmE = _mm_unpacklo_pi16(mmB, mmF);        /* (09 19 29 39 0B 1B 2B 3B) */
    393    mmF = _mm_unpackhi_pi16(mmB, mmF);        /* (0D 1D 2D 3D 0F 1F 2F 3F) */
    394 
    395    mmB = _mm_unpackhi_pi32(mmH, mmA);        /* (02 12 22 32 03 13 23 33) */
    396    mmA = _mm_unpacklo_pi32(mmH, mmA);        /* (00 10 20 30 01 11 21 31) */
    397    mmC = _mm_unpacklo_pi32(mm8, mmD);        /* (04 14 24 34 05 15 25 35) */
    398    mmD = _mm_unpackhi_pi32(mm8, mmD);        /* (06 16 26 36 07 17 27 37) */
    399 
    400    mmH = _mm_unpackhi_pi32(mmG, mmF);        /* (0E 1E 2E 3E 0F 1F 2F 3F) */
    401    mmG = _mm_unpacklo_pi32(mmG, mmF);        /* (0C 1C 2C 3C 0D 1D 2D 3D) */
    402    mmF = _mm_unpackhi_pi32(mm9, mmE);        /* (0A 1A 2A 3A 0B 1B 2B 3B) */
    403    mmE = _mm_unpacklo_pi32(mm9, mmE);        /* (08 18 28 38 09 19 29 39) */
    404 
    405    if (num_cols >= 8) {
    406      if (!(((long)outptr) & 7)) {
    407        _mm_store_si64((__m64 *)outptr, mmA);
    408        _mm_store_si64((__m64 *)(outptr + 8), mmB);
    409        _mm_store_si64((__m64 *)(outptr + 16), mmC);
    410        _mm_store_si64((__m64 *)(outptr + 24), mmD);
    411        _mm_store_si64((__m64 *)(outptr + 32), mmE);
    412        _mm_store_si64((__m64 *)(outptr + 40), mmF);
    413        _mm_store_si64((__m64 *)(outptr + 48), mmG);
    414        _mm_store_si64((__m64 *)(outptr + 56), mmH);
    415      } else {
    416        _mm_storeu_si64((__m64 *)outptr, mmA);
    417        _mm_storeu_si64((__m64 *)(outptr + 8), mmB);
    418        _mm_storeu_si64((__m64 *)(outptr + 16), mmC);
    419        _mm_storeu_si64((__m64 *)(outptr + 24), mmD);
    420        _mm_storeu_si64((__m64 *)(outptr + 32), mmE);
    421        _mm_storeu_si64((__m64 *)(outptr + 40), mmF);
    422        _mm_storeu_si64((__m64 *)(outptr + 48), mmG);
    423        _mm_storeu_si64((__m64 *)(outptr + 56), mmH);
    424      }
    425      outptr += RGB_PIXELSIZE * 16;
    426    } else {
    427      if (output_width & 1)
    428        col = num_cols * 2 + 1;
    429      else
    430        col = num_cols * 2;
    431      asm(".set noreorder\r\n"                /* st32 */
    432 
    433          "li       $8, 8\r\n"
    434          "move     $9, %10\r\n"
    435          "move     $10, %11\r\n"
    436          "mov.s    $f4, %2\r\n"
    437          "mov.s    $f6, %3\r\n"
    438          "mov.s    $f8, %4\r\n"
    439          "mov.s    $f10, %5\r\n"
    440          "bltu     $9, $8, 1f\r\n"
    441          "nop      \r\n"
    442          "gssdlc1  $f4, 7($10)\r\n"
    443          "gssdrc1  $f4, 0($10)\r\n"
    444          "gssdlc1  $f6, 7+8($10)\r\n"
    445          "gssdrc1  $f6, 8($10)\r\n"
    446          "gssdlc1  $f8, 7+16($10)\r\n"
    447          "gssdrc1  $f8, 16($10)\r\n"
    448          "gssdlc1  $f10, 7+24($10)\r\n"
    449          "gssdrc1  $f10, 24($10)\r\n"
    450          "mov.s    $f4, %6\r\n"
    451          "mov.s    $f6, %7\r\n"
    452          "mov.s    $f8, %8\r\n"
    453          "mov.s    $f10, %9\r\n"
    454          "subu     $9, $9, 8\r\n"
    455          PTR_ADDU  "$10, $10, 32\r\n"
    456 
    457          "1:       \r\n"
    458          "li       $8, 4\r\n"                /* st16 */
    459          "bltu     $9, $8, 2f\r\n"
    460          "nop      \r\n"
    461          "gssdlc1  $f4, 7($10)\r\n"
    462          "gssdrc1  $f4, 0($10)\r\n"
    463          "gssdlc1  $f6, 7+8($10)\r\n"
    464          "gssdrc1  $f6, 8($10)\r\n"
    465          "mov.s    $f4, $f8\r\n"
    466          "mov.s    $f6, $f10\r\n"
    467          "subu     $9, $9, 4\r\n"
    468          PTR_ADDU  "$10, $10, 16\r\n"
    469 
    470          "2:       \r\n"
    471          "li       $8, 2\r\n"                /* st8 */
    472          "bltu     $9, $8, 3f\r\n"
    473          "nop      \r\n"
    474          "gssdlc1  $f4, 7($10)\r\n"
    475          "gssdrc1  $f4, 0($10)\r\n"
    476          "mov.s    $f4, $f6\r\n"
    477          "subu     $9, $9, 2\r\n"
    478          PTR_ADDU  "$10, $10, 8\r\n"
    479 
    480          "3:       \r\n"
    481          "li       $8, 1\r\n"                /* st4 */
    482          "bltu     $9, $8, 4f\r\n"
    483          "nop      \r\n"
    484          "gsswlc1  $f4, 3($10)\r\n"
    485          "gsswrc1  $f4, 0($10)\r\n"
    486 
    487          "4:       \r\n"
    488          "li       %1, 0\r\n"                /* end */
    489          : "=m" (*outptr), "=r" (col)
    490          : "f" (mmA), "f" (mmB), "f" (mmC), "f" (mmD), "f" (mmE), "f" (mmF),
    491            "f" (mmG), "f" (mmH), "r" (col), "r" (outptr)
    492          : "$f4", "$f6", "$f8", "$f10", "$8", "$9", "$10", "memory"
    493         );
    494    }
    495 
    496 #endif
    497 
    498  }
    499 
    500  if (!((output_width >> 1) & 7)) {
    501    if (output_width & 1) {
    502      cb = _mm_load_si64((__m64 *)inptr1);
    503      cr = _mm_load_si64((__m64 *)inptr2);
    504      y = _mm_load_si64((__m64 *)inptr0);
    505 
    506      decenter = 0.0;
    507      decenter = _mm_cmpeq_pi16(decenter, decenter);
    508      decenter = _mm_slli_pi16(decenter, 7);  /* {0xFF80 0xFF80 0xFF80 0xFF80} */
    509 
    510      cbl = _mm_unpacklo_pi8(cb, zero);       /* Cb(0123) */
    511      crl = _mm_unpacklo_pi8(cr, zero);       /* Cr(0123) */
    512      cbl = _mm_add_pi16(cbl, decenter);
    513      crl = _mm_add_pi16(crl, decenter);
    514 
    515      cbl2 = _mm_add_pi16(cbl, cbl);          /* 2*CbL */
    516      crl2 = _mm_add_pi16(crl, crl);          /* 2*CrL */
    517      bl = _mm_mulhi_pi16(cbl2, PW_MF0228);   /* (2*CbL * -FIX(0.22800) */
    518      rl = _mm_mulhi_pi16(crl2, PW_F0402);    /* (2*CrL * FIX(0.40200)) */
    519 
    520      bl = _mm_add_pi16(bl, PW_ONE);
    521      bl = _mm_srai_pi16(bl, 1);              /* (CbL * -FIX(0.22800)) */
    522      rl = _mm_add_pi16(rl, PW_ONE);
    523      rl = _mm_srai_pi16(rl, 1);              /* (CrL * FIX(0.40200)) */
    524 
    525      bl = _mm_add_pi16(bl, cbl);
    526      bl = _mm_add_pi16(bl, cbl);             /* (CbL * FIX(1.77200))=(B-Y)L */
    527      rl = _mm_add_pi16(rl, crl);             /* (CrL * FIX(1.40200))=(R-Y)L */
    528 
    529      gl = _mm_unpacklo_pi16(cbl, crl);
    530      gl = _mm_madd_pi16(gl, PW_MF0344_F0285);
    531      gl = _mm_add_pi32(gl, PD_ONEHALF);
    532      gl = _mm_srai_pi32(gl, SCALEBITS);
    533      gl = _mm_packs_pi32(gl, zero);       /* CbL*-FIX(0.344)+CrL*FIX(0.285) */
    534      gl = _mm_sub_pi16(gl, crl);  /* CbL*-FIX(0.344)+CrL*-FIX(0.714)=(G-Y)L */
    535 
    536      yl = _mm_unpacklo_pi8(y, zero);         /* Y(0123) */
    537      rl = _mm_add_pi16(rl, yl);              /* (R0 R1 R2 R3) */
    538      gl = _mm_add_pi16(gl, yl);              /* (G0 G1 G2 G3) */
    539      bl = _mm_add_pi16(bl, yl);              /* (B0 B1 B2 B3) */
    540      re = _mm_packs_pu16(rl, rl);
    541      ge = _mm_packs_pu16(gl, gl);
    542      be = _mm_packs_pu16(bl, bl);
    543 #if RGB_PIXELSIZE == 3
    544      mmA = _mm_unpacklo_pi8(mmA, mmC);
    545      mmA = _mm_unpacklo_pi16(mmA, mmE);
    546      asm(".set noreorder\r\n"
    547 
    548          "move    $8, %2\r\n"
    549          "mov.s   $f4, %1\r\n"
    550          "mfc1    $9, $f4\r\n"
    551          "ush     $9, 0($8)\r\n"
    552          "srl     $9, 16\r\n"
    553          "sb      $9, 2($8)\r\n"
    554          : "=m" (*outptr)
    555          : "f" (mmA), "r" (outptr)
    556          : "$f4", "$8", "$9", "memory"
    557         );
    558 #else  /* RGB_PIXELSIZE == 4 */
    559 
    560 #ifdef RGBX_FILLER_0XFF
    561      xe = _mm_cmpeq_pi8(xe, xe);
    562 #else
    563      xe = _mm_xor_si64(xe, xe);
    564 #endif
    565      mmA = _mm_unpacklo_pi8(mmA, mmC);
    566      mmE = _mm_unpacklo_pi8(mmE, mmG);
    567      mmA = _mm_unpacklo_pi16(mmA, mmE);
    568      asm(".set noreorder\r\n"
    569 
    570          "move    $8, %2\r\n"
    571          "mov.s   $f4, %1\r\n"
    572          "gsswlc1 $f4, 3($8)\r\n"
    573          "gsswrc1 $f4, 0($8)\r\n"
    574          : "=m" (*outptr)
    575          : "f" (mmA), "r" (outptr)
    576          : "$f4", "$8", "memory"
    577         );
    578 #endif
    579    }
    580  }
    581 }
    582 
    583 
    584 void jsimd_h2v2_merged_upsample_mmi(JDIMENSION output_width,
    585                                    JSAMPIMAGE input_buf,
    586                                    JDIMENSION in_row_group_ctr,
    587                                    JSAMPARRAY output_buf)
    588 {
    589  JSAMPROW inptr, outptr;
    590 
    591  inptr = input_buf[0][in_row_group_ctr];
    592  outptr = output_buf[0];
    593 
    594  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2];
    595  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
    596                                 output_buf);
    597 
    598  input_buf[0][in_row_group_ctr] = input_buf[0][in_row_group_ctr * 2 + 1];
    599  output_buf[0] = output_buf[1];
    600  jsimd_h2v1_merged_upsample_mmi(output_width, input_buf, in_row_group_ctr,
    601                                 output_buf);
    602 
    603  input_buf[0][in_row_group_ctr] = inptr;
    604  output_buf[0] = outptr;
    605 }
    606 
    607 
    608 #undef mmA
    609 #undef mmB
    610 #undef mmC
    611 #undef mmD
    612 #undef mmE
    613 #undef mmF
    614 #undef mmG
    615 #undef mmH