tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctint.c (106340B)


      1 /*
      2 * jidctint.c
      3 *
      4 * This file was part of the Independent JPEG Group's software:
      5 * Copyright (C) 1991-1998, Thomas G. Lane.
      6 * Modification developed 2002-2018 by Guido Vollbeding.
      7 * libjpeg-turbo Modifications:
      8 * Copyright (C) 2015, 2020, 2022, D. R. Commander.
      9 * For conditions of distribution and use, see the accompanying README.ijg
     10 * file.
     11 *
     12 * This file contains a slower but more accurate integer implementation of the
     13 * inverse DCT (Discrete Cosine Transform).  In the IJG code, this routine
     14 * must also perform dequantization of the input coefficients.
     15 *
     16 * A 2-D IDCT can be done by 1-D IDCT on each column followed by 1-D IDCT
     17 * on each row (or vice versa, but it's more convenient to emit a row at
     18 * a time).  Direct algorithms are also available, but they are much more
     19 * complex and seem not to be any faster when reduced to code.
     20 *
     21 * This implementation is based on an algorithm described in
     22 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
     23 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
     24 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
     25 * The primary algorithm described there uses 11 multiplies and 29 adds.
     26 * We use their alternate method with 12 multiplies and 32 adds.
     27 * The advantage of this method is that no data path contains more than one
     28 * multiplication; this allows a very simple and accurate implementation in
     29 * scaled fixed-point arithmetic, with a minimal number of shifts.
     30 *
     31 * We also provide IDCT routines with various output sample block sizes for
     32 * direct resolution reduction or enlargement without additional resampling:
     33 * NxN (N=1...16) pixels for one 8x8 input DCT block.
     34 *
     35 * For N<8 we simply take the corresponding low-frequency coefficients of
     36 * the 8x8 input DCT block and apply an NxN point IDCT on the sub-block
     37 * to yield the downscaled outputs.
     38 * This can be seen as direct low-pass downsampling from the DCT domain
     39 * point of view rather than the usual spatial domain point of view,
     40 * yielding significant computational savings and results at least
     41 * as good as common bilinear (averaging) spatial downsampling.
     42 *
     43 * For N>8 we apply a partial NxN IDCT on the 8 input coefficients as
     44 * lower frequencies and higher frequencies assumed to be zero.
     45 * It turns out that the computational effort is similar to the 8x8 IDCT
     46 * regarding the output size.
     47 * Furthermore, the scaling and descaling is the same for all IDCT sizes.
     48 *
     49 * CAUTION: We rely on the FIX() macro except for the N=1,2,4,8 cases
     50 * since there would be too many additional constants to pre-calculate.
     51 */
     52 
     53 #define JPEG_INTERNALS
     54 #include "jinclude.h"
     55 #include "jpeglib.h"
     56 #include "jdct.h"               /* Private declarations for DCT subsystem */
     57 
     58 #ifdef DCT_ISLOW_SUPPORTED
     59 
     60 
     61 /*
     62 * This module is specialized to the case DCTSIZE = 8.
     63 */
     64 
     65 #if DCTSIZE != 8
     66  Sorry, this code only copes with 8x8 DCT blocks. /* deliberate syntax err */
     67 #endif
     68 
     69 
     70 /*
     71 * The poop on this scaling stuff is as follows:
     72 *
     73 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
     74 * larger than the true IDCT outputs.  The final outputs are therefore
     75 * a factor of N larger than desired; since N=8 this can be cured by
     76 * a simple right shift at the end of the algorithm.  The advantage of
     77 * this arrangement is that we save two multiplications per 1-D IDCT,
     78 * because the y0 and y4 inputs need not be divided by sqrt(N).
     79 *
     80 * We have to do addition and subtraction of the integer inputs, which
     81 * is no problem, and multiplication by fractional constants, which is
     82 * a problem to do in integer arithmetic.  We multiply all the constants
     83 * by CONST_SCALE and convert them to integer constants (thus retaining
     84 * CONST_BITS bits of precision in the constants).  After doing a
     85 * multiplication we have to divide the product by CONST_SCALE, with proper
     86 * rounding, to produce the correct output.  This division can be done
     87 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
     88 * as long as possible so that partial sums can be added together with
     89 * full fractional precision.
     90 *
     91 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
     92 * they are represented to better-than-integral precision.  These outputs
     93 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
     94 * with the recommended scaling.  (To scale up 12-bit sample data further, an
     95 * intermediate JLONG array would be needed.)
     96 *
     97 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
     98 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
     99 * shows that the values given below are the most effective.
    100 */
    101 
    102 #if BITS_IN_JSAMPLE == 8
    103 #define CONST_BITS  13
    104 #define PASS1_BITS  2
    105 #else
    106 #define CONST_BITS  13
    107 #define PASS1_BITS  1           /* lose a little precision to avoid overflow */
    108 #endif
    109 
    110 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
    111 * causing a lot of useless floating-point operations at run time.
    112 * To get around this we use the following pre-calculated constants.
    113 * If you change CONST_BITS you may want to add appropriate values.
    114 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
    115 */
    116 
    117 #if CONST_BITS == 13
    118 #define FIX_0_298631336  ((JLONG)2446)          /* FIX(0.298631336) */
    119 #define FIX_0_390180644  ((JLONG)3196)          /* FIX(0.390180644) */
    120 #define FIX_0_541196100  ((JLONG)4433)          /* FIX(0.541196100) */
    121 #define FIX_0_765366865  ((JLONG)6270)          /* FIX(0.765366865) */
    122 #define FIX_0_899976223  ((JLONG)7373)          /* FIX(0.899976223) */
    123 #define FIX_1_175875602  ((JLONG)9633)          /* FIX(1.175875602) */
    124 #define FIX_1_501321110  ((JLONG)12299)         /* FIX(1.501321110) */
    125 #define FIX_1_847759065  ((JLONG)15137)         /* FIX(1.847759065) */
    126 #define FIX_1_961570560  ((JLONG)16069)         /* FIX(1.961570560) */
    127 #define FIX_2_053119869  ((JLONG)16819)         /* FIX(2.053119869) */
    128 #define FIX_2_562915447  ((JLONG)20995)         /* FIX(2.562915447) */
    129 #define FIX_3_072711026  ((JLONG)25172)         /* FIX(3.072711026) */
    130 #else
    131 #define FIX_0_298631336  FIX(0.298631336)
    132 #define FIX_0_390180644  FIX(0.390180644)
    133 #define FIX_0_541196100  FIX(0.541196100)
    134 #define FIX_0_765366865  FIX(0.765366865)
    135 #define FIX_0_899976223  FIX(0.899976223)
    136 #define FIX_1_175875602  FIX(1.175875602)
    137 #define FIX_1_501321110  FIX(1.501321110)
    138 #define FIX_1_847759065  FIX(1.847759065)
    139 #define FIX_1_961570560  FIX(1.961570560)
    140 #define FIX_2_053119869  FIX(2.053119869)
    141 #define FIX_2_562915447  FIX(2.562915447)
    142 #define FIX_3_072711026  FIX(3.072711026)
    143 #endif
    144 
    145 
    146 /* Multiply an JLONG variable by an JLONG constant to yield an JLONG result.
    147 * For 8-bit samples with the recommended scaling, all the variable
    148 * and constant values involved are no more than 16 bits wide, so a
    149 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
    150 * For 12-bit samples, a full 32-bit multiplication will be needed.
    151 */
    152 
    153 #if BITS_IN_JSAMPLE == 8
    154 #define MULTIPLY(var, const)  MULTIPLY16C16(var, const)
    155 #else
    156 #define MULTIPLY(var, const)  ((var) * (const))
    157 #endif
    158 
    159 
    160 /* Dequantize a coefficient by multiplying it by the multiplier-table
    161 * entry; produce an int result.  In this module, both inputs and result
    162 * are 16 bits or less, so either int or short multiply will work.
    163 */
    164 
    165 #define DEQUANTIZE(coef, quantval)  (((ISLOW_MULT_TYPE)(coef)) * (quantval))
    166 
    167 
    168 /*
    169 * Perform dequantization and inverse DCT on one block of coefficients.
    170 */
    171 
    172 GLOBAL(void)
    173 _jpeg_idct_islow(j_decompress_ptr cinfo, jpeg_component_info *compptr,
    174                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
    175                 JDIMENSION output_col)
    176 {
    177  JLONG tmp0, tmp1, tmp2, tmp3;
    178  JLONG tmp10, tmp11, tmp12, tmp13;
    179  JLONG z1, z2, z3, z4, z5;
    180  JCOEFPTR inptr;
    181  ISLOW_MULT_TYPE *quantptr;
    182  int *wsptr;
    183  _JSAMPROW outptr;
    184  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    185  int ctr;
    186  int workspace[DCTSIZE2];      /* buffers data between passes */
    187  SHIFT_TEMPS
    188 
    189  /* Pass 1: process columns from input, store into work array. */
    190  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
    191  /* furthermore, we scale the results by 2**PASS1_BITS. */
    192 
    193  inptr = coef_block;
    194  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
    195  wsptr = workspace;
    196  for (ctr = DCTSIZE; ctr > 0; ctr--) {
    197    /* Due to quantization, we will usually find that many of the input
    198     * coefficients are zero, especially the AC terms.  We can exploit this
    199     * by short-circuiting the IDCT calculation for any column in which all
    200     * the AC terms are zero.  In that case each output is equal to the
    201     * DC coefficient (with scale factor as needed).
    202     * With typical images and quantization tables, half or more of the
    203     * column DCT calculations can be simplified this way.
    204     */
    205 
    206    if (inptr[DCTSIZE * 1] == 0 && inptr[DCTSIZE * 2] == 0 &&
    207        inptr[DCTSIZE * 3] == 0 && inptr[DCTSIZE * 4] == 0 &&
    208        inptr[DCTSIZE * 5] == 0 && inptr[DCTSIZE * 6] == 0 &&
    209        inptr[DCTSIZE * 7] == 0) {
    210      /* AC terms all zero */
    211      int dcval = LEFT_SHIFT(DEQUANTIZE(inptr[DCTSIZE * 0],
    212                             quantptr[DCTSIZE * 0]), PASS1_BITS);
    213 
    214      wsptr[DCTSIZE * 0] = dcval;
    215      wsptr[DCTSIZE * 1] = dcval;
    216      wsptr[DCTSIZE * 2] = dcval;
    217      wsptr[DCTSIZE * 3] = dcval;
    218      wsptr[DCTSIZE * 4] = dcval;
    219      wsptr[DCTSIZE * 5] = dcval;
    220      wsptr[DCTSIZE * 6] = dcval;
    221      wsptr[DCTSIZE * 7] = dcval;
    222 
    223      inptr++;                  /* advance pointers to next column */
    224      quantptr++;
    225      wsptr++;
    226      continue;
    227    }
    228 
    229    /* Even part: reverse the even part of the forward DCT. */
    230    /* The rotator is sqrt(2)*c(-6). */
    231 
    232    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
    233    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
    234 
    235    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    236    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
    237    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
    238 
    239    z2 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
    240    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
    241 
    242    tmp0 = LEFT_SHIFT(z2 + z3, CONST_BITS);
    243    tmp1 = LEFT_SHIFT(z2 - z3, CONST_BITS);
    244 
    245    tmp10 = tmp0 + tmp3;
    246    tmp13 = tmp0 - tmp3;
    247    tmp11 = tmp1 + tmp2;
    248    tmp12 = tmp1 - tmp2;
    249 
    250    /* Odd part per figure 8; the matrix is unitary and hence its
    251     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    252     */
    253 
    254    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
    255    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
    256    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
    257    tmp3 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
    258 
    259    z1 = tmp0 + tmp3;
    260    z2 = tmp1 + tmp2;
    261    z3 = tmp0 + tmp2;
    262    z4 = tmp1 + tmp3;
    263    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    264 
    265    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    266    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    267    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    268    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    269    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
    270    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    271    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    272    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
    273 
    274    z3 += z5;
    275    z4 += z5;
    276 
    277    tmp0 += z1 + z3;
    278    tmp1 += z2 + z4;
    279    tmp2 += z2 + z3;
    280    tmp3 += z1 + z4;
    281 
    282    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    283 
    284    wsptr[DCTSIZE * 0] = (int)DESCALE(tmp10 + tmp3, CONST_BITS - PASS1_BITS);
    285    wsptr[DCTSIZE * 7] = (int)DESCALE(tmp10 - tmp3, CONST_BITS - PASS1_BITS);
    286    wsptr[DCTSIZE * 1] = (int)DESCALE(tmp11 + tmp2, CONST_BITS - PASS1_BITS);
    287    wsptr[DCTSIZE * 6] = (int)DESCALE(tmp11 - tmp2, CONST_BITS - PASS1_BITS);
    288    wsptr[DCTSIZE * 2] = (int)DESCALE(tmp12 + tmp1, CONST_BITS - PASS1_BITS);
    289    wsptr[DCTSIZE * 5] = (int)DESCALE(tmp12 - tmp1, CONST_BITS - PASS1_BITS);
    290    wsptr[DCTSIZE * 3] = (int)DESCALE(tmp13 + tmp0, CONST_BITS - PASS1_BITS);
    291    wsptr[DCTSIZE * 4] = (int)DESCALE(tmp13 - tmp0, CONST_BITS - PASS1_BITS);
    292 
    293    inptr++;                    /* advance pointers to next column */
    294    quantptr++;
    295    wsptr++;
    296  }
    297 
    298  /* Pass 2: process rows from work array, store into output array. */
    299  /* Note that we must descale the results by a factor of 8 == 2**3, */
    300  /* and also undo the PASS1_BITS scaling. */
    301 
    302  wsptr = workspace;
    303  for (ctr = 0; ctr < DCTSIZE; ctr++) {
    304    outptr = output_buf[ctr] + output_col;
    305    /* Rows of zeroes can be exploited in the same way as we did with columns.
    306     * However, the column calculation has created many nonzero AC terms, so
    307     * the simplification applies less often (typically 5% to 10% of the time).
    308     * On machines with very fast multiplication, it's possible that the
    309     * test takes more time than it's worth.  In that case this section
    310     * may be commented out.
    311     */
    312 
    313 #ifndef NO_ZERO_ROW_TEST
    314    if (wsptr[1] == 0 && wsptr[2] == 0 && wsptr[3] == 0 && wsptr[4] == 0 &&
    315        wsptr[5] == 0 && wsptr[6] == 0 && wsptr[7] == 0) {
    316      /* AC terms all zero */
    317      _JSAMPLE dcval = range_limit[(int)DESCALE((JLONG)wsptr[0],
    318                                                PASS1_BITS + 3) & RANGE_MASK];
    319 
    320      outptr[0] = dcval;
    321      outptr[1] = dcval;
    322      outptr[2] = dcval;
    323      outptr[3] = dcval;
    324      outptr[4] = dcval;
    325      outptr[5] = dcval;
    326      outptr[6] = dcval;
    327      outptr[7] = dcval;
    328 
    329      wsptr += DCTSIZE;         /* advance pointer to next row */
    330      continue;
    331    }
    332 #endif
    333 
    334    /* Even part: reverse the even part of the forward DCT. */
    335    /* The rotator is sqrt(2)*c(-6). */
    336 
    337    z2 = (JLONG)wsptr[2];
    338    z3 = (JLONG)wsptr[6];
    339 
    340    z1 = MULTIPLY(z2 + z3, FIX_0_541196100);
    341    tmp2 = z1 + MULTIPLY(z3, -FIX_1_847759065);
    342    tmp3 = z1 + MULTIPLY(z2, FIX_0_765366865);
    343 
    344    tmp0 = LEFT_SHIFT((JLONG)wsptr[0] + (JLONG)wsptr[4], CONST_BITS);
    345    tmp1 = LEFT_SHIFT((JLONG)wsptr[0] - (JLONG)wsptr[4], CONST_BITS);
    346 
    347    tmp10 = tmp0 + tmp3;
    348    tmp13 = tmp0 - tmp3;
    349    tmp11 = tmp1 + tmp2;
    350    tmp12 = tmp1 - tmp2;
    351 
    352    /* Odd part per figure 8; the matrix is unitary and hence its
    353     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    354     */
    355 
    356    tmp0 = (JLONG)wsptr[7];
    357    tmp1 = (JLONG)wsptr[5];
    358    tmp2 = (JLONG)wsptr[3];
    359    tmp3 = (JLONG)wsptr[1];
    360 
    361    z1 = tmp0 + tmp3;
    362    z2 = tmp1 + tmp2;
    363    z3 = tmp0 + tmp2;
    364    z4 = tmp1 + tmp3;
    365    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    366 
    367    tmp0 = MULTIPLY(tmp0, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    368    tmp1 = MULTIPLY(tmp1, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    369    tmp2 = MULTIPLY(tmp2, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    370    tmp3 = MULTIPLY(tmp3, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    371    z1 = MULTIPLY(z1, -FIX_0_899976223); /* sqrt(2) * ( c7-c3) */
    372    z2 = MULTIPLY(z2, -FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    373    z3 = MULTIPLY(z3, -FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    374    z4 = MULTIPLY(z4, -FIX_0_390180644); /* sqrt(2) * ( c5-c3) */
    375 
    376    z3 += z5;
    377    z4 += z5;
    378 
    379    tmp0 += z1 + z3;
    380    tmp1 += z2 + z4;
    381    tmp2 += z2 + z3;
    382    tmp3 += z1 + z4;
    383 
    384    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    385 
    386    outptr[0] = range_limit[(int)DESCALE(tmp10 + tmp3,
    387                                         CONST_BITS + PASS1_BITS + 3) &
    388                            RANGE_MASK];
    389    outptr[7] = range_limit[(int)DESCALE(tmp10 - tmp3,
    390                                         CONST_BITS + PASS1_BITS + 3) &
    391                            RANGE_MASK];
    392    outptr[1] = range_limit[(int)DESCALE(tmp11 + tmp2,
    393                                         CONST_BITS + PASS1_BITS + 3) &
    394                            RANGE_MASK];
    395    outptr[6] = range_limit[(int)DESCALE(tmp11 - tmp2,
    396                                         CONST_BITS + PASS1_BITS + 3) &
    397                            RANGE_MASK];
    398    outptr[2] = range_limit[(int)DESCALE(tmp12 + tmp1,
    399                                         CONST_BITS + PASS1_BITS + 3) &
    400                            RANGE_MASK];
    401    outptr[5] = range_limit[(int)DESCALE(tmp12 - tmp1,
    402                                         CONST_BITS + PASS1_BITS + 3) &
    403                            RANGE_MASK];
    404    outptr[3] = range_limit[(int)DESCALE(tmp13 + tmp0,
    405                                         CONST_BITS + PASS1_BITS + 3) &
    406                            RANGE_MASK];
    407    outptr[4] = range_limit[(int)DESCALE(tmp13 - tmp0,
    408                                         CONST_BITS + PASS1_BITS + 3) &
    409                            RANGE_MASK];
    410 
    411    wsptr += DCTSIZE;           /* advance pointer to next row */
    412  }
    413 }
    414 
    415 #ifdef IDCT_SCALING_SUPPORTED
    416 
    417 
    418 /*
    419 * Perform dequantization and inverse DCT on one block of coefficients,
    420 * producing a reduced-size 7x7 output block.
    421 *
    422 * Optimized algorithm with 12 multiplications in the 1-D kernel.
    423 * cK represents sqrt(2) * cos(K*pi/14).
    424 */
    425 
    426 GLOBAL(void)
    427 _jpeg_idct_7x7(j_decompress_ptr cinfo, jpeg_component_info *compptr,
    428               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
    429               JDIMENSION output_col)
    430 {
    431  JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12, tmp13;
    432  JLONG z1, z2, z3;
    433  JCOEFPTR inptr;
    434  ISLOW_MULT_TYPE *quantptr;
    435  int *wsptr;
    436  _JSAMPROW outptr;
    437  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    438  int ctr;
    439  int workspace[7 * 7];         /* buffers data between passes */
    440  SHIFT_TEMPS
    441 
    442  /* Pass 1: process columns from input, store into work array. */
    443 
    444  inptr = coef_block;
    445  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
    446  wsptr = workspace;
    447  for (ctr = 0; ctr < 7; ctr++, inptr++, quantptr++, wsptr++) {
    448    /* Even part */
    449 
    450    tmp13 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
    451    tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
    452    /* Add fudge factor here for final descale. */
    453    tmp13 += ONE << (CONST_BITS - PASS1_BITS - 1);
    454 
    455    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
    456    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
    457    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
    458 
    459    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    460    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    461    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    462    tmp0 = z1 + z3;
    463    z2 -= tmp0;
    464    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    465    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    466    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    467    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    468 
    469    /* Odd part */
    470 
    471    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
    472    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
    473    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
    474 
    475    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    476    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    477    tmp0 = tmp1 - tmp2;
    478    tmp1 += tmp2;
    479    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
    480    tmp1 += tmp2;
    481    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    482    tmp0 += z2;
    483    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    484 
    485    /* Final output stage */
    486 
    487    wsptr[7 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
    488    wsptr[7 * 6] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
    489    wsptr[7 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
    490    wsptr[7 * 5] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
    491    wsptr[7 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
    492    wsptr[7 * 4] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
    493    wsptr[7 * 3] = (int)RIGHT_SHIFT(tmp13, CONST_BITS - PASS1_BITS);
    494  }
    495 
    496  /* Pass 2: process 7 rows from work array, store into output array. */
    497 
    498  wsptr = workspace;
    499  for (ctr = 0; ctr < 7; ctr++) {
    500    outptr = output_buf[ctr] + output_col;
    501 
    502    /* Even part */
    503 
    504    /* Add fudge factor here for final descale. */
    505    tmp13 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
    506    tmp13 = LEFT_SHIFT(tmp13, CONST_BITS);
    507 
    508    z1 = (JLONG)wsptr[2];
    509    z2 = (JLONG)wsptr[4];
    510    z3 = (JLONG)wsptr[6];
    511 
    512    tmp10 = MULTIPLY(z2 - z3, FIX(0.881747734));     /* c4 */
    513    tmp12 = MULTIPLY(z1 - z2, FIX(0.314692123));     /* c6 */
    514    tmp11 = tmp10 + tmp12 + tmp13 - MULTIPLY(z2, FIX(1.841218003)); /* c2+c4-c6 */
    515    tmp0 = z1 + z3;
    516    z2 -= tmp0;
    517    tmp0 = MULTIPLY(tmp0, FIX(1.274162392)) + tmp13; /* c2 */
    518    tmp10 += tmp0 - MULTIPLY(z3, FIX(0.077722536));  /* c2-c4-c6 */
    519    tmp12 += tmp0 - MULTIPLY(z1, FIX(2.470602249));  /* c2+c4+c6 */
    520    tmp13 += MULTIPLY(z2, FIX(1.414213562));         /* c0 */
    521 
    522    /* Odd part */
    523 
    524    z1 = (JLONG)wsptr[1];
    525    z2 = (JLONG)wsptr[3];
    526    z3 = (JLONG)wsptr[5];
    527 
    528    tmp1 = MULTIPLY(z1 + z2, FIX(0.935414347));      /* (c3+c1-c5)/2 */
    529    tmp2 = MULTIPLY(z1 - z2, FIX(0.170262339));      /* (c3+c5-c1)/2 */
    530    tmp0 = tmp1 - tmp2;
    531    tmp1 += tmp2;
    532    tmp2 = MULTIPLY(z2 + z3, -FIX(1.378756276));     /* -c1 */
    533    tmp1 += tmp2;
    534    z2 = MULTIPLY(z1 + z3, FIX(0.613604268));        /* c5 */
    535    tmp0 += z2;
    536    tmp2 += z2 + MULTIPLY(z3, FIX(1.870828693));     /* c3+c1-c5 */
    537 
    538    /* Final output stage */
    539 
    540    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
    541                                             CONST_BITS + PASS1_BITS + 3) &
    542                            RANGE_MASK];
    543    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
    544                                             CONST_BITS + PASS1_BITS + 3) &
    545                            RANGE_MASK];
    546    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
    547                                             CONST_BITS + PASS1_BITS + 3) &
    548                            RANGE_MASK];
    549    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
    550                                             CONST_BITS + PASS1_BITS + 3) &
    551                            RANGE_MASK];
    552    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
    553                                             CONST_BITS + PASS1_BITS + 3) &
    554                            RANGE_MASK];
    555    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
    556                                             CONST_BITS + PASS1_BITS + 3) &
    557                            RANGE_MASK];
    558    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13,
    559                                             CONST_BITS + PASS1_BITS + 3) &
    560                            RANGE_MASK];
    561 
    562    wsptr += 7;         /* advance pointer to next row */
    563  }
    564 }
    565 
    566 
    567 /*
    568 * Perform dequantization and inverse DCT on one block of coefficients,
    569 * producing a reduced-size 6x6 output block.
    570 *
    571 * Optimized algorithm with 3 multiplications in the 1-D kernel.
    572 * cK represents sqrt(2) * cos(K*pi/12).
    573 */
    574 
    575 GLOBAL(void)
    576 _jpeg_idct_6x6(j_decompress_ptr cinfo, jpeg_component_info *compptr,
    577               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
    578               JDIMENSION output_col)
    579 {
    580  JLONG tmp0, tmp1, tmp2, tmp10, tmp11, tmp12;
    581  JLONG z1, z2, z3;
    582  JCOEFPTR inptr;
    583  ISLOW_MULT_TYPE *quantptr;
    584  int *wsptr;
    585  _JSAMPROW outptr;
    586  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    587  int ctr;
    588  int workspace[6 * 6];         /* buffers data between passes */
    589  SHIFT_TEMPS
    590 
    591  /* Pass 1: process columns from input, store into work array. */
    592 
    593  inptr = coef_block;
    594  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
    595  wsptr = workspace;
    596  for (ctr = 0; ctr < 6; ctr++, inptr++, quantptr++, wsptr++) {
    597    /* Even part */
    598 
    599    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
    600    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    601    /* Add fudge factor here for final descale. */
    602    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
    603    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
    604    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    605    tmp1 = tmp0 + tmp10;
    606    tmp11 = RIGHT_SHIFT(tmp0 - tmp10 - tmp10, CONST_BITS - PASS1_BITS);
    607    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
    608    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    609    tmp10 = tmp1 + tmp0;
    610    tmp12 = tmp1 - tmp0;
    611 
    612    /* Odd part */
    613 
    614    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
    615    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
    616    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
    617    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    618    tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
    619    tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
    620    tmp1 = LEFT_SHIFT(z1 - z2 - z3, PASS1_BITS);
    621 
    622    /* Final output stage */
    623 
    624    wsptr[6 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
    625    wsptr[6 * 5] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
    626    wsptr[6 * 1] = (int)(tmp11 + tmp1);
    627    wsptr[6 * 4] = (int)(tmp11 - tmp1);
    628    wsptr[6 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
    629    wsptr[6 * 3] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
    630  }
    631 
    632  /* Pass 2: process 6 rows from work array, store into output array. */
    633 
    634  wsptr = workspace;
    635  for (ctr = 0; ctr < 6; ctr++) {
    636    outptr = output_buf[ctr] + output_col;
    637 
    638    /* Even part */
    639 
    640    /* Add fudge factor here for final descale. */
    641    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
    642    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    643    tmp2 = (JLONG)wsptr[4];
    644    tmp10 = MULTIPLY(tmp2, FIX(0.707106781));   /* c4 */
    645    tmp1 = tmp0 + tmp10;
    646    tmp11 = tmp0 - tmp10 - tmp10;
    647    tmp10 = (JLONG)wsptr[2];
    648    tmp0 = MULTIPLY(tmp10, FIX(1.224744871));   /* c2 */
    649    tmp10 = tmp1 + tmp0;
    650    tmp12 = tmp1 - tmp0;
    651 
    652    /* Odd part */
    653 
    654    z1 = (JLONG)wsptr[1];
    655    z2 = (JLONG)wsptr[3];
    656    z3 = (JLONG)wsptr[5];
    657    tmp1 = MULTIPLY(z1 + z3, FIX(0.366025404)); /* c5 */
    658    tmp0 = tmp1 + LEFT_SHIFT(z1 + z2, CONST_BITS);
    659    tmp2 = tmp1 + LEFT_SHIFT(z3 - z2, CONST_BITS);
    660    tmp1 = LEFT_SHIFT(z1 - z2 - z3, CONST_BITS);
    661 
    662    /* Final output stage */
    663 
    664    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
    665                                             CONST_BITS + PASS1_BITS + 3) &
    666                            RANGE_MASK];
    667    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
    668                                             CONST_BITS + PASS1_BITS + 3) &
    669                            RANGE_MASK];
    670    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
    671                                             CONST_BITS + PASS1_BITS + 3) &
    672                            RANGE_MASK];
    673    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
    674                                             CONST_BITS + PASS1_BITS + 3) &
    675                            RANGE_MASK];
    676    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
    677                                             CONST_BITS + PASS1_BITS + 3) &
    678                            RANGE_MASK];
    679    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
    680                                             CONST_BITS + PASS1_BITS + 3) &
    681                            RANGE_MASK];
    682 
    683    wsptr += 6;         /* advance pointer to next row */
    684  }
    685 }
    686 
    687 
    688 /*
    689 * Perform dequantization and inverse DCT on one block of coefficients,
    690 * producing a reduced-size 5x5 output block.
    691 *
    692 * Optimized algorithm with 5 multiplications in the 1-D kernel.
    693 * cK represents sqrt(2) * cos(K*pi/10).
    694 */
    695 
    696 GLOBAL(void)
    697 _jpeg_idct_5x5(j_decompress_ptr cinfo, jpeg_component_info *compptr,
    698               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
    699               JDIMENSION output_col)
    700 {
    701  JLONG tmp0, tmp1, tmp10, tmp11, tmp12;
    702  JLONG z1, z2, z3;
    703  JCOEFPTR inptr;
    704  ISLOW_MULT_TYPE *quantptr;
    705  int *wsptr;
    706  _JSAMPROW outptr;
    707  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    708  int ctr;
    709  int workspace[5 * 5];         /* buffers data between passes */
    710  SHIFT_TEMPS
    711 
    712  /* Pass 1: process columns from input, store into work array. */
    713 
    714  inptr = coef_block;
    715  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
    716  wsptr = workspace;
    717  for (ctr = 0; ctr < 5; ctr++, inptr++, quantptr++, wsptr++) {
    718    /* Even part */
    719 
    720    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
    721    tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
    722    /* Add fudge factor here for final descale. */
    723    tmp12 += ONE << (CONST_BITS - PASS1_BITS - 1);
    724    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
    725    tmp1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
    726    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    727    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    728    z3 = tmp12 + z2;
    729    tmp10 = z3 + z1;
    730    tmp11 = z3 - z1;
    731    tmp12 -= LEFT_SHIFT(z2, 2);
    732 
    733    /* Odd part */
    734 
    735    z2 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
    736    z3 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
    737 
    738    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    739    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    740    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    741 
    742    /* Final output stage */
    743 
    744    wsptr[5 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
    745    wsptr[5 * 4] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
    746    wsptr[5 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
    747    wsptr[5 * 3] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
    748    wsptr[5 * 2] = (int)RIGHT_SHIFT(tmp12, CONST_BITS - PASS1_BITS);
    749  }
    750 
    751  /* Pass 2: process 5 rows from work array, store into output array. */
    752 
    753  wsptr = workspace;
    754  for (ctr = 0; ctr < 5; ctr++) {
    755    outptr = output_buf[ctr] + output_col;
    756 
    757    /* Even part */
    758 
    759    /* Add fudge factor here for final descale. */
    760    tmp12 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
    761    tmp12 = LEFT_SHIFT(tmp12, CONST_BITS);
    762    tmp0 = (JLONG)wsptr[2];
    763    tmp1 = (JLONG)wsptr[4];
    764    z1 = MULTIPLY(tmp0 + tmp1, FIX(0.790569415)); /* (c2+c4)/2 */
    765    z2 = MULTIPLY(tmp0 - tmp1, FIX(0.353553391)); /* (c2-c4)/2 */
    766    z3 = tmp12 + z2;
    767    tmp10 = z3 + z1;
    768    tmp11 = z3 - z1;
    769    tmp12 -= LEFT_SHIFT(z2, 2);
    770 
    771    /* Odd part */
    772 
    773    z2 = (JLONG)wsptr[1];
    774    z3 = (JLONG)wsptr[3];
    775 
    776    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));     /* c3 */
    777    tmp0 = z1 + MULTIPLY(z2, FIX(0.513743148));   /* c1-c3 */
    778    tmp1 = z1 - MULTIPLY(z3, FIX(2.176250899));   /* c1+c3 */
    779 
    780    /* Final output stage */
    781 
    782    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
    783                                             CONST_BITS + PASS1_BITS + 3) &
    784                            RANGE_MASK];
    785    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
    786                                             CONST_BITS + PASS1_BITS + 3) &
    787                            RANGE_MASK];
    788    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
    789                                             CONST_BITS + PASS1_BITS + 3) &
    790                            RANGE_MASK];
    791    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
    792                                             CONST_BITS + PASS1_BITS + 3) &
    793                            RANGE_MASK];
    794    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12,
    795                                             CONST_BITS + PASS1_BITS + 3) &
    796                            RANGE_MASK];
    797 
    798    wsptr += 5;         /* advance pointer to next row */
    799  }
    800 }
    801 
    802 
    803 /*
    804 * Perform dequantization and inverse DCT on one block of coefficients,
    805 * producing a reduced-size 3x3 output block.
    806 *
    807 * Optimized algorithm with 2 multiplications in the 1-D kernel.
    808 * cK represents sqrt(2) * cos(K*pi/6).
    809 */
    810 
    811 GLOBAL(void)
    812 _jpeg_idct_3x3(j_decompress_ptr cinfo, jpeg_component_info *compptr,
    813               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
    814               JDIMENSION output_col)
    815 {
    816  JLONG tmp0, tmp2, tmp10, tmp12;
    817  JCOEFPTR inptr;
    818  ISLOW_MULT_TYPE *quantptr;
    819  int *wsptr;
    820  _JSAMPROW outptr;
    821  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    822  int ctr;
    823  int workspace[3 * 3];         /* buffers data between passes */
    824  SHIFT_TEMPS
    825 
    826  /* Pass 1: process columns from input, store into work array. */
    827 
    828  inptr = coef_block;
    829  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
    830  wsptr = workspace;
    831  for (ctr = 0; ctr < 3; ctr++, inptr++, quantptr++, wsptr++) {
    832    /* Even part */
    833 
    834    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
    835    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    836    /* Add fudge factor here for final descale. */
    837    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
    838    tmp2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
    839    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    840    tmp10 = tmp0 + tmp12;
    841    tmp2 = tmp0 - tmp12 - tmp12;
    842 
    843    /* Odd part */
    844 
    845    tmp12 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
    846    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    847 
    848    /* Final output stage */
    849 
    850    wsptr[3 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
    851    wsptr[3 * 2] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
    852    wsptr[3 * 1] = (int)RIGHT_SHIFT(tmp2, CONST_BITS - PASS1_BITS);
    853  }
    854 
    855  /* Pass 2: process 3 rows from work array, store into output array. */
    856 
    857  wsptr = workspace;
    858  for (ctr = 0; ctr < 3; ctr++) {
    859    outptr = output_buf[ctr] + output_col;
    860 
    861    /* Even part */
    862 
    863    /* Add fudge factor here for final descale. */
    864    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
    865    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    866    tmp2 = (JLONG)wsptr[2];
    867    tmp12 = MULTIPLY(tmp2, FIX(0.707106781)); /* c2 */
    868    tmp10 = tmp0 + tmp12;
    869    tmp2 = tmp0 - tmp12 - tmp12;
    870 
    871    /* Odd part */
    872 
    873    tmp12 = (JLONG)wsptr[1];
    874    tmp0 = MULTIPLY(tmp12, FIX(1.224744871)); /* c1 */
    875 
    876    /* Final output stage */
    877 
    878    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
    879                                             CONST_BITS + PASS1_BITS + 3) &
    880                            RANGE_MASK];
    881    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
    882                                             CONST_BITS + PASS1_BITS + 3) &
    883                            RANGE_MASK];
    884    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp2,
    885                                             CONST_BITS + PASS1_BITS + 3) &
    886                            RANGE_MASK];
    887 
    888    wsptr += 3;         /* advance pointer to next row */
    889  }
    890 }
    891 
    892 
    893 /*
    894 * Perform dequantization and inverse DCT on one block of coefficients,
    895 * producing a 9x9 output block.
    896 *
    897 * Optimized algorithm with 10 multiplications in the 1-D kernel.
    898 * cK represents sqrt(2) * cos(K*pi/18).
    899 */
    900 
    901 GLOBAL(void)
    902 _jpeg_idct_9x9(j_decompress_ptr cinfo, jpeg_component_info *compptr,
    903               JCOEFPTR coef_block, _JSAMPARRAY output_buf,
    904               JDIMENSION output_col)
    905 {
    906  JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13, tmp14;
    907  JLONG z1, z2, z3, z4;
    908  JCOEFPTR inptr;
    909  ISLOW_MULT_TYPE *quantptr;
    910  int *wsptr;
    911  _JSAMPROW outptr;
    912  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
    913  int ctr;
    914  int workspace[8 * 9];         /* buffers data between passes */
    915  SHIFT_TEMPS
    916 
    917  /* Pass 1: process columns from input, store into work array. */
    918 
    919  inptr = coef_block;
    920  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
    921  wsptr = workspace;
    922  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
    923    /* Even part */
    924 
    925    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
    926    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    927    /* Add fudge factor here for final descale. */
    928    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
    929 
    930    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
    931    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
    932    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
    933 
    934    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
    935    tmp1 = tmp0 + tmp3;
    936    tmp2 = tmp0 - tmp3 - tmp3;
    937 
    938    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
    939    tmp11 = tmp2 + tmp0;
    940    tmp14 = tmp2 - tmp0 - tmp0;
    941 
    942    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
    943    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
    944    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
    945 
    946    tmp10 = tmp1 + tmp0 - tmp3;
    947    tmp12 = tmp1 - tmp0 + tmp2;
    948    tmp13 = tmp1 - tmp2 + tmp3;
    949 
    950    /* Odd part */
    951 
    952    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
    953    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
    954    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
    955    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
    956 
    957    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
    958 
    959    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
    960    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
    961    tmp0 = tmp2 + tmp3 - z2;
    962    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
    963    tmp2 += z2 - tmp1;
    964    tmp3 += z2 + tmp1;
    965    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
    966 
    967    /* Final output stage */
    968 
    969    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp10 + tmp0, CONST_BITS - PASS1_BITS);
    970    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp10 - tmp0, CONST_BITS - PASS1_BITS);
    971    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp11 + tmp1, CONST_BITS - PASS1_BITS);
    972    wsptr[8 * 7] = (int)RIGHT_SHIFT(tmp11 - tmp1, CONST_BITS - PASS1_BITS);
    973    wsptr[8 * 2] = (int)RIGHT_SHIFT(tmp12 + tmp2, CONST_BITS - PASS1_BITS);
    974    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp12 - tmp2, CONST_BITS - PASS1_BITS);
    975    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp13 + tmp3, CONST_BITS - PASS1_BITS);
    976    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp13 - tmp3, CONST_BITS - PASS1_BITS);
    977    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp14, CONST_BITS - PASS1_BITS);
    978  }
    979 
    980  /* Pass 2: process 9 rows from work array, store into output array. */
    981 
    982  wsptr = workspace;
    983  for (ctr = 0; ctr < 9; ctr++) {
    984    outptr = output_buf[ctr] + output_col;
    985 
    986    /* Even part */
    987 
    988    /* Add fudge factor here for final descale. */
    989    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
    990    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
    991 
    992    z1 = (JLONG)wsptr[2];
    993    z2 = (JLONG)wsptr[4];
    994    z3 = (JLONG)wsptr[6];
    995 
    996    tmp3 = MULTIPLY(z3, FIX(0.707106781));      /* c6 */
    997    tmp1 = tmp0 + tmp3;
    998    tmp2 = tmp0 - tmp3 - tmp3;
    999 
   1000    tmp0 = MULTIPLY(z1 - z2, FIX(0.707106781)); /* c6 */
   1001    tmp11 = tmp2 + tmp0;
   1002    tmp14 = tmp2 - tmp0 - tmp0;
   1003 
   1004    tmp0 = MULTIPLY(z1 + z2, FIX(1.328926049)); /* c2 */
   1005    tmp2 = MULTIPLY(z1, FIX(1.083350441));      /* c4 */
   1006    tmp3 = MULTIPLY(z2, FIX(0.245575608));      /* c8 */
   1007 
   1008    tmp10 = tmp1 + tmp0 - tmp3;
   1009    tmp12 = tmp1 - tmp0 + tmp2;
   1010    tmp13 = tmp1 - tmp2 + tmp3;
   1011 
   1012    /* Odd part */
   1013 
   1014    z1 = (JLONG)wsptr[1];
   1015    z2 = (JLONG)wsptr[3];
   1016    z3 = (JLONG)wsptr[5];
   1017    z4 = (JLONG)wsptr[7];
   1018 
   1019    z2 = MULTIPLY(z2, -FIX(1.224744871));            /* -c3 */
   1020 
   1021    tmp2 = MULTIPLY(z1 + z3, FIX(0.909038955));      /* c5 */
   1022    tmp3 = MULTIPLY(z1 + z4, FIX(0.483689525));      /* c7 */
   1023    tmp0 = tmp2 + tmp3 - z2;
   1024    tmp1 = MULTIPLY(z3 - z4, FIX(1.392728481));      /* c1 */
   1025    tmp2 += z2 - tmp1;
   1026    tmp3 += z2 + tmp1;
   1027    tmp1 = MULTIPLY(z1 - z3 - z4, FIX(1.224744871)); /* c3 */
   1028 
   1029    /* Final output stage */
   1030 
   1031    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp10 + tmp0,
   1032                                             CONST_BITS + PASS1_BITS + 3) &
   1033                            RANGE_MASK];
   1034    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp10 - tmp0,
   1035                                             CONST_BITS + PASS1_BITS + 3) &
   1036                            RANGE_MASK];
   1037    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp11 + tmp1,
   1038                                             CONST_BITS + PASS1_BITS + 3) &
   1039                            RANGE_MASK];
   1040    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp11 - tmp1,
   1041                                             CONST_BITS + PASS1_BITS + 3) &
   1042                            RANGE_MASK];
   1043    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp12 + tmp2,
   1044                                             CONST_BITS + PASS1_BITS + 3) &
   1045                            RANGE_MASK];
   1046    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp12 - tmp2,
   1047                                             CONST_BITS + PASS1_BITS + 3) &
   1048                            RANGE_MASK];
   1049    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp13 + tmp3,
   1050                                             CONST_BITS + PASS1_BITS + 3) &
   1051                            RANGE_MASK];
   1052    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp13 - tmp3,
   1053                                             CONST_BITS + PASS1_BITS + 3) &
   1054                            RANGE_MASK];
   1055    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp14,
   1056                                             CONST_BITS + PASS1_BITS + 3) &
   1057                            RANGE_MASK];
   1058 
   1059    wsptr += 8;         /* advance pointer to next row */
   1060  }
   1061 }
   1062 
   1063 
   1064 /*
   1065 * Perform dequantization and inverse DCT on one block of coefficients,
   1066 * producing a 10x10 output block.
   1067 *
   1068 * Optimized algorithm with 12 multiplications in the 1-D kernel.
   1069 * cK represents sqrt(2) * cos(K*pi/20).
   1070 */
   1071 
   1072 GLOBAL(void)
   1073 _jpeg_idct_10x10(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1074                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   1075                 JDIMENSION output_col)
   1076 {
   1077  JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   1078  JLONG tmp20, tmp21, tmp22, tmp23, tmp24;
   1079  JLONG z1, z2, z3, z4, z5;
   1080  JCOEFPTR inptr;
   1081  ISLOW_MULT_TYPE *quantptr;
   1082  int *wsptr;
   1083  _JSAMPROW outptr;
   1084  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1085  int ctr;
   1086  int workspace[8 * 10];        /* buffers data between passes */
   1087  SHIFT_TEMPS
   1088 
   1089  /* Pass 1: process columns from input, store into work array. */
   1090 
   1091  inptr = coef_block;
   1092  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   1093  wsptr = workspace;
   1094  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1095    /* Even part */
   1096 
   1097    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   1098    z3 = LEFT_SHIFT(z3, CONST_BITS);
   1099    /* Add fudge factor here for final descale. */
   1100    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
   1101    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   1102    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1103    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1104    tmp10 = z3 + z1;
   1105    tmp11 = z3 - z2;
   1106 
   1107    tmp22 = RIGHT_SHIFT(z3 - LEFT_SHIFT(z1 - z2, 1),
   1108                        CONST_BITS - PASS1_BITS); /* c0 = (c4-c8)*2 */
   1109 
   1110    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   1111    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   1112 
   1113    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1114    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1115    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1116 
   1117    tmp20 = tmp10 + tmp12;
   1118    tmp24 = tmp10 - tmp12;
   1119    tmp21 = tmp11 + tmp13;
   1120    tmp23 = tmp11 - tmp13;
   1121 
   1122    /* Odd part */
   1123 
   1124    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   1125    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   1126    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   1127    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   1128 
   1129    tmp11 = z2 + z4;
   1130    tmp13 = z2 - z4;
   1131 
   1132    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1133    z5 = LEFT_SHIFT(z3, CONST_BITS);
   1134 
   1135    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1136    z4 = z5 + tmp12;
   1137 
   1138    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1139    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1140 
   1141    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1142    z4 = z5 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
   1143 
   1144    tmp12 = LEFT_SHIFT(z1 - tmp13 - z3, PASS1_BITS);
   1145 
   1146    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1147    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1148 
   1149    /* Final output stage */
   1150 
   1151    wsptr[8 * 0] = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
   1152    wsptr[8 * 9] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
   1153    wsptr[8 * 1] = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
   1154    wsptr[8 * 8] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
   1155    wsptr[8 * 2] = (int)(tmp22 + tmp12);
   1156    wsptr[8 * 7] = (int)(tmp22 - tmp12);
   1157    wsptr[8 * 3] = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
   1158    wsptr[8 * 6] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
   1159    wsptr[8 * 4] = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
   1160    wsptr[8 * 5] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   1161  }
   1162 
   1163  /* Pass 2: process 10 rows from work array, store into output array. */
   1164 
   1165  wsptr = workspace;
   1166  for (ctr = 0; ctr < 10; ctr++) {
   1167    outptr = output_buf[ctr] + output_col;
   1168 
   1169    /* Even part */
   1170 
   1171    /* Add fudge factor here for final descale. */
   1172    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   1173    z3 = LEFT_SHIFT(z3, CONST_BITS);
   1174    z4 = (JLONG)wsptr[4];
   1175    z1 = MULTIPLY(z4, FIX(1.144122806));         /* c4 */
   1176    z2 = MULTIPLY(z4, FIX(0.437016024));         /* c8 */
   1177    tmp10 = z3 + z1;
   1178    tmp11 = z3 - z2;
   1179 
   1180    tmp22 = z3 - LEFT_SHIFT(z1 - z2, 1);         /* c0 = (c4-c8)*2 */
   1181 
   1182    z2 = (JLONG)wsptr[2];
   1183    z3 = (JLONG)wsptr[6];
   1184 
   1185    z1 = MULTIPLY(z2 + z3, FIX(0.831253876));    /* c6 */
   1186    tmp12 = z1 + MULTIPLY(z2, FIX(0.513743148)); /* c2-c6 */
   1187    tmp13 = z1 - MULTIPLY(z3, FIX(2.176250899)); /* c2+c6 */
   1188 
   1189    tmp20 = tmp10 + tmp12;
   1190    tmp24 = tmp10 - tmp12;
   1191    tmp21 = tmp11 + tmp13;
   1192    tmp23 = tmp11 - tmp13;
   1193 
   1194    /* Odd part */
   1195 
   1196    z1 = (JLONG)wsptr[1];
   1197    z2 = (JLONG)wsptr[3];
   1198    z3 = (JLONG)wsptr[5];
   1199    z3 = LEFT_SHIFT(z3, CONST_BITS);
   1200    z4 = (JLONG)wsptr[7];
   1201 
   1202    tmp11 = z2 + z4;
   1203    tmp13 = z2 - z4;
   1204 
   1205    tmp12 = MULTIPLY(tmp13, FIX(0.309016994));        /* (c3-c7)/2 */
   1206 
   1207    z2 = MULTIPLY(tmp11, FIX(0.951056516));           /* (c3+c7)/2 */
   1208    z4 = z3 + tmp12;
   1209 
   1210    tmp10 = MULTIPLY(z1, FIX(1.396802247)) + z2 + z4; /* c1 */
   1211    tmp14 = MULTIPLY(z1, FIX(0.221231742)) - z2 + z4; /* c9 */
   1212 
   1213    z2 = MULTIPLY(tmp11, FIX(0.587785252));           /* (c1-c9)/2 */
   1214    z4 = z3 - tmp12 - LEFT_SHIFT(tmp13, CONST_BITS - 1);
   1215 
   1216    tmp12 = LEFT_SHIFT(z1 - tmp13, CONST_BITS) - z3;
   1217 
   1218    tmp11 = MULTIPLY(z1, FIX(1.260073511)) - z2 - z4; /* c3 */
   1219    tmp13 = MULTIPLY(z1, FIX(0.642039522)) - z2 + z4; /* c7 */
   1220 
   1221    /* Final output stage */
   1222 
   1223    outptr[0] = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
   1224                                             CONST_BITS + PASS1_BITS + 3) &
   1225                            RANGE_MASK];
   1226    outptr[9] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
   1227                                             CONST_BITS + PASS1_BITS + 3) &
   1228                            RANGE_MASK];
   1229    outptr[1] = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
   1230                                             CONST_BITS + PASS1_BITS + 3) &
   1231                            RANGE_MASK];
   1232    outptr[8] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
   1233                                             CONST_BITS + PASS1_BITS + 3) &
   1234                            RANGE_MASK];
   1235    outptr[2] = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
   1236                                             CONST_BITS + PASS1_BITS + 3) &
   1237                            RANGE_MASK];
   1238    outptr[7] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
   1239                                             CONST_BITS + PASS1_BITS + 3) &
   1240                            RANGE_MASK];
   1241    outptr[3] = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
   1242                                             CONST_BITS + PASS1_BITS + 3) &
   1243                            RANGE_MASK];
   1244    outptr[6] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
   1245                                             CONST_BITS + PASS1_BITS + 3) &
   1246                            RANGE_MASK];
   1247    outptr[4] = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
   1248                                             CONST_BITS + PASS1_BITS + 3) &
   1249                            RANGE_MASK];
   1250    outptr[5] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
   1251                                             CONST_BITS + PASS1_BITS + 3) &
   1252                            RANGE_MASK];
   1253 
   1254    wsptr += 8;         /* advance pointer to next row */
   1255  }
   1256 }
   1257 
   1258 
   1259 /*
   1260 * Perform dequantization and inverse DCT on one block of coefficients,
   1261 * producing an 11x11 output block.
   1262 *
   1263 * Optimized algorithm with 24 multiplications in the 1-D kernel.
   1264 * cK represents sqrt(2) * cos(K*pi/22).
   1265 */
   1266 
   1267 GLOBAL(void)
   1268 _jpeg_idct_11x11(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1269                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   1270                 JDIMENSION output_col)
   1271 {
   1272  JLONG tmp10, tmp11, tmp12, tmp13, tmp14;
   1273  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1274  JLONG z1, z2, z3, z4;
   1275  JCOEFPTR inptr;
   1276  ISLOW_MULT_TYPE *quantptr;
   1277  int *wsptr;
   1278  _JSAMPROW outptr;
   1279  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1280  int ctr;
   1281  int workspace[8 * 11];        /* buffers data between passes */
   1282  SHIFT_TEMPS
   1283 
   1284  /* Pass 1: process columns from input, store into work array. */
   1285 
   1286  inptr = coef_block;
   1287  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   1288  wsptr = workspace;
   1289  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1290    /* Even part */
   1291 
   1292    tmp10 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   1293    tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
   1294    /* Add fudge factor here for final descale. */
   1295    tmp10 += ONE << (CONST_BITS - PASS1_BITS - 1);
   1296 
   1297    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   1298    z2 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   1299    z3 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   1300 
   1301    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1302    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1303    z4 = z1 + z3;
   1304    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
   1305    z4 -= z2;
   1306    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1307    tmp21 = tmp20 + tmp23 + tmp25 -
   1308            MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1309    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1310    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1311    tmp24 += tmp25;
   1312    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1313    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1314             MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1315    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1316 
   1317    /* Odd part */
   1318 
   1319    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   1320    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   1321    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   1322    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   1323 
   1324    tmp11 = z1 + z2;
   1325    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1326    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1327    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1328    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1329    tmp10 = tmp11 + tmp12 + tmp13 -
   1330            MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1331    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1332    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1333    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1334    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
   1335    tmp11 += z1;
   1336    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1337    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
   1338             MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1339             MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1340 
   1341    /* Final output stage */
   1342 
   1343    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
   1344    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
   1345    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
   1346    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
   1347    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
   1348    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
   1349    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
   1350    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
   1351    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
   1352    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   1353    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25, CONST_BITS - PASS1_BITS);
   1354  }
   1355 
   1356  /* Pass 2: process 11 rows from work array, store into output array. */
   1357 
   1358  wsptr = workspace;
   1359  for (ctr = 0; ctr < 11; ctr++) {
   1360    outptr = output_buf[ctr] + output_col;
   1361 
   1362    /* Even part */
   1363 
   1364    /* Add fudge factor here for final descale. */
   1365    tmp10 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   1366    tmp10 = LEFT_SHIFT(tmp10, CONST_BITS);
   1367 
   1368    z1 = (JLONG)wsptr[2];
   1369    z2 = (JLONG)wsptr[4];
   1370    z3 = (JLONG)wsptr[6];
   1371 
   1372    tmp20 = MULTIPLY(z2 - z3, FIX(2.546640132));     /* c2+c4 */
   1373    tmp23 = MULTIPLY(z2 - z1, FIX(0.430815045));     /* c2-c6 */
   1374    z4 = z1 + z3;
   1375    tmp24 = MULTIPLY(z4, -FIX(1.155664402));         /* -(c2-c10) */
   1376    z4 -= z2;
   1377    tmp25 = tmp10 + MULTIPLY(z4, FIX(1.356927976));  /* c2 */
   1378    tmp21 = tmp20 + tmp23 + tmp25 -
   1379            MULTIPLY(z2, FIX(1.821790775));          /* c2+c4+c10-c6 */
   1380    tmp20 += tmp25 + MULTIPLY(z3, FIX(2.115825087)); /* c4+c6 */
   1381    tmp23 += tmp25 - MULTIPLY(z1, FIX(1.513598477)); /* c6+c8 */
   1382    tmp24 += tmp25;
   1383    tmp22 = tmp24 - MULTIPLY(z3, FIX(0.788749120));  /* c8+c10 */
   1384    tmp24 += MULTIPLY(z2, FIX(1.944413522)) -        /* c2+c8 */
   1385             MULTIPLY(z1, FIX(1.390975730));         /* c4+c10 */
   1386    tmp25 = tmp10 - MULTIPLY(z4, FIX(1.414213562));  /* c0 */
   1387 
   1388    /* Odd part */
   1389 
   1390    z1 = (JLONG)wsptr[1];
   1391    z2 = (JLONG)wsptr[3];
   1392    z3 = (JLONG)wsptr[5];
   1393    z4 = (JLONG)wsptr[7];
   1394 
   1395    tmp11 = z1 + z2;
   1396    tmp14 = MULTIPLY(tmp11 + z3 + z4, FIX(0.398430003)); /* c9 */
   1397    tmp11 = MULTIPLY(tmp11, FIX(0.887983902));           /* c3-c9 */
   1398    tmp12 = MULTIPLY(z1 + z3, FIX(0.670361295));         /* c5-c9 */
   1399    tmp13 = tmp14 + MULTIPLY(z1 + z4, FIX(0.366151574)); /* c7-c9 */
   1400    tmp10 = tmp11 + tmp12 + tmp13 -
   1401            MULTIPLY(z1, FIX(0.923107866));              /* c7+c5+c3-c1-2*c9 */
   1402    z1    = tmp14 - MULTIPLY(z2 + z3, FIX(1.163011579)); /* c7+c9 */
   1403    tmp11 += z1 + MULTIPLY(z2, FIX(2.073276588));        /* c1+c7+3*c9-c3 */
   1404    tmp12 += z1 - MULTIPLY(z3, FIX(1.192193623));        /* c3+c5-c7-c9 */
   1405    z1    = MULTIPLY(z2 + z4, -FIX(1.798248910));        /* -(c1+c9) */
   1406    tmp11 += z1;
   1407    tmp13 += z1 + MULTIPLY(z4, FIX(2.102458632));        /* c1+c5+c9-c7 */
   1408    tmp14 += MULTIPLY(z2, -FIX(1.467221301)) +           /* -(c5+c9) */
   1409             MULTIPLY(z3, FIX(1.001388905)) -            /* c1-c9 */
   1410             MULTIPLY(z4, FIX(1.684843907));             /* c3+c9 */
   1411 
   1412    /* Final output stage */
   1413 
   1414    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
   1415                                              CONST_BITS + PASS1_BITS + 3) &
   1416                             RANGE_MASK];
   1417    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
   1418                                              CONST_BITS + PASS1_BITS + 3) &
   1419                             RANGE_MASK];
   1420    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
   1421                                              CONST_BITS + PASS1_BITS + 3) &
   1422                             RANGE_MASK];
   1423    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
   1424                                              CONST_BITS + PASS1_BITS + 3) &
   1425                             RANGE_MASK];
   1426    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
   1427                                              CONST_BITS + PASS1_BITS + 3) &
   1428                             RANGE_MASK];
   1429    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
   1430                                              CONST_BITS + PASS1_BITS + 3) &
   1431                             RANGE_MASK];
   1432    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
   1433                                              CONST_BITS + PASS1_BITS + 3) &
   1434                             RANGE_MASK];
   1435    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
   1436                                              CONST_BITS + PASS1_BITS + 3) &
   1437                             RANGE_MASK];
   1438    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
   1439                                              CONST_BITS + PASS1_BITS + 3) &
   1440                             RANGE_MASK];
   1441    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
   1442                                              CONST_BITS + PASS1_BITS + 3) &
   1443                             RANGE_MASK];
   1444    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25,
   1445                                              CONST_BITS + PASS1_BITS + 3) &
   1446                             RANGE_MASK];
   1447 
   1448    wsptr += 8;         /* advance pointer to next row */
   1449  }
   1450 }
   1451 
   1452 
   1453 /*
   1454 * Perform dequantization and inverse DCT on one block of coefficients,
   1455 * producing a 12x12 output block.
   1456 *
   1457 * Optimized algorithm with 15 multiplications in the 1-D kernel.
   1458 * cK represents sqrt(2) * cos(K*pi/24).
   1459 */
   1460 
   1461 GLOBAL(void)
   1462 _jpeg_idct_12x12(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1463                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   1464                 JDIMENSION output_col)
   1465 {
   1466  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1467  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25;
   1468  JLONG z1, z2, z3, z4;
   1469  JCOEFPTR inptr;
   1470  ISLOW_MULT_TYPE *quantptr;
   1471  int *wsptr;
   1472  _JSAMPROW outptr;
   1473  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1474  int ctr;
   1475  int workspace[8 * 12];        /* buffers data between passes */
   1476  SHIFT_TEMPS
   1477 
   1478  /* Pass 1: process columns from input, store into work array. */
   1479 
   1480  inptr = coef_block;
   1481  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   1482  wsptr = workspace;
   1483  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1484    /* Even part */
   1485 
   1486    z3 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   1487    z3 = LEFT_SHIFT(z3, CONST_BITS);
   1488    /* Add fudge factor here for final descale. */
   1489    z3 += ONE << (CONST_BITS - PASS1_BITS - 1);
   1490 
   1491    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   1492    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1493 
   1494    tmp10 = z3 + z4;
   1495    tmp11 = z3 - z4;
   1496 
   1497    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   1498    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1499    z1 = LEFT_SHIFT(z1, CONST_BITS);
   1500    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   1501    z2 = LEFT_SHIFT(z2, CONST_BITS);
   1502 
   1503    tmp12 = z1 - z2;
   1504 
   1505    tmp21 = z3 + tmp12;
   1506    tmp24 = z3 - tmp12;
   1507 
   1508    tmp12 = z4 + z2;
   1509 
   1510    tmp20 = tmp10 + tmp12;
   1511    tmp25 = tmp10 - tmp12;
   1512 
   1513    tmp12 = z4 - z1 - z2;
   1514 
   1515    tmp22 = tmp11 + tmp12;
   1516    tmp23 = tmp11 - tmp12;
   1517 
   1518    /* Odd part */
   1519 
   1520    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   1521    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   1522    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   1523    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   1524 
   1525    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1526    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
   1527 
   1528    tmp10 = z1 + z3;
   1529    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1530    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1531    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1532    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
   1533    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1534    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1535    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1536             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1537 
   1538    z1 -= z4;
   1539    z2 -= z3;
   1540    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1541    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1542    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1543 
   1544    /* Final output stage */
   1545 
   1546    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
   1547    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
   1548    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
   1549    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
   1550    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
   1551    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
   1552    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
   1553    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
   1554    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
   1555    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   1556    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
   1557    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   1558  }
   1559 
   1560  /* Pass 2: process 12 rows from work array, store into output array. */
   1561 
   1562  wsptr = workspace;
   1563  for (ctr = 0; ctr < 12; ctr++) {
   1564    outptr = output_buf[ctr] + output_col;
   1565 
   1566    /* Even part */
   1567 
   1568    /* Add fudge factor here for final descale. */
   1569    z3 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   1570    z3 = LEFT_SHIFT(z3, CONST_BITS);
   1571 
   1572    z4 = (JLONG)wsptr[4];
   1573    z4 = MULTIPLY(z4, FIX(1.224744871)); /* c4 */
   1574 
   1575    tmp10 = z3 + z4;
   1576    tmp11 = z3 - z4;
   1577 
   1578    z1 = (JLONG)wsptr[2];
   1579    z4 = MULTIPLY(z1, FIX(1.366025404)); /* c2 */
   1580    z1 = LEFT_SHIFT(z1, CONST_BITS);
   1581    z2 = (JLONG)wsptr[6];
   1582    z2 = LEFT_SHIFT(z2, CONST_BITS);
   1583 
   1584    tmp12 = z1 - z2;
   1585 
   1586    tmp21 = z3 + tmp12;
   1587    tmp24 = z3 - tmp12;
   1588 
   1589    tmp12 = z4 + z2;
   1590 
   1591    tmp20 = tmp10 + tmp12;
   1592    tmp25 = tmp10 - tmp12;
   1593 
   1594    tmp12 = z4 - z1 - z2;
   1595 
   1596    tmp22 = tmp11 + tmp12;
   1597    tmp23 = tmp11 - tmp12;
   1598 
   1599    /* Odd part */
   1600 
   1601    z1 = (JLONG)wsptr[1];
   1602    z2 = (JLONG)wsptr[3];
   1603    z3 = (JLONG)wsptr[5];
   1604    z4 = (JLONG)wsptr[7];
   1605 
   1606    tmp11 = MULTIPLY(z2, FIX(1.306562965));                  /* c3 */
   1607    tmp14 = MULTIPLY(z2, -FIX_0_541196100);                  /* -c9 */
   1608 
   1609    tmp10 = z1 + z3;
   1610    tmp15 = MULTIPLY(tmp10 + z4, FIX(0.860918669));          /* c7 */
   1611    tmp12 = tmp15 + MULTIPLY(tmp10, FIX(0.261052384));       /* c5-c7 */
   1612    tmp10 = tmp12 + tmp11 + MULTIPLY(z1, FIX(0.280143716));  /* c1-c5 */
   1613    tmp13 = MULTIPLY(z3 + z4, -FIX(1.045510580));            /* -(c7+c11) */
   1614    tmp12 += tmp13 + tmp14 - MULTIPLY(z3, FIX(1.478575242)); /* c1+c5-c7-c11 */
   1615    tmp13 += tmp15 - tmp11 + MULTIPLY(z4, FIX(1.586706681)); /* c1+c11 */
   1616    tmp15 += tmp14 - MULTIPLY(z1, FIX(0.676326758)) -        /* c7-c11 */
   1617             MULTIPLY(z4, FIX(1.982889723));                 /* c5+c7 */
   1618 
   1619    z1 -= z4;
   1620    z2 -= z3;
   1621    z3 = MULTIPLY(z1 + z2, FIX_0_541196100);                 /* c9 */
   1622    tmp11 = z3 + MULTIPLY(z1, FIX_0_765366865);              /* c3-c9 */
   1623    tmp14 = z3 - MULTIPLY(z2, FIX_1_847759065);              /* c3+c9 */
   1624 
   1625    /* Final output stage */
   1626 
   1627    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
   1628                                              CONST_BITS + PASS1_BITS + 3) &
   1629                             RANGE_MASK];
   1630    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
   1631                                              CONST_BITS + PASS1_BITS + 3) &
   1632                             RANGE_MASK];
   1633    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
   1634                                              CONST_BITS + PASS1_BITS + 3) &
   1635                             RANGE_MASK];
   1636    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
   1637                                              CONST_BITS + PASS1_BITS + 3) &
   1638                             RANGE_MASK];
   1639    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
   1640                                              CONST_BITS + PASS1_BITS + 3) &
   1641                             RANGE_MASK];
   1642    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
   1643                                              CONST_BITS + PASS1_BITS + 3) &
   1644                             RANGE_MASK];
   1645    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
   1646                                              CONST_BITS + PASS1_BITS + 3) &
   1647                             RANGE_MASK];
   1648    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
   1649                                              CONST_BITS + PASS1_BITS + 3) &
   1650                             RANGE_MASK];
   1651    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
   1652                                              CONST_BITS + PASS1_BITS + 3) &
   1653                             RANGE_MASK];
   1654    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
   1655                                              CONST_BITS + PASS1_BITS + 3) &
   1656                             RANGE_MASK];
   1657    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
   1658                                              CONST_BITS + PASS1_BITS + 3) &
   1659                             RANGE_MASK];
   1660    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
   1661                                              CONST_BITS + PASS1_BITS + 3) &
   1662                             RANGE_MASK];
   1663 
   1664    wsptr += 8;         /* advance pointer to next row */
   1665  }
   1666 }
   1667 
   1668 
   1669 /*
   1670 * Perform dequantization and inverse DCT on one block of coefficients,
   1671 * producing a 13x13 output block.
   1672 *
   1673 * Optimized algorithm with 29 multiplications in the 1-D kernel.
   1674 * cK represents sqrt(2) * cos(K*pi/26).
   1675 */
   1676 
   1677 GLOBAL(void)
   1678 _jpeg_idct_13x13(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1679                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   1680                 JDIMENSION output_col)
   1681 {
   1682  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
   1683  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1684  JLONG z1, z2, z3, z4;
   1685  JCOEFPTR inptr;
   1686  ISLOW_MULT_TYPE *quantptr;
   1687  int *wsptr;
   1688  _JSAMPROW outptr;
   1689  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1690  int ctr;
   1691  int workspace[8 * 13];        /* buffers data between passes */
   1692  SHIFT_TEMPS
   1693 
   1694  /* Pass 1: process columns from input, store into work array. */
   1695 
   1696  inptr = coef_block;
   1697  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   1698  wsptr = workspace;
   1699  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1700    /* Even part */
   1701 
   1702    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   1703    z1 = LEFT_SHIFT(z1, CONST_BITS);
   1704    /* Add fudge factor here for final descale. */
   1705    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
   1706 
   1707    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   1708    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   1709    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   1710 
   1711    tmp10 = z3 + z4;
   1712    tmp11 = z3 - z4;
   1713 
   1714    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1715    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1716 
   1717    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1718    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1719 
   1720    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1721    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1722 
   1723    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1724    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
   1725 
   1726    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1727    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1728 
   1729    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
   1730    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
   1731 
   1732    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1733 
   1734    /* Odd part */
   1735 
   1736    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   1737    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   1738    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   1739    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   1740 
   1741    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1742    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1743    tmp15 = z1 + z4;
   1744    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1745    tmp10 = tmp11 + tmp12 + tmp13 -
   1746            MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1747    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
   1748    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1749    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1750    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
   1751    tmp11 += tmp14;
   1752    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1753    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
   1754    tmp12 += tmp14;
   1755    tmp13 += tmp14;
   1756    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1757    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1758            MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1759    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1760    tmp14 += z1;
   1761    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1762             MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1763 
   1764    /* Final output stage */
   1765 
   1766    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
   1767    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
   1768    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
   1769    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
   1770    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
   1771    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
   1772    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
   1773    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
   1774    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
   1775    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   1776    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
   1777    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   1778    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26, CONST_BITS - PASS1_BITS);
   1779  }
   1780 
   1781  /* Pass 2: process 13 rows from work array, store into output array. */
   1782 
   1783  wsptr = workspace;
   1784  for (ctr = 0; ctr < 13; ctr++) {
   1785    outptr = output_buf[ctr] + output_col;
   1786 
   1787    /* Even part */
   1788 
   1789    /* Add fudge factor here for final descale. */
   1790    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   1791    z1 = LEFT_SHIFT(z1, CONST_BITS);
   1792 
   1793    z2 = (JLONG)wsptr[2];
   1794    z3 = (JLONG)wsptr[4];
   1795    z4 = (JLONG)wsptr[6];
   1796 
   1797    tmp10 = z3 + z4;
   1798    tmp11 = z3 - z4;
   1799 
   1800    tmp12 = MULTIPLY(tmp10, FIX(1.155388986));                /* (c4+c6)/2 */
   1801    tmp13 = MULTIPLY(tmp11, FIX(0.096834934)) + z1;           /* (c4-c6)/2 */
   1802 
   1803    tmp20 = MULTIPLY(z2, FIX(1.373119086)) + tmp12 + tmp13;   /* c2 */
   1804    tmp22 = MULTIPLY(z2, FIX(0.501487041)) - tmp12 + tmp13;   /* c10 */
   1805 
   1806    tmp12 = MULTIPLY(tmp10, FIX(0.316450131));                /* (c8-c12)/2 */
   1807    tmp13 = MULTIPLY(tmp11, FIX(0.486914739)) + z1;           /* (c8+c12)/2 */
   1808 
   1809    tmp21 = MULTIPLY(z2, FIX(1.058554052)) - tmp12 + tmp13;   /* c6 */
   1810    tmp25 = MULTIPLY(z2, -FIX(1.252223920)) + tmp12 + tmp13;  /* c4 */
   1811 
   1812    tmp12 = MULTIPLY(tmp10, FIX(0.435816023));                /* (c2-c10)/2 */
   1813    tmp13 = MULTIPLY(tmp11, FIX(0.937303064)) - z1;           /* (c2+c10)/2 */
   1814 
   1815    tmp23 = MULTIPLY(z2, -FIX(0.170464608)) - tmp12 - tmp13;  /* c12 */
   1816    tmp24 = MULTIPLY(z2, -FIX(0.803364869)) + tmp12 - tmp13;  /* c8 */
   1817 
   1818    tmp26 = MULTIPLY(tmp11 - z2, FIX(1.414213562)) + z1;      /* c0 */
   1819 
   1820    /* Odd part */
   1821 
   1822    z1 = (JLONG)wsptr[1];
   1823    z2 = (JLONG)wsptr[3];
   1824    z3 = (JLONG)wsptr[5];
   1825    z4 = (JLONG)wsptr[7];
   1826 
   1827    tmp11 = MULTIPLY(z1 + z2, FIX(1.322312651));     /* c3 */
   1828    tmp12 = MULTIPLY(z1 + z3, FIX(1.163874945));     /* c5 */
   1829    tmp15 = z1 + z4;
   1830    tmp13 = MULTIPLY(tmp15, FIX(0.937797057));       /* c7 */
   1831    tmp10 = tmp11 + tmp12 + tmp13 -
   1832            MULTIPLY(z1, FIX(2.020082300));          /* c7+c5+c3-c1 */
   1833    tmp14 = MULTIPLY(z2 + z3, -FIX(0.338443458));    /* -c11 */
   1834    tmp11 += tmp14 + MULTIPLY(z2, FIX(0.837223564)); /* c5+c9+c11-c3 */
   1835    tmp12 += tmp14 - MULTIPLY(z3, FIX(1.572116027)); /* c1+c5-c9-c11 */
   1836    tmp14 = MULTIPLY(z2 + z4, -FIX(1.163874945));    /* -c5 */
   1837    tmp11 += tmp14;
   1838    tmp13 += tmp14 + MULTIPLY(z4, FIX(2.205608352)); /* c3+c5+c9-c7 */
   1839    tmp14 = MULTIPLY(z3 + z4, -FIX(0.657217813));    /* -c9 */
   1840    tmp12 += tmp14;
   1841    tmp13 += tmp14;
   1842    tmp15 = MULTIPLY(tmp15, FIX(0.338443458));       /* c11 */
   1843    tmp14 = tmp15 + MULTIPLY(z1, FIX(0.318774355)) - /* c9-c11 */
   1844            MULTIPLY(z2, FIX(0.466105296));          /* c1-c7 */
   1845    z1    = MULTIPLY(z3 - z2, FIX(0.937797057));     /* c7 */
   1846    tmp14 += z1;
   1847    tmp15 += z1 + MULTIPLY(z3, FIX(0.384515595)) -   /* c3-c7 */
   1848             MULTIPLY(z4, FIX(1.742345811));         /* c1+c11 */
   1849 
   1850    /* Final output stage */
   1851 
   1852    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
   1853                                              CONST_BITS + PASS1_BITS + 3) &
   1854                             RANGE_MASK];
   1855    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
   1856                                              CONST_BITS + PASS1_BITS + 3) &
   1857                             RANGE_MASK];
   1858    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
   1859                                              CONST_BITS + PASS1_BITS + 3) &
   1860                             RANGE_MASK];
   1861    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
   1862                                              CONST_BITS + PASS1_BITS + 3) &
   1863                             RANGE_MASK];
   1864    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
   1865                                              CONST_BITS + PASS1_BITS + 3) &
   1866                             RANGE_MASK];
   1867    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
   1868                                              CONST_BITS + PASS1_BITS + 3) &
   1869                             RANGE_MASK];
   1870    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
   1871                                              CONST_BITS + PASS1_BITS + 3) &
   1872                             RANGE_MASK];
   1873    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
   1874                                              CONST_BITS + PASS1_BITS + 3) &
   1875                             RANGE_MASK];
   1876    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
   1877                                              CONST_BITS + PASS1_BITS + 3) &
   1878                             RANGE_MASK];
   1879    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
   1880                                              CONST_BITS + PASS1_BITS + 3) &
   1881                             RANGE_MASK];
   1882    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
   1883                                              CONST_BITS + PASS1_BITS + 3) &
   1884                             RANGE_MASK];
   1885    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
   1886                                              CONST_BITS + PASS1_BITS + 3) &
   1887                             RANGE_MASK];
   1888    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26,
   1889                                              CONST_BITS + PASS1_BITS + 3) &
   1890                             RANGE_MASK];
   1891 
   1892    wsptr += 8;         /* advance pointer to next row */
   1893  }
   1894 }
   1895 
   1896 
   1897 /*
   1898 * Perform dequantization and inverse DCT on one block of coefficients,
   1899 * producing a 14x14 output block.
   1900 *
   1901 * Optimized algorithm with 20 multiplications in the 1-D kernel.
   1902 * cK represents sqrt(2) * cos(K*pi/28).
   1903 */
   1904 
   1905 GLOBAL(void)
   1906 _jpeg_idct_14x14(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   1907                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   1908                 JDIMENSION output_col)
   1909 {
   1910  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   1911  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26;
   1912  JLONG z1, z2, z3, z4;
   1913  JCOEFPTR inptr;
   1914  ISLOW_MULT_TYPE *quantptr;
   1915  int *wsptr;
   1916  _JSAMPROW outptr;
   1917  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   1918  int ctr;
   1919  int workspace[8 * 14];        /* buffers data between passes */
   1920  SHIFT_TEMPS
   1921 
   1922  /* Pass 1: process columns from input, store into work array. */
   1923 
   1924  inptr = coef_block;
   1925  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   1926  wsptr = workspace;
   1927  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   1928    /* Even part */
   1929 
   1930    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   1931    z1 = LEFT_SHIFT(z1, CONST_BITS);
   1932    /* Add fudge factor here for final descale. */
   1933    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
   1934    z4 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   1935    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   1936    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   1937    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   1938 
   1939    tmp10 = z1 + z2;
   1940    tmp11 = z1 + z3;
   1941    tmp12 = z1 - z4;
   1942 
   1943    tmp23 = RIGHT_SHIFT(z1 - LEFT_SHIFT(z2 + z3 - z4, 1),
   1944                        CONST_BITS - PASS1_BITS); /* c0 = (c4+c12-c8)*2 */
   1945 
   1946    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   1947    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   1948 
   1949    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   1950 
   1951    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   1952    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   1953    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   1954            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   1955 
   1956    tmp20 = tmp10 + tmp13;
   1957    tmp26 = tmp10 - tmp13;
   1958    tmp21 = tmp11 + tmp14;
   1959    tmp25 = tmp11 - tmp14;
   1960    tmp22 = tmp12 + tmp15;
   1961    tmp24 = tmp12 - tmp15;
   1962 
   1963    /* Odd part */
   1964 
   1965    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   1966    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   1967    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   1968    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   1969    tmp13 = LEFT_SHIFT(z4, CONST_BITS);
   1970 
   1971    tmp14 = z1 + z3;
   1972    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   1973    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   1974    tmp10 = tmp11 + tmp12 + tmp13 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   1975    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   1976    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   1977    z1    -= z2;
   1978    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - tmp13;        /* c11 */
   1979    tmp16 += tmp15;
   1980    z1    += z4;
   1981    z4    = MULTIPLY(z2 + z3, -FIX(0.158341681)) - tmp13;  /* -c13 */
   1982    tmp11 += z4 - MULTIPLY(z2, FIX(0.424103948));          /* c3-c9-c13 */
   1983    tmp12 += z4 - MULTIPLY(z3, FIX(2.373959773));          /* c3+c5-c13 */
   1984    z4    = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   1985    tmp14 += z4 + tmp13 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   1986    tmp15 += z4 + MULTIPLY(z2, FIX(0.674957567));          /* c1+c11-c5 */
   1987 
   1988    tmp13 = LEFT_SHIFT(z1 - z3, PASS1_BITS);
   1989 
   1990    /* Final output stage */
   1991 
   1992    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
   1993    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
   1994    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
   1995    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
   1996    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
   1997    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
   1998    wsptr[8 * 3]  = (int)(tmp23 + tmp13);
   1999    wsptr[8 * 10] = (int)(tmp23 - tmp13);
   2000    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
   2001    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   2002    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
   2003    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   2004    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
   2005    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
   2006  }
   2007 
   2008  /* Pass 2: process 14 rows from work array, store into output array. */
   2009 
   2010  wsptr = workspace;
   2011  for (ctr = 0; ctr < 14; ctr++) {
   2012    outptr = output_buf[ctr] + output_col;
   2013 
   2014    /* Even part */
   2015 
   2016    /* Add fudge factor here for final descale. */
   2017    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   2018    z1 = LEFT_SHIFT(z1, CONST_BITS);
   2019    z4 = (JLONG)wsptr[4];
   2020    z2 = MULTIPLY(z4, FIX(1.274162392));         /* c4 */
   2021    z3 = MULTIPLY(z4, FIX(0.314692123));         /* c12 */
   2022    z4 = MULTIPLY(z4, FIX(0.881747734));         /* c8 */
   2023 
   2024    tmp10 = z1 + z2;
   2025    tmp11 = z1 + z3;
   2026    tmp12 = z1 - z4;
   2027 
   2028    tmp23 = z1 - LEFT_SHIFT(z2 + z3 - z4, 1);    /* c0 = (c4+c12-c8)*2 */
   2029 
   2030    z1 = (JLONG)wsptr[2];
   2031    z2 = (JLONG)wsptr[6];
   2032 
   2033    z3 = MULTIPLY(z1 + z2, FIX(1.105676686));    /* c6 */
   2034 
   2035    tmp13 = z3 + MULTIPLY(z1, FIX(0.273079590)); /* c2-c6 */
   2036    tmp14 = z3 - MULTIPLY(z2, FIX(1.719280954)); /* c6+c10 */
   2037    tmp15 = MULTIPLY(z1, FIX(0.613604268)) -     /* c10 */
   2038            MULTIPLY(z2, FIX(1.378756276));      /* c2 */
   2039 
   2040    tmp20 = tmp10 + tmp13;
   2041    tmp26 = tmp10 - tmp13;
   2042    tmp21 = tmp11 + tmp14;
   2043    tmp25 = tmp11 - tmp14;
   2044    tmp22 = tmp12 + tmp15;
   2045    tmp24 = tmp12 - tmp15;
   2046 
   2047    /* Odd part */
   2048 
   2049    z1 = (JLONG)wsptr[1];
   2050    z2 = (JLONG)wsptr[3];
   2051    z3 = (JLONG)wsptr[5];
   2052    z4 = (JLONG)wsptr[7];
   2053    z4 = LEFT_SHIFT(z4, CONST_BITS);
   2054 
   2055    tmp14 = z1 + z3;
   2056    tmp11 = MULTIPLY(z1 + z2, FIX(1.334852607));           /* c3 */
   2057    tmp12 = MULTIPLY(tmp14, FIX(1.197448846));             /* c5 */
   2058    tmp10 = tmp11 + tmp12 + z4 - MULTIPLY(z1, FIX(1.126980169)); /* c3+c5-c1 */
   2059    tmp14 = MULTIPLY(tmp14, FIX(0.752406978));             /* c9 */
   2060    tmp16 = tmp14 - MULTIPLY(z1, FIX(1.061150426));        /* c9+c11-c13 */
   2061    z1    -= z2;
   2062    tmp15 = MULTIPLY(z1, FIX(0.467085129)) - z4;           /* c11 */
   2063    tmp16 += tmp15;
   2064    tmp13 = MULTIPLY(z2 + z3, -FIX(0.158341681)) - z4;     /* -c13 */
   2065    tmp11 += tmp13 - MULTIPLY(z2, FIX(0.424103948));       /* c3-c9-c13 */
   2066    tmp12 += tmp13 - MULTIPLY(z3, FIX(2.373959773));       /* c3+c5-c13 */
   2067    tmp13 = MULTIPLY(z3 - z2, FIX(1.405321284));           /* c1 */
   2068    tmp14 += tmp13 + z4 - MULTIPLY(z3, FIX(1.6906431334)); /* c1+c9-c11 */
   2069    tmp15 += tmp13 + MULTIPLY(z2, FIX(0.674957567));       /* c1+c11-c5 */
   2070 
   2071    tmp13 = LEFT_SHIFT(z1 - z3, CONST_BITS) + z4;
   2072 
   2073    /* Final output stage */
   2074 
   2075    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
   2076                                              CONST_BITS + PASS1_BITS + 3) &
   2077                             RANGE_MASK];
   2078    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
   2079                                              CONST_BITS + PASS1_BITS + 3) &
   2080                             RANGE_MASK];
   2081    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
   2082                                              CONST_BITS + PASS1_BITS + 3) &
   2083                             RANGE_MASK];
   2084    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
   2085                                              CONST_BITS + PASS1_BITS + 3) &
   2086                             RANGE_MASK];
   2087    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
   2088                                              CONST_BITS + PASS1_BITS + 3) &
   2089                             RANGE_MASK];
   2090    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
   2091                                              CONST_BITS + PASS1_BITS + 3) &
   2092                             RANGE_MASK];
   2093    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
   2094                                              CONST_BITS + PASS1_BITS + 3) &
   2095                             RANGE_MASK];
   2096    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
   2097                                              CONST_BITS + PASS1_BITS + 3) &
   2098                             RANGE_MASK];
   2099    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
   2100                                              CONST_BITS + PASS1_BITS + 3) &
   2101                             RANGE_MASK];
   2102    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
   2103                                              CONST_BITS + PASS1_BITS + 3) &
   2104                             RANGE_MASK];
   2105    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
   2106                                              CONST_BITS + PASS1_BITS + 3) &
   2107                             RANGE_MASK];
   2108    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
   2109                                              CONST_BITS + PASS1_BITS + 3) &
   2110                             RANGE_MASK];
   2111    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
   2112                                              CONST_BITS + PASS1_BITS + 3) &
   2113                             RANGE_MASK];
   2114    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
   2115                                              CONST_BITS + PASS1_BITS + 3) &
   2116                             RANGE_MASK];
   2117 
   2118    wsptr += 8;         /* advance pointer to next row */
   2119  }
   2120 }
   2121 
   2122 
   2123 /*
   2124 * Perform dequantization and inverse DCT on one block of coefficients,
   2125 * producing a 15x15 output block.
   2126 *
   2127 * Optimized algorithm with 22 multiplications in the 1-D kernel.
   2128 * cK represents sqrt(2) * cos(K*pi/30).
   2129 */
   2130 
   2131 GLOBAL(void)
   2132 _jpeg_idct_15x15(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   2133                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   2134                 JDIMENSION output_col)
   2135 {
   2136  JLONG tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16;
   2137  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2138  JLONG z1, z2, z3, z4;
   2139  JCOEFPTR inptr;
   2140  ISLOW_MULT_TYPE *quantptr;
   2141  int *wsptr;
   2142  _JSAMPROW outptr;
   2143  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2144  int ctr;
   2145  int workspace[8 * 15];        /* buffers data between passes */
   2146  SHIFT_TEMPS
   2147 
   2148  /* Pass 1: process columns from input, store into work array. */
   2149 
   2150  inptr = coef_block;
   2151  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   2152  wsptr = workspace;
   2153  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2154    /* Even part */
   2155 
   2156    z1 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   2157    z1 = LEFT_SHIFT(z1, CONST_BITS);
   2158    /* Add fudge factor here for final descale. */
   2159    z1 += ONE << (CONST_BITS - PASS1_BITS - 1);
   2160 
   2161    z2 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   2162    z3 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   2163    z4 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   2164 
   2165    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2166    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2167 
   2168    tmp12 = z1 - tmp10;
   2169    tmp13 = z1 + tmp11;
   2170    z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
   2171 
   2172    z4 = z2 - z3;
   2173    z3 += z2;
   2174    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2175    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2176    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2177 
   2178    tmp20 = tmp13 + tmp10 + tmp11;
   2179    tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2180 
   2181    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2182    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2183 
   2184    tmp25 = tmp13 - tmp10 - tmp11;
   2185    tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2186 
   2187    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2188    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2189 
   2190    tmp21 = tmp12 + tmp10 + tmp11;
   2191    tmp24 = tmp13 - tmp10 + tmp11;
   2192    tmp11 += tmp11;
   2193    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2194    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2195 
   2196    /* Odd part */
   2197 
   2198    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   2199    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   2200    z4 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   2201    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2202    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   2203 
   2204    tmp13 = z2 - z4;
   2205    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2206    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2207    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2208 
   2209    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
   2210    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
   2211    z2 = z1 - z4;
   2212    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2213 
   2214    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2215    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2216    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2217    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2218    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2219    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2220 
   2221    /* Final output stage */
   2222 
   2223    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp10, CONST_BITS - PASS1_BITS);
   2224    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp20 - tmp10, CONST_BITS - PASS1_BITS);
   2225    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp11, CONST_BITS - PASS1_BITS);
   2226    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp21 - tmp11, CONST_BITS - PASS1_BITS);
   2227    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp12, CONST_BITS - PASS1_BITS);
   2228    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp22 - tmp12, CONST_BITS - PASS1_BITS);
   2229    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp13, CONST_BITS - PASS1_BITS);
   2230    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp23 - tmp13, CONST_BITS - PASS1_BITS);
   2231    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp14, CONST_BITS - PASS1_BITS);
   2232    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp24 - tmp14, CONST_BITS - PASS1_BITS);
   2233    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp15, CONST_BITS - PASS1_BITS);
   2234    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp25 - tmp15, CONST_BITS - PASS1_BITS);
   2235    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp16, CONST_BITS - PASS1_BITS);
   2236    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp26 - tmp16, CONST_BITS - PASS1_BITS);
   2237    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27, CONST_BITS - PASS1_BITS);
   2238  }
   2239 
   2240  /* Pass 2: process 15 rows from work array, store into output array. */
   2241 
   2242  wsptr = workspace;
   2243  for (ctr = 0; ctr < 15; ctr++) {
   2244    outptr = output_buf[ctr] + output_col;
   2245 
   2246    /* Even part */
   2247 
   2248    /* Add fudge factor here for final descale. */
   2249    z1 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   2250    z1 = LEFT_SHIFT(z1, CONST_BITS);
   2251 
   2252    z2 = (JLONG)wsptr[2];
   2253    z3 = (JLONG)wsptr[4];
   2254    z4 = (JLONG)wsptr[6];
   2255 
   2256    tmp10 = MULTIPLY(z4, FIX(0.437016024)); /* c12 */
   2257    tmp11 = MULTIPLY(z4, FIX(1.144122806)); /* c6 */
   2258 
   2259    tmp12 = z1 - tmp10;
   2260    tmp13 = z1 + tmp11;
   2261    z1 -= LEFT_SHIFT(tmp11 - tmp10, 1);     /* c0 = (c6-c12)*2 */
   2262 
   2263    z4 = z2 - z3;
   2264    z3 += z2;
   2265    tmp10 = MULTIPLY(z3, FIX(1.337628990)); /* (c2+c4)/2 */
   2266    tmp11 = MULTIPLY(z4, FIX(0.045680613)); /* (c2-c4)/2 */
   2267    z2 = MULTIPLY(z2, FIX(1.439773946));    /* c4+c14 */
   2268 
   2269    tmp20 = tmp13 + tmp10 + tmp11;
   2270    tmp23 = tmp12 - tmp10 + tmp11 + z2;
   2271 
   2272    tmp10 = MULTIPLY(z3, FIX(0.547059574)); /* (c8+c14)/2 */
   2273    tmp11 = MULTIPLY(z4, FIX(0.399234004)); /* (c8-c14)/2 */
   2274 
   2275    tmp25 = tmp13 - tmp10 - tmp11;
   2276    tmp26 = tmp12 + tmp10 - tmp11 - z2;
   2277 
   2278    tmp10 = MULTIPLY(z3, FIX(0.790569415)); /* (c6+c12)/2 */
   2279    tmp11 = MULTIPLY(z4, FIX(0.353553391)); /* (c6-c12)/2 */
   2280 
   2281    tmp21 = tmp12 + tmp10 + tmp11;
   2282    tmp24 = tmp13 - tmp10 + tmp11;
   2283    tmp11 += tmp11;
   2284    tmp22 = z1 + tmp11;                     /* c10 = c6-c12 */
   2285    tmp27 = z1 - tmp11 - tmp11;             /* c0 = (c6-c12)*2 */
   2286 
   2287    /* Odd part */
   2288 
   2289    z1 = (JLONG)wsptr[1];
   2290    z2 = (JLONG)wsptr[3];
   2291    z4 = (JLONG)wsptr[5];
   2292    z3 = MULTIPLY(z4, FIX(1.224744871));                    /* c5 */
   2293    z4 = (JLONG)wsptr[7];
   2294 
   2295    tmp13 = z2 - z4;
   2296    tmp15 = MULTIPLY(z1 + tmp13, FIX(0.831253876));         /* c9 */
   2297    tmp11 = tmp15 + MULTIPLY(z1, FIX(0.513743148));         /* c3-c9 */
   2298    tmp14 = tmp15 - MULTIPLY(tmp13, FIX(2.176250899));      /* c3+c9 */
   2299 
   2300    tmp13 = MULTIPLY(z2, -FIX(0.831253876));                /* -c9 */
   2301    tmp15 = MULTIPLY(z2, -FIX(1.344997024));                /* -c3 */
   2302    z2 = z1 - z4;
   2303    tmp12 = z3 + MULTIPLY(z2, FIX(1.406466353));            /* c1 */
   2304 
   2305    tmp10 = tmp12 + MULTIPLY(z4, FIX(2.457431844)) - tmp15; /* c1+c7 */
   2306    tmp16 = tmp12 - MULTIPLY(z1, FIX(1.112434820)) + tmp13; /* c1-c13 */
   2307    tmp12 = MULTIPLY(z2, FIX(1.224744871)) - z3;            /* c5 */
   2308    z2 = MULTIPLY(z1 + z4, FIX(0.575212477));               /* c11 */
   2309    tmp13 += z2 + MULTIPLY(z1, FIX(0.475753014)) - z3;      /* c7-c11 */
   2310    tmp15 += z2 - MULTIPLY(z4, FIX(0.869244010)) + z3;      /* c11+c13 */
   2311 
   2312    /* Final output stage */
   2313 
   2314    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp10,
   2315                                              CONST_BITS + PASS1_BITS + 3) &
   2316                             RANGE_MASK];
   2317    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp10,
   2318                                              CONST_BITS + PASS1_BITS + 3) &
   2319                             RANGE_MASK];
   2320    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp11,
   2321                                              CONST_BITS + PASS1_BITS + 3) &
   2322                             RANGE_MASK];
   2323    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp11,
   2324                                              CONST_BITS + PASS1_BITS + 3) &
   2325                             RANGE_MASK];
   2326    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp12,
   2327                                              CONST_BITS + PASS1_BITS + 3) &
   2328                             RANGE_MASK];
   2329    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp12,
   2330                                              CONST_BITS + PASS1_BITS + 3) &
   2331                             RANGE_MASK];
   2332    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp13,
   2333                                              CONST_BITS + PASS1_BITS + 3) &
   2334                             RANGE_MASK];
   2335    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp13,
   2336                                              CONST_BITS + PASS1_BITS + 3) &
   2337                             RANGE_MASK];
   2338    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp14,
   2339                                              CONST_BITS + PASS1_BITS + 3) &
   2340                             RANGE_MASK];
   2341    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp14,
   2342                                              CONST_BITS + PASS1_BITS + 3) &
   2343                             RANGE_MASK];
   2344    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp15,
   2345                                              CONST_BITS + PASS1_BITS + 3) &
   2346                             RANGE_MASK];
   2347    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp15,
   2348                                              CONST_BITS + PASS1_BITS + 3) &
   2349                             RANGE_MASK];
   2350    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp16,
   2351                                              CONST_BITS + PASS1_BITS + 3) &
   2352                             RANGE_MASK];
   2353    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp16,
   2354                                              CONST_BITS + PASS1_BITS + 3) &
   2355                             RANGE_MASK];
   2356    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27,
   2357                                              CONST_BITS + PASS1_BITS + 3) &
   2358                             RANGE_MASK];
   2359 
   2360    wsptr += 8;         /* advance pointer to next row */
   2361  }
   2362 }
   2363 
   2364 
   2365 /*
   2366 * Perform dequantization and inverse DCT on one block of coefficients,
   2367 * producing a 16x16 output block.
   2368 *
   2369 * Optimized algorithm with 28 multiplications in the 1-D kernel.
   2370 * cK represents sqrt(2) * cos(K*pi/32).
   2371 */
   2372 
   2373 GLOBAL(void)
   2374 _jpeg_idct_16x16(j_decompress_ptr cinfo, jpeg_component_info *compptr,
   2375                 JCOEFPTR coef_block, _JSAMPARRAY output_buf,
   2376                 JDIMENSION output_col)
   2377 {
   2378  JLONG tmp0, tmp1, tmp2, tmp3, tmp10, tmp11, tmp12, tmp13;
   2379  JLONG tmp20, tmp21, tmp22, tmp23, tmp24, tmp25, tmp26, tmp27;
   2380  JLONG z1, z2, z3, z4;
   2381  JCOEFPTR inptr;
   2382  ISLOW_MULT_TYPE *quantptr;
   2383  int *wsptr;
   2384  _JSAMPROW outptr;
   2385  _JSAMPLE *range_limit = IDCT_range_limit(cinfo);
   2386  int ctr;
   2387  int workspace[8 * 16];        /* buffers data between passes */
   2388  SHIFT_TEMPS
   2389 
   2390  /* Pass 1: process columns from input, store into work array. */
   2391 
   2392  inptr = coef_block;
   2393  quantptr = (ISLOW_MULT_TYPE *)compptr->dct_table;
   2394  wsptr = workspace;
   2395  for (ctr = 0; ctr < 8; ctr++, inptr++, quantptr++, wsptr++) {
   2396    /* Even part */
   2397 
   2398    tmp0 = DEQUANTIZE(inptr[DCTSIZE * 0], quantptr[DCTSIZE * 0]);
   2399    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
   2400    /* Add fudge factor here for final descale. */
   2401    tmp0 += ONE << (CONST_BITS - PASS1_BITS - 1);
   2402 
   2403    z1 = DEQUANTIZE(inptr[DCTSIZE * 4], quantptr[DCTSIZE * 4]);
   2404    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2405    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2406 
   2407    tmp10 = tmp0 + tmp1;
   2408    tmp11 = tmp0 - tmp1;
   2409    tmp12 = tmp0 + tmp2;
   2410    tmp13 = tmp0 - tmp2;
   2411 
   2412    z1 = DEQUANTIZE(inptr[DCTSIZE * 2], quantptr[DCTSIZE * 2]);
   2413    z2 = DEQUANTIZE(inptr[DCTSIZE * 6], quantptr[DCTSIZE * 6]);
   2414    z3 = z1 - z2;
   2415    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2416    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2417 
   2418    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2419    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2420    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2421    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2422 
   2423    tmp20 = tmp10 + tmp0;
   2424    tmp27 = tmp10 - tmp0;
   2425    tmp21 = tmp12 + tmp1;
   2426    tmp26 = tmp12 - tmp1;
   2427    tmp22 = tmp13 + tmp2;
   2428    tmp25 = tmp13 - tmp2;
   2429    tmp23 = tmp11 + tmp3;
   2430    tmp24 = tmp11 - tmp3;
   2431 
   2432    /* Odd part */
   2433 
   2434    z1 = DEQUANTIZE(inptr[DCTSIZE * 1], quantptr[DCTSIZE * 1]);
   2435    z2 = DEQUANTIZE(inptr[DCTSIZE * 3], quantptr[DCTSIZE * 3]);
   2436    z3 = DEQUANTIZE(inptr[DCTSIZE * 5], quantptr[DCTSIZE * 5]);
   2437    z4 = DEQUANTIZE(inptr[DCTSIZE * 7], quantptr[DCTSIZE * 7]);
   2438 
   2439    tmp11 = z1 + z3;
   2440 
   2441    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2442    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2443    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2444    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2445    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2446    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2447    tmp0  = tmp1 + tmp2 + tmp3 -
   2448            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2449    tmp13 = tmp10 + tmp11 + tmp12 -
   2450            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2451    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2452    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2453    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2454    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2455    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2456    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2457    z2    += z4;
   2458    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
   2459    tmp1  += z1;
   2460    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2461    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
   2462    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2463    tmp12 += z2;
   2464    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
   2465    tmp2  += z2;
   2466    tmp3  += z2;
   2467    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2468    tmp10 += z2;
   2469    tmp11 += z2;
   2470 
   2471    /* Final output stage */
   2472 
   2473    wsptr[8 * 0]  = (int)RIGHT_SHIFT(tmp20 + tmp0,  CONST_BITS - PASS1_BITS);
   2474    wsptr[8 * 15] = (int)RIGHT_SHIFT(tmp20 - tmp0,  CONST_BITS - PASS1_BITS);
   2475    wsptr[8 * 1]  = (int)RIGHT_SHIFT(tmp21 + tmp1,  CONST_BITS - PASS1_BITS);
   2476    wsptr[8 * 14] = (int)RIGHT_SHIFT(tmp21 - tmp1,  CONST_BITS - PASS1_BITS);
   2477    wsptr[8 * 2]  = (int)RIGHT_SHIFT(tmp22 + tmp2,  CONST_BITS - PASS1_BITS);
   2478    wsptr[8 * 13] = (int)RIGHT_SHIFT(tmp22 - tmp2,  CONST_BITS - PASS1_BITS);
   2479    wsptr[8 * 3]  = (int)RIGHT_SHIFT(tmp23 + tmp3,  CONST_BITS - PASS1_BITS);
   2480    wsptr[8 * 12] = (int)RIGHT_SHIFT(tmp23 - tmp3,  CONST_BITS - PASS1_BITS);
   2481    wsptr[8 * 4]  = (int)RIGHT_SHIFT(tmp24 + tmp10, CONST_BITS - PASS1_BITS);
   2482    wsptr[8 * 11] = (int)RIGHT_SHIFT(tmp24 - tmp10, CONST_BITS - PASS1_BITS);
   2483    wsptr[8 * 5]  = (int)RIGHT_SHIFT(tmp25 + tmp11, CONST_BITS - PASS1_BITS);
   2484    wsptr[8 * 10] = (int)RIGHT_SHIFT(tmp25 - tmp11, CONST_BITS - PASS1_BITS);
   2485    wsptr[8 * 6]  = (int)RIGHT_SHIFT(tmp26 + tmp12, CONST_BITS - PASS1_BITS);
   2486    wsptr[8 * 9]  = (int)RIGHT_SHIFT(tmp26 - tmp12, CONST_BITS - PASS1_BITS);
   2487    wsptr[8 * 7]  = (int)RIGHT_SHIFT(tmp27 + tmp13, CONST_BITS - PASS1_BITS);
   2488    wsptr[8 * 8]  = (int)RIGHT_SHIFT(tmp27 - tmp13, CONST_BITS - PASS1_BITS);
   2489  }
   2490 
   2491  /* Pass 2: process 16 rows from work array, store into output array. */
   2492 
   2493  wsptr = workspace;
   2494  for (ctr = 0; ctr < 16; ctr++) {
   2495    outptr = output_buf[ctr] + output_col;
   2496 
   2497    /* Even part */
   2498 
   2499    /* Add fudge factor here for final descale. */
   2500    tmp0 = (JLONG)wsptr[0] + (ONE << (PASS1_BITS + 2));
   2501    tmp0 = LEFT_SHIFT(tmp0, CONST_BITS);
   2502 
   2503    z1 = (JLONG)wsptr[4];
   2504    tmp1 = MULTIPLY(z1, FIX(1.306562965));      /* c4[16] = c2[8] */
   2505    tmp2 = MULTIPLY(z1, FIX_0_541196100);       /* c12[16] = c6[8] */
   2506 
   2507    tmp10 = tmp0 + tmp1;
   2508    tmp11 = tmp0 - tmp1;
   2509    tmp12 = tmp0 + tmp2;
   2510    tmp13 = tmp0 - tmp2;
   2511 
   2512    z1 = (JLONG)wsptr[2];
   2513    z2 = (JLONG)wsptr[6];
   2514    z3 = z1 - z2;
   2515    z4 = MULTIPLY(z3, FIX(0.275899379));        /* c14[16] = c7[8] */
   2516    z3 = MULTIPLY(z3, FIX(1.387039845));        /* c2[16] = c1[8] */
   2517 
   2518    tmp0 = z3 + MULTIPLY(z2, FIX_2_562915447);  /* (c6+c2)[16] = (c3+c1)[8] */
   2519    tmp1 = z4 + MULTIPLY(z1, FIX_0_899976223);  /* (c6-c14)[16] = (c3-c7)[8] */
   2520    tmp2 = z3 - MULTIPLY(z1, FIX(0.601344887)); /* (c2-c10)[16] = (c1-c5)[8] */
   2521    tmp3 = z4 - MULTIPLY(z2, FIX(0.509795579)); /* (c10-c14)[16] = (c5-c7)[8] */
   2522 
   2523    tmp20 = tmp10 + tmp0;
   2524    tmp27 = tmp10 - tmp0;
   2525    tmp21 = tmp12 + tmp1;
   2526    tmp26 = tmp12 - tmp1;
   2527    tmp22 = tmp13 + tmp2;
   2528    tmp25 = tmp13 - tmp2;
   2529    tmp23 = tmp11 + tmp3;
   2530    tmp24 = tmp11 - tmp3;
   2531 
   2532    /* Odd part */
   2533 
   2534    z1 = (JLONG)wsptr[1];
   2535    z2 = (JLONG)wsptr[3];
   2536    z3 = (JLONG)wsptr[5];
   2537    z4 = (JLONG)wsptr[7];
   2538 
   2539    tmp11 = z1 + z3;
   2540 
   2541    tmp1  = MULTIPLY(z1 + z2, FIX(1.353318001));   /* c3 */
   2542    tmp2  = MULTIPLY(tmp11,   FIX(1.247225013));   /* c5 */
   2543    tmp3  = MULTIPLY(z1 + z4, FIX(1.093201867));   /* c7 */
   2544    tmp10 = MULTIPLY(z1 - z4, FIX(0.897167586));   /* c9 */
   2545    tmp11 = MULTIPLY(tmp11,   FIX(0.666655658));   /* c11 */
   2546    tmp12 = MULTIPLY(z1 - z2, FIX(0.410524528));   /* c13 */
   2547    tmp0  = tmp1 + tmp2 + tmp3 -
   2548            MULTIPLY(z1, FIX(2.286341144));        /* c7+c5+c3-c1 */
   2549    tmp13 = tmp10 + tmp11 + tmp12 -
   2550            MULTIPLY(z1, FIX(1.835730603));        /* c9+c11+c13-c15 */
   2551    z1    = MULTIPLY(z2 + z3, FIX(0.138617169));   /* c15 */
   2552    tmp1  += z1 + MULTIPLY(z2, FIX(0.071888074));  /* c9+c11-c3-c15 */
   2553    tmp2  += z1 - MULTIPLY(z3, FIX(1.125726048));  /* c5+c7+c15-c3 */
   2554    z1    = MULTIPLY(z3 - z2, FIX(1.407403738));   /* c1 */
   2555    tmp11 += z1 - MULTIPLY(z3, FIX(0.766367282));  /* c1+c11-c9-c13 */
   2556    tmp12 += z1 + MULTIPLY(z2, FIX(1.971951411));  /* c1+c5+c13-c7 */
   2557    z2    += z4;
   2558    z1    = MULTIPLY(z2, -FIX(0.666655658));       /* -c11 */
   2559    tmp1  += z1;
   2560    tmp3  += z1 + MULTIPLY(z4, FIX(1.065388962));  /* c3+c11+c15-c7 */
   2561    z2    = MULTIPLY(z2, -FIX(1.247225013));       /* -c5 */
   2562    tmp10 += z2 + MULTIPLY(z4, FIX(3.141271809));  /* c1+c5+c9-c13 */
   2563    tmp12 += z2;
   2564    z2    = MULTIPLY(z3 + z4, -FIX(1.353318001));  /* -c3 */
   2565    tmp2  += z2;
   2566    tmp3  += z2;
   2567    z2    = MULTIPLY(z4 - z3, FIX(0.410524528));   /* c13 */
   2568    tmp10 += z2;
   2569    tmp11 += z2;
   2570 
   2571    /* Final output stage */
   2572 
   2573    outptr[0]  = range_limit[(int)RIGHT_SHIFT(tmp20 + tmp0,
   2574                                              CONST_BITS + PASS1_BITS + 3) &
   2575                             RANGE_MASK];
   2576    outptr[15] = range_limit[(int)RIGHT_SHIFT(tmp20 - tmp0,
   2577                                              CONST_BITS + PASS1_BITS + 3) &
   2578                             RANGE_MASK];
   2579    outptr[1]  = range_limit[(int)RIGHT_SHIFT(tmp21 + tmp1,
   2580                                              CONST_BITS + PASS1_BITS + 3) &
   2581                             RANGE_MASK];
   2582    outptr[14] = range_limit[(int)RIGHT_SHIFT(tmp21 - tmp1,
   2583                                              CONST_BITS + PASS1_BITS + 3) &
   2584                             RANGE_MASK];
   2585    outptr[2]  = range_limit[(int)RIGHT_SHIFT(tmp22 + tmp2,
   2586                                              CONST_BITS + PASS1_BITS + 3) &
   2587                             RANGE_MASK];
   2588    outptr[13] = range_limit[(int)RIGHT_SHIFT(tmp22 - tmp2,
   2589                                              CONST_BITS + PASS1_BITS + 3) &
   2590                             RANGE_MASK];
   2591    outptr[3]  = range_limit[(int)RIGHT_SHIFT(tmp23 + tmp3,
   2592                                              CONST_BITS + PASS1_BITS + 3) &
   2593                             RANGE_MASK];
   2594    outptr[12] = range_limit[(int)RIGHT_SHIFT(tmp23 - tmp3,
   2595                                              CONST_BITS + PASS1_BITS + 3) &
   2596                             RANGE_MASK];
   2597    outptr[4]  = range_limit[(int)RIGHT_SHIFT(tmp24 + tmp10,
   2598                                              CONST_BITS + PASS1_BITS + 3) &
   2599                             RANGE_MASK];
   2600    outptr[11] = range_limit[(int)RIGHT_SHIFT(tmp24 - tmp10,
   2601                                              CONST_BITS + PASS1_BITS + 3) &
   2602                             RANGE_MASK];
   2603    outptr[5]  = range_limit[(int)RIGHT_SHIFT(tmp25 + tmp11,
   2604                                              CONST_BITS + PASS1_BITS + 3) &
   2605                             RANGE_MASK];
   2606    outptr[10] = range_limit[(int)RIGHT_SHIFT(tmp25 - tmp11,
   2607                                              CONST_BITS + PASS1_BITS + 3) &
   2608                             RANGE_MASK];
   2609    outptr[6]  = range_limit[(int)RIGHT_SHIFT(tmp26 + tmp12,
   2610                                              CONST_BITS + PASS1_BITS + 3) &
   2611                             RANGE_MASK];
   2612    outptr[9]  = range_limit[(int)RIGHT_SHIFT(tmp26 - tmp12,
   2613                                              CONST_BITS + PASS1_BITS + 3) &
   2614                             RANGE_MASK];
   2615    outptr[7]  = range_limit[(int)RIGHT_SHIFT(tmp27 + tmp13,
   2616                                              CONST_BITS + PASS1_BITS + 3) &
   2617                             RANGE_MASK];
   2618    outptr[8]  = range_limit[(int)RIGHT_SHIFT(tmp27 - tmp13,
   2619                                              CONST_BITS + PASS1_BITS + 3) &
   2620                             RANGE_MASK];
   2621 
   2622    wsptr += 8;         /* advance pointer to next row */
   2623  }
   2624 }
   2625 
   2626 #endif /* IDCT_SCALING_SUPPORTED */
   2627 #endif /* DCT_ISLOW_SUPPORTED */