tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jrevdct.c (44393B)


      1 /*
      2 * This file is part of the Independent JPEG Group's software.
      3 *
      4 * The authors make NO WARRANTY or representation, either express or implied,
      5 * with respect to this software, its quality, accuracy, merchantability, or
      6 * fitness for a particular purpose.  This software is provided "AS IS", and
      7 * you, its user, assume the entire risk as to its quality and accuracy.
      8 *
      9 * This software is copyright (C) 1991, 1992, Thomas G. Lane.
     10 * All Rights Reserved except as specified below.
     11 *
     12 * Permission is hereby granted to use, copy, modify, and distribute this
     13 * software (or portions thereof) for any purpose, without fee, subject to
     14 * these conditions:
     15 * (1) If any part of the source code for this software is distributed, then
     16 * this README file must be included, with this copyright and no-warranty
     17 * notice unaltered; and any additions, deletions, or changes to the original
     18 * files must be clearly indicated in accompanying documentation.
     19 * (2) If only executable code is distributed, then the accompanying
     20 * documentation must state that "this software is based in part on the work
     21 * of the Independent JPEG Group".
     22 * (3) Permission for use of this software is granted only if the user accepts
     23 * full responsibility for any undesirable consequences; the authors accept
     24 * NO LIABILITY for damages of any kind.
     25 *
     26 * These conditions apply to any software derived from or based on the IJG
     27 * code, not just to the unmodified library.  If you use our work, you ought
     28 * to acknowledge us.
     29 *
     30 * Permission is NOT granted for the use of any IJG author's name or company
     31 * name in advertising or publicity relating to this software or products
     32 * derived from it.  This software may be referred to only as "the Independent
     33 * JPEG Group's software".
     34 *
     35 * We specifically permit and encourage the use of this software as the basis
     36 * of commercial products, provided that all warranty or liability claims are
     37 * assumed by the product vendor.
     38 *
     39 * This file contains the basic inverse-DCT transformation subroutine.
     40 *
     41 * This implementation is based on an algorithm described in
     42 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
     43 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
     44 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
     45 * The primary algorithm described there uses 11 multiplies and 29 adds.
     46 * We use their alternate method with 12 multiplies and 32 adds.
     47 * The advantage of this method is that no data path contains more than one
     48 * multiplication; this allows a very simple and accurate implementation in
     49 * scaled fixed-point arithmetic, with a minimal number of shifts.
     50 *
     51 * I've made lots of modifications to attempt to take advantage of the
     52 * sparse nature of the DCT matrices we're getting.  Although the logic
     53 * is cumbersome, it's straightforward and the resulting code is much
     54 * faster.
     55 *
     56 * A better way to do this would be to pass in the DCT block as a sparse
     57 * matrix, perhaps with the difference cases encoded.
     58 */
     59 
     60 /**
     61 * @file
     62 * Independent JPEG Group's LLM idct.
     63 */
     64 
     65 #include <stddef.h>
     66 #include <stdint.h>
     67 
     68 #include "libavutil/intreadwrite.h"
     69 
     70 #include "dct.h"
     71 #include "idctdsp.h"
     72 
     73 #define EIGHT_BIT_SAMPLES
     74 
     75 #define DCTSIZE 8
     76 #define DCTSIZE2 64
     77 
     78 #define GLOBAL
     79 
     80 #define RIGHT_SHIFT(x, n) ((x) >> (n))
     81 
     82 typedef int16_t DCTBLOCK[DCTSIZE2];
     83 
     84 #define CONST_BITS 13
     85 
     86 /*
     87 * This routine is specialized to the case DCTSIZE = 8.
     88 */
     89 
     90 #if DCTSIZE != 8
     91  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
     92 #endif
     93 
     94 
     95 /*
     96 * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
     97 * on each column.  Direct algorithms are also available, but they are
     98 * much more complex and seem not to be any faster when reduced to code.
     99 *
    100 * The poop on this scaling stuff is as follows:
    101 *
    102 * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
    103 * larger than the true IDCT outputs.  The final outputs are therefore
    104 * a factor of N larger than desired; since N=8 this can be cured by
    105 * a simple right shift at the end of the algorithm.  The advantage of
    106 * this arrangement is that we save two multiplications per 1-D IDCT,
    107 * because the y0 and y4 inputs need not be divided by sqrt(N).
    108 *
    109 * We have to do addition and subtraction of the integer inputs, which
    110 * is no problem, and multiplication by fractional constants, which is
    111 * a problem to do in integer arithmetic.  We multiply all the constants
    112 * by CONST_SCALE and convert them to integer constants (thus retaining
    113 * CONST_BITS bits of precision in the constants).  After doing a
    114 * multiplication we have to divide the product by CONST_SCALE, with proper
    115 * rounding, to produce the correct output.  This division can be done
    116 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
    117 * as long as possible so that partial sums can be added together with
    118 * full fractional precision.
    119 *
    120 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
    121 * they are represented to better-than-integral precision.  These outputs
    122 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
    123 * with the recommended scaling.  (To scale up 12-bit sample data further, an
    124 * intermediate int32 array would be needed.)
    125 *
    126 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
    127 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
    128 * shows that the values given below are the most effective.
    129 */
    130 
    131 #ifdef EIGHT_BIT_SAMPLES
    132 #define PASS1_BITS  2
    133 #else
    134 #define PASS1_BITS  1   /* lose a little precision to avoid overflow */
    135 #endif
    136 
    137 #define ONE         ((int32_t) 1)
    138 
    139 #define CONST_SCALE (ONE << CONST_BITS)
    140 
    141 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
    142 * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
    143 * you will pay a significant penalty in run time.  In that case, figure
    144 * the correct integer constant values and insert them by hand.
    145 */
    146 
    147 /* Actually FIX is no longer used, we precomputed them all */
    148 #define FIX(x)  ((int32_t) ((x) * CONST_SCALE + 0.5))
    149 
    150 /* Descale and correctly round an int32_t value that's scaled by N bits.
    151 * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
    152 * the fudge factor is correct for either sign of X.
    153 */
    154 
    155 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
    156 
    157 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
    158 * For 8-bit samples with the recommended scaling, all the variable
    159 * and constant values involved are no more than 16 bits wide, so a
    160 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
    161 * this provides a useful speedup on many machines.
    162 * There is no way to specify a 16x16->32 multiply in portable C, but
    163 * some C compilers will do the right thing if you provide the correct
    164 * combination of casts.
    165 * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
    166 */
    167 
    168 #ifdef EIGHT_BIT_SAMPLES
    169 #ifdef SHORTxSHORT_32           /* may work if 'int' is 32 bits */
    170 #define MULTIPLY(var,const)  (((int16_t) (var)) * ((int16_t) (const)))
    171 #endif
    172 #ifdef SHORTxLCONST_32          /* known to work with Microsoft C 6.0 */
    173 #define MULTIPLY(var,const)  (((int16_t) (var)) * ((int32_t) (const)))
    174 #endif
    175 #endif
    176 
    177 #ifndef MULTIPLY                /* default definition */
    178 #define MULTIPLY(var,const)  ((var) * (const))
    179 #endif
    180 
    181 
    182 /*
    183  Unlike our decoder where we approximate the FIXes, we need to use exact
    184 ones here or successive P-frames will drift too much with Reference frame coding
    185 */
    186 #define FIX_0_211164243 1730
    187 #define FIX_0_275899380 2260
    188 #define FIX_0_298631336 2446
    189 #define FIX_0_390180644 3196
    190 #define FIX_0_509795579 4176
    191 #define FIX_0_541196100 4433
    192 #define FIX_0_601344887 4926
    193 #define FIX_0_765366865 6270
    194 #define FIX_0_785694958 6436
    195 #define FIX_0_899976223 7373
    196 #define FIX_1_061594337 8697
    197 #define FIX_1_111140466 9102
    198 #define FIX_1_175875602 9633
    199 #define FIX_1_306562965 10703
    200 #define FIX_1_387039845 11363
    201 #define FIX_1_451774981 11893
    202 #define FIX_1_501321110 12299
    203 #define FIX_1_662939225 13623
    204 #define FIX_1_847759065 15137
    205 #define FIX_1_961570560 16069
    206 #define FIX_2_053119869 16819
    207 #define FIX_2_172734803 17799
    208 #define FIX_2_562915447 20995
    209 #define FIX_3_072711026 25172
    210 
    211 /*
    212 * Perform the inverse DCT on one block of coefficients.
    213 */
    214 
    215 void ff_j_rev_dct(DCTBLOCK data)
    216 {
    217  int32_t tmp0, tmp1, tmp2, tmp3;
    218  int32_t tmp10, tmp11, tmp12, tmp13;
    219  int32_t z1, z2, z3, z4, z5;
    220  int32_t d0, d1, d2, d3, d4, d5, d6, d7;
    221  register int16_t *dataptr;
    222  int rowctr;
    223 
    224  /* Pass 1: process rows. */
    225  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
    226  /* furthermore, we scale the results by 2**PASS1_BITS. */
    227 
    228  dataptr = data;
    229 
    230  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
    231    /* Due to quantization, we will usually find that many of the input
    232     * coefficients are zero, especially the AC terms.  We can exploit this
    233     * by short-circuiting the IDCT calculation for any row in which all
    234     * the AC terms are zero.  In that case each output is equal to the
    235     * DC coefficient (with scale factor as needed).
    236     * With typical images and quantization tables, half or more of the
    237     * row DCT calculations can be simplified this way.
    238     */
    239 
    240    register uint8_t *idataptr = (uint8_t*)dataptr;
    241 
    242    /* WARNING: we do the same permutation as MMX idct to simplify the
    243       video core */
    244    d0 = dataptr[0];
    245    d2 = dataptr[1];
    246    d4 = dataptr[2];
    247    d6 = dataptr[3];
    248    d1 = dataptr[4];
    249    d3 = dataptr[5];
    250    d5 = dataptr[6];
    251    d7 = dataptr[7];
    252 
    253    if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
    254      /* AC terms all zero */
    255      if (d0) {
    256          /* Compute a 32 bit value to assign. */
    257          int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
    258          register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16);
    259 
    260          AV_WN32A(&idataptr[ 0], v);
    261          AV_WN32A(&idataptr[ 4], v);
    262          AV_WN32A(&idataptr[ 8], v);
    263          AV_WN32A(&idataptr[12], v);
    264      }
    265 
    266      dataptr += DCTSIZE;       /* advance pointer to next row */
    267      continue;
    268    }
    269 
    270    /* Even part: reverse the even part of the forward DCT. */
    271    /* The rotator is sqrt(2)*c(-6). */
    272 {
    273    if (d6) {
    274            if (d2) {
    275                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
    276                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
    277                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
    278                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
    279 
    280                    tmp0 = (d0 + d4) * CONST_SCALE;
    281                    tmp1 = (d0 - d4) * CONST_SCALE;
    282 
    283                    tmp10 = tmp0 + tmp3;
    284                    tmp13 = tmp0 - tmp3;
    285                    tmp11 = tmp1 + tmp2;
    286                    tmp12 = tmp1 - tmp2;
    287            } else {
    288                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
    289                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
    290                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
    291 
    292                    tmp0 = (d0 + d4) * CONST_SCALE;
    293                    tmp1 = (d0 - d4) * CONST_SCALE;
    294 
    295                    tmp10 = tmp0 + tmp3;
    296                    tmp13 = tmp0 - tmp3;
    297                    tmp11 = tmp1 + tmp2;
    298                    tmp12 = tmp1 - tmp2;
    299            }
    300    } else {
    301            if (d2) {
    302                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
    303                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
    304                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
    305 
    306                    tmp0 = (d0 + d4) * CONST_SCALE;
    307                    tmp1 = (d0 - d4) * CONST_SCALE;
    308 
    309                    tmp10 = tmp0 + tmp3;
    310                    tmp13 = tmp0 - tmp3;
    311                    tmp11 = tmp1 + tmp2;
    312                    tmp12 = tmp1 - tmp2;
    313            } else {
    314                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
    315                    tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
    316                    tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
    317            }
    318      }
    319 
    320    /* Odd part per figure 8; the matrix is unitary and hence its
    321     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    322     */
    323 
    324    if (d7) {
    325        if (d5) {
    326            if (d3) {
    327                if (d1) {
    328                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
    329                    z1 = d7 + d1;
    330                    z2 = d5 + d3;
    331                    z3 = d7 + d3;
    332                    z4 = d5 + d1;
    333                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
    334 
    335                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    336                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    337                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    338                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    339                    z1 = MULTIPLY(-z1, FIX_0_899976223);
    340                    z2 = MULTIPLY(-z2, FIX_2_562915447);
    341                    z3 = MULTIPLY(-z3, FIX_1_961570560);
    342                    z4 = MULTIPLY(-z4, FIX_0_390180644);
    343 
    344                    z3 += z5;
    345                    z4 += z5;
    346 
    347                    tmp0 += z1 + z3;
    348                    tmp1 += z2 + z4;
    349                    tmp2 += z2 + z3;
    350                    tmp3 += z1 + z4;
    351                } else {
    352                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
    353                    z2 = d5 + d3;
    354                    z3 = d7 + d3;
    355                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
    356 
    357                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    358                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    359                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    360                    z1 = MULTIPLY(-d7, FIX_0_899976223);
    361                    z2 = MULTIPLY(-z2, FIX_2_562915447);
    362                    z3 = MULTIPLY(-z3, FIX_1_961570560);
    363                    z4 = MULTIPLY(-d5, FIX_0_390180644);
    364 
    365                    z3 += z5;
    366                    z4 += z5;
    367 
    368                    tmp0 += z1 + z3;
    369                    tmp1 += z2 + z4;
    370                    tmp2 += z2 + z3;
    371                    tmp3 = z1 + z4;
    372                }
    373            } else {
    374                if (d1) {
    375                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
    376                    z1 = d7 + d1;
    377                    z4 = d5 + d1;
    378                    z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
    379 
    380                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    381                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    382                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    383                    z1 = MULTIPLY(-z1, FIX_0_899976223);
    384                    z2 = MULTIPLY(-d5, FIX_2_562915447);
    385                    z3 = MULTIPLY(-d7, FIX_1_961570560);
    386                    z4 = MULTIPLY(-z4, FIX_0_390180644);
    387 
    388                    z3 += z5;
    389                    z4 += z5;
    390 
    391                    tmp0 += z1 + z3;
    392                    tmp1 += z2 + z4;
    393                    tmp2 = z2 + z3;
    394                    tmp3 += z1 + z4;
    395                } else {
    396                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
    397                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
    398                    z1 = MULTIPLY(-d7, FIX_0_899976223);
    399                    z3 = MULTIPLY(-d7, FIX_1_961570560);
    400                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
    401                    z2 = MULTIPLY(-d5, FIX_2_562915447);
    402                    z4 = MULTIPLY(-d5, FIX_0_390180644);
    403                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
    404 
    405                    z3 += z5;
    406                    z4 += z5;
    407 
    408                    tmp0 += z3;
    409                    tmp1 += z4;
    410                    tmp2 = z2 + z3;
    411                    tmp3 = z1 + z4;
    412                }
    413            }
    414        } else {
    415            if (d3) {
    416                if (d1) {
    417                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
    418                    z1 = d7 + d1;
    419                    z3 = d7 + d3;
    420                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
    421 
    422                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    423                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    424                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    425                    z1 = MULTIPLY(-z1, FIX_0_899976223);
    426                    z2 = MULTIPLY(-d3, FIX_2_562915447);
    427                    z3 = MULTIPLY(-z3, FIX_1_961570560);
    428                    z4 = MULTIPLY(-d1, FIX_0_390180644);
    429 
    430                    z3 += z5;
    431                    z4 += z5;
    432 
    433                    tmp0 += z1 + z3;
    434                    tmp1 = z2 + z4;
    435                    tmp2 += z2 + z3;
    436                    tmp3 += z1 + z4;
    437                } else {
    438                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
    439                    z3 = d7 + d3;
    440 
    441                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
    442                    z1 = MULTIPLY(-d7, FIX_0_899976223);
    443                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
    444                    z2 = MULTIPLY(-d3, FIX_2_562915447);
    445                    z5 = MULTIPLY(z3, FIX_1_175875602);
    446                    z3 = MULTIPLY(-z3, FIX_0_785694958);
    447 
    448                    tmp0 += z3;
    449                    tmp1 = z2 + z5;
    450                    tmp2 += z3;
    451                    tmp3 = z1 + z5;
    452                }
    453            } else {
    454                if (d1) {
    455                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
    456                    z1 = d7 + d1;
    457                    z5 = MULTIPLY(z1, FIX_1_175875602);
    458 
    459                    z1 = MULTIPLY(z1, FIX_0_275899380);
    460                    z3 = MULTIPLY(-d7, FIX_1_961570560);
    461                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
    462                    z4 = MULTIPLY(-d1, FIX_0_390180644);
    463                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
    464 
    465                    tmp0 += z1;
    466                    tmp1 = z4 + z5;
    467                    tmp2 = z3 + z5;
    468                    tmp3 += z1;
    469                } else {
    470                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
    471                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
    472                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
    473                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
    474                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
    475                }
    476            }
    477        }
    478    } else {
    479        if (d5) {
    480            if (d3) {
    481                if (d1) {
    482                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
    483                    z2 = d5 + d3;
    484                    z4 = d5 + d1;
    485                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
    486 
    487                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    488                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    489                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    490                    z1 = MULTIPLY(-d1, FIX_0_899976223);
    491                    z2 = MULTIPLY(-z2, FIX_2_562915447);
    492                    z3 = MULTIPLY(-d3, FIX_1_961570560);
    493                    z4 = MULTIPLY(-z4, FIX_0_390180644);
    494 
    495                    z3 += z5;
    496                    z4 += z5;
    497 
    498                    tmp0 = z1 + z3;
    499                    tmp1 += z2 + z4;
    500                    tmp2 += z2 + z3;
    501                    tmp3 += z1 + z4;
    502                } else {
    503                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
    504                    z2 = d5 + d3;
    505 
    506                    z5 = MULTIPLY(z2, FIX_1_175875602);
    507                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
    508                    z4 = MULTIPLY(-d5, FIX_0_390180644);
    509                    z2 = MULTIPLY(-z2, FIX_1_387039845);
    510                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
    511                    z3 = MULTIPLY(-d3, FIX_1_961570560);
    512 
    513                    tmp0 = z3 + z5;
    514                    tmp1 += z2;
    515                    tmp2 += z2;
    516                    tmp3 = z4 + z5;
    517                }
    518            } else {
    519                if (d1) {
    520                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
    521                    z4 = d5 + d1;
    522 
    523                    z5 = MULTIPLY(z4, FIX_1_175875602);
    524                    z1 = MULTIPLY(-d1, FIX_0_899976223);
    525                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
    526                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
    527                    z2 = MULTIPLY(-d5, FIX_2_562915447);
    528                    z4 = MULTIPLY(z4, FIX_0_785694958);
    529 
    530                    tmp0 = z1 + z5;
    531                    tmp1 += z4;
    532                    tmp2 = z2 + z5;
    533                    tmp3 += z4;
    534                } else {
    535                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
    536                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
    537                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
    538                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
    539                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
    540                }
    541            }
    542        } else {
    543            if (d3) {
    544                if (d1) {
    545                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
    546                    z5 = d1 + d3;
    547                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
    548                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
    549                    z1 = MULTIPLY(d1, FIX_1_061594337);
    550                    z2 = MULTIPLY(-d3, FIX_2_172734803);
    551                    z4 = MULTIPLY(z5, FIX_0_785694958);
    552                    z5 = MULTIPLY(z5, FIX_1_175875602);
    553 
    554                    tmp0 = z1 - z4;
    555                    tmp1 = z2 + z4;
    556                    tmp2 += z5;
    557                    tmp3 += z5;
    558                } else {
    559                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
    560                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
    561                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
    562                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
    563                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
    564                }
    565            } else {
    566                if (d1) {
    567                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
    568                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
    569                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
    570                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
    571                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
    572                } else {
    573                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
    574                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
    575                }
    576            }
    577        }
    578    }
    579 }
    580    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    581 
    582    dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
    583    dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
    584    dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
    585    dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
    586    dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
    587    dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
    588    dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
    589    dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
    590 
    591    dataptr += DCTSIZE;         /* advance pointer to next row */
    592  }
    593 
    594  /* Pass 2: process columns. */
    595  /* Note that we must descale the results by a factor of 8 == 2**3, */
    596  /* and also undo the PASS1_BITS scaling. */
    597 
    598  dataptr = data;
    599  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
    600    /* Columns of zeroes can be exploited in the same way as we did with rows.
    601     * However, the row calculation has created many nonzero AC terms, so the
    602     * simplification applies less often (typically 5% to 10% of the time).
    603     * On machines with very fast multiplication, it's possible that the
    604     * test takes more time than it's worth.  In that case this section
    605     * may be commented out.
    606     */
    607 
    608    d0 = dataptr[DCTSIZE*0];
    609    d1 = dataptr[DCTSIZE*1];
    610    d2 = dataptr[DCTSIZE*2];
    611    d3 = dataptr[DCTSIZE*3];
    612    d4 = dataptr[DCTSIZE*4];
    613    d5 = dataptr[DCTSIZE*5];
    614    d6 = dataptr[DCTSIZE*6];
    615    d7 = dataptr[DCTSIZE*7];
    616 
    617    /* Even part: reverse the even part of the forward DCT. */
    618    /* The rotator is sqrt(2)*c(-6). */
    619    if (d6) {
    620            if (d2) {
    621                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
    622                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
    623                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
    624                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
    625 
    626                    tmp0 = (d0 + d4) * CONST_SCALE;
    627                    tmp1 = (d0 - d4) * CONST_SCALE;
    628 
    629                    tmp10 = tmp0 + tmp3;
    630                    tmp13 = tmp0 - tmp3;
    631                    tmp11 = tmp1 + tmp2;
    632                    tmp12 = tmp1 - tmp2;
    633            } else {
    634                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
    635                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
    636                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
    637 
    638                    tmp0 = (d0 + d4) * CONST_SCALE;
    639                    tmp1 = (d0 - d4) * CONST_SCALE;
    640 
    641                    tmp10 = tmp0 + tmp3;
    642                    tmp13 = tmp0 - tmp3;
    643                    tmp11 = tmp1 + tmp2;
    644                    tmp12 = tmp1 - tmp2;
    645            }
    646    } else {
    647            if (d2) {
    648                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
    649                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
    650                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
    651 
    652                    tmp0 = (d0 + d4) * CONST_SCALE;
    653                    tmp1 = (d0 - d4) * CONST_SCALE;
    654 
    655                    tmp10 = tmp0 + tmp3;
    656                    tmp13 = tmp0 - tmp3;
    657                    tmp11 = tmp1 + tmp2;
    658                    tmp12 = tmp1 - tmp2;
    659            } else {
    660                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
    661                    tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
    662                    tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
    663            }
    664    }
    665 
    666    /* Odd part per figure 8; the matrix is unitary and hence its
    667     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
    668     */
    669    if (d7) {
    670        if (d5) {
    671            if (d3) {
    672                if (d1) {
    673                    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
    674                    z1 = d7 + d1;
    675                    z2 = d5 + d3;
    676                    z3 = d7 + d3;
    677                    z4 = d5 + d1;
    678                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
    679 
    680                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    681                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    682                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    683                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    684                    z1 = MULTIPLY(-z1, FIX_0_899976223);
    685                    z2 = MULTIPLY(-z2, FIX_2_562915447);
    686                    z3 = MULTIPLY(-z3, FIX_1_961570560);
    687                    z4 = MULTIPLY(-z4, FIX_0_390180644);
    688 
    689                    z3 += z5;
    690                    z4 += z5;
    691 
    692                    tmp0 += z1 + z3;
    693                    tmp1 += z2 + z4;
    694                    tmp2 += z2 + z3;
    695                    tmp3 += z1 + z4;
    696                } else {
    697                    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
    698                    z2 = d5 + d3;
    699                    z3 = d7 + d3;
    700                    z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
    701 
    702                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    703                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    704                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    705                    z1 = MULTIPLY(-d7, FIX_0_899976223);
    706                    z2 = MULTIPLY(-z2, FIX_2_562915447);
    707                    z3 = MULTIPLY(-z3, FIX_1_961570560);
    708                    z4 = MULTIPLY(-d5, FIX_0_390180644);
    709 
    710                    z3 += z5;
    711                    z4 += z5;
    712 
    713                    tmp0 += z1 + z3;
    714                    tmp1 += z2 + z4;
    715                    tmp2 += z2 + z3;
    716                    tmp3 = z1 + z4;
    717                }
    718            } else {
    719                if (d1) {
    720                    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
    721                    z1 = d7 + d1;
    722                    z3 = d7;
    723                    z4 = d5 + d1;
    724                    z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
    725 
    726                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    727                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    728                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    729                    z1 = MULTIPLY(-z1, FIX_0_899976223);
    730                    z2 = MULTIPLY(-d5, FIX_2_562915447);
    731                    z3 = MULTIPLY(-d7, FIX_1_961570560);
    732                    z4 = MULTIPLY(-z4, FIX_0_390180644);
    733 
    734                    z3 += z5;
    735                    z4 += z5;
    736 
    737                    tmp0 += z1 + z3;
    738                    tmp1 += z2 + z4;
    739                    tmp2 = z2 + z3;
    740                    tmp3 += z1 + z4;
    741                } else {
    742                    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
    743                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
    744                    z1 = MULTIPLY(-d7, FIX_0_899976223);
    745                    z3 = MULTIPLY(-d7, FIX_1_961570560);
    746                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
    747                    z2 = MULTIPLY(-d5, FIX_2_562915447);
    748                    z4 = MULTIPLY(-d5, FIX_0_390180644);
    749                    z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
    750 
    751                    z3 += z5;
    752                    z4 += z5;
    753 
    754                    tmp0 += z3;
    755                    tmp1 += z4;
    756                    tmp2 = z2 + z3;
    757                    tmp3 = z1 + z4;
    758                }
    759            }
    760        } else {
    761            if (d3) {
    762                if (d1) {
    763                    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
    764                    z1 = d7 + d1;
    765                    z3 = d7 + d3;
    766                    z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
    767 
    768                    tmp0 = MULTIPLY(d7, FIX_0_298631336);
    769                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    770                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    771                    z1 = MULTIPLY(-z1, FIX_0_899976223);
    772                    z2 = MULTIPLY(-d3, FIX_2_562915447);
    773                    z3 = MULTIPLY(-z3, FIX_1_961570560);
    774                    z4 = MULTIPLY(-d1, FIX_0_390180644);
    775 
    776                    z3 += z5;
    777                    z4 += z5;
    778 
    779                    tmp0 += z1 + z3;
    780                    tmp1 = z2 + z4;
    781                    tmp2 += z2 + z3;
    782                    tmp3 += z1 + z4;
    783                } else {
    784                    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
    785                    z3 = d7 + d3;
    786 
    787                    tmp0 = MULTIPLY(-d7, FIX_0_601344887);
    788                    z1 = MULTIPLY(-d7, FIX_0_899976223);
    789                    tmp2 = MULTIPLY(d3, FIX_0_509795579);
    790                    z2 = MULTIPLY(-d3, FIX_2_562915447);
    791                    z5 = MULTIPLY(z3, FIX_1_175875602);
    792                    z3 = MULTIPLY(-z3, FIX_0_785694958);
    793 
    794                    tmp0 += z3;
    795                    tmp1 = z2 + z5;
    796                    tmp2 += z3;
    797                    tmp3 = z1 + z5;
    798                }
    799            } else {
    800                if (d1) {
    801                    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
    802                    z1 = d7 + d1;
    803                    z5 = MULTIPLY(z1, FIX_1_175875602);
    804 
    805                    z1 = MULTIPLY(z1, FIX_0_275899380);
    806                    z3 = MULTIPLY(-d7, FIX_1_961570560);
    807                    tmp0 = MULTIPLY(-d7, FIX_1_662939225);
    808                    z4 = MULTIPLY(-d1, FIX_0_390180644);
    809                    tmp3 = MULTIPLY(d1, FIX_1_111140466);
    810 
    811                    tmp0 += z1;
    812                    tmp1 = z4 + z5;
    813                    tmp2 = z3 + z5;
    814                    tmp3 += z1;
    815                } else {
    816                    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
    817                    tmp0 = MULTIPLY(-d7, FIX_1_387039845);
    818                    tmp1 = MULTIPLY(d7, FIX_1_175875602);
    819                    tmp2 = MULTIPLY(-d7, FIX_0_785694958);
    820                    tmp3 = MULTIPLY(d7, FIX_0_275899380);
    821                }
    822            }
    823        }
    824    } else {
    825        if (d5) {
    826            if (d3) {
    827                if (d1) {
    828                    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
    829                    z2 = d5 + d3;
    830                    z4 = d5 + d1;
    831                    z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
    832 
    833                    tmp1 = MULTIPLY(d5, FIX_2_053119869);
    834                    tmp2 = MULTIPLY(d3, FIX_3_072711026);
    835                    tmp3 = MULTIPLY(d1, FIX_1_501321110);
    836                    z1 = MULTIPLY(-d1, FIX_0_899976223);
    837                    z2 = MULTIPLY(-z2, FIX_2_562915447);
    838                    z3 = MULTIPLY(-d3, FIX_1_961570560);
    839                    z4 = MULTIPLY(-z4, FIX_0_390180644);
    840 
    841                    z3 += z5;
    842                    z4 += z5;
    843 
    844                    tmp0 = z1 + z3;
    845                    tmp1 += z2 + z4;
    846                    tmp2 += z2 + z3;
    847                    tmp3 += z1 + z4;
    848                } else {
    849                    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
    850                    z2 = d5 + d3;
    851 
    852                    z5 = MULTIPLY(z2, FIX_1_175875602);
    853                    tmp1 = MULTIPLY(d5, FIX_1_662939225);
    854                    z4 = MULTIPLY(-d5, FIX_0_390180644);
    855                    z2 = MULTIPLY(-z2, FIX_1_387039845);
    856                    tmp2 = MULTIPLY(d3, FIX_1_111140466);
    857                    z3 = MULTIPLY(-d3, FIX_1_961570560);
    858 
    859                    tmp0 = z3 + z5;
    860                    tmp1 += z2;
    861                    tmp2 += z2;
    862                    tmp3 = z4 + z5;
    863                }
    864            } else {
    865                if (d1) {
    866                    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
    867                    z4 = d5 + d1;
    868 
    869                    z5 = MULTIPLY(z4, FIX_1_175875602);
    870                    z1 = MULTIPLY(-d1, FIX_0_899976223);
    871                    tmp3 = MULTIPLY(d1, FIX_0_601344887);
    872                    tmp1 = MULTIPLY(-d5, FIX_0_509795579);
    873                    z2 = MULTIPLY(-d5, FIX_2_562915447);
    874                    z4 = MULTIPLY(z4, FIX_0_785694958);
    875 
    876                    tmp0 = z1 + z5;
    877                    tmp1 += z4;
    878                    tmp2 = z2 + z5;
    879                    tmp3 += z4;
    880                } else {
    881                    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
    882                    tmp0 = MULTIPLY(d5, FIX_1_175875602);
    883                    tmp1 = MULTIPLY(d5, FIX_0_275899380);
    884                    tmp2 = MULTIPLY(-d5, FIX_1_387039845);
    885                    tmp3 = MULTIPLY(d5, FIX_0_785694958);
    886                }
    887            }
    888        } else {
    889            if (d3) {
    890                if (d1) {
    891                    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
    892                    z5 = d1 + d3;
    893                    tmp3 = MULTIPLY(d1, FIX_0_211164243);
    894                    tmp2 = MULTIPLY(-d3, FIX_1_451774981);
    895                    z1 = MULTIPLY(d1, FIX_1_061594337);
    896                    z2 = MULTIPLY(-d3, FIX_2_172734803);
    897                    z4 = MULTIPLY(z5, FIX_0_785694958);
    898                    z5 = MULTIPLY(z5, FIX_1_175875602);
    899 
    900                    tmp0 = z1 - z4;
    901                    tmp1 = z2 + z4;
    902                    tmp2 += z5;
    903                    tmp3 += z5;
    904                } else {
    905                    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
    906                    tmp0 = MULTIPLY(-d3, FIX_0_785694958);
    907                    tmp1 = MULTIPLY(-d3, FIX_1_387039845);
    908                    tmp2 = MULTIPLY(-d3, FIX_0_275899380);
    909                    tmp3 = MULTIPLY(d3, FIX_1_175875602);
    910                }
    911            } else {
    912                if (d1) {
    913                    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
    914                    tmp0 = MULTIPLY(d1, FIX_0_275899380);
    915                    tmp1 = MULTIPLY(d1, FIX_0_785694958);
    916                    tmp2 = MULTIPLY(d1, FIX_1_175875602);
    917                    tmp3 = MULTIPLY(d1, FIX_1_387039845);
    918                } else {
    919                    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
    920                    tmp0 = tmp1 = tmp2 = tmp3 = 0;
    921                }
    922            }
    923        }
    924    }
    925 
    926    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
    927 
    928    dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3,
    929                                           CONST_BITS+PASS1_BITS+3);
    930    dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3,
    931                                           CONST_BITS+PASS1_BITS+3);
    932    dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2,
    933                                           CONST_BITS+PASS1_BITS+3);
    934    dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2,
    935                                           CONST_BITS+PASS1_BITS+3);
    936    dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1,
    937                                           CONST_BITS+PASS1_BITS+3);
    938    dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1,
    939                                           CONST_BITS+PASS1_BITS+3);
    940    dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0,
    941                                           CONST_BITS+PASS1_BITS+3);
    942    dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0,
    943                                           CONST_BITS+PASS1_BITS+3);
    944 
    945    dataptr++;                  /* advance pointer to next column */
    946  }
    947 }
    948 
    949 #undef DCTSIZE
    950 #define DCTSIZE 4
    951 #define DCTSTRIDE 8
    952 
    953 void ff_j_rev_dct4(DCTBLOCK data)
    954 {
    955  int32_t tmp0, tmp1, tmp2, tmp3;
    956  int32_t tmp10, tmp11, tmp12, tmp13;
    957  int32_t z1;
    958  int32_t d0, d2, d4, d6;
    959  register int16_t *dataptr;
    960  int rowctr;
    961 
    962  /* Pass 1: process rows. */
    963  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
    964  /* furthermore, we scale the results by 2**PASS1_BITS. */
    965 
    966  data[0] += 4;
    967 
    968  dataptr = data;
    969 
    970  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
    971    /* Due to quantization, we will usually find that many of the input
    972     * coefficients are zero, especially the AC terms.  We can exploit this
    973     * by short-circuiting the IDCT calculation for any row in which all
    974     * the AC terms are zero.  In that case each output is equal to the
    975     * DC coefficient (with scale factor as needed).
    976     * With typical images and quantization tables, half or more of the
    977     * row DCT calculations can be simplified this way.
    978     */
    979 
    980    register uint8_t *idataptr = (uint8_t*)dataptr;
    981 
    982    d0 = dataptr[0];
    983    d2 = dataptr[1];
    984    d4 = dataptr[2];
    985    d6 = dataptr[3];
    986 
    987    if ((d2 | d4 | d6) == 0) {
    988      /* AC terms all zero */
    989      if (d0) {
    990          /* Compute a 32 bit value to assign. */
    991          int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
    992          register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16);
    993 
    994          AV_WN32A(&idataptr[0], v);
    995          AV_WN32A(&idataptr[4], v);
    996      }
    997 
    998      dataptr += DCTSTRIDE;     /* advance pointer to next row */
    999      continue;
   1000    }
   1001 
   1002    /* Even part: reverse the even part of the forward DCT. */
   1003    /* The rotator is sqrt(2)*c(-6). */
   1004    if (d6) {
   1005            if (d2) {
   1006                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
   1007                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
   1008                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
   1009                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
   1010 
   1011                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
   1012                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
   1013 
   1014                    tmp10 = tmp0 + tmp3;
   1015                    tmp13 = tmp0 - tmp3;
   1016                    tmp11 = tmp1 + tmp2;
   1017                    tmp12 = tmp1 - tmp2;
   1018            } else {
   1019                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
   1020                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
   1021                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
   1022 
   1023                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
   1024                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
   1025 
   1026                    tmp10 = tmp0 + tmp3;
   1027                    tmp13 = tmp0 - tmp3;
   1028                    tmp11 = tmp1 + tmp2;
   1029                    tmp12 = tmp1 - tmp2;
   1030            }
   1031    } else {
   1032            if (d2) {
   1033                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
   1034                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
   1035                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
   1036 
   1037                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
   1038                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
   1039 
   1040                    tmp10 = tmp0 + tmp3;
   1041                    tmp13 = tmp0 - tmp3;
   1042                    tmp11 = tmp1 + tmp2;
   1043                    tmp12 = tmp1 - tmp2;
   1044            } else {
   1045                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
   1046                    tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS);
   1047                    tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS);
   1048            }
   1049      }
   1050 
   1051    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   1052 
   1053    dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
   1054    dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
   1055    dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
   1056    dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
   1057 
   1058    dataptr += DCTSTRIDE;       /* advance pointer to next row */
   1059  }
   1060 
   1061  /* Pass 2: process columns. */
   1062  /* Note that we must descale the results by a factor of 8 == 2**3, */
   1063  /* and also undo the PASS1_BITS scaling. */
   1064 
   1065  dataptr = data;
   1066  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
   1067    /* Columns of zeroes can be exploited in the same way as we did with rows.
   1068     * However, the row calculation has created many nonzero AC terms, so the
   1069     * simplification applies less often (typically 5% to 10% of the time).
   1070     * On machines with very fast multiplication, it's possible that the
   1071     * test takes more time than it's worth.  In that case this section
   1072     * may be commented out.
   1073     */
   1074 
   1075    d0 = dataptr[DCTSTRIDE*0];
   1076    d2 = dataptr[DCTSTRIDE*1];
   1077    d4 = dataptr[DCTSTRIDE*2];
   1078    d6 = dataptr[DCTSTRIDE*3];
   1079 
   1080    /* Even part: reverse the even part of the forward DCT. */
   1081    /* The rotator is sqrt(2)*c(-6). */
   1082    if (d6) {
   1083            if (d2) {
   1084                    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
   1085                    z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
   1086                    tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
   1087                    tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
   1088 
   1089                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
   1090                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
   1091 
   1092                    tmp10 = tmp0 + tmp3;
   1093                    tmp13 = tmp0 - tmp3;
   1094                    tmp11 = tmp1 + tmp2;
   1095                    tmp12 = tmp1 - tmp2;
   1096            } else {
   1097                    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
   1098                    tmp2 = MULTIPLY(-d6, FIX_1_306562965);
   1099                    tmp3 = MULTIPLY(d6, FIX_0_541196100);
   1100 
   1101                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
   1102                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
   1103 
   1104                    tmp10 = tmp0 + tmp3;
   1105                    tmp13 = tmp0 - tmp3;
   1106                    tmp11 = tmp1 + tmp2;
   1107                    tmp12 = tmp1 - tmp2;
   1108            }
   1109    } else {
   1110            if (d2) {
   1111                    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
   1112                    tmp2 = MULTIPLY(d2, FIX_0_541196100);
   1113                    tmp3 = MULTIPLY(d2, FIX_1_306562965);
   1114 
   1115                    tmp0 = (d0 + d4) * (1 << CONST_BITS);
   1116                    tmp1 = (d0 - d4) * (1 << CONST_BITS);
   1117 
   1118                    tmp10 = tmp0 + tmp3;
   1119                    tmp13 = tmp0 - tmp3;
   1120                    tmp11 = tmp1 + tmp2;
   1121                    tmp12 = tmp1 - tmp2;
   1122            } else {
   1123                    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
   1124                    tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS);
   1125                    tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS);
   1126            }
   1127    }
   1128 
   1129    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
   1130 
   1131    dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
   1132    dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
   1133    dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
   1134    dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
   1135 
   1136    dataptr++;                  /* advance pointer to next column */
   1137  }
   1138 }
   1139 
   1140 void ff_j_rev_dct2(DCTBLOCK data){
   1141  int d00, d01, d10, d11;
   1142 
   1143  data[0] += 4;
   1144  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
   1145  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
   1146  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
   1147  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
   1148 
   1149  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
   1150  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
   1151  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
   1152  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
   1153 }
   1154 
   1155 void ff_j_rev_dct1(DCTBLOCK data){
   1156  data[0] = (data[0] + 4)>>3;
   1157 }
   1158 
   1159 #undef FIX
   1160 #undef CONST_BITS
   1161 
   1162 void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
   1163 {
   1164    ff_j_rev_dct(block);
   1165    ff_put_pixels_clamped_c(block, dest, line_size);
   1166 }
   1167 
   1168 void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
   1169 {
   1170    ff_j_rev_dct(block);
   1171    ff_add_pixels_clamped_c(block, dest, line_size);
   1172 }