tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcdctmgr.c (23029B)


      1 /*
      2 * jcdctmgr.c
      3 *
      4 * This file was part of the Independent JPEG Group's software:
      5 * Copyright (C) 1994-1996, Thomas G. Lane.
      6 * libjpeg-turbo Modifications:
      7 * Copyright (C) 1999-2006, MIYASAKA Masaru.
      8 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      9 * Copyright (C) 2011, 2014-2015, 2022, 2024, D. R. Commander.
     10 * For conditions of distribution and use, see the accompanying README.ijg
     11 * file.
     12 *
     13 * This file contains the forward-DCT management logic.
     14 * This code selects a particular DCT implementation to be used,
     15 * and it performs related housekeeping chores including coefficient
     16 * quantization.
     17 */
     18 
     19 #define JPEG_INTERNALS
     20 #include "jinclude.h"
     21 #include "jpeglib.h"
     22 #include "jdct.h"               /* Private declarations for DCT subsystem */
     23 #include "jsimddct.h"
     24 
     25 
     26 /* Private subobject for this module */
     27 
     28 typedef void (*forward_DCT_method_ptr) (DCTELEM *data);
     29 typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data);
     30 
     31 typedef void (*convsamp_method_ptr) (_JSAMPARRAY sample_data,
     32                                     JDIMENSION start_col,
     33                                     DCTELEM *workspace);
     34 typedef void (*float_convsamp_method_ptr) (_JSAMPARRAY sample_data,
     35                                           JDIMENSION start_col,
     36                                           FAST_FLOAT *workspace);
     37 
     38 typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors,
     39                                     DCTELEM *workspace);
     40 typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block,
     41                                           FAST_FLOAT *divisors,
     42                                           FAST_FLOAT *workspace);
     43 
     44 METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *);
     45 
     46 typedef struct {
     47  struct jpeg_forward_dct pub;  /* public fields */
     48 
     49  /* Pointer to the DCT routine actually in use */
     50  forward_DCT_method_ptr dct;
     51  convsamp_method_ptr convsamp;
     52  quantize_method_ptr quantize;
     53 
     54  /* The actual post-DCT divisors --- not identical to the quant table
     55   * entries, because of scaling (especially for an unnormalized DCT).
     56   * Each table is given in normal array order.
     57   */
     58  DCTELEM *divisors[NUM_QUANT_TBLS];
     59 
     60  /* work area for FDCT subroutine */
     61  DCTELEM *workspace;
     62 
     63 #ifdef DCT_FLOAT_SUPPORTED
     64  /* Same as above for the floating-point case. */
     65  float_DCT_method_ptr float_dct;
     66  float_convsamp_method_ptr float_convsamp;
     67  float_quantize_method_ptr float_quantize;
     68  FAST_FLOAT *float_divisors[NUM_QUANT_TBLS];
     69  FAST_FLOAT *float_workspace;
     70 #endif
     71 } my_fdct_controller;
     72 
     73 typedef my_fdct_controller *my_fdct_ptr;
     74 
     75 
     76 #if BITS_IN_JSAMPLE == 8
     77 
     78 /*
     79 * Find the highest bit in an integer through binary search.
     80 */
     81 
     82 LOCAL(int)
     83 flss(UINT16 val)
     84 {
     85  int bit;
     86 
     87  bit = 16;
     88 
     89  if (!val)
     90    return 0;
     91 
     92  if (!(val & 0xff00)) {
     93    bit -= 8;
     94    val <<= 8;
     95  }
     96  if (!(val & 0xf000)) {
     97    bit -= 4;
     98    val <<= 4;
     99  }
    100  if (!(val & 0xc000)) {
    101    bit -= 2;
    102    val <<= 2;
    103  }
    104  if (!(val & 0x8000)) {
    105    bit -= 1;
    106    val <<= 1;
    107  }
    108 
    109  return bit;
    110 }
    111 
    112 
    113 /*
    114 * Compute values to do a division using reciprocal.
    115 *
    116 * This implementation is based on an algorithm described in
    117 *   "Optimizing subroutines in assembly language:
    118 *   An optimization guide for x86 platforms" (https://agner.org/optimize).
    119 * More information about the basic algorithm can be found in
    120 * the paper "Integer Division Using Reciprocals" by Robert Alverson.
    121 *
    122 * The basic idea is to replace x/d by x * d^-1. In order to store
    123 * d^-1 with enough precision we shift it left a few places. It turns
    124 * out that this algoright gives just enough precision, and also fits
    125 * into DCTELEM:
    126 *
    127 *   b = (the number of significant bits in divisor) - 1
    128 *   r = (word size) + b
    129 *   f = 2^r / divisor
    130 *
    131 * f will not be an integer for most cases, so we need to compensate
    132 * for the rounding error introduced:
    133 *
    134 *   no fractional part:
    135 *
    136 *       result = input >> r
    137 *
    138 *   fractional part of f < 0.5:
    139 *
    140 *       round f down to nearest integer
    141 *       result = ((input + 1) * f) >> r
    142 *
    143 *   fractional part of f > 0.5:
    144 *
    145 *       round f up to nearest integer
    146 *       result = (input * f) >> r
    147 *
    148 * This is the original algorithm that gives truncated results. But we
    149 * want properly rounded results, so we replace "input" with
    150 * "input + divisor/2".
    151 *
    152 * In order to allow SIMD implementations we also tweak the values to
    153 * allow the same calculation to be made at all times:
    154 *
    155 *   dctbl[0] = f rounded to nearest integer
    156 *   dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5)
    157 *   dctbl[2] = 1 << ((word size) * 2 - r)
    158 *   dctbl[3] = r - (word size)
    159 *
    160 * dctbl[2] is for stupid instruction sets where the shift operation
    161 * isn't member wise (e.g. MMX).
    162 *
    163 * The reason dctbl[2] and dctbl[3] reduce the shift with (word size)
    164 * is that most SIMD implementations have a "multiply and store top
    165 * half" operation.
    166 *
    167 * Lastly, we store each of the values in their own table instead
    168 * of in a consecutive manner, yet again in order to allow SIMD
    169 * routines.
    170 */
    171 
    172 LOCAL(int)
    173 compute_reciprocal(UINT16 divisor, DCTELEM *dtbl)
    174 {
    175  UDCTELEM2 fq, fr;
    176  UDCTELEM c;
    177  int b, r;
    178 
    179  if (divisor == 1) {
    180    /* divisor == 1 means unquantized, so these reciprocal/correction/shift
    181     * values will cause the C quantization algorithm to act like the
    182     * identity function.  Since only the C quantization algorithm is used in
    183     * these cases, the scale value is irrelevant.
    184     */
    185    dtbl[DCTSIZE2 * 0] = (DCTELEM)1;                        /* reciprocal */
    186    dtbl[DCTSIZE2 * 1] = (DCTELEM)0;                        /* correction */
    187    dtbl[DCTSIZE2 * 2] = (DCTELEM)1;                        /* scale */
    188    dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8);   /* shift */
    189    return 0;
    190  }
    191 
    192  b = flss(divisor) - 1;
    193  r  = sizeof(DCTELEM) * 8 + b;
    194 
    195  fq = ((UDCTELEM2)1 << r) / divisor;
    196  fr = ((UDCTELEM2)1 << r) % divisor;
    197 
    198  c = divisor / 2;                      /* for rounding */
    199 
    200  if (fr == 0) {                        /* divisor is power of two */
    201    /* fq will be one bit too large to fit in DCTELEM, so adjust */
    202    fq >>= 1;
    203    r--;
    204  } else if (fr <= (divisor / 2U)) {    /* fractional part is < 0.5 */
    205    c++;
    206  } else {                              /* fractional part is > 0.5 */
    207    fq++;
    208  }
    209 
    210  dtbl[DCTSIZE2 * 0] = (DCTELEM)fq;     /* reciprocal */
    211  dtbl[DCTSIZE2 * 1] = (DCTELEM)c;      /* correction + roundfactor */
    212 #ifdef WITH_SIMD
    213  dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */
    214 #else
    215  dtbl[DCTSIZE2 * 2] = 1;
    216 #endif
    217  dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */
    218 
    219  if (r <= 16) return 0;
    220  else return 1;
    221 }
    222 
    223 #endif
    224 
    225 
    226 /*
    227 * Initialize for a processing pass.
    228 * Verify that all referenced Q-tables are present, and set up
    229 * the divisor table for each one.
    230 * In the current implementation, DCT of all components is done during
    231 * the first pass, even if only some components will be output in the
    232 * first scan.  Hence all components should be examined here.
    233 */
    234 
    235 METHODDEF(void)
    236 start_pass_fdctmgr(j_compress_ptr cinfo)
    237 {
    238  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
    239  int ci, qtblno, i;
    240  jpeg_component_info *compptr;
    241  JQUANT_TBL *qtbl;
    242  DCTELEM *dtbl;
    243 
    244  for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components;
    245       ci++, compptr++) {
    246    qtblno = compptr->quant_tbl_no;
    247    /* Make sure specified quantization table is present */
    248    if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS ||
    249        cinfo->quant_tbl_ptrs[qtblno] == NULL)
    250      ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno);
    251    qtbl = cinfo->quant_tbl_ptrs[qtblno];
    252    /* Compute divisors for this quant table */
    253    /* We may do this more than once for same table, but it's not a big deal */
    254    switch (cinfo->dct_method) {
    255 #ifdef DCT_ISLOW_SUPPORTED
    256    case JDCT_ISLOW:
    257      /* For LL&M IDCT method, divisors are equal to raw quantization
    258       * coefficients multiplied by 8 (to counteract scaling).
    259       */
    260      if (fdct->divisors[qtblno] == NULL) {
    261        fdct->divisors[qtblno] = (DCTELEM *)
    262          (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
    263                                      (DCTSIZE2 * 4) * sizeof(DCTELEM));
    264      }
    265      dtbl = fdct->divisors[qtblno];
    266      for (i = 0; i < DCTSIZE2; i++) {
    267 #if BITS_IN_JSAMPLE == 8
    268 #ifdef WITH_SIMD
    269        if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) &&
    270            fdct->quantize == jsimd_quantize)
    271          fdct->quantize = quantize;
    272 #else
    273        compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]);
    274 #endif
    275 #else
    276        dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3;
    277 #endif
    278      }
    279      break;
    280 #endif
    281 #ifdef DCT_IFAST_SUPPORTED
    282    case JDCT_IFAST:
    283      {
    284        /* For AA&N IDCT method, divisors are equal to quantization
    285         * coefficients scaled by scalefactor[row]*scalefactor[col], where
    286         *   scalefactor[0] = 1
    287         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
    288         * We apply a further scale factor of 8.
    289         */
    290 #define CONST_BITS  14
    291        static const INT16 aanscales[DCTSIZE2] = {
    292          /* precomputed values scaled up by 14 bits */
    293          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
    294          22725, 31521, 29692, 26722, 22725, 17855, 12299,  6270,
    295          21407, 29692, 27969, 25172, 21407, 16819, 11585,  5906,
    296          19266, 26722, 25172, 22654, 19266, 15137, 10426,  5315,
    297          16384, 22725, 21407, 19266, 16384, 12873,  8867,  4520,
    298          12873, 17855, 16819, 15137, 12873, 10114,  6967,  3552,
    299           8867, 12299, 11585, 10426,  8867,  6967,  4799,  2446,
    300           4520,  6270,  5906,  5315,  4520,  3552,  2446,  1247
    301        };
    302        SHIFT_TEMPS
    303 
    304        if (fdct->divisors[qtblno] == NULL) {
    305          fdct->divisors[qtblno] = (DCTELEM *)
    306            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
    307                                        (DCTSIZE2 * 4) * sizeof(DCTELEM));
    308        }
    309        dtbl = fdct->divisors[qtblno];
    310        for (i = 0; i < DCTSIZE2; i++) {
    311 #if BITS_IN_JSAMPLE == 8
    312 #ifdef WITH_SIMD
    313          if (!compute_reciprocal(
    314                DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
    315                                      (JLONG)aanscales[i]),
    316                        CONST_BITS - 3), &dtbl[i]) &&
    317              fdct->quantize == jsimd_quantize)
    318            fdct->quantize = quantize;
    319 #else
    320          compute_reciprocal(
    321            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
    322                                  (JLONG)aanscales[i]),
    323                    CONST_BITS-3), &dtbl[i]);
    324 #endif
    325 #else
    326          dtbl[i] = (DCTELEM)
    327            DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i],
    328                                  (JLONG)aanscales[i]),
    329                    CONST_BITS - 3);
    330 #endif
    331        }
    332      }
    333      break;
    334 #endif
    335 #ifdef DCT_FLOAT_SUPPORTED
    336    case JDCT_FLOAT:
    337      {
    338        /* For float AA&N IDCT method, divisors are equal to quantization
    339         * coefficients scaled by scalefactor[row]*scalefactor[col], where
    340         *   scalefactor[0] = 1
    341         *   scalefactor[k] = cos(k*PI/16) * sqrt(2)    for k=1..7
    342         * We apply a further scale factor of 8.
    343         * What's actually stored is 1/divisor so that the inner loop can
    344         * use a multiplication rather than a division.
    345         */
    346        FAST_FLOAT *fdtbl;
    347        int row, col;
    348        static const double aanscalefactor[DCTSIZE] = {
    349          1.0, 1.387039845, 1.306562965, 1.175875602,
    350          1.0, 0.785694958, 0.541196100, 0.275899379
    351        };
    352 
    353        if (fdct->float_divisors[qtblno] == NULL) {
    354          fdct->float_divisors[qtblno] = (FAST_FLOAT *)
    355            (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
    356                                        DCTSIZE2 * sizeof(FAST_FLOAT));
    357        }
    358        fdtbl = fdct->float_divisors[qtblno];
    359        i = 0;
    360        for (row = 0; row < DCTSIZE; row++) {
    361          for (col = 0; col < DCTSIZE; col++) {
    362            fdtbl[i] = (FAST_FLOAT)
    363              (1.0 / (((double)qtbl->quantval[i] *
    364                       aanscalefactor[row] * aanscalefactor[col] * 8.0)));
    365            i++;
    366          }
    367        }
    368      }
    369      break;
    370 #endif
    371    default:
    372      ERREXIT(cinfo, JERR_NOT_COMPILED);
    373      break;
    374    }
    375  }
    376 }
    377 
    378 
    379 /*
    380 * Load data into workspace, applying unsigned->signed conversion.
    381 */
    382 
    383 METHODDEF(void)
    384 convsamp(_JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace)
    385 {
    386  register DCTELEM *workspaceptr;
    387  register _JSAMPROW elemptr;
    388  register int elemr;
    389 
    390  workspaceptr = workspace;
    391  for (elemr = 0; elemr < DCTSIZE; elemr++) {
    392    elemptr = sample_data[elemr] + start_col;
    393 
    394 #if DCTSIZE == 8                /* unroll the inner loop */
    395    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    396    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    397    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    398    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    399    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    400    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    401    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    402    *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    403 #else
    404    {
    405      register int elemc;
    406      for (elemc = DCTSIZE; elemc > 0; elemc--)
    407        *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE;
    408    }
    409 #endif
    410  }
    411 }
    412 
    413 
    414 /*
    415 * Quantize/descale the coefficients, and store into coef_blocks[].
    416 */
    417 
    418 METHODDEF(void)
    419 quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace)
    420 {
    421  int i;
    422  DCTELEM temp;
    423  JCOEFPTR output_ptr = coef_block;
    424 
    425 #if BITS_IN_JSAMPLE == 8
    426 
    427  UDCTELEM recip, corr;
    428  int shift;
    429  UDCTELEM2 product;
    430 
    431  for (i = 0; i < DCTSIZE2; i++) {
    432    temp = workspace[i];
    433    recip = divisors[i + DCTSIZE2 * 0];
    434    corr =  divisors[i + DCTSIZE2 * 1];
    435    shift = divisors[i + DCTSIZE2 * 3];
    436 
    437    if (temp < 0) {
    438      temp = -temp;
    439      product = (UDCTELEM2)(temp + corr) * recip;
    440      product >>= shift + sizeof(DCTELEM) * 8;
    441      temp = (DCTELEM)product;
    442      temp = -temp;
    443    } else {
    444      product = (UDCTELEM2)(temp + corr) * recip;
    445      product >>= shift + sizeof(DCTELEM) * 8;
    446      temp = (DCTELEM)product;
    447    }
    448    output_ptr[i] = (JCOEF)temp;
    449  }
    450 
    451 #else
    452 
    453  register DCTELEM qval;
    454 
    455  for (i = 0; i < DCTSIZE2; i++) {
    456    qval = divisors[i];
    457    temp = workspace[i];
    458    /* Divide the coefficient value by qval, ensuring proper rounding.
    459     * Since C does not specify the direction of rounding for negative
    460     * quotients, we have to force the dividend positive for portability.
    461     *
    462     * In most files, at least half of the output values will be zero
    463     * (at default quantization settings, more like three-quarters...)
    464     * so we should ensure that this case is fast.  On many machines,
    465     * a comparison is enough cheaper than a divide to make a special test
    466     * a win.  Since both inputs will be nonnegative, we need only test
    467     * for a < b to discover whether a/b is 0.
    468     * If your machine's division is fast enough, define FAST_DIVIDE.
    469     */
    470 #ifdef FAST_DIVIDE
    471 #define DIVIDE_BY(a, b)  a /= b
    472 #else
    473 #define DIVIDE_BY(a, b)  if (a >= b) a /= b;  else a = 0
    474 #endif
    475    if (temp < 0) {
    476      temp = -temp;
    477      temp += qval >> 1;        /* for rounding */
    478      DIVIDE_BY(temp, qval);
    479      temp = -temp;
    480    } else {
    481      temp += qval >> 1;        /* for rounding */
    482      DIVIDE_BY(temp, qval);
    483    }
    484    output_ptr[i] = (JCOEF)temp;
    485  }
    486 
    487 #endif
    488 
    489 }
    490 
    491 
    492 /*
    493 * Perform forward DCT on one or more blocks of a component.
    494 *
    495 * The input samples are taken from the sample_data[] array starting at
    496 * position start_row/start_col, and moving to the right for any additional
    497 * blocks. The quantized coefficients are returned in coef_blocks[].
    498 */
    499 
    500 METHODDEF(void)
    501 forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr,
    502            _JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
    503            JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks)
    504 /* This version is used for integer DCT implementations. */
    505 {
    506  /* This routine is heavily used, so it's worth coding it tightly. */
    507  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
    508  DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no];
    509  DCTELEM *workspace;
    510  JDIMENSION bi;
    511 
    512  /* Make sure the compiler doesn't look up these every pass */
    513  forward_DCT_method_ptr do_dct = fdct->dct;
    514  convsamp_method_ptr do_convsamp = fdct->convsamp;
    515  quantize_method_ptr do_quantize = fdct->quantize;
    516  workspace = fdct->workspace;
    517 
    518  sample_data += start_row;     /* fold in the vertical offset once */
    519 
    520  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    521    /* Load data into workspace, applying unsigned->signed conversion */
    522    (*do_convsamp) (sample_data, start_col, workspace);
    523 
    524    /* Perform the DCT */
    525    (*do_dct) (workspace);
    526 
    527    /* Quantize/descale the coefficients, and store into coef_blocks[] */
    528    (*do_quantize) (coef_blocks[bi], divisors, workspace);
    529  }
    530 }
    531 
    532 
    533 #ifdef DCT_FLOAT_SUPPORTED
    534 
    535 METHODDEF(void)
    536 convsamp_float(_JSAMPARRAY sample_data, JDIMENSION start_col,
    537               FAST_FLOAT *workspace)
    538 {
    539  register FAST_FLOAT *workspaceptr;
    540  register _JSAMPROW elemptr;
    541  register int elemr;
    542 
    543  workspaceptr = workspace;
    544  for (elemr = 0; elemr < DCTSIZE; elemr++) {
    545    elemptr = sample_data[elemr] + start_col;
    546 #if DCTSIZE == 8                /* unroll the inner loop */
    547    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    548    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    549    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    550    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    551    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    552    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    553    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    554    *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    555 #else
    556    {
    557      register int elemc;
    558      for (elemc = DCTSIZE; elemc > 0; elemc--)
    559        *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE);
    560    }
    561 #endif
    562  }
    563 }
    564 
    565 
    566 METHODDEF(void)
    567 quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors,
    568               FAST_FLOAT *workspace)
    569 {
    570  register FAST_FLOAT temp;
    571  register int i;
    572  register JCOEFPTR output_ptr = coef_block;
    573 
    574  for (i = 0; i < DCTSIZE2; i++) {
    575    /* Apply the quantization and scaling factor */
    576    temp = workspace[i] * divisors[i];
    577 
    578    /* Round to nearest integer.
    579     * Since C does not specify the direction of rounding for negative
    580     * quotients, we have to force the dividend positive for portability.
    581     * The maximum coefficient size is +-16K (for 12-bit data), so this
    582     * code should work for either 16-bit or 32-bit ints.
    583     */
    584    output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384);
    585  }
    586 }
    587 
    588 
    589 METHODDEF(void)
    590 forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr,
    591                  _JSAMPARRAY sample_data, JBLOCKROW coef_blocks,
    592                  JDIMENSION start_row, JDIMENSION start_col,
    593                  JDIMENSION num_blocks)
    594 /* This version is used for floating-point DCT implementations. */
    595 {
    596  /* This routine is heavily used, so it's worth coding it tightly. */
    597  my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct;
    598  FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no];
    599  FAST_FLOAT *workspace;
    600  JDIMENSION bi;
    601 
    602 
    603  /* Make sure the compiler doesn't look up these every pass */
    604  float_DCT_method_ptr do_dct = fdct->float_dct;
    605  float_convsamp_method_ptr do_convsamp = fdct->float_convsamp;
    606  float_quantize_method_ptr do_quantize = fdct->float_quantize;
    607  workspace = fdct->float_workspace;
    608 
    609  sample_data += start_row;     /* fold in the vertical offset once */
    610 
    611  for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) {
    612    /* Load data into workspace, applying unsigned->signed conversion */
    613    (*do_convsamp) (sample_data, start_col, workspace);
    614 
    615    /* Perform the DCT */
    616    (*do_dct) (workspace);
    617 
    618    /* Quantize/descale the coefficients, and store into coef_blocks[] */
    619    (*do_quantize) (coef_blocks[bi], divisors, workspace);
    620  }
    621 }
    622 
    623 #endif /* DCT_FLOAT_SUPPORTED */
    624 
    625 
    626 /*
    627 * Initialize FDCT manager.
    628 */
    629 
    630 GLOBAL(void)
    631 _jinit_forward_dct(j_compress_ptr cinfo)
    632 {
    633  my_fdct_ptr fdct;
    634  int i;
    635 
    636  if (cinfo->data_precision != BITS_IN_JSAMPLE)
    637    ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision);
    638 
    639  fdct = (my_fdct_ptr)
    640    (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
    641                                sizeof(my_fdct_controller));
    642  cinfo->fdct = (struct jpeg_forward_dct *)fdct;
    643  fdct->pub.start_pass = start_pass_fdctmgr;
    644 
    645  /* First determine the DCT... */
    646  switch (cinfo->dct_method) {
    647 #ifdef DCT_ISLOW_SUPPORTED
    648  case JDCT_ISLOW:
    649    fdct->pub._forward_DCT = forward_DCT;
    650 #ifdef WITH_SIMD
    651    if (jsimd_can_fdct_islow())
    652      fdct->dct = jsimd_fdct_islow;
    653    else
    654 #endif
    655      fdct->dct = _jpeg_fdct_islow;
    656    break;
    657 #endif
    658 #ifdef DCT_IFAST_SUPPORTED
    659  case JDCT_IFAST:
    660    fdct->pub._forward_DCT = forward_DCT;
    661 #ifdef WITH_SIMD
    662    if (jsimd_can_fdct_ifast())
    663      fdct->dct = jsimd_fdct_ifast;
    664    else
    665 #endif
    666      fdct->dct = _jpeg_fdct_ifast;
    667    break;
    668 #endif
    669 #ifdef DCT_FLOAT_SUPPORTED
    670  case JDCT_FLOAT:
    671    fdct->pub._forward_DCT = forward_DCT_float;
    672 #ifdef WITH_SIMD
    673    if (jsimd_can_fdct_float())
    674      fdct->float_dct = jsimd_fdct_float;
    675    else
    676 #endif
    677      fdct->float_dct = jpeg_fdct_float;
    678    break;
    679 #endif
    680  default:
    681    ERREXIT(cinfo, JERR_NOT_COMPILED);
    682    break;
    683  }
    684 
    685  /* ...then the supporting stages. */
    686  switch (cinfo->dct_method) {
    687 #ifdef DCT_ISLOW_SUPPORTED
    688  case JDCT_ISLOW:
    689 #endif
    690 #ifdef DCT_IFAST_SUPPORTED
    691  case JDCT_IFAST:
    692 #endif
    693 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED)
    694 #ifdef WITH_SIMD
    695    if (jsimd_can_convsamp())
    696      fdct->convsamp = jsimd_convsamp;
    697    else
    698 #endif
    699      fdct->convsamp = convsamp;
    700 #ifdef WITH_SIMD
    701    if (jsimd_can_quantize())
    702      fdct->quantize = jsimd_quantize;
    703    else
    704 #endif
    705      fdct->quantize = quantize;
    706    break;
    707 #endif
    708 #ifdef DCT_FLOAT_SUPPORTED
    709  case JDCT_FLOAT:
    710 #ifdef WITH_SIMD
    711    if (jsimd_can_convsamp_float())
    712      fdct->float_convsamp = jsimd_convsamp_float;
    713    else
    714 #endif
    715      fdct->float_convsamp = convsamp_float;
    716 #ifdef WITH_SIMD
    717    if (jsimd_can_quantize_float())
    718      fdct->float_quantize = jsimd_quantize_float;
    719    else
    720 #endif
    721      fdct->float_quantize = quantize_float;
    722    break;
    723 #endif
    724  default:
    725    ERREXIT(cinfo, JERR_NOT_COMPILED);
    726    break;
    727  }
    728 
    729  /* Allocate workspace memory */
    730 #ifdef DCT_FLOAT_SUPPORTED
    731  if (cinfo->dct_method == JDCT_FLOAT)
    732    fdct->float_workspace = (FAST_FLOAT *)
    733      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
    734                                  sizeof(FAST_FLOAT) * DCTSIZE2);
    735  else
    736 #endif
    737    fdct->workspace = (DCTELEM *)
    738      (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
    739                                  sizeof(DCTELEM) * DCTSIZE2);
    740 
    741  /* Mark divisor tables unallocated */
    742  for (i = 0; i < NUM_QUANT_TBLS; i++) {
    743    fdct->divisors[i] = NULL;
    744 #ifdef DCT_FLOAT_SUPPORTED
    745    fdct->float_divisors[i] = NULL;
    746 #endif
    747  }
    748 }