jcdctmgr.c (23029B)
1 /* 2 * jcdctmgr.c 3 * 4 * This file was part of the Independent JPEG Group's software: 5 * Copyright (C) 1994-1996, Thomas G. Lane. 6 * libjpeg-turbo Modifications: 7 * Copyright (C) 1999-2006, MIYASAKA Masaru. 8 * Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 9 * Copyright (C) 2011, 2014-2015, 2022, 2024, D. R. Commander. 10 * For conditions of distribution and use, see the accompanying README.ijg 11 * file. 12 * 13 * This file contains the forward-DCT management logic. 14 * This code selects a particular DCT implementation to be used, 15 * and it performs related housekeeping chores including coefficient 16 * quantization. 17 */ 18 19 #define JPEG_INTERNALS 20 #include "jinclude.h" 21 #include "jpeglib.h" 22 #include "jdct.h" /* Private declarations for DCT subsystem */ 23 #include "jsimddct.h" 24 25 26 /* Private subobject for this module */ 27 28 typedef void (*forward_DCT_method_ptr) (DCTELEM *data); 29 typedef void (*float_DCT_method_ptr) (FAST_FLOAT *data); 30 31 typedef void (*convsamp_method_ptr) (_JSAMPARRAY sample_data, 32 JDIMENSION start_col, 33 DCTELEM *workspace); 34 typedef void (*float_convsamp_method_ptr) (_JSAMPARRAY sample_data, 35 JDIMENSION start_col, 36 FAST_FLOAT *workspace); 37 38 typedef void (*quantize_method_ptr) (JCOEFPTR coef_block, DCTELEM *divisors, 39 DCTELEM *workspace); 40 typedef void (*float_quantize_method_ptr) (JCOEFPTR coef_block, 41 FAST_FLOAT *divisors, 42 FAST_FLOAT *workspace); 43 44 METHODDEF(void) quantize(JCOEFPTR, DCTELEM *, DCTELEM *); 45 46 typedef struct { 47 struct jpeg_forward_dct pub; /* public fields */ 48 49 /* Pointer to the DCT routine actually in use */ 50 forward_DCT_method_ptr dct; 51 convsamp_method_ptr convsamp; 52 quantize_method_ptr quantize; 53 54 /* The actual post-DCT divisors --- not identical to the quant table 55 * entries, because of scaling (especially for an unnormalized DCT). 56 * Each table is given in normal array order. 57 */ 58 DCTELEM *divisors[NUM_QUANT_TBLS]; 59 60 /* work area for FDCT subroutine */ 61 DCTELEM *workspace; 62 63 #ifdef DCT_FLOAT_SUPPORTED 64 /* Same as above for the floating-point case. */ 65 float_DCT_method_ptr float_dct; 66 float_convsamp_method_ptr float_convsamp; 67 float_quantize_method_ptr float_quantize; 68 FAST_FLOAT *float_divisors[NUM_QUANT_TBLS]; 69 FAST_FLOAT *float_workspace; 70 #endif 71 } my_fdct_controller; 72 73 typedef my_fdct_controller *my_fdct_ptr; 74 75 76 #if BITS_IN_JSAMPLE == 8 77 78 /* 79 * Find the highest bit in an integer through binary search. 80 */ 81 82 LOCAL(int) 83 flss(UINT16 val) 84 { 85 int bit; 86 87 bit = 16; 88 89 if (!val) 90 return 0; 91 92 if (!(val & 0xff00)) { 93 bit -= 8; 94 val <<= 8; 95 } 96 if (!(val & 0xf000)) { 97 bit -= 4; 98 val <<= 4; 99 } 100 if (!(val & 0xc000)) { 101 bit -= 2; 102 val <<= 2; 103 } 104 if (!(val & 0x8000)) { 105 bit -= 1; 106 val <<= 1; 107 } 108 109 return bit; 110 } 111 112 113 /* 114 * Compute values to do a division using reciprocal. 115 * 116 * This implementation is based on an algorithm described in 117 * "Optimizing subroutines in assembly language: 118 * An optimization guide for x86 platforms" (https://agner.org/optimize). 119 * More information about the basic algorithm can be found in 120 * the paper "Integer Division Using Reciprocals" by Robert Alverson. 121 * 122 * The basic idea is to replace x/d by x * d^-1. In order to store 123 * d^-1 with enough precision we shift it left a few places. It turns 124 * out that this algoright gives just enough precision, and also fits 125 * into DCTELEM: 126 * 127 * b = (the number of significant bits in divisor) - 1 128 * r = (word size) + b 129 * f = 2^r / divisor 130 * 131 * f will not be an integer for most cases, so we need to compensate 132 * for the rounding error introduced: 133 * 134 * no fractional part: 135 * 136 * result = input >> r 137 * 138 * fractional part of f < 0.5: 139 * 140 * round f down to nearest integer 141 * result = ((input + 1) * f) >> r 142 * 143 * fractional part of f > 0.5: 144 * 145 * round f up to nearest integer 146 * result = (input * f) >> r 147 * 148 * This is the original algorithm that gives truncated results. But we 149 * want properly rounded results, so we replace "input" with 150 * "input + divisor/2". 151 * 152 * In order to allow SIMD implementations we also tweak the values to 153 * allow the same calculation to be made at all times: 154 * 155 * dctbl[0] = f rounded to nearest integer 156 * dctbl[1] = divisor / 2 (+ 1 if fractional part of f < 0.5) 157 * dctbl[2] = 1 << ((word size) * 2 - r) 158 * dctbl[3] = r - (word size) 159 * 160 * dctbl[2] is for stupid instruction sets where the shift operation 161 * isn't member wise (e.g. MMX). 162 * 163 * The reason dctbl[2] and dctbl[3] reduce the shift with (word size) 164 * is that most SIMD implementations have a "multiply and store top 165 * half" operation. 166 * 167 * Lastly, we store each of the values in their own table instead 168 * of in a consecutive manner, yet again in order to allow SIMD 169 * routines. 170 */ 171 172 LOCAL(int) 173 compute_reciprocal(UINT16 divisor, DCTELEM *dtbl) 174 { 175 UDCTELEM2 fq, fr; 176 UDCTELEM c; 177 int b, r; 178 179 if (divisor == 1) { 180 /* divisor == 1 means unquantized, so these reciprocal/correction/shift 181 * values will cause the C quantization algorithm to act like the 182 * identity function. Since only the C quantization algorithm is used in 183 * these cases, the scale value is irrelevant. 184 */ 185 dtbl[DCTSIZE2 * 0] = (DCTELEM)1; /* reciprocal */ 186 dtbl[DCTSIZE2 * 1] = (DCTELEM)0; /* correction */ 187 dtbl[DCTSIZE2 * 2] = (DCTELEM)1; /* scale */ 188 dtbl[DCTSIZE2 * 3] = -(DCTELEM)(sizeof(DCTELEM) * 8); /* shift */ 189 return 0; 190 } 191 192 b = flss(divisor) - 1; 193 r = sizeof(DCTELEM) * 8 + b; 194 195 fq = ((UDCTELEM2)1 << r) / divisor; 196 fr = ((UDCTELEM2)1 << r) % divisor; 197 198 c = divisor / 2; /* for rounding */ 199 200 if (fr == 0) { /* divisor is power of two */ 201 /* fq will be one bit too large to fit in DCTELEM, so adjust */ 202 fq >>= 1; 203 r--; 204 } else if (fr <= (divisor / 2U)) { /* fractional part is < 0.5 */ 205 c++; 206 } else { /* fractional part is > 0.5 */ 207 fq++; 208 } 209 210 dtbl[DCTSIZE2 * 0] = (DCTELEM)fq; /* reciprocal */ 211 dtbl[DCTSIZE2 * 1] = (DCTELEM)c; /* correction + roundfactor */ 212 #ifdef WITH_SIMD 213 dtbl[DCTSIZE2 * 2] = (DCTELEM)(1 << (sizeof(DCTELEM) * 8 * 2 - r)); /* scale */ 214 #else 215 dtbl[DCTSIZE2 * 2] = 1; 216 #endif 217 dtbl[DCTSIZE2 * 3] = (DCTELEM)r - sizeof(DCTELEM) * 8; /* shift */ 218 219 if (r <= 16) return 0; 220 else return 1; 221 } 222 223 #endif 224 225 226 /* 227 * Initialize for a processing pass. 228 * Verify that all referenced Q-tables are present, and set up 229 * the divisor table for each one. 230 * In the current implementation, DCT of all components is done during 231 * the first pass, even if only some components will be output in the 232 * first scan. Hence all components should be examined here. 233 */ 234 235 METHODDEF(void) 236 start_pass_fdctmgr(j_compress_ptr cinfo) 237 { 238 my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct; 239 int ci, qtblno, i; 240 jpeg_component_info *compptr; 241 JQUANT_TBL *qtbl; 242 DCTELEM *dtbl; 243 244 for (ci = 0, compptr = cinfo->comp_info; ci < cinfo->num_components; 245 ci++, compptr++) { 246 qtblno = compptr->quant_tbl_no; 247 /* Make sure specified quantization table is present */ 248 if (qtblno < 0 || qtblno >= NUM_QUANT_TBLS || 249 cinfo->quant_tbl_ptrs[qtblno] == NULL) 250 ERREXIT1(cinfo, JERR_NO_QUANT_TABLE, qtblno); 251 qtbl = cinfo->quant_tbl_ptrs[qtblno]; 252 /* Compute divisors for this quant table */ 253 /* We may do this more than once for same table, but it's not a big deal */ 254 switch (cinfo->dct_method) { 255 #ifdef DCT_ISLOW_SUPPORTED 256 case JDCT_ISLOW: 257 /* For LL&M IDCT method, divisors are equal to raw quantization 258 * coefficients multiplied by 8 (to counteract scaling). 259 */ 260 if (fdct->divisors[qtblno] == NULL) { 261 fdct->divisors[qtblno] = (DCTELEM *) 262 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 263 (DCTSIZE2 * 4) * sizeof(DCTELEM)); 264 } 265 dtbl = fdct->divisors[qtblno]; 266 for (i = 0; i < DCTSIZE2; i++) { 267 #if BITS_IN_JSAMPLE == 8 268 #ifdef WITH_SIMD 269 if (!compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]) && 270 fdct->quantize == jsimd_quantize) 271 fdct->quantize = quantize; 272 #else 273 compute_reciprocal(qtbl->quantval[i] << 3, &dtbl[i]); 274 #endif 275 #else 276 dtbl[i] = ((DCTELEM)qtbl->quantval[i]) << 3; 277 #endif 278 } 279 break; 280 #endif 281 #ifdef DCT_IFAST_SUPPORTED 282 case JDCT_IFAST: 283 { 284 /* For AA&N IDCT method, divisors are equal to quantization 285 * coefficients scaled by scalefactor[row]*scalefactor[col], where 286 * scalefactor[0] = 1 287 * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 288 * We apply a further scale factor of 8. 289 */ 290 #define CONST_BITS 14 291 static const INT16 aanscales[DCTSIZE2] = { 292 /* precomputed values scaled up by 14 bits */ 293 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 294 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270, 295 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906, 296 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315, 297 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520, 298 12873, 17855, 16819, 15137, 12873, 10114, 6967, 3552, 299 8867, 12299, 11585, 10426, 8867, 6967, 4799, 2446, 300 4520, 6270, 5906, 5315, 4520, 3552, 2446, 1247 301 }; 302 SHIFT_TEMPS 303 304 if (fdct->divisors[qtblno] == NULL) { 305 fdct->divisors[qtblno] = (DCTELEM *) 306 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 307 (DCTSIZE2 * 4) * sizeof(DCTELEM)); 308 } 309 dtbl = fdct->divisors[qtblno]; 310 for (i = 0; i < DCTSIZE2; i++) { 311 #if BITS_IN_JSAMPLE == 8 312 #ifdef WITH_SIMD 313 if (!compute_reciprocal( 314 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i], 315 (JLONG)aanscales[i]), 316 CONST_BITS - 3), &dtbl[i]) && 317 fdct->quantize == jsimd_quantize) 318 fdct->quantize = quantize; 319 #else 320 compute_reciprocal( 321 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i], 322 (JLONG)aanscales[i]), 323 CONST_BITS-3), &dtbl[i]); 324 #endif 325 #else 326 dtbl[i] = (DCTELEM) 327 DESCALE(MULTIPLY16V16((JLONG)qtbl->quantval[i], 328 (JLONG)aanscales[i]), 329 CONST_BITS - 3); 330 #endif 331 } 332 } 333 break; 334 #endif 335 #ifdef DCT_FLOAT_SUPPORTED 336 case JDCT_FLOAT: 337 { 338 /* For float AA&N IDCT method, divisors are equal to quantization 339 * coefficients scaled by scalefactor[row]*scalefactor[col], where 340 * scalefactor[0] = 1 341 * scalefactor[k] = cos(k*PI/16) * sqrt(2) for k=1..7 342 * We apply a further scale factor of 8. 343 * What's actually stored is 1/divisor so that the inner loop can 344 * use a multiplication rather than a division. 345 */ 346 FAST_FLOAT *fdtbl; 347 int row, col; 348 static const double aanscalefactor[DCTSIZE] = { 349 1.0, 1.387039845, 1.306562965, 1.175875602, 350 1.0, 0.785694958, 0.541196100, 0.275899379 351 }; 352 353 if (fdct->float_divisors[qtblno] == NULL) { 354 fdct->float_divisors[qtblno] = (FAST_FLOAT *) 355 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 356 DCTSIZE2 * sizeof(FAST_FLOAT)); 357 } 358 fdtbl = fdct->float_divisors[qtblno]; 359 i = 0; 360 for (row = 0; row < DCTSIZE; row++) { 361 for (col = 0; col < DCTSIZE; col++) { 362 fdtbl[i] = (FAST_FLOAT) 363 (1.0 / (((double)qtbl->quantval[i] * 364 aanscalefactor[row] * aanscalefactor[col] * 8.0))); 365 i++; 366 } 367 } 368 } 369 break; 370 #endif 371 default: 372 ERREXIT(cinfo, JERR_NOT_COMPILED); 373 break; 374 } 375 } 376 } 377 378 379 /* 380 * Load data into workspace, applying unsigned->signed conversion. 381 */ 382 383 METHODDEF(void) 384 convsamp(_JSAMPARRAY sample_data, JDIMENSION start_col, DCTELEM *workspace) 385 { 386 register DCTELEM *workspaceptr; 387 register _JSAMPROW elemptr; 388 register int elemr; 389 390 workspaceptr = workspace; 391 for (elemr = 0; elemr < DCTSIZE; elemr++) { 392 elemptr = sample_data[elemr] + start_col; 393 394 #if DCTSIZE == 8 /* unroll the inner loop */ 395 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 396 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 397 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 398 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 399 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 400 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 401 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 402 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 403 #else 404 { 405 register int elemc; 406 for (elemc = DCTSIZE; elemc > 0; elemc--) 407 *workspaceptr++ = (*elemptr++) - _CENTERJSAMPLE; 408 } 409 #endif 410 } 411 } 412 413 414 /* 415 * Quantize/descale the coefficients, and store into coef_blocks[]. 416 */ 417 418 METHODDEF(void) 419 quantize(JCOEFPTR coef_block, DCTELEM *divisors, DCTELEM *workspace) 420 { 421 int i; 422 DCTELEM temp; 423 JCOEFPTR output_ptr = coef_block; 424 425 #if BITS_IN_JSAMPLE == 8 426 427 UDCTELEM recip, corr; 428 int shift; 429 UDCTELEM2 product; 430 431 for (i = 0; i < DCTSIZE2; i++) { 432 temp = workspace[i]; 433 recip = divisors[i + DCTSIZE2 * 0]; 434 corr = divisors[i + DCTSIZE2 * 1]; 435 shift = divisors[i + DCTSIZE2 * 3]; 436 437 if (temp < 0) { 438 temp = -temp; 439 product = (UDCTELEM2)(temp + corr) * recip; 440 product >>= shift + sizeof(DCTELEM) * 8; 441 temp = (DCTELEM)product; 442 temp = -temp; 443 } else { 444 product = (UDCTELEM2)(temp + corr) * recip; 445 product >>= shift + sizeof(DCTELEM) * 8; 446 temp = (DCTELEM)product; 447 } 448 output_ptr[i] = (JCOEF)temp; 449 } 450 451 #else 452 453 register DCTELEM qval; 454 455 for (i = 0; i < DCTSIZE2; i++) { 456 qval = divisors[i]; 457 temp = workspace[i]; 458 /* Divide the coefficient value by qval, ensuring proper rounding. 459 * Since C does not specify the direction of rounding for negative 460 * quotients, we have to force the dividend positive for portability. 461 * 462 * In most files, at least half of the output values will be zero 463 * (at default quantization settings, more like three-quarters...) 464 * so we should ensure that this case is fast. On many machines, 465 * a comparison is enough cheaper than a divide to make a special test 466 * a win. Since both inputs will be nonnegative, we need only test 467 * for a < b to discover whether a/b is 0. 468 * If your machine's division is fast enough, define FAST_DIVIDE. 469 */ 470 #ifdef FAST_DIVIDE 471 #define DIVIDE_BY(a, b) a /= b 472 #else 473 #define DIVIDE_BY(a, b) if (a >= b) a /= b; else a = 0 474 #endif 475 if (temp < 0) { 476 temp = -temp; 477 temp += qval >> 1; /* for rounding */ 478 DIVIDE_BY(temp, qval); 479 temp = -temp; 480 } else { 481 temp += qval >> 1; /* for rounding */ 482 DIVIDE_BY(temp, qval); 483 } 484 output_ptr[i] = (JCOEF)temp; 485 } 486 487 #endif 488 489 } 490 491 492 /* 493 * Perform forward DCT on one or more blocks of a component. 494 * 495 * The input samples are taken from the sample_data[] array starting at 496 * position start_row/start_col, and moving to the right for any additional 497 * blocks. The quantized coefficients are returned in coef_blocks[]. 498 */ 499 500 METHODDEF(void) 501 forward_DCT(j_compress_ptr cinfo, jpeg_component_info *compptr, 502 _JSAMPARRAY sample_data, JBLOCKROW coef_blocks, 503 JDIMENSION start_row, JDIMENSION start_col, JDIMENSION num_blocks) 504 /* This version is used for integer DCT implementations. */ 505 { 506 /* This routine is heavily used, so it's worth coding it tightly. */ 507 my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct; 508 DCTELEM *divisors = fdct->divisors[compptr->quant_tbl_no]; 509 DCTELEM *workspace; 510 JDIMENSION bi; 511 512 /* Make sure the compiler doesn't look up these every pass */ 513 forward_DCT_method_ptr do_dct = fdct->dct; 514 convsamp_method_ptr do_convsamp = fdct->convsamp; 515 quantize_method_ptr do_quantize = fdct->quantize; 516 workspace = fdct->workspace; 517 518 sample_data += start_row; /* fold in the vertical offset once */ 519 520 for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { 521 /* Load data into workspace, applying unsigned->signed conversion */ 522 (*do_convsamp) (sample_data, start_col, workspace); 523 524 /* Perform the DCT */ 525 (*do_dct) (workspace); 526 527 /* Quantize/descale the coefficients, and store into coef_blocks[] */ 528 (*do_quantize) (coef_blocks[bi], divisors, workspace); 529 } 530 } 531 532 533 #ifdef DCT_FLOAT_SUPPORTED 534 535 METHODDEF(void) 536 convsamp_float(_JSAMPARRAY sample_data, JDIMENSION start_col, 537 FAST_FLOAT *workspace) 538 { 539 register FAST_FLOAT *workspaceptr; 540 register _JSAMPROW elemptr; 541 register int elemr; 542 543 workspaceptr = workspace; 544 for (elemr = 0; elemr < DCTSIZE; elemr++) { 545 elemptr = sample_data[elemr] + start_col; 546 #if DCTSIZE == 8 /* unroll the inner loop */ 547 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 548 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 549 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 550 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 551 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 552 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 553 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 554 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 555 #else 556 { 557 register int elemc; 558 for (elemc = DCTSIZE; elemc > 0; elemc--) 559 *workspaceptr++ = (FAST_FLOAT)((*elemptr++) - _CENTERJSAMPLE); 560 } 561 #endif 562 } 563 } 564 565 566 METHODDEF(void) 567 quantize_float(JCOEFPTR coef_block, FAST_FLOAT *divisors, 568 FAST_FLOAT *workspace) 569 { 570 register FAST_FLOAT temp; 571 register int i; 572 register JCOEFPTR output_ptr = coef_block; 573 574 for (i = 0; i < DCTSIZE2; i++) { 575 /* Apply the quantization and scaling factor */ 576 temp = workspace[i] * divisors[i]; 577 578 /* Round to nearest integer. 579 * Since C does not specify the direction of rounding for negative 580 * quotients, we have to force the dividend positive for portability. 581 * The maximum coefficient size is +-16K (for 12-bit data), so this 582 * code should work for either 16-bit or 32-bit ints. 583 */ 584 output_ptr[i] = (JCOEF)((int)(temp + (FAST_FLOAT)16384.5) - 16384); 585 } 586 } 587 588 589 METHODDEF(void) 590 forward_DCT_float(j_compress_ptr cinfo, jpeg_component_info *compptr, 591 _JSAMPARRAY sample_data, JBLOCKROW coef_blocks, 592 JDIMENSION start_row, JDIMENSION start_col, 593 JDIMENSION num_blocks) 594 /* This version is used for floating-point DCT implementations. */ 595 { 596 /* This routine is heavily used, so it's worth coding it tightly. */ 597 my_fdct_ptr fdct = (my_fdct_ptr)cinfo->fdct; 598 FAST_FLOAT *divisors = fdct->float_divisors[compptr->quant_tbl_no]; 599 FAST_FLOAT *workspace; 600 JDIMENSION bi; 601 602 603 /* Make sure the compiler doesn't look up these every pass */ 604 float_DCT_method_ptr do_dct = fdct->float_dct; 605 float_convsamp_method_ptr do_convsamp = fdct->float_convsamp; 606 float_quantize_method_ptr do_quantize = fdct->float_quantize; 607 workspace = fdct->float_workspace; 608 609 sample_data += start_row; /* fold in the vertical offset once */ 610 611 for (bi = 0; bi < num_blocks; bi++, start_col += DCTSIZE) { 612 /* Load data into workspace, applying unsigned->signed conversion */ 613 (*do_convsamp) (sample_data, start_col, workspace); 614 615 /* Perform the DCT */ 616 (*do_dct) (workspace); 617 618 /* Quantize/descale the coefficients, and store into coef_blocks[] */ 619 (*do_quantize) (coef_blocks[bi], divisors, workspace); 620 } 621 } 622 623 #endif /* DCT_FLOAT_SUPPORTED */ 624 625 626 /* 627 * Initialize FDCT manager. 628 */ 629 630 GLOBAL(void) 631 _jinit_forward_dct(j_compress_ptr cinfo) 632 { 633 my_fdct_ptr fdct; 634 int i; 635 636 if (cinfo->data_precision != BITS_IN_JSAMPLE) 637 ERREXIT1(cinfo, JERR_BAD_PRECISION, cinfo->data_precision); 638 639 fdct = (my_fdct_ptr) 640 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 641 sizeof(my_fdct_controller)); 642 cinfo->fdct = (struct jpeg_forward_dct *)fdct; 643 fdct->pub.start_pass = start_pass_fdctmgr; 644 645 /* First determine the DCT... */ 646 switch (cinfo->dct_method) { 647 #ifdef DCT_ISLOW_SUPPORTED 648 case JDCT_ISLOW: 649 fdct->pub._forward_DCT = forward_DCT; 650 #ifdef WITH_SIMD 651 if (jsimd_can_fdct_islow()) 652 fdct->dct = jsimd_fdct_islow; 653 else 654 #endif 655 fdct->dct = _jpeg_fdct_islow; 656 break; 657 #endif 658 #ifdef DCT_IFAST_SUPPORTED 659 case JDCT_IFAST: 660 fdct->pub._forward_DCT = forward_DCT; 661 #ifdef WITH_SIMD 662 if (jsimd_can_fdct_ifast()) 663 fdct->dct = jsimd_fdct_ifast; 664 else 665 #endif 666 fdct->dct = _jpeg_fdct_ifast; 667 break; 668 #endif 669 #ifdef DCT_FLOAT_SUPPORTED 670 case JDCT_FLOAT: 671 fdct->pub._forward_DCT = forward_DCT_float; 672 #ifdef WITH_SIMD 673 if (jsimd_can_fdct_float()) 674 fdct->float_dct = jsimd_fdct_float; 675 else 676 #endif 677 fdct->float_dct = jpeg_fdct_float; 678 break; 679 #endif 680 default: 681 ERREXIT(cinfo, JERR_NOT_COMPILED); 682 break; 683 } 684 685 /* ...then the supporting stages. */ 686 switch (cinfo->dct_method) { 687 #ifdef DCT_ISLOW_SUPPORTED 688 case JDCT_ISLOW: 689 #endif 690 #ifdef DCT_IFAST_SUPPORTED 691 case JDCT_IFAST: 692 #endif 693 #if defined(DCT_ISLOW_SUPPORTED) || defined(DCT_IFAST_SUPPORTED) 694 #ifdef WITH_SIMD 695 if (jsimd_can_convsamp()) 696 fdct->convsamp = jsimd_convsamp; 697 else 698 #endif 699 fdct->convsamp = convsamp; 700 #ifdef WITH_SIMD 701 if (jsimd_can_quantize()) 702 fdct->quantize = jsimd_quantize; 703 else 704 #endif 705 fdct->quantize = quantize; 706 break; 707 #endif 708 #ifdef DCT_FLOAT_SUPPORTED 709 case JDCT_FLOAT: 710 #ifdef WITH_SIMD 711 if (jsimd_can_convsamp_float()) 712 fdct->float_convsamp = jsimd_convsamp_float; 713 else 714 #endif 715 fdct->float_convsamp = convsamp_float; 716 #ifdef WITH_SIMD 717 if (jsimd_can_quantize_float()) 718 fdct->float_quantize = jsimd_quantize_float; 719 else 720 #endif 721 fdct->float_quantize = quantize_float; 722 break; 723 #endif 724 default: 725 ERREXIT(cinfo, JERR_NOT_COMPILED); 726 break; 727 } 728 729 /* Allocate workspace memory */ 730 #ifdef DCT_FLOAT_SUPPORTED 731 if (cinfo->dct_method == JDCT_FLOAT) 732 fdct->float_workspace = (FAST_FLOAT *) 733 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 734 sizeof(FAST_FLOAT) * DCTSIZE2); 735 else 736 #endif 737 fdct->workspace = (DCTELEM *) 738 (*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE, 739 sizeof(DCTELEM) * DCTSIZE2); 740 741 /* Mark divisor tables unallocated */ 742 for (i = 0; i < NUM_QUANT_TBLS; i++) { 743 fdct->divisors[i] = NULL; 744 #ifdef DCT_FLOAT_SUPPORTED 745 fdct->float_divisors[i] = NULL; 746 #endif 747 } 748 }