tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctint-neon.c (35406B)


      1 /*
      2 * jidctint-neon.c - accurate integer IDCT (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
      6 *
      7 * This software is provided 'as-is', without any express or implied
      8 * warranty.  In no event will the authors be held liable for any damages
      9 * arising from the use of this software.
     10 *
     11 * Permission is granted to anyone to use this software for any purpose,
     12 * including commercial applications, and to alter it and redistribute it
     13 * freely, subject to the following restrictions:
     14 *
     15 * 1. The origin of this software must not be misrepresented; you must not
     16 *    claim that you wrote the original software. If you use this software
     17 *    in a product, an acknowledgment in the product documentation would be
     18 *    appreciated but is not required.
     19 * 2. Altered source versions must be plainly marked as such, and must not be
     20 *    misrepresented as being the original software.
     21 * 3. This notice may not be removed or altered from any source distribution.
     22 */
     23 
     24 #define JPEG_INTERNALS
     25 #include "../../jinclude.h"
     26 #include "../../jpeglib.h"
     27 #include "../../jsimd.h"
     28 #include "../../jdct.h"
     29 #include "../../jsimddct.h"
     30 #include "../jsimd.h"
     31 #include "align.h"
     32 #include "neon-compat.h"
     33 
     34 #include <arm_neon.h>
     35 
     36 
     37 #define CONST_BITS  13
     38 #define PASS1_BITS  2
     39 
     40 #define DESCALE_P1  (CONST_BITS - PASS1_BITS)
     41 #define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
     42 
     43 /* The computation of the inverse DCT requires the use of constants known at
     44 * compile time.  Scaled integer constants are used to avoid floating-point
     45 * arithmetic:
     46 *    0.298631336 =  2446 * 2^-13
     47 *    0.390180644 =  3196 * 2^-13
     48 *    0.541196100 =  4433 * 2^-13
     49 *    0.765366865 =  6270 * 2^-13
     50 *    0.899976223 =  7373 * 2^-13
     51 *    1.175875602 =  9633 * 2^-13
     52 *    1.501321110 = 12299 * 2^-13
     53 *    1.847759065 = 15137 * 2^-13
     54 *    1.961570560 = 16069 * 2^-13
     55 *    2.053119869 = 16819 * 2^-13
     56 *    2.562915447 = 20995 * 2^-13
     57 *    3.072711026 = 25172 * 2^-13
     58 */
     59 
     60 #define F_0_298  2446
     61 #define F_0_390  3196
     62 #define F_0_541  4433
     63 #define F_0_765  6270
     64 #define F_0_899  7373
     65 #define F_1_175  9633
     66 #define F_1_501  12299
     67 #define F_1_847  15137
     68 #define F_1_961  16069
     69 #define F_2_053  16819
     70 #define F_2_562  20995
     71 #define F_3_072  25172
     72 
     73 #define F_1_175_MINUS_1_961  (F_1_175 - F_1_961)
     74 #define F_1_175_MINUS_0_390  (F_1_175 - F_0_390)
     75 #define F_0_541_MINUS_1_847  (F_0_541 - F_1_847)
     76 #define F_3_072_MINUS_2_562  (F_3_072 - F_2_562)
     77 #define F_0_298_MINUS_0_899  (F_0_298 - F_0_899)
     78 #define F_1_501_MINUS_0_899  (F_1_501 - F_0_899)
     79 #define F_2_053_MINUS_2_562  (F_2_053 - F_2_562)
     80 #define F_0_541_PLUS_0_765   (F_0_541 + F_0_765)
     81 
     82 
     83 ALIGN(16) static const int16_t jsimd_idct_islow_neon_consts[] = {
     84  F_0_899,             F_0_541,
     85  F_2_562,             F_0_298_MINUS_0_899,
     86  F_1_501_MINUS_0_899, F_2_053_MINUS_2_562,
     87  F_0_541_PLUS_0_765,  F_1_175,
     88  F_1_175_MINUS_0_390, F_0_541_MINUS_1_847,
     89  F_3_072_MINUS_2_562, F_1_175_MINUS_1_961,
     90  0, 0, 0, 0
     91 };
     92 
     93 
     94 /* Forward declaration of regular and sparse IDCT helper functions */
     95 
     96 static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
     97                                                  int16x4_t row1,
     98                                                  int16x4_t row2,
     99                                                  int16x4_t row3,
    100                                                  int16x4_t row4,
    101                                                  int16x4_t row5,
    102                                                  int16x4_t row6,
    103                                                  int16x4_t row7,
    104                                                  int16x4_t quant_row0,
    105                                                  int16x4_t quant_row1,
    106                                                  int16x4_t quant_row2,
    107                                                  int16x4_t quant_row3,
    108                                                  int16x4_t quant_row4,
    109                                                  int16x4_t quant_row5,
    110                                                  int16x4_t quant_row6,
    111                                                  int16x4_t quant_row7,
    112                                                  int16_t *workspace_1,
    113                                                  int16_t *workspace_2);
    114 
    115 static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
    116                                                 int16x4_t row1,
    117                                                 int16x4_t row2,
    118                                                 int16x4_t row3,
    119                                                 int16x4_t quant_row0,
    120                                                 int16x4_t quant_row1,
    121                                                 int16x4_t quant_row2,
    122                                                 int16x4_t quant_row3,
    123                                                 int16_t *workspace_1,
    124                                                 int16_t *workspace_2);
    125 
    126 static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
    127                                                  JSAMPARRAY output_buf,
    128                                                  JDIMENSION output_col,
    129                                                  unsigned buf_offset);
    130 
    131 static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
    132                                                 JSAMPARRAY output_buf,
    133                                                 JDIMENSION output_col,
    134                                                 unsigned buf_offset);
    135 
    136 
    137 /* Perform dequantization and inverse DCT on one block of coefficients.  For
    138 * reference, the C implementation (jpeg_idct_slow()) can be found in
    139 * jidctint.c.
    140 *
    141 * Optimization techniques used for fast data access:
    142 *
    143 * In each pass, the inverse DCT is computed for the left and right 4x8 halves
    144 * of the DCT block.  This avoids spilling due to register pressure, and the
    145 * increased granularity allows for an optimized calculation depending on the
    146 * values of the DCT coefficients.  Between passes, intermediate data is stored
    147 * in 4x8 workspace buffers.
    148 *
    149 * Transposing the 8x8 DCT block after each pass can be achieved by transposing
    150 * each of the four 4x4 quadrants and swapping quadrants 1 and 2 (refer to the
    151 * diagram below.)  Swapping quadrants is cheap, since the second pass can just
    152 * swap the workspace buffer pointers.
    153 *
    154 *      +-------+-------+                   +-------+-------+
    155 *      |       |       |                   |       |       |
    156 *      |   0   |   1   |                   |   0   |   2   |
    157 *      |       |       |    transpose      |       |       |
    158 *      +-------+-------+     ------>       +-------+-------+
    159 *      |       |       |                   |       |       |
    160 *      |   2   |   3   |                   |   1   |   3   |
    161 *      |       |       |                   |       |       |
    162 *      +-------+-------+                   +-------+-------+
    163 *
    164 * Optimization techniques used to accelerate the inverse DCT calculation:
    165 *
    166 * In a DCT coefficient block, the coefficients are increasingly likely to be 0
    167 * as you move diagonally from top left to bottom right.  If whole rows of
    168 * coefficients are 0, then the inverse DCT calculation can be simplified.  On
    169 * the first pass of the inverse DCT, we test for three special cases before
    170 * defaulting to a full "regular" inverse DCT:
    171 *
    172 * 1) Coefficients in rows 4-7 are all zero.  In this case, we perform a
    173 *    "sparse" simplified inverse DCT on rows 0-3.
    174 * 2) AC coefficients (rows 1-7) are all zero.  In this case, the inverse DCT
    175 *    result is equal to the dequantized DC coefficients.
    176 * 3) AC and DC coefficients are all zero.  In this case, the inverse DCT
    177 *    result is all zero.  For the left 4x8 half, this is handled identically
    178 *    to Case 2 above.  For the right 4x8 half, we do no work and signal that
    179 *    the "sparse" algorithm is required for the second pass.
    180 *
    181 * In the second pass, only a single special case is tested: whether the AC and
    182 * DC coefficients were all zero in the right 4x8 block during the first pass
    183 * (refer to Case 3 above.)  If this is the case, then a "sparse" variant of
    184 * the second pass is performed for both the left and right halves of the DCT
    185 * block.  (The transposition after the first pass means that the right 4x8
    186 * block during the first pass becomes rows 4-7 during the second pass.)
    187 */
    188 
    189 void jsimd_idct_islow_neon(void *dct_table, JCOEFPTR coef_block,
    190                           JSAMPARRAY output_buf, JDIMENSION output_col)
    191 {
    192  ISLOW_MULT_TYPE *quantptr = dct_table;
    193 
    194  int16_t workspace_l[8 * DCTSIZE / 2];
    195  int16_t workspace_r[8 * DCTSIZE / 2];
    196 
    197  /* Compute IDCT first pass on left 4x8 coefficient block. */
    198 
    199  /* Load DCT coefficients in left 4x8 block. */
    200  int16x4_t row0 = vld1_s16(coef_block + 0 * DCTSIZE);
    201  int16x4_t row1 = vld1_s16(coef_block + 1 * DCTSIZE);
    202  int16x4_t row2 = vld1_s16(coef_block + 2 * DCTSIZE);
    203  int16x4_t row3 = vld1_s16(coef_block + 3 * DCTSIZE);
    204  int16x4_t row4 = vld1_s16(coef_block + 4 * DCTSIZE);
    205  int16x4_t row5 = vld1_s16(coef_block + 5 * DCTSIZE);
    206  int16x4_t row6 = vld1_s16(coef_block + 6 * DCTSIZE);
    207  int16x4_t row7 = vld1_s16(coef_block + 7 * DCTSIZE);
    208 
    209  /* Load quantization table for left 4x8 block. */
    210  int16x4_t quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE);
    211  int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
    212  int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
    213  int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
    214  int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE);
    215  int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
    216  int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
    217  int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
    218 
    219  /* Construct bitmap to test if DCT coefficients in left 4x8 block are 0. */
    220  int16x4_t bitmap = vorr_s16(row7, row6);
    221  bitmap = vorr_s16(bitmap, row5);
    222  bitmap = vorr_s16(bitmap, row4);
    223  int64_t bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
    224 
    225  if (bitmap_rows_4567 == 0) {
    226    bitmap = vorr_s16(bitmap, row3);
    227    bitmap = vorr_s16(bitmap, row2);
    228    bitmap = vorr_s16(bitmap, row1);
    229    int64_t left_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
    230 
    231    if (left_ac_bitmap == 0) {
    232      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
    233      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
    234      /* Store 4x4 blocks to workspace, transposing in the process. */
    235      vst4_s16(workspace_l, quadrant);
    236      vst4_s16(workspace_r, quadrant);
    237    } else {
    238      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
    239                                    quant_row1, quant_row2, quant_row3,
    240                                    workspace_l, workspace_r);
    241    }
    242  } else {
    243    jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
    244                                   row6, row7, quant_row0, quant_row1,
    245                                   quant_row2, quant_row3, quant_row4,
    246                                   quant_row5, quant_row6, quant_row7,
    247                                   workspace_l, workspace_r);
    248  }
    249 
    250  /* Compute IDCT first pass on right 4x8 coefficient block. */
    251 
    252  /* Load DCT coefficients in right 4x8 block. */
    253  row0 = vld1_s16(coef_block + 0 * DCTSIZE + 4);
    254  row1 = vld1_s16(coef_block + 1 * DCTSIZE + 4);
    255  row2 = vld1_s16(coef_block + 2 * DCTSIZE + 4);
    256  row3 = vld1_s16(coef_block + 3 * DCTSIZE + 4);
    257  row4 = vld1_s16(coef_block + 4 * DCTSIZE + 4);
    258  row5 = vld1_s16(coef_block + 5 * DCTSIZE + 4);
    259  row6 = vld1_s16(coef_block + 6 * DCTSIZE + 4);
    260  row7 = vld1_s16(coef_block + 7 * DCTSIZE + 4);
    261 
    262  /* Load quantization table for right 4x8 block. */
    263  quant_row0 = vld1_s16(quantptr + 0 * DCTSIZE + 4);
    264  quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
    265  quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
    266  quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
    267  quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4);
    268  quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
    269  quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
    270  quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
    271 
    272  /* Construct bitmap to test if DCT coefficients in right 4x8 block are 0. */
    273  bitmap = vorr_s16(row7, row6);
    274  bitmap = vorr_s16(bitmap, row5);
    275  bitmap = vorr_s16(bitmap, row4);
    276  bitmap_rows_4567 = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
    277  bitmap = vorr_s16(bitmap, row3);
    278  bitmap = vorr_s16(bitmap, row2);
    279  bitmap = vorr_s16(bitmap, row1);
    280  int64_t right_ac_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
    281 
    282  /* If this remains non-zero, a "regular" second pass will be performed. */
    283  int64_t right_ac_dc_bitmap = 1;
    284 
    285  if (right_ac_bitmap == 0) {
    286    bitmap = vorr_s16(bitmap, row0);
    287    right_ac_dc_bitmap = vget_lane_s64(vreinterpret_s64_s16(bitmap), 0);
    288 
    289    if (right_ac_dc_bitmap != 0) {
    290      int16x4_t dcval = vshl_n_s16(vmul_s16(row0, quant_row0), PASS1_BITS);
    291      int16x4x4_t quadrant = { { dcval, dcval, dcval, dcval } };
    292      /* Store 4x4 blocks to workspace, transposing in the process. */
    293      vst4_s16(workspace_l + 4 * DCTSIZE / 2, quadrant);
    294      vst4_s16(workspace_r + 4 * DCTSIZE / 2, quadrant);
    295    }
    296  } else {
    297    if (bitmap_rows_4567 == 0) {
    298      jsimd_idct_islow_pass1_sparse(row0, row1, row2, row3, quant_row0,
    299                                    quant_row1, quant_row2, quant_row3,
    300                                    workspace_l + 4 * DCTSIZE / 2,
    301                                    workspace_r + 4 * DCTSIZE / 2);
    302    } else {
    303      jsimd_idct_islow_pass1_regular(row0, row1, row2, row3, row4, row5,
    304                                     row6, row7, quant_row0, quant_row1,
    305                                     quant_row2, quant_row3, quant_row4,
    306                                     quant_row5, quant_row6, quant_row7,
    307                                     workspace_l + 4 * DCTSIZE / 2,
    308                                     workspace_r + 4 * DCTSIZE / 2);
    309    }
    310  }
    311 
    312  /* Second pass: compute IDCT on rows in workspace. */
    313 
    314  /* If all coefficients in right 4x8 block are 0, use "sparse" second pass. */
    315  if (right_ac_dc_bitmap == 0) {
    316    jsimd_idct_islow_pass2_sparse(workspace_l, output_buf, output_col, 0);
    317    jsimd_idct_islow_pass2_sparse(workspace_r, output_buf, output_col, 4);
    318  } else {
    319    jsimd_idct_islow_pass2_regular(workspace_l, output_buf, output_col, 0);
    320    jsimd_idct_islow_pass2_regular(workspace_r, output_buf, output_col, 4);
    321  }
    322 }
    323 
    324 
    325 /* Perform dequantization and the first pass of the accurate inverse DCT on a
    326 * 4x8 block of coefficients.  (To process the full 8x8 DCT block, this
    327 * function-- or some other optimized variant-- needs to be called for both the
    328 * left and right 4x8 blocks.)
    329 *
    330 * This "regular" version assumes that no optimization can be made to the IDCT
    331 * calculation, since no useful set of AC coefficients is all 0.
    332 *
    333 * The original C implementation of the accurate IDCT (jpeg_idct_slow()) can be
    334 * found in jidctint.c.  Algorithmic changes made here are documented inline.
    335 */
    336 
    337 static INLINE void jsimd_idct_islow_pass1_regular(int16x4_t row0,
    338                                                  int16x4_t row1,
    339                                                  int16x4_t row2,
    340                                                  int16x4_t row3,
    341                                                  int16x4_t row4,
    342                                                  int16x4_t row5,
    343                                                  int16x4_t row6,
    344                                                  int16x4_t row7,
    345                                                  int16x4_t quant_row0,
    346                                                  int16x4_t quant_row1,
    347                                                  int16x4_t quant_row2,
    348                                                  int16x4_t quant_row3,
    349                                                  int16x4_t quant_row4,
    350                                                  int16x4_t quant_row5,
    351                                                  int16x4_t quant_row6,
    352                                                  int16x4_t quant_row7,
    353                                                  int16_t *workspace_1,
    354                                                  int16_t *workspace_2)
    355 {
    356  /* Load constants for IDCT computation. */
    357 #ifdef HAVE_VLD1_S16_X3
    358  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
    359 #else
    360  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
    361  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
    362  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
    363  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
    364 #endif
    365 
    366  /* Even part */
    367  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
    368  int16x4_t z3_s16 = vmul_s16(row6, quant_row6);
    369 
    370  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
    371  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
    372  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
    373  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
    374 
    375  z2_s16 = vmul_s16(row0, quant_row0);
    376  z3_s16 = vmul_s16(row4, quant_row4);
    377 
    378  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
    379  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
    380 
    381  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
    382  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
    383  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
    384  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
    385 
    386  /* Odd part */
    387  int16x4_t tmp0_s16 = vmul_s16(row7, quant_row7);
    388  int16x4_t tmp1_s16 = vmul_s16(row5, quant_row5);
    389  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
    390  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
    391 
    392  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
    393  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
    394 
    395  /* Implementation as per jpeg_idct_islow() in jidctint.c:
    396   *   z5 = (z3 + z4) * 1.175875602;
    397   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    398   *   z3 += z5;  z4 += z5;
    399   *
    400   * This implementation:
    401   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    402   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    403   */
    404 
    405  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
    406  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
    407  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
    408  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
    409 
    410  /* Implementation as per jpeg_idct_islow() in jidctint.c:
    411   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
    412   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
    413   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
    414   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    415   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
    416   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
    417   *
    418   * This implementation:
    419   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
    420   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
    421   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
    422   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
    423   *   tmp0 += z3;  tmp1 += z4;
    424   *   tmp2 += z3;  tmp3 += z4;
    425   */
    426 
    427  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
    428  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
    429  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
    430  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
    431 
    432  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
    433  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
    434  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
    435  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
    436 
    437  tmp0 = vaddq_s32(tmp0, z3);
    438  tmp1 = vaddq_s32(tmp1, z4);
    439  tmp2 = vaddq_s32(tmp2, z3);
    440  tmp3 = vaddq_s32(tmp3, z4);
    441 
    442  /* Final output stage: descale and narrow to 16-bit. */
    443  int16x4x4_t rows_0123 = { {
    444    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
    445    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
    446    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
    447    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
    448  } };
    449  int16x4x4_t rows_4567 = { {
    450    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
    451    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
    452    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
    453    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
    454  } };
    455 
    456  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
    457   * (VST4 transposes the blocks.  We need to operate on rows in the next
    458   * pass.)
    459   */
    460  vst4_s16(workspace_1, rows_0123);
    461  vst4_s16(workspace_2, rows_4567);
    462 }
    463 
    464 
    465 /* Perform dequantization and the first pass of the accurate inverse DCT on a
    466 * 4x8 block of coefficients.
    467 *
    468 * This "sparse" version assumes that the AC coefficients in rows 4-7 are all
    469 * 0.  This simplifies the IDCT calculation, accelerating overall performance.
    470 */
    471 
    472 static INLINE void jsimd_idct_islow_pass1_sparse(int16x4_t row0,
    473                                                 int16x4_t row1,
    474                                                 int16x4_t row2,
    475                                                 int16x4_t row3,
    476                                                 int16x4_t quant_row0,
    477                                                 int16x4_t quant_row1,
    478                                                 int16x4_t quant_row2,
    479                                                 int16x4_t quant_row3,
    480                                                 int16_t *workspace_1,
    481                                                 int16_t *workspace_2)
    482 {
    483  /* Load constants for IDCT computation. */
    484 #ifdef HAVE_VLD1_S16_X3
    485  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
    486 #else
    487  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
    488  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
    489  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
    490  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
    491 #endif
    492 
    493  /* Even part (z3 is all 0) */
    494  int16x4_t z2_s16 = vmul_s16(row2, quant_row2);
    495 
    496  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
    497  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
    498 
    499  z2_s16 = vmul_s16(row0, quant_row0);
    500  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
    501  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
    502 
    503  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
    504  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
    505  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
    506  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
    507 
    508  /* Odd part (tmp0 and tmp1 are both all 0) */
    509  int16x4_t tmp2_s16 = vmul_s16(row3, quant_row3);
    510  int16x4_t tmp3_s16 = vmul_s16(row1, quant_row1);
    511 
    512  int16x4_t z3_s16 = tmp2_s16;
    513  int16x4_t z4_s16 = tmp3_s16;
    514 
    515  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
    516  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
    517  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
    518  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
    519 
    520  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
    521  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
    522  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
    523  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
    524 
    525  /* Final output stage: descale and narrow to 16-bit. */
    526  int16x4x4_t rows_0123 = { {
    527    vrshrn_n_s32(vaddq_s32(tmp10, tmp3), DESCALE_P1),
    528    vrshrn_n_s32(vaddq_s32(tmp11, tmp2), DESCALE_P1),
    529    vrshrn_n_s32(vaddq_s32(tmp12, tmp1), DESCALE_P1),
    530    vrshrn_n_s32(vaddq_s32(tmp13, tmp0), DESCALE_P1)
    531  } };
    532  int16x4x4_t rows_4567 = { {
    533    vrshrn_n_s32(vsubq_s32(tmp13, tmp0), DESCALE_P1),
    534    vrshrn_n_s32(vsubq_s32(tmp12, tmp1), DESCALE_P1),
    535    vrshrn_n_s32(vsubq_s32(tmp11, tmp2), DESCALE_P1),
    536    vrshrn_n_s32(vsubq_s32(tmp10, tmp3), DESCALE_P1)
    537  } };
    538 
    539  /* Store 4x4 blocks to the intermediate workspace, ready for the second pass.
    540   * (VST4 transposes the blocks.  We need to operate on rows in the next
    541   * pass.)
    542   */
    543  vst4_s16(workspace_1, rows_0123);
    544  vst4_s16(workspace_2, rows_4567);
    545 }
    546 
    547 
    548 /* Perform the second pass of the accurate inverse DCT on a 4x8 block of
    549 * coefficients.  (To process the full 8x8 DCT block, this function-- or some
    550 * other optimized variant-- needs to be called for both the right and left 4x8
    551 * blocks.)
    552 *
    553 * This "regular" version assumes that no optimization can be made to the IDCT
    554 * calculation, since no useful set of coefficient values are all 0 after the
    555 * first pass.
    556 *
    557 * Again, the original C implementation of the accurate IDCT (jpeg_idct_slow())
    558 * can be found in jidctint.c.  Algorithmic changes made here are documented
    559 * inline.
    560 */
    561 
    562 static INLINE void jsimd_idct_islow_pass2_regular(int16_t *workspace,
    563                                                  JSAMPARRAY output_buf,
    564                                                  JDIMENSION output_col,
    565                                                  unsigned buf_offset)
    566 {
    567  /* Load constants for IDCT computation. */
    568 #ifdef HAVE_VLD1_S16_X3
    569  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
    570 #else
    571  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
    572  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
    573  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
    574  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
    575 #endif
    576 
    577  /* Even part */
    578  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
    579  int16x4_t z3_s16 = vld1_s16(workspace + 6 * DCTSIZE / 2);
    580 
    581  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
    582  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
    583  tmp2 = vmlal_lane_s16(tmp2, z3_s16, consts.val[2], 1);
    584  tmp3 = vmlal_lane_s16(tmp3, z3_s16, consts.val[0], 1);
    585 
    586  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
    587  z3_s16 = vld1_s16(workspace + 4 * DCTSIZE / 2);
    588 
    589  int32x4_t tmp0 = vshll_n_s16(vadd_s16(z2_s16, z3_s16), CONST_BITS);
    590  int32x4_t tmp1 = vshll_n_s16(vsub_s16(z2_s16, z3_s16), CONST_BITS);
    591 
    592  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
    593  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
    594  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
    595  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
    596 
    597  /* Odd part */
    598  int16x4_t tmp0_s16 = vld1_s16(workspace + 7 * DCTSIZE / 2);
    599  int16x4_t tmp1_s16 = vld1_s16(workspace + 5 * DCTSIZE / 2);
    600  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
    601  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
    602 
    603  z3_s16 = vadd_s16(tmp0_s16, tmp2_s16);
    604  int16x4_t z4_s16 = vadd_s16(tmp1_s16, tmp3_s16);
    605 
    606  /* Implementation as per jpeg_idct_islow() in jidctint.c:
    607   *   z5 = (z3 + z4) * 1.175875602;
    608   *   z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    609   *   z3 += z5;  z4 += z5;
    610   *
    611   * This implementation:
    612   *   z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    613   *   z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    614   */
    615 
    616  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
    617  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
    618  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
    619  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
    620 
    621  /* Implementation as per jpeg_idct_islow() in jidctint.c:
    622   *   z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
    623   *   tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
    624   *   tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
    625   *   z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    626   *   tmp0 += z1 + z3;  tmp1 += z2 + z4;
    627   *   tmp2 += z2 + z3;  tmp3 += z1 + z4;
    628   *
    629   * This implementation:
    630   *   tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
    631   *   tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
    632   *   tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
    633   *   tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
    634   *   tmp0 += z3;  tmp1 += z4;
    635   *   tmp2 += z3;  tmp3 += z4;
    636   */
    637 
    638  tmp0 = vmull_lane_s16(tmp0_s16, consts.val[0], 3);
    639  tmp1 = vmull_lane_s16(tmp1_s16, consts.val[1], 1);
    640  tmp2 = vmull_lane_s16(tmp2_s16, consts.val[2], 2);
    641  tmp3 = vmull_lane_s16(tmp3_s16, consts.val[1], 0);
    642 
    643  tmp0 = vmlsl_lane_s16(tmp0, tmp3_s16, consts.val[0], 0);
    644  tmp1 = vmlsl_lane_s16(tmp1, tmp2_s16, consts.val[0], 2);
    645  tmp2 = vmlsl_lane_s16(tmp2, tmp1_s16, consts.val[0], 2);
    646  tmp3 = vmlsl_lane_s16(tmp3, tmp0_s16, consts.val[0], 0);
    647 
    648  tmp0 = vaddq_s32(tmp0, z3);
    649  tmp1 = vaddq_s32(tmp1, z4);
    650  tmp2 = vaddq_s32(tmp2, z3);
    651  tmp3 = vaddq_s32(tmp3, z4);
    652 
    653  /* Final output stage: descale and narrow to 16-bit. */
    654  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
    655                                       vaddhn_s32(tmp12, tmp1));
    656  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
    657                                       vaddhn_s32(tmp13, tmp0));
    658  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
    659                                       vsubhn_s32(tmp11, tmp2));
    660  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
    661                                       vsubhn_s32(tmp10, tmp3));
    662  /* Descale and narrow to 8-bit. */
    663  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
    664  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
    665  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
    666  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
    667  /* Clamp to range [0-255]. */
    668  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
    669                                 vdup_n_u8(CENTERJSAMPLE));
    670  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
    671                                 vdup_n_u8(CENTERJSAMPLE));
    672  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
    673                                 vdup_n_u8(CENTERJSAMPLE));
    674  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
    675                                 vdup_n_u8(CENTERJSAMPLE));
    676 
    677  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
    678   * together allows us to store 16-bit elements.)
    679   */
    680  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
    681  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
    682  uint16x4x4_t cols_01_23_45_67 = { {
    683    vreinterpret_u16_u8(cols_01_23.val[0]),
    684    vreinterpret_u16_u8(cols_01_23.val[1]),
    685    vreinterpret_u16_u8(cols_45_67.val[0]),
    686    vreinterpret_u16_u8(cols_45_67.val[1])
    687  } };
    688 
    689  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
    690  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
    691  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
    692  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
    693  /* VST4 of 16-bit elements completes the transpose. */
    694  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
    695  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
    696  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
    697  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
    698 }
    699 
    700 
    701 /* Performs the second pass of the accurate inverse DCT on a 4x8 block
    702 * of coefficients.
    703 *
    704 * This "sparse" version assumes that the coefficient values (after the first
    705 * pass) in rows 4-7 are all 0.  This simplifies the IDCT calculation,
    706 * accelerating overall performance.
    707 */
    708 
    709 static INLINE void jsimd_idct_islow_pass2_sparse(int16_t *workspace,
    710                                                 JSAMPARRAY output_buf,
    711                                                 JDIMENSION output_col,
    712                                                 unsigned buf_offset)
    713 {
    714  /* Load constants for IDCT computation. */
    715 #ifdef HAVE_VLD1_S16_X3
    716  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_islow_neon_consts);
    717 #else
    718  const int16x4_t consts1 = vld1_s16(jsimd_idct_islow_neon_consts);
    719  const int16x4_t consts2 = vld1_s16(jsimd_idct_islow_neon_consts + 4);
    720  const int16x4_t consts3 = vld1_s16(jsimd_idct_islow_neon_consts + 8);
    721  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
    722 #endif
    723 
    724  /* Even part (z3 is all 0) */
    725  int16x4_t z2_s16 = vld1_s16(workspace + 2 * DCTSIZE / 2);
    726 
    727  int32x4_t tmp2 = vmull_lane_s16(z2_s16, consts.val[0], 1);
    728  int32x4_t tmp3 = vmull_lane_s16(z2_s16, consts.val[1], 2);
    729 
    730  z2_s16 = vld1_s16(workspace + 0 * DCTSIZE / 2);
    731  int32x4_t tmp0 = vshll_n_s16(z2_s16, CONST_BITS);
    732  int32x4_t tmp1 = vshll_n_s16(z2_s16, CONST_BITS);
    733 
    734  int32x4_t tmp10 = vaddq_s32(tmp0, tmp3);
    735  int32x4_t tmp13 = vsubq_s32(tmp0, tmp3);
    736  int32x4_t tmp11 = vaddq_s32(tmp1, tmp2);
    737  int32x4_t tmp12 = vsubq_s32(tmp1, tmp2);
    738 
    739  /* Odd part (tmp0 and tmp1 are both all 0) */
    740  int16x4_t tmp2_s16 = vld1_s16(workspace + 3 * DCTSIZE / 2);
    741  int16x4_t tmp3_s16 = vld1_s16(workspace + 1 * DCTSIZE / 2);
    742 
    743  int16x4_t z3_s16 = tmp2_s16;
    744  int16x4_t z4_s16 = tmp3_s16;
    745 
    746  int32x4_t z3 = vmull_lane_s16(z3_s16, consts.val[2], 3);
    747  z3 = vmlal_lane_s16(z3, z4_s16, consts.val[1], 3);
    748  int32x4_t z4 = vmull_lane_s16(z3_s16, consts.val[1], 3);
    749  z4 = vmlal_lane_s16(z4, z4_s16, consts.val[2], 0);
    750 
    751  tmp0 = vmlsl_lane_s16(z3, tmp3_s16, consts.val[0], 0);
    752  tmp1 = vmlsl_lane_s16(z4, tmp2_s16, consts.val[0], 2);
    753  tmp2 = vmlal_lane_s16(z3, tmp2_s16, consts.val[2], 2);
    754  tmp3 = vmlal_lane_s16(z4, tmp3_s16, consts.val[1], 0);
    755 
    756  /* Final output stage: descale and narrow to 16-bit. */
    757  int16x8_t cols_02_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp3),
    758                                       vaddhn_s32(tmp12, tmp1));
    759  int16x8_t cols_13_s16 = vcombine_s16(vaddhn_s32(tmp11, tmp2),
    760                                       vaddhn_s32(tmp13, tmp0));
    761  int16x8_t cols_46_s16 = vcombine_s16(vsubhn_s32(tmp13, tmp0),
    762                                       vsubhn_s32(tmp11, tmp2));
    763  int16x8_t cols_57_s16 = vcombine_s16(vsubhn_s32(tmp12, tmp1),
    764                                       vsubhn_s32(tmp10, tmp3));
    765  /* Descale and narrow to 8-bit. */
    766  int8x8_t cols_02_s8 = vqrshrn_n_s16(cols_02_s16, DESCALE_P2 - 16);
    767  int8x8_t cols_13_s8 = vqrshrn_n_s16(cols_13_s16, DESCALE_P2 - 16);
    768  int8x8_t cols_46_s8 = vqrshrn_n_s16(cols_46_s16, DESCALE_P2 - 16);
    769  int8x8_t cols_57_s8 = vqrshrn_n_s16(cols_57_s16, DESCALE_P2 - 16);
    770  /* Clamp to range [0-255]. */
    771  uint8x8_t cols_02_u8 = vadd_u8(vreinterpret_u8_s8(cols_02_s8),
    772                                 vdup_n_u8(CENTERJSAMPLE));
    773  uint8x8_t cols_13_u8 = vadd_u8(vreinterpret_u8_s8(cols_13_s8),
    774                                 vdup_n_u8(CENTERJSAMPLE));
    775  uint8x8_t cols_46_u8 = vadd_u8(vreinterpret_u8_s8(cols_46_s8),
    776                                 vdup_n_u8(CENTERJSAMPLE));
    777  uint8x8_t cols_57_u8 = vadd_u8(vreinterpret_u8_s8(cols_57_s8),
    778                                 vdup_n_u8(CENTERJSAMPLE));
    779 
    780  /* Transpose 4x8 block and store to memory.  (Zipping adjacent columns
    781   * together allows us to store 16-bit elements.)
    782   */
    783  uint8x8x2_t cols_01_23 = vzip_u8(cols_02_u8, cols_13_u8);
    784  uint8x8x2_t cols_45_67 = vzip_u8(cols_46_u8, cols_57_u8);
    785  uint16x4x4_t cols_01_23_45_67 = { {
    786    vreinterpret_u16_u8(cols_01_23.val[0]),
    787    vreinterpret_u16_u8(cols_01_23.val[1]),
    788    vreinterpret_u16_u8(cols_45_67.val[0]),
    789    vreinterpret_u16_u8(cols_45_67.val[1])
    790  } };
    791 
    792  JSAMPROW outptr0 = output_buf[buf_offset + 0] + output_col;
    793  JSAMPROW outptr1 = output_buf[buf_offset + 1] + output_col;
    794  JSAMPROW outptr2 = output_buf[buf_offset + 2] + output_col;
    795  JSAMPROW outptr3 = output_buf[buf_offset + 3] + output_col;
    796  /* VST4 of 16-bit elements completes the transpose. */
    797  vst4_lane_u16((uint16_t *)outptr0, cols_01_23_45_67, 0);
    798  vst4_lane_u16((uint16_t *)outptr1, cols_01_23_45_67, 1);
    799  vst4_lane_u16((uint16_t *)outptr2, cols_01_23_45_67, 2);
    800  vst4_lane_u16((uint16_t *)outptr3, cols_01_23_45_67, 3);
    801 }