tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctred-neon.c (21455B)


      1 /*
      2 * jidctred-neon.c - reduced-size IDCT (Arm Neon)
      3 *
      4 * Copyright (C) 2020, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2020, D. R. Commander.  All Rights Reserved.
      6 *
      7 * This software is provided 'as-is', without any express or implied
      8 * warranty.  In no event will the authors be held liable for any damages
      9 * arising from the use of this software.
     10 *
     11 * Permission is granted to anyone to use this software for any purpose,
     12 * including commercial applications, and to alter it and redistribute it
     13 * freely, subject to the following restrictions:
     14 *
     15 * 1. The origin of this software must not be misrepresented; you must not
     16 *    claim that you wrote the original software. If you use this software
     17 *    in a product, an acknowledgment in the product documentation would be
     18 *    appreciated but is not required.
     19 * 2. Altered source versions must be plainly marked as such, and must not be
     20 *    misrepresented as being the original software.
     21 * 3. This notice may not be removed or altered from any source distribution.
     22 */
     23 
     24 #define JPEG_INTERNALS
     25 #include "../../jinclude.h"
     26 #include "../../jpeglib.h"
     27 #include "../../jsimd.h"
     28 #include "../../jdct.h"
     29 #include "../../jsimddct.h"
     30 #include "../jsimd.h"
     31 #include "align.h"
     32 #include "neon-compat.h"
     33 
     34 #include <arm_neon.h>
     35 
     36 
     37 #define CONST_BITS  13
     38 #define PASS1_BITS  2
     39 
     40 #define F_0_211  1730
     41 #define F_0_509  4176
     42 #define F_0_601  4926
     43 #define F_0_720  5906
     44 #define F_0_765  6270
     45 #define F_0_850  6967
     46 #define F_0_899  7373
     47 #define F_1_061  8697
     48 #define F_1_272  10426
     49 #define F_1_451  11893
     50 #define F_1_847  15137
     51 #define F_2_172  17799
     52 #define F_2_562  20995
     53 #define F_3_624  29692
     54 
     55 
     56 /* jsimd_idct_2x2_neon() is an inverse DCT function that produces reduced-size
     57 * 2x2 output from an 8x8 DCT block.  It uses the same calculations and
     58 * produces exactly the same output as IJG's original jpeg_idct_2x2() function
     59 * from jpeg-6b, which can be found in jidctred.c.
     60 *
     61 * Scaled integer constants are used to avoid floating-point arithmetic:
     62 *    0.720959822 =  5906 * 2^-13
     63 *    0.850430095 =  6967 * 2^-13
     64 *    1.272758580 = 10426 * 2^-13
     65 *    3.624509785 = 29692 * 2^-13
     66 *
     67 * See jidctred.c for further details of the 2x2 IDCT algorithm.  Where
     68 * possible, the variable names and comments here in jsimd_idct_2x2_neon()
     69 * match up with those in jpeg_idct_2x2().
     70 */
     71 
     72 ALIGN(16) static const int16_t jsimd_idct_2x2_neon_consts[] = {
     73  -F_0_720, F_0_850, -F_1_272, F_3_624
     74 };
     75 
     76 void jsimd_idct_2x2_neon(void *dct_table, JCOEFPTR coef_block,
     77                         JSAMPARRAY output_buf, JDIMENSION output_col)
     78 {
     79  ISLOW_MULT_TYPE *quantptr = dct_table;
     80 
     81  /* Load DCT coefficients. */
     82  int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE);
     83  int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE);
     84  int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE);
     85  int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE);
     86  int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE);
     87 
     88  /* Load quantization table values. */
     89  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
     90  int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
     91  int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
     92  int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
     93  int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
     94 
     95  /* Dequantize DCT coefficients. */
     96  row0 = vmulq_s16(row0, quant_row0);
     97  row1 = vmulq_s16(row1, quant_row1);
     98  row3 = vmulq_s16(row3, quant_row3);
     99  row5 = vmulq_s16(row5, quant_row5);
    100  row7 = vmulq_s16(row7, quant_row7);
    101 
    102  /* Load IDCT conversion constants. */
    103  const int16x4_t consts = vld1_s16(jsimd_idct_2x2_neon_consts);
    104 
    105  /* Pass 1: process columns from input, put results in vectors row0 and
    106   * row1.
    107   */
    108 
    109  /* Even part */
    110  int32x4_t tmp10_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 2);
    111  int32x4_t tmp10_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 2);
    112 
    113  /* Odd part */
    114  int32x4_t tmp0_l = vmull_lane_s16(vget_low_s16(row1), consts, 3);
    115  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row3), consts, 2);
    116  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row5), consts, 1);
    117  tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(row7), consts, 0);
    118  int32x4_t tmp0_h = vmull_lane_s16(vget_high_s16(row1), consts, 3);
    119  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row3), consts, 2);
    120  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row5), consts, 1);
    121  tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(row7), consts, 0);
    122 
    123  /* Final output stage: descale and narrow to 16-bit. */
    124  row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp0_l), CONST_BITS),
    125                      vrshrn_n_s32(vaddq_s32(tmp10_h, tmp0_h), CONST_BITS));
    126  row1 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp0_l), CONST_BITS),
    127                      vrshrn_n_s32(vsubq_s32(tmp10_h, tmp0_h), CONST_BITS));
    128 
    129  /* Transpose two rows, ready for second pass. */
    130  int16x8x2_t cols_0246_1357 = vtrnq_s16(row0, row1);
    131  int16x8_t cols_0246 = cols_0246_1357.val[0];
    132  int16x8_t cols_1357 = cols_0246_1357.val[1];
    133  /* Duplicate columns such that each is accessible in its own vector. */
    134  int32x4x2_t cols_1155_3377 = vtrnq_s32(vreinterpretq_s32_s16(cols_1357),
    135                                         vreinterpretq_s32_s16(cols_1357));
    136  int16x8_t cols_1155 = vreinterpretq_s16_s32(cols_1155_3377.val[0]);
    137  int16x8_t cols_3377 = vreinterpretq_s16_s32(cols_1155_3377.val[1]);
    138 
    139  /* Pass 2: process two rows, store to output array. */
    140 
    141  /* Even part: we're only interested in col0; the top half of tmp10 is "don't
    142   * care."
    143   */
    144  int32x4_t tmp10 = vshll_n_s16(vget_low_s16(cols_0246), CONST_BITS + 2);
    145 
    146  /* Odd part: we're only interested in the bottom half of tmp0. */
    147  int32x4_t tmp0 = vmull_lane_s16(vget_low_s16(cols_1155), consts, 3);
    148  tmp0 = vmlal_lane_s16(tmp0, vget_low_s16(cols_3377), consts, 2);
    149  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_1155), consts, 1);
    150  tmp0 = vmlal_lane_s16(tmp0, vget_high_s16(cols_3377), consts, 0);
    151 
    152  /* Final output stage: descale and clamp to range [0-255]. */
    153  int16x8_t output_s16 = vcombine_s16(vaddhn_s32(tmp10, tmp0),
    154                                      vsubhn_s32(tmp10, tmp0));
    155  output_s16 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_s16,
    156                            CONST_BITS + PASS1_BITS + 3 + 2 - 16);
    157  /* Narrow to 8-bit and convert to unsigned. */
    158  uint8x8_t output_u8 = vqmovun_s16(output_s16);
    159 
    160  /* Store 2x2 block to memory. */
    161  vst1_lane_u8(output_buf[0] + output_col, output_u8, 0);
    162  vst1_lane_u8(output_buf[1] + output_col, output_u8, 1);
    163  vst1_lane_u8(output_buf[0] + output_col + 1, output_u8, 4);
    164  vst1_lane_u8(output_buf[1] + output_col + 1, output_u8, 5);
    165 }
    166 
    167 
    168 /* jsimd_idct_4x4_neon() is an inverse DCT function that produces reduced-size
    169 * 4x4 output from an 8x8 DCT block.  It uses the same calculations and
    170 * produces exactly the same output as IJG's original jpeg_idct_4x4() function
    171 * from jpeg-6b, which can be found in jidctred.c.
    172 *
    173 * Scaled integer constants are used to avoid floating-point arithmetic:
    174 *    0.211164243 =  1730 * 2^-13
    175 *    0.509795579 =  4176 * 2^-13
    176 *    0.601344887 =  4926 * 2^-13
    177 *    0.765366865 =  6270 * 2^-13
    178 *    0.899976223 =  7373 * 2^-13
    179 *    1.061594337 =  8697 * 2^-13
    180 *    1.451774981 = 11893 * 2^-13
    181 *    1.847759065 = 15137 * 2^-13
    182 *    2.172734803 = 17799 * 2^-13
    183 *    2.562915447 = 20995 * 2^-13
    184 *
    185 * See jidctred.c for further details of the 4x4 IDCT algorithm.  Where
    186 * possible, the variable names and comments here in jsimd_idct_4x4_neon()
    187 * match up with those in jpeg_idct_4x4().
    188 */
    189 
    190 ALIGN(16) static const int16_t jsimd_idct_4x4_neon_consts[] = {
    191  F_1_847, -F_0_765, -F_0_211,  F_1_451,
    192 -F_2_172,  F_1_061, -F_0_509, -F_0_601,
    193  F_0_899,  F_2_562,        0,        0
    194 };
    195 
    196 void jsimd_idct_4x4_neon(void *dct_table, JCOEFPTR coef_block,
    197                         JSAMPARRAY output_buf, JDIMENSION output_col)
    198 {
    199  ISLOW_MULT_TYPE *quantptr = dct_table;
    200 
    201  /* Load DCT coefficients. */
    202  int16x8_t row0  = vld1q_s16(coef_block + 0 * DCTSIZE);
    203  int16x8_t row1  = vld1q_s16(coef_block + 1 * DCTSIZE);
    204  int16x8_t row2  = vld1q_s16(coef_block + 2 * DCTSIZE);
    205  int16x8_t row3  = vld1q_s16(coef_block + 3 * DCTSIZE);
    206  int16x8_t row5  = vld1q_s16(coef_block + 5 * DCTSIZE);
    207  int16x8_t row6  = vld1q_s16(coef_block + 6 * DCTSIZE);
    208  int16x8_t row7  = vld1q_s16(coef_block + 7 * DCTSIZE);
    209 
    210  /* Load quantization table values for DC coefficients. */
    211  int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE);
    212  /* Dequantize DC coefficients. */
    213  row0 = vmulq_s16(row0, quant_row0);
    214 
    215  /* Construct bitmap to test if all AC coefficients are 0. */
    216  int16x8_t bitmap = vorrq_s16(row1, row2);
    217  bitmap = vorrq_s16(bitmap, row3);
    218  bitmap = vorrq_s16(bitmap, row5);
    219  bitmap = vorrq_s16(bitmap, row6);
    220  bitmap = vorrq_s16(bitmap, row7);
    221 
    222  int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0);
    223  int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1);
    224 
    225  /* Load constants for IDCT computation. */
    226 #ifdef HAVE_VLD1_S16_X3
    227  const int16x4x3_t consts = vld1_s16_x3(jsimd_idct_4x4_neon_consts);
    228 #else
    229  /* GCC does not currently support the intrinsic vld1_<type>_x3(). */
    230  const int16x4_t consts1 = vld1_s16(jsimd_idct_4x4_neon_consts);
    231  const int16x4_t consts2 = vld1_s16(jsimd_idct_4x4_neon_consts + 4);
    232  const int16x4_t consts3 = vld1_s16(jsimd_idct_4x4_neon_consts + 8);
    233  const int16x4x3_t consts = { { consts1, consts2, consts3 } };
    234 #endif
    235 
    236  if (left_ac_bitmap == 0 && right_ac_bitmap == 0) {
    237    /* All AC coefficients are zero.
    238     * Compute DC values and duplicate into row vectors 0, 1, 2, and 3.
    239     */
    240    int16x8_t dcval = vshlq_n_s16(row0, PASS1_BITS);
    241    row0 = dcval;
    242    row1 = dcval;
    243    row2 = dcval;
    244    row3 = dcval;
    245  } else if (left_ac_bitmap == 0) {
    246    /* AC coefficients are zero for columns 0, 1, 2, and 3.
    247     * Compute DC values for these columns.
    248     */
    249    int16x4_t dcval = vshl_n_s16(vget_low_s16(row0), PASS1_BITS);
    250 
    251    /* Commence regular IDCT computation for columns 4, 5, 6, and 7. */
    252 
    253    /* Load quantization table. */
    254    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4);
    255    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4);
    256    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4);
    257    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4);
    258    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4);
    259    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4);
    260 
    261    /* Even part */
    262    int32x4_t tmp0 = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
    263 
    264    int16x4_t z2 = vmul_s16(vget_high_s16(row2), quant_row2);
    265    int16x4_t z3 = vmul_s16(vget_high_s16(row6), quant_row6);
    266 
    267    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
    268    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
    269 
    270    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
    271    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
    272 
    273    /* Odd part */
    274    int16x4_t z1 = vmul_s16(vget_high_s16(row7), quant_row7);
    275    z2 = vmul_s16(vget_high_s16(row5), quant_row5);
    276    z3 = vmul_s16(vget_high_s16(row3), quant_row3);
    277    int16x4_t z4 = vmul_s16(vget_high_s16(row1), quant_row1);
    278 
    279    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
    280    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
    281    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
    282    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
    283 
    284    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
    285    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
    286    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
    287    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
    288 
    289    /* Final output stage: descale and narrow to 16-bit. */
    290    row0 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
    291                                            CONST_BITS - PASS1_BITS + 1));
    292    row3 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
    293                                            CONST_BITS - PASS1_BITS + 1));
    294    row1 = vcombine_s16(dcval, vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
    295                                            CONST_BITS - PASS1_BITS + 1));
    296    row2 = vcombine_s16(dcval, vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
    297                                            CONST_BITS - PASS1_BITS + 1));
    298  } else if (right_ac_bitmap == 0) {
    299    /* AC coefficients are zero for columns 4, 5, 6, and 7.
    300     * Compute DC values for these columns.
    301     */
    302    int16x4_t dcval = vshl_n_s16(vget_high_s16(row0), PASS1_BITS);
    303 
    304    /* Commence regular IDCT computation for columns 0, 1, 2, and 3. */
    305 
    306    /* Load quantization table. */
    307    int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE);
    308    int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE);
    309    int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE);
    310    int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE);
    311    int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE);
    312    int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE);
    313 
    314    /* Even part */
    315    int32x4_t tmp0 = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
    316 
    317    int16x4_t z2 = vmul_s16(vget_low_s16(row2), quant_row2);
    318    int16x4_t z3 = vmul_s16(vget_low_s16(row6), quant_row6);
    319 
    320    int32x4_t tmp2 = vmull_lane_s16(z2, consts.val[0], 0);
    321    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[0], 1);
    322 
    323    int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
    324    int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
    325 
    326    /* Odd part */
    327    int16x4_t z1 = vmul_s16(vget_low_s16(row7), quant_row7);
    328    z2 = vmul_s16(vget_low_s16(row5), quant_row5);
    329    z3 = vmul_s16(vget_low_s16(row3), quant_row3);
    330    int16x4_t z4 = vmul_s16(vget_low_s16(row1), quant_row1);
    331 
    332    tmp0 = vmull_lane_s16(z1, consts.val[0], 2);
    333    tmp0 = vmlal_lane_s16(tmp0, z2, consts.val[0], 3);
    334    tmp0 = vmlal_lane_s16(tmp0, z3, consts.val[1], 0);
    335    tmp0 = vmlal_lane_s16(tmp0, z4, consts.val[1], 1);
    336 
    337    tmp2 = vmull_lane_s16(z1, consts.val[1], 2);
    338    tmp2 = vmlal_lane_s16(tmp2, z2, consts.val[1], 3);
    339    tmp2 = vmlal_lane_s16(tmp2, z3, consts.val[2], 0);
    340    tmp2 = vmlal_lane_s16(tmp2, z4, consts.val[2], 1);
    341 
    342    /* Final output stage: descale and narrow to 16-bit. */
    343    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10, tmp2),
    344                                     CONST_BITS - PASS1_BITS + 1), dcval);
    345    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10, tmp2),
    346                                     CONST_BITS - PASS1_BITS + 1), dcval);
    347    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12, tmp0),
    348                                     CONST_BITS - PASS1_BITS + 1), dcval);
    349    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12, tmp0),
    350                                     CONST_BITS - PASS1_BITS + 1), dcval);
    351  } else {
    352    /* All AC coefficients are non-zero; full IDCT calculation required. */
    353    int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE);
    354    int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE);
    355    int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE);
    356    int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE);
    357    int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE);
    358    int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE);
    359 
    360    /* Even part */
    361    int32x4_t tmp0_l = vshll_n_s16(vget_low_s16(row0), CONST_BITS + 1);
    362    int32x4_t tmp0_h = vshll_n_s16(vget_high_s16(row0), CONST_BITS + 1);
    363 
    364    int16x8_t z2 = vmulq_s16(row2, quant_row2);
    365    int16x8_t z3 = vmulq_s16(row6, quant_row6);
    366 
    367    int32x4_t tmp2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[0], 0);
    368    int32x4_t tmp2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[0], 0);
    369    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[0], 1);
    370    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[0], 1);
    371 
    372    int32x4_t tmp10_l = vaddq_s32(tmp0_l, tmp2_l);
    373    int32x4_t tmp10_h = vaddq_s32(tmp0_h, tmp2_h);
    374    int32x4_t tmp12_l = vsubq_s32(tmp0_l, tmp2_l);
    375    int32x4_t tmp12_h = vsubq_s32(tmp0_h, tmp2_h);
    376 
    377    /* Odd part */
    378    int16x8_t z1 = vmulq_s16(row7, quant_row7);
    379    z2 = vmulq_s16(row5, quant_row5);
    380    z3 = vmulq_s16(row3, quant_row3);
    381    int16x8_t z4 = vmulq_s16(row1, quant_row1);
    382 
    383    tmp0_l = vmull_lane_s16(vget_low_s16(z1), consts.val[0], 2);
    384    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z2), consts.val[0], 3);
    385    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z3), consts.val[1], 0);
    386    tmp0_l = vmlal_lane_s16(tmp0_l, vget_low_s16(z4), consts.val[1], 1);
    387    tmp0_h = vmull_lane_s16(vget_high_s16(z1), consts.val[0], 2);
    388    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z2), consts.val[0], 3);
    389    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z3), consts.val[1], 0);
    390    tmp0_h = vmlal_lane_s16(tmp0_h, vget_high_s16(z4), consts.val[1], 1);
    391 
    392    tmp2_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 2);
    393    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z2), consts.val[1], 3);
    394    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z3), consts.val[2], 0);
    395    tmp2_l = vmlal_lane_s16(tmp2_l, vget_low_s16(z4), consts.val[2], 1);
    396    tmp2_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 2);
    397    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z2), consts.val[1], 3);
    398    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z3), consts.val[2], 0);
    399    tmp2_h = vmlal_lane_s16(tmp2_h, vget_high_s16(z4), consts.val[2], 1);
    400 
    401    /* Final output stage: descale and narrow to 16-bit. */
    402    row0 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp10_l, tmp2_l),
    403                                     CONST_BITS - PASS1_BITS + 1),
    404                        vrshrn_n_s32(vaddq_s32(tmp10_h, tmp2_h),
    405                                     CONST_BITS - PASS1_BITS + 1));
    406    row3 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp10_l, tmp2_l),
    407                                     CONST_BITS - PASS1_BITS + 1),
    408                        vrshrn_n_s32(vsubq_s32(tmp10_h, tmp2_h),
    409                                     CONST_BITS - PASS1_BITS + 1));
    410    row1 = vcombine_s16(vrshrn_n_s32(vaddq_s32(tmp12_l, tmp0_l),
    411                                     CONST_BITS - PASS1_BITS + 1),
    412                        vrshrn_n_s32(vaddq_s32(tmp12_h, tmp0_h),
    413                                     CONST_BITS - PASS1_BITS + 1));
    414    row2 = vcombine_s16(vrshrn_n_s32(vsubq_s32(tmp12_l, tmp0_l),
    415                                     CONST_BITS - PASS1_BITS + 1),
    416                        vrshrn_n_s32(vsubq_s32(tmp12_h, tmp0_h),
    417                                     CONST_BITS - PASS1_BITS + 1));
    418  }
    419 
    420  /* Transpose 8x4 block to perform IDCT on rows in second pass. */
    421  int16x8x2_t row_01 = vtrnq_s16(row0, row1);
    422  int16x8x2_t row_23 = vtrnq_s16(row2, row3);
    423 
    424  int32x4x2_t cols_0426 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[0]),
    425                                    vreinterpretq_s32_s16(row_23.val[0]));
    426  int32x4x2_t cols_1537 = vtrnq_s32(vreinterpretq_s32_s16(row_01.val[1]),
    427                                    vreinterpretq_s32_s16(row_23.val[1]));
    428 
    429  int16x4_t col0 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[0]));
    430  int16x4_t col1 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[0]));
    431  int16x4_t col2 = vreinterpret_s16_s32(vget_low_s32(cols_0426.val[1]));
    432  int16x4_t col3 = vreinterpret_s16_s32(vget_low_s32(cols_1537.val[1]));
    433  int16x4_t col5 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[0]));
    434  int16x4_t col6 = vreinterpret_s16_s32(vget_high_s32(cols_0426.val[1]));
    435  int16x4_t col7 = vreinterpret_s16_s32(vget_high_s32(cols_1537.val[1]));
    436 
    437  /* Commence second pass of IDCT. */
    438 
    439  /* Even part */
    440  int32x4_t tmp0 = vshll_n_s16(col0, CONST_BITS + 1);
    441  int32x4_t tmp2 = vmull_lane_s16(col2, consts.val[0], 0);
    442  tmp2 = vmlal_lane_s16(tmp2, col6, consts.val[0], 1);
    443 
    444  int32x4_t tmp10 = vaddq_s32(tmp0, tmp2);
    445  int32x4_t tmp12 = vsubq_s32(tmp0, tmp2);
    446 
    447  /* Odd part */
    448  tmp0 = vmull_lane_s16(col7, consts.val[0], 2);
    449  tmp0 = vmlal_lane_s16(tmp0, col5, consts.val[0], 3);
    450  tmp0 = vmlal_lane_s16(tmp0, col3, consts.val[1], 0);
    451  tmp0 = vmlal_lane_s16(tmp0, col1, consts.val[1], 1);
    452 
    453  tmp2 = vmull_lane_s16(col7, consts.val[1], 2);
    454  tmp2 = vmlal_lane_s16(tmp2, col5, consts.val[1], 3);
    455  tmp2 = vmlal_lane_s16(tmp2, col3, consts.val[2], 0);
    456  tmp2 = vmlal_lane_s16(tmp2, col1, consts.val[2], 1);
    457 
    458  /* Final output stage: descale and clamp to range [0-255]. */
    459  int16x8_t output_cols_02 = vcombine_s16(vaddhn_s32(tmp10, tmp2),
    460                                          vsubhn_s32(tmp12, tmp0));
    461  int16x8_t output_cols_13 = vcombine_s16(vaddhn_s32(tmp12, tmp0),
    462                                          vsubhn_s32(tmp10, tmp2));
    463  output_cols_02 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_02,
    464                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
    465  output_cols_13 = vrsraq_n_s16(vdupq_n_s16(CENTERJSAMPLE), output_cols_13,
    466                                CONST_BITS + PASS1_BITS + 3 + 1 - 16);
    467  /* Narrow to 8-bit and convert to unsigned while zipping 8-bit elements.
    468   * An interleaving store completes the transpose.
    469   */
    470  uint8x8x2_t output_0123 = vzip_u8(vqmovun_s16(output_cols_02),
    471                                    vqmovun_s16(output_cols_13));
    472  uint16x4x2_t output_01_23 = { {
    473    vreinterpret_u16_u8(output_0123.val[0]),
    474    vreinterpret_u16_u8(output_0123.val[1])
    475  } };
    476 
    477  /* Store 4x4 block to memory. */
    478  JSAMPROW outptr0 = output_buf[0] + output_col;
    479  JSAMPROW outptr1 = output_buf[1] + output_col;
    480  JSAMPROW outptr2 = output_buf[2] + output_col;
    481  JSAMPROW outptr3 = output_buf[3] + output_col;
    482  vst2_lane_u16((uint16_t *)outptr0, output_01_23, 0);
    483  vst2_lane_u16((uint16_t *)outptr1, output_01_23, 1);
    484  vst2_lane_u16((uint16_t *)outptr2, output_01_23, 2);
    485  vst2_lane_u16((uint16_t *)outptr3, output_01_23, 3);
    486 }