jidctfst-neon.c (19854B)
1 /* 2 * jidctfst-neon.c - fast integer IDCT (Arm Neon) 3 * 4 * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5 * 6 * This software is provided 'as-is', without any express or implied 7 * warranty. In no event will the authors be held liable for any damages 8 * arising from the use of this software. 9 * 10 * Permission is granted to anyone to use this software for any purpose, 11 * including commercial applications, and to alter it and redistribute it 12 * freely, subject to the following restrictions: 13 * 14 * 1. The origin of this software must not be misrepresented; you must not 15 * claim that you wrote the original software. If you use this software 16 * in a product, an acknowledgment in the product documentation would be 17 * appreciated but is not required. 18 * 2. Altered source versions must be plainly marked as such, and must not be 19 * misrepresented as being the original software. 20 * 3. This notice may not be removed or altered from any source distribution. 21 */ 22 23 #define JPEG_INTERNALS 24 #include "../../jinclude.h" 25 #include "../../jpeglib.h" 26 #include "../../jsimd.h" 27 #include "../../jdct.h" 28 #include "../../jsimddct.h" 29 #include "../jsimd.h" 30 #include "align.h" 31 32 #include <arm_neon.h> 33 34 35 /* jsimd_idct_ifast_neon() performs dequantization and a fast, not so accurate 36 * inverse DCT (Discrete Cosine Transform) on one block of coefficients. It 37 * uses the same calculations and produces exactly the same output as IJG's 38 * original jpeg_idct_ifast() function, which can be found in jidctfst.c. 39 * 40 * Scaled integer constants are used to avoid floating-point arithmetic: 41 * 0.082392200 = 2688 * 2^-15 42 * 0.414213562 = 13568 * 2^-15 43 * 0.847759065 = 27776 * 2^-15 44 * 0.613125930 = 20096 * 2^-15 45 * 46 * See jidctfst.c for further details of the IDCT algorithm. Where possible, 47 * the variable names and comments here in jsimd_idct_ifast_neon() match up 48 * with those in jpeg_idct_ifast(). 49 */ 50 51 #define PASS1_BITS 2 52 53 #define F_0_082 2688 54 #define F_0_414 13568 55 #define F_0_847 27776 56 #define F_0_613 20096 57 58 59 ALIGN(16) static const int16_t jsimd_idct_ifast_neon_consts[] = { 60 F_0_082, F_0_414, F_0_847, F_0_613 61 }; 62 63 void jsimd_idct_ifast_neon(void *dct_table, JCOEFPTR coef_block, 64 JSAMPARRAY output_buf, JDIMENSION output_col) 65 { 66 IFAST_MULT_TYPE *quantptr = dct_table; 67 68 /* Load DCT coefficients. */ 69 int16x8_t row0 = vld1q_s16(coef_block + 0 * DCTSIZE); 70 int16x8_t row1 = vld1q_s16(coef_block + 1 * DCTSIZE); 71 int16x8_t row2 = vld1q_s16(coef_block + 2 * DCTSIZE); 72 int16x8_t row3 = vld1q_s16(coef_block + 3 * DCTSIZE); 73 int16x8_t row4 = vld1q_s16(coef_block + 4 * DCTSIZE); 74 int16x8_t row5 = vld1q_s16(coef_block + 5 * DCTSIZE); 75 int16x8_t row6 = vld1q_s16(coef_block + 6 * DCTSIZE); 76 int16x8_t row7 = vld1q_s16(coef_block + 7 * DCTSIZE); 77 78 /* Load quantization table values for DC coefficients. */ 79 int16x8_t quant_row0 = vld1q_s16(quantptr + 0 * DCTSIZE); 80 /* Dequantize DC coefficients. */ 81 row0 = vmulq_s16(row0, quant_row0); 82 83 /* Construct bitmap to test if all AC coefficients are 0. */ 84 int16x8_t bitmap = vorrq_s16(row1, row2); 85 bitmap = vorrq_s16(bitmap, row3); 86 bitmap = vorrq_s16(bitmap, row4); 87 bitmap = vorrq_s16(bitmap, row5); 88 bitmap = vorrq_s16(bitmap, row6); 89 bitmap = vorrq_s16(bitmap, row7); 90 91 int64_t left_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 0); 92 int64_t right_ac_bitmap = vgetq_lane_s64(vreinterpretq_s64_s16(bitmap), 1); 93 94 /* Load IDCT conversion constants. */ 95 const int16x4_t consts = vld1_s16(jsimd_idct_ifast_neon_consts); 96 97 if (left_ac_bitmap == 0 && right_ac_bitmap == 0) { 98 /* All AC coefficients are zero. 99 * Compute DC values and duplicate into vectors. 100 */ 101 int16x8_t dcval = row0; 102 row1 = dcval; 103 row2 = dcval; 104 row3 = dcval; 105 row4 = dcval; 106 row5 = dcval; 107 row6 = dcval; 108 row7 = dcval; 109 } else if (left_ac_bitmap == 0) { 110 /* AC coefficients are zero for columns 0, 1, 2, and 3. 111 * Use DC values for these columns. 112 */ 113 int16x4_t dcval = vget_low_s16(row0); 114 115 /* Commence regular fast IDCT computation for columns 4, 5, 6, and 7. */ 116 117 /* Load quantization table. */ 118 int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE + 4); 119 int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE + 4); 120 int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE + 4); 121 int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE + 4); 122 int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE + 4); 123 int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE + 4); 124 int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE + 4); 125 126 /* Even part: dequantize DCT coefficients. */ 127 int16x4_t tmp0 = vget_high_s16(row0); 128 int16x4_t tmp1 = vmul_s16(vget_high_s16(row2), quant_row2); 129 int16x4_t tmp2 = vmul_s16(vget_high_s16(row4), quant_row4); 130 int16x4_t tmp3 = vmul_s16(vget_high_s16(row6), quant_row6); 131 132 int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ 133 int16x4_t tmp11 = vsub_s16(tmp0, tmp2); 134 135 int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ 136 int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); 137 int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1); 138 tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); 139 tmp12 = vsub_s16(tmp12, tmp13); 140 141 tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ 142 tmp3 = vsub_s16(tmp10, tmp13); 143 tmp1 = vadd_s16(tmp11, tmp12); 144 tmp2 = vsub_s16(tmp11, tmp12); 145 146 /* Odd part: dequantize DCT coefficients. */ 147 int16x4_t tmp4 = vmul_s16(vget_high_s16(row1), quant_row1); 148 int16x4_t tmp5 = vmul_s16(vget_high_s16(row3), quant_row3); 149 int16x4_t tmp6 = vmul_s16(vget_high_s16(row5), quant_row5); 150 int16x4_t tmp7 = vmul_s16(vget_high_s16(row7), quant_row7); 151 152 int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ 153 int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); 154 int16x4_t z11 = vadd_s16(tmp4, tmp7); 155 int16x4_t z12 = vsub_s16(tmp4, tmp7); 156 157 tmp7 = vadd_s16(z11, z13); /* phase 5 */ 158 int16x4_t z11_sub_z13 = vsub_s16(z11, z13); 159 tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1); 160 tmp11 = vadd_s16(tmp11, z11_sub_z13); 161 162 int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); 163 int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2); 164 z5 = vadd_s16(z5, z10_add_z12); 165 tmp10 = vqdmulh_lane_s16(z12, consts, 0); 166 tmp10 = vadd_s16(tmp10, z12); 167 tmp10 = vsub_s16(tmp10, z5); 168 tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3); 169 tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); 170 tmp12 = vadd_s16(tmp12, z5); 171 172 tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ 173 tmp5 = vsub_s16(tmp11, tmp6); 174 tmp4 = vadd_s16(tmp10, tmp5); 175 176 row0 = vcombine_s16(dcval, vadd_s16(tmp0, tmp7)); 177 row7 = vcombine_s16(dcval, vsub_s16(tmp0, tmp7)); 178 row1 = vcombine_s16(dcval, vadd_s16(tmp1, tmp6)); 179 row6 = vcombine_s16(dcval, vsub_s16(tmp1, tmp6)); 180 row2 = vcombine_s16(dcval, vadd_s16(tmp2, tmp5)); 181 row5 = vcombine_s16(dcval, vsub_s16(tmp2, tmp5)); 182 row4 = vcombine_s16(dcval, vadd_s16(tmp3, tmp4)); 183 row3 = vcombine_s16(dcval, vsub_s16(tmp3, tmp4)); 184 } else if (right_ac_bitmap == 0) { 185 /* AC coefficients are zero for columns 4, 5, 6, and 7. 186 * Use DC values for these columns. 187 */ 188 int16x4_t dcval = vget_high_s16(row0); 189 190 /* Commence regular fast IDCT computation for columns 0, 1, 2, and 3. */ 191 192 /* Load quantization table. */ 193 int16x4_t quant_row1 = vld1_s16(quantptr + 1 * DCTSIZE); 194 int16x4_t quant_row2 = vld1_s16(quantptr + 2 * DCTSIZE); 195 int16x4_t quant_row3 = vld1_s16(quantptr + 3 * DCTSIZE); 196 int16x4_t quant_row4 = vld1_s16(quantptr + 4 * DCTSIZE); 197 int16x4_t quant_row5 = vld1_s16(quantptr + 5 * DCTSIZE); 198 int16x4_t quant_row6 = vld1_s16(quantptr + 6 * DCTSIZE); 199 int16x4_t quant_row7 = vld1_s16(quantptr + 7 * DCTSIZE); 200 201 /* Even part: dequantize DCT coefficients. */ 202 int16x4_t tmp0 = vget_low_s16(row0); 203 int16x4_t tmp1 = vmul_s16(vget_low_s16(row2), quant_row2); 204 int16x4_t tmp2 = vmul_s16(vget_low_s16(row4), quant_row4); 205 int16x4_t tmp3 = vmul_s16(vget_low_s16(row6), quant_row6); 206 207 int16x4_t tmp10 = vadd_s16(tmp0, tmp2); /* phase 3 */ 208 int16x4_t tmp11 = vsub_s16(tmp0, tmp2); 209 210 int16x4_t tmp13 = vadd_s16(tmp1, tmp3); /* phases 5-3 */ 211 int16x4_t tmp1_sub_tmp3 = vsub_s16(tmp1, tmp3); 212 int16x4_t tmp12 = vqdmulh_lane_s16(tmp1_sub_tmp3, consts, 1); 213 tmp12 = vadd_s16(tmp12, tmp1_sub_tmp3); 214 tmp12 = vsub_s16(tmp12, tmp13); 215 216 tmp0 = vadd_s16(tmp10, tmp13); /* phase 2 */ 217 tmp3 = vsub_s16(tmp10, tmp13); 218 tmp1 = vadd_s16(tmp11, tmp12); 219 tmp2 = vsub_s16(tmp11, tmp12); 220 221 /* Odd part: dequantize DCT coefficients. */ 222 int16x4_t tmp4 = vmul_s16(vget_low_s16(row1), quant_row1); 223 int16x4_t tmp5 = vmul_s16(vget_low_s16(row3), quant_row3); 224 int16x4_t tmp6 = vmul_s16(vget_low_s16(row5), quant_row5); 225 int16x4_t tmp7 = vmul_s16(vget_low_s16(row7), quant_row7); 226 227 int16x4_t z13 = vadd_s16(tmp6, tmp5); /* phase 6 */ 228 int16x4_t neg_z10 = vsub_s16(tmp5, tmp6); 229 int16x4_t z11 = vadd_s16(tmp4, tmp7); 230 int16x4_t z12 = vsub_s16(tmp4, tmp7); 231 232 tmp7 = vadd_s16(z11, z13); /* phase 5 */ 233 int16x4_t z11_sub_z13 = vsub_s16(z11, z13); 234 tmp11 = vqdmulh_lane_s16(z11_sub_z13, consts, 1); 235 tmp11 = vadd_s16(tmp11, z11_sub_z13); 236 237 int16x4_t z10_add_z12 = vsub_s16(z12, neg_z10); 238 int16x4_t z5 = vqdmulh_lane_s16(z10_add_z12, consts, 2); 239 z5 = vadd_s16(z5, z10_add_z12); 240 tmp10 = vqdmulh_lane_s16(z12, consts, 0); 241 tmp10 = vadd_s16(tmp10, z12); 242 tmp10 = vsub_s16(tmp10, z5); 243 tmp12 = vqdmulh_lane_s16(neg_z10, consts, 3); 244 tmp12 = vadd_s16(tmp12, vadd_s16(neg_z10, neg_z10)); 245 tmp12 = vadd_s16(tmp12, z5); 246 247 tmp6 = vsub_s16(tmp12, tmp7); /* phase 2 */ 248 tmp5 = vsub_s16(tmp11, tmp6); 249 tmp4 = vadd_s16(tmp10, tmp5); 250 251 row0 = vcombine_s16(vadd_s16(tmp0, tmp7), dcval); 252 row7 = vcombine_s16(vsub_s16(tmp0, tmp7), dcval); 253 row1 = vcombine_s16(vadd_s16(tmp1, tmp6), dcval); 254 row6 = vcombine_s16(vsub_s16(tmp1, tmp6), dcval); 255 row2 = vcombine_s16(vadd_s16(tmp2, tmp5), dcval); 256 row5 = vcombine_s16(vsub_s16(tmp2, tmp5), dcval); 257 row4 = vcombine_s16(vadd_s16(tmp3, tmp4), dcval); 258 row3 = vcombine_s16(vsub_s16(tmp3, tmp4), dcval); 259 } else { 260 /* Some AC coefficients are non-zero; full IDCT calculation required. */ 261 262 /* Load quantization table. */ 263 int16x8_t quant_row1 = vld1q_s16(quantptr + 1 * DCTSIZE); 264 int16x8_t quant_row2 = vld1q_s16(quantptr + 2 * DCTSIZE); 265 int16x8_t quant_row3 = vld1q_s16(quantptr + 3 * DCTSIZE); 266 int16x8_t quant_row4 = vld1q_s16(quantptr + 4 * DCTSIZE); 267 int16x8_t quant_row5 = vld1q_s16(quantptr + 5 * DCTSIZE); 268 int16x8_t quant_row6 = vld1q_s16(quantptr + 6 * DCTSIZE); 269 int16x8_t quant_row7 = vld1q_s16(quantptr + 7 * DCTSIZE); 270 271 /* Even part: dequantize DCT coefficients. */ 272 int16x8_t tmp0 = row0; 273 int16x8_t tmp1 = vmulq_s16(row2, quant_row2); 274 int16x8_t tmp2 = vmulq_s16(row4, quant_row4); 275 int16x8_t tmp3 = vmulq_s16(row6, quant_row6); 276 277 int16x8_t tmp10 = vaddq_s16(tmp0, tmp2); /* phase 3 */ 278 int16x8_t tmp11 = vsubq_s16(tmp0, tmp2); 279 280 int16x8_t tmp13 = vaddq_s16(tmp1, tmp3); /* phases 5-3 */ 281 int16x8_t tmp1_sub_tmp3 = vsubq_s16(tmp1, tmp3); 282 int16x8_t tmp12 = vqdmulhq_lane_s16(tmp1_sub_tmp3, consts, 1); 283 tmp12 = vaddq_s16(tmp12, tmp1_sub_tmp3); 284 tmp12 = vsubq_s16(tmp12, tmp13); 285 286 tmp0 = vaddq_s16(tmp10, tmp13); /* phase 2 */ 287 tmp3 = vsubq_s16(tmp10, tmp13); 288 tmp1 = vaddq_s16(tmp11, tmp12); 289 tmp2 = vsubq_s16(tmp11, tmp12); 290 291 /* Odd part: dequantize DCT coefficients. */ 292 int16x8_t tmp4 = vmulq_s16(row1, quant_row1); 293 int16x8_t tmp5 = vmulq_s16(row3, quant_row3); 294 int16x8_t tmp6 = vmulq_s16(row5, quant_row5); 295 int16x8_t tmp7 = vmulq_s16(row7, quant_row7); 296 297 int16x8_t z13 = vaddq_s16(tmp6, tmp5); /* phase 6 */ 298 int16x8_t neg_z10 = vsubq_s16(tmp5, tmp6); 299 int16x8_t z11 = vaddq_s16(tmp4, tmp7); 300 int16x8_t z12 = vsubq_s16(tmp4, tmp7); 301 302 tmp7 = vaddq_s16(z11, z13); /* phase 5 */ 303 int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); 304 tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1); 305 tmp11 = vaddq_s16(tmp11, z11_sub_z13); 306 307 int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); 308 int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2); 309 z5 = vaddq_s16(z5, z10_add_z12); 310 tmp10 = vqdmulhq_lane_s16(z12, consts, 0); 311 tmp10 = vaddq_s16(tmp10, z12); 312 tmp10 = vsubq_s16(tmp10, z5); 313 tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3); 314 tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); 315 tmp12 = vaddq_s16(tmp12, z5); 316 317 tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ 318 tmp5 = vsubq_s16(tmp11, tmp6); 319 tmp4 = vaddq_s16(tmp10, tmp5); 320 321 row0 = vaddq_s16(tmp0, tmp7); 322 row7 = vsubq_s16(tmp0, tmp7); 323 row1 = vaddq_s16(tmp1, tmp6); 324 row6 = vsubq_s16(tmp1, tmp6); 325 row2 = vaddq_s16(tmp2, tmp5); 326 row5 = vsubq_s16(tmp2, tmp5); 327 row4 = vaddq_s16(tmp3, tmp4); 328 row3 = vsubq_s16(tmp3, tmp4); 329 } 330 331 /* Transpose rows to work on columns in pass 2. */ 332 int16x8x2_t rows_01 = vtrnq_s16(row0, row1); 333 int16x8x2_t rows_23 = vtrnq_s16(row2, row3); 334 int16x8x2_t rows_45 = vtrnq_s16(row4, row5); 335 int16x8x2_t rows_67 = vtrnq_s16(row6, row7); 336 337 int32x4x2_t rows_0145_l = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[0]), 338 vreinterpretq_s32_s16(rows_45.val[0])); 339 int32x4x2_t rows_0145_h = vtrnq_s32(vreinterpretq_s32_s16(rows_01.val[1]), 340 vreinterpretq_s32_s16(rows_45.val[1])); 341 int32x4x2_t rows_2367_l = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[0]), 342 vreinterpretq_s32_s16(rows_67.val[0])); 343 int32x4x2_t rows_2367_h = vtrnq_s32(vreinterpretq_s32_s16(rows_23.val[1]), 344 vreinterpretq_s32_s16(rows_67.val[1])); 345 346 int32x4x2_t cols_04 = vzipq_s32(rows_0145_l.val[0], rows_2367_l.val[0]); 347 int32x4x2_t cols_15 = vzipq_s32(rows_0145_h.val[0], rows_2367_h.val[0]); 348 int32x4x2_t cols_26 = vzipq_s32(rows_0145_l.val[1], rows_2367_l.val[1]); 349 int32x4x2_t cols_37 = vzipq_s32(rows_0145_h.val[1], rows_2367_h.val[1]); 350 351 int16x8_t col0 = vreinterpretq_s16_s32(cols_04.val[0]); 352 int16x8_t col1 = vreinterpretq_s16_s32(cols_15.val[0]); 353 int16x8_t col2 = vreinterpretq_s16_s32(cols_26.val[0]); 354 int16x8_t col3 = vreinterpretq_s16_s32(cols_37.val[0]); 355 int16x8_t col4 = vreinterpretq_s16_s32(cols_04.val[1]); 356 int16x8_t col5 = vreinterpretq_s16_s32(cols_15.val[1]); 357 int16x8_t col6 = vreinterpretq_s16_s32(cols_26.val[1]); 358 int16x8_t col7 = vreinterpretq_s16_s32(cols_37.val[1]); 359 360 /* 1-D IDCT, pass 2 */ 361 362 /* Even part */ 363 int16x8_t tmp10 = vaddq_s16(col0, col4); 364 int16x8_t tmp11 = vsubq_s16(col0, col4); 365 366 int16x8_t tmp13 = vaddq_s16(col2, col6); 367 int16x8_t col2_sub_col6 = vsubq_s16(col2, col6); 368 int16x8_t tmp12 = vqdmulhq_lane_s16(col2_sub_col6, consts, 1); 369 tmp12 = vaddq_s16(tmp12, col2_sub_col6); 370 tmp12 = vsubq_s16(tmp12, tmp13); 371 372 int16x8_t tmp0 = vaddq_s16(tmp10, tmp13); 373 int16x8_t tmp3 = vsubq_s16(tmp10, tmp13); 374 int16x8_t tmp1 = vaddq_s16(tmp11, tmp12); 375 int16x8_t tmp2 = vsubq_s16(tmp11, tmp12); 376 377 /* Odd part */ 378 int16x8_t z13 = vaddq_s16(col5, col3); 379 int16x8_t neg_z10 = vsubq_s16(col3, col5); 380 int16x8_t z11 = vaddq_s16(col1, col7); 381 int16x8_t z12 = vsubq_s16(col1, col7); 382 383 int16x8_t tmp7 = vaddq_s16(z11, z13); /* phase 5 */ 384 int16x8_t z11_sub_z13 = vsubq_s16(z11, z13); 385 tmp11 = vqdmulhq_lane_s16(z11_sub_z13, consts, 1); 386 tmp11 = vaddq_s16(tmp11, z11_sub_z13); 387 388 int16x8_t z10_add_z12 = vsubq_s16(z12, neg_z10); 389 int16x8_t z5 = vqdmulhq_lane_s16(z10_add_z12, consts, 2); 390 z5 = vaddq_s16(z5, z10_add_z12); 391 tmp10 = vqdmulhq_lane_s16(z12, consts, 0); 392 tmp10 = vaddq_s16(tmp10, z12); 393 tmp10 = vsubq_s16(tmp10, z5); 394 tmp12 = vqdmulhq_lane_s16(neg_z10, consts, 3); 395 tmp12 = vaddq_s16(tmp12, vaddq_s16(neg_z10, neg_z10)); 396 tmp12 = vaddq_s16(tmp12, z5); 397 398 int16x8_t tmp6 = vsubq_s16(tmp12, tmp7); /* phase 2 */ 399 int16x8_t tmp5 = vsubq_s16(tmp11, tmp6); 400 int16x8_t tmp4 = vaddq_s16(tmp10, tmp5); 401 402 col0 = vaddq_s16(tmp0, tmp7); 403 col7 = vsubq_s16(tmp0, tmp7); 404 col1 = vaddq_s16(tmp1, tmp6); 405 col6 = vsubq_s16(tmp1, tmp6); 406 col2 = vaddq_s16(tmp2, tmp5); 407 col5 = vsubq_s16(tmp2, tmp5); 408 col4 = vaddq_s16(tmp3, tmp4); 409 col3 = vsubq_s16(tmp3, tmp4); 410 411 /* Scale down by a factor of 8, narrowing to 8-bit. */ 412 int8x16_t cols_01_s8 = vcombine_s8(vqshrn_n_s16(col0, PASS1_BITS + 3), 413 vqshrn_n_s16(col1, PASS1_BITS + 3)); 414 int8x16_t cols_45_s8 = vcombine_s8(vqshrn_n_s16(col4, PASS1_BITS + 3), 415 vqshrn_n_s16(col5, PASS1_BITS + 3)); 416 int8x16_t cols_23_s8 = vcombine_s8(vqshrn_n_s16(col2, PASS1_BITS + 3), 417 vqshrn_n_s16(col3, PASS1_BITS + 3)); 418 int8x16_t cols_67_s8 = vcombine_s8(vqshrn_n_s16(col6, PASS1_BITS + 3), 419 vqshrn_n_s16(col7, PASS1_BITS + 3)); 420 /* Clamp to range [0-255]. */ 421 uint8x16_t cols_01 = 422 vreinterpretq_u8_s8 423 (vaddq_s8(cols_01_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); 424 uint8x16_t cols_45 = 425 vreinterpretq_u8_s8 426 (vaddq_s8(cols_45_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); 427 uint8x16_t cols_23 = 428 vreinterpretq_u8_s8 429 (vaddq_s8(cols_23_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); 430 uint8x16_t cols_67 = 431 vreinterpretq_u8_s8 432 (vaddq_s8(cols_67_s8, vreinterpretq_s8_u8(vdupq_n_u8(CENTERJSAMPLE)))); 433 434 /* Transpose block to prepare for store. */ 435 uint32x4x2_t cols_0415 = vzipq_u32(vreinterpretq_u32_u8(cols_01), 436 vreinterpretq_u32_u8(cols_45)); 437 uint32x4x2_t cols_2637 = vzipq_u32(vreinterpretq_u32_u8(cols_23), 438 vreinterpretq_u32_u8(cols_67)); 439 440 uint8x16x2_t cols_0145 = vtrnq_u8(vreinterpretq_u8_u32(cols_0415.val[0]), 441 vreinterpretq_u8_u32(cols_0415.val[1])); 442 uint8x16x2_t cols_2367 = vtrnq_u8(vreinterpretq_u8_u32(cols_2637.val[0]), 443 vreinterpretq_u8_u32(cols_2637.val[1])); 444 uint16x8x2_t rows_0426 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[0]), 445 vreinterpretq_u16_u8(cols_2367.val[0])); 446 uint16x8x2_t rows_1537 = vtrnq_u16(vreinterpretq_u16_u8(cols_0145.val[1]), 447 vreinterpretq_u16_u8(cols_2367.val[1])); 448 449 uint8x16_t rows_04 = vreinterpretq_u8_u16(rows_0426.val[0]); 450 uint8x16_t rows_15 = vreinterpretq_u8_u16(rows_1537.val[0]); 451 uint8x16_t rows_26 = vreinterpretq_u8_u16(rows_0426.val[1]); 452 uint8x16_t rows_37 = vreinterpretq_u8_u16(rows_1537.val[1]); 453 454 JSAMPROW outptr0 = output_buf[0] + output_col; 455 JSAMPROW outptr1 = output_buf[1] + output_col; 456 JSAMPROW outptr2 = output_buf[2] + output_col; 457 JSAMPROW outptr3 = output_buf[3] + output_col; 458 JSAMPROW outptr4 = output_buf[4] + output_col; 459 JSAMPROW outptr5 = output_buf[5] + output_col; 460 JSAMPROW outptr6 = output_buf[6] + output_col; 461 JSAMPROW outptr7 = output_buf[7] + output_col; 462 463 /* Store DCT block to memory. */ 464 vst1q_lane_u64((uint64_t *)outptr0, vreinterpretq_u64_u8(rows_04), 0); 465 vst1q_lane_u64((uint64_t *)outptr1, vreinterpretq_u64_u8(rows_15), 0); 466 vst1q_lane_u64((uint64_t *)outptr2, vreinterpretq_u64_u8(rows_26), 0); 467 vst1q_lane_u64((uint64_t *)outptr3, vreinterpretq_u64_u8(rows_37), 0); 468 vst1q_lane_u64((uint64_t *)outptr4, vreinterpretq_u64_u8(rows_04), 1); 469 vst1q_lane_u64((uint64_t *)outptr5, vreinterpretq_u64_u8(rows_15), 1); 470 vst1q_lane_u64((uint64_t *)outptr6, vreinterpretq_u64_u8(rows_26), 1); 471 vst1q_lane_u64((uint64_t *)outptr7, vreinterpretq_u64_u8(rows_37), 1); 472 }