jfdctint-neon.c (15054B)
1 /* 2 * jfdctint-neon.c - accurate integer FDCT (Arm Neon) 3 * 4 * Copyright (C) 2020, Arm Limited. All Rights Reserved. 5 * Copyright (C) 2020, D. R. Commander. All Rights Reserved. 6 * 7 * This software is provided 'as-is', without any express or implied 8 * warranty. In no event will the authors be held liable for any damages 9 * arising from the use of this software. 10 * 11 * Permission is granted to anyone to use this software for any purpose, 12 * including commercial applications, and to alter it and redistribute it 13 * freely, subject to the following restrictions: 14 * 15 * 1. The origin of this software must not be misrepresented; you must not 16 * claim that you wrote the original software. If you use this software 17 * in a product, an acknowledgment in the product documentation would be 18 * appreciated but is not required. 19 * 2. Altered source versions must be plainly marked as such, and must not be 20 * misrepresented as being the original software. 21 * 3. This notice may not be removed or altered from any source distribution. 22 */ 23 24 #define JPEG_INTERNALS 25 #include "../../jinclude.h" 26 #include "../../jpeglib.h" 27 #include "../../jsimd.h" 28 #include "../../jdct.h" 29 #include "../../jsimddct.h" 30 #include "../jsimd.h" 31 #include "align.h" 32 #include "neon-compat.h" 33 34 #include <arm_neon.h> 35 36 37 /* jsimd_fdct_islow_neon() performs a slower but more accurate forward DCT 38 * (Discrete Cosine Transform) on one block of samples. It uses the same 39 * calculations and produces exactly the same output as IJG's original 40 * jpeg_fdct_islow() function, which can be found in jfdctint.c. 41 * 42 * Scaled integer constants are used to avoid floating-point arithmetic: 43 * 0.298631336 = 2446 * 2^-13 44 * 0.390180644 = 3196 * 2^-13 45 * 0.541196100 = 4433 * 2^-13 46 * 0.765366865 = 6270 * 2^-13 47 * 0.899976223 = 7373 * 2^-13 48 * 1.175875602 = 9633 * 2^-13 49 * 1.501321110 = 12299 * 2^-13 50 * 1.847759065 = 15137 * 2^-13 51 * 1.961570560 = 16069 * 2^-13 52 * 2.053119869 = 16819 * 2^-13 53 * 2.562915447 = 20995 * 2^-13 54 * 3.072711026 = 25172 * 2^-13 55 * 56 * See jfdctint.c for further details of the DCT algorithm. Where possible, 57 * the variable names and comments here in jsimd_fdct_islow_neon() match up 58 * with those in jpeg_fdct_islow(). 59 */ 60 61 #define CONST_BITS 13 62 #define PASS1_BITS 2 63 64 #define DESCALE_P1 (CONST_BITS - PASS1_BITS) 65 #define DESCALE_P2 (CONST_BITS + PASS1_BITS) 66 67 #define F_0_298 2446 68 #define F_0_390 3196 69 #define F_0_541 4433 70 #define F_0_765 6270 71 #define F_0_899 7373 72 #define F_1_175 9633 73 #define F_1_501 12299 74 #define F_1_847 15137 75 #define F_1_961 16069 76 #define F_2_053 16819 77 #define F_2_562 20995 78 #define F_3_072 25172 79 80 81 ALIGN(16) static const int16_t jsimd_fdct_islow_neon_consts[] = { 82 F_0_298, -F_0_390, F_0_541, F_0_765, 83 -F_0_899, F_1_175, F_1_501, -F_1_847, 84 -F_1_961, F_2_053, -F_2_562, F_3_072 85 }; 86 87 void jsimd_fdct_islow_neon(DCTELEM *data) 88 { 89 /* Load DCT constants. */ 90 #ifdef HAVE_VLD1_S16_X3 91 const int16x4x3_t consts = vld1_s16_x3(jsimd_fdct_islow_neon_consts); 92 #else 93 /* GCC does not currently support the intrinsic vld1_<type>_x3(). */ 94 const int16x4_t consts1 = vld1_s16(jsimd_fdct_islow_neon_consts); 95 const int16x4_t consts2 = vld1_s16(jsimd_fdct_islow_neon_consts + 4); 96 const int16x4_t consts3 = vld1_s16(jsimd_fdct_islow_neon_consts + 8); 97 const int16x4x3_t consts = { { consts1, consts2, consts3 } }; 98 #endif 99 100 /* Load an 8x8 block of samples into Neon registers. De-interleaving loads 101 * are used, followed by vuzp to transpose the block such that we have a 102 * column of samples per vector - allowing all rows to be processed at once. 103 */ 104 int16x8x4_t s_rows_0123 = vld4q_s16(data); 105 int16x8x4_t s_rows_4567 = vld4q_s16(data + 4 * DCTSIZE); 106 107 int16x8x2_t cols_04 = vuzpq_s16(s_rows_0123.val[0], s_rows_4567.val[0]); 108 int16x8x2_t cols_15 = vuzpq_s16(s_rows_0123.val[1], s_rows_4567.val[1]); 109 int16x8x2_t cols_26 = vuzpq_s16(s_rows_0123.val[2], s_rows_4567.val[2]); 110 int16x8x2_t cols_37 = vuzpq_s16(s_rows_0123.val[3], s_rows_4567.val[3]); 111 112 int16x8_t col0 = cols_04.val[0]; 113 int16x8_t col1 = cols_15.val[0]; 114 int16x8_t col2 = cols_26.val[0]; 115 int16x8_t col3 = cols_37.val[0]; 116 int16x8_t col4 = cols_04.val[1]; 117 int16x8_t col5 = cols_15.val[1]; 118 int16x8_t col6 = cols_26.val[1]; 119 int16x8_t col7 = cols_37.val[1]; 120 121 /* Pass 1: process rows. */ 122 123 int16x8_t tmp0 = vaddq_s16(col0, col7); 124 int16x8_t tmp7 = vsubq_s16(col0, col7); 125 int16x8_t tmp1 = vaddq_s16(col1, col6); 126 int16x8_t tmp6 = vsubq_s16(col1, col6); 127 int16x8_t tmp2 = vaddq_s16(col2, col5); 128 int16x8_t tmp5 = vsubq_s16(col2, col5); 129 int16x8_t tmp3 = vaddq_s16(col3, col4); 130 int16x8_t tmp4 = vsubq_s16(col3, col4); 131 132 /* Even part */ 133 int16x8_t tmp10 = vaddq_s16(tmp0, tmp3); 134 int16x8_t tmp13 = vsubq_s16(tmp0, tmp3); 135 int16x8_t tmp11 = vaddq_s16(tmp1, tmp2); 136 int16x8_t tmp12 = vsubq_s16(tmp1, tmp2); 137 138 col0 = vshlq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); 139 col4 = vshlq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); 140 141 int16x8_t tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); 142 int32x4_t z1_l = 143 vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2); 144 int32x4_t z1_h = 145 vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2); 146 147 int32x4_t col2_scaled_l = 148 vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3); 149 int32x4_t col2_scaled_h = 150 vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3); 151 col2 = vcombine_s16(vrshrn_n_s32(col2_scaled_l, DESCALE_P1), 152 vrshrn_n_s32(col2_scaled_h, DESCALE_P1)); 153 154 int32x4_t col6_scaled_l = 155 vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3); 156 int32x4_t col6_scaled_h = 157 vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3); 158 col6 = vcombine_s16(vrshrn_n_s32(col6_scaled_l, DESCALE_P1), 159 vrshrn_n_s32(col6_scaled_h, DESCALE_P1)); 160 161 /* Odd part */ 162 int16x8_t z1 = vaddq_s16(tmp4, tmp7); 163 int16x8_t z2 = vaddq_s16(tmp5, tmp6); 164 int16x8_t z3 = vaddq_s16(tmp4, tmp6); 165 int16x8_t z4 = vaddq_s16(tmp5, tmp7); 166 /* sqrt(2) * c3 */ 167 int32x4_t z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1); 168 int32x4_t z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1); 169 z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1); 170 z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1); 171 172 /* sqrt(2) * (-c1+c3+c5-c7) */ 173 int32x4_t tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0); 174 int32x4_t tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0); 175 /* sqrt(2) * ( c1+c3-c5+c7) */ 176 int32x4_t tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1); 177 int32x4_t tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1); 178 /* sqrt(2) * ( c1+c3+c5-c7) */ 179 int32x4_t tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3); 180 int32x4_t tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3); 181 /* sqrt(2) * ( c1+c3-c5-c7) */ 182 int32x4_t tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2); 183 int32x4_t tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2); 184 185 /* sqrt(2) * (c7-c3) */ 186 z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0); 187 z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0); 188 /* sqrt(2) * (-c1-c3) */ 189 int32x4_t z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2); 190 int32x4_t z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2); 191 /* sqrt(2) * (-c3-c5) */ 192 int32x4_t z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0); 193 int32x4_t z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0); 194 /* sqrt(2) * (c5-c3) */ 195 int32x4_t z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1); 196 int32x4_t z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1); 197 198 z3_l = vaddq_s32(z3_l, z5_l); 199 z3_h = vaddq_s32(z3_h, z5_h); 200 z4_l = vaddq_s32(z4_l, z5_l); 201 z4_h = vaddq_s32(z4_h, z5_h); 202 203 tmp4_l = vaddq_s32(tmp4_l, z1_l); 204 tmp4_h = vaddq_s32(tmp4_h, z1_h); 205 tmp4_l = vaddq_s32(tmp4_l, z3_l); 206 tmp4_h = vaddq_s32(tmp4_h, z3_h); 207 col7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P1), 208 vrshrn_n_s32(tmp4_h, DESCALE_P1)); 209 210 tmp5_l = vaddq_s32(tmp5_l, z2_l); 211 tmp5_h = vaddq_s32(tmp5_h, z2_h); 212 tmp5_l = vaddq_s32(tmp5_l, z4_l); 213 tmp5_h = vaddq_s32(tmp5_h, z4_h); 214 col5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P1), 215 vrshrn_n_s32(tmp5_h, DESCALE_P1)); 216 217 tmp6_l = vaddq_s32(tmp6_l, z2_l); 218 tmp6_h = vaddq_s32(tmp6_h, z2_h); 219 tmp6_l = vaddq_s32(tmp6_l, z3_l); 220 tmp6_h = vaddq_s32(tmp6_h, z3_h); 221 col3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P1), 222 vrshrn_n_s32(tmp6_h, DESCALE_P1)); 223 224 tmp7_l = vaddq_s32(tmp7_l, z1_l); 225 tmp7_h = vaddq_s32(tmp7_h, z1_h); 226 tmp7_l = vaddq_s32(tmp7_l, z4_l); 227 tmp7_h = vaddq_s32(tmp7_h, z4_h); 228 col1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P1), 229 vrshrn_n_s32(tmp7_h, DESCALE_P1)); 230 231 /* Transpose to work on columns in pass 2. */ 232 int16x8x2_t cols_01 = vtrnq_s16(col0, col1); 233 int16x8x2_t cols_23 = vtrnq_s16(col2, col3); 234 int16x8x2_t cols_45 = vtrnq_s16(col4, col5); 235 int16x8x2_t cols_67 = vtrnq_s16(col6, col7); 236 237 int32x4x2_t cols_0145_l = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[0]), 238 vreinterpretq_s32_s16(cols_45.val[0])); 239 int32x4x2_t cols_0145_h = vtrnq_s32(vreinterpretq_s32_s16(cols_01.val[1]), 240 vreinterpretq_s32_s16(cols_45.val[1])); 241 int32x4x2_t cols_2367_l = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[0]), 242 vreinterpretq_s32_s16(cols_67.val[0])); 243 int32x4x2_t cols_2367_h = vtrnq_s32(vreinterpretq_s32_s16(cols_23.val[1]), 244 vreinterpretq_s32_s16(cols_67.val[1])); 245 246 int32x4x2_t rows_04 = vzipq_s32(cols_0145_l.val[0], cols_2367_l.val[0]); 247 int32x4x2_t rows_15 = vzipq_s32(cols_0145_h.val[0], cols_2367_h.val[0]); 248 int32x4x2_t rows_26 = vzipq_s32(cols_0145_l.val[1], cols_2367_l.val[1]); 249 int32x4x2_t rows_37 = vzipq_s32(cols_0145_h.val[1], cols_2367_h.val[1]); 250 251 int16x8_t row0 = vreinterpretq_s16_s32(rows_04.val[0]); 252 int16x8_t row1 = vreinterpretq_s16_s32(rows_15.val[0]); 253 int16x8_t row2 = vreinterpretq_s16_s32(rows_26.val[0]); 254 int16x8_t row3 = vreinterpretq_s16_s32(rows_37.val[0]); 255 int16x8_t row4 = vreinterpretq_s16_s32(rows_04.val[1]); 256 int16x8_t row5 = vreinterpretq_s16_s32(rows_15.val[1]); 257 int16x8_t row6 = vreinterpretq_s16_s32(rows_26.val[1]); 258 int16x8_t row7 = vreinterpretq_s16_s32(rows_37.val[1]); 259 260 /* Pass 2: process columns. */ 261 262 tmp0 = vaddq_s16(row0, row7); 263 tmp7 = vsubq_s16(row0, row7); 264 tmp1 = vaddq_s16(row1, row6); 265 tmp6 = vsubq_s16(row1, row6); 266 tmp2 = vaddq_s16(row2, row5); 267 tmp5 = vsubq_s16(row2, row5); 268 tmp3 = vaddq_s16(row3, row4); 269 tmp4 = vsubq_s16(row3, row4); 270 271 /* Even part */ 272 tmp10 = vaddq_s16(tmp0, tmp3); 273 tmp13 = vsubq_s16(tmp0, tmp3); 274 tmp11 = vaddq_s16(tmp1, tmp2); 275 tmp12 = vsubq_s16(tmp1, tmp2); 276 277 row0 = vrshrq_n_s16(vaddq_s16(tmp10, tmp11), PASS1_BITS); 278 row4 = vrshrq_n_s16(vsubq_s16(tmp10, tmp11), PASS1_BITS); 279 280 tmp12_add_tmp13 = vaddq_s16(tmp12, tmp13); 281 z1_l = vmull_lane_s16(vget_low_s16(tmp12_add_tmp13), consts.val[0], 2); 282 z1_h = vmull_lane_s16(vget_high_s16(tmp12_add_tmp13), consts.val[0], 2); 283 284 int32x4_t row2_scaled_l = 285 vmlal_lane_s16(z1_l, vget_low_s16(tmp13), consts.val[0], 3); 286 int32x4_t row2_scaled_h = 287 vmlal_lane_s16(z1_h, vget_high_s16(tmp13), consts.val[0], 3); 288 row2 = vcombine_s16(vrshrn_n_s32(row2_scaled_l, DESCALE_P2), 289 vrshrn_n_s32(row2_scaled_h, DESCALE_P2)); 290 291 int32x4_t row6_scaled_l = 292 vmlal_lane_s16(z1_l, vget_low_s16(tmp12), consts.val[1], 3); 293 int32x4_t row6_scaled_h = 294 vmlal_lane_s16(z1_h, vget_high_s16(tmp12), consts.val[1], 3); 295 row6 = vcombine_s16(vrshrn_n_s32(row6_scaled_l, DESCALE_P2), 296 vrshrn_n_s32(row6_scaled_h, DESCALE_P2)); 297 298 /* Odd part */ 299 z1 = vaddq_s16(tmp4, tmp7); 300 z2 = vaddq_s16(tmp5, tmp6); 301 z3 = vaddq_s16(tmp4, tmp6); 302 z4 = vaddq_s16(tmp5, tmp7); 303 /* sqrt(2) * c3 */ 304 z5_l = vmull_lane_s16(vget_low_s16(z3), consts.val[1], 1); 305 z5_h = vmull_lane_s16(vget_high_s16(z3), consts.val[1], 1); 306 z5_l = vmlal_lane_s16(z5_l, vget_low_s16(z4), consts.val[1], 1); 307 z5_h = vmlal_lane_s16(z5_h, vget_high_s16(z4), consts.val[1], 1); 308 309 /* sqrt(2) * (-c1+c3+c5-c7) */ 310 tmp4_l = vmull_lane_s16(vget_low_s16(tmp4), consts.val[0], 0); 311 tmp4_h = vmull_lane_s16(vget_high_s16(tmp4), consts.val[0], 0); 312 /* sqrt(2) * ( c1+c3-c5+c7) */ 313 tmp5_l = vmull_lane_s16(vget_low_s16(tmp5), consts.val[2], 1); 314 tmp5_h = vmull_lane_s16(vget_high_s16(tmp5), consts.val[2], 1); 315 /* sqrt(2) * ( c1+c3+c5-c7) */ 316 tmp6_l = vmull_lane_s16(vget_low_s16(tmp6), consts.val[2], 3); 317 tmp6_h = vmull_lane_s16(vget_high_s16(tmp6), consts.val[2], 3); 318 /* sqrt(2) * ( c1+c3-c5-c7) */ 319 tmp7_l = vmull_lane_s16(vget_low_s16(tmp7), consts.val[1], 2); 320 tmp7_h = vmull_lane_s16(vget_high_s16(tmp7), consts.val[1], 2); 321 322 /* sqrt(2) * (c7-c3) */ 323 z1_l = vmull_lane_s16(vget_low_s16(z1), consts.val[1], 0); 324 z1_h = vmull_lane_s16(vget_high_s16(z1), consts.val[1], 0); 325 /* sqrt(2) * (-c1-c3) */ 326 z2_l = vmull_lane_s16(vget_low_s16(z2), consts.val[2], 2); 327 z2_h = vmull_lane_s16(vget_high_s16(z2), consts.val[2], 2); 328 /* sqrt(2) * (-c3-c5) */ 329 z3_l = vmull_lane_s16(vget_low_s16(z3), consts.val[2], 0); 330 z3_h = vmull_lane_s16(vget_high_s16(z3), consts.val[2], 0); 331 /* sqrt(2) * (c5-c3) */ 332 z4_l = vmull_lane_s16(vget_low_s16(z4), consts.val[0], 1); 333 z4_h = vmull_lane_s16(vget_high_s16(z4), consts.val[0], 1); 334 335 z3_l = vaddq_s32(z3_l, z5_l); 336 z3_h = vaddq_s32(z3_h, z5_h); 337 z4_l = vaddq_s32(z4_l, z5_l); 338 z4_h = vaddq_s32(z4_h, z5_h); 339 340 tmp4_l = vaddq_s32(tmp4_l, z1_l); 341 tmp4_h = vaddq_s32(tmp4_h, z1_h); 342 tmp4_l = vaddq_s32(tmp4_l, z3_l); 343 tmp4_h = vaddq_s32(tmp4_h, z3_h); 344 row7 = vcombine_s16(vrshrn_n_s32(tmp4_l, DESCALE_P2), 345 vrshrn_n_s32(tmp4_h, DESCALE_P2)); 346 347 tmp5_l = vaddq_s32(tmp5_l, z2_l); 348 tmp5_h = vaddq_s32(tmp5_h, z2_h); 349 tmp5_l = vaddq_s32(tmp5_l, z4_l); 350 tmp5_h = vaddq_s32(tmp5_h, z4_h); 351 row5 = vcombine_s16(vrshrn_n_s32(tmp5_l, DESCALE_P2), 352 vrshrn_n_s32(tmp5_h, DESCALE_P2)); 353 354 tmp6_l = vaddq_s32(tmp6_l, z2_l); 355 tmp6_h = vaddq_s32(tmp6_h, z2_h); 356 tmp6_l = vaddq_s32(tmp6_l, z3_l); 357 tmp6_h = vaddq_s32(tmp6_h, z3_h); 358 row3 = vcombine_s16(vrshrn_n_s32(tmp6_l, DESCALE_P2), 359 vrshrn_n_s32(tmp6_h, DESCALE_P2)); 360 361 tmp7_l = vaddq_s32(tmp7_l, z1_l); 362 tmp7_h = vaddq_s32(tmp7_h, z1_h); 363 tmp7_l = vaddq_s32(tmp7_l, z4_l); 364 tmp7_h = vaddq_s32(tmp7_h, z4_h); 365 row1 = vcombine_s16(vrshrn_n_s32(tmp7_l, DESCALE_P2), 366 vrshrn_n_s32(tmp7_h, DESCALE_P2)); 367 368 vst1q_s16(data + 0 * DCTSIZE, row0); 369 vst1q_s16(data + 1 * DCTSIZE, row1); 370 vst1q_s16(data + 2 * DCTSIZE, row2); 371 vst1q_s16(data + 3 * DCTSIZE, row3); 372 vst1q_s16(data + 4 * DCTSIZE, row4); 373 vst1q_s16(data + 5 * DCTSIZE, row5); 374 vst1q_s16(data + 6 * DCTSIZE, row6); 375 vst1q_s16(data + 7 * DCTSIZE, row7); 376 }