av1_inv_txfm_neon.c (154792B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/arm/transpose_neon.h" 19 #include "av1/common/av1_inv_txfm1d.h" 20 #include "av1/common/av1_inv_txfm1d_cfg.h" 21 #include "av1/common/av1_txfm.h" 22 #include "av1/common/enums.h" 23 #include "av1/common/idct.h" 24 #include "av1/common/arm/av1_inv_txfm_neon.h" 25 26 // 1D itx types 27 typedef enum ATTRIBUTE_PACKED { 28 IDCT_1D, 29 IADST_1D, 30 IFLIPADST_1D = IADST_1D, 31 IIDENTITY_1D, 32 ITX_TYPES_1D, 33 } ITX_TYPE_1D; 34 35 static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = { 36 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D, 37 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D, 38 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D, 39 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D, 40 }; 41 42 static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = { 43 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D, 44 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D, 45 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D, 46 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D, 47 }; 48 49 // 1D functions 50 static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = { 51 { av1_idct4, av1_iadst4, av1_iidentity4_c }, 52 { av1_idct8, av1_iadst8, av1_iidentity8_c }, 53 { av1_idct16, av1_iadst16, av1_iidentity16_c }, 54 { av1_idct32, NULL, NULL }, 55 { av1_idct64, NULL, NULL }, 56 }; 57 58 static inline void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in, 59 uint8_t *output, int stride, 60 int flipud, 61 const int height) { 62 int j = flipud ? (height - 1) : 0; 63 const int step = flipud ? -1 : 1; 64 int16x8_t temp_output; 65 for (int i = 0; i < height; ++i, j += step) { 66 temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output))); 67 temp_output = vaddq_s16(temp_output, in[j]); 68 vst1_u8(output, vqmovun_s16(temp_output)); 69 output += stride; 70 } 71 } 72 73 static inline uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred, 74 int16x8_t res0, 75 int16x8_t res1) { 76 int16x8_t temp_output[2]; 77 uint8x16_t temp_output_8q; 78 temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred))); 79 temp_output[0] = vaddq_s16(temp_output[0], res0); 80 temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred))); 81 temp_output[1] = vaddq_s16(temp_output[1], res1); 82 temp_output_8q = 83 vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1])); 84 return temp_output_8q; 85 } 86 87 static inline void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in, 88 uint8_t *output, int stride, 89 int flipud, int height) { 90 uint8x16_t temp_output_8q; 91 int j = flipud ? (height - 1) : 0; 92 const int step = flipud ? -1 : 1; 93 for (int i = 0; i < height; ++i, j += step) { 94 temp_output_8q = vld1q_u8(output + i * stride); 95 temp_output_8q = 96 lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]); 97 vst1q_u8((output + i * stride), temp_output_8q); 98 } 99 } 100 101 static inline void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size, 102 int value) { 103 for (int i = 0; i < size; i++) { 104 a[i] = vdupq_n_s16((int16_t)value); 105 } 106 } 107 108 static inline void btf_16_lane_0_1_neon(const int16x8_t in0, 109 const int16x8_t in1, const int16x4_t c, 110 int16x8_t *t0, int16x8_t *t1) { 111 int32x4_t s0[2], s1[2]; 112 int16x4_t v0[2], v1[2]; 113 114 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); 115 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); 116 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); 117 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); 118 119 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1); 120 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1); 121 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0); 122 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0); 123 124 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); 125 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); 126 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); 127 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); 128 129 *t0 = vcombine_s16(v0[0], v0[1]); 130 *t1 = vcombine_s16(v1[0], v1[1]); 131 } 132 133 static inline void btf_16_lane_1_0_neon(const int16x8_t in0, 134 const int16x8_t in1, const int16x4_t c, 135 int16x8_t *t0, int16x8_t *t1) { 136 int32x4_t s0[2], s1[2]; 137 int16x4_t v0[2], v1[2]; 138 139 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1); 140 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1); 141 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0); 142 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0); 143 144 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0); 145 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0); 146 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1); 147 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1); 148 149 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); 150 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); 151 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); 152 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); 153 154 *t0 = vcombine_s16(v0[0], v0[1]); 155 *t1 = vcombine_s16(v1[0], v1[1]); 156 } 157 158 static inline void btf_16_lane_2_3_neon(const int16x8_t in0, 159 const int16x8_t in1, const int16x4_t c, 160 int16x8_t *t0, int16x8_t *t1) { 161 int32x4_t s0[2], s1[2]; 162 int16x4_t v0[2], v1[2]; 163 164 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); 165 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); 166 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); 167 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); 168 169 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3); 170 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3); 171 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2); 172 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2); 173 174 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); 175 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); 176 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); 177 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); 178 179 *t0 = vcombine_s16(v0[0], v0[1]); 180 *t1 = vcombine_s16(v1[0], v1[1]); 181 } 182 183 static inline void btf_16_neon(const int16x8_t in0, int16_t coef1, 184 int16_t coef2, int16x8_t *t0, int16x8_t *t1) { 185 int32x4_t s0_l, s0_h, s1_l, s1_h; 186 int16x4_t v0[2], v1[2]; 187 188 s0_l = vmull_n_s16(vget_low_s16(in0), coef1); 189 s0_h = vmull_n_s16(vget_high_s16(in0), coef1); 190 s1_l = vmull_n_s16(vget_low_s16(in0), coef2); 191 s1_h = vmull_n_s16(vget_high_s16(in0), coef2); 192 193 v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT); 194 v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT); 195 v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT); 196 v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT); 197 198 *t0 = vcombine_s16(v0[0], v0[1]); 199 *t1 = vcombine_s16(v1[0], v1[1]); 200 } 201 202 static inline void btf_16_lane_3_2_neon(const int16x8_t in0, 203 const int16x8_t in1, const int16x4_t c, 204 int16x8_t *t0, int16x8_t *t1) { 205 int32x4_t s0[2], s1[2]; 206 int16x4_t v0[2], v1[2]; 207 208 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3); 209 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3); 210 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2); 211 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2); 212 213 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2); 214 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2); 215 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3); 216 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3); 217 218 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT); 219 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT); 220 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT); 221 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT); 222 223 *t0 = vcombine_s16(v0[0], v0[1]); 224 *t1 = vcombine_s16(v1[0], v1[1]); 225 } 226 227 static inline void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) { 228 int32x4_t t0[2], t1[2]; 229 int16x4_t v0[2], v1[2]; 230 231 // Don't add/sub before multiply, which will overflow in iadst8. 232 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0); 233 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0); 234 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0); 235 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0); 236 237 t0[0] = vaddq_s32(x0_lo, x1_lo); 238 t0[1] = vaddq_s32(x0_hi, x1_hi); 239 t1[0] = vsubq_s32(x0_lo, x1_lo); 240 t1[1] = vsubq_s32(x0_hi, x1_hi); 241 242 v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT); 243 v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT); 244 v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT); 245 v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT); 246 247 x[0] = vcombine_s16(v0[0], v0[1]); 248 x[1] = vcombine_s16(v1[0], v1[1]); 249 } 250 251 static inline int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1, 252 const int16_t c2, const int16_t c3) { 253 int16x4_t val = vdup_n_s16(c0); 254 val = vset_lane_s16(c1, val, 1); 255 val = vset_lane_s16(c2, val, 2); 256 val = vset_lane_s16(c3, val, 3); 257 return val; 258 } 259 260 static inline void iadst8_neon(int16x8_t *const in, int16x8_t *out, 261 int8_t cos_bit) { 262 const int32_t *cospi = cospi_arr(cos_bit); 263 264 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], 265 (int16_t)cospi[20], (int16_t)cospi[44]); 266 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28], 267 (int16_t)cospi[52], (int16_t)cospi[12]); 268 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 269 (int16_t)cospi[16], (int16_t)cospi[48]); 270 271 int16x8_t x[8]; 272 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 273 274 // Stage 1 275 x[0] = in[7]; 276 x[1] = in[0]; 277 x[2] = in[5]; 278 x[3] = in[2]; 279 x[4] = in[3]; 280 x[5] = in[4]; 281 x[6] = in[1]; 282 x[7] = in[6]; 283 284 // Stage 2 285 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); 286 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); 287 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); 288 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); 289 290 // Stage 3 291 x[0] = vqaddq_s16(s0, s4); 292 x[1] = vqaddq_s16(s1, s5); 293 x[2] = vqaddq_s16(s2, s6); 294 x[3] = vqaddq_s16(s3, s7); 295 x[4] = vqsubq_s16(s0, s4); 296 x[5] = vqsubq_s16(s1, s5); 297 x[6] = vqsubq_s16(s2, s6); 298 x[7] = vqsubq_s16(s3, s7); 299 300 // Stage 4 301 s0 = x[0]; 302 s1 = x[1]; 303 s2 = x[2]; 304 s3 = x[3]; 305 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); 306 btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6); 307 308 // Stage 5 309 x[0] = vqaddq_s16(s0, s2); 310 x[1] = vqaddq_s16(s1, s3); 311 x[2] = vqsubq_s16(s0, s2); 312 x[3] = vqsubq_s16(s1, s3); 313 x[4] = vqaddq_s16(s4, s6); 314 x[5] = vqaddq_s16(s5, s7); 315 x[6] = vqsubq_s16(s4, s6); 316 x[7] = vqsubq_s16(s5, s7); 317 318 // stage 6 319 btf_16_half_neon(x + 2, c2); 320 btf_16_half_neon(x + 6, c2); 321 322 // Stage 7 323 out[0] = x[0]; 324 out[1] = vqnegq_s16(x[4]); 325 out[2] = x[6]; 326 out[3] = vqnegq_s16(x[2]); 327 out[4] = x[3]; 328 out[5] = vqnegq_s16(x[7]); 329 out[6] = x[5]; 330 out[7] = vqnegq_s16(x[1]); 331 } 332 333 static inline void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out, 334 int8_t cos_bit) { 335 const int32_t *cospi = cospi_arr(cos_bit); 336 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 337 (int16_t)cospi[16], (int16_t)cospi[48]); 338 339 int16x8_t x[8]; 340 int16x8_t s0, s1, s4, s5; 341 342 // Stage 1 343 x[1] = in[0]; 344 345 // Stage 2 346 347 btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1); 348 349 // Stage 3 350 x[0] = s0; 351 x[1] = s1; 352 x[4] = s0; 353 x[5] = s1; 354 355 // Stage 4 356 s0 = x[0]; 357 s1 = x[1]; 358 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5); 359 360 // Stage 5 361 x[0] = s0; 362 x[1] = s1; 363 x[2] = s0; 364 x[3] = s1; 365 x[4] = s4; 366 x[5] = s5; 367 x[6] = s4; 368 x[7] = s5; 369 370 // stage 6 371 btf_16_half_neon(x + 2, c2); 372 btf_16_half_neon(x + 6, c2); 373 374 // Stage 7 375 out[0] = x[0]; 376 out[1] = vqnegq_s16(x[4]); 377 out[2] = x[6]; 378 out[3] = vqnegq_s16(x[2]); 379 out[4] = x[3]; 380 out[5] = vqnegq_s16(x[7]); 381 out[6] = x[5]; 382 out[7] = vqnegq_s16(x[1]); 383 } 384 385 static inline void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { 386 const int32_t *cospi = cospi_arr(cos_bit); 387 int16x8_t step1[8], step2[8]; 388 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 389 (int16_t)cospi[40], (int16_t)cospi[24]); 390 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 391 (int16_t)cospi[16], (int16_t)cospi[48]); 392 393 // stage 2 394 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]); 395 btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]); 396 397 // stage 3 398 btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]); 399 btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]); 400 step2[4] = vqaddq_s16(step1[4], step1[5]); 401 step2[5] = vqsubq_s16(step1[4], step1[5]); 402 step2[6] = vqsubq_s16(step1[7], step1[6]); 403 step2[7] = vqaddq_s16(step1[7], step1[6]); 404 405 // stage 4 406 step1[0] = vqaddq_s16(step2[0], step2[3]); 407 step1[1] = vqaddq_s16(step2[1], step2[2]); 408 step1[2] = vqsubq_s16(step2[1], step2[2]); 409 step1[3] = vqsubq_s16(step2[0], step2[3]); 410 btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]); 411 412 // stage 5 413 out[0] = vqaddq_s16(step1[0], step2[7]); 414 out[1] = vqaddq_s16(step1[1], step1[6]); 415 out[2] = vqaddq_s16(step1[2], step1[5]); 416 out[3] = vqaddq_s16(step1[3], step2[4]); 417 out[4] = vqsubq_s16(step1[3], step2[4]); 418 out[5] = vqsubq_s16(step1[2], step1[5]); 419 out[6] = vqsubq_s16(step1[1], step1[6]); 420 out[7] = vqsubq_s16(step1[0], step2[7]); 421 } 422 423 static inline void idct8_low1_neon(int16x8_t *in, int16x8_t *out, 424 int8_t cos_bit) { 425 const int32_t *cospi = cospi_arr(cos_bit); 426 int16x8_t step1; 427 int32x4_t t32[2]; 428 429 // stage 1 430 // stage 2 431 // stage 3 432 t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]); 433 t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]); 434 435 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), 436 vrshrn_n_s32(t32[1], INV_COS_BIT)); 437 438 // stage 4 439 // stage 5 440 out[0] = step1; 441 out[1] = step1; 442 out[2] = step1; 443 out[3] = step1; 444 out[4] = step1; 445 out[5] = step1; 446 out[6] = step1; 447 out[7] = step1; 448 } 449 450 static void round_shift_array_16_neon(int16x8_t *arr, int size, int bit) { 451 assert(!(size % 4)); 452 if (!bit) return; 453 const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit)); 454 for (int i = 0; i < size; i++) { 455 arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8); 456 } 457 } 458 459 static inline void flip_buf_ud_neon(int16x8_t *input, int size) { 460 int16x8_t temp[8]; 461 for (int i = 0; i < size; ++i) { 462 temp[i] = input[size - 1 - i]; 463 } 464 for (int i = 0; i < size; ++i) { 465 input[i] = temp[i]; 466 } 467 } 468 469 static inline void load_buffer_32bit_to_16bit_neon(const int32_t *input, 470 int stride, 471 int16x8_t *const a, 472 int out_size) { 473 for (int i = 0; i < out_size; ++i) { 474 a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)), 475 vmovn_s32(vld1q_s32(input + 4))); 476 input += stride; 477 } 478 } 479 480 static const int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 481 4 * 4096, 4 * 5793 }; 482 483 static inline void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output, 484 int txw_idx, int8_t size, int bit) { 485 const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit)); 486 int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]); 487 int16x4_t low_i16, high_i16; 488 int32x4_t low_i32, high_i32; 489 for (int i = 0; i < size; i++) { 490 int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale); 491 int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale); 492 low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4); 493 high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4); 494 low_i16 = vqmovn_s32(low_i32); 495 high_i16 = vqmovn_s32(high_i32); 496 output[i] = vcombine_s16(low_i16, high_i16); 497 } 498 } 499 500 static inline void round_shift_for_rect(int16x8_t *input, int16x8_t *output, 501 int size) { 502 int32x4_t out_low, out_high; 503 int16x4_t low, high; 504 505 for (int z = 0; z < size; ++z) { 506 out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2); 507 out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2); 508 509 low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits); 510 high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits); 511 512 output[z] = vcombine_s16(low, high); 513 } 514 } 515 516 static inline void idct16_low1_neon(int16x8_t *in, int16x8_t *out, 517 int8_t cos_bit) { 518 const int32_t *cospi = cospi_arr(cos_bit); 519 int16x8_t step1; 520 int32x4_t t32[2]; 521 522 // stage 4 523 524 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); 525 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); 526 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), 527 vrshrn_n_s32(t32[1], INV_COS_BIT)); 528 529 // stage 6 530 // stage 7 531 out[0] = step1; 532 out[1] = step1; 533 out[2] = step1; 534 out[3] = step1; 535 out[4] = step1; 536 out[5] = step1; 537 out[6] = step1; 538 out[7] = step1; 539 out[8] = step1; 540 out[9] = step1; 541 out[10] = step1; 542 out[11] = step1; 543 out[12] = step1; 544 out[13] = step1; 545 out[14] = step1; 546 out[15] = step1; 547 } 548 549 static inline void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { 550 const int32_t *cospi = cospi_arr(cos_bit); 551 int16x8_t step1[16], step2[16]; 552 553 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], 554 (int16_t)cospi[36], (int16_t)cospi[28]); 555 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], 556 (int16_t)cospi[52], (int16_t)cospi[12]); 557 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 558 (int16_t)cospi[40], (int16_t)cospi[24]); 559 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 560 (int16_t)cospi[16], (int16_t)cospi[48]); 561 const int16x4_t c4 = 562 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 563 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 564 // stage 2 565 566 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]); 567 btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]); 568 btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]); 569 btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]); 570 571 step2[0] = in[0]; 572 step2[1] = in[8]; 573 step2[2] = in[4]; 574 step2[3] = in[12]; 575 step2[4] = in[2]; 576 step2[5] = in[10]; 577 step2[6] = in[6]; 578 step2[7] = in[14]; 579 580 // stage 3 581 582 btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]); 583 btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]); 584 585 step1[0] = step2[0]; 586 step1[1] = step2[1]; 587 step1[2] = step2[2]; 588 step1[3] = step2[3]; 589 step1[8] = vqaddq_s16(step2[8], step2[9]); 590 step1[9] = vqsubq_s16(step2[8], step2[9]); 591 step1[10] = vqsubq_s16(step2[11], step2[10]); 592 step1[11] = vqaddq_s16(step2[11], step2[10]); 593 step1[12] = vqaddq_s16(step2[12], step2[13]); 594 step1[13] = vqsubq_s16(step2[12], step2[13]); 595 step1[14] = vqsubq_s16(step2[15], step2[14]); 596 step1[15] = vqaddq_s16(step2[15], step2[14]); 597 598 // stage 4 599 600 btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]); 601 btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]); 602 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); 603 btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]); 604 605 step2[4] = vqaddq_s16(step1[4], step1[5]); 606 step2[5] = vqsubq_s16(step1[4], step1[5]); 607 step2[6] = vqsubq_s16(step1[7], step1[6]); 608 step2[7] = vqaddq_s16(step1[7], step1[6]); 609 step2[8] = step1[8]; 610 step2[11] = step1[11]; 611 step2[12] = step1[12]; 612 step2[15] = step1[15]; 613 614 // stage 5 615 616 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); 617 618 step1[0] = vqaddq_s16(step2[0], step2[3]); 619 step1[1] = vqaddq_s16(step2[1], step2[2]); 620 step1[2] = vqsubq_s16(step2[1], step2[2]); 621 step1[3] = vqsubq_s16(step2[0], step2[3]); 622 step1[4] = step2[4]; 623 step1[7] = step2[7]; 624 step1[8] = vqaddq_s16(step2[8], step2[11]); 625 step1[9] = vqaddq_s16(step2[9], step2[10]); 626 step1[10] = vqsubq_s16(step2[9], step2[10]); 627 step1[11] = vqsubq_s16(step2[8], step2[11]); 628 step1[12] = vqsubq_s16(step2[15], step2[12]); 629 step1[13] = vqsubq_s16(step2[14], step2[13]); 630 step1[14] = vqaddq_s16(step2[14], step2[13]); 631 step1[15] = vqaddq_s16(step2[15], step2[12]); 632 633 // stage 6 634 635 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); 636 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); 637 638 step2[0] = vqaddq_s16(step1[0], step1[7]); 639 step2[1] = vqaddq_s16(step1[1], step1[6]); 640 step2[2] = vqaddq_s16(step1[2], step1[5]); 641 step2[3] = vqaddq_s16(step1[3], step1[4]); 642 step2[4] = vqsubq_s16(step1[3], step1[4]); 643 step2[5] = vqsubq_s16(step1[2], step1[5]); 644 step2[6] = vqsubq_s16(step1[1], step1[6]); 645 step2[7] = vqsubq_s16(step1[0], step1[7]); 646 step2[8] = step1[8]; 647 step2[9] = step1[9]; 648 step2[14] = step1[14]; 649 step2[15] = step1[15]; 650 651 // stage 7 652 out[0] = vqaddq_s16(step2[0], step2[15]); 653 out[1] = vqaddq_s16(step2[1], step2[14]); 654 out[2] = vqaddq_s16(step2[2], step2[13]); 655 out[3] = vqaddq_s16(step2[3], step2[12]); 656 out[4] = vqaddq_s16(step2[4], step2[11]); 657 out[5] = vqaddq_s16(step2[5], step2[10]); 658 out[6] = vqaddq_s16(step2[6], step2[9]); 659 out[7] = vqaddq_s16(step2[7], step2[8]); 660 out[8] = vqsubq_s16(step2[7], step2[8]); 661 out[9] = vqsubq_s16(step2[6], step2[9]); 662 out[10] = vqsubq_s16(step2[5], step2[10]); 663 out[11] = vqsubq_s16(step2[4], step2[11]); 664 out[12] = vqsubq_s16(step2[3], step2[12]); 665 out[13] = vqsubq_s16(step2[2], step2[13]); 666 out[14] = vqsubq_s16(step2[1], step2[14]); 667 out[15] = vqsubq_s16(step2[0], step2[15]); 668 } 669 670 static inline void idct16_low8_neon(int16x8_t *in, int16x8_t *out, 671 int8_t cos_bit) { 672 const int32_t *cospi = cospi_arr(cos_bit); 673 int16x8_t step1[16], step2[16]; 674 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 675 (int16_t)cospi[16], (int16_t)cospi[48]); 676 const int16x4_t c1 = 677 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 678 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 679 680 // stage 1 681 // stage 2 682 683 step2[0] = in[0]; 684 step2[2] = in[4]; 685 step2[4] = in[2]; 686 step2[6] = in[6]; 687 688 btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]); 689 btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]); 690 btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]); 691 btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]); 692 693 // stage 3 694 695 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); 696 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); 697 698 step1[0] = step2[0]; 699 step1[2] = step2[2]; 700 step1[8] = vqaddq_s16(step2[8], step2[9]); 701 step1[9] = vqsubq_s16(step2[8], step2[9]); 702 step1[10] = vqsubq_s16(step2[11], step2[10]); 703 step1[11] = vqaddq_s16(step2[11], step2[10]); 704 step1[12] = vqaddq_s16(step2[12], step2[13]); 705 step1[13] = vqsubq_s16(step2[12], step2[13]); 706 step1[14] = vqsubq_s16(step2[15], step2[14]); 707 step1[15] = vqaddq_s16(step2[15], step2[14]); 708 709 // stage 4 710 711 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); 712 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); 713 btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]); 714 btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]); 715 716 step2[4] = vqaddq_s16(step1[4], step1[5]); 717 step2[5] = vqsubq_s16(step1[4], step1[5]); 718 step2[6] = vqsubq_s16(step1[7], step1[6]); 719 step2[7] = vqaddq_s16(step1[7], step1[6]); 720 step2[8] = step1[8]; 721 step2[11] = step1[11]; 722 step2[12] = step1[12]; 723 step2[15] = step1[15]; 724 725 // stage 5 726 727 btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]); 728 step1[0] = vqaddq_s16(step2[0], step2[3]); 729 step1[1] = vqaddq_s16(step2[1], step2[2]); 730 step1[2] = vqsubq_s16(step2[1], step2[2]); 731 step1[3] = vqsubq_s16(step2[0], step2[3]); 732 step1[4] = step2[4]; 733 step1[7] = step2[7]; 734 step1[8] = vqaddq_s16(step2[8], step2[11]); 735 step1[9] = vqaddq_s16(step2[9], step2[10]); 736 step1[10] = vqsubq_s16(step2[9], step2[10]); 737 step1[11] = vqsubq_s16(step2[8], step2[11]); 738 step1[12] = vqsubq_s16(step2[15], step2[12]); 739 step1[13] = vqsubq_s16(step2[14], step2[13]); 740 step1[14] = vqaddq_s16(step2[14], step2[13]); 741 step1[15] = vqaddq_s16(step2[15], step2[12]); 742 743 // stage 6 744 btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]); 745 btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]); 746 747 step2[0] = vqaddq_s16(step1[0], step1[7]); 748 step2[1] = vqaddq_s16(step1[1], step1[6]); 749 step2[2] = vqaddq_s16(step1[2], step1[5]); 750 step2[3] = vqaddq_s16(step1[3], step1[4]); 751 step2[4] = vqsubq_s16(step1[3], step1[4]); 752 step2[5] = vqsubq_s16(step1[2], step1[5]); 753 step2[6] = vqsubq_s16(step1[1], step1[6]); 754 step2[7] = vqsubq_s16(step1[0], step1[7]); 755 step2[8] = step1[8]; 756 step2[9] = step1[9]; 757 step2[14] = step1[14]; 758 step2[15] = step1[15]; 759 760 // stage 7 761 762 out[0] = vqaddq_s16(step2[0], step2[15]); 763 out[1] = vqaddq_s16(step2[1], step2[14]); 764 out[2] = vqaddq_s16(step2[2], step2[13]); 765 out[3] = vqaddq_s16(step2[3], step2[12]); 766 out[4] = vqaddq_s16(step2[4], step2[11]); 767 out[5] = vqaddq_s16(step2[5], step2[10]); 768 out[6] = vqaddq_s16(step2[6], step2[9]); 769 out[7] = vqaddq_s16(step2[7], step2[8]); 770 out[8] = vqsubq_s16(step2[7], step2[8]); 771 out[9] = vqsubq_s16(step2[6], step2[9]); 772 out[10] = vqsubq_s16(step2[5], step2[10]); 773 out[11] = vqsubq_s16(step2[4], step2[11]); 774 out[12] = vqsubq_s16(step2[3], step2[12]); 775 out[13] = vqsubq_s16(step2[2], step2[13]); 776 out[14] = vqsubq_s16(step2[1], step2[14]); 777 out[15] = vqsubq_s16(step2[0], step2[15]); 778 } 779 780 static inline void iadst16_neon(int16x8_t *const in, int16x8_t *out, 781 int8_t cos_bit) { 782 const int32_t *cospi = cospi_arr(cos_bit); 783 784 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], 785 (int16_t)cospi[10], (int16_t)cospi[54]); 786 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], 787 (int16_t)cospi[26], (int16_t)cospi[38]); 788 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30], 789 (int16_t)cospi[42], (int16_t)cospi[22]); 790 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14], 791 (int16_t)cospi[58], (int16_t)cospi[6]); 792 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 793 (int16_t)cospi[40], (int16_t)cospi[24]); 794 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 795 (int16_t)cospi[16], (int16_t)cospi[48]); 796 797 int16x8_t x[16]; 798 int16x8_t t[14]; 799 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 800 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; 801 802 // Stage 1 803 x[0] = in[15]; 804 x[1] = in[0]; 805 x[2] = in[13]; 806 x[3] = in[2]; 807 x[4] = in[11]; 808 x[5] = in[4]; 809 x[6] = in[9]; 810 x[7] = in[6]; 811 x[8] = in[7]; 812 x[9] = in[8]; 813 x[10] = in[5]; 814 x[11] = in[10]; 815 x[12] = in[3]; 816 x[13] = in[12]; 817 x[14] = in[1]; 818 x[15] = in[14]; 819 820 // Stage 2 821 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1); 822 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3); 823 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5); 824 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7); 825 btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9); 826 btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11); 827 btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13); 828 btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15); 829 830 // Stage 3 831 x[0] = vqaddq_s16(s0, s8); 832 x[1] = vqaddq_s16(s1, s9); 833 x[2] = vqaddq_s16(s2, s10); 834 x[3] = vqaddq_s16(s3, s11); 835 x[4] = vqaddq_s16(s4, s12); 836 x[5] = vqaddq_s16(s5, s13); 837 x[6] = vqaddq_s16(s6, s14); 838 x[7] = vqaddq_s16(s7, s15); 839 x[8] = vqsubq_s16(s0, s8); 840 x[9] = vqsubq_s16(s1, s9); 841 x[10] = vqsubq_s16(s2, s10); 842 x[11] = vqsubq_s16(s3, s11); 843 x[12] = vqsubq_s16(s4, s12); 844 x[13] = vqsubq_s16(s5, s13); 845 x[14] = vqsubq_s16(s6, s14); 846 x[15] = vqsubq_s16(s7, s15); 847 848 // Stage 4 849 t[0] = x[0]; 850 t[1] = x[1]; 851 t[2] = x[2]; 852 t[3] = x[3]; 853 t[4] = x[4]; 854 t[5] = x[5]; 855 t[6] = x[6]; 856 t[7] = x[7]; 857 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9); 858 btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11); 859 btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12); 860 btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14); 861 862 // Stage 5 863 x[0] = vqaddq_s16(t[0], t[4]); 864 x[1] = vqaddq_s16(t[1], t[5]); 865 x[2] = vqaddq_s16(t[2], t[6]); 866 x[3] = vqaddq_s16(t[3], t[7]); 867 x[4] = vqsubq_s16(t[0], t[4]); 868 x[5] = vqsubq_s16(t[1], t[5]); 869 x[6] = vqsubq_s16(t[2], t[6]); 870 x[7] = vqsubq_s16(t[3], t[7]); 871 x[8] = vqaddq_s16(s8, s12); 872 x[9] = vqaddq_s16(s9, s13); 873 x[10] = vqaddq_s16(s10, s14); 874 x[11] = vqaddq_s16(s11, s15); 875 x[12] = vqsubq_s16(s8, s12); 876 x[13] = vqsubq_s16(s9, s13); 877 x[14] = vqsubq_s16(s10, s14); 878 x[15] = vqsubq_s16(s11, s15); 879 880 // stage 6 881 t[0] = x[0]; 882 t[1] = x[1]; 883 t[2] = x[2]; 884 t[3] = x[3]; 885 btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5); 886 btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6); 887 t[8] = x[8]; 888 t[9] = x[9]; 889 t[10] = x[10]; 890 t[11] = x[11]; 891 btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13); 892 btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14); 893 894 // Stage 7 895 x[0] = vqaddq_s16(t[0], t[2]); 896 x[1] = vqaddq_s16(t[1], t[3]); 897 x[2] = vqsubq_s16(t[0], t[2]); 898 x[3] = vqsubq_s16(t[1], t[3]); 899 x[4] = vqaddq_s16(s4, s6); 900 x[5] = vqaddq_s16(s5, s7); 901 x[6] = vqsubq_s16(s4, s6); 902 x[7] = vqsubq_s16(s5, s7); 903 x[8] = vqaddq_s16(t[8], t[10]); 904 x[9] = vqaddq_s16(t[9], t[11]); 905 x[10] = vqsubq_s16(t[8], t[10]); 906 x[11] = vqsubq_s16(t[9], t[11]); 907 x[12] = vqaddq_s16(s12, s14); 908 x[13] = vqaddq_s16(s13, s15); 909 x[14] = vqsubq_s16(s12, s14); 910 x[15] = vqsubq_s16(s13, s15); 911 912 // Stage 8 913 btf_16_half_neon(x + 2, c5); 914 btf_16_half_neon(x + 6, c5); 915 btf_16_half_neon(x + 10, c5); 916 btf_16_half_neon(x + 14, c5); 917 918 // Stage 9 919 out[0] = x[0]; 920 out[1] = vqnegq_s16(x[8]); 921 out[2] = x[12]; 922 out[3] = vqnegq_s16(x[4]); 923 out[4] = x[6]; 924 out[5] = vqnegq_s16(x[14]); 925 out[6] = x[10]; 926 out[7] = vqnegq_s16(x[2]); 927 out[8] = x[3]; 928 out[9] = vqnegq_s16(x[11]); 929 out[10] = x[15]; 930 out[11] = vqnegq_s16(x[7]); 931 out[12] = x[5]; 932 out[13] = vqnegq_s16(x[13]); 933 out[14] = x[9]; 934 out[15] = vqnegq_s16(x[1]); 935 } 936 937 static inline void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out, 938 int8_t cos_bit) { 939 const int32_t *cospi = cospi_arr(cos_bit); 940 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 941 (int16_t)cospi[40], (int16_t)cospi[24]); 942 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 943 (int16_t)cospi[16], (int16_t)cospi[48]); 944 945 int16x8_t x[16]; 946 int16x8_t t[10]; 947 int16x8_t s0, s1, s4, s5; 948 int16x8_t s8, s9, s12, s13; 949 950 // Stage 1 951 x[1] = in[0]; 952 953 // Stage 2 954 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); 955 956 // Stage 3 957 x[0] = s0; 958 x[1] = s1; 959 x[8] = s0; 960 x[9] = s1; 961 962 // Stage 4 963 t[0] = x[0]; 964 t[1] = x[1]; 965 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); 966 967 // Stage 5 968 x[0] = t[0]; 969 x[1] = t[1]; 970 x[4] = t[0]; 971 x[5] = t[1]; 972 x[8] = s8; 973 x[9] = s9; 974 x[12] = s8; 975 x[13] = s9; 976 977 // stage 6 978 t[0] = x[0]; 979 t[1] = x[1]; 980 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); 981 t[8] = x[8]; 982 t[9] = x[9]; 983 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); 984 985 // Stage 7 986 x[0] = t[0]; 987 x[1] = t[1]; 988 x[2] = t[0]; 989 x[3] = t[1]; 990 x[4] = s4; 991 x[5] = s5; 992 x[6] = s4; 993 x[7] = s5; 994 x[8] = t[8]; 995 x[9] = t[9]; 996 x[10] = t[8]; 997 x[11] = t[9]; 998 x[12] = s12; 999 x[13] = s13; 1000 x[14] = s12; 1001 x[15] = s13; 1002 1003 // Stage 8 1004 btf_16_half_neon(x + 2, c1); 1005 btf_16_half_neon(x + 6, c1); 1006 btf_16_half_neon(x + 10, c1); 1007 btf_16_half_neon(x + 14, c1); 1008 1009 // Stage 9 1010 out[0] = x[0]; 1011 out[1] = vqnegq_s16(x[8]); 1012 out[2] = x[12]; 1013 out[3] = vqnegq_s16(x[4]); 1014 out[4] = x[6]; 1015 out[5] = vqnegq_s16(x[14]); 1016 out[6] = x[10]; 1017 out[7] = vqnegq_s16(x[2]); 1018 out[8] = x[3]; 1019 out[9] = vqnegq_s16(x[11]); 1020 out[10] = x[15]; 1021 out[11] = vqnegq_s16(x[7]); 1022 out[12] = x[5]; 1023 out[13] = vqnegq_s16(x[13]); 1024 out[14] = x[9]; 1025 out[15] = vqnegq_s16(x[1]); 1026 } 1027 1028 static inline void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out, 1029 int8_t cos_bit) { 1030 const int32_t *cospi = cospi_arr(cos_bit); 1031 1032 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 1033 (int16_t)cospi[40], (int16_t)cospi[24]); 1034 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 1035 (int16_t)cospi[16], (int16_t)cospi[48]); 1036 1037 int16x8_t x[16]; 1038 int16x8_t t[14]; 1039 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; 1040 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15; 1041 1042 // Stage 1 1043 x[1] = in[0]; 1044 x[3] = in[2]; 1045 x[5] = in[4]; 1046 x[7] = in[6]; 1047 x[8] = in[7]; 1048 x[10] = in[5]; 1049 x[12] = in[3]; 1050 x[14] = in[1]; 1051 1052 // Stage 2 1053 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1); 1054 btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3); 1055 btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5); 1056 btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7); 1057 1058 btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9); 1059 btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11); 1060 btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13); 1061 btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15); 1062 1063 // Stage 3 1064 x[0] = vqaddq_s16(s0, s8); 1065 x[1] = vqaddq_s16(s1, s9); 1066 x[2] = vqaddq_s16(s2, s10); 1067 x[3] = vqaddq_s16(s3, s11); 1068 x[4] = vqaddq_s16(s4, s12); 1069 x[5] = vqaddq_s16(s5, s13); 1070 x[6] = vqaddq_s16(s6, s14); 1071 x[7] = vqaddq_s16(s7, s15); 1072 x[8] = vqsubq_s16(s0, s8); 1073 x[9] = vqsubq_s16(s1, s9); 1074 x[10] = vqsubq_s16(s2, s10); 1075 x[11] = vqsubq_s16(s3, s11); 1076 x[12] = vqsubq_s16(s4, s12); 1077 x[13] = vqsubq_s16(s5, s13); 1078 x[14] = vqsubq_s16(s6, s14); 1079 x[15] = vqsubq_s16(s7, s15); 1080 1081 // Stage 4 1082 t[0] = x[0]; 1083 t[1] = x[1]; 1084 t[2] = x[2]; 1085 t[3] = x[3]; 1086 t[4] = x[4]; 1087 t[5] = x[5]; 1088 t[6] = x[6]; 1089 t[7] = x[7]; 1090 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9); 1091 btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11); 1092 btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12); 1093 btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14); 1094 1095 // Stage 5 1096 x[0] = vqaddq_s16(t[0], t[4]); 1097 x[1] = vqaddq_s16(t[1], t[5]); 1098 x[2] = vqaddq_s16(t[2], t[6]); 1099 x[3] = vqaddq_s16(t[3], t[7]); 1100 x[4] = vqsubq_s16(t[0], t[4]); 1101 x[5] = vqsubq_s16(t[1], t[5]); 1102 x[6] = vqsubq_s16(t[2], t[6]); 1103 x[7] = vqsubq_s16(t[3], t[7]); 1104 x[8] = vqaddq_s16(s8, s12); 1105 x[9] = vqaddq_s16(s9, s13); 1106 x[10] = vqaddq_s16(s10, s14); 1107 x[11] = vqaddq_s16(s11, s15); 1108 x[12] = vqsubq_s16(s8, s12); 1109 x[13] = vqsubq_s16(s9, s13); 1110 x[14] = vqsubq_s16(s10, s14); 1111 x[15] = vqsubq_s16(s11, s15); 1112 1113 // stage 6 1114 t[0] = x[0]; 1115 t[1] = x[1]; 1116 t[2] = x[2]; 1117 t[3] = x[3]; 1118 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5); 1119 btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6); 1120 t[8] = x[8]; 1121 t[9] = x[9]; 1122 t[10] = x[10]; 1123 t[11] = x[11]; 1124 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13); 1125 btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14); 1126 1127 // Stage 7 1128 x[0] = vqaddq_s16(t[0], t[2]); 1129 x[1] = vqaddq_s16(t[1], t[3]); 1130 x[2] = vqsubq_s16(t[0], t[2]); 1131 x[3] = vqsubq_s16(t[1], t[3]); 1132 x[4] = vqaddq_s16(s4, s6); 1133 x[5] = vqaddq_s16(s5, s7); 1134 x[6] = vqsubq_s16(s4, s6); 1135 x[7] = vqsubq_s16(s5, s7); 1136 x[8] = vqaddq_s16(t[8], t[10]); 1137 x[9] = vqaddq_s16(t[9], t[11]); 1138 x[10] = vqsubq_s16(t[8], t[10]); 1139 x[11] = vqsubq_s16(t[9], t[11]); 1140 x[12] = vqaddq_s16(s12, s14); 1141 x[13] = vqaddq_s16(s13, s15); 1142 x[14] = vqsubq_s16(s12, s14); 1143 x[15] = vqsubq_s16(s13, s15); 1144 1145 // Stage 8 1146 btf_16_half_neon(x + 2, c1); 1147 btf_16_half_neon(x + 6, c1); 1148 btf_16_half_neon(x + 10, c1); 1149 btf_16_half_neon(x + 14, c1); 1150 1151 // Stage 9 1152 out[0] = x[0]; 1153 out[1] = vqnegq_s16(x[8]); 1154 out[2] = x[12]; 1155 out[3] = vqnegq_s16(x[4]); 1156 out[4] = x[6]; 1157 out[5] = vqnegq_s16(x[14]); 1158 out[6] = x[10]; 1159 out[7] = vqnegq_s16(x[2]); 1160 out[8] = x[3]; 1161 out[9] = vqnegq_s16(x[11]); 1162 out[10] = x[15]; 1163 out[11] = vqnegq_s16(x[7]); 1164 out[12] = x[5]; 1165 out[13] = vqnegq_s16(x[13]); 1166 out[14] = x[9]; 1167 out[15] = vqnegq_s16(x[1]); 1168 } 1169 1170 static inline void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) { 1171 const int32_t *cospi = cospi_arr(cos_bit); 1172 int16x8_t step1[32], step2[32]; 1173 1174 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62], 1175 (int16_t)cospi[34], (int16_t)cospi[30]); 1176 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46], 1177 (int16_t)cospi[50], (int16_t)cospi[14]); 1178 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54], 1179 (int16_t)cospi[42], (int16_t)cospi[22]); 1180 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38], 1181 (int16_t)cospi[58], (int16_t)cospi[6]); 1182 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], 1183 (int16_t)cospi[36], (int16_t)cospi[28]); 1184 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], 1185 (int16_t)cospi[52], (int16_t)cospi[12]); 1186 const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 1187 (int16_t)cospi[40], (int16_t)cospi[24]); 1188 const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 1189 (int16_t)cospi[16], (int16_t)cospi[48]); 1190 const int16x4_t c8 = 1191 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), 1192 (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); 1193 const int16x4_t c9 = 1194 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 1195 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 1196 1197 // stage 2 1198 1199 btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]); 1200 btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]); 1201 btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]); 1202 btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]); 1203 btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]); 1204 btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]); 1205 btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]); 1206 btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]); 1207 1208 step2[0] = in[0]; 1209 step2[1] = in[16]; 1210 step2[2] = in[8]; 1211 step2[3] = in[24]; 1212 step2[4] = in[4]; 1213 step2[5] = in[20]; 1214 step2[6] = in[12]; 1215 step2[7] = in[28]; 1216 step2[8] = in[2]; 1217 step2[9] = in[18]; 1218 step2[10] = in[10]; 1219 step2[11] = in[26]; 1220 step2[12] = in[6]; 1221 step2[13] = in[22]; 1222 step2[14] = in[14]; 1223 step2[15] = in[30]; 1224 1225 // stage 3 1226 1227 btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]); 1228 btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]); 1229 btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]); 1230 btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]); 1231 1232 step1[0] = step2[0]; 1233 step1[1] = step2[1]; 1234 step1[2] = step2[2]; 1235 step1[3] = step2[3]; 1236 step1[4] = step2[4]; 1237 step1[5] = step2[5]; 1238 step1[6] = step2[6]; 1239 step1[7] = step2[7]; 1240 1241 step1[16] = vqaddq_s16(step2[16], step2[17]); 1242 step1[17] = vqsubq_s16(step2[16], step2[17]); 1243 step1[18] = vqsubq_s16(step2[19], step2[18]); 1244 step1[19] = vqaddq_s16(step2[19], step2[18]); 1245 step1[20] = vqaddq_s16(step2[20], step2[21]); 1246 step1[21] = vqsubq_s16(step2[20], step2[21]); 1247 step1[22] = vqsubq_s16(step2[23], step2[22]); 1248 step1[23] = vqaddq_s16(step2[23], step2[22]); 1249 step1[24] = vqaddq_s16(step2[24], step2[25]); 1250 step1[25] = vqsubq_s16(step2[24], step2[25]); 1251 step1[26] = vqsubq_s16(step2[27], step2[26]); 1252 step1[27] = vqaddq_s16(step2[27], step2[26]); 1253 step1[28] = vqaddq_s16(step2[28], step2[29]); 1254 step1[29] = vqsubq_s16(step2[28], step2[29]); 1255 step1[30] = vqsubq_s16(step2[31], step2[30]); 1256 step1[31] = vqaddq_s16(step2[31], step2[30]); 1257 1258 // stage 4 1259 1260 btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]); 1261 btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]); 1262 btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]); 1263 btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]); 1264 btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]); 1265 btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]); 1266 1267 step2[0] = step1[0]; 1268 step2[1] = step1[1]; 1269 step2[2] = step1[2]; 1270 step2[3] = step1[3]; 1271 step2[8] = vqaddq_s16(step1[8], step1[9]); 1272 step2[9] = vqsubq_s16(step1[8], step1[9]); 1273 step2[10] = vqsubq_s16(step1[11], step1[10]); 1274 step2[11] = vqaddq_s16(step1[11], step1[10]); 1275 step2[12] = vqaddq_s16(step1[12], step1[13]); 1276 step2[13] = vqsubq_s16(step1[12], step1[13]); 1277 step2[14] = vqsubq_s16(step1[15], step1[14]); 1278 step2[15] = vqaddq_s16(step1[15], step1[14]); 1279 step2[16] = step1[16]; 1280 step2[19] = step1[19]; 1281 step2[20] = step1[20]; 1282 step2[23] = step1[23]; 1283 step2[24] = step1[24]; 1284 step2[27] = step1[27]; 1285 step2[28] = step1[28]; 1286 step2[31] = step1[31]; 1287 1288 // stage 5 1289 1290 btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]); 1291 btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]); 1292 btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]); 1293 btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]); 1294 1295 step1[4] = vqaddq_s16(step2[4], step2[5]); 1296 step1[5] = vqsubq_s16(step2[4], step2[5]); 1297 step1[6] = vqsubq_s16(step2[7], step2[6]); 1298 step1[7] = vqaddq_s16(step2[7], step2[6]); 1299 step1[8] = step2[8]; 1300 step1[11] = step2[11]; 1301 step1[12] = step2[12]; 1302 step1[15] = step2[15]; 1303 step1[16] = vqaddq_s16(step2[16], step2[19]); 1304 step1[17] = vqaddq_s16(step2[17], step2[18]); 1305 step1[18] = vqsubq_s16(step2[17], step2[18]); 1306 step1[19] = vqsubq_s16(step2[16], step2[19]); 1307 step1[20] = vqsubq_s16(step2[23], step2[20]); 1308 step1[21] = vqsubq_s16(step2[22], step2[21]); 1309 step1[22] = vqaddq_s16(step2[22], step2[21]); 1310 step1[23] = vqaddq_s16(step2[23], step2[20]); 1311 step1[24] = vqaddq_s16(step2[24], step2[27]); 1312 step1[25] = vqaddq_s16(step2[25], step2[26]); 1313 step1[26] = vqsubq_s16(step2[25], step2[26]); 1314 step1[27] = vqsubq_s16(step2[24], step2[27]); 1315 step1[28] = vqsubq_s16(step2[31], step2[28]); 1316 step1[29] = vqsubq_s16(step2[30], step2[29]); 1317 step1[30] = vqaddq_s16(step2[30], step2[29]); 1318 step1[31] = vqaddq_s16(step2[31], step2[28]); 1319 1320 // stage 6 1321 1322 btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]); 1323 btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]); 1324 btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]); 1325 btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]); 1326 btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]); 1327 1328 step2[0] = vqaddq_s16(step1[0], step1[3]); 1329 step2[1] = vqaddq_s16(step1[1], step1[2]); 1330 step2[2] = vqsubq_s16(step1[1], step1[2]); 1331 step2[3] = vqsubq_s16(step1[0], step1[3]); 1332 step2[4] = step1[4]; 1333 step2[7] = step1[7]; 1334 step2[8] = vqaddq_s16(step1[8], step1[11]); 1335 step2[9] = vqaddq_s16(step1[9], step1[10]); 1336 step2[10] = vqsubq_s16(step1[9], step1[10]); 1337 step2[11] = vqsubq_s16(step1[8], step1[11]); 1338 step2[12] = vqsubq_s16(step1[15], step1[12]); 1339 step2[13] = vqsubq_s16(step1[14], step1[13]); 1340 step2[14] = vqaddq_s16(step1[14], step1[13]); 1341 step2[15] = vqaddq_s16(step1[15], step1[12]); 1342 step2[16] = step1[16]; 1343 step2[17] = step1[17]; 1344 step2[22] = step1[22]; 1345 step2[23] = step1[23]; 1346 step2[24] = step1[24]; 1347 step2[25] = step1[25]; 1348 step2[30] = step1[30]; 1349 step2[31] = step1[31]; 1350 1351 // stage 7 1352 1353 btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]); 1354 btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]); 1355 1356 step1[0] = vqaddq_s16(step2[0], step2[7]); 1357 step1[1] = vqaddq_s16(step2[1], step2[6]); 1358 step1[2] = vqaddq_s16(step2[2], step2[5]); 1359 step1[3] = vqaddq_s16(step2[3], step2[4]); 1360 step1[4] = vqsubq_s16(step2[3], step2[4]); 1361 step1[5] = vqsubq_s16(step2[2], step2[5]); 1362 step1[6] = vqsubq_s16(step2[1], step2[6]); 1363 step1[7] = vqsubq_s16(step2[0], step2[7]); 1364 step1[8] = step2[8]; 1365 step1[9] = step2[9]; 1366 step1[14] = step2[14]; 1367 step1[15] = step2[15]; 1368 step1[16] = vqaddq_s16(step2[16], step2[23]); 1369 step1[17] = vqaddq_s16(step2[17], step2[22]); 1370 step1[18] = vqaddq_s16(step2[18], step2[21]); 1371 step1[19] = vqaddq_s16(step2[19], step2[20]); 1372 step1[20] = vqsubq_s16(step2[19], step2[20]); 1373 step1[21] = vqsubq_s16(step2[18], step2[21]); 1374 step1[22] = vqsubq_s16(step2[17], step2[22]); 1375 step1[23] = vqsubq_s16(step2[16], step2[23]); 1376 step1[24] = vqsubq_s16(step2[31], step2[24]); 1377 step1[25] = vqsubq_s16(step2[30], step2[25]); 1378 step1[26] = vqsubq_s16(step2[29], step2[26]); 1379 step1[27] = vqsubq_s16(step2[28], step2[27]); 1380 step1[28] = vqaddq_s16(step2[27], step2[28]); 1381 step1[29] = vqaddq_s16(step2[26], step2[29]); 1382 step1[30] = vqaddq_s16(step2[25], step2[30]); 1383 step1[31] = vqaddq_s16(step2[24], step2[31]); 1384 1385 // stage 8 1386 1387 btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]); 1388 btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]); 1389 btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]); 1390 btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]); 1391 1392 step2[0] = vqaddq_s16(step1[0], step1[15]); 1393 step2[1] = vqaddq_s16(step1[1], step1[14]); 1394 step2[2] = vqaddq_s16(step1[2], step1[13]); 1395 step2[3] = vqaddq_s16(step1[3], step1[12]); 1396 step2[4] = vqaddq_s16(step1[4], step1[11]); 1397 step2[5] = vqaddq_s16(step1[5], step1[10]); 1398 step2[6] = vqaddq_s16(step1[6], step1[9]); 1399 step2[7] = vqaddq_s16(step1[7], step1[8]); 1400 step2[8] = vqsubq_s16(step1[7], step1[8]); 1401 step2[9] = vqsubq_s16(step1[6], step1[9]); 1402 step2[10] = vqsubq_s16(step1[5], step1[10]); 1403 step2[11] = vqsubq_s16(step1[4], step1[11]); 1404 step2[12] = vqsubq_s16(step1[3], step1[12]); 1405 step2[13] = vqsubq_s16(step1[2], step1[13]); 1406 step2[14] = vqsubq_s16(step1[1], step1[14]); 1407 step2[15] = vqsubq_s16(step1[0], step1[15]); 1408 step2[16] = step1[16]; 1409 step2[17] = step1[17]; 1410 step2[18] = step1[18]; 1411 step2[19] = step1[19]; 1412 step2[28] = step1[28]; 1413 step2[29] = step1[29]; 1414 step2[30] = step1[30]; 1415 step2[31] = step1[31]; 1416 1417 // stage 9 1418 1419 out[0] = vqaddq_s16(step2[0], step2[31]); 1420 out[1] = vqaddq_s16(step2[1], step2[30]); 1421 out[2] = vqaddq_s16(step2[2], step2[29]); 1422 out[3] = vqaddq_s16(step2[3], step2[28]); 1423 out[4] = vqaddq_s16(step2[4], step2[27]); 1424 out[5] = vqaddq_s16(step2[5], step2[26]); 1425 out[6] = vqaddq_s16(step2[6], step2[25]); 1426 out[7] = vqaddq_s16(step2[7], step2[24]); 1427 out[8] = vqaddq_s16(step2[8], step2[23]); 1428 out[9] = vqaddq_s16(step2[9], step2[22]); 1429 out[10] = vqaddq_s16(step2[10], step2[21]); 1430 out[11] = vqaddq_s16(step2[11], step2[20]); 1431 out[12] = vqaddq_s16(step2[12], step2[19]); 1432 out[13] = vqaddq_s16(step2[13], step2[18]); 1433 out[14] = vqaddq_s16(step2[14], step2[17]); 1434 out[15] = vqaddq_s16(step2[15], step2[16]); 1435 out[16] = vqsubq_s16(step2[15], step2[16]); 1436 out[17] = vqsubq_s16(step2[14], step2[17]); 1437 out[18] = vqsubq_s16(step2[13], step2[18]); 1438 out[19] = vqsubq_s16(step2[12], step2[19]); 1439 out[20] = vqsubq_s16(step2[11], step2[20]); 1440 out[21] = vqsubq_s16(step2[10], step2[21]); 1441 out[22] = vqsubq_s16(step2[9], step2[22]); 1442 out[23] = vqsubq_s16(step2[8], step2[23]); 1443 out[24] = vqsubq_s16(step2[7], step2[24]); 1444 out[25] = vqsubq_s16(step2[6], step2[25]); 1445 out[26] = vqsubq_s16(step2[5], step2[26]); 1446 out[27] = vqsubq_s16(step2[4], step2[27]); 1447 out[28] = vqsubq_s16(step2[3], step2[28]); 1448 out[29] = vqsubq_s16(step2[2], step2[29]); 1449 out[30] = vqsubq_s16(step2[1], step2[30]); 1450 out[31] = vqsubq_s16(step2[0], step2[31]); 1451 } 1452 1453 static inline void idct32_low1_neon(int16x8_t *in, int16x8_t *out, 1454 int8_t cos_bit) { 1455 const int32_t *cospi = cospi_arr(cos_bit); 1456 int16x8_t step1; 1457 int32x4_t t32[2]; 1458 1459 // stage 1 1460 // stage 2 1461 // stage 3 1462 // stage 4 1463 // stage 5 1464 1465 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]); 1466 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]); 1467 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), 1468 vrshrn_n_s32(t32[1], INV_COS_BIT)); 1469 1470 // stage 6 1471 // stage 7 1472 // stage 8 1473 // stage 9 1474 1475 out[0] = step1; 1476 out[1] = step1; 1477 out[2] = step1; 1478 out[3] = step1; 1479 out[4] = step1; 1480 out[5] = step1; 1481 out[6] = step1; 1482 out[7] = step1; 1483 out[8] = step1; 1484 out[9] = step1; 1485 out[10] = step1; 1486 out[11] = step1; 1487 out[12] = step1; 1488 out[13] = step1; 1489 out[14] = step1; 1490 out[15] = step1; 1491 out[16] = step1; 1492 out[17] = step1; 1493 out[18] = step1; 1494 out[19] = step1; 1495 out[20] = step1; 1496 out[21] = step1; 1497 out[22] = step1; 1498 out[23] = step1; 1499 out[24] = step1; 1500 out[25] = step1; 1501 out[26] = step1; 1502 out[27] = step1; 1503 out[28] = step1; 1504 out[29] = step1; 1505 out[30] = step1; 1506 out[31] = step1; 1507 } 1508 1509 static inline void idct32_low8_neon(int16x8_t *in, int16x8_t *out, 1510 int8_t cos_bit) { 1511 const int32_t *cospi = cospi_arr(cos_bit); 1512 int16x8_t step1[32], step2[32]; 1513 int32x4_t t32[16]; 1514 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 1515 (int16_t)cospi[40], (int16_t)cospi[24]); 1516 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 1517 (int16_t)cospi[16], cospi[48]); 1518 const int16x4_t c2 = 1519 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), 1520 (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); 1521 const int16x4_t c3 = 1522 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 1523 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 1524 // stage 1 1525 // stage 2 1526 1527 step2[0] = in[0]; 1528 step2[4] = in[4]; 1529 step2[8] = in[2]; 1530 step2[12] = in[6]; 1531 1532 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); 1533 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); 1534 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); 1535 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); 1536 1537 // stage 3 1538 step1[0] = step2[0]; 1539 step1[4] = step2[4]; 1540 1541 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); 1542 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); 1543 1544 step1[16] = step2[16]; 1545 step1[17] = step2[16]; 1546 step1[18] = step2[19]; 1547 step1[19] = step2[19]; 1548 step1[20] = step2[20]; 1549 step1[21] = step2[20]; 1550 step1[22] = step2[23]; 1551 step1[23] = step2[23]; 1552 step1[24] = step2[24]; 1553 step1[25] = step2[24]; 1554 step1[26] = step2[27]; 1555 step1[27] = step2[27]; 1556 step1[28] = step2[28]; 1557 step1[29] = step2[28]; 1558 step1[30] = step2[31]; 1559 step1[31] = step2[31]; 1560 1561 // stage 4 1562 1563 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); 1564 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); 1565 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); 1566 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); 1567 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); 1568 1569 step2[0] = step1[0]; 1570 step2[8] = step1[8]; 1571 step2[9] = step1[8]; 1572 step2[10] = step1[11]; 1573 step2[11] = step1[11]; 1574 step2[12] = step1[12]; 1575 step2[13] = step1[12]; 1576 step2[14] = step1[15]; 1577 step2[15] = step1[15]; 1578 step2[16] = step1[16]; 1579 step2[19] = step1[19]; 1580 step2[20] = step1[20]; 1581 step2[23] = step1[23]; 1582 step2[24] = step1[24]; 1583 step2[27] = step1[27]; 1584 step2[28] = step1[28]; 1585 step2[31] = step1[31]; 1586 1587 // stage 5 1588 1589 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); 1590 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); 1591 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), 1592 vrshrn_n_s32(t32[1], INV_COS_BIT)); 1593 1594 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); 1595 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); 1596 1597 step1[4] = step2[4]; 1598 step1[5] = step2[4]; 1599 step1[6] = step2[7]; 1600 step1[7] = step2[7]; 1601 step1[8] = step2[8]; 1602 step1[11] = step2[11]; 1603 step1[12] = step2[12]; 1604 step1[15] = step2[15]; 1605 step1[16] = vqaddq_s16(step2[16], step2[19]); 1606 step1[17] = vqaddq_s16(step2[17], step2[18]); 1607 step1[18] = vqsubq_s16(step2[17], step2[18]); 1608 step1[19] = vqsubq_s16(step2[16], step2[19]); 1609 step1[20] = vqsubq_s16(step2[23], step2[20]); 1610 step1[21] = vqsubq_s16(step2[22], step2[21]); 1611 step1[22] = vqaddq_s16(step2[22], step2[21]); 1612 step1[23] = vqaddq_s16(step2[23], step2[20]); 1613 step1[24] = vqaddq_s16(step2[24], step2[27]); 1614 step1[25] = vqaddq_s16(step2[25], step2[26]); 1615 step1[26] = vqsubq_s16(step2[25], step2[26]); 1616 step1[27] = vqsubq_s16(step2[24], step2[27]); 1617 step1[28] = vqsubq_s16(step2[31], step2[28]); 1618 step1[29] = vqsubq_s16(step2[30], step2[29]); 1619 step1[30] = vqaddq_s16(step2[30], step2[29]); 1620 step1[31] = vqaddq_s16(step2[31], step2[28]); 1621 1622 // stage 6 1623 1624 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); 1625 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); 1626 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); 1627 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); 1628 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); 1629 1630 step2[0] = step1[0]; 1631 step2[1] = step1[0]; 1632 step2[2] = step1[0]; 1633 step2[3] = step1[0]; 1634 step2[4] = step1[4]; 1635 step2[7] = step1[7]; 1636 step2[8] = vqaddq_s16(step1[8], step1[11]); 1637 step2[9] = vqaddq_s16(step1[9], step1[10]); 1638 step2[10] = vqsubq_s16(step1[9], step1[10]); 1639 step2[11] = vqsubq_s16(step1[8], step1[11]); 1640 step2[12] = vqsubq_s16(step1[15], step1[12]); 1641 step2[13] = vqsubq_s16(step1[14], step1[13]); 1642 step2[14] = vqaddq_s16(step1[14], step1[13]); 1643 step2[15] = vqaddq_s16(step1[15], step1[12]); 1644 step2[16] = step1[16]; 1645 step2[17] = step1[17]; 1646 step2[22] = step1[22]; 1647 step2[23] = step1[23]; 1648 step2[24] = step1[24]; 1649 step2[25] = step1[25]; 1650 step2[30] = step1[30]; 1651 step2[31] = step1[31]; 1652 1653 // stage 7 1654 1655 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); 1656 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); 1657 1658 step1[0] = vqaddq_s16(step2[0], step2[7]); 1659 step1[1] = vqaddq_s16(step2[1], step2[6]); 1660 step1[2] = vqaddq_s16(step2[2], step2[5]); 1661 step1[3] = vqaddq_s16(step2[3], step2[4]); 1662 step1[4] = vqsubq_s16(step2[3], step2[4]); 1663 step1[5] = vqsubq_s16(step2[2], step2[5]); 1664 step1[6] = vqsubq_s16(step2[1], step2[6]); 1665 step1[7] = vqsubq_s16(step2[0], step2[7]); 1666 step1[8] = step2[8]; 1667 step1[9] = step2[9]; 1668 step1[14] = step2[14]; 1669 step1[15] = step2[15]; 1670 step1[16] = vqaddq_s16(step2[16], step2[23]); 1671 step1[17] = vqaddq_s16(step2[17], step2[22]); 1672 step1[18] = vqaddq_s16(step2[18], step2[21]); 1673 step1[19] = vqaddq_s16(step2[19], step2[20]); 1674 step1[20] = vqsubq_s16(step2[19], step2[20]); 1675 step1[21] = vqsubq_s16(step2[18], step2[21]); 1676 step1[22] = vqsubq_s16(step2[17], step2[22]); 1677 step1[23] = vqsubq_s16(step2[16], step2[23]); 1678 step1[24] = vqsubq_s16(step2[31], step2[24]); 1679 step1[25] = vqsubq_s16(step2[30], step2[25]); 1680 step1[26] = vqsubq_s16(step2[29], step2[26]); 1681 step1[27] = vqsubq_s16(step2[28], step2[27]); 1682 step1[28] = vqaddq_s16(step2[27], step2[28]); 1683 step1[29] = vqaddq_s16(step2[26], step2[29]); 1684 step1[30] = vqaddq_s16(step2[25], step2[30]); 1685 step1[31] = vqaddq_s16(step2[24], step2[31]); 1686 1687 // stage 8 1688 1689 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); 1690 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); 1691 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); 1692 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); 1693 1694 step2[0] = vqaddq_s16(step1[0], step1[15]); 1695 step2[1] = vqaddq_s16(step1[1], step1[14]); 1696 step2[2] = vqaddq_s16(step1[2], step1[13]); 1697 step2[3] = vqaddq_s16(step1[3], step1[12]); 1698 step2[4] = vqaddq_s16(step1[4], step1[11]); 1699 step2[5] = vqaddq_s16(step1[5], step1[10]); 1700 step2[6] = vqaddq_s16(step1[6], step1[9]); 1701 step2[7] = vqaddq_s16(step1[7], step1[8]); 1702 step2[8] = vqsubq_s16(step1[7], step1[8]); 1703 step2[9] = vqsubq_s16(step1[6], step1[9]); 1704 step2[10] = vqsubq_s16(step1[5], step1[10]); 1705 step2[11] = vqsubq_s16(step1[4], step1[11]); 1706 step2[12] = vqsubq_s16(step1[3], step1[12]); 1707 step2[13] = vqsubq_s16(step1[2], step1[13]); 1708 step2[14] = vqsubq_s16(step1[1], step1[14]); 1709 step2[15] = vqsubq_s16(step1[0], step1[15]); 1710 step2[16] = step1[16]; 1711 step2[17] = step1[17]; 1712 step2[18] = step1[18]; 1713 step2[19] = step1[19]; 1714 step2[28] = step1[28]; 1715 step2[29] = step1[29]; 1716 step2[30] = step1[30]; 1717 step2[31] = step1[31]; 1718 1719 // stage 9 1720 1721 out[0] = vqaddq_s16(step2[0], step2[31]); 1722 out[1] = vqaddq_s16(step2[1], step2[30]); 1723 out[2] = vqaddq_s16(step2[2], step2[29]); 1724 out[3] = vqaddq_s16(step2[3], step2[28]); 1725 out[4] = vqaddq_s16(step2[4], step2[27]); 1726 out[5] = vqaddq_s16(step2[5], step2[26]); 1727 out[6] = vqaddq_s16(step2[6], step2[25]); 1728 out[7] = vqaddq_s16(step2[7], step2[24]); 1729 out[8] = vqaddq_s16(step2[8], step2[23]); 1730 out[9] = vqaddq_s16(step2[9], step2[22]); 1731 out[10] = vqaddq_s16(step2[10], step2[21]); 1732 out[11] = vqaddq_s16(step2[11], step2[20]); 1733 out[12] = vqaddq_s16(step2[12], step2[19]); 1734 out[13] = vqaddq_s16(step2[13], step2[18]); 1735 out[14] = vqaddq_s16(step2[14], step2[17]); 1736 out[15] = vqaddq_s16(step2[15], step2[16]); 1737 out[16] = vqsubq_s16(step2[15], step2[16]); 1738 out[17] = vqsubq_s16(step2[14], step2[17]); 1739 out[18] = vqsubq_s16(step2[13], step2[18]); 1740 out[19] = vqsubq_s16(step2[12], step2[19]); 1741 out[20] = vqsubq_s16(step2[11], step2[20]); 1742 out[21] = vqsubq_s16(step2[10], step2[21]); 1743 out[22] = vqsubq_s16(step2[9], step2[22]); 1744 out[23] = vqsubq_s16(step2[8], step2[23]); 1745 out[24] = vqsubq_s16(step2[7], step2[24]); 1746 out[25] = vqsubq_s16(step2[6], step2[25]); 1747 out[26] = vqsubq_s16(step2[5], step2[26]); 1748 out[27] = vqsubq_s16(step2[4], step2[27]); 1749 out[28] = vqsubq_s16(step2[3], step2[28]); 1750 out[29] = vqsubq_s16(step2[2], step2[29]); 1751 out[30] = vqsubq_s16(step2[1], step2[30]); 1752 out[31] = vqsubq_s16(step2[0], step2[31]); 1753 } 1754 1755 static inline void idct32_low16_neon(int16x8_t *in, int16x8_t *out, 1756 int8_t cos_bit) { 1757 const int32_t *cospi = cospi_arr(cos_bit); 1758 int16x8_t step1[32], step2[32]; 1759 int32x4_t t32[16]; 1760 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 1761 (int16_t)cospi[40], (int16_t)cospi[24]); 1762 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 1763 (int16_t)cospi[16], (int16_t)cospi[48]); 1764 const int16x4_t c2 = 1765 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), 1766 (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); 1767 const int16x4_t c3 = 1768 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 1769 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 1770 1771 // stage 1 1772 // stage 2 1773 1774 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]); 1775 btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]); 1776 btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]); 1777 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]); 1778 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]); 1779 btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]); 1780 btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]); 1781 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]); 1782 1783 step2[0] = in[0]; 1784 step2[2] = in[8]; 1785 step2[4] = in[4]; 1786 step2[6] = in[12]; 1787 step2[8] = in[2]; 1788 step2[10] = in[10]; 1789 step2[12] = in[6]; 1790 step2[14] = in[14]; 1791 1792 // stage 3 1793 1794 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]); 1795 btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]); 1796 btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]); 1797 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]); 1798 1799 step1[0] = step2[0]; 1800 step1[2] = step2[2]; 1801 step1[4] = step2[4]; 1802 step1[6] = step2[6]; 1803 step1[16] = vqaddq_s16(step2[16], step2[17]); 1804 step1[17] = vqsubq_s16(step2[16], step2[17]); 1805 step1[18] = vqsubq_s16(step2[19], step2[18]); 1806 step1[19] = vqaddq_s16(step2[19], step2[18]); 1807 step1[20] = vqaddq_s16(step2[20], step2[21]); 1808 step1[21] = vqsubq_s16(step2[20], step2[21]); 1809 step1[22] = vqsubq_s16(step2[23], step2[22]); 1810 step1[23] = vqaddq_s16(step2[23], step2[22]); 1811 step1[24] = vqaddq_s16(step2[24], step2[25]); 1812 step1[25] = vqsubq_s16(step2[24], step2[25]); 1813 step1[26] = vqsubq_s16(step2[27], step2[26]); 1814 step1[27] = vqaddq_s16(step2[27], step2[26]); 1815 step1[28] = vqaddq_s16(step2[28], step2[29]); 1816 step1[29] = vqsubq_s16(step2[28], step2[29]); 1817 step1[30] = vqsubq_s16(step2[31], step2[30]); 1818 step1[31] = vqaddq_s16(step2[31], step2[30]); 1819 1820 // stage 4 1821 1822 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]); 1823 btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]); 1824 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]); 1825 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]); 1826 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]); 1827 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]); 1828 1829 step2[0] = step1[0]; 1830 step2[2] = step1[2]; 1831 step2[8] = vqaddq_s16(step1[8], step1[9]); 1832 step2[9] = vqsubq_s16(step1[8], step1[9]); 1833 step2[10] = vqsubq_s16(step1[11], step1[10]); 1834 step2[11] = vqaddq_s16(step1[11], step1[10]); 1835 step2[12] = vqaddq_s16(step1[12], step1[13]); 1836 step2[13] = vqsubq_s16(step1[12], step1[13]); 1837 step2[14] = vqsubq_s16(step1[15], step1[14]); 1838 step2[15] = vqaddq_s16(step1[15], step1[14]); 1839 step2[16] = step1[16]; 1840 step2[19] = step1[19]; 1841 step2[20] = step1[20]; 1842 step2[23] = step1[23]; 1843 step2[24] = step1[24]; 1844 step2[27] = step1[27]; 1845 step2[28] = step1[28]; 1846 step2[31] = step1[31]; 1847 1848 // stage 5 1849 1850 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]); 1851 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]); 1852 1853 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), 1854 vrshrn_n_s32(t32[1], INV_COS_BIT)); 1855 1856 btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]); 1857 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]); 1858 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]); 1859 1860 step1[4] = vqaddq_s16(step2[4], step2[5]); 1861 step1[5] = vqsubq_s16(step2[4], step2[5]); 1862 step1[6] = vqsubq_s16(step2[7], step2[6]); 1863 step1[7] = vqaddq_s16(step2[7], step2[6]); 1864 step1[8] = step2[8]; 1865 step1[11] = step2[11]; 1866 step1[12] = step2[12]; 1867 step1[15] = step2[15]; 1868 step1[16] = vqaddq_s16(step2[16], step2[19]); 1869 step1[17] = vqaddq_s16(step2[17], step2[18]); 1870 step1[18] = vqsubq_s16(step2[17], step2[18]); 1871 step1[19] = vqsubq_s16(step2[16], step2[19]); 1872 step1[20] = vqsubq_s16(step2[23], step2[20]); 1873 step1[21] = vqsubq_s16(step2[22], step2[21]); 1874 step1[22] = vqaddq_s16(step2[22], step2[21]); 1875 step1[23] = vqaddq_s16(step2[23], step2[20]); 1876 step1[24] = vqaddq_s16(step2[24], step2[27]); 1877 step1[25] = vqaddq_s16(step2[25], step2[26]); 1878 step1[26] = vqsubq_s16(step2[25], step2[26]); 1879 step1[27] = vqsubq_s16(step2[24], step2[27]); 1880 step1[28] = vqsubq_s16(step2[31], step2[28]); 1881 step1[29] = vqsubq_s16(step2[30], step2[29]); 1882 step1[30] = vqaddq_s16(step2[30], step2[29]); 1883 step1[31] = vqaddq_s16(step2[31], step2[28]); 1884 1885 // stage 6 1886 1887 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]); 1888 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]); 1889 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]); 1890 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]); 1891 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]); 1892 1893 step2[0] = vqaddq_s16(step1[0], step1[3]); 1894 step2[1] = vqaddq_s16(step1[0], step1[2]); 1895 step2[2] = vqsubq_s16(step1[0], step1[2]); 1896 step2[3] = vqsubq_s16(step1[0], step1[3]); 1897 step2[4] = step1[4]; 1898 step2[7] = step1[7]; 1899 step2[8] = vqaddq_s16(step1[8], step1[11]); 1900 step2[9] = vqaddq_s16(step1[9], step1[10]); 1901 step2[10] = vqsubq_s16(step1[9], step1[10]); 1902 step2[11] = vqsubq_s16(step1[8], step1[11]); 1903 step2[12] = vqsubq_s16(step1[15], step1[12]); 1904 step2[13] = vqsubq_s16(step1[14], step1[13]); 1905 step2[14] = vqaddq_s16(step1[14], step1[13]); 1906 step2[15] = vqaddq_s16(step1[15], step1[12]); 1907 step2[16] = step1[16]; 1908 step2[17] = step1[17]; 1909 step2[22] = step1[22]; 1910 step2[23] = step1[23]; 1911 step2[24] = step1[24]; 1912 step2[25] = step1[25]; 1913 step2[30] = step1[30]; 1914 step2[31] = step1[31]; 1915 1916 // stage 7 1917 1918 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]); 1919 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]); 1920 1921 step1[0] = vqaddq_s16(step2[0], step2[7]); 1922 step1[1] = vqaddq_s16(step2[1], step2[6]); 1923 step1[2] = vqaddq_s16(step2[2], step2[5]); 1924 step1[3] = vqaddq_s16(step2[3], step2[4]); 1925 step1[4] = vqsubq_s16(step2[3], step2[4]); 1926 step1[5] = vqsubq_s16(step2[2], step2[5]); 1927 step1[6] = vqsubq_s16(step2[1], step2[6]); 1928 step1[7] = vqsubq_s16(step2[0], step2[7]); 1929 step1[8] = step2[8]; 1930 step1[9] = step2[9]; 1931 step1[14] = step2[14]; 1932 step1[15] = step2[15]; 1933 step1[16] = vqaddq_s16(step2[16], step2[23]); 1934 step1[17] = vqaddq_s16(step2[17], step2[22]); 1935 step1[18] = vqaddq_s16(step2[18], step2[21]); 1936 step1[19] = vqaddq_s16(step2[19], step2[20]); 1937 step1[20] = vqsubq_s16(step2[19], step2[20]); 1938 step1[21] = vqsubq_s16(step2[18], step2[21]); 1939 step1[22] = vqsubq_s16(step2[17], step2[22]); 1940 step1[23] = vqsubq_s16(step2[16], step2[23]); 1941 step1[24] = vqsubq_s16(step2[31], step2[24]); 1942 step1[25] = vqsubq_s16(step2[30], step2[25]); 1943 step1[26] = vqsubq_s16(step2[29], step2[26]); 1944 step1[27] = vqsubq_s16(step2[28], step2[27]); 1945 step1[28] = vqaddq_s16(step2[27], step2[28]); 1946 step1[29] = vqaddq_s16(step2[26], step2[29]); 1947 step1[30] = vqaddq_s16(step2[25], step2[30]); 1948 step1[31] = vqaddq_s16(step2[24], step2[31]); 1949 1950 // stage 8 1951 1952 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]); 1953 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]); 1954 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]); 1955 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]); 1956 1957 step2[0] = vqaddq_s16(step1[0], step1[15]); 1958 step2[1] = vqaddq_s16(step1[1], step1[14]); 1959 step2[2] = vqaddq_s16(step1[2], step1[13]); 1960 step2[3] = vqaddq_s16(step1[3], step1[12]); 1961 step2[4] = vqaddq_s16(step1[4], step1[11]); 1962 step2[5] = vqaddq_s16(step1[5], step1[10]); 1963 step2[6] = vqaddq_s16(step1[6], step1[9]); 1964 step2[7] = vqaddq_s16(step1[7], step1[8]); 1965 step2[8] = vqsubq_s16(step1[7], step1[8]); 1966 step2[9] = vqsubq_s16(step1[6], step1[9]); 1967 step2[10] = vqsubq_s16(step1[5], step1[10]); 1968 step2[11] = vqsubq_s16(step1[4], step1[11]); 1969 step2[12] = vqsubq_s16(step1[3], step1[12]); 1970 step2[13] = vqsubq_s16(step1[2], step1[13]); 1971 step2[14] = vqsubq_s16(step1[1], step1[14]); 1972 step2[15] = vqsubq_s16(step1[0], step1[15]); 1973 step2[16] = step1[16]; 1974 step2[17] = step1[17]; 1975 step2[18] = step1[18]; 1976 step2[19] = step1[19]; 1977 step2[28] = step1[28]; 1978 step2[29] = step1[29]; 1979 step2[30] = step1[30]; 1980 step2[31] = step1[31]; 1981 1982 // stage 9 1983 1984 out[0] = vqaddq_s16(step2[0], step2[31]); 1985 out[1] = vqaddq_s16(step2[1], step2[30]); 1986 out[2] = vqaddq_s16(step2[2], step2[29]); 1987 out[3] = vqaddq_s16(step2[3], step2[28]); 1988 out[4] = vqaddq_s16(step2[4], step2[27]); 1989 out[5] = vqaddq_s16(step2[5], step2[26]); 1990 out[6] = vqaddq_s16(step2[6], step2[25]); 1991 out[7] = vqaddq_s16(step2[7], step2[24]); 1992 out[8] = vqaddq_s16(step2[8], step2[23]); 1993 out[9] = vqaddq_s16(step2[9], step2[22]); 1994 out[10] = vqaddq_s16(step2[10], step2[21]); 1995 out[11] = vqaddq_s16(step2[11], step2[20]); 1996 out[12] = vqaddq_s16(step2[12], step2[19]); 1997 out[13] = vqaddq_s16(step2[13], step2[18]); 1998 out[14] = vqaddq_s16(step2[14], step2[17]); 1999 out[15] = vqaddq_s16(step2[15], step2[16]); 2000 out[16] = vqsubq_s16(step2[15], step2[16]); 2001 out[17] = vqsubq_s16(step2[14], step2[17]); 2002 out[18] = vqsubq_s16(step2[13], step2[18]); 2003 out[19] = vqsubq_s16(step2[12], step2[19]); 2004 out[20] = vqsubq_s16(step2[11], step2[20]); 2005 out[21] = vqsubq_s16(step2[10], step2[21]); 2006 out[22] = vqsubq_s16(step2[9], step2[22]); 2007 out[23] = vqsubq_s16(step2[8], step2[23]); 2008 out[24] = vqsubq_s16(step2[7], step2[24]); 2009 out[25] = vqsubq_s16(step2[6], step2[25]); 2010 out[26] = vqsubq_s16(step2[5], step2[26]); 2011 out[27] = vqsubq_s16(step2[4], step2[27]); 2012 out[28] = vqsubq_s16(step2[3], step2[28]); 2013 out[29] = vqsubq_s16(step2[2], step2[29]); 2014 out[30] = vqsubq_s16(step2[1], step2[30]); 2015 out[31] = vqsubq_s16(step2[0], step2[31]); 2016 } 2017 static inline void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1, 2018 int8_t cos_bit) { 2019 const int32_t *cospi = cospi_arr(cos_bit); 2020 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 2021 (int16_t)cospi[16], (int16_t)cospi[48]); 2022 2023 btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]); 2024 btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]); 2025 btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]); 2026 btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]); 2027 2028 step1[0] = vqaddq_s16(step2[0], step2[15]); 2029 step1[1] = vqaddq_s16(step2[1], step2[14]); 2030 step1[2] = vqaddq_s16(step2[2], step2[13]); 2031 step1[3] = vqaddq_s16(step2[3], step2[12]); 2032 step1[4] = vqaddq_s16(step2[4], step2[11]); 2033 step1[5] = vqaddq_s16(step2[5], step2[10]); 2034 step1[6] = vqaddq_s16(step2[6], step2[9]); 2035 step1[7] = vqaddq_s16(step2[7], step2[8]); 2036 step1[8] = vqsubq_s16(step2[7], step2[8]); 2037 step1[9] = vqsubq_s16(step2[6], step2[9]); 2038 step1[10] = vqsubq_s16(step2[5], step2[10]); 2039 step1[11] = vqsubq_s16(step2[4], step2[11]); 2040 step1[12] = vqsubq_s16(step2[3], step2[12]); 2041 step1[13] = vqsubq_s16(step2[2], step2[13]); 2042 step1[14] = vqsubq_s16(step2[1], step2[14]); 2043 step1[15] = vqsubq_s16(step2[0], step2[15]); 2044 step1[16] = step2[16]; 2045 step1[17] = step2[17]; 2046 step1[18] = step2[18]; 2047 step1[19] = step2[19]; 2048 step1[28] = step2[28]; 2049 step1[29] = step2[29]; 2050 step1[30] = step2[30]; 2051 step1[31] = step2[31]; 2052 step1[32] = vqaddq_s16(step2[32], step2[47]); 2053 step1[33] = vqaddq_s16(step2[33], step2[46]); 2054 step1[34] = vqaddq_s16(step2[34], step2[45]); 2055 step1[35] = vqaddq_s16(step2[35], step2[44]); 2056 step1[36] = vqaddq_s16(step2[36], step2[43]); 2057 step1[37] = vqaddq_s16(step2[37], step2[42]); 2058 step1[38] = vqaddq_s16(step2[38], step2[41]); 2059 step1[39] = vqaddq_s16(step2[39], step2[40]); 2060 step1[40] = vqsubq_s16(step2[39], step2[40]); 2061 step1[41] = vqsubq_s16(step2[38], step2[41]); 2062 step1[42] = vqsubq_s16(step2[37], step2[42]); 2063 step1[43] = vqsubq_s16(step2[36], step2[43]); 2064 step1[44] = vqsubq_s16(step2[35], step2[44]); 2065 step1[45] = vqsubq_s16(step2[34], step2[45]); 2066 step1[46] = vqsubq_s16(step2[33], step2[46]); 2067 step1[47] = vqsubq_s16(step2[32], step2[47]); 2068 step1[48] = vqsubq_s16(step2[63], step2[48]); 2069 step1[49] = vqsubq_s16(step2[62], step2[49]); 2070 step1[50] = vqsubq_s16(step2[61], step2[50]); 2071 step1[51] = vqsubq_s16(step2[60], step2[51]); 2072 step1[52] = vqsubq_s16(step2[59], step2[52]); 2073 step1[53] = vqsubq_s16(step2[58], step2[53]); 2074 step1[54] = vqsubq_s16(step2[57], step2[54]); 2075 step1[55] = vqsubq_s16(step2[56], step2[55]); 2076 step1[56] = vqaddq_s16(step2[56], step2[55]); 2077 step1[57] = vqaddq_s16(step2[57], step2[54]); 2078 step1[58] = vqaddq_s16(step2[58], step2[53]); 2079 step1[59] = vqaddq_s16(step2[59], step2[52]); 2080 step1[60] = vqaddq_s16(step2[60], step2[51]); 2081 step1[61] = vqaddq_s16(step2[61], step2[50]); 2082 step1[62] = vqaddq_s16(step2[62], step2[49]); 2083 step1[63] = vqaddq_s16(step2[63], step2[48]); 2084 } 2085 2086 static inline void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2, 2087 int8_t cos_bit) { 2088 const int32_t *cospi = cospi_arr(cos_bit); 2089 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 2090 (int16_t)cospi[16], (int16_t)cospi[48]); 2091 2092 btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]); 2093 btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]); 2094 btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]); 2095 btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]); 2096 btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]); 2097 btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]); 2098 btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]); 2099 btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]); 2100 2101 step2[0] = vqaddq_s16(step1[0], step1[31]); 2102 step2[1] = vqaddq_s16(step1[1], step1[30]); 2103 step2[2] = vqaddq_s16(step1[2], step1[29]); 2104 step2[3] = vqaddq_s16(step1[3], step1[28]); 2105 step2[4] = vqaddq_s16(step1[4], step1[27]); 2106 step2[5] = vqaddq_s16(step1[5], step1[26]); 2107 step2[6] = vqaddq_s16(step1[6], step1[25]); 2108 step2[7] = vqaddq_s16(step1[7], step1[24]); 2109 step2[8] = vqaddq_s16(step1[8], step1[23]); 2110 step2[9] = vqaddq_s16(step1[9], step1[22]); 2111 step2[10] = vqaddq_s16(step1[10], step1[21]); 2112 step2[11] = vqaddq_s16(step1[11], step1[20]); 2113 step2[12] = vqaddq_s16(step1[12], step1[19]); 2114 step2[13] = vqaddq_s16(step1[13], step1[18]); 2115 step2[14] = vqaddq_s16(step1[14], step1[17]); 2116 step2[15] = vqaddq_s16(step1[15], step1[16]); 2117 step2[16] = vqsubq_s16(step1[15], step1[16]); 2118 step2[17] = vqsubq_s16(step1[14], step1[17]); 2119 step2[18] = vqsubq_s16(step1[13], step1[18]); 2120 step2[19] = vqsubq_s16(step1[12], step1[19]); 2121 step2[20] = vqsubq_s16(step1[11], step1[20]); 2122 step2[21] = vqsubq_s16(step1[10], step1[21]); 2123 step2[22] = vqsubq_s16(step1[9], step1[22]); 2124 step2[23] = vqsubq_s16(step1[8], step1[23]); 2125 step2[24] = vqsubq_s16(step1[7], step1[24]); 2126 step2[25] = vqsubq_s16(step1[6], step1[25]); 2127 step2[26] = vqsubq_s16(step1[5], step1[26]); 2128 step2[27] = vqsubq_s16(step1[4], step1[27]); 2129 step2[28] = vqsubq_s16(step1[3], step1[28]); 2130 step2[29] = vqsubq_s16(step1[2], step1[29]); 2131 step2[30] = vqsubq_s16(step1[1], step1[30]); 2132 step2[31] = vqsubq_s16(step1[0], step1[31]); 2133 step2[32] = step1[32]; 2134 step2[33] = step1[33]; 2135 step2[34] = step1[34]; 2136 step2[35] = step1[35]; 2137 step2[36] = step1[36]; 2138 step2[37] = step1[37]; 2139 step2[38] = step1[38]; 2140 step2[39] = step1[39]; 2141 step2[56] = step1[56]; 2142 step2[57] = step1[57]; 2143 step2[58] = step1[58]; 2144 step2[59] = step1[59]; 2145 step2[60] = step1[60]; 2146 step2[61] = step1[61]; 2147 step2[62] = step1[62]; 2148 step2[63] = step1[63]; 2149 } 2150 2151 static inline void idct64_low32_neon(int16x8_t *in, int16x8_t *out, 2152 int8_t cos_bit) { 2153 const int32_t *cospi = cospi_arr(cos_bit); 2154 int16x8_t step2[64], step1[64]; 2155 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], 2156 (int16_t)cospi[36], (int16_t)cospi[28]); 2157 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], 2158 (int16_t)cospi[52], (int16_t)cospi[12]); 2159 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 2160 (int16_t)cospi[40], (int16_t)cospi[24]); 2161 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 2162 (int16_t)cospi[16], (int16_t)cospi[48]); 2163 const int16x4_t c4 = 2164 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), 2165 (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); 2166 const int16x4_t c5 = 2167 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), 2168 (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); 2169 const int16x4_t c6 = 2170 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), 2171 (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); 2172 const int16x4_t c7 = 2173 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 2174 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 2175 2176 // stage 1 2177 // stage 2 2178 2179 step2[0] = in[0]; 2180 step2[2] = in[16]; 2181 step2[4] = in[8]; 2182 step2[6] = in[24]; 2183 step2[8] = in[4]; 2184 step2[10] = in[20]; 2185 step2[12] = in[12]; 2186 step2[14] = in[28]; 2187 step2[16] = in[2]; 2188 step2[18] = in[18]; 2189 step2[20] = in[10]; 2190 step2[22] = in[26]; 2191 step2[24] = in[6]; 2192 step2[26] = in[22]; 2193 step2[28] = in[14]; 2194 step2[30] = in[30]; 2195 2196 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); 2197 btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]); 2198 btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]); 2199 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); 2200 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); 2201 btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]); 2202 btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]); 2203 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); 2204 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); 2205 btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]); 2206 btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]); 2207 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); 2208 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); 2209 btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]); 2210 btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]); 2211 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); 2212 2213 // stage 3 2214 2215 step1[0] = step2[0]; 2216 step1[2] = step2[2]; 2217 step1[4] = step2[4]; 2218 step1[6] = step2[6]; 2219 step1[8] = step2[8]; 2220 step1[10] = step2[10]; 2221 step1[12] = step2[12]; 2222 step1[14] = step2[14]; 2223 2224 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); 2225 btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]); 2226 btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]); 2227 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); 2228 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); 2229 btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]); 2230 btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]); 2231 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); 2232 2233 step1[32] = vqaddq_s16(step2[32], step2[33]); 2234 step1[33] = vqsubq_s16(step2[32], step2[33]); 2235 step1[34] = vqsubq_s16(step2[35], step2[34]); 2236 step1[35] = vqaddq_s16(step2[35], step2[34]); 2237 step1[36] = vqaddq_s16(step2[36], step2[37]); 2238 step1[37] = vqsubq_s16(step2[36], step2[37]); 2239 step1[38] = vqsubq_s16(step2[39], step2[38]); 2240 step1[39] = vqaddq_s16(step2[39], step2[38]); 2241 step1[40] = vqaddq_s16(step2[40], step2[41]); 2242 step1[41] = vqsubq_s16(step2[40], step2[41]); 2243 step1[42] = vqsubq_s16(step2[43], step2[42]); 2244 step1[43] = vqaddq_s16(step2[43], step2[42]); 2245 step1[44] = vqaddq_s16(step2[44], step2[45]); 2246 step1[45] = vqsubq_s16(step2[44], step2[45]); 2247 step1[46] = vqsubq_s16(step2[47], step2[46]); 2248 step1[47] = vqaddq_s16(step2[47], step2[46]); 2249 step1[48] = vqaddq_s16(step2[48], step2[49]); 2250 step1[49] = vqsubq_s16(step2[48], step2[49]); 2251 step1[50] = vqsubq_s16(step2[51], step2[50]); 2252 step1[51] = vqaddq_s16(step2[51], step2[50]); 2253 step1[52] = vqaddq_s16(step2[52], step2[53]); 2254 step1[53] = vqsubq_s16(step2[52], step2[53]); 2255 step1[54] = vqsubq_s16(step2[55], step2[54]); 2256 step1[55] = vqaddq_s16(step2[55], step2[54]); 2257 step1[56] = vqaddq_s16(step2[56], step2[57]); 2258 step1[57] = vqsubq_s16(step2[56], step2[57]); 2259 step1[58] = vqsubq_s16(step2[59], step2[58]); 2260 step1[59] = vqaddq_s16(step2[59], step2[58]); 2261 step1[60] = vqaddq_s16(step2[60], step2[61]); 2262 step1[61] = vqsubq_s16(step2[60], step2[61]); 2263 step1[62] = vqsubq_s16(step2[63], step2[62]); 2264 step1[63] = vqaddq_s16(step2[63], step2[62]); 2265 2266 // stage 4 2267 2268 step2[0] = step1[0]; 2269 step2[2] = step1[2]; 2270 step2[4] = step1[4]; 2271 step2[6] = step1[6]; 2272 2273 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); 2274 btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]); 2275 btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]); 2276 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); 2277 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); 2278 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); 2279 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); 2280 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); 2281 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); 2282 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); 2283 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); 2284 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); 2285 2286 step2[16] = vqaddq_s16(step1[16], step1[17]); 2287 step2[17] = vqsubq_s16(step1[16], step1[17]); 2288 step2[18] = vqsubq_s16(step1[19], step1[18]); 2289 step2[19] = vqaddq_s16(step1[19], step1[18]); 2290 step2[20] = vqaddq_s16(step1[20], step1[21]); 2291 step2[21] = vqsubq_s16(step1[20], step1[21]); 2292 step2[22] = vqsubq_s16(step1[23], step1[22]); 2293 step2[23] = vqaddq_s16(step1[23], step1[22]); 2294 step2[24] = vqaddq_s16(step1[24], step1[25]); 2295 step2[25] = vqsubq_s16(step1[24], step1[25]); 2296 step2[26] = vqsubq_s16(step1[27], step1[26]); 2297 step2[27] = vqaddq_s16(step1[27], step1[26]); 2298 step2[28] = vqaddq_s16(step1[28], step1[29]); 2299 step2[29] = vqsubq_s16(step1[28], step1[29]); 2300 step2[30] = vqsubq_s16(step1[31], step1[30]); 2301 step2[31] = vqaddq_s16(step1[31], step1[30]); 2302 step2[32] = step1[32]; 2303 step2[35] = step1[35]; 2304 step2[36] = step1[36]; 2305 step2[39] = step1[39]; 2306 step2[40] = step1[40]; 2307 step2[43] = step1[43]; 2308 step2[44] = step1[44]; 2309 step2[47] = step1[47]; 2310 step2[48] = step1[48]; 2311 step2[51] = step1[51]; 2312 step2[52] = step1[52]; 2313 step2[55] = step1[55]; 2314 step2[56] = step1[56]; 2315 step2[59] = step1[59]; 2316 step2[60] = step1[60]; 2317 step2[63] = step1[63]; 2318 2319 // stage 5 2320 2321 step1[0] = step2[0]; 2322 step1[2] = step2[2]; 2323 2324 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); 2325 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]); 2326 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); 2327 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); 2328 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); 2329 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); 2330 2331 step1[8] = vqaddq_s16(step2[8], step2[9]); 2332 step1[9] = vqsubq_s16(step2[8], step2[9]); 2333 step1[10] = vqsubq_s16(step2[11], step2[10]); 2334 step1[11] = vqaddq_s16(step2[11], step2[10]); 2335 step1[12] = vqaddq_s16(step2[12], step2[13]); 2336 step1[13] = vqsubq_s16(step2[12], step2[13]); 2337 step1[14] = vqsubq_s16(step2[15], step2[14]); 2338 step1[15] = vqaddq_s16(step2[15], step2[14]); 2339 step1[16] = step2[16]; 2340 step1[19] = step2[19]; 2341 step1[20] = step2[20]; 2342 step1[23] = step2[23]; 2343 step1[24] = step2[24]; 2344 step1[27] = step2[27]; 2345 step1[28] = step2[28]; 2346 step1[31] = step2[31]; 2347 step1[32] = vqaddq_s16(step2[32], step2[35]); 2348 step1[33] = vqaddq_s16(step2[33], step2[34]); 2349 step1[34] = vqsubq_s16(step2[33], step2[34]); 2350 step1[35] = vqsubq_s16(step2[32], step2[35]); 2351 step1[36] = vqsubq_s16(step2[39], step2[36]); 2352 step1[37] = vqsubq_s16(step2[38], step2[37]); 2353 step1[38] = vqaddq_s16(step2[38], step2[37]); 2354 step1[39] = vqaddq_s16(step2[39], step2[36]); 2355 step1[40] = vqaddq_s16(step2[40], step2[43]); 2356 step1[41] = vqaddq_s16(step2[41], step2[42]); 2357 step1[42] = vqsubq_s16(step2[41], step2[42]); 2358 step1[43] = vqsubq_s16(step2[40], step2[43]); 2359 step1[44] = vqsubq_s16(step2[47], step2[44]); 2360 step1[45] = vqsubq_s16(step2[46], step2[45]); 2361 step1[46] = vqaddq_s16(step2[46], step2[45]); 2362 step1[47] = vqaddq_s16(step2[47], step2[44]); 2363 step1[48] = vqaddq_s16(step2[48], step2[51]); 2364 step1[49] = vqaddq_s16(step2[49], step2[50]); 2365 step1[50] = vqsubq_s16(step2[49], step2[50]); 2366 step1[51] = vqsubq_s16(step2[48], step2[51]); 2367 step1[52] = vqsubq_s16(step2[55], step2[52]); 2368 step1[53] = vqsubq_s16(step2[54], step2[53]); 2369 step1[54] = vqaddq_s16(step2[54], step2[53]); 2370 step1[55] = vqaddq_s16(step2[55], step2[52]); 2371 step1[56] = vqaddq_s16(step2[56], step2[59]); 2372 step1[57] = vqaddq_s16(step2[57], step2[58]); 2373 step1[58] = vqsubq_s16(step2[57], step2[58]); 2374 step1[59] = vqsubq_s16(step2[56], step2[59]); 2375 step1[60] = vqsubq_s16(step2[63], step2[60]); 2376 step1[61] = vqsubq_s16(step2[62], step2[61]); 2377 step1[62] = vqaddq_s16(step2[62], step2[61]); 2378 step1[63] = vqaddq_s16(step2[63], step2[60]); 2379 2380 // stage 6 2381 2382 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); 2383 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]); 2384 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); 2385 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); 2386 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); 2387 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); 2388 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); 2389 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); 2390 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); 2391 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); 2392 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); 2393 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); 2394 2395 step2[4] = vqaddq_s16(step1[4], step1[5]); 2396 step2[5] = vqsubq_s16(step1[4], step1[5]); 2397 step2[6] = vqsubq_s16(step1[7], step1[6]); 2398 step2[7] = vqaddq_s16(step1[7], step1[6]); 2399 step2[8] = step1[8]; 2400 step2[11] = step1[11]; 2401 step2[12] = step1[12]; 2402 step2[15] = step1[15]; 2403 step2[16] = vqaddq_s16(step1[16], step1[19]); 2404 step2[17] = vqaddq_s16(step1[17], step1[18]); 2405 step2[18] = vqsubq_s16(step1[17], step1[18]); 2406 step2[19] = vqsubq_s16(step1[16], step1[19]); 2407 step2[20] = vqsubq_s16(step1[23], step1[20]); 2408 step2[21] = vqsubq_s16(step1[22], step1[21]); 2409 step2[22] = vqaddq_s16(step1[22], step1[21]); 2410 step2[23] = vqaddq_s16(step1[23], step1[20]); 2411 step2[24] = vqaddq_s16(step1[24], step1[27]); 2412 step2[25] = vqaddq_s16(step1[25], step1[26]); 2413 step2[26] = vqsubq_s16(step1[25], step1[26]); 2414 step2[27] = vqsubq_s16(step1[24], step1[27]); 2415 step2[28] = vqsubq_s16(step1[31], step1[28]); 2416 step2[29] = vqsubq_s16(step1[30], step1[29]); 2417 step2[30] = vqaddq_s16(step1[30], step1[29]); 2418 step2[31] = vqaddq_s16(step1[31], step1[28]); 2419 step2[32] = step1[32]; 2420 step2[33] = step1[33]; 2421 step2[38] = step1[38]; 2422 step2[39] = step1[39]; 2423 step2[40] = step1[40]; 2424 step2[41] = step1[41]; 2425 step2[46] = step1[46]; 2426 step2[47] = step1[47]; 2427 step2[48] = step1[48]; 2428 step2[49] = step1[49]; 2429 step2[54] = step1[54]; 2430 step2[55] = step1[55]; 2431 step2[56] = step1[56]; 2432 step2[57] = step1[57]; 2433 step2[62] = step1[62]; 2434 step2[63] = step1[63]; 2435 2436 // stage 7 2437 2438 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); 2439 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); 2440 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); 2441 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); 2442 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); 2443 2444 step1[0] = vqaddq_s16(step2[0], step2[3]); 2445 step1[1] = vqaddq_s16(step2[1], step2[2]); 2446 step1[2] = vqsubq_s16(step2[1], step2[2]); 2447 step1[3] = vqsubq_s16(step2[0], step2[3]); 2448 step1[4] = step2[4]; 2449 step1[7] = step2[7]; 2450 step1[8] = vqaddq_s16(step2[8], step2[11]); 2451 step1[9] = vqaddq_s16(step2[9], step2[10]); 2452 step1[10] = vqsubq_s16(step2[9], step2[10]); 2453 step1[11] = vqsubq_s16(step2[8], step2[11]); 2454 step1[12] = vqsubq_s16(step2[15], step2[12]); 2455 step1[13] = vqsubq_s16(step2[14], step2[13]); 2456 step1[14] = vqaddq_s16(step2[14], step2[13]); 2457 step1[15] = vqaddq_s16(step2[15], step2[12]); 2458 step1[16] = step2[16]; 2459 step1[17] = step2[17]; 2460 step1[22] = step2[22]; 2461 step1[23] = step2[23]; 2462 step1[24] = step2[24]; 2463 step1[25] = step2[25]; 2464 step1[30] = step2[30]; 2465 step1[31] = step2[31]; 2466 step1[32] = vqaddq_s16(step2[32], step2[39]); 2467 step1[33] = vqaddq_s16(step2[33], step2[38]); 2468 step1[34] = vqaddq_s16(step2[34], step2[37]); 2469 step1[35] = vqaddq_s16(step2[35], step2[36]); 2470 step1[36] = vqsubq_s16(step2[35], step2[36]); 2471 step1[37] = vqsubq_s16(step2[34], step2[37]); 2472 step1[38] = vqsubq_s16(step2[33], step2[38]); 2473 step1[39] = vqsubq_s16(step2[32], step2[39]); 2474 step1[40] = vqsubq_s16(step2[47], step2[40]); 2475 step1[41] = vqsubq_s16(step2[46], step2[41]); 2476 step1[42] = vqsubq_s16(step2[45], step2[42]); 2477 step1[43] = vqsubq_s16(step2[44], step2[43]); 2478 step1[44] = vqaddq_s16(step2[43], step2[44]); 2479 step1[45] = vqaddq_s16(step2[42], step2[45]); 2480 step1[46] = vqaddq_s16(step2[41], step2[46]); 2481 step1[47] = vqaddq_s16(step2[40], step2[47]); 2482 step1[48] = vqaddq_s16(step2[48], step2[55]); 2483 step1[49] = vqaddq_s16(step2[49], step2[54]); 2484 step1[50] = vqaddq_s16(step2[50], step2[53]); 2485 step1[51] = vqaddq_s16(step2[51], step2[52]); 2486 step1[52] = vqsubq_s16(step2[51], step2[52]); 2487 step1[53] = vqsubq_s16(step2[50], step2[53]); 2488 step1[54] = vqsubq_s16(step2[49], step2[54]); 2489 step1[55] = vqsubq_s16(step2[48], step2[55]); 2490 step1[56] = vqsubq_s16(step2[63], step2[56]); 2491 step1[57] = vqsubq_s16(step2[62], step2[57]); 2492 step1[58] = vqsubq_s16(step2[61], step2[58]); 2493 step1[59] = vqsubq_s16(step2[60], step2[59]); 2494 step1[60] = vqaddq_s16(step2[59], step2[60]); 2495 step1[61] = vqaddq_s16(step2[58], step2[61]); 2496 step1[62] = vqaddq_s16(step2[57], step2[62]); 2497 step1[63] = vqaddq_s16(step2[56], step2[63]); 2498 2499 // stage 8 2500 2501 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); 2502 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); 2503 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); 2504 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); 2505 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); 2506 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); 2507 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); 2508 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); 2509 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); 2510 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); 2511 2512 step2[0] = vqaddq_s16(step1[0], step1[7]); 2513 step2[1] = vqaddq_s16(step1[1], step1[6]); 2514 step2[2] = vqaddq_s16(step1[2], step1[5]); 2515 step2[3] = vqaddq_s16(step1[3], step1[4]); 2516 step2[4] = vqsubq_s16(step1[3], step1[4]); 2517 step2[5] = vqsubq_s16(step1[2], step1[5]); 2518 step2[6] = vqsubq_s16(step1[1], step1[6]); 2519 step2[7] = vqsubq_s16(step1[0], step1[7]); 2520 step2[8] = step1[8]; 2521 step2[9] = step1[9]; 2522 step2[14] = step1[14]; 2523 step2[15] = step1[15]; 2524 step2[16] = vqaddq_s16(step1[16], step1[23]); 2525 step2[17] = vqaddq_s16(step1[17], step1[22]); 2526 step2[18] = vqaddq_s16(step1[18], step1[21]); 2527 step2[19] = vqaddq_s16(step1[19], step1[20]); 2528 step2[20] = vqsubq_s16(step1[19], step1[20]); 2529 step2[21] = vqsubq_s16(step1[18], step1[21]); 2530 step2[22] = vqsubq_s16(step1[17], step1[22]); 2531 step2[23] = vqsubq_s16(step1[16], step1[23]); 2532 step2[24] = vqsubq_s16(step1[31], step1[24]); 2533 step2[25] = vqsubq_s16(step1[30], step1[25]); 2534 step2[26] = vqsubq_s16(step1[29], step1[26]); 2535 step2[27] = vqsubq_s16(step1[28], step1[27]); 2536 step2[28] = vqaddq_s16(step1[28], step1[27]); 2537 step2[29] = vqaddq_s16(step1[29], step1[26]); 2538 step2[30] = vqaddq_s16(step1[30], step1[25]); 2539 step2[31] = vqaddq_s16(step1[31], step1[24]); 2540 step2[32] = step1[32]; 2541 step2[33] = step1[33]; 2542 step2[34] = step1[34]; 2543 step2[35] = step1[35]; 2544 step2[44] = step1[44]; 2545 step2[45] = step1[45]; 2546 step2[46] = step1[46]; 2547 step2[47] = step1[47]; 2548 step2[48] = step1[48]; 2549 step2[49] = step1[49]; 2550 step2[50] = step1[50]; 2551 step2[51] = step1[51]; 2552 step2[60] = step1[60]; 2553 step2[61] = step1[61]; 2554 step2[62] = step1[62]; 2555 step2[63] = step1[63]; 2556 2557 // stage 9 2558 idct64_stage9_neon(step2, step1, cos_bit); 2559 2560 // stage 10 2561 idct64_stage10_neon(step1, step2, cos_bit); 2562 2563 // stage 11 2564 2565 out[0] = vqaddq_s16(step2[0], step2[63]); 2566 out[1] = vqaddq_s16(step2[1], step2[62]); 2567 out[2] = vqaddq_s16(step2[2], step2[61]); 2568 out[3] = vqaddq_s16(step2[3], step2[60]); 2569 out[4] = vqaddq_s16(step2[4], step2[59]); 2570 out[5] = vqaddq_s16(step2[5], step2[58]); 2571 out[6] = vqaddq_s16(step2[6], step2[57]); 2572 out[7] = vqaddq_s16(step2[7], step2[56]); 2573 out[8] = vqaddq_s16(step2[8], step2[55]); 2574 out[9] = vqaddq_s16(step2[9], step2[54]); 2575 out[10] = vqaddq_s16(step2[10], step2[53]); 2576 out[11] = vqaddq_s16(step2[11], step2[52]); 2577 out[12] = vqaddq_s16(step2[12], step2[51]); 2578 out[13] = vqaddq_s16(step2[13], step2[50]); 2579 out[14] = vqaddq_s16(step2[14], step2[49]); 2580 out[15] = vqaddq_s16(step2[15], step2[48]); 2581 out[16] = vqaddq_s16(step2[16], step2[47]); 2582 out[17] = vqaddq_s16(step2[17], step2[46]); 2583 out[18] = vqaddq_s16(step2[18], step2[45]); 2584 out[19] = vqaddq_s16(step2[19], step2[44]); 2585 out[20] = vqaddq_s16(step2[20], step2[43]); 2586 out[21] = vqaddq_s16(step2[21], step2[42]); 2587 out[22] = vqaddq_s16(step2[22], step2[41]); 2588 out[23] = vqaddq_s16(step2[23], step2[40]); 2589 out[24] = vqaddq_s16(step2[24], step2[39]); 2590 out[25] = vqaddq_s16(step2[25], step2[38]); 2591 out[26] = vqaddq_s16(step2[26], step2[37]); 2592 out[27] = vqaddq_s16(step2[27], step2[36]); 2593 out[28] = vqaddq_s16(step2[28], step2[35]); 2594 out[29] = vqaddq_s16(step2[29], step2[34]); 2595 out[30] = vqaddq_s16(step2[30], step2[33]); 2596 out[31] = vqaddq_s16(step2[31], step2[32]); 2597 out[32] = vqsubq_s16(step2[31], step2[32]); 2598 out[33] = vqsubq_s16(step2[30], step2[33]); 2599 out[34] = vqsubq_s16(step2[29], step2[34]); 2600 out[35] = vqsubq_s16(step2[28], step2[35]); 2601 out[36] = vqsubq_s16(step2[27], step2[36]); 2602 out[37] = vqsubq_s16(step2[26], step2[37]); 2603 out[38] = vqsubq_s16(step2[25], step2[38]); 2604 out[39] = vqsubq_s16(step2[24], step2[39]); 2605 out[40] = vqsubq_s16(step2[23], step2[40]); 2606 out[41] = vqsubq_s16(step2[22], step2[41]); 2607 out[42] = vqsubq_s16(step2[21], step2[42]); 2608 out[43] = vqsubq_s16(step2[20], step2[43]); 2609 out[44] = vqsubq_s16(step2[19], step2[44]); 2610 out[45] = vqsubq_s16(step2[18], step2[45]); 2611 out[46] = vqsubq_s16(step2[17], step2[46]); 2612 out[47] = vqsubq_s16(step2[16], step2[47]); 2613 out[48] = vqsubq_s16(step2[15], step2[48]); 2614 out[49] = vqsubq_s16(step2[14], step2[49]); 2615 out[50] = vqsubq_s16(step2[13], step2[50]); 2616 out[51] = vqsubq_s16(step2[12], step2[51]); 2617 out[52] = vqsubq_s16(step2[11], step2[52]); 2618 out[53] = vqsubq_s16(step2[10], step2[53]); 2619 out[54] = vqsubq_s16(step2[9], step2[54]); 2620 out[55] = vqsubq_s16(step2[8], step2[55]); 2621 out[56] = vqsubq_s16(step2[7], step2[56]); 2622 out[57] = vqsubq_s16(step2[6], step2[57]); 2623 out[58] = vqsubq_s16(step2[5], step2[58]); 2624 out[59] = vqsubq_s16(step2[4], step2[59]); 2625 out[60] = vqsubq_s16(step2[3], step2[60]); 2626 out[61] = vqsubq_s16(step2[2], step2[61]); 2627 out[62] = vqsubq_s16(step2[1], step2[62]); 2628 out[63] = vqsubq_s16(step2[0], step2[63]); 2629 } 2630 2631 static inline void idct64_low1_neon(int16x8_t *input, int16x8_t *out, 2632 int8_t cos_bit) { 2633 const int32_t *cospi = cospi_arr(cos_bit); 2634 int16x8_t step1; 2635 int32x4_t t32[2]; 2636 2637 // stage 1 2638 // stage 2 2639 // stage 3 2640 // stage 4 2641 // stage 5 2642 // stage 6 2643 2644 t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]); 2645 t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]); 2646 2647 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT), 2648 vrshrn_n_s32(t32[1], INV_COS_BIT)); 2649 // stage 7 2650 // stage 8 2651 // stage 9 2652 // stage 10 2653 // stage 11 2654 out[0] = step1; 2655 out[1] = step1; 2656 out[2] = step1; 2657 out[3] = step1; 2658 out[4] = step1; 2659 out[5] = step1; 2660 out[6] = step1; 2661 out[7] = step1; 2662 out[8] = step1; 2663 out[9] = step1; 2664 out[10] = step1; 2665 out[11] = step1; 2666 out[12] = step1; 2667 out[13] = step1; 2668 out[14] = step1; 2669 out[15] = step1; 2670 out[16] = step1; 2671 out[17] = step1; 2672 out[18] = step1; 2673 out[19] = step1; 2674 out[20] = step1; 2675 out[21] = step1; 2676 out[22] = step1; 2677 out[23] = step1; 2678 out[24] = step1; 2679 out[25] = step1; 2680 out[26] = step1; 2681 out[27] = step1; 2682 out[28] = step1; 2683 out[29] = step1; 2684 out[30] = step1; 2685 out[31] = step1; 2686 out[32] = step1; 2687 out[33] = step1; 2688 out[34] = step1; 2689 out[35] = step1; 2690 out[36] = step1; 2691 out[37] = step1; 2692 out[38] = step1; 2693 out[39] = step1; 2694 out[40] = step1; 2695 out[41] = step1; 2696 out[42] = step1; 2697 out[43] = step1; 2698 out[44] = step1; 2699 out[45] = step1; 2700 out[46] = step1; 2701 out[47] = step1; 2702 out[48] = step1; 2703 out[49] = step1; 2704 out[50] = step1; 2705 out[51] = step1; 2706 out[52] = step1; 2707 out[53] = step1; 2708 out[54] = step1; 2709 out[55] = step1; 2710 out[56] = step1; 2711 out[57] = step1; 2712 out[58] = step1; 2713 out[59] = step1; 2714 out[60] = step1; 2715 out[61] = step1; 2716 out[62] = step1; 2717 out[63] = step1; 2718 } 2719 2720 static inline void idct64_low8_neon(int16x8_t *in, int16x8_t *out, 2721 int8_t cos_bit) { 2722 const int32_t *cospi = cospi_arr(cos_bit); 2723 int16x8_t step2[64], step1[64]; 2724 2725 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], 2726 (int16_t)cospi[36], (int16_t)cospi[28]); 2727 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], 2728 (int16_t)cospi[52], (int16_t)cospi[12]); 2729 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 2730 (int16_t)cospi[40], (int16_t)cospi[24]); 2731 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 2732 (int16_t)cospi[16], (int16_t)cospi[48]); 2733 const int16x4_t c4 = 2734 set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]), 2735 (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); 2736 const int16x4_t c5 = 2737 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), 2738 (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); 2739 const int16x4_t c6 = 2740 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 2741 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 2742 2743 // stage 1 2744 // stage 2 2745 2746 step2[0] = in[0]; 2747 step2[8] = in[4]; 2748 step2[16] = in[2]; 2749 step2[24] = in[6]; 2750 2751 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); 2752 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); 2753 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); 2754 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); 2755 2756 // stage 3 2757 2758 step1[0] = step2[0]; 2759 step1[8] = step2[8]; 2760 2761 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); 2762 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); 2763 2764 step1[32] = step2[32]; 2765 step1[33] = step2[32]; 2766 step1[38] = step2[39]; 2767 step1[39] = step2[39]; 2768 step1[40] = step2[40]; 2769 step1[41] = step2[40]; 2770 step1[46] = step2[47]; 2771 step1[47] = step2[47]; 2772 step1[48] = step2[48]; 2773 step1[49] = step2[48]; 2774 step1[54] = step2[55]; 2775 step1[55] = step2[55]; 2776 step1[56] = step2[56]; 2777 step1[57] = step2[56]; 2778 step1[62] = step2[63]; 2779 step1[63] = step2[63]; 2780 2781 // stage 4 2782 2783 step2[0] = step1[0]; 2784 2785 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); 2786 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); 2787 btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); 2788 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); 2789 btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]); 2790 2791 step2[16] = step1[16]; 2792 step2[17] = step1[16]; 2793 step2[22] = step1[23]; 2794 step2[23] = step1[23]; 2795 step2[24] = step1[24]; 2796 step2[25] = step1[24]; 2797 step2[30] = step1[31]; 2798 step2[31] = step1[31]; 2799 step2[32] = step1[32]; 2800 step2[39] = step1[39]; 2801 step2[40] = step1[40]; 2802 step2[47] = step1[47]; 2803 step2[48] = step1[48]; 2804 step2[55] = step1[55]; 2805 step2[56] = step1[56]; 2806 step2[63] = step1[63]; 2807 2808 // stage 5 2809 2810 step1[0] = step2[0]; 2811 2812 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); 2813 btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]); 2814 2815 step1[8] = step2[8]; 2816 step1[9] = step2[8]; 2817 step1[14] = step2[15]; 2818 step1[15] = step2[15]; 2819 2820 step1[16] = step2[16]; 2821 step1[23] = step2[23]; 2822 step1[24] = step2[24]; 2823 step1[31] = step2[31]; 2824 step1[32] = step2[32]; 2825 step1[33] = step2[33]; 2826 step1[34] = step2[33]; 2827 step1[35] = step2[32]; 2828 step1[36] = step2[39]; 2829 step1[37] = step2[38]; 2830 step1[38] = step2[38]; 2831 step1[39] = step2[39]; 2832 step1[40] = step2[40]; 2833 step1[41] = step2[41]; 2834 step1[42] = step2[41]; 2835 step1[43] = step2[40]; 2836 step1[44] = step2[47]; 2837 step1[45] = step2[46]; 2838 step1[46] = step2[46]; 2839 step1[47] = step2[47]; 2840 step1[48] = step2[48]; 2841 step1[49] = step2[49]; 2842 step1[50] = step2[49]; 2843 step1[51] = step2[48]; 2844 step1[52] = step2[55]; 2845 step1[53] = step2[54]; 2846 step1[54] = step2[54]; 2847 step1[55] = step2[55]; 2848 step1[56] = step2[56]; 2849 step1[57] = step2[57]; 2850 step1[58] = step2[57]; 2851 step1[59] = step2[56]; 2852 step1[60] = step2[63]; 2853 step1[61] = step2[62]; 2854 step1[62] = step2[62]; 2855 step1[63] = step2[63]; 2856 2857 // stage 6 2858 2859 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); 2860 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); 2861 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); 2862 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); 2863 btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]); 2864 btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]); 2865 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); 2866 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); 2867 btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]); 2868 btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]); 2869 2870 step2[8] = step1[8]; 2871 step2[15] = step1[15]; 2872 step2[16] = step1[16]; 2873 step2[17] = step1[17]; 2874 step2[18] = step1[17]; 2875 step2[19] = step1[16]; 2876 step2[20] = step1[23]; 2877 step2[21] = step1[22]; 2878 step2[22] = step1[22]; 2879 step2[23] = step1[23]; 2880 step2[24] = step1[24]; 2881 step2[25] = step1[25]; 2882 step2[26] = step1[25]; 2883 step2[27] = step1[24]; 2884 step2[28] = step1[31]; 2885 step2[29] = step1[30]; 2886 step2[30] = step1[30]; 2887 step2[31] = step1[31]; 2888 step2[32] = step1[32]; 2889 step2[33] = step1[33]; 2890 step2[38] = step1[38]; 2891 step2[39] = step1[39]; 2892 step2[40] = step1[40]; 2893 step2[41] = step1[41]; 2894 step2[46] = step1[46]; 2895 step2[47] = step1[47]; 2896 step2[48] = step1[48]; 2897 step2[49] = step1[49]; 2898 step2[54] = step1[54]; 2899 step2[55] = step1[55]; 2900 step2[56] = step1[56]; 2901 step2[57] = step1[57]; 2902 step2[62] = step1[62]; 2903 step2[63] = step1[63]; 2904 2905 // stage 7 2906 2907 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); 2908 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); 2909 btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]); 2910 btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]); 2911 2912 step1[0] = step2[0]; 2913 step1[1] = step2[1]; 2914 step1[2] = step2[1]; 2915 step1[3] = step2[0]; 2916 step1[8] = step2[8]; 2917 step1[9] = step2[9]; 2918 step1[10] = step2[9]; 2919 step1[11] = step2[8]; 2920 step1[12] = step2[15]; 2921 step1[13] = step2[14]; 2922 step1[14] = step2[14]; 2923 step1[15] = step2[15]; 2924 step1[16] = step2[16]; 2925 step1[17] = step2[17]; 2926 step1[22] = step2[22]; 2927 step1[23] = step2[23]; 2928 step1[24] = step2[24]; 2929 step1[25] = step2[25]; 2930 step1[30] = step2[30]; 2931 step1[31] = step2[31]; 2932 step1[32] = vqaddq_s16(step2[32], step2[39]); 2933 step1[33] = vqaddq_s16(step2[33], step2[38]); 2934 step1[34] = vqaddq_s16(step2[34], step2[37]); 2935 step1[35] = vqaddq_s16(step2[35], step2[36]); 2936 step1[36] = vqsubq_s16(step2[35], step2[36]); 2937 step1[37] = vqsubq_s16(step2[34], step2[37]); 2938 step1[38] = vqsubq_s16(step2[33], step2[38]); 2939 step1[39] = vqsubq_s16(step2[32], step2[39]); 2940 step1[40] = vqsubq_s16(step2[47], step2[40]); 2941 step1[41] = vqsubq_s16(step2[46], step2[41]); 2942 step1[42] = vqsubq_s16(step2[45], step2[42]); 2943 step1[43] = vqsubq_s16(step2[44], step2[43]); 2944 step1[44] = vqaddq_s16(step2[43], step2[44]); 2945 step1[45] = vqaddq_s16(step2[42], step2[45]); 2946 step1[46] = vqaddq_s16(step2[41], step2[46]); 2947 step1[47] = vqaddq_s16(step2[40], step2[47]); 2948 step1[48] = vqaddq_s16(step2[48], step2[55]); 2949 step1[49] = vqaddq_s16(step2[49], step2[54]); 2950 step1[50] = vqaddq_s16(step2[50], step2[53]); 2951 step1[51] = vqaddq_s16(step2[51], step2[52]); 2952 step1[52] = vqsubq_s16(step2[51], step2[52]); 2953 step1[53] = vqsubq_s16(step2[50], step2[53]); 2954 step1[54] = vqsubq_s16(step2[49], step2[54]); 2955 step1[55] = vqsubq_s16(step2[48], step2[55]); 2956 step1[56] = vqsubq_s16(step2[63], step2[56]); 2957 step1[57] = vqsubq_s16(step2[62], step2[57]); 2958 step1[58] = vqsubq_s16(step2[61], step2[58]); 2959 step1[59] = vqsubq_s16(step2[60], step2[59]); 2960 step1[60] = vqaddq_s16(step2[59], step2[60]); 2961 step1[61] = vqaddq_s16(step2[58], step2[61]); 2962 step1[62] = vqaddq_s16(step2[57], step2[62]); 2963 step1[63] = vqaddq_s16(step2[56], step2[63]); 2964 2965 // stage 8 2966 2967 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); 2968 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); 2969 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); 2970 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); 2971 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); 2972 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); 2973 btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]); 2974 btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]); 2975 btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]); 2976 btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]); 2977 2978 step2[0] = step1[0]; 2979 step2[1] = step1[1]; 2980 step2[2] = step1[2]; 2981 step2[3] = step1[3]; 2982 step2[4] = step1[3]; 2983 step2[5] = step1[2]; 2984 step2[6] = step1[1]; 2985 step2[7] = step1[0]; 2986 step2[8] = step1[8]; 2987 step2[9] = step1[9]; 2988 step2[14] = step1[14]; 2989 step2[15] = step1[15]; 2990 step2[16] = vqaddq_s16(step1[16], step1[23]); 2991 step2[17] = vqaddq_s16(step1[17], step1[22]); 2992 step2[18] = vqaddq_s16(step1[18], step1[21]); 2993 step2[19] = vqaddq_s16(step1[19], step1[20]); 2994 step2[20] = vqsubq_s16(step1[19], step1[20]); 2995 step2[21] = vqsubq_s16(step1[18], step1[21]); 2996 step2[22] = vqsubq_s16(step1[17], step1[22]); 2997 step2[23] = vqsubq_s16(step1[16], step1[23]); 2998 step2[24] = vqsubq_s16(step1[31], step1[24]); 2999 step2[25] = vqsubq_s16(step1[30], step1[25]); 3000 step2[26] = vqsubq_s16(step1[29], step1[26]); 3001 step2[27] = vqsubq_s16(step1[28], step1[27]); 3002 step2[28] = vqaddq_s16(step1[28], step1[27]); 3003 step2[29] = vqaddq_s16(step1[29], step1[26]); 3004 step2[30] = vqaddq_s16(step1[30], step1[25]); 3005 step2[31] = vqaddq_s16(step1[31], step1[24]); 3006 step2[32] = step1[32]; 3007 step2[33] = step1[33]; 3008 step2[34] = step1[34]; 3009 step2[35] = step1[35]; 3010 step2[44] = step1[44]; 3011 step2[45] = step1[45]; 3012 step2[46] = step1[46]; 3013 step2[47] = step1[47]; 3014 step2[48] = step1[48]; 3015 step2[49] = step1[49]; 3016 step2[50] = step1[50]; 3017 step2[51] = step1[51]; 3018 step2[60] = step1[60]; 3019 step2[61] = step1[61]; 3020 step2[62] = step1[62]; 3021 step2[63] = step1[63]; 3022 3023 // stage 9 3024 idct64_stage9_neon(step2, step1, cos_bit); 3025 3026 // stage 10 3027 idct64_stage10_neon(step1, step2, cos_bit); 3028 3029 // stage 11 3030 3031 out[0] = vqaddq_s16(step2[0], step2[63]); 3032 out[1] = vqaddq_s16(step2[1], step2[62]); 3033 out[2] = vqaddq_s16(step2[2], step2[61]); 3034 out[3] = vqaddq_s16(step2[3], step2[60]); 3035 out[4] = vqaddq_s16(step2[4], step2[59]); 3036 out[5] = vqaddq_s16(step2[5], step2[58]); 3037 out[6] = vqaddq_s16(step2[6], step2[57]); 3038 out[7] = vqaddq_s16(step2[7], step2[56]); 3039 out[8] = vqaddq_s16(step2[8], step2[55]); 3040 out[9] = vqaddq_s16(step2[9], step2[54]); 3041 out[10] = vqaddq_s16(step2[10], step2[53]); 3042 out[11] = vqaddq_s16(step2[11], step2[52]); 3043 out[12] = vqaddq_s16(step2[12], step2[51]); 3044 out[13] = vqaddq_s16(step2[13], step2[50]); 3045 out[14] = vqaddq_s16(step2[14], step2[49]); 3046 out[15] = vqaddq_s16(step2[15], step2[48]); 3047 out[16] = vqaddq_s16(step2[16], step2[47]); 3048 out[17] = vqaddq_s16(step2[17], step2[46]); 3049 out[18] = vqaddq_s16(step2[18], step2[45]); 3050 out[19] = vqaddq_s16(step2[19], step2[44]); 3051 out[20] = vqaddq_s16(step2[20], step2[43]); 3052 out[21] = vqaddq_s16(step2[21], step2[42]); 3053 out[22] = vqaddq_s16(step2[22], step2[41]); 3054 out[23] = vqaddq_s16(step2[23], step2[40]); 3055 out[24] = vqaddq_s16(step2[24], step2[39]); 3056 out[25] = vqaddq_s16(step2[25], step2[38]); 3057 out[26] = vqaddq_s16(step2[26], step2[37]); 3058 out[27] = vqaddq_s16(step2[27], step2[36]); 3059 out[28] = vqaddq_s16(step2[28], step2[35]); 3060 out[29] = vqaddq_s16(step2[29], step2[34]); 3061 out[30] = vqaddq_s16(step2[30], step2[33]); 3062 out[31] = vqaddq_s16(step2[31], step2[32]); 3063 out[32] = vqsubq_s16(step2[31], step2[32]); 3064 out[33] = vqsubq_s16(step2[30], step2[33]); 3065 out[34] = vqsubq_s16(step2[29], step2[34]); 3066 out[35] = vqsubq_s16(step2[28], step2[35]); 3067 out[36] = vqsubq_s16(step2[27], step2[36]); 3068 out[37] = vqsubq_s16(step2[26], step2[37]); 3069 out[38] = vqsubq_s16(step2[25], step2[38]); 3070 out[39] = vqsubq_s16(step2[24], step2[39]); 3071 out[40] = vqsubq_s16(step2[23], step2[40]); 3072 out[41] = vqsubq_s16(step2[22], step2[41]); 3073 out[42] = vqsubq_s16(step2[21], step2[42]); 3074 out[43] = vqsubq_s16(step2[20], step2[43]); 3075 out[44] = vqsubq_s16(step2[19], step2[44]); 3076 out[45] = vqsubq_s16(step2[18], step2[45]); 3077 out[46] = vqsubq_s16(step2[17], step2[46]); 3078 out[47] = vqsubq_s16(step2[16], step2[47]); 3079 out[48] = vqsubq_s16(step2[15], step2[48]); 3080 out[49] = vqsubq_s16(step2[14], step2[49]); 3081 out[50] = vqsubq_s16(step2[13], step2[50]); 3082 out[51] = vqsubq_s16(step2[12], step2[51]); 3083 out[52] = vqsubq_s16(step2[11], step2[52]); 3084 out[53] = vqsubq_s16(step2[10], step2[53]); 3085 out[54] = vqsubq_s16(step2[9], step2[54]); 3086 out[55] = vqsubq_s16(step2[8], step2[55]); 3087 out[56] = vqsubq_s16(step2[7], step2[56]); 3088 out[57] = vqsubq_s16(step2[6], step2[57]); 3089 out[58] = vqsubq_s16(step2[5], step2[58]); 3090 out[59] = vqsubq_s16(step2[4], step2[59]); 3091 out[60] = vqsubq_s16(step2[3], step2[60]); 3092 out[61] = vqsubq_s16(step2[2], step2[61]); 3093 out[62] = vqsubq_s16(step2[1], step2[62]); 3094 out[63] = vqsubq_s16(step2[0], step2[63]); 3095 } 3096 3097 static inline void idct64_low16_neon(int16x8_t *in, int16x8_t *out, 3098 int8_t cos_bit) { 3099 const int32_t *cospi = cospi_arr(cos_bit); 3100 int16x8_t step2[64], step1[64]; 3101 3102 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60], 3103 (int16_t)cospi[36], (int16_t)cospi[28]); 3104 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44], 3105 (int16_t)cospi[52], (int16_t)cospi[12]); 3106 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56], 3107 (int16_t)cospi[40], (int16_t)cospi[24]); 3108 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32], 3109 (int16_t)cospi[16], (int16_t)cospi[48]); 3110 const int16x4_t c4 = 3111 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]), 3112 (int16_t)(-cospi[36]), (int16_t)(-cospi[28])); 3113 const int16x4_t c5 = 3114 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]), 3115 (int16_t)(-cospi[52]), (int16_t)(-cospi[12])); 3116 const int16x4_t c6 = 3117 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]), 3118 (int16_t)(-cospi[40]), (int16_t)(-cospi[24])); 3119 const int16x4_t c7 = 3120 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]), 3121 (int16_t)(-cospi[16]), (int16_t)(-cospi[48])); 3122 3123 // stage 1 3124 // stage 2 3125 3126 step2[0] = in[0]; 3127 step2[4] = in[8]; 3128 step2[8] = in[4]; 3129 step2[12] = in[12]; 3130 step2[16] = in[2]; 3131 step2[20] = in[10]; 3132 step2[24] = in[6]; 3133 step2[28] = in[14]; 3134 3135 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]); 3136 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]); 3137 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]); 3138 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]); 3139 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]); 3140 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]); 3141 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]); 3142 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]); 3143 3144 // stage 3 3145 3146 step1[0] = step2[0]; 3147 step1[4] = step2[4]; 3148 step1[8] = step2[8]; 3149 step1[12] = step2[12]; 3150 3151 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]); 3152 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]); 3153 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]); 3154 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]); 3155 3156 step1[32] = step2[32]; 3157 step1[33] = step2[32]; 3158 step1[34] = step2[35]; 3159 step1[35] = step2[35]; 3160 step1[36] = step2[36]; 3161 step1[37] = step2[36]; 3162 step1[38] = step2[39]; 3163 step1[39] = step2[39]; 3164 step1[40] = step2[40]; 3165 step1[41] = step2[40]; 3166 step1[42] = step2[43]; 3167 step1[43] = step2[43]; 3168 step1[44] = step2[44]; 3169 step1[45] = step2[44]; 3170 step1[46] = step2[47]; 3171 step1[47] = step2[47]; 3172 step1[48] = step2[48]; 3173 step1[49] = step2[48]; 3174 step1[50] = step2[51]; 3175 step1[51] = step2[51]; 3176 step1[52] = step2[52]; 3177 step1[53] = step2[52]; 3178 step1[54] = step2[55]; 3179 step1[55] = step2[55]; 3180 step1[56] = step2[56]; 3181 step1[57] = step2[56]; 3182 step1[58] = step2[59]; 3183 step1[59] = step2[59]; 3184 step1[60] = step2[60]; 3185 step1[61] = step2[60]; 3186 step1[62] = step2[63]; 3187 step1[63] = step2[63]; 3188 3189 // stage 4 3190 3191 step2[0] = step1[0]; 3192 step2[4] = step1[4]; 3193 3194 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]); 3195 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]); 3196 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]); 3197 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]); 3198 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]); 3199 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]); 3200 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]); 3201 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]); 3202 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]); 3203 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]); 3204 3205 step2[16] = step1[16]; 3206 step2[17] = step1[16]; 3207 step2[18] = step1[19]; 3208 step2[19] = step1[19]; 3209 step2[20] = step1[20]; 3210 step2[21] = step1[20]; 3211 step2[22] = step1[23]; 3212 step2[23] = step1[23]; 3213 step2[24] = step1[24]; 3214 step2[25] = step1[24]; 3215 step2[26] = step1[27]; 3216 step2[27] = step1[27]; 3217 step2[28] = step1[28]; 3218 step2[29] = step1[28]; 3219 step2[30] = step1[31]; 3220 step2[31] = step1[31]; 3221 step2[32] = step1[32]; 3222 step2[35] = step1[35]; 3223 step2[36] = step1[36]; 3224 step2[39] = step1[39]; 3225 step2[40] = step1[40]; 3226 step2[43] = step1[43]; 3227 step2[44] = step1[44]; 3228 step2[47] = step1[47]; 3229 step2[48] = step1[48]; 3230 step2[51] = step1[51]; 3231 step2[52] = step1[52]; 3232 step2[55] = step1[55]; 3233 step2[56] = step1[56]; 3234 step2[59] = step1[59]; 3235 step2[60] = step1[60]; 3236 step2[63] = step1[63]; 3237 3238 // stage 5 3239 3240 step1[0] = step2[0]; 3241 3242 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]); 3243 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]); 3244 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]); 3245 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]); 3246 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]); 3247 3248 step1[8] = step2[8]; 3249 step1[9] = step2[8]; 3250 step1[10] = step2[11]; 3251 step1[11] = step2[11]; 3252 step1[12] = step2[12]; 3253 step1[13] = step2[12]; 3254 step1[14] = step2[15]; 3255 step1[15] = step2[15]; 3256 step1[16] = step2[16]; 3257 step1[19] = step2[19]; 3258 step1[20] = step2[20]; 3259 step1[23] = step2[23]; 3260 step1[24] = step2[24]; 3261 step1[27] = step2[27]; 3262 step1[28] = step2[28]; 3263 step1[31] = step2[31]; 3264 step1[32] = vqaddq_s16(step2[32], step2[35]); 3265 step1[33] = vqaddq_s16(step2[33], step2[34]); 3266 step1[34] = vqsubq_s16(step2[33], step2[34]); 3267 step1[35] = vqsubq_s16(step2[32], step2[35]); 3268 step1[36] = vqsubq_s16(step2[39], step2[36]); 3269 step1[37] = vqsubq_s16(step2[38], step2[37]); 3270 step1[38] = vqaddq_s16(step2[38], step2[37]); 3271 step1[39] = vqaddq_s16(step2[39], step2[36]); 3272 step1[40] = vqaddq_s16(step2[40], step2[43]); 3273 step1[41] = vqaddq_s16(step2[41], step2[42]); 3274 step1[42] = vqsubq_s16(step2[41], step2[42]); 3275 step1[43] = vqsubq_s16(step2[40], step2[43]); 3276 step1[44] = vqsubq_s16(step2[47], step2[44]); 3277 step1[45] = vqsubq_s16(step2[46], step2[45]); 3278 step1[46] = vqaddq_s16(step2[46], step2[45]); 3279 step1[47] = vqaddq_s16(step2[47], step2[44]); 3280 step1[48] = vqaddq_s16(step2[48], step2[51]); 3281 step1[49] = vqaddq_s16(step2[49], step2[50]); 3282 step1[50] = vqsubq_s16(step2[49], step2[50]); 3283 step1[51] = vqsubq_s16(step2[48], step2[51]); 3284 step1[52] = vqsubq_s16(step2[55], step2[52]); 3285 step1[53] = vqsubq_s16(step2[54], step2[53]); 3286 step1[54] = vqaddq_s16(step2[54], step2[53]); 3287 step1[55] = vqaddq_s16(step2[55], step2[52]); 3288 step1[56] = vqaddq_s16(step2[56], step2[59]); 3289 step1[57] = vqaddq_s16(step2[57], step2[58]); 3290 step1[58] = vqsubq_s16(step2[57], step2[58]); 3291 step1[59] = vqsubq_s16(step2[56], step2[59]); 3292 step1[60] = vqsubq_s16(step2[63], step2[60]); 3293 step1[61] = vqsubq_s16(step2[62], step2[61]); 3294 step1[62] = vqaddq_s16(step2[62], step2[61]); 3295 step1[63] = vqaddq_s16(step2[63], step2[60]); 3296 3297 // stage 6 3298 3299 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]); 3300 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]); 3301 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]); 3302 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]); 3303 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]); 3304 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]); 3305 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]); 3306 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]); 3307 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]); 3308 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]); 3309 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]); 3310 3311 step2[4] = step1[4]; 3312 step2[5] = step1[4]; 3313 step2[6] = step1[7]; 3314 step2[7] = step1[7]; 3315 step2[8] = step1[8]; 3316 step2[11] = step1[11]; 3317 step2[12] = step1[12]; 3318 step2[15] = step1[15]; 3319 step2[16] = vqaddq_s16(step1[16], step1[19]); 3320 step2[17] = vqaddq_s16(step1[17], step1[18]); 3321 step2[18] = vqsubq_s16(step1[17], step1[18]); 3322 step2[19] = vqsubq_s16(step1[16], step1[19]); 3323 step2[20] = vqsubq_s16(step1[23], step1[20]); 3324 step2[21] = vqsubq_s16(step1[22], step1[21]); 3325 step2[22] = vqaddq_s16(step1[22], step1[21]); 3326 step2[23] = vqaddq_s16(step1[23], step1[20]); 3327 step2[24] = vqaddq_s16(step1[24], step1[27]); 3328 step2[25] = vqaddq_s16(step1[25], step1[26]); 3329 step2[26] = vqsubq_s16(step1[25], step1[26]); 3330 step2[27] = vqsubq_s16(step1[24], step1[27]); 3331 step2[28] = vqsubq_s16(step1[31], step1[28]); 3332 step2[29] = vqsubq_s16(step1[30], step1[29]); 3333 step2[30] = vqaddq_s16(step1[30], step1[29]); 3334 step2[31] = vqaddq_s16(step1[31], step1[28]); 3335 step2[32] = step1[32]; 3336 step2[33] = step1[33]; 3337 step2[38] = step1[38]; 3338 step2[39] = step1[39]; 3339 step2[40] = step1[40]; 3340 step2[41] = step1[41]; 3341 step2[46] = step1[46]; 3342 step2[47] = step1[47]; 3343 step2[48] = step1[48]; 3344 step2[49] = step1[49]; 3345 step2[54] = step1[54]; 3346 step2[55] = step1[55]; 3347 step2[56] = step1[56]; 3348 step2[57] = step1[57]; 3349 step2[62] = step1[62]; 3350 step2[63] = step1[63]; 3351 3352 // stage 7 3353 3354 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]); 3355 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]); 3356 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]); 3357 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]); 3358 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]); 3359 3360 step1[0] = step2[0]; 3361 step1[1] = step2[1]; 3362 step1[2] = step2[1]; 3363 step1[3] = step2[0]; 3364 step1[4] = step2[4]; 3365 step1[7] = step2[7]; 3366 step1[8] = vqaddq_s16(step2[8], step2[11]); 3367 step1[9] = vqaddq_s16(step2[9], step2[10]); 3368 step1[10] = vqsubq_s16(step2[9], step2[10]); 3369 step1[11] = vqsubq_s16(step2[8], step2[11]); 3370 step1[12] = vqsubq_s16(step2[15], step2[12]); 3371 step1[13] = vqsubq_s16(step2[14], step2[13]); 3372 step1[14] = vqaddq_s16(step2[14], step2[13]); 3373 step1[15] = vqaddq_s16(step2[15], step2[12]); 3374 step1[16] = step2[16]; 3375 step1[17] = step2[17]; 3376 step1[22] = step2[22]; 3377 step1[23] = step2[23]; 3378 step1[24] = step2[24]; 3379 step1[25] = step2[25]; 3380 step1[30] = step2[30]; 3381 step1[31] = step2[31]; 3382 step1[32] = vqaddq_s16(step2[32], step2[39]); 3383 step1[33] = vqaddq_s16(step2[33], step2[38]); 3384 step1[34] = vqaddq_s16(step2[34], step2[37]); 3385 step1[35] = vqaddq_s16(step2[35], step2[36]); 3386 step1[36] = vqsubq_s16(step2[35], step2[36]); 3387 step1[37] = vqsubq_s16(step2[34], step2[37]); 3388 step1[38] = vqsubq_s16(step2[33], step2[38]); 3389 step1[39] = vqsubq_s16(step2[32], step2[39]); 3390 step1[40] = vqsubq_s16(step2[47], step2[40]); 3391 step1[41] = vqsubq_s16(step2[46], step2[41]); 3392 step1[42] = vqsubq_s16(step2[45], step2[42]); 3393 step1[43] = vqsubq_s16(step2[44], step2[43]); 3394 step1[44] = vqaddq_s16(step2[43], step2[44]); 3395 step1[45] = vqaddq_s16(step2[42], step2[45]); 3396 step1[46] = vqaddq_s16(step2[41], step2[46]); 3397 step1[47] = vqaddq_s16(step2[40], step2[47]); 3398 step1[48] = vqaddq_s16(step2[48], step2[55]); 3399 step1[49] = vqaddq_s16(step2[49], step2[54]); 3400 step1[50] = vqaddq_s16(step2[50], step2[53]); 3401 step1[51] = vqaddq_s16(step2[51], step2[52]); 3402 step1[52] = vqsubq_s16(step2[51], step2[52]); 3403 step1[53] = vqsubq_s16(step2[50], step2[53]); 3404 step1[54] = vqsubq_s16(step2[49], step2[54]); 3405 step1[55] = vqsubq_s16(step2[48], step2[55]); 3406 step1[56] = vqsubq_s16(step2[63], step2[56]); 3407 step1[57] = vqsubq_s16(step2[62], step2[57]); 3408 step1[58] = vqsubq_s16(step2[61], step2[58]); 3409 step1[59] = vqsubq_s16(step2[60], step2[59]); 3410 step1[60] = vqaddq_s16(step2[59], step2[60]); 3411 step1[61] = vqaddq_s16(step2[58], step2[61]); 3412 step1[62] = vqaddq_s16(step2[57], step2[62]); 3413 step1[63] = vqaddq_s16(step2[56], step2[63]); 3414 3415 // stage 8 3416 3417 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]); 3418 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]); 3419 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]); 3420 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]); 3421 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]); 3422 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]); 3423 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]); 3424 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]); 3425 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]); 3426 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]); 3427 3428 step2[0] = vqaddq_s16(step1[0], step1[7]); 3429 step2[1] = vqaddq_s16(step1[1], step1[6]); 3430 step2[2] = vqaddq_s16(step1[2], step1[5]); 3431 step2[3] = vqaddq_s16(step1[3], step1[4]); 3432 step2[4] = vqsubq_s16(step1[3], step1[4]); 3433 step2[5] = vqsubq_s16(step1[2], step1[5]); 3434 step2[6] = vqsubq_s16(step1[1], step1[6]); 3435 step2[7] = vqsubq_s16(step1[0], step1[7]); 3436 step2[8] = step1[8]; 3437 step2[9] = step1[9]; 3438 step2[14] = step1[14]; 3439 step2[15] = step1[15]; 3440 step2[16] = vqaddq_s16(step1[16], step1[23]); 3441 step2[17] = vqaddq_s16(step1[17], step1[22]); 3442 step2[18] = vqaddq_s16(step1[18], step1[21]); 3443 step2[19] = vqaddq_s16(step1[19], step1[20]); 3444 step2[20] = vqsubq_s16(step1[19], step1[20]); 3445 step2[21] = vqsubq_s16(step1[18], step1[21]); 3446 step2[22] = vqsubq_s16(step1[17], step1[22]); 3447 step2[23] = vqsubq_s16(step1[16], step1[23]); 3448 step2[24] = vqsubq_s16(step1[31], step1[24]); 3449 step2[25] = vqsubq_s16(step1[30], step1[25]); 3450 step2[26] = vqsubq_s16(step1[29], step1[26]); 3451 step2[27] = vqsubq_s16(step1[28], step1[27]); 3452 step2[28] = vqaddq_s16(step1[28], step1[27]); 3453 step2[29] = vqaddq_s16(step1[29], step1[26]); 3454 step2[30] = vqaddq_s16(step1[30], step1[25]); 3455 step2[31] = vqaddq_s16(step1[31], step1[24]); 3456 step2[32] = step1[32]; 3457 step2[33] = step1[33]; 3458 step2[34] = step1[34]; 3459 step2[35] = step1[35]; 3460 step2[44] = step1[44]; 3461 step2[45] = step1[45]; 3462 step2[46] = step1[46]; 3463 step2[47] = step1[47]; 3464 step2[48] = step1[48]; 3465 step2[49] = step1[49]; 3466 step2[50] = step1[50]; 3467 step2[51] = step1[51]; 3468 step2[60] = step1[60]; 3469 step2[61] = step1[61]; 3470 step2[62] = step1[62]; 3471 step2[63] = step1[63]; 3472 3473 // stage 9 3474 idct64_stage9_neon(step2, step1, cos_bit); 3475 3476 // stage 10 3477 idct64_stage10_neon(step1, step2, cos_bit); 3478 3479 // stage 11 3480 3481 out[0] = vqaddq_s16(step2[0], step2[63]); 3482 out[1] = vqaddq_s16(step2[1], step2[62]); 3483 out[2] = vqaddq_s16(step2[2], step2[61]); 3484 out[3] = vqaddq_s16(step2[3], step2[60]); 3485 out[4] = vqaddq_s16(step2[4], step2[59]); 3486 out[5] = vqaddq_s16(step2[5], step2[58]); 3487 out[6] = vqaddq_s16(step2[6], step2[57]); 3488 out[7] = vqaddq_s16(step2[7], step2[56]); 3489 out[8] = vqaddq_s16(step2[8], step2[55]); 3490 out[9] = vqaddq_s16(step2[9], step2[54]); 3491 out[10] = vqaddq_s16(step2[10], step2[53]); 3492 out[11] = vqaddq_s16(step2[11], step2[52]); 3493 out[12] = vqaddq_s16(step2[12], step2[51]); 3494 out[13] = vqaddq_s16(step2[13], step2[50]); 3495 out[14] = vqaddq_s16(step2[14], step2[49]); 3496 out[15] = vqaddq_s16(step2[15], step2[48]); 3497 out[16] = vqaddq_s16(step2[16], step2[47]); 3498 out[17] = vqaddq_s16(step2[17], step2[46]); 3499 out[18] = vqaddq_s16(step2[18], step2[45]); 3500 out[19] = vqaddq_s16(step2[19], step2[44]); 3501 out[20] = vqaddq_s16(step2[20], step2[43]); 3502 out[21] = vqaddq_s16(step2[21], step2[42]); 3503 out[22] = vqaddq_s16(step2[22], step2[41]); 3504 out[23] = vqaddq_s16(step2[23], step2[40]); 3505 out[24] = vqaddq_s16(step2[24], step2[39]); 3506 out[25] = vqaddq_s16(step2[25], step2[38]); 3507 out[26] = vqaddq_s16(step2[26], step2[37]); 3508 out[27] = vqaddq_s16(step2[27], step2[36]); 3509 out[28] = vqaddq_s16(step2[28], step2[35]); 3510 out[29] = vqaddq_s16(step2[29], step2[34]); 3511 out[30] = vqaddq_s16(step2[30], step2[33]); 3512 out[31] = vqaddq_s16(step2[31], step2[32]); 3513 out[32] = vqsubq_s16(step2[31], step2[32]); 3514 out[33] = vqsubq_s16(step2[30], step2[33]); 3515 out[34] = vqsubq_s16(step2[29], step2[34]); 3516 out[35] = vqsubq_s16(step2[28], step2[35]); 3517 out[36] = vqsubq_s16(step2[27], step2[36]); 3518 out[37] = vqsubq_s16(step2[26], step2[37]); 3519 out[38] = vqsubq_s16(step2[25], step2[38]); 3520 out[39] = vqsubq_s16(step2[24], step2[39]); 3521 out[40] = vqsubq_s16(step2[23], step2[40]); 3522 out[41] = vqsubq_s16(step2[22], step2[41]); 3523 out[42] = vqsubq_s16(step2[21], step2[42]); 3524 out[43] = vqsubq_s16(step2[20], step2[43]); 3525 out[44] = vqsubq_s16(step2[19], step2[44]); 3526 out[45] = vqsubq_s16(step2[18], step2[45]); 3527 out[46] = vqsubq_s16(step2[17], step2[46]); 3528 out[47] = vqsubq_s16(step2[16], step2[47]); 3529 out[48] = vqsubq_s16(step2[15], step2[48]); 3530 out[49] = vqsubq_s16(step2[14], step2[49]); 3531 out[50] = vqsubq_s16(step2[13], step2[50]); 3532 out[51] = vqsubq_s16(step2[12], step2[51]); 3533 out[52] = vqsubq_s16(step2[11], step2[52]); 3534 out[53] = vqsubq_s16(step2[10], step2[53]); 3535 out[54] = vqsubq_s16(step2[9], step2[54]); 3536 out[55] = vqsubq_s16(step2[8], step2[55]); 3537 out[56] = vqsubq_s16(step2[7], step2[56]); 3538 out[57] = vqsubq_s16(step2[6], step2[57]); 3539 out[58] = vqsubq_s16(step2[5], step2[58]); 3540 out[59] = vqsubq_s16(step2[4], step2[59]); 3541 out[60] = vqsubq_s16(step2[3], step2[60]); 3542 out[61] = vqsubq_s16(step2[2], step2[61]); 3543 out[62] = vqsubq_s16(step2[1], step2[62]); 3544 out[63] = vqsubq_s16(step2[0], step2[63]); 3545 } 3546 3547 // Functions for blocks with eob at DC and within 3548 // topleft 8x8, 16x16, 32x32 corner 3549 static const transform_neon 3550 lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = { 3551 { 3552 { NULL, NULL, NULL, NULL }, 3553 { NULL, NULL, NULL, NULL }, 3554 { NULL, NULL, NULL, NULL }, 3555 }, 3556 { { idct8_low1_neon, idct8_neon, NULL, NULL }, 3557 { iadst8_low1_neon, iadst8_neon, NULL, NULL }, 3558 { NULL, NULL, NULL, NULL } }, 3559 { 3560 { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL }, 3561 { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL }, 3562 { NULL, NULL, NULL, NULL }, 3563 }, 3564 { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon }, 3565 { NULL, NULL, NULL, NULL }, 3566 { NULL, NULL, NULL, NULL } }, 3567 { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon, 3568 idct64_low32_neon }, 3569 { NULL, NULL, NULL, NULL }, 3570 { NULL, NULL, NULL, NULL } } 3571 }; 3572 3573 static inline void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input, 3574 uint8_t *output, int stride, 3575 TX_TYPE tx_type, 3576 TX_SIZE tx_size, int eob) { 3577 (void)tx_type; 3578 int16x8_t a[32 * 4]; 3579 int16x8_t b[32 * 4]; 3580 int eobx, eoby; 3581 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); 3582 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3583 const int txw_idx = get_txw_idx(tx_size); 3584 const int txh_idx = get_txh_idx(tx_size); 3585 const int txfm_size_col = tx_size_wide[tx_size]; 3586 const int txfm_size_row = tx_size_high[tx_size]; 3587 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), 3588 0); 3589 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), 3590 0); 3591 const int buf_size_w_div8 = txfm_size_col >> 3; 3592 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 3593 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 3594 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; 3595 const int input_stride = txfm_size_row; 3596 int temp_b = 0; 3597 3598 for (int i = 0; i < buf_size_nonzero_h_div8; i++) { 3599 int16x8_t *cur_a = &a[i * txfm_size_col]; 3600 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, 3601 buf_size_nonzero_w); 3602 input += 8; 3603 if (abs(rect_type) == 1) { 3604 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); 3605 } 3606 identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, 3607 -shift[0]); 3608 for (int j = 0; j < buf_size_w_div8; ++j) { 3609 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); 3610 } 3611 temp_b += 8; 3612 } 3613 for (int j = 0; j < buf_size_w_div8; ++j) { 3614 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], 3615 txh_idx, txfm_size_row, -shift[1]); 3616 } 3617 if (txfm_size_col >= 16) { 3618 for (int i = 0; i < (txfm_size_col >> 4); i++) { 3619 lowbd_add_flip_buffer_16xn_neon( 3620 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); 3621 } 3622 } else if (txfm_size_col == 8) { 3623 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); 3624 } 3625 } 3626 3627 static inline void lowbd_inv_txfm2d_add_v_identity_neon( 3628 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 3629 TX_SIZE tx_size, int eob) { 3630 int16x8_t a[16 * 2]; 3631 int16x8_t b[16 * 2]; 3632 int eobx, eoby, ud_flip, lr_flip; 3633 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob); 3634 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3635 const int txw_idx = get_txw_idx(tx_size); 3636 const int txh_idx = get_txh_idx(tx_size); 3637 const int txfm_size_col = tx_size_wide[tx_size]; 3638 const int txfm_size_row = tx_size_high[tx_size]; 3639 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3), 3640 0); 3641 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 3642 const int buf_size_w_div8 = txfm_size_col >> 3; 3643 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 3644 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; 3645 const int input_stride = txfm_size_row; 3646 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 3647 int temp_b = 0; 3648 const transform_neon row_txfm = 3649 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 3650 3651 assert(row_txfm != NULL); 3652 3653 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 3654 3655 for (int i = 0; i < buf_size_nonzero_h_div8; i++) { 3656 int16x8_t *cur_a = &a[i * txfm_size_col]; 3657 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, 3658 buf_size_nonzero_w); 3659 input += 8; 3660 if (abs(rect_type) == 1) { 3661 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); 3662 } 3663 row_txfm(cur_a, cur_a, INV_COS_BIT); 3664 round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); 3665 if (lr_flip == 1) { 3666 for (int j = 0; j < buf_size_w_div8; ++j) { 3667 flip_buf_ud_neon(&cur_a[j * 8], 8); 3668 transpose_arrays_s16_8x8( 3669 &cur_a[j * 8], 3670 &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); 3671 } 3672 temp_b += 8; 3673 } else { 3674 for (int j = 0; j < buf_size_w_div8; ++j) { 3675 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); 3676 } 3677 temp_b += 8; 3678 } 3679 } 3680 for (int j = 0; j < buf_size_w_div8; ++j) { 3681 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row], 3682 txh_idx, txfm_size_row, -shift[1]); 3683 } 3684 if (txfm_size_col >= 16) { 3685 for (int i = 0; i < (txfm_size_col >> 4); i++) { 3686 lowbd_add_flip_buffer_16xn_neon( 3687 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row); 3688 } 3689 } else if (txfm_size_col == 8) { 3690 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row); 3691 } 3692 } 3693 3694 static inline void lowbd_inv_txfm2d_add_h_identity_neon( 3695 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 3696 TX_SIZE tx_size, int eob) { 3697 int16x8_t a[16 * 2]; 3698 int16x8_t b[16 * 2]; 3699 int eobx, eoby, ud_flip, lr_flip; 3700 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob); 3701 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3702 const int txw_idx = get_txw_idx(tx_size); 3703 const int txh_idx = get_txh_idx(tx_size); 3704 const int txfm_size_col = tx_size_wide[tx_size]; 3705 const int txfm_size_row = tx_size_high[tx_size]; 3706 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3), 3707 0); 3708 const int buf_size_w_div8 = txfm_size_col >> 3; 3709 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 3710 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 3711 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; 3712 const int input_stride = txfm_size_row; 3713 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 3714 int temp_b = 0; 3715 const transform_neon col_txfm = 3716 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 3717 3718 assert(col_txfm != NULL); 3719 3720 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 3721 3722 for (int i = 0; i < buf_size_nonzero_h_div8; i++) { 3723 int16x8_t *cur_a = &a[i * txfm_size_col]; 3724 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, 3725 buf_size_nonzero_w); 3726 input += 8; 3727 if (abs(rect_type) == 1) { 3728 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); 3729 } 3730 identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w, 3731 -shift[0]); 3732 for (int j = 0; j < buf_size_w_div8; ++j) { 3733 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); 3734 } 3735 temp_b += 8; 3736 } 3737 for (int j = 0; j < buf_size_w_div8; ++j) { 3738 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); 3739 round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]); 3740 } 3741 if (txfm_size_col >= 16) { 3742 for (int i = 0; i < (txfm_size_col >> 4); i++) { 3743 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], 3744 output + 16 * i, stride, ud_flip, 3745 txfm_size_row); 3746 } 3747 } else if (txfm_size_col == 8) { 3748 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); 3749 } 3750 } 3751 3752 static inline void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input, 3753 uint8_t *output, int stride, 3754 TX_TYPE tx_type, int eob) { 3755 (void)eob; 3756 TX_SIZE tx_size = TX_4X4; 3757 DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]); 3758 int32_t *temp_in = txfm_buf; 3759 3760 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3761 const int txw_idx = get_txw_idx(tx_size); 3762 const int txh_idx = get_txh_idx(tx_size); 3763 const int txfm_size_col = tx_size_wide[tx_size]; 3764 const int txfm_size_row = tx_size_high[tx_size]; 3765 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); 3766 int32_t *temp_out = temp_in + buf_offset; 3767 int32_t *buf = temp_out + buf_offset; 3768 int32_t *buf_ptr = buf; 3769 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 }; 3770 int r; 3771 const transform_1d_neon row_txfm = 3772 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; 3773 const transform_1d_neon col_txfm = 3774 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; 3775 3776 int ud_flip, lr_flip; 3777 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 3778 3779 for (int i = 0; i < txfm_size_row; i++) { 3780 for (int c = 0; c < txfm_size_col; ++c) 3781 temp_in[c] = input[c * txfm_size_row]; 3782 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); 3783 3784 input++; 3785 buf_ptr += txfm_size_col; 3786 } 3787 3788 for (int c = 0; c < txfm_size_col; ++c) { 3789 if (lr_flip == 0) { 3790 for (r = 0; r < txfm_size_row; ++r) 3791 temp_in[r] = buf[r * txfm_size_col + c]; 3792 } else { 3793 // flip left right 3794 for (r = 0; r < txfm_size_row; ++r) 3795 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; 3796 } 3797 clamp_buf(temp_in, txfm_size_row, 16); 3798 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); 3799 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); 3800 3801 if (ud_flip == 0) { 3802 for (r = 0; r < txfm_size_row; ++r) { 3803 output[r * stride + c] = 3804 clip_pixel(output[r * stride + c] + temp_out[r]); 3805 } 3806 } else { 3807 // flip upside down 3808 for (r = 0; r < txfm_size_row; ++r) { 3809 output[r * stride + c] = clip_pixel(output[r * stride + c] + 3810 temp_out[txfm_size_row - r - 1]); 3811 } 3812 } 3813 } 3814 } 3815 3816 static void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output, 3817 int stride, TX_TYPE tx_type, 3818 int eob) { 3819 (void)eob; 3820 TX_SIZE tx_size = TX_4X8; 3821 DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]); 3822 int32_t *temp_in = txfm_buf; 3823 3824 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3825 const int txw_idx = get_txw_idx(tx_size); 3826 const int txh_idx = get_txh_idx(tx_size); 3827 const int txfm_size_col = tx_size_wide[tx_size]; 3828 const int txfm_size_row = tx_size_high[tx_size]; 3829 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); 3830 int32_t *temp_out = temp_in + buf_offset; 3831 int32_t *buf = temp_out + buf_offset; 3832 int32_t *buf_ptr = buf; 3833 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 3834 16, 16, 16, 16 }; 3835 int r; 3836 const transform_1d_neon row_txfm = 3837 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; 3838 const transform_1d_neon col_txfm = 3839 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; 3840 3841 int ud_flip, lr_flip; 3842 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 3843 3844 for (int i = 0; i < txfm_size_row; i++) { 3845 for (int c = 0; c < txfm_size_col; c++) 3846 temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, 3847 NewSqrt2Bits); 3848 3849 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); 3850 input++; 3851 buf_ptr += txfm_size_col; 3852 } 3853 3854 for (int c = 0; c < txfm_size_col; ++c) { 3855 if (lr_flip == 0) { 3856 for (r = 0; r < txfm_size_row; ++r) 3857 temp_in[r] = buf[r * txfm_size_col + c]; 3858 } else { 3859 // flip left right 3860 for (r = 0; r < txfm_size_row; ++r) 3861 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; 3862 } 3863 clamp_buf(temp_in, txfm_size_row, 16); 3864 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); 3865 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); 3866 3867 if (ud_flip == 0) { 3868 for (r = 0; r < txfm_size_row; ++r) { 3869 output[r * stride + c] = 3870 clip_pixel(output[r * stride + c] + temp_out[r]); 3871 } 3872 } else { 3873 // flip upside down 3874 for (r = 0; r < txfm_size_row; ++r) { 3875 output[r * stride + c] = clip_pixel(output[r * stride + c] + 3876 temp_out[txfm_size_row - r - 1]); 3877 } 3878 } 3879 } 3880 } 3881 3882 static void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output, 3883 int stride, TX_TYPE tx_type, 3884 int eob) { 3885 (void)eob; 3886 TX_SIZE tx_size = TX_8X4; 3887 DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]); 3888 int32_t *temp_in = txfm_buf; 3889 3890 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3891 const int txw_idx = get_txw_idx(tx_size); 3892 const int txh_idx = get_txh_idx(tx_size); 3893 const int txfm_size_col = tx_size_wide[tx_size]; 3894 const int txfm_size_row = tx_size_high[tx_size]; 3895 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); 3896 int32_t *temp_out = temp_in + buf_offset; 3897 int32_t *buf = temp_out + buf_offset; 3898 int32_t *buf_ptr = buf; 3899 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 3900 16, 16, 16, 16 }; 3901 int r; 3902 const transform_1d_neon row_txfm = 3903 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; 3904 const transform_1d_neon col_txfm = 3905 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; 3906 3907 int ud_flip, lr_flip; 3908 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 3909 3910 for (int i = 0; i < txfm_size_row; i++) { 3911 for (int c = 0; c < txfm_size_col; c++) 3912 temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2, 3913 NewSqrt2Bits); 3914 3915 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); 3916 input++; 3917 buf_ptr += txfm_size_col; 3918 } 3919 3920 for (int c = 0; c < txfm_size_col; ++c) { 3921 if (lr_flip == 0) { 3922 for (r = 0; r < txfm_size_row; ++r) 3923 temp_in[r] = buf[r * txfm_size_col + c]; 3924 } else { 3925 // flip left right 3926 for (r = 0; r < txfm_size_row; ++r) 3927 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; 3928 } 3929 clamp_buf(temp_in, txfm_size_row, 16); 3930 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); 3931 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); 3932 3933 if (ud_flip == 0) { 3934 for (r = 0; r < txfm_size_row; ++r) { 3935 output[r * stride + c] = 3936 clip_pixel(output[r * stride + c] + temp_out[r]); 3937 } 3938 } else { 3939 // flip upside down 3940 for (r = 0; r < txfm_size_row; ++r) { 3941 output[r * stride + c] = clip_pixel(output[r * stride + c] + 3942 temp_out[txfm_size_row - r - 1]); 3943 } 3944 } 3945 } 3946 } 3947 3948 static void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, 3949 uint8_t *output, int stride, 3950 TX_TYPE tx_type, int eob) { 3951 (void)eob; 3952 TX_SIZE tx_size = TX_4X16; 3953 DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]); 3954 int32_t *temp_in = txfm_buf; 3955 3956 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 3957 const int txw_idx = get_txw_idx(tx_size); 3958 const int txh_idx = get_txh_idx(tx_size); 3959 const int txfm_size_col = tx_size_wide[tx_size]; 3960 const int txfm_size_row = tx_size_high[tx_size]; 3961 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); 3962 int32_t *temp_out = temp_in + buf_offset; 3963 int32_t *buf = temp_out + buf_offset; 3964 int32_t *buf_ptr = buf; 3965 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 3966 16, 16, 16, 16, 16 }; 3967 int r; 3968 const transform_1d_neon row_txfm = 3969 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; 3970 const transform_1d_neon col_txfm = 3971 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; 3972 3973 int ud_flip, lr_flip; 3974 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 3975 3976 for (int i = 0; i < txfm_size_row; i++) { 3977 for (int c = 0; c < txfm_size_col; c++) 3978 temp_in[c] = input[c * txfm_size_row]; 3979 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); 3980 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); 3981 input++; 3982 buf_ptr += txfm_size_col; 3983 } 3984 3985 for (int c = 0; c < txfm_size_col; ++c) { 3986 if (lr_flip == 0) { 3987 for (r = 0; r < txfm_size_row; ++r) 3988 temp_in[r] = buf[r * txfm_size_col + c]; 3989 } else { 3990 // flip left right 3991 for (r = 0; r < txfm_size_row; ++r) 3992 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; 3993 } 3994 clamp_buf(temp_in, txfm_size_row, 16); 3995 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); 3996 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); 3997 3998 if (ud_flip == 0) { 3999 for (r = 0; r < txfm_size_row; ++r) { 4000 output[r * stride + c] = 4001 clip_pixel(output[r * stride + c] + temp_out[r]); 4002 } 4003 } else { 4004 // flip upside down 4005 for (r = 0; r < txfm_size_row; ++r) { 4006 output[r * stride + c] = clip_pixel(output[r * stride + c] + 4007 temp_out[txfm_size_row - r - 1]); 4008 } 4009 } 4010 } 4011 } 4012 4013 static void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, 4014 uint8_t *output, int stride, 4015 TX_TYPE tx_type, int eob) { 4016 (void)eob; 4017 TX_SIZE tx_size = TX_16X4; 4018 DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]); 4019 int32_t *temp_in = txfm_buf; 4020 4021 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 4022 const int txw_idx = get_txw_idx(tx_size); 4023 const int txh_idx = get_txh_idx(tx_size); 4024 const int txfm_size_col = tx_size_wide[tx_size]; 4025 const int txfm_size_row = tx_size_high[tx_size]; 4026 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col); 4027 int32_t *temp_out = temp_in + buf_offset; 4028 int32_t *buf = temp_out + buf_offset; 4029 int32_t *buf_ptr = buf; 4030 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 4031 16, 16, 16, 16, 16 }; 4032 int r; 4033 const transform_1d_neon row_txfm = 4034 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]]; 4035 const transform_1d_neon col_txfm = 4036 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]]; 4037 4038 int ud_flip, lr_flip; 4039 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 4040 4041 for (int i = 0; i < txfm_size_row; i++) { 4042 for (int c = 0; c < txfm_size_col; c++) 4043 temp_in[c] = input[c * txfm_size_row]; 4044 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range); 4045 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]); 4046 input++; 4047 buf_ptr += txfm_size_col; 4048 } 4049 4050 for (int c = 0; c < txfm_size_col; ++c) { 4051 if (lr_flip == 0) { 4052 for (r = 0; r < txfm_size_row; ++r) 4053 temp_in[r] = buf[r * txfm_size_col + c]; 4054 } else { 4055 // flip left right 4056 for (r = 0; r < txfm_size_row; ++r) 4057 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)]; 4058 } 4059 clamp_buf(temp_in, txfm_size_row, 16); 4060 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range); 4061 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]); 4062 4063 if (ud_flip == 0) { 4064 for (r = 0; r < txfm_size_row; ++r) { 4065 output[r * stride + c] = 4066 clip_pixel(output[r * stride + c] + temp_out[r]); 4067 } 4068 } else { 4069 // flip upside down 4070 for (r = 0; r < txfm_size_row; ++r) { 4071 output[r * stride + c] = clip_pixel(output[r * stride + c] + 4072 temp_out[txfm_size_row - r - 1]); 4073 } 4074 } 4075 } 4076 } 4077 4078 static inline void lowbd_inv_txfm2d_add_no_identity_neon( 4079 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 4080 TX_SIZE tx_size, int eob) { 4081 int16x8_t a[64 * 8]; 4082 int16x8_t b[64 * 8]; 4083 int eobx, eoby, ud_flip, lr_flip; 4084 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob); 4085 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size]; 4086 const int txw_idx = get_txw_idx(tx_size); 4087 const int txh_idx = get_txh_idx(tx_size); 4088 const int txfm_size_col = tx_size_wide[tx_size]; 4089 const int txfm_size_row = tx_size_high[tx_size]; 4090 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row); 4091 const int buf_size_w_div8 = txfm_size_col >> 3; 4092 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3; 4093 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3; 4094 const int input_stride = AOMMIN(32, txfm_size_row); 4095 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx]; 4096 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby]; 4097 int temp_b = 0; 4098 4099 const transform_neon row_txfm = 4100 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x]; 4101 const transform_neon col_txfm = 4102 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y]; 4103 4104 assert(col_txfm != NULL); 4105 assert(row_txfm != NULL); 4106 4107 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 4108 4109 for (int i = 0; i < buf_size_nonzero_h_div8; i++) { 4110 int16x8_t *cur_a = &a[i * txfm_size_col]; 4111 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a, 4112 buf_size_nonzero_w); 4113 input += 8; 4114 if (abs(rect_type) == 1) { 4115 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w); 4116 } 4117 row_txfm(cur_a, cur_a, INV_COS_BIT); 4118 round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]); 4119 if (lr_flip == 1) { 4120 for (int j = 0; j < buf_size_w_div8; ++j) { 4121 flip_buf_ud_neon(&cur_a[j * 8], 8); 4122 transpose_arrays_s16_8x8( 4123 &cur_a[j * 8], 4124 &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]); 4125 } 4126 temp_b += 8; 4127 } else { 4128 for (int j = 0; j < buf_size_w_div8; ++j) { 4129 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]); 4130 } 4131 temp_b += 8; 4132 } 4133 } 4134 for (int j = 0; j < buf_size_w_div8; ++j) { 4135 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT); 4136 round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row, -shift[1]); 4137 } 4138 4139 if (txfm_size_col >= 16) { 4140 for (int i = 0; i < (txfm_size_col >> 4); i++) { 4141 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2], 4142 output + 16 * i, stride, ud_flip, 4143 txfm_size_row); 4144 } 4145 } else if (txfm_size_col == 8) { 4146 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row); 4147 } 4148 } 4149 4150 static inline void lowbd_inv_txfm2d_add_universe_neon( 4151 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type, 4152 TX_SIZE tx_size, int eob) { 4153 switch (tx_type) { 4154 case IDTX: 4155 lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size, 4156 eob); 4157 break; 4158 4159 case H_DCT: 4160 case H_ADST: 4161 case H_FLIPADST: 4162 lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type, 4163 tx_size, eob); 4164 break; 4165 4166 case V_DCT: 4167 case V_ADST: 4168 case V_FLIPADST: 4169 lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type, 4170 tx_size, eob); 4171 break; 4172 4173 default: 4174 lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type, 4175 tx_size, eob); 4176 break; 4177 } 4178 } 4179 4180 // This function is used by av1_inv_txfm2d_test.cc. 4181 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, 4182 int stride, TX_TYPE tx_type, TX_SIZE tx_size, 4183 int eob); 4184 4185 void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output, 4186 int stride, TX_TYPE tx_type, TX_SIZE tx_size, 4187 int eob) { 4188 switch (tx_size) { 4189 case TX_4X4: 4190 lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob); 4191 break; 4192 4193 case TX_4X8: 4194 lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob); 4195 break; 4196 4197 case TX_8X4: 4198 lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob); 4199 break; 4200 4201 case TX_4X16: 4202 lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob); 4203 break; 4204 4205 case TX_16X4: 4206 lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob); 4207 break; 4208 4209 default: 4210 lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type, 4211 tx_size, eob); 4212 break; 4213 } 4214 } 4215 void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride, 4216 const TxfmParam *txfm_param) { 4217 const TX_TYPE tx_type = txfm_param->tx_type; 4218 if (!txfm_param->lossless) { 4219 av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type, 4220 txfm_param->tx_size, txfm_param->eob); 4221 } else { 4222 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param); 4223 } 4224 }