av1_fwd_txfm2d_neon.c (127018B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "aom_dsp/arm/mem_neon.h" 16 #include "aom_dsp/arm/transpose_neon.h" 17 #include "aom_dsp/txfm_common.h" 18 #include "aom_ports/mem.h" 19 #include "av1/common/av1_txfm.h" 20 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 21 #include "config/aom_config.h" 22 #include "config/av1_rtcd.h" 23 #include "shift_neon.h" 24 #include "txfm_neon.h" 25 26 #define TXFM_COS_BIT_MAX 13 27 28 // A note on butterfly helper naming: 29 // 30 // butterfly_[input_ty]_[acc_ty]_[input_num]_[weight_num]_[weight_neg]_neon 31 // e.g. butterfly_s32_s32_x4_0231_neon 32 // | | | ^ Weights are applied as indices 0, 2, 3, 1 33 // | | | (see more detail below) 34 // | | ^ (int32)x4 input/output parameters 35 // | ^ 32-bit accumulators internally 36 // ^ 32-bit input/output parameters 37 // 38 // Weights are stored as 4-tuples in Q2.13 format as (w0, 1-w0, -w0, w0-1) to 39 // avoid needing separate negation instructions. This is represented in the 40 // helper naming by referring to the lane index in the loaded tuple that each 41 // multiply is performed with: 42 // 43 // in0 in1 44 // /---------- 45 // out0 | w0 w1 ==> out0 = in0 * w0 + in1 * w1 46 // out1 | w2 w3 ==> out1 = in0 * w2 + in1 * w3 47 // 48 // So for indices 0331 from the earlier example, we end up with: 49 // 50 // in0 in1 51 // /------------------ 52 // out0 | (lane 0) (lane 2) ==> out0 = in0 * w0 + in1 * -w0 53 // out1 | (lane 3) (lane 1) ==> out1 = in0 * (w0-1) + in1 * (1-w0) 54 55 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0112_neon( 56 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, 57 int32x4_t *out0, int32x4_t *out1) { 58 int32x4_t w0101 = vmovl_s16(w0101_s16); 59 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); 60 o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 1); 61 int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); 62 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); 63 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); 64 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); 65 } 66 67 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_0332_neon( 68 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, 69 int32x4_t *out0, int32x4_t *out1) { 70 int32x4_t w0101 = vmovl_s16(w0101_s16); 71 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); 72 o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 1); 73 int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 1); 74 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 0); 75 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); 76 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); 77 } 78 79 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1003_neon( 80 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, 81 int32x4_t *out0, int32x4_t *out1) { 82 int32x4_t w0101 = vmovl_s16(w0101_s16); 83 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); 84 o0 = vmlaq_lane_s32(o0, in1, vget_low_s32(w0101), 0); 85 int32x4_t o1 = vmulq_lane_s32(in0, vget_low_s32(w0101), 0); 86 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); 87 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); 88 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); 89 } 90 91 static AOM_FORCE_INLINE void butterfly_s32_s32_x4_1223_neon( 92 const int16x4_t w0101_s16, const int32x4_t in0, const int32x4_t in1, 93 int32x4_t *out0, int32x4_t *out1) { 94 int32x4_t w0101 = vmovl_s16(w0101_s16); 95 int32x4_t o0 = vmulq_lane_s32(in0, vget_low_s32(w0101), 1); 96 o0 = vmlaq_lane_s32(o0, in1, vget_high_s32(w0101), 0); 97 int32x4_t o1 = vmulq_lane_s32(in0, vget_high_s32(w0101), 0); 98 o1 = vmlaq_lane_s32(o1, in1, vget_high_s32(w0101), 1); 99 *out0 = vrshrq_n_s32(o0, TXFM_COS_BIT_MAX); 100 *out1 = vrshrq_n_s32(o1, TXFM_COS_BIT_MAX); 101 } 102 103 #define butterfly_s16_s32_x4_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ 104 out0, out1) \ 105 do { \ 106 int32x4_t u0 = vmull_lane_s16(in0, wvec, lane0); \ 107 u0 = vmlal_lane_s16(u0, in1, wvec, lane1); \ 108 int32x4_t v0 = vmull_lane_s16(in0, wvec, lane2); \ 109 v0 = vmlal_lane_s16(v0, in1, wvec, lane3); \ 110 *out0 = vqrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ 111 *out1 = vqrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ 112 } while (0) 113 114 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0112_neon( 115 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, 116 int16x4_t *out0, int16x4_t *out1) { 117 butterfly_s16_s32_x4_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); 118 } 119 120 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_0332_neon( 121 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, 122 int16x4_t *out0, int16x4_t *out1) { 123 butterfly_s16_s32_x4_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); 124 } 125 126 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1003_neon( 127 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, 128 int16x4_t *out0, int16x4_t *out1) { 129 butterfly_s16_s32_x4_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); 130 } 131 132 static AOM_FORCE_INLINE void butterfly_s16_s32_x4_1223_neon( 133 const int16x4_t w0101, const int16x4_t in0, const int16x4_t in1, 134 int16x4_t *out0, int16x4_t *out1) { 135 butterfly_s16_s32_x4_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); 136 } 137 138 #define butterfly_s16_s32_x8_neon(wvec, lane0, lane1, lane2, lane3, in0, in1, \ 139 out0, out1) \ 140 do { \ 141 int32x4_t u0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane0); \ 142 u0 = vmlal_lane_s16(u0, vget_low_s16(in1), wvec, lane1); \ 143 int32x4_t u1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane0); \ 144 u1 = vmlal_lane_s16(u1, vget_high_s16(in1), wvec, lane1); \ 145 int32x4_t v0 = vmull_lane_s16(vget_low_s16(in0), wvec, lane2); \ 146 v0 = vmlal_lane_s16(v0, vget_low_s16(in1), wvec, lane3); \ 147 int32x4_t v1 = vmull_lane_s16(vget_high_s16(in0), wvec, lane2); \ 148 v1 = vmlal_lane_s16(v1, vget_high_s16(in1), wvec, lane3); \ 149 const int16x4_t c0 = vrshrn_n_s32(u0, TXFM_COS_BIT_MAX); \ 150 const int16x4_t c1 = vrshrn_n_s32(u1, TXFM_COS_BIT_MAX); \ 151 const int16x4_t d0 = vrshrn_n_s32(v0, TXFM_COS_BIT_MAX); \ 152 const int16x4_t d1 = vrshrn_n_s32(v1, TXFM_COS_BIT_MAX); \ 153 *out0 = vcombine_s16(c0, c1); \ 154 *out1 = vcombine_s16(d0, d1); \ 155 } while (0) 156 157 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0112_neon( 158 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, 159 int16x8_t *out0, int16x8_t *out1) { 160 butterfly_s16_s32_x8_neon(w0101, 0, 1, 1, 2, in0, in1, out0, out1); 161 } 162 163 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_0332_neon( 164 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, 165 int16x8_t *out0, int16x8_t *out1) { 166 butterfly_s16_s32_x8_neon(w0101, 0, 3, 3, 2, in0, in1, out0, out1); 167 } 168 169 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1003_neon( 170 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, 171 int16x8_t *out0, int16x8_t *out1) { 172 butterfly_s16_s32_x8_neon(w0101, 1, 0, 0, 3, in0, in1, out0, out1); 173 } 174 175 static AOM_FORCE_INLINE void butterfly_s16_s32_x8_1223_neon( 176 const int16x4_t w0101, const int16x8_t in0, const int16x8_t in1, 177 int16x8_t *out0, int16x8_t *out1) { 178 butterfly_s16_s32_x8_neon(w0101, 1, 2, 2, 3, in0, in1, out0, out1); 179 } 180 181 static AOM_FORCE_INLINE void flip_buf_4_neon(int16x4_t *in, int16x4_t *out, 182 int size) { 183 for (int i = 0; i < size; ++i) { 184 out[size - i - 1] = in[i]; 185 } 186 } 187 188 static AOM_FORCE_INLINE void flip_buf_8_neon(int16x8_t *in, int16x8_t *out, 189 int size) { 190 for (int i = 0; i < size; ++i) { 191 out[size - i - 1] = in[i]; 192 } 193 } 194 195 static AOM_FORCE_INLINE void store_buffer_interleaved_s32_x8( 196 int32_t *const out, const int32x4_t *const in1, const int32x4_t *const in2, 197 const int stride, const int out_size) { 198 for (int i = 0; i < out_size; ++i) { 199 vst1q_s32(out + stride * i, in1[i]); 200 vst1q_s32(out + stride * i + 4, in2[i]); 201 } 202 } 203 204 static AOM_FORCE_INLINE void load_buffer_s16_x4(const int16_t *in, 205 const int stride, 206 int16x4_t *const out, 207 const int out_size) { 208 for (int i = 0; i < out_size; ++i) { 209 out[i] = vld1_s16(in); 210 in += stride; 211 } 212 } 213 214 static AOM_FORCE_INLINE void load_buffer_s16_x8(const int16_t *in, int stride, 215 int16x8_t *out, int out_size) { 216 for (int i = 0; i < out_size; ++i) { 217 out[i] = vld1q_s16(in + i * stride); 218 } 219 } 220 221 static AOM_FORCE_INLINE void store_buffer_s16_x4(const int16x4_t *const in, 222 int32_t *const out, 223 const int stride, 224 const int out_size) { 225 for (int i = 0; i < out_size; ++i) { 226 vst1q_s32(out + i * stride, vmovl_s16(in[i])); 227 } 228 } 229 230 static AOM_FORCE_INLINE void store_buffer_s16_x8(const int16x8_t *const in, 231 int32_t *const out, 232 const int stride, 233 const int out_size) { 234 for (int i = 0; i < out_size; ++i) { 235 vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i]))); 236 vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i]))); 237 } 238 } 239 240 // A note on naming: 241 // round_shift_[sqrt2]_s16_s32_4x1_neon(...) 242 // | | | ^ 1 => a single vector 243 // | | | n => an array of vectors 244 // | | | ^ input/output vector element count 245 // | | ^ output type 246 // | ^ input type 247 // ^ multiplicand and shift identifier 248 249 static AOM_FORCE_INLINE int16x4_t 250 round_shift_sqrt2_s16_s16_4x1_neon(int16x4_t a) { 251 return vqrshrn_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); 252 } 253 254 static AOM_FORCE_INLINE int16x8_t 255 round_shift_sqrt2_s16_s16_8x1_neon(int16x8_t a) { 256 return vcombine_s16(round_shift_sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), 257 round_shift_sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); 258 } 259 260 static AOM_FORCE_INLINE int16x4_t 261 round_shift_2sqrt2_s16_s16_4x1_neon(int16x4_t a) { 262 return vqrshrn_n_s32(vmull_n_s16(a, 2 * NewSqrt2), NewSqrt2Bits); 263 } 264 265 static AOM_FORCE_INLINE int16x8_t 266 round_shift_2sqrt2_s16_s16_8x1_neon(int16x8_t a) { 267 return vcombine_s16(round_shift_2sqrt2_s16_s16_4x1_neon(vget_low_s16(a)), 268 round_shift_2sqrt2_s16_s16_4x1_neon(vget_high_s16(a))); 269 } 270 271 static AOM_FORCE_INLINE int32x4_t 272 round_shift_sqrt2_s16_s32_4x1_neon(int16x4_t a) { 273 return vrshrq_n_s32(vmull_n_s16(a, NewSqrt2), NewSqrt2Bits); 274 } 275 276 static AOM_FORCE_INLINE int32x4_t 277 round_shift_sqrt2_s32_s32_4x1_neon(int32x4_t a) { 278 return vrshrq_n_s32(vmulq_n_s32(a, NewSqrt2), NewSqrt2Bits); 279 } 280 281 #define ROUND_SHIFT_SQRT_LOOP_HELPER(name, type0, type1, fn) \ 282 static AOM_FORCE_INLINE void name(const type0 *in, type1 *out, int size) { \ 283 for (int i = 0; i < size; ++i) { \ 284 out[i] = fn(in[i]); \ 285 } \ 286 } 287 288 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s32_s32_4xn_neon, int32x4_t, 289 int32x4_t, round_shift_sqrt2_s32_s32_4x1_neon) 290 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_4xn_neon, int16x4_t, 291 int16x4_t, round_shift_sqrt2_s16_s16_4x1_neon) 292 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_sqrt2_s16_s16_8xn_neon, int16x8_t, 293 int16x8_t, round_shift_sqrt2_s16_s16_8x1_neon) 294 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_4xn_neon, int16x4_t, 295 int16x4_t, round_shift_2sqrt2_s16_s16_4x1_neon) 296 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t, 297 int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon) 298 299 static AOM_FORCE_INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in, 300 int32_t *const out, 301 const int stride, 302 const int out_size) { 303 for (int i = 0; i < out_size; ++i) { 304 vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i])); 305 } 306 } 307 308 static AOM_FORCE_INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in, 309 int32_t *const out, 310 const int stride, 311 const int out_size) { 312 for (int i = 0; i < out_size; ++i) { 313 vst1q_s32(out + i * stride + 0, 314 round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i]))); 315 vst1q_s32(out + i * stride + 4, 316 round_shift_sqrt2_s16_s32_4x1_neon(vget_high_s16(in[i]))); 317 } 318 } 319 320 static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input, 321 int16x4_t *output, int cos_bit) { 322 int32x4_t u[6], v[6]; 323 const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); 324 const int16x4_t u01 = vqadd_s16(input[0], input[1]); 325 326 v[5] = vmull_lane_s16(input[2], sinpi, 2); 327 v[0] = vmull_lane_s16(input[1], sinpi, 1); 328 v[0] = vmlal_lane_s16(v[0], input[0], sinpi, 0); 329 v[1] = vmlal_lane_s16(v[5], input[3], sinpi, 3); 330 v[2] = vmull_lane_s16(u01, sinpi, 2); 331 v[3] = vmull_lane_s16(input[0], sinpi, 3); 332 v[3] = vmlsl_lane_s16(v[3], input[1], sinpi, 0); 333 v[4] = vmlsl_lane_s16(v[5], input[3], sinpi, 1); 334 335 u[0] = vaddq_s32(v[0], v[1]); 336 u[1] = vmlsl_lane_s16(v[2], input[3], sinpi, 2); 337 u[2] = vsubq_s32(v[3], v[4]); 338 u[3] = vsubq_s32(u[2], u[0]); 339 u[3] = vmlaq_n_s32(u[3], v[5], 3); 340 341 output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); 342 output[1] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); 343 output[2] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); 344 output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); 345 } 346 347 static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input, 348 int16x4_t *output, int cos_bit) { 349 const int16_t *cospi = cospi_arr_q13(cos_bit); 350 351 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 352 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 353 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 354 355 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 356 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 357 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 358 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 359 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 360 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 361 362 // stage 1-2 363 int16x4_t x2[8]; 364 butterfly_s16_s32_x4_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); 365 butterfly_s16_s32_x4_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); 366 367 // stage 3 368 int16x4_t x3[8]; 369 x3[0] = vqadd_s16(input[0], x2[2]); 370 x3[1] = vqsub_s16(x2[3], input[7]); 371 x3[2] = vqsub_s16(input[0], x2[2]); 372 x3[3] = vqadd_s16(input[7], x2[3]); 373 x3[4] = vqsub_s16(x2[6], input[1]); 374 x3[5] = vqadd_s16(input[6], x2[7]); 375 x3[6] = vqadd_s16(input[1], x2[6]); 376 x3[7] = vqsub_s16(input[6], x2[7]); 377 378 // stage 4 379 int16x4_t x4[8]; 380 butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x4[4], &x4[5]); 381 butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x4[6], &x4[7]); 382 383 // stage 5 384 int16x4_t x5[8]; 385 x5[0] = vqadd_s16(x3[0], x4[4]); 386 x5[1] = vqadd_s16(x3[1], x4[5]); 387 x5[2] = vqadd_s16(x3[2], x4[6]); 388 x5[3] = vqsub_s16(x4[7], x3[3]); 389 x5[4] = vqsub_s16(x3[0], x4[4]); 390 x5[5] = vqsub_s16(x3[1], x4[5]); 391 x5[6] = vqsub_s16(x3[2], x4[6]); 392 x5[7] = vqadd_s16(x3[3], x4[7]); 393 394 // stage 6-7 395 butterfly_s16_s32_x4_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); 396 butterfly_s16_s32_x4_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); 397 butterfly_s16_s32_x4_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); 398 butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); 399 } 400 401 static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input, 402 int16x8_t *output, int cos_bit) { 403 int32x4_t u_lo[4], u_hi[4]; 404 const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit)); 405 const int16x8_t u01 = vqaddq_s16(input[0], input[1]); 406 407 u_lo[0] = vmull_lane_s16(vget_low_s16(input[1]), sinpi, 1); 408 u_hi[0] = vmull_lane_s16(vget_high_s16(input[1]), sinpi, 1); 409 410 u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[0]), sinpi, 0); 411 u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[0]), sinpi, 0); 412 413 u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[3]), sinpi, 3); 414 u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[3]), sinpi, 3); 415 416 u_lo[0] = vmlal_lane_s16(u_lo[0], vget_low_s16(input[2]), sinpi, 2); 417 u_hi[0] = vmlal_lane_s16(u_hi[0], vget_high_s16(input[2]), sinpi, 2); 418 419 u_lo[1] = vmull_lane_s16(vget_low_s16(u01), sinpi, 2); 420 u_hi[1] = vmull_lane_s16(vget_high_s16(u01), sinpi, 2); 421 422 u_lo[2] = vmull_lane_s16(vget_low_s16(input[0]), sinpi, 3); 423 u_hi[2] = vmull_lane_s16(vget_high_s16(input[0]), sinpi, 3); 424 425 u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[1]), sinpi, 0); 426 u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[1]), sinpi, 0); 427 428 u_lo[2] = vmlal_lane_s16(u_lo[2], vget_low_s16(input[3]), sinpi, 1); 429 u_hi[2] = vmlal_lane_s16(u_hi[2], vget_high_s16(input[3]), sinpi, 1); 430 431 u_lo[2] = vmlsl_lane_s16(u_lo[2], vget_low_s16(input[2]), sinpi, 2); 432 u_hi[2] = vmlsl_lane_s16(u_hi[2], vget_high_s16(input[2]), sinpi, 2); 433 434 u_lo[1] = vmlsl_lane_s16(u_lo[1], vget_low_s16(input[3]), sinpi, 2); 435 u_hi[1] = vmlsl_lane_s16(u_hi[1], vget_high_s16(input[3]), sinpi, 2); 436 437 u_lo[3] = vsubq_s32(u_lo[2], u_lo[0]); 438 u_hi[3] = vsubq_s32(u_hi[2], u_hi[0]); 439 440 const int16x4_t sinpix3 = vmul_n_s16(sinpi, 3); 441 u_lo[3] = vmlal_lane_s16(u_lo[3], vget_low_s16(input[2]), sinpix3, 2); 442 u_hi[3] = vmlal_lane_s16(u_hi[3], vget_high_s16(input[2]), sinpix3, 2); 443 444 output[0] = vcombine_s16(vrshrn_n_s32(u_lo[0], TXFM_COS_BIT_MAX), 445 vrshrn_n_s32(u_hi[0], TXFM_COS_BIT_MAX)); 446 output[1] = vcombine_s16(vrshrn_n_s32(u_lo[1], TXFM_COS_BIT_MAX), 447 vrshrn_n_s32(u_hi[1], TXFM_COS_BIT_MAX)); 448 output[2] = vcombine_s16(vrshrn_n_s32(u_lo[2], TXFM_COS_BIT_MAX), 449 vrshrn_n_s32(u_hi[2], TXFM_COS_BIT_MAX)); 450 output[3] = vcombine_s16(vrshrn_n_s32(u_lo[3], TXFM_COS_BIT_MAX), 451 vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX)); 452 } 453 454 static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input, 455 int16x4_t *output, int cos_bit) { 456 const int16_t *cospi = cospi_arr_q13(cos_bit); 457 const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]); 458 459 int16x4_t in12a = vadd_s16(input[1], input[2]); 460 int16x4_t in12s = vsub_s16(input[1], input[2]); 461 int16x4_t in03a = vadd_s16(input[0], input[3]); 462 int16x4_t in03s = vsub_s16(input[0], input[3]); 463 464 int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]); 465 int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]); 466 467 int32x4_t u[4]; 468 u[0] = vaddq_s32(u0ad1, u0ad2); 469 u[1] = vsubq_s32(u0ad2, u0ad1); 470 u[2] = vmull_lane_s16(in12s, cospi16, 1); 471 u[2] = vmlal_lane_s16(u[2], in03s, cospi16, 0); 472 u[3] = vmull_lane_s16(in03s, cospi16, 1); 473 u[3] = vmlsl_lane_s16(u[3], in12s, cospi16, 0); 474 475 output[0] = vrshrn_n_s32(u[0], TXFM_COS_BIT_MAX); 476 output[1] = vrshrn_n_s32(u[2], TXFM_COS_BIT_MAX); 477 output[2] = vrshrn_n_s32(u[1], TXFM_COS_BIT_MAX); 478 output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX); 479 } 480 481 // Butterfly pre-processing: 482 // e.g. n=4: 483 // out[0] = in[0] + in[3] 484 // out[1] = in[1] + in[2] 485 // out[2] = in[1] - in[2] 486 // out[3] = in[0] - in[3] 487 488 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x4(const int16x4_t *input, 489 int16x4_t *output, 490 int n) { 491 for (int i = 0; i < n / 2; ++i) { 492 output[i] = vqadd_s16(input[i], input[n - i - 1]); 493 } 494 for (int i = 0; i < n / 2; ++i) { 495 output[n / 2 + i] = vqsub_s16(input[n / 2 - i - 1], input[n / 2 + i]); 496 } 497 } 498 499 static AOM_FORCE_INLINE void butterfly_dct_pre_s16_x8(const int16x8_t *input, 500 int16x8_t *output, 501 int n) { 502 for (int i = 0; i < n / 2; ++i) { 503 output[i] = vqaddq_s16(input[i], input[n - i - 1]); 504 } 505 for (int i = 0; i < n / 2; ++i) { 506 output[n / 2 + i] = vqsubq_s16(input[n / 2 - i - 1], input[n / 2 + i]); 507 } 508 } 509 510 static AOM_FORCE_INLINE void butterfly_dct_pre_s32_x4(const int32x4_t *input, 511 int32x4_t *output, 512 int n) { 513 for (int i = 0; i < n / 2; ++i) { 514 output[i] = vqaddq_s32(input[i], input[n - i - 1]); 515 } 516 for (int i = 0; i < n / 2; ++i) { 517 output[n / 2 + i] = vqsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); 518 } 519 } 520 521 // Butterfly post-processing: 522 // e.g. n=8: 523 // out[0] = in0[0] + in1[3]; 524 // out[1] = in0[1] + in1[2]; 525 // out[2] = in0[1] - in1[2]; 526 // out[3] = in0[0] - in1[3]; 527 // out[4] = in0[7] - in1[4]; 528 // out[5] = in0[6] - in1[5]; 529 // out[6] = in0[6] + in1[5]; 530 // out[7] = in0[7] + in1[4]; 531 532 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x4(const int16x4_t *in0, 533 const int16x4_t *in1, 534 int16x4_t *output, 535 int n) { 536 for (int i = 0; i < n / 4; ++i) { 537 output[i] = vqadd_s16(in0[i], in1[n / 2 - i - 1]); 538 } 539 for (int i = 0; i < n / 4; ++i) { 540 output[n / 4 + i] = vqsub_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); 541 } 542 for (int i = 0; i < n / 4; ++i) { 543 output[n / 2 + i] = vqsub_s16(in0[n - i - 1], in1[n / 2 + i]); 544 } 545 for (int i = 0; i < n / 4; ++i) { 546 output[(3 * n) / 4 + i] = 547 vqadd_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); 548 } 549 } 550 551 static AOM_FORCE_INLINE void butterfly_dct_post_s16_x8(const int16x8_t *in0, 552 const int16x8_t *in1, 553 int16x8_t *output, 554 int n) { 555 for (int i = 0; i < n / 4; ++i) { 556 output[i] = vqaddq_s16(in0[i], in1[n / 2 - i - 1]); 557 } 558 for (int i = 0; i < n / 4; ++i) { 559 output[n / 4 + i] = vqsubq_s16(in0[n / 4 - i - 1], in1[n / 4 + i]); 560 } 561 for (int i = 0; i < n / 4; ++i) { 562 output[n / 2 + i] = vqsubq_s16(in0[n - i - 1], in1[n / 2 + i]); 563 } 564 for (int i = 0; i < n / 4; ++i) { 565 output[(3 * n) / 4 + i] = 566 vqaddq_s16(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); 567 } 568 } 569 570 static AOM_FORCE_INLINE void butterfly_dct_post_s32_x4(const int32x4_t *in0, 571 const int32x4_t *in1, 572 int32x4_t *output, 573 int n) { 574 for (int i = 0; i < n / 4; ++i) { 575 output[i] = vqaddq_s32(in0[i], in1[n / 2 - i - 1]); 576 } 577 for (int i = 0; i < n / 4; ++i) { 578 output[n / 4 + i] = vqsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); 579 } 580 for (int i = 0; i < n / 4; ++i) { 581 output[n / 2 + i] = vqsubq_s32(in0[n - i - 1], in1[n / 2 + i]); 582 } 583 for (int i = 0; i < n / 4; ++i) { 584 output[(3 * n) / 4 + i] = 585 vqaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); 586 } 587 } 588 589 static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input, 590 int16x8_t *output, int cos_bit) { 591 const int16_t *cospi = cospi_arr_q13(cos_bit); 592 593 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 594 595 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 596 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 597 598 // stage 1 599 int16x8_t x1[4]; 600 butterfly_dct_pre_s16_x8(input, x1, 4); 601 602 // stage 2 603 int16x8_t x2[4]; 604 butterfly_s16_s32_x8_0112_neon(cospi32, x1[0], x1[1], &x2[0], &x2[1]); 605 butterfly_s16_s32_x8_0112_neon(cospi16, x1[3], x1[2], &x2[2], &x2[3]); 606 607 // stage 3 608 output[0] = x2[0]; 609 output[1] = x2[2]; 610 output[2] = x2[1]; 611 output[3] = x2[3]; 612 } 613 614 static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input, 615 int16x4_t *output, int cos_bit) { 616 const int16_t *cospi = cospi_arr_q13(cos_bit); 617 618 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 619 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 620 621 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 622 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 623 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 624 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 625 626 // stage 1 627 int16x4_t x1[8]; 628 butterfly_dct_pre_s16_x4(input, x1, 8); 629 630 // stage 2 631 int16x4_t x2[8]; 632 butterfly_dct_pre_s16_x4(x1, x2, 4); 633 butterfly_s16_s32_x4_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); 634 635 // stage 3 636 int16x4_t x3[8]; 637 butterfly_s16_s32_x4_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); 638 butterfly_s16_s32_x4_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); 639 butterfly_dct_post_s16_x4(x1 + 4, x2 + 4, x3 + 4, 4); 640 641 // stage 4-5 642 butterfly_s16_s32_x4_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); 643 butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); 644 } 645 646 static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input, 647 int16x8_t *output, int cos_bit) { 648 const int16_t *cospi = cospi_arr_q13(cos_bit); 649 650 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 651 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 652 653 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 654 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 655 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 656 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 657 658 // stage 1 659 int16x8_t x1[8]; 660 butterfly_dct_pre_s16_x8(input, x1, 8); 661 662 // stage 2 663 int16x8_t x2[8]; 664 butterfly_dct_pre_s16_x8(x1, x2, 4); 665 butterfly_s16_s32_x8_0112_neon(cospi32, x1[6], x1[5], &x2[6], &x2[5]); 666 667 // stage 3 668 int16x8_t x3[8]; 669 butterfly_s16_s32_x8_0112_neon(cospi32, x2[0], x2[1], &output[0], &output[4]); 670 butterfly_s16_s32_x8_0112_neon(cospi16, x2[3], x2[2], &output[2], &output[6]); 671 butterfly_dct_post_s16_x8(x1 + 4, x2 + 4, x3 + 4, 4); 672 673 // stage 4-5 674 butterfly_s16_s32_x8_0112_neon(cospi8, x3[7], x3[4], &output[1], &output[7]); 675 butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]); 676 } 677 678 static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input, 679 int16x4_t *output, int cos_bit) { 680 const int16_t *cospi = cospi_arr_q13(cos_bit); 681 682 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 683 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 684 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 685 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 686 687 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 688 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 689 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 690 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 691 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 692 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 693 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 694 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 695 696 // stage 1 697 int16x4_t x1[16]; 698 butterfly_dct_pre_s16_x4(input, x1, 16); 699 700 // stage 2 701 int16x4_t x2[16]; 702 butterfly_dct_pre_s16_x4(x1, x2, 8); 703 butterfly_s16_s32_x4_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); 704 butterfly_s16_s32_x4_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); 705 706 // stage 3 707 int16x4_t x3[16]; 708 butterfly_dct_pre_s16_x4(x2, x3, 4); 709 butterfly_s16_s32_x4_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); 710 butterfly_dct_post_s16_x4(x1 + 8, x2 + 8, x3 + 8, 8); 711 712 // stage 4 713 int16x4_t x4[16]; 714 butterfly_s16_s32_x4_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); 715 butterfly_s16_s32_x4_0112_neon(cospi16, x3[3], x3[2], &output[4], 716 &output[12]); 717 butterfly_dct_post_s16_x4(x2 + 4, x3 + 4, x4 + 4, 4); 718 butterfly_s16_s32_x4_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); 719 butterfly_s16_s32_x4_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); 720 721 // stage 5 722 int16x4_t x5[16]; 723 butterfly_s16_s32_x4_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); 724 butterfly_s16_s32_x4_1003_neon(cospi24, x4[6], x4[5], &output[10], 725 &output[6]); 726 butterfly_dct_post_s16_x4(x3 + 8, x4 + 8, x5 + 8, 4); 727 butterfly_dct_post_s16_x4(x3 + 12, x4 + 12, x5 + 12, 4); 728 729 // stage 6-7 730 butterfly_s16_s32_x4_0112_neon(cospi4, x5[15], x5[8], &output[1], 731 &output[15]); 732 butterfly_s16_s32_x4_1003_neon(cospi28, x5[14], x5[9], &output[9], 733 &output[7]); 734 butterfly_s16_s32_x4_0112_neon(cospi20, x5[13], x5[10], &output[5], 735 &output[11]); 736 butterfly_s16_s32_x4_1003_neon(cospi12, x5[12], x5[11], &output[13], 737 &output[3]); 738 } 739 740 static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input, 741 int16x8_t *output, int cos_bit) { 742 const int16_t *cospi = cospi_arr_q13(cos_bit); 743 744 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 745 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 746 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 747 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 748 749 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 750 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 751 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 752 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 753 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 754 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 755 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 756 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 757 758 // stage 1 759 int16x8_t x1[16]; 760 butterfly_dct_pre_s16_x8(input, x1, 16); 761 762 // stage 2 763 int16x8_t x2[16]; 764 butterfly_dct_pre_s16_x8(x1, x2, 8); 765 butterfly_s16_s32_x8_0112_neon(cospi32, x1[13], x1[10], &x2[13], &x2[10]); 766 butterfly_s16_s32_x8_0112_neon(cospi32, x1[12], x1[11], &x2[12], &x2[11]); 767 768 // stage 3 769 int16x8_t x3[16]; 770 butterfly_dct_pre_s16_x8(x2, x3, 4); 771 butterfly_s16_s32_x8_0112_neon(cospi32, x2[6], x2[5], &x3[6], &x3[5]); 772 butterfly_dct_post_s16_x8(x1 + 8, x2 + 8, x3 + 8, 8); 773 774 // stage 4 775 int16x8_t x4[16]; 776 butterfly_s16_s32_x8_0112_neon(cospi32, x3[0], x3[1], &output[0], &output[8]); 777 butterfly_s16_s32_x8_0112_neon(cospi16, x3[3], x3[2], &output[4], 778 &output[12]); 779 butterfly_dct_post_s16_x8(x2 + 4, x3 + 4, x4 + 4, 4); 780 butterfly_s16_s32_x8_0112_neon(cospi16, x3[14], x3[9], &x4[14], &x4[9]); 781 butterfly_s16_s32_x8_1223_neon(cospi16, x3[13], x3[10], &x4[13], &x4[10]); 782 783 // stage 5 784 int16x8_t x5[16]; 785 butterfly_s16_s32_x8_0112_neon(cospi8, x4[7], x4[4], &output[2], &output[14]); 786 butterfly_s16_s32_x8_1003_neon(cospi24, x4[6], x4[5], &output[10], 787 &output[6]); 788 butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 4); 789 butterfly_dct_post_s16_x8(x3 + 12, x4 + 12, x5 + 12, 4); 790 791 // stage 6-7 792 butterfly_s16_s32_x8_0112_neon(cospi4, x5[15], x5[8], &output[1], 793 &output[15]); 794 butterfly_s16_s32_x8_1003_neon(cospi28, x5[14], x5[9], &output[9], 795 &output[7]); 796 butterfly_s16_s32_x8_0112_neon(cospi20, x5[13], x5[10], &output[5], 797 &output[11]); 798 butterfly_s16_s32_x8_1003_neon(cospi12, x5[12], x5[11], &output[13], 799 &output[3]); 800 } 801 802 static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input, 803 int16x8_t *output, int cos_bit) { 804 const int16_t *cospi = cospi_arr_q13(cos_bit); 805 806 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 807 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 808 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 809 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 810 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); 811 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); 812 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); 813 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); 814 815 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 816 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 817 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 818 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 819 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 820 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 821 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 822 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 823 const int16x4_t cospi2 = vget_low_s16(cospi2_6); 824 const int16x4_t cospi6 = vget_high_s16(cospi2_6); 825 const int16x4_t cospi10 = vget_low_s16(cospi10_14); 826 const int16x4_t cospi14 = vget_high_s16(cospi10_14); 827 const int16x4_t cospi18 = vget_low_s16(cospi18_22); 828 const int16x4_t cospi22 = vget_high_s16(cospi18_22); 829 const int16x4_t cospi26 = vget_low_s16(cospi26_30); 830 const int16x4_t cospi30 = vget_high_s16(cospi26_30); 831 832 // stage 1 833 int16x8_t x1[32]; 834 butterfly_dct_pre_s16_x8(input, x1, 32); 835 836 // stage 2 837 int16x8_t x2[32]; 838 butterfly_dct_pre_s16_x8(x1, x2, 16); 839 butterfly_s16_s32_x8_0112_neon(cospi32, x1[27], x1[20], &x2[27], &x2[20]); 840 butterfly_s16_s32_x8_0112_neon(cospi32, x1[26], x1[21], &x2[26], &x2[21]); 841 butterfly_s16_s32_x8_0112_neon(cospi32, x1[25], x1[22], &x2[25], &x2[22]); 842 butterfly_s16_s32_x8_0112_neon(cospi32, x1[24], x1[23], &x2[24], &x2[23]); 843 844 // stage 3 845 int16x8_t x3[32]; 846 butterfly_dct_pre_s16_x8(x2, x3, 8); 847 butterfly_s16_s32_x8_0112_neon(cospi32, x2[13], x2[10], &x3[13], &x3[10]); 848 butterfly_s16_s32_x8_0112_neon(cospi32, x2[12], x2[11], &x3[12], &x3[11]); 849 butterfly_dct_post_s16_x8(x1 + 16, x2 + 16, x3 + 16, 16); 850 851 // stage 4 852 int16x8_t x4[32]; 853 butterfly_dct_pre_s16_x8(x3, x4, 4); 854 butterfly_s16_s32_x8_0112_neon(cospi32, x3[6], x3[5], &x4[6], &x4[5]); 855 butterfly_dct_post_s16_x8(x2 + 8, x3 + 8, x4 + 8, 8); 856 butterfly_s16_s32_x8_0112_neon(cospi16, x3[29], x3[18], &x4[29], &x4[18]); 857 butterfly_s16_s32_x8_0112_neon(cospi16, x3[28], x3[19], &x4[28], &x4[19]); 858 butterfly_s16_s32_x8_1223_neon(cospi16, x3[27], x3[20], &x4[27], &x4[20]); 859 butterfly_s16_s32_x8_1223_neon(cospi16, x3[26], x3[21], &x4[26], &x4[21]); 860 861 // stage 5 862 int16x8_t x5[32]; 863 butterfly_s16_s32_x8_0112_neon(cospi32, x4[0], x4[1], &output[0], 864 &output[16]); 865 butterfly_s16_s32_x8_0112_neon(cospi16, x4[3], x4[2], &output[8], 866 &output[24]); 867 butterfly_dct_post_s16_x8(x3 + 4, x4 + 4, x5 + 4, 4); 868 butterfly_s16_s32_x8_0112_neon(cospi16, x4[14], x4[9], &x5[14], &x5[9]); 869 butterfly_s16_s32_x8_1223_neon(cospi16, x4[13], x4[10], &x5[13], &x5[10]); 870 butterfly_dct_post_s16_x8(x3 + 16, x4 + 16, x5 + 16, 8); 871 butterfly_dct_post_s16_x8(x3 + 24, x4 + 24, x5 + 24, 8); 872 873 // stage 6 874 int16x8_t x6[32]; 875 butterfly_s16_s32_x8_0112_neon(cospi8, x5[7], x5[4], &output[4], &output[28]); 876 butterfly_s16_s32_x8_1003_neon(cospi24, x5[6], x5[5], &output[20], 877 &output[12]); 878 butterfly_dct_post_s16_x8(x4 + 8, x5 + 8, x6 + 8, 4); 879 butterfly_dct_post_s16_x8(x4 + 12, x5 + 12, x6 + 12, 4); 880 butterfly_s16_s32_x8_0112_neon(cospi8, x5[30], x5[17], &x6[30], &x6[17]); 881 butterfly_s16_s32_x8_1223_neon(cospi8, x5[29], x5[18], &x6[29], &x6[18]); 882 butterfly_s16_s32_x8_1003_neon(cospi24, x5[26], x5[21], &x6[26], &x6[21]); 883 butterfly_s16_s32_x8_0332_neon(cospi24, x5[25], x5[22], &x6[25], &x6[22]); 884 885 // stage 7 886 int16x8_t x7[32]; 887 butterfly_s16_s32_x8_0112_neon(cospi4, x6[15], x6[8], &output[2], 888 &output[30]); 889 butterfly_s16_s32_x8_1003_neon(cospi28, x6[14], x6[9], &output[18], 890 &output[14]); 891 butterfly_s16_s32_x8_0112_neon(cospi20, x6[13], x6[10], &output[10], 892 &output[22]); 893 butterfly_s16_s32_x8_1003_neon(cospi12, x6[12], x6[11], &output[26], 894 &output[6]); 895 butterfly_dct_post_s16_x8(x5 + 16, x6 + 16, x7 + 16, 4); 896 butterfly_dct_post_s16_x8(x5 + 20, x6 + 20, x7 + 20, 4); 897 butterfly_dct_post_s16_x8(x5 + 24, x6 + 24, x7 + 24, 4); 898 butterfly_dct_post_s16_x8(x5 + 28, x6 + 28, x7 + 28, 4); 899 900 butterfly_s16_s32_x8_0112_neon(cospi2, x7[31], x7[16], &output[1], 901 &output[31]); 902 butterfly_s16_s32_x8_1003_neon(cospi30, x7[30], x7[17], &output[17], 903 &output[15]); 904 butterfly_s16_s32_x8_0112_neon(cospi18, x7[29], x7[18], &output[9], 905 &output[23]); 906 butterfly_s16_s32_x8_1003_neon(cospi14, x7[28], x7[19], &output[25], 907 &output[7]); 908 butterfly_s16_s32_x8_0112_neon(cospi10, x7[27], x7[20], &output[5], 909 &output[27]); 910 butterfly_s16_s32_x8_1003_neon(cospi22, x7[26], x7[21], &output[21], 911 &output[11]); 912 butterfly_s16_s32_x8_0112_neon(cospi26, x7[25], x7[22], &output[13], 913 &output[19]); 914 butterfly_s16_s32_x8_1003_neon(cospi6, x7[24], x7[23], &output[29], 915 &output[3]); 916 } 917 918 static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input, 919 int16x8_t *output, int cos_bit) { 920 const int16_t *cospi = cospi_arr_q13(cos_bit); 921 922 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 923 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 924 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 925 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 926 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); 927 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); 928 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); 929 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); 930 const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); 931 const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); 932 const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); 933 const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); 934 const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); 935 const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); 936 const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); 937 const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); 938 939 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 940 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 941 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 942 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 943 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 944 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 945 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 946 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 947 const int16x4_t cospi2 = vget_low_s16(cospi2_6); 948 const int16x4_t cospi6 = vget_high_s16(cospi2_6); 949 const int16x4_t cospi10 = vget_low_s16(cospi10_14); 950 const int16x4_t cospi14 = vget_high_s16(cospi10_14); 951 const int16x4_t cospi18 = vget_low_s16(cospi18_22); 952 const int16x4_t cospi22 = vget_high_s16(cospi18_22); 953 const int16x4_t cospi26 = vget_low_s16(cospi26_30); 954 const int16x4_t cospi30 = vget_high_s16(cospi26_30); 955 const int16x4_t cospi1 = vget_low_s16(cospi1_3); 956 const int16x4_t cospi3 = vget_high_s16(cospi1_3); 957 const int16x4_t cospi5 = vget_low_s16(cospi5_7); 958 const int16x4_t cospi7 = vget_high_s16(cospi5_7); 959 const int16x4_t cospi9 = vget_low_s16(cospi9_11); 960 const int16x4_t cospi11 = vget_high_s16(cospi9_11); 961 const int16x4_t cospi13 = vget_low_s16(cospi13_15); 962 const int16x4_t cospi15 = vget_high_s16(cospi13_15); 963 const int16x4_t cospi17 = vget_low_s16(cospi17_19); 964 const int16x4_t cospi19 = vget_high_s16(cospi17_19); 965 const int16x4_t cospi21 = vget_low_s16(cospi21_23); 966 const int16x4_t cospi23 = vget_high_s16(cospi21_23); 967 const int16x4_t cospi25 = vget_low_s16(cospi25_27); 968 const int16x4_t cospi27 = vget_high_s16(cospi25_27); 969 const int16x4_t cospi29 = vget_low_s16(cospi29_31); 970 const int16x4_t cospi31 = vget_high_s16(cospi29_31); 971 972 // stage 1 973 int16x8_t x1[64]; 974 butterfly_dct_pre_s16_x8(input, x1, 64); 975 976 // stage 2 977 int16x8_t x2[64]; 978 butterfly_dct_pre_s16_x8(x1, x2, 32); 979 butterfly_s16_s32_x8_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); 980 butterfly_s16_s32_x8_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); 981 butterfly_s16_s32_x8_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); 982 butterfly_s16_s32_x8_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); 983 butterfly_s16_s32_x8_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); 984 butterfly_s16_s32_x8_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); 985 butterfly_s16_s32_x8_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); 986 butterfly_s16_s32_x8_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); 987 988 // stage 3 989 int16x8_t x3[64]; 990 butterfly_dct_pre_s16_x8(x2, x3, 16); 991 x3[16] = x2[16]; 992 x3[17] = x2[17]; 993 x3[18] = x2[18]; 994 x3[19] = x2[19]; 995 butterfly_s16_s32_x8_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); 996 butterfly_s16_s32_x8_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); 997 butterfly_s16_s32_x8_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); 998 butterfly_s16_s32_x8_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); 999 x3[28] = x2[28]; 1000 x3[29] = x2[29]; 1001 x3[30] = x2[30]; 1002 x3[31] = x2[31]; 1003 butterfly_dct_post_s16_x8(x1 + 32, x2 + 32, x3 + 32, 32); 1004 1005 // stage 4 1006 int16x8_t x4[64]; 1007 butterfly_dct_pre_s16_x8(x3, x4, 8); 1008 butterfly_s16_s32_x8_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); 1009 butterfly_s16_s32_x8_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); 1010 butterfly_dct_post_s16_x8(x3 + 16, x3 + 16, x4 + 16, 16); 1011 butterfly_s16_s32_x8_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); 1012 butterfly_s16_s32_x8_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); 1013 butterfly_s16_s32_x8_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); 1014 butterfly_s16_s32_x8_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); 1015 butterfly_s16_s32_x8_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); 1016 butterfly_s16_s32_x8_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); 1017 butterfly_s16_s32_x8_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); 1018 butterfly_s16_s32_x8_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); 1019 1020 // stage 5 1021 int16x8_t x5[64]; 1022 butterfly_dct_pre_s16_x8(x4, x5, 4); 1023 butterfly_s16_s32_x8_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); 1024 butterfly_dct_post_s16_x8(x3 + 8, x4 + 8, x5 + 8, 8); 1025 butterfly_s16_s32_x8_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); 1026 butterfly_s16_s32_x8_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); 1027 butterfly_s16_s32_x8_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); 1028 butterfly_s16_s32_x8_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); 1029 butterfly_dct_post_s16_x8(x3 + 32, x4 + 32, x5 + 32, 16); 1030 butterfly_dct_post_s16_x8(x3 + 48, x4 + 48, x5 + 48, 16); 1031 1032 // stage 6 1033 int16x8_t x6[64]; 1034 butterfly_s16_s32_x8_0112_neon(cospi32, x5[1], x5[0], &x6[0], &x6[1]); 1035 butterfly_s16_s32_x8_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); 1036 butterfly_dct_post_s16_x8(x4 + 4, x5 + 4, x6 + 4, 4); 1037 butterfly_s16_s32_x8_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); 1038 butterfly_s16_s32_x8_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); 1039 butterfly_dct_post_s16_x8(x4 + 16, x5 + 16, x6 + 16, 8); 1040 butterfly_dct_post_s16_x8(x4 + 24, x5 + 24, x6 + 24, 8); 1041 butterfly_s16_s32_x8_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); 1042 butterfly_s16_s32_x8_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); 1043 butterfly_s16_s32_x8_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); 1044 butterfly_s16_s32_x8_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); 1045 butterfly_s16_s32_x8_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); 1046 butterfly_s16_s32_x8_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); 1047 butterfly_s16_s32_x8_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); 1048 butterfly_s16_s32_x8_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); 1049 1050 // stage 7 1051 int16x8_t x7[64]; 1052 butterfly_s16_s32_x8_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); 1053 butterfly_s16_s32_x8_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); 1054 butterfly_dct_post_s16_x8(x5 + 8, x6 + 8, x7 + 8, 4); 1055 butterfly_dct_post_s16_x8(x5 + 12, x6 + 12, x7 + 12, 4); 1056 butterfly_s16_s32_x8_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); 1057 butterfly_s16_s32_x8_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); 1058 butterfly_s16_s32_x8_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); 1059 butterfly_s16_s32_x8_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); 1060 butterfly_dct_post_s16_x8(x5 + 32, x6 + 32, x7 + 32, 8); 1061 butterfly_dct_post_s16_x8(x5 + 40, x6 + 40, x7 + 40, 8); 1062 butterfly_dct_post_s16_x8(x5 + 48, x6 + 48, x7 + 48, 8); 1063 butterfly_dct_post_s16_x8(x5 + 56, x6 + 56, x7 + 56, 8); 1064 1065 // stage 8 1066 int16x8_t x8[64]; 1067 butterfly_s16_s32_x8_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); 1068 butterfly_s16_s32_x8_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); 1069 butterfly_s16_s32_x8_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); 1070 butterfly_s16_s32_x8_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); 1071 butterfly_dct_post_s16_x8(x6 + 16, x7 + 16, x8 + 16, 4); 1072 butterfly_dct_post_s16_x8(x6 + 20, x7 + 20, x8 + 20, 4); 1073 butterfly_dct_post_s16_x8(x6 + 24, x7 + 24, x8 + 24, 4); 1074 butterfly_dct_post_s16_x8(x6 + 28, x7 + 28, x8 + 28, 4); 1075 butterfly_s16_s32_x8_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); 1076 butterfly_s16_s32_x8_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); 1077 butterfly_s16_s32_x8_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); 1078 butterfly_s16_s32_x8_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); 1079 butterfly_s16_s32_x8_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); 1080 butterfly_s16_s32_x8_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); 1081 butterfly_s16_s32_x8_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); 1082 butterfly_s16_s32_x8_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); 1083 1084 // stage 9 1085 int16x8_t x9[64]; 1086 butterfly_s16_s32_x8_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); 1087 butterfly_s16_s32_x8_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); 1088 butterfly_s16_s32_x8_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); 1089 butterfly_s16_s32_x8_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); 1090 butterfly_s16_s32_x8_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); 1091 butterfly_s16_s32_x8_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); 1092 butterfly_s16_s32_x8_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); 1093 butterfly_s16_s32_x8_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); 1094 butterfly_dct_post_s16_x8(x7 + 32, x8 + 32, x9 + 32, 4); 1095 butterfly_dct_post_s16_x8(x7 + 36, x8 + 36, x9 + 36, 4); 1096 butterfly_dct_post_s16_x8(x7 + 40, x8 + 40, x9 + 40, 4); 1097 butterfly_dct_post_s16_x8(x7 + 44, x8 + 44, x9 + 44, 4); 1098 butterfly_dct_post_s16_x8(x7 + 48, x8 + 48, x9 + 48, 4); 1099 butterfly_dct_post_s16_x8(x7 + 52, x8 + 52, x9 + 52, 4); 1100 butterfly_dct_post_s16_x8(x7 + 56, x8 + 56, x9 + 56, 4); 1101 butterfly_dct_post_s16_x8(x7 + 60, x8 + 60, x9 + 60, 4); 1102 1103 // stage 10 1104 butterfly_s16_s32_x8_0112_neon(cospi1, x9[63], x9[32], &output[1], 1105 &output[63]); 1106 butterfly_s16_s32_x8_1003_neon(cospi31, x9[62], x9[33], &output[33], 1107 &output[31]); 1108 butterfly_s16_s32_x8_0112_neon(cospi17, x9[61], x9[34], &output[17], 1109 &output[47]); 1110 butterfly_s16_s32_x8_1003_neon(cospi15, x9[60], x9[35], &output[49], 1111 &output[15]); 1112 butterfly_s16_s32_x8_0112_neon(cospi9, x9[59], x9[36], &output[9], 1113 &output[55]); 1114 butterfly_s16_s32_x8_1003_neon(cospi23, x9[58], x9[37], &output[41], 1115 &output[23]); 1116 butterfly_s16_s32_x8_0112_neon(cospi25, x9[57], x9[38], &output[25], 1117 &output[39]); 1118 butterfly_s16_s32_x8_1003_neon(cospi7, x9[56], x9[39], &output[57], 1119 &output[7]); 1120 butterfly_s16_s32_x8_0112_neon(cospi5, x9[55], x9[40], &output[5], 1121 &output[59]); 1122 butterfly_s16_s32_x8_1003_neon(cospi27, x9[54], x9[41], &output[37], 1123 &output[27]); 1124 butterfly_s16_s32_x8_0112_neon(cospi21, x9[53], x9[42], &output[21], 1125 &output[43]); 1126 butterfly_s16_s32_x8_1003_neon(cospi11, x9[52], x9[43], &output[53], 1127 &output[11]); 1128 butterfly_s16_s32_x8_0112_neon(cospi13, x9[51], x9[44], &output[13], 1129 &output[51]); 1130 butterfly_s16_s32_x8_1003_neon(cospi19, x9[50], x9[45], &output[45], 1131 &output[19]); 1132 butterfly_s16_s32_x8_0112_neon(cospi29, x9[49], x9[46], &output[29], 1133 &output[35]); 1134 butterfly_s16_s32_x8_1003_neon(cospi3, x9[48], x9[47], &output[61], 1135 &output[3]); 1136 1137 // stage 11 1138 output[0] = x6[0]; 1139 output[2] = x9[16]; 1140 output[4] = x8[8]; 1141 output[6] = x9[24]; 1142 output[8] = x7[4]; 1143 output[10] = x9[20]; 1144 output[12] = x8[12]; 1145 output[14] = x9[28]; 1146 output[16] = x6[2]; 1147 output[18] = x9[18]; 1148 output[20] = x8[10]; 1149 output[22] = x9[26]; 1150 output[24] = x7[6]; 1151 output[26] = x9[22]; 1152 output[28] = x8[14]; 1153 output[30] = x9[30]; 1154 output[32] = x6[1]; 1155 output[34] = x9[17]; 1156 output[36] = x8[9]; 1157 output[38] = x9[25]; 1158 output[40] = x7[5]; 1159 output[42] = x9[21]; 1160 output[44] = x8[13]; 1161 output[46] = x9[29]; 1162 output[48] = x6[3]; 1163 output[52] = x8[11]; 1164 output[54] = x9[27]; 1165 output[56] = x7[7]; 1166 output[58] = x9[23]; 1167 output[60] = x8[15]; 1168 output[62] = x9[31]; 1169 } 1170 1171 static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input, 1172 int16x8_t *output, int cos_bit) { 1173 const int16_t *cospi = cospi_arr_q13(cos_bit); 1174 1175 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 1176 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 1177 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 1178 1179 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 1180 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 1181 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 1182 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 1183 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 1184 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 1185 1186 // stage 2 1187 int16x8_t x2[8]; 1188 butterfly_s16_s32_x8_0332_neon(cospi32, input[4], input[3], &x2[2], &x2[3]); 1189 butterfly_s16_s32_x8_0112_neon(cospi32, input[2], input[5], &x2[7], &x2[6]); 1190 1191 // stage 3 1192 int16x8_t x3[8]; 1193 x3[0] = vqaddq_s16(input[0], x2[2]); 1194 x3[1] = vqsubq_s16(x2[3], input[7]); 1195 x3[2] = vqsubq_s16(input[0], x2[2]); 1196 x3[3] = vqaddq_s16(input[7], x2[3]); 1197 x3[4] = vqsubq_s16(x2[6], input[1]); 1198 x3[5] = vqaddq_s16(input[6], x2[7]); 1199 x3[6] = vqaddq_s16(input[1], x2[6]); 1200 x3[7] = vqsubq_s16(input[6], x2[7]); 1201 1202 // stage 4 1203 butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); 1204 butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); 1205 1206 // stage 5 1207 int16x8_t x5[8]; 1208 x5[0] = vqaddq_s16(x3[0], x3[4]); 1209 x5[1] = vqaddq_s16(x3[1], x3[5]); 1210 x5[2] = vqaddq_s16(x3[2], x3[6]); 1211 x5[3] = vqsubq_s16(x3[7], x3[3]); 1212 x5[4] = vqsubq_s16(x3[0], x3[4]); 1213 x5[5] = vqsubq_s16(x3[1], x3[5]); 1214 x5[6] = vqsubq_s16(x3[2], x3[6]); 1215 x5[7] = vqaddq_s16(x3[3], x3[7]); 1216 1217 // stage 6 1218 butterfly_s16_s32_x8_0112_neon(cospi4, x5[0], x5[1], &output[7], &output[0]); 1219 butterfly_s16_s32_x8_0112_neon(cospi20, x5[2], x5[3], &output[5], &output[2]); 1220 butterfly_s16_s32_x8_1003_neon(cospi28, x5[4], x5[5], &output[3], &output[4]); 1221 butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]); 1222 } 1223 1224 static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input, 1225 int16x4_t *output, int cos_bit) { 1226 const int16_t *cospi = cospi_arr_q13(cos_bit); 1227 1228 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 1229 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 1230 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); 1231 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); 1232 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); 1233 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); 1234 1235 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 1236 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 1237 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 1238 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 1239 const int16x4_t cospi2 = vget_low_s16(cospi2_6); 1240 const int16x4_t cospi6 = vget_high_s16(cospi2_6); 1241 const int16x4_t cospi10 = vget_low_s16(cospi10_14); 1242 const int16x4_t cospi14 = vget_high_s16(cospi10_14); 1243 const int16x4_t cospi18 = vget_low_s16(cospi18_22); 1244 const int16x4_t cospi22 = vget_high_s16(cospi18_22); 1245 const int16x4_t cospi26 = vget_low_s16(cospi26_30); 1246 const int16x4_t cospi30 = vget_high_s16(cospi26_30); 1247 1248 // stage 2 1249 int16x4_t x2[8]; 1250 butterfly_s16_s32_x4_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); 1251 butterfly_s16_s32_x4_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); 1252 butterfly_s16_s32_x4_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); 1253 butterfly_s16_s32_x4_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); 1254 1255 // stage 3 1256 int16x4_t x3[16]; 1257 x3[0] = vqadd_s16(input[0], x2[0]); 1258 x3[1] = vqsub_s16(x2[1], input[15]); 1259 x3[2] = vqsub_s16(input[0], x2[0]); 1260 x3[3] = vqadd_s16(input[15], x2[1]); 1261 x3[4] = vqsub_s16(x2[2], input[3]); 1262 x3[5] = vqadd_s16(input[12], x2[3]); 1263 x3[6] = vqadd_s16(input[3], x2[2]); 1264 x3[7] = vqsub_s16(input[12], x2[3]); 1265 x3[8] = vqsub_s16(x2[4], input[1]); 1266 x3[9] = vqadd_s16(input[14], x2[5]); 1267 x3[10] = vqadd_s16(input[1], x2[4]); 1268 x3[11] = vqsub_s16(input[14], x2[5]); 1269 x3[12] = vqadd_s16(input[2], x2[6]); 1270 x3[13] = vqsub_s16(x2[7], input[13]); 1271 x3[14] = vqsub_s16(input[2], x2[6]); 1272 x3[15] = vqadd_s16(input[13], x2[7]); 1273 1274 // stage 4 1275 butterfly_s16_s32_x4_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); 1276 butterfly_s16_s32_x4_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); 1277 butterfly_s16_s32_x4_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); 1278 butterfly_s16_s32_x4_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); 1279 1280 // stage 5 1281 int16x4_t x5[16]; 1282 x5[0] = vqadd_s16(x3[0], x3[4]); 1283 x5[1] = vqadd_s16(x3[1], x3[5]); 1284 x5[2] = vqadd_s16(x3[2], x3[6]); 1285 x5[3] = vqsub_s16(x3[7], x3[3]); 1286 x5[4] = vqsub_s16(x3[0], x3[4]); 1287 x5[5] = vqsub_s16(x3[1], x3[5]); 1288 x5[6] = vqsub_s16(x3[2], x3[6]); 1289 x5[7] = vqadd_s16(x3[3], x3[7]); 1290 x5[8] = vqadd_s16(x3[8], x3[12]); 1291 x5[9] = vqadd_s16(x3[9], x3[13]); 1292 x5[10] = vqsub_s16(x3[14], x3[10]); 1293 x5[11] = vqadd_s16(x3[11], x3[15]); 1294 x5[12] = vqsub_s16(x3[8], x3[12]); 1295 x5[13] = vqsub_s16(x3[9], x3[13]); 1296 x5[14] = vqadd_s16(x3[10], x3[14]); 1297 x5[15] = vqsub_s16(x3[11], x3[15]); 1298 1299 // stage 6 1300 butterfly_s16_s32_x4_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); 1301 butterfly_s16_s32_x4_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); 1302 butterfly_s16_s32_x4_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); 1303 butterfly_s16_s32_x4_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); 1304 1305 // stage 7 1306 int16x4_t x7[16]; 1307 x7[0] = vqadd_s16(x5[0], x5[8]); 1308 x7[1] = vqadd_s16(x5[1], x5[9]); 1309 x7[2] = vqadd_s16(x5[2], x5[10]); 1310 x7[3] = vqadd_s16(x5[3], x5[11]); 1311 x7[4] = vqadd_s16(x5[4], x5[12]); 1312 x7[5] = vqadd_s16(x5[5], x5[13]); 1313 x7[6] = vqadd_s16(x5[6], x5[14]); 1314 x7[7] = vqsub_s16(x5[15], x5[7]); 1315 x7[8] = vqsub_s16(x5[0], x5[8]); 1316 x7[9] = vqsub_s16(x5[1], x5[9]); 1317 x7[10] = vqsub_s16(x5[2], x5[10]); 1318 x7[11] = vqsub_s16(x5[3], x5[11]); 1319 x7[12] = vqsub_s16(x5[4], x5[12]); 1320 x7[13] = vqsub_s16(x5[5], x5[13]); 1321 x7[14] = vqsub_s16(x5[6], x5[14]); 1322 x7[15] = vqadd_s16(x5[7], x5[15]); 1323 1324 // stage 8 1325 butterfly_s16_s32_x4_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); 1326 butterfly_s16_s32_x4_0112_neon(cospi10, x7[2], x7[3], &output[13], 1327 &output[2]); 1328 butterfly_s16_s32_x4_0112_neon(cospi18, x7[4], x7[5], &output[11], 1329 &output[4]); 1330 butterfly_s16_s32_x4_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); 1331 butterfly_s16_s32_x4_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); 1332 butterfly_s16_s32_x4_1003_neon(cospi22, x7[10], x7[11], &output[5], 1333 &output[10]); 1334 butterfly_s16_s32_x4_1003_neon(cospi14, x7[12], x7[13], &output[3], 1335 &output[12]); 1336 butterfly_s16_s32_x4_0112_neon(cospi6, x7[14], x7[15], &output[14], 1337 &output[1]); 1338 } 1339 1340 static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input, 1341 int16x8_t *output, int cos_bit) { 1342 const int16_t *cospi = cospi_arr_q13(cos_bit); 1343 1344 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 1345 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 1346 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); 1347 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); 1348 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); 1349 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); 1350 1351 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 1352 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 1353 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 1354 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 1355 const int16x4_t cospi2 = vget_low_s16(cospi2_6); 1356 const int16x4_t cospi6 = vget_high_s16(cospi2_6); 1357 const int16x4_t cospi10 = vget_low_s16(cospi10_14); 1358 const int16x4_t cospi14 = vget_high_s16(cospi10_14); 1359 const int16x4_t cospi18 = vget_low_s16(cospi18_22); 1360 const int16x4_t cospi22 = vget_high_s16(cospi18_22); 1361 const int16x4_t cospi26 = vget_low_s16(cospi26_30); 1362 const int16x4_t cospi30 = vget_high_s16(cospi26_30); 1363 1364 // stage 2 1365 int16x8_t x2[8]; 1366 butterfly_s16_s32_x8_0332_neon(cospi32, input[8], input[7], &x2[0], &x2[1]); 1367 butterfly_s16_s32_x8_0112_neon(cospi32, input[4], input[11], &x2[3], &x2[2]); 1368 butterfly_s16_s32_x8_0112_neon(cospi32, input[6], input[9], &x2[5], &x2[4]); 1369 butterfly_s16_s32_x8_0332_neon(cospi32, input[10], input[5], &x2[6], &x2[7]); 1370 1371 // stage 3 1372 int16x8_t x3[16]; 1373 x3[0] = vqaddq_s16(input[0], x2[0]); 1374 x3[1] = vqsubq_s16(x2[1], input[15]); 1375 x3[2] = vqsubq_s16(input[0], x2[0]); 1376 x3[3] = vqaddq_s16(input[15], x2[1]); 1377 x3[4] = vqsubq_s16(x2[2], input[3]); 1378 x3[5] = vqaddq_s16(input[12], x2[3]); 1379 x3[6] = vqaddq_s16(input[3], x2[2]); 1380 x3[7] = vqsubq_s16(input[12], x2[3]); 1381 x3[8] = vqsubq_s16(x2[4], input[1]); 1382 x3[9] = vqaddq_s16(input[14], x2[5]); 1383 x3[10] = vqaddq_s16(input[1], x2[4]); 1384 x3[11] = vqsubq_s16(input[14], x2[5]); 1385 x3[12] = vqaddq_s16(input[2], x2[6]); 1386 x3[13] = vqsubq_s16(x2[7], input[13]); 1387 x3[14] = vqsubq_s16(input[2], x2[6]); 1388 x3[15] = vqaddq_s16(input[13], x2[7]); 1389 1390 // stage 4 1391 butterfly_s16_s32_x8_0112_neon(cospi16, x3[4], x3[5], &x3[4], &x3[5]); 1392 butterfly_s16_s32_x8_0112_neon(cospi16, x3[7], x3[6], &x3[6], &x3[7]); 1393 butterfly_s16_s32_x8_0112_neon(cospi16, x3[12], x3[13], &x3[12], &x3[13]); 1394 butterfly_s16_s32_x8_0332_neon(cospi16, x3[14], x3[15], &x3[15], &x3[14]); 1395 1396 // stage 5 1397 int16x8_t x5[16]; 1398 x5[0] = vqaddq_s16(x3[0], x3[4]); 1399 x5[1] = vqaddq_s16(x3[1], x3[5]); 1400 x5[2] = vqaddq_s16(x3[2], x3[6]); 1401 x5[3] = vqsubq_s16(x3[7], x3[3]); 1402 x5[4] = vqsubq_s16(x3[0], x3[4]); 1403 x5[5] = vqsubq_s16(x3[1], x3[5]); 1404 x5[6] = vqsubq_s16(x3[2], x3[6]); 1405 x5[7] = vqaddq_s16(x3[3], x3[7]); 1406 x5[8] = vqaddq_s16(x3[8], x3[12]); 1407 x5[9] = vqaddq_s16(x3[9], x3[13]); 1408 x5[10] = vqsubq_s16(x3[14], x3[10]); 1409 x5[11] = vqaddq_s16(x3[11], x3[15]); 1410 x5[12] = vqsubq_s16(x3[8], x3[12]); 1411 x5[13] = vqsubq_s16(x3[9], x3[13]); 1412 x5[14] = vqaddq_s16(x3[10], x3[14]); 1413 x5[15] = vqsubq_s16(x3[11], x3[15]); 1414 1415 // stage 6 1416 butterfly_s16_s32_x8_0112_neon(cospi8, x5[8], x5[9], &x5[8], &x5[9]); 1417 butterfly_s16_s32_x8_1003_neon(cospi24, x5[10], x5[11], &x5[10], &x5[11]); 1418 butterfly_s16_s32_x8_1003_neon(cospi8, x5[13], x5[12], &x5[13], &x5[12]); 1419 butterfly_s16_s32_x8_1003_neon(cospi24, x5[15], x5[14], &x5[14], &x5[15]); 1420 1421 // stage 7 1422 int16x8_t x7[16]; 1423 x7[0] = vqaddq_s16(x5[0], x5[8]); 1424 x7[1] = vqaddq_s16(x5[1], x5[9]); 1425 x7[2] = vqaddq_s16(x5[2], x5[10]); 1426 x7[3] = vqaddq_s16(x5[3], x5[11]); 1427 x7[4] = vqaddq_s16(x5[4], x5[12]); 1428 x7[5] = vqaddq_s16(x5[5], x5[13]); 1429 x7[6] = vqaddq_s16(x5[6], x5[14]); 1430 x7[7] = vqsubq_s16(x5[15], x5[7]); 1431 x7[8] = vqsubq_s16(x5[0], x5[8]); 1432 x7[9] = vqsubq_s16(x5[1], x5[9]); 1433 x7[10] = vqsubq_s16(x5[2], x5[10]); 1434 x7[11] = vqsubq_s16(x5[3], x5[11]); 1435 x7[12] = vqsubq_s16(x5[4], x5[12]); 1436 x7[13] = vqsubq_s16(x5[5], x5[13]); 1437 x7[14] = vqsubq_s16(x5[6], x5[14]); 1438 x7[15] = vqaddq_s16(x5[7], x5[15]); 1439 1440 // stage 8 1441 butterfly_s16_s32_x8_0112_neon(cospi2, x7[0], x7[1], &output[15], &output[0]); 1442 butterfly_s16_s32_x8_0112_neon(cospi10, x7[2], x7[3], &output[13], 1443 &output[2]); 1444 butterfly_s16_s32_x8_0112_neon(cospi18, x7[4], x7[5], &output[11], 1445 &output[4]); 1446 butterfly_s16_s32_x8_0112_neon(cospi26, x7[6], x7[7], &output[9], &output[6]); 1447 butterfly_s16_s32_x8_1003_neon(cospi30, x7[8], x7[9], &output[7], &output[8]); 1448 butterfly_s16_s32_x8_1003_neon(cospi22, x7[10], x7[11], &output[5], 1449 &output[10]); 1450 butterfly_s16_s32_x8_1003_neon(cospi14, x7[12], x7[13], &output[3], 1451 &output[12]); 1452 butterfly_s16_s32_x8_0112_neon(cospi6, x7[14], x7[15], &output[14], 1453 &output[1]); 1454 } 1455 1456 static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input, 1457 int16x4_t *const output, 1458 const int cos_bit) { 1459 (void)cos_bit; 1460 round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4); 1461 } 1462 1463 static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input, 1464 int16x8_t *const output, 1465 const int cos_bit) { 1466 (void)cos_bit; 1467 round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4); 1468 } 1469 1470 static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input, 1471 int16x4_t *output, int cos_bit) { 1472 (void)cos_bit; 1473 shift_left_1_s16_x4(input, output, 8); 1474 } 1475 1476 static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input, 1477 int16x8_t *output, int cos_bit) { 1478 (void)cos_bit; 1479 shift_left_1_s16_x8(input, output, 8); 1480 } 1481 1482 static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input, 1483 int16x4_t *output, 1484 int cos_bit) { 1485 (void)cos_bit; 1486 round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16); 1487 } 1488 1489 static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input, 1490 int16x8_t *output, 1491 int cos_bit) { 1492 (void)cos_bit; 1493 round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16); 1494 } 1495 1496 static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input, 1497 int16x8_t *output, 1498 int cos_bit) { 1499 (void)cos_bit; 1500 shift_left_2_s16_x8(input, output, 32); 1501 } 1502 1503 #define TRANSFORM_COL(name, tw, n) \ 1504 static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \ 1505 int stride, int cos_bit) { \ 1506 int16x##tw##_t buf0[n]; \ 1507 load_buffer_s16_x##tw(input, stride, buf0, n); \ 1508 shift_left_2_s16_x##tw(buf0, buf0, n); \ 1509 name##_neon(buf0, output, cos_bit); \ 1510 } 1511 1512 TRANSFORM_COL(fadst4x4, 4, 4) 1513 TRANSFORM_COL(fadst4x8, 4, 8) 1514 TRANSFORM_COL(fadst4x16, 4, 16) 1515 TRANSFORM_COL(fadst8x4, 8, 4) 1516 TRANSFORM_COL(fadst8x8, 8, 8) 1517 TRANSFORM_COL(fadst8x16, 8, 16) 1518 TRANSFORM_COL(fdct4x4, 4, 4) 1519 TRANSFORM_COL(fdct4x8, 4, 8) 1520 TRANSFORM_COL(fdct4x16, 4, 16) 1521 TRANSFORM_COL(fdct8x4, 8, 4) 1522 TRANSFORM_COL(fdct8x8, 8, 8) 1523 TRANSFORM_COL(fdct8x16, 8, 16) 1524 TRANSFORM_COL(fdct8x32, 8, 32) 1525 TRANSFORM_COL(fidentity4x4, 4, 4) 1526 TRANSFORM_COL(fidentity4x8, 4, 8) 1527 TRANSFORM_COL(fidentity4x16, 4, 16) 1528 TRANSFORM_COL(fidentity8x4, 8, 4) 1529 TRANSFORM_COL(fidentity8x8, 8, 8) 1530 TRANSFORM_COL(fidentity8x16, 8, 16) 1531 TRANSFORM_COL(fidentity8x32, 8, 32) 1532 1533 #define TRANSFORM_ROW(name, tw, n) \ 1534 static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \ 1535 int stride, int cos_bit) { \ 1536 int16x##tw##_t buf0[n]; \ 1537 name##_neon(input, buf0, cos_bit); \ 1538 store_buffer_s16_x##tw(buf0, output, stride, n); \ 1539 } 1540 1541 #define TRANSFORM_ROW_RECT(name, tw, n) \ 1542 static void name##_row_rect_neon(const int16x##tw##_t *input, \ 1543 int32_t *output, int stride, int cos_bit) { \ 1544 int16x##tw##_t buf0[n]; \ 1545 name##_neon(input, buf0, cos_bit); \ 1546 store_rect_buffer_s16_x##tw(buf0, output, stride, n); \ 1547 } 1548 1549 TRANSFORM_ROW(fadst4x4, 4, 4) 1550 TRANSFORM_ROW(fadst4x16, 4, 16) 1551 TRANSFORM_ROW(fadst8x4, 8, 4) 1552 TRANSFORM_ROW(fadst8x8, 8, 8) 1553 TRANSFORM_ROW(fadst8x16, 8, 16) 1554 TRANSFORM_ROW(fdct4x4, 4, 4) 1555 TRANSFORM_ROW(fdct4x16, 4, 16) 1556 TRANSFORM_ROW(fdct8x4, 8, 4) 1557 TRANSFORM_ROW(fdct8x8, 8, 8) 1558 TRANSFORM_ROW(fdct8x16, 8, 16) 1559 TRANSFORM_ROW(fdct8x32, 8, 32) 1560 TRANSFORM_ROW(fidentity4x4, 4, 4) 1561 TRANSFORM_ROW(fidentity4x16, 4, 16) 1562 TRANSFORM_ROW(fidentity8x4, 8, 4) 1563 TRANSFORM_ROW(fidentity8x8, 8, 8) 1564 TRANSFORM_ROW(fidentity8x16, 8, 16) 1565 TRANSFORM_ROW(fidentity8x32, 8, 32) 1566 1567 TRANSFORM_ROW_RECT(fadst4x8, 4, 8) 1568 TRANSFORM_ROW_RECT(fadst8x4, 8, 4) 1569 TRANSFORM_ROW_RECT(fadst8x8, 8, 8) 1570 TRANSFORM_ROW_RECT(fadst8x16, 8, 16) 1571 TRANSFORM_ROW_RECT(fdct4x8, 4, 8) 1572 TRANSFORM_ROW_RECT(fdct8x4, 8, 4) 1573 TRANSFORM_ROW_RECT(fdct8x8, 8, 8) 1574 TRANSFORM_ROW_RECT(fdct8x16, 8, 16) 1575 TRANSFORM_ROW_RECT(fdct8x32, 8, 32) 1576 TRANSFORM_ROW_RECT(fidentity4x8, 4, 8) 1577 TRANSFORM_ROW_RECT(fidentity8x4, 8, 4) 1578 TRANSFORM_ROW_RECT(fidentity8x8, 8, 8) 1579 TRANSFORM_ROW_RECT(fidentity8x16, 8, 16) 1580 TRANSFORM_ROW_RECT(fidentity8x32, 8, 32) 1581 1582 typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input, 1583 int16x4_t *output, int cos_bit); 1584 typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input, 1585 int16x8_t *output, int cos_bit); 1586 1587 typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input, 1588 int16x4_t *output, int stride, 1589 int cos_bit); 1590 typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input, 1591 int16x8_t *output, int stride, 1592 int cos_bit); 1593 1594 typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input, 1595 int32_t *output, int stride, 1596 int cos_bit); 1597 typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input, 1598 int32_t *output, int stride, 1599 int cos_bit); 1600 1601 static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = { 1602 fdct4x8_col_neon, // DCT_DCT 1603 fadst4x8_col_neon, // ADST_DCT 1604 fdct4x8_col_neon, // DCT_ADST 1605 fadst4x8_col_neon, // ADST_ADST 1606 fadst4x8_col_neon, // FLIPADST_DCT 1607 fdct4x8_col_neon, // DCT_FLIPADST 1608 fadst4x8_col_neon, // FLIPADST_FLIPADST 1609 fadst4x8_col_neon, // ADST_FLIPADST 1610 fadst4x8_col_neon, // FLIPADST_ADST 1611 fidentity4x8_col_neon, // IDTX 1612 fdct4x8_col_neon, // V_DCT 1613 fidentity4x8_col_neon, // H_DCT 1614 fadst4x8_col_neon, // V_ADST 1615 fidentity4x8_col_neon, // H_ADST 1616 fadst4x8_col_neon, // V_FLIPADST 1617 fidentity4x8_col_neon // H_FLIPADST 1618 }; 1619 1620 static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = { 1621 fdct8x4_row_neon, // DCT_DCT 1622 fdct8x4_row_neon, // ADST_DCT 1623 fadst8x4_row_neon, // DCT_ADST 1624 fadst8x4_row_neon, // ADST_ADST 1625 fdct8x4_row_neon, // FLIPADST_DCT 1626 fadst8x4_row_neon, // DCT_FLIPADST 1627 fadst8x4_row_neon, // FLIPADST_FLIPADST 1628 fadst8x4_row_neon, // ADST_FLIPADST 1629 fadst8x4_row_neon, // FLIPADST_ADST 1630 fidentity8x4_row_neon, // IDTX 1631 fidentity8x4_row_neon, // V_DCT 1632 fdct8x4_row_neon, // H_DCT 1633 fidentity8x4_row_neon, // V_ADST 1634 fadst8x4_row_neon, // H_ADST 1635 fidentity8x4_row_neon, // V_FLIPADST 1636 fadst8x4_row_neon // H_FLIPADST 1637 }; 1638 1639 static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = { 1640 fdct8x4_row_rect_neon, // DCT_DCT 1641 fdct8x4_row_rect_neon, // ADST_DCT 1642 fadst8x4_row_rect_neon, // DCT_ADST 1643 fadst8x4_row_rect_neon, // ADST_ADST 1644 fdct8x4_row_rect_neon, // FLIPADST_DCT 1645 fadst8x4_row_rect_neon, // DCT_FLIPADST 1646 fadst8x4_row_rect_neon, // FLIPADST_FLIPADST 1647 fadst8x4_row_rect_neon, // ADST_FLIPADST 1648 fadst8x4_row_rect_neon, // FLIPADST_ADST 1649 fidentity8x4_row_rect_neon, // IDTX 1650 fidentity8x4_row_rect_neon, // V_DCT 1651 fdct8x4_row_rect_neon, // H_DCT 1652 fidentity8x4_row_rect_neon, // V_ADST 1653 fadst8x4_row_rect_neon, // H_ADST 1654 fidentity8x4_row_rect_neon, // V_FLIPADST 1655 fadst8x4_row_rect_neon // H_FLIPADST 1656 }; 1657 1658 static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = { 1659 fdct8x4_col_neon, // DCT_DCT 1660 fadst8x4_col_neon, // ADST_DCT 1661 fdct8x4_col_neon, // DCT_ADST 1662 fadst8x4_col_neon, // ADST_ADST 1663 fadst8x4_col_neon, // FLIPADST_DCT 1664 fdct8x4_col_neon, // DCT_FLIPADST 1665 fadst8x4_col_neon, // FLIPADST_FLIPADST 1666 fadst8x4_col_neon, // ADST_FLIPADST 1667 fadst8x4_col_neon, // FLIPADST_ADST 1668 fidentity8x4_col_neon, // IDTX 1669 fdct8x4_col_neon, // V_DCT 1670 fidentity8x4_col_neon, // H_DCT 1671 fadst8x4_col_neon, // V_ADST 1672 fidentity8x4_col_neon, // H_ADST 1673 fadst8x4_col_neon, // V_FLIPADST 1674 fidentity8x4_col_neon // H_FLIPADST 1675 }; 1676 1677 static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = { 1678 fdct4x8_row_rect_neon, // DCT_DCT 1679 fdct4x8_row_rect_neon, // ADST_DCT 1680 fadst4x8_row_rect_neon, // DCT_ADST 1681 fadst4x8_row_rect_neon, // ADST_ADST 1682 fdct4x8_row_rect_neon, // FLIPADST_DCT 1683 fadst4x8_row_rect_neon, // DCT_FLIPADST 1684 fadst4x8_row_rect_neon, // FLIPADST_FLIPADST 1685 fadst4x8_row_rect_neon, // ADST_FLIPADST 1686 fadst4x8_row_rect_neon, // FLIPADST_ADST 1687 fidentity4x8_row_rect_neon, // IDTX 1688 fidentity4x8_row_rect_neon, // V_DCT 1689 fdct4x8_row_rect_neon, // H_DCT 1690 fidentity4x8_row_rect_neon, // V_ADST 1691 fadst4x8_row_rect_neon, // H_ADST 1692 fidentity4x8_row_rect_neon, // V_FLIPADST 1693 fadst4x8_row_rect_neon // H_FLIPADST 1694 }; 1695 1696 static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = { 1697 fdct8x8_col_neon, // DCT_DCT 1698 fadst8x8_col_neon, // ADST_DCT 1699 fdct8x8_col_neon, // DCT_ADST 1700 fadst8x8_col_neon, // ADST_ADST 1701 fadst8x8_col_neon, // FLIPADST_DCT 1702 fdct8x8_col_neon, // DCT_FLIPADST 1703 fadst8x8_col_neon, // FLIPADST_FLIPADST 1704 fadst8x8_col_neon, // ADST_FLIPADST 1705 fadst8x8_col_neon, // FLIPADST_ADST 1706 fidentity8x8_col_neon, // IDTX 1707 fdct8x8_col_neon, // V_DCT 1708 fidentity8x8_col_neon, // H_DCT 1709 fadst8x8_col_neon, // V_ADST 1710 fidentity8x8_col_neon, // H_ADST 1711 fadst8x8_col_neon, // V_FLIPADST 1712 fidentity8x8_col_neon, // H_FLIPADST 1713 }; 1714 1715 static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = { 1716 fdct8x8_row_neon, // DCT_DCT 1717 fdct8x8_row_neon, // ADST_DCT 1718 fadst8x8_row_neon, // DCT_ADST 1719 fadst8x8_row_neon, // ADST_ADST 1720 fdct8x8_row_neon, // FLIPADST_DCT 1721 fadst8x8_row_neon, // DCT_FLIPADST 1722 fadst8x8_row_neon, // FLIPADST_FLIPADST 1723 fadst8x8_row_neon, // ADST_FLIPADST 1724 fadst8x8_row_neon, // FLIPADST_ADST 1725 fidentity8x8_row_neon, // IDTX 1726 fidentity8x8_row_neon, // V_DCT 1727 fdct8x8_row_neon, // H_DCT 1728 fidentity8x8_row_neon, // V_ADST 1729 fadst8x8_row_neon, // H_ADST 1730 fidentity8x8_row_neon, // V_FLIPADST 1731 fadst8x8_row_neon // H_FLIPADST 1732 }; 1733 1734 static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = { 1735 fdct8x8_row_rect_neon, // DCT_DCT 1736 fdct8x8_row_rect_neon, // ADST_DCT 1737 fadst8x8_row_rect_neon, // DCT_ADST 1738 fadst8x8_row_rect_neon, // ADST_ADST 1739 fdct8x8_row_rect_neon, // FLIPADST_DCT 1740 fadst8x8_row_rect_neon, // DCT_FLIPADST 1741 fadst8x8_row_rect_neon, // FLIPADST_FLIPADST 1742 fadst8x8_row_rect_neon, // ADST_FLIPADST 1743 fadst8x8_row_rect_neon, // FLIPADST_ADST 1744 fidentity8x8_row_rect_neon, // IDTX 1745 fidentity8x8_row_rect_neon, // V_DCT 1746 fdct8x8_row_rect_neon, // H_DCT 1747 fidentity8x8_row_rect_neon, // V_ADST 1748 fadst8x8_row_rect_neon, // H_ADST 1749 fidentity8x8_row_rect_neon, // V_FLIPADST 1750 fadst8x8_row_rect_neon // H_FLIPADST 1751 }; 1752 1753 static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = { 1754 fdct4x16_col_neon, // DCT_DCT 1755 fadst4x16_col_neon, // ADST_DCT 1756 fdct4x16_col_neon, // DCT_ADST 1757 fadst4x16_col_neon, // ADST_ADST 1758 fadst4x16_col_neon, // FLIPADST_DCT 1759 fdct4x16_col_neon, // DCT_FLIPADST 1760 fadst4x16_col_neon, // FLIPADST_FLIPADST 1761 fadst4x16_col_neon, // ADST_FLIPADST 1762 fadst4x16_col_neon, // FLIPADST_ADST 1763 fidentity4x16_col_neon, // IDTX 1764 fdct4x16_col_neon, // V_DCT 1765 fidentity4x16_col_neon, // H_DCT 1766 fadst4x16_col_neon, // V_ADST 1767 fidentity4x16_col_neon, // H_ADST 1768 fadst4x16_col_neon, // V_FLIPADST 1769 fidentity4x16_col_neon // H_FLIPADST 1770 }; 1771 1772 static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = { 1773 fdct4x16_row_neon, // DCT_DCT 1774 fdct4x16_row_neon, // ADST_DCT 1775 fadst4x16_row_neon, // DCT_ADST 1776 fadst4x16_row_neon, // ADST_ADST 1777 fdct4x16_row_neon, // FLIPADST_DCT 1778 fadst4x16_row_neon, // DCT_FLIPADST 1779 fadst4x16_row_neon, // FLIPADST_FLIPADST 1780 fadst4x16_row_neon, // ADST_FLIPADST 1781 fadst4x16_row_neon, // FLIPADST_ADST 1782 fidentity4x16_row_neon, // IDTX 1783 fidentity4x16_row_neon, // V_DCT 1784 fdct4x16_row_neon, // H_DCT 1785 fidentity4x16_row_neon, // V_ADST 1786 fadst4x16_row_neon, // H_ADST 1787 fidentity4x16_row_neon, // V_FLIPADST 1788 fadst4x16_row_neon // H_FLIPADST 1789 }; 1790 1791 static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = { 1792 fdct8x16_col_neon, // DCT_DCT 1793 fadst8x16_col_neon, // ADST_DCT 1794 fdct8x16_col_neon, // DCT_ADST 1795 fadst8x16_col_neon, // ADST_ADST 1796 fadst8x16_col_neon, // FLIPADST_DCT 1797 fdct8x16_col_neon, // DCT_FLIPADST 1798 fadst8x16_col_neon, // FLIPADST_FLIPADST 1799 fadst8x16_col_neon, // ADST_FLIPADST 1800 fadst8x16_col_neon, // FLIPADST_ADST 1801 fidentity8x16_col_neon, // IDTX 1802 fdct8x16_col_neon, // V_DCT 1803 fidentity8x16_col_neon, // H_DCT 1804 fadst8x16_col_neon, // V_ADST 1805 fidentity8x16_col_neon, // H_ADST 1806 fadst8x16_col_neon, // V_FLIPADST 1807 fidentity8x16_col_neon // H_FLIPADST 1808 }; 1809 1810 static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = { 1811 fdct8x16_row_neon, // DCT_DCT 1812 fdct8x16_row_neon, // ADST_DCT 1813 fadst8x16_row_neon, // DCT_ADST 1814 fadst8x16_row_neon, // ADST_ADST 1815 fdct8x16_row_neon, // FLIPADST_DCT 1816 fadst8x16_row_neon, // DCT_FLIPADST 1817 fadst8x16_row_neon, // FLIPADST_FLIPADST 1818 fadst8x16_row_neon, // ADST_FLIPADST 1819 fadst8x16_row_neon, // FLIPADST_ADST 1820 fidentity8x16_row_neon, // IDTX 1821 fidentity8x16_row_neon, // V_DCT 1822 fdct8x16_row_neon, // H_DCT 1823 fidentity8x16_row_neon, // V_ADST 1824 fadst8x16_row_neon, // H_ADST 1825 fidentity8x16_row_neon, // V_FLIPADST 1826 fadst8x16_row_neon // H_FLIPADST 1827 }; 1828 1829 static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = { 1830 fdct8x16_row_rect_neon, // DCT_DCT 1831 fdct8x16_row_rect_neon, // ADST_DCT 1832 fadst8x16_row_rect_neon, // DCT_ADST 1833 fadst8x16_row_rect_neon, // ADST_ADST 1834 fdct8x16_row_rect_neon, // FLIPADST_DCT 1835 fadst8x16_row_rect_neon, // DCT_FLIPADST 1836 fadst8x16_row_rect_neon, // FLIPADST_FLIPADST 1837 fadst8x16_row_rect_neon, // ADST_FLIPADST 1838 fadst8x16_row_rect_neon, // FLIPADST_ADST 1839 fidentity8x16_row_rect_neon, // IDTX 1840 fidentity8x16_row_rect_neon, // V_DCT 1841 fdct8x16_row_rect_neon, // H_DCT 1842 fidentity8x16_row_rect_neon, // V_ADST 1843 fadst8x16_row_rect_neon, // H_ADST 1844 fidentity8x16_row_rect_neon, // V_FLIPADST 1845 fadst8x16_row_rect_neon // H_FLIPADST 1846 }; 1847 1848 static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = { 1849 fdct8x32_row_neon, // DCT_DCT 1850 NULL, // ADST_DCT 1851 NULL, // DCT_ADST 1852 NULL, // ADST_ADST 1853 NULL, // FLIPADST_DCT 1854 NULL, // DCT_FLIPADST 1855 NULL, // FLIPADST_FLIPADST 1856 NULL, // ADST_FLIPADST 1857 NULL, // FLIPADST_ADST 1858 fidentity8x32_row_neon, // IDTX 1859 fidentity8x32_row_neon, // V_DCT 1860 fdct8x32_row_neon, // H_DCT 1861 NULL, // V_ADST 1862 NULL, // H_ADST 1863 NULL, // V_FLIPADST 1864 NULL // H_FLIPADST 1865 }; 1866 1867 static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = { 1868 fdct8x32_row_rect_neon, // DCT_DCT 1869 NULL, // ADST_DCT 1870 NULL, // DCT_ADST 1871 NULL, // ADST_ADST 1872 NULL, // FLIPADST_DCT 1873 NULL, // DCT_FLIPADST 1874 NULL, // FLIPADST_FLIPADST 1875 NULL, // ADST_FLIPADST 1876 NULL, // FLIPADST_ADST 1877 fidentity8x32_row_rect_neon, // IDTX 1878 fidentity8x32_row_rect_neon, // V_DCT 1879 fdct8x32_row_rect_neon, // H_DCT 1880 NULL, // V_ADST 1881 NULL, // H_ADST 1882 NULL, // V_FLIPADST 1883 NULL // H_FLIPADST 1884 }; 1885 1886 static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = { 1887 fdct8x32_col_neon, // DCT_DCT 1888 NULL, // ADST_DCT 1889 NULL, // DCT_ADST 1890 NULL, // ADST_ADST 1891 NULL, // FLIPADST_DCT 1892 NULL, // DCT_FLIPADST 1893 NULL, // FLIPADST_FLIPADST 1894 NULL, // ADST_FLIPADST 1895 NULL, // FLIPADST_ADST 1896 fidentity8x32_col_neon, // IDTX 1897 fdct8x32_col_neon, // V_DCT 1898 fidentity8x32_col_neon, // H_DCT 1899 NULL, // V_ADST 1900 NULL, // H_ADST 1901 NULL, // V_FLIPADST 1902 NULL // H_FLIPADST 1903 }; 1904 1905 static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output, 1906 int stride, TX_TYPE tx_type, int bd) { 1907 (void)bd; 1908 int ud_flip, lr_flip; 1909 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1910 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); 1911 1912 int16x4_t buf0[4], buf1[4]; 1913 switch (tx_type) { 1914 case DCT_DCT: 1915 fdct4x4_col_neon(input, buf0, stride, 13); 1916 transpose_arrays_s16_4x4(buf0, buf1); 1917 fdct4x4_row_neon(buf1, output, 4, 13); 1918 break; 1919 case ADST_DCT: 1920 fadst4x4_col_neon(input, buf0, stride, 13); 1921 transpose_arrays_s16_4x4(buf0, buf1); 1922 fdct4x4_row_neon(buf1, output, 4, 13); 1923 break; 1924 case DCT_ADST: 1925 fdct4x4_col_neon(input, buf0, stride, 13); 1926 transpose_arrays_s16_4x4(buf0, buf1); 1927 fadst4x4_row_neon(buf1, output, 4, 13); 1928 break; 1929 case ADST_ADST: 1930 fadst4x4_col_neon(input, buf0, stride, 13); 1931 transpose_arrays_s16_4x4(buf0, buf1); 1932 fadst4x4_row_neon(buf1, output, 4, 13); 1933 break; 1934 case FLIPADST_DCT: 1935 fadst4x4_col_neon(input, buf0, stride, 13); 1936 transpose_arrays_s16_4x4(buf0, buf1); 1937 fdct4x4_row_neon(buf1, output, 4, 13); 1938 break; 1939 case DCT_FLIPADST: 1940 fdct4x4_col_neon(input, buf0, stride, 13); 1941 transpose_arrays_s16_4x4(buf0, buf1); 1942 flip_buf_4_neon(buf1, buf0, 4); 1943 fadst4x4_row_neon(buf0, output, 4, 13); 1944 break; 1945 case FLIPADST_FLIPADST: 1946 fadst4x4_col_neon(input, buf0, stride, 13); 1947 transpose_arrays_s16_4x4(buf0, buf1); 1948 flip_buf_4_neon(buf1, buf0, 4); 1949 fadst4x4_row_neon(buf0, output, 4, 13); 1950 break; 1951 case ADST_FLIPADST: 1952 fadst4x4_col_neon(input, buf0, stride, 13); 1953 transpose_arrays_s16_4x4(buf0, buf1); 1954 flip_buf_4_neon(buf1, buf0, 4); 1955 fadst4x4_row_neon(buf0, output, 4, 13); 1956 break; 1957 case FLIPADST_ADST: 1958 fadst4x4_col_neon(input, buf0, stride, 13); 1959 transpose_arrays_s16_4x4(buf0, buf1); 1960 fadst4x4_row_neon(buf1, output, 4, 13); 1961 break; 1962 case IDTX: 1963 fidentity4x4_col_neon(input, buf0, stride, 13); 1964 transpose_arrays_s16_4x4(buf0, buf1); 1965 fidentity4x4_row_neon(buf1, output, 4, 13); 1966 break; 1967 case V_DCT: 1968 fdct4x4_col_neon(input, buf0, stride, 13); 1969 transpose_arrays_s16_4x4(buf0, buf1); 1970 fidentity4x4_row_neon(buf1, output, 4, 13); 1971 break; 1972 case H_DCT: 1973 fidentity4x4_col_neon(input, buf0, stride, 13); 1974 transpose_arrays_s16_4x4(buf0, buf1); 1975 fdct4x4_row_neon(buf1, output, 4, 13); 1976 break; 1977 case V_ADST: 1978 fadst4x4_col_neon(input, buf0, stride, 13); 1979 transpose_arrays_s16_4x4(buf0, buf1); 1980 fidentity4x4_row_neon(buf1, output, 4, 13); 1981 break; 1982 case H_ADST: 1983 fidentity4x4_col_neon(input, buf0, stride, 13); 1984 transpose_arrays_s16_4x4(buf0, buf1); 1985 fadst4x4_row_neon(buf1, output, 4, 13); 1986 break; 1987 case V_FLIPADST: 1988 fadst4x4_col_neon(input, buf0, stride, 13); 1989 transpose_arrays_s16_4x4(buf0, buf1); 1990 fidentity4x4_row_neon(buf1, output, 4, 13); 1991 break; 1992 case H_FLIPADST: 1993 fidentity4x4_col_neon(input, buf0, stride, 13); 1994 transpose_arrays_s16_4x4(buf0, buf1); 1995 flip_buf_4_neon(buf1, buf0, 4); 1996 fadst4x4_row_neon(buf0, output, 4, 13); 1997 break; 1998 } 1999 } 2000 2001 static void lowbd_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *output, 2002 int stride, TX_TYPE tx_type, int bd) { 2003 (void)bd; 2004 int16x4_t buf0[8]; 2005 int16x8_t buf1[8]; 2006 const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type]; 2007 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type]; 2008 2009 int ud_flip, lr_flip; 2010 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2011 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 2012 col_txfm(input, buf0, stride, 13); 2013 shift_right_1_round_s16_x4(buf0, buf0, 8); 2014 transpose_arrays_s16_4x8(buf0, buf1); 2015 2016 if (lr_flip) { 2017 int16x8_t buf2[8]; 2018 flip_buf_8_neon(buf1, buf2, 4); 2019 row_txfm(buf2, output, 8, 13); 2020 } else { 2021 row_txfm(buf1, output, 8, 13); 2022 } 2023 } 2024 2025 static void lowbd_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *output, 2026 int stride, TX_TYPE tx_type, int bd) { 2027 (void)bd; 2028 int16x4_t buf0[16]; 2029 int16x8_t buf1[16]; 2030 const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type]; 2031 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type]; 2032 int ud_flip, lr_flip; 2033 2034 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2035 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2036 col_txfm(input, buf0, stride, 13); 2037 shift_right_1_round_s16_x4(buf0, buf0, 16); 2038 transpose_arrays_s16_4x8(buf0, buf1); 2039 transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8); 2040 2041 for (int i = 0; i < 2; i++) { 2042 if (lr_flip) { 2043 int16x8_t buf2[16]; 2044 flip_buf_8_neon(buf1 + 8 * i, buf2, 4); 2045 row_txfm(buf2, output + 8 * i, 16, 12); 2046 } else { 2047 int16x8_t *buf = buf1 + 8 * i; 2048 row_txfm(buf, output + 8 * i, 16, 12); 2049 } 2050 } 2051 } 2052 2053 static void lowbd_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *output, 2054 int stride, TX_TYPE tx_type, int bd) { 2055 (void)bd; 2056 int16x8_t buf0[8]; 2057 int16x4_t buf1[8]; 2058 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; 2059 const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type]; 2060 int ud_flip, lr_flip; 2061 2062 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2063 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); 2064 col_txfm(input, buf0, stride, 13); 2065 shift_right_1_round_s16_x8(buf0, buf0, 4); 2066 transpose_arrays_s16_8x4(buf0, buf1); 2067 2068 if (lr_flip) { 2069 int16x4_t buf2[8]; 2070 flip_buf_4_neon(buf1, buf2, 8); 2071 row_txfm(buf2, output, 4, 13); 2072 } else { 2073 row_txfm(buf1, output, 4, 13); 2074 } 2075 } 2076 2077 static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output, 2078 int stride, TX_TYPE tx_type, int bd) { 2079 (void)bd; 2080 int ud_flip, lr_flip; 2081 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2082 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 2083 2084 int16x8_t buf0[8], buf1[8]; 2085 2086 switch (tx_type) { 2087 case DCT_DCT: 2088 fdct8x8_col_neon(input, buf0, stride, 13); 2089 shift_right_1_round_s16_x8(buf0, buf0, 8); 2090 transpose_arrays_s16_8x8(buf0, buf1); 2091 fdct8x8_row_neon(buf1, output, 8, 13); 2092 break; 2093 case ADST_DCT: 2094 fadst8x8_col_neon(input, buf0, stride, 13); 2095 shift_right_1_round_s16_x8(buf0, buf0, 8); 2096 transpose_arrays_s16_8x8(buf0, buf1); 2097 fdct8x8_row_neon(buf1, output, 8, 13); 2098 break; 2099 case DCT_ADST: 2100 fdct8x8_col_neon(input, buf0, stride, 13); 2101 shift_right_1_round_s16_x8(buf0, buf0, 8); 2102 transpose_arrays_s16_8x8(buf0, buf1); 2103 fadst8x8_row_neon(buf1, output, 8, 13); 2104 break; 2105 case ADST_ADST: 2106 fadst8x8_col_neon(input, buf0, stride, 13); 2107 shift_right_1_round_s16_x8(buf0, buf0, 8); 2108 transpose_arrays_s16_8x8(buf0, buf1); 2109 fadst8x8_row_neon(buf1, output, 8, 13); 2110 break; 2111 case FLIPADST_DCT: 2112 fadst8x8_col_neon(input, buf0, stride, 13); 2113 shift_right_1_round_s16_x8(buf0, buf0, 8); 2114 transpose_arrays_s16_8x8(buf0, buf1); 2115 fdct8x8_row_neon(buf1, output, 8, 13); 2116 break; 2117 case DCT_FLIPADST: 2118 fdct8x8_col_neon(input, buf0, stride, 13); 2119 shift_right_1_round_s16_x8(buf0, buf0, 8); 2120 transpose_arrays_s16_8x8(buf0, buf1); 2121 flip_buf_8_neon(buf1, buf0, 8); 2122 fadst8x8_row_neon(buf0, output, 8, 13); 2123 break; 2124 case FLIPADST_FLIPADST: 2125 fadst8x8_col_neon(input, buf0, stride, 13); 2126 shift_right_1_round_s16_x8(buf0, buf0, 8); 2127 transpose_arrays_s16_8x8(buf0, buf1); 2128 flip_buf_8_neon(buf1, buf0, 8); 2129 fadst8x8_row_neon(buf0, output, 8, 13); 2130 break; 2131 case ADST_FLIPADST: 2132 fadst8x8_col_neon(input, buf0, stride, 13); 2133 shift_right_1_round_s16_x8(buf0, buf0, 8); 2134 transpose_arrays_s16_8x8(buf0, buf1); 2135 flip_buf_8_neon(buf1, buf0, 8); 2136 fadst8x8_row_neon(buf0, output, 8, 13); 2137 break; 2138 case FLIPADST_ADST: 2139 fadst8x8_col_neon(input, buf0, stride, 13); 2140 shift_right_1_round_s16_x8(buf0, buf0, 8); 2141 transpose_arrays_s16_8x8(buf0, buf1); 2142 fadst8x8_row_neon(buf1, output, 8, 13); 2143 break; 2144 case IDTX: 2145 fidentity8x8_col_neon(input, buf0, stride, 13); 2146 shift_right_1_round_s16_x8(buf0, buf0, 8); 2147 transpose_arrays_s16_8x8(buf0, buf1); 2148 fidentity8x8_row_neon(buf1, output, 8, 13); 2149 break; 2150 case V_DCT: 2151 fdct8x8_col_neon(input, buf0, stride, 13); 2152 shift_right_1_round_s16_x8(buf0, buf0, 8); 2153 transpose_arrays_s16_8x8(buf0, buf1); 2154 fidentity8x8_row_neon(buf1, output, 8, 13); 2155 break; 2156 case H_DCT: 2157 fidentity8x8_col_neon(input, buf0, stride, 13); 2158 shift_right_1_round_s16_x8(buf0, buf0, 8); 2159 transpose_arrays_s16_8x8(buf0, buf1); 2160 fdct8x8_row_neon(buf1, output, 8, 13); 2161 break; 2162 case V_ADST: 2163 fadst8x8_col_neon(input, buf0, stride, 13); 2164 shift_right_1_round_s16_x8(buf0, buf0, 8); 2165 transpose_arrays_s16_8x8(buf0, buf1); 2166 fidentity8x8_row_neon(buf1, output, 8, 13); 2167 break; 2168 case H_ADST: 2169 fidentity8x8_col_neon(input, buf0, stride, 13); 2170 shift_right_1_round_s16_x8(buf0, buf0, 8); 2171 transpose_arrays_s16_8x8(buf0, buf1); 2172 fadst8x8_row_neon(buf1, output, 8, 13); 2173 break; 2174 case V_FLIPADST: 2175 fadst8x8_col_neon(input, buf0, stride, 13); 2176 shift_right_1_round_s16_x8(buf0, buf0, 8); 2177 transpose_arrays_s16_8x8(buf0, buf1); 2178 fidentity8x8_row_neon(buf1, output, 8, 13); 2179 break; 2180 case H_FLIPADST: 2181 fidentity8x8_col_neon(input, buf0, stride, 13); 2182 shift_right_1_round_s16_x8(buf0, buf0, 8); 2183 transpose_arrays_s16_8x8(buf0, buf1); 2184 flip_buf_8_neon(buf1, buf0, 8); 2185 fadst8x8_row_neon(buf0, output, 8, 13); 2186 break; 2187 } 2188 } 2189 2190 static void lowbd_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *output, 2191 int stride, TX_TYPE tx_type, int bd) { 2192 (void)bd; 2193 int16x8_t buf0[16], buf1[16]; 2194 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; 2195 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type]; 2196 int ud_flip, lr_flip; 2197 2198 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2199 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2200 col_txfm(input, buf0, stride, 13); 2201 shift_right_2_round_s16_x8(buf0, buf0, 16); 2202 transpose_arrays_s16_8x8(buf0, buf1); 2203 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); 2204 2205 for (int i = 0; i < 2; i++) { 2206 if (lr_flip) { 2207 flip_buf_8_neon(buf1 + 8 * i, buf0, 8); 2208 row_txfm(buf0, output + 8 * i, 16, 13); 2209 } else { 2210 int16x8_t *buf = buf1 + 8 * i; 2211 row_txfm(buf, output + 8 * i, 16, 13); 2212 } 2213 } 2214 } 2215 2216 static void lowbd_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *output, 2217 int stride, TX_TYPE tx_type, int bd) { 2218 (void)bd; 2219 int16x8_t buf0[32], buf1[32]; 2220 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; 2221 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type]; 2222 int ud_flip, lr_flip; 2223 2224 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2225 ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); 2226 col_txfm(input, buf0, stride, 12); 2227 shift_right_2_round_s16_x8(buf0, buf0, 32); 2228 transpose_arrays_s16_8x8(buf0, buf1); 2229 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8); 2230 transpose_arrays_s16_8x8(buf0 + 16, buf1 + 16); 2231 transpose_arrays_s16_8x8(buf0 + 24, buf1 + 24); 2232 2233 for (int i = 0; i < 4; i++) { 2234 if (lr_flip) { 2235 flip_buf_8_neon(buf1 + 8 * i, buf0, 8); 2236 row_txfm(buf0, output + 8 * i, 32, 12); 2237 } else { 2238 int16x8_t *buf = buf1 + 8 * i; 2239 row_txfm(buf, output + 8 * i, 32, 12); 2240 } 2241 } 2242 } 2243 2244 static void lowbd_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *output, 2245 int stride, TX_TYPE tx_type, int bd) { 2246 (void)bd; 2247 int16x8_t buf0[16]; 2248 int16x4_t buf1[16]; 2249 int16x4_t buf2[16]; 2250 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type]; 2251 const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type]; 2252 int ud_flip, lr_flip; 2253 2254 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2255 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); 2256 for (int i = 0; i < 2; i++) { 2257 col_txfm(input + 8 * i, buf0, stride, 13); 2258 shift_right_1_round_s16_x8(buf0, buf0, 4); 2259 transpose_arrays_s16_8x4(buf0, buf1 + 8 * i); 2260 } 2261 2262 if (lr_flip) { 2263 flip_buf_4_neon(buf1, buf2, 16); 2264 row_txfm(buf2, output, 4, 13); 2265 } else { 2266 row_txfm(buf1, output, 4, 13); 2267 } 2268 } 2269 2270 static void lowbd_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *output, 2271 int stride, TX_TYPE tx_type, int bd) { 2272 (void)bd; 2273 int16x8_t buf0[16], buf1[16]; 2274 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; 2275 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; 2276 int ud_flip, lr_flip; 2277 2278 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2279 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 2280 for (int i = 0; i < 2; i++) { 2281 col_txfm(input + 8 * i, buf0, stride, 13); 2282 shift_right_2_round_s16_x8(buf0, buf0, 8); 2283 transpose_arrays_s16_8x8(buf0, buf1 + 8 * i); 2284 } 2285 2286 if (lr_flip) { 2287 flip_buf_8_neon(buf1, buf0, 16); 2288 row_txfm(buf0, output, 8, 13); 2289 } else { 2290 row_txfm(buf1, output, 8, 13); 2291 } 2292 } 2293 2294 static void lowbd_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *output, 2295 int stride, TX_TYPE tx_type, int bd) { 2296 (void)bd; 2297 int16x8_t buf0[16], buf1[32]; 2298 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; 2299 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type]; 2300 int ud_flip, lr_flip; 2301 2302 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2303 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2304 for (int i = 0; i < 2; i++) { 2305 col_txfm(input + 8 * i, buf0, stride, 13); 2306 shift_right_2_round_s16_x8(buf0, buf0, 16); 2307 transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i); 2308 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i); 2309 } 2310 2311 for (int i = 0; i < 2; i++) { 2312 if (lr_flip) { 2313 flip_buf_8_neon(buf1 + 16 * i, buf0, 16); 2314 row_txfm(buf0, output + 8 * i, 16, 12); 2315 } else { 2316 int16x8_t *buf = buf1 + 16 * i; 2317 row_txfm(buf, output + 8 * i, 16, 12); 2318 } 2319 } 2320 } 2321 2322 static void lowbd_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *output, 2323 int stride, TX_TYPE tx_type, int bd) { 2324 (void)bd; 2325 int16x8_t buf0[32], buf1[64]; 2326 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; 2327 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type]; 2328 2329 if (col_txfm == NULL || row_txfm == NULL) { 2330 av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd); 2331 return; 2332 } 2333 2334 int ud_flip, lr_flip; 2335 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2336 ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); 2337 for (int i = 0; i < 2; i++) { 2338 col_txfm(input + 8 * i, buf0, stride, 12); 2339 shift_right_4_round_s16_x8(buf0, buf0, 32); 2340 transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i); 2341 transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i); 2342 transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 16 + 8 * i); 2343 transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 16 + 8 * i); 2344 } 2345 2346 for (int i = 0; i < 4; i++) { 2347 if (lr_flip) { 2348 flip_buf_8_neon(buf1 + 16 * i, buf0, 16); 2349 row_txfm(buf0, output + 8 * i, 32, 13); 2350 } else { 2351 int16x8_t *buf = buf1 + 16 * i; 2352 row_txfm(buf, output + 8 * i, 32, 13); 2353 } 2354 } 2355 } 2356 2357 static void lowbd_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *output, 2358 int stride, TX_TYPE tx_type, int bd) { 2359 (void)bd; 2360 int16x8_t buf0[32], buf1[32]; 2361 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; 2362 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; 2363 2364 if (col_txfm == NULL || row_txfm == NULL) { 2365 av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); 2366 return; 2367 } 2368 2369 int ud_flip, lr_flip; 2370 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2371 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 2372 for (int i = 0; i < 4; i++) { 2373 col_txfm(input + 8 * i, buf0, stride, 13); 2374 shift_right_2_round_s16_x8(buf0, buf0, 8); 2375 transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); 2376 } 2377 2378 if (lr_flip) { 2379 flip_buf_8_neon(buf1, buf0, 32); 2380 row_txfm(buf0, output, 8, 12); 2381 } else { 2382 row_txfm(buf1, output, 8, 12); 2383 } 2384 } 2385 2386 static void lowbd_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *output, 2387 int stride, TX_TYPE tx_type, int bd) { 2388 (void)bd; 2389 int16x8_t buf0[32], buf1[64]; 2390 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type]; 2391 const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type]; 2392 2393 if (col_txfm == NULL || row_txfm == NULL) { 2394 av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd); 2395 return; 2396 } 2397 2398 int ud_flip, lr_flip; 2399 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2400 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2401 for (int i = 0; i < 4; i++) { 2402 col_txfm(input + 8 * i, buf0, stride, 13); 2403 shift_right_4_round_s16_x8(buf0, buf0, 16); 2404 transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i); 2405 transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i); 2406 } 2407 2408 for (int i = 0; i < 2; i++) { 2409 if (lr_flip) { 2410 flip_buf_8_neon(buf1 + 32 * i, buf0, 32); 2411 row_txfm(buf0, output + 8 * i, 16, 13); 2412 } else { 2413 int16x8_t *buf = buf1 + 32 * i; 2414 row_txfm(buf, output + 8 * i, 16, 13); 2415 } 2416 } 2417 } 2418 2419 static void lowbd_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, 2420 int stride, TX_TYPE tx_type, int bd) { 2421 (void)bd; 2422 int16x8_t buf0[32], buf1[128]; 2423 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; 2424 const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type]; 2425 2426 if (col_txfm == NULL || row_txfm == NULL) { 2427 av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd); 2428 return; 2429 } 2430 2431 int ud_flip, lr_flip; 2432 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2433 ud_adjust_input_and_stride(ud_flip, &input, &stride, 32); 2434 for (int i = 0; i < 4; i++) { 2435 col_txfm(input + 8 * i, buf0, stride, 12); 2436 shift_right_4_round_s16_x8(buf0, buf0, 32); 2437 transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i); 2438 transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i); 2439 transpose_arrays_s16_8x8(buf0 + 2 * 8, buf1 + 2 * 32 + 8 * i); 2440 transpose_arrays_s16_8x8(buf0 + 3 * 8, buf1 + 3 * 32 + 8 * i); 2441 } 2442 2443 for (int i = 0; i < 4; i++) { 2444 if (lr_flip) { 2445 flip_buf_8_neon(buf1 + 32 * i, buf0, 32); 2446 row_txfm(buf0, output + 8 * i, 32, 12); 2447 } else { 2448 int16x8_t *buf = buf1 + 32 * i; 2449 row_txfm(buf, output + 8 * i, 32, 12); 2450 } 2451 } 2452 } 2453 2454 static void lowbd_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *output, 2455 int stride, TX_TYPE tx_type, int bd) { 2456 (void)bd; 2457 (void)tx_type; 2458 assert(tx_type == DCT_DCT); 2459 int16x8_t buf0[64], buf1[128]; 2460 const transform_1d_lbd_8_neon col_txfm = fdct8x16_neon; 2461 const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon; 2462 2463 for (int i = 0; i < 8; i++) { 2464 load_buffer_s16_x8(input + 8 * i, stride, buf0, 16); 2465 shift_left_2_s16_x8(buf0, buf0, 16); 2466 col_txfm(buf0, buf0, 13); 2467 shift_right_4_round_s16_x8(buf0, buf0, 16); 2468 for (int j = 0; j < 2; ++j) { 2469 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); 2470 } 2471 } 2472 2473 for (int i = 0; i < 2; i++) { 2474 int16x8_t *buf = buf1 + 64 * i; 2475 row_txfm(buf, buf, 12); 2476 store_buffer_s16_x8(buf, output + 8 * i, 16, 32); 2477 } 2478 // Zero out the bottom 16x32 area. 2479 memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output)); 2480 } 2481 2482 static void lowbd_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *output, 2483 int stride, TX_TYPE tx_type, int bd) { 2484 (void)bd; 2485 (void)tx_type; 2486 assert(tx_type == DCT_DCT); 2487 int16x8_t buf0[64], buf1[128]; 2488 const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; 2489 const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon; 2490 2491 for (int i = 0; i < 2; i++) { 2492 load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); 2493 col_txfm(buf0, buf0, 13); 2494 shift_right_2_round_s16_x8(buf0, buf0, 64); 2495 for (int j = 0; j < 8; ++j) { 2496 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 16 + 8 * i); 2497 } 2498 } 2499 2500 for (int i = 0; i < 4; i++) { 2501 int16x8_t *buf = buf1 + 16 * i; 2502 row_txfm(buf, buf, 12); 2503 store_buffer_s16_x8(buf, output + 8 * i, 32, 16); 2504 } 2505 } 2506 2507 static void fdct32_neon(const int32x4_t *input, int32x4_t *output, 2508 int cos_bit) { 2509 const int16_t *cospi = cospi_arr_q13(cos_bit); 2510 2511 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 2512 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 2513 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 2514 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 2515 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); 2516 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); 2517 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); 2518 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); 2519 2520 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 2521 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 2522 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 2523 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 2524 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 2525 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 2526 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 2527 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 2528 const int16x4_t cospi2 = vget_low_s16(cospi2_6); 2529 const int16x4_t cospi6 = vget_high_s16(cospi2_6); 2530 const int16x4_t cospi10 = vget_low_s16(cospi10_14); 2531 const int16x4_t cospi14 = vget_high_s16(cospi10_14); 2532 const int16x4_t cospi18 = vget_low_s16(cospi18_22); 2533 const int16x4_t cospi22 = vget_high_s16(cospi18_22); 2534 const int16x4_t cospi26 = vget_low_s16(cospi26_30); 2535 const int16x4_t cospi30 = vget_high_s16(cospi26_30); 2536 2537 int32x4_t buf0[32]; 2538 int32x4_t buf1[32]; 2539 2540 // stage 1 2541 butterfly_dct_pre_s32_x4(input, buf1, 32); 2542 2543 // stage 2 2544 butterfly_dct_pre_s32_x4(buf1, buf0, 16); 2545 buf0[16] = buf1[16]; 2546 buf0[17] = buf1[17]; 2547 buf0[18] = buf1[18]; 2548 buf0[19] = buf1[19]; 2549 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[27], buf1[20], &buf0[27], 2550 &buf0[20]); 2551 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[26], buf1[21], &buf0[26], 2552 &buf0[21]); 2553 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[25], buf1[22], &buf0[25], 2554 &buf0[22]); 2555 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[24], buf1[23], &buf0[24], 2556 &buf0[23]); 2557 buf0[28] = buf1[28]; 2558 buf0[29] = buf1[29]; 2559 buf0[30] = buf1[30]; 2560 buf0[31] = buf1[31]; 2561 2562 // stage 3 2563 butterfly_dct_pre_s32_x4(buf0, buf1, 8); 2564 buf1[8] = buf0[8]; 2565 buf1[9] = buf0[9]; 2566 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[13], buf0[10], &buf1[13], 2567 &buf1[10]); 2568 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[12], buf0[11], &buf1[12], 2569 &buf1[11]); 2570 buf1[14] = buf0[14]; 2571 buf1[15] = buf0[15]; 2572 butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 16); 2573 2574 // stage 4 2575 butterfly_dct_pre_s32_x4(buf1, buf0, 4); 2576 buf0[4] = buf1[4]; 2577 butterfly_s32_s32_x4_0112_neon(cospi32, buf1[6], buf1[5], &buf0[6], &buf0[5]); 2578 buf0[7] = buf1[7]; 2579 butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 8); 2580 buf0[16] = buf1[16]; 2581 buf0[17] = buf1[17]; 2582 butterfly_s32_s32_x4_0112_neon(cospi16, buf1[29], buf1[18], &buf0[29], 2583 &buf0[18]); 2584 butterfly_s32_s32_x4_0112_neon(cospi16, buf1[28], buf1[19], &buf0[28], 2585 &buf0[19]); 2586 butterfly_s32_s32_x4_1223_neon(cospi16, buf1[27], buf1[20], &buf0[27], 2587 &buf0[20]); 2588 butterfly_s32_s32_x4_1223_neon(cospi16, buf1[26], buf1[21], &buf0[26], 2589 &buf0[21]); 2590 buf0[22] = buf1[22]; 2591 buf0[23] = buf1[23]; 2592 buf0[24] = buf1[24]; 2593 buf0[25] = buf1[25]; 2594 buf0[30] = buf1[30]; 2595 buf0[31] = buf1[31]; 2596 2597 // stage 5 2598 butterfly_s32_s32_x4_0112_neon(cospi32, buf0[0], buf0[1], &buf1[0], &buf1[1]); 2599 butterfly_s32_s32_x4_0112_neon(cospi16, buf0[3], buf0[2], &buf1[2], &buf1[3]); 2600 butterfly_dct_post_s32_x4(buf0 + 4, buf0 + 4, buf1 + 4, 4); 2601 buf1[8] = buf0[8]; 2602 butterfly_s32_s32_x4_0112_neon(cospi16, buf0[14], buf0[9], &buf1[14], 2603 &buf1[9]); 2604 butterfly_s32_s32_x4_1223_neon(cospi16, buf0[13], buf0[10], &buf1[13], 2605 &buf1[10]); 2606 buf1[11] = buf0[11]; 2607 buf1[12] = buf0[12]; 2608 buf1[15] = buf0[15]; 2609 butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 8); 2610 butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 8); 2611 2612 // stage 6 2613 buf0[0] = buf1[0]; 2614 buf0[1] = buf1[1]; 2615 buf0[2] = buf1[2]; 2616 buf0[3] = buf1[3]; 2617 butterfly_s32_s32_x4_0112_neon(cospi8, buf1[7], buf1[4], &buf0[4], &buf0[7]); 2618 butterfly_s32_s32_x4_1003_neon(cospi24, buf1[6], buf1[5], &buf0[5], &buf0[6]); 2619 butterfly_dct_post_s32_x4(buf1 + 8, buf1 + 8, buf0 + 8, 4); 2620 butterfly_dct_post_s32_x4(buf1 + 12, buf1 + 12, buf0 + 12, 4); 2621 buf0[16] = buf1[16]; 2622 butterfly_s32_s32_x4_0112_neon(cospi8, buf1[30], buf1[17], &buf0[30], 2623 &buf0[17]); 2624 butterfly_s32_s32_x4_1223_neon(cospi8, buf1[29], buf1[18], &buf0[29], 2625 &buf0[18]); 2626 buf0[19] = buf1[19]; 2627 buf0[20] = buf1[20]; 2628 butterfly_s32_s32_x4_1003_neon(cospi24, buf1[26], buf1[21], &buf0[26], 2629 &buf0[21]); 2630 butterfly_s32_s32_x4_0332_neon(cospi24, buf1[25], buf1[22], &buf0[25], 2631 &buf0[22]); 2632 buf0[23] = buf1[23]; 2633 buf0[24] = buf1[24]; 2634 buf0[27] = buf1[27]; 2635 buf0[28] = buf1[28]; 2636 buf0[31] = buf1[31]; 2637 2638 // stage 7 2639 buf1[0] = buf0[0]; 2640 buf1[1] = buf0[1]; 2641 buf1[2] = buf0[2]; 2642 buf1[3] = buf0[3]; 2643 buf1[4] = buf0[4]; 2644 buf1[5] = buf0[5]; 2645 buf1[6] = buf0[6]; 2646 buf1[7] = buf0[7]; 2647 butterfly_s32_s32_x4_0112_neon(cospi4, buf0[15], buf0[8], &buf1[8], 2648 &buf1[15]); 2649 butterfly_s32_s32_x4_1003_neon(cospi28, buf0[14], buf0[9], &buf1[9], 2650 &buf1[14]); 2651 butterfly_s32_s32_x4_0112_neon(cospi20, buf0[13], buf0[10], &buf1[10], 2652 &buf1[13]); 2653 butterfly_s32_s32_x4_1003_neon(cospi12, buf0[12], buf0[11], &buf1[11], 2654 &buf1[12]); 2655 butterfly_dct_post_s32_x4(buf0 + 16, buf0 + 16, buf1 + 16, 4); 2656 butterfly_dct_post_s32_x4(buf0 + 20, buf0 + 20, buf1 + 20, 4); 2657 butterfly_dct_post_s32_x4(buf0 + 24, buf0 + 24, buf1 + 24, 4); 2658 butterfly_dct_post_s32_x4(buf0 + 28, buf0 + 28, buf1 + 28, 4); 2659 2660 // stage 8 2661 buf0[0] = buf1[0]; 2662 buf0[1] = buf1[1]; 2663 buf0[2] = buf1[2]; 2664 buf0[3] = buf1[3]; 2665 buf0[4] = buf1[4]; 2666 buf0[5] = buf1[5]; 2667 buf0[6] = buf1[6]; 2668 buf0[7] = buf1[7]; 2669 buf0[8] = buf1[8]; 2670 buf0[9] = buf1[9]; 2671 buf0[10] = buf1[10]; 2672 buf0[11] = buf1[11]; 2673 buf0[12] = buf1[12]; 2674 buf0[13] = buf1[13]; 2675 buf0[14] = buf1[14]; 2676 buf0[15] = buf1[15]; 2677 butterfly_s32_s32_x4_0112_neon(cospi2, buf1[31], buf1[16], &buf0[16], 2678 &buf0[31]); 2679 butterfly_s32_s32_x4_1003_neon(cospi30, buf1[30], buf1[17], &buf0[17], 2680 &buf0[30]); 2681 butterfly_s32_s32_x4_0112_neon(cospi18, buf1[29], buf1[18], &buf0[18], 2682 &buf0[29]); 2683 butterfly_s32_s32_x4_1003_neon(cospi14, buf1[28], buf1[19], &buf0[19], 2684 &buf0[28]); 2685 butterfly_s32_s32_x4_0112_neon(cospi10, buf1[27], buf1[20], &buf0[20], 2686 &buf0[27]); 2687 butterfly_s32_s32_x4_1003_neon(cospi22, buf1[26], buf1[21], &buf0[21], 2688 &buf0[26]); 2689 butterfly_s32_s32_x4_0112_neon(cospi26, buf1[25], buf1[22], &buf0[22], 2690 &buf0[25]); 2691 butterfly_s32_s32_x4_1003_neon(cospi6, buf1[24], buf1[23], &buf0[23], 2692 &buf0[24]); 2693 2694 // stage 9 2695 output[0] = buf0[0]; 2696 output[1] = buf0[16]; 2697 output[2] = buf0[8]; 2698 output[3] = buf0[24]; 2699 output[4] = buf0[4]; 2700 output[5] = buf0[20]; 2701 output[6] = buf0[12]; 2702 output[7] = buf0[28]; 2703 output[8] = buf0[2]; 2704 output[9] = buf0[18]; 2705 output[10] = buf0[10]; 2706 output[11] = buf0[26]; 2707 output[12] = buf0[6]; 2708 output[13] = buf0[22]; 2709 output[14] = buf0[14]; 2710 output[15] = buf0[30]; 2711 output[16] = buf0[1]; 2712 output[17] = buf0[17]; 2713 output[18] = buf0[9]; 2714 output[19] = buf0[25]; 2715 output[20] = buf0[5]; 2716 output[21] = buf0[21]; 2717 output[22] = buf0[13]; 2718 output[23] = buf0[29]; 2719 output[24] = buf0[3]; 2720 output[25] = buf0[19]; 2721 output[26] = buf0[11]; 2722 output[27] = buf0[27]; 2723 output[28] = buf0[7]; 2724 output[29] = buf0[23]; 2725 output[30] = buf0[15]; 2726 output[31] = buf0[31]; 2727 } 2728 2729 static void fdct64_neon(const int32x4_t *input, int32x4_t *output, 2730 int cos_bit) { 2731 const int16_t *cospi = cospi_arr_q13(cos_bit); 2732 2733 const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); 2734 const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]); 2735 const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]); 2736 const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]); 2737 const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]); 2738 const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]); 2739 const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]); 2740 const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]); 2741 const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]); 2742 const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]); 2743 const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]); 2744 const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]); 2745 const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]); 2746 const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]); 2747 const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]); 2748 const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]); 2749 2750 const int16x4_t cospi32 = vget_low_s16(cospi32_16); 2751 const int16x4_t cospi16 = vget_high_s16(cospi32_16); 2752 const int16x4_t cospi8 = vget_low_s16(cospi8_24); 2753 const int16x4_t cospi24 = vget_high_s16(cospi8_24); 2754 const int16x4_t cospi4 = vget_low_s16(cospi4_12); 2755 const int16x4_t cospi12 = vget_high_s16(cospi4_12); 2756 const int16x4_t cospi20 = vget_low_s16(cospi20_28); 2757 const int16x4_t cospi28 = vget_high_s16(cospi20_28); 2758 const int16x4_t cospi2 = vget_low_s16(cospi2_6); 2759 const int16x4_t cospi6 = vget_high_s16(cospi2_6); 2760 const int16x4_t cospi10 = vget_low_s16(cospi10_14); 2761 const int16x4_t cospi14 = vget_high_s16(cospi10_14); 2762 const int16x4_t cospi18 = vget_low_s16(cospi18_22); 2763 const int16x4_t cospi22 = vget_high_s16(cospi18_22); 2764 const int16x4_t cospi26 = vget_low_s16(cospi26_30); 2765 const int16x4_t cospi30 = vget_high_s16(cospi26_30); 2766 const int16x4_t cospi1 = vget_low_s16(cospi1_3); 2767 const int16x4_t cospi3 = vget_high_s16(cospi1_3); 2768 const int16x4_t cospi5 = vget_low_s16(cospi5_7); 2769 const int16x4_t cospi7 = vget_high_s16(cospi5_7); 2770 const int16x4_t cospi9 = vget_low_s16(cospi9_11); 2771 const int16x4_t cospi11 = vget_high_s16(cospi9_11); 2772 const int16x4_t cospi13 = vget_low_s16(cospi13_15); 2773 const int16x4_t cospi15 = vget_high_s16(cospi13_15); 2774 const int16x4_t cospi17 = vget_low_s16(cospi17_19); 2775 const int16x4_t cospi19 = vget_high_s16(cospi17_19); 2776 const int16x4_t cospi21 = vget_low_s16(cospi21_23); 2777 const int16x4_t cospi23 = vget_high_s16(cospi21_23); 2778 const int16x4_t cospi25 = vget_low_s16(cospi25_27); 2779 const int16x4_t cospi27 = vget_high_s16(cospi25_27); 2780 const int16x4_t cospi29 = vget_low_s16(cospi29_31); 2781 const int16x4_t cospi31 = vget_high_s16(cospi29_31); 2782 2783 // stage 1 2784 int32x4_t x1[64]; 2785 butterfly_dct_pre_s32_x4(input, x1, 64); 2786 2787 // stage 2 2788 int32x4_t x2[64]; 2789 butterfly_dct_pre_s32_x4(x1, x2, 32); 2790 butterfly_s32_s32_x4_0112_neon(cospi32, x1[55], x1[40], &x2[55], &x2[40]); 2791 butterfly_s32_s32_x4_0112_neon(cospi32, x1[54], x1[41], &x2[54], &x2[41]); 2792 butterfly_s32_s32_x4_0112_neon(cospi32, x1[53], x1[42], &x2[53], &x2[42]); 2793 butterfly_s32_s32_x4_0112_neon(cospi32, x1[52], x1[43], &x2[52], &x2[43]); 2794 butterfly_s32_s32_x4_0112_neon(cospi32, x1[51], x1[44], &x2[51], &x2[44]); 2795 butterfly_s32_s32_x4_0112_neon(cospi32, x1[50], x1[45], &x2[50], &x2[45]); 2796 butterfly_s32_s32_x4_0112_neon(cospi32, x1[49], x1[46], &x2[49], &x2[46]); 2797 butterfly_s32_s32_x4_0112_neon(cospi32, x1[48], x1[47], &x2[48], &x2[47]); 2798 2799 // stage 3 2800 int32x4_t x3[64]; 2801 butterfly_dct_pre_s32_x4(x2, x3, 16); 2802 butterfly_s32_s32_x4_0112_neon(cospi32, x2[27], x2[20], &x3[27], &x3[20]); 2803 butterfly_s32_s32_x4_0112_neon(cospi32, x2[26], x2[21], &x3[26], &x3[21]); 2804 butterfly_s32_s32_x4_0112_neon(cospi32, x2[25], x2[22], &x3[25], &x3[22]); 2805 butterfly_s32_s32_x4_0112_neon(cospi32, x2[24], x2[23], &x3[24], &x3[23]); 2806 butterfly_dct_post_s32_x4(x1 + 32, x2 + 32, x3 + 32, 32); 2807 2808 // stage 4 2809 int32x4_t x4[64]; 2810 butterfly_dct_pre_s32_x4(x3, x4, 8); 2811 butterfly_s32_s32_x4_0112_neon(cospi32, x3[13], x3[10], &x4[13], &x4[10]); 2812 butterfly_s32_s32_x4_0112_neon(cospi32, x3[12], x3[11], &x4[12], &x4[11]); 2813 butterfly_dct_post_s32_x4(x2 + 16, x3 + 16, x4 + 16, 16); 2814 butterfly_s32_s32_x4_0112_neon(cospi16, x3[59], x3[36], &x4[59], &x4[36]); 2815 butterfly_s32_s32_x4_0112_neon(cospi16, x3[58], x3[37], &x4[58], &x4[37]); 2816 butterfly_s32_s32_x4_0112_neon(cospi16, x3[57], x3[38], &x4[57], &x4[38]); 2817 butterfly_s32_s32_x4_0112_neon(cospi16, x3[56], x3[39], &x4[56], &x4[39]); 2818 butterfly_s32_s32_x4_1223_neon(cospi16, x3[55], x3[40], &x4[55], &x4[40]); 2819 butterfly_s32_s32_x4_1223_neon(cospi16, x3[54], x3[41], &x4[54], &x4[41]); 2820 butterfly_s32_s32_x4_1223_neon(cospi16, x3[53], x3[42], &x4[53], &x4[42]); 2821 butterfly_s32_s32_x4_1223_neon(cospi16, x3[52], x3[43], &x4[52], &x4[43]); 2822 2823 // stage 5 2824 int32x4_t x5[64]; 2825 butterfly_dct_pre_s32_x4(x4, x5, 4); 2826 butterfly_s32_s32_x4_0112_neon(cospi32, x4[6], x4[5], &x5[6], &x5[5]); 2827 butterfly_dct_post_s32_x4(x3 + 8, x4 + 8, x5 + 8, 8); 2828 butterfly_s32_s32_x4_0112_neon(cospi16, x4[29], x4[18], &x5[29], &x5[18]); 2829 butterfly_s32_s32_x4_0112_neon(cospi16, x4[28], x4[19], &x5[28], &x5[19]); 2830 butterfly_s32_s32_x4_1223_neon(cospi16, x4[27], x4[20], &x5[27], &x5[20]); 2831 butterfly_s32_s32_x4_1223_neon(cospi16, x4[26], x4[21], &x5[26], &x5[21]); 2832 butterfly_dct_post_s32_x4(x3 + 32, x4 + 32, x5 + 32, 16); 2833 butterfly_dct_post_s32_x4(x3 + 48, x4 + 48, x5 + 48, 16); 2834 2835 // stage 6 2836 int32x4_t x6[64]; 2837 butterfly_s32_s32_x4_0112_neon(cospi32, x5[0], x5[1], &x6[0], &x6[1]); 2838 butterfly_s32_s32_x4_0112_neon(cospi16, x5[3], x5[2], &x6[2], &x6[3]); 2839 butterfly_dct_post_s32_x4(x4 + 4, x5 + 4, x6 + 4, 4); 2840 butterfly_s32_s32_x4_0112_neon(cospi16, x5[14], x5[9], &x6[14], &x6[9]); 2841 butterfly_s32_s32_x4_1223_neon(cospi16, x5[13], x5[10], &x6[13], &x6[10]); 2842 butterfly_dct_post_s32_x4(x4 + 16, x5 + 16, x6 + 16, 8); 2843 butterfly_dct_post_s32_x4(x4 + 24, x5 + 24, x6 + 24, 8); 2844 butterfly_s32_s32_x4_0112_neon(cospi8, x5[61], x5[34], &x6[61], &x6[34]); 2845 butterfly_s32_s32_x4_0112_neon(cospi8, x5[60], x5[35], &x6[60], &x6[35]); 2846 butterfly_s32_s32_x4_1223_neon(cospi8, x5[59], x5[36], &x6[59], &x6[36]); 2847 butterfly_s32_s32_x4_1223_neon(cospi8, x5[58], x5[37], &x6[58], &x6[37]); 2848 butterfly_s32_s32_x4_1003_neon(cospi24, x5[53], x5[42], &x6[53], &x6[42]); 2849 butterfly_s32_s32_x4_1003_neon(cospi24, x5[52], x5[43], &x6[52], &x6[43]); 2850 butterfly_s32_s32_x4_0332_neon(cospi24, x5[51], x5[44], &x6[51], &x6[44]); 2851 butterfly_s32_s32_x4_0332_neon(cospi24, x5[50], x5[45], &x6[50], &x6[45]); 2852 2853 // stage 7 2854 int32x4_t x7[64]; 2855 butterfly_s32_s32_x4_0112_neon(cospi8, x6[7], x6[4], &x7[4], &x7[7]); 2856 butterfly_s32_s32_x4_1003_neon(cospi24, x6[6], x6[5], &x7[5], &x7[6]); 2857 butterfly_dct_post_s32_x4(x5 + 8, x6 + 8, x7 + 8, 4); 2858 butterfly_dct_post_s32_x4(x5 + 12, x6 + 12, x7 + 12, 4); 2859 butterfly_s32_s32_x4_0112_neon(cospi8, x6[30], x6[17], &x7[30], &x7[17]); 2860 butterfly_s32_s32_x4_1223_neon(cospi8, x6[29], x6[18], &x7[29], &x7[18]); 2861 butterfly_s32_s32_x4_1003_neon(cospi24, x6[26], x6[21], &x7[26], &x7[21]); 2862 butterfly_s32_s32_x4_0332_neon(cospi24, x6[25], x6[22], &x7[25], &x7[22]); 2863 butterfly_dct_post_s32_x4(x5 + 32, x6 + 32, x7 + 32, 8); 2864 butterfly_dct_post_s32_x4(x5 + 40, x6 + 40, x7 + 40, 8); 2865 butterfly_dct_post_s32_x4(x5 + 48, x6 + 48, x7 + 48, 8); 2866 butterfly_dct_post_s32_x4(x5 + 56, x6 + 56, x7 + 56, 8); 2867 2868 // stage 8 2869 int32x4_t x8[64]; 2870 butterfly_s32_s32_x4_0112_neon(cospi4, x7[15], x7[8], &x8[8], &x8[15]); 2871 butterfly_s32_s32_x4_1003_neon(cospi28, x7[14], x7[9], &x8[9], &x8[14]); 2872 butterfly_s32_s32_x4_0112_neon(cospi20, x7[13], x7[10], &x8[10], &x8[13]); 2873 butterfly_s32_s32_x4_1003_neon(cospi12, x7[12], x7[11], &x8[11], &x8[12]); 2874 butterfly_dct_post_s32_x4(x6 + 16, x7 + 16, x8 + 16, 4); 2875 butterfly_dct_post_s32_x4(x6 + 20, x7 + 20, x8 + 20, 4); 2876 butterfly_dct_post_s32_x4(x6 + 24, x7 + 24, x8 + 24, 4); 2877 butterfly_dct_post_s32_x4(x6 + 28, x7 + 28, x8 + 28, 4); 2878 butterfly_s32_s32_x4_0112_neon(cospi4, x7[62], x7[33], &x8[62], &x8[33]); 2879 butterfly_s32_s32_x4_1223_neon(cospi4, x7[61], x7[34], &x8[61], &x8[34]); 2880 butterfly_s32_s32_x4_1003_neon(cospi28, x7[58], x7[37], &x8[58], &x8[37]); 2881 butterfly_s32_s32_x4_0332_neon(cospi28, x7[57], x7[38], &x8[57], &x8[38]); 2882 butterfly_s32_s32_x4_0112_neon(cospi20, x7[54], x7[41], &x8[54], &x8[41]); 2883 butterfly_s32_s32_x4_1223_neon(cospi20, x7[53], x7[42], &x8[53], &x8[42]); 2884 butterfly_s32_s32_x4_1003_neon(cospi12, x7[50], x7[45], &x8[50], &x8[45]); 2885 butterfly_s32_s32_x4_0332_neon(cospi12, x7[49], x7[46], &x8[49], &x8[46]); 2886 2887 // stage 9 2888 int32x4_t x9[64]; 2889 butterfly_s32_s32_x4_0112_neon(cospi2, x8[31], x8[16], &x9[16], &x9[31]); 2890 butterfly_s32_s32_x4_1003_neon(cospi30, x8[30], x8[17], &x9[17], &x9[30]); 2891 butterfly_s32_s32_x4_0112_neon(cospi18, x8[29], x8[18], &x9[18], &x9[29]); 2892 butterfly_s32_s32_x4_1003_neon(cospi14, x8[28], x8[19], &x9[19], &x9[28]); 2893 butterfly_s32_s32_x4_0112_neon(cospi10, x8[27], x8[20], &x9[20], &x9[27]); 2894 butterfly_s32_s32_x4_1003_neon(cospi22, x8[26], x8[21], &x9[21], &x9[26]); 2895 butterfly_s32_s32_x4_0112_neon(cospi26, x8[25], x8[22], &x9[22], &x9[25]); 2896 butterfly_s32_s32_x4_1003_neon(cospi6, x8[24], x8[23], &x9[23], &x9[24]); 2897 butterfly_dct_post_s32_x4(x7 + 32, x8 + 32, x9 + 32, 4); 2898 butterfly_dct_post_s32_x4(x7 + 36, x8 + 36, x9 + 36, 4); 2899 butterfly_dct_post_s32_x4(x7 + 40, x8 + 40, x9 + 40, 4); 2900 butterfly_dct_post_s32_x4(x7 + 44, x8 + 44, x9 + 44, 4); 2901 butterfly_dct_post_s32_x4(x7 + 48, x8 + 48, x9 + 48, 4); 2902 butterfly_dct_post_s32_x4(x7 + 52, x8 + 52, x9 + 52, 4); 2903 butterfly_dct_post_s32_x4(x7 + 56, x8 + 56, x9 + 56, 4); 2904 butterfly_dct_post_s32_x4(x7 + 60, x8 + 60, x9 + 60, 4); 2905 2906 // stage 10 2907 int32x4_t x10[64]; 2908 butterfly_s32_s32_x4_0112_neon(cospi1, x9[63], x9[32], &x10[32], &x10[63]); 2909 butterfly_s32_s32_x4_1003_neon(cospi31, x9[62], x9[33], &x10[33], &x10[62]); 2910 butterfly_s32_s32_x4_0112_neon(cospi17, x9[61], x9[34], &x10[34], &x10[61]); 2911 butterfly_s32_s32_x4_1003_neon(cospi15, x9[60], x9[35], &x10[35], &x10[60]); 2912 butterfly_s32_s32_x4_0112_neon(cospi9, x9[59], x9[36], &x10[36], &x10[59]); 2913 butterfly_s32_s32_x4_1003_neon(cospi23, x9[58], x9[37], &x10[37], &x10[58]); 2914 butterfly_s32_s32_x4_0112_neon(cospi25, x9[57], x9[38], &x10[38], &x10[57]); 2915 butterfly_s32_s32_x4_1003_neon(cospi7, x9[56], x9[39], &x10[39], &x10[56]); 2916 butterfly_s32_s32_x4_0112_neon(cospi5, x9[55], x9[40], &x10[40], &x10[55]); 2917 butterfly_s32_s32_x4_1003_neon(cospi27, x9[54], x9[41], &x10[41], &x10[54]); 2918 butterfly_s32_s32_x4_0112_neon(cospi21, x9[53], x9[42], &x10[42], &x10[53]); 2919 butterfly_s32_s32_x4_1003_neon(cospi11, x9[52], x9[43], &x10[43], &x10[52]); 2920 butterfly_s32_s32_x4_0112_neon(cospi13, x9[51], x9[44], &x10[44], &x10[51]); 2921 butterfly_s32_s32_x4_1003_neon(cospi19, x9[50], x9[45], &x10[45], &x10[50]); 2922 butterfly_s32_s32_x4_0112_neon(cospi29, x9[49], x9[46], &x10[46], &x10[49]); 2923 butterfly_s32_s32_x4_1003_neon(cospi3, x9[48], x9[47], &x10[47], &x10[48]); 2924 2925 // stage 11, only store into the low 32 output indices. 2926 output[0] = x6[0]; 2927 output[1] = x10[32]; 2928 output[2] = x9[16]; 2929 output[3] = x10[48]; 2930 output[4] = x8[8]; 2931 output[5] = x10[40]; 2932 output[6] = x9[24]; 2933 output[7] = x10[56]; 2934 output[8] = x7[4]; 2935 output[9] = x10[36]; 2936 output[10] = x9[20]; 2937 output[11] = x10[52]; 2938 output[12] = x8[12]; 2939 output[13] = x10[44]; 2940 output[14] = x9[28]; 2941 output[15] = x10[60]; 2942 output[16] = x6[2]; 2943 output[17] = x10[34]; 2944 output[18] = x9[18]; 2945 output[19] = x10[50]; 2946 output[20] = x8[10]; 2947 output[21] = x10[42]; 2948 output[22] = x9[26]; 2949 output[23] = x10[58]; 2950 output[24] = x7[6]; 2951 output[25] = x10[38]; 2952 output[26] = x9[22]; 2953 output[27] = x10[54]; 2954 output[28] = x8[14]; 2955 output[29] = x10[46]; 2956 output[30] = x9[30]; 2957 output[31] = x10[62]; 2958 } 2959 2960 static void lowbd_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, 2961 int stride, TX_TYPE tx_type, int bd) { 2962 (void)bd; 2963 (void)tx_type; 2964 assert(tx_type == DCT_DCT); 2965 int16x8_t buf0[64], buf1[512]; 2966 const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; 2967 2968 for (int i = 0; i < 8; i++) { 2969 load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); 2970 col_txfm(buf0, buf0, 13); 2971 shift_right_2_round_s16_x8(buf0, buf0, 64); 2972 for (int j = 0; j < 4; ++j) { 2973 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); 2974 } 2975 } 2976 for (int i = 0; i < 4; i++) { 2977 int32x4_t bufA[64]; 2978 int32x4_t bufB[64]; 2979 int16x8_t *buf = buf1 + 64 * i; 2980 for (int j = 0; j < 64; ++j) { 2981 bufA[j] = vmovl_s16(vget_low_s16(buf[j])); 2982 bufB[j] = vmovl_s16(vget_high_s16(buf[j])); 2983 } 2984 fdct64_neon(bufA, bufA, 10); 2985 fdct64_neon(bufB, bufB, 10); 2986 shift_right_2_round_s32_x4(bufA, bufA, 32); 2987 shift_right_2_round_s32_x4(bufB, bufB, 32); 2988 store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); 2989 } 2990 } 2991 2992 static void lowbd_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *output, 2993 int stride, TX_TYPE tx_type, int bd) { 2994 (void)bd; 2995 int16x8_t buf0[64], buf1[256]; 2996 const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type]; 2997 2998 for (int i = 0; i < 8; i++) { 2999 col_txfm(input + 8 * i, buf0, stride, 12); 3000 shift_right_4_round_s16_x8(buf0, buf0, 32); 3001 for (int j = 0; j < 4; ++j) { 3002 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i); 3003 } 3004 } 3005 assert(tx_type == DCT_DCT); 3006 for (int i = 0; i < 4; i++) { 3007 int32x4_t bufA[64]; 3008 int32x4_t bufB[64]; 3009 int16x8_t *buf = buf1 + 64 * i; 3010 for (int j = 0; j < 64; ++j) { 3011 bufA[j] = vmovl_s16(vget_low_s16(buf[j])); 3012 bufB[j] = vmovl_s16(vget_high_s16(buf[j])); 3013 } 3014 fdct64_neon(bufA, bufA, 11); 3015 fdct64_neon(bufB, bufB, 11); 3016 shift_right_2_round_s32_x4(bufA, bufA, 32); 3017 shift_right_2_round_s32_x4(bufB, bufB, 32); 3018 round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); 3019 round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); 3020 store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); 3021 } 3022 } 3023 3024 static void lowbd_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *output, 3025 int stride, TX_TYPE tx_type, int bd) { 3026 (void)bd; 3027 (void)tx_type; 3028 assert(tx_type == DCT_DCT); 3029 int16x8_t buf0[64], buf1[256]; 3030 const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon; 3031 3032 for (int i = 0; i < 4; i++) { 3033 load_buffer_s16_x8(input + 8 * i, stride, buf0, 64); 3034 col_txfm(buf0, buf0, 13); 3035 shift_right_2_round_s16_x8(buf0, buf0, 64); 3036 for (int j = 0; j < 4; ++j) { 3037 transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 32 + 8 * i); 3038 } 3039 } 3040 3041 for (int i = 0; i < 4; i++) { 3042 int32x4_t bufA[32]; 3043 int32x4_t bufB[32]; 3044 int16x8_t *buf = buf1 + 32 * i; 3045 for (int j = 0; j < 32; ++j) { 3046 bufA[j] = vmovl_s16(vget_low_s16(buf[j])); 3047 bufB[j] = vmovl_s16(vget_high_s16(buf[j])); 3048 } 3049 fdct32_neon(bufA, bufA, 11); 3050 fdct32_neon(bufB, bufB, 11); 3051 shift_right_2_round_s32_x4(bufA, bufA, 32); 3052 shift_right_2_round_s32_x4(bufB, bufB, 32); 3053 round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); 3054 round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32); 3055 store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); 3056 } 3057 } 3058 3059 static FwdTxfm2dFunc lowbd_fwd_txfm_func_ls[TX_SIZES_ALL] = { 3060 lowbd_fwd_txfm2d_4x4_neon, // 4x4 transform 3061 lowbd_fwd_txfm2d_8x8_neon, // 8x8 transform 3062 lowbd_fwd_txfm2d_16x16_neon, // 16x16 transform 3063 lowbd_fwd_txfm2d_32x32_neon, // 32x32 transform 3064 lowbd_fwd_txfm2d_64x64_neon, // 64x64 transform 3065 lowbd_fwd_txfm2d_4x8_neon, // 4x8 transform 3066 lowbd_fwd_txfm2d_8x4_neon, // 8x4 transform 3067 lowbd_fwd_txfm2d_8x16_neon, // 8x16 transform 3068 lowbd_fwd_txfm2d_16x8_neon, // 16x8 transform 3069 lowbd_fwd_txfm2d_16x32_neon, // 16x32 transform 3070 lowbd_fwd_txfm2d_32x16_neon, // 32x16 transform 3071 lowbd_fwd_txfm2d_32x64_neon, // 32x64 transform 3072 lowbd_fwd_txfm2d_64x32_neon, // 64x32 transform 3073 lowbd_fwd_txfm2d_4x16_neon, // 4x16 transform 3074 lowbd_fwd_txfm2d_16x4_neon, // 16x4 transform 3075 lowbd_fwd_txfm2d_8x32_neon, // 8x32 transform 3076 lowbd_fwd_txfm2d_32x8_neon, // 32x8 transform 3077 lowbd_fwd_txfm2d_16x64_neon, // 16x64 transform 3078 lowbd_fwd_txfm2d_64x16_neon, // 64x16 transform 3079 }; 3080 3081 void av1_lowbd_fwd_txfm_neon(const int16_t *src_diff, tran_low_t *coeff, 3082 int diff_stride, TxfmParam *txfm_param) { 3083 FwdTxfm2dFunc fwd_txfm2d_func = lowbd_fwd_txfm_func_ls[txfm_param->tx_size]; 3084 if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { 3085 av1_lowbd_fwd_txfm_c(src_diff, coeff, diff_stride, txfm_param); 3086 } else { 3087 fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, 3088 txfm_param->bd); 3089 } 3090 }