highbd_fwd_txfm_neon.c (98194B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "aom_dsp/arm/transpose_neon.h" 16 #include "aom_dsp/txfm_common.h" 17 #include "aom_ports/mem.h" 18 #include "av1/common/av1_txfm.h" 19 #include "av1/encoder/av1_fwd_txfm1d_cfg.h" 20 #include "config/aom_config.h" 21 #include "config/av1_rtcd.h" 22 #include "shift_neon.h" 23 #include "txfm_neon.h" 24 25 static AOM_FORCE_INLINE void transpose_arrays_s32_64x64(const int32x4_t *in, 26 int32x4_t *out) { 27 // This is not quite the same as the other transposes defined in 28 // transpose_neon.h: We only write the low 64x32 sub-matrix since the rest is 29 // unused by the following row transform. 30 for (int j = 0; j < 8; ++j) { 31 for (int i = 0; i < 16; ++i) { 32 transpose_arrays_s32_4x4(in + 64 * i + 4 * j, out + 64 * j + 4 * i); 33 } 34 } 35 } 36 37 // A note on butterfly helper naming: 38 // 39 // butterfly_[weight_indices]_neon 40 // e.g. butterfly_0312_neon 41 // ^ Weights are applied as indices 0, 3, 2, 1 42 // (see more detail below) 43 // 44 // Weight indices are treated as an index into the 4-tuple of the weight 45 // itself, plus related and negated constants: w=(w0, 1-w0, -w0, w0-1). 46 // This is then represented in the helper naming by referring to the lane index 47 // in the loaded tuple that each multiply is performed with: 48 // 49 // in0 in1 50 // /------------ 51 // out0 | w[0] w[1] ==> out0 = in0 * w[0] + in1 * w[1] 52 // out1 | w[2] w[3] ==> out1 = in0 * w[2] + in1 * w[3] 53 // 54 // So for indices 0321 from the earlier example, we end up with: 55 // 56 // in0 in1 57 // /------------------ 58 // out0 | (lane 0) (lane 3) ==> out0 = in0 * w0 + in1 * (w0-1) 59 // out1 | (lane 2) (lane 1) ==> out1 = in0 * -w0 + in1 * (1-w0) 60 61 #define butterfly_half_neon(wvec, lane0, lane1, in0, in1, out, v_bit) \ 62 do { \ 63 int32x2x2_t wvecs = { { wvec, vneg_s32(wvec) } }; \ 64 int32x4_t x = vmulq_lane_s32(in0, wvecs.val[lane0 / 2], lane0 % 2); \ 65 x = vmlaq_lane_s32(x, in1, wvecs.val[lane1 / 2], lane1 % 2); \ 66 *out = vrshlq_s32(x, v_bit); \ 67 } while (false) 68 69 static AOM_FORCE_INLINE void butterfly_0112_neon( 70 const int32_t *cospi, const int widx0, const int32x4_t n0, 71 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, 72 const int32x4_t v_bit) { 73 int32x2_t w01 = vld1_s32(cospi + 2 * widx0); 74 butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); 75 butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); 76 } 77 78 static AOM_FORCE_INLINE void butterfly_2312_neon( 79 const int32_t *cospi, const int widx0, const int32x4_t n0, 80 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, 81 const int32x4_t v_bit) { 82 int32x2_t w01 = vld1_s32(cospi + 2 * widx0); 83 butterfly_half_neon(w01, 2, 3, n0, n1, out0, v_bit); 84 butterfly_half_neon(w01, 1, 2, n0, n1, out1, v_bit); 85 } 86 87 static AOM_FORCE_INLINE void butterfly_0332_neon( 88 const int32_t *cospi, const int widx0, const int32x4_t n0, 89 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, 90 const int32x4_t v_bit) { 91 int32x2_t w01 = vld1_s32(cospi + 2 * widx0); 92 butterfly_half_neon(w01, 0, 3, n0, n1, out0, v_bit); 93 butterfly_half_neon(w01, 3, 2, n0, n1, out1, v_bit); 94 } 95 96 static AOM_FORCE_INLINE void butterfly_0130_neon( 97 const int32_t *cospi, const int widx0, const int32x4_t n0, 98 const int32x4_t n1, int32x4_t *out0, int32x4_t *out1, 99 const int32x4_t v_bit) { 100 int32x2_t w01 = vld1_s32(cospi + 2 * widx0); 101 butterfly_half_neon(w01, 0, 1, n0, n1, out0, v_bit); 102 butterfly_half_neon(w01, 3, 0, n0, n1, out1, v_bit); 103 } 104 105 static AOM_FORCE_INLINE void butterfly_cospi32_0002_neon( 106 const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, 107 int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { 108 int32x2_t w01 = vld1_s32(cospi + 2 * 32); 109 butterfly_half_neon(w01, 0, 0, n0, n1, out0, v_bit); 110 butterfly_half_neon(w01, 0, 2, n0, n1, out1, v_bit); 111 } 112 113 static AOM_FORCE_INLINE void butterfly_cospi32_0222_neon( 114 const int32_t *cospi, const int32x4_t n0, const int32x4_t n1, 115 int32x4_t *out0, int32x4_t *out1, const int32x4_t v_bit) { 116 int32x2_t w01 = vld1_s32(cospi + 2 * 32); 117 butterfly_half_neon(w01, 0, 2, n0, n1, out0, v_bit); 118 butterfly_half_neon(w01, 2, 2, n0, n1, out1, v_bit); 119 } 120 121 static AOM_FORCE_INLINE void round_rect_array_s32_neon(const int32x4_t *input, 122 int32x4_t *output, 123 const int size) { 124 const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); 125 int i = 0; 126 do { 127 const int32x4_t r1 = vmulq_s32(input[i], sqrt2); 128 output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); 129 } while (++i < size); 130 } 131 132 static AOM_FORCE_INLINE void round_shift2_rect_array_s32_neon( 133 const int32x4_t *input, int32x4_t *output, const int size) { 134 const int32x4_t sqrt2 = vdupq_n_s32(NewSqrt2); 135 int i = 0; 136 do { 137 const int32x4_t r0 = vrshrq_n_s32(input[i], 2); 138 const int32x4_t r1 = vmulq_s32(r0, sqrt2); 139 output[i] = vrshrq_n_s32(r1, NewSqrt2Bits); 140 } while (++i < size); 141 } 142 143 #define LOAD_BUFFER_4XH(h) \ 144 static AOM_FORCE_INLINE void load_buffer_4x##h( \ 145 const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ 146 if (fliplr) { \ 147 for (int i = 0; i < (h); ++i) { \ 148 int16x4_t a = vld1_s16(input + i * stride); \ 149 a = vrev64_s16(a); \ 150 in[i] = vshll_n_s16(a, 2); \ 151 } \ 152 } else { \ 153 for (int i = 0; i < (h); ++i) { \ 154 int16x4_t a = vld1_s16(input + i * stride); \ 155 in[i] = vshll_n_s16(a, 2); \ 156 } \ 157 } \ 158 } 159 160 // AArch32 does not permit the argument to vshll_n_s16 to be zero, so need to 161 // avoid the expression even though the compiler can prove that the code path 162 // is never taken if `shift == 0`. 163 #define shift_left_long_s16(a, shift) \ 164 ((shift) == 0 ? vmovl_s16(a) : vshll_n_s16((a), (shift) == 0 ? 1 : (shift))) 165 166 #define LOAD_BUFFER_WXH(w, h, shift) \ 167 static AOM_FORCE_INLINE void load_buffer_##w##x##h( \ 168 const int16_t *input, int32x4_t *in, int stride, int fliplr) { \ 169 assert(w >= 8); \ 170 if (fliplr) { \ 171 for (int i = 0; i < (h); ++i) { \ 172 for (int j = 0; j < (w) / 8; ++j) { \ 173 int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ 174 a = vrev64q_s16(a); \ 175 int j2 = (w) / 8 - j - 1; \ 176 in[i + (h) * (2 * j2 + 0)] = \ 177 shift_left_long_s16(vget_high_s16(a), (shift)); \ 178 in[i + (h) * (2 * j2 + 1)] = \ 179 shift_left_long_s16(vget_low_s16(a), (shift)); \ 180 } \ 181 } \ 182 } else { \ 183 for (int i = 0; i < (h); ++i) { \ 184 for (int j = 0; j < (w) / 8; ++j) { \ 185 int16x8_t a = vld1q_s16(input + i * stride + j * 8); \ 186 in[i + (h) * (2 * j + 0)] = \ 187 shift_left_long_s16(vget_low_s16(a), (shift)); \ 188 in[i + (h) * (2 * j + 1)] = \ 189 shift_left_long_s16(vget_high_s16(a), (shift)); \ 190 } \ 191 } \ 192 } \ 193 } 194 195 LOAD_BUFFER_4XH(4) 196 LOAD_BUFFER_4XH(8) 197 LOAD_BUFFER_4XH(16) 198 LOAD_BUFFER_4XH(32) 199 LOAD_BUFFER_WXH(8, 8, 2) 200 LOAD_BUFFER_WXH(16, 16, 2) 201 LOAD_BUFFER_WXH(32, 64, 0) 202 LOAD_BUFFER_WXH(64, 32, 2) 203 LOAD_BUFFER_WXH(64, 64, 0) 204 205 #if !CONFIG_REALTIME_ONLY 206 LOAD_BUFFER_WXH(16, 64, 0) 207 LOAD_BUFFER_WXH(64, 16, 2) 208 #endif // !CONFIG_REALTIME_ONLY 209 210 #define STORE_BUFFER_WXH(w, h) \ 211 static AOM_FORCE_INLINE void store_buffer_##w##x##h( \ 212 const int32x4_t *in, int32_t *out, int stride) { \ 213 for (int i = 0; i < (w); ++i) { \ 214 for (int j = 0; j < (h) / 4; ++j) { \ 215 vst1q_s32(&out[i * stride + j * 4], in[i + j * (w)]); \ 216 } \ 217 } \ 218 } 219 220 STORE_BUFFER_WXH(4, 4) 221 STORE_BUFFER_WXH(8, 4) 222 STORE_BUFFER_WXH(8, 8) 223 STORE_BUFFER_WXH(16, 4) 224 STORE_BUFFER_WXH(16, 16) 225 STORE_BUFFER_WXH(32, 4) 226 STORE_BUFFER_WXH(32, 32) 227 STORE_BUFFER_WXH(64, 32) 228 229 #if !CONFIG_REALTIME_ONLY 230 STORE_BUFFER_WXH(16, 32) 231 STORE_BUFFER_WXH(64, 16) 232 #endif // !CONFIG_REALTIME_ONLY 233 234 static AOM_FORCE_INLINE void highbd_fdct4_x4_neon(const int32x4_t *in, 235 int32x4_t *out, int bit) { 236 const int32_t *const cospi = cospi_arr_s32(bit); 237 const int32x4_t cospi32 = vdupq_n_s32(cospi[2 * 32]); 238 const int32x2_t cospi16_48 = vld1_s32(&cospi[2 * 16]); 239 240 const int32x4_t a0 = vaddq_s32(in[0], in[3]); 241 const int32x4_t a1 = vsubq_s32(in[0], in[3]); 242 const int32x4_t a2 = vaddq_s32(in[1], in[2]); 243 const int32x4_t a3 = vsubq_s32(in[1], in[2]); 244 245 const int32x4_t b0 = vmulq_s32(a0, cospi32); 246 const int32x4_t b1 = vmulq_lane_s32(a1, cospi16_48, 1); 247 const int32x4_t b2 = vmulq_s32(a2, cospi32); 248 const int32x4_t b3 = vmulq_lane_s32(a3, cospi16_48, 1); 249 250 const int32x4_t c0 = vaddq_s32(b0, b2); 251 const int32x4_t c1 = vsubq_s32(b0, b2); 252 const int32x4_t c2 = vmlaq_lane_s32(b3, a1, cospi16_48, 0); 253 const int32x4_t c3 = vmlsq_lane_s32(b1, a3, cospi16_48, 0); 254 255 const int32x4_t v_bit = vdupq_n_s32(-bit); 256 const int32x4_t d0 = vrshlq_s32(c0, v_bit); 257 const int32x4_t d1 = vrshlq_s32(c1, v_bit); 258 const int32x4_t d2 = vrshlq_s32(c2, v_bit); 259 const int32x4_t d3 = vrshlq_s32(c3, v_bit); 260 261 out[0] = d0; 262 out[1] = d2; 263 out[2] = d1; 264 out[3] = d3; 265 } 266 267 static AOM_FORCE_INLINE void highbd_fadst4_x4_neon(const int32x4_t *in, 268 int32x4_t *out, int bit) { 269 const int32x4_t sinpi = vld1q_s32(sinpi_arr(bit) + 1); 270 271 const int32x4_t a0 = vaddq_s32(in[0], in[1]); 272 const int32x4_t a1 = vmulq_lane_s32(in[0], vget_low_s32(sinpi), 0); 273 const int32x4_t a2 = vmulq_lane_s32(in[0], vget_high_s32(sinpi), 1); 274 const int32x4_t a3 = vmulq_lane_s32(in[2], vget_high_s32(sinpi), 0); 275 276 const int32x4_t b0 = vmlaq_lane_s32(a1, in[1], vget_low_s32(sinpi), 1); 277 const int32x4_t b1 = vmlsq_lane_s32(a2, in[1], vget_low_s32(sinpi), 0); 278 const int32x4_t b2 = vsubq_s32(a0, in[3]); 279 280 const int32x4_t c0 = vmlaq_lane_s32(b0, in[3], vget_high_s32(sinpi), 1); 281 const int32x4_t c1 = vmlaq_lane_s32(b1, in[3], vget_low_s32(sinpi), 1); 282 const int32x4_t c2 = vmulq_lane_s32(b2, vget_high_s32(sinpi), 0); 283 284 const int32x4_t d0 = vaddq_s32(c0, a3); 285 const int32x4_t d1 = vsubq_s32(c1, a3); 286 const int32x4_t d2 = vsubq_s32(c1, c0); 287 288 const int32x4_t e0 = vaddq_s32(d2, a3); 289 290 const int32x4_t v_bit = vdupq_n_s32(-bit); 291 out[0] = vrshlq_s32(d0, v_bit); 292 out[1] = vrshlq_s32(c2, v_bit); 293 out[2] = vrshlq_s32(d1, v_bit); 294 out[3] = vrshlq_s32(e0, v_bit); 295 } 296 297 static AOM_FORCE_INLINE void highbd_fidentity4_x4_neon(const int32x4_t *in, 298 int32x4_t *out, 299 int bit) { 300 (void)bit; 301 int32x4_t fact = vdupq_n_s32(NewSqrt2); 302 303 for (int i = 0; i < 4; i++) { 304 const int32x4_t a_low = vmulq_s32(in[i], fact); 305 out[i] = vrshrq_n_s32(a_low, NewSqrt2Bits); 306 } 307 } 308 309 void av1_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *coeff, 310 int input_stride, TX_TYPE tx_type, int bd) { 311 (void)bd; 312 313 int ud_flip, lr_flip; 314 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 315 ud_adjust_input_and_stride(ud_flip, &input, &input_stride, 4); 316 317 // Workspace for column/row-wise transforms. 318 int32x4_t buf[4]; 319 320 switch (tx_type) { 321 case DCT_DCT: 322 load_buffer_4x4(input, buf, input_stride, 0); 323 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 324 transpose_arrays_s32_4x4(buf, buf); 325 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 326 store_buffer_4x4(buf, coeff, /*stride=*/4); 327 break; 328 case ADST_DCT: 329 load_buffer_4x4(input, buf, input_stride, 0); 330 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 331 transpose_arrays_s32_4x4(buf, buf); 332 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 333 store_buffer_4x4(buf, coeff, /*stride=*/4); 334 break; 335 case DCT_ADST: 336 load_buffer_4x4(input, buf, input_stride, 0); 337 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 338 transpose_arrays_s32_4x4(buf, buf); 339 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 340 store_buffer_4x4(buf, coeff, /*stride=*/4); 341 break; 342 case ADST_ADST: 343 load_buffer_4x4(input, buf, input_stride, 0); 344 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 345 transpose_arrays_s32_4x4(buf, buf); 346 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 347 store_buffer_4x4(buf, coeff, /*stride=*/4); 348 break; 349 case FLIPADST_DCT: 350 load_buffer_4x4(input, buf, input_stride, 0); 351 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 352 transpose_arrays_s32_4x4(buf, buf); 353 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 354 store_buffer_4x4(buf, coeff, /*stride=*/4); 355 break; 356 case DCT_FLIPADST: 357 load_buffer_4x4(input, buf, input_stride, 1); 358 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 359 transpose_arrays_s32_4x4(buf, buf); 360 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 361 store_buffer_4x4(buf, coeff, /*stride=*/4); 362 break; 363 case FLIPADST_FLIPADST: 364 load_buffer_4x4(input, buf, input_stride, 1); 365 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 366 transpose_arrays_s32_4x4(buf, buf); 367 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 368 store_buffer_4x4(buf, coeff, /*stride=*/4); 369 break; 370 case ADST_FLIPADST: 371 load_buffer_4x4(input, buf, input_stride, 1); 372 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 373 transpose_arrays_s32_4x4(buf, buf); 374 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 375 store_buffer_4x4(buf, coeff, /*stride=*/4); 376 break; 377 case FLIPADST_ADST: 378 load_buffer_4x4(input, buf, input_stride, 0); 379 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 380 transpose_arrays_s32_4x4(buf, buf); 381 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 382 store_buffer_4x4(buf, coeff, /*stride=*/4); 383 break; 384 case IDTX: 385 load_buffer_4x4(input, buf, input_stride, 0); 386 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 387 transpose_arrays_s32_4x4(buf, buf); 388 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 389 store_buffer_4x4(buf, coeff, /*stride=*/4); 390 break; 391 case V_DCT: 392 load_buffer_4x4(input, buf, input_stride, 0); 393 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 394 transpose_arrays_s32_4x4(buf, buf); 395 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 396 store_buffer_4x4(buf, coeff, /*stride=*/4); 397 break; 398 case H_DCT: 399 load_buffer_4x4(input, buf, input_stride, 0); 400 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 401 transpose_arrays_s32_4x4(buf, buf); 402 highbd_fdct4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 403 store_buffer_4x4(buf, coeff, /*stride=*/4); 404 break; 405 case V_ADST: 406 load_buffer_4x4(input, buf, input_stride, 0); 407 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 408 transpose_arrays_s32_4x4(buf, buf); 409 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 410 store_buffer_4x4(buf, coeff, /*stride=*/4); 411 break; 412 case H_ADST: 413 load_buffer_4x4(input, buf, input_stride, 0); 414 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 415 transpose_arrays_s32_4x4(buf, buf); 416 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_col[0][0]); 417 store_buffer_4x4(buf, coeff, /*stride=*/4); 418 break; 419 case V_FLIPADST: 420 load_buffer_4x4(input, buf, input_stride, 0); 421 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 422 transpose_arrays_s32_4x4(buf, buf); 423 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 424 store_buffer_4x4(buf, coeff, /*stride=*/4); 425 break; 426 case H_FLIPADST: 427 load_buffer_4x4(input, buf, input_stride, 1); 428 highbd_fidentity4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 429 transpose_arrays_s32_4x4(buf, buf); 430 highbd_fadst4_x4_neon(buf, buf, av1_fwd_cos_bit_row[0][0]); 431 store_buffer_4x4(buf, coeff, /*stride=*/4); 432 break; 433 default: assert(0); 434 } 435 } 436 437 // Butterfly pre-processing: 438 // e.g. n=4: 439 // out[0] = in[0] + in[3] 440 // out[1] = in[1] + in[2] 441 // out[2] = in[1] - in[2] 442 // out[3] = in[0] - in[3] 443 444 static AOM_FORCE_INLINE void butterfly_dct_pre(const int32x4_t *input, 445 int32x4_t *output, int n) { 446 for (int i = 0; i < n / 2; ++i) { 447 output[i] = vaddq_s32(input[i], input[n - i - 1]); 448 } 449 for (int i = 0; i < n / 2; ++i) { 450 output[n / 2 + i] = vsubq_s32(input[n / 2 - i - 1], input[n / 2 + i]); 451 } 452 } 453 454 // Butterfly post-processing: 455 // e.g. n=8: 456 // out[0] = in0[0] + in1[3]; 457 // out[1] = in0[1] + in1[2]; 458 // out[2] = in0[1] - in1[2]; 459 // out[3] = in0[0] - in1[3]; 460 // out[4] = in0[7] - in1[4]; 461 // out[5] = in0[6] - in1[5]; 462 // out[6] = in0[6] + in1[5]; 463 // out[7] = in0[7] + in1[4]; 464 465 static AOM_FORCE_INLINE void butterfly_dct_post(const int32x4_t *in0, 466 const int32x4_t *in1, 467 int32x4_t *output, int n) { 468 for (int i = 0; i < n / 4; ++i) { 469 output[i] = vaddq_s32(in0[i], in1[n / 2 - i - 1]); 470 } 471 for (int i = 0; i < n / 4; ++i) { 472 output[n / 4 + i] = vsubq_s32(in0[n / 4 - i - 1], in1[n / 4 + i]); 473 } 474 for (int i = 0; i < n / 4; ++i) { 475 output[n / 2 + i] = vsubq_s32(in0[n - i - 1], in1[n / 2 + i]); 476 } 477 for (int i = 0; i < n / 4; ++i) { 478 output[(3 * n) / 4 + i] = 479 vaddq_s32(in0[(3 * n) / 4 + i], in1[(3 * n) / 4 - i - 1]); 480 } 481 } 482 483 static AOM_FORCE_INLINE void highbd_fdct8_x4_neon(const int32x4_t *in, 484 int32x4_t *out, int bit) { 485 const int32_t *const cospi = cospi_arr_s32(bit); 486 const int32x4_t v_bit = vdupq_n_s32(-bit); 487 488 // stage 1 489 int32x4_t a[8]; 490 butterfly_dct_pre(in, a, 8); 491 492 // stage 2 493 int32x4_t b[8]; 494 butterfly_dct_pre(a, b, 4); 495 butterfly_0130_neon(cospi, 32, a[5], a[6], &b[6], &b[5], v_bit); 496 497 // stage 3 498 int32x4_t c[8]; 499 butterfly_0130_neon(cospi, 32, b[1], b[0], &c[0], &c[1], v_bit); 500 butterfly_0112_neon(cospi, 16, b[3], b[2], &c[2], &c[3], v_bit); 501 butterfly_dct_post(a + 4, b + 4, c + 4, 4); 502 503 // stage 4-5 504 butterfly_0112_neon(cospi, 8, c[7], c[4], &out[1], &out[7], v_bit); 505 butterfly_0130_neon(cospi, 24, c[5], c[6], &out[5], &out[3], v_bit); 506 507 out[0] = c[0]; 508 out[2] = c[2]; 509 out[4] = c[1]; 510 out[6] = c[3]; 511 } 512 513 static AOM_FORCE_INLINE void highbd_fadst8_x4_neon(const int32x4_t *in, 514 int32x4_t *out, int bit) { 515 const int32_t *const cospi = cospi_arr_s32(bit); 516 const int32x4_t v_bit = vdupq_n_s32(-bit); 517 518 int32x4_t u0, u1, u2, u3, u4, u5, u6, u7; 519 int32x4_t v0, v1, v2, v3, v4, v5, v6, v7; 520 521 // stage 0-1 522 u0 = in[0]; 523 u1 = in[7]; 524 u2 = in[3]; 525 u3 = in[4]; 526 u4 = in[1]; 527 u5 = in[6]; 528 u6 = in[2]; 529 u7 = in[5]; 530 531 // stage 2 532 v0 = u0; 533 v1 = u1; 534 butterfly_cospi32_0222_neon(cospi, u3, u2, &v2, &v3, v_bit); 535 v4 = u4; 536 v5 = u5; 537 butterfly_cospi32_0002_neon(cospi, u6, u7, &v7, &v6, v_bit); 538 539 // stage 3 540 u0 = vaddq_s32(v0, v2); 541 u1 = vsubq_s32(v3, v1); 542 u2 = vsubq_s32(v0, v2); 543 u3 = vaddq_s32(v1, v3); 544 u4 = vsubq_s32(v6, v4); 545 u5 = vaddq_s32(v5, v7); 546 u6 = vaddq_s32(v4, v6); 547 u7 = vsubq_s32(v5, v7); 548 549 // stage 4 550 v0 = u0; 551 v1 = u1; 552 v2 = u2; 553 v3 = u3; 554 555 butterfly_0112_neon(cospi, 16, u4, u5, &v4, &v5, v_bit); 556 butterfly_0112_neon(cospi, 16, u7, u6, &v6, &v7, v_bit); 557 558 // stage 5 559 u0 = vaddq_s32(v0, v4); 560 u1 = vaddq_s32(v1, v5); 561 u2 = vaddq_s32(v2, v6); 562 u3 = vsubq_s32(v7, v3); 563 u4 = vsubq_s32(v0, v4); 564 u5 = vsubq_s32(v1, v5); 565 u6 = vsubq_s32(v2, v6); 566 u7 = vaddq_s32(v3, v7); 567 568 // stage 6 569 butterfly_0112_neon(cospi, 4, u0, u1, &v0, &v1, v_bit); 570 butterfly_0112_neon(cospi, 20, u2, u3, &v2, &v3, v_bit); 571 butterfly_0130_neon(cospi, 28, u5, u4, &v4, &v5, v_bit); 572 butterfly_0112_neon(cospi, 12, u6, u7, &v7, &v6, v_bit); 573 574 // stage 7 575 out[0] = v1; 576 out[1] = v6; 577 out[2] = v3; 578 out[3] = v4; 579 out[4] = v5; 580 out[5] = v2; 581 out[6] = v7; 582 out[7] = v0; 583 } 584 585 static AOM_FORCE_INLINE void highbd_fidentity8_x4_neon(const int32x4_t *in, 586 int32x4_t *out, 587 int bit) { 588 (void)bit; 589 out[0] = vshlq_n_s32(in[0], 1); 590 out[1] = vshlq_n_s32(in[1], 1); 591 out[2] = vshlq_n_s32(in[2], 1); 592 out[3] = vshlq_n_s32(in[3], 1); 593 out[4] = vshlq_n_s32(in[4], 1); 594 out[5] = vshlq_n_s32(in[5], 1); 595 out[6] = vshlq_n_s32(in[6], 1); 596 out[7] = vshlq_n_s32(in[7], 1); 597 } 598 599 static AOM_FORCE_INLINE void highbd_fdct8_xn_neon(const int32x4_t *in, 600 int32x4_t *out, int bit, 601 int howmany) { 602 const int stride = 8; 603 int i = 0; 604 do { 605 highbd_fdct8_x4_neon(in + i * stride, out + i * stride, bit); 606 } while (++i < howmany); 607 } 608 609 static AOM_FORCE_INLINE void highbd_fadst8_xn_neon(const int32x4_t *in, 610 int32x4_t *out, int bit, 611 int howmany) { 612 const int stride = 8; 613 int i = 0; 614 do { 615 highbd_fadst8_x4_neon(in + i * stride, out + i * stride, bit); 616 } while (++i < howmany); 617 } 618 619 static AOM_FORCE_INLINE void highbd_fidentity8_xn_neon(const int32x4_t *in, 620 int32x4_t *out, int bit, 621 int howmany) { 622 (void)bit; 623 const int stride = 8; 624 int i = 0; 625 do { 626 highbd_fidentity8_x4_neon(in + i * stride, out + i * stride, bit); 627 } while (++i < howmany); 628 } 629 630 void av1_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *coeff, int stride, 631 TX_TYPE tx_type, int bd) { 632 (void)bd; 633 634 int ud_flip, lr_flip; 635 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 636 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 637 638 // Workspaces for column/row-wise transforms. 639 int32x4_t buf0[16], buf1[16]; 640 641 switch (tx_type) { 642 case DCT_DCT: 643 load_buffer_8x8(input, buf0, stride, 0); 644 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 645 shift_right_1_round_s32_x4(buf0, buf0, 16); 646 transpose_arrays_s32_8x8(buf0, buf1); 647 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 648 store_buffer_8x8(buf1, coeff, /*stride=*/8); 649 break; 650 case ADST_DCT: 651 load_buffer_8x8(input, buf0, stride, 0); 652 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 653 shift_right_1_round_s32_x4(buf0, buf0, 16); 654 transpose_arrays_s32_8x8(buf0, buf1); 655 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 656 store_buffer_8x8(buf1, coeff, /*stride=*/8); 657 break; 658 case DCT_ADST: 659 load_buffer_8x8(input, buf0, stride, 0); 660 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 661 shift_right_1_round_s32_x4(buf0, buf0, 16); 662 transpose_arrays_s32_8x8(buf0, buf1); 663 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 664 store_buffer_8x8(buf1, coeff, /*stride=*/8); 665 break; 666 case ADST_ADST: 667 load_buffer_8x8(input, buf0, stride, 0); 668 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 669 shift_right_1_round_s32_x4(buf0, buf0, 16); 670 transpose_arrays_s32_8x8(buf0, buf1); 671 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 672 store_buffer_8x8(buf1, coeff, /*stride=*/8); 673 break; 674 case FLIPADST_DCT: 675 load_buffer_8x8(input, buf0, stride, 0); 676 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 677 shift_right_1_round_s32_x4(buf0, buf0, 16); 678 transpose_arrays_s32_8x8(buf0, buf1); 679 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 680 store_buffer_8x8(buf1, coeff, /*stride=*/8); 681 break; 682 case DCT_FLIPADST: 683 load_buffer_8x8(input, buf0, stride, 1); 684 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 685 shift_right_1_round_s32_x4(buf0, buf0, 16); 686 transpose_arrays_s32_8x8(buf0, buf1); 687 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 688 store_buffer_8x8(buf1, coeff, /*stride=*/8); 689 break; 690 case FLIPADST_FLIPADST: 691 load_buffer_8x8(input, buf0, stride, 1); 692 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 693 shift_right_1_round_s32_x4(buf0, buf0, 16); 694 transpose_arrays_s32_8x8(buf0, buf1); 695 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 696 store_buffer_8x8(buf1, coeff, /*stride=*/8); 697 break; 698 case ADST_FLIPADST: 699 load_buffer_8x8(input, buf0, stride, 1); 700 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 701 shift_right_1_round_s32_x4(buf0, buf0, 16); 702 transpose_arrays_s32_8x8(buf0, buf1); 703 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 704 store_buffer_8x8(buf1, coeff, /*stride=*/8); 705 break; 706 case FLIPADST_ADST: 707 load_buffer_8x8(input, buf0, stride, 0); 708 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 709 shift_right_1_round_s32_x4(buf0, buf0, 16); 710 transpose_arrays_s32_8x8(buf0, buf1); 711 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[1][1], 2); 712 store_buffer_8x8(buf1, coeff, /*stride=*/8); 713 break; 714 case IDTX: 715 load_buffer_8x8(input, buf0, stride, 0); 716 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 717 shift_right_1_round_s32_x4(buf0, buf0, 16); 718 transpose_arrays_s32_8x8(buf0, buf1); 719 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 720 store_buffer_8x8(buf1, coeff, /*stride=*/8); 721 break; 722 case V_DCT: 723 load_buffer_8x8(input, buf0, stride, 0); 724 highbd_fdct8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 725 shift_right_1_round_s32_x4(buf0, buf0, 16); 726 transpose_arrays_s32_8x8(buf0, buf1); 727 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 728 store_buffer_8x8(buf1, coeff, /*stride=*/8); 729 break; 730 case H_DCT: 731 load_buffer_8x8(input, buf0, stride, 0); 732 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 733 shift_right_1_round_s32_x4(buf0, buf0, 16); 734 transpose_arrays_s32_8x8(buf0, buf1); 735 highbd_fdct8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 736 store_buffer_8x8(buf1, coeff, /*stride=*/8); 737 break; 738 case V_ADST: 739 load_buffer_8x8(input, buf0, stride, 0); 740 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 741 shift_right_1_round_s32_x4(buf0, buf0, 16); 742 transpose_arrays_s32_8x8(buf0, buf1); 743 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 744 store_buffer_8x8(buf1, coeff, /*stride=*/8); 745 break; 746 case H_ADST: 747 load_buffer_8x8(input, buf0, stride, 0); 748 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 749 shift_right_1_round_s32_x4(buf0, buf0, 16); 750 transpose_arrays_s32_8x8(buf0, buf1); 751 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 752 store_buffer_8x8(buf1, coeff, /*stride=*/8); 753 break; 754 case V_FLIPADST: 755 load_buffer_8x8(input, buf0, stride, 0); 756 highbd_fadst8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 757 shift_right_1_round_s32_x4(buf0, buf0, 16); 758 transpose_arrays_s32_8x8(buf0, buf1); 759 highbd_fidentity8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 760 store_buffer_8x8(buf1, coeff, /*stride=*/8); 761 break; 762 case H_FLIPADST: 763 load_buffer_8x8(input, buf0, stride, 1); 764 highbd_fidentity8_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[1][1], 2); 765 shift_right_1_round_s32_x4(buf0, buf0, 16); 766 transpose_arrays_s32_8x8(buf0, buf1); 767 highbd_fadst8_xn_neon(buf1, buf1, av1_fwd_cos_bit_col[1][1], 2); 768 store_buffer_8x8(buf1, coeff, /*stride=*/8); 769 break; 770 default: assert(0); 771 } 772 } 773 774 static void highbd_fdct16_x4_neon(const int32x4_t *in, int32x4_t *out, 775 int bit) { 776 const int32_t *const cospi = cospi_arr_s32(bit); 777 const int32x4_t v_bit = vdupq_n_s32(-bit); 778 779 int32x4_t u[16], v[16]; 780 781 // stage 1 782 butterfly_dct_pre(in, u, 16); 783 784 // stage 2 785 butterfly_dct_pre(u, v, 8); 786 v[8] = u[8]; 787 v[9] = u[9]; 788 butterfly_cospi32_0002_neon(cospi, u[13], u[10], &v[13], &v[10], v_bit); 789 butterfly_cospi32_0002_neon(cospi, u[12], u[11], &v[12], &v[11], v_bit); 790 v[14] = u[14]; 791 v[15] = u[15]; 792 793 // stage 3 794 butterfly_dct_pre(v, u, 4); 795 u[4] = v[4]; 796 butterfly_cospi32_0002_neon(cospi, v[6], v[5], &u[6], &u[5], v_bit); 797 u[7] = v[7]; 798 butterfly_dct_post(v + 8, v + 8, u + 8, 8); 799 800 // stage 4 801 butterfly_cospi32_0002_neon(cospi, u[0], u[1], &v[0], &v[1], v_bit); 802 butterfly_0112_neon(cospi, 16, u[3], u[2], &v[2], &v[3], v_bit); 803 butterfly_dct_post(u + 4, u + 4, v + 4, 4); 804 v[8] = u[8]; 805 butterfly_0112_neon(cospi, 16, u[14], u[9], &v[14], &v[9], v_bit); 806 butterfly_2312_neon(cospi, 16, u[13], u[10], &v[10], &v[13], v_bit); 807 v[11] = u[11]; 808 v[12] = u[12]; 809 v[15] = u[15]; 810 811 // stage 5 812 u[0] = v[0]; 813 u[1] = v[1]; 814 u[2] = v[2]; 815 u[3] = v[3]; 816 butterfly_0112_neon(cospi, 8, v[7], v[4], &u[4], &u[7], v_bit); 817 butterfly_0130_neon(cospi, 24, v[5], v[6], &u[5], &u[6], v_bit); 818 butterfly_dct_post(v + 8, v + 8, u + 8, 4); 819 butterfly_dct_post(v + 12, v + 12, u + 12, 4); 820 821 // stage 6 822 v[0] = u[0]; 823 v[1] = u[1]; 824 v[2] = u[2]; 825 v[3] = u[3]; 826 v[4] = u[4]; 827 v[5] = u[5]; 828 v[6] = u[6]; 829 v[7] = u[7]; 830 butterfly_0112_neon(cospi, 4, u[15], u[8], &v[8], &v[15], v_bit); 831 butterfly_0130_neon(cospi, 28, u[9], u[14], &v[9], &v[14], v_bit); 832 butterfly_0112_neon(cospi, 20, u[13], u[10], &v[10], &v[13], v_bit); 833 butterfly_0130_neon(cospi, 12, u[11], u[12], &v[11], &v[12], v_bit); 834 835 out[0] = v[0]; 836 out[1] = v[8]; 837 out[2] = v[4]; 838 out[3] = v[12]; 839 out[4] = v[2]; 840 out[5] = v[10]; 841 out[6] = v[6]; 842 out[7] = v[14]; 843 out[8] = v[1]; 844 out[9] = v[9]; 845 out[10] = v[5]; 846 out[11] = v[13]; 847 out[12] = v[3]; 848 out[13] = v[11]; 849 out[14] = v[7]; 850 out[15] = v[15]; 851 } 852 853 static void highbd_fadst16_x4_neon(const int32x4_t *in, int32x4_t *out, 854 int bit) { 855 const int32_t *const cospi = cospi_arr_s32(bit); 856 const int32x4_t v_bit = vdupq_n_s32(-bit); 857 858 int32x4_t u[16], v[16]; 859 860 // stage 0-1 861 u[0] = in[0]; 862 u[1] = in[15]; 863 u[2] = in[7]; 864 u[3] = in[8]; 865 u[4] = in[3]; 866 u[5] = in[12]; 867 u[6] = in[4]; 868 u[7] = in[11]; 869 u[8] = in[1]; 870 u[9] = in[14]; 871 u[10] = in[6]; 872 u[11] = in[9]; 873 u[12] = in[2]; 874 u[13] = in[13]; 875 u[14] = in[5]; 876 u[15] = in[10]; 877 878 // stage 2 879 v[0] = u[0]; 880 v[1] = u[1]; 881 butterfly_cospi32_0222_neon(cospi, u[3], u[2], &v[2], &v[3], v_bit); 882 v[4] = u[4]; 883 v[5] = u[5]; 884 butterfly_cospi32_0002_neon(cospi, u[6], u[7], &v[7], &v[6], v_bit); 885 v[8] = u[8]; 886 v[9] = u[9]; 887 butterfly_cospi32_0002_neon(cospi, u[10], u[11], &v[11], &v[10], v_bit); 888 v[12] = u[12]; 889 v[13] = u[13]; 890 butterfly_cospi32_0222_neon(cospi, u[15], u[14], &v[14], &v[15], v_bit); 891 892 // stage 3 893 u[0] = vaddq_s32(v[0], v[2]); 894 u[1] = vsubq_s32(v[3], v[1]); 895 u[2] = vsubq_s32(v[0], v[2]); 896 u[3] = vaddq_s32(v[1], v[3]); 897 u[4] = vsubq_s32(v[6], v[4]); 898 u[5] = vaddq_s32(v[5], v[7]); 899 u[6] = vaddq_s32(v[4], v[6]); 900 u[7] = vsubq_s32(v[5], v[7]); 901 u[8] = vsubq_s32(v[10], v[8]); 902 u[9] = vaddq_s32(v[9], v[11]); 903 u[10] = vaddq_s32(v[8], v[10]); 904 u[11] = vsubq_s32(v[9], v[11]); 905 u[12] = vaddq_s32(v[12], v[14]); 906 u[13] = vsubq_s32(v[15], v[13]); 907 u[14] = vsubq_s32(v[12], v[14]); 908 u[15] = vaddq_s32(v[13], v[15]); 909 910 // stage 4 911 v[0] = u[0]; 912 v[1] = u[1]; 913 v[2] = u[2]; 914 v[3] = u[3]; 915 butterfly_0112_neon(cospi, 16, u[4], u[5], &v[4], &v[5], v_bit); 916 butterfly_0112_neon(cospi, 16, u[7], u[6], &v[6], &v[7], v_bit); 917 918 v[8] = u[8]; 919 v[9] = u[9]; 920 v[10] = u[10]; 921 v[11] = u[11]; 922 923 butterfly_0112_neon(cospi, 16, u[12], u[13], &v[12], &v[13], v_bit); 924 butterfly_0332_neon(cospi, 16, u[14], u[15], &v[15], &v[14], v_bit); 925 926 // stage 5 927 u[0] = vaddq_s32(v[0], v[4]); 928 u[1] = vaddq_s32(v[1], v[5]); 929 u[2] = vaddq_s32(v[2], v[6]); 930 u[3] = vsubq_s32(v[7], v[3]); 931 u[4] = vsubq_s32(v[0], v[4]); 932 u[5] = vsubq_s32(v[1], v[5]); 933 u[6] = vsubq_s32(v[2], v[6]); 934 u[7] = vaddq_s32(v[3], v[7]); 935 u[8] = vaddq_s32(v[8], v[12]); 936 u[9] = vaddq_s32(v[9], v[13]); 937 u[10] = vsubq_s32(v[14], v[10]); 938 u[11] = vaddq_s32(v[11], v[15]); 939 u[12] = vsubq_s32(v[8], v[12]); 940 u[13] = vsubq_s32(v[9], v[13]); 941 u[14] = vaddq_s32(v[10], v[14]); 942 u[15] = vsubq_s32(v[11], v[15]); 943 944 // stage 6 945 v[0] = u[0]; 946 v[1] = u[1]; 947 v[2] = u[2]; 948 v[3] = u[3]; 949 v[4] = u[4]; 950 v[5] = u[5]; 951 v[6] = u[6]; 952 v[7] = u[7]; 953 954 butterfly_0112_neon(cospi, 8, u[8], u[9], &v[8], &v[9], v_bit); 955 butterfly_0130_neon(cospi, 8, u[12], u[13], &v[13], &v[12], v_bit); 956 butterfly_0130_neon(cospi, 24, u[11], u[10], &v[10], &v[11], v_bit); 957 butterfly_0130_neon(cospi, 24, u[14], u[15], &v[14], &v[15], v_bit); 958 959 // stage 7 960 u[0] = vaddq_s32(v[0], v[8]); 961 u[1] = vaddq_s32(v[1], v[9]); 962 u[2] = vaddq_s32(v[2], v[10]); 963 u[3] = vaddq_s32(v[3], v[11]); 964 u[4] = vaddq_s32(v[4], v[12]); 965 u[5] = vaddq_s32(v[5], v[13]); 966 u[6] = vaddq_s32(v[6], v[14]); 967 u[7] = vsubq_s32(v[15], v[7]); 968 u[8] = vsubq_s32(v[0], v[8]); 969 u[9] = vsubq_s32(v[1], v[9]); 970 u[10] = vsubq_s32(v[2], v[10]); 971 u[11] = vsubq_s32(v[3], v[11]); 972 u[12] = vsubq_s32(v[4], v[12]); 973 u[13] = vsubq_s32(v[5], v[13]); 974 u[14] = vsubq_s32(v[6], v[14]); 975 u[15] = vaddq_s32(v[7], v[15]); 976 977 // stage 8 978 butterfly_0112_neon(cospi, 2, u[0], u[1], &v[0], &v[1], v_bit); 979 butterfly_0112_neon(cospi, 10, u[2], u[3], &v[2], &v[3], v_bit); 980 butterfly_0112_neon(cospi, 18, u[4], u[5], &v[4], &v[5], v_bit); 981 butterfly_0112_neon(cospi, 26, u[6], u[7], &v[6], &v[7], v_bit); 982 butterfly_0130_neon(cospi, 30, u[9], u[8], &v[8], &v[9], v_bit); 983 butterfly_0130_neon(cospi, 22, u[11], u[10], &v[10], &v[11], v_bit); 984 butterfly_0130_neon(cospi, 14, u[13], u[12], &v[12], &v[13], v_bit); 985 butterfly_0112_neon(cospi, 6, u[14], u[15], &v[15], &v[14], v_bit); 986 987 // stage 9 988 out[0] = v[1]; 989 out[1] = v[14]; 990 out[2] = v[3]; 991 out[3] = v[12]; 992 out[4] = v[5]; 993 out[5] = v[10]; 994 out[6] = v[7]; 995 out[7] = v[8]; 996 out[8] = v[9]; 997 out[9] = v[6]; 998 out[10] = v[11]; 999 out[11] = v[4]; 1000 out[12] = v[13]; 1001 out[13] = v[2]; 1002 out[14] = v[15]; 1003 out[15] = v[0]; 1004 } 1005 1006 static void highbd_fidentity16_x4_neon(const int32x4_t *in, int32x4_t *out, 1007 int bit) { 1008 (void)bit; 1009 const int32x4_t fact = vdupq_n_s32(2 * NewSqrt2); 1010 const int32x4_t offset = vdupq_n_s32(1 << (NewSqrt2Bits - 1)); 1011 1012 for (int i = 0; i < 16; i++) { 1013 int32x4_t a = vmulq_s32(in[i], fact); 1014 a = vaddq_s32(a, offset); 1015 out[i] = vshrq_n_s32(a, NewSqrt2Bits); 1016 } 1017 } 1018 1019 static void highbd_fdct16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, 1020 const int howmany) { 1021 const int stride = 16; 1022 int i = 0; 1023 do { 1024 highbd_fdct16_x4_neon(in + i * stride, out + i * stride, bit); 1025 } while (++i < howmany); 1026 } 1027 1028 static void highbd_fadst16_xn_neon(const int32x4_t *in, int32x4_t *out, int bit, 1029 int howmany) { 1030 const int stride = 16; 1031 int i = 0; 1032 do { 1033 highbd_fadst16_x4_neon(in + i * stride, out + i * stride, bit); 1034 } while (++i < howmany); 1035 } 1036 1037 static void highbd_fidentity16_xn_neon(const int32x4_t *in, int32x4_t *out, 1038 int bit, int howmany) { 1039 const int stride = 16; 1040 int i = 0; 1041 do { 1042 highbd_fidentity16_x4_neon(in + i * stride, out + i * stride, bit); 1043 } while (++i < howmany); 1044 } 1045 1046 void av1_fwd_txfm2d_16x16_neon(const int16_t *input, int32_t *coeff, int stride, 1047 TX_TYPE tx_type, int bd) { 1048 (void)bd; 1049 int ud_flip, lr_flip; 1050 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 1051 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 1052 1053 // Workspaces for column/row-wise transforms. 1054 int32x4_t buf0[64], buf1[64]; 1055 1056 switch (tx_type) { 1057 case DCT_DCT: 1058 load_buffer_16x16(input, buf0, stride, 0); 1059 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1060 shift_right_2_round_s32_x4(buf0, buf0, 64); 1061 transpose_arrays_s32_16x16(buf0, buf1); 1062 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1063 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1064 break; 1065 case ADST_DCT: 1066 load_buffer_16x16(input, buf0, stride, 0); 1067 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1068 shift_right_2_round_s32_x4(buf0, buf0, 64); 1069 transpose_arrays_s32_16x16(buf0, buf1); 1070 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1071 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1072 break; 1073 case DCT_ADST: 1074 load_buffer_16x16(input, buf0, stride, 0); 1075 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1076 shift_right_2_round_s32_x4(buf0, buf0, 64); 1077 transpose_arrays_s32_16x16(buf0, buf1); 1078 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1079 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1080 break; 1081 case ADST_ADST: 1082 load_buffer_16x16(input, buf0, stride, 0); 1083 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1084 shift_right_2_round_s32_x4(buf0, buf0, 64); 1085 transpose_arrays_s32_16x16(buf0, buf1); 1086 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1087 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1088 break; 1089 case FLIPADST_DCT: 1090 load_buffer_16x16(input, buf0, stride, 0); 1091 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1092 shift_right_2_round_s32_x4(buf0, buf0, 64); 1093 transpose_arrays_s32_16x16(buf0, buf1); 1094 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1095 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1096 break; 1097 case DCT_FLIPADST: 1098 load_buffer_16x16(input, buf0, stride, 1); 1099 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1100 shift_right_2_round_s32_x4(buf0, buf0, 64); 1101 transpose_arrays_s32_16x16(buf0, buf1); 1102 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1103 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1104 break; 1105 case FLIPADST_FLIPADST: 1106 load_buffer_16x16(input, buf0, stride, 1); 1107 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1108 shift_right_2_round_s32_x4(buf0, buf0, 64); 1109 transpose_arrays_s32_16x16(buf0, buf1); 1110 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1111 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1112 break; 1113 case ADST_FLIPADST: 1114 load_buffer_16x16(input, buf0, stride, 1); 1115 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1116 shift_right_2_round_s32_x4(buf0, buf0, 64); 1117 transpose_arrays_s32_16x16(buf0, buf1); 1118 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1119 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1120 break; 1121 case FLIPADST_ADST: 1122 load_buffer_16x16(input, buf0, stride, 0); 1123 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1124 shift_right_2_round_s32_x4(buf0, buf0, 64); 1125 transpose_arrays_s32_16x16(buf0, buf1); 1126 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1127 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1128 break; 1129 case IDTX: 1130 load_buffer_16x16(input, buf0, stride, 0); 1131 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1132 shift_right_2_round_s32_x4(buf0, buf0, 64); 1133 transpose_arrays_s32_16x16(buf0, buf1); 1134 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1135 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1136 break; 1137 case V_DCT: 1138 load_buffer_16x16(input, buf0, stride, 0); 1139 highbd_fdct16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1140 shift_right_2_round_s32_x4(buf0, buf0, 64); 1141 transpose_arrays_s32_16x16(buf0, buf1); 1142 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1143 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1144 break; 1145 case H_DCT: 1146 load_buffer_16x16(input, buf0, stride, 0); 1147 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1148 shift_right_2_round_s32_x4(buf0, buf0, 64); 1149 transpose_arrays_s32_16x16(buf0, buf1); 1150 highbd_fdct16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1151 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1152 break; 1153 case V_ADST: 1154 load_buffer_16x16(input, buf0, stride, 0); 1155 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1156 shift_right_2_round_s32_x4(buf0, buf0, 64); 1157 transpose_arrays_s32_16x16(buf0, buf1); 1158 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1159 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1160 break; 1161 case H_ADST: 1162 load_buffer_16x16(input, buf0, stride, 0); 1163 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1164 shift_right_2_round_s32_x4(buf0, buf0, 64); 1165 transpose_arrays_s32_16x16(buf0, buf1); 1166 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1167 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1168 break; 1169 case V_FLIPADST: 1170 load_buffer_16x16(input, buf0, stride, 0); 1171 highbd_fadst16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1172 shift_right_2_round_s32_x4(buf0, buf0, 64); 1173 transpose_arrays_s32_16x16(buf0, buf1); 1174 highbd_fidentity16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1175 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1176 break; 1177 case H_FLIPADST: 1178 load_buffer_16x16(input, buf0, stride, 1); 1179 highbd_fidentity16_xn_neon(buf0, buf0, av1_fwd_cos_bit_col[2][2], 4); 1180 shift_right_2_round_s32_x4(buf0, buf0, 64); 1181 transpose_arrays_s32_16x16(buf0, buf1); 1182 highbd_fadst16_xn_neon(buf1, buf1, av1_fwd_cos_bit_row[2][2], 4); 1183 store_buffer_16x16(buf1, coeff, /*stride=*/16); 1184 break; 1185 default: assert(0); 1186 } 1187 } 1188 1189 typedef void (*fwd_transform_1d_col_neon)(const int16_t *in, int32x4_t *out, 1190 int stride, int bit, int lr_flip); 1191 typedef void (*fwd_transform_1d_col_many_neon)(const int16_t *in, 1192 int32x4_t *out, int stride, 1193 int bit, int lr_flip, 1194 int howmany, int hm_stride); 1195 1196 typedef void (*fwd_transform_1d_row_neon)(const int32x4_t *in, int32_t *out, 1197 int bit, int stride); 1198 typedef void (*fwd_transform_1d_row_many_neon)(const int32x4_t *in, 1199 int32_t *out, int bit, 1200 int howmany, int hm_stride, 1201 int stride); 1202 1203 // Construct component kernels that include the load_buffer and store_buffer 1204 // stages to avoid the need to spill loaded data to the stack between these and 1205 // the txfm kernel calls. 1206 // The TRANSFORM_*_ONE cases are only ever called in situations where the 1207 // howmany parameter would be one, so no need for the loop at all in these 1208 // cases. 1209 1210 #define TRANSFORM_COL_ONE(name, n) \ 1211 static void highbd_##name##_col_neon(const int16_t *input, \ 1212 int32x4_t *output, int stride, \ 1213 int cos_bit, int lr_flip) { \ 1214 int32x4_t buf0[n]; \ 1215 load_buffer_4x##n(input, buf0, stride, lr_flip); \ 1216 highbd_##name##_x4_neon(buf0, output, cos_bit); \ 1217 } 1218 1219 #define TRANSFORM_COL_MANY(name, n) \ 1220 static void highbd_##name##_col_many_neon( \ 1221 const int16_t *input, int32x4_t *output, int stride, int cos_bit, \ 1222 int lr_flip, int howmany, int hm_stride) { \ 1223 int i = 0; \ 1224 do { \ 1225 int32x4_t buf0[n]; \ 1226 load_buffer_4x##n(input + 4 * i, buf0, stride, lr_flip); \ 1227 highbd_##name##_x4_neon(buf0, output + i * hm_stride, cos_bit); \ 1228 } while (++i < howmany); \ 1229 } 1230 1231 #define TRANSFORM_ROW_ONE(name, n) \ 1232 static void highbd_##name##_row_neon( \ 1233 const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ 1234 int32x4_t buf0[n]; \ 1235 highbd_##name##_x4_neon(input, buf0, cos_bit); \ 1236 store_buffer_##n##x4(buf0, output, stride); \ 1237 } 1238 1239 #define TRANSFORM_ROW_RECT_ONE(name, n) \ 1240 static void highbd_##name##_row_rect_neon( \ 1241 const int32x4_t *input, int32_t *output, int cos_bit, int stride) { \ 1242 int32x4_t buf0[n]; \ 1243 highbd_##name##_x4_neon(input, buf0, cos_bit); \ 1244 round_rect_array_s32_neon(buf0, buf0, (n)); \ 1245 store_buffer_##n##x4(buf0, output, stride); \ 1246 } 1247 1248 #define TRANSFORM_ROW_MANY(name, n) \ 1249 static void highbd_##name##_row_many_neon( \ 1250 const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ 1251 int hm_stride, int stride) { \ 1252 int i = 0; \ 1253 do { \ 1254 int32x4_t buf0[n]; \ 1255 highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ 1256 store_buffer_##n##x4(buf0, output + 4 * i, stride); \ 1257 } while (++i < howmany); \ 1258 } 1259 1260 #define TRANSFORM_ROW_RECT_MANY(name, n) \ 1261 static void highbd_##name##_row_rect_many_neon( \ 1262 const int32x4_t *input, int32_t *output, int cos_bit, int howmany, \ 1263 int hm_stride, int stride) { \ 1264 int i = 0; \ 1265 do { \ 1266 int32x4_t buf0[n]; \ 1267 highbd_##name##_x4_neon(input + hm_stride * i, buf0, cos_bit); \ 1268 round_rect_array_s32_neon(buf0, buf0, (n)); \ 1269 store_buffer_##n##x4(buf0, output + 4 * i, stride); \ 1270 } while (++i < howmany); \ 1271 } 1272 1273 TRANSFORM_COL_ONE(fdct8, 8) 1274 TRANSFORM_COL_ONE(fadst8, 8) 1275 TRANSFORM_COL_ONE(fidentity8, 8) 1276 1277 TRANSFORM_COL_MANY(fdct4, 4) 1278 TRANSFORM_COL_MANY(fdct8, 8) 1279 TRANSFORM_COL_MANY(fdct16, 16) 1280 TRANSFORM_COL_MANY(fadst4, 4) 1281 TRANSFORM_COL_MANY(fadst8, 8) 1282 TRANSFORM_COL_MANY(fadst16, 16) 1283 TRANSFORM_COL_MANY(fidentity4, 4) 1284 TRANSFORM_COL_MANY(fidentity8, 8) 1285 TRANSFORM_COL_MANY(fidentity16, 16) 1286 1287 TRANSFORM_ROW_ONE(fdct16, 16) 1288 TRANSFORM_ROW_ONE(fadst16, 16) 1289 TRANSFORM_ROW_ONE(fidentity16, 16) 1290 1291 TRANSFORM_ROW_RECT_ONE(fdct8, 8) 1292 TRANSFORM_ROW_RECT_ONE(fadst8, 8) 1293 TRANSFORM_ROW_RECT_ONE(fidentity8, 8) 1294 1295 #if !CONFIG_REALTIME_ONLY 1296 TRANSFORM_ROW_MANY(fdct4, 4) 1297 TRANSFORM_ROW_MANY(fdct8, 8) 1298 TRANSFORM_ROW_MANY(fadst4, 4) 1299 TRANSFORM_ROW_MANY(fadst8, 8) 1300 TRANSFORM_ROW_MANY(fidentity4, 4) 1301 TRANSFORM_ROW_MANY(fidentity8, 8) 1302 #endif 1303 1304 TRANSFORM_ROW_RECT_MANY(fdct4, 4) 1305 TRANSFORM_ROW_RECT_MANY(fdct8, 8) 1306 TRANSFORM_ROW_RECT_MANY(fdct16, 16) 1307 TRANSFORM_ROW_RECT_MANY(fadst4, 4) 1308 TRANSFORM_ROW_RECT_MANY(fadst8, 8) 1309 TRANSFORM_ROW_RECT_MANY(fadst16, 16) 1310 TRANSFORM_ROW_RECT_MANY(fidentity4, 4) 1311 TRANSFORM_ROW_RECT_MANY(fidentity8, 8) 1312 TRANSFORM_ROW_RECT_MANY(fidentity16, 16) 1313 1314 static const fwd_transform_1d_col_many_neon 1315 col_highbd_txfm8_xn_arr[TX_TYPES] = { 1316 highbd_fdct8_col_many_neon, // DCT_DCT 1317 highbd_fadst8_col_many_neon, // ADST_DCT 1318 highbd_fdct8_col_many_neon, // DCT_ADST 1319 highbd_fadst8_col_many_neon, // ADST_ADST 1320 highbd_fadst8_col_many_neon, // FLIPADST_DCT 1321 highbd_fdct8_col_many_neon, // DCT_FLIPADST 1322 highbd_fadst8_col_many_neon, // FLIPADST_FLIPADST 1323 highbd_fadst8_col_many_neon, // ADST_FLIPADST 1324 highbd_fadst8_col_many_neon, // FLIPADST_ADST 1325 highbd_fidentity8_col_many_neon, // IDTX 1326 highbd_fdct8_col_many_neon, // V_DCT 1327 highbd_fidentity8_col_many_neon, // H_DCT 1328 highbd_fadst8_col_many_neon, // V_ADST 1329 highbd_fidentity8_col_many_neon, // H_ADST 1330 highbd_fadst8_col_many_neon, // V_FLIPADST 1331 highbd_fidentity8_col_many_neon // H_FLIPADST 1332 }; 1333 1334 static const fwd_transform_1d_col_neon col_highbd_txfm8_x4_arr[TX_TYPES] = { 1335 highbd_fdct8_col_neon, // DCT_DCT 1336 highbd_fadst8_col_neon, // ADST_DCT 1337 highbd_fdct8_col_neon, // DCT_ADST 1338 highbd_fadst8_col_neon, // ADST_ADST 1339 highbd_fadst8_col_neon, // FLIPADST_DCT 1340 highbd_fdct8_col_neon, // DCT_FLIPADST 1341 highbd_fadst8_col_neon, // FLIPADST_FLIPADST 1342 highbd_fadst8_col_neon, // ADST_FLIPADST 1343 highbd_fadst8_col_neon, // FLIPADST_ADST 1344 highbd_fidentity8_col_neon, // IDTX 1345 highbd_fdct8_col_neon, // V_DCT 1346 highbd_fidentity8_col_neon, // H_DCT 1347 highbd_fadst8_col_neon, // V_ADST 1348 highbd_fidentity8_col_neon, // H_ADST 1349 highbd_fadst8_col_neon, // V_FLIPADST 1350 highbd_fidentity8_col_neon // H_FLIPADST 1351 }; 1352 1353 static const fwd_transform_1d_col_many_neon 1354 col_highbd_txfm16_xn_arr[TX_TYPES] = { 1355 highbd_fdct16_col_many_neon, // DCT_DCT 1356 highbd_fadst16_col_many_neon, // ADST_DCT 1357 highbd_fdct16_col_many_neon, // DCT_ADST 1358 highbd_fadst16_col_many_neon, // ADST_ADST 1359 highbd_fadst16_col_many_neon, // FLIPADST_DCT 1360 highbd_fdct16_col_many_neon, // DCT_FLIPADST 1361 highbd_fadst16_col_many_neon, // FLIPADST_FLIPADST 1362 highbd_fadst16_col_many_neon, // ADST_FLIPADST 1363 highbd_fadst16_col_many_neon, // FLIPADST_ADST 1364 highbd_fidentity16_col_many_neon, // IDTX 1365 highbd_fdct16_col_many_neon, // V_DCT 1366 highbd_fidentity16_col_many_neon, // H_DCT 1367 highbd_fadst16_col_many_neon, // V_ADST 1368 highbd_fidentity16_col_many_neon, // H_ADST 1369 highbd_fadst16_col_many_neon, // V_FLIPADST 1370 highbd_fidentity16_col_many_neon // H_FLIPADST 1371 }; 1372 1373 static const fwd_transform_1d_col_many_neon 1374 col_highbd_txfm4_xn_arr[TX_TYPES] = { 1375 highbd_fdct4_col_many_neon, // DCT_DCT 1376 highbd_fadst4_col_many_neon, // ADST_DCT 1377 highbd_fdct4_col_many_neon, // DCT_ADST 1378 highbd_fadst4_col_many_neon, // ADST_ADST 1379 highbd_fadst4_col_many_neon, // FLIPADST_DCT 1380 highbd_fdct4_col_many_neon, // DCT_FLIPADST 1381 highbd_fadst4_col_many_neon, // FLIPADST_FLIPADST 1382 highbd_fadst4_col_many_neon, // ADST_FLIPADST 1383 highbd_fadst4_col_many_neon, // FLIPADST_ADST 1384 highbd_fidentity4_col_many_neon, // IDTX 1385 highbd_fdct4_col_many_neon, // V_DCT 1386 highbd_fidentity4_col_many_neon, // H_DCT 1387 highbd_fadst4_col_many_neon, // V_ADST 1388 highbd_fidentity4_col_many_neon, // H_ADST 1389 highbd_fadst4_col_many_neon, // V_FLIPADST 1390 highbd_fidentity4_col_many_neon // H_FLIPADST 1391 }; 1392 1393 static const fwd_transform_1d_row_neon row_highbd_txfm16_xn_arr[TX_TYPES] = { 1394 highbd_fdct16_row_neon, // DCT_DCT 1395 highbd_fdct16_row_neon, // ADST_DCT 1396 highbd_fadst16_row_neon, // DCT_ADST 1397 highbd_fadst16_row_neon, // ADST_ADST 1398 highbd_fdct16_row_neon, // FLIPADST_DCT 1399 highbd_fadst16_row_neon, // DCT_FLIPADST 1400 highbd_fadst16_row_neon, // FLIPADST_FLIPADST 1401 highbd_fadst16_row_neon, // ADST_FLIPADST 1402 highbd_fadst16_row_neon, // FLIPADST_ADST 1403 highbd_fidentity16_row_neon, // IDTX 1404 highbd_fidentity16_row_neon, // V_DCT 1405 highbd_fdct16_row_neon, // H_DCT 1406 highbd_fidentity16_row_neon, // V_ADST 1407 highbd_fadst16_row_neon, // H_ADST 1408 highbd_fidentity16_row_neon, // V_FLIPADST 1409 highbd_fadst16_row_neon // H_FLIPADST 1410 }; 1411 1412 static const fwd_transform_1d_row_many_neon 1413 row_rect_highbd_txfm16_xn_arr[TX_TYPES] = { 1414 highbd_fdct16_row_rect_many_neon, // DCT_DCT 1415 highbd_fdct16_row_rect_many_neon, // ADST_DCT 1416 highbd_fadst16_row_rect_many_neon, // DCT_ADST 1417 highbd_fadst16_row_rect_many_neon, // ADST_ADST 1418 highbd_fdct16_row_rect_many_neon, // FLIPADST_DCT 1419 highbd_fadst16_row_rect_many_neon, // DCT_FLIPADST 1420 highbd_fadst16_row_rect_many_neon, // FLIPADST_FLIPADST 1421 highbd_fadst16_row_rect_many_neon, // ADST_FLIPADST 1422 highbd_fadst16_row_rect_many_neon, // FLIPADST_ADST 1423 highbd_fidentity16_row_rect_many_neon, // IDTX 1424 highbd_fidentity16_row_rect_many_neon, // V_DCT 1425 highbd_fdct16_row_rect_many_neon, // H_DCT 1426 highbd_fidentity16_row_rect_many_neon, // V_ADST 1427 highbd_fadst16_row_rect_many_neon, // H_ADST 1428 highbd_fidentity16_row_rect_many_neon, // V_FLIPADST 1429 highbd_fadst16_row_rect_many_neon // H_FLIPADST 1430 }; 1431 1432 #if !CONFIG_REALTIME_ONLY 1433 static const fwd_transform_1d_row_many_neon 1434 row_highbd_txfm8_xn_arr[TX_TYPES] = { 1435 highbd_fdct8_row_many_neon, // DCT_DCT 1436 highbd_fdct8_row_many_neon, // ADST_DCT 1437 highbd_fadst8_row_many_neon, // DCT_ADST 1438 highbd_fadst8_row_many_neon, // ADST_ADST 1439 highbd_fdct8_row_many_neon, // FLIPADST_DCT 1440 highbd_fadst8_row_many_neon, // DCT_FLIPADST 1441 highbd_fadst8_row_many_neon, // FLIPADST_FLIPADST 1442 highbd_fadst8_row_many_neon, // ADST_FLIPADST 1443 highbd_fadst8_row_many_neon, // FLIPADST_ADST 1444 highbd_fidentity8_row_many_neon, // IDTX 1445 highbd_fidentity8_row_many_neon, // V_DCT 1446 highbd_fdct8_row_many_neon, // H_DCT 1447 highbd_fidentity8_row_many_neon, // V_ADST 1448 highbd_fadst8_row_many_neon, // H_ADST 1449 highbd_fidentity8_row_many_neon, // V_FLIPADST 1450 highbd_fadst8_row_many_neon // H_FLIPADST 1451 }; 1452 #endif 1453 1454 static const fwd_transform_1d_row_many_neon 1455 row_rect_highbd_txfm8_xn_arr[TX_TYPES] = { 1456 highbd_fdct8_row_rect_many_neon, // DCT_DCT 1457 highbd_fdct8_row_rect_many_neon, // ADST_DCT 1458 highbd_fadst8_row_rect_many_neon, // DCT_ADST 1459 highbd_fadst8_row_rect_many_neon, // ADST_ADST 1460 highbd_fdct8_row_rect_many_neon, // FLIPADST_DCT 1461 highbd_fadst8_row_rect_many_neon, // DCT_FLIPADST 1462 highbd_fadst8_row_rect_many_neon, // FLIPADST_FLIPADST 1463 highbd_fadst8_row_rect_many_neon, // ADST_FLIPADST 1464 highbd_fadst8_row_rect_many_neon, // FLIPADST_ADST 1465 highbd_fidentity8_row_rect_many_neon, // IDTX 1466 highbd_fidentity8_row_rect_many_neon, // V_DCT 1467 highbd_fdct8_row_rect_many_neon, // H_DCT 1468 highbd_fidentity8_row_rect_many_neon, // V_ADST 1469 highbd_fadst8_row_rect_many_neon, // H_ADST 1470 highbd_fidentity8_row_rect_many_neon, // V_FLIPADST 1471 highbd_fadst8_row_rect_many_neon // H_FLIPADST 1472 }; 1473 1474 static const fwd_transform_1d_row_neon row_highbd_txfm8_x4_arr[TX_TYPES] = { 1475 highbd_fdct8_row_rect_neon, // DCT_DCT 1476 highbd_fdct8_row_rect_neon, // ADST_DCT 1477 highbd_fadst8_row_rect_neon, // DCT_ADST 1478 highbd_fadst8_row_rect_neon, // ADST_ADST 1479 highbd_fdct8_row_rect_neon, // FLIPADST_DCT 1480 highbd_fadst8_row_rect_neon, // DCT_FLIPADST 1481 highbd_fadst8_row_rect_neon, // FLIPADST_FLIPADST 1482 highbd_fadst8_row_rect_neon, // ADST_FLIPADST 1483 highbd_fadst8_row_rect_neon, // FLIPADST_ADST 1484 highbd_fidentity8_row_rect_neon, // IDTX 1485 highbd_fidentity8_row_rect_neon, // V_DCT 1486 highbd_fdct8_row_rect_neon, // H_DCT 1487 highbd_fidentity8_row_rect_neon, // V_ADST 1488 highbd_fadst8_row_rect_neon, // H_ADST 1489 highbd_fidentity8_row_rect_neon, // V_FLIPADST 1490 highbd_fadst8_row_rect_neon // H_FLIPADST 1491 }; 1492 1493 #if !CONFIG_REALTIME_ONLY 1494 static const fwd_transform_1d_row_many_neon 1495 row_highbd_txfm4_xn_arr[TX_TYPES] = { 1496 highbd_fdct4_row_many_neon, // DCT_DCT 1497 highbd_fdct4_row_many_neon, // ADST_DCT 1498 highbd_fadst4_row_many_neon, // DCT_ADST 1499 highbd_fadst4_row_many_neon, // ADST_ADST 1500 highbd_fdct4_row_many_neon, // FLIPADST_DCT 1501 highbd_fadst4_row_many_neon, // DCT_FLIPADST 1502 highbd_fadst4_row_many_neon, // FLIPADST_FLIPADST 1503 highbd_fadst4_row_many_neon, // ADST_FLIPADST 1504 highbd_fadst4_row_many_neon, // FLIPADST_ADST 1505 highbd_fidentity4_row_many_neon, // IDTX 1506 highbd_fidentity4_row_many_neon, // V_DCT 1507 highbd_fdct4_row_many_neon, // H_DCT 1508 highbd_fidentity4_row_many_neon, // V_ADST 1509 highbd_fadst4_row_many_neon, // H_ADST 1510 highbd_fidentity4_row_many_neon, // V_FLIPADST 1511 highbd_fadst4_row_many_neon // H_FLIPADST 1512 }; 1513 #endif 1514 1515 static const fwd_transform_1d_row_many_neon 1516 row_rect_highbd_txfm4_xn_arr[TX_TYPES] = { 1517 highbd_fdct4_row_rect_many_neon, // DCT_DCT 1518 highbd_fdct4_row_rect_many_neon, // ADST_DCT 1519 highbd_fadst4_row_rect_many_neon, // DCT_ADST 1520 highbd_fadst4_row_rect_many_neon, // ADST_ADST 1521 highbd_fdct4_row_rect_many_neon, // FLIPADST_DCT 1522 highbd_fadst4_row_rect_many_neon, // DCT_FLIPADST 1523 highbd_fadst4_row_rect_many_neon, // FLIPADST_FLIPADST 1524 highbd_fadst4_row_rect_many_neon, // ADST_FLIPADST 1525 highbd_fadst4_row_rect_many_neon, // FLIPADST_ADST 1526 highbd_fidentity4_row_rect_many_neon, // IDTX 1527 highbd_fidentity4_row_rect_many_neon, // V_DCT 1528 highbd_fdct4_row_rect_many_neon, // H_DCT 1529 highbd_fidentity4_row_rect_many_neon, // V_ADST 1530 highbd_fadst4_row_rect_many_neon, // H_ADST 1531 highbd_fidentity4_row_rect_many_neon, // V_FLIPADST 1532 highbd_fadst4_row_rect_many_neon // H_FLIPADST 1533 }; 1534 1535 static void highbd_fdct32_x4_neon(const int32x4_t *input, int32x4_t *output, 1536 int cos_bit) { 1537 const int32_t *const cospi = cospi_arr_s32(cos_bit); 1538 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); 1539 1540 // Workspaces for intermediate transform steps. 1541 int32x4_t buf0[32]; 1542 int32x4_t buf1[32]; 1543 1544 // stage 1 1545 butterfly_dct_pre(input, buf1, 32); 1546 1547 // stage 2 1548 butterfly_dct_pre(buf1, buf0, 16); 1549 buf0[16] = buf1[16]; 1550 buf0[17] = buf1[17]; 1551 buf0[18] = buf1[18]; 1552 buf0[19] = buf1[19]; 1553 butterfly_0112_neon(cospi, 32, buf1[27], buf1[20], &buf0[27], &buf0[20], 1554 v_cos_bit); 1555 butterfly_0112_neon(cospi, 32, buf1[26], buf1[21], &buf0[26], &buf0[21], 1556 v_cos_bit); 1557 butterfly_0112_neon(cospi, 32, buf1[25], buf1[22], &buf0[25], &buf0[22], 1558 v_cos_bit); 1559 butterfly_0112_neon(cospi, 32, buf1[24], buf1[23], &buf0[24], &buf0[23], 1560 v_cos_bit); 1561 buf0[28] = buf1[28]; 1562 buf0[29] = buf1[29]; 1563 buf0[30] = buf1[30]; 1564 buf0[31] = buf1[31]; 1565 1566 // stage 3 1567 butterfly_dct_pre(buf0, buf1, 8); 1568 buf1[8] = buf0[8]; 1569 buf1[9] = buf0[9]; 1570 butterfly_0112_neon(cospi, 32, buf0[13], buf0[10], &buf1[13], &buf1[10], 1571 v_cos_bit); 1572 butterfly_0112_neon(cospi, 32, buf0[12], buf0[11], &buf1[12], &buf1[11], 1573 v_cos_bit); 1574 buf1[14] = buf0[14]; 1575 buf1[15] = buf0[15]; 1576 butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 16); 1577 1578 // stage 4 1579 butterfly_dct_pre(buf1, buf0, 4); 1580 buf0[4] = buf1[4]; 1581 butterfly_0112_neon(cospi, 32, buf1[6], buf1[5], &buf0[6], &buf0[5], 1582 v_cos_bit); 1583 buf0[7] = buf1[7]; 1584 butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 8); 1585 buf0[16] = buf1[16]; 1586 buf0[17] = buf1[17]; 1587 butterfly_0112_neon(cospi, 16, buf1[29], buf1[18], &buf0[29], &buf0[18], 1588 v_cos_bit); 1589 butterfly_0112_neon(cospi, 16, buf1[28], buf1[19], &buf0[28], &buf0[19], 1590 v_cos_bit); 1591 butterfly_2312_neon(cospi, 16, buf1[27], buf1[20], &buf0[20], &buf0[27], 1592 v_cos_bit); 1593 butterfly_2312_neon(cospi, 16, buf1[26], buf1[21], &buf0[21], &buf0[26], 1594 v_cos_bit); 1595 buf0[22] = buf1[22]; 1596 buf0[23] = buf1[23]; 1597 buf0[24] = buf1[24]; 1598 buf0[25] = buf1[25]; 1599 buf0[30] = buf1[30]; 1600 buf0[31] = buf1[31]; 1601 1602 // stage 5 1603 butterfly_0112_neon(cospi, 32, buf0[0], buf0[1], &buf1[0], &buf1[1], 1604 v_cos_bit); 1605 butterfly_0112_neon(cospi, 16, buf0[3], buf0[2], &buf1[2], &buf1[3], 1606 v_cos_bit); 1607 butterfly_dct_post(buf0 + 4, buf0 + 4, buf1 + 4, 4); 1608 buf1[8] = buf0[8]; 1609 butterfly_0112_neon(cospi, 16, buf0[14], buf0[9], &buf1[14], &buf1[9], 1610 v_cos_bit); 1611 butterfly_2312_neon(cospi, 16, buf0[13], buf0[10], &buf1[10], &buf1[13], 1612 v_cos_bit); 1613 buf1[11] = buf0[11]; 1614 buf1[12] = buf0[12]; 1615 buf1[15] = buf0[15]; 1616 butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 8); 1617 butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 8); 1618 1619 // stage 6 1620 buf0[0] = buf1[0]; 1621 buf0[1] = buf1[1]; 1622 buf0[2] = buf1[2]; 1623 buf0[3] = buf1[3]; 1624 1625 butterfly_0112_neon(cospi, 8, buf1[7], buf1[4], &buf0[4], &buf0[7], 1626 v_cos_bit); 1627 butterfly_0112_neon(cospi, 8, buf1[30], buf1[17], &buf0[30], &buf0[17], 1628 v_cos_bit); 1629 butterfly_2312_neon(cospi, 8, buf1[29], buf1[18], &buf0[18], &buf0[29], 1630 v_cos_bit); 1631 butterfly_dct_post(buf1 + 8, buf1 + 8, buf0 + 8, 4); 1632 butterfly_dct_post(buf1 + 12, buf1 + 12, buf0 + 12, 4); 1633 buf0[16] = buf1[16]; 1634 buf0[19] = buf1[19]; 1635 buf0[20] = buf1[20]; 1636 1637 butterfly_0130_neon(cospi, 24, buf1[5], buf1[6], &buf0[5], &buf0[6], 1638 v_cos_bit); 1639 butterfly_0130_neon(cospi, 24, buf1[21], buf1[26], &buf0[26], &buf0[21], 1640 v_cos_bit); 1641 butterfly_0332_neon(cospi, 24, buf1[25], buf1[22], &buf0[25], &buf0[22], 1642 v_cos_bit); 1643 1644 buf0[23] = buf1[23]; 1645 buf0[24] = buf1[24]; 1646 buf0[27] = buf1[27]; 1647 buf0[28] = buf1[28]; 1648 buf0[31] = buf1[31]; 1649 1650 // stage 7 1651 buf1[0] = buf0[0]; 1652 buf1[1] = buf0[1]; 1653 buf1[2] = buf0[2]; 1654 buf1[3] = buf0[3]; 1655 buf1[4] = buf0[4]; 1656 buf1[5] = buf0[5]; 1657 buf1[6] = buf0[6]; 1658 buf1[7] = buf0[7]; 1659 butterfly_0112_neon(cospi, 4, buf0[15], buf0[8], &buf1[8], &buf1[15], 1660 v_cos_bit); 1661 butterfly_0130_neon(cospi, 28, buf0[9], buf0[14], &buf1[9], &buf1[14], 1662 v_cos_bit); 1663 butterfly_0112_neon(cospi, 20, buf0[13], buf0[10], &buf1[10], &buf1[13], 1664 v_cos_bit); 1665 butterfly_0130_neon(cospi, 12, buf0[11], buf0[12], &buf1[11], &buf1[12], 1666 v_cos_bit); 1667 butterfly_dct_post(buf0 + 16, buf0 + 16, buf1 + 16, 4); 1668 butterfly_dct_post(buf0 + 20, buf0 + 20, buf1 + 20, 4); 1669 butterfly_dct_post(buf0 + 24, buf0 + 24, buf1 + 24, 4); 1670 butterfly_dct_post(buf0 + 28, buf0 + 28, buf1 + 28, 4); 1671 1672 // stage 8 1673 buf0[0] = buf1[0]; 1674 buf0[1] = buf1[1]; 1675 buf0[2] = buf1[2]; 1676 buf0[3] = buf1[3]; 1677 buf0[4] = buf1[4]; 1678 buf0[5] = buf1[5]; 1679 buf0[6] = buf1[6]; 1680 buf0[7] = buf1[7]; 1681 buf0[8] = buf1[8]; 1682 buf0[9] = buf1[9]; 1683 buf0[10] = buf1[10]; 1684 buf0[11] = buf1[11]; 1685 buf0[12] = buf1[12]; 1686 buf0[13] = buf1[13]; 1687 buf0[14] = buf1[14]; 1688 buf0[15] = buf1[15]; 1689 butterfly_0112_neon(cospi, 2, buf1[31], buf1[16], &buf0[16], &buf0[31], 1690 v_cos_bit); 1691 butterfly_0130_neon(cospi, 30, buf1[17], buf1[30], &buf0[17], &buf0[30], 1692 v_cos_bit); 1693 butterfly_0112_neon(cospi, 18, buf1[29], buf1[18], &buf0[18], &buf0[29], 1694 v_cos_bit); 1695 butterfly_0130_neon(cospi, 14, buf1[19], buf1[28], &buf0[19], &buf0[28], 1696 v_cos_bit); 1697 butterfly_0112_neon(cospi, 10, buf1[27], buf1[20], &buf0[20], &buf0[27], 1698 v_cos_bit); 1699 butterfly_0130_neon(cospi, 22, buf1[21], buf1[26], &buf0[21], &buf0[26], 1700 v_cos_bit); 1701 butterfly_0112_neon(cospi, 26, buf1[25], buf1[22], &buf0[22], &buf0[25], 1702 v_cos_bit); 1703 butterfly_0130_neon(cospi, 6, buf1[23], buf1[24], &buf0[23], &buf0[24], 1704 v_cos_bit); 1705 1706 // stage 9 1707 output[0] = buf0[0]; 1708 output[1] = buf0[16]; 1709 output[2] = buf0[8]; 1710 output[3] = buf0[24]; 1711 output[4] = buf0[4]; 1712 output[5] = buf0[20]; 1713 output[6] = buf0[12]; 1714 output[7] = buf0[28]; 1715 output[8] = buf0[2]; 1716 output[9] = buf0[18]; 1717 output[10] = buf0[10]; 1718 output[11] = buf0[26]; 1719 output[12] = buf0[6]; 1720 output[13] = buf0[22]; 1721 output[14] = buf0[14]; 1722 output[15] = buf0[30]; 1723 output[16] = buf0[1]; 1724 output[17] = buf0[17]; 1725 output[18] = buf0[9]; 1726 output[19] = buf0[25]; 1727 output[20] = buf0[5]; 1728 output[21] = buf0[21]; 1729 output[22] = buf0[13]; 1730 output[23] = buf0[29]; 1731 output[24] = buf0[3]; 1732 output[25] = buf0[19]; 1733 output[26] = buf0[11]; 1734 output[27] = buf0[27]; 1735 output[28] = buf0[7]; 1736 output[29] = buf0[23]; 1737 output[30] = buf0[15]; 1738 output[31] = buf0[31]; 1739 } 1740 1741 static void highbd_fdct64_x4_neon(const int32x4_t *input, int32x4_t *output, 1742 int8_t cos_bit) { 1743 const int32_t *const cospi = cospi_arr_s32(cos_bit); 1744 const int32x4_t v_cos_bit = vdupq_n_s32(-cos_bit); 1745 1746 // stage 1 1747 int32x4_t x1[64]; 1748 butterfly_dct_pre(input, x1, 64); 1749 1750 // stage 2 1751 int32x4_t x2[64]; 1752 butterfly_dct_pre(x1, x2, 32); 1753 x2[32] = x1[32]; 1754 x2[33] = x1[33]; 1755 x2[34] = x1[34]; 1756 x2[35] = x1[35]; 1757 x2[36] = x1[36]; 1758 x2[37] = x1[37]; 1759 x2[38] = x1[38]; 1760 x2[39] = x1[39]; 1761 butterfly_0112_neon(cospi, 32, x1[55], x1[40], &x2[55], &x2[40], v_cos_bit); 1762 butterfly_0112_neon(cospi, 32, x1[54], x1[41], &x2[54], &x2[41], v_cos_bit); 1763 butterfly_0112_neon(cospi, 32, x1[53], x1[42], &x2[53], &x2[42], v_cos_bit); 1764 butterfly_0112_neon(cospi, 32, x1[52], x1[43], &x2[52], &x2[43], v_cos_bit); 1765 butterfly_0112_neon(cospi, 32, x1[51], x1[44], &x2[51], &x2[44], v_cos_bit); 1766 butterfly_0112_neon(cospi, 32, x1[50], x1[45], &x2[50], &x2[45], v_cos_bit); 1767 butterfly_0112_neon(cospi, 32, x1[49], x1[46], &x2[49], &x2[46], v_cos_bit); 1768 butterfly_0112_neon(cospi, 32, x1[48], x1[47], &x2[48], &x2[47], v_cos_bit); 1769 x2[56] = x1[56]; 1770 x2[57] = x1[57]; 1771 x2[58] = x1[58]; 1772 x2[59] = x1[59]; 1773 x2[60] = x1[60]; 1774 x2[61] = x1[61]; 1775 x2[62] = x1[62]; 1776 x2[63] = x1[63]; 1777 1778 // stage 3 1779 int32x4_t x3[64]; 1780 butterfly_dct_pre(x2, x3, 16); 1781 x3[16] = x2[16]; 1782 x3[17] = x2[17]; 1783 x3[18] = x2[18]; 1784 x3[19] = x2[19]; 1785 butterfly_0112_neon(cospi, 32, x2[27], x2[20], &x3[27], &x3[20], v_cos_bit); 1786 butterfly_0112_neon(cospi, 32, x2[26], x2[21], &x3[26], &x3[21], v_cos_bit); 1787 butterfly_0112_neon(cospi, 32, x2[25], x2[22], &x3[25], &x3[22], v_cos_bit); 1788 butterfly_0112_neon(cospi, 32, x2[24], x2[23], &x3[24], &x3[23], v_cos_bit); 1789 x3[28] = x2[28]; 1790 x3[29] = x2[29]; 1791 x3[30] = x2[30]; 1792 x3[31] = x2[31]; 1793 butterfly_dct_post(x2 + 32, x2 + 32, x3 + 32, 32); 1794 1795 // stage 4 1796 int32x4_t x4[64]; 1797 butterfly_dct_pre(x3, x4, 8); 1798 x4[8] = x3[8]; 1799 x4[9] = x3[9]; 1800 butterfly_0112_neon(cospi, 32, x3[13], x3[10], &x4[13], &x4[10], v_cos_bit); 1801 butterfly_0112_neon(cospi, 32, x3[12], x3[11], &x4[12], &x4[11], v_cos_bit); 1802 x4[14] = x3[14]; 1803 x4[15] = x3[15]; 1804 butterfly_dct_post(x3 + 16, x3 + 16, x4 + 16, 16); 1805 x4[32] = x3[32]; 1806 x4[33] = x3[33]; 1807 x4[34] = x3[34]; 1808 x4[35] = x3[35]; 1809 butterfly_0112_neon(cospi, 16, x3[59], x3[36], &x4[59], &x4[36], v_cos_bit); 1810 butterfly_0112_neon(cospi, 16, x3[58], x3[37], &x4[58], &x4[37], v_cos_bit); 1811 butterfly_0112_neon(cospi, 16, x3[57], x3[38], &x4[57], &x4[38], v_cos_bit); 1812 butterfly_0112_neon(cospi, 16, x3[56], x3[39], &x4[56], &x4[39], v_cos_bit); 1813 butterfly_2312_neon(cospi, 16, x3[55], x3[40], &x4[40], &x4[55], v_cos_bit); 1814 butterfly_2312_neon(cospi, 16, x3[54], x3[41], &x4[41], &x4[54], v_cos_bit); 1815 butterfly_2312_neon(cospi, 16, x3[53], x3[42], &x4[42], &x4[53], v_cos_bit); 1816 butterfly_2312_neon(cospi, 16, x3[52], x3[43], &x4[43], &x4[52], v_cos_bit); 1817 x4[44] = x3[44]; 1818 x4[45] = x3[45]; 1819 x4[46] = x3[46]; 1820 x4[47] = x3[47]; 1821 x4[48] = x3[48]; 1822 x4[49] = x3[49]; 1823 x4[50] = x3[50]; 1824 x4[51] = x3[51]; 1825 x4[60] = x3[60]; 1826 x4[61] = x3[61]; 1827 x4[62] = x3[62]; 1828 x4[63] = x3[63]; 1829 1830 // stage 5 1831 int32x4_t x5[64]; 1832 butterfly_dct_pre(x4, x5, 4); 1833 x5[4] = x4[4]; 1834 butterfly_0112_neon(cospi, 32, x4[6], x4[5], &x5[6], &x5[5], v_cos_bit); 1835 x5[7] = x4[7]; 1836 butterfly_dct_post(x4 + 8, x4 + 8, x5 + 8, 8); 1837 x5[16] = x4[16]; 1838 x5[17] = x4[17]; 1839 butterfly_0112_neon(cospi, 16, x4[29], x4[18], &x5[29], &x5[18], v_cos_bit); 1840 butterfly_0112_neon(cospi, 16, x4[28], x4[19], &x5[28], &x5[19], v_cos_bit); 1841 butterfly_2312_neon(cospi, 16, x4[27], x4[20], &x5[20], &x5[27], v_cos_bit); 1842 butterfly_2312_neon(cospi, 16, x4[26], x4[21], &x5[21], &x5[26], v_cos_bit); 1843 x5[22] = x4[22]; 1844 x5[23] = x4[23]; 1845 x5[24] = x4[24]; 1846 x5[25] = x4[25]; 1847 x5[30] = x4[30]; 1848 x5[31] = x4[31]; 1849 butterfly_dct_post(x4 + 32, x4 + 32, x5 + 32, 16); 1850 butterfly_dct_post(x4 + 48, x4 + 48, x5 + 48, 16); 1851 1852 // stage 6 1853 int32x4_t x6[64]; 1854 butterfly_0112_neon(cospi, 32, x5[0], x5[1], &x6[0], &x6[1], v_cos_bit); 1855 butterfly_0112_neon(cospi, 16, x5[3], x5[2], &x6[2], &x6[3], v_cos_bit); 1856 butterfly_dct_post(x5 + 4, x5 + 4, x6 + 4, 4); 1857 x6[8] = x5[8]; 1858 butterfly_0112_neon(cospi, 16, x5[14], x5[9], &x6[14], &x6[9], v_cos_bit); 1859 butterfly_2312_neon(cospi, 16, x5[13], x5[10], &x6[10], &x6[13], v_cos_bit); 1860 x6[11] = x5[11]; 1861 x6[12] = x5[12]; 1862 x6[15] = x5[15]; 1863 butterfly_dct_post(x5 + 16, x5 + 16, x6 + 16, 8); 1864 butterfly_dct_post(x5 + 24, x5 + 24, x6 + 24, 8); 1865 x6[32] = x5[32]; 1866 x6[33] = x5[33]; 1867 butterfly_0112_neon(cospi, 8, x5[61], x5[34], &x6[61], &x6[34], v_cos_bit); 1868 butterfly_0112_neon(cospi, 8, x5[60], x5[35], &x6[60], &x6[35], v_cos_bit); 1869 butterfly_2312_neon(cospi, 8, x5[59], x5[36], &x6[36], &x6[59], v_cos_bit); 1870 butterfly_2312_neon(cospi, 8, x5[58], x5[37], &x6[37], &x6[58], v_cos_bit); 1871 x6[38] = x5[38]; 1872 x6[39] = x5[39]; 1873 x6[40] = x5[40]; 1874 x6[41] = x5[41]; 1875 butterfly_0130_neon(cospi, 24, x5[42], x5[53], &x6[53], &x6[42], v_cos_bit); 1876 butterfly_0130_neon(cospi, 24, x5[43], x5[52], &x6[52], &x6[43], v_cos_bit); 1877 butterfly_0332_neon(cospi, 24, x5[51], x5[44], &x6[51], &x6[44], v_cos_bit); 1878 butterfly_0332_neon(cospi, 24, x5[50], x5[45], &x6[50], &x6[45], v_cos_bit); 1879 x6[46] = x5[46]; 1880 x6[47] = x5[47]; 1881 x6[48] = x5[48]; 1882 x6[49] = x5[49]; 1883 x6[54] = x5[54]; 1884 x6[55] = x5[55]; 1885 x6[56] = x5[56]; 1886 x6[57] = x5[57]; 1887 x6[62] = x5[62]; 1888 x6[63] = x5[63]; 1889 1890 // stage 7 1891 int32x4_t x7[64]; 1892 x7[0] = x6[0]; 1893 x7[1] = x6[1]; 1894 x7[2] = x6[2]; 1895 x7[3] = x6[3]; 1896 butterfly_0112_neon(cospi, 8, x6[7], x6[4], &x7[4], &x7[7], v_cos_bit); 1897 butterfly_0130_neon(cospi, 24, x6[5], x6[6], &x7[5], &x7[6], v_cos_bit); 1898 butterfly_dct_post(x6 + 8, x6 + 8, x7 + 8, 4); 1899 butterfly_dct_post(x6 + 12, x6 + 12, x7 + 12, 4); 1900 x7[16] = x6[16]; 1901 butterfly_0112_neon(cospi, 8, x6[30], x6[17], &x7[30], &x7[17], v_cos_bit); 1902 butterfly_2312_neon(cospi, 8, x6[29], x6[18], &x7[18], &x7[29], v_cos_bit); 1903 x7[19] = x6[19]; 1904 x7[20] = x6[20]; 1905 butterfly_0130_neon(cospi, 24, x6[21], x6[26], &x7[26], &x7[21], v_cos_bit); 1906 butterfly_0332_neon(cospi, 24, x6[25], x6[22], &x7[25], &x7[22], v_cos_bit); 1907 x7[23] = x6[23]; 1908 x7[24] = x6[24]; 1909 x7[27] = x6[27]; 1910 x7[28] = x6[28]; 1911 x7[31] = x6[31]; 1912 butterfly_dct_post(x6 + 32, x6 + 32, x7 + 32, 8); 1913 butterfly_dct_post(x6 + 40, x6 + 40, x7 + 40, 8); 1914 butterfly_dct_post(x6 + 48, x6 + 48, x7 + 48, 8); 1915 butterfly_dct_post(x6 + 56, x6 + 56, x7 + 56, 8); 1916 1917 // stage 8 1918 int32x4_t x8[64]; 1919 x8[0] = x7[0]; 1920 x8[1] = x7[1]; 1921 x8[2] = x7[2]; 1922 x8[3] = x7[3]; 1923 x8[4] = x7[4]; 1924 x8[5] = x7[5]; 1925 x8[6] = x7[6]; 1926 x8[7] = x7[7]; 1927 1928 butterfly_0112_neon(cospi, 4, x7[15], x7[8], &x8[8], &x8[15], v_cos_bit); 1929 butterfly_0130_neon(cospi, 28, x7[9], x7[14], &x8[9], &x8[14], v_cos_bit); 1930 butterfly_0112_neon(cospi, 20, x7[13], x7[10], &x8[10], &x8[13], v_cos_bit); 1931 butterfly_0130_neon(cospi, 12, x7[11], x7[12], &x8[11], &x8[12], v_cos_bit); 1932 butterfly_dct_post(x7 + 16, x7 + 16, x8 + 16, 4); 1933 butterfly_dct_post(x7 + 20, x7 + 20, x8 + 20, 4); 1934 butterfly_dct_post(x7 + 24, x7 + 24, x8 + 24, 4); 1935 butterfly_dct_post(x7 + 28, x7 + 28, x8 + 28, 4); 1936 x8[32] = x7[32]; 1937 butterfly_0112_neon(cospi, 4, x7[62], x7[33], &x8[62], &x8[33], v_cos_bit); 1938 butterfly_2312_neon(cospi, 4, x7[61], x7[34], &x8[34], &x8[61], v_cos_bit); 1939 x8[35] = x7[35]; 1940 x8[36] = x7[36]; 1941 butterfly_0130_neon(cospi, 28, x7[37], x7[58], &x8[58], &x8[37], v_cos_bit); 1942 butterfly_0332_neon(cospi, 28, x7[57], x7[38], &x8[57], &x8[38], v_cos_bit); 1943 x8[39] = x7[39]; 1944 x8[40] = x7[40]; 1945 butterfly_0112_neon(cospi, 20, x7[54], x7[41], &x8[54], &x8[41], v_cos_bit); 1946 butterfly_2312_neon(cospi, 20, x7[53], x7[42], &x8[42], &x8[53], v_cos_bit); 1947 x8[43] = x7[43]; 1948 x8[44] = x7[44]; 1949 butterfly_0130_neon(cospi, 12, x7[45], x7[50], &x8[50], &x8[45], v_cos_bit); 1950 butterfly_0332_neon(cospi, 12, x7[49], x7[46], &x8[49], &x8[46], v_cos_bit); 1951 x8[47] = x7[47]; 1952 x8[48] = x7[48]; 1953 x8[51] = x7[51]; 1954 x8[52] = x7[52]; 1955 x8[55] = x7[55]; 1956 x8[56] = x7[56]; 1957 x8[59] = x7[59]; 1958 x8[60] = x7[60]; 1959 x8[63] = x7[63]; 1960 1961 // stage 9 1962 int32x4_t x9[64]; 1963 x9[0] = x8[0]; 1964 x9[1] = x8[1]; 1965 x9[2] = x8[2]; 1966 x9[3] = x8[3]; 1967 x9[4] = x8[4]; 1968 x9[5] = x8[5]; 1969 x9[6] = x8[6]; 1970 x9[7] = x8[7]; 1971 x9[8] = x8[8]; 1972 x9[9] = x8[9]; 1973 x9[10] = x8[10]; 1974 x9[11] = x8[11]; 1975 x9[12] = x8[12]; 1976 x9[13] = x8[13]; 1977 x9[14] = x8[14]; 1978 x9[15] = x8[15]; 1979 butterfly_0112_neon(cospi, 2, x8[31], x8[16], &x9[16], &x9[31], v_cos_bit); 1980 butterfly_0130_neon(cospi, 30, x8[17], x8[30], &x9[17], &x9[30], v_cos_bit); 1981 butterfly_0112_neon(cospi, 18, x8[29], x8[18], &x9[18], &x9[29], v_cos_bit); 1982 butterfly_0130_neon(cospi, 14, x8[19], x8[28], &x9[19], &x9[28], v_cos_bit); 1983 butterfly_0112_neon(cospi, 10, x8[27], x8[20], &x9[20], &x9[27], v_cos_bit); 1984 butterfly_0130_neon(cospi, 22, x8[21], x8[26], &x9[21], &x9[26], v_cos_bit); 1985 butterfly_0112_neon(cospi, 26, x8[25], x8[22], &x9[22], &x9[25], v_cos_bit); 1986 butterfly_0130_neon(cospi, 6, x8[23], x8[24], &x9[23], &x9[24], v_cos_bit); 1987 butterfly_dct_post(x8 + 32, x8 + 32, x9 + 32, 4); 1988 butterfly_dct_post(x8 + 36, x8 + 36, x9 + 36, 4); 1989 butterfly_dct_post(x8 + 40, x8 + 40, x9 + 40, 4); 1990 butterfly_dct_post(x8 + 44, x8 + 44, x9 + 44, 4); 1991 butterfly_dct_post(x8 + 48, x8 + 48, x9 + 48, 4); 1992 butterfly_dct_post(x8 + 52, x8 + 52, x9 + 52, 4); 1993 butterfly_dct_post(x8 + 56, x8 + 56, x9 + 56, 4); 1994 butterfly_dct_post(x8 + 60, x8 + 60, x9 + 60, 4); 1995 1996 // stage 10 1997 int32x4_t x10[64]; 1998 x10[0] = x9[0]; 1999 x10[1] = x9[1]; 2000 x10[2] = x9[2]; 2001 x10[3] = x9[3]; 2002 x10[4] = x9[4]; 2003 x10[5] = x9[5]; 2004 x10[6] = x9[6]; 2005 x10[7] = x9[7]; 2006 x10[8] = x9[8]; 2007 x10[9] = x9[9]; 2008 x10[10] = x9[10]; 2009 x10[11] = x9[11]; 2010 x10[12] = x9[12]; 2011 x10[13] = x9[13]; 2012 x10[14] = x9[14]; 2013 x10[15] = x9[15]; 2014 x10[16] = x9[16]; 2015 x10[17] = x9[17]; 2016 x10[18] = x9[18]; 2017 x10[19] = x9[19]; 2018 x10[20] = x9[20]; 2019 x10[21] = x9[21]; 2020 x10[22] = x9[22]; 2021 x10[23] = x9[23]; 2022 x10[24] = x9[24]; 2023 x10[25] = x9[25]; 2024 x10[26] = x9[26]; 2025 x10[27] = x9[27]; 2026 x10[28] = x9[28]; 2027 x10[29] = x9[29]; 2028 x10[30] = x9[30]; 2029 x10[31] = x9[31]; 2030 butterfly_0112_neon(cospi, 1, x9[63], x9[32], &x10[32], &x10[63], v_cos_bit); 2031 butterfly_0130_neon(cospi, 31, x9[33], x9[62], &x10[33], &x10[62], v_cos_bit); 2032 butterfly_0112_neon(cospi, 17, x9[61], x9[34], &x10[34], &x10[61], v_cos_bit); 2033 butterfly_0130_neon(cospi, 15, x9[35], x9[60], &x10[35], &x10[60], v_cos_bit); 2034 butterfly_0112_neon(cospi, 9, x9[59], x9[36], &x10[36], &x10[59], v_cos_bit); 2035 butterfly_0130_neon(cospi, 23, x9[37], x9[58], &x10[37], &x10[58], v_cos_bit); 2036 butterfly_0112_neon(cospi, 25, x9[57], x9[38], &x10[38], &x10[57], v_cos_bit); 2037 butterfly_0130_neon(cospi, 7, x9[39], x9[56], &x10[39], &x10[56], v_cos_bit); 2038 butterfly_0112_neon(cospi, 5, x9[55], x9[40], &x10[40], &x10[55], v_cos_bit); 2039 butterfly_0130_neon(cospi, 27, x9[41], x9[54], &x10[41], &x10[54], v_cos_bit); 2040 butterfly_0112_neon(cospi, 21, x9[53], x9[42], &x10[42], &x10[53], v_cos_bit); 2041 butterfly_0130_neon(cospi, 11, x9[43], x9[52], &x10[43], &x10[52], v_cos_bit); 2042 butterfly_0112_neon(cospi, 13, x9[51], x9[44], &x10[44], &x10[51], v_cos_bit); 2043 butterfly_0130_neon(cospi, 19, x9[45], x9[50], &x10[45], &x10[50], v_cos_bit); 2044 butterfly_0112_neon(cospi, 29, x9[49], x9[46], &x10[46], &x10[49], v_cos_bit); 2045 butterfly_0130_neon(cospi, 3, x9[47], x9[48], &x10[47], &x10[48], v_cos_bit); 2046 2047 // stage 11 2048 output[0] = x10[0]; 2049 output[1] = x10[32]; 2050 output[2] = x10[16]; 2051 output[3] = x10[48]; 2052 output[4] = x10[8]; 2053 output[5] = x10[40]; 2054 output[6] = x10[24]; 2055 output[7] = x10[56]; 2056 output[8] = x10[4]; 2057 output[9] = x10[36]; 2058 output[10] = x10[20]; 2059 output[11] = x10[52]; 2060 output[12] = x10[12]; 2061 output[13] = x10[44]; 2062 output[14] = x10[28]; 2063 output[15] = x10[60]; 2064 output[16] = x10[2]; 2065 output[17] = x10[34]; 2066 output[18] = x10[18]; 2067 output[19] = x10[50]; 2068 output[20] = x10[10]; 2069 output[21] = x10[42]; 2070 output[22] = x10[26]; 2071 output[23] = x10[58]; 2072 output[24] = x10[6]; 2073 output[25] = x10[38]; 2074 output[26] = x10[22]; 2075 output[27] = x10[54]; 2076 output[28] = x10[14]; 2077 output[29] = x10[46]; 2078 output[30] = x10[30]; 2079 output[31] = x10[62]; 2080 output[32] = x10[1]; 2081 output[33] = x10[33]; 2082 output[34] = x10[17]; 2083 output[35] = x10[49]; 2084 output[36] = x10[9]; 2085 output[37] = x10[41]; 2086 output[38] = x10[25]; 2087 output[39] = x10[57]; 2088 output[40] = x10[5]; 2089 output[41] = x10[37]; 2090 output[42] = x10[21]; 2091 output[43] = x10[53]; 2092 output[44] = x10[13]; 2093 output[45] = x10[45]; 2094 output[46] = x10[29]; 2095 output[47] = x10[61]; 2096 output[48] = x10[3]; 2097 output[49] = x10[35]; 2098 output[50] = x10[19]; 2099 output[51] = x10[51]; 2100 output[52] = x10[11]; 2101 output[53] = x10[43]; 2102 output[54] = x10[27]; 2103 output[55] = x10[59]; 2104 output[56] = x10[7]; 2105 output[57] = x10[39]; 2106 output[58] = x10[23]; 2107 output[59] = x10[55]; 2108 output[60] = x10[15]; 2109 output[61] = x10[47]; 2110 output[62] = x10[31]; 2111 output[63] = x10[63]; 2112 } 2113 2114 static void highbd_fidentity32_x4_neon(const int32x4_t *input, 2115 int32x4_t *output, int cos_bit) { 2116 (void)cos_bit; 2117 for (int i = 0; i < 32; i++) { 2118 output[i] = vshlq_n_s32(input[i], 2); 2119 } 2120 } 2121 2122 TRANSFORM_COL_MANY(fdct32, 32) 2123 TRANSFORM_COL_MANY(fidentity32, 32) 2124 2125 static const fwd_transform_1d_col_many_neon 2126 col_highbd_txfm32_x4_arr[TX_TYPES] = { 2127 highbd_fdct32_col_many_neon, // DCT_DCT 2128 NULL, // ADST_DCT 2129 NULL, // DCT_ADST 2130 NULL, // ADST_ADST 2131 NULL, // FLIPADST_DCT 2132 NULL, // DCT_FLIPADST 2133 NULL, // FLIPADST_FLIPADST 2134 NULL, // ADST_FLIPADST 2135 NULL, // FLIPADST_ADST 2136 highbd_fidentity32_col_many_neon, // IDTX 2137 NULL, // V_DCT 2138 NULL, // H_DCT 2139 NULL, // V_ADST 2140 NULL, // H_ADST 2141 NULL, // V_FLIPADST 2142 NULL // H_FLIPADST 2143 }; 2144 2145 TRANSFORM_ROW_MANY(fdct32, 32) 2146 TRANSFORM_ROW_MANY(fidentity32, 32) 2147 2148 static const fwd_transform_1d_row_many_neon 2149 row_highbd_txfm32_x4_arr[TX_TYPES] = { 2150 highbd_fdct32_row_many_neon, // DCT_DCT 2151 NULL, // ADST_DCT 2152 NULL, // DCT_ADST 2153 NULL, // ADST_ADST 2154 NULL, // FLIPADST_DCT 2155 NULL, // DCT_FLIPADST 2156 NULL, // FLIPADST_FLIPADST 2157 NULL, // ADST_FLIPADST 2158 NULL, // FLIPADST_ADST 2159 highbd_fidentity32_row_many_neon, // IDTX 2160 NULL, // V_DCT 2161 NULL, // H_DCT 2162 NULL, // V_ADST 2163 NULL, // H_ADST 2164 NULL, // V_FLIPADST 2165 NULL // H_FLIPADST 2166 }; 2167 2168 TRANSFORM_ROW_RECT_MANY(fdct32, 32) 2169 TRANSFORM_ROW_RECT_MANY(fidentity32, 32) 2170 2171 static const fwd_transform_1d_row_many_neon 2172 row_rect_highbd_txfm32_x4_arr[TX_TYPES] = { 2173 highbd_fdct32_row_rect_many_neon, // DCT_DCT 2174 NULL, // ADST_DCT 2175 NULL, // DCT_ADST 2176 NULL, // ADST_ADST 2177 NULL, // FLIPADST_DCT 2178 NULL, // DCT_FLIPADST 2179 NULL, // FLIPADST_FLIPADST 2180 NULL, // ADST_FLIPADST 2181 NULL, // FLIPADST_ADST 2182 highbd_fidentity32_row_rect_many_neon, // IDTX 2183 NULL, // V_DCT 2184 NULL, // H_DCT 2185 NULL, // V_ADST 2186 NULL, // H_ADST 2187 NULL, // V_FLIPADST 2188 NULL // H_FLIPADST 2189 }; 2190 2191 void av1_fwd_txfm2d_16x8_neon(const int16_t *input, int32_t *coeff, int stride, 2192 TX_TYPE tx_type, int bd) { 2193 (void)bd; 2194 const fwd_transform_1d_col_many_neon col_txfm = 2195 col_highbd_txfm8_xn_arr[tx_type]; 2196 const fwd_transform_1d_row_many_neon row_txfm = 2197 row_rect_highbd_txfm16_xn_arr[tx_type]; 2198 int bit = av1_fwd_cos_bit_col[2][1]; 2199 2200 int ud_flip, lr_flip; 2201 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2202 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 2203 2204 // Column-wise transform. 2205 int32x4_t buf0[32]; 2206 if (lr_flip) { 2207 col_txfm(input, buf0 + 3 * 8, stride, bit, /*lr_flip=*/1, /*howmany=*/4, 2208 /*hm_stride=*/-8); 2209 } else { 2210 col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/4, 2211 /*hm_stride=*/8); 2212 } 2213 shift_right_2_round_s32_x4(buf0, buf0, 32); 2214 2215 int32x4_t buf1[32]; 2216 transpose_arrays_s32_16x8(buf0, buf1); 2217 2218 // Row-wise transform. 2219 row_txfm(buf1, coeff, bit, /*howmany=*/2, /*hm_stride=*/16, /*stride=*/8); 2220 } 2221 2222 void av1_fwd_txfm2d_8x16_neon(const int16_t *input, int32_t *coeff, int stride, 2223 TX_TYPE tx_type, int bd) { 2224 (void)bd; 2225 const fwd_transform_1d_col_many_neon col_txfm = 2226 col_highbd_txfm16_xn_arr[tx_type]; 2227 const fwd_transform_1d_row_many_neon row_txfm = 2228 row_rect_highbd_txfm8_xn_arr[tx_type]; 2229 int bit = av1_fwd_cos_bit_col[1][2]; 2230 2231 int ud_flip, lr_flip; 2232 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2233 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2234 2235 // Column-wise transform. 2236 int32x4_t buf0[32]; 2237 if (lr_flip) { 2238 col_txfm(input, buf0 + 16, stride, bit, /*lr_flip=*/1, /*howmany=*/2, 2239 /*hm_stride=*/-16); 2240 } else { 2241 col_txfm(input, buf0, stride, bit, /*lr_flip=*/0, /*howmany=*/2, 2242 /*hm_stride=*/16); 2243 } 2244 shift_right_2_round_s32_x4(buf0, buf0, 32); 2245 2246 int32x4_t buf1[32]; 2247 transpose_arrays_s32_8x16(buf0, buf1); 2248 2249 // Row-wise transform. 2250 row_txfm(buf1, coeff, bit, /*howmany=*/4, /*hm_stride=*/8, /*stride=*/16); 2251 } 2252 2253 #if !CONFIG_REALTIME_ONLY 2254 void av1_fwd_txfm2d_4x16_neon(const int16_t *input, int32_t *coeff, int stride, 2255 TX_TYPE tx_type, int bd) { 2256 (void)bd; 2257 int bitcol = av1_fwd_cos_bit_col[0][2]; 2258 int bitrow = av1_fwd_cos_bit_row[0][2]; 2259 const fwd_transform_1d_col_many_neon col_txfm = 2260 col_highbd_txfm16_xn_arr[tx_type]; 2261 const fwd_transform_1d_row_many_neon row_txfm = 2262 row_highbd_txfm4_xn_arr[tx_type]; 2263 2264 int ud_flip, lr_flip; 2265 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2266 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2267 2268 // Column-wise transform. 2269 int32x4_t buf0[16]; 2270 if (lr_flip) { 2271 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/1, /*howmany=*/1, 2272 /*hm_stride=*/0); 2273 } else { 2274 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/1, 2275 /*hm_stride=*/0); 2276 } 2277 shift_right_1_round_s32_x4(buf0, buf0, 16); 2278 2279 int32x4_t buf1[16]; 2280 transpose_arrays_s32_4x16(buf0, buf1); 2281 2282 // Row-wise transform. 2283 row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/4, /*stride=*/16); 2284 } 2285 #endif 2286 2287 void av1_fwd_txfm2d_16x4_neon(const int16_t *input, int32_t *coeff, int stride, 2288 TX_TYPE tx_type, int bd) { 2289 (void)bd; 2290 int bitcol = av1_fwd_cos_bit_col[2][0]; 2291 int bitrow = av1_fwd_cos_bit_row[2][0]; 2292 const fwd_transform_1d_col_many_neon col_txfm = 2293 col_highbd_txfm4_xn_arr[tx_type]; 2294 const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm16_xn_arr[tx_type]; 2295 2296 int ud_flip, lr_flip; 2297 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2298 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); 2299 2300 // Column-wise transform. 2301 int32x4_t buf0[16]; 2302 if (lr_flip) { 2303 col_txfm(input, buf0 + 3 * 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/4, 2304 /*hm_stride=*/-4); 2305 } else { 2306 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, 2307 /*hm_stride=*/4); 2308 } 2309 2310 shift_right_1_round_s32_x4(buf0, buf0, 16); 2311 transpose_arrays_s32_4x16(buf0, buf0); 2312 2313 // Row-wise transform. 2314 row_txfm(buf0, coeff, bitrow, /*stride=*/4); 2315 } 2316 2317 void av1_fwd_txfm2d_16x32_neon(const int16_t *input, int32_t *coeff, int stride, 2318 TX_TYPE tx_type, int bd) { 2319 (void)bd; 2320 const fwd_transform_1d_col_many_neon col_txfm = 2321 col_highbd_txfm32_x4_arr[tx_type]; 2322 const fwd_transform_1d_row_many_neon row_txfm = 2323 row_rect_highbd_txfm16_xn_arr[tx_type]; 2324 int bitcol = av1_fwd_cos_bit_col[2][3]; 2325 int bitrow = av1_fwd_cos_bit_row[2][3]; 2326 2327 // Column-wise transform. 2328 int32x4_t buf0[128]; 2329 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/4, 2330 /*hm_stride=*/32); 2331 shift_right_4_round_s32_x4(buf0, buf0, 128); 2332 2333 int32x4_t buf1[128]; 2334 transpose_arrays_s32_16x32(buf0, buf1); 2335 2336 // Row-wise transform. 2337 row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/16, /*stride=*/32); 2338 } 2339 2340 void av1_fwd_txfm2d_32x64_neon(const int16_t *input, int32_t *coeff, int stride, 2341 TX_TYPE tx_type, int bd) { 2342 (void)bd; 2343 (void)tx_type; 2344 int bitcol = av1_fwd_cos_bit_col[3][4]; 2345 int bitrow = av1_fwd_cos_bit_row[3][4]; 2346 2347 // Column-wise transform. 2348 int32x4_t buf0[512]; 2349 load_buffer_32x64(input, buf0, stride, 0); 2350 for (int i = 0; i < 8; i++) { 2351 highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); 2352 } 2353 shift_right_2_round_s32_x4(buf0, buf0, 512); 2354 2355 int32x4_t buf1[512]; 2356 transpose_arrays_s32_32x64(buf0, buf1); 2357 2358 // Row-wise transform. 2359 for (int i = 0; i < 16; i++) { 2360 highbd_fdct32_x4_neon(buf1 + i * 32, buf1 + i * 32, bitrow); 2361 } 2362 round_shift2_rect_array_s32_neon(buf1, buf1, 512); 2363 store_buffer_32x32(buf1, coeff, /*stride=*/32); 2364 } 2365 2366 void av1_fwd_txfm2d_64x32_neon(const int16_t *input, int32_t *coeff, int stride, 2367 TX_TYPE tx_type, int bd) { 2368 (void)bd; 2369 (void)tx_type; 2370 int bitcol = av1_fwd_cos_bit_col[4][3]; 2371 int bitrow = av1_fwd_cos_bit_row[4][3]; 2372 2373 // Column-wise transform. 2374 int32x4_t buf0[512]; 2375 load_buffer_64x32(input, buf0, stride, 0); 2376 for (int i = 0; i < 16; i++) { 2377 highbd_fdct32_x4_neon(buf0 + i * 32, buf0 + i * 32, bitcol); 2378 } 2379 shift_right_4_round_s32_x4(buf0, buf0, 512); 2380 2381 int32x4_t buf1[512]; 2382 transpose_arrays_s32_64x32(buf0, buf1); 2383 2384 // Row-wise transform. 2385 for (int i = 0; i < 8; i++) { 2386 highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); 2387 } 2388 round_shift2_rect_array_s32_neon(buf1, buf1, 512); 2389 store_buffer_64x32(buf1, coeff, /*stride=*/32); 2390 } 2391 2392 void av1_fwd_txfm2d_32x16_neon(const int16_t *input, int32_t *coeff, int stride, 2393 TX_TYPE tx_type, int bd) { 2394 (void)bd; 2395 const fwd_transform_1d_col_many_neon col_txfm = 2396 col_highbd_txfm16_xn_arr[tx_type]; 2397 const fwd_transform_1d_row_many_neon row_txfm = 2398 row_rect_highbd_txfm32_x4_arr[tx_type]; 2399 int bitcol = av1_fwd_cos_bit_col[3][2]; 2400 int bitrow = av1_fwd_cos_bit_row[3][2]; 2401 2402 // Column-wise transform. 2403 int32x4_t buf0[128]; 2404 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, 2405 /*hm_stride=*/16); 2406 shift_right_4_round_s32_x4(buf0, buf0, 128); 2407 2408 int32x4_t buf1[128]; 2409 transpose_arrays_s32_32x16(buf0, buf1); 2410 2411 // Row-wise transform. 2412 row_txfm(buf1, coeff, bitrow, /*howmany=*/4, /*hm_stride=*/32, /*stride=*/16); 2413 } 2414 2415 #if !CONFIG_REALTIME_ONLY 2416 void av1_fwd_txfm2d_8x32_neon(const int16_t *input, int32_t *coeff, int stride, 2417 TX_TYPE tx_type, int bd) { 2418 (void)bd; 2419 const fwd_transform_1d_col_many_neon col_txfm = 2420 col_highbd_txfm32_x4_arr[tx_type]; 2421 const fwd_transform_1d_row_many_neon row_txfm = 2422 row_highbd_txfm8_xn_arr[tx_type]; 2423 int bitcol = av1_fwd_cos_bit_col[1][3]; 2424 int bitrow = av1_fwd_cos_bit_row[1][3]; 2425 2426 // Column-wise transform. 2427 int32x4_t buf0[64]; 2428 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, 2429 /*hm_stride=*/32); 2430 shift_right_2_round_s32_x4(buf0, buf0, 64); 2431 2432 int32x4_t buf1[64]; 2433 transpose_arrays_s32_8x32(buf0, buf1); 2434 2435 // Row-wise transform. 2436 row_txfm(buf1, coeff, bitrow, /*howmany=*/8, /*hm_stride=*/8, /*stride=*/32); 2437 } 2438 2439 void av1_fwd_txfm2d_32x8_neon(const int16_t *input, int32_t *coeff, int stride, 2440 TX_TYPE tx_type, int bd) { 2441 (void)bd; 2442 const fwd_transform_1d_col_many_neon col_txfm = 2443 col_highbd_txfm8_xn_arr[tx_type]; 2444 const fwd_transform_1d_row_many_neon row_txfm = 2445 row_highbd_txfm32_x4_arr[tx_type]; 2446 int bitcol = av1_fwd_cos_bit_col[3][1]; 2447 int bitrow = av1_fwd_cos_bit_row[3][1]; 2448 2449 // Column-wise transform. 2450 int32x4_t buf0[64]; 2451 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/8, 2452 /*hm_stride=*/8); 2453 shift_right_2_round_s32_x4(buf0, buf0, 64); 2454 2455 int32x4_t buf1[64]; 2456 transpose_arrays_s32_32x8(buf0, buf1); 2457 2458 // Row-wise transform. 2459 row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/32, /*stride=*/8); 2460 } 2461 #endif 2462 2463 void av1_fwd_txfm2d_4x8_neon(const int16_t *input, int32_t *coeff, int stride, 2464 TX_TYPE tx_type, int bd) { 2465 (void)bd; 2466 int bitcol = av1_fwd_cos_bit_col[0][1]; 2467 int bitrow = av1_fwd_cos_bit_row[0][1]; 2468 const fwd_transform_1d_col_neon col_txfm = col_highbd_txfm8_x4_arr[tx_type]; 2469 const fwd_transform_1d_row_many_neon row_txfm = 2470 row_rect_highbd_txfm4_xn_arr[tx_type]; 2471 2472 int ud_flip, lr_flip; 2473 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2474 ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); 2475 2476 // Column-wise transform. 2477 int32x4_t buf0[8]; 2478 col_txfm(input, buf0, stride, bitcol, lr_flip); 2479 shift_right_1_round_s32_x4(buf0, buf0, 8); 2480 2481 int32x4_t buf1[8]; 2482 transpose_arrays_s32_4x8(buf0, buf1); 2483 2484 // Row-wise transform. 2485 row_txfm(buf1, coeff, bitrow, /*howmany=*/2, /*hm_stride=*/4, /*stride=*/8); 2486 } 2487 2488 void av1_fwd_txfm2d_8x4_neon(const int16_t *input, int32_t *coeff, int stride, 2489 TX_TYPE tx_type, int bd) { 2490 (void)bd; 2491 const int bitcol = av1_fwd_cos_bit_col[1][0]; 2492 const int bitrow = av1_fwd_cos_bit_row[1][0]; 2493 const fwd_transform_1d_col_many_neon col_txfm = 2494 col_highbd_txfm4_xn_arr[tx_type]; 2495 const fwd_transform_1d_row_neon row_txfm = row_highbd_txfm8_x4_arr[tx_type]; 2496 2497 int ud_flip, lr_flip; 2498 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2499 ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); 2500 2501 // Column-wise transform. 2502 int32x4_t buf0[8]; 2503 if (lr_flip) { 2504 col_txfm(input, buf0 + 4, stride, bitcol, /*lr_flip=*/1, /*howmany=*/2, 2505 /*hm_stride=*/-4); 2506 } else { 2507 col_txfm(input, buf0, stride, bitcol, /*lr_flip=*/0, /*howmany=*/2, 2508 /*hm_stride=*/4); 2509 } 2510 2511 shift_right_1_round_s32_x4(buf0, buf0, 8); 2512 2513 int32x4_t buf1[8]; 2514 transpose_arrays_s32_8x4(buf0, buf1); 2515 2516 // Row-wise transform. 2517 row_txfm(buf1, coeff, bitrow, /*stride=*/4); 2518 } 2519 2520 #if !CONFIG_REALTIME_ONLY 2521 void av1_fwd_txfm2d_16x64_neon(const int16_t *input, int32_t *coeff, int stride, 2522 TX_TYPE tx_type, int bd) { 2523 (void)bd; 2524 const int bitcol = av1_fwd_cos_bit_col[2][4]; 2525 const int bitrow = av1_fwd_cos_bit_row[2][4]; 2526 2527 int ud_flip, lr_flip; 2528 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2529 ud_adjust_input_and_stride(ud_flip, &input, &stride, 64); 2530 2531 // Column-wise transform. 2532 int32x4_t buf0[256]; 2533 load_buffer_16x64(input, buf0, stride, lr_flip); 2534 for (int i = 0; i < 4; i++) { 2535 highbd_fdct64_x4_neon(buf0 + i * 64, buf0 + i * 64, bitcol); 2536 } 2537 shift_right_2_round_s32_x4(buf0, buf0, 256); 2538 2539 int32x4_t buf1[256]; 2540 transpose_arrays_s32_16x64(buf0, buf1); 2541 2542 // Row-wise transform. 2543 highbd_fdct16_xn_neon(buf1, buf1, bitrow, 8); 2544 store_buffer_16x32(buf1, coeff, /*stride=*/32); 2545 } 2546 2547 void av1_fwd_txfm2d_64x16_neon(const int16_t *input, int32_t *coeff, int stride, 2548 TX_TYPE tx_type, int bd) { 2549 (void)bd; 2550 const int bitcol = av1_fwd_cos_bit_col[4][2]; 2551 const int bitrow = av1_fwd_cos_bit_row[4][2]; 2552 2553 int ud_flip, lr_flip; 2554 get_flip_cfg(tx_type, &ud_flip, &lr_flip); 2555 ud_adjust_input_and_stride(ud_flip, &input, &stride, 16); 2556 2557 // Column-wise transform. 2558 int32x4_t buf0[256]; 2559 load_buffer_64x16(input, buf0, stride, lr_flip); 2560 highbd_fdct16_xn_neon(buf0, buf0, bitcol, 16); 2561 shift_right_4_round_s32_x4(buf0, buf0, 256); 2562 2563 int32x4_t buf1[256]; 2564 transpose_arrays_s32_64x16(buf0, buf1); 2565 2566 // Row-wise transform. 2567 for (int i = 0; i < 4; i++) { 2568 highbd_fdct64_x4_neon(buf1 + i * 64, buf1 + i * 64, bitrow); 2569 } 2570 store_buffer_64x16(buf1, coeff, /*stride=*/16); 2571 memset(coeff + 16 * 32, 0, 16 * 32 * sizeof(*coeff)); 2572 } 2573 #endif 2574 2575 void av1_fwd_txfm2d_32x32_neon(const int16_t *input, int32_t *output, 2576 int stride, TX_TYPE tx_type, int bd) { 2577 (void)bd; 2578 const fwd_transform_1d_col_many_neon col_txfm = 2579 col_highbd_txfm32_x4_arr[tx_type]; 2580 const fwd_transform_1d_row_many_neon row_txfm = 2581 row_highbd_txfm32_x4_arr[tx_type]; 2582 2583 // Column-wise transform. 2584 int32x4_t buf0[256]; 2585 col_txfm(input, buf0, stride, /*cos_bit=*/12, /*lr_flip=*/0, /*howmany=*/8, 2586 /*hm_stride=*/32); 2587 shift_right_4_round_s32_x4(buf0, buf0, 256); 2588 2589 int32x4_t buf1[256]; 2590 transpose_arrays_s32_32x32(buf0, buf1); 2591 2592 // Row-wise transform. 2593 row_txfm(buf1, output, /*cos_bit=*/12, /*howmany=*/8, /*hm_stride=*/32, 2594 /*stride=*/32); 2595 } 2596 2597 void av1_fwd_txfm2d_64x64_neon(const int16_t *input, int32_t *output, 2598 int stride, TX_TYPE tx_type, int bd) { 2599 (void)bd; 2600 (void)tx_type; 2601 2602 // Column-wise transform. 2603 int32x4_t buf0[1024]; 2604 load_buffer_64x64(input, buf0, stride, 0); 2605 for (int col = 0; col < 16; col++) { 2606 highbd_fdct64_x4_neon(buf0 + col * 64, buf0 + col * 64, 13); 2607 } 2608 shift_right_2_round_s32_x4(buf0, buf0, 1024); 2609 2610 int32x4_t buf1[1024]; 2611 transpose_arrays_s32_64x64(buf0, buf1); 2612 2613 // Row-wise transform. 2614 for (int col = 0; col < 8; col++) { 2615 highbd_fdct64_x4_neon(buf1 + col * 64, buf1 + col * 64, 10); 2616 } 2617 shift_right_2_round_s32_x4(buf1, buf1, 512); 2618 store_buffer_64x32(buf1, output, /*stride=*/32); 2619 }