highbd_intrapred_neon.c (123001B)
1 /* 2 * Copyright (c) 2022, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom/aom_integer.h" 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_dsp/arm/sum_neon.h" 21 #include "aom_dsp/arm/transpose_neon.h" 22 #include "aom_dsp/intrapred_common.h" 23 24 // ----------------------------------------------------------------------------- 25 // DC 26 27 static inline void highbd_dc_store_4xh(uint16_t *dst, ptrdiff_t stride, int h, 28 uint16x4_t dc) { 29 for (int i = 0; i < h; ++i) { 30 vst1_u16(dst + i * stride, dc); 31 } 32 } 33 34 static inline void highbd_dc_store_8xh(uint16_t *dst, ptrdiff_t stride, int h, 35 uint16x8_t dc) { 36 for (int i = 0; i < h; ++i) { 37 vst1q_u16(dst + i * stride, dc); 38 } 39 } 40 41 static inline void highbd_dc_store_16xh(uint16_t *dst, ptrdiff_t stride, int h, 42 uint16x8_t dc) { 43 for (int i = 0; i < h; ++i) { 44 vst1q_u16(dst + i * stride, dc); 45 vst1q_u16(dst + i * stride + 8, dc); 46 } 47 } 48 49 static inline void highbd_dc_store_32xh(uint16_t *dst, ptrdiff_t stride, int h, 50 uint16x8_t dc) { 51 for (int i = 0; i < h; ++i) { 52 vst1q_u16(dst + i * stride, dc); 53 vst1q_u16(dst + i * stride + 8, dc); 54 vst1q_u16(dst + i * stride + 16, dc); 55 vst1q_u16(dst + i * stride + 24, dc); 56 } 57 } 58 59 static inline void highbd_dc_store_64xh(uint16_t *dst, ptrdiff_t stride, int h, 60 uint16x8_t dc) { 61 for (int i = 0; i < h; ++i) { 62 vst1q_u16(dst + i * stride, dc); 63 vst1q_u16(dst + i * stride + 8, dc); 64 vst1q_u16(dst + i * stride + 16, dc); 65 vst1q_u16(dst + i * stride + 24, dc); 66 vst1q_u16(dst + i * stride + 32, dc); 67 vst1q_u16(dst + i * stride + 40, dc); 68 vst1q_u16(dst + i * stride + 48, dc); 69 vst1q_u16(dst + i * stride + 56, dc); 70 } 71 } 72 73 static inline uint32x4_t horizontal_add_and_broadcast_long_u16x8(uint16x8_t a) { 74 // Need to assume input is up to 16 bits wide from dc 64x64 partial sum, so 75 // promote first. 76 const uint32x4_t b = vpaddlq_u16(a); 77 #if AOM_ARCH_AARCH64 78 const uint32x4_t c = vpaddq_u32(b, b); 79 return vpaddq_u32(c, c); 80 #else 81 const uint32x2_t c = vadd_u32(vget_low_u32(b), vget_high_u32(b)); 82 const uint32x2_t d = vpadd_u32(c, c); 83 return vcombine_u32(d, d); 84 #endif 85 } 86 87 static inline uint16x8_t highbd_dc_load_partial_sum_4(const uint16_t *left) { 88 // Nothing to do since sum is already one vector, but saves needing to 89 // special case w=4 or h=4 cases. The combine will be zero cost for a sane 90 // compiler since vld1 already sets the top half of a vector to zero as part 91 // of the operation. 92 return vcombine_u16(vld1_u16(left), vdup_n_u16(0)); 93 } 94 95 static inline uint16x8_t highbd_dc_load_partial_sum_8(const uint16_t *left) { 96 // Nothing to do since sum is already one vector, but saves needing to 97 // special case w=8 or h=8 cases. 98 return vld1q_u16(left); 99 } 100 101 static inline uint16x8_t highbd_dc_load_partial_sum_16(const uint16_t *left) { 102 const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits 103 const uint16x8_t a1 = vld1q_u16(left + 8); 104 return vaddq_u16(a0, a1); // up to 13 bits 105 } 106 107 static inline uint16x8_t highbd_dc_load_partial_sum_32(const uint16_t *left) { 108 const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits 109 const uint16x8_t a1 = vld1q_u16(left + 8); 110 const uint16x8_t a2 = vld1q_u16(left + 16); 111 const uint16x8_t a3 = vld1q_u16(left + 24); 112 const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits 113 const uint16x8_t b1 = vaddq_u16(a2, a3); 114 return vaddq_u16(b0, b1); // up to 14 bits 115 } 116 117 static inline uint16x8_t highbd_dc_load_partial_sum_64(const uint16_t *left) { 118 const uint16x8_t a0 = vld1q_u16(left + 0); // up to 12 bits 119 const uint16x8_t a1 = vld1q_u16(left + 8); 120 const uint16x8_t a2 = vld1q_u16(left + 16); 121 const uint16x8_t a3 = vld1q_u16(left + 24); 122 const uint16x8_t a4 = vld1q_u16(left + 32); 123 const uint16x8_t a5 = vld1q_u16(left + 40); 124 const uint16x8_t a6 = vld1q_u16(left + 48); 125 const uint16x8_t a7 = vld1q_u16(left + 56); 126 const uint16x8_t b0 = vaddq_u16(a0, a1); // up to 13 bits 127 const uint16x8_t b1 = vaddq_u16(a2, a3); 128 const uint16x8_t b2 = vaddq_u16(a4, a5); 129 const uint16x8_t b3 = vaddq_u16(a6, a7); 130 const uint16x8_t c0 = vaddq_u16(b0, b1); // up to 14 bits 131 const uint16x8_t c1 = vaddq_u16(b2, b3); 132 return vaddq_u16(c0, c1); // up to 15 bits 133 } 134 135 #define HIGHBD_DC_PREDICTOR(w, h, shift) \ 136 void aom_highbd_dc_predictor_##w##x##h##_neon( \ 137 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 138 const uint16_t *left, int bd) { \ 139 (void)bd; \ 140 const uint16x8_t a = highbd_dc_load_partial_sum_##w(above); \ 141 const uint16x8_t l = highbd_dc_load_partial_sum_##h(left); \ 142 const uint32x4_t sum = \ 143 horizontal_add_and_broadcast_long_u16x8(vaddq_u16(a, l)); \ 144 const uint16x4_t dc0 = vrshrn_n_u32(sum, shift); \ 145 highbd_dc_store_##w##xh(dst, stride, (h), vdupq_lane_u16(dc0, 0)); \ 146 } 147 148 void aom_highbd_dc_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, 149 const uint16_t *above, 150 const uint16_t *left, int bd) { 151 // In the rectangular cases we simply extend the shorter vector to uint16x8 152 // in order to accumulate, however in the 4x4 case there is no shorter vector 153 // to extend so it is beneficial to do the whole calculation in uint16x4 154 // instead. 155 (void)bd; 156 const uint16x4_t a = vld1_u16(above); // up to 12 bits 157 const uint16x4_t l = vld1_u16(left); 158 uint16x4_t sum = vpadd_u16(a, l); // up to 13 bits 159 sum = vpadd_u16(sum, sum); // up to 14 bits 160 sum = vpadd_u16(sum, sum); 161 const uint16x4_t dc = vrshr_n_u16(sum, 3); 162 highbd_dc_store_4xh(dst, stride, 4, dc); 163 } 164 165 HIGHBD_DC_PREDICTOR(8, 8, 4) 166 HIGHBD_DC_PREDICTOR(16, 16, 5) 167 HIGHBD_DC_PREDICTOR(32, 32, 6) 168 HIGHBD_DC_PREDICTOR(64, 64, 7) 169 170 #undef HIGHBD_DC_PREDICTOR 171 172 static inline int divide_using_multiply_shift(int num, int shift1, 173 int multiplier, int shift2) { 174 const int interm = num >> shift1; 175 return interm * multiplier >> shift2; 176 } 177 178 #define HIGHBD_DC_MULTIPLIER_1X2 0xAAAB 179 #define HIGHBD_DC_MULTIPLIER_1X4 0x6667 180 #define HIGHBD_DC_SHIFT2 17 181 182 static inline int highbd_dc_predictor_rect(int bw, int bh, int sum, int shift1, 183 uint32_t multiplier) { 184 return divide_using_multiply_shift(sum + ((bw + bh) >> 1), shift1, multiplier, 185 HIGHBD_DC_SHIFT2); 186 } 187 188 #undef HIGHBD_DC_SHIFT2 189 190 #define HIGHBD_DC_PREDICTOR_RECT(w, h, q, shift, mult) \ 191 void aom_highbd_dc_predictor_##w##x##h##_neon( \ 192 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 193 const uint16_t *left, int bd) { \ 194 (void)bd; \ 195 uint16x8_t sum_above = highbd_dc_load_partial_sum_##w(above); \ 196 uint16x8_t sum_left = highbd_dc_load_partial_sum_##h(left); \ 197 uint16x8_t sum_vec = vaddq_u16(sum_left, sum_above); \ 198 int sum = horizontal_add_u16x8(sum_vec); \ 199 int dc0 = highbd_dc_predictor_rect((w), (h), sum, (shift), (mult)); \ 200 highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u16(dc0)); \ 201 } 202 203 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 204 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2) 205 HIGHBD_DC_PREDICTOR_RECT(4, 16, , 2, HIGHBD_DC_MULTIPLIER_1X4) 206 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2) 207 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2) 208 HIGHBD_DC_PREDICTOR_RECT(8, 32, q, 3, HIGHBD_DC_MULTIPLIER_1X4) 209 HIGHBD_DC_PREDICTOR_RECT(16, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X4) 210 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2) 211 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2) 212 HIGHBD_DC_PREDICTOR_RECT(16, 64, q, 4, HIGHBD_DC_MULTIPLIER_1X4) 213 HIGHBD_DC_PREDICTOR_RECT(32, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X4) 214 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2) 215 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2) 216 HIGHBD_DC_PREDICTOR_RECT(64, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X4) 217 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2) 218 #else 219 HIGHBD_DC_PREDICTOR_RECT(4, 8, , 2, HIGHBD_DC_MULTIPLIER_1X2) 220 HIGHBD_DC_PREDICTOR_RECT(8, 4, q, 2, HIGHBD_DC_MULTIPLIER_1X2) 221 HIGHBD_DC_PREDICTOR_RECT(8, 16, q, 3, HIGHBD_DC_MULTIPLIER_1X2) 222 HIGHBD_DC_PREDICTOR_RECT(16, 8, q, 3, HIGHBD_DC_MULTIPLIER_1X2) 223 HIGHBD_DC_PREDICTOR_RECT(16, 32, q, 4, HIGHBD_DC_MULTIPLIER_1X2) 224 HIGHBD_DC_PREDICTOR_RECT(32, 16, q, 4, HIGHBD_DC_MULTIPLIER_1X2) 225 HIGHBD_DC_PREDICTOR_RECT(32, 64, q, 5, HIGHBD_DC_MULTIPLIER_1X2) 226 HIGHBD_DC_PREDICTOR_RECT(64, 32, q, 5, HIGHBD_DC_MULTIPLIER_1X2) 227 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 228 229 #undef HIGHBD_DC_PREDICTOR_RECT 230 #undef HIGHBD_DC_MULTIPLIER_1X2 231 #undef HIGHBD_DC_MULTIPLIER_1X4 232 233 // ----------------------------------------------------------------------------- 234 // DC_128 235 236 #define HIGHBD_DC_PREDICTOR_128(w, h, q) \ 237 void aom_highbd_dc_128_predictor_##w##x##h##_neon( \ 238 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 239 const uint16_t *left, int bd) { \ 240 (void)above; \ 241 (void)bd; \ 242 (void)left; \ 243 highbd_dc_store_##w##xh(dst, stride, (h), \ 244 vdup##q##_n_u16(0x80 << (bd - 8))); \ 245 } 246 247 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 248 HIGHBD_DC_PREDICTOR_128(4, 4, ) 249 HIGHBD_DC_PREDICTOR_128(4, 8, ) 250 HIGHBD_DC_PREDICTOR_128(4, 16, ) 251 HIGHBD_DC_PREDICTOR_128(8, 4, q) 252 HIGHBD_DC_PREDICTOR_128(8, 8, q) 253 HIGHBD_DC_PREDICTOR_128(8, 16, q) 254 HIGHBD_DC_PREDICTOR_128(8, 32, q) 255 HIGHBD_DC_PREDICTOR_128(16, 4, q) 256 HIGHBD_DC_PREDICTOR_128(16, 8, q) 257 HIGHBD_DC_PREDICTOR_128(16, 16, q) 258 HIGHBD_DC_PREDICTOR_128(16, 32, q) 259 HIGHBD_DC_PREDICTOR_128(16, 64, q) 260 HIGHBD_DC_PREDICTOR_128(32, 8, q) 261 HIGHBD_DC_PREDICTOR_128(32, 16, q) 262 HIGHBD_DC_PREDICTOR_128(32, 32, q) 263 HIGHBD_DC_PREDICTOR_128(32, 64, q) 264 HIGHBD_DC_PREDICTOR_128(64, 16, q) 265 HIGHBD_DC_PREDICTOR_128(64, 32, q) 266 HIGHBD_DC_PREDICTOR_128(64, 64, q) 267 #else 268 HIGHBD_DC_PREDICTOR_128(4, 4, ) 269 HIGHBD_DC_PREDICTOR_128(4, 8, ) 270 HIGHBD_DC_PREDICTOR_128(8, 4, q) 271 HIGHBD_DC_PREDICTOR_128(8, 8, q) 272 HIGHBD_DC_PREDICTOR_128(8, 16, q) 273 HIGHBD_DC_PREDICTOR_128(16, 8, q) 274 HIGHBD_DC_PREDICTOR_128(16, 16, q) 275 HIGHBD_DC_PREDICTOR_128(16, 32, q) 276 HIGHBD_DC_PREDICTOR_128(32, 16, q) 277 HIGHBD_DC_PREDICTOR_128(32, 32, q) 278 HIGHBD_DC_PREDICTOR_128(32, 64, q) 279 HIGHBD_DC_PREDICTOR_128(64, 32, q) 280 HIGHBD_DC_PREDICTOR_128(64, 64, q) 281 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 282 283 #undef HIGHBD_DC_PREDICTOR_128 284 285 // ----------------------------------------------------------------------------- 286 // DC_LEFT 287 288 static inline uint32x4_t highbd_dc_load_sum_4(const uint16_t *left) { 289 const uint16x4_t a = vld1_u16(left); // up to 12 bits 290 const uint16x4_t b = vpadd_u16(a, a); // up to 13 bits 291 return vcombine_u32(vpaddl_u16(b), vdup_n_u32(0)); 292 } 293 294 static inline uint32x4_t highbd_dc_load_sum_8(const uint16_t *left) { 295 return horizontal_add_and_broadcast_long_u16x8(vld1q_u16(left)); 296 } 297 298 static inline uint32x4_t highbd_dc_load_sum_16(const uint16_t *left) { 299 return horizontal_add_and_broadcast_long_u16x8( 300 highbd_dc_load_partial_sum_16(left)); 301 } 302 303 static inline uint32x4_t highbd_dc_load_sum_32(const uint16_t *left) { 304 return horizontal_add_and_broadcast_long_u16x8( 305 highbd_dc_load_partial_sum_32(left)); 306 } 307 308 static inline uint32x4_t highbd_dc_load_sum_64(const uint16_t *left) { 309 return horizontal_add_and_broadcast_long_u16x8( 310 highbd_dc_load_partial_sum_64(left)); 311 } 312 313 #define DC_PREDICTOR_LEFT(w, h, shift, q) \ 314 void aom_highbd_dc_left_predictor_##w##x##h##_neon( \ 315 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 316 const uint16_t *left, int bd) { \ 317 (void)above; \ 318 (void)bd; \ 319 const uint32x4_t sum = highbd_dc_load_sum_##h(left); \ 320 const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \ 321 highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \ 322 } 323 324 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 325 DC_PREDICTOR_LEFT(4, 4, 2, ) 326 DC_PREDICTOR_LEFT(4, 8, 3, ) 327 DC_PREDICTOR_LEFT(4, 16, 4, ) 328 DC_PREDICTOR_LEFT(8, 4, 2, q) 329 DC_PREDICTOR_LEFT(8, 8, 3, q) 330 DC_PREDICTOR_LEFT(8, 16, 4, q) 331 DC_PREDICTOR_LEFT(8, 32, 5, q) 332 DC_PREDICTOR_LEFT(16, 4, 2, q) 333 DC_PREDICTOR_LEFT(16, 8, 3, q) 334 DC_PREDICTOR_LEFT(16, 16, 4, q) 335 DC_PREDICTOR_LEFT(16, 32, 5, q) 336 DC_PREDICTOR_LEFT(16, 64, 6, q) 337 DC_PREDICTOR_LEFT(32, 8, 3, q) 338 DC_PREDICTOR_LEFT(32, 16, 4, q) 339 DC_PREDICTOR_LEFT(32, 32, 5, q) 340 DC_PREDICTOR_LEFT(32, 64, 6, q) 341 DC_PREDICTOR_LEFT(64, 16, 4, q) 342 DC_PREDICTOR_LEFT(64, 32, 5, q) 343 DC_PREDICTOR_LEFT(64, 64, 6, q) 344 #else 345 DC_PREDICTOR_LEFT(4, 4, 2, ) 346 DC_PREDICTOR_LEFT(4, 8, 3, ) 347 DC_PREDICTOR_LEFT(8, 4, 2, q) 348 DC_PREDICTOR_LEFT(8, 8, 3, q) 349 DC_PREDICTOR_LEFT(8, 16, 4, q) 350 DC_PREDICTOR_LEFT(16, 8, 3, q) 351 DC_PREDICTOR_LEFT(16, 16, 4, q) 352 DC_PREDICTOR_LEFT(16, 32, 5, q) 353 DC_PREDICTOR_LEFT(32, 16, 4, q) 354 DC_PREDICTOR_LEFT(32, 32, 5, q) 355 DC_PREDICTOR_LEFT(32, 64, 6, q) 356 DC_PREDICTOR_LEFT(64, 32, 5, q) 357 DC_PREDICTOR_LEFT(64, 64, 6, q) 358 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 359 360 #undef DC_PREDICTOR_LEFT 361 362 // ----------------------------------------------------------------------------- 363 // DC_TOP 364 365 #define DC_PREDICTOR_TOP(w, h, shift, q) \ 366 void aom_highbd_dc_top_predictor_##w##x##h##_neon( \ 367 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 368 const uint16_t *left, int bd) { \ 369 (void)bd; \ 370 (void)left; \ 371 const uint32x4_t sum = highbd_dc_load_sum_##w(above); \ 372 const uint16x4_t dc0 = vrshrn_n_u32(sum, (shift)); \ 373 highbd_dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u16(dc0, 0)); \ 374 } 375 376 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 377 DC_PREDICTOR_TOP(4, 4, 2, ) 378 DC_PREDICTOR_TOP(4, 8, 2, ) 379 DC_PREDICTOR_TOP(4, 16, 2, ) 380 DC_PREDICTOR_TOP(8, 4, 3, q) 381 DC_PREDICTOR_TOP(8, 8, 3, q) 382 DC_PREDICTOR_TOP(8, 16, 3, q) 383 DC_PREDICTOR_TOP(8, 32, 3, q) 384 DC_PREDICTOR_TOP(16, 4, 4, q) 385 DC_PREDICTOR_TOP(16, 8, 4, q) 386 DC_PREDICTOR_TOP(16, 16, 4, q) 387 DC_PREDICTOR_TOP(16, 32, 4, q) 388 DC_PREDICTOR_TOP(16, 64, 4, q) 389 DC_PREDICTOR_TOP(32, 8, 5, q) 390 DC_PREDICTOR_TOP(32, 16, 5, q) 391 DC_PREDICTOR_TOP(32, 32, 5, q) 392 DC_PREDICTOR_TOP(32, 64, 5, q) 393 DC_PREDICTOR_TOP(64, 16, 6, q) 394 DC_PREDICTOR_TOP(64, 32, 6, q) 395 DC_PREDICTOR_TOP(64, 64, 6, q) 396 #else 397 DC_PREDICTOR_TOP(4, 4, 2, ) 398 DC_PREDICTOR_TOP(4, 8, 2, ) 399 DC_PREDICTOR_TOP(8, 4, 3, q) 400 DC_PREDICTOR_TOP(8, 8, 3, q) 401 DC_PREDICTOR_TOP(8, 16, 3, q) 402 DC_PREDICTOR_TOP(16, 8, 4, q) 403 DC_PREDICTOR_TOP(16, 16, 4, q) 404 DC_PREDICTOR_TOP(16, 32, 4, q) 405 DC_PREDICTOR_TOP(32, 16, 5, q) 406 DC_PREDICTOR_TOP(32, 32, 5, q) 407 DC_PREDICTOR_TOP(32, 64, 5, q) 408 DC_PREDICTOR_TOP(64, 32, 6, q) 409 DC_PREDICTOR_TOP(64, 64, 6, q) 410 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 411 412 #undef DC_PREDICTOR_TOP 413 414 // ----------------------------------------------------------------------------- 415 // V_PRED 416 417 #define HIGHBD_V_NXM(W, H) \ 418 void aom_highbd_v_predictor_##W##x##H##_neon( \ 419 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 420 const uint16_t *left, int bd) { \ 421 (void)left; \ 422 (void)bd; \ 423 vertical##W##xh_neon(dst, stride, above, H); \ 424 } 425 426 static inline uint16x8x2_t load_uint16x8x2(uint16_t const *ptr) { 427 uint16x8x2_t x; 428 // Clang/gcc uses ldp here. 429 x.val[0] = vld1q_u16(ptr); 430 x.val[1] = vld1q_u16(ptr + 8); 431 return x; 432 } 433 434 static inline void store_uint16x8x2(uint16_t *ptr, uint16x8x2_t x) { 435 vst1q_u16(ptr, x.val[0]); 436 vst1q_u16(ptr + 8, x.val[1]); 437 } 438 439 static inline void vertical4xh_neon(uint16_t *dst, ptrdiff_t stride, 440 const uint16_t *const above, int height) { 441 const uint16x4_t row = vld1_u16(above); 442 int y = height; 443 do { 444 vst1_u16(dst, row); 445 vst1_u16(dst + stride, row); 446 dst += stride << 1; 447 y -= 2; 448 } while (y != 0); 449 } 450 451 static inline void vertical8xh_neon(uint16_t *dst, ptrdiff_t stride, 452 const uint16_t *const above, int height) { 453 const uint16x8_t row = vld1q_u16(above); 454 int y = height; 455 do { 456 vst1q_u16(dst, row); 457 vst1q_u16(dst + stride, row); 458 dst += stride << 1; 459 y -= 2; 460 } while (y != 0); 461 } 462 463 static inline void vertical16xh_neon(uint16_t *dst, ptrdiff_t stride, 464 const uint16_t *const above, int height) { 465 const uint16x8x2_t row = load_uint16x8x2(above); 466 int y = height; 467 do { 468 store_uint16x8x2(dst, row); 469 store_uint16x8x2(dst + stride, row); 470 dst += stride << 1; 471 y -= 2; 472 } while (y != 0); 473 } 474 475 static inline uint16x8x4_t load_uint16x8x4(uint16_t const *ptr) { 476 uint16x8x4_t x; 477 // Clang/gcc uses ldp here. 478 x.val[0] = vld1q_u16(ptr); 479 x.val[1] = vld1q_u16(ptr + 8); 480 x.val[2] = vld1q_u16(ptr + 16); 481 x.val[3] = vld1q_u16(ptr + 24); 482 return x; 483 } 484 485 static inline void store_uint16x8x4(uint16_t *ptr, uint16x8x4_t x) { 486 vst1q_u16(ptr, x.val[0]); 487 vst1q_u16(ptr + 8, x.val[1]); 488 vst1q_u16(ptr + 16, x.val[2]); 489 vst1q_u16(ptr + 24, x.val[3]); 490 } 491 492 static inline void vertical32xh_neon(uint16_t *dst, ptrdiff_t stride, 493 const uint16_t *const above, int height) { 494 const uint16x8x4_t row = load_uint16x8x4(above); 495 int y = height; 496 do { 497 store_uint16x8x4(dst, row); 498 store_uint16x8x4(dst + stride, row); 499 dst += stride << 1; 500 y -= 2; 501 } while (y != 0); 502 } 503 504 static inline void vertical64xh_neon(uint16_t *dst, ptrdiff_t stride, 505 const uint16_t *const above, int height) { 506 uint16_t *dst32 = dst + 32; 507 const uint16x8x4_t row = load_uint16x8x4(above); 508 const uint16x8x4_t row32 = load_uint16x8x4(above + 32); 509 int y = height; 510 do { 511 store_uint16x8x4(dst, row); 512 store_uint16x8x4(dst32, row32); 513 store_uint16x8x4(dst + stride, row); 514 store_uint16x8x4(dst32 + stride, row32); 515 dst += stride << 1; 516 dst32 += stride << 1; 517 y -= 2; 518 } while (y != 0); 519 } 520 521 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 522 HIGHBD_V_NXM(4, 4) 523 HIGHBD_V_NXM(4, 8) 524 HIGHBD_V_NXM(4, 16) 525 526 HIGHBD_V_NXM(8, 4) 527 HIGHBD_V_NXM(8, 8) 528 HIGHBD_V_NXM(8, 16) 529 HIGHBD_V_NXM(8, 32) 530 531 HIGHBD_V_NXM(16, 4) 532 HIGHBD_V_NXM(16, 8) 533 HIGHBD_V_NXM(16, 16) 534 HIGHBD_V_NXM(16, 32) 535 HIGHBD_V_NXM(16, 64) 536 537 HIGHBD_V_NXM(32, 8) 538 HIGHBD_V_NXM(32, 16) 539 HIGHBD_V_NXM(32, 32) 540 HIGHBD_V_NXM(32, 64) 541 542 HIGHBD_V_NXM(64, 16) 543 HIGHBD_V_NXM(64, 32) 544 HIGHBD_V_NXM(64, 64) 545 #else 546 HIGHBD_V_NXM(4, 4) 547 HIGHBD_V_NXM(4, 8) 548 549 HIGHBD_V_NXM(8, 4) 550 HIGHBD_V_NXM(8, 8) 551 HIGHBD_V_NXM(8, 16) 552 553 HIGHBD_V_NXM(16, 8) 554 HIGHBD_V_NXM(16, 16) 555 HIGHBD_V_NXM(16, 32) 556 557 HIGHBD_V_NXM(32, 16) 558 HIGHBD_V_NXM(32, 32) 559 HIGHBD_V_NXM(32, 64) 560 561 HIGHBD_V_NXM(64, 32) 562 HIGHBD_V_NXM(64, 64) 563 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 564 565 // ----------------------------------------------------------------------------- 566 // H_PRED 567 568 static inline void highbd_h_store_4x4(uint16_t *dst, ptrdiff_t stride, 569 uint16x4_t left) { 570 vst1_u16(dst + 0 * stride, vdup_lane_u16(left, 0)); 571 vst1_u16(dst + 1 * stride, vdup_lane_u16(left, 1)); 572 vst1_u16(dst + 2 * stride, vdup_lane_u16(left, 2)); 573 vst1_u16(dst + 3 * stride, vdup_lane_u16(left, 3)); 574 } 575 576 static inline void highbd_h_store_8x4(uint16_t *dst, ptrdiff_t stride, 577 uint16x4_t left) { 578 vst1q_u16(dst + 0 * stride, vdupq_lane_u16(left, 0)); 579 vst1q_u16(dst + 1 * stride, vdupq_lane_u16(left, 1)); 580 vst1q_u16(dst + 2 * stride, vdupq_lane_u16(left, 2)); 581 vst1q_u16(dst + 3 * stride, vdupq_lane_u16(left, 3)); 582 } 583 584 static inline void highbd_h_store_16x1(uint16_t *dst, uint16x8_t left) { 585 vst1q_u16(dst + 0, left); 586 vst1q_u16(dst + 8, left); 587 } 588 589 static inline void highbd_h_store_16x4(uint16_t *dst, ptrdiff_t stride, 590 uint16x4_t left) { 591 highbd_h_store_16x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); 592 highbd_h_store_16x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); 593 highbd_h_store_16x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); 594 highbd_h_store_16x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); 595 } 596 597 static inline void highbd_h_store_32x1(uint16_t *dst, uint16x8_t left) { 598 vst1q_u16(dst + 0, left); 599 vst1q_u16(dst + 8, left); 600 vst1q_u16(dst + 16, left); 601 vst1q_u16(dst + 24, left); 602 } 603 604 static inline void highbd_h_store_32x4(uint16_t *dst, ptrdiff_t stride, 605 uint16x4_t left) { 606 highbd_h_store_32x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); 607 highbd_h_store_32x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); 608 highbd_h_store_32x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); 609 highbd_h_store_32x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); 610 } 611 612 static inline void highbd_h_store_64x1(uint16_t *dst, uint16x8_t left) { 613 vst1q_u16(dst + 0, left); 614 vst1q_u16(dst + 8, left); 615 vst1q_u16(dst + 16, left); 616 vst1q_u16(dst + 24, left); 617 vst1q_u16(dst + 32, left); 618 vst1q_u16(dst + 40, left); 619 vst1q_u16(dst + 48, left); 620 vst1q_u16(dst + 56, left); 621 } 622 623 static inline void highbd_h_store_64x4(uint16_t *dst, ptrdiff_t stride, 624 uint16x4_t left) { 625 highbd_h_store_64x1(dst + 0 * stride, vdupq_lane_u16(left, 0)); 626 highbd_h_store_64x1(dst + 1 * stride, vdupq_lane_u16(left, 1)); 627 highbd_h_store_64x1(dst + 2 * stride, vdupq_lane_u16(left, 2)); 628 highbd_h_store_64x1(dst + 3 * stride, vdupq_lane_u16(left, 3)); 629 } 630 631 void aom_highbd_h_predictor_4x4_neon(uint16_t *dst, ptrdiff_t stride, 632 const uint16_t *above, 633 const uint16_t *left, int bd) { 634 (void)above; 635 (void)bd; 636 highbd_h_store_4x4(dst, stride, vld1_u16(left)); 637 } 638 639 void aom_highbd_h_predictor_4x8_neon(uint16_t *dst, ptrdiff_t stride, 640 const uint16_t *above, 641 const uint16_t *left, int bd) { 642 (void)above; 643 (void)bd; 644 uint16x8_t l = vld1q_u16(left); 645 highbd_h_store_4x4(dst + 0 * stride, stride, vget_low_u16(l)); 646 highbd_h_store_4x4(dst + 4 * stride, stride, vget_high_u16(l)); 647 } 648 649 void aom_highbd_h_predictor_8x4_neon(uint16_t *dst, ptrdiff_t stride, 650 const uint16_t *above, 651 const uint16_t *left, int bd) { 652 (void)above; 653 (void)bd; 654 highbd_h_store_8x4(dst, stride, vld1_u16(left)); 655 } 656 657 void aom_highbd_h_predictor_8x8_neon(uint16_t *dst, ptrdiff_t stride, 658 const uint16_t *above, 659 const uint16_t *left, int bd) { 660 (void)above; 661 (void)bd; 662 uint16x8_t l = vld1q_u16(left); 663 highbd_h_store_8x4(dst + 0 * stride, stride, vget_low_u16(l)); 664 highbd_h_store_8x4(dst + 4 * stride, stride, vget_high_u16(l)); 665 } 666 667 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 668 void aom_highbd_h_predictor_16x4_neon(uint16_t *dst, ptrdiff_t stride, 669 const uint16_t *above, 670 const uint16_t *left, int bd) { 671 (void)above; 672 (void)bd; 673 highbd_h_store_16x4(dst, stride, vld1_u16(left)); 674 } 675 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 676 677 void aom_highbd_h_predictor_16x8_neon(uint16_t *dst, ptrdiff_t stride, 678 const uint16_t *above, 679 const uint16_t *left, int bd) { 680 (void)above; 681 (void)bd; 682 uint16x8_t l = vld1q_u16(left); 683 highbd_h_store_16x4(dst + 0 * stride, stride, vget_low_u16(l)); 684 highbd_h_store_16x4(dst + 4 * stride, stride, vget_high_u16(l)); 685 } 686 687 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 688 void aom_highbd_h_predictor_32x8_neon(uint16_t *dst, ptrdiff_t stride, 689 const uint16_t *above, 690 const uint16_t *left, int bd) { 691 (void)above; 692 (void)bd; 693 uint16x8_t l = vld1q_u16(left); 694 highbd_h_store_32x4(dst + 0 * stride, stride, vget_low_u16(l)); 695 highbd_h_store_32x4(dst + 4 * stride, stride, vget_high_u16(l)); 696 } 697 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 698 699 // For cases where height >= 16 we use pairs of loads to get LDP instructions. 700 #define HIGHBD_H_WXH_LARGE(w, h) \ 701 void aom_highbd_h_predictor_##w##x##h##_neon( \ 702 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 703 const uint16_t *left, int bd) { \ 704 (void)above; \ 705 (void)bd; \ 706 for (int i = 0; i < (h) / 16; ++i) { \ 707 uint16x8_t l0 = vld1q_u16(left + 0); \ 708 uint16x8_t l1 = vld1q_u16(left + 8); \ 709 highbd_h_store_##w##x4(dst + 0 * stride, stride, vget_low_u16(l0)); \ 710 highbd_h_store_##w##x4(dst + 4 * stride, stride, vget_high_u16(l0)); \ 711 highbd_h_store_##w##x4(dst + 8 * stride, stride, vget_low_u16(l1)); \ 712 highbd_h_store_##w##x4(dst + 12 * stride, stride, vget_high_u16(l1)); \ 713 left += 16; \ 714 dst += 16 * stride; \ 715 } \ 716 } 717 718 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 719 HIGHBD_H_WXH_LARGE(4, 16) 720 HIGHBD_H_WXH_LARGE(8, 16) 721 HIGHBD_H_WXH_LARGE(8, 32) 722 HIGHBD_H_WXH_LARGE(16, 16) 723 HIGHBD_H_WXH_LARGE(16, 32) 724 HIGHBD_H_WXH_LARGE(16, 64) 725 HIGHBD_H_WXH_LARGE(32, 16) 726 HIGHBD_H_WXH_LARGE(32, 32) 727 HIGHBD_H_WXH_LARGE(32, 64) 728 HIGHBD_H_WXH_LARGE(64, 16) 729 HIGHBD_H_WXH_LARGE(64, 32) 730 HIGHBD_H_WXH_LARGE(64, 64) 731 #else 732 HIGHBD_H_WXH_LARGE(8, 16) 733 HIGHBD_H_WXH_LARGE(16, 16) 734 HIGHBD_H_WXH_LARGE(16, 32) 735 HIGHBD_H_WXH_LARGE(32, 16) 736 HIGHBD_H_WXH_LARGE(32, 32) 737 HIGHBD_H_WXH_LARGE(32, 64) 738 HIGHBD_H_WXH_LARGE(64, 32) 739 HIGHBD_H_WXH_LARGE(64, 64) 740 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 741 742 #undef HIGHBD_H_WXH_LARGE 743 744 // ----------------------------------------------------------------------------- 745 // PAETH 746 747 static inline void highbd_paeth_4or8_x_h_neon(uint16_t *dest, ptrdiff_t stride, 748 const uint16_t *const top_row, 749 const uint16_t *const left_column, 750 int width, int height) { 751 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); 752 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); 753 uint16x8_t top; 754 if (width == 4) { 755 top = vcombine_u16(vld1_u16(top_row), vdup_n_u16(0)); 756 } else { // width == 8 757 top = vld1q_u16(top_row); 758 } 759 760 for (int y = 0; y < height; ++y) { 761 const uint16x8_t left = vdupq_n_u16(left_column[y]); 762 763 const uint16x8_t left_dist = vabdq_u16(top, top_left); 764 const uint16x8_t top_dist = vabdq_u16(left, top_left); 765 const uint16x8_t top_left_dist = 766 vabdq_u16(vaddq_u16(top, left), top_left_x2); 767 768 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); 769 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); 770 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); 771 772 // if (left_dist <= top_dist && left_dist <= top_left_dist) 773 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); 774 // dest[x] = left_column[y]; 775 // Fill all the unused spaces with 'top'. They will be overwritten when 776 // the positions for top_left are known. 777 uint16x8_t result = vbslq_u16(left_mask, left, top); 778 // else if (top_dist <= top_left_dist) 779 // dest[x] = top_row[x]; 780 // Add these values to the mask. They were already set. 781 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); 782 // else 783 // dest[x] = top_left; 784 result = vbslq_u16(left_or_top_mask, result, top_left); 785 786 if (width == 4) { 787 vst1_u16(dest, vget_low_u16(result)); 788 } else { // width == 8 789 vst1q_u16(dest, result); 790 } 791 dest += stride; 792 } 793 } 794 795 #define HIGHBD_PAETH_NXM(W, H) \ 796 void aom_highbd_paeth_predictor_##W##x##H##_neon( \ 797 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 798 const uint16_t *left, int bd) { \ 799 (void)bd; \ 800 highbd_paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ 801 } 802 803 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 804 HIGHBD_PAETH_NXM(4, 4) 805 HIGHBD_PAETH_NXM(4, 8) 806 HIGHBD_PAETH_NXM(4, 16) 807 HIGHBD_PAETH_NXM(8, 4) 808 HIGHBD_PAETH_NXM(8, 8) 809 HIGHBD_PAETH_NXM(8, 16) 810 HIGHBD_PAETH_NXM(8, 32) 811 #else 812 HIGHBD_PAETH_NXM(4, 4) 813 HIGHBD_PAETH_NXM(4, 8) 814 HIGHBD_PAETH_NXM(8, 4) 815 HIGHBD_PAETH_NXM(8, 8) 816 HIGHBD_PAETH_NXM(8, 16) 817 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 818 819 // Select the closest values and collect them. 820 static inline uint16x8_t select_paeth(const uint16x8_t top, 821 const uint16x8_t left, 822 const uint16x8_t top_left, 823 const uint16x8_t left_le_top, 824 const uint16x8_t left_le_top_left, 825 const uint16x8_t top_le_top_left) { 826 // if (left_dist <= top_dist && left_dist <= top_left_dist) 827 const uint16x8_t left_mask = vandq_u16(left_le_top, left_le_top_left); 828 // dest[x] = left_column[y]; 829 // Fill all the unused spaces with 'top'. They will be overwritten when 830 // the positions for top_left are known. 831 const uint16x8_t result = vbslq_u16(left_mask, left, top); 832 // else if (top_dist <= top_left_dist) 833 // dest[x] = top_row[x]; 834 // Add these values to the mask. They were already set. 835 const uint16x8_t left_or_top_mask = vorrq_u16(left_mask, top_le_top_left); 836 // else 837 // dest[x] = top_left; 838 return vbslq_u16(left_or_top_mask, result, top_left); 839 } 840 841 #define PAETH_PREDICTOR(num) \ 842 do { \ 843 const uint16x8_t left_dist = vabdq_u16(top[num], top_left); \ 844 const uint16x8_t top_left_dist = \ 845 vabdq_u16(vaddq_u16(top[num], left), top_left_x2); \ 846 const uint16x8_t left_le_top = vcleq_u16(left_dist, top_dist); \ 847 const uint16x8_t left_le_top_left = vcleq_u16(left_dist, top_left_dist); \ 848 const uint16x8_t top_le_top_left = vcleq_u16(top_dist, top_left_dist); \ 849 const uint16x8_t result = \ 850 select_paeth(top[num], left, top_left, left_le_top, left_le_top_left, \ 851 top_le_top_left); \ 852 vst1q_u16(dest + (num * 8), result); \ 853 } while (0) 854 855 #define LOAD_TOP_ROW(num) vld1q_u16(top_row + (num * 8)) 856 857 static inline void highbd_paeth16_plus_x_h_neon( 858 uint16_t *dest, ptrdiff_t stride, const uint16_t *const top_row, 859 const uint16_t *const left_column, int width, int height) { 860 const uint16x8_t top_left = vdupq_n_u16(top_row[-1]); 861 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); 862 uint16x8_t top[8]; 863 top[0] = LOAD_TOP_ROW(0); 864 top[1] = LOAD_TOP_ROW(1); 865 if (width > 16) { 866 top[2] = LOAD_TOP_ROW(2); 867 top[3] = LOAD_TOP_ROW(3); 868 if (width == 64) { 869 top[4] = LOAD_TOP_ROW(4); 870 top[5] = LOAD_TOP_ROW(5); 871 top[6] = LOAD_TOP_ROW(6); 872 top[7] = LOAD_TOP_ROW(7); 873 } 874 } 875 876 for (int y = 0; y < height; ++y) { 877 const uint16x8_t left = vdupq_n_u16(left_column[y]); 878 const uint16x8_t top_dist = vabdq_u16(left, top_left); 879 PAETH_PREDICTOR(0); 880 PAETH_PREDICTOR(1); 881 if (width > 16) { 882 PAETH_PREDICTOR(2); 883 PAETH_PREDICTOR(3); 884 if (width == 64) { 885 PAETH_PREDICTOR(4); 886 PAETH_PREDICTOR(5); 887 PAETH_PREDICTOR(6); 888 PAETH_PREDICTOR(7); 889 } 890 } 891 dest += stride; 892 } 893 } 894 895 #define HIGHBD_PAETH_NXM_WIDE(W, H) \ 896 void aom_highbd_paeth_predictor_##W##x##H##_neon( \ 897 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 898 const uint16_t *left, int bd) { \ 899 (void)bd; \ 900 highbd_paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ 901 } 902 903 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 904 HIGHBD_PAETH_NXM_WIDE(16, 4) 905 HIGHBD_PAETH_NXM_WIDE(16, 8) 906 HIGHBD_PAETH_NXM_WIDE(16, 16) 907 HIGHBD_PAETH_NXM_WIDE(16, 32) 908 HIGHBD_PAETH_NXM_WIDE(16, 64) 909 HIGHBD_PAETH_NXM_WIDE(32, 8) 910 HIGHBD_PAETH_NXM_WIDE(32, 16) 911 HIGHBD_PAETH_NXM_WIDE(32, 32) 912 HIGHBD_PAETH_NXM_WIDE(32, 64) 913 HIGHBD_PAETH_NXM_WIDE(64, 16) 914 HIGHBD_PAETH_NXM_WIDE(64, 32) 915 HIGHBD_PAETH_NXM_WIDE(64, 64) 916 #else 917 HIGHBD_PAETH_NXM_WIDE(16, 8) 918 HIGHBD_PAETH_NXM_WIDE(16, 16) 919 HIGHBD_PAETH_NXM_WIDE(16, 32) 920 HIGHBD_PAETH_NXM_WIDE(32, 16) 921 HIGHBD_PAETH_NXM_WIDE(32, 32) 922 HIGHBD_PAETH_NXM_WIDE(32, 64) 923 HIGHBD_PAETH_NXM_WIDE(64, 32) 924 HIGHBD_PAETH_NXM_WIDE(64, 64) 925 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 926 927 // ----------------------------------------------------------------------------- 928 // SMOOTH 929 930 // 256 - v = vneg_s8(v) 931 static inline uint16x4_t negate_s8(const uint16x4_t v) { 932 return vreinterpret_u16_s8(vneg_s8(vreinterpret_s8_u16(v))); 933 } 934 935 static inline void highbd_smooth_4xh_neon(uint16_t *dst, ptrdiff_t stride, 936 const uint16_t *const top_row, 937 const uint16_t *const left_column, 938 const int height) { 939 const uint16_t top_right = top_row[3]; 940 const uint16_t bottom_left = left_column[height - 1]; 941 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; 942 943 const uint16x4_t top_v = vld1_u16(top_row); 944 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); 945 const uint16x4_t weights_x_v = vld1_u16(smooth_weights_u16); 946 const uint16x4_t scaled_weights_x = negate_s8(weights_x_v); 947 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); 948 949 for (int y = 0; y < height; ++y) { 950 // Each variable in the running summation is named for the last item to be 951 // accumulated. 952 const uint32x4_t weighted_top = 953 vmlal_n_u16(weighted_tr, top_v, weights_y[y]); 954 const uint32x4_t weighted_left = 955 vmlal_n_u16(weighted_top, weights_x_v, left_column[y]); 956 const uint32x4_t weighted_bl = 957 vmlal_n_u16(weighted_left, bottom_left_v, 256 - weights_y[y]); 958 959 const uint16x4_t pred = 960 vrshrn_n_u32(weighted_bl, SMOOTH_WEIGHT_LOG2_SCALE + 1); 961 vst1_u16(dst, pred); 962 dst += stride; 963 } 964 } 965 966 // Common code between 8xH and [16|32|64]xH. 967 static inline void highbd_calculate_pred8( 968 uint16_t *dst, const uint32x4_t weighted_corners_low, 969 const uint32x4_t weighted_corners_high, const uint16x4x2_t top_vals, 970 const uint16x4x2_t weights_x, const uint16_t left_y, 971 const uint16_t weight_y) { 972 // Each variable in the running summation is named for the last item to be 973 // accumulated. 974 const uint32x4_t weighted_top_low = 975 vmlal_n_u16(weighted_corners_low, top_vals.val[0], weight_y); 976 const uint32x4_t weighted_edges_low = 977 vmlal_n_u16(weighted_top_low, weights_x.val[0], left_y); 978 979 const uint16x4_t pred_low = 980 vrshrn_n_u32(weighted_edges_low, SMOOTH_WEIGHT_LOG2_SCALE + 1); 981 vst1_u16(dst, pred_low); 982 983 const uint32x4_t weighted_top_high = 984 vmlal_n_u16(weighted_corners_high, top_vals.val[1], weight_y); 985 const uint32x4_t weighted_edges_high = 986 vmlal_n_u16(weighted_top_high, weights_x.val[1], left_y); 987 988 const uint16x4_t pred_high = 989 vrshrn_n_u32(weighted_edges_high, SMOOTH_WEIGHT_LOG2_SCALE + 1); 990 vst1_u16(dst + 4, pred_high); 991 } 992 993 static void highbd_smooth_8xh_neon(uint16_t *dst, ptrdiff_t stride, 994 const uint16_t *const top_row, 995 const uint16_t *const left_column, 996 const int height) { 997 const uint16_t top_right = top_row[7]; 998 const uint16_t bottom_left = left_column[height - 1]; 999 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; 1000 1001 const uint16x4x2_t top_vals = { { vld1_u16(top_row), 1002 vld1_u16(top_row + 4) } }; 1003 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); 1004 const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), 1005 vld1_u16(smooth_weights_u16 + 8) } }; 1006 const uint32x4_t weighted_tr_low = 1007 vmull_n_u16(negate_s8(weights_x.val[0]), top_right); 1008 const uint32x4_t weighted_tr_high = 1009 vmull_n_u16(negate_s8(weights_x.val[1]), top_right); 1010 1011 for (int y = 0; y < height; ++y) { 1012 const uint32x4_t weighted_bl = 1013 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); 1014 const uint32x4_t weighted_corners_low = 1015 vaddq_u32(weighted_bl, weighted_tr_low); 1016 const uint32x4_t weighted_corners_high = 1017 vaddq_u32(weighted_bl, weighted_tr_high); 1018 highbd_calculate_pred8(dst, weighted_corners_low, weighted_corners_high, 1019 top_vals, weights_x, left_column[y], weights_y[y]); 1020 dst += stride; 1021 } 1022 } 1023 1024 #define HIGHBD_SMOOTH_NXM(W, H) \ 1025 void aom_highbd_smooth_predictor_##W##x##H##_neon( \ 1026 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ 1027 const uint16_t *left, int bd) { \ 1028 (void)bd; \ 1029 highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ 1030 } 1031 1032 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1033 HIGHBD_SMOOTH_NXM(4, 4) 1034 HIGHBD_SMOOTH_NXM(4, 8) 1035 HIGHBD_SMOOTH_NXM(8, 4) 1036 HIGHBD_SMOOTH_NXM(8, 8) 1037 HIGHBD_SMOOTH_NXM(4, 16) 1038 HIGHBD_SMOOTH_NXM(8, 16) 1039 HIGHBD_SMOOTH_NXM(8, 32) 1040 #else 1041 HIGHBD_SMOOTH_NXM(4, 4) 1042 HIGHBD_SMOOTH_NXM(4, 8) 1043 HIGHBD_SMOOTH_NXM(8, 4) 1044 HIGHBD_SMOOTH_NXM(8, 8) 1045 HIGHBD_SMOOTH_NXM(8, 16) 1046 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1047 1048 #undef HIGHBD_SMOOTH_NXM 1049 1050 // For width 16 and above. 1051 #define HIGHBD_SMOOTH_PREDICTOR(W) \ 1052 static void highbd_smooth_##W##xh_neon( \ 1053 uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ 1054 const uint16_t *const left_column, const int height) { \ 1055 const uint16_t top_right = top_row[(W) - 1]; \ 1056 const uint16_t bottom_left = left_column[height - 1]; \ 1057 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ 1058 \ 1059 /* Precompute weighted values that don't vary with |y|. */ \ 1060 uint32x4_t weighted_tr_low[(W) >> 3]; \ 1061 uint32x4_t weighted_tr_high[(W) >> 3]; \ 1062 for (int i = 0; i < (W) >> 3; ++i) { \ 1063 const int x = i << 3; \ 1064 const uint16x4_t weights_x_low = \ 1065 vld1_u16(smooth_weights_u16 + (W) - 4 + x); \ 1066 weighted_tr_low[i] = vmull_n_u16(negate_s8(weights_x_low), top_right); \ 1067 const uint16x4_t weights_x_high = \ 1068 vld1_u16(smooth_weights_u16 + (W) + x); \ 1069 weighted_tr_high[i] = vmull_n_u16(negate_s8(weights_x_high), top_right); \ 1070 } \ 1071 \ 1072 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ 1073 for (int y = 0; y < height; ++y) { \ 1074 const uint32x4_t weighted_bl = \ 1075 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ 1076 uint16_t *dst_x = dst; \ 1077 for (int i = 0; i < (W) >> 3; ++i) { \ 1078 const int x = i << 3; \ 1079 const uint16x4x2_t top_vals = { { vld1_u16(top_row + x), \ 1080 vld1_u16(top_row + x + 4) } }; \ 1081 const uint32x4_t weighted_corners_low = \ 1082 vaddq_u32(weighted_bl, weighted_tr_low[i]); \ 1083 const uint32x4_t weighted_corners_high = \ 1084 vaddq_u32(weighted_bl, weighted_tr_high[i]); \ 1085 /* Accumulate weighted edge values and store. */ \ 1086 const uint16x4x2_t weights_x = { \ 1087 { vld1_u16(smooth_weights_u16 + (W) - 4 + x), \ 1088 vld1_u16(smooth_weights_u16 + (W) + x) } \ 1089 }; \ 1090 highbd_calculate_pred8(dst_x, weighted_corners_low, \ 1091 weighted_corners_high, top_vals, weights_x, \ 1092 left_column[y], weights_y[y]); \ 1093 dst_x += 8; \ 1094 } \ 1095 dst += stride; \ 1096 } \ 1097 } 1098 1099 HIGHBD_SMOOTH_PREDICTOR(16) 1100 HIGHBD_SMOOTH_PREDICTOR(32) 1101 HIGHBD_SMOOTH_PREDICTOR(64) 1102 1103 #undef HIGHBD_SMOOTH_PREDICTOR 1104 1105 #define HIGHBD_SMOOTH_NXM_WIDE(W, H) \ 1106 void aom_highbd_smooth_predictor_##W##x##H##_neon( \ 1107 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ 1108 const uint16_t *left, int bd) { \ 1109 (void)bd; \ 1110 highbd_smooth_##W##xh_neon(dst, y_stride, above, left, H); \ 1111 } 1112 1113 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1114 HIGHBD_SMOOTH_NXM_WIDE(16, 4) 1115 HIGHBD_SMOOTH_NXM_WIDE(16, 8) 1116 HIGHBD_SMOOTH_NXM_WIDE(16, 16) 1117 HIGHBD_SMOOTH_NXM_WIDE(16, 32) 1118 HIGHBD_SMOOTH_NXM_WIDE(16, 64) 1119 HIGHBD_SMOOTH_NXM_WIDE(32, 8) 1120 HIGHBD_SMOOTH_NXM_WIDE(32, 16) 1121 HIGHBD_SMOOTH_NXM_WIDE(32, 32) 1122 HIGHBD_SMOOTH_NXM_WIDE(32, 64) 1123 HIGHBD_SMOOTH_NXM_WIDE(64, 16) 1124 HIGHBD_SMOOTH_NXM_WIDE(64, 32) 1125 HIGHBD_SMOOTH_NXM_WIDE(64, 64) 1126 #else 1127 HIGHBD_SMOOTH_NXM_WIDE(16, 8) 1128 HIGHBD_SMOOTH_NXM_WIDE(16, 16) 1129 HIGHBD_SMOOTH_NXM_WIDE(16, 32) 1130 HIGHBD_SMOOTH_NXM_WIDE(32, 16) 1131 HIGHBD_SMOOTH_NXM_WIDE(32, 32) 1132 HIGHBD_SMOOTH_NXM_WIDE(32, 64) 1133 HIGHBD_SMOOTH_NXM_WIDE(64, 32) 1134 HIGHBD_SMOOTH_NXM_WIDE(64, 64) 1135 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1136 1137 #undef HIGHBD_SMOOTH_NXM_WIDE 1138 1139 static void highbd_smooth_v_4xh_neon(uint16_t *dst, ptrdiff_t stride, 1140 const uint16_t *const top_row, 1141 const uint16_t *const left_column, 1142 const int height) { 1143 const uint16_t bottom_left = left_column[height - 1]; 1144 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; 1145 1146 const uint16x4_t top_v = vld1_u16(top_row); 1147 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); 1148 1149 for (int y = 0; y < height; ++y) { 1150 const uint32x4_t weighted_bl = 1151 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); 1152 const uint32x4_t weighted_top = 1153 vmlal_n_u16(weighted_bl, top_v, weights_y[y]); 1154 vst1_u16(dst, vrshrn_n_u32(weighted_top, SMOOTH_WEIGHT_LOG2_SCALE)); 1155 1156 dst += stride; 1157 } 1158 } 1159 1160 static void highbd_smooth_v_8xh_neon(uint16_t *dst, const ptrdiff_t stride, 1161 const uint16_t *const top_row, 1162 const uint16_t *const left_column, 1163 const int height) { 1164 const uint16_t bottom_left = left_column[height - 1]; 1165 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; 1166 1167 const uint16x4_t top_low = vld1_u16(top_row); 1168 const uint16x4_t top_high = vld1_u16(top_row + 4); 1169 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); 1170 1171 for (int y = 0; y < height; ++y) { 1172 const uint32x4_t weighted_bl = 1173 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); 1174 1175 const uint32x4_t weighted_top_low = 1176 vmlal_n_u16(weighted_bl, top_low, weights_y[y]); 1177 vst1_u16(dst, vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); 1178 1179 const uint32x4_t weighted_top_high = 1180 vmlal_n_u16(weighted_bl, top_high, weights_y[y]); 1181 vst1_u16(dst + 4, 1182 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); 1183 dst += stride; 1184 } 1185 } 1186 1187 #define HIGHBD_SMOOTH_V_NXM(W, H) \ 1188 void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ 1189 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ 1190 const uint16_t *left, int bd) { \ 1191 (void)bd; \ 1192 highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ 1193 } 1194 1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1196 HIGHBD_SMOOTH_V_NXM(4, 4) 1197 HIGHBD_SMOOTH_V_NXM(4, 8) 1198 HIGHBD_SMOOTH_V_NXM(4, 16) 1199 HIGHBD_SMOOTH_V_NXM(8, 4) 1200 HIGHBD_SMOOTH_V_NXM(8, 8) 1201 HIGHBD_SMOOTH_V_NXM(8, 16) 1202 HIGHBD_SMOOTH_V_NXM(8, 32) 1203 #else 1204 HIGHBD_SMOOTH_V_NXM(4, 4) 1205 HIGHBD_SMOOTH_V_NXM(4, 8) 1206 HIGHBD_SMOOTH_V_NXM(8, 4) 1207 HIGHBD_SMOOTH_V_NXM(8, 8) 1208 HIGHBD_SMOOTH_V_NXM(8, 16) 1209 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1210 1211 #undef HIGHBD_SMOOTH_V_NXM 1212 1213 // For width 16 and above. 1214 #define HIGHBD_SMOOTH_V_PREDICTOR(W) \ 1215 static void highbd_smooth_v_##W##xh_neon( \ 1216 uint16_t *dst, const ptrdiff_t stride, const uint16_t *const top_row, \ 1217 const uint16_t *const left_column, const int height) { \ 1218 const uint16_t bottom_left = left_column[height - 1]; \ 1219 const uint16_t *const weights_y = smooth_weights_u16 + height - 4; \ 1220 \ 1221 uint16x4x2_t top_vals[(W) >> 3]; \ 1222 for (int i = 0; i < (W) >> 3; ++i) { \ 1223 const int x = i << 3; \ 1224 top_vals[i].val[0] = vld1_u16(top_row + x); \ 1225 top_vals[i].val[1] = vld1_u16(top_row + x + 4); \ 1226 } \ 1227 \ 1228 const uint16x4_t bottom_left_v = vdup_n_u16(bottom_left); \ 1229 for (int y = 0; y < height; ++y) { \ 1230 const uint32x4_t weighted_bl = \ 1231 vmull_n_u16(bottom_left_v, 256 - weights_y[y]); \ 1232 \ 1233 uint16_t *dst_x = dst; \ 1234 for (int i = 0; i < (W) >> 3; ++i) { \ 1235 const uint32x4_t weighted_top_low = \ 1236 vmlal_n_u16(weighted_bl, top_vals[i].val[0], weights_y[y]); \ 1237 vst1_u16(dst_x, \ 1238 vrshrn_n_u32(weighted_top_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ 1239 \ 1240 const uint32x4_t weighted_top_high = \ 1241 vmlal_n_u16(weighted_bl, top_vals[i].val[1], weights_y[y]); \ 1242 vst1_u16(dst_x + 4, \ 1243 vrshrn_n_u32(weighted_top_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ 1244 dst_x += 8; \ 1245 } \ 1246 dst += stride; \ 1247 } \ 1248 } 1249 1250 HIGHBD_SMOOTH_V_PREDICTOR(16) 1251 HIGHBD_SMOOTH_V_PREDICTOR(32) 1252 HIGHBD_SMOOTH_V_PREDICTOR(64) 1253 1254 #undef HIGHBD_SMOOTH_V_PREDICTOR 1255 1256 #define HIGHBD_SMOOTH_V_NXM_WIDE(W, H) \ 1257 void aom_highbd_smooth_v_predictor_##W##x##H##_neon( \ 1258 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ 1259 const uint16_t *left, int bd) { \ 1260 (void)bd; \ 1261 highbd_smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ 1262 } 1263 1264 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1265 HIGHBD_SMOOTH_V_NXM_WIDE(16, 4) 1266 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) 1267 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) 1268 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) 1269 HIGHBD_SMOOTH_V_NXM_WIDE(16, 64) 1270 HIGHBD_SMOOTH_V_NXM_WIDE(32, 8) 1271 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) 1272 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) 1273 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) 1274 HIGHBD_SMOOTH_V_NXM_WIDE(64, 16) 1275 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) 1276 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) 1277 #else 1278 HIGHBD_SMOOTH_V_NXM_WIDE(16, 8) 1279 HIGHBD_SMOOTH_V_NXM_WIDE(16, 16) 1280 HIGHBD_SMOOTH_V_NXM_WIDE(16, 32) 1281 HIGHBD_SMOOTH_V_NXM_WIDE(32, 16) 1282 HIGHBD_SMOOTH_V_NXM_WIDE(32, 32) 1283 HIGHBD_SMOOTH_V_NXM_WIDE(32, 64) 1284 HIGHBD_SMOOTH_V_NXM_WIDE(64, 32) 1285 HIGHBD_SMOOTH_V_NXM_WIDE(64, 64) 1286 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1287 1288 #undef HIGHBD_SMOOTH_V_NXM_WIDE 1289 1290 static inline void highbd_smooth_h_4xh_neon(uint16_t *dst, ptrdiff_t stride, 1291 const uint16_t *const top_row, 1292 const uint16_t *const left_column, 1293 const int height) { 1294 const uint16_t top_right = top_row[3]; 1295 1296 const uint16x4_t weights_x = vld1_u16(smooth_weights_u16); 1297 const uint16x4_t scaled_weights_x = negate_s8(weights_x); 1298 1299 const uint32x4_t weighted_tr = vmull_n_u16(scaled_weights_x, top_right); 1300 for (int y = 0; y < height; ++y) { 1301 const uint32x4_t weighted_left = 1302 vmlal_n_u16(weighted_tr, weights_x, left_column[y]); 1303 vst1_u16(dst, vrshrn_n_u32(weighted_left, SMOOTH_WEIGHT_LOG2_SCALE)); 1304 dst += stride; 1305 } 1306 } 1307 1308 static inline void highbd_smooth_h_8xh_neon(uint16_t *dst, ptrdiff_t stride, 1309 const uint16_t *const top_row, 1310 const uint16_t *const left_column, 1311 const int height) { 1312 const uint16_t top_right = top_row[7]; 1313 1314 const uint16x4x2_t weights_x = { { vld1_u16(smooth_weights_u16 + 4), 1315 vld1_u16(smooth_weights_u16 + 8) } }; 1316 1317 const uint32x4_t weighted_tr_low = 1318 vmull_n_u16(negate_s8(weights_x.val[0]), top_right); 1319 const uint32x4_t weighted_tr_high = 1320 vmull_n_u16(negate_s8(weights_x.val[1]), top_right); 1321 1322 for (int y = 0; y < height; ++y) { 1323 const uint16_t left_y = left_column[y]; 1324 const uint32x4_t weighted_left_low = 1325 vmlal_n_u16(weighted_tr_low, weights_x.val[0], left_y); 1326 vst1_u16(dst, vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); 1327 1328 const uint32x4_t weighted_left_high = 1329 vmlal_n_u16(weighted_tr_high, weights_x.val[1], left_y); 1330 vst1_u16(dst + 4, 1331 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); 1332 dst += stride; 1333 } 1334 } 1335 1336 #define HIGHBD_SMOOTH_H_NXM(W, H) \ 1337 void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ 1338 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ 1339 const uint16_t *left, int bd) { \ 1340 (void)bd; \ 1341 highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ 1342 } 1343 1344 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1345 HIGHBD_SMOOTH_H_NXM(4, 4) 1346 HIGHBD_SMOOTH_H_NXM(4, 8) 1347 HIGHBD_SMOOTH_H_NXM(4, 16) 1348 HIGHBD_SMOOTH_H_NXM(8, 4) 1349 HIGHBD_SMOOTH_H_NXM(8, 8) 1350 HIGHBD_SMOOTH_H_NXM(8, 16) 1351 HIGHBD_SMOOTH_H_NXM(8, 32) 1352 #else 1353 HIGHBD_SMOOTH_H_NXM(4, 4) 1354 HIGHBD_SMOOTH_H_NXM(4, 8) 1355 HIGHBD_SMOOTH_H_NXM(8, 4) 1356 HIGHBD_SMOOTH_H_NXM(8, 8) 1357 HIGHBD_SMOOTH_H_NXM(8, 16) 1358 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1359 1360 #undef HIGHBD_SMOOTH_H_NXM 1361 1362 // For width 16 and above. 1363 #define HIGHBD_SMOOTH_H_PREDICTOR(W) \ 1364 static void highbd_smooth_h_##W##xh_neon( \ 1365 uint16_t *dst, ptrdiff_t stride, const uint16_t *const top_row, \ 1366 const uint16_t *const left_column, const int height) { \ 1367 const uint16_t top_right = top_row[(W) - 1]; \ 1368 \ 1369 uint16x4_t weights_x_low[(W) >> 3]; \ 1370 uint16x4_t weights_x_high[(W) >> 3]; \ 1371 uint32x4_t weighted_tr_low[(W) >> 3]; \ 1372 uint32x4_t weighted_tr_high[(W) >> 3]; \ 1373 for (int i = 0; i < (W) >> 3; ++i) { \ 1374 const int x = i << 3; \ 1375 weights_x_low[i] = vld1_u16(smooth_weights_u16 + (W) - 4 + x); \ 1376 weighted_tr_low[i] = \ 1377 vmull_n_u16(negate_s8(weights_x_low[i]), top_right); \ 1378 weights_x_high[i] = vld1_u16(smooth_weights_u16 + (W) + x); \ 1379 weighted_tr_high[i] = \ 1380 vmull_n_u16(negate_s8(weights_x_high[i]), top_right); \ 1381 } \ 1382 \ 1383 for (int y = 0; y < height; ++y) { \ 1384 uint16_t *dst_x = dst; \ 1385 const uint16_t left_y = left_column[y]; \ 1386 for (int i = 0; i < (W) >> 3; ++i) { \ 1387 const uint32x4_t weighted_left_low = \ 1388 vmlal_n_u16(weighted_tr_low[i], weights_x_low[i], left_y); \ 1389 vst1_u16(dst_x, \ 1390 vrshrn_n_u32(weighted_left_low, SMOOTH_WEIGHT_LOG2_SCALE)); \ 1391 \ 1392 const uint32x4_t weighted_left_high = \ 1393 vmlal_n_u16(weighted_tr_high[i], weights_x_high[i], left_y); \ 1394 vst1_u16(dst_x + 4, \ 1395 vrshrn_n_u32(weighted_left_high, SMOOTH_WEIGHT_LOG2_SCALE)); \ 1396 dst_x += 8; \ 1397 } \ 1398 dst += stride; \ 1399 } \ 1400 } 1401 1402 HIGHBD_SMOOTH_H_PREDICTOR(16) 1403 HIGHBD_SMOOTH_H_PREDICTOR(32) 1404 HIGHBD_SMOOTH_H_PREDICTOR(64) 1405 1406 #undef HIGHBD_SMOOTH_H_PREDICTOR 1407 1408 #define HIGHBD_SMOOTH_H_NXM_WIDE(W, H) \ 1409 void aom_highbd_smooth_h_predictor_##W##x##H##_neon( \ 1410 uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, \ 1411 const uint16_t *left, int bd) { \ 1412 (void)bd; \ 1413 highbd_smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ 1414 } 1415 1416 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1417 HIGHBD_SMOOTH_H_NXM_WIDE(16, 4) 1418 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) 1419 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) 1420 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) 1421 HIGHBD_SMOOTH_H_NXM_WIDE(16, 64) 1422 HIGHBD_SMOOTH_H_NXM_WIDE(32, 8) 1423 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) 1424 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) 1425 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) 1426 HIGHBD_SMOOTH_H_NXM_WIDE(64, 16) 1427 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) 1428 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) 1429 #else 1430 HIGHBD_SMOOTH_H_NXM_WIDE(16, 8) 1431 HIGHBD_SMOOTH_H_NXM_WIDE(16, 16) 1432 HIGHBD_SMOOTH_H_NXM_WIDE(16, 32) 1433 HIGHBD_SMOOTH_H_NXM_WIDE(32, 16) 1434 HIGHBD_SMOOTH_H_NXM_WIDE(32, 32) 1435 HIGHBD_SMOOTH_H_NXM_WIDE(32, 64) 1436 HIGHBD_SMOOTH_H_NXM_WIDE(64, 32) 1437 HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) 1438 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1439 1440 #undef HIGHBD_SMOOTH_H_NXM_WIDE 1441 1442 // ----------------------------------------------------------------------------- 1443 // Z1 1444 1445 static const int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; 1446 static const int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 }; 1447 1448 static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0, 1449 uint16x4_t a1, 1450 int shift) { 1451 // The C implementation of the z1 predictor uses (32 - shift) and a right 1452 // shift by 5, however we instead double shift to avoid an unnecessary right 1453 // shift by 1. 1454 uint32x4_t res = vmull_n_u16(a1, shift); 1455 res = vmlal_n_u16(res, a0, 64 - shift); 1456 return vrshrn_n_u32(res, 6); 1457 } 1458 1459 static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0, 1460 uint16x8_t a1, 1461 int shift) { 1462 return vcombine_u16( 1463 highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift), 1464 highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift)); 1465 } 1466 1467 // clang-format off 1468 static const uint8_t kLoadMaxShuffles[] = { 1469 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 1470 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 1471 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 1472 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 14, 15, 1473 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 14, 15, 1474 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 14, 15, 1475 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 14, 15, 1476 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1477 }; 1478 // clang-format on 1479 1480 static inline uint16x8_t zn_load_masked_neon(const uint16_t *ptr, 1481 int shuffle_idx) { 1482 uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]); 1483 uint8x16_t src = vreinterpretq_u8_u16(vld1q_u16(ptr)); 1484 #if AOM_ARCH_AARCH64 1485 return vreinterpretq_u16_u8(vqtbl1q_u8(src, shuffle)); 1486 #else 1487 uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } }; 1488 uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle)); 1489 uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle)); 1490 return vreinterpretq_u16_u8(vcombine_u8(lo, hi)); 1491 #endif 1492 } 1493 1494 static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst, 1495 ptrdiff_t stride, int bw, 1496 int bh, 1497 const uint16_t *above, 1498 int dx) { 1499 assert(bw % 4 == 0); 1500 assert(bh % 4 == 0); 1501 assert(dx > 0); 1502 1503 const int max_base_x = (bw + bh) - 1; 1504 const int above_max = above[max_base_x]; 1505 1506 const int16x8_t iota1x8 = vld1q_s16(iota1_s16); 1507 const int16x4_t iota1x4 = vget_low_s16(iota1x8); 1508 1509 int x = dx; 1510 int r = 0; 1511 do { 1512 const int base = x >> 6; 1513 if (base >= max_base_x) { 1514 for (int i = r; i < bh; ++i) { 1515 aom_memset16(dst, above_max, bw); 1516 dst += stride; 1517 } 1518 return; 1519 } 1520 1521 // The C implementation of the z1 predictor when not upsampling uses: 1522 // ((x & 0x3f) >> 1) 1523 // The right shift is unnecessary here since we instead shift by +1 later, 1524 // so adjust the mask to 0x3e to ensure we don't consider the extra bit. 1525 const int shift = x & 0x3e; 1526 1527 if (bw == 4) { 1528 const uint16x4_t a0 = vld1_u16(&above[base]); 1529 const uint16x4_t a1 = vld1_u16(&above[base + 1]); 1530 const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift); 1531 const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4); 1532 const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); 1533 vst1_u16(dst, res); 1534 } else { 1535 int c = 0; 1536 do { 1537 uint16x8_t a0; 1538 uint16x8_t a1; 1539 if (base + c >= max_base_x) { 1540 a0 = a1 = vdupq_n_u16(above_max); 1541 } else { 1542 if (base + c + 7 >= max_base_x) { 1543 int shuffle_idx = max_base_x - base - c; 1544 a0 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx); 1545 } else { 1546 a0 = vld1q_u16(above + base + c); 1547 } 1548 if (base + c + 8 >= max_base_x) { 1549 int shuffle_idx = max_base_x - base - c - 1; 1550 a1 = zn_load_masked_neon(above + (max_base_x - 7), shuffle_idx); 1551 } else { 1552 a1 = vld1q_u16(above + base + c + 1); 1553 } 1554 } 1555 1556 vst1q_u16(dst + c, highbd_dr_z1_apply_shift_x8(a0, a1, shift)); 1557 c += 8; 1558 } while (c < bw); 1559 } 1560 1561 dst += stride; 1562 x += dx; 1563 } while (++r < bh); 1564 } 1565 1566 static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst, 1567 ptrdiff_t stride, int bw, 1568 int bh, 1569 const uint16_t *above, 1570 int dx) { 1571 assert(bw % 4 == 0); 1572 assert(bh % 4 == 0); 1573 assert(dx > 0); 1574 1575 const int max_base_x = ((bw + bh) - 1) << 1; 1576 const int above_max = above[max_base_x]; 1577 1578 const int16x8_t iota2x8 = vld1q_s16(iota2_s16); 1579 const int16x4_t iota2x4 = vget_low_s16(iota2x8); 1580 1581 int x = dx; 1582 int r = 0; 1583 do { 1584 const int base = x >> 5; 1585 if (base >= max_base_x) { 1586 for (int i = r; i < bh; ++i) { 1587 aom_memset16(dst, above_max, bw); 1588 dst += stride; 1589 } 1590 return; 1591 } 1592 1593 // The C implementation of the z1 predictor when upsampling uses: 1594 // (((x << 1) & 0x3f) >> 1) 1595 // The right shift is unnecessary here since we instead shift by +1 later, 1596 // so adjust the mask to 0x3e to ensure we don't consider the extra bit. 1597 const int shift = (x << 1) & 0x3e; 1598 1599 if (bw == 4) { 1600 const uint16x4x2_t a01 = vld2_u16(&above[base]); 1601 const uint16x4_t val = 1602 highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift); 1603 const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4); 1604 const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); 1605 vst1_u16(dst, res); 1606 } else { 1607 int c = 0; 1608 do { 1609 const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]); 1610 const uint16x8_t val = 1611 highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift); 1612 const uint16x8_t cmp = 1613 vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8); 1614 const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); 1615 vst1q_u16(dst + c, res); 1616 c += 8; 1617 } while (c < bw); 1618 } 1619 1620 dst += stride; 1621 x += dx; 1622 } while (++r < bh); 1623 } 1624 1625 // Directional prediction, zone 1: 0 < angle < 90 1626 void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw, 1627 int bh, const uint16_t *above, 1628 const uint16_t *left, int upsample_above, 1629 int dx, int dy, int bd) { 1630 (void)left; 1631 (void)dy; 1632 (void)bd; 1633 assert(dy == 1); 1634 1635 if (upsample_above) { 1636 highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx); 1637 } else { 1638 highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx); 1639 } 1640 } 1641 1642 // ----------------------------------------------------------------------------- 1643 // Z2 1644 1645 #if AOM_ARCH_AARCH64 1646 // Incrementally shift more elements from `above` into the result, merging with 1647 // existing `left` elements. 1648 // X0, X1, X2, X3 1649 // Y0, X0, X1, X2 1650 // Y0, Y1, X0, X1 1651 // Y0, Y1, Y2, X0 1652 // Y0, Y1, Y2, Y3 1653 // clang-format off 1654 static const uint8_t z2_merge_shuffles_u16x4[5][8] = { 1655 { 8, 9, 10, 11, 12, 13, 14, 15 }, 1656 { 0, 1, 8, 9, 10, 11, 12, 13 }, 1657 { 0, 1, 2, 3, 8, 9, 10, 11 }, 1658 { 0, 1, 2, 3, 4, 5, 8, 9 }, 1659 { 0, 1, 2, 3, 4, 5, 6, 7 }, 1660 }; 1661 // clang-format on 1662 1663 // Incrementally shift more elements from `above` into the result, merging with 1664 // existing `left` elements. 1665 // X0, X1, X2, X3, X4, X5, X6, X7 1666 // Y0, X0, X1, X2, X3, X4, X5, X6 1667 // Y0, Y1, X0, X1, X2, X3, X4, X5 1668 // Y0, Y1, Y2, X0, X1, X2, X3, X4 1669 // Y0, Y1, Y2, Y3, X0, X1, X2, X3 1670 // Y0, Y1, Y2, Y3, Y4, X0, X1, X2 1671 // Y0, Y1, Y2, Y3, Y4, Y5, X0, X1 1672 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0 1673 // Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 1674 // clang-format off 1675 static const uint8_t z2_merge_shuffles_u16x8[9][16] = { 1676 { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, 1677 { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, 1678 { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }, 1679 { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 }, 1680 { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }, 1681 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 }, 1682 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 }, 1683 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 }, 1684 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 1685 }; 1686 // clang-format on 1687 1688 // clang-format off 1689 static const uint16_t z2_y_iter_masks_u16x4[5][4] = { 1690 { 0U, 0U, 0U, 0U }, 1691 { 0xffffU, 0U, 0U, 0U }, 1692 { 0xffffU, 0xffffU, 0U, 0U }, 1693 { 0xffffU, 0xffffU, 0xffffU, 0U }, 1694 { 0xffffU, 0xffffU, 0xffffU, 0xffffU }, 1695 }; 1696 // clang-format on 1697 1698 // clang-format off 1699 static const uint16_t z2_y_iter_masks_u16x8[9][8] = { 1700 { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, 1701 { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, 1702 { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U }, 1703 { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U }, 1704 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U }, 1705 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U }, 1706 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U }, 1707 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U }, 1708 { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU }, 1709 }; 1710 // clang-format on 1711 1712 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8( 1713 const uint16x8_t left_data, const int16x4_t indices, int base, int n) { 1714 // Need to adjust indices to operate on 0-based indices rather than 1715 // `base`-based indices and then adjust from uint16x4 indices to uint8x8 1716 // indices so we can use a tbl instruction (which only operates on bytes). 1717 uint8x8_t left_indices = 1718 vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); 1719 left_indices = vtrn1_u8(left_indices, left_indices); 1720 left_indices = vadd_u8(left_indices, left_indices); 1721 left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); 1722 const uint16x4_t ret = vreinterpret_u16_u8( 1723 vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices)); 1724 return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); 1725 } 1726 1727 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16( 1728 const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) { 1729 // Need to adjust indices to operate on 0-based indices rather than 1730 // `base`-based indices and then adjust from uint16x4 indices to uint8x8 1731 // indices so we can use a tbl instruction (which only operates on bytes). 1732 uint8x8_t left_indices = 1733 vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); 1734 left_indices = vtrn1_u8(left_indices, left_indices); 1735 left_indices = vadd_u8(left_indices, left_indices); 1736 left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); 1737 uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), 1738 vreinterpretq_u8_u16(left_data.val[1]) } }; 1739 const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices)); 1740 return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); 1741 } 1742 1743 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8( 1744 const uint16x8_t left_data, const int16x8_t indices, int base, int n) { 1745 // Need to adjust indices to operate on 0-based indices rather than 1746 // `base`-based indices and then adjust from uint16x4 indices to uint8x8 1747 // indices so we can use a tbl instruction (which only operates on bytes). 1748 uint8x16_t left_indices = 1749 vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); 1750 left_indices = vtrn1q_u8(left_indices, left_indices); 1751 left_indices = vaddq_u8(left_indices, left_indices); 1752 left_indices = 1753 vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); 1754 const uint16x8_t ret = vreinterpretq_u16_u8( 1755 vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices)); 1756 return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); 1757 } 1758 1759 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16( 1760 const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) { 1761 // Need to adjust indices to operate on 0-based indices rather than 1762 // `base`-based indices and then adjust from uint16x4 indices to uint8x8 1763 // indices so we can use a tbl instruction (which only operates on bytes). 1764 uint8x16_t left_indices = 1765 vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); 1766 left_indices = vtrn1q_u8(left_indices, left_indices); 1767 left_indices = vaddq_u8(left_indices, left_indices); 1768 left_indices = 1769 vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); 1770 uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), 1771 vreinterpretq_u8_u16(left_data.val[1]) } }; 1772 const uint16x8_t ret = 1773 vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices)); 1774 return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); 1775 } 1776 #endif // AOM_ARCH_AARCH64 1777 1778 static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4( 1779 const uint16_t *left, const int16x4_t indices, int n) { 1780 assert(n > 0); 1781 assert(n <= 4); 1782 // Load two elements at a time and then uzp them into separate vectors, to 1783 // reduce the number of memory accesses. 1784 uint32x2_t ret0_u32 = vdup_n_u32(0); 1785 uint32x2_t ret1_u32 = vdup_n_u32(0); 1786 1787 // Use a single vget_lane_u64 to minimize vector to general purpose register 1788 // transfers and then mask off the bits we actually want. 1789 const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0); 1790 const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); 1791 const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); 1792 const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); 1793 const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); 1794 1795 // At time of writing both Clang and GCC produced better code with these 1796 // nested if-statements compared to a switch statement with fallthrough. 1797 load_unaligned_u32_2x1_lane(ret0_u32, left + idx0, 0); 1798 if (n > 1) { 1799 load_unaligned_u32_2x1_lane(ret0_u32, left + idx1, 1); 1800 if (n > 2) { 1801 load_unaligned_u32_2x1_lane(ret1_u32, left + idx2, 0); 1802 if (n > 3) { 1803 load_unaligned_u32_2x1_lane(ret1_u32, left + idx3, 1); 1804 } 1805 } 1806 } 1807 return vuzp_u16(vreinterpret_u16_u32(ret0_u32), 1808 vreinterpret_u16_u32(ret1_u32)); 1809 } 1810 1811 static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8( 1812 const uint16_t *left, const int16x8_t indices, int n) { 1813 assert(n > 0); 1814 assert(n <= 8); 1815 // Load two elements at a time and then uzp them into separate vectors, to 1816 // reduce the number of memory accesses. 1817 uint32x4_t ret0_u32 = vdupq_n_u32(0); 1818 uint32x4_t ret1_u32 = vdupq_n_u32(0); 1819 1820 // Use a pair of vget_lane_u64 to minimize vector to general purpose register 1821 // transfers and then mask off the bits we actually want. 1822 const uint64_t indices0123 = 1823 vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0); 1824 const uint64_t indices4567 = 1825 vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1); 1826 const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); 1827 const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); 1828 const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); 1829 const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); 1830 const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU); 1831 const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU); 1832 const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU); 1833 const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU); 1834 1835 // At time of writing both Clang and GCC produced better code with these 1836 // nested if-statements compared to a switch statement with fallthrough. 1837 load_unaligned_u32_4x1_lane(ret0_u32, left + idx0, 0); 1838 if (n > 1) { 1839 load_unaligned_u32_4x1_lane(ret0_u32, left + idx1, 1); 1840 if (n > 2) { 1841 load_unaligned_u32_4x1_lane(ret0_u32, left + idx2, 2); 1842 if (n > 3) { 1843 load_unaligned_u32_4x1_lane(ret0_u32, left + idx3, 3); 1844 if (n > 4) { 1845 load_unaligned_u32_4x1_lane(ret1_u32, left + idx4, 0); 1846 if (n > 5) { 1847 load_unaligned_u32_4x1_lane(ret1_u32, left + idx5, 1); 1848 if (n > 6) { 1849 load_unaligned_u32_4x1_lane(ret1_u32, left + idx6, 2); 1850 if (n > 7) { 1851 load_unaligned_u32_4x1_lane(ret1_u32, left + idx7, 3); 1852 } 1853 } 1854 } 1855 } 1856 } 1857 } 1858 } 1859 return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32), 1860 vreinterpretq_u16_u32(ret1_u32)); 1861 } 1862 1863 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4( 1864 uint16x4_t out_x, uint16x4_t out_y, int base_shift) { 1865 assert(base_shift >= 0); 1866 assert(base_shift <= 4); 1867 // On AArch64 we can permute the data from the `above` and `left` vectors 1868 // into a single vector in a single load (of the permute vector) + tbl. 1869 #if AOM_ARCH_AARCH64 1870 const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y), 1871 vreinterpret_u8_u16(out_x) } }; 1872 return vreinterpret_u16_u8( 1873 vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift]))); 1874 #else 1875 uint16x4_t out = out_y; 1876 for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) { 1877 out[c2] = out_x[x_idx]; 1878 } 1879 return out; 1880 #endif 1881 } 1882 1883 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8( 1884 uint16x8_t out_x, uint16x8_t out_y, int base_shift) { 1885 assert(base_shift >= 0); 1886 assert(base_shift <= 8); 1887 // On AArch64 we can permute the data from the `above` and `left` vectors 1888 // into a single vector in a single load (of the permute vector) + tbl. 1889 #if AOM_ARCH_AARCH64 1890 const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y), 1891 vreinterpretq_u8_u16(out_x) } }; 1892 return vreinterpretq_u16_u8( 1893 vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift]))); 1894 #else 1895 uint16x8_t out = out_y; 1896 for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) { 1897 out[c2] = out_x[x_idx]; 1898 } 1899 return out; 1900 #endif 1901 } 1902 1903 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4( 1904 uint16x4_t a0, uint16x4_t a1, int16x4_t shift) { 1905 uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift)); 1906 res = 1907 vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift))); 1908 return vrshrn_n_u32(res, 5); 1909 } 1910 1911 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8( 1912 uint16x8_t a0, uint16x8_t a1, int16x8_t shift) { 1913 return vcombine_u16( 1914 highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), 1915 vget_low_s16(shift)), 1916 highbd_dr_prediction_z2_apply_shift_x4( 1917 vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift))); 1918 } 1919 1920 static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4( 1921 const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1, 1922 const uint16_t *left, int dx, int dy, int r, int c) { 1923 const int16x4_t iota = vld1_s16(iota1_s16); 1924 1925 const int x0 = (c << 6) - (r + 1) * dx; 1926 const int y0 = (r << 6) - (c + 1) * dy; 1927 1928 const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6)); 1929 const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy)); 1930 const int16x4_t shift_x0123 = 1931 vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); 1932 const int16x4_t shift_y0123 = 1933 vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1); 1934 const int16x4_t base_y0123 = vshr_n_s16(y0123, 6); 1935 1936 const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; 1937 1938 // Based on the value of `base_shift` there are three possible cases to 1939 // compute the result: 1940 // 1) base_shift <= 0: We can load and operate entirely on data from the 1941 // `above` input vector. 1942 // 2) base_shift < vl: We can load from `above[-1]` and shift 1943 // `vl - base_shift` elements across to the end of the 1944 // vector, then compute the remainder from `left`. 1945 // 3) base_shift >= vl: We can load and operate entirely on data from the 1946 // `left` input vector. 1947 1948 if (base_shift <= 0) { 1949 const int base_x = x0 >> 6; 1950 const uint16x4_t a0 = vld1_u16(above + base_x); 1951 const uint16x4_t a1 = vld1_u16(above + base_x + 1); 1952 return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); 1953 } else if (base_shift < 4) { 1954 const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4( 1955 left + 1, base_y0123, base_shift); 1956 const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4( 1957 l01.val[0], l01.val[1], shift_y0123); 1958 1959 // No need to reload from above in the loop, just use pre-loaded constants. 1960 const uint16x4_t out16_x = 1961 highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123); 1962 1963 return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift); 1964 } else { 1965 const uint16x4x2_t l01 = 1966 highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4); 1967 return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1], 1968 shift_y0123); 1969 } 1970 } 1971 1972 static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8( 1973 const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1, 1974 const uint16_t *left, int dx, int dy, int r, int c) { 1975 const int16x8_t iota = vld1q_s16(iota1_s16); 1976 1977 const int x0 = (c << 6) - (r + 1) * dx; 1978 const int y0 = (r << 6) - (c + 1) * dy; 1979 1980 const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6)); 1981 const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy)); 1982 const int16x8_t shift_x01234567 = 1983 vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1); 1984 const int16x8_t shift_y01234567 = 1985 vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1); 1986 const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6); 1987 1988 const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; 1989 1990 // Based on the value of `base_shift` there are three possible cases to 1991 // compute the result: 1992 // 1) base_shift <= 0: We can load and operate entirely on data from the 1993 // `above` input vector. 1994 // 2) base_shift < vl: We can load from `above[-1]` and shift 1995 // `vl - base_shift` elements across to the end of the 1996 // vector, then compute the remainder from `left`. 1997 // 3) base_shift >= vl: We can load and operate entirely on data from the 1998 // `left` input vector. 1999 2000 if (base_shift <= 0) { 2001 const int base_x = x0 >> 6; 2002 const uint16x8_t a0 = vld1q_u16(above + base_x); 2003 const uint16x8_t a1 = vld1q_u16(above + base_x + 1); 2004 return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); 2005 } else if (base_shift < 8) { 2006 const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8( 2007 left + 1, base_y01234567, base_shift); 2008 const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8( 2009 l01.val[0], l01.val[1], shift_y01234567); 2010 2011 // No need to reload from above in the loop, just use pre-loaded constants. 2012 const uint16x8_t out16_x = 2013 highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567); 2014 2015 return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift); 2016 } else { 2017 const uint16x8x2_t l01 = 2018 highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8); 2019 return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1], 2020 shift_y01234567); 2021 } 2022 } 2023 2024 // Left array is accessed from -1 through `bh - 1` inclusive. 2025 // Above array is accessed from -1 through `bw - 1` inclusive. 2026 #define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \ 2027 static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \ 2028 uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ 2029 const uint16_t *left, int upsample_above, int upsample_left, int dx, \ 2030 int dy, int bd) { \ 2031 (void)bd; \ 2032 (void)upsample_above; \ 2033 (void)upsample_left; \ 2034 assert(!upsample_above); \ 2035 assert(!upsample_left); \ 2036 assert(bw % 4 == 0); \ 2037 assert(bh % 4 == 0); \ 2038 assert(dx > 0); \ 2039 assert(dy > 0); \ 2040 \ 2041 uint16_t left_data[bh + 1]; \ 2042 memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \ 2043 \ 2044 uint16x8_t a0, a1; \ 2045 if (bw == 4) { \ 2046 a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \ 2047 a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \ 2048 } else { \ 2049 a0 = vld1q_u16(above - 1); \ 2050 a1 = vld1q_u16(above + 0); \ 2051 } \ 2052 \ 2053 int r = 0; \ 2054 do { \ 2055 if (bw == 4) { \ 2056 vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \ 2057 above, vget_low_u16(a0), vget_low_u16(a1), \ 2058 left_data, dx, dy, r, 0)); \ 2059 } else { \ 2060 int c = 0; \ 2061 do { \ 2062 vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \ 2063 above, a0, a1, left_data, dx, dy, r, c)); \ 2064 c += 8; \ 2065 } while (c < bw); \ 2066 } \ 2067 dst += stride; \ 2068 } while (++r < bh); \ 2069 } 2070 2071 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2072 HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16) 2073 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) 2074 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32) 2075 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4) 2076 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) 2077 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) 2078 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) 2079 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64) 2080 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8) 2081 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16) 2082 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) 2083 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) 2084 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16) 2085 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) 2086 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) 2087 #else 2088 HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) 2089 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) 2090 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) 2091 HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) 2092 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) 2093 HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) 2094 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) 2095 HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) 2096 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2097 2098 #undef HIGHBD_DR_PREDICTOR_Z2_WXH 2099 2100 typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride, 2101 const uint16_t *above, 2102 const uint16_t *left, 2103 int upsample_above, 2104 int upsample_left, int dx, int dy, 2105 int bd); 2106 2107 static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride, 2108 const uint16_t *above, 2109 const uint16_t *left, 2110 int upsample_above, 2111 int upsample_left, int dx, int dy, 2112 int bd) { 2113 (void)bd; 2114 assert(dx > 0); 2115 assert(dy > 0); 2116 2117 const int frac_bits_x = 6 - upsample_above; 2118 const int frac_bits_y = 6 - upsample_left; 2119 const int min_base_x = -(1 << (upsample_above + frac_bits_x)); 2120 2121 // if `upsample_left` then we need -2 through 6 inclusive from `left`. 2122 // else we only need -1 through 3 inclusive. 2123 2124 #if AOM_ARCH_AARCH64 2125 uint16x8_t left_data0, left_data1; 2126 if (upsample_left) { 2127 left_data0 = vld1q_u16(left - 2); 2128 left_data1 = vld1q_u16(left - 1); 2129 } else { 2130 left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); 2131 left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); 2132 } 2133 #endif 2134 2135 const int16x4_t iota0123 = vld1_s16(iota1_s16); 2136 const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); 2137 2138 for (int r = 0; r < 4; ++r) { 2139 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; 2140 const int x0 = (r + 1) * dx; 2141 const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); 2142 const int base_x0 = (-x0) >> frac_bits_x; 2143 if (base_shift <= 0) { 2144 uint16x4_t a0, a1; 2145 int16x4_t shift_x0123; 2146 if (upsample_above) { 2147 const uint16x4x2_t a01 = vld2_u16(above + base_x0); 2148 a0 = a01.val[0]; 2149 a1 = a01.val[1]; 2150 shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); 2151 } else { 2152 a0 = vld1_u16(above + base_x0); 2153 a1 = vld1_u16(above + base_x0 + 1); 2154 shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); 2155 } 2156 vst1_u16(dst, 2157 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); 2158 } else if (base_shift < 4) { 2159 // Calculate Y component from `left`. 2160 const int y_iters = base_shift; 2161 const int16x4_t y0123 = 2162 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); 2163 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); 2164 const int16x4_t shift_y0123 = vshr_n_s16( 2165 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); 2166 uint16x4_t l0, l1; 2167 #if AOM_ARCH_AARCH64 2168 const int left_data_base = upsample_left ? -2 : -1; 2169 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, 2170 left_data_base, y_iters); 2171 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, 2172 left_data_base, y_iters); 2173 #else 2174 const uint16x4x2_t l01 = 2175 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); 2176 l0 = l01.val[0]; 2177 l1 = l01.val[1]; 2178 #endif 2179 2180 const uint16x4_t out_y = 2181 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); 2182 2183 // Calculate X component from `above`. 2184 const int16x4_t shift_x0123 = vshr_n_s16( 2185 vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)), 2186 1); 2187 uint16x4_t a0, a1; 2188 if (upsample_above) { 2189 const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); 2190 a0 = a01.val[0]; 2191 a1 = a01.val[1]; 2192 } else { 2193 a0 = vld1_u16(above - 1); 2194 a1 = vld1_u16(above + 0); 2195 } 2196 const uint16x4_t out_x = 2197 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); 2198 2199 // Combine X and Y vectors. 2200 const uint16x4_t out = 2201 highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); 2202 vst1_u16(dst, out); 2203 } else { 2204 const int16x4_t y0123 = 2205 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); 2206 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); 2207 const int16x4_t shift_y0123 = vshr_n_s16( 2208 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); 2209 uint16x4_t l0, l1; 2210 #if AOM_ARCH_AARCH64 2211 const int left_data_base = upsample_left ? -2 : -1; 2212 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, 2213 left_data_base, 4); 2214 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, 2215 left_data_base, 4); 2216 #else 2217 const uint16x4x2_t l01 = 2218 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); 2219 l0 = l01.val[0]; 2220 l1 = l01.val[1]; 2221 #endif 2222 vst1_u16(dst, 2223 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); 2224 } 2225 dst += stride; 2226 } 2227 } 2228 2229 static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride, 2230 const uint16_t *above, 2231 const uint16_t *left, 2232 int upsample_above, 2233 int upsample_left, int dx, int dy, 2234 int bd) { 2235 (void)bd; 2236 assert(dx > 0); 2237 assert(dy > 0); 2238 2239 const int frac_bits_x = 6 - upsample_above; 2240 const int frac_bits_y = 6 - upsample_left; 2241 const int min_base_x = -(1 << (upsample_above + frac_bits_x)); 2242 2243 // if `upsample_left` then we need -2 through 14 inclusive from `left`. 2244 // else we only need -1 through 6 inclusive. 2245 2246 #if AOM_ARCH_AARCH64 2247 uint16x8x2_t left_data0, left_data1; 2248 if (upsample_left) { 2249 left_data0 = vld1q_u16_x2(left - 2); 2250 left_data1 = vld1q_u16_x2(left - 1); 2251 } else { 2252 left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; 2253 left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; 2254 } 2255 #endif 2256 2257 const int16x4_t iota0123 = vld1_s16(iota1_s16); 2258 const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); 2259 2260 for (int r = 0; r < 8; ++r) { 2261 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; 2262 const int x0 = (r + 1) * dx; 2263 const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); 2264 const int base_x0 = (-x0) >> frac_bits_x; 2265 if (base_shift <= 0) { 2266 uint16x4_t a0, a1; 2267 int16x4_t shift_x0123; 2268 if (upsample_above) { 2269 const uint16x4x2_t a01 = vld2_u16(above + base_x0); 2270 a0 = a01.val[0]; 2271 a1 = a01.val[1]; 2272 shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); 2273 } else { 2274 a0 = vld1_u16(above + base_x0); 2275 a1 = vld1_u16(above + base_x0 + 1); 2276 shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); 2277 } 2278 vst1_u16(dst, 2279 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); 2280 } else if (base_shift < 4) { 2281 // Calculate Y component from `left`. 2282 const int y_iters = base_shift; 2283 const int16x4_t y0123 = 2284 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); 2285 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); 2286 const int16x4_t shift_y0123 = vshr_n_s16( 2287 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); 2288 2289 uint16x4_t l0, l1; 2290 #if AOM_ARCH_AARCH64 2291 const int left_data_base = upsample_left ? -2 : -1; 2292 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( 2293 left_data0, base_y0123, left_data_base, y_iters); 2294 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( 2295 left_data1, base_y0123, left_data_base, y_iters); 2296 #else 2297 const uint16x4x2_t l01 = 2298 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); 2299 l0 = l01.val[0]; 2300 l1 = l01.val[1]; 2301 #endif 2302 2303 const uint16x4_t out_y = 2304 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); 2305 2306 // Calculate X component from `above`. 2307 uint16x4_t a0, a1; 2308 int16x4_t shift_x0123; 2309 if (upsample_above) { 2310 const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); 2311 a0 = a01.val[0]; 2312 a1 = a01.val[1]; 2313 shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); 2314 } else { 2315 a0 = vld1_u16(above - 1); 2316 a1 = vld1_u16(above + 0); 2317 shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); 2318 } 2319 const uint16x4_t out_x = 2320 highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); 2321 2322 // Combine X and Y vectors. 2323 const uint16x4_t out = 2324 highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); 2325 vst1_u16(dst, out); 2326 } else { 2327 const int16x4_t y0123 = 2328 vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); 2329 const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); 2330 const int16x4_t shift_y0123 = vshr_n_s16( 2331 vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); 2332 2333 uint16x4_t l0, l1; 2334 #if AOM_ARCH_AARCH64 2335 const int left_data_base = upsample_left ? -2 : -1; 2336 l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123, 2337 left_data_base, 4); 2338 l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123, 2339 left_data_base, 4); 2340 #else 2341 const uint16x4x2_t l01 = 2342 highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); 2343 l0 = l01.val[0]; 2344 l1 = l01.val[1]; 2345 #endif 2346 2347 vst1_u16(dst, 2348 highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); 2349 } 2350 dst += stride; 2351 } 2352 } 2353 2354 static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride, 2355 const uint16_t *above, 2356 const uint16_t *left, 2357 int upsample_above, 2358 int upsample_left, int dx, int dy, 2359 int bd) { 2360 (void)bd; 2361 assert(dx > 0); 2362 assert(dy > 0); 2363 2364 const int frac_bits_x = 6 - upsample_above; 2365 const int frac_bits_y = 6 - upsample_left; 2366 const int min_base_x = -(1 << (upsample_above + frac_bits_x)); 2367 2368 // if `upsample_left` then we need -2 through 6 inclusive from `left`. 2369 // else we only need -1 through 3 inclusive. 2370 2371 #if AOM_ARCH_AARCH64 2372 uint16x8_t left_data0, left_data1; 2373 if (upsample_left) { 2374 left_data0 = vld1q_u16(left - 2); 2375 left_data1 = vld1q_u16(left - 1); 2376 } else { 2377 left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); 2378 left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); 2379 } 2380 #endif 2381 2382 const int16x8_t iota01234567 = vld1q_s16(iota1_s16); 2383 const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); 2384 2385 for (int r = 0; r < 4; ++r) { 2386 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; 2387 const int x0 = (r + 1) * dx; 2388 const int16x8_t x01234567 = 2389 vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); 2390 const int base_x0 = (-x0) >> frac_bits_x; 2391 if (base_shift <= 0) { 2392 uint16x8_t a0, a1; 2393 int16x8_t shift_x01234567; 2394 if (upsample_above) { 2395 const uint16x8x2_t a01 = vld2q_u16(above + base_x0); 2396 a0 = a01.val[0]; 2397 a1 = a01.val[1]; 2398 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); 2399 } else { 2400 a0 = vld1q_u16(above + base_x0); 2401 a1 = vld1q_u16(above + base_x0 + 1); 2402 shift_x01234567 = 2403 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); 2404 } 2405 vst1q_u16( 2406 dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); 2407 } else if (base_shift < 8) { 2408 // Calculate Y component from `left`. 2409 const int y_iters = base_shift; 2410 const int16x8_t y01234567 = 2411 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); 2412 const int16x8_t base_y01234567 = 2413 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); 2414 const int16x8_t shift_y01234567 = 2415 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), 2416 vdupq_n_s16(0x3F)), 2417 1); 2418 2419 uint16x8_t l0, l1; 2420 #if AOM_ARCH_AARCH64 2421 const int left_data_base = upsample_left ? -2 : -1; 2422 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( 2423 left_data0, base_y01234567, left_data_base, y_iters); 2424 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( 2425 left_data1, base_y01234567, left_data_base, y_iters); 2426 #else 2427 const uint16x8x2_t l01 = 2428 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); 2429 l0 = l01.val[0]; 2430 l1 = l01.val[1]; 2431 #endif 2432 2433 const uint16x8_t out_y = 2434 highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); 2435 2436 // Calculate X component from `above`. 2437 uint16x8_t a0, a1; 2438 int16x8_t shift_x01234567; 2439 if (upsample_above) { 2440 const uint16x8x2_t a01 = 2441 vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); 2442 a0 = a01.val[0]; 2443 a1 = a01.val[1]; 2444 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); 2445 } else { 2446 a0 = vld1q_u16(above - 1); 2447 a1 = vld1q_u16(above + 0); 2448 shift_x01234567 = 2449 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); 2450 } 2451 const uint16x8_t out_x = 2452 highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); 2453 2454 // Combine X and Y vectors. 2455 const uint16x8_t out = 2456 highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); 2457 vst1q_u16(dst, out); 2458 } else { 2459 const int16x8_t y01234567 = 2460 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); 2461 const int16x8_t base_y01234567 = 2462 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); 2463 const int16x8_t shift_y01234567 = 2464 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), 2465 vdupq_n_s16(0x3F)), 2466 1); 2467 2468 uint16x8_t l0, l1; 2469 #if AOM_ARCH_AARCH64 2470 const int left_data_base = upsample_left ? -2 : -1; 2471 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( 2472 left_data0, base_y01234567, left_data_base, 8); 2473 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( 2474 left_data1, base_y01234567, left_data_base, 8); 2475 #else 2476 const uint16x8x2_t l01 = 2477 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); 2478 l0 = l01.val[0]; 2479 l1 = l01.val[1]; 2480 #endif 2481 2482 vst1q_u16( 2483 dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); 2484 } 2485 dst += stride; 2486 } 2487 } 2488 2489 static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride, 2490 const uint16_t *above, 2491 const uint16_t *left, 2492 int upsample_above, 2493 int upsample_left, int dx, int dy, 2494 int bd) { 2495 (void)bd; 2496 assert(dx > 0); 2497 assert(dy > 0); 2498 2499 const int frac_bits_x = 6 - upsample_above; 2500 const int frac_bits_y = 6 - upsample_left; 2501 const int min_base_x = -(1 << (upsample_above + frac_bits_x)); 2502 2503 // if `upsample_left` then we need -2 through 14 inclusive from `left`. 2504 // else we only need -1 through 6 inclusive. 2505 2506 #if AOM_ARCH_AARCH64 2507 uint16x8x2_t left_data0, left_data1; 2508 if (upsample_left) { 2509 left_data0 = vld1q_u16_x2(left - 2); 2510 left_data1 = vld1q_u16_x2(left - 1); 2511 } else { 2512 left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; 2513 left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; 2514 } 2515 #endif 2516 2517 const int16x8_t iota01234567 = vld1q_s16(iota1_s16); 2518 const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); 2519 2520 for (int r = 0; r < 8; ++r) { 2521 const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; 2522 const int x0 = (r + 1) * dx; 2523 const int16x8_t x01234567 = 2524 vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); 2525 const int base_x0 = (-x0) >> frac_bits_x; 2526 if (base_shift <= 0) { 2527 uint16x8_t a0, a1; 2528 int16x8_t shift_x01234567; 2529 if (upsample_above) { 2530 const uint16x8x2_t a01 = vld2q_u16(above + base_x0); 2531 a0 = a01.val[0]; 2532 a1 = a01.val[1]; 2533 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); 2534 } else { 2535 a0 = vld1q_u16(above + base_x0); 2536 a1 = vld1q_u16(above + base_x0 + 1); 2537 shift_x01234567 = 2538 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); 2539 } 2540 vst1q_u16( 2541 dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); 2542 } else if (base_shift < 8) { 2543 // Calculate Y component from `left`. 2544 const int y_iters = base_shift; 2545 const int16x8_t y01234567 = 2546 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); 2547 const int16x8_t base_y01234567 = 2548 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); 2549 const int16x8_t shift_y01234567 = 2550 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), 2551 vdupq_n_s16(0x3F)), 2552 1); 2553 2554 uint16x8_t l0, l1; 2555 #if AOM_ARCH_AARCH64 2556 const int left_data_base = upsample_left ? -2 : -1; 2557 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( 2558 left_data0, base_y01234567, left_data_base, y_iters); 2559 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( 2560 left_data1, base_y01234567, left_data_base, y_iters); 2561 #else 2562 const uint16x8x2_t l01 = 2563 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); 2564 l0 = l01.val[0]; 2565 l1 = l01.val[1]; 2566 #endif 2567 2568 const uint16x8_t out_y = 2569 highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); 2570 2571 // Calculate X component from `above`. 2572 uint16x8_t a0, a1; 2573 int16x8_t shift_x01234567; 2574 if (upsample_above) { 2575 const uint16x8x2_t a01 = 2576 vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); 2577 a0 = a01.val[0]; 2578 a1 = a01.val[1]; 2579 shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); 2580 } else { 2581 a0 = vld1q_u16(above - 1); 2582 a1 = vld1q_u16(above + 0); 2583 shift_x01234567 = 2584 vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); 2585 } 2586 const uint16x8_t out_x = 2587 highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); 2588 2589 // Combine X and Y vectors. 2590 const uint16x8_t out = 2591 highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); 2592 vst1q_u16(dst, out); 2593 } else { 2594 const int16x8_t y01234567 = 2595 vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); 2596 const int16x8_t base_y01234567 = 2597 vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); 2598 const int16x8_t shift_y01234567 = 2599 vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), 2600 vdupq_n_s16(0x3F)), 2601 1); 2602 2603 uint16x8_t l0, l1; 2604 #if AOM_ARCH_AARCH64 2605 const int left_data_base = upsample_left ? -2 : -1; 2606 l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( 2607 left_data0, base_y01234567, left_data_base, 8); 2608 l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( 2609 left_data1, base_y01234567, left_data_base, 8); 2610 #else 2611 const uint16x8x2_t l01 = 2612 highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); 2613 l0 = l01.val[0]; 2614 l1 = l01.val[1]; 2615 #endif 2616 2617 vst1q_u16( 2618 dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); 2619 } 2620 dst += stride; 2621 } 2622 } 2623 2624 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2625 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { 2626 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2627 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2628 { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, 2629 &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL, 2630 NULL }, 2631 { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, 2632 &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, 2633 &highbd_dr_prediction_z2_8x32_neon, NULL }, 2634 { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon, 2635 &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon, 2636 &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon }, 2637 { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon, 2638 &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon, 2639 &highbd_dr_prediction_z2_32x64_neon }, 2640 { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon, 2641 &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon }, 2642 }; 2643 #else 2644 static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { 2645 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2646 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2647 { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, 2648 &highbd_dr_prediction_z2_4x8_neon, NULL, NULL, NULL }, 2649 { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, 2650 &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, NULL, 2651 NULL }, 2652 { NULL, NULL, NULL, &highbd_dr_prediction_z2_16x8_neon, 2653 &highbd_dr_prediction_z2_16x16_neon, &highbd_dr_prediction_z2_16x32_neon, 2654 NULL }, 2655 { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_32x32_neon, 2656 &highbd_dr_prediction_z2_32x64_neon }, 2657 { NULL, NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x32_neon, 2658 &highbd_dr_prediction_z2_64x64_neon }, 2659 }; 2660 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2661 2662 // Directional prediction, zone 2: 90 < angle < 180 2663 void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw, 2664 int bh, const uint16_t *above, 2665 const uint16_t *left, int upsample_above, 2666 int upsample_left, int dx, int dy, 2667 int bd) { 2668 highbd_dr_prediction_z2_ptr f = 2669 dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)]; 2670 assert(f != NULL); 2671 f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd); 2672 } 2673 2674 // ----------------------------------------------------------------------------- 2675 // Z3 2676 2677 // Both the lane to the use and the shift amount must be immediates. 2678 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \ 2679 lane, shift) \ 2680 do { \ 2681 uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \ 2682 val = vmlal_lane_u16(val, (in1), (s1), (lane)); \ 2683 const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \ 2684 const uint16x4_t res = vrshrn_n_u32(val, (shift)); \ 2685 *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \ 2686 vdup_n_u16(left_max)); \ 2687 } while (0) 2688 2689 #define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \ 2690 lane, shift) \ 2691 do { \ 2692 uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \ 2693 val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \ 2694 uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \ 2695 val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \ 2696 *(out) = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \ 2697 vrshrn_n_u32(val_hi, (shift))); \ 2698 } while (0) 2699 2700 static inline uint16x8x2_t z3_load_left_neon(const uint16_t *left0, int ofs, 2701 int max_ofs) { 2702 uint16x8_t r0; 2703 uint16x8_t r1; 2704 if (ofs + 7 >= max_ofs) { 2705 int shuffle_idx = max_ofs - ofs; 2706 r0 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx); 2707 } else { 2708 r0 = vld1q_u16(left0 + ofs); 2709 } 2710 if (ofs + 8 >= max_ofs) { 2711 int shuffle_idx = max_ofs - ofs - 1; 2712 r1 = zn_load_masked_neon(left0 + (max_ofs - 7), shuffle_idx); 2713 } else { 2714 r1 = vld1q_u16(left0 + ofs + 1); 2715 } 2716 return (uint16x8x2_t){ { r0, r1 } }; 2717 } 2718 2719 static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst, 2720 ptrdiff_t stride, int bw, 2721 int bh, const uint16_t *left, 2722 int dy) { 2723 assert(bw % 4 == 0); 2724 assert(bh % 4 == 0); 2725 assert(dy > 0); 2726 2727 // Factor out left + 1 to give the compiler a better chance of recognising 2728 // that the offsets used for the loads from left and left + 1 are otherwise 2729 // identical. 2730 const uint16_t *left1 = left + 1; 2731 2732 const int max_base_y = (bw + bh - 1); 2733 const int left_max = left[max_base_y]; 2734 const int frac_bits = 6; 2735 2736 const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16)); 2737 const uint16x4_t iota1x4 = vget_low_u16(iota1x8); 2738 2739 // The C implementation of the z3 predictor when not upsampling uses: 2740 // ((y & 0x3f) >> 1) 2741 // The right shift is unnecessary here since we instead shift by +1 later, 2742 // so adjust the mask to 0x3e to ensure we don't consider the extra bit. 2743 const uint16x4_t shift_mask = vdup_n_u16(0x3e); 2744 2745 if (bh == 4) { 2746 int y = dy; 2747 int c = 0; 2748 do { 2749 // Fully unroll the 4x4 block to allow us to use immediate lane-indexed 2750 // multiply instructions. 2751 const uint16x4_t shifts1 = 2752 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); 2753 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); 2754 const int base0 = (y + 0 * dy) >> frac_bits; 2755 const int base1 = (y + 1 * dy) >> frac_bits; 2756 const int base2 = (y + 2 * dy) >> frac_bits; 2757 const int base3 = (y + 3 * dy) >> frac_bits; 2758 uint16x4_t out[4]; 2759 if (base0 >= max_base_y) { 2760 out[0] = vdup_n_u16(left_max); 2761 } else { 2762 const uint16x4_t l00 = vld1_u16(left + base0); 2763 const uint16x4_t l01 = vld1_u16(left1 + base0); 2764 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01, 2765 shifts0, shifts1, 0, 6); 2766 } 2767 if (base1 >= max_base_y) { 2768 out[1] = vdup_n_u16(left_max); 2769 } else { 2770 const uint16x4_t l10 = vld1_u16(left + base1); 2771 const uint16x4_t l11 = vld1_u16(left1 + base1); 2772 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11, 2773 shifts0, shifts1, 1, 6); 2774 } 2775 if (base2 >= max_base_y) { 2776 out[2] = vdup_n_u16(left_max); 2777 } else { 2778 const uint16x4_t l20 = vld1_u16(left + base2); 2779 const uint16x4_t l21 = vld1_u16(left1 + base2); 2780 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21, 2781 shifts0, shifts1, 2, 6); 2782 } 2783 if (base3 >= max_base_y) { 2784 out[3] = vdup_n_u16(left_max); 2785 } else { 2786 const uint16x4_t l30 = vld1_u16(left + base3); 2787 const uint16x4_t l31 = vld1_u16(left1 + base3); 2788 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31, 2789 shifts0, shifts1, 3, 6); 2790 } 2791 transpose_array_inplace_u16_4x4(out); 2792 for (int r2 = 0; r2 < 4; ++r2) { 2793 vst1_u16(dst + r2 * stride + c, out[r2]); 2794 } 2795 y += 4 * dy; 2796 c += 4; 2797 } while (c < bw); 2798 } else { 2799 int y = dy; 2800 int c = 0; 2801 do { 2802 int r = 0; 2803 do { 2804 // Fully unroll the 4x4 block to allow us to use immediate lane-indexed 2805 // multiply instructions. 2806 const uint16x4_t shifts1 = 2807 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); 2808 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); 2809 const int base0 = ((y + 0 * dy) >> frac_bits) + r; 2810 const int base1 = ((y + 1 * dy) >> frac_bits) + r; 2811 const int base2 = ((y + 2 * dy) >> frac_bits) + r; 2812 const int base3 = ((y + 3 * dy) >> frac_bits) + r; 2813 uint16x8_t out[4]; 2814 if (base0 >= max_base_y) { 2815 out[0] = vdupq_n_u16(left_max); 2816 } else { 2817 const uint16x8x2_t l0 = z3_load_left_neon(left, base0, max_base_y); 2818 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l0.val[0], 2819 l0.val[1], shifts0, shifts1, 0, 6); 2820 } 2821 if (base1 >= max_base_y) { 2822 out[1] = vdupq_n_u16(left_max); 2823 } else { 2824 const uint16x8x2_t l1 = z3_load_left_neon(left, base1, max_base_y); 2825 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l1.val[0], 2826 l1.val[1], shifts0, shifts1, 1, 6); 2827 } 2828 if (base2 >= max_base_y) { 2829 out[2] = vdupq_n_u16(left_max); 2830 } else { 2831 const uint16x8x2_t l2 = z3_load_left_neon(left, base2, max_base_y); 2832 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l2.val[0], 2833 l2.val[1], shifts0, shifts1, 2, 6); 2834 } 2835 if (base3 >= max_base_y) { 2836 out[3] = vdupq_n_u16(left_max); 2837 } else { 2838 const uint16x8x2_t l3 = z3_load_left_neon(left, base3, max_base_y); 2839 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l3.val[0], 2840 l3.val[1], shifts0, shifts1, 3, 6); 2841 } 2842 transpose_array_inplace_u16_4x8(out); 2843 for (int r2 = 0; r2 < 4; ++r2) { 2844 vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); 2845 } 2846 for (int r2 = 0; r2 < 4; ++r2) { 2847 vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); 2848 } 2849 r += 8; 2850 } while (r < bh); 2851 y += 4 * dy; 2852 c += 4; 2853 } while (c < bw); 2854 } 2855 } 2856 2857 static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst, 2858 ptrdiff_t stride, int bw, 2859 int bh, const uint16_t *left, 2860 int dy) { 2861 assert(bw % 4 == 0); 2862 assert(bh % 4 == 0); 2863 assert(dy > 0); 2864 2865 const int max_base_y = (bw + bh - 1) << 1; 2866 const int left_max = left[max_base_y]; 2867 const int frac_bits = 5; 2868 2869 const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16)); 2870 const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16)); 2871 const uint16x4_t iota2x4 = vget_low_u16(iota2x8); 2872 2873 // The C implementation of the z3 predictor when upsampling uses: 2874 // (((x << 1) & 0x3f) >> 1) 2875 // The two shifts are unnecessary here since the lowest bit is guaranteed to 2876 // be zero when the mask is applied, so adjust the mask to 0x1f to avoid 2877 // needing the shifts at all. 2878 const uint16x4_t shift_mask = vdup_n_u16(0x1F); 2879 2880 if (bh == 4) { 2881 int y = dy; 2882 int c = 0; 2883 do { 2884 // Fully unroll the 4x4 block to allow us to use immediate lane-indexed 2885 // multiply instructions. 2886 const uint16x4_t shifts1 = 2887 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); 2888 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); 2889 const int base0 = (y + 0 * dy) >> frac_bits; 2890 const int base1 = (y + 1 * dy) >> frac_bits; 2891 const int base2 = (y + 2 * dy) >> frac_bits; 2892 const int base3 = (y + 3 * dy) >> frac_bits; 2893 const uint16x4x2_t l0 = vld2_u16(left + base0); 2894 const uint16x4x2_t l1 = vld2_u16(left + base1); 2895 const uint16x4x2_t l2 = vld2_u16(left + base2); 2896 const uint16x4x2_t l3 = vld2_u16(left + base3); 2897 uint16x4_t out[4]; 2898 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0], 2899 l0.val[1], shifts0, shifts1, 0, 5); 2900 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0], 2901 l1.val[1], shifts0, shifts1, 1, 5); 2902 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0], 2903 l2.val[1], shifts0, shifts1, 2, 5); 2904 HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0], 2905 l3.val[1], shifts0, shifts1, 3, 5); 2906 transpose_array_inplace_u16_4x4(out); 2907 for (int r2 = 0; r2 < 4; ++r2) { 2908 vst1_u16(dst + r2 * stride + c, out[r2]); 2909 } 2910 y += 4 * dy; 2911 c += 4; 2912 } while (c < bw); 2913 } else { 2914 assert(bh % 8 == 0); 2915 2916 int y = dy; 2917 int c = 0; 2918 do { 2919 int r = 0; 2920 do { 2921 // Fully unroll the 4x8 block to allow us to use immediate lane-indexed 2922 // multiply instructions. 2923 const uint16x4_t shifts1 = 2924 vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); 2925 const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); 2926 const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2); 2927 const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2); 2928 const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2); 2929 const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2); 2930 const uint16x8x2_t l0 = vld2q_u16(left + base0); 2931 const uint16x8x2_t l1 = vld2q_u16(left + base1); 2932 const uint16x8x2_t l2 = vld2q_u16(left + base2); 2933 const uint16x8x2_t l3 = vld2q_u16(left + base3); 2934 uint16x8_t out[4]; 2935 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0], 2936 l0.val[1], shifts0, shifts1, 0, 5); 2937 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0], 2938 l1.val[1], shifts0, shifts1, 1, 5); 2939 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0], 2940 l2.val[1], shifts0, shifts1, 2, 5); 2941 HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0], 2942 l3.val[1], shifts0, shifts1, 3, 5); 2943 transpose_array_inplace_u16_4x8(out); 2944 for (int r2 = 0; r2 < 4; ++r2) { 2945 vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); 2946 } 2947 for (int r2 = 0; r2 < 4; ++r2) { 2948 vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); 2949 } 2950 r += 8; 2951 } while (r < bh); 2952 y += 4 * dy; 2953 c += 4; 2954 } while (c < bw); 2955 } 2956 } 2957 2958 // Directional prediction, zone 3: 180 < angle < 270 2959 void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw, 2960 int bh, const uint16_t *above, 2961 const uint16_t *left, int upsample_left, 2962 int dx, int dy, int bd) { 2963 (void)above; 2964 (void)dx; 2965 (void)bd; 2966 assert(bw % 4 == 0); 2967 assert(bh % 4 == 0); 2968 assert(dx == 1); 2969 assert(dy > 0); 2970 2971 if (upsample_left) { 2972 highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy); 2973 } else { 2974 highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy); 2975 } 2976 } 2977 2978 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4 2979 #undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8