intrapred_neon.c (134752B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 #include <stdint.h> 15 16 #include "config/aom_config.h" 17 #include "config/aom_dsp_rtcd.h" 18 #include "config/av1_rtcd.h" 19 20 #include "aom/aom_integer.h" 21 #include "aom_dsp/arm/mem_neon.h" 22 #include "aom_dsp/arm/reinterpret_neon.h" 23 #include "aom_dsp/arm/sum_neon.h" 24 #include "aom_dsp/arm/transpose_neon.h" 25 #include "aom_dsp/intrapred_common.h" 26 27 //------------------------------------------------------------------------------ 28 // DC 4x4 29 30 static inline uint16x8_t dc_load_sum_4(const uint8_t *in) { 31 const uint8x8_t a = load_u8_4x1(in); 32 const uint16x4_t p0 = vpaddl_u8(a); 33 const uint16x4_t p1 = vpadd_u16(p0, p0); 34 return vcombine_u16(p1, vdup_n_u16(0)); 35 } 36 37 static inline void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, 38 uint8x8_t dc) { 39 for (int i = 0; i < h; ++i) { 40 store_u8_4x1(dst + i * stride, dc); 41 } 42 } 43 44 void aom_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 45 const uint8_t *above, const uint8_t *left) { 46 const uint16x8_t sum_top = dc_load_sum_4(above); 47 const uint16x8_t sum_left = dc_load_sum_4(left); 48 const uint16x8_t sum = vaddq_u16(sum_left, sum_top); 49 const uint8x8_t dc0 = vrshrn_n_u16(sum, 3); 50 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); 51 } 52 53 void aom_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 54 const uint8_t *above, const uint8_t *left) { 55 const uint16x8_t sum_left = dc_load_sum_4(left); 56 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 2); 57 (void)above; 58 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); 59 } 60 61 void aom_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 62 const uint8_t *above, const uint8_t *left) { 63 const uint16x8_t sum_top = dc_load_sum_4(above); 64 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 2); 65 (void)left; 66 dc_store_4xh(dst, stride, 4, vdup_lane_u8(dc0, 0)); 67 } 68 69 void aom_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 70 const uint8_t *above, const uint8_t *left) { 71 const uint8x8_t dc0 = vdup_n_u8(0x80); 72 (void)above; 73 (void)left; 74 dc_store_4xh(dst, stride, 4, dc0); 75 } 76 77 //------------------------------------------------------------------------------ 78 // DC 8x8 79 80 static inline uint16x8_t dc_load_sum_8(const uint8_t *in) { 81 // This isn't used in the case where we want to load both above and left 82 // vectors, since we want to avoid performing the reduction twice. 83 const uint8x8_t a = vld1_u8(in); 84 const uint16x4_t p0 = vpaddl_u8(a); 85 const uint16x4_t p1 = vpadd_u16(p0, p0); 86 const uint16x4_t p2 = vpadd_u16(p1, p1); 87 return vcombine_u16(p2, vdup_n_u16(0)); 88 } 89 90 static inline uint16x8_t horizontal_add_and_broadcast_u16x8(uint16x8_t a) { 91 #if AOM_ARCH_AARCH64 92 // On AArch64 we could also use vdupq_n_u16(vaddvq_u16(a)) here to save an 93 // instruction, however the addv instruction is usually slightly more 94 // expensive than a pairwise addition, so the need for immediately 95 // broadcasting the result again seems to negate any benefit. 96 const uint16x8_t b = vpaddq_u16(a, a); 97 const uint16x8_t c = vpaddq_u16(b, b); 98 return vpaddq_u16(c, c); 99 #else 100 const uint16x4_t b = vadd_u16(vget_low_u16(a), vget_high_u16(a)); 101 const uint16x4_t c = vpadd_u16(b, b); 102 const uint16x4_t d = vpadd_u16(c, c); 103 return vcombine_u16(d, d); 104 #endif 105 } 106 107 static inline void dc_store_8xh(uint8_t *dst, ptrdiff_t stride, int h, 108 uint8x8_t dc) { 109 for (int i = 0; i < h; ++i) { 110 vst1_u8(dst + i * stride, dc); 111 } 112 } 113 114 void aom_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 115 const uint8_t *above, const uint8_t *left) { 116 const uint8x8_t sum_top = vld1_u8(above); 117 const uint8x8_t sum_left = vld1_u8(left); 118 uint16x8_t sum = vaddl_u8(sum_left, sum_top); 119 sum = horizontal_add_and_broadcast_u16x8(sum); 120 const uint8x8_t dc0 = vrshrn_n_u16(sum, 4); 121 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); 122 } 123 124 void aom_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 125 const uint8_t *above, const uint8_t *left) { 126 const uint16x8_t sum_left = dc_load_sum_8(left); 127 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 3); 128 (void)above; 129 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); 130 } 131 132 void aom_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 133 const uint8_t *above, const uint8_t *left) { 134 const uint16x8_t sum_top = dc_load_sum_8(above); 135 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 3); 136 (void)left; 137 dc_store_8xh(dst, stride, 8, vdup_lane_u8(dc0, 0)); 138 } 139 140 void aom_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 141 const uint8_t *above, const uint8_t *left) { 142 const uint8x8_t dc0 = vdup_n_u8(0x80); 143 (void)above; 144 (void)left; 145 dc_store_8xh(dst, stride, 8, dc0); 146 } 147 148 //------------------------------------------------------------------------------ 149 // DC 16x16 150 151 static inline uint16x8_t dc_load_partial_sum_16(const uint8_t *in) { 152 const uint8x16_t a = vld1q_u8(in); 153 // delay the remainder of the reduction until 154 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather 155 // than twice in the case we are loading both above and left. 156 return vpaddlq_u8(a); 157 } 158 159 static inline uint16x8_t dc_load_sum_16(const uint8_t *in) { 160 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_16(in)); 161 } 162 163 static inline void dc_store_16xh(uint8_t *dst, ptrdiff_t stride, int h, 164 uint8x16_t dc) { 165 for (int i = 0; i < h; ++i) { 166 vst1q_u8(dst + i * stride, dc); 167 } 168 } 169 170 void aom_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 171 const uint8_t *above, const uint8_t *left) { 172 const uint16x8_t sum_top = dc_load_partial_sum_16(above); 173 const uint16x8_t sum_left = dc_load_partial_sum_16(left); 174 uint16x8_t sum = vaddq_u16(sum_left, sum_top); 175 sum = horizontal_add_and_broadcast_u16x8(sum); 176 const uint8x8_t dc0 = vrshrn_n_u16(sum, 5); 177 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); 178 } 179 180 void aom_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 181 const uint8_t *above, 182 const uint8_t *left) { 183 const uint16x8_t sum_left = dc_load_sum_16(left); 184 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 4); 185 (void)above; 186 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); 187 } 188 189 void aom_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 190 const uint8_t *above, 191 const uint8_t *left) { 192 const uint16x8_t sum_top = dc_load_sum_16(above); 193 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 4); 194 (void)left; 195 dc_store_16xh(dst, stride, 16, vdupq_lane_u8(dc0, 0)); 196 } 197 198 void aom_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 199 const uint8_t *above, 200 const uint8_t *left) { 201 const uint8x16_t dc0 = vdupq_n_u8(0x80); 202 (void)above; 203 (void)left; 204 dc_store_16xh(dst, stride, 16, dc0); 205 } 206 207 //------------------------------------------------------------------------------ 208 // DC 32x32 209 210 static inline uint16x8_t dc_load_partial_sum_32(const uint8_t *in) { 211 const uint8x16_t a0 = vld1q_u8(in); 212 const uint8x16_t a1 = vld1q_u8(in + 16); 213 // delay the remainder of the reduction until 214 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather 215 // than twice in the case we are loading both above and left. 216 return vpadalq_u8(vpaddlq_u8(a0), a1); 217 } 218 219 static inline uint16x8_t dc_load_sum_32(const uint8_t *in) { 220 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_32(in)); 221 } 222 223 static inline void dc_store_32xh(uint8_t *dst, ptrdiff_t stride, int h, 224 uint8x16_t dc) { 225 for (int i = 0; i < h; ++i) { 226 vst1q_u8(dst + i * stride, dc); 227 vst1q_u8(dst + i * stride + 16, dc); 228 } 229 } 230 231 void aom_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 232 const uint8_t *above, const uint8_t *left) { 233 const uint16x8_t sum_top = dc_load_partial_sum_32(above); 234 const uint16x8_t sum_left = dc_load_partial_sum_32(left); 235 uint16x8_t sum = vaddq_u16(sum_left, sum_top); 236 sum = horizontal_add_and_broadcast_u16x8(sum); 237 const uint8x8_t dc0 = vrshrn_n_u16(sum, 6); 238 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); 239 } 240 241 void aom_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 242 const uint8_t *above, 243 const uint8_t *left) { 244 const uint16x8_t sum_left = dc_load_sum_32(left); 245 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 5); 246 (void)above; 247 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); 248 } 249 250 void aom_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 251 const uint8_t *above, 252 const uint8_t *left) { 253 const uint16x8_t sum_top = dc_load_sum_32(above); 254 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 5); 255 (void)left; 256 dc_store_32xh(dst, stride, 32, vdupq_lane_u8(dc0, 0)); 257 } 258 259 void aom_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 260 const uint8_t *above, 261 const uint8_t *left) { 262 const uint8x16_t dc0 = vdupq_n_u8(0x80); 263 (void)above; 264 (void)left; 265 dc_store_32xh(dst, stride, 32, dc0); 266 } 267 268 //------------------------------------------------------------------------------ 269 // DC 64x64 270 271 static inline uint16x8_t dc_load_partial_sum_64(const uint8_t *in) { 272 const uint8x16_t a0 = vld1q_u8(in); 273 const uint8x16_t a1 = vld1q_u8(in + 16); 274 const uint8x16_t a2 = vld1q_u8(in + 32); 275 const uint8x16_t a3 = vld1q_u8(in + 48); 276 const uint16x8_t p01 = vpadalq_u8(vpaddlq_u8(a0), a1); 277 const uint16x8_t p23 = vpadalq_u8(vpaddlq_u8(a2), a3); 278 // delay the remainder of the reduction until 279 // horizontal_add_and_broadcast_u16x8, since we want to do it once rather 280 // than twice in the case we are loading both above and left. 281 return vaddq_u16(p01, p23); 282 } 283 284 static inline uint16x8_t dc_load_sum_64(const uint8_t *in) { 285 return horizontal_add_and_broadcast_u16x8(dc_load_partial_sum_64(in)); 286 } 287 288 static inline void dc_store_64xh(uint8_t *dst, ptrdiff_t stride, int h, 289 uint8x16_t dc) { 290 for (int i = 0; i < h; ++i) { 291 vst1q_u8(dst + i * stride, dc); 292 vst1q_u8(dst + i * stride + 16, dc); 293 vst1q_u8(dst + i * stride + 32, dc); 294 vst1q_u8(dst + i * stride + 48, dc); 295 } 296 } 297 298 void aom_dc_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, 299 const uint8_t *above, const uint8_t *left) { 300 const uint16x8_t sum_top = dc_load_partial_sum_64(above); 301 const uint16x8_t sum_left = dc_load_partial_sum_64(left); 302 uint16x8_t sum = vaddq_u16(sum_left, sum_top); 303 sum = horizontal_add_and_broadcast_u16x8(sum); 304 const uint8x8_t dc0 = vrshrn_n_u16(sum, 7); 305 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); 306 } 307 308 void aom_dc_left_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, 309 const uint8_t *above, 310 const uint8_t *left) { 311 const uint16x8_t sum_left = dc_load_sum_64(left); 312 const uint8x8_t dc0 = vrshrn_n_u16(sum_left, 6); 313 (void)above; 314 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); 315 } 316 317 void aom_dc_top_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, 318 const uint8_t *above, 319 const uint8_t *left) { 320 const uint16x8_t sum_top = dc_load_sum_64(above); 321 const uint8x8_t dc0 = vrshrn_n_u16(sum_top, 6); 322 (void)left; 323 dc_store_64xh(dst, stride, 64, vdupq_lane_u8(dc0, 0)); 324 } 325 326 void aom_dc_128_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, 327 const uint8_t *above, 328 const uint8_t *left) { 329 const uint8x16_t dc0 = vdupq_n_u8(0x80); 330 (void)above; 331 (void)left; 332 dc_store_64xh(dst, stride, 64, dc0); 333 } 334 335 //------------------------------------------------------------------------------ 336 // DC rectangular cases 337 338 #define DC_MULTIPLIER_1X2 0x5556 339 #define DC_MULTIPLIER_1X4 0x3334 340 341 #define DC_SHIFT2 16 342 343 static inline int divide_using_multiply_shift(int num, int shift1, 344 int multiplier, int shift2) { 345 const int interm = num >> shift1; 346 return interm * multiplier >> shift2; 347 } 348 349 static inline int calculate_dc_from_sum(int bw, int bh, uint32_t sum, 350 int shift1, int multiplier) { 351 const int expected_dc = divide_using_multiply_shift( 352 sum + ((bw + bh) >> 1), shift1, multiplier, DC_SHIFT2); 353 assert(expected_dc < (1 << 8)); 354 return expected_dc; 355 } 356 357 #undef DC_SHIFT2 358 359 void aom_dc_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, 360 const uint8_t *above, const uint8_t *left) { 361 uint8x8_t a = load_u8_4x1(above); 362 uint8x8_t l = vld1_u8(left); 363 uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l)); 364 uint32_t dc = calculate_dc_from_sum(4, 8, sum, 2, DC_MULTIPLIER_1X2); 365 dc_store_4xh(dst, stride, 8, vdup_n_u8(dc)); 366 } 367 368 void aom_dc_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, 369 const uint8_t *above, const uint8_t *left) { 370 uint8x8_t a = vld1_u8(above); 371 uint8x8_t l = load_u8_4x1(left); 372 uint32_t sum = horizontal_add_u16x8(vaddl_u8(a, l)); 373 uint32_t dc = calculate_dc_from_sum(8, 4, sum, 2, DC_MULTIPLIER_1X2); 374 dc_store_8xh(dst, stride, 4, vdup_n_u8(dc)); 375 } 376 377 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 378 void aom_dc_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, 379 const uint8_t *above, const uint8_t *left) { 380 uint8x8_t a = load_u8_4x1(above); 381 uint8x16_t l = vld1q_u8(left); 382 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a); 383 uint32_t sum = horizontal_add_u16x8(sum_al); 384 uint32_t dc = calculate_dc_from_sum(4, 16, sum, 2, DC_MULTIPLIER_1X4); 385 dc_store_4xh(dst, stride, 16, vdup_n_u8(dc)); 386 } 387 388 void aom_dc_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, 389 const uint8_t *above, const uint8_t *left) { 390 uint8x16_t a = vld1q_u8(above); 391 uint8x8_t l = load_u8_4x1(left); 392 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l); 393 uint32_t sum = horizontal_add_u16x8(sum_al); 394 uint32_t dc = calculate_dc_from_sum(16, 4, sum, 2, DC_MULTIPLIER_1X4); 395 dc_store_16xh(dst, stride, 4, vdupq_n_u8(dc)); 396 } 397 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 398 399 void aom_dc_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, 400 const uint8_t *above, const uint8_t *left) { 401 uint8x8_t a = vld1_u8(above); 402 uint8x16_t l = vld1q_u8(left); 403 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(l), a); 404 uint32_t sum = horizontal_add_u16x8(sum_al); 405 uint32_t dc = calculate_dc_from_sum(8, 16, sum, 3, DC_MULTIPLIER_1X2); 406 dc_store_8xh(dst, stride, 16, vdup_n_u8(dc)); 407 } 408 409 void aom_dc_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, 410 const uint8_t *above, const uint8_t *left) { 411 uint8x16_t a = vld1q_u8(above); 412 uint8x8_t l = vld1_u8(left); 413 uint16x8_t sum_al = vaddw_u8(vpaddlq_u8(a), l); 414 uint32_t sum = horizontal_add_u16x8(sum_al); 415 uint32_t dc = calculate_dc_from_sum(16, 8, sum, 3, DC_MULTIPLIER_1X2); 416 dc_store_16xh(dst, stride, 8, vdupq_n_u8(dc)); 417 } 418 419 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 420 void aom_dc_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, 421 const uint8_t *above, const uint8_t *left) { 422 uint8x8_t a = vld1_u8(above); 423 uint16x8_t sum_left = dc_load_partial_sum_32(left); 424 uint16x8_t sum_al = vaddw_u8(sum_left, a); 425 uint32_t sum = horizontal_add_u16x8(sum_al); 426 uint32_t dc = calculate_dc_from_sum(8, 32, sum, 3, DC_MULTIPLIER_1X4); 427 dc_store_8xh(dst, stride, 32, vdup_n_u8(dc)); 428 } 429 430 void aom_dc_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, 431 const uint8_t *above, const uint8_t *left) { 432 uint16x8_t sum_top = dc_load_partial_sum_32(above); 433 uint8x8_t l = vld1_u8(left); 434 uint16x8_t sum_al = vaddw_u8(sum_top, l); 435 uint32_t sum = horizontal_add_u16x8(sum_al); 436 uint32_t dc = calculate_dc_from_sum(32, 8, sum, 3, DC_MULTIPLIER_1X4); 437 dc_store_32xh(dst, stride, 8, vdupq_n_u8(dc)); 438 } 439 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 440 441 void aom_dc_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, 442 const uint8_t *above, const uint8_t *left) { 443 uint16x8_t sum_above = dc_load_partial_sum_16(above); 444 uint16x8_t sum_left = dc_load_partial_sum_32(left); 445 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); 446 uint32_t sum = horizontal_add_u16x8(sum_al); 447 uint32_t dc = calculate_dc_from_sum(16, 32, sum, 4, DC_MULTIPLIER_1X2); 448 dc_store_16xh(dst, stride, 32, vdupq_n_u8(dc)); 449 } 450 451 void aom_dc_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, 452 const uint8_t *above, const uint8_t *left) { 453 uint16x8_t sum_above = dc_load_partial_sum_32(above); 454 uint16x8_t sum_left = dc_load_partial_sum_16(left); 455 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); 456 uint32_t sum = horizontal_add_u16x8(sum_al); 457 uint32_t dc = calculate_dc_from_sum(32, 16, sum, 4, DC_MULTIPLIER_1X2); 458 dc_store_32xh(dst, stride, 16, vdupq_n_u8(dc)); 459 } 460 461 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 462 void aom_dc_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, 463 const uint8_t *above, const uint8_t *left) { 464 uint16x8_t sum_above = dc_load_partial_sum_16(above); 465 uint16x8_t sum_left = dc_load_partial_sum_64(left); 466 uint16x8_t sum_al = vaddq_u16(sum_left, sum_above); 467 uint32_t sum = horizontal_add_u16x8(sum_al); 468 uint32_t dc = calculate_dc_from_sum(16, 64, sum, 4, DC_MULTIPLIER_1X4); 469 dc_store_16xh(dst, stride, 64, vdupq_n_u8(dc)); 470 } 471 472 void aom_dc_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, 473 const uint8_t *above, const uint8_t *left) { 474 uint16x8_t sum_above = dc_load_partial_sum_64(above); 475 uint16x8_t sum_left = dc_load_partial_sum_16(left); 476 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); 477 uint32_t sum = horizontal_add_u16x8(sum_al); 478 uint32_t dc = calculate_dc_from_sum(64, 16, sum, 4, DC_MULTIPLIER_1X4); 479 dc_store_64xh(dst, stride, 16, vdupq_n_u8(dc)); 480 } 481 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 482 483 void aom_dc_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, 484 const uint8_t *above, const uint8_t *left) { 485 uint16x8_t sum_above = dc_load_partial_sum_32(above); 486 uint16x8_t sum_left = dc_load_partial_sum_64(left); 487 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); 488 uint32_t sum = horizontal_add_u16x8(sum_al); 489 uint32_t dc = calculate_dc_from_sum(32, 64, sum, 5, DC_MULTIPLIER_1X2); 490 dc_store_32xh(dst, stride, 64, vdupq_n_u8(dc)); 491 } 492 493 void aom_dc_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, 494 const uint8_t *above, const uint8_t *left) { 495 uint16x8_t sum_above = dc_load_partial_sum_64(above); 496 uint16x8_t sum_left = dc_load_partial_sum_32(left); 497 uint16x8_t sum_al = vaddq_u16(sum_above, sum_left); 498 uint32_t sum = horizontal_add_u16x8(sum_al); 499 uint32_t dc = calculate_dc_from_sum(64, 32, sum, 5, DC_MULTIPLIER_1X2); 500 dc_store_64xh(dst, stride, 32, vdupq_n_u8(dc)); 501 } 502 503 #undef DC_MULTIPLIER_1X2 504 #undef DC_MULTIPLIER_1X4 505 506 #define DC_PREDICTOR_128(w, h, q) \ 507 void aom_dc_128_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ 508 const uint8_t *above, \ 509 const uint8_t *left) { \ 510 (void)above; \ 511 (void)left; \ 512 dc_store_##w##xh(dst, stride, (h), vdup##q##_n_u8(0x80)); \ 513 } 514 515 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 516 DC_PREDICTOR_128(4, 16, ) 517 DC_PREDICTOR_128(8, 32, ) 518 DC_PREDICTOR_128(16, 4, q) 519 DC_PREDICTOR_128(16, 64, q) 520 DC_PREDICTOR_128(32, 8, q) 521 DC_PREDICTOR_128(64, 16, q) 522 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 523 DC_PREDICTOR_128(4, 8, ) 524 DC_PREDICTOR_128(8, 4, ) 525 DC_PREDICTOR_128(8, 16, ) 526 DC_PREDICTOR_128(16, 8, q) 527 DC_PREDICTOR_128(16, 32, q) 528 DC_PREDICTOR_128(32, 16, q) 529 DC_PREDICTOR_128(32, 64, q) 530 DC_PREDICTOR_128(64, 32, q) 531 532 #undef DC_PREDICTOR_128 533 534 #define DC_PREDICTOR_LEFT(w, h, shift, q) \ 535 void aom_dc_left_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ 536 const uint8_t *above, \ 537 const uint8_t *left) { \ 538 (void)above; \ 539 const uint16x8_t sum = dc_load_sum_##h(left); \ 540 const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \ 541 dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \ 542 } 543 544 DC_PREDICTOR_LEFT(4, 8, 3, ) 545 DC_PREDICTOR_LEFT(8, 4, 2, ) 546 DC_PREDICTOR_LEFT(8, 16, 4, ) 547 DC_PREDICTOR_LEFT(16, 8, 3, q) 548 DC_PREDICTOR_LEFT(16, 32, 5, q) 549 DC_PREDICTOR_LEFT(32, 16, 4, q) 550 DC_PREDICTOR_LEFT(32, 64, 6, q) 551 DC_PREDICTOR_LEFT(64, 32, 5, q) 552 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 553 DC_PREDICTOR_LEFT(4, 16, 4, ) 554 DC_PREDICTOR_LEFT(16, 4, 2, q) 555 DC_PREDICTOR_LEFT(8, 32, 5, ) 556 DC_PREDICTOR_LEFT(32, 8, 3, q) 557 DC_PREDICTOR_LEFT(16, 64, 6, q) 558 DC_PREDICTOR_LEFT(64, 16, 4, q) 559 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 560 561 #undef DC_PREDICTOR_LEFT 562 563 #define DC_PREDICTOR_TOP(w, h, shift, q) \ 564 void aom_dc_top_predictor_##w##x##h##_neon(uint8_t *dst, ptrdiff_t stride, \ 565 const uint8_t *above, \ 566 const uint8_t *left) { \ 567 (void)left; \ 568 const uint16x8_t sum = dc_load_sum_##w(above); \ 569 const uint8x8_t dc0 = vrshrn_n_u16(sum, (shift)); \ 570 dc_store_##w##xh(dst, stride, (h), vdup##q##_lane_u8(dc0, 0)); \ 571 } 572 573 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 574 DC_PREDICTOR_TOP(8, 32, 3, ) 575 DC_PREDICTOR_TOP(4, 16, 2, ) 576 DC_PREDICTOR_TOP(16, 4, 4, q) 577 DC_PREDICTOR_TOP(16, 64, 4, q) 578 DC_PREDICTOR_TOP(32, 8, 5, q) 579 DC_PREDICTOR_TOP(64, 16, 6, q) 580 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 581 DC_PREDICTOR_TOP(4, 8, 2, ) 582 DC_PREDICTOR_TOP(8, 4, 3, ) 583 DC_PREDICTOR_TOP(8, 16, 3, ) 584 DC_PREDICTOR_TOP(16, 8, 4, q) 585 DC_PREDICTOR_TOP(16, 32, 4, q) 586 DC_PREDICTOR_TOP(32, 16, 5, q) 587 DC_PREDICTOR_TOP(32, 64, 5, q) 588 DC_PREDICTOR_TOP(64, 32, 6, q) 589 590 #undef DC_PREDICTOR_TOP 591 592 // ----------------------------------------------------------------------------- 593 594 static inline void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, 595 uint8x8_t d0) { 596 for (int i = 0; i < h; ++i) { 597 store_u8_4x1(dst + i * stride, d0); 598 } 599 } 600 601 static inline void v_store_8xh(uint8_t *dst, ptrdiff_t stride, int h, 602 uint8x8_t d0) { 603 for (int i = 0; i < h; ++i) { 604 vst1_u8(dst + i * stride, d0); 605 } 606 } 607 608 static inline void v_store_16xh(uint8_t *dst, ptrdiff_t stride, int h, 609 uint8x16_t d0) { 610 for (int i = 0; i < h; ++i) { 611 vst1q_u8(dst + i * stride, d0); 612 } 613 } 614 615 static inline void v_store_32xh(uint8_t *dst, ptrdiff_t stride, int h, 616 uint8x16_t d0, uint8x16_t d1) { 617 for (int i = 0; i < h; ++i) { 618 vst1q_u8(dst + 0, d0); 619 vst1q_u8(dst + 16, d1); 620 dst += stride; 621 } 622 } 623 624 static inline void v_store_64xh(uint8_t *dst, ptrdiff_t stride, int h, 625 uint8x16_t d0, uint8x16_t d1, uint8x16_t d2, 626 uint8x16_t d3) { 627 for (int i = 0; i < h; ++i) { 628 vst1q_u8(dst + 0, d0); 629 vst1q_u8(dst + 16, d1); 630 vst1q_u8(dst + 32, d2); 631 vst1q_u8(dst + 48, d3); 632 dst += stride; 633 } 634 } 635 636 void aom_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 637 const uint8_t *above, const uint8_t *left) { 638 (void)left; 639 v_store_4xh(dst, stride, 4, load_u8_4x1(above)); 640 } 641 642 void aom_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 643 const uint8_t *above, const uint8_t *left) { 644 (void)left; 645 v_store_8xh(dst, stride, 8, vld1_u8(above)); 646 } 647 648 void aom_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 649 const uint8_t *above, const uint8_t *left) { 650 (void)left; 651 v_store_16xh(dst, stride, 16, vld1q_u8(above)); 652 } 653 654 void aom_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 655 const uint8_t *above, const uint8_t *left) { 656 const uint8x16_t d0 = vld1q_u8(above); 657 const uint8x16_t d1 = vld1q_u8(above + 16); 658 (void)left; 659 v_store_32xh(dst, stride, 32, d0, d1); 660 } 661 662 void aom_v_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, 663 const uint8_t *above, const uint8_t *left) { 664 (void)left; 665 v_store_4xh(dst, stride, 8, load_u8_4x1(above)); 666 } 667 668 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 669 void aom_v_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, 670 const uint8_t *above, const uint8_t *left) { 671 (void)left; 672 v_store_4xh(dst, stride, 16, load_u8_4x1(above)); 673 } 674 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 675 676 void aom_v_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, 677 const uint8_t *above, const uint8_t *left) { 678 (void)left; 679 v_store_8xh(dst, stride, 4, vld1_u8(above)); 680 } 681 682 void aom_v_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, 683 const uint8_t *above, const uint8_t *left) { 684 (void)left; 685 v_store_8xh(dst, stride, 16, vld1_u8(above)); 686 } 687 688 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 689 void aom_v_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, 690 const uint8_t *above, const uint8_t *left) { 691 (void)left; 692 v_store_8xh(dst, stride, 32, vld1_u8(above)); 693 } 694 695 void aom_v_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, 696 const uint8_t *above, const uint8_t *left) { 697 (void)left; 698 v_store_16xh(dst, stride, 4, vld1q_u8(above)); 699 } 700 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 701 702 void aom_v_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, 703 const uint8_t *above, const uint8_t *left) { 704 (void)left; 705 v_store_16xh(dst, stride, 8, vld1q_u8(above)); 706 } 707 708 void aom_v_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, 709 const uint8_t *above, const uint8_t *left) { 710 (void)left; 711 v_store_16xh(dst, stride, 32, vld1q_u8(above)); 712 } 713 714 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 715 void aom_v_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, 716 const uint8_t *above, const uint8_t *left) { 717 (void)left; 718 v_store_16xh(dst, stride, 64, vld1q_u8(above)); 719 } 720 721 void aom_v_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, 722 const uint8_t *above, const uint8_t *left) { 723 const uint8x16_t d0 = vld1q_u8(above); 724 const uint8x16_t d1 = vld1q_u8(above + 16); 725 (void)left; 726 v_store_32xh(dst, stride, 8, d0, d1); 727 } 728 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 729 730 void aom_v_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, 731 const uint8_t *above, const uint8_t *left) { 732 const uint8x16_t d0 = vld1q_u8(above); 733 const uint8x16_t d1 = vld1q_u8(above + 16); 734 (void)left; 735 v_store_32xh(dst, stride, 16, d0, d1); 736 } 737 738 void aom_v_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, 739 const uint8_t *above, const uint8_t *left) { 740 const uint8x16_t d0 = vld1q_u8(above); 741 const uint8x16_t d1 = vld1q_u8(above + 16); 742 (void)left; 743 v_store_32xh(dst, stride, 64, d0, d1); 744 } 745 746 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 747 void aom_v_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, 748 const uint8_t *above, const uint8_t *left) { 749 const uint8x16_t d0 = vld1q_u8(above); 750 const uint8x16_t d1 = vld1q_u8(above + 16); 751 const uint8x16_t d2 = vld1q_u8(above + 32); 752 const uint8x16_t d3 = vld1q_u8(above + 48); 753 (void)left; 754 v_store_64xh(dst, stride, 16, d0, d1, d2, d3); 755 } 756 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 757 758 void aom_v_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, 759 const uint8_t *above, const uint8_t *left) { 760 const uint8x16_t d0 = vld1q_u8(above); 761 const uint8x16_t d1 = vld1q_u8(above + 16); 762 const uint8x16_t d2 = vld1q_u8(above + 32); 763 const uint8x16_t d3 = vld1q_u8(above + 48); 764 (void)left; 765 v_store_64xh(dst, stride, 32, d0, d1, d2, d3); 766 } 767 768 void aom_v_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, 769 const uint8_t *above, const uint8_t *left) { 770 const uint8x16_t d0 = vld1q_u8(above); 771 const uint8x16_t d1 = vld1q_u8(above + 16); 772 const uint8x16_t d2 = vld1q_u8(above + 32); 773 const uint8x16_t d3 = vld1q_u8(above + 48); 774 (void)left; 775 v_store_64xh(dst, stride, 64, d0, d1, d2, d3); 776 } 777 778 // ----------------------------------------------------------------------------- 779 780 static inline void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { 781 store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); 782 store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); 783 store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); 784 store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); 785 store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4)); 786 store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5)); 787 store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6)); 788 store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7)); 789 } 790 791 static inline void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { 792 vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0)); 793 vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1)); 794 vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2)); 795 vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3)); 796 vst1_u8(dst + 4 * stride, vdup_lane_u8(d0, 4)); 797 vst1_u8(dst + 5 * stride, vdup_lane_u8(d0, 5)); 798 vst1_u8(dst + 6 * stride, vdup_lane_u8(d0, 6)); 799 vst1_u8(dst + 7 * stride, vdup_lane_u8(d0, 7)); 800 } 801 802 static inline void h_store_16x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { 803 vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0)); 804 vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1)); 805 vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2)); 806 vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3)); 807 vst1q_u8(dst + 4 * stride, vdupq_lane_u8(d0, 4)); 808 vst1q_u8(dst + 5 * stride, vdupq_lane_u8(d0, 5)); 809 vst1q_u8(dst + 6 * stride, vdupq_lane_u8(d0, 6)); 810 vst1q_u8(dst + 7 * stride, vdupq_lane_u8(d0, 7)); 811 } 812 813 static inline void h_store_32x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { 814 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0)); 815 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0)); 816 dst += stride; 817 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1)); 818 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1)); 819 dst += stride; 820 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2)); 821 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2)); 822 dst += stride; 823 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3)); 824 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3)); 825 dst += stride; 826 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4)); 827 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4)); 828 dst += stride; 829 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5)); 830 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5)); 831 dst += stride; 832 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6)); 833 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6)); 834 dst += stride; 835 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7)); 836 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7)); 837 } 838 839 static inline void h_store_64x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { 840 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 0)); 841 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 0)); 842 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 0)); 843 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 0)); 844 dst += stride; 845 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 1)); 846 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 1)); 847 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 1)); 848 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 1)); 849 dst += stride; 850 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 2)); 851 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 2)); 852 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 2)); 853 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 2)); 854 dst += stride; 855 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 3)); 856 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 3)); 857 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 3)); 858 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 3)); 859 dst += stride; 860 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 4)); 861 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 4)); 862 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 4)); 863 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 4)); 864 dst += stride; 865 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 5)); 866 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 5)); 867 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 5)); 868 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 5)); 869 dst += stride; 870 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 6)); 871 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 6)); 872 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 6)); 873 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 6)); 874 dst += stride; 875 vst1q_u8(dst + 0, vdupq_lane_u8(d0, 7)); 876 vst1q_u8(dst + 16, vdupq_lane_u8(d0, 7)); 877 vst1q_u8(dst + 32, vdupq_lane_u8(d0, 7)); 878 vst1q_u8(dst + 48, vdupq_lane_u8(d0, 7)); 879 } 880 881 void aom_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, 882 const uint8_t *above, const uint8_t *left) { 883 const uint8x8_t d0 = load_u8_4x1(left); 884 (void)above; 885 store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); 886 store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); 887 store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); 888 store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); 889 } 890 891 void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, 892 const uint8_t *above, const uint8_t *left) { 893 const uint8x8_t d0 = vld1_u8(left); 894 (void)above; 895 h_store_8x8(dst, stride, d0); 896 } 897 898 void aom_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, 899 const uint8_t *above, const uint8_t *left) { 900 const uint8x16_t d0 = vld1q_u8(left); 901 (void)above; 902 h_store_16x8(dst, stride, vget_low_u8(d0)); 903 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); 904 } 905 906 void aom_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, 907 const uint8_t *above, const uint8_t *left) { 908 const uint8x16_t d0 = vld1q_u8(left); 909 const uint8x16_t d1 = vld1q_u8(left + 16); 910 (void)above; 911 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); 912 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); 913 h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1)); 914 h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1)); 915 } 916 917 void aom_h_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride, 918 const uint8_t *above, const uint8_t *left) { 919 const uint8x8_t d0 = vld1_u8(left); 920 (void)above; 921 h_store_4x8(dst, stride, d0); 922 } 923 924 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 925 void aom_h_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride, 926 const uint8_t *above, const uint8_t *left) { 927 const uint8x16_t d0 = vld1q_u8(left); 928 (void)above; 929 h_store_4x8(dst + 0 * stride, stride, vget_low_u8(d0)); 930 h_store_4x8(dst + 8 * stride, stride, vget_high_u8(d0)); 931 } 932 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 933 934 void aom_h_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride, 935 const uint8_t *above, const uint8_t *left) { 936 const uint8x8_t d0 = load_u8_4x1(left); 937 (void)above; 938 vst1_u8(dst + 0 * stride, vdup_lane_u8(d0, 0)); 939 vst1_u8(dst + 1 * stride, vdup_lane_u8(d0, 1)); 940 vst1_u8(dst + 2 * stride, vdup_lane_u8(d0, 2)); 941 vst1_u8(dst + 3 * stride, vdup_lane_u8(d0, 3)); 942 } 943 944 void aom_h_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride, 945 const uint8_t *above, const uint8_t *left) { 946 const uint8x16_t d0 = vld1q_u8(left); 947 (void)above; 948 h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0)); 949 h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0)); 950 } 951 952 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 953 void aom_h_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride, 954 const uint8_t *above, const uint8_t *left) { 955 const uint8x16_t d0 = vld1q_u8(left); 956 const uint8x16_t d1 = vld1q_u8(left + 16); 957 (void)above; 958 h_store_8x8(dst + 0 * stride, stride, vget_low_u8(d0)); 959 h_store_8x8(dst + 8 * stride, stride, vget_high_u8(d0)); 960 h_store_8x8(dst + 16 * stride, stride, vget_low_u8(d1)); 961 h_store_8x8(dst + 24 * stride, stride, vget_high_u8(d1)); 962 } 963 964 void aom_h_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride, 965 const uint8_t *above, const uint8_t *left) { 966 const uint8x8_t d0 = load_u8_4x1(left); 967 (void)above; 968 vst1q_u8(dst + 0 * stride, vdupq_lane_u8(d0, 0)); 969 vst1q_u8(dst + 1 * stride, vdupq_lane_u8(d0, 1)); 970 vst1q_u8(dst + 2 * stride, vdupq_lane_u8(d0, 2)); 971 vst1q_u8(dst + 3 * stride, vdupq_lane_u8(d0, 3)); 972 } 973 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 974 975 void aom_h_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride, 976 const uint8_t *above, const uint8_t *left) { 977 const uint8x8_t d0 = vld1_u8(left); 978 (void)above; 979 h_store_16x8(dst, stride, d0); 980 } 981 982 void aom_h_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride, 983 const uint8_t *above, const uint8_t *left) { 984 const uint8x16_t d0 = vld1q_u8(left); 985 const uint8x16_t d1 = vld1q_u8(left + 16); 986 (void)above; 987 h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0)); 988 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); 989 h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1)); 990 h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1)); 991 } 992 993 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 994 void aom_h_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride, 995 const uint8_t *above, const uint8_t *left) { 996 const uint8x16_t d0 = vld1q_u8(left); 997 const uint8x16_t d1 = vld1q_u8(left + 16); 998 const uint8x16_t d2 = vld1q_u8(left + 32); 999 const uint8x16_t d3 = vld1q_u8(left + 48); 1000 (void)above; 1001 h_store_16x8(dst + 0 * stride, stride, vget_low_u8(d0)); 1002 h_store_16x8(dst + 8 * stride, stride, vget_high_u8(d0)); 1003 h_store_16x8(dst + 16 * stride, stride, vget_low_u8(d1)); 1004 h_store_16x8(dst + 24 * stride, stride, vget_high_u8(d1)); 1005 h_store_16x8(dst + 32 * stride, stride, vget_low_u8(d2)); 1006 h_store_16x8(dst + 40 * stride, stride, vget_high_u8(d2)); 1007 h_store_16x8(dst + 48 * stride, stride, vget_low_u8(d3)); 1008 h_store_16x8(dst + 56 * stride, stride, vget_high_u8(d3)); 1009 } 1010 1011 void aom_h_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride, 1012 const uint8_t *above, const uint8_t *left) { 1013 const uint8x8_t d0 = vld1_u8(left); 1014 (void)above; 1015 h_store_32x8(dst, stride, d0); 1016 } 1017 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1018 1019 void aom_h_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride, 1020 const uint8_t *above, const uint8_t *left) { 1021 const uint8x16_t d0 = vld1q_u8(left); 1022 (void)above; 1023 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); 1024 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); 1025 } 1026 1027 void aom_h_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride, 1028 const uint8_t *above, const uint8_t *left) { 1029 const uint8x16_t d0 = vld1q_u8(left + 0); 1030 const uint8x16_t d1 = vld1q_u8(left + 16); 1031 const uint8x16_t d2 = vld1q_u8(left + 32); 1032 const uint8x16_t d3 = vld1q_u8(left + 48); 1033 (void)above; 1034 h_store_32x8(dst + 0 * stride, stride, vget_low_u8(d0)); 1035 h_store_32x8(dst + 8 * stride, stride, vget_high_u8(d0)); 1036 h_store_32x8(dst + 16 * stride, stride, vget_low_u8(d1)); 1037 h_store_32x8(dst + 24 * stride, stride, vget_high_u8(d1)); 1038 h_store_32x8(dst + 32 * stride, stride, vget_low_u8(d2)); 1039 h_store_32x8(dst + 40 * stride, stride, vget_high_u8(d2)); 1040 h_store_32x8(dst + 48 * stride, stride, vget_low_u8(d3)); 1041 h_store_32x8(dst + 56 * stride, stride, vget_high_u8(d3)); 1042 } 1043 1044 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1045 void aom_h_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride, 1046 const uint8_t *above, const uint8_t *left) { 1047 const uint8x16_t d0 = vld1q_u8(left); 1048 (void)above; 1049 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); 1050 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); 1051 } 1052 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1053 1054 void aom_h_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride, 1055 const uint8_t *above, const uint8_t *left) { 1056 (void)above; 1057 for (int i = 0; i < 2; ++i) { 1058 const uint8x16_t d0 = vld1q_u8(left); 1059 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); 1060 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); 1061 left += 16; 1062 dst += 16 * stride; 1063 } 1064 } 1065 1066 void aom_h_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride, 1067 const uint8_t *above, const uint8_t *left) { 1068 (void)above; 1069 for (int i = 0; i < 4; ++i) { 1070 const uint8x16_t d0 = vld1q_u8(left); 1071 h_store_64x8(dst + 0 * stride, stride, vget_low_u8(d0)); 1072 h_store_64x8(dst + 8 * stride, stride, vget_high_u8(d0)); 1073 left += 16; 1074 dst += 16 * stride; 1075 } 1076 } 1077 1078 /* ---------------------P R E D I C T I O N Z 1--------------------------- */ 1079 1080 // Low bit depth functions 1081 static DECLARE_ALIGNED(32, const uint8_t, BaseMask[33][32]) = { 1082 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1083 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1084 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1085 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1086 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1087 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1088 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1089 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1090 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1091 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1092 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1093 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1094 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1095 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1096 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1097 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1098 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 1099 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1100 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 1101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1102 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 1103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1105 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1107 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1108 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1109 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1110 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1111 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1112 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1113 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1114 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1115 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 1116 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1117 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1118 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 1119 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1120 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1121 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 1122 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1123 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1124 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 1125 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1126 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1127 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 1128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1129 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1130 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 1131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1132 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1133 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 1134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1135 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1136 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 1137 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1138 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1139 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1140 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1141 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1142 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1143 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1144 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1145 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1146 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, 1147 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1148 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1149 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, 1150 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1151 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1152 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, 1153 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1154 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1155 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, 1156 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1157 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1158 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, 1159 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1160 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1161 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, 1162 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1163 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1164 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, 1165 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1166 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1167 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, 1168 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1169 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1170 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, 1171 }; 1172 1173 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon_64( 1174 int H, int W, uint8x8_t *dst, const uint8_t *above, int upsample_above, 1175 int dx) { 1176 const int frac_bits = 6 - upsample_above; 1177 const int max_base_x = ((W + H) - 1) << upsample_above; 1178 1179 assert(dx > 0); 1180 // pre-filter above pixels 1181 // store in temp buffers: 1182 // above[x] * 32 + 16 1183 // above[x+1] - above[x] 1184 // final pixels will be calculated as: 1185 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1186 1187 const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]); 1188 1189 int x = dx; 1190 for (int r = 0; r < W; r++) { 1191 int base = x >> frac_bits; 1192 int base_max_diff = (max_base_x - base) >> upsample_above; 1193 if (base_max_diff <= 0) { 1194 for (int i = r; i < W; ++i) { 1195 dst[i] = a_mbase_x; // save 4 values 1196 } 1197 return; 1198 } 1199 1200 if (base_max_diff > H) base_max_diff = H; 1201 1202 uint8x8x2_t a01_128; 1203 uint16x8_t shift; 1204 if (upsample_above) { 1205 a01_128 = vld2_u8(above + base); 1206 shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1); 1207 } else { 1208 a01_128.val[0] = vld1_u8(above + base); 1209 a01_128.val[1] = vld1_u8(above + base + 1); 1210 shift = vdupq_n_u16((x & 0x3f) >> 1); 1211 } 1212 uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]); 1213 uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32)); 1214 uint16x8_t res = vmlaq_u16(a32, diff, shift); 1215 1216 uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]); 1217 dst[r] = vbsl_u8(mask, vshrn_n_u16(res, 5), a_mbase_x); 1218 1219 x += dx; 1220 } 1221 } 1222 1223 static void dr_prediction_z1_4xN_neon(int N, uint8_t *dst, ptrdiff_t stride, 1224 const uint8_t *above, int upsample_above, 1225 int dx) { 1226 uint8x8_t dstvec[16]; 1227 1228 dr_prediction_z1_HxW_internal_neon_64(4, N, dstvec, above, upsample_above, 1229 dx); 1230 for (int i = 0; i < N; i++) { 1231 vst1_lane_u32((uint32_t *)(dst + stride * i), 1232 vreinterpret_u32_u8(dstvec[i]), 0); 1233 } 1234 } 1235 1236 static void dr_prediction_z1_8xN_neon(int N, uint8_t *dst, ptrdiff_t stride, 1237 const uint8_t *above, int upsample_above, 1238 int dx) { 1239 uint8x8_t dstvec[32]; 1240 1241 dr_prediction_z1_HxW_internal_neon_64(8, N, dstvec, above, upsample_above, 1242 dx); 1243 for (int i = 0; i < N; i++) { 1244 vst1_u8(dst + stride * i, dstvec[i]); 1245 } 1246 } 1247 1248 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_neon( 1249 int H, int W, uint8x16_t *dst, const uint8_t *above, int upsample_above, 1250 int dx) { 1251 const int frac_bits = 6 - upsample_above; 1252 const int max_base_x = ((W + H) - 1) << upsample_above; 1253 1254 assert(dx > 0); 1255 // pre-filter above pixels 1256 // store in temp buffers: 1257 // above[x] * 32 + 16 1258 // above[x+1] - above[x] 1259 // final pixels will be calculated as: 1260 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1261 1262 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); 1263 1264 int x = dx; 1265 for (int r = 0; r < W; r++) { 1266 int base = x >> frac_bits; 1267 int base_max_diff = (max_base_x - base) >> upsample_above; 1268 if (base_max_diff <= 0) { 1269 for (int i = r; i < W; ++i) { 1270 dst[i] = a_mbase_x; // save 4 values 1271 } 1272 return; 1273 } 1274 1275 if (base_max_diff > H) base_max_diff = H; 1276 1277 uint16x8_t shift; 1278 uint8x16_t a0_128, a1_128; 1279 if (upsample_above) { 1280 uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base); 1281 a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]); 1282 a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8); 1283 shift = vdupq_n_u16(x & 0x1f); 1284 } else { 1285 a0_128 = vld1q_u8(above + base); 1286 a1_128 = vld1q_u8(above + base + 1); 1287 shift = vdupq_n_u16((x & 0x3f) >> 1); 1288 } 1289 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); 1290 uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); 1291 uint16x8_t a32_lo = 1292 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); 1293 uint16x8_t a32_hi = 1294 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); 1295 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); 1296 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); 1297 uint8x16_t v_temp = 1298 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); 1299 1300 uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]); 1301 dst[r] = vbslq_u8(mask, v_temp, a_mbase_x); 1302 1303 x += dx; 1304 } 1305 } 1306 1307 static void dr_prediction_z1_16xN_neon(int N, uint8_t *dst, ptrdiff_t stride, 1308 const uint8_t *above, int upsample_above, 1309 int dx) { 1310 uint8x16_t dstvec[64]; 1311 1312 dr_prediction_z1_HxW_internal_neon(16, N, dstvec, above, upsample_above, dx); 1313 for (int i = 0; i < N; i++) { 1314 vst1q_u8(dst + stride * i, dstvec[i]); 1315 } 1316 } 1317 1318 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon( 1319 int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) { 1320 const int frac_bits = 6; 1321 const int max_base_x = ((32 + N) - 1); 1322 1323 // pre-filter above pixels 1324 // store in temp buffers: 1325 // above[x] * 32 + 16 1326 // above[x+1] - above[x] 1327 // final pixels will be calculated as: 1328 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1329 1330 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); 1331 1332 int x = dx; 1333 for (int r = 0; r < N; r++) { 1334 int base = x >> frac_bits; 1335 int base_max_diff = (max_base_x - base); 1336 if (base_max_diff <= 0) { 1337 for (int i = r; i < N; ++i) { 1338 dstvec[i].val[0] = a_mbase_x; // save 32 values 1339 dstvec[i].val[1] = a_mbase_x; 1340 } 1341 return; 1342 } 1343 if (base_max_diff > 32) base_max_diff = 32; 1344 1345 uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); 1346 1347 uint8x16_t res16[2]; 1348 for (int j = 0, jj = 0; j < 32; j += 16, jj++) { 1349 int mdiff = base_max_diff - j; 1350 if (mdiff <= 0) { 1351 res16[jj] = a_mbase_x; 1352 } else { 1353 uint8x16_t a0_128 = vld1q_u8(above + base + j); 1354 uint8x16_t a1_128 = vld1q_u8(above + base + j + 1); 1355 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); 1356 uint16x8_t diff_hi = 1357 vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); 1358 uint16x8_t a32_lo = 1359 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); 1360 uint16x8_t a32_hi = 1361 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); 1362 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); 1363 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); 1364 1365 res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); 1366 } 1367 } 1368 1369 uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]); 1370 uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16); 1371 dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x); 1372 dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x); 1373 x += dx; 1374 } 1375 } 1376 1377 static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride, 1378 const uint8_t *above, int dx) { 1379 uint8x16x2_t dstvec[64]; 1380 1381 dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx); 1382 for (int i = 0; i < N; i++) { 1383 vst1q_u8(dst + stride * i, dstvec[i].val[0]); 1384 vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]); 1385 } 1386 } 1387 1388 // clang-format off 1389 static const uint8_t kLoadMaxShuffles[] = { 1390 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1391 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1392 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1393 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1394 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1395 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1396 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1397 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 15, 1398 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 15, 1399 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 15, 1400 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 15, 1401 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 15, 1402 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 15, 1403 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 1404 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 1405 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 1406 }; 1407 // clang-format on 1408 1409 static inline uint8x16_t z1_load_masked_neon(const uint8_t *ptr, 1410 int shuffle_idx) { 1411 uint8x16_t shuffle = vld1q_u8(&kLoadMaxShuffles[16 * shuffle_idx]); 1412 uint8x16_t src = vld1q_u8(ptr); 1413 #if AOM_ARCH_AARCH64 1414 return vqtbl1q_u8(src, shuffle); 1415 #else 1416 uint8x8x2_t src2 = { { vget_low_u8(src), vget_high_u8(src) } }; 1417 uint8x8_t lo = vtbl2_u8(src2, vget_low_u8(shuffle)); 1418 uint8x8_t hi = vtbl2_u8(src2, vget_high_u8(shuffle)); 1419 return vcombine_u8(lo, hi); 1420 #endif 1421 } 1422 1423 static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride, 1424 const uint8_t *above, int dx) { 1425 const int frac_bits = 6; 1426 const int max_base_x = ((64 + N) - 1); 1427 1428 // pre-filter above pixels 1429 // store in temp buffers: 1430 // above[x] * 32 + 16 1431 // above[x+1] - above[x] 1432 // final pixels will be calculated as: 1433 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1434 1435 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); 1436 1437 int x = dx; 1438 for (int r = 0; r < N; r++, dst += stride) { 1439 int base = x >> frac_bits; 1440 if (base >= max_base_x) { 1441 for (int i = r; i < N; ++i) { 1442 vst1q_u8(dst, a_mbase_x); 1443 vst1q_u8(dst + 16, a_mbase_x); 1444 vst1q_u8(dst + 32, a_mbase_x); 1445 vst1q_u8(dst + 48, a_mbase_x); 1446 dst += stride; 1447 } 1448 return; 1449 } 1450 1451 uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); 1452 uint8x16_t base_inc128 = 1453 vaddq_u8(vdupq_n_u8(base), vcombine_u8(vcreate_u8(0x0706050403020100), 1454 vcreate_u8(0x0F0E0D0C0B0A0908))); 1455 1456 for (int j = 0; j < 64; j += 16) { 1457 if (base + j >= max_base_x) { 1458 vst1q_u8(dst + j, a_mbase_x); 1459 } else { 1460 uint8x16_t a0_128; 1461 uint8x16_t a1_128; 1462 if (base + j + 15 >= max_base_x) { 1463 int shuffle_idx = max_base_x - base - j; 1464 a0_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx); 1465 } else { 1466 a0_128 = vld1q_u8(above + base + j); 1467 } 1468 if (base + j + 16 >= max_base_x) { 1469 int shuffle_idx = max_base_x - base - j - 1; 1470 a1_128 = z1_load_masked_neon(above + (max_base_x - 15), shuffle_idx); 1471 } else { 1472 a1_128 = vld1q_u8(above + base + j + 1); 1473 } 1474 1475 uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); 1476 uint16x8_t diff_hi = 1477 vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); 1478 uint16x8_t a32_lo = 1479 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); 1480 uint16x8_t a32_hi = 1481 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); 1482 uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); 1483 uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); 1484 vst1q_u8(dst + j, 1485 vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5))); 1486 1487 base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16)); 1488 } 1489 } 1490 x += dx; 1491 } 1492 } 1493 1494 // Directional prediction, zone 1: 0 < angle < 90 1495 void av1_dr_prediction_z1_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 1496 const uint8_t *above, const uint8_t *left, 1497 int upsample_above, int dx, int dy) { 1498 (void)left; 1499 (void)dy; 1500 1501 switch (bw) { 1502 case 4: 1503 dr_prediction_z1_4xN_neon(bh, dst, stride, above, upsample_above, dx); 1504 break; 1505 case 8: 1506 dr_prediction_z1_8xN_neon(bh, dst, stride, above, upsample_above, dx); 1507 break; 1508 case 16: 1509 dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx); 1510 break; 1511 case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break; 1512 case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break; 1513 default: break; 1514 } 1515 } 1516 1517 /* ---------------------P R E D I C T I O N Z 2--------------------------- */ 1518 1519 #if !AOM_ARCH_AARCH64 1520 static DECLARE_ALIGNED(16, const uint8_t, LoadMaskz2[4][16]) = { 1521 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 1522 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, 1523 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 1524 0, 0, 0 }, 1525 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 1526 0xff, 0xff, 0xff, 0xff } 1527 }; 1528 #endif // !AOM_ARCH_AARCH64 1529 1530 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon( 1531 const uint8_t *above, int upsample_above, int dx, int base_x, int y, 1532 uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) { 1533 uint16x4_t r6 = vcreate_u16(0x00C0008000400000); 1534 uint16x4_t ydx = vdup_n_u16(y * dx); 1535 if (upsample_above) { 1536 // Cannot use LD2 here since we only want to load eight bytes, but LD2 can 1537 // only load either 16 or 32. 1538 uint8x8_t v_tmp = vld1_u8(above + base_x); 1539 *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0]; 1540 *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1]; 1541 *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f)); 1542 } else { 1543 *a0_x = load_unaligned_u8_4x1(above + base_x); 1544 *a1_x = load_unaligned_u8_4x1(above + base_x + 1); 1545 *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f)); 1546 } 1547 } 1548 1549 static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon( 1550 #if AOM_ARCH_AARCH64 1551 uint8x16x2_t left_vals, 1552 #else 1553 const uint8_t *left, 1554 #endif 1555 int upsample_left, int dy, int r, int min_base_y, int frac_bits_y, 1556 uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) { 1557 int16x4_t dy64 = vdup_n_s16(dy); 1558 int16x4_t v_1234 = vcreate_s16(0x0004000300020001); 1559 int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); 1560 int16x4_t min_base_y64 = vdup_n_s16(min_base_y); 1561 int16x4_t v_r6 = vdup_n_s16(r << 6); 1562 int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); 1563 int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); 1564 1565 // Values in base_y_c64 range from -2 through 14 inclusive. 1566 base_y_c64 = vmax_s16(base_y_c64, min_base_y64); 1567 1568 #if AOM_ARCH_AARCH64 1569 uint8x8_t left_idx0 = 1570 vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] 1571 uint8x8_t left_idx1 = 1572 vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] 1573 1574 *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0)); 1575 *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1)); 1576 #else // !AOM_ARCH_AARCH64 1577 DECLARE_ALIGNED(32, int16_t, base_y_c[4]); 1578 1579 vst1_s16(base_y_c, base_y_c64); 1580 uint8x8_t a0_y_u8 = vdup_n_u8(0); 1581 a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0); 1582 a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2); 1583 a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4); 1584 a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6); 1585 1586 base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); 1587 vst1_s16(base_y_c, base_y_c64); 1588 uint8x8_t a1_y_u8 = vdup_n_u8(0); 1589 a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0); 1590 a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2); 1591 a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4); 1592 a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6); 1593 1594 *a0_y = vreinterpret_u16_u8(a0_y_u8); 1595 *a1_y = vreinterpret_u16_u8(a1_y_u8); 1596 #endif // AOM_ARCH_AARCH64 1597 1598 if (upsample_left) { 1599 *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f)); 1600 } else { 1601 *shift1 = 1602 vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f)); 1603 } 1604 } 1605 1606 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon( 1607 const uint8_t *above, int upsample_above, int dx, int base_x, int y) { 1608 uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), 1609 vcreate_u16(0x0008000700060005)); 1610 uint16x8_t ydx = vdupq_n_u16(y * dx); 1611 uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6); 1612 1613 uint16x8_t shift0; 1614 uint8x8_t a0_x0; 1615 uint8x8_t a1_x0; 1616 if (upsample_above) { 1617 uint8x8x2_t v_tmp = vld2_u8(above + base_x); 1618 a0_x0 = v_tmp.val[0]; 1619 a1_x0 = v_tmp.val[1]; 1620 shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); 1621 } else { 1622 a0_x0 = vld1_u8(above + base_x); 1623 a1_x0 = vld1_u8(above + base_x + 1); 1624 shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); 1625 } 1626 1627 uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] 1628 uint16x8_t a32 = 1629 vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 1630 uint16x8_t res = vmlaq_u16(a32, diff0, shift0); 1631 return vshrn_n_u16(res, 5); 1632 } 1633 1634 static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon( 1635 #if AOM_ARCH_AARCH64 1636 uint8x16x3_t left_vals, 1637 #else 1638 const uint8_t *left, 1639 #endif 1640 int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) { 1641 int16x8_t v_r6 = vdupq_n_s16(r << 6); 1642 int16x8_t dy128 = vdupq_n_s16(dy); 1643 int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); 1644 int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); 1645 1646 uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), 1647 vcreate_u16(0x0008000700060005)); 1648 int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); 1649 int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); 1650 1651 // Values in base_y_c128 range from -2 through 31 inclusive. 1652 base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128); 1653 1654 #if AOM_ARCH_AARCH64 1655 uint8x16_t left_idx0 = 1656 vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] 1657 uint8x16_t left_idx1 = 1658 vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] 1659 uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); 1660 1661 uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); 1662 uint8x8_t a0_x1 = vget_low_u8(a01_x); 1663 uint8x8_t a1_x1 = vget_high_u8(a01_x); 1664 #else // !AOM_ARCH_AARCH64 1665 uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128); 1666 uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128); 1667 #endif // AOM_ARCH_AARCH64 1668 1669 uint16x8_t shift1; 1670 if (upsample_left) { 1671 shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f)); 1672 } else { 1673 shift1 = vshrq_n_u16( 1674 vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1); 1675 } 1676 1677 uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1); 1678 uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32)); 1679 uint16x8_t res = vmlaq_u16(a32, diff1, shift1); 1680 return vshrn_n_u16(res, 5); 1681 } 1682 1683 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon( 1684 const uint8_t *above, int dx, int base_x, int y, int j) { 1685 uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), 1686 vcreate_u16(0x0007000600050004)), 1687 vcombine_u16(vcreate_u16(0x000B000A00090008), 1688 vcreate_u16(0x000F000E000D000C)) } }; 1689 uint16x8_t j256 = vdupq_n_u16(j); 1690 uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); 1691 1692 const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j); 1693 const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1); 1694 uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); 1695 uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); 1696 uint16x8_t shift0 = 1697 vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1); 1698 uint16x8_t shift1 = 1699 vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1); 1700 // a[x+1] - a[x] 1701 uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128)); 1702 uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128)); 1703 // a[x] * 32 + 16 1704 uint16x8_t a32_0 = 1705 vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32)); 1706 uint16x8_t a32_1 = 1707 vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32)); 1708 uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0); 1709 uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1); 1710 return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); 1711 } 1712 1713 static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon( 1714 #if AOM_ARCH_AARCH64 1715 uint8x16x4_t left_vals0, uint8x16x4_t left_vals1, 1716 #else 1717 const uint8_t *left, 1718 #endif 1719 int dy, int r, int j) { 1720 // here upsample_above and upsample_left are 0 by design of 1721 // av1_use_intra_edge_upsample 1722 const int min_base_y = -1; 1723 1724 int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); 1725 int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1); 1726 int16x8_t dy256 = vdupq_n_s16(dy); 1727 uint16x8_t j256 = vdupq_n_u16(j); 1728 1729 uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), 1730 vcreate_u16(0x0007000600050004)), 1731 vcombine_u16(vcreate_u16(0x000B000A00090008), 1732 vcreate_u16(0x000F000E000D000C)) } }; 1733 uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)), 1734 vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } }; 1735 1736 int16x8_t v_r6 = vdupq_n_s16(r << 6); 1737 1738 int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0])); 1739 int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1])); 1740 int16x8_t mul16_lo = vreinterpretq_s16_u16( 1741 vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)), 1742 vreinterpretq_u16_s16(half_min_base_y256))); 1743 int16x8_t mul16_hi = vreinterpretq_s16_u16( 1744 vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)), 1745 vreinterpretq_u16_s16(half_min_base_y256))); 1746 int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo); 1747 int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi); 1748 1749 int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6); 1750 int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6); 1751 1752 base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo); 1753 base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi); 1754 1755 #if !AOM_ARCH_AARCH64 1756 int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7); 1757 int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0); 1758 int16_t offset_diff = max_y - min_y; 1759 1760 uint8x8_t a0_y0; 1761 uint8x8_t a0_y1; 1762 uint8x8_t a1_y0; 1763 uint8x8_t a1_y1; 1764 if (offset_diff < 16) { 1765 // Avoid gathers where the data we want is close together in memory. 1766 // We don't need this for AArch64 since we can already use TBL to cover the 1767 // full range of possible values. 1768 assert(offset_diff >= 0); 1769 int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3); 1770 1771 int16x8x2_t base_y_offset; 1772 base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256); 1773 base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256); 1774 1775 int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]), 1776 vqmovn_s16(base_y_offset.val[1])); 1777 1778 uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); 1779 uint8x16_t a0_y128 = vld1q_u8(left + min_y); 1780 uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1); 1781 a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); 1782 a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); 1783 1784 uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); 1785 uint8x8_t v_index_high = 1786 vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); 1787 uint8x8x2_t v_tmp, v_res; 1788 v_tmp.val[0] = vget_low_u8(a0_y128); 1789 v_tmp.val[1] = vget_high_u8(a0_y128); 1790 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); 1791 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); 1792 a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); 1793 v_tmp.val[0] = vget_low_u8(a1_y128); 1794 v_tmp.val[1] = vget_high_u8(a1_y128); 1795 v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); 1796 v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); 1797 a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); 1798 1799 a0_y0 = vget_low_u8(a0_y128); 1800 a0_y1 = vget_high_u8(a0_y128); 1801 a1_y0 = vget_low_u8(a1_y128); 1802 a1_y1 = vget_high_u8(a1_y128); 1803 } else { 1804 a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo); 1805 a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi); 1806 a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo); 1807 a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi); 1808 } 1809 #else 1810 // Values in left_idx{0,1} range from 0 through 63 inclusive. 1811 uint8x16_t left_idx0 = 1812 vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1))); 1813 uint8x16_t left_idx1 = 1814 vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1))); 1815 uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); 1816 1817 uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); 1818 uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); 1819 1820 uint8x8_t a0_y0 = vget_low_u8(a0_y01); 1821 uint8x8_t a0_y1 = vget_high_u8(a0_y01); 1822 uint8x8_t a1_y0 = vget_low_u8(a1_y01); 1823 uint8x8_t a1_y1 = vget_high_u8(a1_y01); 1824 #endif // !AOM_ARCH_AARCH64 1825 1826 uint16x8_t shifty_lo = vshrq_n_u16( 1827 vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1); 1828 uint16x8_t shifty_hi = vshrq_n_u16( 1829 vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1); 1830 1831 // a[x+1] - a[x] 1832 uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0); 1833 uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1); 1834 // a[x] * 32 + 16 1835 uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32)); 1836 uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32)); 1837 1838 uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo); 1839 uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi); 1840 1841 return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); 1842 } 1843 1844 static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride, 1845 const uint8_t *above, const uint8_t *left, 1846 int upsample_above, int upsample_left, 1847 int dx, int dy) { 1848 const int min_base_x = -(1 << upsample_above); 1849 const int min_base_y = -(1 << upsample_left); 1850 const int frac_bits_x = 6 - upsample_above; 1851 const int frac_bits_y = 6 - upsample_left; 1852 1853 assert(dx > 0); 1854 // pre-filter above pixels 1855 // store in temp buffers: 1856 // above[x] * 32 + 16 1857 // above[x+1] - above[x] 1858 // final pixels will be calculated as: 1859 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1860 1861 #if AOM_ARCH_AARCH64 1862 // Use ext rather than loading left + 14 directly to avoid over-read. 1863 const uint8x16_t left_m2 = vld1q_u8(left - 2); 1864 const uint8x16_t left_0 = vld1q_u8(left); 1865 const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14); 1866 const uint8x16x2_t left_vals = { { left_m2, left_14 } }; 1867 #define LEFT left_vals 1868 #else // !AOM_ARCH_AARCH64 1869 #define LEFT left 1870 #endif // AOM_ARCH_AARCH64 1871 1872 for (int r = 0; r < N; r++) { 1873 int y = r + 1; 1874 int base_x = (-y * dx) >> frac_bits_x; 1875 const int base_min_diff = 1876 (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >> 1877 upsample_above; 1878 1879 if (base_min_diff <= 0) { 1880 uint8x8_t a0_x_u8, a1_x_u8; 1881 uint16x4_t shift0; 1882 dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, 1883 &a0_x_u8, &a1_x_u8, &shift0); 1884 uint8x8_t a0_x = a0_x_u8; 1885 uint8x8_t a1_x = a1_x_u8; 1886 1887 uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x] 1888 uint16x8_t a32 = 1889 vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16 1890 uint16x8_t res = 1891 vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0))); 1892 uint8x8_t resx = vshrn_n_u16(res, 5); 1893 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0); 1894 } else if (base_min_diff < 4) { 1895 uint8x8_t a0_x_u8, a1_x_u8; 1896 uint16x4_t shift0; 1897 dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, 1898 &a0_x_u8, &a1_x_u8, &shift0); 1899 uint16x8_t a0_x = vmovl_u8(a0_x_u8); 1900 uint16x8_t a1_x = vmovl_u8(a1_x_u8); 1901 1902 uint16x4_t a0_y; 1903 uint16x4_t a1_y; 1904 uint16x4_t shift1; 1905 dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, 1906 frac_bits_y, &a0_y, &a1_y, &shift1); 1907 a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y); 1908 a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y); 1909 1910 uint16x8_t shift = vcombine_u16(shift0, shift1); 1911 uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] 1912 uint16x8_t a32 = 1913 vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16 1914 uint16x8_t res = vmlaq_u16(a32, diff, shift); 1915 uint8x8_t resx = vshrn_n_u16(res, 5); 1916 uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4); 1917 1918 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); 1919 uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); 1920 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); 1921 } else { 1922 uint16x4_t a0_y, a1_y; 1923 uint16x4_t shift1; 1924 dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, 1925 frac_bits_y, &a0_y, &a1_y, &shift1); 1926 uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x] 1927 uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16 1928 uint16x4_t res = vmla_u16(a32, diff, shift1); 1929 uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5); 1930 1931 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0); 1932 } 1933 1934 dst += stride; 1935 } 1936 #undef LEFT 1937 } 1938 1939 static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride, 1940 const uint8_t *above, const uint8_t *left, 1941 int upsample_above, int upsample_left, 1942 int dx, int dy) { 1943 const int min_base_x = -(1 << upsample_above); 1944 const int min_base_y = -(1 << upsample_left); 1945 const int frac_bits_x = 6 - upsample_above; 1946 const int frac_bits_y = 6 - upsample_left; 1947 1948 // pre-filter above pixels 1949 // store in temp buffers: 1950 // above[x] * 32 + 16 1951 // above[x+1] - above[x] 1952 // final pixels will be calculated as: 1953 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1954 1955 #if AOM_ARCH_AARCH64 1956 // Use ext rather than loading left + 30 directly to avoid over-read. 1957 const uint8x16_t left_m2 = vld1q_u8(left - 2); 1958 const uint8x16_t left_0 = vld1q_u8(left + 0); 1959 const uint8x16_t left_16 = vld1q_u8(left + 16); 1960 const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14); 1961 const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14); 1962 const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } }; 1963 #define LEFT left_vals 1964 #else // !AOM_ARCH_AARCH64 1965 #define LEFT left 1966 #endif // AOM_ARCH_AARCH64 1967 1968 for (int r = 0; r < N; r++) { 1969 int y = r + 1; 1970 int base_x = (-y * dx) >> frac_bits_x; 1971 int base_min_diff = 1972 (min_base_x - base_x + upsample_above) >> upsample_above; 1973 1974 if (base_min_diff <= 0) { 1975 uint8x8_t resx = 1976 dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); 1977 vst1_u8(dst, resx); 1978 } else if (base_min_diff < 8) { 1979 uint8x8_t resx = 1980 dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); 1981 uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( 1982 LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); 1983 uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); 1984 uint8x8_t resxy = vbsl_u8(mask, resy, resx); 1985 vst1_u8(dst, resxy); 1986 } else { 1987 uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( 1988 LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); 1989 vst1_u8(dst, resy); 1990 } 1991 1992 dst += stride; 1993 } 1994 #undef LEFT 1995 } 1996 1997 static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst, 1998 ptrdiff_t stride, const uint8_t *above, 1999 const uint8_t *left, int dx, int dy) { 2000 // here upsample_above and upsample_left are 0 by design of 2001 // av1_use_intra_edge_upsample 2002 const int min_base_x = -1; 2003 2004 #if AOM_ARCH_AARCH64 2005 const uint8x16_t left_m1 = vld1q_u8(left - 1); 2006 const uint8x16_t left_0 = vld1q_u8(left + 0); 2007 const uint8x16_t left_16 = vld1q_u8(left + 16); 2008 const uint8x16_t left_32 = vld1q_u8(left + 32); 2009 const uint8x16_t left_48 = vld1q_u8(left + 48); 2010 const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15); 2011 const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15); 2012 const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15); 2013 const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } }; 2014 const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } }; 2015 #define LEFT left_vals0, left_vals1 2016 #else // !AOM_ARCH_AARCH64 2017 #define LEFT left 2018 #endif // AOM_ARCH_AARCH64 2019 2020 for (int r = 0; r < H; r++) { 2021 int y = r + 1; 2022 int base_x = (-y * dx) >> 6; 2023 for (int j = 0; j < W; j += 16) { 2024 const int base_min_diff = min_base_x - base_x - j; 2025 2026 if (base_min_diff <= 0) { 2027 uint8x16_t resx = 2028 dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); 2029 vst1q_u8(dst + j, resx); 2030 } else if (base_min_diff < 16) { 2031 uint8x16_t resx = 2032 dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); 2033 uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); 2034 uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); 2035 uint8x16_t resxy = vbslq_u8(mask, resy, resx); 2036 vst1q_u8(dst + j, resxy); 2037 } else { 2038 uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); 2039 vst1q_u8(dst + j, resy); 2040 } 2041 } // for j 2042 dst += stride; 2043 } 2044 #undef LEFT 2045 } 2046 2047 // Directional prediction, zone 2: 90 < angle < 180 2048 void av1_dr_prediction_z2_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 2049 const uint8_t *above, const uint8_t *left, 2050 int upsample_above, int upsample_left, int dx, 2051 int dy) { 2052 assert(dx > 0); 2053 assert(dy > 0); 2054 2055 switch (bw) { 2056 case 4: 2057 dr_prediction_z2_Nx4_neon(bh, dst, stride, above, left, upsample_above, 2058 upsample_left, dx, dy); 2059 break; 2060 case 8: 2061 dr_prediction_z2_Nx8_neon(bh, dst, stride, above, left, upsample_above, 2062 upsample_left, dx, dy); 2063 break; 2064 default: 2065 dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy); 2066 break; 2067 } 2068 } 2069 2070 /* ---------------------P R E D I C T I O N Z 3--------------------------- */ 2071 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2072 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x, 2073 uint8x16x2_t *d) { 2074 uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); 2075 uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); 2076 2077 d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), 2078 vreinterpretq_u16_u8(w1.val[0]))); 2079 d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), 2080 vreinterpretq_u16_u8(w1.val[1]))); 2081 } 2082 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2083 2084 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x, 2085 uint8x8x2_t *d) { 2086 uint8x8x2_t w0 = vzip_u8(x[0], x[1]); 2087 uint8x8x2_t w1 = vzip_u8(x[2], x[3]); 2088 2089 *d = aom_reinterpret_u8_u16_x2( 2090 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); 2091 } 2092 2093 static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x, 2094 uint8x8x2_t *d) { 2095 uint8x8x2_t w0 = vzip_u8(x[0], x[1]); 2096 uint8x8x2_t w1 = vzip_u8(x[2], x[3]); 2097 2098 d[0] = aom_reinterpret_u8_u16_x2( 2099 vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); 2100 d[1] = aom_reinterpret_u8_u16_x2( 2101 vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]))); 2102 } 2103 2104 static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc, 2105 uint8_t *dst, ptrdiff_t pitchDst) { 2106 // The same as the normal transposes in transpose_neon.h, but with a stride 2107 // between consecutive vectors of elements. 2108 uint8x16_t r[16]; 2109 uint8x16_t d[16]; 2110 for (int i = 0; i < 16; i++) { 2111 r[i] = vld1q_u8(src + i * pitchSrc); 2112 } 2113 transpose_arrays_u8_16x16(r, d); 2114 for (int i = 0; i < 16; i++) { 2115 vst1q_u8(dst + i * pitchDst, d[i]); 2116 } 2117 } 2118 2119 static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src, 2120 ptrdiff_t pitchSrc, uint8_t *dst, 2121 ptrdiff_t pitchDst, int width, 2122 int height) { 2123 for (int j = 0; j < height; j += 16) { 2124 for (int i = 0; i < width; i += 16) { 2125 z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc, 2126 dst + j * pitchDst + i, pitchDst); 2127 } 2128 } 2129 } 2130 2131 static void dr_prediction_z3_4x4_neon(uint8_t *dst, ptrdiff_t stride, 2132 const uint8_t *left, int upsample_left, 2133 int dy) { 2134 uint8x8_t dstvec[4]; 2135 uint8x8x2_t dest; 2136 2137 dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy); 2138 z3_transpose_arrays_u8_4x4(dstvec, &dest); 2139 store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]); 2140 store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]); 2141 } 2142 2143 static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride, 2144 const uint8_t *left, int upsample_left, 2145 int dy) { 2146 uint8x8_t dstvec[8]; 2147 uint8x8_t d[8]; 2148 2149 dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy); 2150 transpose_arrays_u8_8x8(dstvec, d); 2151 store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); 2152 } 2153 2154 static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride, 2155 const uint8_t *left, int upsample_left, 2156 int dy) { 2157 uint8x8_t dstvec[4]; 2158 uint8x8x2_t d[2]; 2159 2160 dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy); 2161 z3_transpose_arrays_u8_8x4(dstvec, d); 2162 store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]); 2163 store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]); 2164 store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]); 2165 store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]); 2166 } 2167 2168 static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride, 2169 const uint8_t *left, int upsample_left, 2170 int dy) { 2171 uint8x8_t dstvec[8]; 2172 uint8x8_t d[8]; 2173 2174 dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy); 2175 transpose_arrays_u8_8x8(dstvec, d); 2176 store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]); 2177 } 2178 2179 static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride, 2180 const uint8_t *left, int upsample_left, 2181 int dy) { 2182 uint8x16_t dstvec[8]; 2183 uint8x8_t d[16]; 2184 2185 dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy); 2186 transpose_arrays_u8_16x8(dstvec, d); 2187 for (int i = 0; i < 16; i++) { 2188 vst1_u8(dst + i * stride, d[i]); 2189 } 2190 } 2191 2192 static void dr_prediction_z3_16x8_neon(uint8_t *dst, ptrdiff_t stride, 2193 const uint8_t *left, int upsample_left, 2194 int dy) { 2195 uint8x8_t dstvec[16]; 2196 uint8x16_t d[8]; 2197 2198 dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy); 2199 transpose_arrays_u8_8x16(dstvec, d); 2200 for (int i = 0; i < 8; i++) { 2201 vst1q_u8(dst + i * stride, d[i]); 2202 } 2203 } 2204 2205 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2206 static void dr_prediction_z3_4x16_neon(uint8_t *dst, ptrdiff_t stride, 2207 const uint8_t *left, int upsample_left, 2208 int dy) { 2209 uint8x16_t dstvec[4]; 2210 uint8x16x2_t d[2]; 2211 2212 dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy); 2213 z3_transpose_arrays_u8_16x4(dstvec, d); 2214 store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]); 2215 store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]); 2216 store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]); 2217 store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]); 2218 } 2219 2220 static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride, 2221 const uint8_t *left, int upsample_left, 2222 int dy) { 2223 uint8x8_t dstvec[16]; 2224 uint8x16_t d[8]; 2225 2226 dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy); 2227 transpose_arrays_u8_8x16(dstvec, d); 2228 for (int i = 0; i < 4; i++) { 2229 vst1q_u8(dst + i * stride, d[i]); 2230 } 2231 } 2232 2233 static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride, 2234 const uint8_t *left, int upsample_left, 2235 int dy) { 2236 (void)upsample_left; 2237 uint8x16x2_t dstvec[16]; 2238 uint8x16_t d[32]; 2239 uint8x16_t v_zero = vdupq_n_u8(0); 2240 2241 dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy); 2242 for (int i = 8; i < 16; i++) { 2243 dstvec[i].val[0] = v_zero; 2244 dstvec[i].val[1] = v_zero; 2245 } 2246 transpose_arrays_u8_32x16(dstvec, d); 2247 for (int i = 0; i < 32; i++) { 2248 vst1_u8(dst + i * stride, vget_low_u8(d[i])); 2249 } 2250 } 2251 2252 static void dr_prediction_z3_32x8_neon(uint8_t *dst, ptrdiff_t stride, 2253 const uint8_t *left, int upsample_left, 2254 int dy) { 2255 uint8x8_t dstvec[32]; 2256 uint8x16_t d[16]; 2257 2258 dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy); 2259 transpose_arrays_u8_8x16(dstvec, d); 2260 transpose_arrays_u8_8x16(dstvec + 16, d + 8); 2261 for (int i = 0; i < 8; i++) { 2262 vst1q_u8(dst + i * stride, d[i]); 2263 vst1q_u8(dst + i * stride + 16, d[i + 8]); 2264 } 2265 } 2266 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2267 2268 static void dr_prediction_z3_16x16_neon(uint8_t *dst, ptrdiff_t stride, 2269 const uint8_t *left, int upsample_left, 2270 int dy) { 2271 uint8x16_t dstvec[16]; 2272 uint8x16_t d[16]; 2273 2274 dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy); 2275 transpose_arrays_u8_16x16(dstvec, d); 2276 for (int i = 0; i < 16; i++) { 2277 vst1q_u8(dst + i * stride, d[i]); 2278 } 2279 } 2280 2281 static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride, 2282 const uint8_t *left, int upsample_left, 2283 int dy) { 2284 (void)upsample_left; 2285 uint8x16x2_t dstvec[32]; 2286 uint8x16_t d[64]; 2287 2288 dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy); 2289 transpose_arrays_u8_32x16(dstvec, d); 2290 transpose_arrays_u8_32x16(dstvec + 16, d + 32); 2291 for (int i = 0; i < 32; i++) { 2292 vst1q_u8(dst + i * stride, d[i]); 2293 vst1q_u8(dst + i * stride + 16, d[i + 32]); 2294 } 2295 } 2296 2297 static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride, 2298 const uint8_t *left, int upsample_left, 2299 int dy) { 2300 (void)upsample_left; 2301 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); 2302 2303 dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy); 2304 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64); 2305 } 2306 2307 static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride, 2308 const uint8_t *left, int upsample_left, 2309 int dy) { 2310 (void)upsample_left; 2311 uint8x16x2_t dstvec[16]; 2312 uint8x16_t d[32]; 2313 2314 dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy); 2315 transpose_arrays_u8_32x16(dstvec, d); 2316 for (int i = 0; i < 16; i++) { 2317 vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]); 2318 vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]); 2319 } 2320 } 2321 2322 static void dr_prediction_z3_32x16_neon(uint8_t *dst, ptrdiff_t stride, 2323 const uint8_t *left, int upsample_left, 2324 int dy) { 2325 uint8x16_t dstvec[32]; 2326 2327 dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy); 2328 for (int i = 0; i < 32; i += 16) { 2329 uint8x16_t d[16]; 2330 transpose_arrays_u8_16x16(dstvec + i, d); 2331 for (int j = 0; j < 16; j++) { 2332 vst1q_u8(dst + j * stride + i, d[j]); 2333 } 2334 } 2335 } 2336 2337 static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride, 2338 const uint8_t *left, int upsample_left, 2339 int dy) { 2340 (void)upsample_left; 2341 uint8_t dstT[64 * 32]; 2342 2343 dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy); 2344 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64); 2345 } 2346 2347 static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride, 2348 const uint8_t *left, int upsample_left, 2349 int dy) { 2350 (void)upsample_left; 2351 uint8_t dstT[32 * 64]; 2352 2353 dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy); 2354 z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32); 2355 } 2356 2357 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2358 static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride, 2359 const uint8_t *left, int upsample_left, 2360 int dy) { 2361 (void)upsample_left; 2362 uint8_t dstT[64 * 16]; 2363 2364 dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy); 2365 z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64); 2366 } 2367 2368 static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride, 2369 const uint8_t *left, int upsample_left, 2370 int dy) { 2371 uint8x16_t dstvec[64]; 2372 2373 dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy); 2374 for (int i = 0; i < 64; i += 16) { 2375 uint8x16_t d[16]; 2376 transpose_arrays_u8_16x16(dstvec + i, d); 2377 for (int j = 0; j < 16; ++j) { 2378 vst1q_u8(dst + j * stride + i, d[j]); 2379 } 2380 } 2381 } 2382 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2383 2384 typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride, 2385 const uint8_t *left, int upsample_left, 2386 int dy); 2387 2388 #if CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER 2389 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { 2390 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2391 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2392 { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, NULL, 2393 NULL, NULL }, 2394 { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, 2395 dr_prediction_z3_8x16_neon, NULL, NULL }, 2396 { NULL, NULL, NULL, dr_prediction_z3_16x8_neon, dr_prediction_z3_16x16_neon, 2397 dr_prediction_z3_16x32_neon, NULL }, 2398 { NULL, NULL, NULL, NULL, dr_prediction_z3_32x16_neon, 2399 dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, 2400 { NULL, NULL, NULL, NULL, NULL, dr_prediction_z3_64x32_neon, 2401 dr_prediction_z3_64x64_neon }, 2402 }; 2403 #else 2404 static const dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { 2405 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2406 { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, 2407 { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, 2408 dr_prediction_z3_4x16_neon, NULL, NULL }, 2409 { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, 2410 dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL }, 2411 { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon, 2412 dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon, 2413 dr_prediction_z3_16x64_neon }, 2414 { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon, 2415 dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, 2416 { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon, 2417 dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon }, 2418 }; 2419 #endif // CONFIG_REALTIME_ONLY && !CONFIG_AV1_DECODER 2420 2421 void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 2422 const uint8_t *above, const uint8_t *left, 2423 int upsample_left, int dx, int dy) { 2424 (void)above; 2425 (void)dx; 2426 assert(dx == 1); 2427 assert(dy > 0); 2428 2429 dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)]; 2430 assert(f != NULL); 2431 f(dst, stride, left, upsample_left, dy); 2432 } 2433 2434 // ----------------------------------------------------------------------------- 2435 // SMOOTH_PRED 2436 2437 // 256 - v = vneg_s8(v) 2438 static inline uint8x8_t negate_s8(const uint8x8_t v) { 2439 return vreinterpret_u8_s8(vneg_s8(vreinterpret_s8_u8(v))); 2440 } 2441 2442 static void smooth_4xh_neon(uint8_t *dst, ptrdiff_t stride, 2443 const uint8_t *const top_row, 2444 const uint8_t *const left_column, 2445 const int height) { 2446 const uint8_t top_right = top_row[3]; 2447 const uint8_t bottom_left = left_column[height - 1]; 2448 const uint8_t *const weights_y = smooth_weights + height - 4; 2449 2450 uint8x8_t top_v = load_u8_4x1(top_row); 2451 const uint8x8_t top_right_v = vdup_n_u8(top_right); 2452 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); 2453 uint8x8_t weights_x_v = load_u8_4x1(smooth_weights); 2454 const uint8x8_t scaled_weights_x = negate_s8(weights_x_v); 2455 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); 2456 2457 assert(height > 0); 2458 int y = 0; 2459 do { 2460 const uint8x8_t left_v = vdup_n_u8(left_column[y]); 2461 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); 2462 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); 2463 const uint16x8_t weighted_bl = vmull_u8(scaled_weights_y, bottom_left_v); 2464 const uint16x8_t weighted_top_bl = 2465 vmlal_u8(weighted_bl, weights_y_v, top_v); 2466 const uint16x8_t weighted_left_tr = 2467 vmlal_u8(weighted_tr, weights_x_v, left_v); 2468 // Maximum value of each parameter: 0xFF00 2469 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); 2470 const uint8x8_t result = vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE); 2471 2472 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(result), 0); 2473 dst += stride; 2474 } while (++y != height); 2475 } 2476 2477 static inline uint8x8_t calculate_pred(const uint16x8_t weighted_top_bl, 2478 const uint16x8_t weighted_left_tr) { 2479 // Maximum value of each parameter: 0xFF00 2480 const uint16x8_t avg = vhaddq_u16(weighted_top_bl, weighted_left_tr); 2481 return vrshrn_n_u16(avg, SMOOTH_WEIGHT_LOG2_SCALE); 2482 } 2483 2484 static inline uint8x8_t calculate_weights_and_pred( 2485 const uint8x8_t top, const uint8x8_t left, const uint16x8_t weighted_tr, 2486 const uint8x8_t bottom_left, const uint8x8_t weights_x, 2487 const uint8x8_t scaled_weights_y, const uint8x8_t weights_y) { 2488 const uint16x8_t weighted_top = vmull_u8(weights_y, top); 2489 const uint16x8_t weighted_top_bl = 2490 vmlal_u8(weighted_top, scaled_weights_y, bottom_left); 2491 const uint16x8_t weighted_left_tr = vmlal_u8(weighted_tr, weights_x, left); 2492 return calculate_pred(weighted_top_bl, weighted_left_tr); 2493 } 2494 2495 static void smooth_8xh_neon(uint8_t *dst, ptrdiff_t stride, 2496 const uint8_t *const top_row, 2497 const uint8_t *const left_column, 2498 const int height) { 2499 const uint8_t top_right = top_row[7]; 2500 const uint8_t bottom_left = left_column[height - 1]; 2501 const uint8_t *const weights_y = smooth_weights + height - 4; 2502 2503 const uint8x8_t top_v = vld1_u8(top_row); 2504 const uint8x8_t top_right_v = vdup_n_u8(top_right); 2505 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); 2506 const uint8x8_t weights_x_v = vld1_u8(smooth_weights + 4); 2507 const uint8x8_t scaled_weights_x = negate_s8(weights_x_v); 2508 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); 2509 2510 assert(height > 0); 2511 int y = 0; 2512 do { 2513 const uint8x8_t left_v = vdup_n_u8(left_column[y]); 2514 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); 2515 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); 2516 const uint8x8_t result = 2517 calculate_weights_and_pred(top_v, left_v, weighted_tr, bottom_left_v, 2518 weights_x_v, scaled_weights_y, weights_y_v); 2519 2520 vst1_u8(dst, result); 2521 dst += stride; 2522 } while (++y != height); 2523 } 2524 2525 #define SMOOTH_NXM(W, H) \ 2526 void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \ 2527 const uint8_t *above, \ 2528 const uint8_t *left) { \ 2529 smooth_##W##xh_neon(dst, y_stride, above, left, H); \ 2530 } 2531 2532 SMOOTH_NXM(4, 4) 2533 SMOOTH_NXM(4, 8) 2534 SMOOTH_NXM(8, 4) 2535 SMOOTH_NXM(8, 8) 2536 SMOOTH_NXM(8, 16) 2537 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2538 SMOOTH_NXM(4, 16) 2539 SMOOTH_NXM(8, 32) 2540 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2541 2542 #undef SMOOTH_NXM 2543 2544 static inline uint8x16_t calculate_weights_and_predq( 2545 const uint8x16_t top, const uint8x8_t left, const uint8x8_t top_right, 2546 const uint8x8_t weights_y, const uint8x16_t weights_x, 2547 const uint8x16_t scaled_weights_x, const uint16x8_t weighted_bl) { 2548 const uint16x8_t weighted_top_bl_low = 2549 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); 2550 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); 2551 const uint16x8_t weighted_left_tr_low = 2552 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); 2553 const uint8x8_t result_low = 2554 calculate_pred(weighted_top_bl_low, weighted_left_tr_low); 2555 2556 const uint16x8_t weighted_top_bl_high = 2557 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); 2558 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); 2559 const uint16x8_t weighted_left_tr_high = 2560 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); 2561 const uint8x8_t result_high = 2562 calculate_pred(weighted_top_bl_high, weighted_left_tr_high); 2563 2564 return vcombine_u8(result_low, result_high); 2565 } 2566 2567 // 256 - v = vneg_s8(v) 2568 static inline uint8x16_t negate_s8q(const uint8x16_t v) { 2569 return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(v))); 2570 } 2571 2572 // For width 16 and above. 2573 #define SMOOTH_PREDICTOR(W) \ 2574 static void smooth_##W##xh_neon( \ 2575 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ 2576 const uint8_t *const left_column, const int height) { \ 2577 const uint8_t top_right = top_row[(W) - 1]; \ 2578 const uint8_t bottom_left = left_column[height - 1]; \ 2579 const uint8_t *const weights_y = smooth_weights + height - 4; \ 2580 \ 2581 uint8x16_t top_v[4]; \ 2582 top_v[0] = vld1q_u8(top_row); \ 2583 if ((W) > 16) { \ 2584 top_v[1] = vld1q_u8(top_row + 16); \ 2585 if ((W) == 64) { \ 2586 top_v[2] = vld1q_u8(top_row + 32); \ 2587 top_v[3] = vld1q_u8(top_row + 48); \ 2588 } \ 2589 } \ 2590 \ 2591 const uint8x8_t top_right_v = vdup_n_u8(top_right); \ 2592 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ 2593 \ 2594 uint8x16_t weights_x_v[4]; \ 2595 weights_x_v[0] = vld1q_u8(smooth_weights + (W) - 4); \ 2596 if ((W) > 16) { \ 2597 weights_x_v[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \ 2598 if ((W) == 64) { \ 2599 weights_x_v[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \ 2600 weights_x_v[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \ 2601 } \ 2602 } \ 2603 \ 2604 uint8x16_t scaled_weights_x[4]; \ 2605 scaled_weights_x[0] = negate_s8q(weights_x_v[0]); \ 2606 if ((W) > 16) { \ 2607 scaled_weights_x[1] = negate_s8q(weights_x_v[1]); \ 2608 if ((W) == 64) { \ 2609 scaled_weights_x[2] = negate_s8q(weights_x_v[2]); \ 2610 scaled_weights_x[3] = negate_s8q(weights_x_v[3]); \ 2611 } \ 2612 } \ 2613 \ 2614 for (int y = 0; y < height; ++y) { \ 2615 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ 2616 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ 2617 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ 2618 const uint16x8_t weighted_bl = \ 2619 vmull_u8(scaled_weights_y, bottom_left_v); \ 2620 \ 2621 vst1q_u8(dst, calculate_weights_and_predq( \ 2622 top_v[0], left_v, top_right_v, weights_y_v, \ 2623 weights_x_v[0], scaled_weights_x[0], weighted_bl)); \ 2624 \ 2625 if ((W) > 16) { \ 2626 vst1q_u8(dst + 16, \ 2627 calculate_weights_and_predq( \ 2628 top_v[1], left_v, top_right_v, weights_y_v, \ 2629 weights_x_v[1], scaled_weights_x[1], weighted_bl)); \ 2630 if ((W) == 64) { \ 2631 vst1q_u8(dst + 32, \ 2632 calculate_weights_and_predq( \ 2633 top_v[2], left_v, top_right_v, weights_y_v, \ 2634 weights_x_v[2], scaled_weights_x[2], weighted_bl)); \ 2635 vst1q_u8(dst + 48, \ 2636 calculate_weights_and_predq( \ 2637 top_v[3], left_v, top_right_v, weights_y_v, \ 2638 weights_x_v[3], scaled_weights_x[3], weighted_bl)); \ 2639 } \ 2640 } \ 2641 \ 2642 dst += stride; \ 2643 } \ 2644 } 2645 2646 SMOOTH_PREDICTOR(16) 2647 SMOOTH_PREDICTOR(32) 2648 SMOOTH_PREDICTOR(64) 2649 2650 #undef SMOOTH_PREDICTOR 2651 2652 #define SMOOTH_NXM_WIDE(W, H) \ 2653 void aom_smooth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t y_stride, \ 2654 const uint8_t *above, \ 2655 const uint8_t *left) { \ 2656 smooth_##W##xh_neon(dst, y_stride, above, left, H); \ 2657 } 2658 2659 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2660 SMOOTH_NXM_WIDE(16, 4) 2661 SMOOTH_NXM_WIDE(16, 64) 2662 SMOOTH_NXM_WIDE(32, 8) 2663 SMOOTH_NXM_WIDE(64, 16) 2664 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2665 SMOOTH_NXM_WIDE(16, 8) 2666 SMOOTH_NXM_WIDE(16, 16) 2667 SMOOTH_NXM_WIDE(16, 32) 2668 SMOOTH_NXM_WIDE(32, 16) 2669 SMOOTH_NXM_WIDE(32, 32) 2670 SMOOTH_NXM_WIDE(32, 64) 2671 SMOOTH_NXM_WIDE(64, 32) 2672 SMOOTH_NXM_WIDE(64, 64) 2673 2674 #undef SMOOTH_NXM_WIDE 2675 2676 // ----------------------------------------------------------------------------- 2677 // SMOOTH_V_PRED 2678 2679 // For widths 4 and 8. 2680 #define SMOOTH_V_PREDICTOR(W) \ 2681 static void smooth_v_##W##xh_neon( \ 2682 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ 2683 const uint8_t *const left_column, const int height) { \ 2684 const uint8_t bottom_left = left_column[height - 1]; \ 2685 const uint8_t *const weights_y = smooth_weights + height - 4; \ 2686 \ 2687 uint8x8_t top_v; \ 2688 if ((W) == 4) { \ 2689 top_v = load_u8_4x1(top_row); \ 2690 } else { /* width == 8 */ \ 2691 top_v = vld1_u8(top_row); \ 2692 } \ 2693 \ 2694 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ 2695 \ 2696 assert(height > 0); \ 2697 int y = 0; \ 2698 do { \ 2699 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ 2700 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ 2701 \ 2702 const uint16x8_t weighted_top = vmull_u8(weights_y_v, top_v); \ 2703 const uint16x8_t weighted_top_bl = \ 2704 vmlal_u8(weighted_top, scaled_weights_y, bottom_left_v); \ 2705 const uint8x8_t pred = \ 2706 vrshrn_n_u16(weighted_top_bl, SMOOTH_WEIGHT_LOG2_SCALE); \ 2707 \ 2708 if ((W) == 4) { \ 2709 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \ 2710 } else { /* width == 8 */ \ 2711 vst1_u8(dst, pred); \ 2712 } \ 2713 dst += stride; \ 2714 } while (++y != height); \ 2715 } 2716 2717 SMOOTH_V_PREDICTOR(4) 2718 SMOOTH_V_PREDICTOR(8) 2719 2720 #undef SMOOTH_V_PREDICTOR 2721 2722 #define SMOOTH_V_NXM(W, H) \ 2723 void aom_smooth_v_predictor_##W##x##H##_neon( \ 2724 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ 2725 const uint8_t *left) { \ 2726 smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ 2727 } 2728 2729 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2730 SMOOTH_V_NXM(4, 16) 2731 SMOOTH_V_NXM(8, 32) 2732 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2733 SMOOTH_V_NXM(4, 4) 2734 SMOOTH_V_NXM(4, 8) 2735 SMOOTH_V_NXM(8, 4) 2736 SMOOTH_V_NXM(8, 8) 2737 SMOOTH_V_NXM(8, 16) 2738 2739 #undef SMOOTH_V_NXM 2740 2741 static inline uint8x16_t calculate_vertical_weights_and_pred( 2742 const uint8x16_t top, const uint8x8_t weights_y, 2743 const uint16x8_t weighted_bl) { 2744 const uint16x8_t pred_low = 2745 vmlal_u8(weighted_bl, weights_y, vget_low_u8(top)); 2746 const uint16x8_t pred_high = 2747 vmlal_u8(weighted_bl, weights_y, vget_high_u8(top)); 2748 const uint8x8_t pred_scaled_low = 2749 vrshrn_n_u16(pred_low, SMOOTH_WEIGHT_LOG2_SCALE); 2750 const uint8x8_t pred_scaled_high = 2751 vrshrn_n_u16(pred_high, SMOOTH_WEIGHT_LOG2_SCALE); 2752 return vcombine_u8(pred_scaled_low, pred_scaled_high); 2753 } 2754 2755 // For width 16 and above. 2756 #define SMOOTH_V_PREDICTOR(W) \ 2757 static void smooth_v_##W##xh_neon( \ 2758 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ 2759 const uint8_t *const left_column, const int height) { \ 2760 const uint8_t bottom_left = left_column[height - 1]; \ 2761 const uint8_t *const weights_y = smooth_weights + height - 4; \ 2762 \ 2763 uint8x16_t top_v[4]; \ 2764 top_v[0] = vld1q_u8(top_row); \ 2765 if ((W) > 16) { \ 2766 top_v[1] = vld1q_u8(top_row + 16); \ 2767 if ((W) == 64) { \ 2768 top_v[2] = vld1q_u8(top_row + 32); \ 2769 top_v[3] = vld1q_u8(top_row + 48); \ 2770 } \ 2771 } \ 2772 \ 2773 const uint8x8_t bottom_left_v = vdup_n_u8(bottom_left); \ 2774 \ 2775 assert(height > 0); \ 2776 int y = 0; \ 2777 do { \ 2778 const uint8x8_t weights_y_v = vdup_n_u8(weights_y[y]); \ 2779 const uint8x8_t scaled_weights_y = negate_s8(weights_y_v); \ 2780 const uint16x8_t weighted_bl = \ 2781 vmull_u8(scaled_weights_y, bottom_left_v); \ 2782 \ 2783 const uint8x16_t pred_0 = calculate_vertical_weights_and_pred( \ 2784 top_v[0], weights_y_v, weighted_bl); \ 2785 vst1q_u8(dst, pred_0); \ 2786 \ 2787 if ((W) > 16) { \ 2788 const uint8x16_t pred_1 = calculate_vertical_weights_and_pred( \ 2789 top_v[1], weights_y_v, weighted_bl); \ 2790 vst1q_u8(dst + 16, pred_1); \ 2791 \ 2792 if ((W) == 64) { \ 2793 const uint8x16_t pred_2 = calculate_vertical_weights_and_pred( \ 2794 top_v[2], weights_y_v, weighted_bl); \ 2795 vst1q_u8(dst + 32, pred_2); \ 2796 \ 2797 const uint8x16_t pred_3 = calculate_vertical_weights_and_pred( \ 2798 top_v[3], weights_y_v, weighted_bl); \ 2799 vst1q_u8(dst + 48, pred_3); \ 2800 } \ 2801 } \ 2802 \ 2803 dst += stride; \ 2804 } while (++y != height); \ 2805 } 2806 2807 SMOOTH_V_PREDICTOR(16) 2808 SMOOTH_V_PREDICTOR(32) 2809 SMOOTH_V_PREDICTOR(64) 2810 2811 #undef SMOOTH_V_PREDICTOR 2812 2813 #define SMOOTH_V_NXM_WIDE(W, H) \ 2814 void aom_smooth_v_predictor_##W##x##H##_neon( \ 2815 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ 2816 const uint8_t *left) { \ 2817 smooth_v_##W##xh_neon(dst, y_stride, above, left, H); \ 2818 } 2819 2820 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2821 SMOOTH_V_NXM_WIDE(16, 4) 2822 SMOOTH_V_NXM_WIDE(32, 8) 2823 SMOOTH_V_NXM_WIDE(64, 16) 2824 SMOOTH_V_NXM_WIDE(16, 64) 2825 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2826 SMOOTH_V_NXM_WIDE(16, 8) 2827 SMOOTH_V_NXM_WIDE(16, 16) 2828 SMOOTH_V_NXM_WIDE(16, 32) 2829 SMOOTH_V_NXM_WIDE(32, 16) 2830 SMOOTH_V_NXM_WIDE(32, 32) 2831 SMOOTH_V_NXM_WIDE(32, 64) 2832 SMOOTH_V_NXM_WIDE(64, 32) 2833 SMOOTH_V_NXM_WIDE(64, 64) 2834 2835 #undef SMOOTH_V_NXM_WIDE 2836 2837 // ----------------------------------------------------------------------------- 2838 // SMOOTH_H_PRED 2839 2840 // For widths 4 and 8. 2841 #define SMOOTH_H_PREDICTOR(W) \ 2842 static void smooth_h_##W##xh_neon( \ 2843 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ 2844 const uint8_t *const left_column, const int height) { \ 2845 const uint8_t top_right = top_row[(W) - 1]; \ 2846 \ 2847 const uint8x8_t top_right_v = vdup_n_u8(top_right); \ 2848 /* Over-reads for 4xN but still within the array. */ \ 2849 const uint8x8_t weights_x = vld1_u8(smooth_weights + (W) - 4); \ 2850 const uint8x8_t scaled_weights_x = negate_s8(weights_x); \ 2851 const uint16x8_t weighted_tr = vmull_u8(scaled_weights_x, top_right_v); \ 2852 \ 2853 assert(height > 0); \ 2854 int y = 0; \ 2855 do { \ 2856 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ 2857 const uint16x8_t weighted_left_tr = \ 2858 vmlal_u8(weighted_tr, weights_x, left_v); \ 2859 const uint8x8_t pred = \ 2860 vrshrn_n_u16(weighted_left_tr, SMOOTH_WEIGHT_LOG2_SCALE); \ 2861 \ 2862 if ((W) == 4) { \ 2863 vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(pred), 0); \ 2864 } else { /* width == 8 */ \ 2865 vst1_u8(dst, pred); \ 2866 } \ 2867 dst += stride; \ 2868 } while (++y != height); \ 2869 } 2870 2871 SMOOTH_H_PREDICTOR(4) 2872 SMOOTH_H_PREDICTOR(8) 2873 2874 #undef SMOOTH_H_PREDICTOR 2875 2876 #define SMOOTH_H_NXM(W, H) \ 2877 void aom_smooth_h_predictor_##W##x##H##_neon( \ 2878 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ 2879 const uint8_t *left) { \ 2880 smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ 2881 } 2882 2883 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2884 SMOOTH_H_NXM(4, 16) 2885 SMOOTH_H_NXM(8, 32) 2886 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2887 SMOOTH_H_NXM(4, 4) 2888 SMOOTH_H_NXM(4, 8) 2889 SMOOTH_H_NXM(8, 4) 2890 SMOOTH_H_NXM(8, 8) 2891 SMOOTH_H_NXM(8, 16) 2892 2893 #undef SMOOTH_H_NXM 2894 2895 static inline uint8x16_t calculate_horizontal_weights_and_pred( 2896 const uint8x8_t left, const uint8x8_t top_right, const uint8x16_t weights_x, 2897 const uint8x16_t scaled_weights_x) { 2898 const uint16x8_t weighted_left_low = vmull_u8(vget_low_u8(weights_x), left); 2899 const uint16x8_t weighted_left_tr_low = 2900 vmlal_u8(weighted_left_low, vget_low_u8(scaled_weights_x), top_right); 2901 const uint8x8_t pred_scaled_low = 2902 vrshrn_n_u16(weighted_left_tr_low, SMOOTH_WEIGHT_LOG2_SCALE); 2903 2904 const uint16x8_t weighted_left_high = vmull_u8(vget_high_u8(weights_x), left); 2905 const uint16x8_t weighted_left_tr_high = 2906 vmlal_u8(weighted_left_high, vget_high_u8(scaled_weights_x), top_right); 2907 const uint8x8_t pred_scaled_high = 2908 vrshrn_n_u16(weighted_left_tr_high, SMOOTH_WEIGHT_LOG2_SCALE); 2909 2910 return vcombine_u8(pred_scaled_low, pred_scaled_high); 2911 } 2912 2913 // For width 16 and above. 2914 #define SMOOTH_H_PREDICTOR(W) \ 2915 static void smooth_h_##W##xh_neon( \ 2916 uint8_t *dst, ptrdiff_t stride, const uint8_t *const top_row, \ 2917 const uint8_t *const left_column, const int height) { \ 2918 const uint8_t top_right = top_row[(W) - 1]; \ 2919 \ 2920 const uint8x8_t top_right_v = vdup_n_u8(top_right); \ 2921 \ 2922 uint8x16_t weights_x[4]; \ 2923 weights_x[0] = vld1q_u8(smooth_weights + (W) - 4); \ 2924 if ((W) > 16) { \ 2925 weights_x[1] = vld1q_u8(smooth_weights + (W) + 16 - 4); \ 2926 if ((W) == 64) { \ 2927 weights_x[2] = vld1q_u8(smooth_weights + (W) + 32 - 4); \ 2928 weights_x[3] = vld1q_u8(smooth_weights + (W) + 48 - 4); \ 2929 } \ 2930 } \ 2931 \ 2932 uint8x16_t scaled_weights_x[4]; \ 2933 scaled_weights_x[0] = negate_s8q(weights_x[0]); \ 2934 if ((W) > 16) { \ 2935 scaled_weights_x[1] = negate_s8q(weights_x[1]); \ 2936 if ((W) == 64) { \ 2937 scaled_weights_x[2] = negate_s8q(weights_x[2]); \ 2938 scaled_weights_x[3] = negate_s8q(weights_x[3]); \ 2939 } \ 2940 } \ 2941 \ 2942 assert(height > 0); \ 2943 int y = 0; \ 2944 do { \ 2945 const uint8x8_t left_v = vdup_n_u8(left_column[y]); \ 2946 \ 2947 const uint8x16_t pred_0 = calculate_horizontal_weights_and_pred( \ 2948 left_v, top_right_v, weights_x[0], scaled_weights_x[0]); \ 2949 vst1q_u8(dst, pred_0); \ 2950 \ 2951 if ((W) > 16) { \ 2952 const uint8x16_t pred_1 = calculate_horizontal_weights_and_pred( \ 2953 left_v, top_right_v, weights_x[1], scaled_weights_x[1]); \ 2954 vst1q_u8(dst + 16, pred_1); \ 2955 \ 2956 if ((W) == 64) { \ 2957 const uint8x16_t pred_2 = calculate_horizontal_weights_and_pred( \ 2958 left_v, top_right_v, weights_x[2], scaled_weights_x[2]); \ 2959 vst1q_u8(dst + 32, pred_2); \ 2960 \ 2961 const uint8x16_t pred_3 = calculate_horizontal_weights_and_pred( \ 2962 left_v, top_right_v, weights_x[3], scaled_weights_x[3]); \ 2963 vst1q_u8(dst + 48, pred_3); \ 2964 } \ 2965 } \ 2966 dst += stride; \ 2967 } while (++y != height); \ 2968 } 2969 2970 SMOOTH_H_PREDICTOR(16) 2971 SMOOTH_H_PREDICTOR(32) 2972 SMOOTH_H_PREDICTOR(64) 2973 2974 #undef SMOOTH_H_PREDICTOR 2975 2976 #define SMOOTH_H_NXM_WIDE(W, H) \ 2977 void aom_smooth_h_predictor_##W##x##H##_neon( \ 2978 uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, \ 2979 const uint8_t *left) { \ 2980 smooth_h_##W##xh_neon(dst, y_stride, above, left, H); \ 2981 } 2982 2983 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2984 SMOOTH_H_NXM_WIDE(16, 4) 2985 SMOOTH_H_NXM_WIDE(16, 64) 2986 SMOOTH_H_NXM_WIDE(32, 8) 2987 SMOOTH_H_NXM_WIDE(64, 16) 2988 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 2989 SMOOTH_H_NXM_WIDE(16, 8) 2990 SMOOTH_H_NXM_WIDE(16, 16) 2991 SMOOTH_H_NXM_WIDE(16, 32) 2992 SMOOTH_H_NXM_WIDE(32, 16) 2993 SMOOTH_H_NXM_WIDE(32, 32) 2994 SMOOTH_H_NXM_WIDE(32, 64) 2995 SMOOTH_H_NXM_WIDE(64, 32) 2996 SMOOTH_H_NXM_WIDE(64, 64) 2997 2998 #undef SMOOTH_H_NXM_WIDE 2999 3000 // ----------------------------------------------------------------------------- 3001 // PAETH 3002 3003 static inline void paeth_4or8_x_h_neon(uint8_t *dest, ptrdiff_t stride, 3004 const uint8_t *const top_row, 3005 const uint8_t *const left_column, 3006 int width, int height) { 3007 const uint8x8_t top_left = vdup_n_u8(top_row[-1]); 3008 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); 3009 uint8x8_t top; 3010 if (width == 4) { 3011 top = load_u8_4x1(top_row); 3012 } else { // width == 8 3013 top = vld1_u8(top_row); 3014 } 3015 3016 assert(height > 0); 3017 int y = 0; 3018 do { 3019 const uint8x8_t left = vdup_n_u8(left_column[y]); 3020 3021 const uint8x8_t left_dist = vabd_u8(top, top_left); 3022 const uint8x8_t top_dist = vabd_u8(left, top_left); 3023 const uint16x8_t top_left_dist = 3024 vabdq_u16(vaddl_u8(top, left), top_left_x2); 3025 3026 const uint8x8_t left_le_top = vcle_u8(left_dist, top_dist); 3027 const uint8x8_t left_le_top_left = 3028 vmovn_u16(vcleq_u16(vmovl_u8(left_dist), top_left_dist)); 3029 const uint8x8_t top_le_top_left = 3030 vmovn_u16(vcleq_u16(vmovl_u8(top_dist), top_left_dist)); 3031 3032 // if (left_dist <= top_dist && left_dist <= top_left_dist) 3033 const uint8x8_t left_mask = vand_u8(left_le_top, left_le_top_left); 3034 // dest[x] = left_column[y]; 3035 // Fill all the unused spaces with 'top'. They will be overwritten when 3036 // the positions for top_left are known. 3037 uint8x8_t result = vbsl_u8(left_mask, left, top); 3038 // else if (top_dist <= top_left_dist) 3039 // dest[x] = top_row[x]; 3040 // Add these values to the mask. They were already set. 3041 const uint8x8_t left_or_top_mask = vorr_u8(left_mask, top_le_top_left); 3042 // else 3043 // dest[x] = top_left; 3044 result = vbsl_u8(left_or_top_mask, result, top_left); 3045 3046 if (width == 4) { 3047 store_u8_4x1(dest, result); 3048 } else { // width == 8 3049 vst1_u8(dest, result); 3050 } 3051 dest += stride; 3052 } while (++y != height); 3053 } 3054 3055 #define PAETH_NXM(W, H) \ 3056 void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \ 3057 const uint8_t *above, \ 3058 const uint8_t *left) { \ 3059 paeth_4or8_x_h_neon(dst, stride, above, left, W, H); \ 3060 } 3061 3062 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3063 PAETH_NXM(4, 16) 3064 PAETH_NXM(8, 32) 3065 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3066 PAETH_NXM(4, 4) 3067 PAETH_NXM(4, 8) 3068 PAETH_NXM(8, 4) 3069 PAETH_NXM(8, 8) 3070 PAETH_NXM(8, 16) 3071 3072 // Calculate X distance <= TopLeft distance and pack the resulting mask into 3073 // uint8x8_t. 3074 static inline uint8x16_t x_le_top_left(const uint8x16_t x_dist, 3075 const uint16x8_t top_left_dist_low, 3076 const uint16x8_t top_left_dist_high) { 3077 const uint8x16_t top_left_dist = vcombine_u8(vqmovn_u16(top_left_dist_low), 3078 vqmovn_u16(top_left_dist_high)); 3079 return vcleq_u8(x_dist, top_left_dist); 3080 } 3081 3082 // Select the closest values and collect them. 3083 static inline uint8x16_t select_paeth(const uint8x16_t top, 3084 const uint8x16_t left, 3085 const uint8x16_t top_left, 3086 const uint8x16_t left_le_top, 3087 const uint8x16_t left_le_top_left, 3088 const uint8x16_t top_le_top_left) { 3089 // if (left_dist <= top_dist && left_dist <= top_left_dist) 3090 const uint8x16_t left_mask = vandq_u8(left_le_top, left_le_top_left); 3091 // dest[x] = left_column[y]; 3092 // Fill all the unused spaces with 'top'. They will be overwritten when 3093 // the positions for top_left are known. 3094 uint8x16_t result = vbslq_u8(left_mask, left, top); 3095 // else if (top_dist <= top_left_dist) 3096 // dest[x] = top_row[x]; 3097 // Add these values to the mask. They were already set. 3098 const uint8x16_t left_or_top_mask = vorrq_u8(left_mask, top_le_top_left); 3099 // else 3100 // dest[x] = top_left; 3101 return vbslq_u8(left_or_top_mask, result, top_left); 3102 } 3103 3104 // Generate numbered and high/low versions of top_left_dist. 3105 #define TOP_LEFT_DIST(num) \ 3106 const uint16x8_t top_left_##num##_dist_low = vabdq_u16( \ 3107 vaddl_u8(vget_low_u8(top[num]), vget_low_u8(left)), top_left_x2); \ 3108 const uint16x8_t top_left_##num##_dist_high = vabdq_u16( \ 3109 vaddl_u8(vget_high_u8(top[num]), vget_low_u8(left)), top_left_x2) 3110 3111 // Generate numbered versions of XLeTopLeft with x = left. 3112 #define LEFT_LE_TOP_LEFT(num) \ 3113 const uint8x16_t left_le_top_left_##num = \ 3114 x_le_top_left(left_##num##_dist, top_left_##num##_dist_low, \ 3115 top_left_##num##_dist_high) 3116 3117 // Generate numbered versions of XLeTopLeft with x = top. 3118 #define TOP_LE_TOP_LEFT(num) \ 3119 const uint8x16_t top_le_top_left_##num = x_le_top_left( \ 3120 top_dist, top_left_##num##_dist_low, top_left_##num##_dist_high) 3121 3122 static inline void paeth16_plus_x_h_neon(uint8_t *dest, ptrdiff_t stride, 3123 const uint8_t *const top_row, 3124 const uint8_t *const left_column, 3125 int width, int height) { 3126 const uint8x16_t top_left = vdupq_n_u8(top_row[-1]); 3127 const uint16x8_t top_left_x2 = vdupq_n_u16(top_row[-1] + top_row[-1]); 3128 uint8x16_t top[4]; 3129 top[0] = vld1q_u8(top_row); 3130 if (width > 16) { 3131 top[1] = vld1q_u8(top_row + 16); 3132 if (width == 64) { 3133 top[2] = vld1q_u8(top_row + 32); 3134 top[3] = vld1q_u8(top_row + 48); 3135 } 3136 } 3137 3138 assert(height > 0); 3139 int y = 0; 3140 do { 3141 const uint8x16_t left = vdupq_n_u8(left_column[y]); 3142 3143 const uint8x16_t top_dist = vabdq_u8(left, top_left); 3144 3145 const uint8x16_t left_0_dist = vabdq_u8(top[0], top_left); 3146 TOP_LEFT_DIST(0); 3147 const uint8x16_t left_0_le_top = vcleq_u8(left_0_dist, top_dist); 3148 LEFT_LE_TOP_LEFT(0); 3149 TOP_LE_TOP_LEFT(0); 3150 3151 const uint8x16_t result_0 = 3152 select_paeth(top[0], left, top_left, left_0_le_top, left_le_top_left_0, 3153 top_le_top_left_0); 3154 vst1q_u8(dest, result_0); 3155 3156 if (width > 16) { 3157 const uint8x16_t left_1_dist = vabdq_u8(top[1], top_left); 3158 TOP_LEFT_DIST(1); 3159 const uint8x16_t left_1_le_top = vcleq_u8(left_1_dist, top_dist); 3160 LEFT_LE_TOP_LEFT(1); 3161 TOP_LE_TOP_LEFT(1); 3162 3163 const uint8x16_t result_1 = 3164 select_paeth(top[1], left, top_left, left_1_le_top, 3165 left_le_top_left_1, top_le_top_left_1); 3166 vst1q_u8(dest + 16, result_1); 3167 3168 if (width == 64) { 3169 const uint8x16_t left_2_dist = vabdq_u8(top[2], top_left); 3170 TOP_LEFT_DIST(2); 3171 const uint8x16_t left_2_le_top = vcleq_u8(left_2_dist, top_dist); 3172 LEFT_LE_TOP_LEFT(2); 3173 TOP_LE_TOP_LEFT(2); 3174 3175 const uint8x16_t result_2 = 3176 select_paeth(top[2], left, top_left, left_2_le_top, 3177 left_le_top_left_2, top_le_top_left_2); 3178 vst1q_u8(dest + 32, result_2); 3179 3180 const uint8x16_t left_3_dist = vabdq_u8(top[3], top_left); 3181 TOP_LEFT_DIST(3); 3182 const uint8x16_t left_3_le_top = vcleq_u8(left_3_dist, top_dist); 3183 LEFT_LE_TOP_LEFT(3); 3184 TOP_LE_TOP_LEFT(3); 3185 3186 const uint8x16_t result_3 = 3187 select_paeth(top[3], left, top_left, left_3_le_top, 3188 left_le_top_left_3, top_le_top_left_3); 3189 vst1q_u8(dest + 48, result_3); 3190 } 3191 } 3192 3193 dest += stride; 3194 } while (++y != height); 3195 } 3196 3197 #define PAETH_NXM_WIDE(W, H) \ 3198 void aom_paeth_predictor_##W##x##H##_neon(uint8_t *dst, ptrdiff_t stride, \ 3199 const uint8_t *above, \ 3200 const uint8_t *left) { \ 3201 paeth16_plus_x_h_neon(dst, stride, above, left, W, H); \ 3202 } 3203 3204 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3205 PAETH_NXM_WIDE(16, 4) 3206 PAETH_NXM_WIDE(16, 64) 3207 PAETH_NXM_WIDE(32, 8) 3208 PAETH_NXM_WIDE(64, 16) 3209 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3210 PAETH_NXM_WIDE(16, 8) 3211 PAETH_NXM_WIDE(16, 16) 3212 PAETH_NXM_WIDE(16, 32) 3213 PAETH_NXM_WIDE(32, 16) 3214 PAETH_NXM_WIDE(32, 32) 3215 PAETH_NXM_WIDE(32, 64) 3216 PAETH_NXM_WIDE(64, 32) 3217 PAETH_NXM_WIDE(64, 64)