highbd_subpel_variance_neon.c (57374B)
1 /* 2 * Copyright (c) 2023 The WebM project authors. All rights reserved. 3 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 4 * 5 * This source code is subject to the terms of the BSD 2 Clause License and 6 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 7 * was not distributed with this source code in the LICENSE file, you can 8 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 9 * Media Patent License 1.0 was not distributed with this source code in the 10 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 11 */ 12 13 #include <arm_neon.h> 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom_dsp/aom_filter.h" 19 #include "aom_dsp/arm/mem_neon.h" 20 #include "aom_dsp/arm/sum_neon.h" 21 #include "aom_dsp/variance.h" 22 23 // The bilinear filters look like this: 24 // 25 // {{ 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 }, 26 // { 64, 64 }, { 48, 80 }, { 32, 96 }, { 16, 112 }} 27 // 28 // We can factor out the highest common multiple, such that the sum of both 29 // weights will be 8 instead of 128. The benefits of this are two-fold: 30 // 31 // 1) We can infer the filter values from the filter_offset parameter in the 32 // bilinear filter functions below - we don't have to actually load the values 33 // from memory: 34 // f0 = 8 - filter_offset 35 // f1 = filter_offset 36 // 37 // 2) Scaling the pixel values by 8, instead of 128 enables us to operate on 38 // 16-bit data types at all times, rather than widening out to 32-bit and 39 // requiring double the number of data processing instructions. (12-bit * 8 = 40 // 15-bit.) 41 42 // Process a block exactly 4 wide and any height. 43 static void highbd_var_filter_block2d_bil_w4(const uint16_t *src_ptr, 44 uint16_t *dst_ptr, int src_stride, 45 int pixel_step, int dst_height, 46 int filter_offset) { 47 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); 48 const uint16x4_t f1 = vdup_n_u16(filter_offset); 49 50 int i = dst_height; 51 do { 52 uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); 53 uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); 54 55 uint16x4_t blend = vmul_u16(s0, f0); 56 blend = vmla_u16(blend, s1, f1); 57 blend = vrshr_n_u16(blend, 3); 58 59 vst1_u16(dst_ptr, blend); 60 61 src_ptr += src_stride; 62 dst_ptr += 4; 63 } while (--i != 0); 64 } 65 66 // Process a block which is a multiple of 8 and any height. 67 static void highbd_var_filter_block2d_bil_large(const uint16_t *src_ptr, 68 uint16_t *dst_ptr, 69 int src_stride, int pixel_step, 70 int dst_width, int dst_height, 71 int filter_offset) { 72 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); 73 const uint16x8_t f1 = vdupq_n_u16(filter_offset); 74 75 int i = dst_height; 76 do { 77 int j = 0; 78 do { 79 uint16x8_t s0 = vld1q_u16(src_ptr + j); 80 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); 81 82 uint16x8_t blend = vmulq_u16(s0, f0); 83 blend = vmlaq_u16(blend, s1, f1); 84 blend = vrshrq_n_u16(blend, 3); 85 86 vst1q_u16(dst_ptr + j, blend); 87 88 j += 8; 89 } while (j < dst_width); 90 91 src_ptr += src_stride; 92 dst_ptr += dst_width; 93 } while (--i != 0); 94 } 95 96 static void highbd_var_filter_block2d_bil_w8(const uint16_t *src_ptr, 97 uint16_t *dst_ptr, int src_stride, 98 int pixel_step, int dst_height, 99 int filter_offset) { 100 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 101 8, dst_height, filter_offset); 102 } 103 104 static void highbd_var_filter_block2d_bil_w16(const uint16_t *src_ptr, 105 uint16_t *dst_ptr, int src_stride, 106 int pixel_step, int dst_height, 107 int filter_offset) { 108 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 109 16, dst_height, filter_offset); 110 } 111 112 static void highbd_var_filter_block2d_bil_w32(const uint16_t *src_ptr, 113 uint16_t *dst_ptr, int src_stride, 114 int pixel_step, int dst_height, 115 int filter_offset) { 116 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 117 32, dst_height, filter_offset); 118 } 119 120 static void highbd_var_filter_block2d_bil_w64(const uint16_t *src_ptr, 121 uint16_t *dst_ptr, int src_stride, 122 int pixel_step, int dst_height, 123 int filter_offset) { 124 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 125 64, dst_height, filter_offset); 126 } 127 128 static void highbd_var_filter_block2d_bil_w128(const uint16_t *src_ptr, 129 uint16_t *dst_ptr, 130 int src_stride, int pixel_step, 131 int dst_height, 132 int filter_offset) { 133 highbd_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 134 128, dst_height, filter_offset); 135 } 136 137 static void highbd_var_filter_block2d_avg(const uint16_t *src_ptr, 138 uint16_t *dst_ptr, int src_stride, 139 int pixel_step, int dst_width, 140 int dst_height) { 141 int i = dst_height; 142 143 // We only specialize on the filter values for large block sizes (>= 16x16.) 144 assert(dst_width >= 16 && dst_width % 16 == 0); 145 146 do { 147 int j = 0; 148 do { 149 uint16x8_t s0 = vld1q_u16(src_ptr + j); 150 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); 151 uint16x8_t avg = vrhaddq_u16(s0, s1); 152 vst1q_u16(dst_ptr + j, avg); 153 154 j += 8; 155 } while (j < dst_width); 156 157 src_ptr += src_stride; 158 dst_ptr += dst_width; 159 } while (--i != 0); 160 } 161 162 #define HBD_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ 163 unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ 164 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 165 const uint8_t *ref, int ref_stride, uint32_t *sse) { \ 166 uint16_t tmp0[w * (h + 1)]; \ 167 uint16_t tmp1[w * h]; \ 168 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ 169 \ 170 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ 171 xoffset); \ 172 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 173 \ 174 return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ 175 w, ref, ref_stride, sse); \ 176 } 177 178 #define HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ 179 unsigned int aom_highbd_##bitdepth##_sub_pixel_variance##w##x##h##_neon( \ 180 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 181 const uint8_t *ref, int ref_stride, unsigned int *sse) { \ 182 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ 183 \ 184 if (xoffset == 0) { \ 185 if (yoffset == 0) { \ 186 return aom_highbd_##bitdepth##_variance##w##x##h( \ 187 CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ 188 } else if (yoffset == 4) { \ 189 uint16_t tmp[w * h]; \ 190 highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ 191 h); \ 192 return aom_highbd_##bitdepth##_variance##w##x##h( \ 193 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ 194 } else { \ 195 uint16_t tmp[w * h]; \ 196 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ 197 src_stride, h, yoffset); \ 198 return aom_highbd_##bitdepth##_variance##w##x##h( \ 199 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ 200 } \ 201 } else if (xoffset == 4) { \ 202 uint16_t tmp0[w * (h + 1)]; \ 203 if (yoffset == 0) { \ 204 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ 205 return aom_highbd_##bitdepth##_variance##w##x##h( \ 206 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ 207 } else if (yoffset == 4) { \ 208 uint16_t tmp1[w * (h + 1)]; \ 209 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ 210 (h + 1)); \ 211 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 212 return aom_highbd_##bitdepth##_variance##w##x##h( \ 213 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 214 } else { \ 215 uint16_t tmp1[w * (h + 1)]; \ 216 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ 217 (h + 1)); \ 218 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 219 return aom_highbd_##bitdepth##_variance##w##x##h( \ 220 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 221 } \ 222 } else { \ 223 uint16_t tmp0[w * (h + 1)]; \ 224 if (yoffset == 0) { \ 225 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ 226 xoffset); \ 227 return aom_highbd_##bitdepth##_variance##w##x##h( \ 228 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ 229 } else if (yoffset == 4) { \ 230 uint16_t tmp1[w * h]; \ 231 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ 232 (h + 1), xoffset); \ 233 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 234 return aom_highbd_##bitdepth##_variance##w##x##h( \ 235 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 236 } else { \ 237 uint16_t tmp1[w * h]; \ 238 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ 239 (h + 1), xoffset); \ 240 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 241 return aom_highbd_##bitdepth##_variance##w##x##h( \ 242 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 243 } \ 244 } \ 245 } 246 247 // 8-bit 248 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) 249 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) 250 251 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) 252 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) 253 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) 254 255 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) 256 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) 257 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) 258 259 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) 260 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) 261 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) 262 263 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) 264 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) 265 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) 266 267 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) 268 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) 269 270 #if !CONFIG_REALTIME_ONLY 271 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) 272 273 HBD_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) 274 275 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) 276 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) 277 278 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) 279 280 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) 281 #endif // !CONFIG_REALTIME_ONLY 282 283 // 10-bit 284 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) 285 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) 286 287 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) 288 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) 289 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) 290 291 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) 292 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) 293 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) 294 295 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) 296 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) 297 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) 298 299 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) 300 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) 301 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) 302 303 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) 304 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) 305 306 #if !CONFIG_REALTIME_ONLY 307 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) 308 309 HBD_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) 310 311 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) 312 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) 313 314 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) 315 316 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) 317 #endif // !CONFIG_REALTIME_ONLY 318 319 // 12-bit 320 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) 321 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) 322 323 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) 324 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) 325 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) 326 327 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) 328 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) 329 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) 330 331 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) 332 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) 333 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) 334 335 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) 336 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) 337 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) 338 339 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) 340 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) 341 342 #if !CONFIG_REALTIME_ONLY 343 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) 344 345 HBD_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) 346 347 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) 348 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) 349 350 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) 351 352 HBD_SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) 353 #endif // !CONFIG_REALTIME_ONLY 354 355 // Combine bilinear filter with aom_highbd_comp_avg_pred for blocks having 356 // width 4. 357 static void highbd_avg_pred_var_filter_block2d_bil_w4( 358 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 359 int dst_height, int filter_offset, const uint16_t *second_pred) { 360 const uint16x4_t f0 = vdup_n_u16(8 - filter_offset); 361 const uint16x4_t f1 = vdup_n_u16(filter_offset); 362 363 int i = dst_height; 364 do { 365 uint16x4_t s0 = load_unaligned_u16_4x1(src_ptr); 366 uint16x4_t s1 = load_unaligned_u16_4x1(src_ptr + pixel_step); 367 uint16x4_t p = vld1_u16(second_pred); 368 369 uint16x4_t blend = vmul_u16(s0, f0); 370 blend = vmla_u16(blend, s1, f1); 371 blend = vrshr_n_u16(blend, 3); 372 373 vst1_u16(dst_ptr, vrhadd_u16(blend, p)); 374 375 src_ptr += src_stride; 376 dst_ptr += 4; 377 second_pred += 4; 378 } while (--i != 0); 379 } 380 381 // Combine bilinear filter with aom_highbd_comp_avg_pred for large blocks. 382 static void highbd_avg_pred_var_filter_block2d_bil_large( 383 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 384 int dst_width, int dst_height, int filter_offset, 385 const uint16_t *second_pred) { 386 const uint16x8_t f0 = vdupq_n_u16(8 - filter_offset); 387 const uint16x8_t f1 = vdupq_n_u16(filter_offset); 388 389 int i = dst_height; 390 do { 391 int j = 0; 392 do { 393 uint16x8_t s0 = vld1q_u16(src_ptr + j); 394 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); 395 uint16x8_t p = vld1q_u16(second_pred); 396 397 uint16x8_t blend = vmulq_u16(s0, f0); 398 blend = vmlaq_u16(blend, s1, f1); 399 blend = vrshrq_n_u16(blend, 3); 400 401 vst1q_u16(dst_ptr + j, vrhaddq_u16(blend, p)); 402 403 j += 8; 404 second_pred += 8; 405 } while (j < dst_width); 406 407 src_ptr += src_stride; 408 dst_ptr += dst_width; 409 } while (--i != 0); 410 } 411 412 static void highbd_avg_pred_var_filter_block2d_bil_w8( 413 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 414 int dst_height, int filter_offset, const uint16_t *second_pred) { 415 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 416 pixel_step, 8, dst_height, 417 filter_offset, second_pred); 418 } 419 420 static void highbd_avg_pred_var_filter_block2d_bil_w16( 421 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 422 int dst_height, int filter_offset, const uint16_t *second_pred) { 423 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 424 pixel_step, 16, dst_height, 425 filter_offset, second_pred); 426 } 427 428 static void highbd_avg_pred_var_filter_block2d_bil_w32( 429 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 430 int dst_height, int filter_offset, const uint16_t *second_pred) { 431 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 432 pixel_step, 32, dst_height, 433 filter_offset, second_pred); 434 } 435 436 static void highbd_avg_pred_var_filter_block2d_bil_w64( 437 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 438 int dst_height, int filter_offset, const uint16_t *second_pred) { 439 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 440 pixel_step, 64, dst_height, 441 filter_offset, second_pred); 442 } 443 444 static void highbd_avg_pred_var_filter_block2d_bil_w128( 445 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 446 int dst_height, int filter_offset, const uint16_t *second_pred) { 447 highbd_avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 448 pixel_step, 128, dst_height, 449 filter_offset, second_pred); 450 } 451 452 // Combine averaging subpel filter with aom_highbd_comp_avg_pred. 453 static void highbd_avg_pred_var_filter_block2d_avg( 454 const uint16_t *src_ptr, uint16_t *dst_ptr, int src_stride, int pixel_step, 455 int dst_width, int dst_height, const uint16_t *second_pred) { 456 int i = dst_height; 457 458 // We only specialize on the filter values for large block sizes (>= 16x16.) 459 assert(dst_width >= 16 && dst_width % 16 == 0); 460 461 do { 462 int j = 0; 463 do { 464 uint16x8_t s0 = vld1q_u16(src_ptr + j); 465 uint16x8_t s1 = vld1q_u16(src_ptr + j + pixel_step); 466 uint16x8_t avg = vrhaddq_u16(s0, s1); 467 468 uint16x8_t p = vld1q_u16(second_pred); 469 avg = vrhaddq_u16(avg, p); 470 471 vst1q_u16(dst_ptr + j, avg); 472 473 j += 8; 474 second_pred += 8; 475 } while (j < dst_width); 476 477 src_ptr += src_stride; 478 dst_ptr += dst_width; 479 } while (--i != 0); 480 } 481 482 // Implementation of aom_highbd_comp_avg_pred for blocks having width >= 16. 483 static void highbd_avg_pred(const uint16_t *src_ptr, uint16_t *dst_ptr, 484 int src_stride, int dst_width, int dst_height, 485 const uint16_t *second_pred) { 486 int i = dst_height; 487 488 // We only specialize on the filter values for large block sizes (>= 16x16.) 489 assert(dst_width >= 16 && dst_width % 16 == 0); 490 491 do { 492 int j = 0; 493 do { 494 uint16x8_t s = vld1q_u16(src_ptr + j); 495 uint16x8_t p = vld1q_u16(second_pred); 496 497 uint16x8_t avg = vrhaddq_u16(s, p); 498 499 vst1q_u16(dst_ptr + j, avg); 500 501 j += 8; 502 second_pred += 8; 503 } while (j < dst_width); 504 505 src_ptr += src_stride; 506 dst_ptr += dst_width; 507 } while (--i != 0); 508 } 509 510 #define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ 511 uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ 512 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 513 const uint8_t *ref, int ref_stride, uint32_t *sse, \ 514 const uint8_t *second_pred) { \ 515 uint16_t tmp0[w * (h + 1)]; \ 516 uint16_t tmp1[w * h]; \ 517 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ 518 \ 519 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ 520 xoffset); \ 521 highbd_avg_pred_var_filter_block2d_bil_w##w( \ 522 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ 523 \ 524 return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ 525 w, ref, ref_stride, sse); \ 526 } 527 528 #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ 529 unsigned int aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ 530 const uint8_t *src, int source_stride, int xoffset, int yoffset, \ 531 const uint8_t *ref, int ref_stride, uint32_t *sse, \ 532 const uint8_t *second_pred) { \ 533 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ 534 \ 535 if (xoffset == 0) { \ 536 uint16_t tmp[w * h]; \ 537 if (yoffset == 0) { \ 538 highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ 539 CONVERT_TO_SHORTPTR(second_pred)); \ 540 return aom_highbd_##bitdepth##_variance##w##x##h( \ 541 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ 542 } else if (yoffset == 4) { \ 543 highbd_avg_pred_var_filter_block2d_avg( \ 544 src_ptr, tmp, source_stride, source_stride, w, h, \ 545 CONVERT_TO_SHORTPTR(second_pred)); \ 546 return aom_highbd_##bitdepth##_variance##w##x##h( \ 547 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ 548 } else { \ 549 highbd_avg_pred_var_filter_block2d_bil_w##w( \ 550 src_ptr, tmp, source_stride, source_stride, h, yoffset, \ 551 CONVERT_TO_SHORTPTR(second_pred)); \ 552 return aom_highbd_##bitdepth##_variance##w##x##h( \ 553 CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ 554 } \ 555 } else if (xoffset == 4) { \ 556 uint16_t tmp0[w * (h + 1)]; \ 557 if (yoffset == 0) { \ 558 highbd_avg_pred_var_filter_block2d_avg( \ 559 src_ptr, tmp0, source_stride, 1, w, h, \ 560 CONVERT_TO_SHORTPTR(second_pred)); \ 561 return aom_highbd_##bitdepth##_variance##w##x##h( \ 562 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ 563 } else if (yoffset == 4) { \ 564 uint16_t tmp1[w * (h + 1)]; \ 565 highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ 566 (h + 1)); \ 567 highbd_avg_pred_var_filter_block2d_avg( \ 568 tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ 569 return aom_highbd_##bitdepth##_variance##w##x##h( \ 570 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 571 } else { \ 572 uint16_t tmp1[w * (h + 1)]; \ 573 highbd_var_filter_block2d_avg(src_ptr, tmp0, source_stride, 1, w, \ 574 (h + 1)); \ 575 highbd_avg_pred_var_filter_block2d_bil_w##w( \ 576 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ 577 return aom_highbd_##bitdepth##_variance##w##x##h( \ 578 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 579 } \ 580 } else { \ 581 uint16_t tmp0[w * (h + 1)]; \ 582 if (yoffset == 0) { \ 583 highbd_avg_pred_var_filter_block2d_bil_w##w( \ 584 src_ptr, tmp0, source_stride, 1, h, xoffset, \ 585 CONVERT_TO_SHORTPTR(second_pred)); \ 586 return aom_highbd_##bitdepth##_variance##w##x##h( \ 587 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ 588 } else if (yoffset == 4) { \ 589 uint16_t tmp1[w * h]; \ 590 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ 591 (h + 1), xoffset); \ 592 highbd_avg_pred_var_filter_block2d_avg( \ 593 tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ 594 return aom_highbd_##bitdepth##_variance##w##x##h( \ 595 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 596 } else { \ 597 uint16_t tmp1[w * h]; \ 598 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, source_stride, 1, \ 599 (h + 1), xoffset); \ 600 highbd_avg_pred_var_filter_block2d_bil_w##w( \ 601 tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ 602 return aom_highbd_##bitdepth##_variance##w##x##h( \ 603 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 604 } \ 605 } \ 606 } 607 608 // 8-bit 609 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 4) 610 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 8) 611 612 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 4) 613 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 8) 614 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 16) 615 616 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 8) 617 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 16) 618 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 32) 619 620 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 16) 621 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 32) 622 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 64) 623 624 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 32) 625 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 64) 626 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 128) 627 628 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 64) 629 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 128, 128) 630 631 #if !CONFIG_REALTIME_ONLY 632 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 16) 633 634 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 32) 635 636 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 4) 637 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 64) 638 639 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 8) 640 641 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(8, 64, 16) 642 #endif // !CONFIG_REALTIME_ONLY 643 644 // 10-bit 645 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 4) 646 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 8) 647 648 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 4) 649 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 8) 650 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 16) 651 652 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 8) 653 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 16) 654 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 32) 655 656 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 16) 657 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 32) 658 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 64) 659 660 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 32) 661 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 64) 662 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 128) 663 664 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 64) 665 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 128, 128) 666 667 #if !CONFIG_REALTIME_ONLY 668 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 4, 16) 669 670 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 8, 32) 671 672 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 4) 673 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 16, 64) 674 675 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 32, 8) 676 677 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(10, 64, 16) 678 #endif // !CONFIG_REALTIME_ONLY 679 680 // 12-bit 681 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 4) 682 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 8) 683 684 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 4) 685 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 8) 686 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 16) 687 688 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 8) 689 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 16) 690 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 32) 691 692 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 16) 693 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 32) 694 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 64) 695 696 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 32) 697 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 64) 698 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 128) 699 700 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 64) 701 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 128, 128) 702 703 #if !CONFIG_REALTIME_ONLY 704 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 4, 16) 705 706 HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 8, 32) 707 708 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 4) 709 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 16, 64) 710 711 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 32, 8) 712 713 HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) 714 #endif // !CONFIG_REALTIME_ONLY 715 716 #define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ 717 unsigned int \ 718 aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ 719 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 720 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ 721 const uint8_t *msk, int msk_stride, int invert_mask, \ 722 unsigned int *sse) { \ 723 uint16_t tmp0[w * (h + 1)]; \ 724 uint16_t tmp1[w * (h + 1)]; \ 725 uint16_t tmp2[w * h]; \ 726 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ 727 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ 728 xoffset); \ 729 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 730 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ 731 h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ 732 msk_stride, invert_mask); \ 733 return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \ 734 w, ref, ref_stride, sse); \ 735 } 736 737 #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ 738 unsigned int \ 739 aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ 740 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 741 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ 742 const uint8_t *msk, int msk_stride, int invert_mask, \ 743 unsigned int *sse) { \ 744 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ 745 if (xoffset == 0) { \ 746 uint16_t tmp0[w * h]; \ 747 if (yoffset == 0) { \ 748 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \ 749 w, h, src, src_stride, msk, msk_stride, \ 750 invert_mask); \ 751 return aom_highbd_##bitdepth##_variance##w##x##h( \ 752 CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ 753 } else if (yoffset == 4) { \ 754 uint16_t tmp1[w * h]; \ 755 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, src_stride, \ 756 w, h); \ 757 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ 758 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ 759 msk_stride, invert_mask); \ 760 return aom_highbd_##bitdepth##_variance##w##x##h( \ 761 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 762 } else { \ 763 uint16_t tmp1[w * h]; \ 764 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, \ 765 src_stride, h, yoffset); \ 766 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ 767 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ 768 msk_stride, invert_mask); \ 769 return aom_highbd_##bitdepth##_variance##w##x##h( \ 770 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 771 } \ 772 } else if (xoffset == 4) { \ 773 uint16_t tmp0[w * (h + 1)]; \ 774 if (yoffset == 0) { \ 775 uint16_t tmp1[w * h]; \ 776 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ 777 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ 778 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ 779 msk_stride, invert_mask); \ 780 return aom_highbd_##bitdepth##_variance##w##x##h( \ 781 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 782 } else if (yoffset == 4) { \ 783 uint16_t tmp1[w * h]; \ 784 uint16_t tmp2[w * h]; \ 785 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ 786 (h + 1)); \ 787 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 788 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ 789 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ 790 msk_stride, invert_mask); \ 791 return aom_highbd_##bitdepth##_variance##w##x##h( \ 792 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ 793 } else { \ 794 uint16_t tmp1[w * h]; \ 795 uint16_t tmp2[w * h]; \ 796 highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ 797 (h + 1)); \ 798 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 799 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ 800 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ 801 msk_stride, invert_mask); \ 802 return aom_highbd_##bitdepth##_variance##w##x##h( \ 803 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ 804 } \ 805 } else { \ 806 if (yoffset == 0) { \ 807 uint16_t tmp0[w * h]; \ 808 uint16_t tmp1[w * h]; \ 809 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ 810 xoffset); \ 811 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ 812 w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ 813 msk_stride, invert_mask); \ 814 return aom_highbd_##bitdepth##_variance##w##x##h( \ 815 CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ 816 } else if (yoffset == 4) { \ 817 uint16_t tmp0[w * (h + 1)]; \ 818 uint16_t tmp1[w * h]; \ 819 uint16_t tmp2[w * h]; \ 820 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ 821 (h + 1), xoffset); \ 822 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 823 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ 824 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ 825 msk_stride, invert_mask); \ 826 return aom_highbd_##bitdepth##_variance##w##x##h( \ 827 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ 828 } else { \ 829 uint16_t tmp0[w * (h + 1)]; \ 830 uint16_t tmp1[w * (h + 1)]; \ 831 uint16_t tmp2[w * h]; \ 832 highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ 833 (h + 1), xoffset); \ 834 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 835 aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ 836 w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ 837 msk_stride, invert_mask); \ 838 return aom_highbd_##bitdepth##_variance##w##x##h( \ 839 CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ 840 } \ 841 } \ 842 } 843 844 // 8-bit 845 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) 846 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) 847 848 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) 849 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) 850 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) 851 852 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) 853 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) 854 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) 855 856 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) 857 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) 858 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) 859 860 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) 861 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) 862 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) 863 864 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) 865 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) 866 867 #if !CONFIG_REALTIME_ONLY 868 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) 869 870 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) 871 872 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) 873 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) 874 875 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) 876 877 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) 878 #endif // !CONFIG_REALTIME_ONLY 879 880 // 10-bit 881 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) 882 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) 883 884 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) 885 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) 886 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) 887 888 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) 889 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) 890 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) 891 892 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) 893 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) 894 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) 895 896 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) 897 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) 898 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) 899 900 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) 901 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) 902 903 #if !CONFIG_REALTIME_ONLY 904 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) 905 906 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) 907 908 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) 909 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) 910 911 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) 912 913 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) 914 #endif // !CONFIG_REALTIME_ONLY 915 916 // 12-bit 917 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) 918 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) 919 920 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) 921 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) 922 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) 923 924 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) 925 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) 926 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) 927 928 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) 929 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) 930 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) 931 932 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) 933 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) 934 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) 935 936 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) 937 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) 938 939 #if !CONFIG_REALTIME_ONLY 940 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) 941 942 HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) 943 944 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) 945 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) 946 947 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) 948 949 HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) 950 #endif // !CONFIG_REALTIME_ONLY 951 952 #if !CONFIG_REALTIME_ONLY 953 #define HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ 954 unsigned int \ 955 aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \ 956 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ 957 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ 958 uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \ 959 uint16_t tmp0[w * (h + 1)]; \ 960 uint16_t tmp1[w * h]; \ 961 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h + 1, \ 962 xoffset); \ 963 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 964 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 965 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ 966 } 967 968 #define SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ 969 unsigned int \ 970 aom_highbd_##bitdepth##_obmc_sub_pixel_variance##w##x##h##_neon( \ 971 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ 972 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ 973 uint16_t *pre_ptr = CONVERT_TO_SHORTPTR(pre); \ 974 if (xoffset == 0) { \ 975 if (yoffset == 0) { \ 976 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 977 pre, pre_stride, wsrc, mask, sse); \ 978 } else if (yoffset == 4) { \ 979 uint16_t tmp[w * h]; \ 980 highbd_var_filter_block2d_avg(pre_ptr, tmp, pre_stride, pre_stride, w, \ 981 h); \ 982 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 983 CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \ 984 } else { \ 985 uint16_t tmp[w * h]; \ 986 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp, pre_stride, \ 987 pre_stride, h, yoffset); \ 988 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 989 CONVERT_TO_BYTEPTR(tmp), w, wsrc, mask, sse); \ 990 } \ 991 } else if (xoffset == 4) { \ 992 uint16_t tmp0[w * (h + 1)]; \ 993 if (yoffset == 0) { \ 994 highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h); \ 995 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 996 CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \ 997 } else if (yoffset == 4) { \ 998 uint16_t tmp1[w * (h + 1)]; \ 999 highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ 1000 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 1001 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 1002 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ 1003 } else { \ 1004 uint16_t tmp1[w * (h + 1)]; \ 1005 highbd_var_filter_block2d_avg(pre_ptr, tmp0, pre_stride, 1, w, h + 1); \ 1006 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 1007 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 1008 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ 1009 } \ 1010 } else { \ 1011 uint16_t tmp0[w * (h + 1)]; \ 1012 if (yoffset == 0) { \ 1013 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, h, \ 1014 xoffset); \ 1015 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 1016 CONVERT_TO_BYTEPTR(tmp0), w, wsrc, mask, sse); \ 1017 } else if (yoffset == 4) { \ 1018 uint16_t tmp1[w * h]; \ 1019 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \ 1020 h + 1, xoffset); \ 1021 highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 1022 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 1023 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ 1024 } else { \ 1025 uint16_t tmp1[w * h]; \ 1026 highbd_var_filter_block2d_bil_w##w(pre_ptr, tmp0, pre_stride, 1, \ 1027 h + 1, xoffset); \ 1028 highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 1029 return aom_highbd_##bitdepth##_obmc_variance##w##x##h##_neon( \ 1030 CONVERT_TO_BYTEPTR(tmp1), w, wsrc, mask, sse); \ 1031 } \ 1032 } \ 1033 } 1034 1035 // 8-bit 1036 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 4) 1037 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 8) 1038 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 16) 1039 1040 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 4) 1041 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 8) 1042 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 16) 1043 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 32) 1044 1045 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 4) 1046 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 8) 1047 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 16) 1048 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 32) 1049 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 64) 1050 1051 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 8) 1052 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 16) 1053 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 32) 1054 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 64) 1055 1056 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 16) 1057 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 32) 1058 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 64) 1059 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 64, 128) 1060 1061 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 64) 1062 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 128, 128) 1063 1064 // 10-bit 1065 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 4) 1066 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 8) 1067 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 4, 16) 1068 1069 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 4) 1070 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 8) 1071 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 16) 1072 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 8, 32) 1073 1074 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 4) 1075 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 8) 1076 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 16) 1077 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 32) 1078 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 16, 64) 1079 1080 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 8) 1081 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 16) 1082 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 32) 1083 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 32, 64) 1084 1085 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 16) 1086 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 32) 1087 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 64) 1088 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 64, 128) 1089 1090 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 64) 1091 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(10, 128, 128) 1092 1093 // 12-bit 1094 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 4) 1095 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 8) 1096 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 4, 16) 1097 1098 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 4) 1099 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 8) 1100 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 16) 1101 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 8, 32) 1102 1103 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 4) 1104 HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 8) 1105 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 16) 1106 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 32) 1107 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 16, 64) 1108 1109 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 8) 1110 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 16) 1111 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 32) 1112 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 32, 64) 1113 1114 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 16) 1115 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 32) 1116 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 64) 1117 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 64, 128) 1118 1119 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 64) 1120 SPECIALIZED_HIGHBD_OBMC_SUBPEL_VARIANCE_WXH_NEON(12, 128, 128) 1121 #endif // !CONFIG_REALTIME_ONLY