subpel_variance_neon.c (40425B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 #include "config/aom_config.h" 16 17 #include "aom_ports/mem.h" 18 #include "aom/aom_integer.h" 19 20 #include "aom_dsp/variance.h" 21 #include "aom_dsp/arm/mem_neon.h" 22 23 static void var_filter_block2d_bil_w4(const uint8_t *src_ptr, uint8_t *dst_ptr, 24 int src_stride, int pixel_step, 25 int dst_height, int filter_offset) { 26 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); 27 const uint8x8_t f1 = vdup_n_u8(filter_offset); 28 29 int i = dst_height; 30 do { 31 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); 32 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); 33 uint16x8_t blend = vmull_u8(s0, f0); 34 blend = vmlal_u8(blend, s1, f1); 35 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); 36 vst1_u8(dst_ptr, blend_u8); 37 38 src_ptr += 2 * src_stride; 39 dst_ptr += 2 * 4; 40 i -= 2; 41 } while (i != 0); 42 } 43 44 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr, uint8_t *dst_ptr, 45 int src_stride, int pixel_step, 46 int dst_height, int filter_offset) { 47 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); 48 const uint8x8_t f1 = vdup_n_u8(filter_offset); 49 50 int i = dst_height; 51 do { 52 uint8x8_t s0 = vld1_u8(src_ptr); 53 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); 54 uint16x8_t blend = vmull_u8(s0, f0); 55 blend = vmlal_u8(blend, s1, f1); 56 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); 57 vst1_u8(dst_ptr, blend_u8); 58 59 src_ptr += src_stride; 60 dst_ptr += 8; 61 } while (--i != 0); 62 } 63 64 static void var_filter_block2d_bil_large(const uint8_t *src_ptr, 65 uint8_t *dst_ptr, int src_stride, 66 int pixel_step, int dst_width, 67 int dst_height, int filter_offset) { 68 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); 69 const uint8x8_t f1 = vdup_n_u8(filter_offset); 70 71 int i = dst_height; 72 do { 73 int j = 0; 74 do { 75 uint8x16_t s0 = vld1q_u8(src_ptr + j); 76 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); 77 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); 78 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); 79 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); 80 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); 81 uint8x16_t blend_u8 = 82 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); 83 vst1q_u8(dst_ptr + j, blend_u8); 84 85 j += 16; 86 } while (j < dst_width); 87 88 src_ptr += src_stride; 89 dst_ptr += dst_width; 90 } while (--i != 0); 91 } 92 93 static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, uint8_t *dst_ptr, 94 int src_stride, int pixel_step, 95 int dst_height, int filter_offset) { 96 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 16, 97 dst_height, filter_offset); 98 } 99 100 static void var_filter_block2d_bil_w32(const uint8_t *src_ptr, uint8_t *dst_ptr, 101 int src_stride, int pixel_step, 102 int dst_height, int filter_offset) { 103 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 32, 104 dst_height, filter_offset); 105 } 106 107 static void var_filter_block2d_bil_w64(const uint8_t *src_ptr, uint8_t *dst_ptr, 108 int src_stride, int pixel_step, 109 int dst_height, int filter_offset) { 110 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 64, 111 dst_height, filter_offset); 112 } 113 114 static void var_filter_block2d_bil_w128(const uint8_t *src_ptr, 115 uint8_t *dst_ptr, int src_stride, 116 int pixel_step, int dst_height, 117 int filter_offset) { 118 var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, pixel_step, 128, 119 dst_height, filter_offset); 120 } 121 122 static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr, 123 int src_stride, int pixel_step, 124 int dst_width, int dst_height) { 125 // We only specialise on the filter values for large block sizes (>= 16x16.) 126 assert(dst_width >= 16 && dst_width % 16 == 0); 127 128 int i = dst_height; 129 do { 130 int j = 0; 131 do { 132 uint8x16_t s0 = vld1q_u8(src_ptr + j); 133 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); 134 uint8x16_t avg = vrhaddq_u8(s0, s1); 135 vst1q_u8(dst_ptr + j, avg); 136 137 j += 16; 138 } while (j < dst_width); 139 140 src_ptr += src_stride; 141 dst_ptr += dst_width; 142 } while (--i != 0); 143 } 144 145 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ 146 unsigned int aom_sub_pixel_variance##w##x##h##_neon( \ 147 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 148 const uint8_t *ref, int ref_stride, uint32_t *sse) { \ 149 uint8_t tmp0[w * (h + padding)]; \ 150 uint8_t tmp1[w * h]; \ 151 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ 152 xoffset); \ 153 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 154 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 155 } 156 157 #define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ 158 unsigned int aom_sub_pixel_variance##w##x##h##_neon( \ 159 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 160 const uint8_t *ref, int ref_stride, unsigned int *sse) { \ 161 if (xoffset == 0) { \ 162 if (yoffset == 0) { \ 163 return aom_variance##w##x##h(src, src_stride, ref, ref_stride, sse); \ 164 } else if (yoffset == 4) { \ 165 uint8_t tmp[w * h]; \ 166 var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h); \ 167 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ 168 } else { \ 169 uint8_t tmp[w * h]; \ 170 var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h, \ 171 yoffset); \ 172 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ 173 } \ 174 } else if (xoffset == 4) { \ 175 uint8_t tmp0[w * (h + padding)]; \ 176 if (yoffset == 0) { \ 177 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ 178 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ 179 } else if (yoffset == 4) { \ 180 uint8_t tmp1[w * (h + padding)]; \ 181 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ 182 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 183 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 184 } else { \ 185 uint8_t tmp1[w * (h + padding)]; \ 186 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ 187 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 188 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 189 } \ 190 } else { \ 191 uint8_t tmp0[w * (h + padding)]; \ 192 if (yoffset == 0) { \ 193 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ 194 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ 195 } else if (yoffset == 4) { \ 196 uint8_t tmp1[w * h]; \ 197 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ 198 xoffset); \ 199 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 200 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 201 } else { \ 202 uint8_t tmp1[w * h]; \ 203 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ 204 xoffset); \ 205 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 206 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 207 } \ 208 } \ 209 } 210 211 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) 212 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) 213 214 SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) 215 SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) 216 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) 217 218 SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) 219 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) 220 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) 221 222 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) 223 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) 224 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) 225 226 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) 227 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) 228 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) 229 230 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) 231 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) 232 233 // Realtime mode doesn't use 4x rectangular blocks. 234 #if !CONFIG_REALTIME_ONLY 235 236 SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) 237 238 SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) 239 240 SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) 241 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) 242 243 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) 244 245 SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) 246 247 #endif // !CONFIG_REALTIME_ONLY 248 249 #undef SUBPEL_VARIANCE_WXH_NEON 250 #undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON 251 252 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 4. 253 static void avg_pred_var_filter_block2d_bil_w4(const uint8_t *src_ptr, 254 uint8_t *dst_ptr, int src_stride, 255 int pixel_step, int dst_height, 256 int filter_offset, 257 const uint8_t *second_pred) { 258 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); 259 const uint8x8_t f1 = vdup_n_u8(filter_offset); 260 261 int i = dst_height; 262 do { 263 uint8x8_t s0 = load_unaligned_u8(src_ptr, src_stride); 264 uint8x8_t s1 = load_unaligned_u8(src_ptr + pixel_step, src_stride); 265 uint16x8_t blend = vmull_u8(s0, f0); 266 blend = vmlal_u8(blend, s1, f1); 267 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); 268 269 uint8x8_t p = vld1_u8(second_pred); 270 uint8x8_t avg = vrhadd_u8(blend_u8, p); 271 272 vst1_u8(dst_ptr, avg); 273 274 src_ptr += 2 * src_stride; 275 dst_ptr += 2 * 4; 276 second_pred += 2 * 4; 277 i -= 2; 278 } while (i != 0); 279 } 280 281 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 8. 282 static void avg_pred_var_filter_block2d_bil_w8(const uint8_t *src_ptr, 283 uint8_t *dst_ptr, int src_stride, 284 int pixel_step, int dst_height, 285 int filter_offset, 286 const uint8_t *second_pred) { 287 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); 288 const uint8x8_t f1 = vdup_n_u8(filter_offset); 289 290 int i = dst_height; 291 do { 292 uint8x8_t s0 = vld1_u8(src_ptr); 293 uint8x8_t s1 = vld1_u8(src_ptr + pixel_step); 294 uint16x8_t blend = vmull_u8(s0, f0); 295 blend = vmlal_u8(blend, s1, f1); 296 uint8x8_t blend_u8 = vrshrn_n_u16(blend, 3); 297 298 uint8x8_t p = vld1_u8(second_pred); 299 uint8x8_t avg = vrhadd_u8(blend_u8, p); 300 301 vst1_u8(dst_ptr, avg); 302 303 src_ptr += src_stride; 304 dst_ptr += 8; 305 second_pred += 8; 306 } while (--i > 0); 307 } 308 309 // Combine bilinear filter with aom_comp_avg_pred for large blocks. 310 static void avg_pred_var_filter_block2d_bil_large( 311 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, 312 int dst_width, int dst_height, int filter_offset, 313 const uint8_t *second_pred) { 314 const uint8x8_t f0 = vdup_n_u8(8 - filter_offset); 315 const uint8x8_t f1 = vdup_n_u8(filter_offset); 316 317 int i = dst_height; 318 do { 319 int j = 0; 320 do { 321 uint8x16_t s0 = vld1q_u8(src_ptr + j); 322 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); 323 uint16x8_t blend_l = vmull_u8(vget_low_u8(s0), f0); 324 blend_l = vmlal_u8(blend_l, vget_low_u8(s1), f1); 325 uint16x8_t blend_h = vmull_u8(vget_high_u8(s0), f0); 326 blend_h = vmlal_u8(blend_h, vget_high_u8(s1), f1); 327 uint8x16_t blend_u8 = 328 vcombine_u8(vrshrn_n_u16(blend_l, 3), vrshrn_n_u16(blend_h, 3)); 329 330 uint8x16_t p = vld1q_u8(second_pred); 331 uint8x16_t avg = vrhaddq_u8(blend_u8, p); 332 333 vst1q_u8(dst_ptr + j, avg); 334 335 j += 16; 336 second_pred += 16; 337 } while (j < dst_width); 338 339 src_ptr += src_stride; 340 dst_ptr += dst_width; 341 } while (--i != 0); 342 } 343 344 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 16. 345 static void avg_pred_var_filter_block2d_bil_w16( 346 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, 347 int dst_height, int filter_offset, const uint8_t *second_pred) { 348 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 349 pixel_step, 16, dst_height, 350 filter_offset, second_pred); 351 } 352 353 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 32. 354 static void avg_pred_var_filter_block2d_bil_w32( 355 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, 356 int dst_height, int filter_offset, const uint8_t *second_pred) { 357 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 358 pixel_step, 32, dst_height, 359 filter_offset, second_pred); 360 } 361 362 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 64. 363 static void avg_pred_var_filter_block2d_bil_w64( 364 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, 365 int dst_height, int filter_offset, const uint8_t *second_pred) { 366 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 367 pixel_step, 64, dst_height, 368 filter_offset, second_pred); 369 } 370 371 // Combine bilinear filter with aom_comp_avg_pred for blocks having width 128. 372 static void avg_pred_var_filter_block2d_bil_w128( 373 const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, int pixel_step, 374 int dst_height, int filter_offset, const uint8_t *second_pred) { 375 avg_pred_var_filter_block2d_bil_large(src_ptr, dst_ptr, src_stride, 376 pixel_step, 128, dst_height, 377 filter_offset, second_pred); 378 } 379 380 // Combine averaging subpel filter with aom_comp_avg_pred. 381 static void avg_pred_var_filter_block2d_avg(const uint8_t *src_ptr, 382 uint8_t *dst_ptr, int src_stride, 383 int pixel_step, int dst_width, 384 int dst_height, 385 const uint8_t *second_pred) { 386 // We only specialise on the filter values for large block sizes (>= 16x16.) 387 assert(dst_width >= 16 && dst_width % 16 == 0); 388 389 int i = dst_height; 390 do { 391 int j = 0; 392 do { 393 uint8x16_t s0 = vld1q_u8(src_ptr + j); 394 uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step); 395 uint8x16_t avg = vrhaddq_u8(s0, s1); 396 397 uint8x16_t p = vld1q_u8(second_pred); 398 avg = vrhaddq_u8(avg, p); 399 400 vst1q_u8(dst_ptr + j, avg); 401 402 j += 16; 403 second_pred += 16; 404 } while (j < dst_width); 405 406 src_ptr += src_stride; 407 dst_ptr += dst_width; 408 } while (--i != 0); 409 } 410 411 // Implementation of aom_comp_avg_pred for blocks having width >= 16. 412 static void avg_pred(const uint8_t *src_ptr, uint8_t *dst_ptr, int src_stride, 413 int dst_width, int dst_height, 414 const uint8_t *second_pred) { 415 // We only specialise on the filter values for large block sizes (>= 16x16.) 416 assert(dst_width >= 16 && dst_width % 16 == 0); 417 418 int i = dst_height; 419 do { 420 int j = 0; 421 do { 422 uint8x16_t s = vld1q_u8(src_ptr + j); 423 uint8x16_t p = vld1q_u8(second_pred); 424 425 uint8x16_t avg = vrhaddq_u8(s, p); 426 427 vst1q_u8(dst_ptr + j, avg); 428 429 j += 16; 430 second_pred += 16; 431 } while (j < dst_width); 432 433 src_ptr += src_stride; 434 dst_ptr += dst_width; 435 } while (--i != 0); 436 } 437 438 #define SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ 439 unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \ 440 const uint8_t *src, int source_stride, int xoffset, int yoffset, \ 441 const uint8_t *ref, int ref_stride, uint32_t *sse, \ 442 const uint8_t *second_pred) { \ 443 uint8_t tmp0[w * (h + padding)]; \ 444 uint8_t tmp1[w * h]; \ 445 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, (h + padding), \ 446 xoffset); \ 447 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ 448 second_pred); \ 449 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 450 } 451 452 #define SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(w, h, padding) \ 453 unsigned int aom_sub_pixel_avg_variance##w##x##h##_neon( \ 454 const uint8_t *src, int source_stride, int xoffset, int yoffset, \ 455 const uint8_t *ref, int ref_stride, unsigned int *sse, \ 456 const uint8_t *second_pred) { \ 457 if (xoffset == 0) { \ 458 uint8_t tmp[w * h]; \ 459 if (yoffset == 0) { \ 460 avg_pred(src, tmp, source_stride, w, h, second_pred); \ 461 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ 462 } else if (yoffset == 4) { \ 463 avg_pred_var_filter_block2d_avg(src, tmp, source_stride, \ 464 source_stride, w, h, second_pred); \ 465 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ 466 } else { \ 467 avg_pred_var_filter_block2d_bil_w##w( \ 468 src, tmp, source_stride, source_stride, h, yoffset, second_pred); \ 469 return aom_variance##w##x##h(tmp, w, ref, ref_stride, sse); \ 470 } \ 471 } else if (xoffset == 4) { \ 472 uint8_t tmp0[w * (h + padding)]; \ 473 if (yoffset == 0) { \ 474 avg_pred_var_filter_block2d_avg(src, tmp0, source_stride, 1, w, h, \ 475 second_pred); \ 476 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ 477 } else if (yoffset == 4) { \ 478 uint8_t tmp1[w * (h + padding)]; \ 479 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ 480 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ 481 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 482 } else { \ 483 uint8_t tmp1[w * (h + padding)]; \ 484 var_filter_block2d_avg(src, tmp0, source_stride, 1, w, (h + padding)); \ 485 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ 486 second_pred); \ 487 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 488 } \ 489 } else { \ 490 uint8_t tmp0[w * (h + padding)]; \ 491 if (yoffset == 0) { \ 492 avg_pred_var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, h, \ 493 xoffset, second_pred); \ 494 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ 495 } else if (yoffset == 4) { \ 496 uint8_t tmp1[w * h]; \ 497 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ 498 (h + padding), xoffset); \ 499 avg_pred_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h, second_pred); \ 500 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 501 } else { \ 502 uint8_t tmp1[w * h]; \ 503 var_filter_block2d_bil_w##w(src, tmp0, source_stride, 1, \ 504 (h + padding), xoffset); \ 505 avg_pred_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset, \ 506 second_pred); \ 507 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 508 } \ 509 } \ 510 } 511 512 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 4, 2) 513 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 8, 2) 514 515 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 4, 1) 516 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 8, 1) 517 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 16, 1) 518 519 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 8, 1) 520 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 16, 1) 521 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 32, 1) 522 523 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 16, 1) 524 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 32, 1) 525 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 64, 1) 526 527 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 32, 1) 528 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 64, 1) 529 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 128, 1) 530 531 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 64, 1) 532 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(128, 128, 1) 533 534 #if !CONFIG_REALTIME_ONLY 535 536 SUBPEL_AVG_VARIANCE_WXH_NEON(4, 16, 2) 537 538 SUBPEL_AVG_VARIANCE_WXH_NEON(8, 32, 1) 539 540 SUBPEL_AVG_VARIANCE_WXH_NEON(16, 4, 1) 541 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(16, 64, 1) 542 543 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(32, 8, 1) 544 545 SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(64, 16, 1) 546 547 #endif // !CONFIG_REALTIME_ONLY 548 549 #undef SUBPEL_AVG_VARIANCE_WXH_NEON 550 #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON 551 552 #if !CONFIG_REALTIME_ONLY 553 554 #define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ 555 unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \ 556 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ 557 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ 558 uint8_t tmp0[w * (h + padding)]; \ 559 uint8_t tmp1[w * h]; \ 560 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ 561 xoffset); \ 562 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 563 return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse); \ 564 } 565 566 #define SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ 567 unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon( \ 568 const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ 569 const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ 570 if (xoffset == 0) { \ 571 if (yoffset == 0) { \ 572 return aom_obmc_variance##w##x##h##_neon(pre, pre_stride, wsrc, mask, \ 573 sse); \ 574 } else if (yoffset == 4) { \ 575 uint8_t tmp[w * h]; \ 576 var_filter_block2d_avg(pre, tmp, pre_stride, pre_stride, w, h); \ 577 return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \ 578 } else { \ 579 uint8_t tmp[w * h]; \ 580 var_filter_block2d_bil_w##w(pre, tmp, pre_stride, pre_stride, h, \ 581 yoffset); \ 582 return aom_obmc_variance##w##x##h##_neon(tmp, w, wsrc, mask, sse); \ 583 } \ 584 } else if (xoffset == 4) { \ 585 uint8_t tmp0[w * (h + padding)]; \ 586 if (yoffset == 0) { \ 587 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h); \ 588 return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \ 589 } else if (yoffset == 4) { \ 590 uint8_t tmp1[w * (h + padding)]; \ 591 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \ 592 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 593 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ 594 } else { \ 595 uint8_t tmp1[w * (h + padding)]; \ 596 var_filter_block2d_avg(pre, tmp0, pre_stride, 1, w, h + padding); \ 597 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 598 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ 599 } \ 600 } else { \ 601 uint8_t tmp0[w * (h + padding)]; \ 602 if (yoffset == 0) { \ 603 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h, xoffset); \ 604 return aom_obmc_variance##w##x##h##_neon(tmp0, w, wsrc, mask, sse); \ 605 } else if (yoffset == 4) { \ 606 uint8_t tmp1[w * h]; \ 607 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ 608 xoffset); \ 609 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 610 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ 611 } else { \ 612 uint8_t tmp1[w * h]; \ 613 var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \ 614 xoffset); \ 615 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 616 return aom_obmc_variance##w##x##h##_neon(tmp1, w, wsrc, mask, sse); \ 617 } \ 618 } \ 619 } 620 621 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) 622 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) 623 OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) 624 625 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) 626 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) 627 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) 628 OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) 629 630 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) 631 OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) 632 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) 633 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) 634 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) 635 636 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) 637 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) 638 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) 639 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) 640 641 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) 642 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) 643 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) 644 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) 645 646 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) 647 SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) 648 649 #undef OBMC_SUBPEL_VARIANCE_WXH_NEON 650 #undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON 651 #endif // !CONFIG_REALTIME_ONLY 652 653 #define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ 654 unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \ 655 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 656 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ 657 const uint8_t *msk, int msk_stride, int invert_mask, \ 658 unsigned int *sse) { \ 659 uint8_t tmp0[w * (h + padding)]; \ 660 uint8_t tmp1[w * h]; \ 661 uint8_t tmp2[w * h]; \ 662 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ 663 xoffset); \ 664 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 665 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \ 666 invert_mask); \ 667 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ 668 } 669 670 #define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding) \ 671 unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon( \ 672 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 673 const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ 674 const uint8_t *msk, int msk_stride, int invert_mask, \ 675 unsigned int *sse) { \ 676 if (xoffset == 0) { \ 677 uint8_t tmp0[w * h]; \ 678 if (yoffset == 0) { \ 679 aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \ 680 msk_stride, invert_mask); \ 681 return aom_variance##w##x##h(tmp0, w, ref, ref_stride, sse); \ 682 } else if (yoffset == 4) { \ 683 uint8_t tmp1[w * h]; \ 684 var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h); \ 685 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ 686 msk_stride, invert_mask); \ 687 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 688 } else { \ 689 uint8_t tmp1[w * h]; \ 690 var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h, \ 691 yoffset); \ 692 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ 693 msk_stride, invert_mask); \ 694 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 695 } \ 696 } else if (xoffset == 4) { \ 697 uint8_t tmp0[w * (h + padding)]; \ 698 if (yoffset == 0) { \ 699 uint8_t tmp1[w * h]; \ 700 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h); \ 701 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ 702 msk_stride, invert_mask); \ 703 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 704 } else if (yoffset == 4) { \ 705 uint8_t tmp1[w * h]; \ 706 uint8_t tmp2[w * h]; \ 707 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ 708 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 709 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ 710 msk_stride, invert_mask); \ 711 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ 712 } else { \ 713 uint8_t tmp1[w * h]; \ 714 uint8_t tmp2[w * h]; \ 715 var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding)); \ 716 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 717 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ 718 msk_stride, invert_mask); \ 719 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ 720 } \ 721 } else { \ 722 if (yoffset == 0) { \ 723 uint8_t tmp0[w * h]; \ 724 uint8_t tmp1[w * h]; \ 725 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset); \ 726 aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk, \ 727 msk_stride, invert_mask); \ 728 return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse); \ 729 } else if (yoffset == 4) { \ 730 uint8_t tmp0[w * (h + padding)]; \ 731 uint8_t tmp1[w * h]; \ 732 uint8_t tmp2[w * h]; \ 733 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ 734 xoffset); \ 735 var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ 736 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ 737 msk_stride, invert_mask); \ 738 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ 739 } else { \ 740 uint8_t tmp0[w * (h + padding)]; \ 741 uint8_t tmp1[w * (h + padding)]; \ 742 uint8_t tmp2[w * h]; \ 743 var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding), \ 744 xoffset); \ 745 var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ 746 aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, \ 747 msk_stride, invert_mask); \ 748 return aom_variance##w##x##h(tmp2, w, ref, ref_stride, sse); \ 749 } \ 750 } \ 751 } 752 753 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2) 754 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2) 755 756 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1) 757 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1) 758 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1) 759 760 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1) 761 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1) 762 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1) 763 764 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1) 765 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1) 766 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1) 767 768 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1) 769 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1) 770 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1) 771 772 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1) 773 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1) 774 775 // Realtime mode doesn't use 4x rectangular blocks. 776 #if !CONFIG_REALTIME_ONLY 777 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2) 778 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1) 779 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1) 780 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1) 781 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1) 782 SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1) 783 #endif // !CONFIG_REALTIME_ONLY 784 785 #undef MASKED_SUBPEL_VARIANCE_WXH_NEON 786 #undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON