highbd_variance_sse2.c (38855B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <emmintrin.h> // SSE2 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom_dsp/x86/synonyms.h" 19 #include "aom_ports/mem.h" 20 21 #include "av1/common/filter.h" 22 #include "av1/common/reconinter.h" 23 24 typedef uint32_t (*high_variance_fn_t)(const uint16_t *src, int src_stride, 25 const uint16_t *ref, int ref_stride, 26 uint32_t *sse, int *sum); 27 28 uint32_t aom_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride, 29 const uint16_t *ref, int ref_stride, 30 uint32_t *sse, int *sum); 31 32 uint32_t aom_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride, 33 const uint16_t *ref, int ref_stride, 34 uint32_t *sse, int *sum); 35 36 static void highbd_8_variance_sse2(const uint16_t *src, int src_stride, 37 const uint16_t *ref, int ref_stride, int w, 38 int h, uint32_t *sse, int *sum, 39 high_variance_fn_t var_fn, int block_size) { 40 int i, j; 41 42 *sse = 0; 43 *sum = 0; 44 45 for (i = 0; i < h; i += block_size) { 46 for (j = 0; j < w; j += block_size) { 47 unsigned int sse0; 48 int sum0; 49 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 50 ref_stride, &sse0, &sum0); 51 *sse += sse0; 52 *sum += sum0; 53 } 54 } 55 } 56 57 static void highbd_10_variance_sse2(const uint16_t *src, int src_stride, 58 const uint16_t *ref, int ref_stride, int w, 59 int h, uint32_t *sse, int *sum, 60 high_variance_fn_t var_fn, int block_size) { 61 int i, j; 62 uint64_t sse_long = 0; 63 int32_t sum_long = 0; 64 65 for (i = 0; i < h; i += block_size) { 66 for (j = 0; j < w; j += block_size) { 67 unsigned int sse0; 68 int sum0; 69 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 70 ref_stride, &sse0, &sum0); 71 sse_long += sse0; 72 sum_long += sum0; 73 } 74 } 75 *sum = ROUND_POWER_OF_TWO(sum_long, 2); 76 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); 77 } 78 79 static void highbd_12_variance_sse2(const uint16_t *src, int src_stride, 80 const uint16_t *ref, int ref_stride, int w, 81 int h, uint32_t *sse, int *sum, 82 high_variance_fn_t var_fn, int block_size) { 83 int i, j; 84 uint64_t sse_long = 0; 85 int32_t sum_long = 0; 86 87 for (i = 0; i < h; i += block_size) { 88 for (j = 0; j < w; j += block_size) { 89 unsigned int sse0; 90 int sum0; 91 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 92 ref_stride, &sse0, &sum0); 93 sse_long += sse0; 94 sum_long += sum0; 95 } 96 } 97 *sum = ROUND_POWER_OF_TWO(sum_long, 4); 98 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); 99 } 100 101 #define VAR_FN(w, h, block_size, shift) \ 102 uint32_t aom_highbd_8_variance##w##x##h##_sse2( \ 103 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 104 int ref_stride, uint32_t *sse) { \ 105 int sum; \ 106 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 107 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 108 highbd_8_variance_sse2( \ 109 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 110 aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 111 return *sse - (uint32_t)(((int64_t)sum * sum) >> shift); \ 112 } \ 113 \ 114 uint32_t aom_highbd_10_variance##w##x##h##_sse2( \ 115 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 116 int ref_stride, uint32_t *sse) { \ 117 int sum; \ 118 int64_t var; \ 119 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 120 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 121 highbd_10_variance_sse2( \ 122 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 123 aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 124 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ 125 return (var >= 0) ? (uint32_t)var : 0; \ 126 } \ 127 \ 128 uint32_t aom_highbd_12_variance##w##x##h##_sse2( \ 129 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 130 int ref_stride, uint32_t *sse) { \ 131 int sum; \ 132 int64_t var; \ 133 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 134 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 135 highbd_12_variance_sse2( \ 136 src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 137 aom_highbd_calc##block_size##x##block_size##var_sse2, block_size); \ 138 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ 139 return (var >= 0) ? (uint32_t)var : 0; \ 140 } 141 142 VAR_FN(128, 128, 16, 14) 143 VAR_FN(128, 64, 16, 13) 144 VAR_FN(64, 128, 16, 13) 145 VAR_FN(64, 64, 16, 12) 146 VAR_FN(64, 32, 16, 11) 147 VAR_FN(32, 64, 16, 11) 148 VAR_FN(32, 32, 16, 10) 149 VAR_FN(32, 16, 16, 9) 150 VAR_FN(16, 32, 16, 9) 151 VAR_FN(16, 16, 16, 8) 152 VAR_FN(16, 8, 8, 7) 153 VAR_FN(8, 16, 8, 7) 154 VAR_FN(8, 8, 8, 6) 155 156 #if !CONFIG_REALTIME_ONLY 157 VAR_FN(8, 32, 8, 8) 158 VAR_FN(32, 8, 8, 8) 159 VAR_FN(16, 64, 16, 10) 160 VAR_FN(64, 16, 16, 10) 161 #endif // !CONFIG_REALTIME_ONLY 162 163 #undef VAR_FN 164 165 unsigned int aom_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride, 166 const uint8_t *ref8, int ref_stride, 167 unsigned int *sse) { 168 int sum; 169 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 170 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 171 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 172 aom_highbd_calc16x16var_sse2, 16); 173 return *sse; 174 } 175 176 unsigned int aom_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride, 177 const uint8_t *ref8, int ref_stride, 178 unsigned int *sse) { 179 int sum; 180 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 181 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 182 highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 183 aom_highbd_calc16x16var_sse2, 16); 184 return *sse; 185 } 186 187 unsigned int aom_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride, 188 const uint8_t *ref8, int ref_stride, 189 unsigned int *sse) { 190 int sum; 191 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 192 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 193 highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 194 aom_highbd_calc16x16var_sse2, 16); 195 return *sse; 196 } 197 198 unsigned int aom_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride, 199 const uint8_t *ref8, int ref_stride, 200 unsigned int *sse) { 201 int sum; 202 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 203 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 204 highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, 205 aom_highbd_calc8x8var_sse2, 8); 206 return *sse; 207 } 208 209 unsigned int aom_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride, 210 const uint8_t *ref8, int ref_stride, 211 unsigned int *sse) { 212 int sum; 213 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 214 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 215 highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, 216 aom_highbd_calc8x8var_sse2, 8); 217 return *sse; 218 } 219 220 unsigned int aom_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride, 221 const uint8_t *ref8, int ref_stride, 222 unsigned int *sse) { 223 int sum; 224 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 225 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 226 highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8, sse, &sum, 227 aom_highbd_calc8x8var_sse2, 8); 228 return *sse; 229 } 230 231 // The 2 unused parameters are place holders for PIC enabled build. 232 // These definitions are for functions defined in 233 // highbd_subpel_variance_impl_sse2.asm 234 #define DECL(w, opt) \ 235 int aom_highbd_sub_pixel_variance##w##xh_##opt( \ 236 const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ 237 const uint16_t *dst, ptrdiff_t dst_stride, int height, \ 238 unsigned int *sse, void *unused0, void *unused); 239 #define DECLS(opt) \ 240 DECL(8, opt) \ 241 DECL(16, opt) 242 243 DECLS(sse2) 244 245 #undef DECLS 246 #undef DECL 247 248 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 249 uint32_t aom_highbd_8_sub_pixel_variance##w##x##h##_##opt( \ 250 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 251 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 252 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 253 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 254 int se = 0; \ 255 unsigned int sse = 0; \ 256 unsigned int sse2; \ 257 int row_rep = (w > 64) ? 2 : 1; \ 258 for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ 259 src += wd_64 * 64; \ 260 dst += wd_64 * 64; \ 261 int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 262 src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse2, \ 263 NULL, NULL); \ 264 se += se2; \ 265 sse += sse2; \ 266 if (w > wf) { \ 267 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 268 src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ 269 &sse2, NULL, NULL); \ 270 se += se2; \ 271 sse += sse2; \ 272 if (w > wf * 2) { \ 273 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 274 src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ 275 dst_stride, h, &sse2, NULL, NULL); \ 276 se += se2; \ 277 sse += sse2; \ 278 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 279 src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ 280 dst_stride, h, &sse2, NULL, NULL); \ 281 se += se2; \ 282 sse += sse2; \ 283 } \ 284 } \ 285 } \ 286 *sse_ptr = sse; \ 287 return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ 288 } \ 289 \ 290 uint32_t aom_highbd_10_sub_pixel_variance##w##x##h##_##opt( \ 291 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 292 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 293 int64_t var; \ 294 uint32_t sse; \ 295 uint64_t long_sse = 0; \ 296 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 297 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 298 int se = 0; \ 299 int row_rep = (w > 64) ? 2 : 1; \ 300 for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ 301 src += wd_64 * 64; \ 302 dst += wd_64 * 64; \ 303 int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 304 src, src_stride, x_offset, y_offset, dst, dst_stride, h, &sse, NULL, \ 305 NULL); \ 306 se += se2; \ 307 long_sse += sse; \ 308 if (w > wf) { \ 309 uint32_t sse2; \ 310 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 311 src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, h, \ 312 &sse2, NULL, NULL); \ 313 se += se2; \ 314 long_sse += sse2; \ 315 if (w > wf * 2) { \ 316 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 317 src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ 318 dst_stride, h, &sse2, NULL, NULL); \ 319 se += se2; \ 320 long_sse += sse2; \ 321 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 322 src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ 323 dst_stride, h, &sse2, NULL, NULL); \ 324 se += se2; \ 325 long_sse += sse2; \ 326 } \ 327 } \ 328 } \ 329 se = ROUND_POWER_OF_TWO(se, 2); \ 330 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 4); \ 331 *sse_ptr = sse; \ 332 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 333 return (var >= 0) ? (uint32_t)var : 0; \ 334 } \ 335 \ 336 uint32_t aom_highbd_12_sub_pixel_variance##w##x##h##_##opt( \ 337 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 338 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \ 339 int start_row; \ 340 uint32_t sse; \ 341 int se = 0; \ 342 int64_t var; \ 343 uint64_t long_sse = 0; \ 344 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 345 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 346 int row_rep = (w > 64) ? 2 : 1; \ 347 for (start_row = 0; start_row < h; start_row += 16) { \ 348 uint32_t sse2; \ 349 int height = h - start_row < 16 ? h - start_row : 16; \ 350 uint16_t *src_tmp = src + (start_row * src_stride); \ 351 uint16_t *dst_tmp = dst + (start_row * dst_stride); \ 352 for (int wd_64 = 0; wd_64 < row_rep; wd_64++) { \ 353 src_tmp += wd_64 * 64; \ 354 dst_tmp += wd_64 * 64; \ 355 int se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 356 src_tmp, src_stride, x_offset, y_offset, dst_tmp, dst_stride, \ 357 height, &sse2, NULL, NULL); \ 358 se += se2; \ 359 long_sse += sse2; \ 360 if (w > wf) { \ 361 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 362 src_tmp + wf, src_stride, x_offset, y_offset, dst_tmp + wf, \ 363 dst_stride, height, &sse2, NULL, NULL); \ 364 se += se2; \ 365 long_sse += sse2; \ 366 if (w > wf * 2) { \ 367 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 368 src_tmp + 2 * wf, src_stride, x_offset, y_offset, \ 369 dst_tmp + 2 * wf, dst_stride, height, &sse2, NULL, NULL); \ 370 se += se2; \ 371 long_sse += sse2; \ 372 se2 = aom_highbd_sub_pixel_variance##wf##xh_##opt( \ 373 src_tmp + 3 * wf, src_stride, x_offset, y_offset, \ 374 dst_tmp + 3 * wf, dst_stride, height, &sse2, NULL, NULL); \ 375 se += se2; \ 376 long_sse += sse2; \ 377 } \ 378 } \ 379 } \ 380 } \ 381 se = ROUND_POWER_OF_TWO(se, 4); \ 382 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ 383 *sse_ptr = sse; \ 384 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 385 return (var >= 0) ? (uint32_t)var : 0; \ 386 } 387 388 #if CONFIG_REALTIME_ONLY 389 #define FNS(opt) \ 390 FN(128, 128, 16, 7, 7, opt, (int64_t)) \ 391 FN(128, 64, 16, 7, 6, opt, (int64_t)) \ 392 FN(64, 128, 16, 6, 7, opt, (int64_t)) \ 393 FN(64, 64, 16, 6, 6, opt, (int64_t)) \ 394 FN(64, 32, 16, 6, 5, opt, (int64_t)) \ 395 FN(32, 64, 16, 5, 6, opt, (int64_t)) \ 396 FN(32, 32, 16, 5, 5, opt, (int64_t)) \ 397 FN(32, 16, 16, 5, 4, opt, (int64_t)) \ 398 FN(16, 32, 16, 4, 5, opt, (int64_t)) \ 399 FN(16, 16, 16, 4, 4, opt, (int64_t)) \ 400 FN(16, 8, 16, 4, 3, opt, (int64_t)) \ 401 FN(8, 16, 8, 3, 4, opt, (int64_t)) \ 402 FN(8, 8, 8, 3, 3, opt, (int64_t)) \ 403 FN(8, 4, 8, 3, 2, opt, (int64_t)) 404 #else // !CONFIG_REALTIME_ONLY 405 #define FNS(opt) \ 406 FN(128, 128, 16, 7, 7, opt, (int64_t)) \ 407 FN(128, 64, 16, 7, 6, opt, (int64_t)) \ 408 FN(64, 128, 16, 6, 7, opt, (int64_t)) \ 409 FN(64, 64, 16, 6, 6, opt, (int64_t)) \ 410 FN(64, 32, 16, 6, 5, opt, (int64_t)) \ 411 FN(32, 64, 16, 5, 6, opt, (int64_t)) \ 412 FN(32, 32, 16, 5, 5, opt, (int64_t)) \ 413 FN(32, 16, 16, 5, 4, opt, (int64_t)) \ 414 FN(16, 32, 16, 4, 5, opt, (int64_t)) \ 415 FN(16, 16, 16, 4, 4, opt, (int64_t)) \ 416 FN(16, 8, 16, 4, 3, opt, (int64_t)) \ 417 FN(8, 16, 8, 3, 4, opt, (int64_t)) \ 418 FN(8, 8, 8, 3, 3, opt, (int64_t)) \ 419 FN(8, 4, 8, 3, 2, opt, (int64_t)) \ 420 FN(16, 4, 16, 4, 2, opt, (int64_t)) \ 421 FN(8, 32, 8, 3, 5, opt, (int64_t)) \ 422 FN(32, 8, 16, 5, 3, opt, (int64_t)) \ 423 FN(16, 64, 16, 4, 6, opt, (int64_t)) \ 424 FN(64, 16, 16, 6, 4, opt, (int64_t)) 425 #endif // CONFIG_REALTIME_ONLY 426 427 FNS(sse2) 428 429 #undef FNS 430 #undef FN 431 432 // The 2 unused parameters are place holders for PIC enabled build. 433 #define DECL(w, opt) \ 434 int aom_highbd_sub_pixel_avg_variance##w##xh_##opt( \ 435 const uint16_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \ 436 const uint16_t *dst, ptrdiff_t dst_stride, const uint16_t *sec, \ 437 ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0, \ 438 void *unused); 439 #define DECLS(opt) \ 440 DECL(16, opt) \ 441 DECL(8, opt) 442 443 DECLS(sse2) 444 #undef DECL 445 #undef DECLS 446 447 #define FN(w, h, wf, wlog2, hlog2, opt, cast) \ 448 uint32_t aom_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \ 449 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 450 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 451 const uint8_t *sec8) { \ 452 uint32_t sse; \ 453 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 454 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 455 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 456 int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 457 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ 458 NULL, NULL); \ 459 if (w > wf) { \ 460 uint32_t sse2; \ 461 int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 462 src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ 463 sec + wf, w, h, &sse2, NULL, NULL); \ 464 se += se2; \ 465 sse += sse2; \ 466 if (w > wf * 2) { \ 467 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 468 src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ 469 dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ 470 se += se2; \ 471 sse += sse2; \ 472 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 473 src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ 474 dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ 475 se += se2; \ 476 sse += sse2; \ 477 } \ 478 } \ 479 *sse_ptr = sse; \ 480 return sse - (uint32_t)((cast se * se) >> (wlog2 + hlog2)); \ 481 } \ 482 \ 483 uint32_t aom_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \ 484 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 485 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 486 const uint8_t *sec8) { \ 487 int64_t var; \ 488 uint32_t sse; \ 489 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 490 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 491 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 492 int se = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 493 src, src_stride, x_offset, y_offset, dst, dst_stride, sec, w, h, &sse, \ 494 NULL, NULL); \ 495 if (w > wf) { \ 496 uint32_t sse2; \ 497 int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 498 src + wf, src_stride, x_offset, y_offset, dst + wf, dst_stride, \ 499 sec + wf, w, h, &sse2, NULL, NULL); \ 500 se += se2; \ 501 sse += sse2; \ 502 if (w > wf * 2) { \ 503 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 504 src + 2 * wf, src_stride, x_offset, y_offset, dst + 2 * wf, \ 505 dst_stride, sec + 2 * wf, w, h, &sse2, NULL, NULL); \ 506 se += se2; \ 507 sse += sse2; \ 508 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 509 src + 3 * wf, src_stride, x_offset, y_offset, dst + 3 * wf, \ 510 dst_stride, sec + 3 * wf, w, h, &sse2, NULL, NULL); \ 511 se += se2; \ 512 sse += sse2; \ 513 } \ 514 } \ 515 se = ROUND_POWER_OF_TWO(se, 2); \ 516 sse = ROUND_POWER_OF_TWO(sse, 4); \ 517 *sse_ptr = sse; \ 518 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 519 return (var >= 0) ? (uint32_t)var : 0; \ 520 } \ 521 \ 522 uint32_t aom_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \ 523 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 524 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \ 525 const uint8_t *sec8) { \ 526 int start_row; \ 527 int64_t var; \ 528 uint32_t sse; \ 529 int se = 0; \ 530 uint64_t long_sse = 0; \ 531 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 532 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \ 533 uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \ 534 for (start_row = 0; start_row < h; start_row += 16) { \ 535 uint32_t sse2; \ 536 int height = h - start_row < 16 ? h - start_row : 16; \ 537 int se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 538 src + (start_row * src_stride), src_stride, x_offset, y_offset, \ 539 dst + (start_row * dst_stride), dst_stride, sec + (start_row * w), \ 540 w, height, &sse2, NULL, NULL); \ 541 se += se2; \ 542 long_sse += sse2; \ 543 if (w > wf) { \ 544 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 545 src + wf + (start_row * src_stride), src_stride, x_offset, \ 546 y_offset, dst + wf + (start_row * dst_stride), dst_stride, \ 547 sec + wf + (start_row * w), w, height, &sse2, NULL, NULL); \ 548 se += se2; \ 549 long_sse += sse2; \ 550 if (w > wf * 2) { \ 551 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 552 src + 2 * wf + (start_row * src_stride), src_stride, x_offset, \ 553 y_offset, dst + 2 * wf + (start_row * dst_stride), dst_stride, \ 554 sec + 2 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ 555 se += se2; \ 556 long_sse += sse2; \ 557 se2 = aom_highbd_sub_pixel_avg_variance##wf##xh_##opt( \ 558 src + 3 * wf + (start_row * src_stride), src_stride, x_offset, \ 559 y_offset, dst + 3 * wf + (start_row * dst_stride), dst_stride, \ 560 sec + 3 * wf + (start_row * w), w, height, &sse2, NULL, NULL); \ 561 se += se2; \ 562 long_sse += sse2; \ 563 } \ 564 } \ 565 } \ 566 se = ROUND_POWER_OF_TWO(se, 4); \ 567 sse = (uint32_t)ROUND_POWER_OF_TWO(long_sse, 8); \ 568 *sse_ptr = sse; \ 569 var = (int64_t)(sse) - ((cast se * se) >> (wlog2 + hlog2)); \ 570 return (var >= 0) ? (uint32_t)var : 0; \ 571 } 572 573 #if CONFIG_REALTIME_ONLY 574 #define FNS(opt) \ 575 FN(64, 64, 16, 6, 6, opt, (int64_t)) \ 576 FN(64, 32, 16, 6, 5, opt, (int64_t)) \ 577 FN(32, 64, 16, 5, 6, opt, (int64_t)) \ 578 FN(32, 32, 16, 5, 5, opt, (int64_t)) \ 579 FN(32, 16, 16, 5, 4, opt, (int64_t)) \ 580 FN(16, 32, 16, 4, 5, opt, (int64_t)) \ 581 FN(16, 16, 16, 4, 4, opt, (int64_t)) \ 582 FN(16, 8, 16, 4, 3, opt, (int64_t)) \ 583 FN(8, 16, 8, 3, 4, opt, (int64_t)) \ 584 FN(8, 8, 8, 3, 3, opt, (int64_t)) \ 585 FN(8, 4, 8, 3, 2, opt, (int64_t)) 586 #else // !CONFIG_REALTIME_ONLY 587 #define FNS(opt) \ 588 FN(64, 64, 16, 6, 6, opt, (int64_t)) \ 589 FN(64, 32, 16, 6, 5, opt, (int64_t)) \ 590 FN(32, 64, 16, 5, 6, opt, (int64_t)) \ 591 FN(32, 32, 16, 5, 5, opt, (int64_t)) \ 592 FN(32, 16, 16, 5, 4, opt, (int64_t)) \ 593 FN(16, 32, 16, 4, 5, opt, (int64_t)) \ 594 FN(16, 16, 16, 4, 4, opt, (int64_t)) \ 595 FN(16, 8, 16, 4, 3, opt, (int64_t)) \ 596 FN(8, 16, 8, 3, 4, opt, (int64_t)) \ 597 FN(8, 8, 8, 3, 3, opt, (int64_t)) \ 598 FN(8, 4, 8, 3, 2, opt, (int64_t)) \ 599 FN(16, 4, 16, 4, 2, opt, (int64_t)) \ 600 FN(8, 32, 8, 3, 5, opt, (int64_t)) \ 601 FN(32, 8, 16, 5, 3, opt, (int64_t)) \ 602 FN(16, 64, 16, 4, 6, opt, (int64_t)) \ 603 FN(64, 16, 16, 6, 4, opt, (int64_t)) 604 #endif // CONFIG_REALTIME_ONLY 605 606 FNS(sse2) 607 608 #undef FNS 609 #undef FN 610 611 static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, 612 uint16_t *src, int sstride, int h) { 613 uint64_t sum = 0; 614 __m128i reg0_4x16, reg1_4x16; 615 __m128i src_8x16; 616 __m128i dst_8x16; 617 __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64; 618 __m128i sub_result_8x16; 619 const __m128i zeros = _mm_setzero_si128(); 620 __m128i square_result = _mm_setzero_si128(); 621 for (int i = 0; i < h; i += 2) { 622 reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); 623 reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); 624 dst_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16); 625 626 reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); 627 reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); 628 src_8x16 = _mm_unpacklo_epi64(reg0_4x16, reg1_4x16); 629 630 sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16); 631 632 res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros); 633 res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros); 634 635 res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32); 636 res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32); 637 638 res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros); 639 res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros); 640 res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros); 641 res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros); 642 643 square_result = _mm_add_epi64( 644 square_result, 645 _mm_add_epi64( 646 _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64), 647 res3_4x64)); 648 } 649 650 const __m128i sum_1x64 = 651 _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); 652 xx_storel_64(&sum, sum_1x64); 653 return sum; 654 } 655 656 static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, 657 uint16_t *src, int sstride, int h) { 658 uint64_t sum = 0; 659 __m128i src_8x16; 660 __m128i dst_8x16; 661 __m128i res0_4x32, res1_4x32, res0_4x64, res1_4x64, res2_4x64, res3_4x64; 662 __m128i sub_result_8x16; 663 const __m128i zeros = _mm_setzero_si128(); 664 __m128i square_result = _mm_setzero_si128(); 665 666 for (int i = 0; i < h; i++) { 667 dst_8x16 = _mm_loadu_si128((__m128i *)&dst[i * dstride]); 668 src_8x16 = _mm_loadu_si128((__m128i *)&src[i * sstride]); 669 670 sub_result_8x16 = _mm_sub_epi16(src_8x16, dst_8x16); 671 672 res0_4x32 = _mm_unpacklo_epi16(sub_result_8x16, zeros); 673 res1_4x32 = _mm_unpackhi_epi16(sub_result_8x16, zeros); 674 675 res0_4x32 = _mm_madd_epi16(res0_4x32, res0_4x32); 676 res1_4x32 = _mm_madd_epi16(res1_4x32, res1_4x32); 677 678 res0_4x64 = _mm_unpacklo_epi32(res0_4x32, zeros); 679 res1_4x64 = _mm_unpackhi_epi32(res0_4x32, zeros); 680 res2_4x64 = _mm_unpacklo_epi32(res1_4x32, zeros); 681 res3_4x64 = _mm_unpackhi_epi32(res1_4x32, zeros); 682 683 square_result = _mm_add_epi64( 684 square_result, 685 _mm_add_epi64( 686 _mm_add_epi64(_mm_add_epi64(res0_4x64, res1_4x64), res2_4x64), 687 res3_4x64)); 688 } 689 690 const __m128i sum_1x64 = 691 _mm_add_epi64(square_result, _mm_srli_si128(square_result, 8)); 692 xx_storel_64(&sum, sum_1x64); 693 return sum; 694 } 695 696 uint64_t aom_mse_wxh_16bit_highbd_sse2(uint16_t *dst, int dstride, 697 uint16_t *src, int sstride, int w, 698 int h) { 699 assert((w == 8 || w == 4) && (h == 8 || h == 4) && 700 "w=8/4 and h=8/4 must satisfy"); 701 switch (w) { 702 case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); 703 case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); 704 default: assert(0 && "unsupported width"); return -1; 705 } 706 }