highbd_variance_avx2.c (37550B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <immintrin.h> // AVX2 14 15 #include "config/aom_dsp_rtcd.h" 16 #include "aom_dsp/aom_filter.h" 17 #include "aom_dsp/x86/synonyms.h" 18 19 typedef void (*high_variance_fn_t)(const uint16_t *src, int src_stride, 20 const uint16_t *ref, int ref_stride, 21 uint32_t *sse, int *sum); 22 23 static uint32_t aom_highbd_var_filter_block2d_bil_avx2( 24 const uint8_t *src_ptr8, unsigned int src_pixels_per_line, int pixel_step, 25 unsigned int output_height, unsigned int output_width, 26 const uint32_t xoffset, const uint32_t yoffset, const uint8_t *dst_ptr8, 27 int dst_stride, uint32_t *sse) { 28 const __m256i filter1 = 29 _mm256_set1_epi32((int)(bilinear_filters_2t[xoffset][1] << 16) | 30 bilinear_filters_2t[xoffset][0]); 31 const __m256i filter2 = 32 _mm256_set1_epi32((int)(bilinear_filters_2t[yoffset][1] << 16) | 33 bilinear_filters_2t[yoffset][0]); 34 const __m256i one = _mm256_set1_epi16(1); 35 const int bitshift = 0x40; 36 (void)pixel_step; 37 unsigned int i, j, prev = 0, curr = 2; 38 uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8); 39 uint16_t *dst_ptr = CONVERT_TO_SHORTPTR(dst_ptr8); 40 uint16_t *src_ptr_ref = src_ptr; 41 uint16_t *dst_ptr_ref = dst_ptr; 42 int64_t sum_long = 0; 43 uint64_t sse_long = 0; 44 unsigned int rshift = 0, inc = 1; 45 __m256i rbias = _mm256_set1_epi32(bitshift); 46 __m256i opointer[8]; 47 unsigned int range; 48 if (xoffset == 0) { 49 if (yoffset == 0) { // xoffset==0 && yoffset==0 50 range = output_width / 16; 51 if (output_height == 8) inc = 2; 52 if (output_height == 4) inc = 4; 53 for (j = 0; j < range * output_height * inc / 16; j++) { 54 if (j % (output_height * inc / 16) == 0) { 55 src_ptr = src_ptr_ref; 56 src_ptr_ref += 16; 57 dst_ptr = dst_ptr_ref; 58 dst_ptr_ref += 16; 59 } 60 __m256i sum1 = _mm256_setzero_si256(); 61 __m256i sse1 = _mm256_setzero_si256(); 62 for (i = 0; i < 16 / inc; ++i) { 63 __m256i V_S_SRC = _mm256_loadu_si256((const __m256i *)src_ptr); 64 src_ptr += src_pixels_per_line; 65 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 66 dst_ptr += dst_stride; 67 68 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 69 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 70 71 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 72 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 73 } 74 75 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 76 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 77 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 78 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 79 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 80 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 81 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 82 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 83 sum_long += _mm_extract_epi32(v_d, 0); 84 sse_long += _mm_extract_epi32(v_d, 1); 85 } 86 87 rshift = get_msb(output_height) + get_msb(output_width); 88 89 } else if (yoffset == 4) { // xoffset==0 && yoffset==4 90 range = output_width / 16; 91 if (output_height == 8) inc = 2; 92 if (output_height == 4) inc = 4; 93 for (j = 0; j < range * output_height * inc / 16; j++) { 94 if (j % (output_height * inc / 16) == 0) { 95 src_ptr = src_ptr_ref; 96 src_ptr_ref += 16; 97 dst_ptr = dst_ptr_ref; 98 dst_ptr_ref += 16; 99 100 opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr); 101 src_ptr += src_pixels_per_line; 102 curr = 0; 103 } 104 105 __m256i sum1 = _mm256_setzero_si256(); 106 __m256i sse1 = _mm256_setzero_si256(); 107 108 for (i = 0; i < 16 / inc; ++i) { 109 prev = curr; 110 curr = (curr == 0) ? 1 : 0; 111 opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr); 112 src_ptr += src_pixels_per_line; 113 114 __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]); 115 116 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 117 dst_ptr += dst_stride; 118 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 119 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 120 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 121 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 122 } 123 124 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 125 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 126 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 127 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 128 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 129 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 130 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 131 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 132 sum_long += _mm_extract_epi32(v_d, 0); 133 sse_long += _mm_extract_epi32(v_d, 1); 134 } 135 136 rshift = get_msb(output_height) + get_msb(output_width); 137 138 } else { // xoffset==0 && yoffset==1,2,3,5,6,7 139 range = output_width / 16; 140 if (output_height == 8) inc = 2; 141 if (output_height == 4) inc = 4; 142 for (j = 0; j < range * output_height * inc / 16; j++) { 143 if (j % (output_height * inc / 16) == 0) { 144 src_ptr = src_ptr_ref; 145 src_ptr_ref += 16; 146 dst_ptr = dst_ptr_ref; 147 dst_ptr_ref += 16; 148 149 opointer[0] = _mm256_loadu_si256((const __m256i *)src_ptr); 150 src_ptr += src_pixels_per_line; 151 curr = 0; 152 } 153 154 __m256i sum1 = _mm256_setzero_si256(); 155 __m256i sse1 = _mm256_setzero_si256(); 156 157 for (i = 0; i < 16 / inc; ++i) { 158 prev = curr; 159 curr = (curr == 0) ? 1 : 0; 160 opointer[curr] = _mm256_loadu_si256((const __m256i *)src_ptr); 161 src_ptr += src_pixels_per_line; 162 163 __m256i V_S_M1 = 164 _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); 165 __m256i V_S_M2 = 166 _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); 167 168 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); 169 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); 170 171 __m256i V_S_S1 = 172 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); 173 __m256i V_S_S2 = 174 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); 175 176 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); 177 178 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 179 dst_ptr += dst_stride; 180 181 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 182 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 183 184 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 185 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 186 } 187 188 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 189 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 190 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 191 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 192 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 193 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 194 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 195 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 196 sum_long += _mm_extract_epi32(v_d, 0); 197 sse_long += _mm_extract_epi32(v_d, 1); 198 } 199 200 rshift = get_msb(output_height) + get_msb(output_width); 201 } 202 } else if (xoffset == 4) { 203 if (yoffset == 0) { // xoffset==4 && yoffset==0 204 range = output_width / 16; 205 if (output_height == 8) inc = 2; 206 if (output_height == 4) inc = 4; 207 for (j = 0; j < range * output_height * inc / 16; j++) { 208 if (j % (output_height * inc / 16) == 0) { 209 src_ptr = src_ptr_ref; 210 src_ptr_ref += 16; 211 dst_ptr = dst_ptr_ref; 212 dst_ptr_ref += 16; 213 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 214 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 215 src_ptr += src_pixels_per_line; 216 217 opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); 218 219 curr = 0; 220 } 221 222 __m256i sum1 = _mm256_setzero_si256(); 223 __m256i sse1 = _mm256_setzero_si256(); 224 225 for (i = 0; i < 16 / inc; ++i) { 226 prev = curr; 227 curr = (curr == 0) ? 1 : 0; 228 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 229 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 230 src_ptr += src_pixels_per_line; 231 232 opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); 233 234 __m256i V_S_M1 = 235 _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); 236 __m256i V_S_M2 = 237 _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); 238 239 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); 240 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); 241 242 __m256i V_S_S1 = 243 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); 244 __m256i V_S_S2 = 245 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); 246 247 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); 248 249 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 250 dst_ptr += dst_stride; 251 252 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 253 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 254 255 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 256 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 257 } 258 259 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 260 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 261 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 262 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 263 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 264 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 265 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 266 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 267 sum_long += _mm_extract_epi32(v_d, 0); 268 sse_long += _mm_extract_epi32(v_d, 1); 269 } 270 271 rshift = get_msb(output_height) + get_msb(output_width); 272 273 } else if (yoffset == 4) { // xoffset==4 && yoffset==4 274 range = output_width / 16; 275 if (output_height == 8) inc = 2; 276 if (output_height == 4) inc = 4; 277 for (j = 0; j < range * output_height * inc / 16; j++) { 278 if (j % (output_height * inc / 16) == 0) { 279 src_ptr = src_ptr_ref; 280 src_ptr_ref += 16; 281 dst_ptr = dst_ptr_ref; 282 dst_ptr_ref += 16; 283 284 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 285 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 286 src_ptr += src_pixels_per_line; 287 opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); 288 curr = 0; 289 } 290 291 __m256i sum1 = _mm256_setzero_si256(); 292 __m256i sse1 = _mm256_setzero_si256(); 293 294 for (i = 0; i < 16 / inc; ++i) { 295 prev = curr; 296 curr = (curr == 0) ? 1 : 0; 297 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 298 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 299 src_ptr += src_pixels_per_line; 300 opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); 301 __m256i V_S_SRC = _mm256_avg_epu16(opointer[curr], opointer[prev]); 302 303 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 304 dst_ptr += dst_stride; 305 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 306 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 307 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 308 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 309 } 310 311 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 312 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 313 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 314 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 315 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 316 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 317 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 318 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 319 sum_long += _mm_extract_epi32(v_d, 0); 320 sse_long += _mm_extract_epi32(v_d, 1); 321 } 322 323 rshift = get_msb(output_height) + get_msb(output_width); 324 325 } else { // xoffset==4 && yoffset==1,2,3,5,6,7 326 range = output_width / 16; 327 if (output_height == 8) inc = 2; 328 if (output_height == 4) inc = 4; 329 for (j = 0; j < range * output_height * inc / 16; j++) { 330 if (j % (output_height * inc / 16) == 0) { 331 src_ptr = src_ptr_ref; 332 src_ptr_ref += 16; 333 dst_ptr = dst_ptr_ref; 334 dst_ptr_ref += 16; 335 336 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 337 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 338 src_ptr += src_pixels_per_line; 339 opointer[0] = _mm256_avg_epu16(V_H_D1, V_H_D2); 340 curr = 0; 341 } 342 343 __m256i sum1 = _mm256_setzero_si256(); 344 __m256i sse1 = _mm256_setzero_si256(); 345 346 for (i = 0; i < 16 / inc; ++i) { 347 prev = curr; 348 curr = (curr == 0) ? 1 : 0; 349 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 350 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 351 src_ptr += src_pixels_per_line; 352 opointer[curr] = _mm256_avg_epu16(V_V_D1, V_V_D2); 353 354 __m256i V_S_M1 = 355 _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); 356 __m256i V_S_M2 = 357 _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); 358 359 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); 360 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); 361 362 __m256i V_S_S1 = 363 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); 364 __m256i V_S_S2 = 365 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); 366 367 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); 368 369 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 370 dst_ptr += dst_stride; 371 372 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 373 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 374 375 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 376 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 377 } 378 379 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 380 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 381 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 382 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 383 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 384 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 385 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 386 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 387 sum_long += _mm_extract_epi32(v_d, 0); 388 sse_long += _mm_extract_epi32(v_d, 1); 389 } 390 391 rshift = get_msb(output_height) + get_msb(output_width); 392 } 393 } else if (yoffset == 0) { // xoffset==1,2,3,5,6,7 && yoffset==0 394 range = output_width / 16; 395 if (output_height == 8) inc = 2; 396 if (output_height == 4) inc = 4; 397 for (j = 0; j < range * output_height * inc / 16; j++) { 398 if (j % (output_height * inc / 16) == 0) { 399 src_ptr = src_ptr_ref; 400 src_ptr_ref += 16; 401 dst_ptr = dst_ptr_ref; 402 dst_ptr_ref += 16; 403 404 curr = 0; 405 } 406 407 __m256i sum1 = _mm256_setzero_si256(); 408 __m256i sse1 = _mm256_setzero_si256(); 409 410 for (i = 0; i < 16 / inc; ++i) { 411 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 412 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 413 src_ptr += src_pixels_per_line; 414 __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); 415 __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); 416 __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); 417 __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); 418 __m256i V_V_S1 = 419 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); 420 __m256i V_V_S2 = 421 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); 422 opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); 423 424 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 425 dst_ptr += dst_stride; 426 __m256i V_R_SUB = _mm256_sub_epi16(opointer[curr], V_D_DST); 427 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 428 429 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 430 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 431 } 432 433 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 434 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 435 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 436 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 437 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 438 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 439 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 440 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 441 sum_long += _mm_extract_epi32(v_d, 0); 442 sse_long += _mm_extract_epi32(v_d, 1); 443 } 444 445 rshift = get_msb(output_height) + get_msb(output_width); 446 447 } else if (yoffset == 4) { // xoffset==1,2,3,5,6,7 && yoffset==4 448 449 range = output_width / 16; 450 if (output_height == 8) inc = 2; 451 if (output_height == 4) inc = 4; 452 for (j = 0; j < range * output_height * inc / 16; j++) { 453 if (j % (output_height * inc / 16) == 0) { 454 src_ptr = src_ptr_ref; 455 src_ptr_ref += 16; 456 dst_ptr = dst_ptr_ref; 457 dst_ptr_ref += 16; 458 459 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 460 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 461 src_ptr += src_pixels_per_line; 462 463 __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2); 464 __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2); 465 466 __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1); 467 __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1); 468 469 __m256i V_H_S1 = 470 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7); 471 __m256i V_H_S2 = 472 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7); 473 474 opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2); 475 476 curr = 0; 477 } 478 479 __m256i sum1 = _mm256_setzero_si256(); 480 __m256i sse1 = _mm256_setzero_si256(); 481 482 for (i = 0; i < 16 / inc; ++i) { 483 prev = curr; 484 curr = (curr == 0) ? 1 : 0; 485 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 486 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 487 src_ptr += src_pixels_per_line; 488 __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); 489 __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); 490 __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); 491 __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); 492 __m256i V_V_S1 = 493 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); 494 __m256i V_V_S2 = 495 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); 496 opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); 497 498 __m256i V_S_SRC = _mm256_avg_epu16(opointer[prev], opointer[curr]); 499 500 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 501 dst_ptr += dst_stride; 502 503 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 504 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 505 506 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 507 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 508 } 509 510 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 511 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 512 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 513 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 514 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 515 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 516 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 517 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 518 sum_long += _mm_extract_epi32(v_d, 0); 519 sse_long += _mm_extract_epi32(v_d, 1); 520 } 521 522 rshift = get_msb(output_height) + get_msb(output_width); 523 524 } else { // xoffset==1,2,3,5,6,7 && yoffset==1,2,3,5,6,7 525 range = output_width / 16; 526 if (output_height == 8) inc = 2; 527 if (output_height == 4) inc = 4; 528 unsigned int nloop = 16 / inc; 529 for (j = 0; j < range * output_height * inc / 16; j++) { 530 if (j % (output_height * inc / 16) == 0) { 531 src_ptr = src_ptr_ref; 532 src_ptr_ref += 16; 533 dst_ptr = dst_ptr_ref; 534 dst_ptr_ref += 16; 535 536 __m256i V_H_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 537 __m256i V_H_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 538 src_ptr += src_pixels_per_line; 539 540 __m256i V_H_M1 = _mm256_unpacklo_epi16(V_H_D1, V_H_D2); 541 __m256i V_H_M2 = _mm256_unpackhi_epi16(V_H_D1, V_H_D2); 542 543 __m256i V_H_MAD1 = _mm256_madd_epi16(V_H_M1, filter1); 544 __m256i V_H_MAD2 = _mm256_madd_epi16(V_H_M2, filter1); 545 546 __m256i V_H_S1 = 547 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD1, rbias), 7); 548 __m256i V_H_S2 = 549 _mm256_srli_epi32(_mm256_add_epi32(V_H_MAD2, rbias), 7); 550 551 opointer[0] = _mm256_packus_epi32(V_H_S1, V_H_S2); 552 553 curr = 0; 554 } 555 556 __m256i sum1 = _mm256_setzero_si256(); 557 __m256i sse1 = _mm256_setzero_si256(); 558 559 for (i = 0; i < nloop; ++i) { 560 prev = curr; 561 curr = !curr; 562 __m256i V_V_D1 = _mm256_loadu_si256((const __m256i *)src_ptr); 563 __m256i V_V_D2 = _mm256_loadu_si256((const __m256i *)(src_ptr + 1)); 564 src_ptr += src_pixels_per_line; 565 __m256i V_V_M1 = _mm256_unpacklo_epi16(V_V_D1, V_V_D2); 566 __m256i V_V_M2 = _mm256_unpackhi_epi16(V_V_D1, V_V_D2); 567 __m256i V_V_MAD1 = _mm256_madd_epi16(V_V_M1, filter1); 568 __m256i V_V_MAD2 = _mm256_madd_epi16(V_V_M2, filter1); 569 __m256i V_V_S1 = 570 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD1, rbias), 7); 571 __m256i V_V_S2 = 572 _mm256_srli_epi32(_mm256_add_epi32(V_V_MAD2, rbias), 7); 573 opointer[curr] = _mm256_packus_epi32(V_V_S1, V_V_S2); 574 575 __m256i V_S_M1 = _mm256_unpacklo_epi16(opointer[prev], opointer[curr]); 576 __m256i V_S_M2 = _mm256_unpackhi_epi16(opointer[prev], opointer[curr]); 577 578 __m256i V_S_MAD1 = _mm256_madd_epi16(V_S_M1, filter2); 579 __m256i V_S_MAD2 = _mm256_madd_epi16(V_S_M2, filter2); 580 581 __m256i V_S_S1 = 582 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD1, rbias), 7); 583 __m256i V_S_S2 = 584 _mm256_srli_epi32(_mm256_add_epi32(V_S_MAD2, rbias), 7); 585 586 __m256i V_S_SRC = _mm256_packus_epi32(V_S_S1, V_S_S2); 587 588 __m256i V_D_DST = _mm256_loadu_si256((const __m256i *)dst_ptr); 589 dst_ptr += dst_stride; 590 591 __m256i V_R_SUB = _mm256_sub_epi16(V_S_SRC, V_D_DST); 592 __m256i V_R_MAD = _mm256_madd_epi16(V_R_SUB, V_R_SUB); 593 594 sum1 = _mm256_add_epi16(sum1, V_R_SUB); 595 sse1 = _mm256_add_epi32(sse1, V_R_MAD); 596 } 597 598 __m256i v_sum0 = _mm256_madd_epi16(sum1, one); 599 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, sse1); 600 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, sse1); 601 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 602 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 603 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 604 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 605 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 606 sum_long += _mm_extract_epi32(v_d, 0); 607 sse_long += _mm_extract_epi32(v_d, 1); 608 } 609 610 rshift = get_msb(output_height) + get_msb(output_width); 611 } 612 613 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); 614 int sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); 615 616 int32_t var = *sse - (uint32_t)(((int64_t)sum * sum) >> rshift); 617 618 return (var > 0) ? var : 0; 619 } 620 621 static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, 622 const uint16_t *ref, int ref_stride, 623 uint32_t *sse, int *sum) { 624 __m256i v_sum_d = _mm256_setzero_si256(); 625 __m256i v_sse_d = _mm256_setzero_si256(); 626 for (int i = 0; i < 8; i += 2) { 627 const __m128i v_p_a0 = _mm_loadu_si128((const __m128i *)src); 628 const __m128i v_p_a1 = _mm_loadu_si128((const __m128i *)(src + src_stride)); 629 const __m128i v_p_b0 = _mm_loadu_si128((const __m128i *)ref); 630 const __m128i v_p_b1 = _mm_loadu_si128((const __m128i *)(ref + ref_stride)); 631 __m256i v_p_a = _mm256_castsi128_si256(v_p_a0); 632 __m256i v_p_b = _mm256_castsi128_si256(v_p_b0); 633 v_p_a = _mm256_inserti128_si256(v_p_a, v_p_a1, 1); 634 v_p_b = _mm256_inserti128_si256(v_p_b, v_p_b1, 1); 635 const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); 636 const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); 637 v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); 638 v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); 639 src += src_stride * 2; 640 ref += ref_stride * 2; 641 } 642 __m256i v_sum00 = _mm256_cvtepi16_epi32(_mm256_castsi256_si128(v_sum_d)); 643 __m256i v_sum01 = _mm256_cvtepi16_epi32(_mm256_extracti128_si256(v_sum_d, 1)); 644 __m256i v_sum0 = _mm256_add_epi32(v_sum00, v_sum01); 645 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); 646 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); 647 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 648 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 649 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 650 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 651 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 652 *sum = _mm_extract_epi32(v_d, 0); 653 *sse = _mm_extract_epi32(v_d, 1); 654 } 655 656 static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, 657 const uint16_t *ref, int ref_stride, 658 uint32_t *sse, int *sum) { 659 __m256i v_sum_d = _mm256_setzero_si256(); 660 __m256i v_sse_d = _mm256_setzero_si256(); 661 const __m256i one = _mm256_set1_epi16(1); 662 for (int i = 0; i < 16; ++i) { 663 const __m256i v_p_a = _mm256_loadu_si256((const __m256i *)src); 664 const __m256i v_p_b = _mm256_loadu_si256((const __m256i *)ref); 665 const __m256i v_diff = _mm256_sub_epi16(v_p_a, v_p_b); 666 const __m256i v_sqrdiff = _mm256_madd_epi16(v_diff, v_diff); 667 v_sum_d = _mm256_add_epi16(v_sum_d, v_diff); 668 v_sse_d = _mm256_add_epi32(v_sse_d, v_sqrdiff); 669 src += src_stride; 670 ref += ref_stride; 671 } 672 __m256i v_sum0 = _mm256_madd_epi16(v_sum_d, one); 673 __m256i v_d_l = _mm256_unpacklo_epi32(v_sum0, v_sse_d); 674 __m256i v_d_h = _mm256_unpackhi_epi32(v_sum0, v_sse_d); 675 __m256i v_d_lh = _mm256_add_epi32(v_d_l, v_d_h); 676 const __m128i v_d0_d = _mm256_castsi256_si128(v_d_lh); 677 const __m128i v_d1_d = _mm256_extracti128_si256(v_d_lh, 1); 678 __m128i v_d = _mm_add_epi32(v_d0_d, v_d1_d); 679 v_d = _mm_add_epi32(v_d, _mm_srli_si128(v_d, 8)); 680 *sum = _mm_extract_epi32(v_d, 0); 681 *sse = _mm_extract_epi32(v_d, 1); 682 } 683 684 static void highbd_10_variance_avx2(const uint16_t *src, int src_stride, 685 const uint16_t *ref, int ref_stride, int w, 686 int h, uint32_t *sse, int *sum, 687 high_variance_fn_t var_fn, int block_size) { 688 int i, j; 689 uint64_t sse_long = 0; 690 int32_t sum_long = 0; 691 692 for (i = 0; i < h; i += block_size) { 693 for (j = 0; j < w; j += block_size) { 694 unsigned int sse0; 695 int sum0; 696 var_fn(src + src_stride * i + j, src_stride, ref + ref_stride * i + j, 697 ref_stride, &sse0, &sum0); 698 sse_long += sse0; 699 sum_long += sum0; 700 } 701 } 702 *sum = ROUND_POWER_OF_TWO(sum_long, 2); 703 *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); 704 } 705 706 #define VAR_FN(w, h, block_size, shift) \ 707 uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ 708 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 709 int ref_stride, uint32_t *sse) { \ 710 int sum; \ 711 int64_t var; \ 712 uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ 713 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ 714 highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ 715 highbd_calc##block_size##x##block_size##var_avx2, \ 716 block_size); \ 717 var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ 718 return (var >= 0) ? (uint32_t)var : 0; \ 719 } 720 721 VAR_FN(128, 128, 16, 14) 722 VAR_FN(128, 64, 16, 13) 723 VAR_FN(64, 128, 16, 13) 724 VAR_FN(64, 64, 16, 12) 725 VAR_FN(64, 32, 16, 11) 726 VAR_FN(32, 64, 16, 11) 727 VAR_FN(32, 32, 16, 10) 728 VAR_FN(32, 16, 16, 9) 729 VAR_FN(16, 32, 16, 9) 730 VAR_FN(16, 16, 16, 8) 731 VAR_FN(16, 8, 8, 7) 732 VAR_FN(8, 16, 8, 7) 733 VAR_FN(8, 8, 8, 6) 734 735 #if !CONFIG_REALTIME_ONLY 736 VAR_FN(16, 64, 16, 10) 737 VAR_FN(32, 8, 8, 8) 738 VAR_FN(64, 16, 16, 10) 739 VAR_FN(8, 32, 8, 8) 740 #endif // !CONFIG_REALTIME_ONLY 741 742 #undef VAR_FN 743 744 unsigned int aom_highbd_10_mse16x16_avx2(const uint8_t *src8, int src_stride, 745 const uint8_t *ref8, int ref_stride, 746 unsigned int *sse) { 747 int sum; 748 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 749 uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); 750 highbd_10_variance_avx2(src, src_stride, ref, ref_stride, 16, 16, sse, &sum, 751 highbd_calc16x16var_avx2, 16); 752 return *sse; 753 } 754 755 #define SSE2_HEIGHT(H) \ 756 uint32_t aom_highbd_10_sub_pixel_variance8x##H##_sse2( \ 757 const uint8_t *src8, int src_stride, int x_offset, int y_offset, \ 758 const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr); 759 760 SSE2_HEIGHT(8) 761 SSE2_HEIGHT(16) 762 763 #undef SSE2_HEIGHT 764 765 #define HIGHBD_SUBPIX_VAR(W, H) \ 766 uint32_t aom_highbd_10_sub_pixel_variance##W##x##H##_avx2( \ 767 const uint8_t *src, int src_stride, int xoffset, int yoffset, \ 768 const uint8_t *dst, int dst_stride, uint32_t *sse) { \ 769 if (W == 8 && H == 16) \ 770 return aom_highbd_10_sub_pixel_variance8x16_sse2( \ 771 src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ 772 else if (W == 8 && H == 8) \ 773 return aom_highbd_10_sub_pixel_variance8x8_sse2( \ 774 src, src_stride, xoffset, yoffset, dst, dst_stride, sse); \ 775 else \ 776 return aom_highbd_var_filter_block2d_bil_avx2( \ 777 src, src_stride, 1, H, W, xoffset, yoffset, dst, dst_stride, sse); \ 778 } 779 780 HIGHBD_SUBPIX_VAR(128, 128) 781 HIGHBD_SUBPIX_VAR(128, 64) 782 HIGHBD_SUBPIX_VAR(64, 128) 783 HIGHBD_SUBPIX_VAR(64, 64) 784 HIGHBD_SUBPIX_VAR(64, 32) 785 HIGHBD_SUBPIX_VAR(32, 64) 786 HIGHBD_SUBPIX_VAR(32, 32) 787 HIGHBD_SUBPIX_VAR(32, 16) 788 HIGHBD_SUBPIX_VAR(16, 32) 789 HIGHBD_SUBPIX_VAR(16, 16) 790 HIGHBD_SUBPIX_VAR(16, 8) 791 HIGHBD_SUBPIX_VAR(8, 16) 792 HIGHBD_SUBPIX_VAR(8, 8) 793 794 #undef HIGHBD_SUBPIX_VAR 795 796 static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, 797 uint16_t *src, int sstride, int h) { 798 uint64_t sum = 0; 799 __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; 800 __m256i src0_8x16, src1_8x16, src_16x16; 801 __m256i dst0_8x16, dst1_8x16, dst_16x16; 802 __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64; 803 __m256i sub_result; 804 const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); 805 __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); 806 for (int i = 0; i < h; i += 4) { 807 reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 0) * dstride])); 808 reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 1) * dstride])); 809 reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 2) * dstride])); 810 reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&dst[(i + 3) * dstride])); 811 dst0_8x16 = 812 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16)); 813 dst1_8x16 = 814 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16)); 815 dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20); 816 817 reg0_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 0) * sstride])); 818 reg1_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 1) * sstride])); 819 reg2_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 2) * sstride])); 820 reg3_4x16 = _mm_loadl_epi64((__m128i const *)(&src[(i + 3) * sstride])); 821 src0_8x16 = 822 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg0_4x16, reg1_4x16)); 823 src1_8x16 = 824 _mm256_castsi128_si256(_mm_unpacklo_epi64(reg2_4x16, reg3_4x16)); 825 src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); 826 827 sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); 828 829 src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros); 830 dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros); 831 832 src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16); 833 dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16); 834 835 res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros); 836 res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros); 837 res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros); 838 res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros); 839 840 square_result = _mm256_add_epi64( 841 square_result, 842 _mm256_add_epi64( 843 _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64), 844 res3_4x64)); 845 } 846 const __m128i sum_2x64 = 847 _mm_add_epi64(_mm256_castsi256_si128(square_result), 848 _mm256_extracti128_si256(square_result, 1)); 849 const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); 850 xx_storel_64(&sum, sum_1x64); 851 return sum; 852 } 853 854 static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, 855 uint16_t *src, int sstride, int h) { 856 uint64_t sum = 0; 857 __m256i src0_8x16, src1_8x16, src_16x16; 858 __m256i dst0_8x16, dst1_8x16, dst_16x16; 859 __m256i res0_4x64, res1_4x64, res2_4x64, res3_4x64; 860 __m256i sub_result; 861 const __m256i zeros = _mm256_broadcastsi128_si256(_mm_setzero_si128()); 862 __m256i square_result = _mm256_broadcastsi128_si256(_mm_setzero_si128()); 863 864 for (int i = 0; i < h; i += 2) { 865 dst0_8x16 = 866 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&dst[i * dstride])); 867 dst1_8x16 = _mm256_castsi128_si256( 868 _mm_loadu_si128((__m128i *)&dst[(i + 1) * dstride])); 869 dst_16x16 = _mm256_permute2x128_si256(dst0_8x16, dst1_8x16, 0x20); 870 871 src0_8x16 = 872 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)&src[i * sstride])); 873 src1_8x16 = _mm256_castsi128_si256( 874 _mm_loadu_si128((__m128i *)&src[(i + 1) * sstride])); 875 src_16x16 = _mm256_permute2x128_si256(src0_8x16, src1_8x16, 0x20); 876 877 sub_result = _mm256_abs_epi16(_mm256_sub_epi16(src_16x16, dst_16x16)); 878 879 src_16x16 = _mm256_unpacklo_epi16(sub_result, zeros); 880 dst_16x16 = _mm256_unpackhi_epi16(sub_result, zeros); 881 882 src_16x16 = _mm256_madd_epi16(src_16x16, src_16x16); 883 dst_16x16 = _mm256_madd_epi16(dst_16x16, dst_16x16); 884 885 res0_4x64 = _mm256_unpacklo_epi32(src_16x16, zeros); 886 res1_4x64 = _mm256_unpackhi_epi32(src_16x16, zeros); 887 res2_4x64 = _mm256_unpacklo_epi32(dst_16x16, zeros); 888 res3_4x64 = _mm256_unpackhi_epi32(dst_16x16, zeros); 889 890 square_result = _mm256_add_epi64( 891 square_result, 892 _mm256_add_epi64( 893 _mm256_add_epi64(_mm256_add_epi64(res0_4x64, res1_4x64), res2_4x64), 894 res3_4x64)); 895 } 896 897 const __m128i sum_2x64 = 898 _mm_add_epi64(_mm256_castsi256_si128(square_result), 899 _mm256_extracti128_si256(square_result, 1)); 900 const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); 901 xx_storel_64(&sum, sum_1x64); 902 return sum; 903 } 904 905 uint64_t aom_mse_wxh_16bit_highbd_avx2(uint16_t *dst, int dstride, 906 uint16_t *src, int sstride, int w, 907 int h) { 908 assert((w == 8 || w == 4) && (h == 8 || h == 4) && 909 "w=8/4 and h=8/4 must satisfy"); 910 switch (w) { 911 case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); 912 case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); 913 default: assert(0 && "unsupported width"); return -1; 914 } 915 }