blend_a64_mask_sse4.c (64681B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <smmintrin.h> // SSE4.1 13 14 #include <assert.h> 15 16 #include "aom/aom_integer.h" 17 #include "aom_ports/mem.h" 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/blend.h" 20 21 #include "aom_dsp/x86/synonyms.h" 22 #include "aom_dsp/x86/blend_sse4.h" 23 #include "aom_dsp/x86/blend_mask_sse4.h" 24 25 #include "config/aom_dsp_rtcd.h" 26 27 ////////////////////////////////////////////////////////////////////////////// 28 // No sub-sampling 29 ////////////////////////////////////////////////////////////////////////////// 30 31 static void blend_a64_mask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, 32 const uint8_t *src0, uint32_t src0_stride, 33 const uint8_t *src1, uint32_t src1_stride, 34 const uint8_t *mask, uint32_t mask_stride, 35 int w, int h) { 36 (void)w; 37 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 38 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 39 do { 40 const __m128i v_m0_b = xx_loadl_32(mask); 41 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 42 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 43 xx_storel_32(dst, v_res_b); 44 45 dst += dst_stride; 46 src0 += src0_stride; 47 src1 += src1_stride; 48 mask += mask_stride; 49 } while (--h); 50 } 51 52 static void blend_a64_mask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, 53 const uint8_t *src0, uint32_t src0_stride, 54 const uint8_t *src1, uint32_t src1_stride, 55 const uint8_t *mask, uint32_t mask_stride, 56 int w, int h) { 57 (void)w; 58 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 59 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 60 do { 61 const __m128i v_m0_b = xx_loadl_64(mask); 62 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 63 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 64 xx_storel_64(dst, v_res_b); 65 66 dst += dst_stride; 67 src0 += src0_stride; 68 src1 += src1_stride; 69 mask += mask_stride; 70 } while (--h); 71 } 72 73 static void blend_a64_mask_w16n_sse4_1( 74 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 75 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 76 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 77 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 78 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 79 80 do { 81 int c; 82 for (c = 0; c < w; c += 16) { 83 const __m128i v_m0_b = xx_loadu_128(mask + c); 84 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 85 86 const __m128i v_res_b = 87 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); 88 89 xx_storeu_128(dst + c, v_res_b); 90 } 91 dst += dst_stride; 92 src0 += src0_stride; 93 src1 += src1_stride; 94 mask += mask_stride; 95 } while (--h); 96 } 97 98 ////////////////////////////////////////////////////////////////////////////// 99 // Horizontal sub-sampling 100 ////////////////////////////////////////////////////////////////////////////// 101 102 static void blend_a64_mask_sx_w4_sse4_1( 103 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 104 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 105 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 106 (void)w; 107 108 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); 109 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 110 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 111 do { 112 const __m128i v_r_b = xx_loadl_64(mask); 113 const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); 114 const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); 115 const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); 116 const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); 117 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 118 119 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 120 xx_storel_32(dst, v_res_b); 121 122 dst += dst_stride; 123 src0 += src0_stride; 124 src1 += src1_stride; 125 mask += mask_stride; 126 } while (--h); 127 } 128 129 static void blend_a64_mask_sx_w8_sse4_1( 130 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 131 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 132 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 133 (void)w; 134 135 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); 136 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 137 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 138 do { 139 const __m128i v_r_b = xx_loadu_128(mask); 140 const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r_b, v_shuffle_b); 141 const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r0_s_b); 142 const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r0_s_b); 143 const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); 144 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 145 146 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 147 148 xx_storel_64(dst, v_res_b); 149 150 dst += dst_stride; 151 src0 += src0_stride; 152 src1 += src1_stride; 153 mask += mask_stride; 154 } while (--h); 155 } 156 157 static void blend_a64_mask_sx_w16n_sse4_1( 158 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 159 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 160 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 161 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); 162 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 163 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 164 165 do { 166 int c; 167 for (c = 0; c < w; c += 16) { 168 const __m128i v_r0_b = xx_loadu_128(mask + 2 * c); 169 const __m128i v_r1_b = xx_loadu_128(mask + 2 * c + 16); 170 const __m128i v_r0_s_b = _mm_shuffle_epi8(v_r0_b, v_shuffle_b); 171 const __m128i v_r1_s_b = _mm_shuffle_epi8(v_r1_b, v_shuffle_b); 172 const __m128i v_r_lo_b = _mm_unpacklo_epi64(v_r0_s_b, v_r1_s_b); 173 const __m128i v_r_hi_b = _mm_unpackhi_epi64(v_r0_s_b, v_r1_s_b); 174 const __m128i v_m0_b = _mm_avg_epu8(v_r_lo_b, v_r_hi_b); 175 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 176 177 const __m128i v_res_b = 178 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); 179 180 xx_storeu_128(dst + c, v_res_b); 181 } 182 dst += dst_stride; 183 src0 += src0_stride; 184 src1 += src1_stride; 185 mask += mask_stride; 186 } while (--h); 187 } 188 189 ////////////////////////////////////////////////////////////////////////////// 190 // Vertical sub-sampling 191 ////////////////////////////////////////////////////////////////////////////// 192 193 static void blend_a64_mask_sy_w4_sse4_1( 194 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 195 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 196 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 197 (void)w; 198 199 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 200 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 201 202 do { 203 const __m128i v_ra_b = xx_loadl_32(mask); 204 const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); 205 const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); 206 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 207 208 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 209 210 xx_storel_32(dst, v_res_b); 211 212 dst += dst_stride; 213 src0 += src0_stride; 214 src1 += src1_stride; 215 mask += 2 * mask_stride; 216 } while (--h); 217 } 218 219 static void blend_a64_mask_sy_w8_sse4_1( 220 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 221 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 222 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 223 (void)w; 224 225 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 226 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 227 do { 228 const __m128i v_ra_b = xx_loadl_64(mask); 229 const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); 230 const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); 231 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 232 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 233 234 xx_storel_64(dst, v_res_b); 235 236 dst += dst_stride; 237 src0 += src0_stride; 238 src1 += src1_stride; 239 mask += 2 * mask_stride; 240 } while (--h); 241 } 242 243 static void blend_a64_mask_sy_w16n_sse4_1( 244 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 245 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 246 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 247 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 248 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 249 do { 250 int c; 251 for (c = 0; c < w; c += 16) { 252 const __m128i v_ra_b = xx_loadu_128(mask + c); 253 const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride); 254 const __m128i v_m0_b = _mm_avg_epu8(v_ra_b, v_rb_b); 255 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 256 257 const __m128i v_res_b = 258 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); 259 260 xx_storeu_128(dst + c, v_res_b); 261 } 262 dst += dst_stride; 263 src0 += src0_stride; 264 src1 += src1_stride; 265 mask += 2 * mask_stride; 266 } while (--h); 267 } 268 269 ////////////////////////////////////////////////////////////////////////////// 270 // Horizontal and Vertical sub-sampling 271 ////////////////////////////////////////////////////////////////////////////// 272 273 static void blend_a64_mask_sx_sy_w4_sse4_1( 274 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 275 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 276 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 277 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); 278 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 279 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 280 (void)w; 281 282 do { 283 const __m128i v_ra_b = xx_loadl_64(mask); 284 const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); 285 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); 286 const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); 287 const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); 288 const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); 289 const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); 290 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); 291 const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); 292 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 293 294 const __m128i v_res_b = blend_4_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 295 296 xx_storel_32(dst, v_res_b); 297 298 dst += dst_stride; 299 src0 += src0_stride; 300 src1 += src1_stride; 301 mask += 2 * mask_stride; 302 } while (--h); 303 } 304 305 static void blend_a64_mask_sx_sy_w8_sse4_1( 306 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 307 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 308 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 309 const __m128i v_shuffle_b = xx_loadu_128(g_blend_a64_mask_shuffle); 310 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 311 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 312 (void)w; 313 314 do { 315 const __m128i v_ra_b = xx_loadu_128(mask); 316 const __m128i v_rb_b = xx_loadu_128(mask + mask_stride); 317 318 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); 319 const __m128i v_r_s_b = _mm_shuffle_epi8(v_rvs_b, v_shuffle_b); 320 const __m128i v_r0_s_w = _mm_cvtepu8_epi16(v_r_s_b); 321 const __m128i v_r1_s_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_r_s_b, 8)); 322 const __m128i v_rs_w = _mm_add_epi16(v_r0_s_w, v_r1_s_w); 323 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); 324 const __m128i v_m0_b = _mm_packus_epi16(v_m0_w, v_m0_w); 325 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 326 327 const __m128i v_res_b = blend_8_u8(src0, src1, &v_m0_b, &v_m1_b, &_r); 328 329 xx_storel_64(dst, v_res_b); 330 331 dst += dst_stride; 332 src0 += src0_stride; 333 src1 += src1_stride; 334 mask += 2 * mask_stride; 335 } while (--h); 336 } 337 338 static void blend_a64_mask_sx_sy_w16n_sse4_1( 339 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 340 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 341 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 342 const __m128i v_zmask_b = 343 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); 344 const __m128i v_maxval_b = _mm_set1_epi8(AOM_BLEND_A64_MAX_ALPHA); 345 const __m128i _r = _mm_set1_epi16(1 << (15 - AOM_BLEND_A64_ROUND_BITS)); 346 do { 347 int c; 348 for (c = 0; c < w; c += 16) { 349 const __m128i v_ral_b = xx_loadu_128(mask + 2 * c); 350 const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16); 351 const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c); 352 const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16); 353 const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b); 354 const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b); 355 const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b); 356 const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b); 357 const __m128i v_rvsbl_w = 358 _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1), v_zmask_b); 359 const __m128i v_rvsbh_w = 360 _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1), v_zmask_b); 361 const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w); 362 const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w); 363 364 const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2); 365 const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2); 366 const __m128i v_m0_b = _mm_packus_epi16(v_m0l_w, v_m0h_w); 367 const __m128i v_m1_b = _mm_sub_epi8(v_maxval_b, v_m0_b); 368 369 const __m128i v_res_b = 370 blend_16_u8(src0 + c, src1 + c, &v_m0_b, &v_m1_b, &_r); 371 372 xx_storeu_128(dst + c, v_res_b); 373 } 374 dst += dst_stride; 375 src0 += src0_stride; 376 src1 += src1_stride; 377 mask += 2 * mask_stride; 378 } while (--h); 379 } 380 381 ////////////////////////////////////////////////////////////////////////////// 382 // Dispatch 383 ////////////////////////////////////////////////////////////////////////////// 384 385 void aom_blend_a64_mask_sse4_1(uint8_t *dst, uint32_t dst_stride, 386 const uint8_t *src0, uint32_t src0_stride, 387 const uint8_t *src1, uint32_t src1_stride, 388 const uint8_t *mask, uint32_t mask_stride, int w, 389 int h, int subw, int subh) { 390 typedef void (*blend_fn)( 391 uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, 392 uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, 393 const uint8_t *mask, uint32_t mask_stride, int w, int h); 394 395 // Dimensions are: width_index X subx X suby 396 static const blend_fn blend[3][2][2] = { 397 { // w % 16 == 0 398 { blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1 }, 399 { blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1 } }, 400 { // w == 4 401 { blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1 }, 402 { blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1 } }, 403 { // w == 8 404 { blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1 }, 405 { blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1 } } 406 }; 407 408 assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); 409 assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); 410 411 assert(h >= 1); 412 assert(w >= 1); 413 assert(IS_POWER_OF_TWO(h)); 414 assert(IS_POWER_OF_TWO(w)); 415 416 if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) 417 aom_blend_a64_mask_c(dst, dst_stride, src0, src0_stride, src1, src1_stride, 418 mask, mask_stride, w, h, subw, subh); 419 } else { 420 blend[(w >> 2) & 3][subw != 0][subh != 0](dst, dst_stride, src0, 421 src0_stride, src1, src1_stride, 422 mask, mask_stride, w, h); 423 } 424 } 425 426 #if CONFIG_AV1_HIGHBITDEPTH 427 ////////////////////////////////////////////////////////////////////////////// 428 // No sub-sampling 429 ////////////////////////////////////////////////////////////////////////////// 430 431 static inline void blend_a64_mask_bn_w4_sse4_1( 432 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 433 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 434 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { 435 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 436 437 do { 438 const __m128i v_m0_b = xx_loadl_32(mask); 439 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); 440 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 441 442 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); 443 444 xx_storel_64(dst, v_res_w); 445 446 dst += dst_stride; 447 src0 += src0_stride; 448 src1 += src1_stride; 449 mask += mask_stride; 450 } while (--h); 451 } 452 453 static void blend_a64_mask_b10_w4_sse4_1( 454 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 455 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 456 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 457 (void)w; 458 blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 459 src1_stride, mask, mask_stride, h, blend_4_b10); 460 } 461 462 static void blend_a64_mask_b12_w4_sse4_1( 463 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 464 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 465 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 466 (void)w; 467 blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 468 src1_stride, mask, mask_stride, h, blend_4_b12); 469 } 470 471 static inline void blend_a64_mask_bn_w8n_sse4_1( 472 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 473 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 474 const uint8_t *mask, uint32_t mask_stride, int w, int h, 475 blend_unit_fn blend) { 476 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 477 478 do { 479 int c; 480 for (c = 0; c < w; c += 8) { 481 const __m128i v_m0_b = xx_loadl_64(mask + c); 482 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b); 483 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 484 485 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); 486 487 xx_storeu_128(dst + c, v_res_w); 488 } 489 dst += dst_stride; 490 src0 += src0_stride; 491 src1 += src1_stride; 492 mask += mask_stride; 493 } while (--h); 494 } 495 496 static void blend_a64_mask_b10_w8n_sse4_1( 497 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 498 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 499 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 500 blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 501 src1_stride, mask, mask_stride, w, h, 502 blend_8_b10); 503 } 504 505 static void blend_a64_mask_b12_w8n_sse4_1( 506 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 507 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 508 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 509 blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 510 src1_stride, mask, mask_stride, w, h, 511 blend_8_b12); 512 } 513 514 ////////////////////////////////////////////////////////////////////////////// 515 // Horizontal sub-sampling 516 ////////////////////////////////////////////////////////////////////////////// 517 518 static inline void blend_a64_mask_bn_sx_w4_sse4_1( 519 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 520 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 521 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { 522 const __m128i v_zmask_b = 523 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); 524 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 525 526 do { 527 const __m128i v_r_b = xx_loadl_64(mask); 528 const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); 529 530 const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); 531 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 532 533 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); 534 535 xx_storel_64(dst, v_res_w); 536 537 dst += dst_stride; 538 src0 += src0_stride; 539 src1 += src1_stride; 540 mask += mask_stride; 541 } while (--h); 542 } 543 544 static void blend_a64_mask_b10_sx_w4_sse4_1( 545 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 546 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 547 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 548 (void)w; 549 blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 550 src1_stride, mask, mask_stride, h, 551 blend_4_b10); 552 } 553 554 static void blend_a64_mask_b12_sx_w4_sse4_1( 555 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 556 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 557 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 558 (void)w; 559 blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 560 src1_stride, mask, mask_stride, h, 561 blend_4_b12); 562 } 563 564 static inline void blend_a64_mask_bn_sx_w8n_sse4_1( 565 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 566 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 567 const uint8_t *mask, uint32_t mask_stride, int w, int h, 568 blend_unit_fn blend) { 569 const __m128i v_zmask_b = 570 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); 571 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 572 573 do { 574 int c; 575 for (c = 0; c < w; c += 8) { 576 const __m128i v_r_b = xx_loadu_128(mask + 2 * c); 577 const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1)); 578 579 const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b); 580 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 581 582 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); 583 584 xx_storeu_128(dst + c, v_res_w); 585 } 586 dst += dst_stride; 587 src0 += src0_stride; 588 src1 += src1_stride; 589 mask += mask_stride; 590 } while (--h); 591 } 592 593 static void blend_a64_mask_b10_sx_w8n_sse4_1( 594 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 595 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 596 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 597 blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 598 src1_stride, mask, mask_stride, w, h, 599 blend_8_b10); 600 } 601 602 static void blend_a64_mask_b12_sx_w8n_sse4_1( 603 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 604 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 605 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 606 blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 607 src1_stride, mask, mask_stride, w, h, 608 blend_8_b12); 609 } 610 611 ////////////////////////////////////////////////////////////////////////////// 612 // Vertical sub-sampling 613 ////////////////////////////////////////////////////////////////////////////// 614 615 static inline void blend_a64_mask_bn_sy_w4_sse4_1( 616 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 617 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 618 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { 619 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 620 621 do { 622 const __m128i v_ra_b = xx_loadl_32(mask); 623 const __m128i v_rb_b = xx_loadl_32(mask + mask_stride); 624 const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); 625 626 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); 627 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 628 629 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); 630 631 xx_storel_64(dst, v_res_w); 632 633 dst += dst_stride; 634 src0 += src0_stride; 635 src1 += src1_stride; 636 mask += 2 * mask_stride; 637 } while (--h); 638 } 639 640 static void blend_a64_mask_b10_sy_w4_sse4_1( 641 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 642 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 643 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 644 (void)w; 645 blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 646 src1_stride, mask, mask_stride, h, 647 blend_4_b10); 648 } 649 650 static void blend_a64_mask_b12_sy_w4_sse4_1( 651 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 652 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 653 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 654 (void)w; 655 blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 656 src1_stride, mask, mask_stride, h, 657 blend_4_b12); 658 } 659 660 static inline void blend_a64_mask_bn_sy_w8n_sse4_1( 661 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 662 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 663 const uint8_t *mask, uint32_t mask_stride, int w, int h, 664 blend_unit_fn blend) { 665 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 666 667 do { 668 int c; 669 for (c = 0; c < w; c += 8) { 670 const __m128i v_ra_b = xx_loadl_64(mask + c); 671 const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride); 672 const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b); 673 674 const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b); 675 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 676 677 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); 678 679 xx_storeu_128(dst + c, v_res_w); 680 } 681 dst += dst_stride; 682 src0 += src0_stride; 683 src1 += src1_stride; 684 mask += 2 * mask_stride; 685 } while (--h); 686 } 687 688 static void blend_a64_mask_b10_sy_w8n_sse4_1( 689 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 690 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 691 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 692 blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 693 src1_stride, mask, mask_stride, w, h, 694 blend_8_b10); 695 } 696 697 static void blend_a64_mask_b12_sy_w8n_sse4_1( 698 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 699 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 700 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 701 blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 702 src1_stride, mask, mask_stride, w, h, 703 blend_8_b12); 704 } 705 706 ////////////////////////////////////////////////////////////////////////////// 707 // Horizontal and Vertical sub-sampling 708 ////////////////////////////////////////////////////////////////////////////// 709 710 static inline void blend_a64_mask_bn_sx_sy_w4_sse4_1( 711 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 712 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 713 const uint8_t *mask, uint32_t mask_stride, int h, blend_unit_fn blend) { 714 const __m128i v_zmask_b = 715 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); 716 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 717 718 do { 719 const __m128i v_ra_b = xx_loadl_64(mask); 720 const __m128i v_rb_b = xx_loadl_64(mask + mask_stride); 721 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); 722 const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); 723 const __m128i v_rvsb_w = 724 _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); 725 const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); 726 727 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); 728 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 729 730 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); 731 732 xx_storel_64(dst, v_res_w); 733 734 dst += dst_stride; 735 src0 += src0_stride; 736 src1 += src1_stride; 737 mask += 2 * mask_stride; 738 } while (--h); 739 } 740 741 static void blend_a64_mask_b10_sx_sy_w4_sse4_1( 742 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 743 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 744 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 745 (void)w; 746 blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 747 src1_stride, mask, mask_stride, h, 748 blend_4_b10); 749 } 750 751 static void blend_a64_mask_b12_sx_sy_w4_sse4_1( 752 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 753 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 754 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 755 (void)w; 756 blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 757 src1_stride, mask, mask_stride, h, 758 blend_4_b12); 759 } 760 761 static inline void blend_a64_mask_bn_sx_sy_w8n_sse4_1( 762 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 763 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 764 const uint8_t *mask, uint32_t mask_stride, int w, int h, 765 blend_unit_fn blend) { 766 const __m128i v_zmask_b = 767 _mm_set_epi8(0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1); 768 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 769 770 do { 771 int c; 772 for (c = 0; c < w; c += 8) { 773 const __m128i v_ra_b = xx_loadu_128(mask + 2 * c); 774 const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride); 775 const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b); 776 const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b); 777 const __m128i v_rvsb_w = 778 _mm_and_si128(_mm_srli_si128(v_rvs_b, 1), v_zmask_b); 779 const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w); 780 781 const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2); 782 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 783 784 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); 785 786 xx_storeu_128(dst + c, v_res_w); 787 } 788 dst += dst_stride; 789 src0 += src0_stride; 790 src1 += src1_stride; 791 mask += 2 * mask_stride; 792 } while (--h); 793 } 794 795 static void blend_a64_mask_b10_sx_sy_w8n_sse4_1( 796 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 797 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 798 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 799 blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 800 src1_stride, mask, mask_stride, w, h, 801 blend_8_b10); 802 } 803 804 static void blend_a64_mask_b12_sx_sy_w8n_sse4_1( 805 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 806 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 807 const uint8_t *mask, uint32_t mask_stride, int w, int h) { 808 blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 809 src1_stride, mask, mask_stride, w, h, 810 blend_8_b12); 811 } 812 813 ////////////////////////////////////////////////////////////////////////////// 814 // Dispatch 815 ////////////////////////////////////////////////////////////////////////////// 816 void aom_highbd_blend_a64_mask_sse4_1(uint8_t *dst_8, uint32_t dst_stride, 817 const uint8_t *src0_8, 818 uint32_t src0_stride, 819 const uint8_t *src1_8, 820 uint32_t src1_stride, const uint8_t *mask, 821 uint32_t mask_stride, int w, int h, 822 int subw, int subh, int bd) { 823 typedef void (*blend_fn)( 824 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 825 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 826 const uint8_t *mask, uint32_t mask_stride, int w, int h); 827 828 // Dimensions are: bd_index X width_index X subw X subh 829 static const blend_fn blend[2][2][2][2] = { 830 { // bd == 8 or 10 831 { // w % 8 == 0 832 { blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1 }, 833 { blend_a64_mask_b10_sx_w8n_sse4_1, 834 blend_a64_mask_b10_sx_sy_w8n_sse4_1 } }, 835 { // w == 4 836 { blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1 }, 837 { blend_a64_mask_b10_sx_w4_sse4_1, 838 blend_a64_mask_b10_sx_sy_w4_sse4_1 } } }, 839 { // bd == 12 840 { // w % 8 == 0 841 { blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1 }, 842 { blend_a64_mask_b12_sx_w8n_sse4_1, 843 blend_a64_mask_b12_sx_sy_w8n_sse4_1 } }, 844 { // w == 4 845 { blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1 }, 846 { blend_a64_mask_b12_sx_w4_sse4_1, 847 blend_a64_mask_b12_sx_sy_w4_sse4_1 } } } 848 }; 849 850 assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); 851 assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); 852 853 assert(h >= 1); 854 assert(w >= 1); 855 assert(IS_POWER_OF_TWO(h)); 856 assert(IS_POWER_OF_TWO(w)); 857 858 assert(bd == 8 || bd == 10 || bd == 12); 859 if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) 860 aom_highbd_blend_a64_mask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, 861 src1_stride, mask, mask_stride, w, h, subw, 862 subh, bd); 863 } else { 864 uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); 865 const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); 866 const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); 867 868 blend[bd == 12][(w >> 2) & 1][subw != 0][subh != 0]( 869 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 870 mask_stride, w, h); 871 } 872 } 873 #endif // CONFIG_AV1_HIGHBITDEPTH 874 875 static inline void blend_a64_d16_mask_w16_sse41( 876 uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, 877 const __m128i *m0, const __m128i *m1, const __m128i *v_round_offset, 878 const __m128i *v_maxval, int shift) { 879 const __m128i max_minus_m0 = _mm_sub_epi16(*v_maxval, *m0); 880 const __m128i max_minus_m1 = _mm_sub_epi16(*v_maxval, *m1); 881 const __m128i s0_0 = xx_loadu_128(src0); 882 const __m128i s0_1 = xx_loadu_128(src0 + 8); 883 const __m128i s1_0 = xx_loadu_128(src1); 884 const __m128i s1_1 = xx_loadu_128(src1 + 8); 885 __m128i res0_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_0, s1_0), 886 _mm_unpacklo_epi16(*m0, max_minus_m0)); 887 __m128i res0_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_0, s1_0), 888 _mm_unpackhi_epi16(*m0, max_minus_m0)); 889 __m128i res1_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0_1, s1_1), 890 _mm_unpacklo_epi16(*m1, max_minus_m1)); 891 __m128i res1_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0_1, s1_1), 892 _mm_unpackhi_epi16(*m1, max_minus_m1)); 893 res0_lo = _mm_srai_epi32(_mm_sub_epi32(res0_lo, *v_round_offset), shift); 894 res0_hi = _mm_srai_epi32(_mm_sub_epi32(res0_hi, *v_round_offset), shift); 895 res1_lo = _mm_srai_epi32(_mm_sub_epi32(res1_lo, *v_round_offset), shift); 896 res1_hi = _mm_srai_epi32(_mm_sub_epi32(res1_hi, *v_round_offset), shift); 897 const __m128i res0 = _mm_packs_epi32(res0_lo, res0_hi); 898 const __m128i res1 = _mm_packs_epi32(res1_lo, res1_hi); 899 const __m128i res = _mm_packus_epi16(res0, res1); 900 901 _mm_storeu_si128((__m128i *)(dst), res); 902 } 903 904 static inline void lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( 905 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 906 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 907 const uint8_t *mask, uint32_t mask_stride, int h, int w, 908 const __m128i *round_offset, int shift) { 909 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 910 for (int i = 0; i < h; ++i) { 911 for (int j = 0; j < w; j += 16) { 912 const __m128i m = xx_loadu_128(mask + j); 913 const __m128i m0 = _mm_cvtepu8_epi16(m); 914 const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m, 8)); 915 916 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, 917 round_offset, &v_maxval, shift); 918 } 919 mask += mask_stride; 920 dst += dst_stride; 921 src0 += src0_stride; 922 src1 += src1_stride; 923 } 924 } 925 926 static inline void lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( 927 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 928 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 929 const uint8_t *mask, uint32_t mask_stride, int h, int w, 930 const __m128i *round_offset, int shift) { 931 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 932 const __m128i one_b = _mm_set1_epi8(1); 933 const __m128i two_w = _mm_set1_epi16(2); 934 for (int i = 0; i < h; ++i) { 935 for (int j = 0; j < w; j += 16) { 936 const __m128i m_i00 = xx_loadu_128(mask + 2 * j); 937 const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); 938 const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); 939 const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); 940 941 const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); 942 const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); 943 const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); 944 const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); 945 const __m128i m0 = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); 946 const __m128i m1 = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); 947 948 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, 949 round_offset, &v_maxval, shift); 950 } 951 mask += mask_stride << 1; 952 dst += dst_stride; 953 src0 += src0_stride; 954 src1 += src1_stride; 955 } 956 } 957 958 static inline void lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( 959 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 960 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 961 const uint8_t *mask, uint32_t mask_stride, int h, int w, 962 const __m128i *round_offset, int shift) { 963 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 964 const __m128i one_b = _mm_set1_epi8(1); 965 const __m128i zeros = _mm_setzero_si128(); 966 for (int i = 0; i < h; ++i) { 967 for (int j = 0; j < w; j += 16) { 968 const __m128i m_i00 = xx_loadu_128(mask + 2 * j); 969 const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); 970 const __m128i m0_ac = _mm_maddubs_epi16(m_i00, one_b); 971 const __m128i m1_ac = _mm_maddubs_epi16(m_i01, one_b); 972 const __m128i m0 = _mm_avg_epu16(m0_ac, zeros); 973 const __m128i m1 = _mm_avg_epu16(m1_ac, zeros); 974 975 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, 976 round_offset, &v_maxval, shift); 977 } 978 mask += mask_stride; 979 dst += dst_stride; 980 src0 += src0_stride; 981 src1 += src1_stride; 982 } 983 } 984 985 static inline void lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( 986 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 987 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 988 const uint8_t *mask, uint32_t mask_stride, int h, int w, 989 const __m128i *round_offset, int shift) { 990 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 991 const __m128i zeros = _mm_setzero_si128(); 992 for (int i = 0; i < h; ++i) { 993 for (int j = 0; j < w; j += 16) { 994 const __m128i m_i00 = xx_loadu_128(mask + j); 995 const __m128i m_i10 = xx_loadu_128(mask + mask_stride + j); 996 997 const __m128i m_ac = _mm_avg_epu8(_mm_adds_epu8(m_i00, m_i10), zeros); 998 const __m128i m0 = _mm_cvtepu8_epi16(m_ac); 999 const __m128i m1 = _mm_cvtepu8_epi16(_mm_srli_si128(m_ac, 8)); 1000 1001 blend_a64_d16_mask_w16_sse41(dst + j, src0 + j, src1 + j, &m0, &m1, 1002 round_offset, &v_maxval, shift); 1003 } 1004 mask += mask_stride << 1; 1005 dst += dst_stride; 1006 src0 += src0_stride; 1007 src1 += src1_stride; 1008 } 1009 } 1010 1011 void aom_lowbd_blend_a64_d16_mask_sse4_1( 1012 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1013 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1014 const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, 1015 ConvolveParams *conv_params) { 1016 const int bd = 8; 1017 const int round_bits = 1018 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 1019 1020 const int round_offset = 1021 ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - 1022 (1 << (round_bits - 1))) 1023 << AOM_BLEND_A64_ROUND_BITS; 1024 1025 const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; 1026 assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); 1027 assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); 1028 1029 assert(h >= 4); 1030 assert(w >= 4); 1031 assert(IS_POWER_OF_TWO(h)); 1032 assert(IS_POWER_OF_TWO(w)); 1033 1034 const __m128i v_round_offset = _mm_set1_epi32(round_offset); 1035 1036 if (subw == 0 && subh == 0) { 1037 switch (w) { 1038 case 4: 1039 aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( 1040 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1041 mask_stride, h, &v_round_offset, shift); 1042 break; 1043 case 8: 1044 aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( 1045 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1046 mask_stride, h, &v_round_offset, shift); 1047 break; 1048 default: 1049 lowbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( 1050 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1051 mask_stride, h, w, &v_round_offset, shift); 1052 break; 1053 } 1054 1055 } else if (subw == 1 && subh == 1) { 1056 switch (w) { 1057 case 4: 1058 aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( 1059 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1060 mask_stride, h, &v_round_offset, shift); 1061 break; 1062 case 8: 1063 aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( 1064 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1065 mask_stride, h, &v_round_offset, shift); 1066 break; 1067 default: 1068 lowbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( 1069 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1070 mask_stride, h, w, &v_round_offset, shift); 1071 break; 1072 } 1073 } else if (subw == 1 && subh == 0) { 1074 switch (w) { 1075 case 4: 1076 aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( 1077 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1078 mask_stride, h, &v_round_offset, shift); 1079 break; 1080 case 8: 1081 aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( 1082 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1083 mask_stride, h, &v_round_offset, shift); 1084 break; 1085 default: 1086 lowbd_blend_a64_d16_mask_subw1_subh0_w16_sse4_1( 1087 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1088 mask_stride, h, w, &v_round_offset, shift); 1089 break; 1090 } 1091 } else { 1092 switch (w) { 1093 case 4: 1094 aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( 1095 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1096 mask_stride, h, &v_round_offset, shift); 1097 break; 1098 case 8: 1099 aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( 1100 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1101 mask_stride, h, &v_round_offset, shift); 1102 break; 1103 default: 1104 lowbd_blend_a64_d16_mask_subw0_subh1_w16_sse4_1( 1105 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1106 mask_stride, h, w, &v_round_offset, shift); 1107 break; 1108 } 1109 } 1110 } 1111 1112 ////////////////////////////////////////////////////////////////////////////// 1113 // aom_highbd_blend_a64_d16_mask_sse4_1() 1114 ////////////////////////////////////////////////////////////////////////////// 1115 #if CONFIG_AV1_HIGHBITDEPTH 1116 static inline void highbd_blend_a64_d16_mask_w4_sse4_1( 1117 uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, 1118 const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, 1119 const __m128i *mask0b, const __m128i *round_offset, int shift, 1120 const __m128i *clip_low, const __m128i *clip_high, 1121 const __m128i *mask_max) { 1122 // Load 4 pixels from each of 4 rows from each source 1123 const __m128i s0a = xx_loadu_2x64(src0, src0 + src0_stride); 1124 const __m128i s0b = 1125 xx_loadu_2x64(src0 + 2 * src0_stride, src0 + 3 * src0_stride); 1126 const __m128i s1a = xx_loadu_2x64(src1, src1 + src1_stride); 1127 const __m128i s1b = 1128 xx_loadu_2x64(src1 + 2 * src1_stride, src1 + 3 * src1_stride); 1129 1130 // Generate the inverse masks 1131 const __m128i mask1a = _mm_sub_epi16(*mask_max, *mask0a); 1132 const __m128i mask1b = _mm_sub_epi16(*mask_max, *mask0b); 1133 1134 // Multiply each mask by the respective source 1135 const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); 1136 const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); 1137 const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); 1138 const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); 1139 const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); 1140 const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); 1141 const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); 1142 const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); 1143 1144 const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); 1145 const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); 1146 const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); 1147 const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); 1148 const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); 1149 const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); 1150 const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); 1151 const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); 1152 1153 const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); 1154 const __m128i sumal = _mm_add_epi32(mul0al, mul1al); 1155 const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); 1156 const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); 1157 1158 const __m128i roundah = 1159 _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); 1160 const __m128i roundbh = 1161 _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); 1162 const __m128i roundal = 1163 _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); 1164 const __m128i roundbl = 1165 _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); 1166 1167 const __m128i packa = _mm_packs_epi32(roundal, roundah); 1168 const __m128i packb = _mm_packs_epi32(roundbl, roundbh); 1169 1170 const __m128i clipa = 1171 _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); 1172 const __m128i clipb = 1173 _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); 1174 1175 xx_storel_64(dst, _mm_srli_si128(clipa, 8)); 1176 xx_storel_64(dst + dst_stride, clipa); 1177 xx_storel_64(dst + 2 * dst_stride, _mm_srli_si128(clipb, 8)); 1178 xx_storel_64(dst + 3 * dst_stride, clipb); 1179 } 1180 1181 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( 1182 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1183 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1184 const uint8_t *mask, uint32_t mask_stride, int h, 1185 const __m128i *round_offset, int shift, const __m128i *clip_low, 1186 const __m128i *clip_high, const __m128i *mask_max) { 1187 do { 1188 const __m128i mask0a8 = 1189 _mm_set_epi32(0, 0, *(int32_t *)mask, *(int32_t *)(mask + mask_stride)); 1190 const __m128i mask0b8 = 1191 _mm_set_epi32(0, 0, *(int32_t *)(mask + 2 * mask_stride), 1192 *(int32_t *)(mask + 3 * mask_stride)); 1193 const __m128i mask0a = _mm_cvtepu8_epi16(mask0a8); 1194 const __m128i mask0b = _mm_cvtepu8_epi16(mask0b8); 1195 1196 highbd_blend_a64_d16_mask_w4_sse4_1( 1197 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, 1198 round_offset, shift, clip_low, clip_high, mask_max); 1199 1200 dst += dst_stride * 4; 1201 src0 += src0_stride * 4; 1202 src1 += src1_stride * 4; 1203 mask += mask_stride * 4; 1204 } while (h -= 4); 1205 } 1206 1207 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( 1208 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1209 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1210 const uint8_t *mask, uint32_t mask_stride, int h, 1211 const __m128i *round_offset, int shift, const __m128i *clip_low, 1212 const __m128i *clip_high, const __m128i *mask_max) { 1213 const __m128i one_b = _mm_set1_epi8(1); 1214 const __m128i two_w = _mm_set1_epi16(2); 1215 do { 1216 // Load 8 pixels from each of 8 rows of mask, 1217 // (saturating) add together rows then use madd to add adjacent pixels 1218 // Finally, divide each value by 4 (with rounding) 1219 const __m128i m02 = _mm_set_epi64x(*(int64_t *)(mask), 1220 *(int64_t *)(mask + 2 * mask_stride)); 1221 const __m128i m13 = _mm_set_epi64x(*(int64_t *)(mask + mask_stride), 1222 *(int64_t *)(mask + 3 * mask_stride)); 1223 const __m128i m0123 = _mm_maddubs_epi16(_mm_adds_epu8(m02, m13), one_b); 1224 const __m128i mask_0a = _mm_srli_epi16(_mm_add_epi16(m0123, two_w), 2); 1225 const __m128i m46 = _mm_set_epi64x(*(int64_t *)(mask + 4 * mask_stride), 1226 *(int64_t *)(mask + 6 * mask_stride)); 1227 const __m128i m57 = _mm_set_epi64x(*(int64_t *)(mask + 5 * mask_stride), 1228 *(int64_t *)(mask + 7 * mask_stride)); 1229 const __m128i m4567 = _mm_maddubs_epi16(_mm_adds_epu8(m46, m57), one_b); 1230 const __m128i mask_0b = _mm_srli_epi16(_mm_add_epi16(m4567, two_w), 2); 1231 1232 highbd_blend_a64_d16_mask_w4_sse4_1( 1233 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_0a, 1234 &mask_0b, round_offset, shift, clip_low, clip_high, mask_max); 1235 1236 dst += dst_stride * 4; 1237 src0 += src0_stride * 4; 1238 src1 += src1_stride * 4; 1239 mask += mask_stride * 8; 1240 } while (h -= 4); 1241 } 1242 1243 static inline void highbd_blend_a64_d16_mask_w8_sse4_1( 1244 uint16_t *dst, int dst_stride, const CONV_BUF_TYPE *src0, int src0_stride, 1245 const CONV_BUF_TYPE *src1, int src1_stride, const __m128i *mask0a, 1246 const __m128i *mask0b, const __m128i *round_offset, int shift, 1247 const __m128i *clip_low, const __m128i *clip_high, 1248 const __m128i *max_mask) { 1249 // Load 8x pixels from each of 2 rows from each source 1250 const __m128i s0a = xx_loadu_128(src0); 1251 const __m128i s0b = xx_loadu_128(src0 + src0_stride); 1252 const __m128i s1a = xx_loadu_128(src1); 1253 const __m128i s1b = xx_loadu_128(src1 + src1_stride); 1254 1255 // Generate inverse masks 1256 const __m128i mask1a = _mm_sub_epi16(*max_mask, *mask0a); 1257 const __m128i mask1b = _mm_sub_epi16(*max_mask, *mask0b); 1258 1259 // Multiply sources by respective masks 1260 const __m128i mul0a_highs = _mm_mulhi_epu16(*mask0a, s0a); 1261 const __m128i mul0a_lows = _mm_mullo_epi16(*mask0a, s0a); 1262 const __m128i mul0ah = _mm_unpackhi_epi16(mul0a_lows, mul0a_highs); 1263 const __m128i mul0al = _mm_unpacklo_epi16(mul0a_lows, mul0a_highs); 1264 1265 const __m128i mul1a_highs = _mm_mulhi_epu16(mask1a, s1a); 1266 const __m128i mul1a_lows = _mm_mullo_epi16(mask1a, s1a); 1267 const __m128i mul1ah = _mm_unpackhi_epi16(mul1a_lows, mul1a_highs); 1268 const __m128i mul1al = _mm_unpacklo_epi16(mul1a_lows, mul1a_highs); 1269 1270 const __m128i sumah = _mm_add_epi32(mul0ah, mul1ah); 1271 const __m128i sumal = _mm_add_epi32(mul0al, mul1al); 1272 1273 const __m128i mul0b_highs = _mm_mulhi_epu16(*mask0b, s0b); 1274 const __m128i mul0b_lows = _mm_mullo_epi16(*mask0b, s0b); 1275 const __m128i mul0bh = _mm_unpackhi_epi16(mul0b_lows, mul0b_highs); 1276 const __m128i mul0bl = _mm_unpacklo_epi16(mul0b_lows, mul0b_highs); 1277 1278 const __m128i mul1b_highs = _mm_mulhi_epu16(mask1b, s1b); 1279 const __m128i mul1b_lows = _mm_mullo_epi16(mask1b, s1b); 1280 const __m128i mul1bh = _mm_unpackhi_epi16(mul1b_lows, mul1b_highs); 1281 const __m128i mul1bl = _mm_unpacklo_epi16(mul1b_lows, mul1b_highs); 1282 1283 const __m128i sumbh = _mm_add_epi32(mul0bh, mul1bh); 1284 const __m128i sumbl = _mm_add_epi32(mul0bl, mul1bl); 1285 1286 const __m128i roundah = 1287 _mm_srai_epi32(_mm_sub_epi32(sumah, *round_offset), shift); 1288 const __m128i roundal = 1289 _mm_srai_epi32(_mm_sub_epi32(sumal, *round_offset), shift); 1290 const __m128i roundbh = 1291 _mm_srai_epi32(_mm_sub_epi32(sumbh, *round_offset), shift); 1292 const __m128i roundbl = 1293 _mm_srai_epi32(_mm_sub_epi32(sumbl, *round_offset), shift); 1294 1295 const __m128i packa = _mm_packs_epi32(roundal, roundah); 1296 const __m128i clipa = 1297 _mm_min_epi16(_mm_max_epi16(packa, *clip_low), *clip_high); 1298 const __m128i packb = _mm_packs_epi32(roundbl, roundbh); 1299 const __m128i clipb = 1300 _mm_min_epi16(_mm_max_epi16(packb, *clip_low), *clip_high); 1301 1302 xx_storeu_128(dst, clipa); 1303 xx_storeu_128(dst + dst_stride, clipb); 1304 } 1305 1306 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( 1307 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1308 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1309 const uint8_t *mask, uint32_t mask_stride, int h, 1310 const __m128i *round_offset, int shift, const __m128i *clip_low, 1311 const __m128i *clip_high, const __m128i *max_mask) { 1312 do { 1313 const __m128i mask0a = _mm_cvtepu8_epi16(xx_loadl_64(mask)); 1314 const __m128i mask0b = _mm_cvtepu8_epi16(xx_loadl_64(mask + mask_stride)); 1315 highbd_blend_a64_d16_mask_w8_sse4_1( 1316 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask0a, &mask0b, 1317 round_offset, shift, clip_low, clip_high, max_mask); 1318 1319 dst += dst_stride * 2; 1320 src0 += src0_stride * 2; 1321 src1 += src1_stride * 2; 1322 mask += mask_stride * 2; 1323 } while (h -= 2); 1324 } 1325 1326 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( 1327 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1328 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1329 const uint8_t *mask, uint32_t mask_stride, int h, 1330 const __m128i *round_offset, int shift, const __m128i *clip_low, 1331 const __m128i *clip_high, const __m128i *max_mask) { 1332 const __m128i one_b = _mm_set1_epi8(1); 1333 const __m128i two_w = _mm_set1_epi16(2); 1334 do { 1335 const __m128i mask_thisrowa = xx_loadu_128(mask); 1336 const __m128i mask_nextrowa = xx_loadu_128(mask + mask_stride); 1337 const __m128i mask_thisrowb = xx_loadu_128(mask + 2 * mask_stride); 1338 const __m128i mask_nextrowb = xx_loadu_128(mask + 3 * mask_stride); 1339 const __m128i mask_bothrowsa = _mm_adds_epu8(mask_thisrowa, mask_nextrowa); 1340 const __m128i mask_bothrowsb = _mm_adds_epu8(mask_thisrowb, mask_nextrowb); 1341 const __m128i mask_16a = _mm_maddubs_epi16(mask_bothrowsa, one_b); 1342 const __m128i mask_16b = _mm_maddubs_epi16(mask_bothrowsb, one_b); 1343 const __m128i mask_sa = _mm_srli_epi16(_mm_add_epi16(mask_16a, two_w), 2); 1344 const __m128i mask_sb = _mm_srli_epi16(_mm_add_epi16(mask_16b, two_w), 2); 1345 1346 highbd_blend_a64_d16_mask_w8_sse4_1( 1347 dst, dst_stride, src0, src0_stride, src1, src1_stride, &mask_sa, 1348 &mask_sb, round_offset, shift, clip_low, clip_high, max_mask); 1349 1350 dst += dst_stride * 2; 1351 src0 += src0_stride * 2; 1352 src1 += src1_stride * 2; 1353 mask += mask_stride * 4; 1354 } while (h -= 2); 1355 } 1356 1357 static inline void highbd_blend_a64_d16_mask_w16_sse4_1( 1358 uint16_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, 1359 const __m128i *round_offset, int shift, const __m128i *mask0l, 1360 const __m128i *mask0h, const __m128i *clip_low, const __m128i *clip_high, 1361 const __m128i *mask_max) { 1362 // Load 16x u16 pixels for this row from each src 1363 const __m128i s0l = xx_loadu_128(src0); 1364 const __m128i s0h = xx_loadu_128(src0 + 8); 1365 const __m128i s1l = xx_loadu_128(src1); 1366 const __m128i s1h = xx_loadu_128(src1 + 8); 1367 1368 // Calculate inverse masks 1369 const __m128i mask1h = _mm_sub_epi16(*mask_max, *mask0h); 1370 const __m128i mask1l = _mm_sub_epi16(*mask_max, *mask0l); 1371 1372 const __m128i mul0_highs = _mm_mulhi_epu16(*mask0h, s0h); 1373 const __m128i mul0_lows = _mm_mullo_epi16(*mask0h, s0h); 1374 const __m128i mul0h = _mm_unpackhi_epi16(mul0_lows, mul0_highs); 1375 const __m128i mul0l = _mm_unpacklo_epi16(mul0_lows, mul0_highs); 1376 1377 const __m128i mul1_highs = _mm_mulhi_epu16(mask1h, s1h); 1378 const __m128i mul1_lows = _mm_mullo_epi16(mask1h, s1h); 1379 const __m128i mul1h = _mm_unpackhi_epi16(mul1_lows, mul1_highs); 1380 const __m128i mul1l = _mm_unpacklo_epi16(mul1_lows, mul1_highs); 1381 1382 const __m128i mulhh = _mm_add_epi32(mul0h, mul1h); 1383 const __m128i mulhl = _mm_add_epi32(mul0l, mul1l); 1384 1385 const __m128i mul2_highs = _mm_mulhi_epu16(*mask0l, s0l); 1386 const __m128i mul2_lows = _mm_mullo_epi16(*mask0l, s0l); 1387 const __m128i mul2h = _mm_unpackhi_epi16(mul2_lows, mul2_highs); 1388 const __m128i mul2l = _mm_unpacklo_epi16(mul2_lows, mul2_highs); 1389 1390 const __m128i mul3_highs = _mm_mulhi_epu16(mask1l, s1l); 1391 const __m128i mul3_lows = _mm_mullo_epi16(mask1l, s1l); 1392 const __m128i mul3h = _mm_unpackhi_epi16(mul3_lows, mul3_highs); 1393 const __m128i mul3l = _mm_unpacklo_epi16(mul3_lows, mul3_highs); 1394 1395 const __m128i mullh = _mm_add_epi32(mul2h, mul3h); 1396 const __m128i mulll = _mm_add_epi32(mul2l, mul3l); 1397 1398 const __m128i reshh = 1399 _mm_srai_epi32(_mm_sub_epi32(mulhh, *round_offset), shift); 1400 const __m128i reshl = 1401 _mm_srai_epi32(_mm_sub_epi32(mulhl, *round_offset), shift); 1402 const __m128i reslh = 1403 _mm_srai_epi32(_mm_sub_epi32(mullh, *round_offset), shift); 1404 const __m128i resll = 1405 _mm_srai_epi32(_mm_sub_epi32(mulll, *round_offset), shift); 1406 1407 // Signed saturating pack from i32 to i16: 1408 const __m128i packh = _mm_packs_epi32(reshl, reshh); 1409 const __m128i packl = _mm_packs_epi32(resll, reslh); 1410 1411 // Clip the values to the valid range 1412 const __m128i cliph = 1413 _mm_min_epi16(_mm_max_epi16(packh, *clip_low), *clip_high); 1414 const __m128i clipl = 1415 _mm_min_epi16(_mm_max_epi16(packl, *clip_low), *clip_high); 1416 1417 // Store 16 pixels 1418 xx_storeu_128(dst, clipl); 1419 xx_storeu_128(dst + 8, cliph); 1420 } 1421 1422 static inline void highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( 1423 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1424 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1425 const uint8_t *mask, uint32_t mask_stride, int h, int w, 1426 const __m128i *round_offset, int shift, const __m128i *clip_low, 1427 const __m128i *clip_high, const __m128i *mask_max) { 1428 for (int i = 0; i < h; i++) { 1429 for (int j = 0; j < w; j += 16) { 1430 // Load 16x u8 alpha-mask values and pad to u16 1431 const __m128i masks_u8 = xx_loadu_128(mask + j); 1432 const __m128i mask0l = _mm_cvtepu8_epi16(masks_u8); 1433 const __m128i mask0h = _mm_cvtepu8_epi16(_mm_srli_si128(masks_u8, 8)); 1434 1435 highbd_blend_a64_d16_mask_w16_sse4_1( 1436 dst + j, src0 + j, src1 + j, round_offset, shift, &mask0l, &mask0h, 1437 clip_low, clip_high, mask_max); 1438 } 1439 dst += dst_stride; 1440 src0 += src0_stride; 1441 src1 += src1_stride; 1442 mask += mask_stride; 1443 } 1444 } 1445 1446 static inline void highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( 1447 uint16_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1448 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1449 const uint8_t *mask, uint32_t mask_stride, int h, int w, 1450 const __m128i *round_offset, int shift, const __m128i *clip_low, 1451 const __m128i *clip_high, const __m128i *mask_max) { 1452 const __m128i one_b = _mm_set1_epi8(1); 1453 const __m128i two_w = _mm_set1_epi16(2); 1454 for (int i = 0; i < h; i++) { 1455 for (int j = 0; j < w; j += 16) { 1456 const __m128i m_i00 = xx_loadu_128(mask + 2 * j); 1457 const __m128i m_i01 = xx_loadu_128(mask + 2 * j + 16); 1458 const __m128i m_i10 = xx_loadu_128(mask + mask_stride + 2 * j); 1459 const __m128i m_i11 = xx_loadu_128(mask + mask_stride + 2 * j + 16); 1460 1461 const __m128i m0_ac = _mm_adds_epu8(m_i00, m_i10); 1462 const __m128i m1_ac = _mm_adds_epu8(m_i01, m_i11); 1463 const __m128i m0_acbd = _mm_maddubs_epi16(m0_ac, one_b); 1464 const __m128i m1_acbd = _mm_maddubs_epi16(m1_ac, one_b); 1465 const __m128i mask_l = _mm_srli_epi16(_mm_add_epi16(m0_acbd, two_w), 2); 1466 const __m128i mask_h = _mm_srli_epi16(_mm_add_epi16(m1_acbd, two_w), 2); 1467 1468 highbd_blend_a64_d16_mask_w16_sse4_1( 1469 dst + j, src0 + j, src1 + j, round_offset, shift, &mask_l, &mask_h, 1470 clip_low, clip_high, mask_max); 1471 } 1472 dst += dst_stride; 1473 src0 += src0_stride; 1474 src1 += src1_stride; 1475 mask += mask_stride * 2; 1476 } 1477 } 1478 1479 void aom_highbd_blend_a64_d16_mask_sse4_1( 1480 uint8_t *dst8, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 1481 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 1482 const uint8_t *mask, uint32_t mask_stride, int w, int h, int subw, int subh, 1483 ConvolveParams *conv_params, const int bd) { 1484 uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); 1485 const int round_bits = 1486 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; 1487 const int32_t round_offset = 1488 ((1 << (round_bits + bd)) + (1 << (round_bits + bd - 1)) - 1489 (1 << (round_bits - 1))) 1490 << AOM_BLEND_A64_ROUND_BITS; 1491 const __m128i v_round_offset = _mm_set1_epi32(round_offset); 1492 const int shift = round_bits + AOM_BLEND_A64_ROUND_BITS; 1493 1494 const __m128i clip_low = _mm_setzero_si128(); 1495 const __m128i clip_high = _mm_set1_epi16((1 << bd) - 1); 1496 const __m128i mask_max = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 1497 1498 assert(IMPLIES((void *)src0 == dst, src0_stride == dst_stride)); 1499 assert(IMPLIES((void *)src1 == dst, src1_stride == dst_stride)); 1500 1501 assert(h >= 4); 1502 assert(w >= 4); 1503 assert(IS_POWER_OF_TWO(h)); 1504 assert(IS_POWER_OF_TWO(w)); 1505 1506 if (subw == 0 && subh == 0) { 1507 switch (w) { 1508 case 4: 1509 highbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( 1510 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1511 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, 1512 &mask_max); 1513 break; 1514 case 8: 1515 highbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( 1516 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1517 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, 1518 &mask_max); 1519 break; 1520 default: // >=16 1521 highbd_blend_a64_d16_mask_subw0_subh0_w16_sse4_1( 1522 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1523 mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, 1524 &mask_max); 1525 break; 1526 } 1527 1528 } else if (subw == 1 && subh == 1) { 1529 switch (w) { 1530 case 4: 1531 highbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( 1532 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1533 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, 1534 &mask_max); 1535 break; 1536 case 8: 1537 highbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( 1538 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1539 mask_stride, h, &v_round_offset, shift, &clip_low, &clip_high, 1540 &mask_max); 1541 break; 1542 default: // >=16 1543 highbd_blend_a64_d16_mask_subw1_subh1_w16_sse4_1( 1544 dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, 1545 mask_stride, h, w, &v_round_offset, shift, &clip_low, &clip_high, 1546 &mask_max); 1547 break; 1548 } 1549 } else { 1550 // Sub-sampling in only one axis doesn't seem to happen very much, so fall 1551 // back to the vanilla C implementation instead of having all the optimised 1552 // code for these. 1553 aom_highbd_blend_a64_d16_mask_c(dst8, dst_stride, src0, src0_stride, src1, 1554 src1_stride, mask, mask_stride, w, h, subw, 1555 subh, conv_params, bd); 1556 } 1557 } 1558 #endif // CONFIG_AV1_HIGHBITDEPTH