masked_sad_intrin_ssse3.c (18612B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <stdio.h> 13 #include <tmmintrin.h> 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom_dsp/blend.h" 19 #include "aom/aom_integer.h" 20 #include "aom_dsp/x86/synonyms.h" 21 22 #include "aom_dsp/x86/masked_sad_intrin_ssse3.h" 23 24 // For width a multiple of 16 25 static inline unsigned int masked_sad_ssse3(const uint8_t *src_ptr, 26 int src_stride, 27 const uint8_t *a_ptr, int a_stride, 28 const uint8_t *b_ptr, int b_stride, 29 const uint8_t *m_ptr, int m_stride, 30 int width, int height); 31 32 #define MASKSADMXN_SSSE3(m, n) \ 33 unsigned int aom_masked_sad##m##x##n##_ssse3( \ 34 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 35 const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ 36 int invert_mask) { \ 37 if (!invert_mask) \ 38 return masked_sad_ssse3(src, src_stride, ref, ref_stride, second_pred, \ 39 m, msk, msk_stride, m, n); \ 40 else \ 41 return masked_sad_ssse3(src, src_stride, second_pred, m, ref, \ 42 ref_stride, msk, msk_stride, m, n); \ 43 } 44 45 #define MASKSAD8XN_SSSE3(n) \ 46 unsigned int aom_masked_sad8x##n##_ssse3( \ 47 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 48 const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ 49 int invert_mask) { \ 50 if (!invert_mask) \ 51 return aom_masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, \ 52 second_pred, 8, msk, msk_stride, n); \ 53 else \ 54 return aom_masked_sad8xh_ssse3(src, src_stride, second_pred, 8, ref, \ 55 ref_stride, msk, msk_stride, n); \ 56 } 57 58 #define MASKSAD4XN_SSSE3(n) \ 59 unsigned int aom_masked_sad4x##n##_ssse3( \ 60 const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, \ 61 const uint8_t *second_pred, const uint8_t *msk, int msk_stride, \ 62 int invert_mask) { \ 63 if (!invert_mask) \ 64 return aom_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, \ 65 second_pred, 4, msk, msk_stride, n); \ 66 else \ 67 return aom_masked_sad4xh_ssse3(src, src_stride, second_pred, 4, ref, \ 68 ref_stride, msk, msk_stride, n); \ 69 } 70 71 MASKSADMXN_SSSE3(128, 128) 72 MASKSADMXN_SSSE3(128, 64) 73 MASKSADMXN_SSSE3(64, 128) 74 MASKSADMXN_SSSE3(64, 64) 75 MASKSADMXN_SSSE3(64, 32) 76 MASKSADMXN_SSSE3(32, 64) 77 MASKSADMXN_SSSE3(32, 32) 78 MASKSADMXN_SSSE3(32, 16) 79 MASKSADMXN_SSSE3(16, 32) 80 MASKSADMXN_SSSE3(16, 16) 81 MASKSADMXN_SSSE3(16, 8) 82 MASKSAD8XN_SSSE3(16) 83 MASKSAD8XN_SSSE3(8) 84 MASKSAD8XN_SSSE3(4) 85 MASKSAD4XN_SSSE3(8) 86 MASKSAD4XN_SSSE3(4) 87 88 #if !CONFIG_REALTIME_ONLY 89 MASKSAD4XN_SSSE3(16) 90 MASKSADMXN_SSSE3(16, 4) 91 MASKSAD8XN_SSSE3(32) 92 MASKSADMXN_SSSE3(32, 8) 93 MASKSADMXN_SSSE3(16, 64) 94 MASKSADMXN_SSSE3(64, 16) 95 #endif // !CONFIG_REALTIME_ONLY 96 97 static inline unsigned int masked_sad_ssse3(const uint8_t *src_ptr, 98 int src_stride, 99 const uint8_t *a_ptr, int a_stride, 100 const uint8_t *b_ptr, int b_stride, 101 const uint8_t *m_ptr, int m_stride, 102 int width, int height) { 103 int x, y; 104 __m128i res = _mm_setzero_si128(); 105 const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); 106 107 for (y = 0; y < height; y++) { 108 for (x = 0; x < width; x += 16) { 109 const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); 110 const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); 111 const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); 112 const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); 113 const __m128i m_inv = _mm_sub_epi8(mask_max, m); 114 115 // Calculate 16 predicted pixels. 116 // Note that the maximum value of any entry of 'pred_l' or 'pred_r' 117 // is 64 * 255, so we have plenty of space to add rounding constants. 118 const __m128i data_l = _mm_unpacklo_epi8(a, b); 119 const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); 120 __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); 121 pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); 122 123 const __m128i data_r = _mm_unpackhi_epi8(a, b); 124 const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); 125 __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); 126 pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); 127 128 const __m128i pred = _mm_packus_epi16(pred_l, pred_r); 129 res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); 130 } 131 132 src_ptr += src_stride; 133 a_ptr += a_stride; 134 b_ptr += b_stride; 135 m_ptr += m_stride; 136 } 137 // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. 138 unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) + 139 _mm_cvtsi128_si32(_mm_srli_si128(res, 8))); 140 return sad; 141 } 142 143 unsigned int aom_masked_sad8xh_ssse3(const uint8_t *src_ptr, int src_stride, 144 const uint8_t *a_ptr, int a_stride, 145 const uint8_t *b_ptr, int b_stride, 146 const uint8_t *m_ptr, int m_stride, 147 int height) { 148 int y; 149 __m128i res = _mm_setzero_si128(); 150 const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); 151 152 for (y = 0; y < height; y += 2) { 153 const __m128i src = _mm_unpacklo_epi64( 154 _mm_loadl_epi64((const __m128i *)src_ptr), 155 _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); 156 const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); 157 const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); 158 const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); 159 const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); 160 const __m128i m = 161 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), 162 _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); 163 const __m128i m_inv = _mm_sub_epi8(mask_max, m); 164 165 const __m128i data_l = _mm_unpacklo_epi8(a0, b0); 166 const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); 167 __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); 168 pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); 169 170 const __m128i data_r = _mm_unpacklo_epi8(a1, b1); 171 const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); 172 __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); 173 pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); 174 175 const __m128i pred = _mm_packus_epi16(pred_l, pred_r); 176 res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); 177 178 src_ptr += src_stride * 2; 179 a_ptr += a_stride * 2; 180 b_ptr += b_stride * 2; 181 m_ptr += m_stride * 2; 182 } 183 unsigned int sad = (unsigned int)(_mm_cvtsi128_si32(res) + 184 _mm_cvtsi128_si32(_mm_srli_si128(res, 8))); 185 return sad; 186 } 187 188 unsigned int aom_masked_sad4xh_ssse3(const uint8_t *src_ptr, int src_stride, 189 const uint8_t *a_ptr, int a_stride, 190 const uint8_t *b_ptr, int b_stride, 191 const uint8_t *m_ptr, int m_stride, 192 int height) { 193 int y; 194 __m128i res = _mm_setzero_si128(); 195 const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); 196 197 for (y = 0; y < height; y += 2) { 198 // Load two rows at a time, this seems to be a bit faster 199 // than four rows at a time in this case. 200 const __m128i src = 201 _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)src_ptr), 202 _mm_cvtsi32_si128(*(int *)&src_ptr[src_stride])); 203 const __m128i a = 204 _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)a_ptr), 205 _mm_cvtsi32_si128(*(int *)&a_ptr[a_stride])); 206 const __m128i b = 207 _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)b_ptr), 208 _mm_cvtsi32_si128(*(int *)&b_ptr[b_stride])); 209 const __m128i m = 210 _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(int *)m_ptr), 211 _mm_cvtsi32_si128(*(int *)&m_ptr[m_stride])); 212 const __m128i m_inv = _mm_sub_epi8(mask_max, m); 213 214 const __m128i data = _mm_unpacklo_epi8(a, b); 215 const __m128i mask = _mm_unpacklo_epi8(m, m_inv); 216 __m128i pred_16bit = _mm_maddubs_epi16(data, mask); 217 pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); 218 219 const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); 220 res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); 221 222 src_ptr += src_stride * 2; 223 a_ptr += a_stride * 2; 224 b_ptr += b_stride * 2; 225 m_ptr += m_stride * 2; 226 } 227 // At this point, the SAD is stored in lane 0 of 'res' 228 return (unsigned int)_mm_cvtsi128_si32(res); 229 } 230 231 #if CONFIG_AV1_HIGHBITDEPTH 232 // For width a multiple of 8 233 static inline unsigned int highbd_masked_sad_ssse3( 234 const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, 235 const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, 236 int width, int height); 237 238 #define HIGHBD_MASKSADMXN_SSSE3(m, n) \ 239 unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \ 240 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 241 int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ 242 int msk_stride, int invert_mask) { \ 243 if (!invert_mask) \ 244 return highbd_masked_sad_ssse3(src8, src_stride, ref8, ref_stride, \ 245 second_pred8, m, msk, msk_stride, m, n); \ 246 else \ 247 return highbd_masked_sad_ssse3(src8, src_stride, second_pred8, m, ref8, \ 248 ref_stride, msk, msk_stride, m, n); \ 249 } 250 251 #define HIGHBD_MASKSAD4XN_SSSE3(n) \ 252 unsigned int aom_highbd_masked_sad4x##n##_ssse3( \ 253 const uint8_t *src8, int src_stride, const uint8_t *ref8, \ 254 int ref_stride, const uint8_t *second_pred8, const uint8_t *msk, \ 255 int msk_stride, int invert_mask) { \ 256 if (!invert_mask) \ 257 return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, ref8, \ 258 ref_stride, second_pred8, 4, msk, \ 259 msk_stride, n); \ 260 else \ 261 return aom_highbd_masked_sad4xh_ssse3(src8, src_stride, second_pred8, 4, \ 262 ref8, ref_stride, msk, msk_stride, \ 263 n); \ 264 } 265 266 HIGHBD_MASKSADMXN_SSSE3(128, 128) 267 HIGHBD_MASKSADMXN_SSSE3(128, 64) 268 HIGHBD_MASKSADMXN_SSSE3(64, 128) 269 HIGHBD_MASKSADMXN_SSSE3(64, 64) 270 HIGHBD_MASKSADMXN_SSSE3(64, 32) 271 HIGHBD_MASKSADMXN_SSSE3(32, 64) 272 HIGHBD_MASKSADMXN_SSSE3(32, 32) 273 HIGHBD_MASKSADMXN_SSSE3(32, 16) 274 HIGHBD_MASKSADMXN_SSSE3(16, 32) 275 HIGHBD_MASKSADMXN_SSSE3(16, 16) 276 HIGHBD_MASKSADMXN_SSSE3(16, 8) 277 HIGHBD_MASKSADMXN_SSSE3(8, 16) 278 HIGHBD_MASKSADMXN_SSSE3(8, 8) 279 HIGHBD_MASKSADMXN_SSSE3(8, 4) 280 HIGHBD_MASKSAD4XN_SSSE3(8) 281 HIGHBD_MASKSAD4XN_SSSE3(4) 282 283 #if !CONFIG_REALTIME_ONLY 284 HIGHBD_MASKSAD4XN_SSSE3(16) 285 HIGHBD_MASKSADMXN_SSSE3(16, 4) 286 HIGHBD_MASKSADMXN_SSSE3(8, 32) 287 HIGHBD_MASKSADMXN_SSSE3(32, 8) 288 HIGHBD_MASKSADMXN_SSSE3(16, 64) 289 HIGHBD_MASKSADMXN_SSSE3(64, 16) 290 #endif // !CONFIG_REALTIME_ONLY 291 292 static inline unsigned int highbd_masked_sad_ssse3( 293 const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, 294 const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, 295 int width, int height) { 296 const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); 297 const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); 298 const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); 299 int x, y; 300 __m128i res = _mm_setzero_si128(); 301 const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); 302 const __m128i round_const = 303 _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); 304 const __m128i one = _mm_set1_epi16(1); 305 306 for (y = 0; y < height; y++) { 307 for (x = 0; x < width; x += 8) { 308 const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); 309 const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); 310 const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); 311 // Zero-extend mask to 16 bits 312 const __m128i m = _mm_unpacklo_epi8( 313 _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); 314 const __m128i m_inv = _mm_sub_epi16(mask_max, m); 315 316 const __m128i data_l = _mm_unpacklo_epi16(a, b); 317 const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); 318 __m128i pred_l = _mm_madd_epi16(data_l, mask_l); 319 pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), 320 AOM_BLEND_A64_ROUND_BITS); 321 322 const __m128i data_r = _mm_unpackhi_epi16(a, b); 323 const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); 324 __m128i pred_r = _mm_madd_epi16(data_r, mask_r); 325 pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), 326 AOM_BLEND_A64_ROUND_BITS); 327 328 // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, 329 // so it is safe to do signed saturation here. 330 const __m128i pred = _mm_packs_epi32(pred_l, pred_r); 331 // There is no 16-bit SAD instruction, so we have to synthesize 332 // an 8-element SAD. We do this by storing 4 32-bit partial SADs, 333 // and accumulating them at the end 334 const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); 335 res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); 336 } 337 338 src_ptr += src_stride; 339 a_ptr += a_stride; 340 b_ptr += b_stride; 341 m_ptr += m_stride; 342 } 343 // At this point, we have four 32-bit partial SADs stored in 'res'. 344 res = _mm_hadd_epi32(res, res); 345 res = _mm_hadd_epi32(res, res); 346 int sad = _mm_cvtsi128_si32(res); 347 return sad; 348 } 349 350 unsigned int aom_highbd_masked_sad4xh_ssse3(const uint8_t *src8, int src_stride, 351 const uint8_t *a8, int a_stride, 352 const uint8_t *b8, int b_stride, 353 const uint8_t *m_ptr, int m_stride, 354 int height) { 355 const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); 356 const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); 357 const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); 358 int y; 359 __m128i res = _mm_setzero_si128(); 360 const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); 361 const __m128i round_const = 362 _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); 363 const __m128i one = _mm_set1_epi16(1); 364 365 for (y = 0; y < height; y += 2) { 366 const __m128i src = _mm_unpacklo_epi64( 367 _mm_loadl_epi64((const __m128i *)src_ptr), 368 _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); 369 const __m128i a = 370 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), 371 _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); 372 const __m128i b = 373 _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), 374 _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); 375 // Zero-extend mask to 16 bits 376 const __m128i m = _mm_unpacklo_epi8( 377 _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)m_ptr), 378 _mm_cvtsi32_si128(*(const int *)&m_ptr[m_stride])), 379 _mm_setzero_si128()); 380 const __m128i m_inv = _mm_sub_epi16(mask_max, m); 381 382 const __m128i data_l = _mm_unpacklo_epi16(a, b); 383 const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); 384 __m128i pred_l = _mm_madd_epi16(data_l, mask_l); 385 pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), 386 AOM_BLEND_A64_ROUND_BITS); 387 388 const __m128i data_r = _mm_unpackhi_epi16(a, b); 389 const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); 390 __m128i pred_r = _mm_madd_epi16(data_r, mask_r); 391 pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), 392 AOM_BLEND_A64_ROUND_BITS); 393 394 const __m128i pred = _mm_packs_epi32(pred_l, pred_r); 395 const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); 396 res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); 397 398 src_ptr += src_stride * 2; 399 a_ptr += a_stride * 2; 400 b_ptr += b_stride * 2; 401 m_ptr += m_stride * 2; 402 } 403 res = _mm_hadd_epi32(res, res); 404 res = _mm_hadd_epi32(res, res); 405 int sad = _mm_cvtsi128_si32(res); 406 return sad; 407 } 408 #endif // CONFIG_AV1_HIGHBITDEPTH