sse_sse4.c (12186B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <smmintrin.h> 14 15 #include "config/aom_config.h" 16 #include "config/aom_dsp_rtcd.h" 17 18 #include "aom_ports/mem.h" 19 #include "aom/aom_integer.h" 20 #include "aom_dsp/x86/synonyms.h" 21 22 static inline int64_t summary_all_sse4(const __m128i *sum_all) { 23 int64_t sum; 24 const __m128i sum0 = _mm_cvtepu32_epi64(*sum_all); 25 const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum_all, 8)); 26 const __m128i sum_2x64 = _mm_add_epi64(sum0, sum1); 27 const __m128i sum_1x64 = _mm_add_epi64(sum_2x64, _mm_srli_si128(sum_2x64, 8)); 28 xx_storel_64(&sum, sum_1x64); 29 return sum; 30 } 31 32 #if CONFIG_AV1_HIGHBITDEPTH 33 static inline void summary_32_sse4(const __m128i *sum32, __m128i *sum64) { 34 const __m128i sum0 = _mm_cvtepu32_epi64(*sum32); 35 const __m128i sum1 = _mm_cvtepu32_epi64(_mm_srli_si128(*sum32, 8)); 36 *sum64 = _mm_add_epi64(sum0, *sum64); 37 *sum64 = _mm_add_epi64(sum1, *sum64); 38 } 39 #endif 40 41 static inline void sse_w16_sse4_1(__m128i *sum, const uint8_t *a, 42 const uint8_t *b) { 43 const __m128i v_a0 = xx_loadu_128(a); 44 const __m128i v_b0 = xx_loadu_128(b); 45 const __m128i v_a00_w = _mm_cvtepu8_epi16(v_a0); 46 const __m128i v_a01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_a0, 8)); 47 const __m128i v_b00_w = _mm_cvtepu8_epi16(v_b0); 48 const __m128i v_b01_w = _mm_cvtepu8_epi16(_mm_srli_si128(v_b0, 8)); 49 const __m128i v_d00_w = _mm_sub_epi16(v_a00_w, v_b00_w); 50 const __m128i v_d01_w = _mm_sub_epi16(v_a01_w, v_b01_w); 51 *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d00_w, v_d00_w)); 52 *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d01_w, v_d01_w)); 53 } 54 55 static inline void sse4x2_sse4_1(const uint8_t *a, int a_stride, 56 const uint8_t *b, int b_stride, __m128i *sum) { 57 const __m128i v_a0 = xx_loadl_32(a); 58 const __m128i v_a1 = xx_loadl_32(a + a_stride); 59 const __m128i v_b0 = xx_loadl_32(b); 60 const __m128i v_b1 = xx_loadl_32(b + b_stride); 61 const __m128i v_a_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_a0, v_a1)); 62 const __m128i v_b_w = _mm_cvtepu8_epi16(_mm_unpacklo_epi32(v_b0, v_b1)); 63 const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); 64 *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); 65 } 66 67 static inline void sse8_sse4_1(const uint8_t *a, const uint8_t *b, 68 __m128i *sum) { 69 const __m128i v_a0 = xx_loadl_64(a); 70 const __m128i v_b0 = xx_loadl_64(b); 71 const __m128i v_a_w = _mm_cvtepu8_epi16(v_a0); 72 const __m128i v_b_w = _mm_cvtepu8_epi16(v_b0); 73 const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); 74 *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); 75 } 76 77 int64_t aom_sse_sse4_1(const uint8_t *a, int a_stride, const uint8_t *b, 78 int b_stride, int width, int height) { 79 int y = 0; 80 int64_t sse = 0; 81 __m128i sum = _mm_setzero_si128(); 82 switch (width) { 83 case 4: 84 do { 85 sse4x2_sse4_1(a, a_stride, b, b_stride, &sum); 86 a += a_stride << 1; 87 b += b_stride << 1; 88 y += 2; 89 } while (y < height); 90 sse = summary_all_sse4(&sum); 91 break; 92 case 8: 93 do { 94 sse8_sse4_1(a, b, &sum); 95 a += a_stride; 96 b += b_stride; 97 y += 1; 98 } while (y < height); 99 sse = summary_all_sse4(&sum); 100 break; 101 case 16: 102 do { 103 sse_w16_sse4_1(&sum, a, b); 104 a += a_stride; 105 b += b_stride; 106 y += 1; 107 } while (y < height); 108 sse = summary_all_sse4(&sum); 109 break; 110 case 32: 111 do { 112 sse_w16_sse4_1(&sum, a, b); 113 sse_w16_sse4_1(&sum, a + 16, b + 16); 114 a += a_stride; 115 b += b_stride; 116 y += 1; 117 } while (y < height); 118 sse = summary_all_sse4(&sum); 119 break; 120 case 64: 121 do { 122 sse_w16_sse4_1(&sum, a, b); 123 sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); 124 sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); 125 sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); 126 a += a_stride; 127 b += b_stride; 128 y += 1; 129 } while (y < height); 130 sse = summary_all_sse4(&sum); 131 break; 132 case 128: 133 do { 134 sse_w16_sse4_1(&sum, a, b); 135 sse_w16_sse4_1(&sum, a + 16 * 1, b + 16 * 1); 136 sse_w16_sse4_1(&sum, a + 16 * 2, b + 16 * 2); 137 sse_w16_sse4_1(&sum, a + 16 * 3, b + 16 * 3); 138 sse_w16_sse4_1(&sum, a + 16 * 4, b + 16 * 4); 139 sse_w16_sse4_1(&sum, a + 16 * 5, b + 16 * 5); 140 sse_w16_sse4_1(&sum, a + 16 * 6, b + 16 * 6); 141 sse_w16_sse4_1(&sum, a + 16 * 7, b + 16 * 7); 142 a += a_stride; 143 b += b_stride; 144 y += 1; 145 } while (y < height); 146 sse = summary_all_sse4(&sum); 147 break; 148 default: 149 if (width & 0x07) { 150 do { 151 int i = 0; 152 do { 153 sse8_sse4_1(a + i, b + i, &sum); 154 sse8_sse4_1(a + i + a_stride, b + i + b_stride, &sum); 155 i += 8; 156 } while (i + 4 < width); 157 sse4x2_sse4_1(a + i, a_stride, b + i, b_stride, &sum); 158 a += (a_stride << 1); 159 b += (b_stride << 1); 160 y += 2; 161 } while (y < height); 162 } else { 163 do { 164 int i = 0; 165 do { 166 sse8_sse4_1(a + i, b + i, &sum); 167 i += 8; 168 } while (i < width); 169 a += a_stride; 170 b += b_stride; 171 y += 1; 172 } while (y < height); 173 } 174 sse = summary_all_sse4(&sum); 175 break; 176 } 177 178 return sse; 179 } 180 181 #if CONFIG_AV1_HIGHBITDEPTH 182 static inline void highbd_sse_w4x2_sse4_1(__m128i *sum, const uint16_t *a, 183 int a_stride, const uint16_t *b, 184 int b_stride) { 185 const __m128i v_a0 = xx_loadl_64(a); 186 const __m128i v_a1 = xx_loadl_64(a + a_stride); 187 const __m128i v_b0 = xx_loadl_64(b); 188 const __m128i v_b1 = xx_loadl_64(b + b_stride); 189 const __m128i v_a_w = _mm_unpacklo_epi64(v_a0, v_a1); 190 const __m128i v_b_w = _mm_unpacklo_epi64(v_b0, v_b1); 191 const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); 192 *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); 193 } 194 195 static inline void highbd_sse_w8_sse4_1(__m128i *sum, const uint16_t *a, 196 const uint16_t *b) { 197 const __m128i v_a_w = xx_loadu_128(a); 198 const __m128i v_b_w = xx_loadu_128(b); 199 const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w); 200 *sum = _mm_add_epi32(*sum, _mm_madd_epi16(v_d_w, v_d_w)); 201 } 202 203 int64_t aom_highbd_sse_sse4_1(const uint8_t *a8, int a_stride, 204 const uint8_t *b8, int b_stride, int width, 205 int height) { 206 int32_t y = 0; 207 int64_t sse = 0; 208 uint16_t *a = CONVERT_TO_SHORTPTR(a8); 209 uint16_t *b = CONVERT_TO_SHORTPTR(b8); 210 __m128i sum = _mm_setzero_si128(); 211 switch (width) { 212 case 4: 213 do { 214 highbd_sse_w4x2_sse4_1(&sum, a, a_stride, b, b_stride); 215 a += a_stride << 1; 216 b += b_stride << 1; 217 y += 2; 218 } while (y < height); 219 sse = summary_all_sse4(&sum); 220 break; 221 case 8: 222 do { 223 highbd_sse_w8_sse4_1(&sum, a, b); 224 a += a_stride; 225 b += b_stride; 226 y += 1; 227 } while (y < height); 228 sse = summary_all_sse4(&sum); 229 break; 230 case 16: 231 do { 232 int l = 0; 233 __m128i sum32 = _mm_setzero_si128(); 234 do { 235 highbd_sse_w8_sse4_1(&sum32, a, b); 236 highbd_sse_w8_sse4_1(&sum32, a + 8, b + 8); 237 a += a_stride; 238 b += b_stride; 239 l += 1; 240 } while (l < 64 && l < (height - y)); 241 summary_32_sse4(&sum32, &sum); 242 y += 64; 243 } while (y < height); 244 xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); 245 break; 246 case 32: 247 do { 248 int l = 0; 249 __m128i sum32 = _mm_setzero_si128(); 250 do { 251 highbd_sse_w8_sse4_1(&sum32, a, b); 252 highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); 253 highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); 254 highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); 255 a += a_stride; 256 b += b_stride; 257 l += 1; 258 } while (l < 32 && l < (height - y)); 259 summary_32_sse4(&sum32, &sum); 260 y += 32; 261 } while (y < height); 262 xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); 263 break; 264 case 64: 265 do { 266 int l = 0; 267 __m128i sum32 = _mm_setzero_si128(); 268 do { 269 highbd_sse_w8_sse4_1(&sum32, a, b); 270 highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); 271 highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); 272 highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); 273 highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); 274 highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); 275 highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); 276 highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); 277 a += a_stride; 278 b += b_stride; 279 l += 1; 280 } while (l < 16 && l < (height - y)); 281 summary_32_sse4(&sum32, &sum); 282 y += 16; 283 } while (y < height); 284 xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); 285 break; 286 case 128: 287 do { 288 int l = 0; 289 __m128i sum32 = _mm_setzero_si128(); 290 do { 291 highbd_sse_w8_sse4_1(&sum32, a, b); 292 highbd_sse_w8_sse4_1(&sum32, a + 8 * 1, b + 8 * 1); 293 highbd_sse_w8_sse4_1(&sum32, a + 8 * 2, b + 8 * 2); 294 highbd_sse_w8_sse4_1(&sum32, a + 8 * 3, b + 8 * 3); 295 highbd_sse_w8_sse4_1(&sum32, a + 8 * 4, b + 8 * 4); 296 highbd_sse_w8_sse4_1(&sum32, a + 8 * 5, b + 8 * 5); 297 highbd_sse_w8_sse4_1(&sum32, a + 8 * 6, b + 8 * 6); 298 highbd_sse_w8_sse4_1(&sum32, a + 8 * 7, b + 8 * 7); 299 highbd_sse_w8_sse4_1(&sum32, a + 8 * 8, b + 8 * 8); 300 highbd_sse_w8_sse4_1(&sum32, a + 8 * 9, b + 8 * 9); 301 highbd_sse_w8_sse4_1(&sum32, a + 8 * 10, b + 8 * 10); 302 highbd_sse_w8_sse4_1(&sum32, a + 8 * 11, b + 8 * 11); 303 highbd_sse_w8_sse4_1(&sum32, a + 8 * 12, b + 8 * 12); 304 highbd_sse_w8_sse4_1(&sum32, a + 8 * 13, b + 8 * 13); 305 highbd_sse_w8_sse4_1(&sum32, a + 8 * 14, b + 8 * 14); 306 highbd_sse_w8_sse4_1(&sum32, a + 8 * 15, b + 8 * 15); 307 a += a_stride; 308 b += b_stride; 309 l += 1; 310 } while (l < 8 && l < (height - y)); 311 summary_32_sse4(&sum32, &sum); 312 y += 8; 313 } while (y < height); 314 xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); 315 break; 316 default: 317 if (width & 0x7) { 318 do { 319 __m128i sum32 = _mm_setzero_si128(); 320 int i = 0; 321 do { 322 highbd_sse_w8_sse4_1(&sum32, a + i, b + i); 323 highbd_sse_w8_sse4_1(&sum32, a + i + a_stride, b + i + b_stride); 324 i += 8; 325 } while (i + 4 < width); 326 highbd_sse_w4x2_sse4_1(&sum32, a + i, a_stride, b + i, b_stride); 327 a += (a_stride << 1); 328 b += (b_stride << 1); 329 y += 2; 330 summary_32_sse4(&sum32, &sum); 331 } while (y < height); 332 } else { 333 do { 334 int l = 0; 335 __m128i sum32 = _mm_setzero_si128(); 336 do { 337 int i = 0; 338 do { 339 highbd_sse_w8_sse4_1(&sum32, a + i, b + i); 340 i += 8; 341 } while (i < width); 342 a += a_stride; 343 b += b_stride; 344 l += 1; 345 } while (l < 8 && l < (height - y)); 346 summary_32_sse4(&sum32, &sum); 347 y += 8; 348 } while (y < height); 349 } 350 xx_storel_64(&sse, _mm_add_epi64(sum, _mm_srli_si128(sum, 8))); 351 break; 352 } 353 return sse; 354 } 355 #endif // CONFIG_AV1_HIGHBITDEPTH