intrapred_sse4.c (51898B)
1 /* 2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <emmintrin.h> // SSE2 13 #include <smmintrin.h> /* SSE4.1 */ 14 15 #include "config/av1_rtcd.h" 16 #include "aom_dsp/x86/intrapred_x86.h" 17 #include "aom_dsp/x86/intrapred_utils.h" 18 #include "aom_dsp/x86/lpf_common_sse2.h" 19 20 // Low bit depth functions 21 static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = { 22 { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 23 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 24 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 25 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 26 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 27 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 28 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 29 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 30 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, 31 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 32 0 }, 33 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 34 0 }, 35 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 36 0, 0 }, 37 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 38 0, 0, 0 }, 39 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 40 0xff, 0, 0, 0 }, 41 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 42 0xff, 0xff, 0, 0 }, 43 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 44 0xff, 0xff, 0xff, 0 }, 45 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 46 0xff, 0xff, 0xff, 0xff }, 47 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 48 0xff, 0xff, 0xff, 0xff }, 49 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 50 0xff, 0xff, 0xff, 0xff }, 51 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 52 0xff, 0xff, 0xff, 0xff }, 53 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 54 0xff, 0xff, 0xff, 0xff }, 55 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 56 0xff, 0xff, 0xff, 0xff }, 57 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 58 0xff, 0xff, 0xff, 0xff }, 59 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 60 0xff, 0xff, 0xff, 0xff }, 61 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 62 0xff, 0xff, 0xff, 0xff }, 63 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 64 0xff, 0xff, 0xff, 0xff }, 65 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 66 0xff, 0xff, 0xff, 0xff }, 67 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 68 0xff, 0xff, 0xff, 0xff }, 69 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 70 0xff, 0xff, 0xff, 0xff }, 71 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 72 0xff, 0xff, 0xff, 0xff }, 73 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 74 0xff, 0xff, 0xff, 0xff }, 75 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 76 0xff, 0xff, 0xff, 0xff }, 77 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 78 0xff, 0xff, 0xff, 0xff } }, 79 { 80 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 81 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 82 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 83 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 84 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 85 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 86 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 87 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 88 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 89 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 90 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 91 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 92 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 93 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 94 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 95 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 96 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 97 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 98 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 99 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 100 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 101 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 102 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 103 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 104 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 105 0 }, 106 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 107 0 }, 108 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 109 0, 0 }, 110 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 111 0, 0, 0 }, 112 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 113 0, 0, 0, 0 }, 114 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 115 0xff, 0, 0, 0 }, 116 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 117 0xff, 0xff, 0, 0 }, 118 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 119 0xff, 0xff, 0xff, 0 }, 120 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 121 0xff, 0xff, 0xff, 0xff }, 122 }, 123 }; 124 125 /* clang-format on */ 126 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1( 127 int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, 128 int dx) { 129 const int frac_bits = 6 - upsample_above; 130 const int max_base_x = ((W + H) - 1) << upsample_above; 131 132 assert(dx > 0); 133 // pre-filter above pixels 134 // store in temp buffers: 135 // above[x] * 32 + 16 136 // above[x+1] - above[x] 137 // final pixels will be calculated as: 138 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 139 __m128i a0, a1, a32, a16; 140 __m128i diff, c3f; 141 __m128i a_mbase_x; 142 143 a16 = _mm_set1_epi16(16); 144 a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); 145 c3f = _mm_set1_epi16(0x3f); 146 147 int x = dx; 148 for (int r = 0; r < W; r++) { 149 __m128i b, res, res1, shift; 150 __m128i a0_above, a1_above; 151 152 int base = x >> frac_bits; 153 int base_max_diff = (max_base_x - base) >> upsample_above; 154 if (base_max_diff <= 0) { 155 for (int i = r; i < W; ++i) { 156 dst[i] = a_mbase_x; // save 4 values 157 } 158 return; 159 } 160 if (base_max_diff > H) base_max_diff = H; 161 a0_above = _mm_loadu_si128((__m128i *)(above + base)); 162 a1_above = _mm_loadu_si128((__m128i *)(above + base + 1)); 163 164 if (upsample_above) { 165 a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]); 166 a1_above = _mm_srli_si128(a0_above, 8); 167 168 shift = _mm_srli_epi16( 169 _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f), 170 1); 171 } else { 172 shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); 173 } 174 // lower half 175 a0 = _mm_cvtepu8_epi16(a0_above); 176 a1 = _mm_cvtepu8_epi16(a1_above); 177 178 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] 179 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 180 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 181 182 b = _mm_mullo_epi16(diff, shift); 183 res = _mm_add_epi16(a32, b); 184 res = _mm_srli_epi16(res, 5); 185 186 // uppar half 187 a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); 188 a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); 189 190 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] 191 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 192 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 193 194 b = _mm_mullo_epi16(diff, shift); 195 res1 = _mm_add_epi16(a32, b); 196 res1 = _mm_srli_epi16(res1, 5); 197 198 res = _mm_packus_epi16(res, res1); 199 200 dst[r] = 201 _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]); 202 x += dx; 203 } 204 } 205 206 static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 207 const uint8_t *above, 208 int upsample_above, int dx) { 209 __m128i dstvec[16]; 210 211 dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx); 212 for (int i = 0; i < N; i++) { 213 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); 214 } 215 } 216 217 static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 218 const uint8_t *above, 219 int upsample_above, int dx) { 220 __m128i dstvec[32]; 221 222 dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx); 223 for (int i = 0; i < N; i++) { 224 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); 225 } 226 } 227 228 static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 229 const uint8_t *above, 230 int upsample_above, int dx) { 231 __m128i dstvec[64]; 232 233 dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above, 234 dx); 235 for (int i = 0; i < N; i++) { 236 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); 237 } 238 } 239 240 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1( 241 int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above, 242 int upsample_above, int dx) { 243 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 244 (void)upsample_above; 245 const int frac_bits = 6; 246 const int max_base_x = ((32 + N) - 1); 247 248 // pre-filter above pixels 249 // store in temp buffers: 250 // above[x] * 32 + 16 251 // above[x+1] - above[x] 252 // final pixels will be calculated as: 253 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 254 __m128i a0, a1, a32, a16; 255 __m128i a_mbase_x, diff, c3f; 256 257 a16 = _mm_set1_epi16(16); 258 a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); 259 c3f = _mm_set1_epi16(0x3f); 260 261 int x = dx; 262 for (int r = 0; r < N; r++) { 263 __m128i b, res, res1, res16[2]; 264 __m128i a0_above, a1_above; 265 266 int base = x >> frac_bits; 267 int base_max_diff = (max_base_x - base); 268 if (base_max_diff <= 0) { 269 for (int i = r; i < N; ++i) { 270 dstvec[i] = a_mbase_x; // save 32 values 271 dstvec_h[i] = a_mbase_x; 272 } 273 return; 274 } 275 if (base_max_diff > 32) base_max_diff = 32; 276 __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); 277 278 for (int j = 0, jj = 0; j < 32; j += 16, jj++) { 279 int mdiff = base_max_diff - j; 280 if (mdiff <= 0) { 281 res16[jj] = a_mbase_x; 282 } else { 283 a0_above = _mm_loadu_si128((__m128i *)(above + base + j)); 284 a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1)); 285 286 // lower half 287 a0 = _mm_cvtepu8_epi16(a0_above); 288 a1 = _mm_cvtepu8_epi16(a1_above); 289 290 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] 291 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 292 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 293 b = _mm_mullo_epi16(diff, shift); 294 295 res = _mm_add_epi16(a32, b); 296 res = _mm_srli_epi16(res, 5); 297 298 // uppar half 299 a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); 300 a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); 301 302 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] 303 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 304 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 305 306 b = _mm_mullo_epi16(diff, shift); 307 res1 = _mm_add_epi16(a32, b); 308 res1 = _mm_srli_epi16(res1, 5); 309 310 res16[jj] = _mm_packus_epi16(res, res1); // 16 8bit values 311 } 312 } 313 314 dstvec[r] = 315 _mm_blendv_epi8(a_mbase_x, res16[0], 316 *(__m128i *)Mask[0][base_max_diff]); // 16 8bit values 317 318 dstvec_h[r] = 319 _mm_blendv_epi8(a_mbase_x, res16[1], 320 *(__m128i *)Mask[1][base_max_diff]); // 16 8bit values 321 x += dx; 322 } 323 } 324 325 static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 326 const uint8_t *above, 327 int upsample_above, int dx) { 328 __m128i dstvec[64], dstvec_h[64]; 329 dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above, 330 upsample_above, dx); 331 for (int i = 0; i < N; i++) { 332 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); 333 _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]); 334 } 335 } 336 337 static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 338 const uint8_t *above, 339 int upsample_above, int dx) { 340 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 341 (void)upsample_above; 342 const int frac_bits = 6; 343 const int max_base_x = ((64 + N) - 1); 344 345 // pre-filter above pixels 346 // store in temp buffers: 347 // above[x] * 32 + 16 348 // above[x+1] - above[x] 349 // final pixels will be calculated as: 350 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 351 __m128i a0, a1, a32, a16; 352 __m128i a_mbase_x, diff, c3f; 353 __m128i max_base, base_inc, mask; 354 355 a16 = _mm_set1_epi16(16); 356 a_mbase_x = _mm_set1_epi8((char)above[max_base_x]); 357 max_base = _mm_set1_epi8(max_base_x); 358 c3f = _mm_set1_epi16(0x3f); 359 360 int x = dx; 361 for (int r = 0; r < N; r++, dst += stride) { 362 __m128i b, res, res1; 363 int base = x >> frac_bits; 364 if (base >= max_base_x) { 365 for (int i = r; i < N; ++i) { 366 _mm_storeu_si128((__m128i *)dst, a_mbase_x); // save 32 values 367 _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x); 368 _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x); 369 _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x); 370 dst += stride; 371 } 372 return; 373 } 374 375 __m128i shift = 376 _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1); // 8 element 377 378 __m128i a0_above, a1_above, res_val; 379 for (int j = 0; j < 64; j += 16) { 380 int mdif = max_base_x - (base + j); 381 if (mdif <= 0) { 382 _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x); 383 } else { 384 a0_above = 385 _mm_loadu_si128((__m128i *)(above + base + j)); // load 16 element 386 a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); 387 388 // lower half 389 a0 = _mm_cvtepu8_epi16(a0_above); 390 a1 = _mm_cvtepu8_epi16(a1_above); 391 392 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] 393 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 394 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 395 b = _mm_mullo_epi16(diff, shift); 396 397 res = _mm_add_epi16(a32, b); 398 res = _mm_srli_epi16(res, 5); 399 400 // uppar half 401 a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); 402 a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); 403 404 diff = _mm_sub_epi16(a1, a0); // a[x+1] - a[x] 405 a32 = _mm_slli_epi16(a0, 5); // a[x] * 32 406 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 407 408 b = _mm_mullo_epi16(diff, shift); 409 res1 = _mm_add_epi16(a32, b); 410 res1 = _mm_srli_epi16(res1, 5); 411 412 res = _mm_packus_epi16(res, res1); // 16 8bit values 413 414 base_inc = 415 _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), 416 (int8_t)(base + j + 2), (int8_t)(base + j + 3), 417 (int8_t)(base + j + 4), (int8_t)(base + j + 5), 418 (int8_t)(base + j + 6), (int8_t)(base + j + 7), 419 (int8_t)(base + j + 8), (int8_t)(base + j + 9), 420 (int8_t)(base + j + 10), (int8_t)(base + j + 11), 421 (int8_t)(base + j + 12), (int8_t)(base + j + 13), 422 (int8_t)(base + j + 14), (int8_t)(base + j + 15)); 423 424 mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc), 425 _mm_setzero_si128()); 426 res_val = _mm_blendv_epi8(a_mbase_x, res, mask); 427 _mm_storeu_si128((__m128i *)(dst + j), res_val); 428 } 429 } 430 x += dx; 431 } 432 } 433 434 // Directional prediction, zone 1: 0 < angle < 90 435 void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 436 const uint8_t *above, const uint8_t *left, 437 int upsample_above, int dx, int dy) { 438 (void)left; 439 (void)dy; 440 switch (bw) { 441 case 4: 442 dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx); 443 break; 444 case 8: 445 dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx); 446 break; 447 case 16: 448 dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx); 449 break; 450 case 32: 451 dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx); 452 break; 453 case 64: 454 dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx); 455 break; 456 default: assert(0 && "Invalid block size"); 457 } 458 return; 459 } 460 461 static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 462 const uint8_t *above, 463 const uint8_t *left, int upsample_above, 464 int upsample_left, int dx, int dy) { 465 const int min_base_x = -(1 << upsample_above); 466 const int min_base_y = -(1 << upsample_left); 467 const int frac_bits_x = 6 - upsample_above; 468 const int frac_bits_y = 6 - upsample_left; 469 470 assert(dx > 0); 471 // pre-filter above pixels 472 // store in temp buffers: 473 // above[x] * 32 + 16 474 // above[x+1] - above[x] 475 // final pixels will be calculated as: 476 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 477 __m128i a0_x, a1_x, a32, diff; 478 479 const __m128i c3f = _mm_set1_epi16(0x3f); 480 const __m128i min_y_base = _mm_set1_epi16(min_base_y); 481 const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); 482 const __m128i dy_reg = _mm_set1_epi16(dy); 483 const __m128i a16 = _mm_set1_epi16(16); 484 485 for (int r = 0; r < N; r++) { 486 __m128i b, res, shift, r6, ydx; 487 __m128i resx, resy, resxy; 488 __m128i a0_above, a1_above; 489 int y = r + 1; 490 int base_x = (-y * dx) >> frac_bits_x; 491 int base_shift = 0; 492 if (base_x < (min_base_x - 1)) { 493 base_shift = (min_base_x - base_x - 1) >> upsample_above; 494 } 495 int base_min_diff = 496 (min_base_x - base_x + upsample_above) >> upsample_above; 497 if (base_min_diff > 4) { 498 base_min_diff = 4; 499 } else { 500 if (base_min_diff < 0) base_min_diff = 0; 501 } 502 503 if (base_shift > 3) { 504 a0_x = _mm_setzero_si128(); 505 a1_x = _mm_setzero_si128(); 506 shift = _mm_setzero_si128(); 507 } else { 508 a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 509 ydx = _mm_set1_epi16(y * dx); 510 r6 = _mm_slli_epi16(c1234, 6); 511 512 if (upsample_above) { 513 a0_above = 514 _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); 515 a1_above = _mm_srli_si128(a0_above, 8); 516 517 shift = _mm_srli_epi16( 518 _mm_and_si128( 519 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 520 1); 521 } else { 522 a0_above = 523 _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); 524 a1_above = _mm_srli_si128(a0_above, 1); 525 526 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); 527 } 528 a0_x = _mm_cvtepu8_epi16(a0_above); 529 a1_x = _mm_cvtepu8_epi16(a1_above); 530 } 531 // y calc 532 __m128i a0_y, a1_y, shifty; 533 if (base_x < min_base_x) { 534 DECLARE_ALIGNED(32, int16_t, base_y_c[8]); 535 __m128i y_c, base_y_c_reg, mask, c1234_; 536 c1234_ = _mm_srli_si128(c1234, 2); 537 r6 = _mm_set1_epi16(r << 6); 538 y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg)); 539 base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); 540 mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); 541 base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); 542 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); 543 544 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 545 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); 546 base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); 547 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); 548 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 549 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); 550 551 if (upsample_left) { 552 shifty = _mm_srli_epi16( 553 _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); 554 } else { 555 shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); 556 } 557 a0_x = _mm_unpacklo_epi64(a0_x, a0_y); 558 a1_x = _mm_unpacklo_epi64(a1_x, a1_y); 559 shift = _mm_unpacklo_epi64(shift, shifty); 560 } 561 562 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 563 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 564 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 565 566 b = _mm_mullo_epi16(diff, shift); 567 res = _mm_add_epi16(a32, b); 568 res = _mm_srli_epi16(res, 5); 569 570 resx = _mm_packus_epi16(res, res); 571 resy = _mm_srli_si128(resx, 4); 572 573 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); 574 *(int *)(dst) = _mm_cvtsi128_si32(resxy); 575 dst += stride; 576 } 577 } 578 579 static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride, 580 const uint8_t *above, 581 const uint8_t *left, int upsample_above, 582 int upsample_left, int dx, int dy) { 583 const int min_base_x = -(1 << upsample_above); 584 const int min_base_y = -(1 << upsample_left); 585 const int frac_bits_x = 6 - upsample_above; 586 const int frac_bits_y = 6 - upsample_left; 587 588 // pre-filter above pixels 589 // store in temp buffers: 590 // above[x] * 32 + 16 591 // above[x+1] - above[x] 592 // final pixels will be calculated as: 593 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 594 __m128i diff, a32; 595 __m128i a0_x, a1_x, a0_y, a1_y; 596 __m128i a0_above, a1_above; 597 598 const __m128i a16 = _mm_set1_epi16(16); 599 const __m128i c3f = _mm_set1_epi16(0x3f); 600 const __m128i min_y_base = _mm_set1_epi16(min_base_y); 601 const __m128i dy_reg = _mm_set1_epi16(dy); 602 const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); 603 604 for (int r = 0; r < N; r++) { 605 __m128i b, res, res1, shift; 606 __m128i resx, resy, resxy, r6, ydx; 607 608 int y = r + 1; 609 int base_x = (-y * dx) >> frac_bits_x; 610 int base_shift = 0; 611 if (base_x < (min_base_x - 1)) { 612 base_shift = (min_base_x - base_x - 1) >> upsample_above; 613 } 614 int base_min_diff = 615 (min_base_x - base_x + upsample_above) >> upsample_above; 616 if (base_min_diff > 8) { 617 base_min_diff = 8; 618 } else { 619 if (base_min_diff < 0) base_min_diff = 0; 620 } 621 622 if (base_shift > 7) { 623 resx = _mm_setzero_si128(); 624 } else { 625 a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 626 ydx = _mm_set1_epi16(y * dx); 627 r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); 628 if (upsample_above) { 629 a0_above = 630 _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]); 631 a1_above = _mm_srli_si128(a0_above, 8); 632 633 shift = _mm_srli_epi16( 634 _mm_and_si128( 635 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 636 1); 637 } else { 638 a1_above = _mm_srli_si128(a0_above, 1); 639 a0_above = 640 _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); 641 a1_above = 642 _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); 643 644 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); 645 } 646 a0_x = _mm_cvtepu8_epi16(a0_above); 647 a1_x = _mm_cvtepu8_epi16(a1_above); 648 649 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 650 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 651 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 652 653 b = _mm_mullo_epi16(diff, shift); 654 res = _mm_add_epi16(a32, b); 655 res = _mm_srli_epi16(res, 5); 656 resx = _mm_packus_epi16(res, res); 657 } 658 659 // y calc 660 if (base_x < min_base_x) { 661 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 662 __m128i y_c, base_y_c_reg, mask; 663 r6 = _mm_set1_epi16(r << 6); 664 y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg)); 665 base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y); 666 mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg); 667 base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg); 668 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); 669 670 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 671 left[base_y_c[2]], left[base_y_c[3]], 672 left[base_y_c[4]], left[base_y_c[5]], 673 left[base_y_c[6]], left[base_y_c[7]]); 674 base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4)); 675 _mm_store_si128((__m128i *)base_y_c, base_y_c_reg); 676 677 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 678 left[base_y_c[2]], left[base_y_c[3]], 679 left[base_y_c[4]], left[base_y_c[5]], 680 left[base_y_c[6]], left[base_y_c[7]]); 681 682 if (upsample_left) { 683 shift = _mm_srli_epi16( 684 _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1); 685 } else { 686 shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1); 687 } 688 689 diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] 690 a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 691 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 692 693 b = _mm_mullo_epi16(diff, shift); 694 res1 = _mm_add_epi16(a32, b); 695 res1 = _mm_srli_epi16(res1, 5); 696 697 resy = _mm_packus_epi16(res1, res1); 698 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); 699 _mm_storel_epi64((__m128i *)dst, resxy); 700 } else { 701 _mm_storel_epi64((__m128i *)dst, resx); 702 } 703 704 dst += stride; 705 } 706 } 707 708 static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst, 709 ptrdiff_t stride, const uint8_t *above, 710 const uint8_t *left, int upsample_above, 711 int upsample_left, int dx, int dy) { 712 // here upsample_above and upsample_left are 0 by design of 713 // av1_use_intra_edge_upsample 714 const int min_base_x = -1; 715 const int min_base_y = -1; 716 (void)upsample_above; 717 (void)upsample_left; 718 const int frac_bits_x = 6; 719 const int frac_bits_y = 6; 720 721 __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32; 722 __m128i diff, shifty, shifty_h; 723 __m128i a0_above, a1_above; 724 725 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 726 const __m128i a16 = _mm_set1_epi16(16); 727 const __m128i c1 = _mm_srli_epi16(a16, 4); 728 const __m128i min_y_base = _mm_set1_epi16(min_base_y); 729 const __m128i c3f = _mm_set1_epi16(0x3f); 730 const __m128i dy256 = _mm_set1_epi16(dy); 731 const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); 732 const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15); 733 const __m128i c1234 = _mm_add_epi16(c0123, c1); 734 const __m128i c1234_h = _mm_add_epi16(c0123_h, c1); 735 736 for (int r = 0; r < H; r++) { 737 __m128i b, res, res1, shift, reg_j, r6, ydx; 738 __m128i resx, resy; 739 __m128i resxy; 740 int y = r + 1; 741 ydx = _mm_set1_epi16((int16_t)(y * dx)); 742 743 int base_x = (-y * dx) >> frac_bits_x; 744 for (int j = 0; j < W; j += 16) { 745 reg_j = _mm_set1_epi16(j); 746 int base_shift = 0; 747 if ((base_x + j) < (min_base_x - 1)) { 748 base_shift = (min_base_x - (base_x + j) - 1); 749 } 750 int base_min_diff = (min_base_x - base_x - j); 751 if (base_min_diff > 16) { 752 base_min_diff = 16; 753 } else { 754 if (base_min_diff < 0) base_min_diff = 0; 755 } 756 757 if (base_shift < 16) { 758 a0_above = 759 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); 760 a1_above = 761 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); 762 a0_above = 763 _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]); 764 a1_above = 765 _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]); 766 767 a0_x = _mm_cvtepu8_epi16(a0_above); 768 a1_x = _mm_cvtepu8_epi16(a1_above); 769 770 r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6); 771 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); 772 773 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 774 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 775 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 776 777 b = _mm_mullo_epi16(diff, shift); 778 res = _mm_add_epi16(a32, b); 779 res = _mm_srli_epi16(res, 5); // 16 16-bit values 780 781 a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8)); 782 a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8)); 783 784 r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6); 785 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); 786 787 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 788 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 789 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 790 791 b = _mm_mullo_epi16(diff, shift); 792 res1 = _mm_add_epi16(a32, b); 793 res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values 794 795 resx = _mm_packus_epi16(res, res1); 796 } else { 797 resx = _mm_setzero_si128(); 798 } 799 800 // y calc 801 if (base_x < min_base_x) { 802 __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h; 803 __m128i mask, mask_h, mul16, mul16_h; 804 r6 = _mm_set1_epi16(r << 6); 805 c_reg = _mm_add_epi16(reg_j, c1234); 806 c_reg_h = _mm_add_epi16(reg_j, c1234_h); 807 mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256), 808 _mm_srli_epi16(min_y_base, 1)); 809 mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256), 810 _mm_srli_epi16(min_y_base, 1)); 811 y_reg = _mm_sub_epi16(r6, mul16); 812 y_reg_h = _mm_sub_epi16(r6, mul16_h); 813 814 base_y = _mm_srai_epi16(y_reg, frac_bits_y); 815 base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y); 816 mask = _mm_cmpgt_epi16(min_y_base, base_y); 817 mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h); 818 819 base_y = _mm_blendv_epi8(base_y, min_y_base, mask); 820 base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h); 821 int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7); 822 int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0); 823 int16_t offset_diff = max_y - min_y; 824 825 if (offset_diff < 16) { 826 __m128i min_y_reg = _mm_set1_epi16(min_y); 827 828 __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg); 829 __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg); 830 __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h); 831 832 __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y)); 833 __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1)); 834 __m128i LoadMask = 835 _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4])); 836 837 a0_mask = _mm_and_si128(a0_mask, LoadMask); 838 a1_mask = _mm_and_si128(a1_mask, LoadMask); 839 840 a0_mask = _mm_shuffle_epi8(a0_mask, y_offset); 841 a1_mask = _mm_shuffle_epi8(a1_mask, y_offset); 842 a0_y = _mm_cvtepu8_epi16(a0_mask); 843 a1_y = _mm_cvtepu8_epi16(a1_mask); 844 a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8)); 845 a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8)); 846 } else { 847 base_y = _mm_andnot_si128(mask, base_y); 848 base_y_h = _mm_andnot_si128(mask_h, base_y_h); 849 _mm_store_si128((__m128i *)base_y_c, base_y); 850 _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); 851 852 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 853 left[base_y_c[2]], left[base_y_c[3]], 854 left[base_y_c[4]], left[base_y_c[5]], 855 left[base_y_c[6]], left[base_y_c[7]]); 856 a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], 857 left[base_y_c[10]], left[base_y_c[11]], 858 left[base_y_c[12]], left[base_y_c[13]], 859 left[base_y_c[14]], left[base_y_c[15]]); 860 base_y = _mm_add_epi16(base_y, c1); 861 base_y_h = _mm_add_epi16(base_y_h, c1); 862 _mm_store_si128((__m128i *)base_y_c, base_y); 863 _mm_store_si128((__m128i *)&base_y_c[8], base_y_h); 864 865 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 866 left[base_y_c[2]], left[base_y_c[3]], 867 left[base_y_c[4]], left[base_y_c[5]], 868 left[base_y_c[6]], left[base_y_c[7]]); 869 a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]], 870 left[base_y_c[10]], left[base_y_c[11]], 871 left[base_y_c[12]], left[base_y_c[13]], 872 left[base_y_c[14]], left[base_y_c[15]]); 873 } 874 shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1); 875 shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1); 876 877 diff = _mm_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] 878 a32 = _mm_slli_epi16(a0_y, 5); // a[x] * 32 879 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 880 881 b = _mm_mullo_epi16(diff, shifty); 882 res = _mm_add_epi16(a32, b); 883 res = _mm_srli_epi16(res, 5); // 16 16-bit values 884 885 diff = _mm_sub_epi16(a1_y_h, a0_y_h); // a[x+1] - a[x] 886 a32 = _mm_slli_epi16(a0_y_h, 5); // a[x] * 32 887 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 888 889 b = _mm_mullo_epi16(diff, shifty_h); 890 res1 = _mm_add_epi16(a32, b); 891 res1 = _mm_srli_epi16(res1, 5); // 16 16-bit values 892 resy = _mm_packus_epi16(res, res1); 893 } else { 894 resy = _mm_setzero_si128(); 895 } 896 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]); 897 _mm_storeu_si128((__m128i *)(dst + j), resxy); 898 } // for j 899 dst += stride; 900 } 901 } 902 903 // Directional prediction, zone 2: 90 < angle < 180 904 void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 905 const uint8_t *above, const uint8_t *left, 906 int upsample_above, int upsample_left, int dx, 907 int dy) { 908 assert(dx > 0); 909 assert(dy > 0); 910 switch (bw) { 911 case 4: 912 dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above, 913 upsample_left, dx, dy); 914 break; 915 case 8: 916 dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above, 917 upsample_left, dx, dy); 918 break; 919 default: 920 dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left, 921 upsample_above, upsample_left, dx, dy); 922 } 923 return; 924 } 925 926 // z3 functions 927 static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride, 928 const uint8_t *left, int upsample_left, 929 int dy) { 930 __m128i dstvec[4], d[4]; 931 932 dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy); 933 transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 934 &d[0], &d[1], &d[2], &d[3]); 935 936 *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); 937 *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); 938 *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); 939 *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); 940 return; 941 } 942 943 static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride, 944 const uint8_t *left, int upsample_left, 945 int dy) { 946 __m128i dstvec[8], d[8]; 947 948 dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy); 949 transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], 950 &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], 951 &d[3]); 952 953 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 954 _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); 955 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); 956 _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); 957 _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); 958 _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); 959 _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); 960 _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); 961 } 962 963 static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride, 964 const uint8_t *left, int upsample_left, 965 int dy) { 966 __m128i dstvec[4], d[8]; 967 968 dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy); 969 transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], 970 &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); 971 for (int i = 0; i < 8; i++) { 972 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); 973 } 974 } 975 976 static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride, 977 const uint8_t *left, int upsample_left, 978 int dy) { 979 __m128i dstvec[8], d[4]; 980 981 dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy); 982 transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 983 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], 984 &d[1], &d[2], &d[3]); 985 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 986 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); 987 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); 988 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); 989 } 990 991 static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride, 992 const uint8_t *left, int upsample_left, 993 int dy) { 994 __m128i dstvec[8], d[8]; 995 996 dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy); 997 transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, 998 dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, 999 d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); 1000 for (int i = 0; i < 8; i++) { 1001 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); 1002 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), 1003 _mm_srli_si128(d[i], 8)); 1004 } 1005 } 1006 1007 static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride, 1008 const uint8_t *left, int upsample_left, 1009 int dy) { 1010 __m128i dstvec[16], d[16]; 1011 1012 dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy); 1013 transpose16x8_8x16_sse2( 1014 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 1015 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 1016 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 1017 &d[3], &d[4], &d[5], &d[6], &d[7]); 1018 1019 for (int i = 0; i < 8; i++) { 1020 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 1021 } 1022 } 1023 1024 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1025 static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride, 1026 const uint8_t *left, int upsample_left, 1027 int dy) { 1028 __m128i dstvec[4], d[16]; 1029 1030 dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy); 1031 transpose4x16_sse2(dstvec, d); 1032 for (int i = 0; i < 16; i++) { 1033 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); 1034 } 1035 } 1036 1037 static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride, 1038 const uint8_t *left, int upsample_left, 1039 int dy) { 1040 __m128i dstvec[16], d[8]; 1041 1042 dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy); 1043 for (int i = 4; i < 8; i++) { 1044 d[i] = _mm_setzero_si128(); 1045 } 1046 transpose16x8_8x16_sse2( 1047 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 1048 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 1049 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 1050 &d[3], &d[4], &d[5], &d[6], &d[7]); 1051 1052 for (int i = 0; i < 4; i++) { 1053 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 1054 } 1055 } 1056 1057 static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride, 1058 const uint8_t *left, int upsample_left, 1059 int dy) { 1060 __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; 1061 1062 dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left, 1063 upsample_left, dy); 1064 for (int i = 8; i < 16; i++) { 1065 dstvec[i] = _mm_setzero_si128(); 1066 dstvec_h[i] = _mm_setzero_si128(); 1067 } 1068 transpose16x16_sse2(dstvec, d); 1069 transpose16x16_sse2(dstvec_h, d_h); 1070 1071 for (int i = 0; i < 16; i++) { 1072 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); 1073 } 1074 for (int i = 0; i < 16; i++) { 1075 _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]); 1076 } 1077 } 1078 1079 static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride, 1080 const uint8_t *left, int upsample_left, 1081 int dy) { 1082 __m128i dstvec[32], d[16]; 1083 1084 dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy); 1085 1086 transpose16x8_8x16_sse2( 1087 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 1088 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 1089 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 1090 &d[3], &d[4], &d[5], &d[6], &d[7]); 1091 transpose16x8_8x16_sse2( 1092 &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], 1093 &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], 1094 &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], 1095 &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], 1096 &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], 1097 &d[6 + 8], &d[7 + 8]); 1098 1099 for (int i = 0; i < 8; i++) { 1100 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 1101 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); 1102 } 1103 } 1104 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1105 1106 static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride, 1107 const uint8_t *left, 1108 int upsample_left, int dy) { 1109 __m128i dstvec[16], d[16]; 1110 1111 dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy); 1112 transpose16x16_sse2(dstvec, d); 1113 1114 for (int i = 0; i < 16; i++) { 1115 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 1116 } 1117 } 1118 1119 static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride, 1120 const uint8_t *left, 1121 int upsample_left, int dy) { 1122 __m128i dstvec[32], d[32], dstvec_h[32], d_h[32]; 1123 1124 dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left, 1125 upsample_left, dy); 1126 transpose16x16_sse2(dstvec, d); 1127 transpose16x16_sse2(dstvec_h, d_h); 1128 transpose16x16_sse2(dstvec + 16, d + 16); 1129 transpose16x16_sse2(dstvec_h + 16, d_h + 16); 1130 for (int j = 0; j < 16; j++) { 1131 _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); 1132 _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]); 1133 } 1134 for (int j = 0; j < 16; j++) { 1135 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); 1136 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]); 1137 } 1138 } 1139 1140 static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride, 1141 const uint8_t *left, 1142 int upsample_left, int dy) { 1143 uint8_t dstT[64 * 64]; 1144 dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy); 1145 transpose(dstT, 64, dst, stride, 64, 64); 1146 } 1147 1148 static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride, 1149 const uint8_t *left, 1150 int upsample_left, int dy) { 1151 __m128i dstvec[16], d[16], dstvec_h[16], d_h[16]; 1152 1153 dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left, 1154 upsample_left, dy); 1155 transpose16x16_sse2(dstvec, d); 1156 transpose16x16_sse2(dstvec_h, d_h); 1157 // store 1158 for (int j = 0; j < 16; j++) { 1159 _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]); 1160 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]); 1161 } 1162 } 1163 1164 static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride, 1165 const uint8_t *left, 1166 int upsample_left, int dy) { 1167 __m128i dstvec[32], d[16]; 1168 1169 dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy); 1170 for (int i = 0; i < 32; i += 16) { 1171 transpose16x16_sse2((dstvec + i), d); 1172 for (int j = 0; j < 16; j++) { 1173 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); 1174 } 1175 } 1176 } 1177 1178 static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride, 1179 const uint8_t *left, 1180 int upsample_left, int dy) { 1181 uint8_t dstT[64 * 32]; 1182 dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy); 1183 transpose(dstT, 64, dst, stride, 32, 64); 1184 } 1185 1186 static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride, 1187 const uint8_t *left, 1188 int upsample_left, int dy) { 1189 uint8_t dstT[32 * 64]; 1190 dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy); 1191 transpose(dstT, 32, dst, stride, 64, 32); 1192 return; 1193 } 1194 1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1196 static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride, 1197 const uint8_t *left, 1198 int upsample_left, int dy) { 1199 uint8_t dstT[64 * 16]; 1200 dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy); 1201 transpose(dstT, 64, dst, stride, 16, 64); 1202 } 1203 1204 static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride, 1205 const uint8_t *left, 1206 int upsample_left, int dy) { 1207 __m128i dstvec[64], d[16]; 1208 1209 dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy); 1210 for (int i = 0; i < 64; i += 16) { 1211 transpose16x16_sse2(dstvec + i, d); 1212 for (int j = 0; j < 16; j++) { 1213 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); 1214 } 1215 } 1216 } 1217 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1218 1219 void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 1220 const uint8_t *above, const uint8_t *left, 1221 int upsample_left, int dx, int dy) { 1222 (void)above; 1223 (void)dx; 1224 assert(dx == 1); 1225 assert(dy > 0); 1226 1227 if (bw == bh) { 1228 switch (bw) { 1229 case 4: 1230 dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy); 1231 break; 1232 case 8: 1233 dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy); 1234 break; 1235 case 16: 1236 dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy); 1237 break; 1238 case 32: 1239 dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy); 1240 break; 1241 case 64: 1242 dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy); 1243 break; 1244 default: assert(0 && "Invalid block size"); 1245 } 1246 } else { 1247 if (bw < bh) { 1248 if (bw + bw == bh) { 1249 switch (bw) { 1250 case 4: 1251 dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy); 1252 break; 1253 case 8: 1254 dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy); 1255 break; 1256 case 16: 1257 dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy); 1258 break; 1259 case 32: 1260 dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy); 1261 break; 1262 default: assert(0 && "Invalid block size"); 1263 } 1264 } else { 1265 switch (bw) { 1266 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1267 case 4: 1268 dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy); 1269 break; 1270 case 8: 1271 dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy); 1272 break; 1273 case 16: 1274 dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy); 1275 break; 1276 default: assert(0 && "Invalid block size"); 1277 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1278 } 1279 } 1280 } else { 1281 if (bh + bh == bw) { 1282 switch (bh) { 1283 case 4: 1284 dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy); 1285 break; 1286 case 8: 1287 dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy); 1288 break; 1289 case 16: 1290 dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy); 1291 break; 1292 case 32: 1293 dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy); 1294 break; 1295 default: assert(0 && "Invalid block size"); 1296 } 1297 } else { 1298 switch (bh) { 1299 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1300 case 4: 1301 dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy); 1302 break; 1303 case 8: 1304 dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy); 1305 break; 1306 case 16: 1307 dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy); 1308 break; 1309 default: assert(0 && "Invalid block size"); 1310 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1311 } 1312 } 1313 } 1314 } 1315 }