intrapred_avx2.c (184008B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/av1_rtcd.h" 15 #include "aom_dsp/x86/intrapred_x86.h" 16 #include "aom_dsp/x86/intrapred_utils.h" 17 #include "aom_dsp/x86/lpf_common_sse2.h" 18 19 static inline __m256i dc_sum_64(const uint8_t *ref) { 20 const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref); 21 const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32)); 22 const __m256i zero = _mm256_setzero_si256(); 23 __m256i y0 = _mm256_sad_epu8(x0, zero); 24 __m256i y1 = _mm256_sad_epu8(x1, zero); 25 y0 = _mm256_add_epi64(y0, y1); 26 __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1); 27 y0 = _mm256_add_epi64(u0, y0); 28 u0 = _mm256_unpackhi_epi64(y0, y0); 29 return _mm256_add_epi16(y0, u0); 30 } 31 32 static inline __m256i dc_sum_32(const uint8_t *ref) { 33 const __m256i x = _mm256_loadu_si256((const __m256i *)ref); 34 const __m256i zero = _mm256_setzero_si256(); 35 __m256i y = _mm256_sad_epu8(x, zero); 36 __m256i u = _mm256_permute2x128_si256(y, y, 1); 37 y = _mm256_add_epi64(u, y); 38 u = _mm256_unpackhi_epi64(y, y); 39 return _mm256_add_epi16(y, u); 40 } 41 42 static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst, 43 ptrdiff_t stride) { 44 for (int i = 0; i < height; ++i) { 45 _mm256_storeu_si256((__m256i *)dst, *r); 46 dst += stride; 47 } 48 } 49 50 static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1, 51 int height, uint8_t *dst, 52 ptrdiff_t stride) { 53 for (int i = 0; i < height; ++i) { 54 _mm256_storeu_si256((__m256i *)dst, *r0); 55 _mm256_storeu_si256((__m256i *)(dst + 32), *r1); 56 dst += stride; 57 } 58 } 59 60 static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst, 61 ptrdiff_t stride) { 62 for (int i = 0; i < height; ++i) { 63 _mm256_storeu_si256((__m256i *)dst, *r); 64 _mm256_storeu_si256((__m256i *)(dst + 32), *r); 65 dst += stride; 66 } 67 } 68 69 #if CONFIG_AV1_HIGHBITDEPTH 70 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = { 71 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 72 { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, 73 { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, 74 { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, 75 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 }, 76 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 }, 77 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 }, 78 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, 79 }; 80 81 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = { 82 { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 }, 83 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 }, 84 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 }, 85 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 } 86 }; 87 88 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = { 89 { 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29, 90 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 }, 91 { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 92 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 }, 93 { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 94 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 }, 95 { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 96 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 }, 97 { 0, 1, 0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 98 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 }, 99 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 100 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 }, 101 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 102 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 }, 103 { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 104 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 } 105 }; 106 107 static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = { 108 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 109 { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 110 { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 111 { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 112 { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 113 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 114 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 115 0 }, 116 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 117 0, 0 }, 118 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 119 0, 0, 0, 0 }, 120 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 121 0, 0, 0, 0, 0, 0 }, 122 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 123 0xffff, 0, 0, 0, 0, 0, 0 }, 124 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 125 0xffff, 0xffff, 0, 0, 0, 0, 0 }, 126 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 127 0xffff, 0xffff, 0xffff, 0, 0, 0, 0 }, 128 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 129 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 }, 130 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 131 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 }, 132 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 133 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 }, 134 { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 135 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff } 136 }; 137 138 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 139 static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) { 140 __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15; 141 142 r0 = _mm_unpacklo_epi16(x[0], x[1]); 143 r1 = _mm_unpacklo_epi16(x[2], x[3]); 144 r2 = _mm_unpacklo_epi16(x[4], x[5]); 145 r3 = _mm_unpacklo_epi16(x[6], x[7]); 146 147 r4 = _mm_unpacklo_epi16(x[8], x[9]); 148 r5 = _mm_unpacklo_epi16(x[10], x[11]); 149 r6 = _mm_unpacklo_epi16(x[12], x[13]); 150 r7 = _mm_unpacklo_epi16(x[14], x[15]); 151 152 r8 = _mm_unpacklo_epi32(r0, r1); 153 r9 = _mm_unpackhi_epi32(r0, r1); 154 r10 = _mm_unpacklo_epi32(r2, r3); 155 r11 = _mm_unpackhi_epi32(r2, r3); 156 157 r12 = _mm_unpacklo_epi32(r4, r5); 158 r13 = _mm_unpackhi_epi32(r4, r5); 159 r14 = _mm_unpacklo_epi32(r6, r7); 160 r15 = _mm_unpackhi_epi32(r6, r7); 161 162 r0 = _mm_unpacklo_epi64(r8, r9); 163 r1 = _mm_unpackhi_epi64(r8, r9); 164 r2 = _mm_unpacklo_epi64(r10, r11); 165 r3 = _mm_unpackhi_epi64(r10, r11); 166 167 r4 = _mm_unpacklo_epi64(r12, r13); 168 r5 = _mm_unpackhi_epi64(r12, r13); 169 r6 = _mm_unpacklo_epi64(r14, r15); 170 r7 = _mm_unpackhi_epi64(r14, r15); 171 172 d[0] = _mm_unpacklo_epi64(r0, r2); 173 d[1] = _mm_unpacklo_epi64(r4, r6); 174 d[2] = _mm_unpacklo_epi64(r1, r3); 175 d[3] = _mm_unpacklo_epi64(r5, r7); 176 177 d[4] = _mm_unpackhi_epi64(r0, r2); 178 d[5] = _mm_unpackhi_epi64(r4, r6); 179 d[6] = _mm_unpackhi_epi64(r1, r3); 180 d[7] = _mm_unpackhi_epi64(r5, r7); 181 } 182 183 static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) { 184 __m256i w0, w1, w2, w3, ww0, ww1; 185 186 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 187 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 188 w2 = _mm256_unpackhi_epi16(x[0], x[1]); // 40 50 41 51 42 52 43 53 189 w3 = _mm256_unpackhi_epi16(x[2], x[3]); // 60 70 61 71 62 72 63 73 190 191 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 192 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 193 194 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 195 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 196 197 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 198 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 199 200 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 201 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 202 } 203 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 204 205 static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) { 206 __m256i w0, w1, w2, w3, ww0, ww1; 207 208 w0 = _mm256_unpacklo_epi16(x[0], x[1]); // 00 10 01 11 02 12 03 13 209 w1 = _mm256_unpacklo_epi16(x[2], x[3]); // 20 30 21 31 22 32 23 33 210 w2 = _mm256_unpacklo_epi16(x[4], x[5]); // 40 50 41 51 42 52 43 53 211 w3 = _mm256_unpacklo_epi16(x[6], x[7]); // 60 70 61 71 62 72 63 73 212 213 ww0 = _mm256_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 214 ww1 = _mm256_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 215 216 d[0] = _mm256_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 217 d[1] = _mm256_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 218 219 ww0 = _mm256_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 220 ww1 = _mm256_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 221 222 d[2] = _mm256_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 223 d[3] = _mm256_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 224 225 w0 = _mm256_unpackhi_epi16(x[0], x[1]); // 04 14 05 15 06 16 07 17 226 w1 = _mm256_unpackhi_epi16(x[2], x[3]); // 24 34 25 35 26 36 27 37 227 w2 = _mm256_unpackhi_epi16(x[4], x[5]); // 44 54 45 55 46 56 47 57 228 w3 = _mm256_unpackhi_epi16(x[6], x[7]); // 64 74 65 75 66 76 67 77 229 230 ww0 = _mm256_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 231 ww1 = _mm256_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 232 233 d[4] = _mm256_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 234 d[5] = _mm256_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 235 236 ww0 = _mm256_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 237 ww1 = _mm256_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 238 239 d[6] = _mm256_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 240 d[7] = _mm256_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 241 } 242 243 static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) { 244 __m256i w0, w1, w2, w3, ww0, ww1; 245 __m256i dd[16]; 246 w0 = _mm256_unpacklo_epi16(x[0], x[1]); 247 w1 = _mm256_unpacklo_epi16(x[2], x[3]); 248 w2 = _mm256_unpacklo_epi16(x[4], x[5]); 249 w3 = _mm256_unpacklo_epi16(x[6], x[7]); 250 251 ww0 = _mm256_unpacklo_epi32(w0, w1); // 252 ww1 = _mm256_unpacklo_epi32(w2, w3); // 253 254 dd[0] = _mm256_unpacklo_epi64(ww0, ww1); 255 dd[1] = _mm256_unpackhi_epi64(ww0, ww1); 256 257 ww0 = _mm256_unpackhi_epi32(w0, w1); // 258 ww1 = _mm256_unpackhi_epi32(w2, w3); // 259 260 dd[2] = _mm256_unpacklo_epi64(ww0, ww1); 261 dd[3] = _mm256_unpackhi_epi64(ww0, ww1); 262 263 w0 = _mm256_unpackhi_epi16(x[0], x[1]); 264 w1 = _mm256_unpackhi_epi16(x[2], x[3]); 265 w2 = _mm256_unpackhi_epi16(x[4], x[5]); 266 w3 = _mm256_unpackhi_epi16(x[6], x[7]); 267 268 ww0 = _mm256_unpacklo_epi32(w0, w1); // 269 ww1 = _mm256_unpacklo_epi32(w2, w3); // 270 271 dd[4] = _mm256_unpacklo_epi64(ww0, ww1); 272 dd[5] = _mm256_unpackhi_epi64(ww0, ww1); 273 274 ww0 = _mm256_unpackhi_epi32(w0, w1); // 275 ww1 = _mm256_unpackhi_epi32(w2, w3); // 276 277 dd[6] = _mm256_unpacklo_epi64(ww0, ww1); 278 dd[7] = _mm256_unpackhi_epi64(ww0, ww1); 279 280 w0 = _mm256_unpacklo_epi16(x[8], x[9]); 281 w1 = _mm256_unpacklo_epi16(x[10], x[11]); 282 w2 = _mm256_unpacklo_epi16(x[12], x[13]); 283 w3 = _mm256_unpacklo_epi16(x[14], x[15]); 284 285 ww0 = _mm256_unpacklo_epi32(w0, w1); 286 ww1 = _mm256_unpacklo_epi32(w2, w3); 287 288 dd[8] = _mm256_unpacklo_epi64(ww0, ww1); 289 dd[9] = _mm256_unpackhi_epi64(ww0, ww1); 290 291 ww0 = _mm256_unpackhi_epi32(w0, w1); 292 ww1 = _mm256_unpackhi_epi32(w2, w3); 293 294 dd[10] = _mm256_unpacklo_epi64(ww0, ww1); 295 dd[11] = _mm256_unpackhi_epi64(ww0, ww1); 296 297 w0 = _mm256_unpackhi_epi16(x[8], x[9]); 298 w1 = _mm256_unpackhi_epi16(x[10], x[11]); 299 w2 = _mm256_unpackhi_epi16(x[12], x[13]); 300 w3 = _mm256_unpackhi_epi16(x[14], x[15]); 301 302 ww0 = _mm256_unpacklo_epi32(w0, w1); 303 ww1 = _mm256_unpacklo_epi32(w2, w3); 304 305 dd[12] = _mm256_unpacklo_epi64(ww0, ww1); 306 dd[13] = _mm256_unpackhi_epi64(ww0, ww1); 307 308 ww0 = _mm256_unpackhi_epi32(w0, w1); 309 ww1 = _mm256_unpackhi_epi32(w2, w3); 310 311 dd[14] = _mm256_unpacklo_epi64(ww0, ww1); 312 dd[15] = _mm256_unpackhi_epi64(ww0, ww1); 313 314 for (int i = 0; i < 8; i++) { 315 d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1); 316 d[i + 8] = _mm256_insertf128_si256(dd[i + 8], 317 _mm256_extracti128_si256(dd[i], 1), 0); 318 } 319 } 320 #endif // CONFIG_AV1_HIGHBITDEPTH 321 322 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 323 const uint8_t *above, const uint8_t *left) { 324 const __m256i sum_above = dc_sum_32(above); 325 __m256i sum_left = dc_sum_32(left); 326 sum_left = _mm256_add_epi16(sum_left, sum_above); 327 const __m256i thirtytwo = _mm256_set1_epi16(32); 328 sum_left = _mm256_add_epi16(sum_left, thirtytwo); 329 sum_left = _mm256_srai_epi16(sum_left, 6); 330 const __m256i zero = _mm256_setzero_si256(); 331 __m256i row = _mm256_shuffle_epi8(sum_left, zero); 332 row_store_32xh(&row, 32, dst, stride); 333 } 334 335 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 336 const uint8_t *above, 337 const uint8_t *left) { 338 __m256i sum = dc_sum_32(above); 339 (void)left; 340 341 const __m256i sixteen = _mm256_set1_epi16(16); 342 sum = _mm256_add_epi16(sum, sixteen); 343 sum = _mm256_srai_epi16(sum, 5); 344 const __m256i zero = _mm256_setzero_si256(); 345 __m256i row = _mm256_shuffle_epi8(sum, zero); 346 row_store_32xh(&row, 32, dst, stride); 347 } 348 349 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 350 const uint8_t *above, 351 const uint8_t *left) { 352 __m256i sum = dc_sum_32(left); 353 (void)above; 354 355 const __m256i sixteen = _mm256_set1_epi16(16); 356 sum = _mm256_add_epi16(sum, sixteen); 357 sum = _mm256_srai_epi16(sum, 5); 358 const __m256i zero = _mm256_setzero_si256(); 359 __m256i row = _mm256_shuffle_epi8(sum, zero); 360 row_store_32xh(&row, 32, dst, stride); 361 } 362 363 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 364 const uint8_t *above, 365 const uint8_t *left) { 366 (void)above; 367 (void)left; 368 const __m256i row = _mm256_set1_epi8((int8_t)0x80); 369 row_store_32xh(&row, 32, dst, stride); 370 } 371 372 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 373 const uint8_t *above, const uint8_t *left) { 374 const __m256i row = _mm256_loadu_si256((const __m256i *)above); 375 (void)left; 376 row_store_32xh(&row, 32, dst, stride); 377 } 378 379 // There are 32 rows togeter. This function does line: 380 // 0,1,2,3, and 16,17,18,19. The next call would do 381 // 4,5,6,7, and 20,21,22,23. So 4 times of calling 382 // would finish 32 rows. 383 static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst, 384 ptrdiff_t stride) { 385 __m256i t[4]; 386 __m256i m = _mm256_setzero_si256(); 387 const __m256i inc = _mm256_set1_epi8(4); 388 int i; 389 390 for (i = 0; i < 4; i++) { 391 t[i] = _mm256_shuffle_epi8(*row, m); 392 __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0); 393 __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11); 394 _mm256_storeu_si256((__m256i *)dst, r0); 395 _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1); 396 dst += stride; 397 m = _mm256_add_epi8(m, inc); 398 } 399 } 400 401 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 402 const uint8_t *above, const uint8_t *left) { 403 (void)above; 404 const __m256i left_col = _mm256_loadu_si256((__m256i const *)left); 405 406 __m256i u = _mm256_unpacklo_epi8(left_col, left_col); 407 408 __m256i v = _mm256_unpacklo_epi8(u, u); 409 h_predictor_32x8line(&v, dst, stride); 410 dst += stride << 2; 411 412 v = _mm256_unpackhi_epi8(u, u); 413 h_predictor_32x8line(&v, dst, stride); 414 dst += stride << 2; 415 416 u = _mm256_unpackhi_epi8(left_col, left_col); 417 418 v = _mm256_unpacklo_epi8(u, u); 419 h_predictor_32x8line(&v, dst, stride); 420 dst += stride << 2; 421 422 v = _mm256_unpackhi_epi8(u, u); 423 h_predictor_32x8line(&v, dst, stride); 424 } 425 426 // ----------------------------------------------------------------------------- 427 // Rectangle 428 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 429 const uint8_t *above, const uint8_t *left) { 430 const __m128i top_sum = dc_sum_32_sse2(above); 431 __m128i left_sum = dc_sum_16_sse2(left); 432 left_sum = _mm_add_epi16(top_sum, left_sum); 433 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum); 434 sum += 24; 435 sum /= 48; 436 const __m256i row = _mm256_set1_epi8((int8_t)sum); 437 row_store_32xh(&row, 16, dst, stride); 438 } 439 440 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 441 const uint8_t *above, const uint8_t *left) { 442 const __m256i sum_above = dc_sum_32(above); 443 __m256i sum_left = dc_sum_64(left); 444 sum_left = _mm256_add_epi16(sum_left, sum_above); 445 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 446 sum += 48; 447 sum /= 96; 448 const __m256i row = _mm256_set1_epi8((int8_t)sum); 449 row_store_32xh(&row, 64, dst, stride); 450 } 451 452 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 453 const uint8_t *above, const uint8_t *left) { 454 const __m256i sum_above = dc_sum_64(above); 455 __m256i sum_left = dc_sum_64(left); 456 sum_left = _mm256_add_epi16(sum_left, sum_above); 457 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 458 sum += 64; 459 sum /= 128; 460 const __m256i row = _mm256_set1_epi8((int8_t)sum); 461 row_store_64xh(&row, 64, dst, stride); 462 } 463 464 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 465 const uint8_t *above, const uint8_t *left) { 466 const __m256i sum_above = dc_sum_64(above); 467 __m256i sum_left = dc_sum_32(left); 468 sum_left = _mm256_add_epi16(sum_left, sum_above); 469 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 470 sum += 48; 471 sum /= 96; 472 const __m256i row = _mm256_set1_epi8((int8_t)sum); 473 row_store_64xh(&row, 32, dst, stride); 474 } 475 476 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 477 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 478 const uint8_t *above, const uint8_t *left) { 479 const __m256i sum_above = dc_sum_64(above); 480 __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left)); 481 sum_left = _mm256_add_epi16(sum_left, sum_above); 482 uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left)); 483 sum += 40; 484 sum /= 80; 485 const __m256i row = _mm256_set1_epi8((int8_t)sum); 486 row_store_64xh(&row, 16, dst, stride); 487 } 488 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 489 490 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 491 const uint8_t *above, 492 const uint8_t *left) { 493 __m256i sum = dc_sum_32(above); 494 (void)left; 495 496 const __m256i sixteen = _mm256_set1_epi16(16); 497 sum = _mm256_add_epi16(sum, sixteen); 498 sum = _mm256_srai_epi16(sum, 5); 499 const __m256i zero = _mm256_setzero_si256(); 500 __m256i row = _mm256_shuffle_epi8(sum, zero); 501 row_store_32xh(&row, 16, dst, stride); 502 } 503 504 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 505 const uint8_t *above, 506 const uint8_t *left) { 507 __m256i sum = dc_sum_32(above); 508 (void)left; 509 510 const __m256i sixteen = _mm256_set1_epi16(16); 511 sum = _mm256_add_epi16(sum, sixteen); 512 sum = _mm256_srai_epi16(sum, 5); 513 const __m256i zero = _mm256_setzero_si256(); 514 __m256i row = _mm256_shuffle_epi8(sum, zero); 515 row_store_32xh(&row, 64, dst, stride); 516 } 517 518 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 519 const uint8_t *above, 520 const uint8_t *left) { 521 __m256i sum = dc_sum_64(above); 522 (void)left; 523 524 const __m256i thirtytwo = _mm256_set1_epi16(32); 525 sum = _mm256_add_epi16(sum, thirtytwo); 526 sum = _mm256_srai_epi16(sum, 6); 527 const __m256i zero = _mm256_setzero_si256(); 528 __m256i row = _mm256_shuffle_epi8(sum, zero); 529 row_store_64xh(&row, 64, dst, stride); 530 } 531 532 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 533 const uint8_t *above, 534 const uint8_t *left) { 535 __m256i sum = dc_sum_64(above); 536 (void)left; 537 538 const __m256i thirtytwo = _mm256_set1_epi16(32); 539 sum = _mm256_add_epi16(sum, thirtytwo); 540 sum = _mm256_srai_epi16(sum, 6); 541 const __m256i zero = _mm256_setzero_si256(); 542 __m256i row = _mm256_shuffle_epi8(sum, zero); 543 row_store_64xh(&row, 32, dst, stride); 544 } 545 546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 547 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 548 const uint8_t *above, 549 const uint8_t *left) { 550 __m256i sum = dc_sum_64(above); 551 (void)left; 552 553 const __m256i thirtytwo = _mm256_set1_epi16(32); 554 sum = _mm256_add_epi16(sum, thirtytwo); 555 sum = _mm256_srai_epi16(sum, 6); 556 const __m256i zero = _mm256_setzero_si256(); 557 __m256i row = _mm256_shuffle_epi8(sum, zero); 558 row_store_64xh(&row, 16, dst, stride); 559 } 560 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 561 562 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 563 const uint8_t *above, 564 const uint8_t *left) { 565 __m128i sum = dc_sum_16_sse2(left); 566 (void)above; 567 568 const __m128i eight = _mm_set1_epi16(8); 569 sum = _mm_add_epi16(sum, eight); 570 sum = _mm_srai_epi16(sum, 4); 571 const __m128i zero = _mm_setzero_si128(); 572 const __m128i r = _mm_shuffle_epi8(sum, zero); 573 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); 574 row_store_32xh(&row, 16, dst, stride); 575 } 576 577 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 578 const uint8_t *above, 579 const uint8_t *left) { 580 __m256i sum = dc_sum_64(left); 581 (void)above; 582 583 const __m256i thirtytwo = _mm256_set1_epi16(32); 584 sum = _mm256_add_epi16(sum, thirtytwo); 585 sum = _mm256_srai_epi16(sum, 6); 586 const __m256i zero = _mm256_setzero_si256(); 587 __m256i row = _mm256_shuffle_epi8(sum, zero); 588 row_store_32xh(&row, 64, dst, stride); 589 } 590 591 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 592 const uint8_t *above, 593 const uint8_t *left) { 594 __m256i sum = dc_sum_64(left); 595 (void)above; 596 597 const __m256i thirtytwo = _mm256_set1_epi16(32); 598 sum = _mm256_add_epi16(sum, thirtytwo); 599 sum = _mm256_srai_epi16(sum, 6); 600 const __m256i zero = _mm256_setzero_si256(); 601 __m256i row = _mm256_shuffle_epi8(sum, zero); 602 row_store_64xh(&row, 64, dst, stride); 603 } 604 605 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 606 const uint8_t *above, 607 const uint8_t *left) { 608 __m256i sum = dc_sum_32(left); 609 (void)above; 610 611 const __m256i sixteen = _mm256_set1_epi16(16); 612 sum = _mm256_add_epi16(sum, sixteen); 613 sum = _mm256_srai_epi16(sum, 5); 614 const __m256i zero = _mm256_setzero_si256(); 615 __m256i row = _mm256_shuffle_epi8(sum, zero); 616 row_store_64xh(&row, 32, dst, stride); 617 } 618 619 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 620 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 621 const uint8_t *above, 622 const uint8_t *left) { 623 __m128i sum = dc_sum_16_sse2(left); 624 (void)above; 625 626 const __m128i eight = _mm_set1_epi16(8); 627 sum = _mm_add_epi16(sum, eight); 628 sum = _mm_srai_epi16(sum, 4); 629 const __m128i zero = _mm_setzero_si128(); 630 const __m128i r = _mm_shuffle_epi8(sum, zero); 631 const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1); 632 row_store_64xh(&row, 16, dst, stride); 633 } 634 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 635 636 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 637 const uint8_t *above, 638 const uint8_t *left) { 639 (void)above; 640 (void)left; 641 const __m256i row = _mm256_set1_epi8((int8_t)0x80); 642 row_store_32xh(&row, 16, dst, stride); 643 } 644 645 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 646 const uint8_t *above, 647 const uint8_t *left) { 648 (void)above; 649 (void)left; 650 const __m256i row = _mm256_set1_epi8((int8_t)0x80); 651 row_store_32xh(&row, 64, dst, stride); 652 } 653 654 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 655 const uint8_t *above, 656 const uint8_t *left) { 657 (void)above; 658 (void)left; 659 const __m256i row = _mm256_set1_epi8((int8_t)0x80); 660 row_store_64xh(&row, 64, dst, stride); 661 } 662 663 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 664 const uint8_t *above, 665 const uint8_t *left) { 666 (void)above; 667 (void)left; 668 const __m256i row = _mm256_set1_epi8((int8_t)0x80); 669 row_store_64xh(&row, 32, dst, stride); 670 } 671 672 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 673 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 674 const uint8_t *above, 675 const uint8_t *left) { 676 (void)above; 677 (void)left; 678 const __m256i row = _mm256_set1_epi8((int8_t)0x80); 679 row_store_64xh(&row, 16, dst, stride); 680 } 681 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 682 683 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 684 const uint8_t *above, const uint8_t *left) { 685 const __m256i row = _mm256_loadu_si256((const __m256i *)above); 686 (void)left; 687 row_store_32xh(&row, 16, dst, stride); 688 } 689 690 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 691 const uint8_t *above, const uint8_t *left) { 692 const __m256i row = _mm256_loadu_si256((const __m256i *)above); 693 (void)left; 694 row_store_32xh(&row, 64, dst, stride); 695 } 696 697 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 698 const uint8_t *above, const uint8_t *left) { 699 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); 700 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); 701 (void)left; 702 row_store_32x2xh(&row0, &row1, 64, dst, stride); 703 } 704 705 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 706 const uint8_t *above, const uint8_t *left) { 707 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); 708 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); 709 (void)left; 710 row_store_32x2xh(&row0, &row1, 32, dst, stride); 711 } 712 713 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 714 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 715 const uint8_t *above, const uint8_t *left) { 716 const __m256i row0 = _mm256_loadu_si256((const __m256i *)above); 717 const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32)); 718 (void)left; 719 row_store_32x2xh(&row0, &row1, 16, dst, stride); 720 } 721 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 722 723 // ----------------------------------------------------------------------------- 724 // PAETH_PRED 725 726 // Return 16 16-bit pixels in one row (__m256i) 727 static inline __m256i paeth_pred(const __m256i *left, const __m256i *top, 728 const __m256i *topleft) { 729 const __m256i base = 730 _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft); 731 732 __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left)); 733 __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top)); 734 __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft)); 735 736 __m256i mask1 = _mm256_cmpgt_epi16(pl, pt); 737 mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl)); 738 __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl); 739 740 pl = _mm256_andnot_si256(mask1, *left); 741 742 ptl = _mm256_and_si256(mask2, *topleft); 743 pt = _mm256_andnot_si256(mask2, *top); 744 pt = _mm256_or_si256(pt, ptl); 745 pt = _mm256_and_si256(mask1, pt); 746 747 return _mm256_or_si256(pt, pl); 748 } 749 750 // Return 16 8-bit pixels in one row (__m128i) 751 static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top, 752 const __m256i *topleft) { 753 const __m256i p0 = paeth_pred(left, top, topleft); 754 const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); 755 const __m256i p = _mm256_packus_epi16(p0, p1); 756 return _mm256_castsi256_si128(p); 757 } 758 759 static inline __m256i get_top_vector(const uint8_t *above) { 760 const __m128i x = _mm_load_si128((const __m128i *)above); 761 const __m128i zero = _mm_setzero_si128(); 762 const __m128i t0 = _mm_unpacklo_epi8(x, zero); 763 const __m128i t1 = _mm_unpackhi_epi8(x, zero); 764 return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1); 765 } 766 767 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride, 768 const uint8_t *above, const uint8_t *left) { 769 __m128i x = _mm_loadl_epi64((const __m128i *)left); 770 const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); 771 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); 772 __m256i rep = _mm256_set1_epi16((short)0x8000); 773 const __m256i one = _mm256_set1_epi16(1); 774 const __m256i top = get_top_vector(above); 775 776 int i; 777 for (i = 0; i < 8; ++i) { 778 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 779 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 780 781 _mm_store_si128((__m128i *)dst, row); 782 dst += stride; 783 rep = _mm256_add_epi16(rep, one); 784 } 785 } 786 787 static inline __m256i get_left_vector(const uint8_t *left) { 788 const __m128i x = _mm_load_si128((const __m128i *)left); 789 return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1); 790 } 791 792 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride, 793 const uint8_t *above, const uint8_t *left) { 794 const __m256i l = get_left_vector(left); 795 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); 796 __m256i rep = _mm256_set1_epi16((short)0x8000); 797 const __m256i one = _mm256_set1_epi16(1); 798 const __m256i top = get_top_vector(above); 799 800 int i; 801 for (i = 0; i < 16; ++i) { 802 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 803 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 804 805 _mm_store_si128((__m128i *)dst, row); 806 dst += stride; 807 rep = _mm256_add_epi16(rep, one); 808 } 809 } 810 811 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride, 812 const uint8_t *above, const uint8_t *left) { 813 __m256i l = get_left_vector(left); 814 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); 815 __m256i rep = _mm256_set1_epi16((short)0x8000); 816 const __m256i one = _mm256_set1_epi16(1); 817 const __m256i top = get_top_vector(above); 818 819 int i; 820 for (i = 0; i < 16; ++i) { 821 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 822 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 823 824 _mm_store_si128((__m128i *)dst, row); 825 dst += stride; 826 rep = _mm256_add_epi16(rep, one); 827 } 828 829 l = get_left_vector(left + 16); 830 rep = _mm256_set1_epi16((short)0x8000); 831 for (i = 0; i < 16; ++i) { 832 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 833 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 834 835 _mm_store_si128((__m128i *)dst, row); 836 dst += stride; 837 rep = _mm256_add_epi16(rep, one); 838 } 839 } 840 841 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 842 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride, 843 const uint8_t *above, const uint8_t *left) { 844 const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]); 845 const __m256i one = _mm256_set1_epi16(1); 846 const __m256i top = get_top_vector(above); 847 848 for (int j = 0; j < 4; ++j) { 849 const __m256i l = get_left_vector(left + j * 16); 850 __m256i rep = _mm256_set1_epi16((short)0x8000); 851 for (int i = 0; i < 16; ++i) { 852 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 853 const __m128i row = paeth_16x1_pred(&l16, &top, &tl16); 854 855 _mm_store_si128((__m128i *)dst, row); 856 dst += stride; 857 rep = _mm256_add_epi16(rep, one); 858 } 859 } 860 } 861 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 862 863 // Return 32 8-bit pixels in one row (__m256i) 864 static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0, 865 const __m256i *top1, 866 const __m256i *topleft) { 867 __m256i p0 = paeth_pred(left, top0, topleft); 868 __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe); 869 const __m256i x0 = _mm256_packus_epi16(p0, p1); 870 871 p0 = paeth_pred(left, top1, topleft); 872 p1 = _mm256_permute4x64_epi64(p0, 0xe); 873 const __m256i x1 = _mm256_packus_epi16(p0, p1); 874 875 return _mm256_permute2x128_si256(x0, x1, 0x20); 876 } 877 878 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 879 const uint8_t *above, const uint8_t *left) { 880 const __m256i l = get_left_vector(left); 881 const __m256i t0 = get_top_vector(above); 882 const __m256i t1 = get_top_vector(above + 16); 883 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); 884 __m256i rep = _mm256_set1_epi16((short)0x8000); 885 const __m256i one = _mm256_set1_epi16(1); 886 887 int i; 888 for (i = 0; i < 16; ++i) { 889 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 890 891 const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl); 892 893 _mm256_storeu_si256((__m256i *)dst, r); 894 895 dst += stride; 896 rep = _mm256_add_epi16(rep, one); 897 } 898 } 899 900 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 901 const uint8_t *above, const uint8_t *left) { 902 __m256i l = get_left_vector(left); 903 const __m256i t0 = get_top_vector(above); 904 const __m256i t1 = get_top_vector(above + 16); 905 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); 906 __m256i rep = _mm256_set1_epi16((short)0x8000); 907 const __m256i one = _mm256_set1_epi16(1); 908 909 int i; 910 for (i = 0; i < 16; ++i) { 911 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 912 913 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 914 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 915 916 _mm_store_si128((__m128i *)dst, r0); 917 _mm_store_si128((__m128i *)(dst + 16), r1); 918 919 dst += stride; 920 rep = _mm256_add_epi16(rep, one); 921 } 922 923 l = get_left_vector(left + 16); 924 rep = _mm256_set1_epi16((short)0x8000); 925 for (i = 0; i < 16; ++i) { 926 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 927 928 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 929 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 930 931 _mm_store_si128((__m128i *)dst, r0); 932 _mm_store_si128((__m128i *)(dst + 16), r1); 933 934 dst += stride; 935 rep = _mm256_add_epi16(rep, one); 936 } 937 } 938 939 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 940 const uint8_t *above, const uint8_t *left) { 941 const __m256i t0 = get_top_vector(above); 942 const __m256i t1 = get_top_vector(above + 16); 943 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); 944 const __m256i one = _mm256_set1_epi16(1); 945 946 int i, j; 947 for (j = 0; j < 4; ++j) { 948 const __m256i l = get_left_vector(left + j * 16); 949 __m256i rep = _mm256_set1_epi16((short)0x8000); 950 for (i = 0; i < 16; ++i) { 951 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 952 953 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 954 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 955 956 _mm_store_si128((__m128i *)dst, r0); 957 _mm_store_si128((__m128i *)(dst + 16), r1); 958 959 dst += stride; 960 rep = _mm256_add_epi16(rep, one); 961 } 962 } 963 } 964 965 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 966 const uint8_t *above, const uint8_t *left) { 967 const __m256i t0 = get_top_vector(above); 968 const __m256i t1 = get_top_vector(above + 16); 969 const __m256i t2 = get_top_vector(above + 32); 970 const __m256i t3 = get_top_vector(above + 48); 971 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); 972 const __m256i one = _mm256_set1_epi16(1); 973 974 int i, j; 975 for (j = 0; j < 2; ++j) { 976 const __m256i l = get_left_vector(left + j * 16); 977 __m256i rep = _mm256_set1_epi16((short)0x8000); 978 for (i = 0; i < 16; ++i) { 979 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 980 981 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 982 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 983 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); 984 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); 985 986 _mm_store_si128((__m128i *)dst, r0); 987 _mm_store_si128((__m128i *)(dst + 16), r1); 988 _mm_store_si128((__m128i *)(dst + 32), r2); 989 _mm_store_si128((__m128i *)(dst + 48), r3); 990 991 dst += stride; 992 rep = _mm256_add_epi16(rep, one); 993 } 994 } 995 } 996 997 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 998 const uint8_t *above, const uint8_t *left) { 999 const __m256i t0 = get_top_vector(above); 1000 const __m256i t1 = get_top_vector(above + 16); 1001 const __m256i t2 = get_top_vector(above + 32); 1002 const __m256i t3 = get_top_vector(above + 48); 1003 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); 1004 const __m256i one = _mm256_set1_epi16(1); 1005 1006 int i, j; 1007 for (j = 0; j < 4; ++j) { 1008 const __m256i l = get_left_vector(left + j * 16); 1009 __m256i rep = _mm256_set1_epi16((short)0x8000); 1010 for (i = 0; i < 16; ++i) { 1011 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 1012 1013 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 1014 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 1015 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); 1016 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); 1017 1018 _mm_store_si128((__m128i *)dst, r0); 1019 _mm_store_si128((__m128i *)(dst + 16), r1); 1020 _mm_store_si128((__m128i *)(dst + 32), r2); 1021 _mm_store_si128((__m128i *)(dst + 48), r3); 1022 1023 dst += stride; 1024 rep = _mm256_add_epi16(rep, one); 1025 } 1026 } 1027 } 1028 1029 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1030 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 1031 const uint8_t *above, const uint8_t *left) { 1032 const __m256i t0 = get_top_vector(above); 1033 const __m256i t1 = get_top_vector(above + 16); 1034 const __m256i t2 = get_top_vector(above + 32); 1035 const __m256i t3 = get_top_vector(above + 48); 1036 const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]); 1037 const __m256i one = _mm256_set1_epi16(1); 1038 1039 int i; 1040 const __m256i l = get_left_vector(left); 1041 __m256i rep = _mm256_set1_epi16((short)0x8000); 1042 for (i = 0; i < 16; ++i) { 1043 const __m256i l16 = _mm256_shuffle_epi8(l, rep); 1044 1045 const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl); 1046 const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl); 1047 const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl); 1048 const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl); 1049 1050 _mm_store_si128((__m128i *)dst, r0); 1051 _mm_store_si128((__m128i *)(dst + 16), r1); 1052 _mm_store_si128((__m128i *)(dst + 32), r2); 1053 _mm_store_si128((__m128i *)(dst + 48), r3); 1054 1055 dst += stride; 1056 rep = _mm256_add_epi16(rep, one); 1057 } 1058 } 1059 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 1060 1061 #if CONFIG_AV1_HIGHBITDEPTH 1062 1063 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2( 1064 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { 1065 const int frac_bits = 6 - upsample_above; 1066 const int max_base_x = ((N + 4) - 1) << upsample_above; 1067 1068 assert(dx > 0); 1069 // pre-filter above pixels 1070 // store in temp buffers: 1071 // above[x] * 32 + 16 1072 // above[x+1] - above[x] 1073 // final pixels will be calculated as: 1074 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1075 __m256i a0, a1, a32, a16; 1076 __m256i diff, c3f; 1077 __m128i a_mbase_x, max_base_x128, base_inc128, mask128; 1078 __m128i a0_128, a1_128; 1079 a16 = _mm256_set1_epi16(16); 1080 a_mbase_x = _mm_set1_epi16(above[max_base_x]); 1081 max_base_x128 = _mm_set1_epi16(max_base_x); 1082 c3f = _mm256_set1_epi16(0x3f); 1083 1084 int x = dx; 1085 for (int r = 0; r < N; r++) { 1086 __m256i b, res, shift; 1087 __m128i res1; 1088 1089 int base = x >> frac_bits; 1090 if (base >= max_base_x) { 1091 for (int i = r; i < N; ++i) { 1092 dst[i] = a_mbase_x; // save 4 values 1093 } 1094 return; 1095 } 1096 1097 a0_128 = _mm_loadu_si128((__m128i *)(above + base)); 1098 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); 1099 1100 if (upsample_above) { 1101 a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]); 1102 a1_128 = _mm_srli_si128(a0_128, 8); 1103 1104 base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8, 1105 base + 10, base + 12, base + 14); 1106 shift = _mm256_srli_epi16( 1107 _mm256_and_si256( 1108 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), 1109 _mm256_set1_epi16(0x3f)), 1110 1); 1111 } else { 1112 base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4, 1113 base + 5, base + 6, base + 7); 1114 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 1115 } 1116 a0 = _mm256_castsi128_si256(a0_128); 1117 a1 = _mm256_castsi128_si256(a1_128); 1118 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 1119 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 1120 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 1121 1122 b = _mm256_mullo_epi16(diff, shift); 1123 res = _mm256_add_epi16(a32, b); 1124 res = _mm256_srli_epi16(res, 5); 1125 res1 = _mm256_castsi256_si128(res); 1126 1127 mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128); 1128 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); 1129 x += dx; 1130 } 1131 } 1132 1133 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2( 1134 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { 1135 const int frac_bits = 6 - upsample_above; 1136 const int max_base_x = ((N + 4) - 1) << upsample_above; 1137 1138 assert(dx > 0); 1139 // pre-filter above pixels 1140 // store in temp buffers: 1141 // above[x] * 32 + 16 1142 // above[x+1] - above[x] 1143 // final pixels will be calculated as: 1144 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1145 __m256i a0, a1, a32, a16; 1146 __m256i diff; 1147 __m128i a_mbase_x, max_base_x128, base_inc128, mask128; 1148 1149 a16 = _mm256_set1_epi32(16); 1150 a_mbase_x = _mm_set1_epi16(above[max_base_x]); 1151 max_base_x128 = _mm_set1_epi32(max_base_x); 1152 1153 int x = dx; 1154 for (int r = 0; r < N; r++) { 1155 __m256i b, res, shift; 1156 __m128i res1; 1157 1158 int base = x >> frac_bits; 1159 if (base >= max_base_x) { 1160 for (int i = r; i < N; ++i) { 1161 dst[i] = a_mbase_x; // save 4 values 1162 } 1163 return; 1164 } 1165 1166 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); 1167 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 1168 1169 if (upsample_above) { 1170 a0 = _mm256_permutevar8x32_epi32( 1171 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 1172 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); 1173 base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6); 1174 shift = _mm256_srli_epi32( 1175 _mm256_and_si256( 1176 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), 1177 _mm256_set1_epi32(0x3f)), 1178 1); 1179 } else { 1180 base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3); 1181 shift = _mm256_srli_epi32( 1182 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1183 } 1184 1185 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1186 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1187 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1188 1189 b = _mm256_mullo_epi32(diff, shift); 1190 res = _mm256_add_epi32(a32, b); 1191 res = _mm256_srli_epi32(res, 5); 1192 1193 res1 = _mm256_castsi256_si128(res); 1194 res1 = _mm_packus_epi32(res1, res1); 1195 1196 mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128); 1197 mask128 = _mm_packs_epi32(mask128, mask128); // goto 16 bit 1198 dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128); 1199 x += dx; 1200 } 1201 } 1202 1203 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst, 1204 ptrdiff_t stride, 1205 const uint16_t *above, 1206 int upsample_above, int dx, 1207 int bd) { 1208 __m128i dstvec[16]; 1209 if (bd < 12) { 1210 highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above, 1211 dx); 1212 } else { 1213 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above, 1214 upsample_above, dx); 1215 } 1216 for (int i = 0; i < N; i++) { 1217 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); 1218 } 1219 } 1220 1221 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2( 1222 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { 1223 const int frac_bits = 6 - upsample_above; 1224 const int max_base_x = ((8 + N) - 1) << upsample_above; 1225 1226 assert(dx > 0); 1227 // pre-filter above pixels 1228 // store in temp buffers: 1229 // above[x] * 32 + 16 1230 // above[x+1] - above[x] 1231 // final pixels will be calculated as: 1232 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1233 __m256i a0, a1, a0_1, a1_1, a32, a16; 1234 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1235 1236 a16 = _mm256_set1_epi32(16); 1237 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1238 max_base_x256 = _mm256_set1_epi32(max_base_x); 1239 1240 int x = dx; 1241 for (int r = 0; r < N; r++) { 1242 __m256i b, res, res1, shift; 1243 1244 int base = x >> frac_bits; 1245 if (base >= max_base_x) { 1246 for (int i = r; i < N; ++i) { 1247 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values 1248 } 1249 return; 1250 } 1251 1252 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); 1253 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 1254 1255 if (upsample_above) { 1256 a0 = _mm256_permutevar8x32_epi32( 1257 a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 1258 a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1)); 1259 1260 a0_1 = 1261 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); 1262 a0_1 = _mm256_permutevar8x32_epi32( 1263 a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0)); 1264 a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1)); 1265 1266 a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1); 1267 a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1); 1268 base_inc256 = 1269 _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8, 1270 base + 10, base + 12, base + 14); 1271 shift = _mm256_srli_epi32( 1272 _mm256_and_si256( 1273 _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above), 1274 _mm256_set1_epi32(0x3f)), 1275 1); 1276 } else { 1277 base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3, 1278 base + 4, base + 5, base + 6, base + 7); 1279 shift = _mm256_srli_epi32( 1280 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1281 } 1282 1283 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1284 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1285 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1286 1287 b = _mm256_mullo_epi32(diff, shift); 1288 res = _mm256_add_epi32(a32, b); 1289 res = _mm256_srli_epi32(res, 5); 1290 1291 res1 = _mm256_packus_epi32( 1292 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 1293 1294 mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256); 1295 mask256 = _mm256_packs_epi32( 1296 mask256, _mm256_castsi128_si256( 1297 _mm256_extracti128_si256(mask256, 1))); // goto 16 bit 1298 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1299 dst[r] = _mm256_castsi256_si128(res1); 1300 x += dx; 1301 } 1302 } 1303 1304 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2( 1305 int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) { 1306 const int frac_bits = 6 - upsample_above; 1307 const int max_base_x = ((8 + N) - 1) << upsample_above; 1308 1309 assert(dx > 0); 1310 // pre-filter above pixels 1311 // store in temp buffers: 1312 // above[x] * 32 + 16 1313 // above[x+1] - above[x] 1314 // final pixels will be calculated as: 1315 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1316 __m256i a0, a1, a32, a16, c3f; 1317 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1318 __m128i a0_x128, a1_x128; 1319 1320 a16 = _mm256_set1_epi16(16); 1321 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1322 max_base_x256 = _mm256_set1_epi16(max_base_x); 1323 c3f = _mm256_set1_epi16(0x3f); 1324 1325 int x = dx; 1326 for (int r = 0; r < N; r++) { 1327 __m256i b, res, res1, shift; 1328 1329 int base = x >> frac_bits; 1330 if (base >= max_base_x) { 1331 for (int i = r; i < N; ++i) { 1332 dst[i] = _mm256_castsi256_si128(a_mbase_x); // save 8 values 1333 } 1334 return; 1335 } 1336 1337 a0_x128 = _mm_loadu_si128((__m128i *)(above + base)); 1338 if (upsample_above) { 1339 __m128i mask, atmp0, atmp1, atmp2, atmp3; 1340 a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8)); 1341 atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]); 1342 atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]); 1343 atmp2 = 1344 _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); 1345 atmp3 = 1346 _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16)); 1347 mask = 1348 _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15)); 1349 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); 1350 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16), 1351 _mm_set1_epi8(15)); 1352 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); 1353 1354 base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6, 1355 base + 8, base + 10, base + 12, base + 14, 1356 0, 0, 0, 0, 0, 0, 0, 0); 1357 shift = _mm256_srli_epi16( 1358 _mm256_and_si256( 1359 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), 1360 1); 1361 } else { 1362 a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1)); 1363 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, 1364 base + 4, base + 5, base + 6, base + 7, 0, 1365 0, 0, 0, 0, 0, 0, 0); 1366 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 1367 } 1368 a0 = _mm256_castsi128_si256(a0_x128); 1369 a1 = _mm256_castsi128_si256(a1_x128); 1370 1371 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 1372 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 1373 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 1374 1375 b = _mm256_mullo_epi16(diff, shift); 1376 res = _mm256_add_epi16(a32, b); 1377 res = _mm256_srli_epi16(res, 5); 1378 1379 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1380 res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256); 1381 dst[r] = _mm256_castsi256_si128(res1); 1382 x += dx; 1383 } 1384 } 1385 1386 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst, 1387 ptrdiff_t stride, 1388 const uint16_t *above, 1389 int upsample_above, int dx, 1390 int bd) { 1391 __m128i dstvec[32]; 1392 if (bd < 12) { 1393 highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above, 1394 dx); 1395 } else { 1396 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above, 1397 upsample_above, dx); 1398 } 1399 for (int i = 0; i < N; i++) { 1400 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); 1401 } 1402 } 1403 1404 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2( 1405 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { 1406 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1407 (void)upsample_above; 1408 const int frac_bits = 6; 1409 const int max_base_x = ((16 + N) - 1); 1410 1411 // pre-filter above pixels 1412 // store in temp buffers: 1413 // above[x] * 32 + 16 1414 // above[x+1] - above[x] 1415 // final pixels will be calculated as: 1416 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1417 __m256i a0, a0_1, a1, a1_1, a32, a16; 1418 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1419 1420 a16 = _mm256_set1_epi32(16); 1421 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1422 max_base_x256 = _mm256_set1_epi16(max_base_x); 1423 1424 int x = dx; 1425 for (int r = 0; r < N; r++) { 1426 __m256i b, res[2], res1; 1427 1428 int base = x >> frac_bits; 1429 if (base >= max_base_x) { 1430 for (int i = r; i < N; ++i) { 1431 dstvec[i] = a_mbase_x; // save 16 values 1432 } 1433 return; 1434 } 1435 __m256i shift = _mm256_srli_epi32( 1436 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1437 1438 a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base))); 1439 a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1))); 1440 1441 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1442 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1443 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1444 b = _mm256_mullo_epi32(diff, shift); 1445 1446 res[0] = _mm256_add_epi32(a32, b); 1447 res[0] = _mm256_srli_epi32(res[0], 5); 1448 res[0] = _mm256_packus_epi32( 1449 res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 1450 1451 int mdif = max_base_x - base; 1452 if (mdif > 8) { 1453 a0_1 = 1454 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8))); 1455 a1_1 = 1456 _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9))); 1457 1458 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 1459 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 1460 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1461 b = _mm256_mullo_epi32(diff, shift); 1462 1463 res[1] = _mm256_add_epi32(a32, b); 1464 res[1] = _mm256_srli_epi32(res[1], 5); 1465 res[1] = _mm256_packus_epi32( 1466 res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 1467 } else { 1468 res[1] = a_mbase_x; 1469 } 1470 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1471 1); // 16 16bit values 1472 1473 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, 1474 base + 4, base + 5, base + 6, base + 7, 1475 base + 8, base + 9, base + 10, base + 11, 1476 base + 12, base + 13, base + 14, base + 15); 1477 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1478 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1479 x += dx; 1480 } 1481 } 1482 1483 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2( 1484 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { 1485 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1486 (void)upsample_above; 1487 const int frac_bits = 6; 1488 const int max_base_x = ((16 + N) - 1); 1489 1490 // pre-filter above pixels 1491 // store in temp buffers: 1492 // above[x] * 32 + 16 1493 // above[x+1] - above[x] 1494 // final pixels will be calculated as: 1495 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1496 __m256i a0, a1, a32, a16, c3f; 1497 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1498 1499 a16 = _mm256_set1_epi16(16); 1500 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1501 max_base_x256 = _mm256_set1_epi16(max_base_x); 1502 c3f = _mm256_set1_epi16(0x3f); 1503 1504 int x = dx; 1505 for (int r = 0; r < N; r++) { 1506 __m256i b, res; 1507 1508 int base = x >> frac_bits; 1509 if (base >= max_base_x) { 1510 for (int i = r; i < N; ++i) { 1511 dstvec[i] = a_mbase_x; // save 16 values 1512 } 1513 return; 1514 } 1515 __m256i shift = 1516 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 1517 1518 a0 = _mm256_loadu_si256((__m256i *)(above + base)); 1519 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1)); 1520 1521 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 1522 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 1523 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 1524 b = _mm256_mullo_epi16(diff, shift); 1525 1526 res = _mm256_add_epi16(a32, b); 1527 res = _mm256_srli_epi16(res, 5); // 16 16bit values 1528 1529 base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3, 1530 base + 4, base + 5, base + 6, base + 7, 1531 base + 8, base + 9, base + 10, base + 11, 1532 base + 12, base + 13, base + 14, base + 15); 1533 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1534 dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256); 1535 x += dx; 1536 } 1537 } 1538 1539 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst, 1540 ptrdiff_t stride, 1541 const uint16_t *above, 1542 int upsample_above, int dx, 1543 int bd) { 1544 __m256i dstvec[64]; 1545 if (bd < 12) { 1546 highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above, 1547 dx); 1548 } else { 1549 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above, 1550 upsample_above, dx); 1551 } 1552 for (int i = 0; i < N; i++) { 1553 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); 1554 } 1555 } 1556 1557 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2( 1558 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { 1559 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1560 (void)upsample_above; 1561 const int frac_bits = 6; 1562 const int max_base_x = ((32 + N) - 1); 1563 1564 // pre-filter above pixels 1565 // store in temp buffers: 1566 // above[x] * 32 + 16 1567 // above[x+1] - above[x] 1568 // final pixels will be calculated as: 1569 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1570 __m256i a0, a0_1, a1, a1_1, a32, a16, c3f; 1571 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1572 1573 a16 = _mm256_set1_epi32(16); 1574 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1575 max_base_x256 = _mm256_set1_epi16(max_base_x); 1576 c3f = _mm256_set1_epi16(0x3f); 1577 1578 int x = dx; 1579 for (int r = 0; r < N; r++) { 1580 __m256i b, res[2], res1; 1581 1582 int base = x >> frac_bits; 1583 if (base >= max_base_x) { 1584 for (int i = r; i < N; ++i) { 1585 dstvec[i] = a_mbase_x; // save 32 values 1586 dstvec[i + N] = a_mbase_x; 1587 } 1588 return; 1589 } 1590 1591 __m256i shift = 1592 _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1); 1593 1594 for (int j = 0; j < 32; j += 16) { 1595 int mdif = max_base_x - (base + j); 1596 if (mdif <= 0) { 1597 res1 = a_mbase_x; 1598 } else { 1599 a0 = _mm256_cvtepu16_epi32( 1600 _mm_loadu_si128((__m128i *)(above + base + j))); 1601 a1 = _mm256_cvtepu16_epi32( 1602 _mm_loadu_si128((__m128i *)(above + base + 1 + j))); 1603 1604 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1605 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1606 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1607 b = _mm256_mullo_epi32(diff, shift); 1608 1609 res[0] = _mm256_add_epi32(a32, b); 1610 res[0] = _mm256_srli_epi32(res[0], 5); 1611 res[0] = _mm256_packus_epi32( 1612 res[0], 1613 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 1614 if (mdif > 8) { 1615 a0_1 = _mm256_cvtepu16_epi32( 1616 _mm_loadu_si128((__m128i *)(above + base + 8 + j))); 1617 a1_1 = _mm256_cvtepu16_epi32( 1618 _mm_loadu_si128((__m128i *)(above + base + 9 + j))); 1619 1620 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 1621 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 1622 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1623 b = _mm256_mullo_epi32(diff, shift); 1624 1625 res[1] = _mm256_add_epi32(a32, b); 1626 res[1] = _mm256_srli_epi32(res[1], 5); 1627 res[1] = _mm256_packus_epi32( 1628 res[1], 1629 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 1630 } else { 1631 res[1] = a_mbase_x; 1632 } 1633 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1634 1); // 16 16bit values 1635 base_inc256 = _mm256_setr_epi16( 1636 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 1637 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 1638 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 1639 base + j + 13, base + j + 14, base + j + 15); 1640 1641 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1642 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1643 } 1644 if (!j) { 1645 dstvec[r] = res1; 1646 } else { 1647 dstvec[r + N] = res1; 1648 } 1649 } 1650 x += dx; 1651 } 1652 } 1653 1654 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2( 1655 int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) { 1656 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1657 (void)upsample_above; 1658 const int frac_bits = 6; 1659 const int max_base_x = ((32 + N) - 1); 1660 1661 // pre-filter above pixels 1662 // store in temp buffers: 1663 // above[x] * 32 + 16 1664 // above[x+1] - above[x] 1665 // final pixels will be calculated as: 1666 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1667 __m256i a0, a1, a32, a16, c3f; 1668 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1669 1670 a16 = _mm256_set1_epi16(16); 1671 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1672 max_base_x256 = _mm256_set1_epi16(max_base_x); 1673 c3f = _mm256_set1_epi16(0x3f); 1674 1675 int x = dx; 1676 for (int r = 0; r < N; r++) { 1677 __m256i b, res; 1678 1679 int base = x >> frac_bits; 1680 if (base >= max_base_x) { 1681 for (int i = r; i < N; ++i) { 1682 dstvec[i] = a_mbase_x; // save 32 values 1683 dstvec[i + N] = a_mbase_x; 1684 } 1685 return; 1686 } 1687 1688 __m256i shift = 1689 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 1690 1691 for (int j = 0; j < 32; j += 16) { 1692 int mdif = max_base_x - (base + j); 1693 if (mdif <= 0) { 1694 res = a_mbase_x; 1695 } else { 1696 a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); 1697 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); 1698 1699 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 1700 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 1701 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 1702 b = _mm256_mullo_epi16(diff, shift); 1703 1704 res = _mm256_add_epi16(a32, b); 1705 res = _mm256_srli_epi16(res, 5); 1706 1707 base_inc256 = _mm256_setr_epi16( 1708 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 1709 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 1710 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 1711 base + j + 13, base + j + 14, base + j + 15); 1712 1713 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1714 res = _mm256_blendv_epi8(a_mbase_x, res, mask256); 1715 } 1716 if (!j) { 1717 dstvec[r] = res; 1718 } else { 1719 dstvec[r + N] = res; 1720 } 1721 } 1722 x += dx; 1723 } 1724 } 1725 1726 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst, 1727 ptrdiff_t stride, 1728 const uint16_t *above, 1729 int upsample_above, int dx, 1730 int bd) { 1731 __m256i dstvec[128]; 1732 if (bd < 12) { 1733 highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, 1734 dx); 1735 } else { 1736 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above, 1737 upsample_above, dx); 1738 } 1739 for (int i = 0; i < N; i++) { 1740 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); 1741 _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]); 1742 } 1743 } 1744 1745 static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst, 1746 ptrdiff_t stride, 1747 const uint16_t *above, 1748 int upsample_above, 1749 int dx) { 1750 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1751 (void)upsample_above; 1752 const int frac_bits = 6; 1753 const int max_base_x = ((64 + N) - 1); 1754 1755 // pre-filter above pixels 1756 // store in temp buffers: 1757 // above[x] * 32 + 16 1758 // above[x+1] - above[x] 1759 // final pixels will be calculated as: 1760 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1761 __m256i a0, a0_1, a1, a1_1, a32, a16; 1762 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1763 1764 a16 = _mm256_set1_epi32(16); 1765 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1766 max_base_x256 = _mm256_set1_epi16(max_base_x); 1767 1768 int x = dx; 1769 for (int r = 0; r < N; r++, dst += stride) { 1770 __m256i b, res[2], res1; 1771 1772 int base = x >> frac_bits; 1773 if (base >= max_base_x) { 1774 for (int i = r; i < N; ++i) { 1775 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values 1776 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); 1777 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); 1778 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); 1779 dst += stride; 1780 } 1781 return; 1782 } 1783 1784 __m256i shift = _mm256_srli_epi32( 1785 _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1); 1786 1787 __m128i a0_128, a0_1_128, a1_128, a1_1_128; 1788 for (int j = 0; j < 64; j += 16) { 1789 int mdif = max_base_x - (base + j); 1790 if (mdif <= 0) { 1791 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); 1792 } else { 1793 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); 1794 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); 1795 a0 = _mm256_cvtepu16_epi32(a0_128); 1796 a1 = _mm256_cvtepu16_epi32(a1_128); 1797 1798 diff = _mm256_sub_epi32(a1, a0); // a[x+1] - a[x] 1799 a32 = _mm256_slli_epi32(a0, 5); // a[x] * 32 1800 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1801 b = _mm256_mullo_epi32(diff, shift); 1802 1803 res[0] = _mm256_add_epi32(a32, b); 1804 res[0] = _mm256_srli_epi32(res[0], 5); 1805 res[0] = _mm256_packus_epi32( 1806 res[0], 1807 _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1))); 1808 if (mdif > 8) { 1809 a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j)); 1810 a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j)); 1811 a0_1 = _mm256_cvtepu16_epi32(a0_1_128); 1812 a1_1 = _mm256_cvtepu16_epi32(a1_1_128); 1813 1814 diff = _mm256_sub_epi32(a1_1, a0_1); // a[x+1] - a[x] 1815 a32 = _mm256_slli_epi32(a0_1, 5); // a[x] * 32 1816 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 1817 b = _mm256_mullo_epi32(diff, shift); 1818 1819 res[1] = _mm256_add_epi32(a32, b); 1820 res[1] = _mm256_srli_epi32(res[1], 5); 1821 res[1] = _mm256_packus_epi32( 1822 res[1], 1823 _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1))); 1824 } else { 1825 res[1] = a_mbase_x; 1826 } 1827 res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]), 1828 1); // 16 16bit values 1829 base_inc256 = _mm256_setr_epi16( 1830 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 1831 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 1832 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 1833 base + j + 13, base + j + 14, base + j + 15); 1834 1835 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1836 res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256); 1837 _mm256_storeu_si256((__m256i *)(dst + j), res1); 1838 } 1839 } 1840 x += dx; 1841 } 1842 } 1843 1844 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst, 1845 ptrdiff_t stride, 1846 const uint16_t *above, 1847 int upsample_above, int dx) { 1848 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 1849 (void)upsample_above; 1850 const int frac_bits = 6; 1851 const int max_base_x = ((64 + N) - 1); 1852 1853 // pre-filter above pixels 1854 // store in temp buffers: 1855 // above[x] * 32 + 16 1856 // above[x+1] - above[x] 1857 // final pixels will be calculated as: 1858 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1859 __m256i a0, a1, a32, a16, c3f; 1860 __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256; 1861 1862 a16 = _mm256_set1_epi16(16); 1863 a_mbase_x = _mm256_set1_epi16(above[max_base_x]); 1864 max_base_x256 = _mm256_set1_epi16(max_base_x); 1865 c3f = _mm256_set1_epi16(0x3f); 1866 1867 int x = dx; 1868 for (int r = 0; r < N; r++, dst += stride) { 1869 __m256i b, res; 1870 1871 int base = x >> frac_bits; 1872 if (base >= max_base_x) { 1873 for (int i = r; i < N; ++i) { 1874 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values 1875 _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x); 1876 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); 1877 _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x); 1878 dst += stride; 1879 } 1880 return; 1881 } 1882 1883 __m256i shift = 1884 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 1885 1886 for (int j = 0; j < 64; j += 16) { 1887 int mdif = max_base_x - (base + j); 1888 if (mdif <= 0) { 1889 _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x); 1890 } else { 1891 a0 = _mm256_loadu_si256((__m256i *)(above + base + j)); 1892 a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j)); 1893 1894 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 1895 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 1896 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 1897 b = _mm256_mullo_epi16(diff, shift); 1898 1899 res = _mm256_add_epi16(a32, b); 1900 res = _mm256_srli_epi16(res, 5); 1901 1902 base_inc256 = _mm256_setr_epi16( 1903 base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4, 1904 base + j + 5, base + j + 6, base + j + 7, base + j + 8, 1905 base + j + 9, base + j + 10, base + j + 11, base + j + 12, 1906 base + j + 13, base + j + 14, base + j + 15); 1907 1908 mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256); 1909 res = _mm256_blendv_epi8(a_mbase_x, res, mask256); 1910 _mm256_storeu_si256((__m256i *)(dst + j), res); // 16 16bit values 1911 } 1912 } 1913 x += dx; 1914 } 1915 } 1916 1917 // Directional prediction, zone 1: 0 < angle < 90 1918 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw, 1919 int bh, const uint16_t *above, 1920 const uint16_t *left, int upsample_above, 1921 int dx, int dy, int bd) { 1922 (void)left; 1923 (void)dy; 1924 1925 switch (bw) { 1926 case 4: 1927 highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, 1928 dx, bd); 1929 break; 1930 case 8: 1931 highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, 1932 dx, bd); 1933 break; 1934 case 16: 1935 highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, 1936 dx, bd); 1937 break; 1938 case 32: 1939 highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, 1940 dx, bd); 1941 break; 1942 case 64: 1943 if (bd < 12) { 1944 highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above, 1945 upsample_above, dx); 1946 } else { 1947 highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above, 1948 upsample_above, dx); 1949 } 1950 break; 1951 default: break; 1952 } 1953 return; 1954 } 1955 1956 static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc, 1957 uint16_t *dst, ptrdiff_t pitchDst) { 1958 __m256i r[16]; 1959 __m256i d[16]; 1960 for (int j = 0; j < 16; j++) { 1961 r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc)); 1962 } 1963 highbd_transpose16x16_avx2(r, d); 1964 for (int j = 0; j < 16; j++) { 1965 _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]); 1966 } 1967 } 1968 1969 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc, 1970 uint16_t *dst, ptrdiff_t pitchDst, int width, 1971 int height) { 1972 for (int j = 0; j < height; j += 16) 1973 for (int i = 0; i < width; i += 16) 1974 highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, 1975 dst + j * pitchDst + i, pitchDst); 1976 } 1977 1978 static void highbd_dr_prediction_32bit_z2_Nx4_avx2( 1979 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 1980 const uint16_t *left, int upsample_above, int upsample_left, int dx, 1981 int dy) { 1982 const int min_base_x = -(1 << upsample_above); 1983 const int min_base_y = -(1 << upsample_left); 1984 const int frac_bits_x = 6 - upsample_above; 1985 const int frac_bits_y = 6 - upsample_left; 1986 1987 assert(dx > 0); 1988 // pre-filter above pixels 1989 // store in temp buffers: 1990 // above[x] * 32 + 16 1991 // above[x+1] - above[x] 1992 // final pixels will be calculated as: 1993 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 1994 __m256i a0_x, a1_x, a32, a16; 1995 __m256i diff; 1996 __m128i c3f, min_base_y128; 1997 1998 a16 = _mm256_set1_epi32(16); 1999 c3f = _mm_set1_epi32(0x3f); 2000 min_base_y128 = _mm_set1_epi32(min_base_y); 2001 2002 for (int r = 0; r < N; r++) { 2003 __m256i b, res, shift; 2004 __m128i resx, resy, resxy; 2005 __m128i a0_x128, a1_x128; 2006 int y = r + 1; 2007 int base_x = (-y * dx) >> frac_bits_x; 2008 int base_shift = 0; 2009 if (base_x < (min_base_x - 1)) { 2010 base_shift = (min_base_x - base_x - 1) >> upsample_above; 2011 } 2012 int base_min_diff = 2013 (min_base_x - base_x + upsample_above) >> upsample_above; 2014 if (base_min_diff > 4) { 2015 base_min_diff = 4; 2016 } else { 2017 if (base_min_diff < 0) base_min_diff = 0; 2018 } 2019 2020 if (base_shift > 3) { 2021 a0_x = _mm256_setzero_si256(); 2022 a1_x = _mm256_setzero_si256(); 2023 shift = _mm256_setzero_si256(); 2024 } else { 2025 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2026 if (upsample_above) { 2027 a0_x128 = _mm_shuffle_epi8(a0_x128, 2028 *(__m128i *)HighbdEvenOddMaskx4[base_shift]); 2029 a1_x128 = _mm_srli_si128(a0_x128, 8); 2030 2031 shift = _mm256_castsi128_si256(_mm_srli_epi32( 2032 _mm_and_si128( 2033 _mm_slli_epi32( 2034 _mm_setr_epi32(-y * dx, (1 << 6) - y * dx, 2035 (2 << 6) - y * dx, (3 << 6) - y * dx), 2036 upsample_above), 2037 c3f), 2038 1)); 2039 } else { 2040 a0_x128 = 2041 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2042 a1_x128 = _mm_srli_si128(a0_x128, 2); 2043 2044 shift = _mm256_castsi128_si256(_mm_srli_epi32( 2045 _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx, 2046 (2 << 6) - y * dx, (3 << 6) - y * dx), 2047 c3f), 2048 1)); 2049 } 2050 a0_x = _mm256_cvtepu16_epi32(a0_x128); 2051 a1_x = _mm256_cvtepu16_epi32(a1_x128); 2052 } 2053 // y calc 2054 __m128i a0_y, a1_y, shifty; 2055 if (base_x < min_base_x) { 2056 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 2057 DECLARE_ALIGNED(32, int, base_y_c[4]); 2058 r6 = _mm_set1_epi32(r << 6); 2059 dy128 = _mm_set1_epi32(dy); 2060 c1234 = _mm_setr_epi32(1, 2, 3, 4); 2061 y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128)); 2062 base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y); 2063 mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128); 2064 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 2065 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 2066 2067 a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]], 2068 left[base_y_c[2]], left[base_y_c[3]]); 2069 a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 2070 left[base_y_c[2] + 1], left[base_y_c[3] + 1]); 2071 2072 if (upsample_left) { 2073 shifty = _mm_srli_epi32( 2074 _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1); 2075 } else { 2076 shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1); 2077 } 2078 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 2079 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 2080 shift = _mm256_inserti128_si256(shift, shifty, 1); 2081 } 2082 2083 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 2084 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 2085 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2086 2087 b = _mm256_mullo_epi32(diff, shift); 2088 res = _mm256_add_epi32(a32, b); 2089 res = _mm256_srli_epi32(res, 5); 2090 2091 resx = _mm256_castsi256_si128(res); 2092 resx = _mm_packus_epi32(resx, resx); 2093 2094 resy = _mm256_extracti128_si256(res, 1); 2095 resy = _mm_packus_epi32(resy, resy); 2096 2097 resxy = 2098 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 2099 _mm_storel_epi64((__m128i *)(dst), resxy); 2100 dst += stride; 2101 } 2102 } 2103 2104 static void highbd_dr_prediction_z2_Nx4_avx2( 2105 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2106 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2107 int dy) { 2108 const int min_base_x = -(1 << upsample_above); 2109 const int min_base_y = -(1 << upsample_left); 2110 const int frac_bits_x = 6 - upsample_above; 2111 const int frac_bits_y = 6 - upsample_left; 2112 2113 assert(dx > 0); 2114 // pre-filter above pixels 2115 // store in temp buffers: 2116 // above[x] * 32 + 16 2117 // above[x+1] - above[x] 2118 // final pixels will be calculated as: 2119 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2120 __m256i a0_x, a1_x, a32, a16; 2121 __m256i diff; 2122 __m128i c3f, min_base_y128; 2123 2124 a16 = _mm256_set1_epi16(16); 2125 c3f = _mm_set1_epi16(0x3f); 2126 min_base_y128 = _mm_set1_epi16(min_base_y); 2127 2128 for (int r = 0; r < N; r++) { 2129 __m256i b, res, shift; 2130 __m128i resx, resy, resxy; 2131 __m128i a0_x128, a1_x128; 2132 int y = r + 1; 2133 int base_x = (-y * dx) >> frac_bits_x; 2134 int base_shift = 0; 2135 if (base_x < (min_base_x - 1)) { 2136 base_shift = (min_base_x - base_x - 1) >> upsample_above; 2137 } 2138 int base_min_diff = 2139 (min_base_x - base_x + upsample_above) >> upsample_above; 2140 if (base_min_diff > 4) { 2141 base_min_diff = 4; 2142 } else { 2143 if (base_min_diff < 0) base_min_diff = 0; 2144 } 2145 2146 if (base_shift > 3) { 2147 a0_x = _mm256_setzero_si256(); 2148 a1_x = _mm256_setzero_si256(); 2149 shift = _mm256_setzero_si256(); 2150 } else { 2151 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2152 if (upsample_above) { 2153 a0_x128 = _mm_shuffle_epi8(a0_x128, 2154 *(__m128i *)HighbdEvenOddMaskx4[base_shift]); 2155 a1_x128 = _mm_srli_si128(a0_x128, 8); 2156 2157 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2158 _mm_and_si128( 2159 _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 2160 (2 << 6) - y * dx, 2161 (3 << 6) - y * dx, 0, 0, 0, 0), 2162 upsample_above), 2163 c3f), 2164 1)); 2165 } else { 2166 a0_x128 = 2167 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2168 a1_x128 = _mm_srli_si128(a0_x128, 2); 2169 2170 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2171 _mm_and_si128( 2172 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, 2173 (3 << 6) - y * dx, 0, 0, 0, 0), 2174 c3f), 2175 1)); 2176 } 2177 a0_x = _mm256_castsi128_si256(a0_x128); 2178 a1_x = _mm256_castsi128_si256(a1_x128); 2179 } 2180 // y calc 2181 __m128i a0_y, a1_y, shifty; 2182 if (base_x < min_base_x) { 2183 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 2184 DECLARE_ALIGNED(32, int16_t, base_y_c[8]); 2185 r6 = _mm_set1_epi16(r << 6); 2186 dy128 = _mm_set1_epi16(dy); 2187 c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0); 2188 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); 2189 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); 2190 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); 2191 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 2192 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 2193 2194 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 2195 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); 2196 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 2197 left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0, 2198 0, 0); 2199 2200 if (upsample_left) { 2201 shifty = _mm_srli_epi16( 2202 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); 2203 } else { 2204 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); 2205 } 2206 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 2207 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 2208 shift = _mm256_inserti128_si256(shift, shifty, 1); 2209 } 2210 2211 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 2212 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 2213 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2214 2215 b = _mm256_mullo_epi16(diff, shift); 2216 res = _mm256_add_epi16(a32, b); 2217 res = _mm256_srli_epi16(res, 5); 2218 2219 resx = _mm256_castsi256_si128(res); 2220 resy = _mm256_extracti128_si256(res, 1); 2221 resxy = 2222 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 2223 _mm_storel_epi64((__m128i *)(dst), resxy); 2224 dst += stride; 2225 } 2226 } 2227 2228 static void highbd_dr_prediction_32bit_z2_Nx8_avx2( 2229 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2230 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2231 int dy) { 2232 const int min_base_x = -(1 << upsample_above); 2233 const int min_base_y = -(1 << upsample_left); 2234 const int frac_bits_x = 6 - upsample_above; 2235 const int frac_bits_y = 6 - upsample_left; 2236 2237 // pre-filter above pixels 2238 // store in temp buffers: 2239 // above[x] * 32 + 16 2240 // above[x+1] - above[x] 2241 // final pixels will be calculated as: 2242 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2243 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256; 2244 __m256i diff; 2245 __m128i a0_x128, a1_x128; 2246 2247 a16 = _mm256_set1_epi32(16); 2248 c3f = _mm256_set1_epi32(0x3f); 2249 min_base_y256 = _mm256_set1_epi32(min_base_y); 2250 2251 for (int r = 0; r < N; r++) { 2252 __m256i b, res, shift; 2253 __m128i resx, resy, resxy; 2254 int y = r + 1; 2255 int base_x = (-y * dx) >> frac_bits_x; 2256 int base_shift = 0; 2257 if (base_x < (min_base_x - 1)) { 2258 base_shift = (min_base_x - base_x - 1) >> upsample_above; 2259 } 2260 int base_min_diff = 2261 (min_base_x - base_x + upsample_above) >> upsample_above; 2262 if (base_min_diff > 8) { 2263 base_min_diff = 8; 2264 } else { 2265 if (base_min_diff < 0) base_min_diff = 0; 2266 } 2267 2268 if (base_shift > 7) { 2269 resx = _mm_setzero_si128(); 2270 } else { 2271 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2272 if (upsample_above) { 2273 __m128i mask, atmp0, atmp1, atmp2, atmp3; 2274 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); 2275 atmp0 = _mm_shuffle_epi8(a0_x128, 2276 *(__m128i *)HighbdEvenOddMaskx[base_shift]); 2277 atmp1 = _mm_shuffle_epi8(a1_x128, 2278 *(__m128i *)HighbdEvenOddMaskx[base_shift]); 2279 atmp2 = _mm_shuffle_epi8( 2280 a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); 2281 atmp3 = _mm_shuffle_epi8( 2282 a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); 2283 mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], 2284 _mm_set1_epi8(15)); 2285 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); 2286 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), 2287 _mm_set1_epi8(15)); 2288 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); 2289 shift = _mm256_srli_epi32( 2290 _mm256_and_si256( 2291 _mm256_slli_epi32( 2292 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, 2293 (2 << 6) - y * dx, (3 << 6) - y * dx, 2294 (4 << 6) - y * dx, (5 << 6) - y * dx, 2295 (6 << 6) - y * dx, (7 << 6) - y * dx), 2296 upsample_above), 2297 c3f), 2298 1); 2299 } else { 2300 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); 2301 a0_x128 = 2302 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2303 a1_x128 = 2304 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2305 2306 shift = _mm256_srli_epi32( 2307 _mm256_and_si256( 2308 _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx, 2309 (3 << 6) - y * dx, (4 << 6) - y * dx, 2310 (5 << 6) - y * dx, (6 << 6) - y * dx, 2311 (7 << 6) - y * dx), 2312 c3f), 2313 1); 2314 } 2315 a0_x = _mm256_cvtepu16_epi32(a0_x128); 2316 a1_x = _mm256_cvtepu16_epi32(a1_x128); 2317 2318 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 2319 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 2320 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2321 2322 b = _mm256_mullo_epi32(diff, shift); 2323 res = _mm256_add_epi32(a32, b); 2324 res = _mm256_srli_epi32(res, 5); 2325 2326 resx = _mm256_castsi256_si128(_mm256_packus_epi32( 2327 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 2328 } 2329 // y calc 2330 if (base_x < min_base_x) { 2331 DECLARE_ALIGNED(32, int, base_y_c[8]); 2332 __m256i r6, c256, dy256, y_c256, base_y_c256, mask256; 2333 r6 = _mm256_set1_epi32(r << 6); 2334 dy256 = _mm256_set1_epi32(dy); 2335 c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8); 2336 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); 2337 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); 2338 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); 2339 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2340 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 2341 2342 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2343 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 2344 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 2345 left[base_y_c[6]], left[base_y_c[7]])); 2346 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2347 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], 2348 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], 2349 left[base_y_c[6] + 1], left[base_y_c[7] + 1])); 2350 2351 if (upsample_left) { 2352 shift = _mm256_srli_epi32( 2353 _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f), 2354 1); 2355 } else { 2356 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); 2357 } 2358 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] 2359 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 2360 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2361 2362 b = _mm256_mullo_epi32(diff, shift); 2363 res = _mm256_add_epi32(a32, b); 2364 res = _mm256_srli_epi32(res, 5); 2365 2366 resy = _mm256_castsi256_si128(_mm256_packus_epi32( 2367 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 2368 } else { 2369 resy = resx; 2370 } 2371 resxy = 2372 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 2373 _mm_storeu_si128((__m128i *)(dst), resxy); 2374 dst += stride; 2375 } 2376 } 2377 2378 static void highbd_dr_prediction_z2_Nx8_avx2( 2379 int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2380 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2381 int dy) { 2382 const int min_base_x = -(1 << upsample_above); 2383 const int min_base_y = -(1 << upsample_left); 2384 const int frac_bits_x = 6 - upsample_above; 2385 const int frac_bits_y = 6 - upsample_left; 2386 2387 // pre-filter above pixels 2388 // store in temp buffers: 2389 // above[x] * 32 + 16 2390 // above[x+1] - above[x] 2391 // final pixels will be calculated as: 2392 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2393 __m128i c3f, min_base_y128; 2394 __m256i a0_x, a1_x, diff, a32, a16; 2395 __m128i a0_x128, a1_x128; 2396 2397 a16 = _mm256_set1_epi16(16); 2398 c3f = _mm_set1_epi16(0x3f); 2399 min_base_y128 = _mm_set1_epi16(min_base_y); 2400 2401 for (int r = 0; r < N; r++) { 2402 __m256i b, res, shift; 2403 __m128i resx, resy, resxy; 2404 int y = r + 1; 2405 int base_x = (-y * dx) >> frac_bits_x; 2406 int base_shift = 0; 2407 if (base_x < (min_base_x - 1)) { 2408 base_shift = (min_base_x - base_x - 1) >> upsample_above; 2409 } 2410 int base_min_diff = 2411 (min_base_x - base_x + upsample_above) >> upsample_above; 2412 if (base_min_diff > 8) { 2413 base_min_diff = 8; 2414 } else { 2415 if (base_min_diff < 0) base_min_diff = 0; 2416 } 2417 2418 if (base_shift > 7) { 2419 a0_x = _mm256_setzero_si256(); 2420 a1_x = _mm256_setzero_si256(); 2421 shift = _mm256_setzero_si256(); 2422 } else { 2423 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2424 if (upsample_above) { 2425 __m128i mask, atmp0, atmp1, atmp2, atmp3; 2426 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift)); 2427 atmp0 = _mm_shuffle_epi8(a0_x128, 2428 *(__m128i *)HighbdEvenOddMaskx[base_shift]); 2429 atmp1 = _mm_shuffle_epi8(a1_x128, 2430 *(__m128i *)HighbdEvenOddMaskx[base_shift]); 2431 atmp2 = _mm_shuffle_epi8( 2432 a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); 2433 atmp3 = _mm_shuffle_epi8( 2434 a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16)); 2435 mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift], 2436 _mm_set1_epi8(15)); 2437 a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask); 2438 mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16), 2439 _mm_set1_epi8(15)); 2440 a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask); 2441 2442 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2443 _mm_and_si128( 2444 _mm_slli_epi16( 2445 _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 2446 (2 << 6) - y * dx, (3 << 6) - y * dx, 2447 (4 << 6) - y * dx, (5 << 6) - y * dx, 2448 (6 << 6) - y * dx, (7 << 6) - y * dx), 2449 upsample_above), 2450 c3f), 2451 1)); 2452 } else { 2453 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift)); 2454 a0_x128 = 2455 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2456 a1_x128 = 2457 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2458 2459 shift = _mm256_castsi128_si256(_mm_srli_epi16( 2460 _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx, 2461 (2 << 6) - y * dx, (3 << 6) - y * dx, 2462 (4 << 6) - y * dx, (5 << 6) - y * dx, 2463 (6 << 6) - y * dx, (7 << 6) - y * dx), 2464 c3f), 2465 1)); 2466 } 2467 a0_x = _mm256_castsi128_si256(a0_x128); 2468 a1_x = _mm256_castsi128_si256(a1_x128); 2469 } 2470 2471 // y calc 2472 __m128i a0_y, a1_y, shifty; 2473 if (base_x < min_base_x) { 2474 DECLARE_ALIGNED(32, int16_t, base_y_c[8]); 2475 __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128; 2476 r6 = _mm_set1_epi16(r << 6); 2477 dy128 = _mm_set1_epi16(dy); 2478 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); 2479 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); 2480 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); 2481 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); 2482 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 2483 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 2484 2485 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 2486 left[base_y_c[2]], left[base_y_c[3]], 2487 left[base_y_c[4]], left[base_y_c[5]], 2488 left[base_y_c[6]], left[base_y_c[7]]); 2489 a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1], 2490 left[base_y_c[2] + 1], left[base_y_c[3] + 1], 2491 left[base_y_c[4] + 1], left[base_y_c[5] + 1], 2492 left[base_y_c[6] + 1], left[base_y_c[7] + 1]); 2493 2494 if (upsample_left) { 2495 shifty = _mm_srli_epi16( 2496 _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1); 2497 } else { 2498 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); 2499 } 2500 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 2501 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 2502 shift = _mm256_inserti128_si256(shift, shifty, 1); 2503 } 2504 2505 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 2506 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 2507 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2508 2509 b = _mm256_mullo_epi16(diff, shift); 2510 res = _mm256_add_epi16(a32, b); 2511 res = _mm256_srli_epi16(res, 5); 2512 2513 resx = _mm256_castsi256_si128(res); 2514 resy = _mm256_extracti128_si256(res, 1); 2515 2516 resxy = 2517 _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]); 2518 _mm_storeu_si128((__m128i *)(dst), resxy); 2519 dst += stride; 2520 } 2521 } 2522 2523 static void highbd_dr_prediction_32bit_z2_HxW_avx2( 2524 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2525 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2526 int dy) { 2527 // here upsample_above and upsample_left are 0 by design of 2528 // av1_use_intra_edge_upsample 2529 const int min_base_x = -1; 2530 const int min_base_y = -1; 2531 (void)upsample_above; 2532 (void)upsample_left; 2533 const int frac_bits_x = 6; 2534 const int frac_bits_y = 6; 2535 2536 // pre-filter above pixels 2537 // store in temp buffers: 2538 // above[x] * 32 + 16 2539 // above[x+1] - above[x] 2540 // final pixels will be calculated as: 2541 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2542 __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1; 2543 __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8; 2544 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; 2545 DECLARE_ALIGNED(32, int, base_y_c[16]); 2546 2547 a16 = _mm256_set1_epi32(16); 2548 c1 = _mm256_srli_epi32(a16, 4); 2549 c8 = _mm256_srli_epi32(a16, 1); 2550 min_base_y256 = _mm256_set1_epi32(min_base_y); 2551 c3f = _mm256_set1_epi32(0x3f); 2552 dy256 = _mm256_set1_epi32(dy); 2553 c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7); 2554 c1234 = _mm256_add_epi32(c0123, c1); 2555 2556 for (int r = 0; r < H; r++) { 2557 __m256i b, res, shift, ydx; 2558 __m256i resx[2], resy[2]; 2559 __m256i resxy, j256, r6; 2560 for (int j = 0; j < W; j += 16) { 2561 j256 = _mm256_set1_epi32(j); 2562 int y = r + 1; 2563 ydx = _mm256_set1_epi32(y * dx); 2564 2565 int base_x = ((j << 6) - y * dx) >> frac_bits_x; 2566 int base_shift = 0; 2567 if ((base_x) < (min_base_x - 1)) { 2568 base_shift = (min_base_x - base_x - 1); 2569 } 2570 int base_min_diff = (min_base_x - base_x); 2571 if (base_min_diff > 16) { 2572 base_min_diff = 16; 2573 } else { 2574 if (base_min_diff < 0) base_min_diff = 0; 2575 } 2576 2577 if (base_shift > 7) { 2578 resx[0] = _mm256_setzero_si256(); 2579 } else { 2580 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2581 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); 2582 a0_x128 = 2583 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2584 a1_x128 = 2585 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2586 2587 a0_x = _mm256_cvtepu16_epi32(a0_x128); 2588 a1_x = _mm256_cvtepu16_epi32(a1_x128); 2589 2590 r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6); 2591 shift = _mm256_srli_epi32( 2592 _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); 2593 2594 diff = _mm256_sub_epi32(a1_x, a0_x); // a[x+1] - a[x] 2595 a32 = _mm256_slli_epi32(a0_x, 5); // a[x] * 32 2596 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2597 2598 b = _mm256_mullo_epi32(diff, shift); 2599 res = _mm256_add_epi32(a32, b); 2600 res = _mm256_srli_epi32(res, 5); 2601 2602 resx[0] = _mm256_packus_epi32( 2603 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 2604 } 2605 int base_shift8 = 0; 2606 if ((base_x + 8) < (min_base_x - 1)) { 2607 base_shift8 = (min_base_x - (base_x + 8) - 1); 2608 } 2609 if (base_shift8 > 7) { 2610 resx[1] = _mm256_setzero_si256(); 2611 } else { 2612 a0_1_x128 = 2613 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8)); 2614 a1_1_x128 = 2615 _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9)); 2616 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, 2617 *(__m128i *)HighbdLoadMaskx[base_shift8]); 2618 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, 2619 *(__m128i *)HighbdLoadMaskx[base_shift8]); 2620 2621 a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128); 2622 a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128); 2623 2624 r6 = _mm256_slli_epi32( 2625 _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6); 2626 shift = _mm256_srli_epi32( 2627 _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1); 2628 2629 diff = _mm256_sub_epi32(a1_1_x, a0_1_x); // a[x+1] - a[x] 2630 a32 = _mm256_slli_epi32(a0_1_x, 5); // a[x] * 32 2631 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2632 b = _mm256_mullo_epi32(diff, shift); 2633 2634 resx[1] = _mm256_add_epi32(a32, b); 2635 resx[1] = _mm256_srli_epi32(resx[1], 5); 2636 resx[1] = _mm256_packus_epi32( 2637 resx[1], 2638 _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1))); 2639 } 2640 resx[0] = 2641 _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]), 2642 1); // 16 16bit values 2643 2644 // y calc 2645 resy[0] = _mm256_setzero_si256(); 2646 if ((base_x < min_base_x)) { 2647 __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256; 2648 r6 = _mm256_set1_epi32(r << 6); 2649 c256 = _mm256_add_epi32(j256, c1234); 2650 y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); 2651 base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y); 2652 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); 2653 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2654 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 2655 c256 = _mm256_add_epi32(c256, c8); 2656 y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256)); 2657 base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y); 2658 mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256); 2659 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2660 _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256); 2661 2662 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2663 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 2664 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 2665 left[base_y_c[6]], left[base_y_c[7]])); 2666 a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2667 left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1], 2668 left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1], 2669 left[base_y_c[6] + 1], left[base_y_c[7] + 1])); 2670 2671 shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1); 2672 2673 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] 2674 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 2675 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2676 2677 b = _mm256_mullo_epi32(diff, shift); 2678 res = _mm256_add_epi32(a32, b); 2679 res = _mm256_srli_epi32(res, 5); 2680 2681 resy[0] = _mm256_packus_epi32( 2682 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 2683 2684 a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16( 2685 left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]], 2686 left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]], 2687 left[base_y_c[14]], left[base_y_c[15]])); 2688 a1_y = _mm256_cvtepu16_epi32( 2689 _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1], 2690 left[base_y_c[10] + 1], left[base_y_c[11] + 1], 2691 left[base_y_c[12] + 1], left[base_y_c[13] + 1], 2692 left[base_y_c[14] + 1], left[base_y_c[15] + 1])); 2693 shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1); 2694 2695 diff = _mm256_sub_epi32(a1_y, a0_y); // a[x+1] - a[x] 2696 a32 = _mm256_slli_epi32(a0_y, 5); // a[x] * 32 2697 a32 = _mm256_add_epi32(a32, a16); // a[x] * 32 + 16 2698 2699 b = _mm256_mullo_epi32(diff, shift); 2700 res = _mm256_add_epi32(a32, b); 2701 res = _mm256_srli_epi32(res, 5); 2702 2703 resy[1] = _mm256_packus_epi32( 2704 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))); 2705 2706 resy[0] = 2707 _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]), 2708 1); // 16 16bit values 2709 } 2710 2711 resxy = _mm256_blendv_epi8(resx[0], resy[0], 2712 *(__m256i *)HighbdBaseMask[base_min_diff]); 2713 _mm256_storeu_si256((__m256i *)(dst + j), resxy); 2714 } // for j 2715 dst += stride; 2716 } 2717 } 2718 2719 static void highbd_dr_prediction_z2_HxW_avx2( 2720 int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above, 2721 const uint16_t *left, int upsample_above, int upsample_left, int dx, 2722 int dy) { 2723 // here upsample_above and upsample_left are 0 by design of 2724 // av1_use_intra_edge_upsample 2725 const int min_base_x = -1; 2726 const int min_base_y = -1; 2727 (void)upsample_above; 2728 (void)upsample_left; 2729 const int frac_bits_x = 6; 2730 const int frac_bits_y = 6; 2731 2732 // pre-filter above pixels 2733 // store in temp buffers: 2734 // above[x] * 32 + 16 2735 // above[x+1] - above[x] 2736 // final pixels will be calculated as: 2737 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 2738 __m256i a0_x, a1_x, a32, a16, c3f, c1; 2739 __m256i diff, min_base_y256, dy256, c1234, c0123; 2740 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 2741 2742 a16 = _mm256_set1_epi16(16); 2743 c1 = _mm256_srli_epi16(a16, 4); 2744 min_base_y256 = _mm256_set1_epi16(min_base_y); 2745 c3f = _mm256_set1_epi16(0x3f); 2746 dy256 = _mm256_set1_epi16(dy); 2747 c0123 = 2748 _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 2749 c1234 = _mm256_add_epi16(c0123, c1); 2750 2751 for (int r = 0; r < H; r++) { 2752 __m256i b, res, shift; 2753 __m256i resx, resy, ydx; 2754 __m256i resxy, j256, r6; 2755 __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128; 2756 int y = r + 1; 2757 ydx = _mm256_set1_epi16((short)(y * dx)); 2758 2759 for (int j = 0; j < W; j += 16) { 2760 j256 = _mm256_set1_epi16(j); 2761 int base_x = ((j << 6) - y * dx) >> frac_bits_x; 2762 int base_shift = 0; 2763 if ((base_x) < (min_base_x - 1)) { 2764 base_shift = (min_base_x - (base_x)-1); 2765 } 2766 int base_min_diff = (min_base_x - base_x); 2767 if (base_min_diff > 16) { 2768 base_min_diff = 16; 2769 } else { 2770 if (base_min_diff < 0) base_min_diff = 0; 2771 } 2772 2773 if (base_shift < 8) { 2774 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 2775 a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1)); 2776 a0_x128 = 2777 _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2778 a1_x128 = 2779 _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]); 2780 2781 a0_x = _mm256_castsi128_si256(a0_x128); 2782 a1_x = _mm256_castsi128_si256(a1_x128); 2783 } else { 2784 a0_x = _mm256_setzero_si256(); 2785 a1_x = _mm256_setzero_si256(); 2786 } 2787 2788 int base_shift1 = 0; 2789 if (base_shift > 8) { 2790 base_shift1 = base_shift - 8; 2791 } 2792 if (base_shift1 < 8) { 2793 a0_1_x128 = 2794 _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8)); 2795 a1_1_x128 = 2796 _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9)); 2797 a0_1_x128 = _mm_shuffle_epi8(a0_1_x128, 2798 *(__m128i *)HighbdLoadMaskx[base_shift1]); 2799 a1_1_x128 = _mm_shuffle_epi8(a1_1_x128, 2800 *(__m128i *)HighbdLoadMaskx[base_shift1]); 2801 2802 a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1); 2803 a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1); 2804 } 2805 r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); 2806 shift = _mm256_srli_epi16( 2807 _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); 2808 2809 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 2810 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 2811 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2812 2813 b = _mm256_mullo_epi16(diff, shift); 2814 res = _mm256_add_epi16(a32, b); 2815 resx = _mm256_srli_epi16(res, 5); // 16 16-bit values 2816 2817 // y calc 2818 resy = _mm256_setzero_si256(); 2819 __m256i a0_y, a1_y, shifty; 2820 if ((base_x < min_base_x)) { 2821 __m256i c256, y_c256, base_y_c256, mask256, mul16; 2822 r6 = _mm256_set1_epi16(r << 6); 2823 c256 = _mm256_add_epi16(j256, c1234); 2824 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), 2825 _mm256_srli_epi16(min_base_y256, 1)); 2826 y_c256 = _mm256_sub_epi16(r6, mul16); 2827 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); 2828 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); 2829 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 2830 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 2831 2832 a0_y = _mm256_setr_epi16( 2833 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 2834 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 2835 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], 2836 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], 2837 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], 2838 left[base_y_c[15]]); 2839 base_y_c256 = _mm256_add_epi16(base_y_c256, c1); 2840 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 2841 2842 a1_y = _mm256_setr_epi16( 2843 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 2844 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 2845 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], 2846 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], 2847 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], 2848 left[base_y_c[15]]); 2849 2850 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); 2851 2852 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] 2853 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 2854 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 2855 2856 b = _mm256_mullo_epi16(diff, shifty); 2857 res = _mm256_add_epi16(a32, b); 2858 resy = _mm256_srli_epi16(res, 5); 2859 } 2860 2861 resxy = _mm256_blendv_epi8(resx, resy, 2862 *(__m256i *)HighbdBaseMask[base_min_diff]); 2863 _mm256_storeu_si256((__m256i *)(dst + j), resxy); 2864 } // for j 2865 dst += stride; 2866 } 2867 } 2868 2869 // Directional prediction, zone 2: 90 < angle < 180 2870 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw, 2871 int bh, const uint16_t *above, 2872 const uint16_t *left, int upsample_above, 2873 int upsample_left, int dx, int dy, 2874 int bd) { 2875 (void)bd; 2876 assert(dx > 0); 2877 assert(dy > 0); 2878 switch (bw) { 2879 case 4: 2880 if (bd < 12) { 2881 highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, 2882 upsample_above, upsample_left, dx, dy); 2883 } else { 2884 highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left, 2885 upsample_above, upsample_left, 2886 dx, dy); 2887 } 2888 break; 2889 case 8: 2890 if (bd < 12) { 2891 highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, 2892 upsample_above, upsample_left, dx, dy); 2893 } else { 2894 highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left, 2895 upsample_above, upsample_left, 2896 dx, dy); 2897 } 2898 break; 2899 default: 2900 if (bd < 12) { 2901 highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, 2902 upsample_above, upsample_left, dx, dy); 2903 } else { 2904 highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left, 2905 upsample_above, upsample_left, 2906 dx, dy); 2907 } 2908 break; 2909 } 2910 } 2911 2912 // Directional prediction, zone 3 functions 2913 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride, 2914 const uint16_t *left, 2915 int upsample_left, int dy, 2916 int bd) { 2917 __m128i dstvec[4], d[4]; 2918 if (bd < 12) { 2919 highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left, 2920 dy); 2921 } else { 2922 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left, 2923 upsample_left, dy); 2924 } 2925 highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], 2926 &dstvec[3], &d[0], &d[1], &d[2], &d[3]); 2927 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 2928 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); 2929 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); 2930 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); 2931 return; 2932 } 2933 2934 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride, 2935 const uint16_t *left, 2936 int upsample_left, int dy, 2937 int bd) { 2938 __m128i dstvec[8], d[8]; 2939 if (bd < 12) { 2940 highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left, 2941 dy); 2942 } else { 2943 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left, 2944 upsample_left, dy); 2945 } 2946 highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 2947 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], 2948 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], 2949 &d[7]); 2950 for (int i = 0; i < 8; i++) { 2951 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 2952 } 2953 } 2954 2955 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride, 2956 const uint16_t *left, 2957 int upsample_left, int dy, 2958 int bd) { 2959 __m128i dstvec[4], d[8]; 2960 if (bd < 12) { 2961 highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left, 2962 dy); 2963 } else { 2964 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left, 2965 upsample_left, dy); 2966 } 2967 2968 highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 2969 &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], 2970 &d[7]); 2971 for (int i = 0; i < 8; i++) { 2972 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); 2973 } 2974 } 2975 2976 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride, 2977 const uint16_t *left, 2978 int upsample_left, int dy, 2979 int bd) { 2980 __m128i dstvec[8], d[4]; 2981 if (bd < 12) { 2982 highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left, 2983 dy); 2984 } else { 2985 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left, 2986 upsample_left, dy); 2987 } 2988 2989 highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 2990 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], 2991 &d[0], &d[1], &d[2], &d[3]); 2992 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); 2993 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]); 2994 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]); 2995 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]); 2996 } 2997 2998 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride, 2999 const uint16_t *left, 3000 int upsample_left, int dy, 3001 int bd) { 3002 __m256i dstvec[8], d[8]; 3003 if (bd < 12) { 3004 highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left, 3005 dy); 3006 } else { 3007 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left, 3008 upsample_left, dy); 3009 } 3010 highbd_transpose8x16_16x8_avx2(dstvec, d); 3011 for (int i = 0; i < 8; i++) { 3012 _mm_storeu_si128((__m128i *)(dst + i * stride), 3013 _mm256_castsi256_si128(d[i])); 3014 } 3015 for (int i = 8; i < 16; i++) { 3016 _mm_storeu_si128((__m128i *)(dst + i * stride), 3017 _mm256_extracti128_si256(d[i - 8], 1)); 3018 } 3019 } 3020 3021 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride, 3022 const uint16_t *left, 3023 int upsample_left, int dy, 3024 int bd) { 3025 __m128i dstvec[16], d[16]; 3026 if (bd < 12) { 3027 highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left, 3028 dy); 3029 } else { 3030 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left, 3031 upsample_left, dy); 3032 } 3033 for (int i = 0; i < 16; i += 8) { 3034 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], 3035 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], 3036 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], 3037 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], 3038 &d[5 + i], &d[6 + i], &d[7 + i]); 3039 } 3040 for (int i = 0; i < 8; i++) { 3041 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 3042 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); 3043 } 3044 } 3045 3046 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3047 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride, 3048 const uint16_t *left, 3049 int upsample_left, int dy, 3050 int bd) { 3051 __m256i dstvec[4], d[4], d1; 3052 if (bd < 12) { 3053 highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left, 3054 dy); 3055 } else { 3056 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left, 3057 upsample_left, dy); 3058 } 3059 highbd_transpose4x16_avx2(dstvec, d); 3060 for (int i = 0; i < 4; i++) { 3061 _mm_storel_epi64((__m128i *)(dst + i * stride), 3062 _mm256_castsi256_si128(d[i])); 3063 d1 = _mm256_bsrli_epi128(d[i], 8); 3064 _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride), 3065 _mm256_castsi256_si128(d1)); 3066 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), 3067 _mm256_extracti128_si256(d[i], 1)); 3068 _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride), 3069 _mm256_extracti128_si256(d1, 1)); 3070 } 3071 } 3072 3073 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride, 3074 const uint16_t *left, 3075 int upsample_left, int dy, 3076 int bd) { 3077 __m128i dstvec[16], d[8]; 3078 if (bd < 12) { 3079 highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left, 3080 dy); 3081 } else { 3082 highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left, 3083 upsample_left, dy); 3084 } 3085 highbd_transpose16x4_8x8_sse2(dstvec, d); 3086 3087 _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]); 3088 _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]); 3089 _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]); 3090 _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]); 3091 _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]); 3092 _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]); 3093 _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]); 3094 _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]); 3095 } 3096 3097 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride, 3098 const uint16_t *left, 3099 int upsample_left, int dy, 3100 int bd) { 3101 __m256i dstvec[16], d[16]; 3102 if (bd < 12) { 3103 highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, 3104 dy); 3105 } else { 3106 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left, 3107 upsample_left, dy); 3108 } 3109 3110 for (int i = 0; i < 16; i += 8) { 3111 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); 3112 } 3113 3114 for (int i = 0; i < 8; i++) { 3115 _mm_storeu_si128((__m128i *)(dst + i * stride), 3116 _mm256_castsi256_si128(d[i])); 3117 } 3118 for (int i = 0; i < 8; i++) { 3119 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), 3120 _mm256_extracti128_si256(d[i], 1)); 3121 } 3122 for (int i = 8; i < 16; i++) { 3123 _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride), 3124 _mm256_castsi256_si128(d[i])); 3125 } 3126 for (int i = 8; i < 16; i++) { 3127 _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride), 3128 _mm256_extracti128_si256(d[i], 1)); 3129 } 3130 } 3131 3132 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride, 3133 const uint16_t *left, 3134 int upsample_left, int dy, 3135 int bd) { 3136 __m128i dstvec[32], d[32]; 3137 if (bd < 12) { 3138 highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left, 3139 dy); 3140 } else { 3141 highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left, 3142 upsample_left, dy); 3143 } 3144 3145 for (int i = 0; i < 32; i += 8) { 3146 highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i], 3147 &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i], 3148 &dstvec[6 + i], &dstvec[7 + i], &d[0 + i], 3149 &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i], 3150 &d[5 + i], &d[6 + i], &d[7 + i]); 3151 } 3152 for (int i = 0; i < 8; i++) { 3153 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 3154 _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]); 3155 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]); 3156 _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]); 3157 } 3158 } 3159 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3160 3161 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride, 3162 const uint16_t *left, 3163 int upsample_left, int dy, 3164 int bd) { 3165 __m256i dstvec[16], d[16]; 3166 if (bd < 12) { 3167 highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left, 3168 dy); 3169 } else { 3170 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left, 3171 upsample_left, dy); 3172 } 3173 3174 highbd_transpose16x16_avx2(dstvec, d); 3175 3176 for (int i = 0; i < 16; i++) { 3177 _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]); 3178 } 3179 } 3180 3181 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride, 3182 const uint16_t *left, 3183 int upsample_left, int dy, 3184 int bd) { 3185 __m256i dstvec[64], d[16]; 3186 if (bd < 12) { 3187 highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, 3188 dy); 3189 } else { 3190 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left, 3191 upsample_left, dy); 3192 } 3193 highbd_transpose16x16_avx2(dstvec, d); 3194 for (int j = 0; j < 16; j++) { 3195 _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]); 3196 } 3197 highbd_transpose16x16_avx2(dstvec + 16, d); 3198 for (int j = 0; j < 16; j++) { 3199 _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]); 3200 } 3201 highbd_transpose16x16_avx2(dstvec + 32, d); 3202 for (int j = 0; j < 16; j++) { 3203 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]); 3204 } 3205 highbd_transpose16x16_avx2(dstvec + 48, d); 3206 for (int j = 0; j < 16; j++) { 3207 _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]); 3208 } 3209 } 3210 3211 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride, 3212 const uint16_t *left, 3213 int upsample_left, int dy, 3214 int bd) { 3215 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]); 3216 if (bd < 12) { 3217 highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); 3218 } else { 3219 highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left, 3220 dy); 3221 } 3222 highbd_transpose(dstT, 64, dst, stride, 64, 64); 3223 } 3224 3225 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride, 3226 const uint16_t *left, 3227 int upsample_left, int dy, 3228 int bd) { 3229 __m256i dstvec[32], d[32]; 3230 if (bd < 12) { 3231 highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, 3232 dy); 3233 } else { 3234 highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left, 3235 upsample_left, dy); 3236 } 3237 for (int i = 0; i < 32; i += 8) { 3238 highbd_transpose8x16_16x8_avx2(dstvec + i, d + i); 3239 } 3240 // store 3241 for (int j = 0; j < 32; j += 16) { 3242 for (int i = 0; i < 8; i++) { 3243 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride), 3244 _mm256_castsi256_si128(d[(i + j)])); 3245 } 3246 for (int i = 0; i < 8; i++) { 3247 _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8), 3248 _mm256_castsi256_si128(d[(i + j) + 8])); 3249 } 3250 for (int i = 8; i < 16; i++) { 3251 _mm256_storeu_si256( 3252 (__m256i *)(dst + (i + j) * stride), 3253 _mm256_inserti128_si256( 3254 d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0)); 3255 } 3256 } 3257 } 3258 3259 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride, 3260 const uint16_t *left, 3261 int upsample_left, int dy, 3262 int bd) { 3263 __m256i dstvec[32], d[16]; 3264 if (bd < 12) { 3265 highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left, 3266 dy); 3267 } else { 3268 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left, 3269 upsample_left, dy); 3270 } 3271 for (int i = 0; i < 32; i += 16) { 3272 highbd_transpose16x16_avx2((dstvec + i), d); 3273 for (int j = 0; j < 16; j++) { 3274 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); 3275 } 3276 } 3277 } 3278 3279 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride, 3280 const uint16_t *left, 3281 int upsample_left, int dy, 3282 int bd) { 3283 uint16_t dstT[64 * 32]; 3284 if (bd < 12) { 3285 highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); 3286 } else { 3287 highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left, 3288 dy); 3289 } 3290 highbd_transpose(dstT, 64, dst, stride, 32, 64); 3291 } 3292 3293 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride, 3294 const uint16_t *left, 3295 int upsample_left, int dy, 3296 int bd) { 3297 DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]); 3298 highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd); 3299 highbd_transpose(dstT, 32, dst, stride, 64, 32); 3300 return; 3301 } 3302 3303 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3304 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride, 3305 const uint16_t *left, 3306 int upsample_left, int dy, 3307 int bd) { 3308 DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]); 3309 if (bd < 12) { 3310 highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); 3311 } else { 3312 highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left, 3313 dy); 3314 } 3315 highbd_transpose(dstT, 64, dst, stride, 16, 64); 3316 } 3317 3318 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride, 3319 const uint16_t *left, 3320 int upsample_left, int dy, 3321 int bd) { 3322 __m256i dstvec[64], d[16]; 3323 if (bd < 12) { 3324 highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left, 3325 dy); 3326 } else { 3327 highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left, 3328 upsample_left, dy); 3329 } 3330 for (int i = 0; i < 64; i += 16) { 3331 highbd_transpose16x16_avx2((dstvec + i), d); 3332 for (int j = 0; j < 16; j++) { 3333 _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]); 3334 } 3335 } 3336 } 3337 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3338 3339 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw, 3340 int bh, const uint16_t *above, 3341 const uint16_t *left, int upsample_left, 3342 int dx, int dy, int bd) { 3343 (void)above; 3344 (void)dx; 3345 3346 assert(dx == 1); 3347 assert(dy > 0); 3348 if (bw == bh) { 3349 switch (bw) { 3350 case 4: 3351 highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy, 3352 bd); 3353 break; 3354 case 8: 3355 highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy, 3356 bd); 3357 break; 3358 case 16: 3359 highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy, 3360 bd); 3361 break; 3362 case 32: 3363 highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy, 3364 bd); 3365 break; 3366 case 64: 3367 highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy, 3368 bd); 3369 break; 3370 } 3371 } else { 3372 if (bw < bh) { 3373 if (bw + bw == bh) { 3374 switch (bw) { 3375 case 4: 3376 highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, 3377 dy, bd); 3378 break; 3379 case 8: 3380 highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, 3381 dy, bd); 3382 break; 3383 case 16: 3384 highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, 3385 dy, bd); 3386 break; 3387 case 32: 3388 highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, 3389 dy, bd); 3390 break; 3391 } 3392 } else { 3393 switch (bw) { 3394 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3395 case 4: 3396 highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, 3397 dy, bd); 3398 break; 3399 case 8: 3400 highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, 3401 dy, bd); 3402 break; 3403 case 16: 3404 highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, 3405 dy, bd); 3406 break; 3407 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3408 } 3409 } 3410 } else { 3411 if (bh + bh == bw) { 3412 switch (bh) { 3413 case 4: 3414 highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, 3415 dy, bd); 3416 break; 3417 case 8: 3418 highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, 3419 dy, bd); 3420 break; 3421 case 16: 3422 highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, 3423 dy, bd); 3424 break; 3425 case 32: 3426 highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, 3427 dy, bd); 3428 break; 3429 } 3430 } else { 3431 switch (bh) { 3432 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3433 case 4: 3434 highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, 3435 dy, bd); 3436 break; 3437 case 8: 3438 highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, 3439 dy, bd); 3440 break; 3441 case 16: 3442 highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, 3443 dy, bd); 3444 break; 3445 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 3446 } 3447 } 3448 } 3449 } 3450 return; 3451 } 3452 #endif // CONFIG_AV1_HIGHBITDEPTH 3453 3454 // Low bit depth functions 3455 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { 3456 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3457 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3458 { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3459 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3460 { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3461 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3462 { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3463 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3464 { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3465 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3466 { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3467 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3468 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3469 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3470 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3471 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3472 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 3473 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3474 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 3475 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3476 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 3477 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3478 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3479 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3480 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3481 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3482 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3483 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3484 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3485 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3486 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3487 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3488 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3489 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 3490 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3491 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3492 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 3493 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3494 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3495 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 3496 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3497 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3498 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 3499 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3500 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3501 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 3502 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3503 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3504 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 3505 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3506 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3507 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 3508 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3509 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3510 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 3511 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3512 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3513 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3514 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3515 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3516 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3517 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 3518 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3519 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3520 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, 3521 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3522 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3523 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0 }, 3524 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3525 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3526 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0 }, 3527 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3528 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3529 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0 }, 3530 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3531 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3532 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0 }, 3533 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3534 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3535 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0 }, 3536 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3537 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3538 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0 }, 3539 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3540 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3541 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 }, 3542 { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3543 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 3544 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, 3545 }; 3546 3547 /* clang-format on */ 3548 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2( 3549 int H, int W, __m128i *dst, const uint8_t *above, int upsample_above, 3550 int dx) { 3551 const int frac_bits = 6 - upsample_above; 3552 const int max_base_x = ((W + H) - 1) << upsample_above; 3553 3554 assert(dx > 0); 3555 // pre-filter above pixels 3556 // store in temp buffers: 3557 // above[x] * 32 + 16 3558 // above[x+1] - above[x] 3559 // final pixels will be calculated as: 3560 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3561 __m256i a0, a1, a32, a16; 3562 __m256i diff, c3f; 3563 __m128i a_mbase_x; 3564 3565 a16 = _mm256_set1_epi16(16); 3566 a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]); 3567 c3f = _mm256_set1_epi16(0x3f); 3568 3569 int x = dx; 3570 for (int r = 0; r < W; r++) { 3571 __m256i b, res, shift; 3572 __m128i res1, a0_128, a1_128; 3573 3574 int base = x >> frac_bits; 3575 int base_max_diff = (max_base_x - base) >> upsample_above; 3576 if (base_max_diff <= 0) { 3577 for (int i = r; i < W; ++i) { 3578 dst[i] = a_mbase_x; // save 4 values 3579 } 3580 return; 3581 } 3582 if (base_max_diff > H) base_max_diff = H; 3583 a0_128 = _mm_loadu_si128((__m128i *)(above + base)); 3584 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1)); 3585 3586 if (upsample_above) { 3587 a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]); 3588 a1_128 = _mm_srli_si128(a0_128, 8); 3589 3590 shift = _mm256_srli_epi16( 3591 _mm256_and_si256( 3592 _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f), 3593 1); 3594 } else { 3595 shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 3596 } 3597 a0 = _mm256_cvtepu8_epi16(a0_128); 3598 a1 = _mm256_cvtepu8_epi16(a1_128); 3599 3600 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 3601 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 3602 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3603 3604 b = _mm256_mullo_epi16(diff, shift); 3605 res = _mm256_add_epi16(a32, b); 3606 res = _mm256_srli_epi16(res, 5); 3607 3608 res = _mm256_packus_epi16( 3609 res, _mm256_castsi128_si256( 3610 _mm256_extracti128_si256(res, 1))); // goto 8 bit 3611 res1 = _mm256_castsi256_si128(res); // 16 8bit values 3612 3613 dst[r] = 3614 _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]); 3615 x += dx; 3616 } 3617 } 3618 3619 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3620 const uint8_t *above, int upsample_above, 3621 int dx) { 3622 __m128i dstvec[16]; 3623 3624 dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx); 3625 for (int i = 0; i < N; i++) { 3626 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]); 3627 } 3628 } 3629 3630 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3631 const uint8_t *above, int upsample_above, 3632 int dx) { 3633 __m128i dstvec[32]; 3634 3635 dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx); 3636 for (int i = 0; i < N; i++) { 3637 _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]); 3638 } 3639 } 3640 3641 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3642 const uint8_t *above, int upsample_above, 3643 int dx) { 3644 __m128i dstvec[64]; 3645 3646 dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx); 3647 for (int i = 0; i < N; i++) { 3648 _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]); 3649 } 3650 } 3651 3652 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2( 3653 int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) { 3654 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 3655 (void)upsample_above; 3656 const int frac_bits = 6; 3657 const int max_base_x = ((32 + N) - 1); 3658 3659 // pre-filter above pixels 3660 // store in temp buffers: 3661 // above[x] * 32 + 16 3662 // above[x+1] - above[x] 3663 // final pixels will be calculated as: 3664 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3665 __m256i a0, a1, a32, a16; 3666 __m256i a_mbase_x, diff, c3f; 3667 3668 a16 = _mm256_set1_epi16(16); 3669 a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]); 3670 c3f = _mm256_set1_epi16(0x3f); 3671 3672 int x = dx; 3673 for (int r = 0; r < N; r++) { 3674 __m256i b, res, res16[2]; 3675 __m128i a0_128, a1_128; 3676 3677 int base = x >> frac_bits; 3678 int base_max_diff = (max_base_x - base); 3679 if (base_max_diff <= 0) { 3680 for (int i = r; i < N; ++i) { 3681 dstvec[i] = a_mbase_x; // save 32 values 3682 } 3683 return; 3684 } 3685 if (base_max_diff > 32) base_max_diff = 32; 3686 __m256i shift = 3687 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 3688 3689 for (int j = 0, jj = 0; j < 32; j += 16, jj++) { 3690 int mdiff = base_max_diff - j; 3691 if (mdiff <= 0) { 3692 res16[jj] = a_mbase_x; 3693 } else { 3694 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); 3695 a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1)); 3696 a0 = _mm256_cvtepu8_epi16(a0_128); 3697 a1 = _mm256_cvtepu8_epi16(a1_128); 3698 3699 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 3700 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 3701 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3702 b = _mm256_mullo_epi16(diff, shift); 3703 3704 res = _mm256_add_epi16(a32, b); 3705 res = _mm256_srli_epi16(res, 5); 3706 res16[jj] = _mm256_packus_epi16( 3707 res, _mm256_castsi128_si256( 3708 _mm256_extracti128_si256(res, 1))); // 16 8bit values 3709 } 3710 } 3711 res16[1] = 3712 _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]), 3713 1); // 32 8bit values 3714 3715 dstvec[r] = _mm256_blendv_epi8( 3716 a_mbase_x, res16[1], 3717 *(__m256i *)BaseMask[base_max_diff]); // 32 8bit values 3718 x += dx; 3719 } 3720 } 3721 3722 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3723 const uint8_t *above, int upsample_above, 3724 int dx) { 3725 __m256i dstvec[64]; 3726 dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx); 3727 for (int i = 0; i < N; i++) { 3728 _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]); 3729 } 3730 } 3731 3732 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3733 const uint8_t *above, int upsample_above, 3734 int dx) { 3735 // here upsample_above is 0 by design of av1_use_intra_edge_upsample 3736 (void)upsample_above; 3737 const int frac_bits = 6; 3738 const int max_base_x = ((64 + N) - 1); 3739 3740 // pre-filter above pixels 3741 // store in temp buffers: 3742 // above[x] * 32 + 16 3743 // above[x+1] - above[x] 3744 // final pixels will be calculated as: 3745 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3746 __m256i a0, a1, a32, a16; 3747 __m256i a_mbase_x, diff, c3f; 3748 __m128i max_base_x128, base_inc128, mask128; 3749 3750 a16 = _mm256_set1_epi16(16); 3751 a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]); 3752 max_base_x128 = _mm_set1_epi8(max_base_x); 3753 c3f = _mm256_set1_epi16(0x3f); 3754 3755 int x = dx; 3756 for (int r = 0; r < N; r++, dst += stride) { 3757 __m256i b, res; 3758 int base = x >> frac_bits; 3759 if (base >= max_base_x) { 3760 for (int i = r; i < N; ++i) { 3761 _mm256_storeu_si256((__m256i *)dst, a_mbase_x); // save 32 values 3762 _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x); 3763 dst += stride; 3764 } 3765 return; 3766 } 3767 3768 __m256i shift = 3769 _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1); 3770 3771 __m128i a0_128, a1_128, res128; 3772 for (int j = 0; j < 64; j += 16) { 3773 int mdif = max_base_x - (base + j); 3774 if (mdif <= 0) { 3775 _mm_storeu_si128((__m128i *)(dst + j), 3776 _mm256_castsi256_si128(a_mbase_x)); 3777 } else { 3778 a0_128 = _mm_loadu_si128((__m128i *)(above + base + j)); 3779 a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j)); 3780 a0 = _mm256_cvtepu8_epi16(a0_128); 3781 a1 = _mm256_cvtepu8_epi16(a1_128); 3782 3783 diff = _mm256_sub_epi16(a1, a0); // a[x+1] - a[x] 3784 a32 = _mm256_slli_epi16(a0, 5); // a[x] * 32 3785 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 3786 b = _mm256_mullo_epi16(diff, shift); 3787 3788 res = _mm256_add_epi16(a32, b); 3789 res = _mm256_srli_epi16(res, 5); 3790 res = _mm256_packus_epi16( 3791 res, _mm256_castsi128_si256( 3792 _mm256_extracti128_si256(res, 1))); // 16 8bit values 3793 3794 base_inc128 = 3795 _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1), 3796 (int8_t)(base + j + 2), (int8_t)(base + j + 3), 3797 (int8_t)(base + j + 4), (int8_t)(base + j + 5), 3798 (int8_t)(base + j + 6), (int8_t)(base + j + 7), 3799 (int8_t)(base + j + 8), (int8_t)(base + j + 9), 3800 (int8_t)(base + j + 10), (int8_t)(base + j + 11), 3801 (int8_t)(base + j + 12), (int8_t)(base + j + 13), 3802 (int8_t)(base + j + 14), (int8_t)(base + j + 15)); 3803 3804 mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128), 3805 _mm_setzero_si128()); 3806 res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x), 3807 _mm256_castsi256_si128(res), mask128); 3808 _mm_storeu_si128((__m128i *)(dst + j), res128); 3809 } 3810 } 3811 x += dx; 3812 } 3813 } 3814 3815 // Directional prediction, zone 1: 0 < angle < 90 3816 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 3817 const uint8_t *above, const uint8_t *left, 3818 int upsample_above, int dx, int dy) { 3819 (void)left; 3820 (void)dy; 3821 switch (bw) { 3822 case 4: 3823 dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx); 3824 break; 3825 case 8: 3826 dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx); 3827 break; 3828 case 16: 3829 dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx); 3830 break; 3831 case 32: 3832 dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx); 3833 break; 3834 case 64: 3835 dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx); 3836 break; 3837 default: break; 3838 } 3839 return; 3840 } 3841 3842 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3843 const uint8_t *above, const uint8_t *left, 3844 int upsample_above, int upsample_left, 3845 int dx, int dy) { 3846 const int min_base_x = -(1 << upsample_above); 3847 const int min_base_y = -(1 << upsample_left); 3848 const int frac_bits_x = 6 - upsample_above; 3849 const int frac_bits_y = 6 - upsample_left; 3850 3851 assert(dx > 0); 3852 // pre-filter above pixels 3853 // store in temp buffers: 3854 // above[x] * 32 + 16 3855 // above[x+1] - above[x] 3856 // final pixels will be calculated as: 3857 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3858 __m128i a0_x, a1_x, a32, a16, diff; 3859 __m128i c3f, min_base_y128, c1234, dy128; 3860 3861 a16 = _mm_set1_epi16(16); 3862 c3f = _mm_set1_epi16(0x3f); 3863 min_base_y128 = _mm_set1_epi16(min_base_y); 3864 c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0); 3865 dy128 = _mm_set1_epi16(dy); 3866 3867 for (int r = 0; r < N; r++) { 3868 __m128i b, res, shift, r6, ydx; 3869 __m128i resx, resy, resxy; 3870 __m128i a0_x128, a1_x128; 3871 int y = r + 1; 3872 int base_x = (-y * dx) >> frac_bits_x; 3873 int base_shift = 0; 3874 if (base_x < (min_base_x - 1)) { 3875 base_shift = (min_base_x - base_x - 1) >> upsample_above; 3876 } 3877 int base_min_diff = 3878 (min_base_x - base_x + upsample_above) >> upsample_above; 3879 if (base_min_diff > 4) { 3880 base_min_diff = 4; 3881 } else { 3882 if (base_min_diff < 0) base_min_diff = 0; 3883 } 3884 3885 if (base_shift > 3) { 3886 a0_x = _mm_setzero_si128(); 3887 a1_x = _mm_setzero_si128(); 3888 shift = _mm_setzero_si128(); 3889 } else { 3890 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 3891 ydx = _mm_set1_epi16(y * dx); 3892 r6 = _mm_slli_epi16(c1234, 6); 3893 3894 if (upsample_above) { 3895 a0_x128 = 3896 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); 3897 a1_x128 = _mm_srli_si128(a0_x128, 8); 3898 3899 shift = _mm_srli_epi16( 3900 _mm_and_si128( 3901 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 3902 1); 3903 } else { 3904 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); 3905 a1_x128 = _mm_srli_si128(a0_x128, 1); 3906 3907 shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1); 3908 } 3909 a0_x = _mm_cvtepu8_epi16(a0_x128); 3910 a1_x = _mm_cvtepu8_epi16(a1_x128); 3911 } 3912 // y calc 3913 __m128i a0_y, a1_y, shifty; 3914 if (base_x < min_base_x) { 3915 DECLARE_ALIGNED(32, int16_t, base_y_c[8]); 3916 __m128i y_c128, base_y_c128, mask128, c1234_; 3917 c1234_ = _mm_srli_si128(c1234, 2); 3918 r6 = _mm_set1_epi16(r << 6); 3919 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128)); 3920 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); 3921 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); 3922 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 3923 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 3924 3925 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 3926 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); 3927 base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4)); 3928 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 3929 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 3930 left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0); 3931 3932 if (upsample_left) { 3933 shifty = _mm_srli_epi16( 3934 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); 3935 } else { 3936 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); 3937 } 3938 a0_x = _mm_unpacklo_epi64(a0_x, a0_y); 3939 a1_x = _mm_unpacklo_epi64(a1_x, a1_y); 3940 shift = _mm_unpacklo_epi64(shift, shifty); 3941 } 3942 3943 diff = _mm_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 3944 a32 = _mm_slli_epi16(a0_x, 5); // a[x] * 32 3945 a32 = _mm_add_epi16(a32, a16); // a[x] * 32 + 16 3946 3947 b = _mm_mullo_epi16(diff, shift); 3948 res = _mm_add_epi16(a32, b); 3949 res = _mm_srli_epi16(res, 5); 3950 3951 resx = _mm_packus_epi16(res, res); 3952 resy = _mm_srli_si128(resx, 4); 3953 3954 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); 3955 *(int *)(dst) = _mm_cvtsi128_si32(resxy); 3956 dst += stride; 3957 } 3958 } 3959 3960 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride, 3961 const uint8_t *above, const uint8_t *left, 3962 int upsample_above, int upsample_left, 3963 int dx, int dy) { 3964 const int min_base_x = -(1 << upsample_above); 3965 const int min_base_y = -(1 << upsample_left); 3966 const int frac_bits_x = 6 - upsample_above; 3967 const int frac_bits_y = 6 - upsample_left; 3968 3969 // pre-filter above pixels 3970 // store in temp buffers: 3971 // above[x] * 32 + 16 3972 // above[x+1] - above[x] 3973 // final pixels will be calculated as: 3974 // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 3975 __m256i diff, a32, a16; 3976 __m256i a0_x, a1_x; 3977 __m128i a0_x128, a1_x128, min_base_y128, c3f; 3978 __m128i c1234, dy128; 3979 3980 a16 = _mm256_set1_epi16(16); 3981 c3f = _mm_set1_epi16(0x3f); 3982 min_base_y128 = _mm_set1_epi16(min_base_y); 3983 dy128 = _mm_set1_epi16(dy); 3984 c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); 3985 3986 for (int r = 0; r < N; r++) { 3987 __m256i b, res, shift; 3988 __m128i resx, resy, resxy, r6, ydx; 3989 3990 int y = r + 1; 3991 int base_x = (-y * dx) >> frac_bits_x; 3992 int base_shift = 0; 3993 if (base_x < (min_base_x - 1)) { 3994 base_shift = (min_base_x - base_x - 1) >> upsample_above; 3995 } 3996 int base_min_diff = 3997 (min_base_x - base_x + upsample_above) >> upsample_above; 3998 if (base_min_diff > 8) { 3999 base_min_diff = 8; 4000 } else { 4001 if (base_min_diff < 0) base_min_diff = 0; 4002 } 4003 4004 if (base_shift > 7) { 4005 a0_x = _mm256_setzero_si256(); 4006 a1_x = _mm256_setzero_si256(); 4007 shift = _mm256_setzero_si256(); 4008 } else { 4009 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift)); 4010 ydx = _mm_set1_epi16(y * dx); 4011 r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6); 4012 if (upsample_above) { 4013 a0_x128 = 4014 _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]); 4015 a1_x128 = _mm_srli_si128(a0_x128, 8); 4016 4017 shift = _mm256_castsi128_si256(_mm_srli_epi16( 4018 _mm_and_si128( 4019 _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f), 4020 1)); 4021 } else { 4022 a1_x128 = _mm_srli_si128(a0_x128, 1); 4023 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); 4024 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); 4025 4026 shift = _mm256_castsi128_si256( 4027 _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1)); 4028 } 4029 a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128)); 4030 a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128)); 4031 } 4032 4033 // y calc 4034 __m128i a0_y, a1_y, shifty; 4035 if (base_x < min_base_x) { 4036 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 4037 __m128i y_c128, base_y_c128, mask128; 4038 r6 = _mm_set1_epi16(r << 6); 4039 y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128)); 4040 base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y); 4041 mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128); 4042 base_y_c128 = _mm_andnot_si128(mask128, base_y_c128); 4043 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 4044 4045 a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 4046 left[base_y_c[2]], left[base_y_c[3]], 4047 left[base_y_c[4]], left[base_y_c[5]], 4048 left[base_y_c[6]], left[base_y_c[7]]); 4049 base_y_c128 = _mm_add_epi16( 4050 base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4)); 4051 _mm_store_si128((__m128i *)base_y_c, base_y_c128); 4052 4053 a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]], 4054 left[base_y_c[2]], left[base_y_c[3]], 4055 left[base_y_c[4]], left[base_y_c[5]], 4056 left[base_y_c[6]], left[base_y_c[7]]); 4057 4058 if (upsample_left) { 4059 shifty = _mm_srli_epi16( 4060 _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1); 4061 } else { 4062 shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1); 4063 } 4064 4065 a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1); 4066 a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1); 4067 shift = _mm256_inserti128_si256(shift, shifty, 1); 4068 } 4069 4070 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 4071 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 4072 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 4073 4074 b = _mm256_mullo_epi16(diff, shift); 4075 res = _mm256_add_epi16(a32, b); 4076 res = _mm256_srli_epi16(res, 5); 4077 4078 resx = _mm_packus_epi16(_mm256_castsi256_si128(res), 4079 _mm256_castsi256_si128(res)); 4080 resy = _mm256_extracti128_si256(res, 1); 4081 resy = _mm_packus_epi16(resy, resy); 4082 4083 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); 4084 _mm_storel_epi64((__m128i *)(dst), resxy); 4085 dst += stride; 4086 } 4087 } 4088 4089 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst, 4090 ptrdiff_t stride, const uint8_t *above, 4091 const uint8_t *left, int upsample_above, 4092 int upsample_left, int dx, int dy) { 4093 // here upsample_above and upsample_left are 0 by design of 4094 // av1_use_intra_edge_upsample 4095 const int min_base_x = -1; 4096 const int min_base_y = -1; 4097 (void)upsample_above; 4098 (void)upsample_left; 4099 const int frac_bits_x = 6; 4100 const int frac_bits_y = 6; 4101 4102 __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123; 4103 __m256i diff, min_base_y256, c3f, shifty, dy256, c1; 4104 __m128i a0_x128, a1_x128; 4105 4106 DECLARE_ALIGNED(32, int16_t, base_y_c[16]); 4107 a16 = _mm256_set1_epi16(16); 4108 c1 = _mm256_srli_epi16(a16, 4); 4109 min_base_y256 = _mm256_set1_epi16(min_base_y); 4110 c3f = _mm256_set1_epi16(0x3f); 4111 dy256 = _mm256_set1_epi16(dy); 4112 c0123 = 4113 _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 4114 c1234 = _mm256_add_epi16(c0123, c1); 4115 4116 for (int r = 0; r < H; r++) { 4117 __m256i b, res, shift, j256, r6, ydx; 4118 __m128i resx, resy; 4119 __m128i resxy; 4120 int y = r + 1; 4121 ydx = _mm256_set1_epi16((int16_t)(y * dx)); 4122 4123 int base_x = (-y * dx) >> frac_bits_x; 4124 for (int j = 0; j < W; j += 16) { 4125 j256 = _mm256_set1_epi16(j); 4126 int base_shift = 0; 4127 if ((base_x + j) < (min_base_x - 1)) { 4128 base_shift = (min_base_x - (base_x + j) - 1); 4129 } 4130 int base_min_diff = (min_base_x - base_x - j); 4131 if (base_min_diff > 16) { 4132 base_min_diff = 16; 4133 } else { 4134 if (base_min_diff < 0) base_min_diff = 0; 4135 } 4136 4137 if (base_shift < 16) { 4138 a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j)); 4139 a1_x128 = 4140 _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j)); 4141 a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]); 4142 a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]); 4143 4144 a0_x = _mm256_cvtepu8_epi16(a0_x128); 4145 a1_x = _mm256_cvtepu8_epi16(a1_x128); 4146 4147 r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6); 4148 shift = _mm256_srli_epi16( 4149 _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1); 4150 4151 diff = _mm256_sub_epi16(a1_x, a0_x); // a[x+1] - a[x] 4152 a32 = _mm256_slli_epi16(a0_x, 5); // a[x] * 32 4153 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 4154 4155 b = _mm256_mullo_epi16(diff, shift); 4156 res = _mm256_add_epi16(a32, b); 4157 res = _mm256_srli_epi16(res, 5); // 16 16-bit values 4158 resx = _mm256_castsi256_si128(_mm256_packus_epi16( 4159 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 4160 } else { 4161 resx = _mm_setzero_si128(); 4162 } 4163 4164 // y calc 4165 if (base_x < min_base_x) { 4166 __m256i c256, y_c256, base_y_c256, mask256, mul16; 4167 r6 = _mm256_set1_epi16(r << 6); 4168 c256 = _mm256_add_epi16(j256, c1234); 4169 mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256), 4170 _mm256_srli_epi16(min_base_y256, 1)); 4171 y_c256 = _mm256_sub_epi16(r6, mul16); 4172 4173 base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y); 4174 mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256); 4175 4176 base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256); 4177 int16_t min_y = (int16_t)_mm_extract_epi16( 4178 _mm256_extracti128_si256(base_y_c256, 1), 7); 4179 int16_t max_y = 4180 (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0); 4181 int16_t offset_diff = max_y - min_y; 4182 4183 if (offset_diff < 16) { 4184 __m256i min_y256 = _mm256_set1_epi16(min_y); 4185 4186 __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256); 4187 __m128i base_y_offset128 = 4188 _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0), 4189 _mm256_extracti128_si256(base_y_offset, 1)); 4190 4191 __m128i a0_y128 = _mm_maskload_epi32( 4192 (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]); 4193 __m128i a1_y128 = 4194 _mm_maskload_epi32((int *)(left + min_y + 1), 4195 *(__m128i *)LoadMaskz2[offset_diff / 4]); 4196 a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128); 4197 a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128); 4198 a0_y = _mm256_cvtepu8_epi16(a0_y128); 4199 a1_y = _mm256_cvtepu8_epi16(a1_y128); 4200 } else { 4201 base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256); 4202 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 4203 4204 a0_y = _mm256_setr_epi16( 4205 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 4206 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 4207 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], 4208 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], 4209 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], 4210 left[base_y_c[15]]); 4211 base_y_c256 = _mm256_add_epi16(base_y_c256, c1); 4212 _mm256_store_si256((__m256i *)base_y_c, base_y_c256); 4213 4214 a1_y = _mm256_setr_epi16( 4215 left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]], 4216 left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]], 4217 left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]], 4218 left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]], 4219 left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]], 4220 left[base_y_c[15]]); 4221 } 4222 shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1); 4223 4224 diff = _mm256_sub_epi16(a1_y, a0_y); // a[x+1] - a[x] 4225 a32 = _mm256_slli_epi16(a0_y, 5); // a[x] * 32 4226 a32 = _mm256_add_epi16(a32, a16); // a[x] * 32 + 16 4227 4228 b = _mm256_mullo_epi16(diff, shifty); 4229 res = _mm256_add_epi16(a32, b); 4230 res = _mm256_srli_epi16(res, 5); // 16 16-bit values 4231 resy = _mm256_castsi256_si128(_mm256_packus_epi16( 4232 res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)))); 4233 } else { 4234 resy = _mm_setzero_si128(); 4235 } 4236 resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]); 4237 _mm_storeu_si128((__m128i *)(dst + j), resxy); 4238 } // for j 4239 dst += stride; 4240 } 4241 } 4242 4243 // Directional prediction, zone 2: 90 < angle < 180 4244 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 4245 const uint8_t *above, const uint8_t *left, 4246 int upsample_above, int upsample_left, int dx, 4247 int dy) { 4248 assert(dx > 0); 4249 assert(dy > 0); 4250 switch (bw) { 4251 case 4: 4252 dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above, 4253 upsample_left, dx, dy); 4254 break; 4255 case 8: 4256 dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above, 4257 upsample_left, dx, dy); 4258 break; 4259 default: 4260 dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left, 4261 upsample_above, upsample_left, dx, dy); 4262 break; 4263 } 4264 return; 4265 } 4266 4267 // z3 functions 4268 static inline void transpose16x32_avx2(__m256i *x, __m256i *d) { 4269 __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; 4270 __m256i w10, w11, w12, w13, w14, w15; 4271 4272 w0 = _mm256_unpacklo_epi8(x[0], x[1]); 4273 w1 = _mm256_unpacklo_epi8(x[2], x[3]); 4274 w2 = _mm256_unpacklo_epi8(x[4], x[5]); 4275 w3 = _mm256_unpacklo_epi8(x[6], x[7]); 4276 4277 w8 = _mm256_unpacklo_epi8(x[8], x[9]); 4278 w9 = _mm256_unpacklo_epi8(x[10], x[11]); 4279 w10 = _mm256_unpacklo_epi8(x[12], x[13]); 4280 w11 = _mm256_unpacklo_epi8(x[14], x[15]); 4281 4282 w4 = _mm256_unpacklo_epi16(w0, w1); 4283 w5 = _mm256_unpacklo_epi16(w2, w3); 4284 w12 = _mm256_unpacklo_epi16(w8, w9); 4285 w13 = _mm256_unpacklo_epi16(w10, w11); 4286 4287 w6 = _mm256_unpacklo_epi32(w4, w5); 4288 w7 = _mm256_unpackhi_epi32(w4, w5); 4289 w14 = _mm256_unpacklo_epi32(w12, w13); 4290 w15 = _mm256_unpackhi_epi32(w12, w13); 4291 4292 // Store first 4-line result 4293 d[0] = _mm256_unpacklo_epi64(w6, w14); 4294 d[1] = _mm256_unpackhi_epi64(w6, w14); 4295 d[2] = _mm256_unpacklo_epi64(w7, w15); 4296 d[3] = _mm256_unpackhi_epi64(w7, w15); 4297 4298 w4 = _mm256_unpackhi_epi16(w0, w1); 4299 w5 = _mm256_unpackhi_epi16(w2, w3); 4300 w12 = _mm256_unpackhi_epi16(w8, w9); 4301 w13 = _mm256_unpackhi_epi16(w10, w11); 4302 4303 w6 = _mm256_unpacklo_epi32(w4, w5); 4304 w7 = _mm256_unpackhi_epi32(w4, w5); 4305 w14 = _mm256_unpacklo_epi32(w12, w13); 4306 w15 = _mm256_unpackhi_epi32(w12, w13); 4307 4308 // Store second 4-line result 4309 d[4] = _mm256_unpacklo_epi64(w6, w14); 4310 d[5] = _mm256_unpackhi_epi64(w6, w14); 4311 d[6] = _mm256_unpacklo_epi64(w7, w15); 4312 d[7] = _mm256_unpackhi_epi64(w7, w15); 4313 4314 // upper half 4315 w0 = _mm256_unpackhi_epi8(x[0], x[1]); 4316 w1 = _mm256_unpackhi_epi8(x[2], x[3]); 4317 w2 = _mm256_unpackhi_epi8(x[4], x[5]); 4318 w3 = _mm256_unpackhi_epi8(x[6], x[7]); 4319 4320 w8 = _mm256_unpackhi_epi8(x[8], x[9]); 4321 w9 = _mm256_unpackhi_epi8(x[10], x[11]); 4322 w10 = _mm256_unpackhi_epi8(x[12], x[13]); 4323 w11 = _mm256_unpackhi_epi8(x[14], x[15]); 4324 4325 w4 = _mm256_unpacklo_epi16(w0, w1); 4326 w5 = _mm256_unpacklo_epi16(w2, w3); 4327 w12 = _mm256_unpacklo_epi16(w8, w9); 4328 w13 = _mm256_unpacklo_epi16(w10, w11); 4329 4330 w6 = _mm256_unpacklo_epi32(w4, w5); 4331 w7 = _mm256_unpackhi_epi32(w4, w5); 4332 w14 = _mm256_unpacklo_epi32(w12, w13); 4333 w15 = _mm256_unpackhi_epi32(w12, w13); 4334 4335 // Store first 4-line result 4336 d[8] = _mm256_unpacklo_epi64(w6, w14); 4337 d[9] = _mm256_unpackhi_epi64(w6, w14); 4338 d[10] = _mm256_unpacklo_epi64(w7, w15); 4339 d[11] = _mm256_unpackhi_epi64(w7, w15); 4340 4341 w4 = _mm256_unpackhi_epi16(w0, w1); 4342 w5 = _mm256_unpackhi_epi16(w2, w3); 4343 w12 = _mm256_unpackhi_epi16(w8, w9); 4344 w13 = _mm256_unpackhi_epi16(w10, w11); 4345 4346 w6 = _mm256_unpacklo_epi32(w4, w5); 4347 w7 = _mm256_unpackhi_epi32(w4, w5); 4348 w14 = _mm256_unpacklo_epi32(w12, w13); 4349 w15 = _mm256_unpackhi_epi32(w12, w13); 4350 4351 // Store second 4-line result 4352 d[12] = _mm256_unpacklo_epi64(w6, w14); 4353 d[13] = _mm256_unpackhi_epi64(w6, w14); 4354 d[14] = _mm256_unpacklo_epi64(w7, w15); 4355 d[15] = _mm256_unpackhi_epi64(w7, w15); 4356 } 4357 4358 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride, 4359 const uint8_t *left, int upsample_left, 4360 int dy) { 4361 __m128i dstvec[4], d[4]; 4362 4363 dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy); 4364 transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 4365 &d[0], &d[1], &d[2], &d[3]); 4366 4367 *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]); 4368 *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]); 4369 *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]); 4370 *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]); 4371 return; 4372 } 4373 4374 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride, 4375 const uint8_t *left, int upsample_left, 4376 int dy) { 4377 __m128i dstvec[8], d[8]; 4378 4379 dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy); 4380 transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], 4381 &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2], 4382 &d[3]); 4383 4384 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 4385 _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8)); 4386 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]); 4387 _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8)); 4388 _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]); 4389 _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8)); 4390 _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]); 4391 _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8)); 4392 } 4393 4394 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride, 4395 const uint8_t *left, int upsample_left, 4396 int dy) { 4397 __m128i dstvec[4], d[8]; 4398 4399 dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy); 4400 transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0], 4401 &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); 4402 for (int i = 0; i < 8; i++) { 4403 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); 4404 } 4405 } 4406 4407 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride, 4408 const uint8_t *left, int upsample_left, 4409 int dy) { 4410 __m128i dstvec[8], d[4]; 4411 4412 dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy); 4413 transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], 4414 &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0], 4415 &d[1], &d[2], &d[3]); 4416 _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]); 4417 _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]); 4418 _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]); 4419 _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]); 4420 } 4421 4422 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride, 4423 const uint8_t *left, int upsample_left, 4424 int dy) { 4425 __m128i dstvec[8], d[8]; 4426 4427 dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy); 4428 transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3, 4429 dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d, 4430 d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7); 4431 for (int i = 0; i < 8; i++) { 4432 _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]); 4433 _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride), 4434 _mm_srli_si128(d[i], 8)); 4435 } 4436 } 4437 4438 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride, 4439 const uint8_t *left, int upsample_left, 4440 int dy) { 4441 __m128i dstvec[16], d[16]; 4442 4443 dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy); 4444 transpose16x8_8x16_sse2( 4445 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 4446 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 4447 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 4448 &d[3], &d[4], &d[5], &d[6], &d[7]); 4449 4450 for (int i = 0; i < 8; i++) { 4451 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4452 } 4453 } 4454 4455 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4456 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride, 4457 const uint8_t *left, int upsample_left, 4458 int dy) { 4459 __m128i dstvec[4], d[16]; 4460 4461 dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy); 4462 transpose4x16_sse2(dstvec, d); 4463 for (int i = 0; i < 16; i++) { 4464 *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]); 4465 } 4466 } 4467 4468 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride, 4469 const uint8_t *left, int upsample_left, 4470 int dy) { 4471 __m128i dstvec[16], d[8]; 4472 4473 dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy); 4474 for (int i = 4; i < 8; i++) { 4475 d[i] = _mm_setzero_si128(); 4476 } 4477 transpose16x8_8x16_sse2( 4478 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 4479 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 4480 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 4481 &d[3], &d[4], &d[5], &d[6], &d[7]); 4482 4483 for (int i = 0; i < 4; i++) { 4484 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4485 } 4486 } 4487 4488 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride, 4489 const uint8_t *left, int upsample_left, 4490 int dy) { 4491 __m256i dstvec[16], d[16]; 4492 4493 dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy); 4494 for (int i = 8; i < 16; i++) { 4495 dstvec[i] = _mm256_setzero_si256(); 4496 } 4497 transpose16x32_avx2(dstvec, d); 4498 4499 for (int i = 0; i < 16; i++) { 4500 _mm_storel_epi64((__m128i *)(dst + i * stride), 4501 _mm256_castsi256_si128(d[i])); 4502 } 4503 for (int i = 0; i < 16; i++) { 4504 _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), 4505 _mm256_extracti128_si256(d[i], 1)); 4506 } 4507 } 4508 4509 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride, 4510 const uint8_t *left, int upsample_left, 4511 int dy) { 4512 __m128i dstvec[32], d[16]; 4513 4514 dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy); 4515 4516 transpose16x8_8x16_sse2( 4517 &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5], 4518 &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11], 4519 &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2], 4520 &d[3], &d[4], &d[5], &d[6], &d[7]); 4521 transpose16x8_8x16_sse2( 4522 &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16], 4523 &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16], 4524 &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16], 4525 &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16], 4526 &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8], 4527 &d[6 + 8], &d[7 + 8]); 4528 4529 for (int i = 0; i < 8; i++) { 4530 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4531 _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]); 4532 } 4533 } 4534 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4535 4536 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride, 4537 const uint8_t *left, int upsample_left, 4538 int dy) { 4539 __m128i dstvec[16], d[16]; 4540 4541 dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy); 4542 transpose16x16_sse2(dstvec, d); 4543 4544 for (int i = 0; i < 16; i++) { 4545 _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]); 4546 } 4547 } 4548 4549 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride, 4550 const uint8_t *left, int upsample_left, 4551 int dy) { 4552 __m256i dstvec[32], d[32]; 4553 4554 dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy); 4555 transpose16x32_avx2(dstvec, d); 4556 transpose16x32_avx2(dstvec + 16, d + 16); 4557 for (int j = 0; j < 16; j++) { 4558 _mm_storeu_si128((__m128i *)(dst + j * stride), 4559 _mm256_castsi256_si128(d[j])); 4560 _mm_storeu_si128((__m128i *)(dst + j * stride + 16), 4561 _mm256_castsi256_si128(d[j + 16])); 4562 } 4563 for (int j = 0; j < 16; j++) { 4564 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), 4565 _mm256_extracti128_si256(d[j], 1)); 4566 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), 4567 _mm256_extracti128_si256(d[j + 16], 1)); 4568 } 4569 } 4570 4571 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride, 4572 const uint8_t *left, int upsample_left, 4573 int dy) { 4574 DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); 4575 dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy); 4576 transpose(dstT, 64, dst, stride, 64, 64); 4577 } 4578 4579 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride, 4580 const uint8_t *left, int upsample_left, 4581 int dy) { 4582 __m256i dstvec[16], d[16]; 4583 4584 dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy); 4585 transpose16x32_avx2(dstvec, d); 4586 // store 4587 for (int j = 0; j < 16; j++) { 4588 _mm_storeu_si128((__m128i *)(dst + j * stride), 4589 _mm256_castsi256_si128(d[j])); 4590 _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), 4591 _mm256_extracti128_si256(d[j], 1)); 4592 } 4593 } 4594 4595 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride, 4596 const uint8_t *left, int upsample_left, 4597 int dy) { 4598 __m128i dstvec[32], d[16]; 4599 4600 dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy); 4601 for (int i = 0; i < 32; i += 16) { 4602 transpose16x16_sse2((dstvec + i), d); 4603 for (int j = 0; j < 16; j++) { 4604 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); 4605 } 4606 } 4607 } 4608 4609 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride, 4610 const uint8_t *left, int upsample_left, 4611 int dy) { 4612 uint8_t dstT[64 * 32]; 4613 dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy); 4614 transpose(dstT, 64, dst, stride, 32, 64); 4615 } 4616 4617 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride, 4618 const uint8_t *left, int upsample_left, 4619 int dy) { 4620 uint8_t dstT[32 * 64]; 4621 dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy); 4622 transpose(dstT, 32, dst, stride, 64, 32); 4623 return; 4624 } 4625 4626 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4627 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride, 4628 const uint8_t *left, int upsample_left, 4629 int dy) { 4630 uint8_t dstT[64 * 16]; 4631 dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy); 4632 transpose(dstT, 64, dst, stride, 16, 64); 4633 } 4634 4635 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride, 4636 const uint8_t *left, int upsample_left, 4637 int dy) { 4638 __m128i dstvec[64], d[16]; 4639 4640 dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy); 4641 for (int i = 0; i < 64; i += 16) { 4642 transpose16x16_sse2((dstvec + i), d); 4643 for (int j = 0; j < 16; j++) { 4644 _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]); 4645 } 4646 } 4647 } 4648 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4649 4650 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh, 4651 const uint8_t *above, const uint8_t *left, 4652 int upsample_left, int dx, int dy) { 4653 (void)above; 4654 (void)dx; 4655 assert(dx == 1); 4656 assert(dy > 0); 4657 4658 if (bw == bh) { 4659 switch (bw) { 4660 case 4: 4661 dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy); 4662 break; 4663 case 8: 4664 dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy); 4665 break; 4666 case 16: 4667 dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy); 4668 break; 4669 case 32: 4670 dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy); 4671 break; 4672 case 64: 4673 dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy); 4674 break; 4675 } 4676 } else { 4677 if (bw < bh) { 4678 if (bw + bw == bh) { 4679 switch (bw) { 4680 case 4: 4681 dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy); 4682 break; 4683 case 8: 4684 dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy); 4685 break; 4686 case 16: 4687 dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy); 4688 break; 4689 case 32: 4690 dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy); 4691 break; 4692 } 4693 } else { 4694 switch (bw) { 4695 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4696 case 4: 4697 dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy); 4698 break; 4699 case 8: 4700 dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy); 4701 break; 4702 case 16: 4703 dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy); 4704 break; 4705 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4706 } 4707 } 4708 } else { 4709 if (bh + bh == bw) { 4710 switch (bh) { 4711 case 4: 4712 dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy); 4713 break; 4714 case 8: 4715 dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy); 4716 break; 4717 case 16: 4718 dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy); 4719 break; 4720 case 32: 4721 dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy); 4722 break; 4723 } 4724 } else { 4725 switch (bh) { 4726 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4727 case 4: 4728 dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy); 4729 break; 4730 case 8: 4731 dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy); 4732 break; 4733 case 16: 4734 dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy); 4735 break; 4736 #endif // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER 4737 } 4738 } 4739 } 4740 } 4741 }