convolve_avx2.c (39711B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/av1_rtcd.h" 15 16 #if CONFIG_SVT_AV1 17 #include "third_party/SVT-AV1/convolve_avx2.h" 18 #endif 19 20 #include "aom_dsp/aom_dsp_common.h" 21 #include "aom_dsp/x86/convolve_avx2.h" 22 #include "aom_dsp/x86/convolve_common_intrin.h" 23 #include "aom_dsp/x86/synonyms.h" 24 25 static inline void av1_convolve_y_sr_general_avx2( 26 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 27 int h, const InterpFilterParams *filter_params_y, const int subpel_y_qn) { 28 // right shift is F-1 because we are already dividing 29 // filter co-efficients by 2 30 const int right_shift_bits = (FILTER_BITS - 1); 31 __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits); 32 __m256i right_shift_const = _mm256_set1_epi16((1 << right_shift_bits) >> 1); 33 34 __m256i coeffs[6], s[12]; 35 __m128i d[10]; 36 37 int i, vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); 38 39 if (vert_tap == 6) 40 prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs); 41 else if (vert_tap == 12) { 42 prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs); 43 } else { 44 prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs); 45 } 46 47 // vert_filt as 4 tap 48 if (vert_tap == 4) { 49 const int fo_vert = 1; 50 const uint8_t *const src_ptr = src - fo_vert * src_stride; 51 for (int j = 0; j < w; j += 16) { 52 const uint8_t *data = &src_ptr[j]; 53 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); 54 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); 55 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); 56 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); 57 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); 58 59 // Load lines a and b. Line a to lower 128, line b to upper 128 60 const __m256i src_01a = _mm256_permute2x128_si256( 61 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); 62 63 const __m256i src_12a = _mm256_permute2x128_si256( 64 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); 65 66 const __m256i src_23a = _mm256_permute2x128_si256( 67 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); 68 69 const __m256i src_34a = _mm256_permute2x128_si256( 70 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); 71 72 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); 73 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); 74 75 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); 76 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); 77 78 for (i = 0; i < h; i += 2) { 79 data = &src_ptr[i * src_stride + j]; 80 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); 81 const __m256i src_45a = _mm256_permute2x128_si256( 82 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); 83 84 d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride)); 85 const __m256i src_56a = _mm256_permute2x128_si256( 86 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20); 87 88 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); 89 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); 90 91 const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1); 92 /* rounding code */ 93 // shift by F - 1 94 const __m256i res_16b_lo = _mm256_sra_epi16( 95 _mm256_add_epi16(res_lo, right_shift_const), right_shift); 96 // 8 bit conversion and saturation to uint8 97 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); 98 99 if (w - j > 8) { 100 const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1); 101 102 /* rounding code */ 103 // shift by F - 1 104 const __m256i res_16b_hi = _mm256_sra_epi16( 105 _mm256_add_epi16(res_hi, right_shift_const), right_shift); 106 // 8 bit conversion and saturation to uint8 107 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); 108 109 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); 110 111 const __m128i res_0 = _mm256_castsi256_si128(res_a); 112 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); 113 114 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); 115 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 116 res_1); 117 } else { 118 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); 119 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); 120 if (w - j > 4) { 121 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); 122 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 123 res_1); 124 } else if (w - j > 2) { 125 xx_storel_32(&dst[i * dst_stride + j], res_0); 126 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); 127 } else { 128 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; 129 __m128i *const p_1 = 130 (__m128i *)&dst[i * dst_stride + j + dst_stride]; 131 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); 132 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); 133 } 134 } 135 s[0] = s[1]; 136 s[1] = s[2]; 137 138 s[3] = s[4]; 139 s[4] = s[5]; 140 } 141 } 142 } else if (vert_tap == 6) { 143 const int fo_vert = vert_tap / 2 - 1; 144 const uint8_t *const src_ptr = src - fo_vert * src_stride; 145 146 for (int j = 0; j < w; j += 16) { 147 const uint8_t *data = &src_ptr[j]; 148 __m256i src6; 149 150 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); 151 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); 152 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); 153 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); 154 // Load lines a and b. Line a to lower 128, line b to upper 128 155 const __m256i src_01a = _mm256_permute2x128_si256( 156 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); 157 158 const __m256i src_12a = _mm256_permute2x128_si256( 159 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); 160 161 const __m256i src_23a = _mm256_permute2x128_si256( 162 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); 163 164 src6 = _mm256_castsi128_si256( 165 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))); 166 const __m256i src_34a = 167 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20); 168 169 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); 170 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); 171 172 s[3] = _mm256_unpackhi_epi8(src_01a, src_12a); 173 s[4] = _mm256_unpackhi_epi8(src_23a, src_34a); 174 175 for (i = 0; i < h; i += 2) { 176 data = &src_ptr[i * src_stride + j]; 177 const __m256i src_45a = _mm256_permute2x128_si256( 178 src6, 179 _mm256_castsi128_si256( 180 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 181 0x20); 182 183 src6 = _mm256_castsi128_si256( 184 _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); 185 const __m256i src_56a = _mm256_permute2x128_si256( 186 _mm256_castsi128_si256( 187 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))), 188 src6, 0x20); 189 190 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); 191 s[5] = _mm256_unpackhi_epi8(src_45a, src_56a); 192 193 const __m256i res_lo = convolve_lowbd_6tap(s, coeffs); 194 195 /* rounding code */ 196 // shift by F - 1 197 const __m256i res_16b_lo = _mm256_sra_epi16( 198 _mm256_add_epi16(res_lo, right_shift_const), right_shift); 199 // 8 bit conversion and saturation to uint8 200 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); 201 202 if (w - j > 8) { 203 const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs); 204 205 /* rounding code */ 206 // shift by F - 1 207 const __m256i res_16b_hi = _mm256_sra_epi16( 208 _mm256_add_epi16(res_hi, right_shift_const), right_shift); 209 // 8 bit conversion and saturation to uint8 210 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); 211 212 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); 213 214 const __m128i res_0 = _mm256_castsi256_si128(res_a); 215 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); 216 217 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); 218 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 219 res_1); 220 } else { 221 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); 222 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); 223 if (w - j > 4) { 224 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); 225 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 226 res_1); 227 } else if (w - j > 2) { 228 xx_storel_32(&dst[i * dst_stride + j], res_0); 229 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); 230 } else { 231 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; 232 __m128i *const p_1 = 233 (__m128i *)&dst[i * dst_stride + j + dst_stride]; 234 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); 235 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); 236 } 237 } 238 s[0] = s[1]; 239 s[1] = s[2]; 240 s[3] = s[4]; 241 s[4] = s[5]; 242 } 243 } 244 } else if (vert_tap == 12) { // vert_tap == 12 245 const int fo_vert = filter_params_y->taps / 2 - 1; 246 const uint8_t *const src_ptr = src - fo_vert * src_stride; 247 const __m256i v_zero = _mm256_setzero_si256(); 248 right_shift = _mm_cvtsi32_si128(FILTER_BITS); 249 right_shift_const = _mm256_set1_epi32((1 << FILTER_BITS) >> 1); 250 251 for (int j = 0; j < w; j += 8) { 252 const uint8_t *data = &src_ptr[j]; 253 __m256i src10; 254 255 d[0] = _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)); 256 d[1] = _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)); 257 d[2] = _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)); 258 d[3] = _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)); 259 d[4] = _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)); 260 d[5] = _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)); 261 d[6] = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)); 262 d[7] = _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)); 263 d[8] = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)); 264 d[9] = _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)); 265 // Load lines a and b. Line a to lower 128, line b to upper 128 266 const __m256i src_01a = _mm256_permute2x128_si256( 267 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); 268 269 const __m256i src_12a = _mm256_permute2x128_si256( 270 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); 271 272 const __m256i src_23a = _mm256_permute2x128_si256( 273 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); 274 275 const __m256i src_34a = _mm256_permute2x128_si256( 276 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); 277 278 const __m256i src_45a = _mm256_permute2x128_si256( 279 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); 280 281 const __m256i src_56a = _mm256_permute2x128_si256( 282 _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[6]), 0x20); 283 284 const __m256i src_67a = _mm256_permute2x128_si256( 285 _mm256_castsi128_si256(d[6]), _mm256_castsi128_si256(d[7]), 0x20); 286 287 const __m256i src_78a = _mm256_permute2x128_si256( 288 _mm256_castsi128_si256(d[7]), _mm256_castsi128_si256(d[8]), 0x20); 289 290 const __m256i src_89a = _mm256_permute2x128_si256( 291 _mm256_castsi128_si256(d[8]), _mm256_castsi128_si256(d[9]), 0x20); 292 293 src10 = _mm256_castsi128_si256( 294 _mm_loadl_epi64((__m128i *)(data + 10 * src_stride))); 295 const __m256i src_910a = 296 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[9]), src10, 0x20); 297 298 const __m256i src_01 = _mm256_unpacklo_epi8(src_01a, v_zero); 299 const __m256i src_12 = _mm256_unpacklo_epi8(src_12a, v_zero); 300 const __m256i src_23 = _mm256_unpacklo_epi8(src_23a, v_zero); 301 const __m256i src_34 = _mm256_unpacklo_epi8(src_34a, v_zero); 302 const __m256i src_45 = _mm256_unpacklo_epi8(src_45a, v_zero); 303 const __m256i src_56 = _mm256_unpacklo_epi8(src_56a, v_zero); 304 const __m256i src_67 = _mm256_unpacklo_epi8(src_67a, v_zero); 305 const __m256i src_78 = _mm256_unpacklo_epi8(src_78a, v_zero); 306 const __m256i src_89 = _mm256_unpacklo_epi8(src_89a, v_zero); 307 const __m256i src_910 = _mm256_unpacklo_epi8(src_910a, v_zero); 308 309 s[0] = _mm256_unpacklo_epi16(src_01, src_12); 310 s[1] = _mm256_unpacklo_epi16(src_23, src_34); 311 s[2] = _mm256_unpacklo_epi16(src_45, src_56); 312 s[3] = _mm256_unpacklo_epi16(src_67, src_78); 313 s[4] = _mm256_unpacklo_epi16(src_89, src_910); 314 315 s[6] = _mm256_unpackhi_epi16(src_01, src_12); 316 s[7] = _mm256_unpackhi_epi16(src_23, src_34); 317 s[8] = _mm256_unpackhi_epi16(src_45, src_56); 318 s[9] = _mm256_unpackhi_epi16(src_67, src_78); 319 s[10] = _mm256_unpackhi_epi16(src_89, src_910); 320 321 for (i = 0; i < h; i += 2) { 322 data = &src_ptr[i * src_stride + j]; 323 const __m256i src_1011a = _mm256_permute2x128_si256( 324 src10, 325 _mm256_castsi128_si256( 326 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), 327 0x20); 328 329 src10 = _mm256_castsi128_si256( 330 _mm_loadl_epi64((__m128i *)(data + 12 * src_stride))); 331 332 const __m256i src_1112a = _mm256_permute2x128_si256( 333 _mm256_castsi128_si256( 334 _mm_loadl_epi64((__m128i *)(data + 11 * src_stride))), 335 src10, 0x20); 336 337 const __m256i src_1011 = _mm256_unpacklo_epi8(src_1011a, v_zero); 338 const __m256i src_1112 = _mm256_unpacklo_epi8(src_1112a, v_zero); 339 340 s[5] = _mm256_unpacklo_epi16(src_1011, src_1112); 341 s[11] = _mm256_unpackhi_epi16(src_1011, src_1112); 342 343 const __m256i res_lo = convolve_12taps(s, coeffs); 344 345 const __m256i res_32b_lo = _mm256_sra_epi32( 346 _mm256_add_epi32(res_lo, right_shift_const), right_shift); 347 // 8 bit conversion and saturation to uint8 348 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); 349 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); 350 351 if (w - j > 4) { 352 const __m256i res_hi = convolve_12taps(s + 6, coeffs); 353 354 const __m256i res_32b_hi = _mm256_sra_epi32( 355 _mm256_add_epi32(res_hi, right_shift_const), right_shift); 356 __m256i res_16b_hi = _mm256_packs_epi32(res_32b_hi, res_32b_hi); 357 // 8 bit conversion and saturation to uint8 358 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); 359 360 __m256i res_a = _mm256_unpacklo_epi32(res_8b_lo, res_8b_hi); 361 362 const __m128i res_0 = _mm256_extracti128_si256(res_a, 0); 363 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); 364 365 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); 366 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 367 res_1); 368 } else { 369 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); 370 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); 371 if (w - j > 2) { 372 *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); 373 *(int *)&dst[i * dst_stride + j + dst_stride] = 374 _mm_cvtsi128_si32(res_1); 375 } else { 376 *(uint16_t *)&dst[i * dst_stride + j] = 377 (uint16_t)_mm_cvtsi128_si32(res_0); 378 *(uint16_t *)&dst[i * dst_stride + j + dst_stride] = 379 (uint16_t)_mm_cvtsi128_si32(res_1); 380 } 381 } 382 s[0] = s[1]; 383 s[1] = s[2]; 384 s[2] = s[3]; 385 s[3] = s[4]; 386 s[4] = s[5]; 387 388 s[6] = s[7]; 389 s[7] = s[8]; 390 s[8] = s[9]; 391 s[9] = s[10]; 392 s[10] = s[11]; 393 } 394 } 395 } else { 396 const int fo_vert = filter_params_y->taps / 2 - 1; 397 const uint8_t *const src_ptr = src - fo_vert * src_stride; 398 399 for (int j = 0; j < w; j += 16) { 400 const uint8_t *data = &src_ptr[j]; 401 __m256i src6; 402 403 d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride)); 404 d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride)); 405 d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride)); 406 d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride)); 407 d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride)); 408 d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride)); 409 // Load lines a and b. Line a to lower 128, line b to upper 128 410 const __m256i src_01a = _mm256_permute2x128_si256( 411 _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20); 412 413 const __m256i src_12a = _mm256_permute2x128_si256( 414 _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20); 415 416 const __m256i src_23a = _mm256_permute2x128_si256( 417 _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20); 418 419 const __m256i src_34a = _mm256_permute2x128_si256( 420 _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20); 421 422 const __m256i src_45a = _mm256_permute2x128_si256( 423 _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20); 424 425 src6 = _mm256_castsi128_si256( 426 _mm_loadu_si128((__m128i *)(data + 6 * src_stride))); 427 const __m256i src_56a = 428 _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20); 429 430 s[0] = _mm256_unpacklo_epi8(src_01a, src_12a); 431 s[1] = _mm256_unpacklo_epi8(src_23a, src_34a); 432 s[2] = _mm256_unpacklo_epi8(src_45a, src_56a); 433 434 s[4] = _mm256_unpackhi_epi8(src_01a, src_12a); 435 s[5] = _mm256_unpackhi_epi8(src_23a, src_34a); 436 s[6] = _mm256_unpackhi_epi8(src_45a, src_56a); 437 438 for (i = 0; i < h; i += 2) { 439 data = &src_ptr[i * src_stride + j]; 440 const __m256i src_67a = _mm256_permute2x128_si256( 441 src6, 442 _mm256_castsi128_si256( 443 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 444 0x20); 445 446 src6 = _mm256_castsi128_si256( 447 _mm_loadu_si128((__m128i *)(data + 8 * src_stride))); 448 const __m256i src_78a = _mm256_permute2x128_si256( 449 _mm256_castsi128_si256( 450 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))), 451 src6, 0x20); 452 453 s[3] = _mm256_unpacklo_epi8(src_67a, src_78a); 454 s[7] = _mm256_unpackhi_epi8(src_67a, src_78a); 455 456 const __m256i res_lo = convolve_lowbd(s, coeffs); 457 458 /* rounding code */ 459 // shift by F - 1 460 const __m256i res_16b_lo = _mm256_sra_epi16( 461 _mm256_add_epi16(res_lo, right_shift_const), right_shift); 462 // 8 bit conversion and saturation to uint8 463 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); 464 465 if (w - j > 8) { 466 const __m256i res_hi = convolve_lowbd(s + 4, coeffs); 467 468 /* rounding code */ 469 // shift by F - 1 470 const __m256i res_16b_hi = _mm256_sra_epi16( 471 _mm256_add_epi16(res_hi, right_shift_const), right_shift); 472 // 8 bit conversion and saturation to uint8 473 __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi); 474 475 __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi); 476 477 const __m128i res_0 = _mm256_castsi256_si128(res_a); 478 const __m128i res_1 = _mm256_extracti128_si256(res_a, 1); 479 480 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0); 481 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride], 482 res_1); 483 } else { 484 const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo); 485 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); 486 if (w - j > 4) { 487 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0); 488 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride], 489 res_1); 490 } else if (w - j > 2) { 491 xx_storel_32(&dst[i * dst_stride + j], res_0); 492 xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1); 493 } else { 494 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j]; 495 __m128i *const p_1 = 496 (__m128i *)&dst[i * dst_stride + j + dst_stride]; 497 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); 498 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); 499 } 500 } 501 s[0] = s[1]; 502 s[1] = s[2]; 503 s[2] = s[3]; 504 505 s[4] = s[5]; 506 s[5] = s[6]; 507 s[6] = s[7]; 508 } 509 } 510 } 511 } 512 513 void av1_convolve_y_sr_avx2(const uint8_t *src, int32_t src_stride, 514 uint8_t *dst, int32_t dst_stride, int32_t w, 515 int32_t h, 516 const InterpFilterParams *filter_params_y, 517 const int32_t subpel_y_qn) { 518 #if CONFIG_SVT_AV1 519 const int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); 520 521 if (vert_tap == 12) { 522 av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, 523 filter_params_y, subpel_y_qn); 524 } else { 525 av1_convolve_y_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, 526 filter_params_y, subpel_y_qn); 527 } 528 #else 529 av1_convolve_y_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, 530 filter_params_y, subpel_y_qn); 531 #endif 532 } 533 534 static inline void av1_convolve_x_sr_general_avx2( 535 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 536 int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, 537 ConvolveParams *conv_params) { 538 const int bits = FILTER_BITS - conv_params->round_0; 539 const __m128i round_shift = _mm_cvtsi32_si128(bits); 540 __m256i round_0_const = 541 _mm256_set1_epi16((1 << (conv_params->round_0 - 1)) >> 1); 542 __m128i round_0_shift = _mm_cvtsi32_si128(conv_params->round_0 - 1); 543 __m256i round_const = _mm256_set1_epi16((1 << bits) >> 1); 544 int i, horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); 545 546 assert(bits >= 0); 547 assert((FILTER_BITS - conv_params->round_1) >= 0 || 548 ((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS)); 549 assert(conv_params->round_0 > 0); 550 551 __m256i coeffs[6], filt[4]; 552 filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2)); 553 filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32)); 554 555 if (horiz_tap == 6) 556 prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs); 557 else if (horiz_tap == 12) { 558 prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs); 559 } else { 560 prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs); 561 } 562 563 // horz_filt as 4 tap 564 if (horiz_tap == 4) { 565 const int fo_horiz = 1; 566 const uint8_t *const src_ptr = src - fo_horiz; 567 if (w <= 8) { 568 for (i = 0; i < h; i += 2) { 569 const __m256i data = _mm256_permute2x128_si256( 570 _mm256_castsi128_si256( 571 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), 572 _mm256_castsi128_si256(_mm_loadu_si128( 573 (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 574 0x20); 575 576 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); 577 578 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), 579 round_0_shift); 580 581 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), 582 round_shift); 583 584 /* rounding code */ 585 // 8 bit conversion and saturation to uint8 586 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); 587 588 const __m128i res_0 = _mm256_castsi256_si128(res_8b); 589 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); 590 591 if (w > 4) { 592 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); 593 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); 594 } else if (w > 2) { 595 xx_storel_32(&dst[i * dst_stride], res_0); 596 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); 597 } else { 598 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; 599 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; 600 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); 601 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); 602 } 603 } 604 } else { 605 for (i = 0; i < h; ++i) { 606 for (int j = 0; j < w; j += 16) { 607 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 608 // 18 19 20 21 22 23 609 const __m256i data = _mm256_inserti128_si256( 610 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), 611 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 612 1); 613 614 __m256i res_16b = convolve_lowbd_x_4tap(data, coeffs + 1, filt); 615 616 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), 617 round_0_shift); 618 619 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), 620 round_shift); 621 622 /* rounding code */ 623 // 8 bit conversion and saturation to uint8 624 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); 625 626 // Store values into the destination buffer 627 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 628 res_8b = _mm256_permute4x64_epi64(res_8b, 216); 629 __m128i res = _mm256_castsi256_si128(res_8b); 630 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); 631 } 632 } 633 } 634 } else if (horiz_tap == 6) { 635 const int fo_horiz = horiz_tap / 2 - 1; 636 const uint8_t *const src_ptr = src - fo_horiz; 637 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); 638 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); 639 640 if (w <= 8) { 641 for (i = 0; i < h; i += 2) { 642 const __m256i data = _mm256_permute2x128_si256( 643 _mm256_castsi128_si256( 644 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), 645 _mm256_castsi128_si256(_mm_loadu_si128( 646 (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 647 0x20); 648 649 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); 650 651 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), 652 round_0_shift); 653 654 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), 655 round_shift); 656 657 /* rounding code */ 658 // 8 bit conversion and saturation to uint8 659 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); 660 661 const __m128i res_0 = _mm256_castsi256_si128(res_8b); 662 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); 663 if (w > 4) { 664 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); 665 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); 666 } else if (w > 2) { 667 xx_storel_32(&dst[i * dst_stride], res_0); 668 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); 669 } else { 670 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; 671 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; 672 *(uint16_t *)p_0 = _mm_cvtsi128_si32(res_0); 673 *(uint16_t *)p_1 = _mm_cvtsi128_si32(res_1); 674 } 675 } 676 } else { 677 for (i = 0; i < h; ++i) { 678 for (int j = 0; j < w; j += 16) { 679 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 680 // 18 19 20 21 22 23 681 const __m256i data = _mm256_inserti128_si256( 682 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), 683 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 684 1); 685 686 __m256i res_16b = convolve_lowbd_x_6tap(data, coeffs, filt); 687 688 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), 689 round_0_shift); 690 691 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), 692 round_shift); 693 694 /* rounding code */ 695 // 8 bit conversion and saturation to uint8 696 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); 697 698 // Store values into the destination buffer 699 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 700 res_8b = _mm256_permute4x64_epi64(res_8b, 216); 701 __m128i res = _mm256_castsi256_si128(res_8b); 702 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); 703 } 704 } 705 } 706 } else if (horiz_tap == 12) { // horiz_tap == 12 707 const int fo_horiz = filter_params_x->taps / 2 - 1; 708 const uint8_t *const src_ptr = src - fo_horiz; 709 const __m256i v_zero = _mm256_setzero_si256(); 710 round_0_const = _mm256_set1_epi32((1 << (conv_params->round_0)) >> 1); 711 round_const = _mm256_set1_epi32((1 << bits) >> 1); 712 round_0_shift = _mm_cvtsi32_si128(conv_params->round_0); 713 __m256i s[6]; 714 715 if (w <= 4) { 716 for (i = 0; i < h; i += 2) { 717 const __m256i data = _mm256_permute2x128_si256( 718 _mm256_castsi128_si256( 719 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), 720 _mm256_castsi128_si256(_mm_loadu_si128( 721 (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 722 0x20); 723 // row0 0..7 row1 0..7 724 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); 725 // row0 8..F row1 8..F 726 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); 727 728 // row0 00 00 01 01 .. 03 03 row1 00 00 01 01 .. 03 03 729 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); 730 // row0 04 04 .. 07 07 row1 04 04 .. 07 07 731 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); 732 733 // row0 08 08 09 09 .. 0B 0B row1 08 08 09 09 .. 0B 0B 734 const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); 735 // row0 0C 0C .. 0F 0F row1 0C 0C .. 0F 0F 736 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); 737 738 // 00 01 01 02 02 03 03 04 10 11 11 12 12 13 13 14 739 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); 740 // 02 03 03 04 04 05 05 06 12 13 13 14 14 15 15 16 741 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); 742 // 04 05 05 06 06 07 07 08 14 15 15 16 16 17 17 18 743 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); 744 // 06 07 07 08 08 09 09 0A 16 17 17 18 18 19 19 1A 745 s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); 746 // 08 09 09 0A 0A 0B 0B 0C 18 19 19 1A 1A 1B 1B 1C 747 s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); 748 // 0A 0B 0B 0C 0C 0D 0D 0E 1A 1B 1B 1C 1C 1D 1D 1E 749 s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); 750 751 const __m256i res_lo = convolve_12taps(s, coeffs); 752 753 __m256i res_32b_lo = _mm256_sra_epi32( 754 _mm256_add_epi32(res_lo, round_0_const), round_0_shift); 755 756 // 00 01 02 03 10 12 13 14 757 res_32b_lo = _mm256_sra_epi32(_mm256_add_epi32(res_32b_lo, round_const), 758 round_shift); 759 // 8 bit conversion and saturation to uint8 760 // 00 01 02 03 00 01 02 03 10 11 12 13 10 11 12 13 761 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); 762 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 763 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 764 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); 765 766 // 00 01 02 03 00 01 02 03 00 01 02 03 00 01 02 03 767 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); 768 // 10 11 12 13 10 11 12 13 10 11 12 13 10 11 12 13 769 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); 770 if (w > 2) { 771 // 00 01 02 03 772 *(int *)&dst[i * dst_stride] = _mm_cvtsi128_si32(res_0); 773 // 10 11 12 13 774 *(int *)&dst[i * dst_stride + dst_stride] = _mm_cvtsi128_si32(res_1); 775 } else { 776 // 00 01 777 *(uint16_t *)&dst[i * dst_stride] = 778 (uint16_t)_mm_cvtsi128_si32(res_0); 779 // 10 11 780 *(uint16_t *)&dst[i * dst_stride + dst_stride] = 781 (uint16_t)_mm_cvtsi128_si32(res_1); 782 } 783 } 784 } else { 785 for (i = 0; i < h; i++) { 786 for (int j = 0; j < w; j += 8) { 787 const __m256i data = _mm256_permute2x128_si256( 788 _mm256_castsi128_si256( 789 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride + j]))), 790 _mm256_castsi128_si256(_mm_loadu_si128( 791 (__m128i *)(&src_ptr[i * src_stride + j + 4]))), 792 0x20); 793 // row0 0..7 4..B 794 const __m256i s_16lo = _mm256_unpacklo_epi8(data, v_zero); 795 // row0 8..F C..13 796 const __m256i s_16hi = _mm256_unpackhi_epi8(data, v_zero); 797 798 // row0 00 00 01 01 .. 03 03 04 04 05 05 .. 07 07 799 const __m256i s_lolo = _mm256_unpacklo_epi16(s_16lo, s_16lo); 800 // row0 04 04 .. 07 07 08 08 .. 0B 0B 801 const __m256i s_lohi = _mm256_unpackhi_epi16(s_16lo, s_16lo); 802 803 // row0 08 08 09 09 .. 0B 0B 0C 0C 0D 0D .. 0F 0F 804 const __m256i s_hilo = _mm256_unpacklo_epi16(s_16hi, s_16hi); 805 // row0 0C 0C 0D 0D .. 0F 0F 10 10 11 11 .. 13 13 806 const __m256i s_hihi = _mm256_unpackhi_epi16(s_16hi, s_16hi); 807 808 s[0] = _mm256_alignr_epi8(s_lohi, s_lolo, 2); 809 s[1] = _mm256_alignr_epi8(s_lohi, s_lolo, 10); 810 s[2] = _mm256_alignr_epi8(s_hilo, s_lohi, 2); 811 s[3] = _mm256_alignr_epi8(s_hilo, s_lohi, 10); 812 s[4] = _mm256_alignr_epi8(s_hihi, s_hilo, 2); 813 s[5] = _mm256_alignr_epi8(s_hihi, s_hilo, 10); 814 815 const __m256i res_lo = convolve_12taps(s, coeffs); 816 817 __m256i res_32b_lo = _mm256_sra_epi32( 818 _mm256_add_epi32(res_lo, round_0_const), round_0_shift); 819 820 res_32b_lo = _mm256_sra_epi32( 821 _mm256_add_epi32(res_32b_lo, round_const), round_shift); 822 // 8 bit conversion and saturation to uint8 823 __m256i res_16b_lo = _mm256_packs_epi32(res_32b_lo, res_32b_lo); 824 __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo); 825 const __m128i res_0 = _mm256_extracti128_si256(res_8b_lo, 0); 826 const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1); 827 *(int *)&dst[i * dst_stride + j] = _mm_cvtsi128_si32(res_0); 828 *(int *)&dst[i * dst_stride + j + 4] = _mm_cvtsi128_si32(res_1); 829 } 830 } 831 } 832 } else { 833 const int fo_horiz = filter_params_x->taps / 2 - 1; 834 const uint8_t *const src_ptr = src - fo_horiz; 835 filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2)); 836 filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3)); 837 838 if (w <= 8) { 839 for (i = 0; i < h; i += 2) { 840 const __m256i data = _mm256_permute2x128_si256( 841 _mm256_castsi128_si256( 842 _mm_loadu_si128((__m128i *)(&src_ptr[i * src_stride]))), 843 _mm256_castsi128_si256(_mm_loadu_si128( 844 (__m128i *)(&src_ptr[i * src_stride + src_stride]))), 845 0x20); 846 847 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); 848 849 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), 850 round_0_shift); 851 852 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), 853 round_shift); 854 855 /* rounding code */ 856 // 8 bit conversion and saturation to uint8 857 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); 858 859 const __m128i res_0 = _mm256_castsi256_si128(res_8b); 860 const __m128i res_1 = _mm256_extracti128_si256(res_8b, 1); 861 if (w > 4) { 862 _mm_storel_epi64((__m128i *)&dst[i * dst_stride], res_0); 863 _mm_storel_epi64((__m128i *)&dst[i * dst_stride + dst_stride], res_1); 864 } else if (w > 2) { 865 xx_storel_32(&dst[i * dst_stride], res_0); 866 xx_storel_32(&dst[i * dst_stride + dst_stride], res_1); 867 } else { 868 __m128i *const p_0 = (__m128i *)&dst[i * dst_stride]; 869 __m128i *const p_1 = (__m128i *)&dst[i * dst_stride + dst_stride]; 870 *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0); 871 *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1); 872 } 873 } 874 } else { 875 for (i = 0; i < h; ++i) { 876 for (int j = 0; j < w; j += 16) { 877 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 8 9 10 11 12 13 14 15 16 17 878 // 18 19 20 21 22 23 879 const __m256i data = _mm256_inserti128_si256( 880 _mm256_loadu_si256((__m256i *)&src_ptr[(i * src_stride) + j]), 881 _mm_loadu_si128((__m128i *)&src_ptr[(i * src_stride) + (j + 8)]), 882 1); 883 884 __m256i res_16b = convolve_lowbd_x(data, coeffs, filt); 885 886 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_0_const), 887 round_0_shift); 888 889 res_16b = _mm256_sra_epi16(_mm256_add_epi16(res_16b, round_const), 890 round_shift); 891 892 /* rounding code */ 893 // 8 bit conversion and saturation to uint8 894 __m256i res_8b = _mm256_packus_epi16(res_16b, res_16b); 895 896 // Store values into the destination buffer 897 // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 898 res_8b = _mm256_permute4x64_epi64(res_8b, 216); 899 __m128i res = _mm256_castsi256_si128(res_8b); 900 _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res); 901 } 902 } 903 } 904 } 905 } 906 907 void av1_convolve_x_sr_avx2(const uint8_t *src, int32_t src_stride, 908 uint8_t *dst, int32_t dst_stride, int32_t w, 909 int32_t h, 910 const InterpFilterParams *filter_params_x, 911 const int32_t subpel_x_qn, 912 ConvolveParams *conv_params) { 913 #if CONFIG_SVT_AV1 914 const int horz_tap = get_filter_tap(filter_params_x, subpel_x_qn); 915 916 if (horz_tap == 12) { 917 av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, 918 filter_params_x, subpel_x_qn, conv_params); 919 } else { 920 av1_convolve_x_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, 921 filter_params_x, subpel_x_qn, 922 conv_params); 923 } 924 #else 925 av1_convolve_x_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, 926 filter_params_x, subpel_x_qn, conv_params); 927 #endif 928 }