convolve_2d_avx2.c (6517B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <immintrin.h> 13 14 #include "config/av1_rtcd.h" 15 16 #if CONFIG_SVT_AV1 17 #include "third_party/SVT-AV1/convolve_2d_avx2.h" 18 #endif 19 20 #include "aom_dsp/x86/convolve_avx2.h" 21 #include "aom_dsp/aom_filter.h" 22 #include "aom_dsp/x86/synonyms.h" 23 24 #include "av1/common/convolve.h" 25 26 static void convolve_2d_sr_general_avx2( 27 const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, 28 int h, const InterpFilterParams *filter_params_x, 29 const InterpFilterParams *filter_params_y, const int subpel_x_qn, 30 const int subpel_y_qn, ConvolveParams *conv_params) { 31 if (filter_params_x->taps > 8) { 32 const int bd = 8; 33 int im_stride = 8, i; 34 DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); 35 const int bits = 36 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 37 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 38 39 assert(conv_params->round_0 > 0); 40 41 const __m256i round_const_h12 = _mm256_set1_epi32( 42 ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1))); 43 const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0); 44 45 const __m256i sum_round_v = _mm256_set1_epi32( 46 (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); 47 const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); 48 49 const __m256i round_const_v = _mm256_set1_epi32( 50 ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - 51 ((1 << (offset_bits - conv_params->round_1)) >> 1)); 52 const __m128i round_shift_v = _mm_cvtsi32_si128(bits); 53 54 __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 }; 55 56 int horiz_tap = 12; 57 int vert_tap = 12; 58 59 prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h); 60 prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v); 61 62 int im_h = h + vert_tap - 1; 63 const int fo_vert = vert_tap / 2 - 1; 64 const int fo_horiz = horiz_tap / 2 - 1; 65 const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; 66 67 for (int j = 0; j < w; j += 8) { 68 CONVOLVE_SR_HORIZONTAL_FILTER_12TAP 69 CONVOLVE_SR_VERTICAL_FILTER_12TAP 70 } 71 } else { 72 const int bd = 8; 73 int im_stride = 8, i; 74 DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]); 75 const int bits = 76 FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1; 77 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; 78 79 assert(conv_params->round_0 > 0); 80 81 const __m256i round_const_h = 82 _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) + 83 (1 << (bd + FILTER_BITS - 2))); 84 const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1); 85 86 const __m256i sum_round_v = _mm256_set1_epi32( 87 (1 << offset_bits) + ((1 << conv_params->round_1) >> 1)); 88 const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1); 89 90 const __m256i round_const_v = _mm256_set1_epi32( 91 ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) - 92 ((1 << (offset_bits - conv_params->round_1)) >> 1)); 93 const __m128i round_shift_v = _mm_cvtsi32_si128(bits); 94 95 __m256i filt[4], coeffs_h[4], coeffs_v[4]; 96 97 prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); 98 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); 99 100 int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn); 101 int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn); 102 103 if (horiz_tap == 6) 104 prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h); 105 else 106 prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h); 107 108 if (vert_tap == 6) 109 prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v); 110 else 111 prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v); 112 113 int im_h = h + vert_tap - 1; 114 const int fo_vert = vert_tap / 2 - 1; 115 const int fo_horiz = horiz_tap / 2 - 1; 116 const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; 117 118 filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2); 119 filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2); 120 filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2); 121 filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2); 122 123 for (int j = 0; j < w; j += 8) { 124 if (horiz_tap == 4) { 125 CONVOLVE_SR_HORIZONTAL_FILTER_4TAP 126 } else if (horiz_tap == 6) { 127 CONVOLVE_SR_HORIZONTAL_FILTER_6TAP 128 } else { 129 CONVOLVE_SR_HORIZONTAL_FILTER_8TAP 130 } 131 132 if (vert_tap == 4) { 133 CONVOLVE_SR_VERTICAL_FILTER_4TAP 134 } else if (vert_tap == 6) { 135 CONVOLVE_SR_VERTICAL_FILTER_6TAP 136 } else { 137 CONVOLVE_SR_VERTICAL_FILTER_8TAP 138 } 139 } 140 } 141 } 142 143 void av1_convolve_2d_sr_avx2( 144 const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, 145 int32_t w, int32_t h, const InterpFilterParams *filter_params_x, 146 const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn, 147 const int32_t subpel_y_qn, ConvolveParams *conv_params) { 148 #if CONFIG_SVT_AV1 149 const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn); 150 const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn); 151 152 const bool use_general = (tap_x == 12 || tap_y == 12); 153 if (use_general) { 154 convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, 155 filter_params_x, filter_params_y, subpel_x_qn, 156 subpel_y_qn, conv_params); 157 } else { 158 av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h, 159 filter_params_x, filter_params_y, 160 subpel_x_qn, subpel_y_qn, conv_params); 161 } 162 #else 163 convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h, 164 filter_params_x, filter_params_y, subpel_x_qn, 165 subpel_y_qn, conv_params); 166 #endif 167 }