[ tor-browser ].git.dasho

convolve_2d_avx2.c (6517B)
      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #if CONFIG_SVT_AV1
     17 #include "third_party/SVT-AV1/convolve_2d_avx2.h"
     18 #endif
     19 
     20 #include "aom_dsp/x86/convolve_avx2.h"
     21 #include "aom_dsp/aom_filter.h"
     22 #include "aom_dsp/x86/synonyms.h"
     23 
     24 #include "av1/common/convolve.h"
     25 
     26 static void convolve_2d_sr_general_avx2(
     27    const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     28    int h, const InterpFilterParams *filter_params_x,
     29    const InterpFilterParams *filter_params_y, const int subpel_x_qn,
     30    const int subpel_y_qn, ConvolveParams *conv_params) {
     31  if (filter_params_x->taps > 8) {
     32    const int bd = 8;
     33    int im_stride = 8, i;
     34    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
     35    const int bits =
     36        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     37    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     38 
     39    assert(conv_params->round_0 > 0);
     40 
     41    const __m256i round_const_h12 = _mm256_set1_epi32(
     42        ((1 << (conv_params->round_0)) >> 1) + (1 << (bd + FILTER_BITS - 1)));
     43    const __m128i round_shift_h12 = _mm_cvtsi32_si128(conv_params->round_0);
     44 
     45    const __m256i sum_round_v = _mm256_set1_epi32(
     46        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
     47    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
     48 
     49    const __m256i round_const_v = _mm256_set1_epi32(
     50        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
     51        ((1 << (offset_bits - conv_params->round_1)) >> 1));
     52    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
     53 
     54    __m256i coeffs_h[6] = { 0 }, coeffs_v[6] = { 0 };
     55 
     56    int horiz_tap = 12;
     57    int vert_tap = 12;
     58 
     59    prepare_coeffs_12taps(filter_params_x, subpel_x_qn, coeffs_h);
     60    prepare_coeffs_12taps(filter_params_y, subpel_y_qn, coeffs_v);
     61 
     62    int im_h = h + vert_tap - 1;
     63    const int fo_vert = vert_tap / 2 - 1;
     64    const int fo_horiz = horiz_tap / 2 - 1;
     65    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
     66 
     67    for (int j = 0; j < w; j += 8) {
     68      CONVOLVE_SR_HORIZONTAL_FILTER_12TAP
     69      CONVOLVE_SR_VERTICAL_FILTER_12TAP
     70    }
     71  } else {
     72    const int bd = 8;
     73    int im_stride = 8, i;
     74    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
     75    const int bits =
     76        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
     77    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
     78 
     79    assert(conv_params->round_0 > 0);
     80 
     81    const __m256i round_const_h =
     82        _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
     83                          (1 << (bd + FILTER_BITS - 2)));
     84    const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
     85 
     86    const __m256i sum_round_v = _mm256_set1_epi32(
     87        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
     88    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
     89 
     90    const __m256i round_const_v = _mm256_set1_epi32(
     91        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
     92        ((1 << (offset_bits - conv_params->round_1)) >> 1));
     93    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
     94 
     95    __m256i filt[4], coeffs_h[4], coeffs_v[4];
     96 
     97    prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
     98    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
     99 
    100    int horiz_tap = get_filter_tap(filter_params_x, subpel_x_qn);
    101    int vert_tap = get_filter_tap(filter_params_y, subpel_y_qn);
    102 
    103    if (horiz_tap == 6)
    104      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    105    else
    106      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
    107 
    108    if (vert_tap == 6)
    109      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
    110    else
    111      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
    112 
    113    int im_h = h + vert_tap - 1;
    114    const int fo_vert = vert_tap / 2 - 1;
    115    const int fo_horiz = horiz_tap / 2 - 1;
    116    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
    117 
    118    filt[0] = _mm256_load_si256((__m256i const *)filt1_global_avx2);
    119    filt[1] = _mm256_load_si256((__m256i const *)filt2_global_avx2);
    120    filt[2] = _mm256_load_si256((__m256i const *)filt3_global_avx2);
    121    filt[3] = _mm256_load_si256((__m256i const *)filt4_global_avx2);
    122 
    123    for (int j = 0; j < w; j += 8) {
    124      if (horiz_tap == 4) {
    125        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
    126      } else if (horiz_tap == 6) {
    127        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
    128      } else {
    129        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
    130      }
    131 
    132      if (vert_tap == 4) {
    133        CONVOLVE_SR_VERTICAL_FILTER_4TAP
    134      } else if (vert_tap == 6) {
    135        CONVOLVE_SR_VERTICAL_FILTER_6TAP
    136      } else {
    137        CONVOLVE_SR_VERTICAL_FILTER_8TAP
    138      }
    139    }
    140  }
    141 }
    142 
    143 void av1_convolve_2d_sr_avx2(
    144    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
    145    int32_t w, int32_t h, const InterpFilterParams *filter_params_x,
    146    const InterpFilterParams *filter_params_y, const int32_t subpel_x_qn,
    147    const int32_t subpel_y_qn, ConvolveParams *conv_params) {
    148 #if CONFIG_SVT_AV1
    149  const int32_t tap_x = get_filter_tap(filter_params_x, subpel_x_qn);
    150  const int32_t tap_y = get_filter_tap(filter_params_y, subpel_y_qn);
    151 
    152  const bool use_general = (tap_x == 12 || tap_y == 12);
    153  if (use_general) {
    154    convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
    155                                filter_params_x, filter_params_y, subpel_x_qn,
    156                                subpel_y_qn, conv_params);
    157  } else {
    158    av1_convolve_2d_sr_specialized_avx2(src, src_stride, dst, dst_stride, w, h,
    159                                        filter_params_x, filter_params_y,
    160                                        subpel_x_qn, subpel_y_qn, conv_params);
    161  }
    162 #else
    163  convolve_2d_sr_general_avx2(src, src_stride, dst, dst_stride, w, h,
    164                              filter_params_x, filter_params_y, subpel_x_qn,
    165                              subpel_y_qn, conv_params);
    166 #endif
    167 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE