tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_block_avx2.c (15640B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include "aom_dsp/aom_simd.h"
     13 #define SIMD_FUNC(name) name##_avx2
     14 #include "av1/common/cdef_block_simd.h"
     15 
     16 /* partial A is a 16-bit vector of the form:
     17 [x8 - - x1 | x16 - - x9] and partial B has the form:
     18 [0  y1 - y7 | 0 y9 - y15].
     19 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ...
     20 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants
     21 are in const1 and const2. */
     22 static inline __m256i fold_mul_and_sum_avx2(__m256i *partiala,
     23                                            __m256i *partialb,
     24                                            const __m256i *const1,
     25                                            const __m256i *const2) {
     26  // Mask used to shuffle the elements present in 256bit register.
     27  static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504,
     28                                             0x0f0e0100, 0x0b0a0d0c, 0x07060908,
     29                                             0x03020504, 0x0f0e0100 };
     30  __m256i tmp;
     31  /* Reverse partial B. */
     32  *partialb = _mm256_shuffle_epi8(
     33      *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit));
     34 
     35  /* Interleave the x and y values of identical indices and pair x8 with 0. */
     36  tmp = *partiala;
     37  *partiala = _mm256_unpacklo_epi16(*partiala, *partialb);
     38  *partialb = _mm256_unpackhi_epi16(tmp, *partialb);
     39 
     40  /* Square and add the corresponding x and y values. */
     41  *partiala = _mm256_madd_epi16(*partiala, *partiala);
     42  *partialb = _mm256_madd_epi16(*partialb, *partialb);
     43  /* Multiply by constant. */
     44  *partiala = _mm256_mullo_epi32(*partiala, *const1);
     45  *partialb = _mm256_mullo_epi32(*partialb, *const2);
     46  /* Sum all results. */
     47  *partiala = _mm256_add_epi32(*partiala, *partialb);
     48  return *partiala;
     49 }
     50 
     51 static inline __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2,
     52                                 __m256i *x3) {
     53  const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1);
     54  const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3);
     55  const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1);
     56  const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3);
     57 
     58  *x0 = _mm256_unpacklo_epi64(t0, t1);
     59  *x1 = _mm256_unpackhi_epi64(t0, t1);
     60  *x2 = _mm256_unpacklo_epi64(t2, t3);
     61  *x3 = _mm256_unpackhi_epi64(t2, t3);
     62  return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1),
     63                          _mm256_add_epi32(*x2, *x3));
     64 }
     65 
     66 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again
     67 to compute the remaining directions. */
     68 static inline __m256i compute_directions_avx2(__m256i *lines,
     69                                              int32_t cost_frist_8x8[4],
     70                                              int32_t cost_second_8x8[4]) {
     71  __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b;
     72  __m256i partial6;
     73  __m256i tmp;
     74  /* Partial sums for lines 0 and 1. */
     75  partial4a = _mm256_slli_si256(lines[0], 14);
     76  partial4b = _mm256_srli_si256(lines[0], 2);
     77  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12));
     78  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4));
     79  tmp = _mm256_add_epi16(lines[0], lines[1]);
     80  partial5a = _mm256_slli_si256(tmp, 10);
     81  partial5b = _mm256_srli_si256(tmp, 6);
     82  partial7a = _mm256_slli_si256(tmp, 4);
     83  partial7b = _mm256_srli_si256(tmp, 12);
     84  partial6 = tmp;
     85 
     86  /* Partial sums for lines 2 and 3. */
     87  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10));
     88  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6));
     89  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8));
     90  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8));
     91  tmp = _mm256_add_epi16(lines[2], lines[3]);
     92  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8));
     93  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8));
     94  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6));
     95  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10));
     96  partial6 = _mm256_add_epi16(partial6, tmp);
     97 
     98  /* Partial sums for lines 4 and 5. */
     99  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6));
    100  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10));
    101  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4));
    102  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12));
    103  tmp = _mm256_add_epi16(lines[4], lines[5]);
    104  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6));
    105  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10));
    106  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8));
    107  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8));
    108  partial6 = _mm256_add_epi16(partial6, tmp);
    109 
    110  /* Partial sums for lines 6 and 7. */
    111  partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2));
    112  partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14));
    113  partial4a = _mm256_add_epi16(partial4a, lines[7]);
    114  tmp = _mm256_add_epi16(lines[6], lines[7]);
    115  partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4));
    116  partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12));
    117  partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10));
    118  partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6));
    119  partial6 = _mm256_add_epi16(partial6, tmp);
    120 
    121  const __m256i const_reg_1 =
    122      _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840);
    123  const __m256i const_reg_2 =
    124      _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168);
    125  const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0);
    126  const __m256i const_reg_4 =
    127      _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140);
    128 
    129  /* Compute costs in terms of partial sums. */
    130  partial4a =
    131      fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2);
    132  partial7a =
    133      fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4);
    134  partial5a =
    135      fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4);
    136  partial6 = _mm256_madd_epi16(partial6, partial6);
    137  partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105));
    138 
    139  partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a);
    140  _mm_storeu_si128((__m128i *)cost_frist_8x8,
    141                   _mm256_castsi256_si128(partial4a));
    142  _mm_storeu_si128((__m128i *)cost_second_8x8,
    143                   _mm256_extractf128_si256(partial4a, 1));
    144 
    145  return partial4a;
    146 }
    147 
    148 /* transpose and reverse the order of the lines -- equivalent to a 90-degree
    149 counter-clockwise rotation of the pixels. */
    150 static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) {
    151  const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]);
    152  const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]);
    153  const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]);
    154  const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]);
    155  const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]);
    156  const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]);
    157  const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]);
    158  const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]);
    159 
    160  const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1);
    161  const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5);
    162  const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1);
    163  const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5);
    164  const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3);
    165  const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7);
    166  const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3);
    167  const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7);
    168 
    169  res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1);
    170  res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1);
    171  res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3);
    172  res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3);
    173  res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5);
    174  res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5);
    175  res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7);
    176  res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7);
    177 }
    178 
    179 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2,
    180                             int stride, int32_t *var_out_1st,
    181                             int32_t *var_out_2nd, int coeff_shift,
    182                             int *out_dir_1st_8x8, int *out_dir_2nd_8x8) {
    183  int32_t cost_first_8x8[8];
    184  int32_t cost_second_8x8[8];
    185  // Used to store the best cost for 2 8x8's.
    186  int32_t best_cost[2] = { 0 };
    187  // Best direction for 2 8x8's.
    188  int best_dir[2] = { 0 };
    189 
    190  const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift);
    191  const __m256i const_128_reg = _mm256_set1_epi16(128);
    192  __m256i lines[8];
    193  for (int i = 0; i < 8; i++) {
    194    const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]);
    195    const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]);
    196 
    197    lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1);
    198    lines[i] = _mm256_sub_epi16(
    199        _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg);
    200  }
    201 
    202  /* Compute "mostly vertical" directions. */
    203  const __m256i dir47 =
    204      compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4);
    205 
    206  /* Transpose and reverse the order of the lines. */
    207  array_reverse_transpose_8x8_avx2(lines, lines);
    208 
    209  /* Compute "mostly horizontal" directions. */
    210  const __m256i dir03 =
    211      compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8);
    212 
    213  __m256i max = _mm256_max_epi32(dir03, dir47);
    214  max =
    215      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8),
    216                                            _mm256_slli_si256(max, 16 - (8))));
    217  max =
    218      _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4),
    219                                            _mm256_slli_si256(max, 16 - (4))));
    220 
    221  const __m128i first_8x8_output = _mm256_castsi256_si128(max);
    222  const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1);
    223  const __m128i cmpeg_res_00 =
    224      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47));
    225  const __m128i cmpeg_res_01 =
    226      _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03));
    227  const __m128i cmpeg_res_10 =
    228      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1));
    229  const __m128i cmpeg_res_11 =
    230      _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1));
    231  const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00);
    232  const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10);
    233 
    234  best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max));
    235  best_cost[1] = _mm_cvtsi128_si32(second_8x8_output);
    236  best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8));
    237  best_dir[0] =
    238      get_msb(best_dir[0] ^ (best_dir[0] - 1));  // Count trailing zeros
    239  best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8));
    240  best_dir[1] =
    241      get_msb(best_dir[1] ^ (best_dir[1] - 1));  // Count trailing zeros
    242 
    243  /* Difference between the optimal variance and the variance along the
    244     orthogonal direction. Again, the sum(x^2) terms cancel out. */
    245  *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7];
    246  *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7];
    247 
    248  /* We'd normally divide by 840, but dividing by 1024 is close enough
    249  for what we're going to do with this. */
    250  *var_out_1st >>= 10;
    251  *var_out_2nd >>= 10;
    252  *out_dir_1st_8x8 = best_dir[0];
    253  *out_dir_2nd_8x8 = best_dir[1];
    254 }
    255 
    256 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride,
    257                                        const uint8_t *src, int sstride,
    258                                        int width, int height) {
    259  int j = 0;
    260  int remaining_width = width;
    261  assert(height % 2 == 0);
    262  assert(height > 0);
    263  assert(width > 0);
    264 
    265  // Process multiple 32 pixels at a time.
    266  if (remaining_width > 31) {
    267    int i = 0;
    268    do {
    269      j = 0;
    270      do {
    271        __m128i row00 =
    272            _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]);
    273        __m128i row01 = _mm_loadu_si128(
    274            (const __m128i *)&src[(i + 0) * sstride + (j + 16)]);
    275        __m128i row10 =
    276            _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]);
    277        __m128i row11 = _mm_loadu_si128(
    278            (const __m128i *)&src[(i + 1) * sstride + (j + 16)]);
    279        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)],
    280                            _mm256_cvtepu8_epi16(row00));
    281        _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)],
    282                            _mm256_cvtepu8_epi16(row01));
    283        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)],
    284                            _mm256_cvtepu8_epi16(row10));
    285        _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)],
    286                            _mm256_cvtepu8_epi16(row11));
    287        j += 32;
    288      } while (j <= width - 32);
    289      i += 2;
    290    } while (i < height);
    291    remaining_width = width & 31;
    292  }
    293 
    294  // Process 16 pixels at a time.
    295  if (remaining_width > 15) {
    296    int i = 0;
    297    do {
    298      __m128i row0 =
    299          _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]);
    300      __m128i row1 =
    301          _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]);
    302      _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j],
    303                          _mm256_cvtepu8_epi16(row0));
    304      _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j],
    305                          _mm256_cvtepu8_epi16(row1));
    306      i += 2;
    307    } while (i < height);
    308    remaining_width = width & 15;
    309    j += 16;
    310  }
    311 
    312  // Process 8 pixels at a time.
    313  if (remaining_width > 7) {
    314    int i = 0;
    315    do {
    316      __m128i row0 =
    317          _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]);
    318      __m128i row1 =
    319          _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]);
    320      _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j],
    321                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
    322      _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j],
    323                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
    324      i += 2;
    325    } while (i < height);
    326    remaining_width = width & 7;
    327    j += 8;
    328  }
    329 
    330  // Process 4 pixels at a time.
    331  if (remaining_width > 3) {
    332    int i = 0;
    333    do {
    334      __m128i row0 =
    335          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j]));
    336      __m128i row1 =
    337          _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j]));
    338      _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j],
    339                       _mm_unpacklo_epi8(row0, _mm_setzero_si128()));
    340      _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j],
    341                       _mm_unpacklo_epi8(row1, _mm_setzero_si128()));
    342      i += 2;
    343    } while (i < height);
    344    remaining_width = width & 3;
    345    j += 4;
    346  }
    347 
    348  // Process the remaining pixels.
    349  if (remaining_width) {
    350    for (int i = 0; i < height; i++) {
    351      for (int k = j; k < width; k++) {
    352        dst[i * dstride + k] = src[i * sstride + k];
    353      }
    354    }
    355  }
    356 }