tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

resize_avx2.c (36682B)


      1 /*
      2 * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #include <immintrin.h>
     12 #include <string.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "av1/common/resize.h"
     17 
     18 #include "aom_dsp/x86/synonyms.h"
     19 
     20 #define ROW_OFFSET 5
     21 #define CAST_HI(x) _mm256_castsi128_si256(x)
     22 #define CAST_LOW(x) _mm256_castsi256_si128(x)
     23 
     24 #define PROCESS_RESIZE_Y_WD16                                               \
     25  const int idx1 = AOMMIN(height - 1, i + 5);                               \
     26  const int idx2 = AOMMIN(height - 1, i + 6);                               \
     27  l6 = l10;                                                                 \
     28  l7 = l11;                                                                 \
     29  l8 = _mm_loadu_si128((__m128i *)(data + idx1 * stride));                  \
     30  l9 = _mm_loadu_si128((__m128i *)(data + idx2 * stride));                  \
     31                                                                            \
     32  /* g0... g15 | i0... i15 */                                               \
     33  const __m256i s68 =                                                       \
     34      _mm256_permute2x128_si256(CAST_HI(l6), CAST_HI(l8), 0x20);            \
     35  /* h0... h15 | j0... j15 */                                               \
     36  const __m256i s79 =                                                       \
     37      _mm256_permute2x128_si256(CAST_HI(l7), CAST_HI(l9), 0x20);            \
     38                                                                            \
     39  /* g0h0... g7g7 | i0j0... i7j */                                          \
     40  s[3] = _mm256_unpacklo_epi8(s68, s79);                                    \
     41  /* g8h8... g15g15 | i8j8... i15j15 */                                     \
     42  s[8] = _mm256_unpackhi_epi8(s68, s79);                                    \
     43                                                                            \
     44  __m256i res_out[2] = { 0 };                                               \
     45  resize_convolve(s, coeffs_y, res_out);                                    \
     46                                                                            \
     47  /* r00... r07 */                                                          \
     48  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits);   \
     49  /* r20... r27 */                                                          \
     50  __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits);   \
     51                                                                            \
     52  res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);        \
     53  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);        \
     54                                                                            \
     55  __m256i res_out_b[2] = { 0 };                                             \
     56  resize_convolve(s + 5, coeffs_y, res_out_b);                              \
     57                                                                            \
     58  /* r08... r015 */                                                         \
     59  __m256i res_b_round_1 = _mm256_add_epi32(res_out_b[0], round_const_bits); \
     60  /* r28... r215 */                                                         \
     61  __m256i res_b_round_2 = _mm256_add_epi32(res_out_b[1], round_const_bits); \
     62  res_b_round_1 = _mm256_sra_epi32(res_b_round_1, round_shift_bits);        \
     63  res_b_round_2 = _mm256_sra_epi32(res_b_round_2, round_shift_bits);        \
     64                                                                            \
     65  /* r00... r03 r20... r23 | r04... r07 r24... r27 */                       \
     66  __m256i res_8bit0 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);    \
     67  /* r08... r012 r28... r212 | r013... r015 r213... r215 */                 \
     68  __m256i res_8bit1 = _mm256_packus_epi32(res_b_round_1, res_b_round_2);    \
     69  /* r00... r07 | r20... r27 */                                             \
     70  res_8bit0 = _mm256_permute4x64_epi64(res_8bit0, 0xd8);                    \
     71  /* r08... r015 | r28... r215 */                                           \
     72  res_8bit1 = _mm256_permute4x64_epi64(res_8bit1, 0xd8);                    \
     73  /* r00... r015 | r20... r215 */                                           \
     74  res_8bit1 = _mm256_packus_epi16(res_8bit0, res_8bit1);                    \
     75  res_8bit0 = _mm256_min_epu8(res_8bit1, clip_pixel);                       \
     76  res_8bit0 = _mm256_max_epu8(res_8bit0, zero);
     77 
     78 #define PROCESS_RESIZE_Y_WD8                                              \
     79  const int idx1 = AOMMIN(height - 1, i + 5);                             \
     80  const int idx2 = AOMMIN(height - 1, i + 6);                             \
     81  l6 = l10;                                                               \
     82  l7 = l11;                                                               \
     83  l8 = _mm_loadl_epi64((__m128i *)(data + idx1 * stride));                \
     84  l9 = _mm_loadl_epi64((__m128i *)(data + idx2 * stride));                \
     85                                                                          \
     86  /* g0h0... g7h7 */                                                      \
     87  s67 = _mm_unpacklo_epi8(l6, l7);                                        \
     88  /* i0j0...i7j7 */                                                       \
     89  __m128i s89 = _mm_unpacklo_epi8(l8, l9);                                \
     90                                                                          \
     91  /* g0h0...g7g7 | i0j0...i7j7 */                                         \
     92  s[3] = _mm256_permute2x128_si256(CAST_HI(s67), CAST_HI(s89), 0x20);     \
     93                                                                          \
     94  __m256i res_out[2] = { 0 };                                             \
     95  resize_convolve(s, coeffs_y, res_out);                                  \
     96                                                                          \
     97  /* r00... r07 */                                                        \
     98  __m256i res_a_round_1 = _mm256_add_epi32(res_out[0], round_const_bits); \
     99  /* r20...r27 */                                                         \
    100  __m256i res_a_round_2 = _mm256_add_epi32(res_out[1], round_const_bits); \
    101  res_a_round_1 = _mm256_sra_epi32(res_a_round_1, round_shift_bits);      \
    102  res_a_round_2 = _mm256_sra_epi32(res_a_round_2, round_shift_bits);      \
    103                                                                          \
    104  /* r00...r03 r20...r23 | r04...r07 r24...r27 */                         \
    105  res_a_round_1 = _mm256_packus_epi32(res_a_round_1, res_a_round_2);      \
    106  /* r00...r07 | r20...r27 */                                             \
    107  res_a_round_1 = _mm256_permute4x64_epi64(res_a_round_1, 0xd8);          \
    108  res_a_round_1 = _mm256_packus_epi16(res_a_round_1, res_a_round_1);      \
    109  res_a_round_1 = _mm256_min_epu8(res_a_round_1, clip_pixel);             \
    110  res_a_round_1 = _mm256_max_epu8(res_a_round_1, zero);
    111 
    112 #define PROCESS_RESIZE_X_WD32                                                  \
    113  /* a0 a1 ..... a30 a31 */                                                    \
    114  __m256i row0 = _mm256_loadu_si256(                                           \
    115      (__m256i *)&input[i * in_stride + j - filter_offset]);                   \
    116  /* b0 b1 ..... b30 b31 */                                                    \
    117  __m256i row1 = _mm256_loadu_si256(                                           \
    118      (__m256i *)&input[(i + 1) * in_stride + j - filter_offset]);             \
    119  /* a0 .... a15 || b0.... b15 */                                              \
    120  __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);                    \
    121  /* a16 .... a31 || b16 .... b31 */                                           \
    122  __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);                    \
    123  filter_offset = 3;                                                           \
    124                                                                               \
    125  /* Pad start pixels to the left, while processing the first pixels in the    \
    126   * row. */                                                                   \
    127  if (j == 0) {                                                                \
    128    /* a0 a0 a0 a0 .... a12 || b0 b0 b0 b0 .... b12 */                         \
    129    row0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);                       \
    130    /* a13 a14 a15 a16.....a28 || b13 b14 b15 b16.....b28 */                   \
    131    row1 = _mm256_alignr_epi8(r1, r0, 13);                                     \
    132    r0 = row0;                                                                 \
    133    r1 = row1;                                                                 \
    134  }                                                                            \
    135  const int is_last_cols32 = (j + 32 == filtered_length);                      \
    136  /* Avoid loading extra pixels at frame boundary.*/                           \
    137  if (is_last_cols32) row_offset = ROW_OFFSET;                                 \
    138  /* a29 a30 a31 a32 a33 a34 a35 a36 0 0 ....*/                                \
    139  __m128i row0_0 = _mm_loadl_epi64(                                            \
    140      (__m128i *)&input[i * in_stride + 32 + j - filter_offset - row_offset]); \
    141  /* b29 b30 b31 b32 b33 b34 b35 b36 0 0 .... */                               \
    142  __m128i row1_0 =                                                             \
    143      _mm_loadl_epi64((__m128i *)&input[(i + 1) * in_stride + 32 + j -         \
    144                                        filter_offset - row_offset]);          \
    145  __m256i r2 = _mm256_permute2x128_si256(                                      \
    146      _mm256_castsi128_si256(row0_0), _mm256_castsi128_si256(row1_0), 0x20);   \
    147                                                                               \
    148  /* Pad end pixels to the right, while processing the last pixels in the      \
    149   * row. */                                                                   \
    150  if (is_last_cols32) {                                                        \
    151    r2 = _mm256_shuffle_epi8(_mm256_srli_si256(r2, ROW_OFFSET),                \
    152                             wd32_end_pad_mask);                               \
    153  }                                                                            \
    154                                                                               \
    155  /* Process even pixels of the first row  */                                  \
    156  /* a0 a0 a0 a0 a1 a2 .... a12 | b0 b0 b0 b0 b1 b2 .... b12 */                \
    157  s0[0] = _mm256_alignr_epi8(r1, r0, 0);                                       \
    158  /* a0 a0 a1 a2 a3 a4 .... a14 | b0 b0 b1 b2 b3 b4 .... b14 */                \
    159  s0[1] = _mm256_alignr_epi8(r1, r0, 2);                                       \
    160  /* a1 a2 a3 a4 a5 a6 .... a16 | b1 b2 b3 b4 b5 b6 .... b16 */                \
    161  s0[2] = _mm256_alignr_epi8(r1, r0, 4);                                       \
    162  /* a3 a4 a5 a6 a7 a8 .... a18 | b3 b4 b5 b6 b7 b8 .... b18 */                \
    163  s0[3] = _mm256_alignr_epi8(r1, r0, 6);                                       \
    164                                                                               \
    165  /* Process even pixels of the second row  */                                 \
    166  /* a13 a14 a15 a16  ..... a28 | b13 b14 b15 b16 ..... b28 */                 \
    167  s1[0] = _mm256_alignr_epi8(r2, r1, 0);                                       \
    168  /* a15 a16 a17 a18  ..... a30 | b15 b16 b17 b18 ..... b30 */                 \
    169  s1[1] = _mm256_alignr_epi8(r2, r1, 2);                                       \
    170  /* a17 a18 a19 a20  ..... a32 | b17 b18 b19 b20 ..... b32 */                 \
    171  s1[2] = _mm256_alignr_epi8(r2, r1, 4);                                       \
    172  /* a19 a20 a21 a22  ..... a34 | b19 b20 b21 b22 ..... b34 */                 \
    173  s1[3] = _mm256_alignr_epi8(r2, r1, 6);                                       \
    174                                                                               \
    175  /* The register res_out_0 stores the result of start-16 pixels corresponding \
    176   * to the first and second rows whereas res_out_1 stores the end-16          \
    177   * pixels. */                                                                \
    178  __m256i res_out_0[2], res_out_1[2];                                          \
    179  res_out_1[0] = res_out_1[1] = zero;                                          \
    180  res_out_0[0] = res_out_0[1] = zero;                                          \
    181  resize_convolve(s0, coeffs_x, res_out_0);                                    \
    182  resize_convolve(s1, coeffs_x, res_out_1);                                    \
    183                                                                               \
    184  /* Result of 32 pixels of row0 (a0 to a32) */                                \
    185  res_out_0[0] = _mm256_sra_epi32(                                             \
    186      _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);     \
    187  res_out_1[0] = _mm256_sra_epi32(                                             \
    188      _mm256_add_epi32(res_out_1[0], round_const_bits), round_shift_bits);     \
    189  /* r00-r03 r08-r011 | r04-r07 r012-r015 */                                   \
    190  __m256i res_out_r0 = _mm256_packus_epi32(res_out_0[0], res_out_1[0]);        \
    191                                                                               \
    192  /* Result of 32 pixels of row1 (b0 to b32) */                                \
    193  res_out_0[1] = _mm256_sra_epi32(                                             \
    194      _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);     \
    195  res_out_1[1] = _mm256_sra_epi32(                                             \
    196      _mm256_add_epi32(res_out_1[1], round_const_bits), round_shift_bits);     \
    197  /* r10-r13 r18-r111 | r14-r17 r112-r115 */                                   \
    198  __m256i res_out_r1 = _mm256_packus_epi32(res_out_0[1], res_out_1[1]);        \
    199                                                                               \
    200  /* Convert the result from 16bit to 8bit */                                  \
    201  /* r00-r03 r08-r011 r10-r13 r18-r111 | r04-r07 r012-r015 r14-r17 r112-r115   \
    202   */                                                                          \
    203  __m256i res_out_r01 = _mm256_packus_epi16(res_out_r0, res_out_r1);           \
    204  __m256i res_out_row01 = _mm256_min_epu8(res_out_r01, clip_pixel);            \
    205  res_out_row01 = _mm256_max_epu8(res_out_r01, zero);                          \
    206  __m128i low_128 = CAST_LOW(res_out_row01);                                   \
    207  __m128i high_128 = _mm256_extracti128_si256(res_out_row01, 1);               \
    208                                                                               \
    209  _mm_storeu_si128((__m128i *)&intbuf[i * dst_stride + j / 2],                 \
    210                   _mm_unpacklo_epi32(low_128, high_128));                     \
    211  _mm_storeu_si128((__m128i *)&intbuf[(i + 1) * dst_stride + j / 2],           \
    212                   _mm_unpackhi_epi32(low_128, high_128));
    213 
    214 static inline void resize_convolve(const __m256i *const s,
    215                                   const __m256i *const coeffs,
    216                                   __m256i *res_out) {
    217  const __m256i res_0 = _mm256_maddubs_epi16(s[0], coeffs[0]);
    218  const __m256i res_1 = _mm256_maddubs_epi16(s[1], coeffs[1]);
    219  const __m256i res_2 = _mm256_maddubs_epi16(s[2], coeffs[2]);
    220  const __m256i res_3 = _mm256_maddubs_epi16(s[3], coeffs[3]);
    221 
    222  const __m256i dst_0 = _mm256_add_epi16(res_0, res_1);
    223  const __m256i dst_1 = _mm256_add_epi16(res_2, res_3);
    224  // The sum of convolve operation crosses signed 16bit. Hence, the addition
    225  // should happen in 32bit.
    226  const __m256i dst_00 = _mm256_cvtepi16_epi32(CAST_LOW(dst_0));
    227  const __m256i dst_01 =
    228      _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_0, 1));
    229  const __m256i dst_10 = _mm256_cvtepi16_epi32(CAST_LOW(dst_1));
    230  const __m256i dst_11 =
    231      _mm256_cvtepi16_epi32(_mm256_extracti128_si256(dst_1, 1));
    232 
    233  res_out[0] = _mm256_add_epi32(dst_00, dst_10);
    234  res_out[1] = _mm256_add_epi32(dst_01, dst_11);
    235 }
    236 
    237 static inline void prepare_filter_coeffs(const int16_t *filter,
    238                                         __m256i *const coeffs /* [4] */) {
    239  // f0 f1 f2 f3 x x x x
    240  const __m128i sym_even_filter = _mm_loadl_epi64((__m128i *)filter);
    241  // f0 f1 f2 f3 f0 f1 f2 f3
    242  const __m128i tmp0 = _mm_shuffle_epi32(sym_even_filter, 0x44);
    243  // f0 f1 f2 f3 f1 f0 f3 f2
    244  const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, 0xb1);
    245 
    246  const __m128i filter_8bit = _mm_packs_epi16(tmp1, tmp1);
    247 
    248  // f0 f1 f0 f1 ..
    249  coeffs[2] = _mm256_broadcastw_epi16(filter_8bit);
    250  // f2 f3 f2 f3 ..
    251  coeffs[3] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 2));
    252  // f3 f2 f3 f2 ..
    253  coeffs[0] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 6));
    254  // f1 f0 f1 f0 ..
    255  coeffs[1] = _mm256_broadcastw_epi16(_mm_bsrli_si128(filter_8bit, 4));
    256 }
    257 
    258 bool av1_resize_vert_dir_avx2(uint8_t *intbuf, uint8_t *output, int out_stride,
    259                              int height, int height2, int stride,
    260                              int start_col) {
    261  assert(start_col <= stride);
    262  // For the GM tool, the input layer height or width is assured to be an even
    263  // number. Hence the function 'down2_symodd()' is not invoked and SIMD
    264  // optimization of the same is not implemented.
    265  // When the input height is less than 8 and even, the potential input
    266  // heights are limited to 2, 4, or 6. These scenarios require seperate
    267  // handling due to padding requirements. Invoking the C function here will
    268  // eliminate the need for conditional statements within the subsequent SIMD
    269  // code to manage these cases.
    270  if (height & 1 || height < 8) {
    271    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
    272                                 stride, start_col);
    273  }
    274 
    275  __m256i s[10], coeffs_y[4];
    276  const int bits = FILTER_BITS;
    277 
    278  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    279  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
    280  const uint8_t max_pixel = 255;
    281  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
    282  const __m256i zero = _mm256_setzero_si256();
    283 
    284  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_y);
    285 
    286  const int num_col16 = stride / 16;
    287  int remain_col = stride % 16;
    288  // The core vertical SIMD processes 4 input rows simultaneously to generate
    289  // output corresponding to 2 rows. To streamline the core loop and eliminate
    290  // the need for conditional checks, the remaining rows (4 or 6) are processed
    291  // separately.
    292  const int remain_row = (height % 4 == 0) ? 4 : 6;
    293 
    294  for (int j = start_col; j < stride - remain_col; j += 16) {
    295    const uint8_t *data = &intbuf[j];
    296    const __m128i l3 = _mm_loadu_si128((__m128i *)(data + 0 * stride));
    297    // Padding top 3 rows with the last available row at the top.
    298    const __m128i l0 = l3;
    299    const __m128i l1 = l3;
    300    const __m128i l2 = l3;
    301    const __m128i l4 = _mm_loadu_si128((__m128i *)(data + 1 * stride));
    302 
    303    __m128i l6, l7, l8, l9;
    304    __m128i l5 = _mm_loadu_si128((__m128i *)(data + 2 * stride));
    305    __m128i l10 = _mm_loadu_si128((__m128i *)(data + 3 * stride));
    306    __m128i l11 = _mm_loadu_si128((__m128i *)(data + 4 * stride));
    307 
    308    // a0...a15 | c0...c15
    309    const __m256i s02 =
    310        _mm256_permute2x128_si256(CAST_HI(l0), CAST_HI(l2), 0x20);
    311    // b0...b15 | d0...d15
    312    const __m256i s13 =
    313        _mm256_permute2x128_si256(CAST_HI(l1), CAST_HI(l3), 0x20);
    314    // c0...c15 | e0...e15
    315    const __m256i s24 =
    316        _mm256_permute2x128_si256(CAST_HI(l2), CAST_HI(l4), 0x20);
    317    // d0...d15 | f0...f15
    318    const __m256i s35 =
    319        _mm256_permute2x128_si256(CAST_HI(l3), CAST_HI(l5), 0x20);
    320    // e0...e15 | g0...g15
    321    const __m256i s46 =
    322        _mm256_permute2x128_si256(CAST_HI(l4), CAST_HI(l10), 0x20);
    323    // f0...f15 | h0...h15
    324    const __m256i s57 =
    325        _mm256_permute2x128_si256(CAST_HI(l5), CAST_HI(l11), 0x20);
    326 
    327    // a0b0...a7b7 | c0d0...c7d7
    328    s[0] = _mm256_unpacklo_epi8(s02, s13);
    329    // c0d0...c7d7 | e0f0...e7f7
    330    s[1] = _mm256_unpacklo_epi8(s24, s35);
    331    // e0f0...e7f7 | g0h0...g7h7
    332    s[2] = _mm256_unpacklo_epi8(s46, s57);
    333 
    334    // a8b8...a15b15 | c8d8...c15d15
    335    s[5] = _mm256_unpackhi_epi8(s02, s13);
    336    // c8d8...c15d15 | e8f8...e15f15
    337    s[6] = _mm256_unpackhi_epi8(s24, s35);
    338    // e8f8...e15f15 | g8h8...g15h15
    339    s[7] = _mm256_unpackhi_epi8(s46, s57);
    340 
    341    // height to be processed here
    342    const int process_ht = height - remain_row;
    343    for (int i = 0; i < process_ht; i += 4) {
    344      PROCESS_RESIZE_Y_WD16
    345 
    346      _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
    347                       CAST_LOW(res_8bit0));
    348 
    349      _mm_storeu_si128(
    350          (__m128i *)&output[(i / 2) * out_stride + j + out_stride],
    351          _mm256_extracti128_si256(res_8bit0, 1));
    352 
    353      // Load the required data for processing of next 4 input rows.
    354      const int idx7 = AOMMIN(height - 1, i + 7);
    355      const int idx8 = AOMMIN(height - 1, i + 8);
    356      l10 = _mm_loadu_si128((__m128i *)(data + idx7 * stride));
    357      l11 = _mm_loadu_si128((__m128i *)(data + idx8 * stride));
    358 
    359      const __m256i s810 =
    360          _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
    361      const __m256i s911 =
    362          _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
    363      // i0j0... i7j7 | k0l0... k7l7
    364      s[4] = _mm256_unpacklo_epi8(s810, s911);
    365      // i8j8... i15j15 | k8l8... k15l15
    366      s[9] = _mm256_unpackhi_epi8(s810, s911);
    367 
    368      s[0] = s[2];
    369      s[1] = s[3];
    370      s[2] = s[4];
    371 
    372      s[5] = s[7];
    373      s[6] = s[8];
    374      s[7] = s[9];
    375    }
    376 
    377    // Process the remaining last 4 or 6 rows here.
    378    int i = process_ht;
    379    while (i < height - 1) {
    380      PROCESS_RESIZE_Y_WD16
    381 
    382      _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
    383                       CAST_LOW(res_8bit0));
    384      i += 2;
    385 
    386      const int is_store_valid = (i < height - 1);
    387      if (is_store_valid)
    388        _mm_storeu_si128((__m128i *)&output[(i / 2) * out_stride + j],
    389                         _mm256_extracti128_si256(res_8bit0, 1));
    390      i += 2;
    391 
    392      // Check if there is any remaining height to process. If so, perform the
    393      // necessary data loading for processing the next row.
    394      if (i < height - 1) {
    395        l10 = l11 = l9;
    396        const __m256i s810 =
    397            _mm256_permute2x128_si256(CAST_HI(l8), CAST_HI(l10), 0x20);
    398        const __m256i s911 =
    399            _mm256_permute2x128_si256(CAST_HI(l9), CAST_HI(l11), 0x20);
    400        // i0j0... i7j7 | k0l0... k7l7
    401        s[4] = _mm256_unpacklo_epi8(s810, s911);
    402        // i8j8... i15j15 | k8l8... k15l15
    403        s[9] = _mm256_unpackhi_epi8(s810, s911);
    404 
    405        s[0] = s[2];
    406        s[1] = s[3];
    407        s[2] = s[4];
    408 
    409        s[5] = s[7];
    410        s[6] = s[8];
    411        s[7] = s[9];
    412      }
    413    }
    414  }
    415 
    416  if (remain_col > 7) {
    417    const int processed_wd = num_col16 * 16;
    418    remain_col = stride % 8;
    419 
    420    const uint8_t *data = &intbuf[processed_wd];
    421 
    422    const __m128i l3 = _mm_loadl_epi64((__m128i *)(data + 0 * stride));
    423    // Padding top 3 rows with available top-most row.
    424    const __m128i l0 = l3;
    425    const __m128i l1 = l3;
    426    const __m128i l2 = l3;
    427    const __m128i l4 = _mm_loadl_epi64((__m128i *)(data + 1 * stride));
    428 
    429    __m128i l6, l7, l8, l9;
    430    __m128i l5 = _mm_loadl_epi64((__m128i *)(data + 2 * stride));
    431    __m128i l10 = _mm_loadl_epi64((__m128i *)(data + 3 * stride));
    432    __m128i l11 = _mm_loadl_epi64((__m128i *)(data + 4 * stride));
    433 
    434    // a0b0...a7b7
    435    const __m128i s01 = _mm_unpacklo_epi8(l0, l1);
    436    // c0d0...c7d7
    437    const __m128i s23 = _mm_unpacklo_epi8(l2, l3);
    438    // e0f0...e7f7
    439    const __m128i s45 = _mm_unpacklo_epi8(l4, l5);
    440    // g0h0...g7h7
    441    __m128i s67 = _mm_unpacklo_epi8(l10, l11);
    442 
    443    // a0b0...a7b7 | c0d0...c7d7
    444    s[0] = _mm256_permute2x128_si256(CAST_HI(s01), CAST_HI(s23), 0x20);
    445    // c0d0...c7d7 | e0f0...e7f7
    446    s[1] = _mm256_permute2x128_si256(CAST_HI(s23), CAST_HI(s45), 0x20);
    447    // e0f0...e7f7 | g0h0...g7h7
    448    s[2] = _mm256_permute2x128_si256(CAST_HI(s45), CAST_HI(s67), 0x20);
    449 
    450    // height to be processed here
    451    const int process_ht = height - remain_row;
    452    for (int i = 0; i < process_ht; i += 4) {
    453      PROCESS_RESIZE_Y_WD8
    454 
    455      _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
    456                       CAST_LOW(res_a_round_1));
    457 
    458      _mm_storel_epi64(
    459          (__m128i *)&output[(i / 2) * out_stride + processed_wd + out_stride],
    460          _mm256_extracti128_si256(res_a_round_1, 1));
    461 
    462      const int idx7 = AOMMIN(height - 1, i + 7);
    463      const int idx8 = AOMMIN(height - 1, i + 8);
    464      l10 = _mm_loadl_epi64((__m128i *)(data + idx7 * stride));
    465      l11 = _mm_loadl_epi64((__m128i *)(data + idx8 * stride));
    466 
    467      // k0l0... k7l7
    468      const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
    469      // i0j0... i7j7 | k0l0... k7l7
    470      s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
    471 
    472      s[0] = s[2];
    473      s[1] = s[3];
    474      s[2] = s[4];
    475    }
    476 
    477    // Process the remaining last 4 or 6 rows here.
    478    int i = process_ht;
    479    while (i < height - 1) {
    480      PROCESS_RESIZE_Y_WD8
    481 
    482      _mm_storel_epi64((__m128i *)&output[(i / 2) * out_stride + processed_wd],
    483                       CAST_LOW(res_a_round_1));
    484 
    485      i += 2;
    486 
    487      const int is_store_valid = (i < height - 1);
    488      if (is_store_valid)
    489        _mm_storel_epi64(
    490            (__m128i *)&output[(i / 2) * out_stride + processed_wd],
    491            _mm256_extracti128_si256(res_a_round_1, 1));
    492      i += 2;
    493 
    494      // Check rows are still remaining for processing. If yes do the required
    495      // load of data for the next iteration.
    496      if (i < height - 1) {
    497        l10 = l11 = l9;
    498        // k0l0... k7l7
    499        const __m128i s10s11 = _mm_unpacklo_epi8(l10, l11);
    500        // i0j0... i7j7 | k0l0... k7l7
    501        s[4] = _mm256_permute2x128_si256(CAST_HI(s89), CAST_HI(s10s11), 0x20);
    502 
    503        s[0] = s[2];
    504        s[1] = s[3];
    505        s[2] = s[4];
    506      }
    507    }
    508  }
    509 
    510  if (remain_col)
    511    return av1_resize_vert_dir_c(intbuf, output, out_stride, height, height2,
    512                                 stride, stride - remain_col);
    513 
    514  return true;
    515 }
    516 
    517 // Masks used for width 32 and 8 pixels, with left and right padding
    518 // requirements
    519 static const uint8_t wd32_left_padding_mask[32] = { 0, 0, 0, 0, 1, 2,  3,  4,
    520                                                    5, 6, 7, 8, 9, 10, 11, 12,
    521                                                    0, 0, 0, 0, 1, 2,  3,  4,
    522                                                    5, 6, 7, 8, 9, 10, 11, 12 };
    523 
    524 static const uint8_t wd32_right_padding_mask[32] = { 0, 1, 2, 2, 2, 2, 2, 2,
    525                                                     2, 2, 2, 2, 2, 2, 2, 2,
    526                                                     0, 1, 2, 2, 2, 2, 2, 2,
    527                                                     2, 2, 2, 2, 2, 2, 2, 2 };
    528 
    529 static const uint8_t wd8_right_padding_mask[32] = {
    530  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10,
    531  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 10, 10, 10, 10, 10
    532 };
    533 
    534 void av1_resize_horz_dir_avx2(const uint8_t *const input, int in_stride,
    535                              uint8_t *intbuf, int height, int filtered_length,
    536                              int width2) {
    537  assert(height % 2 == 0);
    538  // Invoke SSE2 for width less than 32.
    539  if (filtered_length < 32) {
    540    av1_resize_horz_dir_sse2(input, in_stride, intbuf, height, filtered_length,
    541                             width2);
    542    return;
    543  }
    544 
    545  const int filt_length = sizeof(av1_down2_symeven_half_filter);
    546  assert(filt_length % 2 == 0);
    547  (void)filt_length;
    548 
    549  __m256i s0[4], s1[4], coeffs_x[4];
    550 
    551  const int bits = FILTER_BITS;
    552  const int dst_stride = width2;
    553  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
    554  const __m256i round_const_bits = _mm256_set1_epi32((1 << bits) >> 1);
    555 
    556  const uint8_t max_pixel = 255;
    557  const __m256i clip_pixel = _mm256_set1_epi8((char)max_pixel);
    558  const __m256i zero = _mm256_setzero_si256();
    559 
    560  const __m256i wd32_start_pad_mask =
    561      _mm256_loadu_si256((__m256i *)wd32_left_padding_mask);
    562  const __m256i wd32_end_pad_mask =
    563      _mm256_loadu_si256((__m256i *)wd32_right_padding_mask);
    564  const __m256i wd8_end_pad_mask =
    565      _mm256_loadu_si256((__m256i *)wd8_right_padding_mask);
    566  prepare_filter_coeffs(av1_down2_symeven_half_filter, coeffs_x);
    567 
    568  // The core horizontal SIMD processes 32 input pixels of 2 rows simultaneously
    569  // to generate output corresponding to 2 rows. To streamline the core loop and
    570  // eliminate the need for conditional checks, the remaining columns (16 or 8)
    571  // are processed separately.
    572  if (filtered_length % 32 == 0) {
    573    for (int i = 0; i < height; i += 2) {
    574      int filter_offset = 0;
    575      int row_offset = 0;
    576      for (int j = 0; j < filtered_length; j += 32) {
    577        PROCESS_RESIZE_X_WD32
    578      }
    579    }
    580  } else {
    581    for (int i = 0; i < height; i += 2) {
    582      int filter_offset = 0;
    583      int remain_col = filtered_length;
    584      int row_offset = 0;
    585      // To avoid pixel over-read at frame boundary, processing of 32 pixels
    586      // is done using the core loop only if sufficient number of pixels
    587      // required for the load are present. The remaining pixels are processed
    588      // separately.
    589      for (int j = 0; j <= filtered_length - 32; j += 32) {
    590        if (remain_col == 34 || remain_col == 36) {
    591          break;
    592        }
    593        PROCESS_RESIZE_X_WD32
    594        remain_col -= 32;
    595      }
    596 
    597      int wd_processed = filtered_length - remain_col;
    598      // To avoid pixel over-read at frame boundary, processing of 16 pixels
    599      // is done only if sufficient number of pixels required for the
    600      // load are present. The remaining pixels are processed separately.
    601      if (remain_col > 15 && remain_col != 18 && remain_col != 20) {
    602        remain_col = filtered_length - wd_processed - 16;
    603        const int in_idx = i * in_stride + wd_processed;
    604        const int out_idx = (i * dst_stride) + wd_processed / 2;
    605        // a0 a1 --- a15
    606        __m128i row0 =
    607            _mm_loadu_si128((__m128i *)&input[in_idx - filter_offset]);
    608        // b0 b1 --- b15
    609        __m128i row1 = _mm_loadu_si128(
    610            (__m128i *)&input[in_idx + in_stride - filter_offset]);
    611        // a0 a1 --- a15 || b0 b1 --- b15
    612        __m256i r0 =
    613            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
    614        if (filter_offset == 0) {
    615          r0 = _mm256_shuffle_epi8(r0, wd32_start_pad_mask);
    616        }
    617        filter_offset = 3;
    618        const int is_last_cols16 = wd_processed + 16 == filtered_length;
    619        if (is_last_cols16) row_offset = ROW_OFFSET;
    620 
    621        // a16 a17 --- a23
    622        row0 = _mm_loadl_epi64(
    623            (__m128i *)&input[in_idx + 16 - row_offset - filter_offset]);
    624        // b16 b17 --- b23
    625        row1 = _mm_loadl_epi64((__m128i *)&input[in_idx + 16 + in_stride -
    626                                                 row_offset - filter_offset]);
    627 
    628        // a16-a23 x x x x| b16-b23 x x x x
    629        __m256i r1 =
    630            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
    631 
    632        // Pad end pixels to the right, while processing the last pixels in the
    633        // row.
    634        if (is_last_cols16) {
    635          r1 = _mm256_shuffle_epi8(_mm256_srli_si256(r1, ROW_OFFSET),
    636                                   wd32_end_pad_mask);
    637        }
    638 
    639        // a0 a1 --- a15 || b0 b1 --- b15
    640        s0[0] = r0;
    641        // a2 a3 --- a17 || b2 b3 --- b17
    642        s0[1] = _mm256_alignr_epi8(r1, r0, 2);
    643        // a4 a5 --- a19 || b4 b5 --- b19
    644        s0[2] = _mm256_alignr_epi8(r1, r0, 4);
    645        // a6 a7 --- a21 || b6 b7 --- b21
    646        s0[3] = _mm256_alignr_epi8(r1, r0, 6);
    647 
    648        // result for 16 pixels (a0 to a15) of row0 and row1
    649        __m256i res_out_0[2];
    650        res_out_0[0] = res_out_0[1] = zero;
    651        resize_convolve(s0, coeffs_x, res_out_0);
    652 
    653        // r00-r07
    654        res_out_0[0] = _mm256_sra_epi32(
    655            _mm256_add_epi32(res_out_0[0], round_const_bits), round_shift_bits);
    656        // r10-r17
    657        res_out_0[1] = _mm256_sra_epi32(
    658            _mm256_add_epi32(res_out_0[1], round_const_bits), round_shift_bits);
    659        // r00-r03 r10-r13 r04-r07 r14-r17
    660        __m256i res_out_row01 = _mm256_packus_epi32(res_out_0[0], res_out_0[1]);
    661        // r00-r03 r10-r13 r00-r03 r10-r13 | r04-r07 r14-r17 r04-r07 r14-r17
    662        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
    663        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
    664        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
    665        // r00-r03 r10-r13 r04-r07 r14-r17
    666        __m128i low_result =
    667            CAST_LOW(_mm256_permute4x64_epi64(res_out_row01, 0xd8));
    668        // r00-r03 r04-r07 r10-r13 r14-r17
    669        low_result = _mm_shuffle_epi32(low_result, 0xd8);
    670 
    671        _mm_storel_epi64((__m128i *)&intbuf[out_idx], low_result);
    672        _mm_storel_epi64((__m128i *)&intbuf[out_idx + dst_stride],
    673                         _mm_unpackhi_epi64(low_result, low_result));
    674      }
    675 
    676      // To avoid pixel over-read at frame boundary, processing of 8 pixels
    677      // is done only if sufficient number of pixels required for the
    678      // load are present. The remaining pixels are processed by C function.
    679      wd_processed = filtered_length - remain_col;
    680      if (remain_col > 7 && remain_col != 10 && remain_col != 12) {
    681        remain_col = filtered_length - wd_processed - 8;
    682        const int in_idx = i * in_stride + wd_processed - filter_offset;
    683        const int out_idx = (i * dst_stride) + wd_processed / 2;
    684        const int is_last_cols_8 = wd_processed + 8 == filtered_length;
    685        if (is_last_cols_8) row_offset = ROW_OFFSET;
    686        // a0 a1 --- a15
    687        __m128i row0 = _mm_loadu_si128((__m128i *)&input[in_idx - row_offset]);
    688        // b0 b1 --- b15
    689        __m128i row1 =
    690            _mm_loadu_si128((__m128i *)&input[in_idx + in_stride - row_offset]);
    691        // a0 a1 --- a15 || b0 b1 --- b15
    692        __m256i r0 =
    693            _mm256_permute2x128_si256(CAST_HI(row0), CAST_HI(row1), 0x20);
    694 
    695        // Pad end pixels to the right, while processing the last pixels in the
    696        // row.
    697        if (is_last_cols_8)
    698          r0 = _mm256_shuffle_epi8(_mm256_srli_si256(r0, ROW_OFFSET),
    699                                   wd8_end_pad_mask);
    700 
    701        // a0 a1 a2 a3 a4 a5 a6 a7 | b0 b1 b2 b3 b4 b5 b6 b7
    702        s0[0] = r0;
    703        // a2 a3 a4 a5 a6 a7 a8 a9 | b2 b3 b4 b5 b6 b7 b8 b9
    704        s0[1] = _mm256_bsrli_epi128(r0, 2);
    705        // a4 a5 a6 a7 a8 a9 a10 a10 |  b4 b5 b6 b7 b8 b9 b10 b10
    706        s0[2] = _mm256_bsrli_epi128(r0, 4);
    707        // a6 a7 a8 a9 a10 a10 a10 a10 | b6 b7 b8 b9 b10 b10 b10 b10
    708        s0[3] = _mm256_bsrli_epi128(r0, 6);
    709 
    710        __m256i res_out_0[2];
    711        res_out_0[0] = res_out_0[1] = zero;
    712        resize_convolve(s0, coeffs_x, res_out_0);
    713 
    714        // r00 - r03 | r10 - r13
    715        __m256i res_out =
    716            _mm256_permute2x128_si256(res_out_0[0], res_out_0[1], 0x20);
    717        // r00 - r03 | r10 - r13
    718        res_out = _mm256_sra_epi32(_mm256_add_epi32(res_out, round_const_bits),
    719                                   round_shift_bits);
    720        // r00-r03 r00-r03 r10-r13 r10-r13
    721        __m256i res_out_row01 = _mm256_packus_epi32(res_out, res_out);
    722        // r00-r03 r00-r03 r00-r03 r00-r03 r10-r13 r10-r13 r10-r13 r10-r13
    723        res_out_row01 = _mm256_packus_epi16(res_out_row01, res_out_row01);
    724        res_out_row01 = _mm256_min_epu8(res_out_row01, clip_pixel);
    725        res_out_row01 = _mm256_max_epu8(res_out_row01, zero);
    726 
    727        xx_storel_32(intbuf + out_idx, CAST_LOW(res_out_row01));
    728        xx_storel_32(intbuf + out_idx + dst_stride,
    729                     _mm256_extracti128_si256(res_out_row01, 1));
    730      }
    731 
    732      wd_processed = filtered_length - remain_col;
    733      if (remain_col) {
    734        const int in_idx = (in_stride * i);
    735        const int out_idx = (wd_processed / 2) + width2 * i;
    736 
    737        down2_symeven(input + in_idx, filtered_length, intbuf + out_idx,
    738                      wd_processed);
    739        down2_symeven(input + in_idx + in_stride, filtered_length,
    740                      intbuf + out_idx + width2, wd_processed);
    741      }
    742    }
    743  }
    744 }