tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intrapred_avx2.c (184008B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 #include "aom_dsp/x86/intrapred_x86.h"
     16 #include "aom_dsp/x86/intrapred_utils.h"
     17 #include "aom_dsp/x86/lpf_common_sse2.h"
     18 
     19 static inline __m256i dc_sum_64(const uint8_t *ref) {
     20  const __m256i x0 = _mm256_loadu_si256((const __m256i *)ref);
     21  const __m256i x1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
     22  const __m256i zero = _mm256_setzero_si256();
     23  __m256i y0 = _mm256_sad_epu8(x0, zero);
     24  __m256i y1 = _mm256_sad_epu8(x1, zero);
     25  y0 = _mm256_add_epi64(y0, y1);
     26  __m256i u0 = _mm256_permute2x128_si256(y0, y0, 1);
     27  y0 = _mm256_add_epi64(u0, y0);
     28  u0 = _mm256_unpackhi_epi64(y0, y0);
     29  return _mm256_add_epi16(y0, u0);
     30 }
     31 
     32 static inline __m256i dc_sum_32(const uint8_t *ref) {
     33  const __m256i x = _mm256_loadu_si256((const __m256i *)ref);
     34  const __m256i zero = _mm256_setzero_si256();
     35  __m256i y = _mm256_sad_epu8(x, zero);
     36  __m256i u = _mm256_permute2x128_si256(y, y, 1);
     37  y = _mm256_add_epi64(u, y);
     38  u = _mm256_unpackhi_epi64(y, y);
     39  return _mm256_add_epi16(y, u);
     40 }
     41 
     42 static inline void row_store_32xh(const __m256i *r, int height, uint8_t *dst,
     43                                  ptrdiff_t stride) {
     44  for (int i = 0; i < height; ++i) {
     45    _mm256_storeu_si256((__m256i *)dst, *r);
     46    dst += stride;
     47  }
     48 }
     49 
     50 static inline void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
     51                                    int height, uint8_t *dst,
     52                                    ptrdiff_t stride) {
     53  for (int i = 0; i < height; ++i) {
     54    _mm256_storeu_si256((__m256i *)dst, *r0);
     55    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
     56    dst += stride;
     57  }
     58 }
     59 
     60 static inline void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
     61                                  ptrdiff_t stride) {
     62  for (int i = 0; i < height; ++i) {
     63    _mm256_storeu_si256((__m256i *)dst, *r);
     64    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
     65    dst += stride;
     66  }
     67 }
     68 
     69 #if CONFIG_AV1_HIGHBITDEPTH
     70 static DECLARE_ALIGNED(16, uint8_t, HighbdLoadMaskx[8][16]) = {
     71  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
     72  { 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
     73  { 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
     74  { 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
     75  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5, 6, 7 },
     76  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3, 4, 5 },
     77  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 3 },
     78  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 },
     79 };
     80 
     81 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx4[4][16]) = {
     82  { 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15 },
     83  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 4, 5, 8, 9, 12, 13 },
     84  { 0, 1, 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 6, 7, 10, 11 },
     85  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 0, 1, 8, 9 }
     86 };
     87 
     88 static DECLARE_ALIGNED(16, uint8_t, HighbdEvenOddMaskx[8][32]) = {
     89  { 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25, 28, 29,
     90    2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27, 30, 31 },
     91  { 0, 1, 2, 3, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27,
     92    0, 1, 4, 5, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25, 28, 29 },
     93  { 0, 1, 0, 1, 4, 5, 8,  9,  12, 13, 16, 17, 20, 21, 24, 25,
     94    0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23, 26, 27 },
     95  { 0, 1, 0, 1, 0, 1, 6, 7, 10, 11, 14, 15, 18, 19, 22, 23,
     96    0, 1, 0, 1, 0, 1, 8, 9, 12, 13, 16, 17, 20, 21, 24, 25 },
     97  { 0, 1, 0, 1, 0, 1, 0, 1, 8,  9,  12, 13, 16, 17, 20, 21,
     98    0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19, 22, 23 },
     99  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 10, 11, 14, 15, 18, 19,
    100    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17, 20, 21 },
    101  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 12, 13, 16, 17,
    102    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15, 18, 19 },
    103  { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 14, 15,
    104    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 16, 17 }
    105 };
    106 
    107 static DECLARE_ALIGNED(32, uint16_t, HighbdBaseMask[17][16]) = {
    108  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    109  { 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    110  { 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    111  { 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    112  { 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    113  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    114  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    115    0 },
    116  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0, 0, 0, 0,
    117    0, 0 },
    118  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0, 0,
    119    0, 0, 0, 0 },
    120  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0,
    121    0, 0, 0, 0, 0, 0 },
    122  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    123    0xffff, 0, 0, 0, 0, 0, 0 },
    124  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    125    0xffff, 0xffff, 0, 0, 0, 0, 0 },
    126  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    127    0xffff, 0xffff, 0xffff, 0, 0, 0, 0 },
    128  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    129    0xffff, 0xffff, 0xffff, 0xffff, 0, 0, 0 },
    130  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    131    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0, 0 },
    132  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    133    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0 },
    134  { 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff,
    135    0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff, 0xffff }
    136 };
    137 
    138 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    139 static inline void highbd_transpose16x4_8x8_sse2(__m128i *x, __m128i *d) {
    140  __m128i r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15;
    141 
    142  r0 = _mm_unpacklo_epi16(x[0], x[1]);
    143  r1 = _mm_unpacklo_epi16(x[2], x[3]);
    144  r2 = _mm_unpacklo_epi16(x[4], x[5]);
    145  r3 = _mm_unpacklo_epi16(x[6], x[7]);
    146 
    147  r4 = _mm_unpacklo_epi16(x[8], x[9]);
    148  r5 = _mm_unpacklo_epi16(x[10], x[11]);
    149  r6 = _mm_unpacklo_epi16(x[12], x[13]);
    150  r7 = _mm_unpacklo_epi16(x[14], x[15]);
    151 
    152  r8 = _mm_unpacklo_epi32(r0, r1);
    153  r9 = _mm_unpackhi_epi32(r0, r1);
    154  r10 = _mm_unpacklo_epi32(r2, r3);
    155  r11 = _mm_unpackhi_epi32(r2, r3);
    156 
    157  r12 = _mm_unpacklo_epi32(r4, r5);
    158  r13 = _mm_unpackhi_epi32(r4, r5);
    159  r14 = _mm_unpacklo_epi32(r6, r7);
    160  r15 = _mm_unpackhi_epi32(r6, r7);
    161 
    162  r0 = _mm_unpacklo_epi64(r8, r9);
    163  r1 = _mm_unpackhi_epi64(r8, r9);
    164  r2 = _mm_unpacklo_epi64(r10, r11);
    165  r3 = _mm_unpackhi_epi64(r10, r11);
    166 
    167  r4 = _mm_unpacklo_epi64(r12, r13);
    168  r5 = _mm_unpackhi_epi64(r12, r13);
    169  r6 = _mm_unpacklo_epi64(r14, r15);
    170  r7 = _mm_unpackhi_epi64(r14, r15);
    171 
    172  d[0] = _mm_unpacklo_epi64(r0, r2);
    173  d[1] = _mm_unpacklo_epi64(r4, r6);
    174  d[2] = _mm_unpacklo_epi64(r1, r3);
    175  d[3] = _mm_unpacklo_epi64(r5, r7);
    176 
    177  d[4] = _mm_unpackhi_epi64(r0, r2);
    178  d[5] = _mm_unpackhi_epi64(r4, r6);
    179  d[6] = _mm_unpackhi_epi64(r1, r3);
    180  d[7] = _mm_unpackhi_epi64(r5, r7);
    181 }
    182 
    183 static inline void highbd_transpose4x16_avx2(__m256i *x, __m256i *d) {
    184  __m256i w0, w1, w2, w3, ww0, ww1;
    185 
    186  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
    187  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
    188  w2 = _mm256_unpackhi_epi16(x[0], x[1]);  // 40 50 41 51 42 52 43 53
    189  w3 = _mm256_unpackhi_epi16(x[2], x[3]);  // 60 70 61 71 62 72 63 73
    190 
    191  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
    192  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
    193 
    194  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
    195  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
    196 
    197  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
    198  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
    199 
    200  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
    201  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
    202 }
    203 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    204 
    205 static inline void highbd_transpose8x16_16x8_avx2(__m256i *x, __m256i *d) {
    206  __m256i w0, w1, w2, w3, ww0, ww1;
    207 
    208  w0 = _mm256_unpacklo_epi16(x[0], x[1]);  // 00 10 01 11 02 12 03 13
    209  w1 = _mm256_unpacklo_epi16(x[2], x[3]);  // 20 30 21 31 22 32 23 33
    210  w2 = _mm256_unpacklo_epi16(x[4], x[5]);  // 40 50 41 51 42 52 43 53
    211  w3 = _mm256_unpacklo_epi16(x[6], x[7]);  // 60 70 61 71 62 72 63 73
    212 
    213  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
    214  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
    215 
    216  d[0] = _mm256_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
    217  d[1] = _mm256_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
    218 
    219  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
    220  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
    221 
    222  d[2] = _mm256_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
    223  d[3] = _mm256_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
    224 
    225  w0 = _mm256_unpackhi_epi16(x[0], x[1]);  // 04 14 05 15 06 16 07 17
    226  w1 = _mm256_unpackhi_epi16(x[2], x[3]);  // 24 34 25 35 26 36 27 37
    227  w2 = _mm256_unpackhi_epi16(x[4], x[5]);  // 44 54 45 55 46 56 47 57
    228  w3 = _mm256_unpackhi_epi16(x[6], x[7]);  // 64 74 65 75 66 76 67 77
    229 
    230  ww0 = _mm256_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
    231  ww1 = _mm256_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
    232 
    233  d[4] = _mm256_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
    234  d[5] = _mm256_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
    235 
    236  ww0 = _mm256_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
    237  ww1 = _mm256_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
    238 
    239  d[6] = _mm256_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
    240  d[7] = _mm256_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
    241 }
    242 
    243 static inline void highbd_transpose16x16_avx2(__m256i *x, __m256i *d) {
    244  __m256i w0, w1, w2, w3, ww0, ww1;
    245  __m256i dd[16];
    246  w0 = _mm256_unpacklo_epi16(x[0], x[1]);
    247  w1 = _mm256_unpacklo_epi16(x[2], x[3]);
    248  w2 = _mm256_unpacklo_epi16(x[4], x[5]);
    249  w3 = _mm256_unpacklo_epi16(x[6], x[7]);
    250 
    251  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
    252  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
    253 
    254  dd[0] = _mm256_unpacklo_epi64(ww0, ww1);
    255  dd[1] = _mm256_unpackhi_epi64(ww0, ww1);
    256 
    257  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
    258  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
    259 
    260  dd[2] = _mm256_unpacklo_epi64(ww0, ww1);
    261  dd[3] = _mm256_unpackhi_epi64(ww0, ww1);
    262 
    263  w0 = _mm256_unpackhi_epi16(x[0], x[1]);
    264  w1 = _mm256_unpackhi_epi16(x[2], x[3]);
    265  w2 = _mm256_unpackhi_epi16(x[4], x[5]);
    266  w3 = _mm256_unpackhi_epi16(x[6], x[7]);
    267 
    268  ww0 = _mm256_unpacklo_epi32(w0, w1);  //
    269  ww1 = _mm256_unpacklo_epi32(w2, w3);  //
    270 
    271  dd[4] = _mm256_unpacklo_epi64(ww0, ww1);
    272  dd[5] = _mm256_unpackhi_epi64(ww0, ww1);
    273 
    274  ww0 = _mm256_unpackhi_epi32(w0, w1);  //
    275  ww1 = _mm256_unpackhi_epi32(w2, w3);  //
    276 
    277  dd[6] = _mm256_unpacklo_epi64(ww0, ww1);
    278  dd[7] = _mm256_unpackhi_epi64(ww0, ww1);
    279 
    280  w0 = _mm256_unpacklo_epi16(x[8], x[9]);
    281  w1 = _mm256_unpacklo_epi16(x[10], x[11]);
    282  w2 = _mm256_unpacklo_epi16(x[12], x[13]);
    283  w3 = _mm256_unpacklo_epi16(x[14], x[15]);
    284 
    285  ww0 = _mm256_unpacklo_epi32(w0, w1);
    286  ww1 = _mm256_unpacklo_epi32(w2, w3);
    287 
    288  dd[8] = _mm256_unpacklo_epi64(ww0, ww1);
    289  dd[9] = _mm256_unpackhi_epi64(ww0, ww1);
    290 
    291  ww0 = _mm256_unpackhi_epi32(w0, w1);
    292  ww1 = _mm256_unpackhi_epi32(w2, w3);
    293 
    294  dd[10] = _mm256_unpacklo_epi64(ww0, ww1);
    295  dd[11] = _mm256_unpackhi_epi64(ww0, ww1);
    296 
    297  w0 = _mm256_unpackhi_epi16(x[8], x[9]);
    298  w1 = _mm256_unpackhi_epi16(x[10], x[11]);
    299  w2 = _mm256_unpackhi_epi16(x[12], x[13]);
    300  w3 = _mm256_unpackhi_epi16(x[14], x[15]);
    301 
    302  ww0 = _mm256_unpacklo_epi32(w0, w1);
    303  ww1 = _mm256_unpacklo_epi32(w2, w3);
    304 
    305  dd[12] = _mm256_unpacklo_epi64(ww0, ww1);
    306  dd[13] = _mm256_unpackhi_epi64(ww0, ww1);
    307 
    308  ww0 = _mm256_unpackhi_epi32(w0, w1);
    309  ww1 = _mm256_unpackhi_epi32(w2, w3);
    310 
    311  dd[14] = _mm256_unpacklo_epi64(ww0, ww1);
    312  dd[15] = _mm256_unpackhi_epi64(ww0, ww1);
    313 
    314  for (int i = 0; i < 8; i++) {
    315    d[i] = _mm256_insertf128_si256(dd[i], _mm256_castsi256_si128(dd[i + 8]), 1);
    316    d[i + 8] = _mm256_insertf128_si256(dd[i + 8],
    317                                       _mm256_extracti128_si256(dd[i], 1), 0);
    318  }
    319 }
    320 #endif  // CONFIG_AV1_HIGHBITDEPTH
    321 
    322 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    323                                 const uint8_t *above, const uint8_t *left) {
    324  const __m256i sum_above = dc_sum_32(above);
    325  __m256i sum_left = dc_sum_32(left);
    326  sum_left = _mm256_add_epi16(sum_left, sum_above);
    327  const __m256i thirtytwo = _mm256_set1_epi16(32);
    328  sum_left = _mm256_add_epi16(sum_left, thirtytwo);
    329  sum_left = _mm256_srai_epi16(sum_left, 6);
    330  const __m256i zero = _mm256_setzero_si256();
    331  __m256i row = _mm256_shuffle_epi8(sum_left, zero);
    332  row_store_32xh(&row, 32, dst, stride);
    333 }
    334 
    335 void aom_dc_top_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    336                                     const uint8_t *above,
    337                                     const uint8_t *left) {
    338  __m256i sum = dc_sum_32(above);
    339  (void)left;
    340 
    341  const __m256i sixteen = _mm256_set1_epi16(16);
    342  sum = _mm256_add_epi16(sum, sixteen);
    343  sum = _mm256_srai_epi16(sum, 5);
    344  const __m256i zero = _mm256_setzero_si256();
    345  __m256i row = _mm256_shuffle_epi8(sum, zero);
    346  row_store_32xh(&row, 32, dst, stride);
    347 }
    348 
    349 void aom_dc_left_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    350                                      const uint8_t *above,
    351                                      const uint8_t *left) {
    352  __m256i sum = dc_sum_32(left);
    353  (void)above;
    354 
    355  const __m256i sixteen = _mm256_set1_epi16(16);
    356  sum = _mm256_add_epi16(sum, sixteen);
    357  sum = _mm256_srai_epi16(sum, 5);
    358  const __m256i zero = _mm256_setzero_si256();
    359  __m256i row = _mm256_shuffle_epi8(sum, zero);
    360  row_store_32xh(&row, 32, dst, stride);
    361 }
    362 
    363 void aom_dc_128_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    364                                     const uint8_t *above,
    365                                     const uint8_t *left) {
    366  (void)above;
    367  (void)left;
    368  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
    369  row_store_32xh(&row, 32, dst, stride);
    370 }
    371 
    372 void aom_v_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    373                                const uint8_t *above, const uint8_t *left) {
    374  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
    375  (void)left;
    376  row_store_32xh(&row, 32, dst, stride);
    377 }
    378 
    379 // There are 32 rows togeter. This function does line:
    380 // 0,1,2,3, and 16,17,18,19. The next call would do
    381 // 4,5,6,7, and 20,21,22,23. So 4 times of calling
    382 // would finish 32 rows.
    383 static inline void h_predictor_32x8line(const __m256i *row, uint8_t *dst,
    384                                        ptrdiff_t stride) {
    385  __m256i t[4];
    386  __m256i m = _mm256_setzero_si256();
    387  const __m256i inc = _mm256_set1_epi8(4);
    388  int i;
    389 
    390  for (i = 0; i < 4; i++) {
    391    t[i] = _mm256_shuffle_epi8(*row, m);
    392    __m256i r0 = _mm256_permute2x128_si256(t[i], t[i], 0);
    393    __m256i r1 = _mm256_permute2x128_si256(t[i], t[i], 0x11);
    394    _mm256_storeu_si256((__m256i *)dst, r0);
    395    _mm256_storeu_si256((__m256i *)(dst + (stride << 4)), r1);
    396    dst += stride;
    397    m = _mm256_add_epi8(m, inc);
    398  }
    399 }
    400 
    401 void aom_h_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    402                                const uint8_t *above, const uint8_t *left) {
    403  (void)above;
    404  const __m256i left_col = _mm256_loadu_si256((__m256i const *)left);
    405 
    406  __m256i u = _mm256_unpacklo_epi8(left_col, left_col);
    407 
    408  __m256i v = _mm256_unpacklo_epi8(u, u);
    409  h_predictor_32x8line(&v, dst, stride);
    410  dst += stride << 2;
    411 
    412  v = _mm256_unpackhi_epi8(u, u);
    413  h_predictor_32x8line(&v, dst, stride);
    414  dst += stride << 2;
    415 
    416  u = _mm256_unpackhi_epi8(left_col, left_col);
    417 
    418  v = _mm256_unpacklo_epi8(u, u);
    419  h_predictor_32x8line(&v, dst, stride);
    420  dst += stride << 2;
    421 
    422  v = _mm256_unpackhi_epi8(u, u);
    423  h_predictor_32x8line(&v, dst, stride);
    424 }
    425 
    426 // -----------------------------------------------------------------------------
    427 // Rectangle
    428 void aom_dc_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    429                                 const uint8_t *above, const uint8_t *left) {
    430  const __m128i top_sum = dc_sum_32_sse2(above);
    431  __m128i left_sum = dc_sum_16_sse2(left);
    432  left_sum = _mm_add_epi16(top_sum, left_sum);
    433  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(left_sum);
    434  sum += 24;
    435  sum /= 48;
    436  const __m256i row = _mm256_set1_epi8((int8_t)sum);
    437  row_store_32xh(&row, 16, dst, stride);
    438 }
    439 
    440 void aom_dc_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    441                                 const uint8_t *above, const uint8_t *left) {
    442  const __m256i sum_above = dc_sum_32(above);
    443  __m256i sum_left = dc_sum_64(left);
    444  sum_left = _mm256_add_epi16(sum_left, sum_above);
    445  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    446  sum += 48;
    447  sum /= 96;
    448  const __m256i row = _mm256_set1_epi8((int8_t)sum);
    449  row_store_32xh(&row, 64, dst, stride);
    450 }
    451 
    452 void aom_dc_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    453                                 const uint8_t *above, const uint8_t *left) {
    454  const __m256i sum_above = dc_sum_64(above);
    455  __m256i sum_left = dc_sum_64(left);
    456  sum_left = _mm256_add_epi16(sum_left, sum_above);
    457  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    458  sum += 64;
    459  sum /= 128;
    460  const __m256i row = _mm256_set1_epi8((int8_t)sum);
    461  row_store_64xh(&row, 64, dst, stride);
    462 }
    463 
    464 void aom_dc_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    465                                 const uint8_t *above, const uint8_t *left) {
    466  const __m256i sum_above = dc_sum_64(above);
    467  __m256i sum_left = dc_sum_32(left);
    468  sum_left = _mm256_add_epi16(sum_left, sum_above);
    469  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    470  sum += 48;
    471  sum /= 96;
    472  const __m256i row = _mm256_set1_epi8((int8_t)sum);
    473  row_store_64xh(&row, 32, dst, stride);
    474 }
    475 
    476 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    477 void aom_dc_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    478                                 const uint8_t *above, const uint8_t *left) {
    479  const __m256i sum_above = dc_sum_64(above);
    480  __m256i sum_left = _mm256_castsi128_si256(dc_sum_16_sse2(left));
    481  sum_left = _mm256_add_epi16(sum_left, sum_above);
    482  uint16_t sum = (uint16_t)_mm_cvtsi128_si32(_mm256_castsi256_si128(sum_left));
    483  sum += 40;
    484  sum /= 80;
    485  const __m256i row = _mm256_set1_epi8((int8_t)sum);
    486  row_store_64xh(&row, 16, dst, stride);
    487 }
    488 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    489 
    490 void aom_dc_top_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    491                                     const uint8_t *above,
    492                                     const uint8_t *left) {
    493  __m256i sum = dc_sum_32(above);
    494  (void)left;
    495 
    496  const __m256i sixteen = _mm256_set1_epi16(16);
    497  sum = _mm256_add_epi16(sum, sixteen);
    498  sum = _mm256_srai_epi16(sum, 5);
    499  const __m256i zero = _mm256_setzero_si256();
    500  __m256i row = _mm256_shuffle_epi8(sum, zero);
    501  row_store_32xh(&row, 16, dst, stride);
    502 }
    503 
    504 void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    505                                     const uint8_t *above,
    506                                     const uint8_t *left) {
    507  __m256i sum = dc_sum_32(above);
    508  (void)left;
    509 
    510  const __m256i sixteen = _mm256_set1_epi16(16);
    511  sum = _mm256_add_epi16(sum, sixteen);
    512  sum = _mm256_srai_epi16(sum, 5);
    513  const __m256i zero = _mm256_setzero_si256();
    514  __m256i row = _mm256_shuffle_epi8(sum, zero);
    515  row_store_32xh(&row, 64, dst, stride);
    516 }
    517 
    518 void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    519                                     const uint8_t *above,
    520                                     const uint8_t *left) {
    521  __m256i sum = dc_sum_64(above);
    522  (void)left;
    523 
    524  const __m256i thirtytwo = _mm256_set1_epi16(32);
    525  sum = _mm256_add_epi16(sum, thirtytwo);
    526  sum = _mm256_srai_epi16(sum, 6);
    527  const __m256i zero = _mm256_setzero_si256();
    528  __m256i row = _mm256_shuffle_epi8(sum, zero);
    529  row_store_64xh(&row, 64, dst, stride);
    530 }
    531 
    532 void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    533                                     const uint8_t *above,
    534                                     const uint8_t *left) {
    535  __m256i sum = dc_sum_64(above);
    536  (void)left;
    537 
    538  const __m256i thirtytwo = _mm256_set1_epi16(32);
    539  sum = _mm256_add_epi16(sum, thirtytwo);
    540  sum = _mm256_srai_epi16(sum, 6);
    541  const __m256i zero = _mm256_setzero_si256();
    542  __m256i row = _mm256_shuffle_epi8(sum, zero);
    543  row_store_64xh(&row, 32, dst, stride);
    544 }
    545 
    546 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    547 void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    548                                     const uint8_t *above,
    549                                     const uint8_t *left) {
    550  __m256i sum = dc_sum_64(above);
    551  (void)left;
    552 
    553  const __m256i thirtytwo = _mm256_set1_epi16(32);
    554  sum = _mm256_add_epi16(sum, thirtytwo);
    555  sum = _mm256_srai_epi16(sum, 6);
    556  const __m256i zero = _mm256_setzero_si256();
    557  __m256i row = _mm256_shuffle_epi8(sum, zero);
    558  row_store_64xh(&row, 16, dst, stride);
    559 }
    560 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    561 
    562 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    563                                      const uint8_t *above,
    564                                      const uint8_t *left) {
    565  __m128i sum = dc_sum_16_sse2(left);
    566  (void)above;
    567 
    568  const __m128i eight = _mm_set1_epi16(8);
    569  sum = _mm_add_epi16(sum, eight);
    570  sum = _mm_srai_epi16(sum, 4);
    571  const __m128i zero = _mm_setzero_si128();
    572  const __m128i r = _mm_shuffle_epi8(sum, zero);
    573  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
    574  row_store_32xh(&row, 16, dst, stride);
    575 }
    576 
    577 void aom_dc_left_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    578                                      const uint8_t *above,
    579                                      const uint8_t *left) {
    580  __m256i sum = dc_sum_64(left);
    581  (void)above;
    582 
    583  const __m256i thirtytwo = _mm256_set1_epi16(32);
    584  sum = _mm256_add_epi16(sum, thirtytwo);
    585  sum = _mm256_srai_epi16(sum, 6);
    586  const __m256i zero = _mm256_setzero_si256();
    587  __m256i row = _mm256_shuffle_epi8(sum, zero);
    588  row_store_32xh(&row, 64, dst, stride);
    589 }
    590 
    591 void aom_dc_left_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    592                                      const uint8_t *above,
    593                                      const uint8_t *left) {
    594  __m256i sum = dc_sum_64(left);
    595  (void)above;
    596 
    597  const __m256i thirtytwo = _mm256_set1_epi16(32);
    598  sum = _mm256_add_epi16(sum, thirtytwo);
    599  sum = _mm256_srai_epi16(sum, 6);
    600  const __m256i zero = _mm256_setzero_si256();
    601  __m256i row = _mm256_shuffle_epi8(sum, zero);
    602  row_store_64xh(&row, 64, dst, stride);
    603 }
    604 
    605 void aom_dc_left_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    606                                      const uint8_t *above,
    607                                      const uint8_t *left) {
    608  __m256i sum = dc_sum_32(left);
    609  (void)above;
    610 
    611  const __m256i sixteen = _mm256_set1_epi16(16);
    612  sum = _mm256_add_epi16(sum, sixteen);
    613  sum = _mm256_srai_epi16(sum, 5);
    614  const __m256i zero = _mm256_setzero_si256();
    615  __m256i row = _mm256_shuffle_epi8(sum, zero);
    616  row_store_64xh(&row, 32, dst, stride);
    617 }
    618 
    619 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    620 void aom_dc_left_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    621                                      const uint8_t *above,
    622                                      const uint8_t *left) {
    623  __m128i sum = dc_sum_16_sse2(left);
    624  (void)above;
    625 
    626  const __m128i eight = _mm_set1_epi16(8);
    627  sum = _mm_add_epi16(sum, eight);
    628  sum = _mm_srai_epi16(sum, 4);
    629  const __m128i zero = _mm_setzero_si128();
    630  const __m128i r = _mm_shuffle_epi8(sum, zero);
    631  const __m256i row = _mm256_inserti128_si256(_mm256_castsi128_si256(r), r, 1);
    632  row_store_64xh(&row, 16, dst, stride);
    633 }
    634 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    635 
    636 void aom_dc_128_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    637                                     const uint8_t *above,
    638                                     const uint8_t *left) {
    639  (void)above;
    640  (void)left;
    641  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
    642  row_store_32xh(&row, 16, dst, stride);
    643 }
    644 
    645 void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    646                                     const uint8_t *above,
    647                                     const uint8_t *left) {
    648  (void)above;
    649  (void)left;
    650  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
    651  row_store_32xh(&row, 64, dst, stride);
    652 }
    653 
    654 void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    655                                     const uint8_t *above,
    656                                     const uint8_t *left) {
    657  (void)above;
    658  (void)left;
    659  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
    660  row_store_64xh(&row, 64, dst, stride);
    661 }
    662 
    663 void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    664                                     const uint8_t *above,
    665                                     const uint8_t *left) {
    666  (void)above;
    667  (void)left;
    668  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
    669  row_store_64xh(&row, 32, dst, stride);
    670 }
    671 
    672 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    673 void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    674                                     const uint8_t *above,
    675                                     const uint8_t *left) {
    676  (void)above;
    677  (void)left;
    678  const __m256i row = _mm256_set1_epi8((int8_t)0x80);
    679  row_store_64xh(&row, 16, dst, stride);
    680 }
    681 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    682 
    683 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    684                                const uint8_t *above, const uint8_t *left) {
    685  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
    686  (void)left;
    687  row_store_32xh(&row, 16, dst, stride);
    688 }
    689 
    690 void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    691                                const uint8_t *above, const uint8_t *left) {
    692  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
    693  (void)left;
    694  row_store_32xh(&row, 64, dst, stride);
    695 }
    696 
    697 void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    698                                const uint8_t *above, const uint8_t *left) {
    699  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
    700  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
    701  (void)left;
    702  row_store_32x2xh(&row0, &row1, 64, dst, stride);
    703 }
    704 
    705 void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    706                                const uint8_t *above, const uint8_t *left) {
    707  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
    708  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
    709  (void)left;
    710  row_store_32x2xh(&row0, &row1, 32, dst, stride);
    711 }
    712 
    713 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    714 void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
    715                                const uint8_t *above, const uint8_t *left) {
    716  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
    717  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
    718  (void)left;
    719  row_store_32x2xh(&row0, &row1, 16, dst, stride);
    720 }
    721 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    722 
    723 // -----------------------------------------------------------------------------
    724 // PAETH_PRED
    725 
    726 // Return 16 16-bit pixels in one row (__m256i)
    727 static inline __m256i paeth_pred(const __m256i *left, const __m256i *top,
    728                                 const __m256i *topleft) {
    729  const __m256i base =
    730      _mm256_sub_epi16(_mm256_add_epi16(*top, *left), *topleft);
    731 
    732  __m256i pl = _mm256_abs_epi16(_mm256_sub_epi16(base, *left));
    733  __m256i pt = _mm256_abs_epi16(_mm256_sub_epi16(base, *top));
    734  __m256i ptl = _mm256_abs_epi16(_mm256_sub_epi16(base, *topleft));
    735 
    736  __m256i mask1 = _mm256_cmpgt_epi16(pl, pt);
    737  mask1 = _mm256_or_si256(mask1, _mm256_cmpgt_epi16(pl, ptl));
    738  __m256i mask2 = _mm256_cmpgt_epi16(pt, ptl);
    739 
    740  pl = _mm256_andnot_si256(mask1, *left);
    741 
    742  ptl = _mm256_and_si256(mask2, *topleft);
    743  pt = _mm256_andnot_si256(mask2, *top);
    744  pt = _mm256_or_si256(pt, ptl);
    745  pt = _mm256_and_si256(mask1, pt);
    746 
    747  return _mm256_or_si256(pt, pl);
    748 }
    749 
    750 // Return 16 8-bit pixels in one row (__m128i)
    751 static inline __m128i paeth_16x1_pred(const __m256i *left, const __m256i *top,
    752                                      const __m256i *topleft) {
    753  const __m256i p0 = paeth_pred(left, top, topleft);
    754  const __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
    755  const __m256i p = _mm256_packus_epi16(p0, p1);
    756  return _mm256_castsi256_si128(p);
    757 }
    758 
    759 static inline __m256i get_top_vector(const uint8_t *above) {
    760  const __m128i x = _mm_load_si128((const __m128i *)above);
    761  const __m128i zero = _mm_setzero_si128();
    762  const __m128i t0 = _mm_unpacklo_epi8(x, zero);
    763  const __m128i t1 = _mm_unpackhi_epi8(x, zero);
    764  return _mm256_inserti128_si256(_mm256_castsi128_si256(t0), t1, 1);
    765 }
    766 
    767 void aom_paeth_predictor_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
    768                                   const uint8_t *above, const uint8_t *left) {
    769  __m128i x = _mm_loadl_epi64((const __m128i *)left);
    770  const __m256i l = _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
    771  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
    772  __m256i rep = _mm256_set1_epi16((short)0x8000);
    773  const __m256i one = _mm256_set1_epi16(1);
    774  const __m256i top = get_top_vector(above);
    775 
    776  int i;
    777  for (i = 0; i < 8; ++i) {
    778    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    779    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    780 
    781    _mm_store_si128((__m128i *)dst, row);
    782    dst += stride;
    783    rep = _mm256_add_epi16(rep, one);
    784  }
    785 }
    786 
    787 static inline __m256i get_left_vector(const uint8_t *left) {
    788  const __m128i x = _mm_load_si128((const __m128i *)left);
    789  return _mm256_inserti128_si256(_mm256_castsi128_si256(x), x, 1);
    790 }
    791 
    792 void aom_paeth_predictor_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
    793                                    const uint8_t *above, const uint8_t *left) {
    794  const __m256i l = get_left_vector(left);
    795  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
    796  __m256i rep = _mm256_set1_epi16((short)0x8000);
    797  const __m256i one = _mm256_set1_epi16(1);
    798  const __m256i top = get_top_vector(above);
    799 
    800  int i;
    801  for (i = 0; i < 16; ++i) {
    802    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    803    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    804 
    805    _mm_store_si128((__m128i *)dst, row);
    806    dst += stride;
    807    rep = _mm256_add_epi16(rep, one);
    808  }
    809 }
    810 
    811 void aom_paeth_predictor_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
    812                                    const uint8_t *above, const uint8_t *left) {
    813  __m256i l = get_left_vector(left);
    814  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
    815  __m256i rep = _mm256_set1_epi16((short)0x8000);
    816  const __m256i one = _mm256_set1_epi16(1);
    817  const __m256i top = get_top_vector(above);
    818 
    819  int i;
    820  for (i = 0; i < 16; ++i) {
    821    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    822    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    823 
    824    _mm_store_si128((__m128i *)dst, row);
    825    dst += stride;
    826    rep = _mm256_add_epi16(rep, one);
    827  }
    828 
    829  l = get_left_vector(left + 16);
    830  rep = _mm256_set1_epi16((short)0x8000);
    831  for (i = 0; i < 16; ++i) {
    832    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    833    const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    834 
    835    _mm_store_si128((__m128i *)dst, row);
    836    dst += stride;
    837    rep = _mm256_add_epi16(rep, one);
    838  }
    839 }
    840 
    841 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    842 void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
    843                                    const uint8_t *above, const uint8_t *left) {
    844  const __m256i tl16 = _mm256_set1_epi16((int16_t)above[-1]);
    845  const __m256i one = _mm256_set1_epi16(1);
    846  const __m256i top = get_top_vector(above);
    847 
    848  for (int j = 0; j < 4; ++j) {
    849    const __m256i l = get_left_vector(left + j * 16);
    850    __m256i rep = _mm256_set1_epi16((short)0x8000);
    851    for (int i = 0; i < 16; ++i) {
    852      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    853      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
    854 
    855      _mm_store_si128((__m128i *)dst, row);
    856      dst += stride;
    857      rep = _mm256_add_epi16(rep, one);
    858    }
    859  }
    860 }
    861 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
    862 
    863 // Return 32 8-bit pixels in one row (__m256i)
    864 static inline __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
    865                                      const __m256i *top1,
    866                                      const __m256i *topleft) {
    867  __m256i p0 = paeth_pred(left, top0, topleft);
    868  __m256i p1 = _mm256_permute4x64_epi64(p0, 0xe);
    869  const __m256i x0 = _mm256_packus_epi16(p0, p1);
    870 
    871  p0 = paeth_pred(left, top1, topleft);
    872  p1 = _mm256_permute4x64_epi64(p0, 0xe);
    873  const __m256i x1 = _mm256_packus_epi16(p0, p1);
    874 
    875  return _mm256_permute2x128_si256(x0, x1, 0x20);
    876 }
    877 
    878 void aom_paeth_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
    879                                    const uint8_t *above, const uint8_t *left) {
    880  const __m256i l = get_left_vector(left);
    881  const __m256i t0 = get_top_vector(above);
    882  const __m256i t1 = get_top_vector(above + 16);
    883  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
    884  __m256i rep = _mm256_set1_epi16((short)0x8000);
    885  const __m256i one = _mm256_set1_epi16(1);
    886 
    887  int i;
    888  for (i = 0; i < 16; ++i) {
    889    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    890 
    891    const __m256i r = paeth_32x1_pred(&l16, &t0, &t1, &tl);
    892 
    893    _mm256_storeu_si256((__m256i *)dst, r);
    894 
    895    dst += stride;
    896    rep = _mm256_add_epi16(rep, one);
    897  }
    898 }
    899 
    900 void aom_paeth_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
    901                                    const uint8_t *above, const uint8_t *left) {
    902  __m256i l = get_left_vector(left);
    903  const __m256i t0 = get_top_vector(above);
    904  const __m256i t1 = get_top_vector(above + 16);
    905  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
    906  __m256i rep = _mm256_set1_epi16((short)0x8000);
    907  const __m256i one = _mm256_set1_epi16(1);
    908 
    909  int i;
    910  for (i = 0; i < 16; ++i) {
    911    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    912 
    913    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    914    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    915 
    916    _mm_store_si128((__m128i *)dst, r0);
    917    _mm_store_si128((__m128i *)(dst + 16), r1);
    918 
    919    dst += stride;
    920    rep = _mm256_add_epi16(rep, one);
    921  }
    922 
    923  l = get_left_vector(left + 16);
    924  rep = _mm256_set1_epi16((short)0x8000);
    925  for (i = 0; i < 16; ++i) {
    926    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    927 
    928    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    929    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    930 
    931    _mm_store_si128((__m128i *)dst, r0);
    932    _mm_store_si128((__m128i *)(dst + 16), r1);
    933 
    934    dst += stride;
    935    rep = _mm256_add_epi16(rep, one);
    936  }
    937 }
    938 
    939 void aom_paeth_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
    940                                    const uint8_t *above, const uint8_t *left) {
    941  const __m256i t0 = get_top_vector(above);
    942  const __m256i t1 = get_top_vector(above + 16);
    943  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
    944  const __m256i one = _mm256_set1_epi16(1);
    945 
    946  int i, j;
    947  for (j = 0; j < 4; ++j) {
    948    const __m256i l = get_left_vector(left + j * 16);
    949    __m256i rep = _mm256_set1_epi16((short)0x8000);
    950    for (i = 0; i < 16; ++i) {
    951      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    952 
    953      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    954      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    955 
    956      _mm_store_si128((__m128i *)dst, r0);
    957      _mm_store_si128((__m128i *)(dst + 16), r1);
    958 
    959      dst += stride;
    960      rep = _mm256_add_epi16(rep, one);
    961    }
    962  }
    963 }
    964 
    965 void aom_paeth_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
    966                                    const uint8_t *above, const uint8_t *left) {
    967  const __m256i t0 = get_top_vector(above);
    968  const __m256i t1 = get_top_vector(above + 16);
    969  const __m256i t2 = get_top_vector(above + 32);
    970  const __m256i t3 = get_top_vector(above + 48);
    971  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
    972  const __m256i one = _mm256_set1_epi16(1);
    973 
    974  int i, j;
    975  for (j = 0; j < 2; ++j) {
    976    const __m256i l = get_left_vector(left + j * 16);
    977    __m256i rep = _mm256_set1_epi16((short)0x8000);
    978    for (i = 0; i < 16; ++i) {
    979      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
    980 
    981      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
    982      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
    983      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
    984      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
    985 
    986      _mm_store_si128((__m128i *)dst, r0);
    987      _mm_store_si128((__m128i *)(dst + 16), r1);
    988      _mm_store_si128((__m128i *)(dst + 32), r2);
    989      _mm_store_si128((__m128i *)(dst + 48), r3);
    990 
    991      dst += stride;
    992      rep = _mm256_add_epi16(rep, one);
    993    }
    994  }
    995 }
    996 
    997 void aom_paeth_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
    998                                    const uint8_t *above, const uint8_t *left) {
    999  const __m256i t0 = get_top_vector(above);
   1000  const __m256i t1 = get_top_vector(above + 16);
   1001  const __m256i t2 = get_top_vector(above + 32);
   1002  const __m256i t3 = get_top_vector(above + 48);
   1003  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   1004  const __m256i one = _mm256_set1_epi16(1);
   1005 
   1006  int i, j;
   1007  for (j = 0; j < 4; ++j) {
   1008    const __m256i l = get_left_vector(left + j * 16);
   1009    __m256i rep = _mm256_set1_epi16((short)0x8000);
   1010    for (i = 0; i < 16; ++i) {
   1011      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
   1012 
   1013      const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
   1014      const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
   1015      const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
   1016      const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
   1017 
   1018      _mm_store_si128((__m128i *)dst, r0);
   1019      _mm_store_si128((__m128i *)(dst + 16), r1);
   1020      _mm_store_si128((__m128i *)(dst + 32), r2);
   1021      _mm_store_si128((__m128i *)(dst + 48), r3);
   1022 
   1023      dst += stride;
   1024      rep = _mm256_add_epi16(rep, one);
   1025    }
   1026  }
   1027 }
   1028 
   1029 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1030 void aom_paeth_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
   1031                                    const uint8_t *above, const uint8_t *left) {
   1032  const __m256i t0 = get_top_vector(above);
   1033  const __m256i t1 = get_top_vector(above + 16);
   1034  const __m256i t2 = get_top_vector(above + 32);
   1035  const __m256i t3 = get_top_vector(above + 48);
   1036  const __m256i tl = _mm256_set1_epi16((int16_t)above[-1]);
   1037  const __m256i one = _mm256_set1_epi16(1);
   1038 
   1039  int i;
   1040  const __m256i l = get_left_vector(left);
   1041  __m256i rep = _mm256_set1_epi16((short)0x8000);
   1042  for (i = 0; i < 16; ++i) {
   1043    const __m256i l16 = _mm256_shuffle_epi8(l, rep);
   1044 
   1045    const __m128i r0 = paeth_16x1_pred(&l16, &t0, &tl);
   1046    const __m128i r1 = paeth_16x1_pred(&l16, &t1, &tl);
   1047    const __m128i r2 = paeth_16x1_pred(&l16, &t2, &tl);
   1048    const __m128i r3 = paeth_16x1_pred(&l16, &t3, &tl);
   1049 
   1050    _mm_store_si128((__m128i *)dst, r0);
   1051    _mm_store_si128((__m128i *)(dst + 16), r1);
   1052    _mm_store_si128((__m128i *)(dst + 32), r2);
   1053    _mm_store_si128((__m128i *)(dst + 48), r3);
   1054 
   1055    dst += stride;
   1056    rep = _mm256_add_epi16(rep, one);
   1057  }
   1058 }
   1059 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1060 
   1061 #if CONFIG_AV1_HIGHBITDEPTH
   1062 
   1063 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_4xN_internal_avx2(
   1064    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   1065  const int frac_bits = 6 - upsample_above;
   1066  const int max_base_x = ((N + 4) - 1) << upsample_above;
   1067 
   1068  assert(dx > 0);
   1069  // pre-filter above pixels
   1070  // store in temp buffers:
   1071  //   above[x] * 32 + 16
   1072  //   above[x+1] - above[x]
   1073  // final pixels will be calculated as:
   1074  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1075  __m256i a0, a1, a32, a16;
   1076  __m256i diff, c3f;
   1077  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
   1078  __m128i a0_128, a1_128;
   1079  a16 = _mm256_set1_epi16(16);
   1080  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
   1081  max_base_x128 = _mm_set1_epi16(max_base_x);
   1082  c3f = _mm256_set1_epi16(0x3f);
   1083 
   1084  int x = dx;
   1085  for (int r = 0; r < N; r++) {
   1086    __m256i b, res, shift;
   1087    __m128i res1;
   1088 
   1089    int base = x >> frac_bits;
   1090    if (base >= max_base_x) {
   1091      for (int i = r; i < N; ++i) {
   1092        dst[i] = a_mbase_x;  // save 4 values
   1093      }
   1094      return;
   1095    }
   1096 
   1097    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
   1098    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
   1099 
   1100    if (upsample_above) {
   1101      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)HighbdEvenOddMaskx4[0]);
   1102      a1_128 = _mm_srli_si128(a0_128, 8);
   1103 
   1104      base_inc128 = _mm_setr_epi16(base, base + 2, base + 4, base + 6, base + 8,
   1105                                   base + 10, base + 12, base + 14);
   1106      shift = _mm256_srli_epi16(
   1107          _mm256_and_si256(
   1108              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above),
   1109              _mm256_set1_epi16(0x3f)),
   1110          1);
   1111    } else {
   1112      base_inc128 = _mm_setr_epi16(base, base + 1, base + 2, base + 3, base + 4,
   1113                                   base + 5, base + 6, base + 7);
   1114      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   1115    }
   1116    a0 = _mm256_castsi128_si256(a0_128);
   1117    a1 = _mm256_castsi128_si256(a1_128);
   1118    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   1119    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   1120    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   1121 
   1122    b = _mm256_mullo_epi16(diff, shift);
   1123    res = _mm256_add_epi16(a32, b);
   1124    res = _mm256_srli_epi16(res, 5);
   1125    res1 = _mm256_castsi256_si128(res);
   1126 
   1127    mask128 = _mm_cmpgt_epi16(max_base_x128, base_inc128);
   1128    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
   1129    x += dx;
   1130  }
   1131 }
   1132 
   1133 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_4xN_internal_avx2(
   1134    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   1135  const int frac_bits = 6 - upsample_above;
   1136  const int max_base_x = ((N + 4) - 1) << upsample_above;
   1137 
   1138  assert(dx > 0);
   1139  // pre-filter above pixels
   1140  // store in temp buffers:
   1141  //   above[x] * 32 + 16
   1142  //   above[x+1] - above[x]
   1143  // final pixels will be calculated as:
   1144  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1145  __m256i a0, a1, a32, a16;
   1146  __m256i diff;
   1147  __m128i a_mbase_x, max_base_x128, base_inc128, mask128;
   1148 
   1149  a16 = _mm256_set1_epi32(16);
   1150  a_mbase_x = _mm_set1_epi16(above[max_base_x]);
   1151  max_base_x128 = _mm_set1_epi32(max_base_x);
   1152 
   1153  int x = dx;
   1154  for (int r = 0; r < N; r++) {
   1155    __m256i b, res, shift;
   1156    __m128i res1;
   1157 
   1158    int base = x >> frac_bits;
   1159    if (base >= max_base_x) {
   1160      for (int i = r; i < N; ++i) {
   1161        dst[i] = a_mbase_x;  // save 4 values
   1162      }
   1163      return;
   1164    }
   1165 
   1166    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   1167    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   1168 
   1169    if (upsample_above) {
   1170      a0 = _mm256_permutevar8x32_epi32(
   1171          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   1172      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
   1173      base_inc128 = _mm_setr_epi32(base, base + 2, base + 4, base + 6);
   1174      shift = _mm256_srli_epi32(
   1175          _mm256_and_si256(
   1176              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
   1177              _mm256_set1_epi32(0x3f)),
   1178          1);
   1179    } else {
   1180      base_inc128 = _mm_setr_epi32(base, base + 1, base + 2, base + 3);
   1181      shift = _mm256_srli_epi32(
   1182          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1183    }
   1184 
   1185    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1186    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1187    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1188 
   1189    b = _mm256_mullo_epi32(diff, shift);
   1190    res = _mm256_add_epi32(a32, b);
   1191    res = _mm256_srli_epi32(res, 5);
   1192 
   1193    res1 = _mm256_castsi256_si128(res);
   1194    res1 = _mm_packus_epi32(res1, res1);
   1195 
   1196    mask128 = _mm_cmpgt_epi32(max_base_x128, base_inc128);
   1197    mask128 = _mm_packs_epi32(mask128, mask128);  // goto 16 bit
   1198    dst[r] = _mm_blendv_epi8(a_mbase_x, res1, mask128);
   1199    x += dx;
   1200  }
   1201 }
   1202 
   1203 static void highbd_dr_prediction_z1_4xN_avx2(int N, uint16_t *dst,
   1204                                             ptrdiff_t stride,
   1205                                             const uint16_t *above,
   1206                                             int upsample_above, int dx,
   1207                                             int bd) {
   1208  __m128i dstvec[16];
   1209  if (bd < 12) {
   1210    highbd_dr_prediction_z1_4xN_internal_avx2(N, dstvec, above, upsample_above,
   1211                                              dx);
   1212  } else {
   1213    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(N, dstvec, above,
   1214                                                    upsample_above, dx);
   1215  }
   1216  for (int i = 0; i < N; i++) {
   1217    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
   1218  }
   1219 }
   1220 
   1221 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_8xN_internal_avx2(
   1222    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   1223  const int frac_bits = 6 - upsample_above;
   1224  const int max_base_x = ((8 + N) - 1) << upsample_above;
   1225 
   1226  assert(dx > 0);
   1227  // pre-filter above pixels
   1228  // store in temp buffers:
   1229  //   above[x] * 32 + 16
   1230  //   above[x+1] - above[x]
   1231  // final pixels will be calculated as:
   1232  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1233  __m256i a0, a1, a0_1, a1_1, a32, a16;
   1234  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1235 
   1236  a16 = _mm256_set1_epi32(16);
   1237  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1238  max_base_x256 = _mm256_set1_epi32(max_base_x);
   1239 
   1240  int x = dx;
   1241  for (int r = 0; r < N; r++) {
   1242    __m256i b, res, res1, shift;
   1243 
   1244    int base = x >> frac_bits;
   1245    if (base >= max_base_x) {
   1246      for (int i = r; i < N; ++i) {
   1247        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
   1248      }
   1249      return;
   1250    }
   1251 
   1252    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   1253    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   1254 
   1255    if (upsample_above) {
   1256      a0 = _mm256_permutevar8x32_epi32(
   1257          a0, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   1258      a1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0, 1));
   1259 
   1260      a0_1 =
   1261          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
   1262      a0_1 = _mm256_permutevar8x32_epi32(
   1263          a0_1, _mm256_set_epi32(7, 5, 3, 1, 6, 4, 2, 0));
   1264      a1_1 = _mm256_castsi128_si256(_mm256_extracti128_si256(a0_1, 1));
   1265 
   1266      a0 = _mm256_inserti128_si256(a0, _mm256_castsi256_si128(a0_1), 1);
   1267      a1 = _mm256_inserti128_si256(a1, _mm256_castsi256_si128(a1_1), 1);
   1268      base_inc256 =
   1269          _mm256_setr_epi32(base, base + 2, base + 4, base + 6, base + 8,
   1270                            base + 10, base + 12, base + 14);
   1271      shift = _mm256_srli_epi32(
   1272          _mm256_and_si256(
   1273              _mm256_slli_epi32(_mm256_set1_epi32(x), upsample_above),
   1274              _mm256_set1_epi32(0x3f)),
   1275          1);
   1276    } else {
   1277      base_inc256 = _mm256_setr_epi32(base, base + 1, base + 2, base + 3,
   1278                                      base + 4, base + 5, base + 6, base + 7);
   1279      shift = _mm256_srli_epi32(
   1280          _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1281    }
   1282 
   1283    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1284    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1285    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1286 
   1287    b = _mm256_mullo_epi32(diff, shift);
   1288    res = _mm256_add_epi32(a32, b);
   1289    res = _mm256_srli_epi32(res, 5);
   1290 
   1291    res1 = _mm256_packus_epi32(
   1292        res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   1293 
   1294    mask256 = _mm256_cmpgt_epi32(max_base_x256, base_inc256);
   1295    mask256 = _mm256_packs_epi32(
   1296        mask256, _mm256_castsi128_si256(
   1297                     _mm256_extracti128_si256(mask256, 1)));  // goto 16 bit
   1298    res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1299    dst[r] = _mm256_castsi256_si128(res1);
   1300    x += dx;
   1301  }
   1302 }
   1303 
   1304 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_8xN_internal_avx2(
   1305    int N, __m128i *dst, const uint16_t *above, int upsample_above, int dx) {
   1306  const int frac_bits = 6 - upsample_above;
   1307  const int max_base_x = ((8 + N) - 1) << upsample_above;
   1308 
   1309  assert(dx > 0);
   1310  // pre-filter above pixels
   1311  // store in temp buffers:
   1312  //   above[x] * 32 + 16
   1313  //   above[x+1] - above[x]
   1314  // final pixels will be calculated as:
   1315  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1316  __m256i a0, a1, a32, a16, c3f;
   1317  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1318  __m128i a0_x128, a1_x128;
   1319 
   1320  a16 = _mm256_set1_epi16(16);
   1321  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1322  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1323  c3f = _mm256_set1_epi16(0x3f);
   1324 
   1325  int x = dx;
   1326  for (int r = 0; r < N; r++) {
   1327    __m256i b, res, res1, shift;
   1328 
   1329    int base = x >> frac_bits;
   1330    if (base >= max_base_x) {
   1331      for (int i = r; i < N; ++i) {
   1332        dst[i] = _mm256_castsi256_si128(a_mbase_x);  // save 8 values
   1333      }
   1334      return;
   1335    }
   1336 
   1337    a0_x128 = _mm_loadu_si128((__m128i *)(above + base));
   1338    if (upsample_above) {
   1339      __m128i mask, atmp0, atmp1, atmp2, atmp3;
   1340      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 8));
   1341      atmp0 = _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
   1342      atmp1 = _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdEvenOddMaskx[0]);
   1343      atmp2 =
   1344          _mm_shuffle_epi8(a0_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
   1345      atmp3 =
   1346          _mm_shuffle_epi8(a1_x128, *(__m128i *)(HighbdEvenOddMaskx[0] + 16));
   1347      mask =
   1348          _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[0], _mm_set1_epi8(15));
   1349      a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
   1350      mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[0] + 16),
   1351                            _mm_set1_epi8(15));
   1352      a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
   1353 
   1354      base_inc256 = _mm256_setr_epi16(base, base + 2, base + 4, base + 6,
   1355                                      base + 8, base + 10, base + 12, base + 14,
   1356                                      0, 0, 0, 0, 0, 0, 0, 0);
   1357      shift = _mm256_srli_epi16(
   1358          _mm256_and_si256(
   1359              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
   1360          1);
   1361    } else {
   1362      a1_x128 = _mm_loadu_si128((__m128i *)(above + base + 1));
   1363      base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
   1364                                      base + 4, base + 5, base + 6, base + 7, 0,
   1365                                      0, 0, 0, 0, 0, 0, 0);
   1366      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   1367    }
   1368    a0 = _mm256_castsi128_si256(a0_x128);
   1369    a1 = _mm256_castsi128_si256(a1_x128);
   1370 
   1371    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   1372    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   1373    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   1374 
   1375    b = _mm256_mullo_epi16(diff, shift);
   1376    res = _mm256_add_epi16(a32, b);
   1377    res = _mm256_srli_epi16(res, 5);
   1378 
   1379    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1380    res1 = _mm256_blendv_epi8(a_mbase_x, res, mask256);
   1381    dst[r] = _mm256_castsi256_si128(res1);
   1382    x += dx;
   1383  }
   1384 }
   1385 
   1386 static void highbd_dr_prediction_z1_8xN_avx2(int N, uint16_t *dst,
   1387                                             ptrdiff_t stride,
   1388                                             const uint16_t *above,
   1389                                             int upsample_above, int dx,
   1390                                             int bd) {
   1391  __m128i dstvec[32];
   1392  if (bd < 12) {
   1393    highbd_dr_prediction_z1_8xN_internal_avx2(N, dstvec, above, upsample_above,
   1394                                              dx);
   1395  } else {
   1396    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(N, dstvec, above,
   1397                                                    upsample_above, dx);
   1398  }
   1399  for (int i = 0; i < N; i++) {
   1400    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
   1401  }
   1402 }
   1403 
   1404 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_16xN_internal_avx2(
   1405    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
   1406  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1407  (void)upsample_above;
   1408  const int frac_bits = 6;
   1409  const int max_base_x = ((16 + N) - 1);
   1410 
   1411  // pre-filter above pixels
   1412  // store in temp buffers:
   1413  //   above[x] * 32 + 16
   1414  //   above[x+1] - above[x]
   1415  // final pixels will be calculated as:
   1416  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1417  __m256i a0, a0_1, a1, a1_1, a32, a16;
   1418  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1419 
   1420  a16 = _mm256_set1_epi32(16);
   1421  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1422  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1423 
   1424  int x = dx;
   1425  for (int r = 0; r < N; r++) {
   1426    __m256i b, res[2], res1;
   1427 
   1428    int base = x >> frac_bits;
   1429    if (base >= max_base_x) {
   1430      for (int i = r; i < N; ++i) {
   1431        dstvec[i] = a_mbase_x;  // save 16 values
   1432      }
   1433      return;
   1434    }
   1435    __m256i shift = _mm256_srli_epi32(
   1436        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1437 
   1438    a0 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base)));
   1439    a1 = _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 1)));
   1440 
   1441    diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1442    a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1443    a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1444    b = _mm256_mullo_epi32(diff, shift);
   1445 
   1446    res[0] = _mm256_add_epi32(a32, b);
   1447    res[0] = _mm256_srli_epi32(res[0], 5);
   1448    res[0] = _mm256_packus_epi32(
   1449        res[0], _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   1450 
   1451    int mdif = max_base_x - base;
   1452    if (mdif > 8) {
   1453      a0_1 =
   1454          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 8)));
   1455      a1_1 =
   1456          _mm256_cvtepu16_epi32(_mm_loadu_si128((__m128i *)(above + base + 9)));
   1457 
   1458      diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   1459      a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   1460      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1461      b = _mm256_mullo_epi32(diff, shift);
   1462 
   1463      res[1] = _mm256_add_epi32(a32, b);
   1464      res[1] = _mm256_srli_epi32(res[1], 5);
   1465      res[1] = _mm256_packus_epi32(
   1466          res[1], _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   1467    } else {
   1468      res[1] = a_mbase_x;
   1469    }
   1470    res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
   1471                                   1);  // 16 16bit values
   1472 
   1473    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
   1474                                    base + 4, base + 5, base + 6, base + 7,
   1475                                    base + 8, base + 9, base + 10, base + 11,
   1476                                    base + 12, base + 13, base + 14, base + 15);
   1477    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1478    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1479    x += dx;
   1480  }
   1481 }
   1482 
   1483 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_16xN_internal_avx2(
   1484    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
   1485  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1486  (void)upsample_above;
   1487  const int frac_bits = 6;
   1488  const int max_base_x = ((16 + N) - 1);
   1489 
   1490  // pre-filter above pixels
   1491  // store in temp buffers:
   1492  //   above[x] * 32 + 16
   1493  //   above[x+1] - above[x]
   1494  // final pixels will be calculated as:
   1495  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1496  __m256i a0, a1, a32, a16, c3f;
   1497  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1498 
   1499  a16 = _mm256_set1_epi16(16);
   1500  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1501  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1502  c3f = _mm256_set1_epi16(0x3f);
   1503 
   1504  int x = dx;
   1505  for (int r = 0; r < N; r++) {
   1506    __m256i b, res;
   1507 
   1508    int base = x >> frac_bits;
   1509    if (base >= max_base_x) {
   1510      for (int i = r; i < N; ++i) {
   1511        dstvec[i] = a_mbase_x;  // save 16 values
   1512      }
   1513      return;
   1514    }
   1515    __m256i shift =
   1516        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   1517 
   1518    a0 = _mm256_loadu_si256((__m256i *)(above + base));
   1519    a1 = _mm256_loadu_si256((__m256i *)(above + base + 1));
   1520 
   1521    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   1522    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   1523    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   1524    b = _mm256_mullo_epi16(diff, shift);
   1525 
   1526    res = _mm256_add_epi16(a32, b);
   1527    res = _mm256_srli_epi16(res, 5);  // 16 16bit values
   1528 
   1529    base_inc256 = _mm256_setr_epi16(base, base + 1, base + 2, base + 3,
   1530                                    base + 4, base + 5, base + 6, base + 7,
   1531                                    base + 8, base + 9, base + 10, base + 11,
   1532                                    base + 12, base + 13, base + 14, base + 15);
   1533    mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1534    dstvec[r] = _mm256_blendv_epi8(a_mbase_x, res, mask256);
   1535    x += dx;
   1536  }
   1537 }
   1538 
   1539 static void highbd_dr_prediction_z1_16xN_avx2(int N, uint16_t *dst,
   1540                                              ptrdiff_t stride,
   1541                                              const uint16_t *above,
   1542                                              int upsample_above, int dx,
   1543                                              int bd) {
   1544  __m256i dstvec[64];
   1545  if (bd < 12) {
   1546    highbd_dr_prediction_z1_16xN_internal_avx2(N, dstvec, above, upsample_above,
   1547                                               dx);
   1548  } else {
   1549    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(N, dstvec, above,
   1550                                                     upsample_above, dx);
   1551  }
   1552  for (int i = 0; i < N; i++) {
   1553    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   1554  }
   1555 }
   1556 
   1557 static AOM_FORCE_INLINE void highbd_dr_prediction_32bit_z1_32xN_internal_avx2(
   1558    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
   1559  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1560  (void)upsample_above;
   1561  const int frac_bits = 6;
   1562  const int max_base_x = ((32 + N) - 1);
   1563 
   1564  // pre-filter above pixels
   1565  // store in temp buffers:
   1566  //   above[x] * 32 + 16
   1567  //   above[x+1] - above[x]
   1568  // final pixels will be calculated as:
   1569  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1570  __m256i a0, a0_1, a1, a1_1, a32, a16, c3f;
   1571  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1572 
   1573  a16 = _mm256_set1_epi32(16);
   1574  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1575  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1576  c3f = _mm256_set1_epi16(0x3f);
   1577 
   1578  int x = dx;
   1579  for (int r = 0; r < N; r++) {
   1580    __m256i b, res[2], res1;
   1581 
   1582    int base = x >> frac_bits;
   1583    if (base >= max_base_x) {
   1584      for (int i = r; i < N; ++i) {
   1585        dstvec[i] = a_mbase_x;  // save 32 values
   1586        dstvec[i + N] = a_mbase_x;
   1587      }
   1588      return;
   1589    }
   1590 
   1591    __m256i shift =
   1592        _mm256_srli_epi32(_mm256_and_si256(_mm256_set1_epi32(x), c3f), 1);
   1593 
   1594    for (int j = 0; j < 32; j += 16) {
   1595      int mdif = max_base_x - (base + j);
   1596      if (mdif <= 0) {
   1597        res1 = a_mbase_x;
   1598      } else {
   1599        a0 = _mm256_cvtepu16_epi32(
   1600            _mm_loadu_si128((__m128i *)(above + base + j)));
   1601        a1 = _mm256_cvtepu16_epi32(
   1602            _mm_loadu_si128((__m128i *)(above + base + 1 + j)));
   1603 
   1604        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1605        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1606        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1607        b = _mm256_mullo_epi32(diff, shift);
   1608 
   1609        res[0] = _mm256_add_epi32(a32, b);
   1610        res[0] = _mm256_srli_epi32(res[0], 5);
   1611        res[0] = _mm256_packus_epi32(
   1612            res[0],
   1613            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   1614        if (mdif > 8) {
   1615          a0_1 = _mm256_cvtepu16_epi32(
   1616              _mm_loadu_si128((__m128i *)(above + base + 8 + j)));
   1617          a1_1 = _mm256_cvtepu16_epi32(
   1618              _mm_loadu_si128((__m128i *)(above + base + 9 + j)));
   1619 
   1620          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   1621          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   1622          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1623          b = _mm256_mullo_epi32(diff, shift);
   1624 
   1625          res[1] = _mm256_add_epi32(a32, b);
   1626          res[1] = _mm256_srli_epi32(res[1], 5);
   1627          res[1] = _mm256_packus_epi32(
   1628              res[1],
   1629              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   1630        } else {
   1631          res[1] = a_mbase_x;
   1632        }
   1633        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
   1634                                       1);  // 16 16bit values
   1635        base_inc256 = _mm256_setr_epi16(
   1636            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   1637            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   1638            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   1639            base + j + 13, base + j + 14, base + j + 15);
   1640 
   1641        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1642        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1643      }
   1644      if (!j) {
   1645        dstvec[r] = res1;
   1646      } else {
   1647        dstvec[r + N] = res1;
   1648      }
   1649    }
   1650    x += dx;
   1651  }
   1652 }
   1653 
   1654 static AOM_FORCE_INLINE void highbd_dr_prediction_z1_32xN_internal_avx2(
   1655    int N, __m256i *dstvec, const uint16_t *above, int upsample_above, int dx) {
   1656  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1657  (void)upsample_above;
   1658  const int frac_bits = 6;
   1659  const int max_base_x = ((32 + N) - 1);
   1660 
   1661  // pre-filter above pixels
   1662  // store in temp buffers:
   1663  //   above[x] * 32 + 16
   1664  //   above[x+1] - above[x]
   1665  // final pixels will be calculated as:
   1666  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1667  __m256i a0, a1, a32, a16, c3f;
   1668  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1669 
   1670  a16 = _mm256_set1_epi16(16);
   1671  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1672  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1673  c3f = _mm256_set1_epi16(0x3f);
   1674 
   1675  int x = dx;
   1676  for (int r = 0; r < N; r++) {
   1677    __m256i b, res;
   1678 
   1679    int base = x >> frac_bits;
   1680    if (base >= max_base_x) {
   1681      for (int i = r; i < N; ++i) {
   1682        dstvec[i] = a_mbase_x;  // save 32 values
   1683        dstvec[i + N] = a_mbase_x;
   1684      }
   1685      return;
   1686    }
   1687 
   1688    __m256i shift =
   1689        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   1690 
   1691    for (int j = 0; j < 32; j += 16) {
   1692      int mdif = max_base_x - (base + j);
   1693      if (mdif <= 0) {
   1694        res = a_mbase_x;
   1695      } else {
   1696        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
   1697        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
   1698 
   1699        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   1700        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   1701        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   1702        b = _mm256_mullo_epi16(diff, shift);
   1703 
   1704        res = _mm256_add_epi16(a32, b);
   1705        res = _mm256_srli_epi16(res, 5);
   1706 
   1707        base_inc256 = _mm256_setr_epi16(
   1708            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   1709            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   1710            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   1711            base + j + 13, base + j + 14, base + j + 15);
   1712 
   1713        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1714        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
   1715      }
   1716      if (!j) {
   1717        dstvec[r] = res;
   1718      } else {
   1719        dstvec[r + N] = res;
   1720      }
   1721    }
   1722    x += dx;
   1723  }
   1724 }
   1725 
   1726 static void highbd_dr_prediction_z1_32xN_avx2(int N, uint16_t *dst,
   1727                                              ptrdiff_t stride,
   1728                                              const uint16_t *above,
   1729                                              int upsample_above, int dx,
   1730                                              int bd) {
   1731  __m256i dstvec[128];
   1732  if (bd < 12) {
   1733    highbd_dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above,
   1734                                               dx);
   1735  } else {
   1736    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(N, dstvec, above,
   1737                                                     upsample_above, dx);
   1738  }
   1739  for (int i = 0; i < N; i++) {
   1740    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   1741    _mm256_storeu_si256((__m256i *)(dst + stride * i + 16), dstvec[i + N]);
   1742  }
   1743 }
   1744 
   1745 static void highbd_dr_prediction_32bit_z1_64xN_avx2(int N, uint16_t *dst,
   1746                                                    ptrdiff_t stride,
   1747                                                    const uint16_t *above,
   1748                                                    int upsample_above,
   1749                                                    int dx) {
   1750  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1751  (void)upsample_above;
   1752  const int frac_bits = 6;
   1753  const int max_base_x = ((64 + N) - 1);
   1754 
   1755  // pre-filter above pixels
   1756  // store in temp buffers:
   1757  //   above[x] * 32 + 16
   1758  //   above[x+1] - above[x]
   1759  // final pixels will be calculated as:
   1760  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1761  __m256i a0, a0_1, a1, a1_1, a32, a16;
   1762  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1763 
   1764  a16 = _mm256_set1_epi32(16);
   1765  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1766  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1767 
   1768  int x = dx;
   1769  for (int r = 0; r < N; r++, dst += stride) {
   1770    __m256i b, res[2], res1;
   1771 
   1772    int base = x >> frac_bits;
   1773    if (base >= max_base_x) {
   1774      for (int i = r; i < N; ++i) {
   1775        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
   1776        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
   1777        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
   1778        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
   1779        dst += stride;
   1780      }
   1781      return;
   1782    }
   1783 
   1784    __m256i shift = _mm256_srli_epi32(
   1785        _mm256_and_si256(_mm256_set1_epi32(x), _mm256_set1_epi32(0x3f)), 1);
   1786 
   1787    __m128i a0_128, a0_1_128, a1_128, a1_1_128;
   1788    for (int j = 0; j < 64; j += 16) {
   1789      int mdif = max_base_x - (base + j);
   1790      if (mdif <= 0) {
   1791        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
   1792      } else {
   1793        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
   1794        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
   1795        a0 = _mm256_cvtepu16_epi32(a0_128);
   1796        a1 = _mm256_cvtepu16_epi32(a1_128);
   1797 
   1798        diff = _mm256_sub_epi32(a1, a0);   // a[x+1] - a[x]
   1799        a32 = _mm256_slli_epi32(a0, 5);    // a[x] * 32
   1800        a32 = _mm256_add_epi32(a32, a16);  // a[x] * 32 + 16
   1801        b = _mm256_mullo_epi32(diff, shift);
   1802 
   1803        res[0] = _mm256_add_epi32(a32, b);
   1804        res[0] = _mm256_srli_epi32(res[0], 5);
   1805        res[0] = _mm256_packus_epi32(
   1806            res[0],
   1807            _mm256_castsi128_si256(_mm256_extracti128_si256(res[0], 1)));
   1808        if (mdif > 8) {
   1809          a0_1_128 = _mm_loadu_si128((__m128i *)(above + base + 8 + j));
   1810          a1_1_128 = _mm_loadu_si128((__m128i *)(above + base + 9 + j));
   1811          a0_1 = _mm256_cvtepu16_epi32(a0_1_128);
   1812          a1_1 = _mm256_cvtepu16_epi32(a1_1_128);
   1813 
   1814          diff = _mm256_sub_epi32(a1_1, a0_1);  // a[x+1] - a[x]
   1815          a32 = _mm256_slli_epi32(a0_1, 5);     // a[x] * 32
   1816          a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   1817          b = _mm256_mullo_epi32(diff, shift);
   1818 
   1819          res[1] = _mm256_add_epi32(a32, b);
   1820          res[1] = _mm256_srli_epi32(res[1], 5);
   1821          res[1] = _mm256_packus_epi32(
   1822              res[1],
   1823              _mm256_castsi128_si256(_mm256_extracti128_si256(res[1], 1)));
   1824        } else {
   1825          res[1] = a_mbase_x;
   1826        }
   1827        res1 = _mm256_inserti128_si256(res[0], _mm256_castsi256_si128(res[1]),
   1828                                       1);  // 16 16bit values
   1829        base_inc256 = _mm256_setr_epi16(
   1830            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   1831            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   1832            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   1833            base + j + 13, base + j + 14, base + j + 15);
   1834 
   1835        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1836        res1 = _mm256_blendv_epi8(a_mbase_x, res1, mask256);
   1837        _mm256_storeu_si256((__m256i *)(dst + j), res1);
   1838      }
   1839    }
   1840    x += dx;
   1841  }
   1842 }
   1843 
   1844 static void highbd_dr_prediction_z1_64xN_avx2(int N, uint16_t *dst,
   1845                                              ptrdiff_t stride,
   1846                                              const uint16_t *above,
   1847                                              int upsample_above, int dx) {
   1848  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   1849  (void)upsample_above;
   1850  const int frac_bits = 6;
   1851  const int max_base_x = ((64 + N) - 1);
   1852 
   1853  // pre-filter above pixels
   1854  // store in temp buffers:
   1855  //   above[x] * 32 + 16
   1856  //   above[x+1] - above[x]
   1857  // final pixels will be calculated as:
   1858  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1859  __m256i a0, a1, a32, a16, c3f;
   1860  __m256i a_mbase_x, diff, max_base_x256, base_inc256, mask256;
   1861 
   1862  a16 = _mm256_set1_epi16(16);
   1863  a_mbase_x = _mm256_set1_epi16(above[max_base_x]);
   1864  max_base_x256 = _mm256_set1_epi16(max_base_x);
   1865  c3f = _mm256_set1_epi16(0x3f);
   1866 
   1867  int x = dx;
   1868  for (int r = 0; r < N; r++, dst += stride) {
   1869    __m256i b, res;
   1870 
   1871    int base = x >> frac_bits;
   1872    if (base >= max_base_x) {
   1873      for (int i = r; i < N; ++i) {
   1874        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
   1875        _mm256_storeu_si256((__m256i *)(dst + 16), a_mbase_x);
   1876        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
   1877        _mm256_storeu_si256((__m256i *)(dst + 48), a_mbase_x);
   1878        dst += stride;
   1879      }
   1880      return;
   1881    }
   1882 
   1883    __m256i shift =
   1884        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   1885 
   1886    for (int j = 0; j < 64; j += 16) {
   1887      int mdif = max_base_x - (base + j);
   1888      if (mdif <= 0) {
   1889        _mm256_storeu_si256((__m256i *)(dst + j), a_mbase_x);
   1890      } else {
   1891        a0 = _mm256_loadu_si256((__m256i *)(above + base + j));
   1892        a1 = _mm256_loadu_si256((__m256i *)(above + base + 1 + j));
   1893 
   1894        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   1895        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   1896        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   1897        b = _mm256_mullo_epi16(diff, shift);
   1898 
   1899        res = _mm256_add_epi16(a32, b);
   1900        res = _mm256_srli_epi16(res, 5);
   1901 
   1902        base_inc256 = _mm256_setr_epi16(
   1903            base + j, base + j + 1, base + j + 2, base + j + 3, base + j + 4,
   1904            base + j + 5, base + j + 6, base + j + 7, base + j + 8,
   1905            base + j + 9, base + j + 10, base + j + 11, base + j + 12,
   1906            base + j + 13, base + j + 14, base + j + 15);
   1907 
   1908        mask256 = _mm256_cmpgt_epi16(max_base_x256, base_inc256);
   1909        res = _mm256_blendv_epi8(a_mbase_x, res, mask256);
   1910        _mm256_storeu_si256((__m256i *)(dst + j), res);  // 16 16bit values
   1911      }
   1912    }
   1913    x += dx;
   1914  }
   1915 }
   1916 
   1917 // Directional prediction, zone 1: 0 < angle < 90
   1918 void av1_highbd_dr_prediction_z1_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   1919                                      int bh, const uint16_t *above,
   1920                                      const uint16_t *left, int upsample_above,
   1921                                      int dx, int dy, int bd) {
   1922  (void)left;
   1923  (void)dy;
   1924 
   1925  switch (bw) {
   1926    case 4:
   1927      highbd_dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above,
   1928                                       dx, bd);
   1929      break;
   1930    case 8:
   1931      highbd_dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above,
   1932                                       dx, bd);
   1933      break;
   1934    case 16:
   1935      highbd_dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above,
   1936                                        dx, bd);
   1937      break;
   1938    case 32:
   1939      highbd_dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above,
   1940                                        dx, bd);
   1941      break;
   1942    case 64:
   1943      if (bd < 12) {
   1944        highbd_dr_prediction_z1_64xN_avx2(bh, dst, stride, above,
   1945                                          upsample_above, dx);
   1946      } else {
   1947        highbd_dr_prediction_32bit_z1_64xN_avx2(bh, dst, stride, above,
   1948                                                upsample_above, dx);
   1949      }
   1950      break;
   1951    default: break;
   1952  }
   1953  return;
   1954 }
   1955 
   1956 static void highbd_transpose_TX_16X16(const uint16_t *src, ptrdiff_t pitchSrc,
   1957                                      uint16_t *dst, ptrdiff_t pitchDst) {
   1958  __m256i r[16];
   1959  __m256i d[16];
   1960  for (int j = 0; j < 16; j++) {
   1961    r[j] = _mm256_loadu_si256((__m256i *)(src + j * pitchSrc));
   1962  }
   1963  highbd_transpose16x16_avx2(r, d);
   1964  for (int j = 0; j < 16; j++) {
   1965    _mm256_storeu_si256((__m256i *)(dst + j * pitchDst), d[j]);
   1966  }
   1967 }
   1968 
   1969 static void highbd_transpose(const uint16_t *src, ptrdiff_t pitchSrc,
   1970                             uint16_t *dst, ptrdiff_t pitchDst, int width,
   1971                             int height) {
   1972  for (int j = 0; j < height; j += 16)
   1973    for (int i = 0; i < width; i += 16)
   1974      highbd_transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
   1975                                dst + j * pitchDst + i, pitchDst);
   1976 }
   1977 
   1978 static void highbd_dr_prediction_32bit_z2_Nx4_avx2(
   1979    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   1980    const uint16_t *left, int upsample_above, int upsample_left, int dx,
   1981    int dy) {
   1982  const int min_base_x = -(1 << upsample_above);
   1983  const int min_base_y = -(1 << upsample_left);
   1984  const int frac_bits_x = 6 - upsample_above;
   1985  const int frac_bits_y = 6 - upsample_left;
   1986 
   1987  assert(dx > 0);
   1988  // pre-filter above pixels
   1989  // store in temp buffers:
   1990  //   above[x] * 32 + 16
   1991  //   above[x+1] - above[x]
   1992  // final pixels will be calculated as:
   1993  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   1994  __m256i a0_x, a1_x, a32, a16;
   1995  __m256i diff;
   1996  __m128i c3f, min_base_y128;
   1997 
   1998  a16 = _mm256_set1_epi32(16);
   1999  c3f = _mm_set1_epi32(0x3f);
   2000  min_base_y128 = _mm_set1_epi32(min_base_y);
   2001 
   2002  for (int r = 0; r < N; r++) {
   2003    __m256i b, res, shift;
   2004    __m128i resx, resy, resxy;
   2005    __m128i a0_x128, a1_x128;
   2006    int y = r + 1;
   2007    int base_x = (-y * dx) >> frac_bits_x;
   2008    int base_shift = 0;
   2009    if (base_x < (min_base_x - 1)) {
   2010      base_shift = (min_base_x - base_x - 1) >> upsample_above;
   2011    }
   2012    int base_min_diff =
   2013        (min_base_x - base_x + upsample_above) >> upsample_above;
   2014    if (base_min_diff > 4) {
   2015      base_min_diff = 4;
   2016    } else {
   2017      if (base_min_diff < 0) base_min_diff = 0;
   2018    }
   2019 
   2020    if (base_shift > 3) {
   2021      a0_x = _mm256_setzero_si256();
   2022      a1_x = _mm256_setzero_si256();
   2023      shift = _mm256_setzero_si256();
   2024    } else {
   2025      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2026      if (upsample_above) {
   2027        a0_x128 = _mm_shuffle_epi8(a0_x128,
   2028                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
   2029        a1_x128 = _mm_srli_si128(a0_x128, 8);
   2030 
   2031        shift = _mm256_castsi128_si256(_mm_srli_epi32(
   2032            _mm_and_si128(
   2033                _mm_slli_epi32(
   2034                    _mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
   2035                                   (2 << 6) - y * dx, (3 << 6) - y * dx),
   2036                    upsample_above),
   2037                c3f),
   2038            1));
   2039      } else {
   2040        a0_x128 =
   2041            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2042        a1_x128 = _mm_srli_si128(a0_x128, 2);
   2043 
   2044        shift = _mm256_castsi128_si256(_mm_srli_epi32(
   2045            _mm_and_si128(_mm_setr_epi32(-y * dx, (1 << 6) - y * dx,
   2046                                         (2 << 6) - y * dx, (3 << 6) - y * dx),
   2047                          c3f),
   2048            1));
   2049      }
   2050      a0_x = _mm256_cvtepu16_epi32(a0_x128);
   2051      a1_x = _mm256_cvtepu16_epi32(a1_x128);
   2052    }
   2053    // y calc
   2054    __m128i a0_y, a1_y, shifty;
   2055    if (base_x < min_base_x) {
   2056      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   2057      DECLARE_ALIGNED(32, int, base_y_c[4]);
   2058      r6 = _mm_set1_epi32(r << 6);
   2059      dy128 = _mm_set1_epi32(dy);
   2060      c1234 = _mm_setr_epi32(1, 2, 3, 4);
   2061      y_c128 = _mm_sub_epi32(r6, _mm_mullo_epi32(c1234, dy128));
   2062      base_y_c128 = _mm_srai_epi32(y_c128, frac_bits_y);
   2063      mask128 = _mm_cmpgt_epi32(min_base_y128, base_y_c128);
   2064      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   2065      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   2066 
   2067      a0_y = _mm_setr_epi32(left[base_y_c[0]], left[base_y_c[1]],
   2068                            left[base_y_c[2]], left[base_y_c[3]]);
   2069      a1_y = _mm_setr_epi32(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   2070                            left[base_y_c[2] + 1], left[base_y_c[3] + 1]);
   2071 
   2072      if (upsample_left) {
   2073        shifty = _mm_srli_epi32(
   2074            _mm_and_si128(_mm_slli_epi32(y_c128, upsample_left), c3f), 1);
   2075      } else {
   2076        shifty = _mm_srli_epi32(_mm_and_si128(y_c128, c3f), 1);
   2077      }
   2078      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   2079      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   2080      shift = _mm256_inserti128_si256(shift, shifty, 1);
   2081    }
   2082 
   2083    diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   2084    a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   2085    a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2086 
   2087    b = _mm256_mullo_epi32(diff, shift);
   2088    res = _mm256_add_epi32(a32, b);
   2089    res = _mm256_srli_epi32(res, 5);
   2090 
   2091    resx = _mm256_castsi256_si128(res);
   2092    resx = _mm_packus_epi32(resx, resx);
   2093 
   2094    resy = _mm256_extracti128_si256(res, 1);
   2095    resy = _mm_packus_epi32(resy, resy);
   2096 
   2097    resxy =
   2098        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   2099    _mm_storel_epi64((__m128i *)(dst), resxy);
   2100    dst += stride;
   2101  }
   2102 }
   2103 
   2104 static void highbd_dr_prediction_z2_Nx4_avx2(
   2105    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2106    const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2107    int dy) {
   2108  const int min_base_x = -(1 << upsample_above);
   2109  const int min_base_y = -(1 << upsample_left);
   2110  const int frac_bits_x = 6 - upsample_above;
   2111  const int frac_bits_y = 6 - upsample_left;
   2112 
   2113  assert(dx > 0);
   2114  // pre-filter above pixels
   2115  // store in temp buffers:
   2116  //   above[x] * 32 + 16
   2117  //   above[x+1] - above[x]
   2118  // final pixels will be calculated as:
   2119  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2120  __m256i a0_x, a1_x, a32, a16;
   2121  __m256i diff;
   2122  __m128i c3f, min_base_y128;
   2123 
   2124  a16 = _mm256_set1_epi16(16);
   2125  c3f = _mm_set1_epi16(0x3f);
   2126  min_base_y128 = _mm_set1_epi16(min_base_y);
   2127 
   2128  for (int r = 0; r < N; r++) {
   2129    __m256i b, res, shift;
   2130    __m128i resx, resy, resxy;
   2131    __m128i a0_x128, a1_x128;
   2132    int y = r + 1;
   2133    int base_x = (-y * dx) >> frac_bits_x;
   2134    int base_shift = 0;
   2135    if (base_x < (min_base_x - 1)) {
   2136      base_shift = (min_base_x - base_x - 1) >> upsample_above;
   2137    }
   2138    int base_min_diff =
   2139        (min_base_x - base_x + upsample_above) >> upsample_above;
   2140    if (base_min_diff > 4) {
   2141      base_min_diff = 4;
   2142    } else {
   2143      if (base_min_diff < 0) base_min_diff = 0;
   2144    }
   2145 
   2146    if (base_shift > 3) {
   2147      a0_x = _mm256_setzero_si256();
   2148      a1_x = _mm256_setzero_si256();
   2149      shift = _mm256_setzero_si256();
   2150    } else {
   2151      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2152      if (upsample_above) {
   2153        a0_x128 = _mm_shuffle_epi8(a0_x128,
   2154                                   *(__m128i *)HighbdEvenOddMaskx4[base_shift]);
   2155        a1_x128 = _mm_srli_si128(a0_x128, 8);
   2156 
   2157        shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2158            _mm_and_si128(
   2159                _mm_slli_epi16(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   2160                                              (2 << 6) - y * dx,
   2161                                              (3 << 6) - y * dx, 0, 0, 0, 0),
   2162                               upsample_above),
   2163                c3f),
   2164            1));
   2165      } else {
   2166        a0_x128 =
   2167            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2168        a1_x128 = _mm_srli_si128(a0_x128, 2);
   2169 
   2170        shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2171            _mm_and_si128(
   2172                _mm_setr_epi16(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
   2173                               (3 << 6) - y * dx, 0, 0, 0, 0),
   2174                c3f),
   2175            1));
   2176      }
   2177      a0_x = _mm256_castsi128_si256(a0_x128);
   2178      a1_x = _mm256_castsi128_si256(a1_x128);
   2179    }
   2180    // y calc
   2181    __m128i a0_y, a1_y, shifty;
   2182    if (base_x < min_base_x) {
   2183      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   2184      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
   2185      r6 = _mm_set1_epi16(r << 6);
   2186      dy128 = _mm_set1_epi16(dy);
   2187      c1234 = _mm_setr_epi16(1, 2, 3, 4, 0, 0, 0, 0);
   2188      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
   2189      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
   2190      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
   2191      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   2192      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   2193 
   2194      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   2195                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
   2196      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   2197                            left[base_y_c[2] + 1], left[base_y_c[3] + 1], 0, 0,
   2198                            0, 0);
   2199 
   2200      if (upsample_left) {
   2201        shifty = _mm_srli_epi16(
   2202            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
   2203      } else {
   2204        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
   2205      }
   2206      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   2207      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   2208      shift = _mm256_inserti128_si256(shift, shifty, 1);
   2209    }
   2210 
   2211    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   2212    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   2213    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2214 
   2215    b = _mm256_mullo_epi16(diff, shift);
   2216    res = _mm256_add_epi16(a32, b);
   2217    res = _mm256_srli_epi16(res, 5);
   2218 
   2219    resx = _mm256_castsi256_si128(res);
   2220    resy = _mm256_extracti128_si256(res, 1);
   2221    resxy =
   2222        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   2223    _mm_storel_epi64((__m128i *)(dst), resxy);
   2224    dst += stride;
   2225  }
   2226 }
   2227 
   2228 static void highbd_dr_prediction_32bit_z2_Nx8_avx2(
   2229    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2230    const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2231    int dy) {
   2232  const int min_base_x = -(1 << upsample_above);
   2233  const int min_base_y = -(1 << upsample_left);
   2234  const int frac_bits_x = 6 - upsample_above;
   2235  const int frac_bits_y = 6 - upsample_left;
   2236 
   2237  // pre-filter above pixels
   2238  // store in temp buffers:
   2239  //   above[x] * 32 + 16
   2240  //   above[x+1] - above[x]
   2241  // final pixels will be calculated as:
   2242  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2243  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c3f, min_base_y256;
   2244  __m256i diff;
   2245  __m128i a0_x128, a1_x128;
   2246 
   2247  a16 = _mm256_set1_epi32(16);
   2248  c3f = _mm256_set1_epi32(0x3f);
   2249  min_base_y256 = _mm256_set1_epi32(min_base_y);
   2250 
   2251  for (int r = 0; r < N; r++) {
   2252    __m256i b, res, shift;
   2253    __m128i resx, resy, resxy;
   2254    int y = r + 1;
   2255    int base_x = (-y * dx) >> frac_bits_x;
   2256    int base_shift = 0;
   2257    if (base_x < (min_base_x - 1)) {
   2258      base_shift = (min_base_x - base_x - 1) >> upsample_above;
   2259    }
   2260    int base_min_diff =
   2261        (min_base_x - base_x + upsample_above) >> upsample_above;
   2262    if (base_min_diff > 8) {
   2263      base_min_diff = 8;
   2264    } else {
   2265      if (base_min_diff < 0) base_min_diff = 0;
   2266    }
   2267 
   2268    if (base_shift > 7) {
   2269      resx = _mm_setzero_si128();
   2270    } else {
   2271      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2272      if (upsample_above) {
   2273        __m128i mask, atmp0, atmp1, atmp2, atmp3;
   2274        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
   2275        atmp0 = _mm_shuffle_epi8(a0_x128,
   2276                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
   2277        atmp1 = _mm_shuffle_epi8(a1_x128,
   2278                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
   2279        atmp2 = _mm_shuffle_epi8(
   2280            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
   2281        atmp3 = _mm_shuffle_epi8(
   2282            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
   2283        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
   2284                              _mm_set1_epi8(15));
   2285        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
   2286        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
   2287                              _mm_set1_epi8(15));
   2288        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
   2289        shift = _mm256_srli_epi32(
   2290            _mm256_and_si256(
   2291                _mm256_slli_epi32(
   2292                    _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx,
   2293                                      (2 << 6) - y * dx, (3 << 6) - y * dx,
   2294                                      (4 << 6) - y * dx, (5 << 6) - y * dx,
   2295                                      (6 << 6) - y * dx, (7 << 6) - y * dx),
   2296                    upsample_above),
   2297                c3f),
   2298            1);
   2299      } else {
   2300        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
   2301        a0_x128 =
   2302            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2303        a1_x128 =
   2304            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2305 
   2306        shift = _mm256_srli_epi32(
   2307            _mm256_and_si256(
   2308                _mm256_setr_epi32(-y * dx, (1 << 6) - y * dx, (2 << 6) - y * dx,
   2309                                  (3 << 6) - y * dx, (4 << 6) - y * dx,
   2310                                  (5 << 6) - y * dx, (6 << 6) - y * dx,
   2311                                  (7 << 6) - y * dx),
   2312                c3f),
   2313            1);
   2314      }
   2315      a0_x = _mm256_cvtepu16_epi32(a0_x128);
   2316      a1_x = _mm256_cvtepu16_epi32(a1_x128);
   2317 
   2318      diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   2319      a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   2320      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2321 
   2322      b = _mm256_mullo_epi32(diff, shift);
   2323      res = _mm256_add_epi32(a32, b);
   2324      res = _mm256_srli_epi32(res, 5);
   2325 
   2326      resx = _mm256_castsi256_si128(_mm256_packus_epi32(
   2327          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   2328    }
   2329    // y calc
   2330    if (base_x < min_base_x) {
   2331      DECLARE_ALIGNED(32, int, base_y_c[8]);
   2332      __m256i r6, c256, dy256, y_c256, base_y_c256, mask256;
   2333      r6 = _mm256_set1_epi32(r << 6);
   2334      dy256 = _mm256_set1_epi32(dy);
   2335      c256 = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
   2336      y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
   2337      base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
   2338      mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
   2339      base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2340      _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   2341 
   2342      a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2343          left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   2344          left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   2345          left[base_y_c[6]], left[base_y_c[7]]));
   2346      a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2347          left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
   2348          left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   2349          left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
   2350 
   2351      if (upsample_left) {
   2352        shift = _mm256_srli_epi32(
   2353            _mm256_and_si256(_mm256_slli_epi32((y_c256), upsample_left), c3f),
   2354            1);
   2355      } else {
   2356        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
   2357      }
   2358      diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
   2359      a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
   2360      a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2361 
   2362      b = _mm256_mullo_epi32(diff, shift);
   2363      res = _mm256_add_epi32(a32, b);
   2364      res = _mm256_srli_epi32(res, 5);
   2365 
   2366      resy = _mm256_castsi256_si128(_mm256_packus_epi32(
   2367          res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   2368    } else {
   2369      resy = resx;
   2370    }
   2371    resxy =
   2372        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   2373    _mm_storeu_si128((__m128i *)(dst), resxy);
   2374    dst += stride;
   2375  }
   2376 }
   2377 
   2378 static void highbd_dr_prediction_z2_Nx8_avx2(
   2379    int N, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2380    const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2381    int dy) {
   2382  const int min_base_x = -(1 << upsample_above);
   2383  const int min_base_y = -(1 << upsample_left);
   2384  const int frac_bits_x = 6 - upsample_above;
   2385  const int frac_bits_y = 6 - upsample_left;
   2386 
   2387  // pre-filter above pixels
   2388  // store in temp buffers:
   2389  //   above[x] * 32 + 16
   2390  //   above[x+1] - above[x]
   2391  // final pixels will be calculated as:
   2392  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2393  __m128i c3f, min_base_y128;
   2394  __m256i a0_x, a1_x, diff, a32, a16;
   2395  __m128i a0_x128, a1_x128;
   2396 
   2397  a16 = _mm256_set1_epi16(16);
   2398  c3f = _mm_set1_epi16(0x3f);
   2399  min_base_y128 = _mm_set1_epi16(min_base_y);
   2400 
   2401  for (int r = 0; r < N; r++) {
   2402    __m256i b, res, shift;
   2403    __m128i resx, resy, resxy;
   2404    int y = r + 1;
   2405    int base_x = (-y * dx) >> frac_bits_x;
   2406    int base_shift = 0;
   2407    if (base_x < (min_base_x - 1)) {
   2408      base_shift = (min_base_x - base_x - 1) >> upsample_above;
   2409    }
   2410    int base_min_diff =
   2411        (min_base_x - base_x + upsample_above) >> upsample_above;
   2412    if (base_min_diff > 8) {
   2413      base_min_diff = 8;
   2414    } else {
   2415      if (base_min_diff < 0) base_min_diff = 0;
   2416    }
   2417 
   2418    if (base_shift > 7) {
   2419      a0_x = _mm256_setzero_si256();
   2420      a1_x = _mm256_setzero_si256();
   2421      shift = _mm256_setzero_si256();
   2422    } else {
   2423      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2424      if (upsample_above) {
   2425        __m128i mask, atmp0, atmp1, atmp2, atmp3;
   2426        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 8 + base_shift));
   2427        atmp0 = _mm_shuffle_epi8(a0_x128,
   2428                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
   2429        atmp1 = _mm_shuffle_epi8(a1_x128,
   2430                                 *(__m128i *)HighbdEvenOddMaskx[base_shift]);
   2431        atmp2 = _mm_shuffle_epi8(
   2432            a0_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
   2433        atmp3 = _mm_shuffle_epi8(
   2434            a1_x128, *(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16));
   2435        mask = _mm_cmpgt_epi8(*(__m128i *)HighbdEvenOddMaskx[base_shift],
   2436                              _mm_set1_epi8(15));
   2437        a0_x128 = _mm_blendv_epi8(atmp0, atmp1, mask);
   2438        mask = _mm_cmpgt_epi8(*(__m128i *)(HighbdEvenOddMaskx[base_shift] + 16),
   2439                              _mm_set1_epi8(15));
   2440        a1_x128 = _mm_blendv_epi8(atmp2, atmp3, mask);
   2441 
   2442        shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2443            _mm_and_si128(
   2444                _mm_slli_epi16(
   2445                    _mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   2446                                   (2 << 6) - y * dx, (3 << 6) - y * dx,
   2447                                   (4 << 6) - y * dx, (5 << 6) - y * dx,
   2448                                   (6 << 6) - y * dx, (7 << 6) - y * dx),
   2449                    upsample_above),
   2450                c3f),
   2451            1));
   2452      } else {
   2453        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + 1 + base_shift));
   2454        a0_x128 =
   2455            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2456        a1_x128 =
   2457            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2458 
   2459        shift = _mm256_castsi128_si256(_mm_srli_epi16(
   2460            _mm_and_si128(_mm_setr_epi16(-y * dx, (1 << 6) - y * dx,
   2461                                         (2 << 6) - y * dx, (3 << 6) - y * dx,
   2462                                         (4 << 6) - y * dx, (5 << 6) - y * dx,
   2463                                         (6 << 6) - y * dx, (7 << 6) - y * dx),
   2464                          c3f),
   2465            1));
   2466      }
   2467      a0_x = _mm256_castsi128_si256(a0_x128);
   2468      a1_x = _mm256_castsi128_si256(a1_x128);
   2469    }
   2470 
   2471    // y calc
   2472    __m128i a0_y, a1_y, shifty;
   2473    if (base_x < min_base_x) {
   2474      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
   2475      __m128i r6, c1234, dy128, y_c128, base_y_c128, mask128;
   2476      r6 = _mm_set1_epi16(r << 6);
   2477      dy128 = _mm_set1_epi16(dy);
   2478      c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
   2479      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
   2480      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
   2481      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
   2482      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   2483      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   2484 
   2485      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   2486                            left[base_y_c[2]], left[base_y_c[3]],
   2487                            left[base_y_c[4]], left[base_y_c[5]],
   2488                            left[base_y_c[6]], left[base_y_c[7]]);
   2489      a1_y = _mm_setr_epi16(left[base_y_c[0] + 1], left[base_y_c[1] + 1],
   2490                            left[base_y_c[2] + 1], left[base_y_c[3] + 1],
   2491                            left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   2492                            left[base_y_c[6] + 1], left[base_y_c[7] + 1]);
   2493 
   2494      if (upsample_left) {
   2495        shifty = _mm_srli_epi16(
   2496            _mm_and_si128(_mm_slli_epi16((y_c128), upsample_left), c3f), 1);
   2497      } else {
   2498        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
   2499      }
   2500      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   2501      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   2502      shift = _mm256_inserti128_si256(shift, shifty, 1);
   2503    }
   2504 
   2505    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   2506    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   2507    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2508 
   2509    b = _mm256_mullo_epi16(diff, shift);
   2510    res = _mm256_add_epi16(a32, b);
   2511    res = _mm256_srli_epi16(res, 5);
   2512 
   2513    resx = _mm256_castsi256_si128(res);
   2514    resy = _mm256_extracti128_si256(res, 1);
   2515 
   2516    resxy =
   2517        _mm_blendv_epi8(resx, resy, *(__m128i *)HighbdBaseMask[base_min_diff]);
   2518    _mm_storeu_si128((__m128i *)(dst), resxy);
   2519    dst += stride;
   2520  }
   2521 }
   2522 
   2523 static void highbd_dr_prediction_32bit_z2_HxW_avx2(
   2524    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2525    const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2526    int dy) {
   2527  // here upsample_above and upsample_left are 0 by design of
   2528  // av1_use_intra_edge_upsample
   2529  const int min_base_x = -1;
   2530  const int min_base_y = -1;
   2531  (void)upsample_above;
   2532  (void)upsample_left;
   2533  const int frac_bits_x = 6;
   2534  const int frac_bits_y = 6;
   2535 
   2536  // pre-filter above pixels
   2537  // store in temp buffers:
   2538  //   above[x] * 32 + 16
   2539  //   above[x+1] - above[x]
   2540  // final pixels will be calculated as:
   2541  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2542  __m256i a0_x, a1_x, a0_y, a1_y, a32, a0_1_x, a1_1_x, a16, c1;
   2543  __m256i diff, min_base_y256, c3f, dy256, c1234, c0123, c8;
   2544  __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
   2545  DECLARE_ALIGNED(32, int, base_y_c[16]);
   2546 
   2547  a16 = _mm256_set1_epi32(16);
   2548  c1 = _mm256_srli_epi32(a16, 4);
   2549  c8 = _mm256_srli_epi32(a16, 1);
   2550  min_base_y256 = _mm256_set1_epi32(min_base_y);
   2551  c3f = _mm256_set1_epi32(0x3f);
   2552  dy256 = _mm256_set1_epi32(dy);
   2553  c0123 = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
   2554  c1234 = _mm256_add_epi32(c0123, c1);
   2555 
   2556  for (int r = 0; r < H; r++) {
   2557    __m256i b, res, shift, ydx;
   2558    __m256i resx[2], resy[2];
   2559    __m256i resxy, j256, r6;
   2560    for (int j = 0; j < W; j += 16) {
   2561      j256 = _mm256_set1_epi32(j);
   2562      int y = r + 1;
   2563      ydx = _mm256_set1_epi32(y * dx);
   2564 
   2565      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
   2566      int base_shift = 0;
   2567      if ((base_x) < (min_base_x - 1)) {
   2568        base_shift = (min_base_x - base_x - 1);
   2569      }
   2570      int base_min_diff = (min_base_x - base_x);
   2571      if (base_min_diff > 16) {
   2572        base_min_diff = 16;
   2573      } else {
   2574        if (base_min_diff < 0) base_min_diff = 0;
   2575      }
   2576 
   2577      if (base_shift > 7) {
   2578        resx[0] = _mm256_setzero_si256();
   2579      } else {
   2580        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2581        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
   2582        a0_x128 =
   2583            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2584        a1_x128 =
   2585            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2586 
   2587        a0_x = _mm256_cvtepu16_epi32(a0_x128);
   2588        a1_x = _mm256_cvtepu16_epi32(a1_x128);
   2589 
   2590        r6 = _mm256_slli_epi32(_mm256_add_epi32(c0123, j256), 6);
   2591        shift = _mm256_srli_epi32(
   2592            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
   2593 
   2594        diff = _mm256_sub_epi32(a1_x, a0_x);  // a[x+1] - a[x]
   2595        a32 = _mm256_slli_epi32(a0_x, 5);     // a[x] * 32
   2596        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2597 
   2598        b = _mm256_mullo_epi32(diff, shift);
   2599        res = _mm256_add_epi32(a32, b);
   2600        res = _mm256_srli_epi32(res, 5);
   2601 
   2602        resx[0] = _mm256_packus_epi32(
   2603            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   2604      }
   2605      int base_shift8 = 0;
   2606      if ((base_x + 8) < (min_base_x - 1)) {
   2607        base_shift8 = (min_base_x - (base_x + 8) - 1);
   2608      }
   2609      if (base_shift8 > 7) {
   2610        resx[1] = _mm256_setzero_si256();
   2611      } else {
   2612        a0_1_x128 =
   2613            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 8));
   2614        a1_1_x128 =
   2615            _mm_loadu_si128((__m128i *)(above + base_x + base_shift8 + 9));
   2616        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
   2617                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
   2618        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
   2619                                     *(__m128i *)HighbdLoadMaskx[base_shift8]);
   2620 
   2621        a0_1_x = _mm256_cvtepu16_epi32(a0_1_x128);
   2622        a1_1_x = _mm256_cvtepu16_epi32(a1_1_x128);
   2623 
   2624        r6 = _mm256_slli_epi32(
   2625            _mm256_add_epi32(c0123, _mm256_add_epi32(j256, c8)), 6);
   2626        shift = _mm256_srli_epi32(
   2627            _mm256_and_si256(_mm256_sub_epi32(r6, ydx), c3f), 1);
   2628 
   2629        diff = _mm256_sub_epi32(a1_1_x, a0_1_x);  // a[x+1] - a[x]
   2630        a32 = _mm256_slli_epi32(a0_1_x, 5);       // a[x] * 32
   2631        a32 = _mm256_add_epi32(a32, a16);         // a[x] * 32 + 16
   2632        b = _mm256_mullo_epi32(diff, shift);
   2633 
   2634        resx[1] = _mm256_add_epi32(a32, b);
   2635        resx[1] = _mm256_srli_epi32(resx[1], 5);
   2636        resx[1] = _mm256_packus_epi32(
   2637            resx[1],
   2638            _mm256_castsi128_si256(_mm256_extracti128_si256(resx[1], 1)));
   2639      }
   2640      resx[0] =
   2641          _mm256_inserti128_si256(resx[0], _mm256_castsi256_si128(resx[1]),
   2642                                  1);  // 16 16bit values
   2643 
   2644      // y calc
   2645      resy[0] = _mm256_setzero_si256();
   2646      if ((base_x < min_base_x)) {
   2647        __m256i c256, y_c256, y_c_1_256, base_y_c256, mask256;
   2648        r6 = _mm256_set1_epi32(r << 6);
   2649        c256 = _mm256_add_epi32(j256, c1234);
   2650        y_c256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
   2651        base_y_c256 = _mm256_srai_epi32(y_c256, frac_bits_y);
   2652        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
   2653        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2654        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   2655        c256 = _mm256_add_epi32(c256, c8);
   2656        y_c_1_256 = _mm256_sub_epi32(r6, _mm256_mullo_epi32(c256, dy256));
   2657        base_y_c256 = _mm256_srai_epi32(y_c_1_256, frac_bits_y);
   2658        mask256 = _mm256_cmpgt_epi32(min_base_y256, base_y_c256);
   2659        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2660        _mm256_store_si256((__m256i *)(base_y_c + 8), base_y_c256);
   2661 
   2662        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2663            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   2664            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   2665            left[base_y_c[6]], left[base_y_c[7]]));
   2666        a1_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2667            left[base_y_c[0] + 1], left[base_y_c[1] + 1], left[base_y_c[2] + 1],
   2668            left[base_y_c[3] + 1], left[base_y_c[4] + 1], left[base_y_c[5] + 1],
   2669            left[base_y_c[6] + 1], left[base_y_c[7] + 1]));
   2670 
   2671        shift = _mm256_srli_epi32(_mm256_and_si256(y_c256, c3f), 1);
   2672 
   2673        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
   2674        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
   2675        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2676 
   2677        b = _mm256_mullo_epi32(diff, shift);
   2678        res = _mm256_add_epi32(a32, b);
   2679        res = _mm256_srli_epi32(res, 5);
   2680 
   2681        resy[0] = _mm256_packus_epi32(
   2682            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   2683 
   2684        a0_y = _mm256_cvtepu16_epi32(_mm_setr_epi16(
   2685            left[base_y_c[8]], left[base_y_c[9]], left[base_y_c[10]],
   2686            left[base_y_c[11]], left[base_y_c[12]], left[base_y_c[13]],
   2687            left[base_y_c[14]], left[base_y_c[15]]));
   2688        a1_y = _mm256_cvtepu16_epi32(
   2689            _mm_setr_epi16(left[base_y_c[8] + 1], left[base_y_c[9] + 1],
   2690                           left[base_y_c[10] + 1], left[base_y_c[11] + 1],
   2691                           left[base_y_c[12] + 1], left[base_y_c[13] + 1],
   2692                           left[base_y_c[14] + 1], left[base_y_c[15] + 1]));
   2693        shift = _mm256_srli_epi32(_mm256_and_si256(y_c_1_256, c3f), 1);
   2694 
   2695        diff = _mm256_sub_epi32(a1_y, a0_y);  // a[x+1] - a[x]
   2696        a32 = _mm256_slli_epi32(a0_y, 5);     // a[x] * 32
   2697        a32 = _mm256_add_epi32(a32, a16);     // a[x] * 32 + 16
   2698 
   2699        b = _mm256_mullo_epi32(diff, shift);
   2700        res = _mm256_add_epi32(a32, b);
   2701        res = _mm256_srli_epi32(res, 5);
   2702 
   2703        resy[1] = _mm256_packus_epi32(
   2704            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1)));
   2705 
   2706        resy[0] =
   2707            _mm256_inserti128_si256(resy[0], _mm256_castsi256_si128(resy[1]),
   2708                                    1);  // 16 16bit values
   2709      }
   2710 
   2711      resxy = _mm256_blendv_epi8(resx[0], resy[0],
   2712                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
   2713      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
   2714    }  // for j
   2715    dst += stride;
   2716  }
   2717 }
   2718 
   2719 static void highbd_dr_prediction_z2_HxW_avx2(
   2720    int H, int W, uint16_t *dst, ptrdiff_t stride, const uint16_t *above,
   2721    const uint16_t *left, int upsample_above, int upsample_left, int dx,
   2722    int dy) {
   2723  // here upsample_above and upsample_left are 0 by design of
   2724  // av1_use_intra_edge_upsample
   2725  const int min_base_x = -1;
   2726  const int min_base_y = -1;
   2727  (void)upsample_above;
   2728  (void)upsample_left;
   2729  const int frac_bits_x = 6;
   2730  const int frac_bits_y = 6;
   2731 
   2732  // pre-filter above pixels
   2733  // store in temp buffers:
   2734  //   above[x] * 32 + 16
   2735  //   above[x+1] - above[x]
   2736  // final pixels will be calculated as:
   2737  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   2738  __m256i a0_x, a1_x, a32, a16, c3f, c1;
   2739  __m256i diff, min_base_y256, dy256, c1234, c0123;
   2740  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   2741 
   2742  a16 = _mm256_set1_epi16(16);
   2743  c1 = _mm256_srli_epi16(a16, 4);
   2744  min_base_y256 = _mm256_set1_epi16(min_base_y);
   2745  c3f = _mm256_set1_epi16(0x3f);
   2746  dy256 = _mm256_set1_epi16(dy);
   2747  c0123 =
   2748      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
   2749  c1234 = _mm256_add_epi16(c0123, c1);
   2750 
   2751  for (int r = 0; r < H; r++) {
   2752    __m256i b, res, shift;
   2753    __m256i resx, resy, ydx;
   2754    __m256i resxy, j256, r6;
   2755    __m128i a0_x128, a1_x128, a0_1_x128, a1_1_x128;
   2756    int y = r + 1;
   2757    ydx = _mm256_set1_epi16((short)(y * dx));
   2758 
   2759    for (int j = 0; j < W; j += 16) {
   2760      j256 = _mm256_set1_epi16(j);
   2761      int base_x = ((j << 6) - y * dx) >> frac_bits_x;
   2762      int base_shift = 0;
   2763      if ((base_x) < (min_base_x - 1)) {
   2764        base_shift = (min_base_x - (base_x)-1);
   2765      }
   2766      int base_min_diff = (min_base_x - base_x);
   2767      if (base_min_diff > 16) {
   2768        base_min_diff = 16;
   2769      } else {
   2770        if (base_min_diff < 0) base_min_diff = 0;
   2771      }
   2772 
   2773      if (base_shift < 8) {
   2774        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   2775        a1_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1));
   2776        a0_x128 =
   2777            _mm_shuffle_epi8(a0_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2778        a1_x128 =
   2779            _mm_shuffle_epi8(a1_x128, *(__m128i *)HighbdLoadMaskx[base_shift]);
   2780 
   2781        a0_x = _mm256_castsi128_si256(a0_x128);
   2782        a1_x = _mm256_castsi128_si256(a1_x128);
   2783      } else {
   2784        a0_x = _mm256_setzero_si256();
   2785        a1_x = _mm256_setzero_si256();
   2786      }
   2787 
   2788      int base_shift1 = 0;
   2789      if (base_shift > 8) {
   2790        base_shift1 = base_shift - 8;
   2791      }
   2792      if (base_shift1 < 8) {
   2793        a0_1_x128 =
   2794            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 8));
   2795        a1_1_x128 =
   2796            _mm_loadu_si128((__m128i *)(above + base_x + base_shift1 + 9));
   2797        a0_1_x128 = _mm_shuffle_epi8(a0_1_x128,
   2798                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
   2799        a1_1_x128 = _mm_shuffle_epi8(a1_1_x128,
   2800                                     *(__m128i *)HighbdLoadMaskx[base_shift1]);
   2801 
   2802        a0_x = _mm256_inserti128_si256(a0_x, a0_1_x128, 1);
   2803        a1_x = _mm256_inserti128_si256(a1_x, a1_1_x128, 1);
   2804      }
   2805      r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
   2806      shift = _mm256_srli_epi16(
   2807          _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
   2808 
   2809      diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   2810      a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   2811      a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2812 
   2813      b = _mm256_mullo_epi16(diff, shift);
   2814      res = _mm256_add_epi16(a32, b);
   2815      resx = _mm256_srli_epi16(res, 5);  // 16 16-bit values
   2816 
   2817      // y calc
   2818      resy = _mm256_setzero_si256();
   2819      __m256i a0_y, a1_y, shifty;
   2820      if ((base_x < min_base_x)) {
   2821        __m256i c256, y_c256, base_y_c256, mask256, mul16;
   2822        r6 = _mm256_set1_epi16(r << 6);
   2823        c256 = _mm256_add_epi16(j256, c1234);
   2824        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
   2825                                 _mm256_srli_epi16(min_base_y256, 1));
   2826        y_c256 = _mm256_sub_epi16(r6, mul16);
   2827        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
   2828        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
   2829        base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   2830        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   2831 
   2832        a0_y = _mm256_setr_epi16(
   2833            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   2834            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   2835            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
   2836            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
   2837            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
   2838            left[base_y_c[15]]);
   2839        base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
   2840        _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   2841 
   2842        a1_y = _mm256_setr_epi16(
   2843            left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   2844            left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   2845            left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
   2846            left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
   2847            left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
   2848            left[base_y_c[15]]);
   2849 
   2850        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
   2851 
   2852        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
   2853        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
   2854        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   2855 
   2856        b = _mm256_mullo_epi16(diff, shifty);
   2857        res = _mm256_add_epi16(a32, b);
   2858        resy = _mm256_srli_epi16(res, 5);
   2859      }
   2860 
   2861      resxy = _mm256_blendv_epi8(resx, resy,
   2862                                 *(__m256i *)HighbdBaseMask[base_min_diff]);
   2863      _mm256_storeu_si256((__m256i *)(dst + j), resxy);
   2864    }  // for j
   2865    dst += stride;
   2866  }
   2867 }
   2868 
   2869 // Directional prediction, zone 2: 90 < angle < 180
   2870 void av1_highbd_dr_prediction_z2_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   2871                                      int bh, const uint16_t *above,
   2872                                      const uint16_t *left, int upsample_above,
   2873                                      int upsample_left, int dx, int dy,
   2874                                      int bd) {
   2875  (void)bd;
   2876  assert(dx > 0);
   2877  assert(dy > 0);
   2878  switch (bw) {
   2879    case 4:
   2880      if (bd < 12) {
   2881        highbd_dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left,
   2882                                         upsample_above, upsample_left, dx, dy);
   2883      } else {
   2884        highbd_dr_prediction_32bit_z2_Nx4_avx2(bh, dst, stride, above, left,
   2885                                               upsample_above, upsample_left,
   2886                                               dx, dy);
   2887      }
   2888      break;
   2889    case 8:
   2890      if (bd < 12) {
   2891        highbd_dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left,
   2892                                         upsample_above, upsample_left, dx, dy);
   2893      } else {
   2894        highbd_dr_prediction_32bit_z2_Nx8_avx2(bh, dst, stride, above, left,
   2895                                               upsample_above, upsample_left,
   2896                                               dx, dy);
   2897      }
   2898      break;
   2899    default:
   2900      if (bd < 12) {
   2901        highbd_dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
   2902                                         upsample_above, upsample_left, dx, dy);
   2903      } else {
   2904        highbd_dr_prediction_32bit_z2_HxW_avx2(bh, bw, dst, stride, above, left,
   2905                                               upsample_above, upsample_left,
   2906                                               dx, dy);
   2907      }
   2908      break;
   2909  }
   2910 }
   2911 
   2912 //  Directional prediction, zone 3 functions
   2913 static void highbd_dr_prediction_z3_4x4_avx2(uint16_t *dst, ptrdiff_t stride,
   2914                                             const uint16_t *left,
   2915                                             int upsample_left, int dy,
   2916                                             int bd) {
   2917  __m128i dstvec[4], d[4];
   2918  if (bd < 12) {
   2919    highbd_dr_prediction_z1_4xN_internal_avx2(4, dstvec, left, upsample_left,
   2920                                              dy);
   2921  } else {
   2922    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(4, dstvec, left,
   2923                                                    upsample_left, dy);
   2924  }
   2925  highbd_transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2],
   2926                                   &dstvec[3], &d[0], &d[1], &d[2], &d[3]);
   2927  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
   2928  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
   2929  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
   2930  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
   2931  return;
   2932 }
   2933 
   2934 static void highbd_dr_prediction_z3_8x8_avx2(uint16_t *dst, ptrdiff_t stride,
   2935                                             const uint16_t *left,
   2936                                             int upsample_left, int dy,
   2937                                             int bd) {
   2938  __m128i dstvec[8], d[8];
   2939  if (bd < 12) {
   2940    highbd_dr_prediction_z1_8xN_internal_avx2(8, dstvec, left, upsample_left,
   2941                                              dy);
   2942  } else {
   2943    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(8, dstvec, left,
   2944                                                    upsample_left, dy);
   2945  }
   2946  highbd_transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   2947                           &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
   2948                           &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
   2949                           &d[7]);
   2950  for (int i = 0; i < 8; i++) {
   2951    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   2952  }
   2953 }
   2954 
   2955 static void highbd_dr_prediction_z3_4x8_avx2(uint16_t *dst, ptrdiff_t stride,
   2956                                             const uint16_t *left,
   2957                                             int upsample_left, int dy,
   2958                                             int bd) {
   2959  __m128i dstvec[4], d[8];
   2960  if (bd < 12) {
   2961    highbd_dr_prediction_z1_8xN_internal_avx2(4, dstvec, left, upsample_left,
   2962                                              dy);
   2963  } else {
   2964    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(4, dstvec, left,
   2965                                                    upsample_left, dy);
   2966  }
   2967 
   2968  highbd_transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   2969                               &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
   2970                               &d[7]);
   2971  for (int i = 0; i < 8; i++) {
   2972    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
   2973  }
   2974 }
   2975 
   2976 static void highbd_dr_prediction_z3_8x4_avx2(uint16_t *dst, ptrdiff_t stride,
   2977                                             const uint16_t *left,
   2978                                             int upsample_left, int dy,
   2979                                             int bd) {
   2980  __m128i dstvec[8], d[4];
   2981  if (bd < 12) {
   2982    highbd_dr_prediction_z1_4xN_internal_avx2(8, dstvec, left, upsample_left,
   2983                                              dy);
   2984  } else {
   2985    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(8, dstvec, left,
   2986                                                    upsample_left, dy);
   2987  }
   2988 
   2989  highbd_transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   2990                               &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7],
   2991                               &d[0], &d[1], &d[2], &d[3]);
   2992  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
   2993  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[1]);
   2994  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[2]);
   2995  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[3]);
   2996 }
   2997 
   2998 static void highbd_dr_prediction_z3_8x16_avx2(uint16_t *dst, ptrdiff_t stride,
   2999                                              const uint16_t *left,
   3000                                              int upsample_left, int dy,
   3001                                              int bd) {
   3002  __m256i dstvec[8], d[8];
   3003  if (bd < 12) {
   3004    highbd_dr_prediction_z1_16xN_internal_avx2(8, dstvec, left, upsample_left,
   3005                                               dy);
   3006  } else {
   3007    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(8, dstvec, left,
   3008                                                     upsample_left, dy);
   3009  }
   3010  highbd_transpose8x16_16x8_avx2(dstvec, d);
   3011  for (int i = 0; i < 8; i++) {
   3012    _mm_storeu_si128((__m128i *)(dst + i * stride),
   3013                     _mm256_castsi256_si128(d[i]));
   3014  }
   3015  for (int i = 8; i < 16; i++) {
   3016    _mm_storeu_si128((__m128i *)(dst + i * stride),
   3017                     _mm256_extracti128_si256(d[i - 8], 1));
   3018  }
   3019 }
   3020 
   3021 static void highbd_dr_prediction_z3_16x8_avx2(uint16_t *dst, ptrdiff_t stride,
   3022                                              const uint16_t *left,
   3023                                              int upsample_left, int dy,
   3024                                              int bd) {
   3025  __m128i dstvec[16], d[16];
   3026  if (bd < 12) {
   3027    highbd_dr_prediction_z1_8xN_internal_avx2(16, dstvec, left, upsample_left,
   3028                                              dy);
   3029  } else {
   3030    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(16, dstvec, left,
   3031                                                    upsample_left, dy);
   3032  }
   3033  for (int i = 0; i < 16; i += 8) {
   3034    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
   3035                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
   3036                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
   3037                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
   3038                             &d[5 + i], &d[6 + i], &d[7 + i]);
   3039  }
   3040  for (int i = 0; i < 8; i++) {
   3041    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   3042    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
   3043  }
   3044 }
   3045 
   3046 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3047 static void highbd_dr_prediction_z3_4x16_avx2(uint16_t *dst, ptrdiff_t stride,
   3048                                              const uint16_t *left,
   3049                                              int upsample_left, int dy,
   3050                                              int bd) {
   3051  __m256i dstvec[4], d[4], d1;
   3052  if (bd < 12) {
   3053    highbd_dr_prediction_z1_16xN_internal_avx2(4, dstvec, left, upsample_left,
   3054                                               dy);
   3055  } else {
   3056    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(4, dstvec, left,
   3057                                                     upsample_left, dy);
   3058  }
   3059  highbd_transpose4x16_avx2(dstvec, d);
   3060  for (int i = 0; i < 4; i++) {
   3061    _mm_storel_epi64((__m128i *)(dst + i * stride),
   3062                     _mm256_castsi256_si128(d[i]));
   3063    d1 = _mm256_bsrli_epi128(d[i], 8);
   3064    _mm_storel_epi64((__m128i *)(dst + (i + 4) * stride),
   3065                     _mm256_castsi256_si128(d1));
   3066    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
   3067                     _mm256_extracti128_si256(d[i], 1));
   3068    _mm_storel_epi64((__m128i *)(dst + (i + 12) * stride),
   3069                     _mm256_extracti128_si256(d1, 1));
   3070  }
   3071 }
   3072 
   3073 static void highbd_dr_prediction_z3_16x4_avx2(uint16_t *dst, ptrdiff_t stride,
   3074                                              const uint16_t *left,
   3075                                              int upsample_left, int dy,
   3076                                              int bd) {
   3077  __m128i dstvec[16], d[8];
   3078  if (bd < 12) {
   3079    highbd_dr_prediction_z1_4xN_internal_avx2(16, dstvec, left, upsample_left,
   3080                                              dy);
   3081  } else {
   3082    highbd_dr_prediction_32bit_z1_4xN_internal_avx2(16, dstvec, left,
   3083                                                    upsample_left, dy);
   3084  }
   3085  highbd_transpose16x4_8x8_sse2(dstvec, d);
   3086 
   3087  _mm_storeu_si128((__m128i *)(dst + 0 * stride), d[0]);
   3088  _mm_storeu_si128((__m128i *)(dst + 0 * stride + 8), d[1]);
   3089  _mm_storeu_si128((__m128i *)(dst + 1 * stride), d[2]);
   3090  _mm_storeu_si128((__m128i *)(dst + 1 * stride + 8), d[3]);
   3091  _mm_storeu_si128((__m128i *)(dst + 2 * stride), d[4]);
   3092  _mm_storeu_si128((__m128i *)(dst + 2 * stride + 8), d[5]);
   3093  _mm_storeu_si128((__m128i *)(dst + 3 * stride), d[6]);
   3094  _mm_storeu_si128((__m128i *)(dst + 3 * stride + 8), d[7]);
   3095 }
   3096 
   3097 static void highbd_dr_prediction_z3_8x32_avx2(uint16_t *dst, ptrdiff_t stride,
   3098                                              const uint16_t *left,
   3099                                              int upsample_left, int dy,
   3100                                              int bd) {
   3101  __m256i dstvec[16], d[16];
   3102  if (bd < 12) {
   3103    highbd_dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left,
   3104                                               dy);
   3105  } else {
   3106    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(8, dstvec, left,
   3107                                                     upsample_left, dy);
   3108  }
   3109 
   3110  for (int i = 0; i < 16; i += 8) {
   3111    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
   3112  }
   3113 
   3114  for (int i = 0; i < 8; i++) {
   3115    _mm_storeu_si128((__m128i *)(dst + i * stride),
   3116                     _mm256_castsi256_si128(d[i]));
   3117  }
   3118  for (int i = 0; i < 8; i++) {
   3119    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
   3120                     _mm256_extracti128_si256(d[i], 1));
   3121  }
   3122  for (int i = 8; i < 16; i++) {
   3123    _mm_storeu_si128((__m128i *)(dst + (i + 8) * stride),
   3124                     _mm256_castsi256_si128(d[i]));
   3125  }
   3126  for (int i = 8; i < 16; i++) {
   3127    _mm_storeu_si128((__m128i *)(dst + (i + 16) * stride),
   3128                     _mm256_extracti128_si256(d[i], 1));
   3129  }
   3130 }
   3131 
   3132 static void highbd_dr_prediction_z3_32x8_avx2(uint16_t *dst, ptrdiff_t stride,
   3133                                              const uint16_t *left,
   3134                                              int upsample_left, int dy,
   3135                                              int bd) {
   3136  __m128i dstvec[32], d[32];
   3137  if (bd < 12) {
   3138    highbd_dr_prediction_z1_8xN_internal_avx2(32, dstvec, left, upsample_left,
   3139                                              dy);
   3140  } else {
   3141    highbd_dr_prediction_32bit_z1_8xN_internal_avx2(32, dstvec, left,
   3142                                                    upsample_left, dy);
   3143  }
   3144 
   3145  for (int i = 0; i < 32; i += 8) {
   3146    highbd_transpose8x8_sse2(&dstvec[0 + i], &dstvec[1 + i], &dstvec[2 + i],
   3147                             &dstvec[3 + i], &dstvec[4 + i], &dstvec[5 + i],
   3148                             &dstvec[6 + i], &dstvec[7 + i], &d[0 + i],
   3149                             &d[1 + i], &d[2 + i], &d[3 + i], &d[4 + i],
   3150                             &d[5 + i], &d[6 + i], &d[7 + i]);
   3151  }
   3152  for (int i = 0; i < 8; i++) {
   3153    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   3154    _mm_storeu_si128((__m128i *)(dst + i * stride + 8), d[i + 8]);
   3155    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 16]);
   3156    _mm_storeu_si128((__m128i *)(dst + i * stride + 24), d[i + 24]);
   3157  }
   3158 }
   3159 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3160 
   3161 static void highbd_dr_prediction_z3_16x16_avx2(uint16_t *dst, ptrdiff_t stride,
   3162                                               const uint16_t *left,
   3163                                               int upsample_left, int dy,
   3164                                               int bd) {
   3165  __m256i dstvec[16], d[16];
   3166  if (bd < 12) {
   3167    highbd_dr_prediction_z1_16xN_internal_avx2(16, dstvec, left, upsample_left,
   3168                                               dy);
   3169  } else {
   3170    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(16, dstvec, left,
   3171                                                     upsample_left, dy);
   3172  }
   3173 
   3174  highbd_transpose16x16_avx2(dstvec, d);
   3175 
   3176  for (int i = 0; i < 16; i++) {
   3177    _mm256_storeu_si256((__m256i *)(dst + i * stride), d[i]);
   3178  }
   3179 }
   3180 
   3181 static void highbd_dr_prediction_z3_32x32_avx2(uint16_t *dst, ptrdiff_t stride,
   3182                                               const uint16_t *left,
   3183                                               int upsample_left, int dy,
   3184                                               int bd) {
   3185  __m256i dstvec[64], d[16];
   3186  if (bd < 12) {
   3187    highbd_dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left,
   3188                                               dy);
   3189  } else {
   3190    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(32, dstvec, left,
   3191                                                     upsample_left, dy);
   3192  }
   3193  highbd_transpose16x16_avx2(dstvec, d);
   3194  for (int j = 0; j < 16; j++) {
   3195    _mm256_storeu_si256((__m256i *)(dst + j * stride), d[j]);
   3196  }
   3197  highbd_transpose16x16_avx2(dstvec + 16, d);
   3198  for (int j = 0; j < 16; j++) {
   3199    _mm256_storeu_si256((__m256i *)(dst + j * stride + 16), d[j]);
   3200  }
   3201  highbd_transpose16x16_avx2(dstvec + 32, d);
   3202  for (int j = 0; j < 16; j++) {
   3203    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride), d[j]);
   3204  }
   3205  highbd_transpose16x16_avx2(dstvec + 48, d);
   3206  for (int j = 0; j < 16; j++) {
   3207    _mm256_storeu_si256((__m256i *)(dst + (j + 16) * stride + 16), d[j]);
   3208  }
   3209 }
   3210 
   3211 static void highbd_dr_prediction_z3_64x64_avx2(uint16_t *dst, ptrdiff_t stride,
   3212                                               const uint16_t *left,
   3213                                               int upsample_left, int dy,
   3214                                               int bd) {
   3215  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 64]);
   3216  if (bd < 12) {
   3217    highbd_dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
   3218  } else {
   3219    highbd_dr_prediction_32bit_z1_64xN_avx2(64, dstT, 64, left, upsample_left,
   3220                                            dy);
   3221  }
   3222  highbd_transpose(dstT, 64, dst, stride, 64, 64);
   3223 }
   3224 
   3225 static void highbd_dr_prediction_z3_16x32_avx2(uint16_t *dst, ptrdiff_t stride,
   3226                                               const uint16_t *left,
   3227                                               int upsample_left, int dy,
   3228                                               int bd) {
   3229  __m256i dstvec[32], d[32];
   3230  if (bd < 12) {
   3231    highbd_dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left,
   3232                                               dy);
   3233  } else {
   3234    highbd_dr_prediction_32bit_z1_32xN_internal_avx2(16, dstvec, left,
   3235                                                     upsample_left, dy);
   3236  }
   3237  for (int i = 0; i < 32; i += 8) {
   3238    highbd_transpose8x16_16x8_avx2(dstvec + i, d + i);
   3239  }
   3240  // store
   3241  for (int j = 0; j < 32; j += 16) {
   3242    for (int i = 0; i < 8; i++) {
   3243      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride),
   3244                       _mm256_castsi256_si128(d[(i + j)]));
   3245    }
   3246    for (int i = 0; i < 8; i++) {
   3247      _mm_storeu_si128((__m128i *)(dst + (i + j) * stride + 8),
   3248                       _mm256_castsi256_si128(d[(i + j) + 8]));
   3249    }
   3250    for (int i = 8; i < 16; i++) {
   3251      _mm256_storeu_si256(
   3252          (__m256i *)(dst + (i + j) * stride),
   3253          _mm256_inserti128_si256(
   3254              d[(i + j)], _mm256_extracti128_si256(d[(i + j) - 8], 1), 0));
   3255    }
   3256  }
   3257 }
   3258 
   3259 static void highbd_dr_prediction_z3_32x16_avx2(uint16_t *dst, ptrdiff_t stride,
   3260                                               const uint16_t *left,
   3261                                               int upsample_left, int dy,
   3262                                               int bd) {
   3263  __m256i dstvec[32], d[16];
   3264  if (bd < 12) {
   3265    highbd_dr_prediction_z1_16xN_internal_avx2(32, dstvec, left, upsample_left,
   3266                                               dy);
   3267  } else {
   3268    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(32, dstvec, left,
   3269                                                     upsample_left, dy);
   3270  }
   3271  for (int i = 0; i < 32; i += 16) {
   3272    highbd_transpose16x16_avx2((dstvec + i), d);
   3273    for (int j = 0; j < 16; j++) {
   3274      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
   3275    }
   3276  }
   3277 }
   3278 
   3279 static void highbd_dr_prediction_z3_32x64_avx2(uint16_t *dst, ptrdiff_t stride,
   3280                                               const uint16_t *left,
   3281                                               int upsample_left, int dy,
   3282                                               int bd) {
   3283  uint16_t dstT[64 * 32];
   3284  if (bd < 12) {
   3285    highbd_dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
   3286  } else {
   3287    highbd_dr_prediction_32bit_z1_64xN_avx2(32, dstT, 64, left, upsample_left,
   3288                                            dy);
   3289  }
   3290  highbd_transpose(dstT, 64, dst, stride, 32, 64);
   3291 }
   3292 
   3293 static void highbd_dr_prediction_z3_64x32_avx2(uint16_t *dst, ptrdiff_t stride,
   3294                                               const uint16_t *left,
   3295                                               int upsample_left, int dy,
   3296                                               int bd) {
   3297  DECLARE_ALIGNED(16, uint16_t, dstT[32 * 64]);
   3298  highbd_dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy, bd);
   3299  highbd_transpose(dstT, 32, dst, stride, 64, 32);
   3300  return;
   3301 }
   3302 
   3303 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3304 static void highbd_dr_prediction_z3_16x64_avx2(uint16_t *dst, ptrdiff_t stride,
   3305                                               const uint16_t *left,
   3306                                               int upsample_left, int dy,
   3307                                               int bd) {
   3308  DECLARE_ALIGNED(16, uint16_t, dstT[64 * 16]);
   3309  if (bd < 12) {
   3310    highbd_dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
   3311  } else {
   3312    highbd_dr_prediction_32bit_z1_64xN_avx2(16, dstT, 64, left, upsample_left,
   3313                                            dy);
   3314  }
   3315  highbd_transpose(dstT, 64, dst, stride, 16, 64);
   3316 }
   3317 
   3318 static void highbd_dr_prediction_z3_64x16_avx2(uint16_t *dst, ptrdiff_t stride,
   3319                                               const uint16_t *left,
   3320                                               int upsample_left, int dy,
   3321                                               int bd) {
   3322  __m256i dstvec[64], d[16];
   3323  if (bd < 12) {
   3324    highbd_dr_prediction_z1_16xN_internal_avx2(64, dstvec, left, upsample_left,
   3325                                               dy);
   3326  } else {
   3327    highbd_dr_prediction_32bit_z1_16xN_internal_avx2(64, dstvec, left,
   3328                                                     upsample_left, dy);
   3329  }
   3330  for (int i = 0; i < 64; i += 16) {
   3331    highbd_transpose16x16_avx2((dstvec + i), d);
   3332    for (int j = 0; j < 16; j++) {
   3333      _mm256_storeu_si256((__m256i *)(dst + j * stride + i), d[j]);
   3334    }
   3335  }
   3336 }
   3337 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3338 
   3339 void av1_highbd_dr_prediction_z3_avx2(uint16_t *dst, ptrdiff_t stride, int bw,
   3340                                      int bh, const uint16_t *above,
   3341                                      const uint16_t *left, int upsample_left,
   3342                                      int dx, int dy, int bd) {
   3343  (void)above;
   3344  (void)dx;
   3345 
   3346  assert(dx == 1);
   3347  assert(dy > 0);
   3348  if (bw == bh) {
   3349    switch (bw) {
   3350      case 4:
   3351        highbd_dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy,
   3352                                         bd);
   3353        break;
   3354      case 8:
   3355        highbd_dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy,
   3356                                         bd);
   3357        break;
   3358      case 16:
   3359        highbd_dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy,
   3360                                           bd);
   3361        break;
   3362      case 32:
   3363        highbd_dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy,
   3364                                           bd);
   3365        break;
   3366      case 64:
   3367        highbd_dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy,
   3368                                           bd);
   3369        break;
   3370    }
   3371  } else {
   3372    if (bw < bh) {
   3373      if (bw + bw == bh) {
   3374        switch (bw) {
   3375          case 4:
   3376            highbd_dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left,
   3377                                             dy, bd);
   3378            break;
   3379          case 8:
   3380            highbd_dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left,
   3381                                              dy, bd);
   3382            break;
   3383          case 16:
   3384            highbd_dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left,
   3385                                               dy, bd);
   3386            break;
   3387          case 32:
   3388            highbd_dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left,
   3389                                               dy, bd);
   3390            break;
   3391        }
   3392      } else {
   3393        switch (bw) {
   3394 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3395          case 4:
   3396            highbd_dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left,
   3397                                              dy, bd);
   3398            break;
   3399          case 8:
   3400            highbd_dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left,
   3401                                              dy, bd);
   3402            break;
   3403          case 16:
   3404            highbd_dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left,
   3405                                               dy, bd);
   3406            break;
   3407 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3408        }
   3409      }
   3410    } else {
   3411      if (bh + bh == bw) {
   3412        switch (bh) {
   3413          case 4:
   3414            highbd_dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left,
   3415                                             dy, bd);
   3416            break;
   3417          case 8:
   3418            highbd_dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left,
   3419                                              dy, bd);
   3420            break;
   3421          case 16:
   3422            highbd_dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left,
   3423                                               dy, bd);
   3424            break;
   3425          case 32:
   3426            highbd_dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left,
   3427                                               dy, bd);
   3428            break;
   3429        }
   3430      } else {
   3431        switch (bh) {
   3432 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3433          case 4:
   3434            highbd_dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left,
   3435                                              dy, bd);
   3436            break;
   3437          case 8:
   3438            highbd_dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left,
   3439                                              dy, bd);
   3440            break;
   3441          case 16:
   3442            highbd_dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left,
   3443                                               dy, bd);
   3444            break;
   3445 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   3446        }
   3447      }
   3448    }
   3449  }
   3450  return;
   3451 }
   3452 #endif  // CONFIG_AV1_HIGHBITDEPTH
   3453 
   3454 // Low bit depth functions
   3455 static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = {
   3456  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3457    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3458  { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3459    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3460  { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3461    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3462  { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3463    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3464  { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3465    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3466  { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3467    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3468  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3469    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3470  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   3471    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0, 0 },
   3472  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0,
   3473    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0, 0 },
   3474  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
   3475    0,    0,    0,    0,    0,    0,    0,    0,    0,    0, 0, 0, 0, 0, 0, 0 },
   3476  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   3477    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   3478    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3479  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3480    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   3481    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3482  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3483    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
   3484    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3485  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3486    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0,
   3487    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3488  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3489    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0,
   3490    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3491  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3492    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,
   3493    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3494  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3495    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,
   3496    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3497  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3498    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,
   3499    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3500  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3501    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,
   3502    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3503  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3504    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,
   3505    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3506  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3507    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,
   3508    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3509  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3510    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
   3511    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3512  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3513    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3514    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3515  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3516    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3517    0xff, 0,    0,    0,    0,    0,    0,    0,    0,    0 },
   3518  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3519    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3520    0xff, 0xff, 0,    0,    0,    0,    0,    0,    0,    0 },
   3521  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3522    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3523    0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0,    0 },
   3524  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3525    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3526    0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0,    0 },
   3527  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3528    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3529    0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0,    0 },
   3530  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3531    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3532    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0,    0 },
   3533  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3534    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3535    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0,    0 },
   3536  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3537    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3538    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,    0 },
   3539  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3540    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3541    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0 },
   3542  { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3543    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
   3544    0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff },
   3545 };
   3546 
   3547 /* clang-format on */
   3548 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_avx2(
   3549    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
   3550    int dx) {
   3551  const int frac_bits = 6 - upsample_above;
   3552  const int max_base_x = ((W + H) - 1) << upsample_above;
   3553 
   3554  assert(dx > 0);
   3555  // pre-filter above pixels
   3556  // store in temp buffers:
   3557  //   above[x] * 32 + 16
   3558  //   above[x+1] - above[x]
   3559  // final pixels will be calculated as:
   3560  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3561  __m256i a0, a1, a32, a16;
   3562  __m256i diff, c3f;
   3563  __m128i a_mbase_x;
   3564 
   3565  a16 = _mm256_set1_epi16(16);
   3566  a_mbase_x = _mm_set1_epi8((int8_t)above[max_base_x]);
   3567  c3f = _mm256_set1_epi16(0x3f);
   3568 
   3569  int x = dx;
   3570  for (int r = 0; r < W; r++) {
   3571    __m256i b, res, shift;
   3572    __m128i res1, a0_128, a1_128;
   3573 
   3574    int base = x >> frac_bits;
   3575    int base_max_diff = (max_base_x - base) >> upsample_above;
   3576    if (base_max_diff <= 0) {
   3577      for (int i = r; i < W; ++i) {
   3578        dst[i] = a_mbase_x;  // save 4 values
   3579      }
   3580      return;
   3581    }
   3582    if (base_max_diff > H) base_max_diff = H;
   3583    a0_128 = _mm_loadu_si128((__m128i *)(above + base));
   3584    a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1));
   3585 
   3586    if (upsample_above) {
   3587      a0_128 = _mm_shuffle_epi8(a0_128, *(__m128i *)EvenOddMaskx[0]);
   3588      a1_128 = _mm_srli_si128(a0_128, 8);
   3589 
   3590      shift = _mm256_srli_epi16(
   3591          _mm256_and_si256(
   3592              _mm256_slli_epi16(_mm256_set1_epi16(x), upsample_above), c3f),
   3593          1);
   3594    } else {
   3595      shift = _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   3596    }
   3597    a0 = _mm256_cvtepu8_epi16(a0_128);
   3598    a1 = _mm256_cvtepu8_epi16(a1_128);
   3599 
   3600    diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   3601    a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   3602    a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   3603 
   3604    b = _mm256_mullo_epi16(diff, shift);
   3605    res = _mm256_add_epi16(a32, b);
   3606    res = _mm256_srli_epi16(res, 5);
   3607 
   3608    res = _mm256_packus_epi16(
   3609        res, _mm256_castsi128_si256(
   3610                 _mm256_extracti128_si256(res, 1)));  // goto 8 bit
   3611    res1 = _mm256_castsi256_si128(res);               // 16 8bit values
   3612 
   3613    dst[r] =
   3614        _mm_blendv_epi8(a_mbase_x, res1, *(__m128i *)BaseMask[base_max_diff]);
   3615    x += dx;
   3616  }
   3617 }
   3618 
   3619 static void dr_prediction_z1_4xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3620                                      const uint8_t *above, int upsample_above,
   3621                                      int dx) {
   3622  __m128i dstvec[16];
   3623 
   3624  dr_prediction_z1_HxW_internal_avx2(4, N, dstvec, above, upsample_above, dx);
   3625  for (int i = 0; i < N; i++) {
   3626    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
   3627  }
   3628 }
   3629 
   3630 static void dr_prediction_z1_8xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3631                                      const uint8_t *above, int upsample_above,
   3632                                      int dx) {
   3633  __m128i dstvec[32];
   3634 
   3635  dr_prediction_z1_HxW_internal_avx2(8, N, dstvec, above, upsample_above, dx);
   3636  for (int i = 0; i < N; i++) {
   3637    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
   3638  }
   3639 }
   3640 
   3641 static void dr_prediction_z1_16xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3642                                       const uint8_t *above, int upsample_above,
   3643                                       int dx) {
   3644  __m128i dstvec[64];
   3645 
   3646  dr_prediction_z1_HxW_internal_avx2(16, N, dstvec, above, upsample_above, dx);
   3647  for (int i = 0; i < N; i++) {
   3648    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
   3649  }
   3650 }
   3651 
   3652 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_avx2(
   3653    int N, __m256i *dstvec, const uint8_t *above, int upsample_above, int dx) {
   3654  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   3655  (void)upsample_above;
   3656  const int frac_bits = 6;
   3657  const int max_base_x = ((32 + N) - 1);
   3658 
   3659  // pre-filter above pixels
   3660  // store in temp buffers:
   3661  //   above[x] * 32 + 16
   3662  //   above[x+1] - above[x]
   3663  // final pixels will be calculated as:
   3664  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3665  __m256i a0, a1, a32, a16;
   3666  __m256i a_mbase_x, diff, c3f;
   3667 
   3668  a16 = _mm256_set1_epi16(16);
   3669  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
   3670  c3f = _mm256_set1_epi16(0x3f);
   3671 
   3672  int x = dx;
   3673  for (int r = 0; r < N; r++) {
   3674    __m256i b, res, res16[2];
   3675    __m128i a0_128, a1_128;
   3676 
   3677    int base = x >> frac_bits;
   3678    int base_max_diff = (max_base_x - base);
   3679    if (base_max_diff <= 0) {
   3680      for (int i = r; i < N; ++i) {
   3681        dstvec[i] = a_mbase_x;  // save 32 values
   3682      }
   3683      return;
   3684    }
   3685    if (base_max_diff > 32) base_max_diff = 32;
   3686    __m256i shift =
   3687        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   3688 
   3689    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
   3690      int mdiff = base_max_diff - j;
   3691      if (mdiff <= 0) {
   3692        res16[jj] = a_mbase_x;
   3693      } else {
   3694        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
   3695        a1_128 = _mm_loadu_si128((__m128i *)(above + base + j + 1));
   3696        a0 = _mm256_cvtepu8_epi16(a0_128);
   3697        a1 = _mm256_cvtepu8_epi16(a1_128);
   3698 
   3699        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   3700        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   3701        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   3702        b = _mm256_mullo_epi16(diff, shift);
   3703 
   3704        res = _mm256_add_epi16(a32, b);
   3705        res = _mm256_srli_epi16(res, 5);
   3706        res16[jj] = _mm256_packus_epi16(
   3707            res, _mm256_castsi128_si256(
   3708                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
   3709      }
   3710    }
   3711    res16[1] =
   3712        _mm256_inserti128_si256(res16[0], _mm256_castsi256_si128(res16[1]),
   3713                                1);  // 32 8bit values
   3714 
   3715    dstvec[r] = _mm256_blendv_epi8(
   3716        a_mbase_x, res16[1],
   3717        *(__m256i *)BaseMask[base_max_diff]);  // 32 8bit values
   3718    x += dx;
   3719  }
   3720 }
   3721 
   3722 static void dr_prediction_z1_32xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3723                                       const uint8_t *above, int upsample_above,
   3724                                       int dx) {
   3725  __m256i dstvec[64];
   3726  dr_prediction_z1_32xN_internal_avx2(N, dstvec, above, upsample_above, dx);
   3727  for (int i = 0; i < N; i++) {
   3728    _mm256_storeu_si256((__m256i *)(dst + stride * i), dstvec[i]);
   3729  }
   3730 }
   3731 
   3732 static void dr_prediction_z1_64xN_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3733                                       const uint8_t *above, int upsample_above,
   3734                                       int dx) {
   3735  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
   3736  (void)upsample_above;
   3737  const int frac_bits = 6;
   3738  const int max_base_x = ((64 + N) - 1);
   3739 
   3740  // pre-filter above pixels
   3741  // store in temp buffers:
   3742  //   above[x] * 32 + 16
   3743  //   above[x+1] - above[x]
   3744  // final pixels will be calculated as:
   3745  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3746  __m256i a0, a1, a32, a16;
   3747  __m256i a_mbase_x, diff, c3f;
   3748  __m128i max_base_x128, base_inc128, mask128;
   3749 
   3750  a16 = _mm256_set1_epi16(16);
   3751  a_mbase_x = _mm256_set1_epi8((int8_t)above[max_base_x]);
   3752  max_base_x128 = _mm_set1_epi8(max_base_x);
   3753  c3f = _mm256_set1_epi16(0x3f);
   3754 
   3755  int x = dx;
   3756  for (int r = 0; r < N; r++, dst += stride) {
   3757    __m256i b, res;
   3758    int base = x >> frac_bits;
   3759    if (base >= max_base_x) {
   3760      for (int i = r; i < N; ++i) {
   3761        _mm256_storeu_si256((__m256i *)dst, a_mbase_x);  // save 32 values
   3762        _mm256_storeu_si256((__m256i *)(dst + 32), a_mbase_x);
   3763        dst += stride;
   3764      }
   3765      return;
   3766    }
   3767 
   3768    __m256i shift =
   3769        _mm256_srli_epi16(_mm256_and_si256(_mm256_set1_epi16(x), c3f), 1);
   3770 
   3771    __m128i a0_128, a1_128, res128;
   3772    for (int j = 0; j < 64; j += 16) {
   3773      int mdif = max_base_x - (base + j);
   3774      if (mdif <= 0) {
   3775        _mm_storeu_si128((__m128i *)(dst + j),
   3776                         _mm256_castsi256_si128(a_mbase_x));
   3777      } else {
   3778        a0_128 = _mm_loadu_si128((__m128i *)(above + base + j));
   3779        a1_128 = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
   3780        a0 = _mm256_cvtepu8_epi16(a0_128);
   3781        a1 = _mm256_cvtepu8_epi16(a1_128);
   3782 
   3783        diff = _mm256_sub_epi16(a1, a0);   // a[x+1] - a[x]
   3784        a32 = _mm256_slli_epi16(a0, 5);    // a[x] * 32
   3785        a32 = _mm256_add_epi16(a32, a16);  // a[x] * 32 + 16
   3786        b = _mm256_mullo_epi16(diff, shift);
   3787 
   3788        res = _mm256_add_epi16(a32, b);
   3789        res = _mm256_srli_epi16(res, 5);
   3790        res = _mm256_packus_epi16(
   3791            res, _mm256_castsi128_si256(
   3792                     _mm256_extracti128_si256(res, 1)));  // 16 8bit values
   3793 
   3794        base_inc128 =
   3795            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
   3796                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
   3797                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
   3798                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
   3799                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
   3800                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
   3801                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
   3802                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
   3803 
   3804        mask128 = _mm_cmpgt_epi8(_mm_subs_epu8(max_base_x128, base_inc128),
   3805                                 _mm_setzero_si128());
   3806        res128 = _mm_blendv_epi8(_mm256_castsi256_si128(a_mbase_x),
   3807                                 _mm256_castsi256_si128(res), mask128);
   3808        _mm_storeu_si128((__m128i *)(dst + j), res128);
   3809      }
   3810    }
   3811    x += dx;
   3812  }
   3813 }
   3814 
   3815 // Directional prediction, zone 1: 0 < angle < 90
   3816 void av1_dr_prediction_z1_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   3817                               const uint8_t *above, const uint8_t *left,
   3818                               int upsample_above, int dx, int dy) {
   3819  (void)left;
   3820  (void)dy;
   3821  switch (bw) {
   3822    case 4:
   3823      dr_prediction_z1_4xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3824      break;
   3825    case 8:
   3826      dr_prediction_z1_8xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3827      break;
   3828    case 16:
   3829      dr_prediction_z1_16xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3830      break;
   3831    case 32:
   3832      dr_prediction_z1_32xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3833      break;
   3834    case 64:
   3835      dr_prediction_z1_64xN_avx2(bh, dst, stride, above, upsample_above, dx);
   3836      break;
   3837    default: break;
   3838  }
   3839  return;
   3840 }
   3841 
   3842 static void dr_prediction_z2_Nx4_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3843                                      const uint8_t *above, const uint8_t *left,
   3844                                      int upsample_above, int upsample_left,
   3845                                      int dx, int dy) {
   3846  const int min_base_x = -(1 << upsample_above);
   3847  const int min_base_y = -(1 << upsample_left);
   3848  const int frac_bits_x = 6 - upsample_above;
   3849  const int frac_bits_y = 6 - upsample_left;
   3850 
   3851  assert(dx > 0);
   3852  // pre-filter above pixels
   3853  // store in temp buffers:
   3854  //   above[x] * 32 + 16
   3855  //   above[x+1] - above[x]
   3856  // final pixels will be calculated as:
   3857  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3858  __m128i a0_x, a1_x, a32, a16, diff;
   3859  __m128i c3f, min_base_y128, c1234, dy128;
   3860 
   3861  a16 = _mm_set1_epi16(16);
   3862  c3f = _mm_set1_epi16(0x3f);
   3863  min_base_y128 = _mm_set1_epi16(min_base_y);
   3864  c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
   3865  dy128 = _mm_set1_epi16(dy);
   3866 
   3867  for (int r = 0; r < N; r++) {
   3868    __m128i b, res, shift, r6, ydx;
   3869    __m128i resx, resy, resxy;
   3870    __m128i a0_x128, a1_x128;
   3871    int y = r + 1;
   3872    int base_x = (-y * dx) >> frac_bits_x;
   3873    int base_shift = 0;
   3874    if (base_x < (min_base_x - 1)) {
   3875      base_shift = (min_base_x - base_x - 1) >> upsample_above;
   3876    }
   3877    int base_min_diff =
   3878        (min_base_x - base_x + upsample_above) >> upsample_above;
   3879    if (base_min_diff > 4) {
   3880      base_min_diff = 4;
   3881    } else {
   3882      if (base_min_diff < 0) base_min_diff = 0;
   3883    }
   3884 
   3885    if (base_shift > 3) {
   3886      a0_x = _mm_setzero_si128();
   3887      a1_x = _mm_setzero_si128();
   3888      shift = _mm_setzero_si128();
   3889    } else {
   3890      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   3891      ydx = _mm_set1_epi16(y * dx);
   3892      r6 = _mm_slli_epi16(c1234, 6);
   3893 
   3894      if (upsample_above) {
   3895        a0_x128 =
   3896            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
   3897        a1_x128 = _mm_srli_si128(a0_x128, 8);
   3898 
   3899        shift = _mm_srli_epi16(
   3900            _mm_and_si128(
   3901                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
   3902            1);
   3903      } else {
   3904        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
   3905        a1_x128 = _mm_srli_si128(a0_x128, 1);
   3906 
   3907        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
   3908      }
   3909      a0_x = _mm_cvtepu8_epi16(a0_x128);
   3910      a1_x = _mm_cvtepu8_epi16(a1_x128);
   3911    }
   3912    // y calc
   3913    __m128i a0_y, a1_y, shifty;
   3914    if (base_x < min_base_x) {
   3915      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
   3916      __m128i y_c128, base_y_c128, mask128, c1234_;
   3917      c1234_ = _mm_srli_si128(c1234, 2);
   3918      r6 = _mm_set1_epi16(r << 6);
   3919      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy128));
   3920      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
   3921      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
   3922      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   3923      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   3924 
   3925      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   3926                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
   3927      base_y_c128 = _mm_add_epi16(base_y_c128, _mm_srli_epi16(a16, 4));
   3928      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   3929      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   3930                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
   3931 
   3932      if (upsample_left) {
   3933        shifty = _mm_srli_epi16(
   3934            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
   3935      } else {
   3936        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
   3937      }
   3938      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
   3939      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
   3940      shift = _mm_unpacklo_epi64(shift, shifty);
   3941    }
   3942 
   3943    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   3944    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
   3945    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
   3946 
   3947    b = _mm_mullo_epi16(diff, shift);
   3948    res = _mm_add_epi16(a32, b);
   3949    res = _mm_srli_epi16(res, 5);
   3950 
   3951    resx = _mm_packus_epi16(res, res);
   3952    resy = _mm_srli_si128(resx, 4);
   3953 
   3954    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
   3955    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
   3956    dst += stride;
   3957  }
   3958 }
   3959 
   3960 static void dr_prediction_z2_Nx8_avx2(int N, uint8_t *dst, ptrdiff_t stride,
   3961                                      const uint8_t *above, const uint8_t *left,
   3962                                      int upsample_above, int upsample_left,
   3963                                      int dx, int dy) {
   3964  const int min_base_x = -(1 << upsample_above);
   3965  const int min_base_y = -(1 << upsample_left);
   3966  const int frac_bits_x = 6 - upsample_above;
   3967  const int frac_bits_y = 6 - upsample_left;
   3968 
   3969  // pre-filter above pixels
   3970  // store in temp buffers:
   3971  //   above[x] * 32 + 16
   3972  //   above[x+1] - above[x]
   3973  // final pixels will be calculated as:
   3974  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
   3975  __m256i diff, a32, a16;
   3976  __m256i a0_x, a1_x;
   3977  __m128i a0_x128, a1_x128, min_base_y128, c3f;
   3978  __m128i c1234, dy128;
   3979 
   3980  a16 = _mm256_set1_epi16(16);
   3981  c3f = _mm_set1_epi16(0x3f);
   3982  min_base_y128 = _mm_set1_epi16(min_base_y);
   3983  dy128 = _mm_set1_epi16(dy);
   3984  c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
   3985 
   3986  for (int r = 0; r < N; r++) {
   3987    __m256i b, res, shift;
   3988    __m128i resx, resy, resxy, r6, ydx;
   3989 
   3990    int y = r + 1;
   3991    int base_x = (-y * dx) >> frac_bits_x;
   3992    int base_shift = 0;
   3993    if (base_x < (min_base_x - 1)) {
   3994      base_shift = (min_base_x - base_x - 1) >> upsample_above;
   3995    }
   3996    int base_min_diff =
   3997        (min_base_x - base_x + upsample_above) >> upsample_above;
   3998    if (base_min_diff > 8) {
   3999      base_min_diff = 8;
   4000    } else {
   4001      if (base_min_diff < 0) base_min_diff = 0;
   4002    }
   4003 
   4004    if (base_shift > 7) {
   4005      a0_x = _mm256_setzero_si256();
   4006      a1_x = _mm256_setzero_si256();
   4007      shift = _mm256_setzero_si256();
   4008    } else {
   4009      a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
   4010      ydx = _mm_set1_epi16(y * dx);
   4011      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
   4012      if (upsample_above) {
   4013        a0_x128 =
   4014            _mm_shuffle_epi8(a0_x128, *(__m128i *)EvenOddMaskx[base_shift]);
   4015        a1_x128 = _mm_srli_si128(a0_x128, 8);
   4016 
   4017        shift = _mm256_castsi128_si256(_mm_srli_epi16(
   4018            _mm_and_si128(
   4019                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
   4020            1));
   4021      } else {
   4022        a1_x128 = _mm_srli_si128(a0_x128, 1);
   4023        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
   4024        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
   4025 
   4026        shift = _mm256_castsi128_si256(
   4027            _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1));
   4028      }
   4029      a0_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a0_x128));
   4030      a1_x = _mm256_castsi128_si256(_mm_cvtepu8_epi16(a1_x128));
   4031    }
   4032 
   4033    // y calc
   4034    __m128i a0_y, a1_y, shifty;
   4035    if (base_x < min_base_x) {
   4036      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   4037      __m128i y_c128, base_y_c128, mask128;
   4038      r6 = _mm_set1_epi16(r << 6);
   4039      y_c128 = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy128));
   4040      base_y_c128 = _mm_srai_epi16(y_c128, frac_bits_y);
   4041      mask128 = _mm_cmpgt_epi16(min_base_y128, base_y_c128);
   4042      base_y_c128 = _mm_andnot_si128(mask128, base_y_c128);
   4043      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   4044 
   4045      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   4046                            left[base_y_c[2]], left[base_y_c[3]],
   4047                            left[base_y_c[4]], left[base_y_c[5]],
   4048                            left[base_y_c[6]], left[base_y_c[7]]);
   4049      base_y_c128 = _mm_add_epi16(
   4050          base_y_c128, _mm_srli_epi16(_mm256_castsi256_si128(a16), 4));
   4051      _mm_store_si128((__m128i *)base_y_c, base_y_c128);
   4052 
   4053      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
   4054                            left[base_y_c[2]], left[base_y_c[3]],
   4055                            left[base_y_c[4]], left[base_y_c[5]],
   4056                            left[base_y_c[6]], left[base_y_c[7]]);
   4057 
   4058      if (upsample_left) {
   4059        shifty = _mm_srli_epi16(
   4060            _mm_and_si128(_mm_slli_epi16(y_c128, upsample_left), c3f), 1);
   4061      } else {
   4062        shifty = _mm_srli_epi16(_mm_and_si128(y_c128, c3f), 1);
   4063      }
   4064 
   4065      a0_x = _mm256_inserti128_si256(a0_x, a0_y, 1);
   4066      a1_x = _mm256_inserti128_si256(a1_x, a1_y, 1);
   4067      shift = _mm256_inserti128_si256(shift, shifty, 1);
   4068    }
   4069 
   4070    diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   4071    a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   4072    a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   4073 
   4074    b = _mm256_mullo_epi16(diff, shift);
   4075    res = _mm256_add_epi16(a32, b);
   4076    res = _mm256_srli_epi16(res, 5);
   4077 
   4078    resx = _mm_packus_epi16(_mm256_castsi256_si128(res),
   4079                            _mm256_castsi256_si128(res));
   4080    resy = _mm256_extracti128_si256(res, 1);
   4081    resy = _mm_packus_epi16(resy, resy);
   4082 
   4083    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
   4084    _mm_storel_epi64((__m128i *)(dst), resxy);
   4085    dst += stride;
   4086  }
   4087 }
   4088 
   4089 static void dr_prediction_z2_HxW_avx2(int H, int W, uint8_t *dst,
   4090                                      ptrdiff_t stride, const uint8_t *above,
   4091                                      const uint8_t *left, int upsample_above,
   4092                                      int upsample_left, int dx, int dy) {
   4093  // here upsample_above and upsample_left are 0 by design of
   4094  // av1_use_intra_edge_upsample
   4095  const int min_base_x = -1;
   4096  const int min_base_y = -1;
   4097  (void)upsample_above;
   4098  (void)upsample_left;
   4099  const int frac_bits_x = 6;
   4100  const int frac_bits_y = 6;
   4101 
   4102  __m256i a0_x, a1_x, a0_y, a1_y, a32, a16, c1234, c0123;
   4103  __m256i diff, min_base_y256, c3f, shifty, dy256, c1;
   4104  __m128i a0_x128, a1_x128;
   4105 
   4106  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
   4107  a16 = _mm256_set1_epi16(16);
   4108  c1 = _mm256_srli_epi16(a16, 4);
   4109  min_base_y256 = _mm256_set1_epi16(min_base_y);
   4110  c3f = _mm256_set1_epi16(0x3f);
   4111  dy256 = _mm256_set1_epi16(dy);
   4112  c0123 =
   4113      _mm256_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
   4114  c1234 = _mm256_add_epi16(c0123, c1);
   4115 
   4116  for (int r = 0; r < H; r++) {
   4117    __m256i b, res, shift, j256, r6, ydx;
   4118    __m128i resx, resy;
   4119    __m128i resxy;
   4120    int y = r + 1;
   4121    ydx = _mm256_set1_epi16((int16_t)(y * dx));
   4122 
   4123    int base_x = (-y * dx) >> frac_bits_x;
   4124    for (int j = 0; j < W; j += 16) {
   4125      j256 = _mm256_set1_epi16(j);
   4126      int base_shift = 0;
   4127      if ((base_x + j) < (min_base_x - 1)) {
   4128        base_shift = (min_base_x - (base_x + j) - 1);
   4129      }
   4130      int base_min_diff = (min_base_x - base_x - j);
   4131      if (base_min_diff > 16) {
   4132        base_min_diff = 16;
   4133      } else {
   4134        if (base_min_diff < 0) base_min_diff = 0;
   4135      }
   4136 
   4137      if (base_shift < 16) {
   4138        a0_x128 = _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
   4139        a1_x128 =
   4140            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
   4141        a0_x128 = _mm_shuffle_epi8(a0_x128, *(__m128i *)LoadMaskx[base_shift]);
   4142        a1_x128 = _mm_shuffle_epi8(a1_x128, *(__m128i *)LoadMaskx[base_shift]);
   4143 
   4144        a0_x = _mm256_cvtepu8_epi16(a0_x128);
   4145        a1_x = _mm256_cvtepu8_epi16(a1_x128);
   4146 
   4147        r6 = _mm256_slli_epi16(_mm256_add_epi16(c0123, j256), 6);
   4148        shift = _mm256_srli_epi16(
   4149            _mm256_and_si256(_mm256_sub_epi16(r6, ydx), c3f), 1);
   4150 
   4151        diff = _mm256_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
   4152        a32 = _mm256_slli_epi16(a0_x, 5);     // a[x] * 32
   4153        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   4154 
   4155        b = _mm256_mullo_epi16(diff, shift);
   4156        res = _mm256_add_epi16(a32, b);
   4157        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
   4158        resx = _mm256_castsi256_si128(_mm256_packus_epi16(
   4159            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   4160      } else {
   4161        resx = _mm_setzero_si128();
   4162      }
   4163 
   4164      // y calc
   4165      if (base_x < min_base_x) {
   4166        __m256i c256, y_c256, base_y_c256, mask256, mul16;
   4167        r6 = _mm256_set1_epi16(r << 6);
   4168        c256 = _mm256_add_epi16(j256, c1234);
   4169        mul16 = _mm256_min_epu16(_mm256_mullo_epi16(c256, dy256),
   4170                                 _mm256_srli_epi16(min_base_y256, 1));
   4171        y_c256 = _mm256_sub_epi16(r6, mul16);
   4172 
   4173        base_y_c256 = _mm256_srai_epi16(y_c256, frac_bits_y);
   4174        mask256 = _mm256_cmpgt_epi16(min_base_y256, base_y_c256);
   4175 
   4176        base_y_c256 = _mm256_blendv_epi8(base_y_c256, min_base_y256, mask256);
   4177        int16_t min_y = (int16_t)_mm_extract_epi16(
   4178            _mm256_extracti128_si256(base_y_c256, 1), 7);
   4179        int16_t max_y =
   4180            (int16_t)_mm_extract_epi16(_mm256_castsi256_si128(base_y_c256), 0);
   4181        int16_t offset_diff = max_y - min_y;
   4182 
   4183        if (offset_diff < 16) {
   4184          __m256i min_y256 = _mm256_set1_epi16(min_y);
   4185 
   4186          __m256i base_y_offset = _mm256_sub_epi16(base_y_c256, min_y256);
   4187          __m128i base_y_offset128 =
   4188              _mm_packs_epi16(_mm256_extracti128_si256(base_y_offset, 0),
   4189                              _mm256_extracti128_si256(base_y_offset, 1));
   4190 
   4191          __m128i a0_y128 = _mm_maskload_epi32(
   4192              (int *)(left + min_y), *(__m128i *)LoadMaskz2[offset_diff / 4]);
   4193          __m128i a1_y128 =
   4194              _mm_maskload_epi32((int *)(left + min_y + 1),
   4195                                 *(__m128i *)LoadMaskz2[offset_diff / 4]);
   4196          a0_y128 = _mm_shuffle_epi8(a0_y128, base_y_offset128);
   4197          a1_y128 = _mm_shuffle_epi8(a1_y128, base_y_offset128);
   4198          a0_y = _mm256_cvtepu8_epi16(a0_y128);
   4199          a1_y = _mm256_cvtepu8_epi16(a1_y128);
   4200        } else {
   4201          base_y_c256 = _mm256_andnot_si256(mask256, base_y_c256);
   4202          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   4203 
   4204          a0_y = _mm256_setr_epi16(
   4205              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   4206              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   4207              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
   4208              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
   4209              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
   4210              left[base_y_c[15]]);
   4211          base_y_c256 = _mm256_add_epi16(base_y_c256, c1);
   4212          _mm256_store_si256((__m256i *)base_y_c, base_y_c256);
   4213 
   4214          a1_y = _mm256_setr_epi16(
   4215              left[base_y_c[0]], left[base_y_c[1]], left[base_y_c[2]],
   4216              left[base_y_c[3]], left[base_y_c[4]], left[base_y_c[5]],
   4217              left[base_y_c[6]], left[base_y_c[7]], left[base_y_c[8]],
   4218              left[base_y_c[9]], left[base_y_c[10]], left[base_y_c[11]],
   4219              left[base_y_c[12]], left[base_y_c[13]], left[base_y_c[14]],
   4220              left[base_y_c[15]]);
   4221        }
   4222        shifty = _mm256_srli_epi16(_mm256_and_si256(y_c256, c3f), 1);
   4223 
   4224        diff = _mm256_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
   4225        a32 = _mm256_slli_epi16(a0_y, 5);     // a[x] * 32
   4226        a32 = _mm256_add_epi16(a32, a16);     // a[x] * 32 + 16
   4227 
   4228        b = _mm256_mullo_epi16(diff, shifty);
   4229        res = _mm256_add_epi16(a32, b);
   4230        res = _mm256_srli_epi16(res, 5);  // 16 16-bit values
   4231        resy = _mm256_castsi256_si128(_mm256_packus_epi16(
   4232            res, _mm256_castsi128_si256(_mm256_extracti128_si256(res, 1))));
   4233      } else {
   4234        resy = _mm_setzero_si128();
   4235      }
   4236      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)BaseMask[base_min_diff]);
   4237      _mm_storeu_si128((__m128i *)(dst + j), resxy);
   4238    }  // for j
   4239    dst += stride;
   4240  }
   4241 }
   4242 
   4243 // Directional prediction, zone 2: 90 < angle < 180
   4244 void av1_dr_prediction_z2_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   4245                               const uint8_t *above, const uint8_t *left,
   4246                               int upsample_above, int upsample_left, int dx,
   4247                               int dy) {
   4248  assert(dx > 0);
   4249  assert(dy > 0);
   4250  switch (bw) {
   4251    case 4:
   4252      dr_prediction_z2_Nx4_avx2(bh, dst, stride, above, left, upsample_above,
   4253                                upsample_left, dx, dy);
   4254      break;
   4255    case 8:
   4256      dr_prediction_z2_Nx8_avx2(bh, dst, stride, above, left, upsample_above,
   4257                                upsample_left, dx, dy);
   4258      break;
   4259    default:
   4260      dr_prediction_z2_HxW_avx2(bh, bw, dst, stride, above, left,
   4261                                upsample_above, upsample_left, dx, dy);
   4262      break;
   4263  }
   4264  return;
   4265 }
   4266 
   4267 // z3 functions
   4268 static inline void transpose16x32_avx2(__m256i *x, __m256i *d) {
   4269  __m256i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
   4270  __m256i w10, w11, w12, w13, w14, w15;
   4271 
   4272  w0 = _mm256_unpacklo_epi8(x[0], x[1]);
   4273  w1 = _mm256_unpacklo_epi8(x[2], x[3]);
   4274  w2 = _mm256_unpacklo_epi8(x[4], x[5]);
   4275  w3 = _mm256_unpacklo_epi8(x[6], x[7]);
   4276 
   4277  w8 = _mm256_unpacklo_epi8(x[8], x[9]);
   4278  w9 = _mm256_unpacklo_epi8(x[10], x[11]);
   4279  w10 = _mm256_unpacklo_epi8(x[12], x[13]);
   4280  w11 = _mm256_unpacklo_epi8(x[14], x[15]);
   4281 
   4282  w4 = _mm256_unpacklo_epi16(w0, w1);
   4283  w5 = _mm256_unpacklo_epi16(w2, w3);
   4284  w12 = _mm256_unpacklo_epi16(w8, w9);
   4285  w13 = _mm256_unpacklo_epi16(w10, w11);
   4286 
   4287  w6 = _mm256_unpacklo_epi32(w4, w5);
   4288  w7 = _mm256_unpackhi_epi32(w4, w5);
   4289  w14 = _mm256_unpacklo_epi32(w12, w13);
   4290  w15 = _mm256_unpackhi_epi32(w12, w13);
   4291 
   4292  // Store first 4-line result
   4293  d[0] = _mm256_unpacklo_epi64(w6, w14);
   4294  d[1] = _mm256_unpackhi_epi64(w6, w14);
   4295  d[2] = _mm256_unpacklo_epi64(w7, w15);
   4296  d[3] = _mm256_unpackhi_epi64(w7, w15);
   4297 
   4298  w4 = _mm256_unpackhi_epi16(w0, w1);
   4299  w5 = _mm256_unpackhi_epi16(w2, w3);
   4300  w12 = _mm256_unpackhi_epi16(w8, w9);
   4301  w13 = _mm256_unpackhi_epi16(w10, w11);
   4302 
   4303  w6 = _mm256_unpacklo_epi32(w4, w5);
   4304  w7 = _mm256_unpackhi_epi32(w4, w5);
   4305  w14 = _mm256_unpacklo_epi32(w12, w13);
   4306  w15 = _mm256_unpackhi_epi32(w12, w13);
   4307 
   4308  // Store second 4-line result
   4309  d[4] = _mm256_unpacklo_epi64(w6, w14);
   4310  d[5] = _mm256_unpackhi_epi64(w6, w14);
   4311  d[6] = _mm256_unpacklo_epi64(w7, w15);
   4312  d[7] = _mm256_unpackhi_epi64(w7, w15);
   4313 
   4314  // upper half
   4315  w0 = _mm256_unpackhi_epi8(x[0], x[1]);
   4316  w1 = _mm256_unpackhi_epi8(x[2], x[3]);
   4317  w2 = _mm256_unpackhi_epi8(x[4], x[5]);
   4318  w3 = _mm256_unpackhi_epi8(x[6], x[7]);
   4319 
   4320  w8 = _mm256_unpackhi_epi8(x[8], x[9]);
   4321  w9 = _mm256_unpackhi_epi8(x[10], x[11]);
   4322  w10 = _mm256_unpackhi_epi8(x[12], x[13]);
   4323  w11 = _mm256_unpackhi_epi8(x[14], x[15]);
   4324 
   4325  w4 = _mm256_unpacklo_epi16(w0, w1);
   4326  w5 = _mm256_unpacklo_epi16(w2, w3);
   4327  w12 = _mm256_unpacklo_epi16(w8, w9);
   4328  w13 = _mm256_unpacklo_epi16(w10, w11);
   4329 
   4330  w6 = _mm256_unpacklo_epi32(w4, w5);
   4331  w7 = _mm256_unpackhi_epi32(w4, w5);
   4332  w14 = _mm256_unpacklo_epi32(w12, w13);
   4333  w15 = _mm256_unpackhi_epi32(w12, w13);
   4334 
   4335  // Store first 4-line result
   4336  d[8] = _mm256_unpacklo_epi64(w6, w14);
   4337  d[9] = _mm256_unpackhi_epi64(w6, w14);
   4338  d[10] = _mm256_unpacklo_epi64(w7, w15);
   4339  d[11] = _mm256_unpackhi_epi64(w7, w15);
   4340 
   4341  w4 = _mm256_unpackhi_epi16(w0, w1);
   4342  w5 = _mm256_unpackhi_epi16(w2, w3);
   4343  w12 = _mm256_unpackhi_epi16(w8, w9);
   4344  w13 = _mm256_unpackhi_epi16(w10, w11);
   4345 
   4346  w6 = _mm256_unpacklo_epi32(w4, w5);
   4347  w7 = _mm256_unpackhi_epi32(w4, w5);
   4348  w14 = _mm256_unpacklo_epi32(w12, w13);
   4349  w15 = _mm256_unpackhi_epi32(w12, w13);
   4350 
   4351  // Store second 4-line result
   4352  d[12] = _mm256_unpacklo_epi64(w6, w14);
   4353  d[13] = _mm256_unpackhi_epi64(w6, w14);
   4354  d[14] = _mm256_unpacklo_epi64(w7, w15);
   4355  d[15] = _mm256_unpackhi_epi64(w7, w15);
   4356 }
   4357 
   4358 static void dr_prediction_z3_4x4_avx2(uint8_t *dst, ptrdiff_t stride,
   4359                                      const uint8_t *left, int upsample_left,
   4360                                      int dy) {
   4361  __m128i dstvec[4], d[4];
   4362 
   4363  dr_prediction_z1_HxW_internal_avx2(4, 4, dstvec, left, upsample_left, dy);
   4364  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   4365                            &d[0], &d[1], &d[2], &d[3]);
   4366 
   4367  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
   4368  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
   4369  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
   4370  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
   4371  return;
   4372 }
   4373 
   4374 static void dr_prediction_z3_8x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4375                                      const uint8_t *left, int upsample_left,
   4376                                      int dy) {
   4377  __m128i dstvec[8], d[8];
   4378 
   4379  dr_prediction_z1_HxW_internal_avx2(8, 8, dstvec, left, upsample_left, dy);
   4380  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
   4381                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
   4382                    &d[3]);
   4383 
   4384  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
   4385  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
   4386  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
   4387  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
   4388  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
   4389  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
   4390  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
   4391  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
   4392 }
   4393 
   4394 static void dr_prediction_z3_4x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4395                                      const uint8_t *left, int upsample_left,
   4396                                      int dy) {
   4397  __m128i dstvec[4], d[8];
   4398 
   4399  dr_prediction_z1_HxW_internal_avx2(8, 4, dstvec, left, upsample_left, dy);
   4400  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
   4401                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
   4402  for (int i = 0; i < 8; i++) {
   4403    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
   4404  }
   4405 }
   4406 
   4407 static void dr_prediction_z3_8x4_avx2(uint8_t *dst, ptrdiff_t stride,
   4408                                      const uint8_t *left, int upsample_left,
   4409                                      int dy) {
   4410  __m128i dstvec[8], d[4];
   4411 
   4412  dr_prediction_z1_HxW_internal_avx2(4, 8, dstvec, left, upsample_left, dy);
   4413  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
   4414                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
   4415                        &d[1], &d[2], &d[3]);
   4416  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
   4417  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
   4418  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
   4419  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
   4420 }
   4421 
   4422 static void dr_prediction_z3_8x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4423                                       const uint8_t *left, int upsample_left,
   4424                                       int dy) {
   4425  __m128i dstvec[8], d[8];
   4426 
   4427  dr_prediction_z1_HxW_internal_avx2(16, 8, dstvec, left, upsample_left, dy);
   4428  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
   4429                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
   4430                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
   4431  for (int i = 0; i < 8; i++) {
   4432    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
   4433    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
   4434                     _mm_srli_si128(d[i], 8));
   4435  }
   4436 }
   4437 
   4438 static void dr_prediction_z3_16x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4439                                       const uint8_t *left, int upsample_left,
   4440                                       int dy) {
   4441  __m128i dstvec[16], d[16];
   4442 
   4443  dr_prediction_z1_HxW_internal_avx2(8, 16, dstvec, left, upsample_left, dy);
   4444  transpose16x8_8x16_sse2(
   4445      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   4446      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   4447      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   4448      &d[3], &d[4], &d[5], &d[6], &d[7]);
   4449 
   4450  for (int i = 0; i < 8; i++) {
   4451    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4452  }
   4453 }
   4454 
   4455 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4456 static void dr_prediction_z3_4x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4457                                       const uint8_t *left, int upsample_left,
   4458                                       int dy) {
   4459  __m128i dstvec[4], d[16];
   4460 
   4461  dr_prediction_z1_HxW_internal_avx2(16, 4, dstvec, left, upsample_left, dy);
   4462  transpose4x16_sse2(dstvec, d);
   4463  for (int i = 0; i < 16; i++) {
   4464    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
   4465  }
   4466 }
   4467 
   4468 static void dr_prediction_z3_16x4_avx2(uint8_t *dst, ptrdiff_t stride,
   4469                                       const uint8_t *left, int upsample_left,
   4470                                       int dy) {
   4471  __m128i dstvec[16], d[8];
   4472 
   4473  dr_prediction_z1_HxW_internal_avx2(4, 16, dstvec, left, upsample_left, dy);
   4474  for (int i = 4; i < 8; i++) {
   4475    d[i] = _mm_setzero_si128();
   4476  }
   4477  transpose16x8_8x16_sse2(
   4478      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   4479      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   4480      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   4481      &d[3], &d[4], &d[5], &d[6], &d[7]);
   4482 
   4483  for (int i = 0; i < 4; i++) {
   4484    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4485  }
   4486 }
   4487 
   4488 static void dr_prediction_z3_8x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4489                                       const uint8_t *left, int upsample_left,
   4490                                       int dy) {
   4491  __m256i dstvec[16], d[16];
   4492 
   4493  dr_prediction_z1_32xN_internal_avx2(8, dstvec, left, upsample_left, dy);
   4494  for (int i = 8; i < 16; i++) {
   4495    dstvec[i] = _mm256_setzero_si256();
   4496  }
   4497  transpose16x32_avx2(dstvec, d);
   4498 
   4499  for (int i = 0; i < 16; i++) {
   4500    _mm_storel_epi64((__m128i *)(dst + i * stride),
   4501                     _mm256_castsi256_si128(d[i]));
   4502  }
   4503  for (int i = 0; i < 16; i++) {
   4504    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride),
   4505                     _mm256_extracti128_si256(d[i], 1));
   4506  }
   4507 }
   4508 
   4509 static void dr_prediction_z3_32x8_avx2(uint8_t *dst, ptrdiff_t stride,
   4510                                       const uint8_t *left, int upsample_left,
   4511                                       int dy) {
   4512  __m128i dstvec[32], d[16];
   4513 
   4514  dr_prediction_z1_HxW_internal_avx2(8, 32, dstvec, left, upsample_left, dy);
   4515 
   4516  transpose16x8_8x16_sse2(
   4517      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   4518      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   4519      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   4520      &d[3], &d[4], &d[5], &d[6], &d[7]);
   4521  transpose16x8_8x16_sse2(
   4522      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
   4523      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
   4524      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
   4525      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
   4526      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
   4527      &d[6 + 8], &d[7 + 8]);
   4528 
   4529  for (int i = 0; i < 8; i++) {
   4530    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4531    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
   4532  }
   4533 }
   4534 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4535 
   4536 static void dr_prediction_z3_16x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4537                                        const uint8_t *left, int upsample_left,
   4538                                        int dy) {
   4539  __m128i dstvec[16], d[16];
   4540 
   4541  dr_prediction_z1_HxW_internal_avx2(16, 16, dstvec, left, upsample_left, dy);
   4542  transpose16x16_sse2(dstvec, d);
   4543 
   4544  for (int i = 0; i < 16; i++) {
   4545    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   4546  }
   4547 }
   4548 
   4549 static void dr_prediction_z3_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4550                                        const uint8_t *left, int upsample_left,
   4551                                        int dy) {
   4552  __m256i dstvec[32], d[32];
   4553 
   4554  dr_prediction_z1_32xN_internal_avx2(32, dstvec, left, upsample_left, dy);
   4555  transpose16x32_avx2(dstvec, d);
   4556  transpose16x32_avx2(dstvec + 16, d + 16);
   4557  for (int j = 0; j < 16; j++) {
   4558    _mm_storeu_si128((__m128i *)(dst + j * stride),
   4559                     _mm256_castsi256_si128(d[j]));
   4560    _mm_storeu_si128((__m128i *)(dst + j * stride + 16),
   4561                     _mm256_castsi256_si128(d[j + 16]));
   4562  }
   4563  for (int j = 0; j < 16; j++) {
   4564    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
   4565                     _mm256_extracti128_si256(d[j], 1));
   4566    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16),
   4567                     _mm256_extracti128_si256(d[j + 16], 1));
   4568  }
   4569 }
   4570 
   4571 static void dr_prediction_z3_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
   4572                                        const uint8_t *left, int upsample_left,
   4573                                        int dy) {
   4574  DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]);
   4575  dr_prediction_z1_64xN_avx2(64, dstT, 64, left, upsample_left, dy);
   4576  transpose(dstT, 64, dst, stride, 64, 64);
   4577 }
   4578 
   4579 static void dr_prediction_z3_16x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4580                                        const uint8_t *left, int upsample_left,
   4581                                        int dy) {
   4582  __m256i dstvec[16], d[16];
   4583 
   4584  dr_prediction_z1_32xN_internal_avx2(16, dstvec, left, upsample_left, dy);
   4585  transpose16x32_avx2(dstvec, d);
   4586  // store
   4587  for (int j = 0; j < 16; j++) {
   4588    _mm_storeu_si128((__m128i *)(dst + j * stride),
   4589                     _mm256_castsi256_si128(d[j]));
   4590    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride),
   4591                     _mm256_extracti128_si256(d[j], 1));
   4592  }
   4593 }
   4594 
   4595 static void dr_prediction_z3_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4596                                        const uint8_t *left, int upsample_left,
   4597                                        int dy) {
   4598  __m128i dstvec[32], d[16];
   4599 
   4600  dr_prediction_z1_HxW_internal_avx2(16, 32, dstvec, left, upsample_left, dy);
   4601  for (int i = 0; i < 32; i += 16) {
   4602    transpose16x16_sse2((dstvec + i), d);
   4603    for (int j = 0; j < 16; j++) {
   4604      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
   4605    }
   4606  }
   4607 }
   4608 
   4609 static void dr_prediction_z3_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
   4610                                        const uint8_t *left, int upsample_left,
   4611                                        int dy) {
   4612  uint8_t dstT[64 * 32];
   4613  dr_prediction_z1_64xN_avx2(32, dstT, 64, left, upsample_left, dy);
   4614  transpose(dstT, 64, dst, stride, 32, 64);
   4615 }
   4616 
   4617 static void dr_prediction_z3_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
   4618                                        const uint8_t *left, int upsample_left,
   4619                                        int dy) {
   4620  uint8_t dstT[32 * 64];
   4621  dr_prediction_z1_32xN_avx2(64, dstT, 32, left, upsample_left, dy);
   4622  transpose(dstT, 32, dst, stride, 64, 32);
   4623  return;
   4624 }
   4625 
   4626 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4627 static void dr_prediction_z3_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
   4628                                        const uint8_t *left, int upsample_left,
   4629                                        int dy) {
   4630  uint8_t dstT[64 * 16];
   4631  dr_prediction_z1_64xN_avx2(16, dstT, 64, left, upsample_left, dy);
   4632  transpose(dstT, 64, dst, stride, 16, 64);
   4633 }
   4634 
   4635 static void dr_prediction_z3_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
   4636                                        const uint8_t *left, int upsample_left,
   4637                                        int dy) {
   4638  __m128i dstvec[64], d[16];
   4639 
   4640  dr_prediction_z1_HxW_internal_avx2(16, 64, dstvec, left, upsample_left, dy);
   4641  for (int i = 0; i < 64; i += 16) {
   4642    transpose16x16_sse2((dstvec + i), d);
   4643    for (int j = 0; j < 16; j++) {
   4644      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
   4645    }
   4646  }
   4647 }
   4648 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4649 
   4650 void av1_dr_prediction_z3_avx2(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   4651                               const uint8_t *above, const uint8_t *left,
   4652                               int upsample_left, int dx, int dy) {
   4653  (void)above;
   4654  (void)dx;
   4655  assert(dx == 1);
   4656  assert(dy > 0);
   4657 
   4658  if (bw == bh) {
   4659    switch (bw) {
   4660      case 4:
   4661        dr_prediction_z3_4x4_avx2(dst, stride, left, upsample_left, dy);
   4662        break;
   4663      case 8:
   4664        dr_prediction_z3_8x8_avx2(dst, stride, left, upsample_left, dy);
   4665        break;
   4666      case 16:
   4667        dr_prediction_z3_16x16_avx2(dst, stride, left, upsample_left, dy);
   4668        break;
   4669      case 32:
   4670        dr_prediction_z3_32x32_avx2(dst, stride, left, upsample_left, dy);
   4671        break;
   4672      case 64:
   4673        dr_prediction_z3_64x64_avx2(dst, stride, left, upsample_left, dy);
   4674        break;
   4675    }
   4676  } else {
   4677    if (bw < bh) {
   4678      if (bw + bw == bh) {
   4679        switch (bw) {
   4680          case 4:
   4681            dr_prediction_z3_4x8_avx2(dst, stride, left, upsample_left, dy);
   4682            break;
   4683          case 8:
   4684            dr_prediction_z3_8x16_avx2(dst, stride, left, upsample_left, dy);
   4685            break;
   4686          case 16:
   4687            dr_prediction_z3_16x32_avx2(dst, stride, left, upsample_left, dy);
   4688            break;
   4689          case 32:
   4690            dr_prediction_z3_32x64_avx2(dst, stride, left, upsample_left, dy);
   4691            break;
   4692        }
   4693      } else {
   4694        switch (bw) {
   4695 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4696          case 4:
   4697            dr_prediction_z3_4x16_avx2(dst, stride, left, upsample_left, dy);
   4698            break;
   4699          case 8:
   4700            dr_prediction_z3_8x32_avx2(dst, stride, left, upsample_left, dy);
   4701            break;
   4702          case 16:
   4703            dr_prediction_z3_16x64_avx2(dst, stride, left, upsample_left, dy);
   4704            break;
   4705 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4706        }
   4707      }
   4708    } else {
   4709      if (bh + bh == bw) {
   4710        switch (bh) {
   4711          case 4:
   4712            dr_prediction_z3_8x4_avx2(dst, stride, left, upsample_left, dy);
   4713            break;
   4714          case 8:
   4715            dr_prediction_z3_16x8_avx2(dst, stride, left, upsample_left, dy);
   4716            break;
   4717          case 16:
   4718            dr_prediction_z3_32x16_avx2(dst, stride, left, upsample_left, dy);
   4719            break;
   4720          case 32:
   4721            dr_prediction_z3_64x32_avx2(dst, stride, left, upsample_left, dy);
   4722            break;
   4723        }
   4724      } else {
   4725        switch (bh) {
   4726 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4727          case 4:
   4728            dr_prediction_z3_16x4_avx2(dst, stride, left, upsample_left, dy);
   4729            break;
   4730          case 8:
   4731            dr_prediction_z3_32x8_avx2(dst, stride, left, upsample_left, dy);
   4732            break;
   4733          case 16:
   4734            dr_prediction_z3_64x16_avx2(dst, stride, left, upsample_left, dy);
   4735            break;
   4736 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   4737        }
   4738      }
   4739    }
   4740  }
   4741 }