tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intrapred_sse4.c (51898B)


      1 /*
      2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <emmintrin.h>  // SSE2
     13 #include <smmintrin.h>  /* SSE4.1 */
     14 
     15 #include "config/av1_rtcd.h"
     16 #include "aom_dsp/x86/intrapred_x86.h"
     17 #include "aom_dsp/x86/intrapred_utils.h"
     18 #include "aom_dsp/x86/lpf_common_sse2.h"
     19 
     20 // Low bit depth functions
     21 static DECLARE_ALIGNED(16, uint8_t, Mask[2][33][16]) = {
     22  { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     23    { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     24    { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     25    { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     26    { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     27    { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     28    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     29    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     30    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 },
     31    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
     32      0 },
     33    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0,
     34      0 },
     35    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0,
     36      0, 0 },
     37    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0,
     38      0, 0, 0 },
     39    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     40      0xff, 0, 0, 0 },
     41    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     42      0xff, 0xff, 0, 0 },
     43    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     44      0xff, 0xff, 0xff, 0 },
     45    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     46      0xff, 0xff, 0xff, 0xff },
     47    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     48      0xff, 0xff, 0xff, 0xff },
     49    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     50      0xff, 0xff, 0xff, 0xff },
     51    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     52      0xff, 0xff, 0xff, 0xff },
     53    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     54      0xff, 0xff, 0xff, 0xff },
     55    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     56      0xff, 0xff, 0xff, 0xff },
     57    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     58      0xff, 0xff, 0xff, 0xff },
     59    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     60      0xff, 0xff, 0xff, 0xff },
     61    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     62      0xff, 0xff, 0xff, 0xff },
     63    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     64      0xff, 0xff, 0xff, 0xff },
     65    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     66      0xff, 0xff, 0xff, 0xff },
     67    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     68      0xff, 0xff, 0xff, 0xff },
     69    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     70      0xff, 0xff, 0xff, 0xff },
     71    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     72      0xff, 0xff, 0xff, 0xff },
     73    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     74      0xff, 0xff, 0xff, 0xff },
     75    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     76      0xff, 0xff, 0xff, 0xff },
     77    { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
     78      0xff, 0xff, 0xff, 0xff } },
     79  {
     80      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     81      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     82      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     83      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     84      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     85      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     86      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     87      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     88      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     89      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     90      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     91      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     92      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     93      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     94      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     95      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     96      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     97      { 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     98      { 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     99      { 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    100      { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    101      { 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    102      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    103      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
    104      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0,
    105        0 },
    106      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0,
    107        0 },
    108      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0,
    109        0, 0 },
    110      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0,
    111        0, 0, 0 },
    112      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    113        0, 0, 0, 0 },
    114      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    115        0xff, 0, 0, 0 },
    116      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    117        0xff, 0xff, 0, 0 },
    118      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    119        0xff, 0xff, 0xff, 0 },
    120      { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
    121        0xff, 0xff, 0xff, 0xff },
    122  },
    123 };
    124 
    125 /* clang-format on */
    126 static AOM_FORCE_INLINE void dr_prediction_z1_HxW_internal_sse4_1(
    127    int H, int W, __m128i *dst, const uint8_t *above, int upsample_above,
    128    int dx) {
    129  const int frac_bits = 6 - upsample_above;
    130  const int max_base_x = ((W + H) - 1) << upsample_above;
    131 
    132  assert(dx > 0);
    133  // pre-filter above pixels
    134  // store in temp buffers:
    135  //   above[x] * 32 + 16
    136  //   above[x+1] - above[x]
    137  // final pixels will be calculated as:
    138  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    139  __m128i a0, a1, a32, a16;
    140  __m128i diff, c3f;
    141  __m128i a_mbase_x;
    142 
    143  a16 = _mm_set1_epi16(16);
    144  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
    145  c3f = _mm_set1_epi16(0x3f);
    146 
    147  int x = dx;
    148  for (int r = 0; r < W; r++) {
    149    __m128i b, res, res1, shift;
    150    __m128i a0_above, a1_above;
    151 
    152    int base = x >> frac_bits;
    153    int base_max_diff = (max_base_x - base) >> upsample_above;
    154    if (base_max_diff <= 0) {
    155      for (int i = r; i < W; ++i) {
    156        dst[i] = a_mbase_x;  // save 4 values
    157      }
    158      return;
    159    }
    160    if (base_max_diff > H) base_max_diff = H;
    161    a0_above = _mm_loadu_si128((__m128i *)(above + base));
    162    a1_above = _mm_loadu_si128((__m128i *)(above + base + 1));
    163 
    164    if (upsample_above) {
    165      a0_above = _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[0]);
    166      a1_above = _mm_srli_si128(a0_above, 8);
    167 
    168      shift = _mm_srli_epi16(
    169          _mm_and_si128(_mm_slli_epi16(_mm_set1_epi16(x), upsample_above), c3f),
    170          1);
    171    } else {
    172      shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
    173    }
    174    // lower half
    175    a0 = _mm_cvtepu8_epi16(a0_above);
    176    a1 = _mm_cvtepu8_epi16(a1_above);
    177 
    178    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
    179    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
    180    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
    181 
    182    b = _mm_mullo_epi16(diff, shift);
    183    res = _mm_add_epi16(a32, b);
    184    res = _mm_srli_epi16(res, 5);
    185 
    186    // uppar half
    187    a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
    188    a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
    189 
    190    diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
    191    a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
    192    a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
    193 
    194    b = _mm_mullo_epi16(diff, shift);
    195    res1 = _mm_add_epi16(a32, b);
    196    res1 = _mm_srli_epi16(res1, 5);
    197 
    198    res = _mm_packus_epi16(res, res1);
    199 
    200    dst[r] =
    201        _mm_blendv_epi8(a_mbase_x, res, *(__m128i *)Mask[0][base_max_diff]);
    202    x += dx;
    203  }
    204 }
    205 
    206 static void dr_prediction_z1_4xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    207                                        const uint8_t *above,
    208                                        int upsample_above, int dx) {
    209  __m128i dstvec[16];
    210 
    211  dr_prediction_z1_HxW_internal_sse4_1(4, N, dstvec, above, upsample_above, dx);
    212  for (int i = 0; i < N; i++) {
    213    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(dstvec[i]);
    214  }
    215 }
    216 
    217 static void dr_prediction_z1_8xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    218                                        const uint8_t *above,
    219                                        int upsample_above, int dx) {
    220  __m128i dstvec[32];
    221 
    222  dr_prediction_z1_HxW_internal_sse4_1(8, N, dstvec, above, upsample_above, dx);
    223  for (int i = 0; i < N; i++) {
    224    _mm_storel_epi64((__m128i *)(dst + stride * i), dstvec[i]);
    225  }
    226 }
    227 
    228 static void dr_prediction_z1_16xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    229                                         const uint8_t *above,
    230                                         int upsample_above, int dx) {
    231  __m128i dstvec[64];
    232 
    233  dr_prediction_z1_HxW_internal_sse4_1(16, N, dstvec, above, upsample_above,
    234                                       dx);
    235  for (int i = 0; i < N; i++) {
    236    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
    237  }
    238 }
    239 
    240 static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_sse4_1(
    241    int N, __m128i *dstvec, __m128i *dstvec_h, const uint8_t *above,
    242    int upsample_above, int dx) {
    243  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    244  (void)upsample_above;
    245  const int frac_bits = 6;
    246  const int max_base_x = ((32 + N) - 1);
    247 
    248  // pre-filter above pixels
    249  // store in temp buffers:
    250  //   above[x] * 32 + 16
    251  //   above[x+1] - above[x]
    252  // final pixels will be calculated as:
    253  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    254  __m128i a0, a1, a32, a16;
    255  __m128i a_mbase_x, diff, c3f;
    256 
    257  a16 = _mm_set1_epi16(16);
    258  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
    259  c3f = _mm_set1_epi16(0x3f);
    260 
    261  int x = dx;
    262  for (int r = 0; r < N; r++) {
    263    __m128i b, res, res1, res16[2];
    264    __m128i a0_above, a1_above;
    265 
    266    int base = x >> frac_bits;
    267    int base_max_diff = (max_base_x - base);
    268    if (base_max_diff <= 0) {
    269      for (int i = r; i < N; ++i) {
    270        dstvec[i] = a_mbase_x;  // save 32 values
    271        dstvec_h[i] = a_mbase_x;
    272      }
    273      return;
    274    }
    275    if (base_max_diff > 32) base_max_diff = 32;
    276    __m128i shift = _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);
    277 
    278    for (int j = 0, jj = 0; j < 32; j += 16, jj++) {
    279      int mdiff = base_max_diff - j;
    280      if (mdiff <= 0) {
    281        res16[jj] = a_mbase_x;
    282      } else {
    283        a0_above = _mm_loadu_si128((__m128i *)(above + base + j));
    284        a1_above = _mm_loadu_si128((__m128i *)(above + base + j + 1));
    285 
    286        // lower half
    287        a0 = _mm_cvtepu8_epi16(a0_above);
    288        a1 = _mm_cvtepu8_epi16(a1_above);
    289 
    290        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
    291        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
    292        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
    293        b = _mm_mullo_epi16(diff, shift);
    294 
    295        res = _mm_add_epi16(a32, b);
    296        res = _mm_srli_epi16(res, 5);
    297 
    298        // uppar half
    299        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
    300        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
    301 
    302        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
    303        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
    304        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
    305 
    306        b = _mm_mullo_epi16(diff, shift);
    307        res1 = _mm_add_epi16(a32, b);
    308        res1 = _mm_srli_epi16(res1, 5);
    309 
    310        res16[jj] = _mm_packus_epi16(res, res1);  // 16 8bit values
    311      }
    312    }
    313 
    314    dstvec[r] =
    315        _mm_blendv_epi8(a_mbase_x, res16[0],
    316                        *(__m128i *)Mask[0][base_max_diff]);  // 16 8bit values
    317 
    318    dstvec_h[r] =
    319        _mm_blendv_epi8(a_mbase_x, res16[1],
    320                        *(__m128i *)Mask[1][base_max_diff]);  // 16 8bit values
    321    x += dx;
    322  }
    323 }
    324 
    325 static void dr_prediction_z1_32xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    326                                         const uint8_t *above,
    327                                         int upsample_above, int dx) {
    328  __m128i dstvec[64], dstvec_h[64];
    329  dr_prediction_z1_32xN_internal_sse4_1(N, dstvec, dstvec_h, above,
    330                                        upsample_above, dx);
    331  for (int i = 0; i < N; i++) {
    332    _mm_storeu_si128((__m128i *)(dst + stride * i), dstvec[i]);
    333    _mm_storeu_si128((__m128i *)(dst + stride * i + 16), dstvec_h[i]);
    334  }
    335 }
    336 
    337 static void dr_prediction_z1_64xN_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    338                                         const uint8_t *above,
    339                                         int upsample_above, int dx) {
    340  // here upsample_above is 0 by design of av1_use_intra_edge_upsample
    341  (void)upsample_above;
    342  const int frac_bits = 6;
    343  const int max_base_x = ((64 + N) - 1);
    344 
    345  // pre-filter above pixels
    346  // store in temp buffers:
    347  //   above[x] * 32 + 16
    348  //   above[x+1] - above[x]
    349  // final pixels will be calculated as:
    350  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    351  __m128i a0, a1, a32, a16;
    352  __m128i a_mbase_x, diff, c3f;
    353  __m128i max_base, base_inc, mask;
    354 
    355  a16 = _mm_set1_epi16(16);
    356  a_mbase_x = _mm_set1_epi8((char)above[max_base_x]);
    357  max_base = _mm_set1_epi8(max_base_x);
    358  c3f = _mm_set1_epi16(0x3f);
    359 
    360  int x = dx;
    361  for (int r = 0; r < N; r++, dst += stride) {
    362    __m128i b, res, res1;
    363    int base = x >> frac_bits;
    364    if (base >= max_base_x) {
    365      for (int i = r; i < N; ++i) {
    366        _mm_storeu_si128((__m128i *)dst, a_mbase_x);  // save 32 values
    367        _mm_storeu_si128((__m128i *)(dst + 16), a_mbase_x);
    368        _mm_storeu_si128((__m128i *)(dst + 32), a_mbase_x);
    369        _mm_storeu_si128((__m128i *)(dst + 48), a_mbase_x);
    370        dst += stride;
    371      }
    372      return;
    373    }
    374 
    375    __m128i shift =
    376        _mm_srli_epi16(_mm_and_si128(_mm_set1_epi16(x), c3f), 1);  // 8 element
    377 
    378    __m128i a0_above, a1_above, res_val;
    379    for (int j = 0; j < 64; j += 16) {
    380      int mdif = max_base_x - (base + j);
    381      if (mdif <= 0) {
    382        _mm_storeu_si128((__m128i *)(dst + j), a_mbase_x);
    383      } else {
    384        a0_above =
    385            _mm_loadu_si128((__m128i *)(above + base + j));  // load 16 element
    386        a1_above = _mm_loadu_si128((__m128i *)(above + base + 1 + j));
    387 
    388        // lower half
    389        a0 = _mm_cvtepu8_epi16(a0_above);
    390        a1 = _mm_cvtepu8_epi16(a1_above);
    391 
    392        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
    393        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
    394        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
    395        b = _mm_mullo_epi16(diff, shift);
    396 
    397        res = _mm_add_epi16(a32, b);
    398        res = _mm_srli_epi16(res, 5);
    399 
    400        // uppar half
    401        a0 = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
    402        a1 = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
    403 
    404        diff = _mm_sub_epi16(a1, a0);   // a[x+1] - a[x]
    405        a32 = _mm_slli_epi16(a0, 5);    // a[x] * 32
    406        a32 = _mm_add_epi16(a32, a16);  // a[x] * 32 + 16
    407 
    408        b = _mm_mullo_epi16(diff, shift);
    409        res1 = _mm_add_epi16(a32, b);
    410        res1 = _mm_srli_epi16(res1, 5);
    411 
    412        res = _mm_packus_epi16(res, res1);  // 16 8bit values
    413 
    414        base_inc =
    415            _mm_setr_epi8((int8_t)(base + j), (int8_t)(base + j + 1),
    416                          (int8_t)(base + j + 2), (int8_t)(base + j + 3),
    417                          (int8_t)(base + j + 4), (int8_t)(base + j + 5),
    418                          (int8_t)(base + j + 6), (int8_t)(base + j + 7),
    419                          (int8_t)(base + j + 8), (int8_t)(base + j + 9),
    420                          (int8_t)(base + j + 10), (int8_t)(base + j + 11),
    421                          (int8_t)(base + j + 12), (int8_t)(base + j + 13),
    422                          (int8_t)(base + j + 14), (int8_t)(base + j + 15));
    423 
    424        mask = _mm_cmpgt_epi8(_mm_subs_epu8(max_base, base_inc),
    425                              _mm_setzero_si128());
    426        res_val = _mm_blendv_epi8(a_mbase_x, res, mask);
    427        _mm_storeu_si128((__m128i *)(dst + j), res_val);
    428      }
    429    }
    430    x += dx;
    431  }
    432 }
    433 
    434 // Directional prediction, zone 1: 0 < angle < 90
    435 void av1_dr_prediction_z1_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
    436                                 const uint8_t *above, const uint8_t *left,
    437                                 int upsample_above, int dx, int dy) {
    438  (void)left;
    439  (void)dy;
    440  switch (bw) {
    441    case 4:
    442      dr_prediction_z1_4xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
    443      break;
    444    case 8:
    445      dr_prediction_z1_8xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
    446      break;
    447    case 16:
    448      dr_prediction_z1_16xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
    449      break;
    450    case 32:
    451      dr_prediction_z1_32xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
    452      break;
    453    case 64:
    454      dr_prediction_z1_64xN_sse4_1(bh, dst, stride, above, upsample_above, dx);
    455      break;
    456    default: assert(0 && "Invalid block size");
    457  }
    458  return;
    459 }
    460 
    461 static void dr_prediction_z2_Nx4_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    462                                        const uint8_t *above,
    463                                        const uint8_t *left, int upsample_above,
    464                                        int upsample_left, int dx, int dy) {
    465  const int min_base_x = -(1 << upsample_above);
    466  const int min_base_y = -(1 << upsample_left);
    467  const int frac_bits_x = 6 - upsample_above;
    468  const int frac_bits_y = 6 - upsample_left;
    469 
    470  assert(dx > 0);
    471  // pre-filter above pixels
    472  // store in temp buffers:
    473  //   above[x] * 32 + 16
    474  //   above[x+1] - above[x]
    475  // final pixels will be calculated as:
    476  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    477  __m128i a0_x, a1_x, a32, diff;
    478 
    479  const __m128i c3f = _mm_set1_epi16(0x3f);
    480  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
    481  const __m128i c1234 = _mm_setr_epi16(0, 1, 2, 3, 4, 0, 0, 0);
    482  const __m128i dy_reg = _mm_set1_epi16(dy);
    483  const __m128i a16 = _mm_set1_epi16(16);
    484 
    485  for (int r = 0; r < N; r++) {
    486    __m128i b, res, shift, r6, ydx;
    487    __m128i resx, resy, resxy;
    488    __m128i a0_above, a1_above;
    489    int y = r + 1;
    490    int base_x = (-y * dx) >> frac_bits_x;
    491    int base_shift = 0;
    492    if (base_x < (min_base_x - 1)) {
    493      base_shift = (min_base_x - base_x - 1) >> upsample_above;
    494    }
    495    int base_min_diff =
    496        (min_base_x - base_x + upsample_above) >> upsample_above;
    497    if (base_min_diff > 4) {
    498      base_min_diff = 4;
    499    } else {
    500      if (base_min_diff < 0) base_min_diff = 0;
    501    }
    502 
    503    if (base_shift > 3) {
    504      a0_x = _mm_setzero_si128();
    505      a1_x = _mm_setzero_si128();
    506      shift = _mm_setzero_si128();
    507    } else {
    508      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    509      ydx = _mm_set1_epi16(y * dx);
    510      r6 = _mm_slli_epi16(c1234, 6);
    511 
    512      if (upsample_above) {
    513        a0_above =
    514            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
    515        a1_above = _mm_srli_si128(a0_above, 8);
    516 
    517        shift = _mm_srli_epi16(
    518            _mm_and_si128(
    519                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
    520            1);
    521      } else {
    522        a0_above =
    523            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
    524        a1_above = _mm_srli_si128(a0_above, 1);
    525 
    526        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
    527      }
    528      a0_x = _mm_cvtepu8_epi16(a0_above);
    529      a1_x = _mm_cvtepu8_epi16(a1_above);
    530    }
    531    // y calc
    532    __m128i a0_y, a1_y, shifty;
    533    if (base_x < min_base_x) {
    534      DECLARE_ALIGNED(32, int16_t, base_y_c[8]);
    535      __m128i y_c, base_y_c_reg, mask, c1234_;
    536      c1234_ = _mm_srli_si128(c1234, 2);
    537      r6 = _mm_set1_epi16(r << 6);
    538      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234_, dy_reg));
    539      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
    540      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
    541      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
    542      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
    543 
    544      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    545                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
    546      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
    547      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
    548      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    549                            left[base_y_c[2]], left[base_y_c[3]], 0, 0, 0, 0);
    550 
    551      if (upsample_left) {
    552        shifty = _mm_srli_epi16(
    553            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
    554      } else {
    555        shifty = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
    556      }
    557      a0_x = _mm_unpacklo_epi64(a0_x, a0_y);
    558      a1_x = _mm_unpacklo_epi64(a1_x, a1_y);
    559      shift = _mm_unpacklo_epi64(shift, shifty);
    560    }
    561 
    562    diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
    563    a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
    564    a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
    565 
    566    b = _mm_mullo_epi16(diff, shift);
    567    res = _mm_add_epi16(a32, b);
    568    res = _mm_srli_epi16(res, 5);
    569 
    570    resx = _mm_packus_epi16(res, res);
    571    resy = _mm_srli_si128(resx, 4);
    572 
    573    resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
    574    *(int *)(dst) = _mm_cvtsi128_si32(resxy);
    575    dst += stride;
    576  }
    577 }
    578 
    579 static void dr_prediction_z2_Nx8_sse4_1(int N, uint8_t *dst, ptrdiff_t stride,
    580                                        const uint8_t *above,
    581                                        const uint8_t *left, int upsample_above,
    582                                        int upsample_left, int dx, int dy) {
    583  const int min_base_x = -(1 << upsample_above);
    584  const int min_base_y = -(1 << upsample_left);
    585  const int frac_bits_x = 6 - upsample_above;
    586  const int frac_bits_y = 6 - upsample_left;
    587 
    588  // pre-filter above pixels
    589  // store in temp buffers:
    590  //   above[x] * 32 + 16
    591  //   above[x+1] - above[x]
    592  // final pixels will be calculated as:
    593  //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
    594  __m128i diff, a32;
    595  __m128i a0_x, a1_x, a0_y, a1_y;
    596  __m128i a0_above, a1_above;
    597 
    598  const __m128i a16 = _mm_set1_epi16(16);
    599  const __m128i c3f = _mm_set1_epi16(0x3f);
    600  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
    601  const __m128i dy_reg = _mm_set1_epi16(dy);
    602  const __m128i c1234 = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
    603 
    604  for (int r = 0; r < N; r++) {
    605    __m128i b, res, res1, shift;
    606    __m128i resx, resy, resxy, r6, ydx;
    607 
    608    int y = r + 1;
    609    int base_x = (-y * dx) >> frac_bits_x;
    610    int base_shift = 0;
    611    if (base_x < (min_base_x - 1)) {
    612      base_shift = (min_base_x - base_x - 1) >> upsample_above;
    613    }
    614    int base_min_diff =
    615        (min_base_x - base_x + upsample_above) >> upsample_above;
    616    if (base_min_diff > 8) {
    617      base_min_diff = 8;
    618    } else {
    619      if (base_min_diff < 0) base_min_diff = 0;
    620    }
    621 
    622    if (base_shift > 7) {
    623      resx = _mm_setzero_si128();
    624    } else {
    625      a0_above = _mm_loadu_si128((__m128i *)(above + base_x + base_shift));
    626      ydx = _mm_set1_epi16(y * dx);
    627      r6 = _mm_slli_epi16(_mm_srli_si128(c1234, 2), 6);
    628      if (upsample_above) {
    629        a0_above =
    630            _mm_shuffle_epi8(a0_above, *(__m128i *)EvenOddMaskx[base_shift]);
    631        a1_above = _mm_srli_si128(a0_above, 8);
    632 
    633        shift = _mm_srli_epi16(
    634            _mm_and_si128(
    635                _mm_slli_epi16(_mm_sub_epi16(r6, ydx), upsample_above), c3f),
    636            1);
    637      } else {
    638        a1_above = _mm_srli_si128(a0_above, 1);
    639        a0_above =
    640            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
    641        a1_above =
    642            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
    643 
    644        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
    645      }
    646      a0_x = _mm_cvtepu8_epi16(a0_above);
    647      a1_x = _mm_cvtepu8_epi16(a1_above);
    648 
    649      diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
    650      a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
    651      a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
    652 
    653      b = _mm_mullo_epi16(diff, shift);
    654      res = _mm_add_epi16(a32, b);
    655      res = _mm_srli_epi16(res, 5);
    656      resx = _mm_packus_epi16(res, res);
    657    }
    658 
    659    // y calc
    660    if (base_x < min_base_x) {
    661      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
    662      __m128i y_c, base_y_c_reg, mask;
    663      r6 = _mm_set1_epi16(r << 6);
    664      y_c = _mm_sub_epi16(r6, _mm_mullo_epi16(c1234, dy_reg));
    665      base_y_c_reg = _mm_srai_epi16(y_c, frac_bits_y);
    666      mask = _mm_cmpgt_epi16(min_y_base, base_y_c_reg);
    667      base_y_c_reg = _mm_andnot_si128(mask, base_y_c_reg);
    668      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
    669 
    670      a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    671                            left[base_y_c[2]], left[base_y_c[3]],
    672                            left[base_y_c[4]], left[base_y_c[5]],
    673                            left[base_y_c[6]], left[base_y_c[7]]);
    674      base_y_c_reg = _mm_add_epi16(base_y_c_reg, _mm_srli_epi16(a16, 4));
    675      _mm_store_si128((__m128i *)base_y_c, base_y_c_reg);
    676 
    677      a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    678                            left[base_y_c[2]], left[base_y_c[3]],
    679                            left[base_y_c[4]], left[base_y_c[5]],
    680                            left[base_y_c[6]], left[base_y_c[7]]);
    681 
    682      if (upsample_left) {
    683        shift = _mm_srli_epi16(
    684            _mm_and_si128(_mm_slli_epi16(y_c, upsample_left), c3f), 1);
    685      } else {
    686        shift = _mm_srli_epi16(_mm_and_si128(y_c, c3f), 1);
    687      }
    688 
    689      diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
    690      a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
    691      a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
    692 
    693      b = _mm_mullo_epi16(diff, shift);
    694      res1 = _mm_add_epi16(a32, b);
    695      res1 = _mm_srli_epi16(res1, 5);
    696 
    697      resy = _mm_packus_epi16(res1, res1);
    698      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
    699      _mm_storel_epi64((__m128i *)dst, resxy);
    700    } else {
    701      _mm_storel_epi64((__m128i *)dst, resx);
    702    }
    703 
    704    dst += stride;
    705  }
    706 }
    707 
    708 static void dr_prediction_z2_HxW_sse4_1(int H, int W, uint8_t *dst,
    709                                        ptrdiff_t stride, const uint8_t *above,
    710                                        const uint8_t *left, int upsample_above,
    711                                        int upsample_left, int dx, int dy) {
    712  // here upsample_above and upsample_left are 0 by design of
    713  // av1_use_intra_edge_upsample
    714  const int min_base_x = -1;
    715  const int min_base_y = -1;
    716  (void)upsample_above;
    717  (void)upsample_left;
    718  const int frac_bits_x = 6;
    719  const int frac_bits_y = 6;
    720 
    721  __m128i a0_x, a1_x, a0_y, a1_y, a0_y_h, a1_y_h, a32;
    722  __m128i diff, shifty, shifty_h;
    723  __m128i a0_above, a1_above;
    724 
    725  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
    726  const __m128i a16 = _mm_set1_epi16(16);
    727  const __m128i c1 = _mm_srli_epi16(a16, 4);
    728  const __m128i min_y_base = _mm_set1_epi16(min_base_y);
    729  const __m128i c3f = _mm_set1_epi16(0x3f);
    730  const __m128i dy256 = _mm_set1_epi16(dy);
    731  const __m128i c0123 = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
    732  const __m128i c0123_h = _mm_setr_epi16(8, 9, 10, 11, 12, 13, 14, 15);
    733  const __m128i c1234 = _mm_add_epi16(c0123, c1);
    734  const __m128i c1234_h = _mm_add_epi16(c0123_h, c1);
    735 
    736  for (int r = 0; r < H; r++) {
    737    __m128i b, res, res1, shift, reg_j, r6, ydx;
    738    __m128i resx, resy;
    739    __m128i resxy;
    740    int y = r + 1;
    741    ydx = _mm_set1_epi16((int16_t)(y * dx));
    742 
    743    int base_x = (-y * dx) >> frac_bits_x;
    744    for (int j = 0; j < W; j += 16) {
    745      reg_j = _mm_set1_epi16(j);
    746      int base_shift = 0;
    747      if ((base_x + j) < (min_base_x - 1)) {
    748        base_shift = (min_base_x - (base_x + j) - 1);
    749      }
    750      int base_min_diff = (min_base_x - base_x - j);
    751      if (base_min_diff > 16) {
    752        base_min_diff = 16;
    753      } else {
    754        if (base_min_diff < 0) base_min_diff = 0;
    755      }
    756 
    757      if (base_shift < 16) {
    758        a0_above =
    759            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + j));
    760        a1_above =
    761            _mm_loadu_si128((__m128i *)(above + base_x + base_shift + 1 + j));
    762        a0_above =
    763            _mm_shuffle_epi8(a0_above, *(__m128i *)LoadMaskx[base_shift]);
    764        a1_above =
    765            _mm_shuffle_epi8(a1_above, *(__m128i *)LoadMaskx[base_shift]);
    766 
    767        a0_x = _mm_cvtepu8_epi16(a0_above);
    768        a1_x = _mm_cvtepu8_epi16(a1_above);
    769 
    770        r6 = _mm_slli_epi16(_mm_add_epi16(c0123, reg_j), 6);
    771        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
    772 
    773        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
    774        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
    775        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
    776 
    777        b = _mm_mullo_epi16(diff, shift);
    778        res = _mm_add_epi16(a32, b);
    779        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
    780 
    781        a0_x = _mm_cvtepu8_epi16(_mm_srli_si128(a0_above, 8));
    782        a1_x = _mm_cvtepu8_epi16(_mm_srli_si128(a1_above, 8));
    783 
    784        r6 = _mm_slli_epi16(_mm_add_epi16(c0123_h, reg_j), 6);
    785        shift = _mm_srli_epi16(_mm_and_si128(_mm_sub_epi16(r6, ydx), c3f), 1);
    786 
    787        diff = _mm_sub_epi16(a1_x, a0_x);  // a[x+1] - a[x]
    788        a32 = _mm_slli_epi16(a0_x, 5);     // a[x] * 32
    789        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
    790 
    791        b = _mm_mullo_epi16(diff, shift);
    792        res1 = _mm_add_epi16(a32, b);
    793        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
    794 
    795        resx = _mm_packus_epi16(res, res1);
    796      } else {
    797        resx = _mm_setzero_si128();
    798      }
    799 
    800      // y calc
    801      if (base_x < min_base_x) {
    802        __m128i c_reg, c_reg_h, y_reg, y_reg_h, base_y, base_y_h;
    803        __m128i mask, mask_h, mul16, mul16_h;
    804        r6 = _mm_set1_epi16(r << 6);
    805        c_reg = _mm_add_epi16(reg_j, c1234);
    806        c_reg_h = _mm_add_epi16(reg_j, c1234_h);
    807        mul16 = _mm_min_epu16(_mm_mullo_epi16(c_reg, dy256),
    808                              _mm_srli_epi16(min_y_base, 1));
    809        mul16_h = _mm_min_epu16(_mm_mullo_epi16(c_reg_h, dy256),
    810                                _mm_srli_epi16(min_y_base, 1));
    811        y_reg = _mm_sub_epi16(r6, mul16);
    812        y_reg_h = _mm_sub_epi16(r6, mul16_h);
    813 
    814        base_y = _mm_srai_epi16(y_reg, frac_bits_y);
    815        base_y_h = _mm_srai_epi16(y_reg_h, frac_bits_y);
    816        mask = _mm_cmpgt_epi16(min_y_base, base_y);
    817        mask_h = _mm_cmpgt_epi16(min_y_base, base_y_h);
    818 
    819        base_y = _mm_blendv_epi8(base_y, min_y_base, mask);
    820        base_y_h = _mm_blendv_epi8(base_y_h, min_y_base, mask_h);
    821        int16_t min_y = (int16_t)_mm_extract_epi16(base_y_h, 7);
    822        int16_t max_y = (int16_t)_mm_extract_epi16(base_y, 0);
    823        int16_t offset_diff = max_y - min_y;
    824 
    825        if (offset_diff < 16) {
    826          __m128i min_y_reg = _mm_set1_epi16(min_y);
    827 
    828          __m128i base_y_offset = _mm_sub_epi16(base_y, min_y_reg);
    829          __m128i base_y_offset_h = _mm_sub_epi16(base_y_h, min_y_reg);
    830          __m128i y_offset = _mm_packs_epi16(base_y_offset, base_y_offset_h);
    831 
    832          __m128i a0_mask = _mm_loadu_si128((__m128i *)(left + min_y));
    833          __m128i a1_mask = _mm_loadu_si128((__m128i *)(left + min_y + 1));
    834          __m128i LoadMask =
    835              _mm_loadu_si128((__m128i *)(LoadMaskz2[offset_diff / 4]));
    836 
    837          a0_mask = _mm_and_si128(a0_mask, LoadMask);
    838          a1_mask = _mm_and_si128(a1_mask, LoadMask);
    839 
    840          a0_mask = _mm_shuffle_epi8(a0_mask, y_offset);
    841          a1_mask = _mm_shuffle_epi8(a1_mask, y_offset);
    842          a0_y = _mm_cvtepu8_epi16(a0_mask);
    843          a1_y = _mm_cvtepu8_epi16(a1_mask);
    844          a0_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a0_mask, 8));
    845          a1_y_h = _mm_cvtepu8_epi16(_mm_srli_si128(a1_mask, 8));
    846        } else {
    847          base_y = _mm_andnot_si128(mask, base_y);
    848          base_y_h = _mm_andnot_si128(mask_h, base_y_h);
    849          _mm_store_si128((__m128i *)base_y_c, base_y);
    850          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
    851 
    852          a0_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    853                                left[base_y_c[2]], left[base_y_c[3]],
    854                                left[base_y_c[4]], left[base_y_c[5]],
    855                                left[base_y_c[6]], left[base_y_c[7]]);
    856          a0_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
    857                                  left[base_y_c[10]], left[base_y_c[11]],
    858                                  left[base_y_c[12]], left[base_y_c[13]],
    859                                  left[base_y_c[14]], left[base_y_c[15]]);
    860          base_y = _mm_add_epi16(base_y, c1);
    861          base_y_h = _mm_add_epi16(base_y_h, c1);
    862          _mm_store_si128((__m128i *)base_y_c, base_y);
    863          _mm_store_si128((__m128i *)&base_y_c[8], base_y_h);
    864 
    865          a1_y = _mm_setr_epi16(left[base_y_c[0]], left[base_y_c[1]],
    866                                left[base_y_c[2]], left[base_y_c[3]],
    867                                left[base_y_c[4]], left[base_y_c[5]],
    868                                left[base_y_c[6]], left[base_y_c[7]]);
    869          a1_y_h = _mm_setr_epi16(left[base_y_c[8]], left[base_y_c[9]],
    870                                  left[base_y_c[10]], left[base_y_c[11]],
    871                                  left[base_y_c[12]], left[base_y_c[13]],
    872                                  left[base_y_c[14]], left[base_y_c[15]]);
    873        }
    874        shifty = _mm_srli_epi16(_mm_and_si128(y_reg, c3f), 1);
    875        shifty_h = _mm_srli_epi16(_mm_and_si128(y_reg_h, c3f), 1);
    876 
    877        diff = _mm_sub_epi16(a1_y, a0_y);  // a[x+1] - a[x]
    878        a32 = _mm_slli_epi16(a0_y, 5);     // a[x] * 32
    879        a32 = _mm_add_epi16(a32, a16);     // a[x] * 32 + 16
    880 
    881        b = _mm_mullo_epi16(diff, shifty);
    882        res = _mm_add_epi16(a32, b);
    883        res = _mm_srli_epi16(res, 5);  // 16 16-bit values
    884 
    885        diff = _mm_sub_epi16(a1_y_h, a0_y_h);  // a[x+1] - a[x]
    886        a32 = _mm_slli_epi16(a0_y_h, 5);       // a[x] * 32
    887        a32 = _mm_add_epi16(a32, a16);         // a[x] * 32 + 16
    888 
    889        b = _mm_mullo_epi16(diff, shifty_h);
    890        res1 = _mm_add_epi16(a32, b);
    891        res1 = _mm_srli_epi16(res1, 5);  // 16 16-bit values
    892        resy = _mm_packus_epi16(res, res1);
    893      } else {
    894        resy = _mm_setzero_si128();
    895      }
    896      resxy = _mm_blendv_epi8(resx, resy, *(__m128i *)Mask[0][base_min_diff]);
    897      _mm_storeu_si128((__m128i *)(dst + j), resxy);
    898    }  // for j
    899    dst += stride;
    900  }
    901 }
    902 
    903 // Directional prediction, zone 2: 90 < angle < 180
    904 void av1_dr_prediction_z2_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
    905                                 const uint8_t *above, const uint8_t *left,
    906                                 int upsample_above, int upsample_left, int dx,
    907                                 int dy) {
    908  assert(dx > 0);
    909  assert(dy > 0);
    910  switch (bw) {
    911    case 4:
    912      dr_prediction_z2_Nx4_sse4_1(bh, dst, stride, above, left, upsample_above,
    913                                  upsample_left, dx, dy);
    914      break;
    915    case 8:
    916      dr_prediction_z2_Nx8_sse4_1(bh, dst, stride, above, left, upsample_above,
    917                                  upsample_left, dx, dy);
    918      break;
    919    default:
    920      dr_prediction_z2_HxW_sse4_1(bh, bw, dst, stride, above, left,
    921                                  upsample_above, upsample_left, dx, dy);
    922  }
    923  return;
    924 }
    925 
    926 // z3 functions
    927 static void dr_prediction_z3_4x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
    928                                        const uint8_t *left, int upsample_left,
    929                                        int dy) {
    930  __m128i dstvec[4], d[4];
    931 
    932  dr_prediction_z1_HxW_internal_sse4_1(4, 4, dstvec, left, upsample_left, dy);
    933  transpose4x8_8x4_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    934                            &d[0], &d[1], &d[2], &d[3]);
    935 
    936  *(int *)(dst + stride * 0) = _mm_cvtsi128_si32(d[0]);
    937  *(int *)(dst + stride * 1) = _mm_cvtsi128_si32(d[1]);
    938  *(int *)(dst + stride * 2) = _mm_cvtsi128_si32(d[2]);
    939  *(int *)(dst + stride * 3) = _mm_cvtsi128_si32(d[3]);
    940  return;
    941 }
    942 
    943 static void dr_prediction_z3_8x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
    944                                        const uint8_t *left, int upsample_left,
    945                                        int dy) {
    946  __m128i dstvec[8], d[8];
    947 
    948  dr_prediction_z1_HxW_internal_sse4_1(8, 8, dstvec, left, upsample_left, dy);
    949  transpose8x8_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4],
    950                    &dstvec[5], &dstvec[6], &dstvec[7], &d[0], &d[1], &d[2],
    951                    &d[3]);
    952 
    953  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
    954  _mm_storel_epi64((__m128i *)(dst + 1 * stride), _mm_srli_si128(d[0], 8));
    955  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[1]);
    956  _mm_storel_epi64((__m128i *)(dst + 3 * stride), _mm_srli_si128(d[1], 8));
    957  _mm_storel_epi64((__m128i *)(dst + 4 * stride), d[2]);
    958  _mm_storel_epi64((__m128i *)(dst + 5 * stride), _mm_srli_si128(d[2], 8));
    959  _mm_storel_epi64((__m128i *)(dst + 6 * stride), d[3]);
    960  _mm_storel_epi64((__m128i *)(dst + 7 * stride), _mm_srli_si128(d[3], 8));
    961 }
    962 
    963 static void dr_prediction_z3_4x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
    964                                        const uint8_t *left, int upsample_left,
    965                                        int dy) {
    966  __m128i dstvec[4], d[8];
    967 
    968  dr_prediction_z1_HxW_internal_sse4_1(8, 4, dstvec, left, upsample_left, dy);
    969  transpose4x8_8x4_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &d[0],
    970                        &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
    971  for (int i = 0; i < 8; i++) {
    972    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
    973  }
    974 }
    975 
    976 static void dr_prediction_z3_8x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
    977                                        const uint8_t *left, int upsample_left,
    978                                        int dy) {
    979  __m128i dstvec[8], d[4];
    980 
    981  dr_prediction_z1_HxW_internal_sse4_1(4, 8, dstvec, left, upsample_left, dy);
    982  transpose8x8_low_sse2(&dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3],
    983                        &dstvec[4], &dstvec[5], &dstvec[6], &dstvec[7], &d[0],
    984                        &d[1], &d[2], &d[3]);
    985  _mm_storel_epi64((__m128i *)(dst + 0 * stride), d[0]);
    986  _mm_storel_epi64((__m128i *)(dst + 1 * stride), d[1]);
    987  _mm_storel_epi64((__m128i *)(dst + 2 * stride), d[2]);
    988  _mm_storel_epi64((__m128i *)(dst + 3 * stride), d[3]);
    989 }
    990 
    991 static void dr_prediction_z3_8x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
    992                                         const uint8_t *left, int upsample_left,
    993                                         int dy) {
    994  __m128i dstvec[8], d[8];
    995 
    996  dr_prediction_z1_HxW_internal_sse4_1(16, 8, dstvec, left, upsample_left, dy);
    997  transpose8x16_16x8_sse2(dstvec, dstvec + 1, dstvec + 2, dstvec + 3,
    998                          dstvec + 4, dstvec + 5, dstvec + 6, dstvec + 7, d,
    999                          d + 1, d + 2, d + 3, d + 4, d + 5, d + 6, d + 7);
   1000  for (int i = 0; i < 8; i++) {
   1001    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
   1002    _mm_storel_epi64((__m128i *)(dst + (i + 8) * stride),
   1003                     _mm_srli_si128(d[i], 8));
   1004  }
   1005 }
   1006 
   1007 static void dr_prediction_z3_16x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1008                                         const uint8_t *left, int upsample_left,
   1009                                         int dy) {
   1010  __m128i dstvec[16], d[16];
   1011 
   1012  dr_prediction_z1_HxW_internal_sse4_1(8, 16, dstvec, left, upsample_left, dy);
   1013  transpose16x8_8x16_sse2(
   1014      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   1015      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   1016      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   1017      &d[3], &d[4], &d[5], &d[6], &d[7]);
   1018 
   1019  for (int i = 0; i < 8; i++) {
   1020    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   1021  }
   1022 }
   1023 
   1024 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1025 static void dr_prediction_z3_4x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1026                                         const uint8_t *left, int upsample_left,
   1027                                         int dy) {
   1028  __m128i dstvec[4], d[16];
   1029 
   1030  dr_prediction_z1_HxW_internal_sse4_1(16, 4, dstvec, left, upsample_left, dy);
   1031  transpose4x16_sse2(dstvec, d);
   1032  for (int i = 0; i < 16; i++) {
   1033    *(int *)(dst + stride * i) = _mm_cvtsi128_si32(d[i]);
   1034  }
   1035 }
   1036 
   1037 static void dr_prediction_z3_16x4_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1038                                         const uint8_t *left, int upsample_left,
   1039                                         int dy) {
   1040  __m128i dstvec[16], d[8];
   1041 
   1042  dr_prediction_z1_HxW_internal_sse4_1(4, 16, dstvec, left, upsample_left, dy);
   1043  for (int i = 4; i < 8; i++) {
   1044    d[i] = _mm_setzero_si128();
   1045  }
   1046  transpose16x8_8x16_sse2(
   1047      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   1048      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   1049      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   1050      &d[3], &d[4], &d[5], &d[6], &d[7]);
   1051 
   1052  for (int i = 0; i < 4; i++) {
   1053    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   1054  }
   1055 }
   1056 
   1057 static void dr_prediction_z3_8x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1058                                         const uint8_t *left, int upsample_left,
   1059                                         int dy) {
   1060  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
   1061 
   1062  dr_prediction_z1_32xN_internal_sse4_1(8, dstvec, dstvec_h, left,
   1063                                        upsample_left, dy);
   1064  for (int i = 8; i < 16; i++) {
   1065    dstvec[i] = _mm_setzero_si128();
   1066    dstvec_h[i] = _mm_setzero_si128();
   1067  }
   1068  transpose16x16_sse2(dstvec, d);
   1069  transpose16x16_sse2(dstvec_h, d_h);
   1070 
   1071  for (int i = 0; i < 16; i++) {
   1072    _mm_storel_epi64((__m128i *)(dst + i * stride), d[i]);
   1073  }
   1074  for (int i = 0; i < 16; i++) {
   1075    _mm_storel_epi64((__m128i *)(dst + (i + 16) * stride), d_h[i]);
   1076  }
   1077 }
   1078 
   1079 static void dr_prediction_z3_32x8_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1080                                         const uint8_t *left, int upsample_left,
   1081                                         int dy) {
   1082  __m128i dstvec[32], d[16];
   1083 
   1084  dr_prediction_z1_HxW_internal_sse4_1(8, 32, dstvec, left, upsample_left, dy);
   1085 
   1086  transpose16x8_8x16_sse2(
   1087      &dstvec[0], &dstvec[1], &dstvec[2], &dstvec[3], &dstvec[4], &dstvec[5],
   1088      &dstvec[6], &dstvec[7], &dstvec[8], &dstvec[9], &dstvec[10], &dstvec[11],
   1089      &dstvec[12], &dstvec[13], &dstvec[14], &dstvec[15], &d[0], &d[1], &d[2],
   1090      &d[3], &d[4], &d[5], &d[6], &d[7]);
   1091  transpose16x8_8x16_sse2(
   1092      &dstvec[0 + 16], &dstvec[1 + 16], &dstvec[2 + 16], &dstvec[3 + 16],
   1093      &dstvec[4 + 16], &dstvec[5 + 16], &dstvec[6 + 16], &dstvec[7 + 16],
   1094      &dstvec[8 + 16], &dstvec[9 + 16], &dstvec[10 + 16], &dstvec[11 + 16],
   1095      &dstvec[12 + 16], &dstvec[13 + 16], &dstvec[14 + 16], &dstvec[15 + 16],
   1096      &d[0 + 8], &d[1 + 8], &d[2 + 8], &d[3 + 8], &d[4 + 8], &d[5 + 8],
   1097      &d[6 + 8], &d[7 + 8]);
   1098 
   1099  for (int i = 0; i < 8; i++) {
   1100    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   1101    _mm_storeu_si128((__m128i *)(dst + i * stride + 16), d[i + 8]);
   1102  }
   1103 }
   1104 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1105 
   1106 static void dr_prediction_z3_16x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1107                                          const uint8_t *left,
   1108                                          int upsample_left, int dy) {
   1109  __m128i dstvec[16], d[16];
   1110 
   1111  dr_prediction_z1_HxW_internal_sse4_1(16, 16, dstvec, left, upsample_left, dy);
   1112  transpose16x16_sse2(dstvec, d);
   1113 
   1114  for (int i = 0; i < 16; i++) {
   1115    _mm_storeu_si128((__m128i *)(dst + i * stride), d[i]);
   1116  }
   1117 }
   1118 
   1119 static void dr_prediction_z3_32x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1120                                          const uint8_t *left,
   1121                                          int upsample_left, int dy) {
   1122  __m128i dstvec[32], d[32], dstvec_h[32], d_h[32];
   1123 
   1124  dr_prediction_z1_32xN_internal_sse4_1(32, dstvec, dstvec_h, left,
   1125                                        upsample_left, dy);
   1126  transpose16x16_sse2(dstvec, d);
   1127  transpose16x16_sse2(dstvec_h, d_h);
   1128  transpose16x16_sse2(dstvec + 16, d + 16);
   1129  transpose16x16_sse2(dstvec_h + 16, d_h + 16);
   1130  for (int j = 0; j < 16; j++) {
   1131    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
   1132    _mm_storeu_si128((__m128i *)(dst + j * stride + 16), d[j + 16]);
   1133  }
   1134  for (int j = 0; j < 16; j++) {
   1135    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
   1136    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride + 16), d_h[j + 16]);
   1137  }
   1138 }
   1139 
   1140 static void dr_prediction_z3_64x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1141                                          const uint8_t *left,
   1142                                          int upsample_left, int dy) {
   1143  uint8_t dstT[64 * 64];
   1144  dr_prediction_z1_64xN_sse4_1(64, dstT, 64, left, upsample_left, dy);
   1145  transpose(dstT, 64, dst, stride, 64, 64);
   1146 }
   1147 
   1148 static void dr_prediction_z3_16x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1149                                          const uint8_t *left,
   1150                                          int upsample_left, int dy) {
   1151  __m128i dstvec[16], d[16], dstvec_h[16], d_h[16];
   1152 
   1153  dr_prediction_z1_32xN_internal_sse4_1(16, dstvec, dstvec_h, left,
   1154                                        upsample_left, dy);
   1155  transpose16x16_sse2(dstvec, d);
   1156  transpose16x16_sse2(dstvec_h, d_h);
   1157  // store
   1158  for (int j = 0; j < 16; j++) {
   1159    _mm_storeu_si128((__m128i *)(dst + j * stride), d[j]);
   1160    _mm_storeu_si128((__m128i *)(dst + (j + 16) * stride), d_h[j]);
   1161  }
   1162 }
   1163 
   1164 static void dr_prediction_z3_32x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1165                                          const uint8_t *left,
   1166                                          int upsample_left, int dy) {
   1167  __m128i dstvec[32], d[16];
   1168 
   1169  dr_prediction_z1_HxW_internal_sse4_1(16, 32, dstvec, left, upsample_left, dy);
   1170  for (int i = 0; i < 32; i += 16) {
   1171    transpose16x16_sse2((dstvec + i), d);
   1172    for (int j = 0; j < 16; j++) {
   1173      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
   1174    }
   1175  }
   1176 }
   1177 
   1178 static void dr_prediction_z3_32x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1179                                          const uint8_t *left,
   1180                                          int upsample_left, int dy) {
   1181  uint8_t dstT[64 * 32];
   1182  dr_prediction_z1_64xN_sse4_1(32, dstT, 64, left, upsample_left, dy);
   1183  transpose(dstT, 64, dst, stride, 32, 64);
   1184 }
   1185 
   1186 static void dr_prediction_z3_64x32_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1187                                          const uint8_t *left,
   1188                                          int upsample_left, int dy) {
   1189  uint8_t dstT[32 * 64];
   1190  dr_prediction_z1_32xN_sse4_1(64, dstT, 32, left, upsample_left, dy);
   1191  transpose(dstT, 32, dst, stride, 64, 32);
   1192  return;
   1193 }
   1194 
   1195 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1196 static void dr_prediction_z3_16x64_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1197                                          const uint8_t *left,
   1198                                          int upsample_left, int dy) {
   1199  uint8_t dstT[64 * 16];
   1200  dr_prediction_z1_64xN_sse4_1(16, dstT, 64, left, upsample_left, dy);
   1201  transpose(dstT, 64, dst, stride, 16, 64);
   1202 }
   1203 
   1204 static void dr_prediction_z3_64x16_sse4_1(uint8_t *dst, ptrdiff_t stride,
   1205                                          const uint8_t *left,
   1206                                          int upsample_left, int dy) {
   1207  __m128i dstvec[64], d[16];
   1208 
   1209  dr_prediction_z1_HxW_internal_sse4_1(16, 64, dstvec, left, upsample_left, dy);
   1210  for (int i = 0; i < 64; i += 16) {
   1211    transpose16x16_sse2(dstvec + i, d);
   1212    for (int j = 0; j < 16; j++) {
   1213      _mm_storeu_si128((__m128i *)(dst + j * stride + i), d[j]);
   1214    }
   1215  }
   1216 }
   1217 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1218 
   1219 void av1_dr_prediction_z3_sse4_1(uint8_t *dst, ptrdiff_t stride, int bw, int bh,
   1220                                 const uint8_t *above, const uint8_t *left,
   1221                                 int upsample_left, int dx, int dy) {
   1222  (void)above;
   1223  (void)dx;
   1224  assert(dx == 1);
   1225  assert(dy > 0);
   1226 
   1227  if (bw == bh) {
   1228    switch (bw) {
   1229      case 4:
   1230        dr_prediction_z3_4x4_sse4_1(dst, stride, left, upsample_left, dy);
   1231        break;
   1232      case 8:
   1233        dr_prediction_z3_8x8_sse4_1(dst, stride, left, upsample_left, dy);
   1234        break;
   1235      case 16:
   1236        dr_prediction_z3_16x16_sse4_1(dst, stride, left, upsample_left, dy);
   1237        break;
   1238      case 32:
   1239        dr_prediction_z3_32x32_sse4_1(dst, stride, left, upsample_left, dy);
   1240        break;
   1241      case 64:
   1242        dr_prediction_z3_64x64_sse4_1(dst, stride, left, upsample_left, dy);
   1243        break;
   1244      default: assert(0 && "Invalid block size");
   1245    }
   1246  } else {
   1247    if (bw < bh) {
   1248      if (bw + bw == bh) {
   1249        switch (bw) {
   1250          case 4:
   1251            dr_prediction_z3_4x8_sse4_1(dst, stride, left, upsample_left, dy);
   1252            break;
   1253          case 8:
   1254            dr_prediction_z3_8x16_sse4_1(dst, stride, left, upsample_left, dy);
   1255            break;
   1256          case 16:
   1257            dr_prediction_z3_16x32_sse4_1(dst, stride, left, upsample_left, dy);
   1258            break;
   1259          case 32:
   1260            dr_prediction_z3_32x64_sse4_1(dst, stride, left, upsample_left, dy);
   1261            break;
   1262          default: assert(0 && "Invalid block size");
   1263        }
   1264      } else {
   1265        switch (bw) {
   1266 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1267          case 4:
   1268            dr_prediction_z3_4x16_sse4_1(dst, stride, left, upsample_left, dy);
   1269            break;
   1270          case 8:
   1271            dr_prediction_z3_8x32_sse4_1(dst, stride, left, upsample_left, dy);
   1272            break;
   1273          case 16:
   1274            dr_prediction_z3_16x64_sse4_1(dst, stride, left, upsample_left, dy);
   1275            break;
   1276          default: assert(0 && "Invalid block size");
   1277 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1278        }
   1279      }
   1280    } else {
   1281      if (bh + bh == bw) {
   1282        switch (bh) {
   1283          case 4:
   1284            dr_prediction_z3_8x4_sse4_1(dst, stride, left, upsample_left, dy);
   1285            break;
   1286          case 8:
   1287            dr_prediction_z3_16x8_sse4_1(dst, stride, left, upsample_left, dy);
   1288            break;
   1289          case 16:
   1290            dr_prediction_z3_32x16_sse4_1(dst, stride, left, upsample_left, dy);
   1291            break;
   1292          case 32:
   1293            dr_prediction_z3_64x32_sse4_1(dst, stride, left, upsample_left, dy);
   1294            break;
   1295          default: assert(0 && "Invalid block size");
   1296        }
   1297      } else {
   1298        switch (bh) {
   1299 #if !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1300          case 4:
   1301            dr_prediction_z3_16x4_sse4_1(dst, stride, left, upsample_left, dy);
   1302            break;
   1303          case 8:
   1304            dr_prediction_z3_32x8_sse4_1(dst, stride, left, upsample_left, dy);
   1305            break;
   1306          case 16:
   1307            dr_prediction_z3_64x16_sse4_1(dst, stride, left, upsample_left, dy);
   1308            break;
   1309          default: assert(0 && "Invalid block size");
   1310 #endif  // !CONFIG_REALTIME_ONLY || CONFIG_AV1_DECODER
   1311        }
   1312      }
   1313    }
   1314  }
   1315 }