tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_convolve_copy_avx2.c (8446B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 static inline void copy_128(const uint8_t *src, uint8_t *dst) {
     17  __m256i s[4];
     18  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
     19  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
     20  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 32));
     21  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 32));
     22  _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
     23  _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
     24  _mm256_storeu_si256((__m256i *)(dst + 2 * 32), s[2]);
     25  _mm256_storeu_si256((__m256i *)(dst + 3 * 32), s[3]);
     26 }
     27 
     28 void aom_convolve_copy_avx2(const uint8_t *src, ptrdiff_t src_stride,
     29                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
     30  // The w == 16 case uses _mm_store_si128(), which requires its output address
     31  // be aligned on a 16-byte boundary.
     32  if (w == 16) {
     33    assert(!((intptr_t)dst % 16));
     34    assert(!(dst_stride % 16));
     35  }
     36 
     37  if (w == 2) {
     38    do {
     39      memmove(dst, src, 2 * sizeof(*src));
     40      src += src_stride;
     41      dst += dst_stride;
     42      memmove(dst, src, 2 * sizeof(*src));
     43      src += src_stride;
     44      dst += dst_stride;
     45      h -= 2;
     46    } while (h);
     47  } else if (w == 4) {
     48    do {
     49      memmove(dst, src, 4 * sizeof(*src));
     50      src += src_stride;
     51      dst += dst_stride;
     52      memmove(dst, src, 4 * sizeof(*src));
     53      src += src_stride;
     54      dst += dst_stride;
     55      h -= 2;
     56    } while (h);
     57  } else if (w == 8) {
     58    do {
     59      __m128i s[2];
     60      s[0] = _mm_loadl_epi64((__m128i *)src);
     61      src += src_stride;
     62      s[1] = _mm_loadl_epi64((__m128i *)src);
     63      src += src_stride;
     64      _mm_storel_epi64((__m128i *)dst, s[0]);
     65      dst += dst_stride;
     66      _mm_storel_epi64((__m128i *)dst, s[1]);
     67      dst += dst_stride;
     68      h -= 2;
     69    } while (h);
     70  } else if (w == 16) {
     71    do {
     72      __m128i s[2];
     73      s[0] = _mm_loadu_si128((__m128i *)src);
     74      src += src_stride;
     75      s[1] = _mm_loadu_si128((__m128i *)src);
     76      src += src_stride;
     77      _mm_store_si128((__m128i *)dst, s[0]);
     78      dst += dst_stride;
     79      _mm_store_si128((__m128i *)dst, s[1]);
     80      dst += dst_stride;
     81      h -= 2;
     82    } while (h);
     83  } else if (w == 32) {
     84    do {
     85      __m256i s[2];
     86      s[0] = _mm256_loadu_si256((__m256i *)src);
     87      src += src_stride;
     88      s[1] = _mm256_loadu_si256((__m256i *)src);
     89      src += src_stride;
     90      _mm256_storeu_si256((__m256i *)dst, s[0]);
     91      dst += dst_stride;
     92      _mm256_storeu_si256((__m256i *)dst, s[1]);
     93      dst += dst_stride;
     94      h -= 2;
     95    } while (h);
     96  } else if (w == 64) {
     97    do {
     98      __m256i s[4];
     99      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
    100      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
    101      src += src_stride;
    102      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 32));
    103      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 32));
    104      src += src_stride;
    105      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[0]);
    106      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[1]);
    107      dst += dst_stride;
    108      _mm256_storeu_si256((__m256i *)(dst + 0 * 32), s[2]);
    109      _mm256_storeu_si256((__m256i *)(dst + 1 * 32), s[3]);
    110      dst += dst_stride;
    111      h -= 2;
    112    } while (h);
    113  } else {
    114    do {
    115      copy_128(src, dst);
    116      src += src_stride;
    117      dst += dst_stride;
    118      copy_128(src, dst);
    119      src += src_stride;
    120      dst += dst_stride;
    121      h -= 2;
    122    } while (h);
    123  }
    124 }
    125 
    126 #if CONFIG_AV1_HIGHBITDEPTH
    127 
    128 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
    129  __m256i s[4];
    130  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
    131  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
    132  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
    133  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
    134  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
    135  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
    136  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
    137  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
    138 }
    139 
    140 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
    141  __m256i s[8];
    142  s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
    143  s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
    144  s[2] = _mm256_loadu_si256((__m256i *)(src + 2 * 16));
    145  s[3] = _mm256_loadu_si256((__m256i *)(src + 3 * 16));
    146  s[4] = _mm256_loadu_si256((__m256i *)(src + 4 * 16));
    147  s[5] = _mm256_loadu_si256((__m256i *)(src + 5 * 16));
    148  s[6] = _mm256_loadu_si256((__m256i *)(src + 6 * 16));
    149  s[7] = _mm256_loadu_si256((__m256i *)(src + 7 * 16));
    150 
    151  _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
    152  _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
    153  _mm256_storeu_si256((__m256i *)(dst + 2 * 16), s[2]);
    154  _mm256_storeu_si256((__m256i *)(dst + 3 * 16), s[3]);
    155  _mm256_storeu_si256((__m256i *)(dst + 4 * 16), s[4]);
    156  _mm256_storeu_si256((__m256i *)(dst + 5 * 16), s[5]);
    157  _mm256_storeu_si256((__m256i *)(dst + 6 * 16), s[6]);
    158  _mm256_storeu_si256((__m256i *)(dst + 7 * 16), s[7]);
    159 }
    160 
    161 void aom_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride,
    162                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
    163                                   int h) {
    164  // The w == 8 case uses _mm_store_si128(), which requires its output address
    165  // be aligned on a 16-byte boundary.
    166  if (w == 8) {
    167    assert(!((intptr_t)dst % 16));
    168    assert(!(dst_stride % 8));
    169  }
    170 
    171  if (w == 2) {
    172    do {
    173      memmove(dst, src, 2 * sizeof(*src));
    174      src += src_stride;
    175      dst += dst_stride;
    176      memmove(dst, src, 2 * sizeof(*src));
    177      src += src_stride;
    178      dst += dst_stride;
    179      h -= 2;
    180    } while (h);
    181  } else if (w == 4) {
    182    do {
    183      __m128i s[2];
    184      s[0] = _mm_loadl_epi64((__m128i *)src);
    185      src += src_stride;
    186      s[1] = _mm_loadl_epi64((__m128i *)src);
    187      src += src_stride;
    188      _mm_storel_epi64((__m128i *)dst, s[0]);
    189      dst += dst_stride;
    190      _mm_storel_epi64((__m128i *)dst, s[1]);
    191      dst += dst_stride;
    192      h -= 2;
    193    } while (h);
    194  } else if (w == 8) {
    195    do {
    196      __m128i s[2];
    197      s[0] = _mm_loadu_si128((__m128i *)src);
    198      src += src_stride;
    199      s[1] = _mm_loadu_si128((__m128i *)src);
    200      src += src_stride;
    201      _mm_store_si128((__m128i *)dst, s[0]);
    202      dst += dst_stride;
    203      _mm_store_si128((__m128i *)dst, s[1]);
    204      dst += dst_stride;
    205      h -= 2;
    206    } while (h);
    207  } else if (w == 16) {
    208    do {
    209      __m256i s[2];
    210      s[0] = _mm256_loadu_si256((__m256i *)src);
    211      src += src_stride;
    212      s[1] = _mm256_loadu_si256((__m256i *)src);
    213      src += src_stride;
    214      _mm256_storeu_si256((__m256i *)dst, s[0]);
    215      dst += dst_stride;
    216      _mm256_storeu_si256((__m256i *)dst, s[1]);
    217      dst += dst_stride;
    218      h -= 2;
    219    } while (h);
    220  } else if (w == 32) {
    221    do {
    222      __m256i s[4];
    223      s[0] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
    224      s[1] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
    225      src += src_stride;
    226      s[2] = _mm256_loadu_si256((__m256i *)(src + 0 * 16));
    227      s[3] = _mm256_loadu_si256((__m256i *)(src + 1 * 16));
    228      src += src_stride;
    229      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[0]);
    230      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[1]);
    231      dst += dst_stride;
    232      _mm256_storeu_si256((__m256i *)(dst + 0 * 16), s[2]);
    233      _mm256_storeu_si256((__m256i *)(dst + 1 * 16), s[3]);
    234      dst += dst_stride;
    235      h -= 2;
    236    } while (h);
    237  } else if (w == 64) {
    238    do {
    239      highbd_copy_64(src, dst);
    240      src += src_stride;
    241      dst += dst_stride;
    242      highbd_copy_64(src, dst);
    243      src += src_stride;
    244      dst += dst_stride;
    245      h -= 2;
    246    } while (h);
    247  } else {
    248    assert(w == 128);
    249    do {
    250      highbd_copy_128(src, dst);
    251      src += src_stride;
    252      dst += dst_stride;
    253      highbd_copy_128(src, dst);
    254      src += src_stride;
    255      dst += dst_stride;
    256      h -= 2;
    257    } while (h);
    258  }
    259 }
    260 
    261 #endif  // CONFIG_AV1_HIGHBITDEPTH