tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

aom_convolve_copy_sse2.c (11394B)


      1 /*
      2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <immintrin.h>
     13 
     14 #include "config/aom_dsp_rtcd.h"
     15 
     16 static inline void copy_128(const uint8_t *src, uint8_t *dst) {
     17  __m128i s[8];
     18  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
     19  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
     20  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
     21  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
     22  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 16));
     23  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 16));
     24  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 16));
     25  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 16));
     26  _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
     27  _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
     28  _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
     29  _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
     30  _mm_store_si128((__m128i *)(dst + 4 * 16), s[4]);
     31  _mm_store_si128((__m128i *)(dst + 5 * 16), s[5]);
     32  _mm_store_si128((__m128i *)(dst + 6 * 16), s[6]);
     33  _mm_store_si128((__m128i *)(dst + 7 * 16), s[7]);
     34 }
     35 
     36 void aom_convolve_copy_sse2(const uint8_t *src, ptrdiff_t src_stride,
     37                            uint8_t *dst, ptrdiff_t dst_stride, int w, int h) {
     38  // The w >= 16 cases use _mm_store_si128(), which requires its output address
     39  // be aligned on a 16-byte boundary.
     40  if (w >= 16) {
     41    assert(!((intptr_t)dst % 16));
     42    assert(!(dst_stride % 16));
     43  }
     44 
     45  if (w == 2) {
     46    do {
     47      memmove(dst, src, 2 * sizeof(*src));
     48      src += src_stride;
     49      dst += dst_stride;
     50      memmove(dst, src, 2 * sizeof(*src));
     51      src += src_stride;
     52      dst += dst_stride;
     53      h -= 2;
     54    } while (h);
     55  } else if (w == 4) {
     56    do {
     57      memmove(dst, src, 4 * sizeof(*src));
     58      src += src_stride;
     59      dst += dst_stride;
     60      memmove(dst, src, 4 * sizeof(*src));
     61      src += src_stride;
     62      dst += dst_stride;
     63      h -= 2;
     64    } while (h);
     65  } else if (w == 8) {
     66    do {
     67      __m128i s[2];
     68      s[0] = _mm_loadl_epi64((__m128i *)src);
     69      src += src_stride;
     70      s[1] = _mm_loadl_epi64((__m128i *)src);
     71      src += src_stride;
     72      _mm_storel_epi64((__m128i *)dst, s[0]);
     73      dst += dst_stride;
     74      _mm_storel_epi64((__m128i *)dst, s[1]);
     75      dst += dst_stride;
     76      h -= 2;
     77    } while (h);
     78  } else if (w == 16) {
     79    do {
     80      __m128i s[2];
     81      s[0] = _mm_loadu_si128((__m128i *)src);
     82      src += src_stride;
     83      s[1] = _mm_loadu_si128((__m128i *)src);
     84      src += src_stride;
     85      _mm_store_si128((__m128i *)dst, s[0]);
     86      dst += dst_stride;
     87      _mm_store_si128((__m128i *)dst, s[1]);
     88      dst += dst_stride;
     89      h -= 2;
     90    } while (h);
     91  } else if (w == 32) {
     92    do {
     93      __m128i s[4];
     94      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
     95      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
     96      src += src_stride;
     97      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
     98      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
     99      src += src_stride;
    100      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
    101      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
    102      dst += dst_stride;
    103      _mm_store_si128((__m128i *)(dst + 0 * 16), s[2]);
    104      _mm_store_si128((__m128i *)(dst + 1 * 16), s[3]);
    105      dst += dst_stride;
    106      h -= 2;
    107    } while (h);
    108  } else if (w == 64) {
    109    do {
    110      __m128i s[8];
    111      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
    112      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
    113      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
    114      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
    115      src += src_stride;
    116      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 16));
    117      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 16));
    118      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 16));
    119      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 16));
    120      src += src_stride;
    121      _mm_store_si128((__m128i *)(dst + 0 * 16), s[0]);
    122      _mm_store_si128((__m128i *)(dst + 1 * 16), s[1]);
    123      _mm_store_si128((__m128i *)(dst + 2 * 16), s[2]);
    124      _mm_store_si128((__m128i *)(dst + 3 * 16), s[3]);
    125      dst += dst_stride;
    126      _mm_store_si128((__m128i *)(dst + 0 * 16), s[4]);
    127      _mm_store_si128((__m128i *)(dst + 1 * 16), s[5]);
    128      _mm_store_si128((__m128i *)(dst + 2 * 16), s[6]);
    129      _mm_store_si128((__m128i *)(dst + 3 * 16), s[7]);
    130      dst += dst_stride;
    131      h -= 2;
    132    } while (h);
    133  } else {
    134    do {
    135      copy_128(src, dst);
    136      src += src_stride;
    137      dst += dst_stride;
    138      copy_128(src, dst);
    139      src += src_stride;
    140      dst += dst_stride;
    141      h -= 2;
    142    } while (h);
    143  }
    144 }
    145 
    146 #if CONFIG_AV1_HIGHBITDEPTH
    147 static inline void highbd_copy_64(const uint16_t *src, uint16_t *dst) {
    148  __m128i s[8];
    149  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
    150  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
    151  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
    152  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
    153  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
    154  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
    155  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
    156  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
    157  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
    158  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
    159  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
    160  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
    161  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
    162  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
    163  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
    164  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
    165 }
    166 
    167 static inline void highbd_copy_128(const uint16_t *src, uint16_t *dst) {
    168  __m128i s[16];
    169  s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
    170  s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
    171  s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
    172  s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
    173  s[4] = _mm_loadu_si128((__m128i *)(src + 4 * 8));
    174  s[5] = _mm_loadu_si128((__m128i *)(src + 5 * 8));
    175  s[6] = _mm_loadu_si128((__m128i *)(src + 6 * 8));
    176  s[7] = _mm_loadu_si128((__m128i *)(src + 7 * 8));
    177  s[8] = _mm_loadu_si128((__m128i *)(src + 8 * 8));
    178  s[9] = _mm_loadu_si128((__m128i *)(src + 9 * 8));
    179  s[10] = _mm_loadu_si128((__m128i *)(src + 10 * 8));
    180  s[11] = _mm_loadu_si128((__m128i *)(src + 11 * 8));
    181  s[12] = _mm_loadu_si128((__m128i *)(src + 12 * 8));
    182  s[13] = _mm_loadu_si128((__m128i *)(src + 13 * 8));
    183  s[14] = _mm_loadu_si128((__m128i *)(src + 14 * 8));
    184  s[15] = _mm_loadu_si128((__m128i *)(src + 15 * 8));
    185  _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
    186  _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
    187  _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
    188  _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
    189  _mm_store_si128((__m128i *)(dst + 4 * 8), s[4]);
    190  _mm_store_si128((__m128i *)(dst + 5 * 8), s[5]);
    191  _mm_store_si128((__m128i *)(dst + 6 * 8), s[6]);
    192  _mm_store_si128((__m128i *)(dst + 7 * 8), s[7]);
    193  _mm_store_si128((__m128i *)(dst + 8 * 8), s[8]);
    194  _mm_store_si128((__m128i *)(dst + 9 * 8), s[9]);
    195  _mm_store_si128((__m128i *)(dst + 10 * 8), s[10]);
    196  _mm_store_si128((__m128i *)(dst + 11 * 8), s[11]);
    197  _mm_store_si128((__m128i *)(dst + 12 * 8), s[12]);
    198  _mm_store_si128((__m128i *)(dst + 13 * 8), s[13]);
    199  _mm_store_si128((__m128i *)(dst + 14 * 8), s[14]);
    200  _mm_store_si128((__m128i *)(dst + 15 * 8), s[15]);
    201 }
    202 
    203 void aom_highbd_convolve_copy_sse2(const uint16_t *src, ptrdiff_t src_stride,
    204                                   uint16_t *dst, ptrdiff_t dst_stride, int w,
    205                                   int h) {
    206  // The w >= 8 cases use _mm_store_si128(), which requires its output address
    207  // be aligned on a 16-byte boundary.
    208  if (w >= 8) {
    209    assert(!((intptr_t)dst % 16));
    210    assert(!(dst_stride % 8));
    211  }
    212 
    213  if (w == 2) {
    214    do {
    215      __m128i s = _mm_loadl_epi64((__m128i *)src);
    216      *(int *)dst = _mm_cvtsi128_si32(s);
    217      src += src_stride;
    218      dst += dst_stride;
    219      s = _mm_loadl_epi64((__m128i *)src);
    220      *(int *)dst = _mm_cvtsi128_si32(s);
    221      src += src_stride;
    222      dst += dst_stride;
    223      h -= 2;
    224    } while (h);
    225  } else if (w == 4) {
    226    do {
    227      __m128i s[2];
    228      s[0] = _mm_loadl_epi64((__m128i *)src);
    229      src += src_stride;
    230      s[1] = _mm_loadl_epi64((__m128i *)src);
    231      src += src_stride;
    232      _mm_storel_epi64((__m128i *)dst, s[0]);
    233      dst += dst_stride;
    234      _mm_storel_epi64((__m128i *)dst, s[1]);
    235      dst += dst_stride;
    236      h -= 2;
    237    } while (h);
    238  } else if (w == 8) {
    239    do {
    240      __m128i s[2];
    241      s[0] = _mm_loadu_si128((__m128i *)src);
    242      src += src_stride;
    243      s[1] = _mm_loadu_si128((__m128i *)src);
    244      src += src_stride;
    245      _mm_store_si128((__m128i *)dst, s[0]);
    246      dst += dst_stride;
    247      _mm_store_si128((__m128i *)dst, s[1]);
    248      dst += dst_stride;
    249      h -= 2;
    250    } while (h);
    251  } else if (w == 16) {
    252    do {
    253      __m128i s[4];
    254      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
    255      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
    256      src += src_stride;
    257      s[2] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
    258      s[3] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
    259      src += src_stride;
    260      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
    261      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
    262      dst += dst_stride;
    263      _mm_store_si128((__m128i *)(dst + 0 * 8), s[2]);
    264      _mm_store_si128((__m128i *)(dst + 1 * 8), s[3]);
    265      dst += dst_stride;
    266      h -= 2;
    267    } while (h);
    268  } else if (w == 32) {
    269    do {
    270      __m128i s[8];
    271      s[0] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
    272      s[1] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
    273      s[2] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
    274      s[3] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
    275      src += src_stride;
    276      s[4] = _mm_loadu_si128((__m128i *)(src + 0 * 8));
    277      s[5] = _mm_loadu_si128((__m128i *)(src + 1 * 8));
    278      s[6] = _mm_loadu_si128((__m128i *)(src + 2 * 8));
    279      s[7] = _mm_loadu_si128((__m128i *)(src + 3 * 8));
    280      src += src_stride;
    281      _mm_store_si128((__m128i *)(dst + 0 * 8), s[0]);
    282      _mm_store_si128((__m128i *)(dst + 1 * 8), s[1]);
    283      _mm_store_si128((__m128i *)(dst + 2 * 8), s[2]);
    284      _mm_store_si128((__m128i *)(dst + 3 * 8), s[3]);
    285      dst += dst_stride;
    286      _mm_store_si128((__m128i *)(dst + 0 * 8), s[4]);
    287      _mm_store_si128((__m128i *)(dst + 1 * 8), s[5]);
    288      _mm_store_si128((__m128i *)(dst + 2 * 8), s[6]);
    289      _mm_store_si128((__m128i *)(dst + 3 * 8), s[7]);
    290      dst += dst_stride;
    291      h -= 2;
    292    } while (h);
    293  } else if (w == 64) {
    294    do {
    295      highbd_copy_64(src, dst);
    296      src += src_stride;
    297      dst += dst_stride;
    298      highbd_copy_64(src, dst);
    299      src += src_stride;
    300      dst += dst_stride;
    301      h -= 2;
    302    } while (h);
    303  } else {
    304    do {
    305      highbd_copy_128(src, dst);
    306      src += src_stride;
    307      dst += dst_stride;
    308      highbd_copy_128(src, dst);
    309      src += src_stride;
    310      dst += dst_stride;
    311      h -= 2;
    312    } while (h);
    313  }
    314 }
    315 #endif  // CONFIG_AV1_HIGHBITDEPTH