tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mem_sse2.h (6619B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_
     13 #define AOM_AOM_DSP_X86_MEM_SSE2_H_
     14 
     15 #include <emmintrin.h>  // SSE2
     16 #include <string.h>
     17 
     18 #include "config/aom_config.h"
     19 
     20 #include "aom/aom_integer.h"
     21 
     22 static inline int16_t loadu_int16(const void *src) {
     23  int16_t v;
     24  memcpy(&v, src, sizeof(v));
     25  return v;
     26 }
     27 
     28 static inline int32_t loadu_int32(const void *src) {
     29  int32_t v;
     30  memcpy(&v, src, sizeof(v));
     31  return v;
     32 }
     33 
     34 static inline int64_t loadu_int64(const void *src) {
     35  int64_t v;
     36  memcpy(&v, src, sizeof(v));
     37  return v;
     38 }
     39 
     40 static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) {
     41  _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s));
     42 }
     43 
     44 static inline __m128i loadh_epi64(const void *const src, const __m128i s) {
     45  return _mm_castps_si128(
     46      _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
     47 }
     48 
     49 static inline __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src,
     50                                                  const int byte_stride) {
     51  return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride),
     52                        loadu_int32((int8_t *)src + 1 * byte_stride),
     53                        loadu_int32((int8_t *)src + 2 * byte_stride),
     54                        loadu_int32((int8_t *)src + 3 * byte_stride));
     55 }
     56 
     57 static inline __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src,
     58                                                  const int byte_stride) {
     59  __m128i dst;
     60  dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride));
     61  dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst);
     62  return dst;
     63 }
     64 
     65 static inline void store_8bit_8x4_from_16x2(const __m128i *const s,
     66                                            uint8_t *const d,
     67                                            const ptrdiff_t stride) {
     68  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
     69  _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]);
     70  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]);
     71  _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]);
     72 }
     73 
     74 static inline void store_8bit_4x4(const __m128i *const s, uint8_t *const d,
     75                                  const ptrdiff_t stride) {
     76  *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]);
     77  *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]);
     78  *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]);
     79  *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]);
     80 }
     81 
     82 static inline void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d,
     83                                       const ptrdiff_t stride) {
     84  __m128i ss[4];
     85 
     86  ss[0] = s;
     87  ss[1] = _mm_srli_si128(s, 4);
     88  ss[2] = _mm_srli_si128(s, 8);
     89  ss[3] = _mm_srli_si128(s, 12);
     90  store_8bit_4x4(ss, d, stride);
     91 }
     92 
     93 static inline void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
     94                                 __m128i *const d) {
     95  d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
     96  d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride));
     97  d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride));
     98  d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride));
     99 }
    100 
    101 static inline void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride,
    102                                 __m128i *const d) {
    103  load_8bit_4x4(s + 0 * stride, stride, &d[0]);
    104  load_8bit_4x4(s + 4 * stride, stride, &d[4]);
    105 }
    106 
    107 static inline void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride,
    108                                 __m128i *const d) {
    109  d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride));
    110  d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride));
    111  d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride));
    112  d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride));
    113 }
    114 
    115 static inline void loadu_8bit_16x4(const uint8_t *const s,
    116                                   const ptrdiff_t stride, __m128i *const d) {
    117  d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride));
    118  d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride));
    119  d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride));
    120  d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride));
    121 }
    122 
    123 static inline void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride,
    124                                 __m128i *const d) {
    125  load_8bit_8x4(s + 0 * stride, stride, &d[0]);
    126  load_8bit_8x4(s + 4 * stride, stride, &d[4]);
    127 }
    128 
    129 static inline void load_8bit_16x8(const uint8_t *const s,
    130                                  const ptrdiff_t stride, __m128i *const d) {
    131  d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride));
    132  d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride));
    133  d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride));
    134  d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride));
    135  d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride));
    136  d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride));
    137  d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride));
    138  d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride));
    139 }
    140 
    141 static inline void loadu_8bit_16x8(const uint8_t *const s,
    142                                   const ptrdiff_t stride, __m128i *const d) {
    143  loadu_8bit_16x4(s + 0 * stride, stride, &d[0]);
    144  loadu_8bit_16x4(s + 4 * stride, stride, &d[4]);
    145 }
    146 
    147 static inline void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
    148                                  const ptrdiff_t stride) {
    149  _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]);
    150  _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]);
    151  _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]);
    152  _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]);
    153  _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]);
    154  _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]);
    155  _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]);
    156  _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
    157 }
    158 
    159 static inline void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
    160                                    const ptrdiff_t stride) {
    161  _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
    162  _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
    163  _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
    164  _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
    165 }
    166 
    167 #endif  // AOM_AOM_DSP_X86_MEM_SSE2_H_