mem_sse2.h (6619B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_X86_MEM_SSE2_H_ 13 #define AOM_AOM_DSP_X86_MEM_SSE2_H_ 14 15 #include <emmintrin.h> // SSE2 16 #include <string.h> 17 18 #include "config/aom_config.h" 19 20 #include "aom/aom_integer.h" 21 22 static inline int16_t loadu_int16(const void *src) { 23 int16_t v; 24 memcpy(&v, src, sizeof(v)); 25 return v; 26 } 27 28 static inline int32_t loadu_int32(const void *src) { 29 int32_t v; 30 memcpy(&v, src, sizeof(v)); 31 return v; 32 } 33 34 static inline int64_t loadu_int64(const void *src) { 35 int64_t v; 36 memcpy(&v, src, sizeof(v)); 37 return v; 38 } 39 40 static inline void _mm_storeh_epi64(__m128i *const d, const __m128i s) { 41 _mm_storeh_pi((__m64 *)d, _mm_castsi128_ps(s)); 42 } 43 44 static inline __m128i loadh_epi64(const void *const src, const __m128i s) { 45 return _mm_castps_si128( 46 _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); 47 } 48 49 static inline __m128i load_8bit_4x4_to_1_reg_sse2(const void *const src, 50 const int byte_stride) { 51 return _mm_setr_epi32(loadu_int32((int8_t *)src + 0 * byte_stride), 52 loadu_int32((int8_t *)src + 1 * byte_stride), 53 loadu_int32((int8_t *)src + 2 * byte_stride), 54 loadu_int32((int8_t *)src + 3 * byte_stride)); 55 } 56 57 static inline __m128i load_8bit_8x2_to_1_reg_sse2(const void *const src, 58 const int byte_stride) { 59 __m128i dst; 60 dst = _mm_loadl_epi64((__m128i *)((int8_t *)src + 0 * byte_stride)); 61 dst = loadh_epi64((int8_t *)src + 1 * byte_stride, dst); 62 return dst; 63 } 64 65 static inline void store_8bit_8x4_from_16x2(const __m128i *const s, 66 uint8_t *const d, 67 const ptrdiff_t stride) { 68 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); 69 _mm_storeh_epi64((__m128i *)(d + 1 * stride), s[0]); 70 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[1]); 71 _mm_storeh_epi64((__m128i *)(d + 3 * stride), s[1]); 72 } 73 74 static inline void store_8bit_4x4(const __m128i *const s, uint8_t *const d, 75 const ptrdiff_t stride) { 76 *(int *)(d + 0 * stride) = _mm_cvtsi128_si32(s[0]); 77 *(int *)(d + 1 * stride) = _mm_cvtsi128_si32(s[1]); 78 *(int *)(d + 2 * stride) = _mm_cvtsi128_si32(s[2]); 79 *(int *)(d + 3 * stride) = _mm_cvtsi128_si32(s[3]); 80 } 81 82 static inline void store_8bit_4x4_sse2(const __m128i s, uint8_t *const d, 83 const ptrdiff_t stride) { 84 __m128i ss[4]; 85 86 ss[0] = s; 87 ss[1] = _mm_srli_si128(s, 4); 88 ss[2] = _mm_srli_si128(s, 8); 89 ss[3] = _mm_srli_si128(s, 12); 90 store_8bit_4x4(ss, d, stride); 91 } 92 93 static inline void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, 94 __m128i *const d) { 95 d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); 96 d[1] = _mm_cvtsi32_si128(*(const int *)(s + 1 * stride)); 97 d[2] = _mm_cvtsi32_si128(*(const int *)(s + 2 * stride)); 98 d[3] = _mm_cvtsi32_si128(*(const int *)(s + 3 * stride)); 99 } 100 101 static inline void load_8bit_4x8(const uint8_t *const s, const ptrdiff_t stride, 102 __m128i *const d) { 103 load_8bit_4x4(s + 0 * stride, stride, &d[0]); 104 load_8bit_4x4(s + 4 * stride, stride, &d[4]); 105 } 106 107 static inline void load_8bit_8x4(const uint8_t *const s, const ptrdiff_t stride, 108 __m128i *const d) { 109 d[0] = _mm_loadl_epi64((const __m128i *)(s + 0 * stride)); 110 d[1] = _mm_loadl_epi64((const __m128i *)(s + 1 * stride)); 111 d[2] = _mm_loadl_epi64((const __m128i *)(s + 2 * stride)); 112 d[3] = _mm_loadl_epi64((const __m128i *)(s + 3 * stride)); 113 } 114 115 static inline void loadu_8bit_16x4(const uint8_t *const s, 116 const ptrdiff_t stride, __m128i *const d) { 117 d[0] = _mm_loadu_si128((const __m128i *)(s + 0 * stride)); 118 d[1] = _mm_loadu_si128((const __m128i *)(s + 1 * stride)); 119 d[2] = _mm_loadu_si128((const __m128i *)(s + 2 * stride)); 120 d[3] = _mm_loadu_si128((const __m128i *)(s + 3 * stride)); 121 } 122 123 static inline void load_8bit_8x8(const uint8_t *const s, const ptrdiff_t stride, 124 __m128i *const d) { 125 load_8bit_8x4(s + 0 * stride, stride, &d[0]); 126 load_8bit_8x4(s + 4 * stride, stride, &d[4]); 127 } 128 129 static inline void load_8bit_16x8(const uint8_t *const s, 130 const ptrdiff_t stride, __m128i *const d) { 131 d[0] = _mm_load_si128((const __m128i *)(s + 0 * stride)); 132 d[1] = _mm_load_si128((const __m128i *)(s + 1 * stride)); 133 d[2] = _mm_load_si128((const __m128i *)(s + 2 * stride)); 134 d[3] = _mm_load_si128((const __m128i *)(s + 3 * stride)); 135 d[4] = _mm_load_si128((const __m128i *)(s + 4 * stride)); 136 d[5] = _mm_load_si128((const __m128i *)(s + 5 * stride)); 137 d[6] = _mm_load_si128((const __m128i *)(s + 6 * stride)); 138 d[7] = _mm_load_si128((const __m128i *)(s + 7 * stride)); 139 } 140 141 static inline void loadu_8bit_16x8(const uint8_t *const s, 142 const ptrdiff_t stride, __m128i *const d) { 143 loadu_8bit_16x4(s + 0 * stride, stride, &d[0]); 144 loadu_8bit_16x4(s + 4 * stride, stride, &d[4]); 145 } 146 147 static inline void store_8bit_8x8(const __m128i *const s, uint8_t *const d, 148 const ptrdiff_t stride) { 149 _mm_storel_epi64((__m128i *)(d + 0 * stride), s[0]); 150 _mm_storel_epi64((__m128i *)(d + 1 * stride), s[1]); 151 _mm_storel_epi64((__m128i *)(d + 2 * stride), s[2]); 152 _mm_storel_epi64((__m128i *)(d + 3 * stride), s[3]); 153 _mm_storel_epi64((__m128i *)(d + 4 * stride), s[4]); 154 _mm_storel_epi64((__m128i *)(d + 5 * stride), s[5]); 155 _mm_storel_epi64((__m128i *)(d + 6 * stride), s[6]); 156 _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); 157 } 158 159 static inline void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, 160 const ptrdiff_t stride) { 161 _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); 162 _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); 163 _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); 164 _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); 165 } 166 167 #endif // AOM_AOM_DSP_X86_MEM_SSE2_H_