fwd_txfm_sse2.h (6963B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ 13 #define AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_ 14 15 #ifdef __cplusplus 16 extern "C" { 17 #endif 18 19 static inline __m128i k_madd_epi32(__m128i a, __m128i b) { 20 __m128i buf0, buf1; 21 buf0 = _mm_mul_epu32(a, b); 22 a = _mm_srli_epi64(a, 32); 23 b = _mm_srli_epi64(b, 32); 24 buf1 = _mm_mul_epu32(a, b); 25 return _mm_add_epi64(buf0, buf1); 26 } 27 28 static inline __m128i k_packs_epi64(__m128i a, __m128i b) { 29 __m128i buf0 = _mm_shuffle_epi32(a, _MM_SHUFFLE(0, 0, 2, 0)); 30 __m128i buf1 = _mm_shuffle_epi32(b, _MM_SHUFFLE(0, 0, 2, 0)); 31 return _mm_unpacklo_epi64(buf0, buf1); 32 } 33 34 static inline int check_epi16_overflow_x2(const __m128i *preg0, 35 const __m128i *preg1) { 36 const __m128i max_overflow = _mm_set1_epi16(0x7fff); 37 const __m128i min_overflow = _mm_set1_epi16((short)0x8000); 38 __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), 39 _mm_cmpeq_epi16(*preg0, min_overflow)); 40 __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), 41 _mm_cmpeq_epi16(*preg1, min_overflow)); 42 cmp0 = _mm_or_si128(cmp0, cmp1); 43 return _mm_movemask_epi8(cmp0); 44 } 45 46 static inline int check_epi16_overflow_x4(const __m128i *preg0, 47 const __m128i *preg1, 48 const __m128i *preg2, 49 const __m128i *preg3) { 50 const __m128i max_overflow = _mm_set1_epi16(0x7fff); 51 const __m128i min_overflow = _mm_set1_epi16((short)0x8000); 52 __m128i cmp0 = _mm_or_si128(_mm_cmpeq_epi16(*preg0, max_overflow), 53 _mm_cmpeq_epi16(*preg0, min_overflow)); 54 __m128i cmp1 = _mm_or_si128(_mm_cmpeq_epi16(*preg1, max_overflow), 55 _mm_cmpeq_epi16(*preg1, min_overflow)); 56 __m128i cmp2 = _mm_or_si128(_mm_cmpeq_epi16(*preg2, max_overflow), 57 _mm_cmpeq_epi16(*preg2, min_overflow)); 58 __m128i cmp3 = _mm_or_si128(_mm_cmpeq_epi16(*preg3, max_overflow), 59 _mm_cmpeq_epi16(*preg3, min_overflow)); 60 cmp0 = _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)); 61 return _mm_movemask_epi8(cmp0); 62 } 63 64 static inline int check_epi16_overflow_x8( 65 const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, 66 const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, 67 const __m128i *preg6, const __m128i *preg7) { 68 int res0, res1; 69 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 70 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 71 return res0 + res1; 72 } 73 74 static inline int check_epi16_overflow_x12( 75 const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, 76 const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, 77 const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, 78 const __m128i *preg9, const __m128i *preg10, const __m128i *preg11) { 79 int res0, res1; 80 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 81 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 82 if (!res0) res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); 83 return res0 + res1; 84 } 85 86 static inline int check_epi16_overflow_x16( 87 const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, 88 const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, 89 const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, 90 const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, 91 const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, 92 const __m128i *preg15) { 93 int res0, res1; 94 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 95 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 96 if (!res0) { 97 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); 98 if (!res1) res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); 99 } 100 return res0 + res1; 101 } 102 103 static inline int check_epi16_overflow_x32( 104 const __m128i *preg0, const __m128i *preg1, const __m128i *preg2, 105 const __m128i *preg3, const __m128i *preg4, const __m128i *preg5, 106 const __m128i *preg6, const __m128i *preg7, const __m128i *preg8, 107 const __m128i *preg9, const __m128i *preg10, const __m128i *preg11, 108 const __m128i *preg12, const __m128i *preg13, const __m128i *preg14, 109 const __m128i *preg15, const __m128i *preg16, const __m128i *preg17, 110 const __m128i *preg18, const __m128i *preg19, const __m128i *preg20, 111 const __m128i *preg21, const __m128i *preg22, const __m128i *preg23, 112 const __m128i *preg24, const __m128i *preg25, const __m128i *preg26, 113 const __m128i *preg27, const __m128i *preg28, const __m128i *preg29, 114 const __m128i *preg30, const __m128i *preg31) { 115 int res0, res1; 116 res0 = check_epi16_overflow_x4(preg0, preg1, preg2, preg3); 117 res1 = check_epi16_overflow_x4(preg4, preg5, preg6, preg7); 118 if (!res0) { 119 res0 = check_epi16_overflow_x4(preg8, preg9, preg10, preg11); 120 if (!res1) { 121 res1 = check_epi16_overflow_x4(preg12, preg13, preg14, preg15); 122 if (!res0) { 123 res0 = check_epi16_overflow_x4(preg16, preg17, preg18, preg19); 124 if (!res1) { 125 res1 = check_epi16_overflow_x4(preg20, preg21, preg22, preg23); 126 if (!res0) { 127 res0 = check_epi16_overflow_x4(preg24, preg25, preg26, preg27); 128 if (!res1) 129 res1 = check_epi16_overflow_x4(preg28, preg29, preg30, preg31); 130 } 131 } 132 } 133 } 134 } 135 return res0 + res1; 136 } 137 138 static inline void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { 139 const __m128i zero = _mm_setzero_si128(); 140 const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); 141 __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); 142 __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); 143 _mm_store_si128((__m128i *)(dst_ptr), out0); 144 _mm_store_si128((__m128i *)(dst_ptr + 4), out1); 145 } 146 147 static inline void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { 148 const __m128i zero = _mm_setzero_si128(); 149 const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); 150 __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); 151 __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); 152 _mm_storeu_si128((__m128i *)(dst_ptr), out0); 153 _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); 154 } 155 156 #ifdef __cplusplus 157 } // extern "C" 158 #endif 159 160 #endif // AOM_AOM_DSP_X86_FWD_TXFM_SSE2_H_