mmloadusi64.patch (3066B)
1 diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h 2 --- a/aom_dsp/x86/synonyms.h 3 +++ b/aom_dsp/x86/synonyms.h 4 @@ -41,23 +41,34 @@ static inline __m128i xx_loadl_64(const 5 static inline __m128i xx_load_128(const void *a) { 6 return _mm_load_si128((const __m128i *)a); 7 } 8 9 static inline __m128i xx_loadu_128(const void *a) { 10 return _mm_loadu_si128((const __m128i *)a); 11 } 12 13 +// _mm_loadu_si64 has been introduced in GCC 9, reimplement the function 14 +// manually on older compilers. 15 +#if !defined(__clang__) && __GNUC_MAJOR__ < 9 16 +static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) { 17 + __m64 hi_, lo_; 18 + memcpy(&hi_, hi, sizeof(hi_)); 19 + memcpy(&lo_, lo, sizeof(lo_)); 20 + return _mm_set_epi64(hi_, lo_); 21 +} 22 +#else 23 // Load 64 bits from each of hi and low, and pack into an SSE register 24 // Since directly loading as `int64_t`s and using _mm_set_epi64 may violate 25 // the strict aliasing rule, this takes a different approach 26 static inline __m128i xx_loadu_2x64(const void *hi, const void *lo) { 27 return _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)lo), 28 _mm_loadl_epi64((const __m128i *)hi)); 29 } 30 +#endif 31 32 static inline void xx_storel_32(void *const a, const __m128i v) { 33 const int val = _mm_cvtsi128_si32(v); 34 memcpy(a, &val, sizeof(val)); 35 } 36 37 static inline void xx_storel_64(void *const a, const __m128i v) { 38 _mm_storel_epi64((__m128i *)a, v); 39 diff --git a/aom_dsp/x86/synonyms_avx2.h b/aom_dsp/x86/synonyms_avx2.h 40 --- a/aom_dsp/x86/synonyms_avx2.h 41 +++ b/aom_dsp/x86/synonyms_avx2.h 42 @@ -71,21 +71,36 @@ static inline __m256i yy_loadu_4x64(cons 43 __m128d v23 = _mm_loadh_pd(v2, (const double *)e3); 44 // Note this can be replaced with 45 // `_mm256_castpd_si256(_mm256_set_m128d(v23, v01))` if immintrin.h contains 46 // _mm256_set_m128d() with all supported compilers. This version is used to 47 // match the behavior with yy_set_m128i(). 48 return yy_set_m128i(_mm_castpd_si128(v23), _mm_castpd_si128(v01)); 49 } 50 51 +#define GCC_VERSION (__GNUC__ * 10000 \ 52 + + __GNUC_MINOR__ * 100 \ 53 + + __GNUC_PATCHLEVEL__) 54 + 55 +// _mm256_loadu2_m128i has been introduced in GCC 10.1 56 +#if !defined(__clang__) && GCC_VERSION < 101000 57 +static inline __m256i yy_loadu2_128(const void *hi, const void *lo) { 58 + __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); 59 + __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); 60 + return _mm256_set_m128i(mhi, mlo); 61 +} 62 +#else 63 static inline __m256i yy_loadu2_128(const void *hi, const void *lo) { 64 __m128i mhi = _mm_loadu_si128((const __m128i *)(hi)); 65 __m128i mlo = _mm_loadu_si128((const __m128i *)(lo)); 66 return yy_set_m128i(mhi, mlo); 67 } 68 +#endif 69 + 70 +#undef GCC_VERSION 71 72 static inline void yy_storeu2_128(void *hi, void *lo, const __m256i a) { 73 _mm_storeu_si128((__m128i *)hi, _mm256_extracti128_si256(a, 1)); 74 _mm_storeu_si128((__m128i *)lo, _mm256_castsi256_si128(a)); 75 } 76 77 static inline __m256i yy_roundn_epu16(__m256i v_val_w, int bits) { 78 const __m256i v_s_w = _mm256_srli_epi16(v_val_w, bits - 1);