encodetxb_avx2.c (5004B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <emmintrin.h> // SSE2 14 #include <smmintrin.h> /* SSE4.1 */ 15 #include <immintrin.h> /* AVX2 */ 16 17 #include "aom/aom_integer.h" 18 #include "aom_dsp/x86/mem_sse2.h" 19 #include "av1/common/av1_common_int.h" 20 #include "av1/common/txb_common.h" 21 #include "aom_dsp/x86/synonyms.h" 22 #include "aom_dsp/x86/synonyms_avx2.h" 23 24 void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, 25 const int height, uint8_t *const levels) { 26 const int stride = height + TX_PAD_HOR; 27 const __m256i y_zeros = _mm256_setzero_si256(); 28 29 const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); 30 uint8_t *bottom_buf_end = levels + (width + TX_PAD_BOTTOM) * stride; 31 uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); 32 33 do { 34 yy_storeu_256(bottom_buf, y_zeros); 35 bottom_buf += 32; 36 } while (bottom_buf < bottom_buf_end); 37 38 int i = 0; 39 uint8_t *ls = levels; 40 const tran_low_t *cf = coeff; 41 if (height == 4) { 42 do { 43 const __m256i c0 = yy_loadu_256(cf); 44 const __m256i c1 = yy_loadu_256(cf + 8); 45 const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); 46 const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); 47 const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); 48 const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); 49 yy_storeu_256(ls, res); 50 ls += 32; 51 cf += 16; 52 i += 4; 53 } while (i < width); 54 } else if (height == 8) { 55 do { 56 const __m256i coeffA = yy_loadu_256(cf); 57 const __m256i coeffB = yy_loadu_256(cf + 8); 58 const __m256i coeffC = yy_loadu_256(cf + 16); 59 const __m256i coeffD = yy_loadu_256(cf + 24); 60 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); 61 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); 62 const __m256i absAB = _mm256_abs_epi16(coeffAB); 63 const __m256i absCD = _mm256_abs_epi16(coeffCD); 64 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); 65 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); 66 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); 67 const __m128i res0 = _mm256_castsi256_si128(res); 68 const __m128i res1 = _mm256_extracti128_si256(res, 1); 69 xx_storel_64(ls, res0); 70 *(int32_t *)(ls + height) = 0; 71 xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); 72 *(int32_t *)(ls + height + stride) = 0; 73 xx_storel_64(ls + stride * 2, res1); 74 *(int32_t *)(ls + height + stride * 2) = 0; 75 xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); 76 *(int32_t *)(ls + height + stride * 3) = 0; 77 cf += 32; 78 ls += stride << 2; 79 i += 4; 80 } while (i < width); 81 } else if (height == 16) { 82 do { 83 const __m256i coeffA = yy_loadu_256(cf); 84 const __m256i coeffB = yy_loadu_256(cf + 8); 85 const __m256i coeffC = yy_loadu_256(cf + 16); 86 const __m256i coeffD = yy_loadu_256(cf + 24); 87 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); 88 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); 89 const __m256i absAB = _mm256_abs_epi16(coeffAB); 90 const __m256i absCD = _mm256_abs_epi16(coeffCD); 91 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); 92 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); 93 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); 94 xx_storeu_128(ls, _mm256_castsi256_si128(res)); 95 xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); 96 cf += 32; 97 *(int32_t *)(ls + height) = 0; 98 *(int32_t *)(ls + stride + height) = 0; 99 ls += stride << 1; 100 i += 2; 101 } while (i < width); 102 } else { 103 do { 104 const __m256i coeffA = yy_loadu_256(cf); 105 const __m256i coeffB = yy_loadu_256(cf + 8); 106 const __m256i coeffC = yy_loadu_256(cf + 16); 107 const __m256i coeffD = yy_loadu_256(cf + 24); 108 const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); 109 const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); 110 const __m256i absAB = _mm256_abs_epi16(coeffAB); 111 const __m256i absCD = _mm256_abs_epi16(coeffCD); 112 const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); 113 const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); 114 const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); 115 yy_storeu_256(ls, res); 116 cf += 32; 117 *(int32_t *)(ls + height) = 0; 118 ls += stride; 119 i += 1; 120 } while (i < width); 121 } 122 }