intrapred_utils.h (7179B)
1 /* 2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 #ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ 12 #define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_ 13 14 #include <emmintrin.h> // SSE2 15 #include "aom/aom_integer.h" 16 #include "config/aom_config.h" 17 #include "config/aom_dsp_rtcd.h" 18 19 static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { 20 { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, 21 { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, 22 { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, 23 { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, 24 { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, 25 { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, 26 { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, 27 { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } 28 }; 29 30 static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = { 31 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 32 { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 }, 33 { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 }, 34 { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 }, 35 { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 }, 36 { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 }, 37 { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 }, 38 { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 }, 39 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 }, 40 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 }, 41 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 }, 42 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 }, 43 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 }, 44 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 }, 45 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 }, 46 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 47 }; 48 49 static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = { 50 { -1, 0, 0, 0, 0, 0, 0, 0 }, { -1, -1, 0, 0, 0, 0, 0, 0 }, 51 { -1, -1, -1, 0, 0, 0, 0, 0 }, { -1, -1, -1, -1, 0, 0, 0, 0 }, 52 { -1, -1, -1, -1, -1, 0, 0, 0 }, { -1, -1, -1, -1, -1, -1, 0, 0 }, 53 { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 }, 54 }; 55 56 static inline void transpose4x16_sse2(__m128i *x, __m128i *d) { 57 __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3; 58 w0 = _mm_unpacklo_epi8(x[0], x[1]); 59 w1 = _mm_unpacklo_epi8(x[2], x[3]); 60 w2 = _mm_unpackhi_epi8(x[0], x[1]); 61 w3 = _mm_unpackhi_epi8(x[2], x[3]); 62 63 ww0 = _mm_unpacklo_epi16(w0, w1); 64 ww1 = _mm_unpacklo_epi16(w2, w3); 65 ww2 = _mm_unpackhi_epi16(w0, w1); 66 ww3 = _mm_unpackhi_epi16(w2, w3); 67 68 w0 = _mm_unpacklo_epi32(ww0, ww1); 69 w2 = _mm_unpacklo_epi32(ww2, ww3); 70 w1 = _mm_unpackhi_epi32(ww0, ww1); 71 w3 = _mm_unpackhi_epi32(ww2, ww3); 72 73 d[0] = _mm_unpacklo_epi64(w0, w2); 74 d[1] = _mm_unpackhi_epi64(w0, w2); 75 d[2] = _mm_unpacklo_epi64(w1, w3); 76 d[3] = _mm_unpackhi_epi64(w1, w3); 77 78 d[4] = _mm_srli_si128(d[0], 8); 79 d[5] = _mm_srli_si128(d[1], 8); 80 d[6] = _mm_srli_si128(d[2], 8); 81 d[7] = _mm_srli_si128(d[3], 8); 82 83 d[8] = _mm_srli_si128(d[0], 4); 84 d[9] = _mm_srli_si128(d[1], 4); 85 d[10] = _mm_srli_si128(d[2], 4); 86 d[11] = _mm_srli_si128(d[3], 4); 87 88 d[12] = _mm_srli_si128(d[0], 12); 89 d[13] = _mm_srli_si128(d[1], 12); 90 d[14] = _mm_srli_si128(d[2], 12); 91 d[15] = _mm_srli_si128(d[3], 12); 92 } 93 94 static inline void transpose16x16_sse2(__m128i *x, __m128i *d) { 95 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; 96 __m128i w10, w11, w12, w13, w14, w15; 97 98 w0 = _mm_unpacklo_epi8(x[0], x[1]); 99 w1 = _mm_unpacklo_epi8(x[2], x[3]); 100 w2 = _mm_unpacklo_epi8(x[4], x[5]); 101 w3 = _mm_unpacklo_epi8(x[6], x[7]); 102 103 w8 = _mm_unpacklo_epi8(x[8], x[9]); 104 w9 = _mm_unpacklo_epi8(x[10], x[11]); 105 w10 = _mm_unpacklo_epi8(x[12], x[13]); 106 w11 = _mm_unpacklo_epi8(x[14], x[15]); 107 108 w4 = _mm_unpacklo_epi16(w0, w1); 109 w5 = _mm_unpacklo_epi16(w2, w3); 110 w12 = _mm_unpacklo_epi16(w8, w9); 111 w13 = _mm_unpacklo_epi16(w10, w11); 112 113 w6 = _mm_unpacklo_epi32(w4, w5); 114 w7 = _mm_unpackhi_epi32(w4, w5); 115 w14 = _mm_unpacklo_epi32(w12, w13); 116 w15 = _mm_unpackhi_epi32(w12, w13); 117 118 // Store first 4-line result 119 d[0] = _mm_unpacklo_epi64(w6, w14); 120 d[1] = _mm_unpackhi_epi64(w6, w14); 121 d[2] = _mm_unpacklo_epi64(w7, w15); 122 d[3] = _mm_unpackhi_epi64(w7, w15); 123 124 w4 = _mm_unpackhi_epi16(w0, w1); 125 w5 = _mm_unpackhi_epi16(w2, w3); 126 w12 = _mm_unpackhi_epi16(w8, w9); 127 w13 = _mm_unpackhi_epi16(w10, w11); 128 129 w6 = _mm_unpacklo_epi32(w4, w5); 130 w7 = _mm_unpackhi_epi32(w4, w5); 131 w14 = _mm_unpacklo_epi32(w12, w13); 132 w15 = _mm_unpackhi_epi32(w12, w13); 133 134 // Store second 4-line result 135 d[4] = _mm_unpacklo_epi64(w6, w14); 136 d[5] = _mm_unpackhi_epi64(w6, w14); 137 d[6] = _mm_unpacklo_epi64(w7, w15); 138 d[7] = _mm_unpackhi_epi64(w7, w15); 139 140 // upper half 141 w0 = _mm_unpackhi_epi8(x[0], x[1]); 142 w1 = _mm_unpackhi_epi8(x[2], x[3]); 143 w2 = _mm_unpackhi_epi8(x[4], x[5]); 144 w3 = _mm_unpackhi_epi8(x[6], x[7]); 145 146 w8 = _mm_unpackhi_epi8(x[8], x[9]); 147 w9 = _mm_unpackhi_epi8(x[10], x[11]); 148 w10 = _mm_unpackhi_epi8(x[12], x[13]); 149 w11 = _mm_unpackhi_epi8(x[14], x[15]); 150 151 w4 = _mm_unpacklo_epi16(w0, w1); 152 w5 = _mm_unpacklo_epi16(w2, w3); 153 w12 = _mm_unpacklo_epi16(w8, w9); 154 w13 = _mm_unpacklo_epi16(w10, w11); 155 156 w6 = _mm_unpacklo_epi32(w4, w5); 157 w7 = _mm_unpackhi_epi32(w4, w5); 158 w14 = _mm_unpacklo_epi32(w12, w13); 159 w15 = _mm_unpackhi_epi32(w12, w13); 160 161 // Store first 4-line result 162 d[8] = _mm_unpacklo_epi64(w6, w14); 163 d[9] = _mm_unpackhi_epi64(w6, w14); 164 d[10] = _mm_unpacklo_epi64(w7, w15); 165 d[11] = _mm_unpackhi_epi64(w7, w15); 166 167 w4 = _mm_unpackhi_epi16(w0, w1); 168 w5 = _mm_unpackhi_epi16(w2, w3); 169 w12 = _mm_unpackhi_epi16(w8, w9); 170 w13 = _mm_unpackhi_epi16(w10, w11); 171 172 w6 = _mm_unpacklo_epi32(w4, w5); 173 w7 = _mm_unpackhi_epi32(w4, w5); 174 w14 = _mm_unpacklo_epi32(w12, w13); 175 w15 = _mm_unpackhi_epi32(w12, w13); 176 177 // Store second 4-line result 178 d[12] = _mm_unpacklo_epi64(w6, w14); 179 d[13] = _mm_unpackhi_epi64(w6, w14); 180 d[14] = _mm_unpacklo_epi64(w7, w15); 181 d[15] = _mm_unpackhi_epi64(w7, w15); 182 } 183 184 static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, 185 uint8_t *dst, ptrdiff_t pitchDst) { 186 __m128i r[16]; 187 __m128i d[16]; 188 for (int j = 0; j < 16; j++) { 189 r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc)); 190 } 191 transpose16x16_sse2(r, d); 192 for (int j = 0; j < 16; j++) { 193 _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]); 194 } 195 } 196 197 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, 198 ptrdiff_t pitchDst, int width, int height) { 199 for (int j = 0; j < height; j += 16) 200 for (int i = 0; i < width; i += 16) 201 transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, 202 dst + j * pitchDst + i, pitchDst); 203 } 204 205 #endif // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_