lpf_common_sse2.h (30103B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ 13 #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_ 14 15 #include <emmintrin.h> // SSE2 16 17 #include "config/aom_config.h" 18 19 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8) 20 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8) 21 22 static inline void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1, 23 __m128i *x2, __m128i *x3, 24 __m128i *x4, __m128i *x5, 25 __m128i *d0, __m128i *d1, 26 __m128i *d2, __m128i *d3, 27 __m128i *d4, __m128i *d5) { 28 __m128i w0, w1, w2, w3, w4, w5, ww0; 29 30 // 00 01 02 03 04 05 xx xx 31 // 10 11 12 13 14 15 xx xx 32 // 20 21 22 23 24 25 xx xx 33 // 30 31 32 33 34 35 xx xx 34 // 40 41 42 43 44 45 xx xx 35 // 50 51 52 53 54 55 xx xx 36 37 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 38 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 39 w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 40 41 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 42 *d0 = _mm_unpacklo_epi64(ww0, w2); // 00 10 20 30 40 50 41 51 43 *d1 = _mm_unpackhi_epi64(ww0, 44 _mm_srli_si128(w2, 4)); // 01 11 21 31 41 51 xx xx 45 46 ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 47 *d2 = _mm_unpacklo_epi64(ww0, 48 _mm_srli_si128(w2, 8)); // 02 12 22 32 42 52 xx xx 49 50 w3 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 xx xx xx xx 51 w4 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 xx xx xx xx 52 w5 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 xx xx xx xx 53 54 *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4)); // 03 13 23 33 43 53 55 56 ww0 = _mm_unpacklo_epi32(w3, w4); // 04 14 24 34 05 15 25 35 57 *d4 = _mm_unpacklo_epi64(ww0, w5); // 04 14 24 34 44 54 45 55 58 *d5 = _mm_unpackhi_epi64(ww0, 59 _mm_slli_si128(w5, 4)); // 05 15 25 35 45 55 xx xx 60 } 61 62 static inline void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, 63 __m128i *x2, __m128i *x3, 64 __m128i *d0, __m128i *d1, 65 __m128i *d2, __m128i *d3) { 66 __m128i zero = _mm_setzero_si128(); 67 __m128i w0, w1, ww0, ww1; 68 69 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 70 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 71 72 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 73 ww1 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 74 75 *d0 = _mm_unpacklo_epi64(ww0, zero); // 00 10 20 30 xx xx xx xx 76 *d1 = _mm_unpackhi_epi64(ww0, zero); // 01 11 21 31 xx xx xx xx 77 *d2 = _mm_unpacklo_epi64(ww1, zero); // 02 12 22 32 xx xx xx xx 78 *d3 = _mm_unpackhi_epi64(ww1, zero); // 03 13 23 33 xx xx xx xx 79 } 80 81 static inline void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1, 82 __m128i *x2, __m128i *x3, 83 __m128i *d4, __m128i *d5, 84 __m128i *d6, __m128i *d7) { 85 __m128i w0, w1, ww2, ww3; 86 __m128i zero = _mm_setzero_si128(); 87 88 w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 89 w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 90 91 ww2 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 92 ww3 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 93 94 *d4 = _mm_unpacklo_epi64(ww2, zero); // 04 14 24 34 xx xx xx xx 95 *d5 = _mm_unpackhi_epi64(ww2, zero); // 05 15 25 35 xx xx xx xx 96 *d6 = _mm_unpacklo_epi64(ww3, zero); // 06 16 26 36 xx xx xx xx 97 *d7 = _mm_unpackhi_epi64(ww3, zero); // 07 17 27 37 xx xx xx xx 98 } 99 100 // here in and out pointers (x and d) should be different! we don't store their 101 // values inside 102 static inline void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, 103 __m128i *x2, __m128i *x3, 104 __m128i *d0, __m128i *d1, 105 __m128i *d2, __m128i *d3, 106 __m128i *d4, __m128i *d5, 107 __m128i *d6, __m128i *d7) { 108 // input 109 // x0 00 01 02 03 04 05 06 07 110 // x1 10 11 12 13 14 15 16 17 111 // x2 20 21 22 23 24 25 26 27 112 // x3 30 31 32 33 34 35 36 37 113 // output 114 // 00 10 20 30 xx xx xx xx 115 // 01 11 21 31 xx xx xx xx 116 // 02 12 22 32 xx xx xx xx 117 // 03 13 23 33 xx xx xx xx 118 // 04 14 24 34 xx xx xx xx 119 // 05 15 25 35 xx xx xx xx 120 // 06 16 26 36 xx xx xx xx 121 // 07 17 27 37 xx xx xx xx 122 highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3); 123 highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7); 124 } 125 126 static inline void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1, 127 __m128i *x2, __m128i *x3, 128 __m128i *x4, __m128i *x5, 129 __m128i *x6, __m128i *x7, 130 __m128i *d0, __m128i *d1, 131 __m128i *d2, __m128i *d3) { 132 __m128i w0, w1, w2, w3, ww0, ww1; 133 // x0 00 01 02 03 04 05 06 07 134 // x1 10 11 12 13 14 15 16 17 135 // x2 20 21 22 23 24 25 26 27 136 // x3 30 31 32 33 34 35 36 37 137 // x4 40 41 42 43 44 45 46 47 138 // x5 50 51 52 53 54 55 56 57 139 // x6 60 61 62 63 64 65 66 67 140 // x7 70 71 72 73 74 75 76 77 141 142 w0 = _mm_unpacklo_epi16(*x0, *x1); // 00 10 01 11 02 12 03 13 143 w1 = _mm_unpacklo_epi16(*x2, *x3); // 20 30 21 31 22 32 23 33 144 w2 = _mm_unpacklo_epi16(*x4, *x5); // 40 50 41 51 42 52 43 53 145 w3 = _mm_unpacklo_epi16(*x6, *x7); // 60 70 61 71 62 72 63 73 146 147 ww0 = _mm_unpacklo_epi32(w0, w1); // 00 10 20 30 01 11 21 31 148 ww1 = _mm_unpacklo_epi32(w2, w3); // 40 50 60 70 41 51 61 71 149 150 *d0 = _mm_unpacklo_epi64(ww0, ww1); // 00 10 20 30 40 50 60 70 151 *d1 = _mm_unpackhi_epi64(ww0, ww1); // 01 11 21 31 41 51 61 71 152 153 ww0 = _mm_unpackhi_epi32(w0, w1); // 02 12 22 32 03 13 23 33 154 ww1 = _mm_unpackhi_epi32(w2, w3); // 42 52 62 72 43 53 63 73 155 156 *d2 = _mm_unpacklo_epi64(ww0, ww1); // 02 12 22 32 42 52 62 72 157 *d3 = _mm_unpackhi_epi64(ww0, ww1); // 03 13 23 33 43 53 63 73 158 } 159 160 static inline void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1, 161 __m128i *x2, __m128i *x3, 162 __m128i *x4, __m128i *x5, 163 __m128i *x6, __m128i *x7, 164 __m128i *d4, __m128i *d5, 165 __m128i *d6, __m128i *d7) { 166 __m128i w0, w1, w2, w3, ww0, ww1; 167 // x0 00 01 02 03 04 05 06 07 168 // x1 10 11 12 13 14 15 16 17 169 // x2 20 21 22 23 24 25 26 27 170 // x3 30 31 32 33 34 35 36 37 171 // x4 40 41 42 43 44 45 46 47 172 // x5 50 51 52 53 54 55 56 57 173 // x6 60 61 62 63 64 65 66 67 174 // x7 70 71 72 73 74 75 76 77 175 w0 = _mm_unpackhi_epi16(*x0, *x1); // 04 14 05 15 06 16 07 17 176 w1 = _mm_unpackhi_epi16(*x2, *x3); // 24 34 25 35 26 36 27 37 177 w2 = _mm_unpackhi_epi16(*x4, *x5); // 44 54 45 55 46 56 47 57 178 w3 = _mm_unpackhi_epi16(*x6, *x7); // 64 74 65 75 66 76 67 77 179 180 ww0 = _mm_unpacklo_epi32(w0, w1); // 04 14 24 34 05 15 25 35 181 ww1 = _mm_unpacklo_epi32(w2, w3); // 44 54 64 74 45 55 65 75 182 183 *d4 = _mm_unpacklo_epi64(ww0, ww1); // 04 14 24 34 44 54 64 74 184 *d5 = _mm_unpackhi_epi64(ww0, ww1); // 05 15 25 35 45 55 65 75 185 186 ww0 = _mm_unpackhi_epi32(w0, w1); // 06 16 26 36 07 17 27 37 187 ww1 = _mm_unpackhi_epi32(w2, w3); // 46 56 66 76 47 57 67 77 188 189 *d6 = _mm_unpacklo_epi64(ww0, ww1); // 06 16 26 36 46 56 66 76 190 *d7 = _mm_unpackhi_epi64(ww0, ww1); // 07 17 27 37 47 57 67 77 191 } 192 193 // here in and out pointers (x and d) should be different! we don't store their 194 // values inside 195 static inline void highbd_transpose8x8_sse2( 196 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, 197 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, 198 __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, 199 __m128i *d7) { 200 highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3); 201 highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7); 202 } 203 204 // here in and out pointers (x and d arrays) should be different! we don't store 205 // their values inside 206 static inline void highbd_transpose8x16_sse2( 207 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, 208 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1, 209 __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6, 210 __m128i *d7) { 211 highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4, 212 d5, d6, d7); 213 highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1, 214 x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1, 215 d4 + 1, d5 + 1, d6 + 1, d7 + 1); 216 } 217 218 // Low bit depth functions 219 static inline void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1, 220 __m128i *x2, __m128i *x3, 221 __m128i *d0, __m128i *d1, 222 __m128i *d2, __m128i *d3) { 223 // input 224 // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx 225 // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx 226 // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx 227 // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx 228 // output 229 // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx 230 // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx 231 // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx 232 // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx 233 234 __m128i w0, w1; 235 236 w0 = _mm_unpacklo_epi8( 237 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 238 w1 = _mm_unpacklo_epi8( 239 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 240 241 *d0 = _mm_unpacklo_epi16( 242 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 243 244 *d1 = _mm_srli_si128(*d0, 245 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx 246 *d2 = _mm_srli_si128(*d0, 247 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx 248 *d3 = _mm_srli_si128(*d0, 249 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx 250 } 251 252 static inline void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2, 253 __m128i *x3, __m128i *d0, __m128i *d1, 254 __m128i *d2, __m128i *d3, __m128i *d4, 255 __m128i *d5, __m128i *d6, 256 __m128i *d7) { 257 // input 258 // x0 00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx 259 // x1 10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx 260 // x2 20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx 261 // x3 30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx 262 // output 263 // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx 264 // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx 265 // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx 266 // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx 267 // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx 268 // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx 269 // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx 270 // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx 271 272 __m128i w0, w1, ww0, ww1; 273 274 w0 = _mm_unpacklo_epi8( 275 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 276 w1 = _mm_unpacklo_epi8( 277 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 278 279 ww0 = _mm_unpacklo_epi16( 280 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 281 ww1 = _mm_unpackhi_epi16( 282 w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 283 284 *d0 = ww0; // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx 285 *d1 = _mm_srli_si128(ww0, 286 4); // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx 287 *d2 = _mm_srli_si128(ww0, 288 8); // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx 289 *d3 = _mm_srli_si128(ww0, 290 12); // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx 291 292 *d4 = ww1; // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx 293 *d5 = _mm_srli_si128(ww1, 294 4); // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx 295 *d6 = _mm_srli_si128(ww1, 296 8); // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx 297 *d7 = _mm_srli_si128(ww1, 298 12); // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx 299 } 300 301 static inline void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2, 302 __m128i *x3, __m128i *x4, __m128i *x5, 303 __m128i *x6, __m128i *x7, __m128i *d0, 304 __m128i *d1, __m128i *d2, 305 __m128i *d3) { 306 // input 307 // x0 00 01 02 03 04 05 06 07 308 // x1 10 11 12 13 14 15 16 17 309 // x2 20 21 22 23 24 25 26 27 310 // x3 30 31 32 33 34 35 36 37 311 // x4 40 41 42 43 44 45 46 47 312 // x5 50 51 52 53 54 55 56 57 313 // x6 60 61 62 63 64 65 66 67 314 // x7 70 71 72 73 74 75 76 77 315 // output 316 // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx 317 // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx 318 // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx 319 // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx 320 321 __m128i w0, w1, w2, w3, w4, w5; 322 323 w0 = _mm_unpacklo_epi8( 324 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 325 326 w1 = _mm_unpacklo_epi8( 327 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 328 329 w2 = _mm_unpacklo_epi8( 330 *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 331 332 w3 = _mm_unpacklo_epi8( 333 *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 334 335 w4 = _mm_unpacklo_epi16( 336 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 337 w5 = _mm_unpacklo_epi16( 338 w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 339 340 *d0 = _mm_unpacklo_epi32( 341 w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 342 *d1 = _mm_srli_si128(*d0, 8); 343 *d2 = _mm_unpackhi_epi32( 344 w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 345 *d3 = _mm_srli_si128(*d2, 8); 346 } 347 348 static inline void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2, 349 __m128i *x3, __m128i *x4, __m128i *x5, 350 __m128i *x6, __m128i *x7, __m128i *d0d1, 351 __m128i *d2d3, __m128i *d4d5, 352 __m128i *d6d7) { 353 __m128i w0, w1, w2, w3, w4, w5, w6, w7; 354 // x0 00 01 02 03 04 05 06 07 355 // x1 10 11 12 13 14 15 16 17 356 w0 = _mm_unpacklo_epi8( 357 *x0, *x1); // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 358 359 // x2 20 21 22 23 24 25 26 27 360 // x3 30 31 32 33 34 35 36 37 361 w1 = _mm_unpacklo_epi8( 362 *x2, *x3); // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 363 364 // x4 40 41 42 43 44 45 46 47 365 // x5 50 51 52 53 54 55 56 57 366 w2 = _mm_unpacklo_epi8( 367 *x4, *x5); // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 368 369 // x6 60 61 62 63 64 65 66 67 370 // x7 70 71 72 73 74 75 76 77 371 w3 = _mm_unpacklo_epi8( 372 *x6, *x7); // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 373 374 w4 = _mm_unpacklo_epi16( 375 w0, w1); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 376 w5 = _mm_unpacklo_epi16( 377 w2, w3); // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 378 379 *d0d1 = _mm_unpacklo_epi32( 380 w4, w5); // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 381 *d2d3 = _mm_unpackhi_epi32( 382 w4, w5); // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 383 384 w6 = _mm_unpackhi_epi16( 385 w0, w1); // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 386 w7 = _mm_unpackhi_epi16( 387 w2, w3); // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 388 389 *d4d5 = _mm_unpacklo_epi32( 390 w6, w7); // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 391 *d6d7 = _mm_unpackhi_epi32( 392 w6, w7); // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 393 } 394 395 static inline void transpose16x8_8x16_sse2( 396 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, 397 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9, 398 __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14, 399 __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3, 400 __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) { 401 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; 402 __m128i w10, w11, w12, w13, w14, w15; 403 404 w0 = _mm_unpacklo_epi8(*x0, *x1); 405 w1 = _mm_unpacklo_epi8(*x2, *x3); 406 w2 = _mm_unpacklo_epi8(*x4, *x5); 407 w3 = _mm_unpacklo_epi8(*x6, *x7); 408 409 w8 = _mm_unpacklo_epi8(*x8, *x9); 410 w9 = _mm_unpacklo_epi8(*x10, *x11); 411 w10 = _mm_unpacklo_epi8(*x12, *x13); 412 w11 = _mm_unpacklo_epi8(*x14, *x15); 413 414 w4 = _mm_unpacklo_epi16(w0, w1); 415 w5 = _mm_unpacklo_epi16(w2, w3); 416 w12 = _mm_unpacklo_epi16(w8, w9); 417 w13 = _mm_unpacklo_epi16(w10, w11); 418 419 w6 = _mm_unpacklo_epi32(w4, w5); 420 w7 = _mm_unpackhi_epi32(w4, w5); 421 w14 = _mm_unpacklo_epi32(w12, w13); 422 w15 = _mm_unpackhi_epi32(w12, w13); 423 424 // Store first 4-line result 425 *d0 = _mm_unpacklo_epi64(w6, w14); 426 *d1 = _mm_unpackhi_epi64(w6, w14); 427 *d2 = _mm_unpacklo_epi64(w7, w15); 428 *d3 = _mm_unpackhi_epi64(w7, w15); 429 430 w4 = _mm_unpackhi_epi16(w0, w1); 431 w5 = _mm_unpackhi_epi16(w2, w3); 432 w12 = _mm_unpackhi_epi16(w8, w9); 433 w13 = _mm_unpackhi_epi16(w10, w11); 434 435 w6 = _mm_unpacklo_epi32(w4, w5); 436 w7 = _mm_unpackhi_epi32(w4, w5); 437 w14 = _mm_unpacklo_epi32(w12, w13); 438 w15 = _mm_unpackhi_epi32(w12, w13); 439 440 // Store second 4-line result 441 *d4 = _mm_unpacklo_epi64(w6, w14); 442 *d5 = _mm_unpackhi_epi64(w6, w14); 443 *d6 = _mm_unpacklo_epi64(w7, w15); 444 *d7 = _mm_unpackhi_epi64(w7, w15); 445 } 446 447 static inline void transpose8x16_16x8_sse2( 448 __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4, 449 __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3, 450 __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11, 451 __m128i *d12d13, __m128i *d14d15) { 452 __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9; 453 __m128i w10, w11, w12, w13, w14, w15; 454 455 w0 = _mm_unpacklo_epi8(*x0, *x1); 456 w1 = _mm_unpacklo_epi8(*x2, *x3); 457 w2 = _mm_unpacklo_epi8(*x4, *x5); 458 w3 = _mm_unpacklo_epi8(*x6, *x7); 459 460 w8 = _mm_unpackhi_epi8(*x0, *x1); 461 w9 = _mm_unpackhi_epi8(*x2, *x3); 462 w10 = _mm_unpackhi_epi8(*x4, *x5); 463 w11 = _mm_unpackhi_epi8(*x6, *x7); 464 465 w4 = _mm_unpacklo_epi16(w0, w1); 466 w5 = _mm_unpacklo_epi16(w2, w3); 467 w12 = _mm_unpacklo_epi16(w8, w9); 468 w13 = _mm_unpacklo_epi16(w10, w11); 469 470 w6 = _mm_unpacklo_epi32(w4, w5); 471 w7 = _mm_unpackhi_epi32(w4, w5); 472 w14 = _mm_unpacklo_epi32(w12, w13); 473 w15 = _mm_unpackhi_epi32(w12, w13); 474 475 // Store first 4-line result 476 *d0d1 = _mm_unpacklo_epi64(w6, w14); 477 *d2d3 = _mm_unpackhi_epi64(w6, w14); 478 *d4d5 = _mm_unpacklo_epi64(w7, w15); 479 *d6d7 = _mm_unpackhi_epi64(w7, w15); 480 481 w4 = _mm_unpackhi_epi16(w0, w1); 482 w5 = _mm_unpackhi_epi16(w2, w3); 483 w12 = _mm_unpackhi_epi16(w8, w9); 484 w13 = _mm_unpackhi_epi16(w10, w11); 485 486 w6 = _mm_unpacklo_epi32(w4, w5); 487 w7 = _mm_unpackhi_epi32(w4, w5); 488 w14 = _mm_unpacklo_epi32(w12, w13); 489 w15 = _mm_unpackhi_epi32(w12, w13); 490 491 // Store second 4-line result 492 *d8d9 = _mm_unpacklo_epi64(w6, w14); 493 *d10d11 = _mm_unpackhi_epi64(w6, w14); 494 *d12d13 = _mm_unpacklo_epi64(w7, w15); 495 *d14d15 = _mm_unpackhi_epi64(w7, w15); 496 } 497 498 static inline void transpose_16x8(unsigned char *in0, unsigned char *in1, 499 int in_p, unsigned char *out, int out_p) { 500 __m128i x0, x1, x2, x3, x4, x5, x6, x7; 501 __m128i x8, x9, x10, x11, x12, x13, x14, x15; 502 503 x0 = _mm_loadl_epi64((__m128i *)in0); 504 x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p)); 505 x0 = _mm_unpacklo_epi8(x0, x1); 506 507 x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p)); 508 x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p)); 509 x1 = _mm_unpacklo_epi8(x2, x3); 510 511 x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p)); 512 x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p)); 513 x2 = _mm_unpacklo_epi8(x4, x5); 514 515 x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p)); 516 x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p)); 517 x3 = _mm_unpacklo_epi8(x6, x7); 518 x4 = _mm_unpacklo_epi16(x0, x1); 519 520 x8 = _mm_loadl_epi64((__m128i *)in1); 521 x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p)); 522 x8 = _mm_unpacklo_epi8(x8, x9); 523 x5 = _mm_unpacklo_epi16(x2, x3); 524 525 x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p)); 526 x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p)); 527 x9 = _mm_unpacklo_epi8(x10, x11); 528 529 x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p)); 530 x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p)); 531 x10 = _mm_unpacklo_epi8(x12, x13); 532 x12 = _mm_unpacklo_epi16(x8, x9); 533 534 x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p)); 535 x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p)); 536 x11 = _mm_unpacklo_epi8(x14, x15); 537 x13 = _mm_unpacklo_epi16(x10, x11); 538 539 x6 = _mm_unpacklo_epi32(x4, x5); 540 x7 = _mm_unpackhi_epi32(x4, x5); 541 x14 = _mm_unpacklo_epi32(x12, x13); 542 x15 = _mm_unpackhi_epi32(x12, x13); 543 544 // Store first 4-line result 545 _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14)); 546 _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14)); 547 _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15)); 548 _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15)); 549 550 x4 = _mm_unpackhi_epi16(x0, x1); 551 x5 = _mm_unpackhi_epi16(x2, x3); 552 x12 = _mm_unpackhi_epi16(x8, x9); 553 x13 = _mm_unpackhi_epi16(x10, x11); 554 555 x6 = _mm_unpacklo_epi32(x4, x5); 556 x7 = _mm_unpackhi_epi32(x4, x5); 557 x14 = _mm_unpacklo_epi32(x12, x13); 558 x15 = _mm_unpackhi_epi32(x12, x13); 559 560 // Store second 4-line result 561 _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14)); 562 _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14)); 563 _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15)); 564 _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15)); 565 } 566 567 static inline void transpose_16x8_to_8x16(unsigned char *src, int in_p, 568 unsigned char *dst, int out_p) { 569 // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0 570 // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1 571 // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2 572 // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3 573 // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4 574 // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5 575 // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6 576 // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7 577 const __m128i x0 = _mm_loadu_si128((__m128i *)(src)); 578 const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p))); 579 const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p))); 580 const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p))); 581 const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p))); 582 const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p))); 583 const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p))); 584 const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p))); 585 586 // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1 587 // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1 588 // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3 589 // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3 590 // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5 591 // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5 592 // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7 593 // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7 594 const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1); 595 const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1); 596 const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3); 597 const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3); 598 const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5); 599 const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5); 600 const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7); 601 const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7); 602 603 // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3 604 // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3 605 // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3 606 // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3 607 // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7 608 // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7 609 // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7 610 // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7 611 const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12); 612 const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12); 613 const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13); 614 const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13); 615 const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16); 616 const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16); 617 const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17); 618 const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17); 619 620 // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7 621 // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7 622 // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7 623 // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7 624 // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7 625 // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7 626 // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7 627 // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7 628 const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24); 629 const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24); 630 const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25); 631 const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25); 632 const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26); 633 const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26); 634 const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27); 635 const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27); 636 637 mm_storelu(dst, x_s30); 638 mm_storehu(dst + (1 * out_p), x_s30); 639 mm_storelu(dst + (2 * out_p), x_s31); 640 mm_storehu(dst + (3 * out_p), x_s31); 641 mm_storelu(dst + (4 * out_p), x_s32); 642 mm_storehu(dst + (5 * out_p), x_s32); 643 mm_storelu(dst + (6 * out_p), x_s33); 644 mm_storehu(dst + (7 * out_p), x_s33); 645 mm_storelu(dst + (8 * out_p), x_s34); 646 mm_storehu(dst + (9 * out_p), x_s34); 647 mm_storelu(dst + (10 * out_p), x_s35); 648 mm_storehu(dst + (11 * out_p), x_s35); 649 mm_storelu(dst + (12 * out_p), x_s36); 650 mm_storehu(dst + (13 * out_p), x_s36); 651 mm_storelu(dst + (14 * out_p), x_s37); 652 mm_storehu(dst + (15 * out_p), x_s37); 653 } 654 655 static inline void transpose_8xn(unsigned char *src[], int in_p, 656 unsigned char *dst[], int out_p, 657 int num_8x8_to_transpose) { 658 int idx8x8 = 0; 659 __m128i x0, x1, x2, x3, x4, x5, x6, x7; 660 do { 661 unsigned char *in = src[idx8x8]; 662 unsigned char *out = dst[idx8x8]; 663 664 x0 = 665 _mm_loadl_epi64((__m128i *)(in + 0 * in_p)); // 00 01 02 03 04 05 06 07 666 x1 = 667 _mm_loadl_epi64((__m128i *)(in + 1 * in_p)); // 10 11 12 13 14 15 16 17 668 // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 669 x0 = _mm_unpacklo_epi8(x0, x1); 670 671 x2 = 672 _mm_loadl_epi64((__m128i *)(in + 2 * in_p)); // 20 21 22 23 24 25 26 27 673 x3 = 674 _mm_loadl_epi64((__m128i *)(in + 3 * in_p)); // 30 31 32 33 34 35 36 37 675 // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 676 x1 = _mm_unpacklo_epi8(x2, x3); 677 678 x4 = 679 _mm_loadl_epi64((__m128i *)(in + 4 * in_p)); // 40 41 42 43 44 45 46 47 680 x5 = 681 _mm_loadl_epi64((__m128i *)(in + 5 * in_p)); // 50 51 52 53 54 55 56 57 682 // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 683 x2 = _mm_unpacklo_epi8(x4, x5); 684 685 x6 = 686 _mm_loadl_epi64((__m128i *)(in + 6 * in_p)); // 60 61 62 63 64 65 66 67 687 x7 = 688 _mm_loadl_epi64((__m128i *)(in + 7 * in_p)); // 70 71 72 73 74 75 76 77 689 // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 690 x3 = _mm_unpacklo_epi8(x6, x7); 691 692 // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 693 x4 = _mm_unpacklo_epi16(x0, x1); 694 // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73 695 x5 = _mm_unpacklo_epi16(x2, x3); 696 // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 697 x6 = _mm_unpacklo_epi32(x4, x5); 698 mm_storelu(out + 0 * out_p, x6); // 00 10 20 30 40 50 60 70 699 mm_storehu(out + 1 * out_p, x6); // 01 11 21 31 41 51 61 71 700 // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 701 x7 = _mm_unpackhi_epi32(x4, x5); 702 mm_storelu(out + 2 * out_p, x7); // 02 12 22 32 42 52 62 72 703 mm_storehu(out + 3 * out_p, x7); // 03 13 23 33 43 53 63 73 704 705 // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37 706 x4 = _mm_unpackhi_epi16(x0, x1); 707 // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77 708 x5 = _mm_unpackhi_epi16(x2, x3); 709 // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75 710 x6 = _mm_unpacklo_epi32(x4, x5); 711 mm_storelu(out + 4 * out_p, x6); // 04 14 24 34 44 54 64 74 712 mm_storehu(out + 5 * out_p, x6); // 05 15 25 35 45 55 65 75 713 // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77 714 x7 = _mm_unpackhi_epi32(x4, x5); 715 716 mm_storelu(out + 6 * out_p, x7); // 06 16 26 36 46 56 66 76 717 mm_storehu(out + 7 * out_p, x7); // 07 17 27 37 47 57 67 77 718 } while (++idx8x8 < num_8x8_to_transpose); 719 } 720 721 #endif // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_