tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

lpf_common_sse2.h (30103B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
     13 #define AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_
     14 
     15 #include <emmintrin.h>  // SSE2
     16 
     17 #include "config/aom_config.h"
     18 
     19 #define mm_storelu(dst, v) memcpy((dst), (const char *)&(v), 8)
     20 #define mm_storehu(dst, v) memcpy((dst), (const char *)&(v) + 8, 8)
     21 
     22 static inline void highbd_transpose6x6_sse2(__m128i *x0, __m128i *x1,
     23                                            __m128i *x2, __m128i *x3,
     24                                            __m128i *x4, __m128i *x5,
     25                                            __m128i *d0, __m128i *d1,
     26                                            __m128i *d2, __m128i *d3,
     27                                            __m128i *d4, __m128i *d5) {
     28  __m128i w0, w1, w2, w3, w4, w5, ww0;
     29 
     30  // 00 01 02 03 04 05 xx xx
     31  // 10 11 12 13 14 15 xx xx
     32  // 20 21 22 23 24 25 xx xx
     33  // 30 31 32 33 34 35 xx xx
     34  // 40 41 42 43 44 45 xx xx
     35  // 50 51 52 53 54 55 xx xx
     36 
     37  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
     38  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
     39  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
     40 
     41  ww0 = _mm_unpacklo_epi32(w0, w1);   // 00 10 20 30 01 11 21 31
     42  *d0 = _mm_unpacklo_epi64(ww0, w2);  // 00 10 20 30 40 50 41 51
     43  *d1 = _mm_unpackhi_epi64(ww0,
     44                           _mm_srli_si128(w2, 4));  // 01 11 21 31 41 51 xx xx
     45 
     46  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
     47  *d2 = _mm_unpacklo_epi64(ww0,
     48                           _mm_srli_si128(w2, 8));  // 02 12 22 32 42 52 xx xx
     49 
     50  w3 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 xx xx xx xx
     51  w4 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 xx xx xx xx
     52  w5 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 xx xx xx xx
     53 
     54  *d3 = _mm_unpackhi_epi64(ww0, _mm_srli_si128(w2, 4));  // 03 13 23 33 43 53
     55 
     56  ww0 = _mm_unpacklo_epi32(w3, w4);   //  04 14 24 34 05 15 25 35
     57  *d4 = _mm_unpacklo_epi64(ww0, w5);  //  04 14 24 34 44 54 45 55
     58  *d5 = _mm_unpackhi_epi64(ww0,
     59                           _mm_slli_si128(w5, 4));  // 05 15 25 35 45 55 xx xx
     60 }
     61 
     62 static inline void highbd_transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
     63                                                    __m128i *x2, __m128i *x3,
     64                                                    __m128i *d0, __m128i *d1,
     65                                                    __m128i *d2, __m128i *d3) {
     66  __m128i zero = _mm_setzero_si128();
     67  __m128i w0, w1, ww0, ww1;
     68 
     69  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
     70  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
     71 
     72  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
     73  ww1 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
     74 
     75  *d0 = _mm_unpacklo_epi64(ww0, zero);  // 00 10 20 30 xx xx xx xx
     76  *d1 = _mm_unpackhi_epi64(ww0, zero);  // 01 11 21 31 xx xx xx xx
     77  *d2 = _mm_unpacklo_epi64(ww1, zero);  // 02 12 22 32 xx xx xx xx
     78  *d3 = _mm_unpackhi_epi64(ww1, zero);  // 03 13 23 33 xx xx xx xx
     79 }
     80 
     81 static inline void highbd_transpose4x8_8x4_high_sse2(__m128i *x0, __m128i *x1,
     82                                                     __m128i *x2, __m128i *x3,
     83                                                     __m128i *d4, __m128i *d5,
     84                                                     __m128i *d6, __m128i *d7) {
     85  __m128i w0, w1, ww2, ww3;
     86  __m128i zero = _mm_setzero_si128();
     87 
     88  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
     89  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
     90 
     91  ww2 = _mm_unpacklo_epi32(w0, w1);  //  04 14 24 34 05 15 25 35
     92  ww3 = _mm_unpackhi_epi32(w0, w1);  //  06 16 26 36 07 17 27 37
     93 
     94  *d4 = _mm_unpacklo_epi64(ww2, zero);  // 04 14 24 34 xx xx xx xx
     95  *d5 = _mm_unpackhi_epi64(ww2, zero);  // 05 15 25 35 xx xx xx xx
     96  *d6 = _mm_unpacklo_epi64(ww3, zero);  // 06 16 26 36 xx xx xx xx
     97  *d7 = _mm_unpackhi_epi64(ww3, zero);  // 07 17 27 37 xx xx xx xx
     98 }
     99 
    100 // here in and out pointers (x and d) should be different! we don't store their
    101 // values inside
    102 static inline void highbd_transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1,
    103                                                __m128i *x2, __m128i *x3,
    104                                                __m128i *d0, __m128i *d1,
    105                                                __m128i *d2, __m128i *d3,
    106                                                __m128i *d4, __m128i *d5,
    107                                                __m128i *d6, __m128i *d7) {
    108  // input
    109  // x0 00 01 02 03 04 05 06 07
    110  // x1 10 11 12 13 14 15 16 17
    111  // x2 20 21 22 23 24 25 26 27
    112  // x3 30 31 32 33 34 35 36 37
    113  // output
    114  // 00 10 20 30 xx xx xx xx
    115  // 01 11 21 31 xx xx xx xx
    116  // 02 12 22 32 xx xx xx xx
    117  // 03 13 23 33 xx xx xx xx
    118  // 04 14 24 34 xx xx xx xx
    119  // 05 15 25 35 xx xx xx xx
    120  // 06 16 26 36 xx xx xx xx
    121  // 07 17 27 37 xx xx xx xx
    122  highbd_transpose4x8_8x4_low_sse2(x0, x1, x2, x3, d0, d1, d2, d3);
    123  highbd_transpose4x8_8x4_high_sse2(x0, x1, x2, x3, d4, d5, d6, d7);
    124 }
    125 
    126 static inline void highbd_transpose8x8_low_sse2(__m128i *x0, __m128i *x1,
    127                                                __m128i *x2, __m128i *x3,
    128                                                __m128i *x4, __m128i *x5,
    129                                                __m128i *x6, __m128i *x7,
    130                                                __m128i *d0, __m128i *d1,
    131                                                __m128i *d2, __m128i *d3) {
    132  __m128i w0, w1, w2, w3, ww0, ww1;
    133  // x0 00 01 02 03 04 05 06 07
    134  // x1 10 11 12 13 14 15 16 17
    135  // x2 20 21 22 23 24 25 26 27
    136  // x3 30 31 32 33 34 35 36 37
    137  // x4 40 41 42 43 44 45 46 47
    138  // x5 50 51 52 53 54 55 56 57
    139  // x6 60 61 62 63 64 65 66 67
    140  // x7 70 71 72 73 74 75 76 77
    141 
    142  w0 = _mm_unpacklo_epi16(*x0, *x1);  // 00 10 01 11 02 12 03 13
    143  w1 = _mm_unpacklo_epi16(*x2, *x3);  // 20 30 21 31 22 32 23 33
    144  w2 = _mm_unpacklo_epi16(*x4, *x5);  // 40 50 41 51 42 52 43 53
    145  w3 = _mm_unpacklo_epi16(*x6, *x7);  // 60 70 61 71 62 72 63 73
    146 
    147  ww0 = _mm_unpacklo_epi32(w0, w1);  // 00 10 20 30 01 11 21 31
    148  ww1 = _mm_unpacklo_epi32(w2, w3);  // 40 50 60 70 41 51 61 71
    149 
    150  *d0 = _mm_unpacklo_epi64(ww0, ww1);  // 00 10 20 30 40 50 60 70
    151  *d1 = _mm_unpackhi_epi64(ww0, ww1);  // 01 11 21 31 41 51 61 71
    152 
    153  ww0 = _mm_unpackhi_epi32(w0, w1);  // 02 12 22 32 03 13 23 33
    154  ww1 = _mm_unpackhi_epi32(w2, w3);  // 42 52 62 72 43 53 63 73
    155 
    156  *d2 = _mm_unpacklo_epi64(ww0, ww1);  // 02 12 22 32 42 52 62 72
    157  *d3 = _mm_unpackhi_epi64(ww0, ww1);  // 03 13 23 33 43 53 63 73
    158 }
    159 
    160 static inline void highbd_transpose8x8_high_sse2(__m128i *x0, __m128i *x1,
    161                                                 __m128i *x2, __m128i *x3,
    162                                                 __m128i *x4, __m128i *x5,
    163                                                 __m128i *x6, __m128i *x7,
    164                                                 __m128i *d4, __m128i *d5,
    165                                                 __m128i *d6, __m128i *d7) {
    166  __m128i w0, w1, w2, w3, ww0, ww1;
    167  // x0 00 01 02 03 04 05 06 07
    168  // x1 10 11 12 13 14 15 16 17
    169  // x2 20 21 22 23 24 25 26 27
    170  // x3 30 31 32 33 34 35 36 37
    171  // x4 40 41 42 43 44 45 46 47
    172  // x5 50 51 52 53 54 55 56 57
    173  // x6 60 61 62 63 64 65 66 67
    174  // x7 70 71 72 73 74 75 76 77
    175  w0 = _mm_unpackhi_epi16(*x0, *x1);  // 04 14 05 15 06 16 07 17
    176  w1 = _mm_unpackhi_epi16(*x2, *x3);  // 24 34 25 35 26 36 27 37
    177  w2 = _mm_unpackhi_epi16(*x4, *x5);  // 44 54 45 55 46 56 47 57
    178  w3 = _mm_unpackhi_epi16(*x6, *x7);  // 64 74 65 75 66 76 67 77
    179 
    180  ww0 = _mm_unpacklo_epi32(w0, w1);  // 04 14 24 34 05 15 25 35
    181  ww1 = _mm_unpacklo_epi32(w2, w3);  // 44 54 64 74 45 55 65 75
    182 
    183  *d4 = _mm_unpacklo_epi64(ww0, ww1);  // 04 14 24 34 44 54 64 74
    184  *d5 = _mm_unpackhi_epi64(ww0, ww1);  // 05 15 25 35 45 55 65 75
    185 
    186  ww0 = _mm_unpackhi_epi32(w0, w1);  // 06 16 26 36 07 17 27 37
    187  ww1 = _mm_unpackhi_epi32(w2, w3);  // 46 56 66 76 47 57 67 77
    188 
    189  *d6 = _mm_unpacklo_epi64(ww0, ww1);  // 06 16 26 36 46 56 66 76
    190  *d7 = _mm_unpackhi_epi64(ww0, ww1);  // 07 17 27 37 47 57 67 77
    191 }
    192 
    193 // here in and out pointers (x and d) should be different! we don't store their
    194 // values inside
    195 static inline void highbd_transpose8x8_sse2(
    196    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
    197    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
    198    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
    199    __m128i *d7) {
    200  highbd_transpose8x8_low_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3);
    201  highbd_transpose8x8_high_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d4, d5, d6, d7);
    202 }
    203 
    204 // here in and out pointers (x and d arrays) should be different! we don't store
    205 // their values inside
    206 static inline void highbd_transpose8x16_sse2(
    207    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
    208    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0, __m128i *d1,
    209    __m128i *d2, __m128i *d3, __m128i *d4, __m128i *d5, __m128i *d6,
    210    __m128i *d7) {
    211  highbd_transpose8x8_sse2(x0, x1, x2, x3, x4, x5, x6, x7, d0, d1, d2, d3, d4,
    212                           d5, d6, d7);
    213  highbd_transpose8x8_sse2(x0 + 1, x1 + 1, x2 + 1, x3 + 1, x4 + 1, x5 + 1,
    214                           x6 + 1, x7 + 1, d0 + 1, d1 + 1, d2 + 1, d3 + 1,
    215                           d4 + 1, d5 + 1, d6 + 1, d7 + 1);
    216 }
    217 
    218 // Low bit depth functions
    219 static inline void transpose4x8_8x4_low_sse2(__m128i *x0, __m128i *x1,
    220                                             __m128i *x2, __m128i *x3,
    221                                             __m128i *d0, __m128i *d1,
    222                                             __m128i *d2, __m128i *d3) {
    223  // input
    224  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
    225  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
    226  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
    227  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
    228  // output
    229  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
    230  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
    231  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
    232  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
    233 
    234  __m128i w0, w1;
    235 
    236  w0 = _mm_unpacklo_epi8(
    237      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    238  w1 = _mm_unpacklo_epi8(
    239      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    240 
    241  *d0 = _mm_unpacklo_epi16(
    242      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    243 
    244  *d1 = _mm_srli_si128(*d0,
    245                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
    246  *d2 = _mm_srli_si128(*d0,
    247                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
    248  *d3 = _mm_srli_si128(*d0,
    249                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
    250 }
    251 
    252 static inline void transpose4x8_8x4_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
    253                                         __m128i *x3, __m128i *d0, __m128i *d1,
    254                                         __m128i *d2, __m128i *d3, __m128i *d4,
    255                                         __m128i *d5, __m128i *d6,
    256                                         __m128i *d7) {
    257  // input
    258  // x0   00 01 02 03 04 05 06 07 xx xx xx xx xx xx xx xx
    259  // x1   10 11 12 13 14 15 16 17 xx xx xx xx xx xx xx xx
    260  // x2   20 21 22 23 24 25 26 27 xx xx xx xx xx xx xx xx
    261  // x3   30 31 32 33 34 35 36 37 xx xx xx xx xx xx xx xx
    262  // output
    263  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
    264  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
    265  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
    266  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
    267  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
    268  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
    269  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
    270  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
    271 
    272  __m128i w0, w1, ww0, ww1;
    273 
    274  w0 = _mm_unpacklo_epi8(
    275      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    276  w1 = _mm_unpacklo_epi8(
    277      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    278 
    279  ww0 = _mm_unpacklo_epi16(
    280      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    281  ww1 = _mm_unpackhi_epi16(
    282      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    283 
    284  *d0 = ww0;  // 00 10 20 30 xx xx xx xx xx xx xx xx xx xx xx xx
    285  *d1 = _mm_srli_si128(ww0,
    286                       4);  // 01 11 21 31 xx xx xx xx xx xx xx xx xx xx xx xx
    287  *d2 = _mm_srli_si128(ww0,
    288                       8);  // 02 12 22 32 xx xx xx xx xx xx xx xx xx xx xx xx
    289  *d3 = _mm_srli_si128(ww0,
    290                       12);  // 03 13 23 33 xx xx xx xx xx xx xx xx xx xx xx xx
    291 
    292  *d4 = ww1;  // 04 14 24 34 xx xx xx xx xx xx xx xx xx xx xx xx
    293  *d5 = _mm_srli_si128(ww1,
    294                       4);  // 05 15 25 35 xx xx xx xx xx xx xx xx xx xx xx xx
    295  *d6 = _mm_srli_si128(ww1,
    296                       8);  // 06 16 26 36 xx xx xx xx xx xx xx xx xx xx xx xx
    297  *d7 = _mm_srli_si128(ww1,
    298                       12);  // 07 17 27 37 xx xx xx xx xx xx xx xx xx xx xx xx
    299 }
    300 
    301 static inline void transpose8x8_low_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
    302                                         __m128i *x3, __m128i *x4, __m128i *x5,
    303                                         __m128i *x6, __m128i *x7, __m128i *d0,
    304                                         __m128i *d1, __m128i *d2,
    305                                         __m128i *d3) {
    306  // input
    307  // x0 00 01 02 03 04 05 06 07
    308  // x1 10 11 12 13 14 15 16 17
    309  // x2 20 21 22 23 24 25 26 27
    310  // x3 30 31 32 33 34 35 36 37
    311  // x4 40 41 42 43 44 45 46 47
    312  // x5  50 51 52 53 54 55 56 57
    313  // x6  60 61 62 63 64 65 66 67
    314  // x7 70 71 72 73 74 75 76 77
    315  // output
    316  // d0 00 10 20 30 40 50 60 70 xx xx xx xx xx xx xx
    317  // d1 01 11 21 31 41 51 61 71 xx xx xx xx xx xx xx xx
    318  // d2 02 12 22 32 42 52 62 72 xx xx xx xx xx xx xx xx
    319  // d3 03 13 23 33 43 53 63 73 xx xx xx xx xx xx xx xx
    320 
    321  __m128i w0, w1, w2, w3, w4, w5;
    322 
    323  w0 = _mm_unpacklo_epi8(
    324      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    325 
    326  w1 = _mm_unpacklo_epi8(
    327      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    328 
    329  w2 = _mm_unpacklo_epi8(
    330      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
    331 
    332  w3 = _mm_unpacklo_epi8(
    333      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
    334 
    335  w4 = _mm_unpacklo_epi16(
    336      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    337  w5 = _mm_unpacklo_epi16(
    338      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
    339 
    340  *d0 = _mm_unpacklo_epi32(
    341      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
    342  *d1 = _mm_srli_si128(*d0, 8);
    343  *d2 = _mm_unpackhi_epi32(
    344      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
    345  *d3 = _mm_srli_si128(*d2, 8);
    346 }
    347 
    348 static inline void transpose8x8_sse2(__m128i *x0, __m128i *x1, __m128i *x2,
    349                                     __m128i *x3, __m128i *x4, __m128i *x5,
    350                                     __m128i *x6, __m128i *x7, __m128i *d0d1,
    351                                     __m128i *d2d3, __m128i *d4d5,
    352                                     __m128i *d6d7) {
    353  __m128i w0, w1, w2, w3, w4, w5, w6, w7;
    354  // x0 00 01 02 03 04 05 06 07
    355  // x1 10 11 12 13 14 15 16 17
    356  w0 = _mm_unpacklo_epi8(
    357      *x0, *x1);  // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    358 
    359  // x2 20 21 22 23 24 25 26 27
    360  // x3 30 31 32 33 34 35 36 37
    361  w1 = _mm_unpacklo_epi8(
    362      *x2, *x3);  // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    363 
    364  // x4 40 41 42 43 44 45 46 47
    365  // x5  50 51 52 53 54 55 56 57
    366  w2 = _mm_unpacklo_epi8(
    367      *x4, *x5);  // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
    368 
    369  // x6  60 61 62 63 64 65 66 67
    370  // x7 70 71 72 73 74 75 76 77
    371  w3 = _mm_unpacklo_epi8(
    372      *x6, *x7);  // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
    373 
    374  w4 = _mm_unpacklo_epi16(
    375      w0, w1);  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    376  w5 = _mm_unpacklo_epi16(
    377      w2, w3);  // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
    378 
    379  *d0d1 = _mm_unpacklo_epi32(
    380      w4, w5);  // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
    381  *d2d3 = _mm_unpackhi_epi32(
    382      w4, w5);  // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
    383 
    384  w6 = _mm_unpackhi_epi16(
    385      w0, w1);  // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    386  w7 = _mm_unpackhi_epi16(
    387      w2, w3);  // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
    388 
    389  *d4d5 = _mm_unpacklo_epi32(
    390      w6, w7);  // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
    391  *d6d7 = _mm_unpackhi_epi32(
    392      w6, w7);  // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
    393 }
    394 
    395 static inline void transpose16x8_8x16_sse2(
    396    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
    397    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *x8, __m128i *x9,
    398    __m128i *x10, __m128i *x11, __m128i *x12, __m128i *x13, __m128i *x14,
    399    __m128i *x15, __m128i *d0, __m128i *d1, __m128i *d2, __m128i *d3,
    400    __m128i *d4, __m128i *d5, __m128i *d6, __m128i *d7) {
    401  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
    402  __m128i w10, w11, w12, w13, w14, w15;
    403 
    404  w0 = _mm_unpacklo_epi8(*x0, *x1);
    405  w1 = _mm_unpacklo_epi8(*x2, *x3);
    406  w2 = _mm_unpacklo_epi8(*x4, *x5);
    407  w3 = _mm_unpacklo_epi8(*x6, *x7);
    408 
    409  w8 = _mm_unpacklo_epi8(*x8, *x9);
    410  w9 = _mm_unpacklo_epi8(*x10, *x11);
    411  w10 = _mm_unpacklo_epi8(*x12, *x13);
    412  w11 = _mm_unpacklo_epi8(*x14, *x15);
    413 
    414  w4 = _mm_unpacklo_epi16(w0, w1);
    415  w5 = _mm_unpacklo_epi16(w2, w3);
    416  w12 = _mm_unpacklo_epi16(w8, w9);
    417  w13 = _mm_unpacklo_epi16(w10, w11);
    418 
    419  w6 = _mm_unpacklo_epi32(w4, w5);
    420  w7 = _mm_unpackhi_epi32(w4, w5);
    421  w14 = _mm_unpacklo_epi32(w12, w13);
    422  w15 = _mm_unpackhi_epi32(w12, w13);
    423 
    424  // Store first 4-line result
    425  *d0 = _mm_unpacklo_epi64(w6, w14);
    426  *d1 = _mm_unpackhi_epi64(w6, w14);
    427  *d2 = _mm_unpacklo_epi64(w7, w15);
    428  *d3 = _mm_unpackhi_epi64(w7, w15);
    429 
    430  w4 = _mm_unpackhi_epi16(w0, w1);
    431  w5 = _mm_unpackhi_epi16(w2, w3);
    432  w12 = _mm_unpackhi_epi16(w8, w9);
    433  w13 = _mm_unpackhi_epi16(w10, w11);
    434 
    435  w6 = _mm_unpacklo_epi32(w4, w5);
    436  w7 = _mm_unpackhi_epi32(w4, w5);
    437  w14 = _mm_unpacklo_epi32(w12, w13);
    438  w15 = _mm_unpackhi_epi32(w12, w13);
    439 
    440  // Store second 4-line result
    441  *d4 = _mm_unpacklo_epi64(w6, w14);
    442  *d5 = _mm_unpackhi_epi64(w6, w14);
    443  *d6 = _mm_unpacklo_epi64(w7, w15);
    444  *d7 = _mm_unpackhi_epi64(w7, w15);
    445 }
    446 
    447 static inline void transpose8x16_16x8_sse2(
    448    __m128i *x0, __m128i *x1, __m128i *x2, __m128i *x3, __m128i *x4,
    449    __m128i *x5, __m128i *x6, __m128i *x7, __m128i *d0d1, __m128i *d2d3,
    450    __m128i *d4d5, __m128i *d6d7, __m128i *d8d9, __m128i *d10d11,
    451    __m128i *d12d13, __m128i *d14d15) {
    452  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
    453  __m128i w10, w11, w12, w13, w14, w15;
    454 
    455  w0 = _mm_unpacklo_epi8(*x0, *x1);
    456  w1 = _mm_unpacklo_epi8(*x2, *x3);
    457  w2 = _mm_unpacklo_epi8(*x4, *x5);
    458  w3 = _mm_unpacklo_epi8(*x6, *x7);
    459 
    460  w8 = _mm_unpackhi_epi8(*x0, *x1);
    461  w9 = _mm_unpackhi_epi8(*x2, *x3);
    462  w10 = _mm_unpackhi_epi8(*x4, *x5);
    463  w11 = _mm_unpackhi_epi8(*x6, *x7);
    464 
    465  w4 = _mm_unpacklo_epi16(w0, w1);
    466  w5 = _mm_unpacklo_epi16(w2, w3);
    467  w12 = _mm_unpacklo_epi16(w8, w9);
    468  w13 = _mm_unpacklo_epi16(w10, w11);
    469 
    470  w6 = _mm_unpacklo_epi32(w4, w5);
    471  w7 = _mm_unpackhi_epi32(w4, w5);
    472  w14 = _mm_unpacklo_epi32(w12, w13);
    473  w15 = _mm_unpackhi_epi32(w12, w13);
    474 
    475  // Store first 4-line result
    476  *d0d1 = _mm_unpacklo_epi64(w6, w14);
    477  *d2d3 = _mm_unpackhi_epi64(w6, w14);
    478  *d4d5 = _mm_unpacklo_epi64(w7, w15);
    479  *d6d7 = _mm_unpackhi_epi64(w7, w15);
    480 
    481  w4 = _mm_unpackhi_epi16(w0, w1);
    482  w5 = _mm_unpackhi_epi16(w2, w3);
    483  w12 = _mm_unpackhi_epi16(w8, w9);
    484  w13 = _mm_unpackhi_epi16(w10, w11);
    485 
    486  w6 = _mm_unpacklo_epi32(w4, w5);
    487  w7 = _mm_unpackhi_epi32(w4, w5);
    488  w14 = _mm_unpacklo_epi32(w12, w13);
    489  w15 = _mm_unpackhi_epi32(w12, w13);
    490 
    491  // Store second 4-line result
    492  *d8d9 = _mm_unpacklo_epi64(w6, w14);
    493  *d10d11 = _mm_unpackhi_epi64(w6, w14);
    494  *d12d13 = _mm_unpacklo_epi64(w7, w15);
    495  *d14d15 = _mm_unpackhi_epi64(w7, w15);
    496 }
    497 
    498 static inline void transpose_16x8(unsigned char *in0, unsigned char *in1,
    499                                  int in_p, unsigned char *out, int out_p) {
    500  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    501  __m128i x8, x9, x10, x11, x12, x13, x14, x15;
    502 
    503  x0 = _mm_loadl_epi64((__m128i *)in0);
    504  x1 = _mm_loadl_epi64((__m128i *)(in0 + in_p));
    505  x0 = _mm_unpacklo_epi8(x0, x1);
    506 
    507  x2 = _mm_loadl_epi64((__m128i *)(in0 + 2 * in_p));
    508  x3 = _mm_loadl_epi64((__m128i *)(in0 + 3 * in_p));
    509  x1 = _mm_unpacklo_epi8(x2, x3);
    510 
    511  x4 = _mm_loadl_epi64((__m128i *)(in0 + 4 * in_p));
    512  x5 = _mm_loadl_epi64((__m128i *)(in0 + 5 * in_p));
    513  x2 = _mm_unpacklo_epi8(x4, x5);
    514 
    515  x6 = _mm_loadl_epi64((__m128i *)(in0 + 6 * in_p));
    516  x7 = _mm_loadl_epi64((__m128i *)(in0 + 7 * in_p));
    517  x3 = _mm_unpacklo_epi8(x6, x7);
    518  x4 = _mm_unpacklo_epi16(x0, x1);
    519 
    520  x8 = _mm_loadl_epi64((__m128i *)in1);
    521  x9 = _mm_loadl_epi64((__m128i *)(in1 + in_p));
    522  x8 = _mm_unpacklo_epi8(x8, x9);
    523  x5 = _mm_unpacklo_epi16(x2, x3);
    524 
    525  x10 = _mm_loadl_epi64((__m128i *)(in1 + 2 * in_p));
    526  x11 = _mm_loadl_epi64((__m128i *)(in1 + 3 * in_p));
    527  x9 = _mm_unpacklo_epi8(x10, x11);
    528 
    529  x12 = _mm_loadl_epi64((__m128i *)(in1 + 4 * in_p));
    530  x13 = _mm_loadl_epi64((__m128i *)(in1 + 5 * in_p));
    531  x10 = _mm_unpacklo_epi8(x12, x13);
    532  x12 = _mm_unpacklo_epi16(x8, x9);
    533 
    534  x14 = _mm_loadl_epi64((__m128i *)(in1 + 6 * in_p));
    535  x15 = _mm_loadl_epi64((__m128i *)(in1 + 7 * in_p));
    536  x11 = _mm_unpacklo_epi8(x14, x15);
    537  x13 = _mm_unpacklo_epi16(x10, x11);
    538 
    539  x6 = _mm_unpacklo_epi32(x4, x5);
    540  x7 = _mm_unpackhi_epi32(x4, x5);
    541  x14 = _mm_unpacklo_epi32(x12, x13);
    542  x15 = _mm_unpackhi_epi32(x12, x13);
    543 
    544  // Store first 4-line result
    545  _mm_storeu_si128((__m128i *)out, _mm_unpacklo_epi64(x6, x14));
    546  _mm_storeu_si128((__m128i *)(out + out_p), _mm_unpackhi_epi64(x6, x14));
    547  _mm_storeu_si128((__m128i *)(out + 2 * out_p), _mm_unpacklo_epi64(x7, x15));
    548  _mm_storeu_si128((__m128i *)(out + 3 * out_p), _mm_unpackhi_epi64(x7, x15));
    549 
    550  x4 = _mm_unpackhi_epi16(x0, x1);
    551  x5 = _mm_unpackhi_epi16(x2, x3);
    552  x12 = _mm_unpackhi_epi16(x8, x9);
    553  x13 = _mm_unpackhi_epi16(x10, x11);
    554 
    555  x6 = _mm_unpacklo_epi32(x4, x5);
    556  x7 = _mm_unpackhi_epi32(x4, x5);
    557  x14 = _mm_unpacklo_epi32(x12, x13);
    558  x15 = _mm_unpackhi_epi32(x12, x13);
    559 
    560  // Store second 4-line result
    561  _mm_storeu_si128((__m128i *)(out + 4 * out_p), _mm_unpacklo_epi64(x6, x14));
    562  _mm_storeu_si128((__m128i *)(out + 5 * out_p), _mm_unpackhi_epi64(x6, x14));
    563  _mm_storeu_si128((__m128i *)(out + 6 * out_p), _mm_unpacklo_epi64(x7, x15));
    564  _mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
    565 }
    566 
    567 static inline void transpose_16x8_to_8x16(unsigned char *src, int in_p,
    568                                          unsigned char *dst, int out_p) {
    569  // a0 b0 c0 d0 e0 f0 g0 h0 A0 B0 C0 D0 E0 F0 G0 H0
    570  // a1 b1 c1 d1 e1 f1 g1 h1 A1 B1 C1 D1 E1 F1 G1 H1
    571  // a2 b2 c2 d2 e2 f2 g2 h2 A2 B2 C2 D2 E2 F2 G2 H2
    572  // a3 b3 c3 d3 e3 f3 g3 h3 A3 B3 C3 D3 E3 F3 G3 H3
    573  // a4 b4 c4 d4 e4 f4 g4 h4 A4 B4 C4 D4 E4 F4 G4 H4
    574  // a5 b5 c5 d5 e5 f5 g5 h5 A5 B5 C5 D5 E5 F5 G5 H5
    575  // a6 b6 c6 d6 e6 f6 g6 h6 A6 B6 C6 D6 E6 F6 G6 H6
    576  // a7 b7 c7 d7 e7 f7 g7 h7 A7 B7 C7 D7 E7 F7 G7 H7
    577  const __m128i x0 = _mm_loadu_si128((__m128i *)(src));
    578  const __m128i x1 = _mm_loadu_si128((__m128i *)(src + (1 * in_p)));
    579  const __m128i x2 = _mm_loadu_si128((__m128i *)(src + (2 * in_p)));
    580  const __m128i x3 = _mm_loadu_si128((__m128i *)(src + (3 * in_p)));
    581  const __m128i x4 = _mm_loadu_si128((__m128i *)(src + (4 * in_p)));
    582  const __m128i x5 = _mm_loadu_si128((__m128i *)(src + (5 * in_p)));
    583  const __m128i x6 = _mm_loadu_si128((__m128i *)(src + (6 * in_p)));
    584  const __m128i x7 = _mm_loadu_si128((__m128i *)(src + (7 * in_p)));
    585 
    586  // a0 a1 b0 b1 c0 c1 d0 d1 A0 A1 B0 B1 C0 C1 D0 D1
    587  // e0 e1 f0 f1 g0 g1 h0 h1 E0 E1 F0 F1 G0 G1 H0 H1
    588  // a2 a3 b2 b3 c2 c3 d2 d3 A2 A3 B2 B3 C2 C3 D2 D3
    589  // e2 e3 f2 f3 g2 g3 h2 h3 E2 E3 F2 F3 G2 G3 H2 H3
    590  // a4 a5 b4 b5 c4 c5 d4 d5 A4 A5 B4 B5 C4 C5 D4 D5
    591  // e4 e5 f4 f5 g4 g5 h4 h5 E4 E5 F4 F5 G4 G5 H4 H5
    592  // a6 a7 b6 b7 c6 c7 d6 d7 A6 A7 B6 B7 C6 C7 D6 D7
    593  // e6 e7 f6 f7 g6 g7 h6 h7 E6 E7 F6 F7 G6 G7 H6 H7
    594  const __m128i x_s10 = _mm_unpacklo_epi8(x0, x1);
    595  const __m128i x_s11 = _mm_unpackhi_epi8(x0, x1);
    596  const __m128i x_s12 = _mm_unpacklo_epi8(x2, x3);
    597  const __m128i x_s13 = _mm_unpackhi_epi8(x2, x3);
    598  const __m128i x_s14 = _mm_unpacklo_epi8(x4, x5);
    599  const __m128i x_s15 = _mm_unpackhi_epi8(x4, x5);
    600  const __m128i x_s16 = _mm_unpacklo_epi8(x6, x7);
    601  const __m128i x_s17 = _mm_unpackhi_epi8(x6, x7);
    602 
    603  // a0 a1 a2 a3 b0 b1 b2 b3 | A0 A1 A2 A3 B0 B1 B2 B3
    604  // c0 c1 c2 c3 d0 d1 d2 d3 | C0 C1 C2 C3 D0 D1 D2 D3
    605  // e0 e1 e2 e3 f0 f1 f2 f3 | E0 E1 E2 E3 F0 F1 F2 F3
    606  // g0 g1 g2 g3 h0 h1 h2 h3 | G0 G1 G2 G3 H0 H1 H2 H3
    607  // a4 a5 a6 a7 b4 b5 b6 b7 | A4 A5 A6 A7 B4 B5 B6 B7
    608  // c4 c5 c6 c7 d4 d5 d6 d7 | C4 C5 C6 C7 D4 D5 D6 D7
    609  // e4 e5 e6 e7 f4 f5 f6 f7 | E4 E5 E6 E7 F4 F5 F6 F7
    610  // g4 g5 g6 g7 h4 h5 h6 h7 | G4 G5 G6 G7 H4 H5 H6 H7
    611  const __m128i x_s20 = _mm_unpacklo_epi16(x_s10, x_s12);
    612  const __m128i x_s21 = _mm_unpackhi_epi16(x_s10, x_s12);
    613  const __m128i x_s22 = _mm_unpacklo_epi16(x_s11, x_s13);
    614  const __m128i x_s23 = _mm_unpackhi_epi16(x_s11, x_s13);
    615  const __m128i x_s24 = _mm_unpacklo_epi16(x_s14, x_s16);
    616  const __m128i x_s25 = _mm_unpackhi_epi16(x_s14, x_s16);
    617  const __m128i x_s26 = _mm_unpacklo_epi16(x_s15, x_s17);
    618  const __m128i x_s27 = _mm_unpackhi_epi16(x_s15, x_s17);
    619 
    620  // a0 a1 a2 a3 a4 a5 a6 a7 | A0 A1 A2 A3 A4 A5 A6 A7
    621  // b0 b1 b2 b3 b4 b5 b6 b7 | B0 B1 B2 B3 B4 B5 B6 B7
    622  // c0 c1 c2 c3 c4 c5 c6 c7 | C0 C1 C2 C3 C4 C5 C6 C7
    623  // d0 d1 d2 d3 d4 d5 d6 d7 | D0 D1 D2 D3 D4 D5 D6 D7
    624  // e0 e1 e2 e3 e4 e5 e6 e7 | E0 E1 E2 E3 E4 E5 E6 E7
    625  // f0 f1 f2 f3 f4 f5 f6 f7 | F0 F1 F2 F3 F4 F5 F6 F7
    626  // g0 g1 g2 g3 g4 g5 g6 g7 | G0 G1 G2 G3 G4 G5 G6 G7
    627  // h0 h1 h2 h3 h4 h5 h6 h7 | H0 H1 H2 H3 H4 H5 H6 H7
    628  const __m128i x_s30 = _mm_unpacklo_epi32(x_s20, x_s24);
    629  const __m128i x_s31 = _mm_unpackhi_epi32(x_s20, x_s24);
    630  const __m128i x_s32 = _mm_unpacklo_epi32(x_s21, x_s25);
    631  const __m128i x_s33 = _mm_unpackhi_epi32(x_s21, x_s25);
    632  const __m128i x_s34 = _mm_unpacklo_epi32(x_s22, x_s26);
    633  const __m128i x_s35 = _mm_unpackhi_epi32(x_s22, x_s26);
    634  const __m128i x_s36 = _mm_unpacklo_epi32(x_s23, x_s27);
    635  const __m128i x_s37 = _mm_unpackhi_epi32(x_s23, x_s27);
    636 
    637  mm_storelu(dst, x_s30);
    638  mm_storehu(dst + (1 * out_p), x_s30);
    639  mm_storelu(dst + (2 * out_p), x_s31);
    640  mm_storehu(dst + (3 * out_p), x_s31);
    641  mm_storelu(dst + (4 * out_p), x_s32);
    642  mm_storehu(dst + (5 * out_p), x_s32);
    643  mm_storelu(dst + (6 * out_p), x_s33);
    644  mm_storehu(dst + (7 * out_p), x_s33);
    645  mm_storelu(dst + (8 * out_p), x_s34);
    646  mm_storehu(dst + (9 * out_p), x_s34);
    647  mm_storelu(dst + (10 * out_p), x_s35);
    648  mm_storehu(dst + (11 * out_p), x_s35);
    649  mm_storelu(dst + (12 * out_p), x_s36);
    650  mm_storehu(dst + (13 * out_p), x_s36);
    651  mm_storelu(dst + (14 * out_p), x_s37);
    652  mm_storehu(dst + (15 * out_p), x_s37);
    653 }
    654 
    655 static inline void transpose_8xn(unsigned char *src[], int in_p,
    656                                 unsigned char *dst[], int out_p,
    657                                 int num_8x8_to_transpose) {
    658  int idx8x8 = 0;
    659  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    660  do {
    661    unsigned char *in = src[idx8x8];
    662    unsigned char *out = dst[idx8x8];
    663 
    664    x0 =
    665        _mm_loadl_epi64((__m128i *)(in + 0 * in_p));  // 00 01 02 03 04 05 06 07
    666    x1 =
    667        _mm_loadl_epi64((__m128i *)(in + 1 * in_p));  // 10 11 12 13 14 15 16 17
    668    // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
    669    x0 = _mm_unpacklo_epi8(x0, x1);
    670 
    671    x2 =
    672        _mm_loadl_epi64((__m128i *)(in + 2 * in_p));  // 20 21 22 23 24 25 26 27
    673    x3 =
    674        _mm_loadl_epi64((__m128i *)(in + 3 * in_p));  // 30 31 32 33 34 35 36 37
    675    // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
    676    x1 = _mm_unpacklo_epi8(x2, x3);
    677 
    678    x4 =
    679        _mm_loadl_epi64((__m128i *)(in + 4 * in_p));  // 40 41 42 43 44 45 46 47
    680    x5 =
    681        _mm_loadl_epi64((__m128i *)(in + 5 * in_p));  // 50 51 52 53 54 55 56 57
    682    // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
    683    x2 = _mm_unpacklo_epi8(x4, x5);
    684 
    685    x6 =
    686        _mm_loadl_epi64((__m128i *)(in + 6 * in_p));  // 60 61 62 63 64 65 66 67
    687    x7 =
    688        _mm_loadl_epi64((__m128i *)(in + 7 * in_p));  // 70 71 72 73 74 75 76 77
    689    // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
    690    x3 = _mm_unpacklo_epi8(x6, x7);
    691 
    692    // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
    693    x4 = _mm_unpacklo_epi16(x0, x1);
    694    // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
    695    x5 = _mm_unpacklo_epi16(x2, x3);
    696    // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
    697    x6 = _mm_unpacklo_epi32(x4, x5);
    698    mm_storelu(out + 0 * out_p, x6);  // 00 10 20 30 40 50 60 70
    699    mm_storehu(out + 1 * out_p, x6);  // 01 11 21 31 41 51 61 71
    700    // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
    701    x7 = _mm_unpackhi_epi32(x4, x5);
    702    mm_storelu(out + 2 * out_p, x7);  // 02 12 22 32 42 52 62 72
    703    mm_storehu(out + 3 * out_p, x7);  // 03 13 23 33 43 53 63 73
    704 
    705    // 04 14 24 34 05 15 25 35 06 16 26 36 07 17 27 37
    706    x4 = _mm_unpackhi_epi16(x0, x1);
    707    // 44 54 64 74 45 55 65 75 46 56 66 76 47 57 67 77
    708    x5 = _mm_unpackhi_epi16(x2, x3);
    709    // 04 14 24 34 44 54 64 74 05 15 25 35 45 55 65 75
    710    x6 = _mm_unpacklo_epi32(x4, x5);
    711    mm_storelu(out + 4 * out_p, x6);  // 04 14 24 34 44 54 64 74
    712    mm_storehu(out + 5 * out_p, x6);  // 05 15 25 35 45 55 65 75
    713    // 06 16 26 36 46 56 66 76 07 17 27 37 47 57 67 77
    714    x7 = _mm_unpackhi_epi32(x4, x5);
    715 
    716    mm_storelu(out + 6 * out_p, x7);  // 06 16 26 36 46 56 66 76
    717    mm_storehu(out + 7 * out_p, x7);  // 07 17 27 37 47 57 67 77
    718  } while (++idx8x8 < num_8x8_to_transpose);
    719 }
    720 
    721 #endif  // AOM_AOM_DSP_X86_LPF_COMMON_SSE2_H_