tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

transpose_sse2.h (15251B)


      1 /*
      2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #ifndef AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
     13 #define AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_
     14 
     15 #include <emmintrin.h>  // SSE2
     16 
     17 #include "config/aom_config.h"
     18 
     19 static inline __m128i transpose_8bit_4x4(const __m128i *const in) {
     20  // Unpack 8 bit elements. Goes from:
     21  // in[0]: 00 01 02 03
     22  // in[1]: 10 11 12 13
     23  // in[2]: 20 21 22 23
     24  // in[3]: 30 31 32 33
     25  // to:
     26  // a0:    00 10 01 11  02 12 03 13
     27  // a1:    20 30 21 31  22 32 23 33
     28  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     29  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     30 
     31  // Unpack 16 bit elements resulting in:
     32  // 00 10 20 30  01 11 21 31  02 12 22 32  03 13 23 33
     33  return _mm_unpacklo_epi16(a0, a1);
     34 }
     35 
     36 static inline void transpose_8bit_8x8(const __m128i *const in,
     37                                      __m128i *const out) {
     38  // Unpack 8 bit elements. Goes from:
     39  // in[0]: 00 01 02 03 04 05 06 07
     40  // in[1]: 10 11 12 13 14 15 16 17
     41  // in[2]: 20 21 22 23 24 25 26 27
     42  // in[3]: 30 31 32 33 34 35 36 37
     43  // in[4]: 40 41 42 43 44 45 46 47
     44  // in[5]: 50 51 52 53 54 55 56 57
     45  // in[6]: 60 61 62 63 64 65 66 67
     46  // in[7]: 70 71 72 73 74 75 76 77
     47  // to:
     48  // a0:    00 10 01 11 02 12 03 13  04 14 05 15 06 16 07 17
     49  // a1:    20 30 21 31 22 32 23 33  24 34 25 35 26 36 27 37
     50  // a2:    40 50 41 51 42 52 43 53  44 54 45 55 46 56 47 57
     51  // a3:    60 70 61 71 62 72 63 73  64 74 65 75 66 76 67 77
     52  const __m128i a0 = _mm_unpacklo_epi8(in[0], in[1]);
     53  const __m128i a1 = _mm_unpacklo_epi8(in[2], in[3]);
     54  const __m128i a2 = _mm_unpacklo_epi8(in[4], in[5]);
     55  const __m128i a3 = _mm_unpacklo_epi8(in[6], in[7]);
     56 
     57  // Unpack 16 bit elements resulting in:
     58  // b0: 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
     59  // b1: 40 50 60 70 41 51 61 71  42 52 62 72 43 53 63 73
     60  // b2: 04 14 24 34 05 15 25 35  06 16 26 36 07 17 27 37
     61  // b3: 44 54 64 74 45 55 65 75  46 56 66 76 47 57 67 77
     62  const __m128i b0 = _mm_unpacklo_epi16(a0, a1);
     63  const __m128i b1 = _mm_unpackhi_epi16(a0, a1);
     64  const __m128i b2 = _mm_unpacklo_epi16(a2, a3);
     65  const __m128i b3 = _mm_unpackhi_epi16(a2, a3);
     66 
     67  // Unpack 32 bit elements resulting in:
     68  // c0: 00 10 20 30 40 50 60 70  01 11 21 31 41 51 61 71
     69  // c1: 02 12 22 32 42 52 62 72  03 13 23 33 43 53 63 73
     70  // c2: 04 14 24 34 44 54 64 74  05 15 25 35 45 55 65 75
     71  // c3: 06 16 26 36 46 56 66 76  07 17 27 37 47 57 67 77
     72  const __m128i c0 = _mm_unpacklo_epi32(b0, b2);
     73  const __m128i c1 = _mm_unpackhi_epi32(b0, b2);
     74  const __m128i c2 = _mm_unpacklo_epi32(b1, b3);
     75  const __m128i c3 = _mm_unpackhi_epi32(b1, b3);
     76 
     77  // Unpack 64 bit elements resulting in:
     78  // out[0]: 00 10 20 30 40 50 60 70
     79  // out[1]: 01 11 21 31 41 51 61 71
     80  // out[2]: 02 12 22 32 42 52 62 72
     81  // out[3]: 03 13 23 33 43 53 63 73
     82  // out[4]: 04 14 24 34 44 54 64 74
     83  // out[5]: 05 15 25 35 45 55 65 75
     84  // out[6]: 06 16 26 36 46 56 66 76
     85  // out[7]: 07 17 27 37 47 57 67 77
     86  out[0] = _mm_unpacklo_epi64(c0, c0);
     87  out[1] = _mm_unpackhi_epi64(c0, c0);
     88  out[2] = _mm_unpacklo_epi64(c1, c1);
     89  out[3] = _mm_unpackhi_epi64(c1, c1);
     90  out[4] = _mm_unpacklo_epi64(c2, c2);
     91  out[5] = _mm_unpackhi_epi64(c2, c2);
     92  out[6] = _mm_unpacklo_epi64(c3, c3);
     93  out[7] = _mm_unpackhi_epi64(c3, c3);
     94 }
     95 
     96 static inline void transpose_16bit_4x4(const __m128i *const in,
     97                                       __m128i *const out) {
     98  // Unpack 16 bit elements. Goes from:
     99  // in[0]: 00 01 02 03  XX XX XX XX
    100  // in[1]: 10 11 12 13  XX XX XX XX
    101  // in[2]: 20 21 22 23  XX XX XX XX
    102  // in[3]: 30 31 32 33  XX XX XX XX
    103  // to:
    104  // a0:    00 10 01 11  02 12 03 13
    105  // a1:    20 30 21 31  22 32 23 33
    106  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    107  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    108 
    109  // Unpack 32 bit elements resulting in:
    110  // out[0]: 00 10 20 30  01 11 21 31
    111  // out[1]: 01 11 21 31  __ __ __ __
    112  // out[2]: 02 12 22 32  03 13 23 33
    113  // out[3]: 03 13 23 33  __ __ __ __
    114  //
    115  // Note: The high 64 bits of the output registers are shown for informational
    116  // purposes only. Callers should only use the low 64 bits of the output
    117  // registers. "__" indicates zeros.
    118  out[0] = _mm_unpacklo_epi32(a0, a1);
    119  out[1] = _mm_srli_si128(out[0], 8);
    120  out[2] = _mm_unpackhi_epi32(a0, a1);
    121  out[3] = _mm_srli_si128(out[2], 8);
    122 }
    123 
    124 static inline void transpose_16bit_4x8(const __m128i *const in,
    125                                       __m128i *const out) {
    126  // Unpack 16 bit elements. Goes from:
    127  // in[0]: 00 01 02 03  XX XX XX XX
    128  // in[1]: 10 11 12 13  XX XX XX XX
    129  // in[2]: 20 21 22 23  XX XX XX XX
    130  // in[3]: 30 31 32 33  XX XX XX XX
    131  // in[4]: 40 41 42 43  XX XX XX XX
    132  // in[5]: 50 51 52 53  XX XX XX XX
    133  // in[6]: 60 61 62 63  XX XX XX XX
    134  // in[7]: 70 71 72 73  XX XX XX XX
    135  // to:
    136  // a0:    00 10 01 11  02 12 03 13
    137  // a1:    20 30 21 31  22 32 23 33
    138  // a2:    40 50 41 51  42 52 43 53
    139  // a3:    60 70 61 71  62 72 63 73
    140  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    141  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    142  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
    143  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
    144 
    145  // Unpack 32 bit elements resulting in:
    146  // b0: 00 10 20 30  01 11 21 31
    147  // b1: 40 50 60 70  41 51 61 71
    148  // b2: 02 12 22 32  03 13 23 33
    149  // b3: 42 52 62 72  43 53 63 73
    150  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
    151  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
    152  const __m128i b2 = _mm_unpackhi_epi32(a0, a1);
    153  const __m128i b3 = _mm_unpackhi_epi32(a2, a3);
    154 
    155  // Unpack 64 bit elements resulting in:
    156  // out[0]: 00 10 20 30  40 50 60 70
    157  // out[1]: 01 11 21 31  41 51 61 71
    158  // out[2]: 02 12 22 32  42 52 62 72
    159  // out[3]: 03 13 23 33  43 53 63 73
    160  out[0] = _mm_unpacklo_epi64(b0, b1);
    161  out[1] = _mm_unpackhi_epi64(b0, b1);
    162  out[2] = _mm_unpacklo_epi64(b2, b3);
    163  out[3] = _mm_unpackhi_epi64(b2, b3);
    164 }
    165 
    166 static inline void transpose_16bit_8x4(const __m128i *const in,
    167                                       __m128i *const out) {
    168  // Unpack 16 bit elements. Goes from:
    169  // in[0]: 00 01 02 03  04 05 06 07
    170  // in[1]: 10 11 12 13  14 15 16 17
    171  // in[2]: 20 21 22 23  24 25 26 27
    172  // in[3]: 30 31 32 33  34 35 36 37
    173 
    174  // to:
    175  // a0:    00 10 01 11  02 12 03 13
    176  // a1:    20 30 21 31  22 32 23 33
    177  // a4:    04 14 05 15  06 16 07 17
    178  // a5:    24 34 25 35  26 36 27 37
    179  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    180  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    181  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
    182  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
    183 
    184  // Unpack 32 bit elements resulting in:
    185  // b0: 00 10 20 30  01 11 21 31
    186  // b2: 04 14 24 34  05 15 25 35
    187  // b4: 02 12 22 32  03 13 23 33
    188  // b6: 06 16 26 36  07 17 27 37
    189  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
    190  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
    191  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
    192  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
    193 
    194  // Unpack 64 bit elements resulting in:
    195  // out[0]: 00 10 20 30  XX XX XX XX
    196  // out[1]: 01 11 21 31  XX XX XX XX
    197  // out[2]: 02 12 22 32  XX XX XX XX
    198  // out[3]: 03 13 23 33  XX XX XX XX
    199  // out[4]: 04 14 24 34  XX XX XX XX
    200  // out[5]: 05 15 25 35  XX XX XX XX
    201  // out[6]: 06 16 26 36  XX XX XX XX
    202  // out[7]: 07 17 27 37  XX XX XX XX
    203  const __m128i zeros = _mm_setzero_si128();
    204  out[0] = _mm_unpacklo_epi64(b0, zeros);
    205  out[1] = _mm_unpackhi_epi64(b0, zeros);
    206  out[2] = _mm_unpacklo_epi64(b4, zeros);
    207  out[3] = _mm_unpackhi_epi64(b4, zeros);
    208  out[4] = _mm_unpacklo_epi64(b2, zeros);
    209  out[5] = _mm_unpackhi_epi64(b2, zeros);
    210  out[6] = _mm_unpacklo_epi64(b6, zeros);
    211  out[7] = _mm_unpackhi_epi64(b6, zeros);
    212 }
    213 
    214 static inline void transpose_16bit_8x8(const __m128i *const in,
    215                                       __m128i *const out) {
    216  // Unpack 16 bit elements. Goes from:
    217  // in[0]: 00 01 02 03  04 05 06 07
    218  // in[1]: 10 11 12 13  14 15 16 17
    219  // in[2]: 20 21 22 23  24 25 26 27
    220  // in[3]: 30 31 32 33  34 35 36 37
    221  // in[4]: 40 41 42 43  44 45 46 47
    222  // in[5]: 50 51 52 53  54 55 56 57
    223  // in[6]: 60 61 62 63  64 65 66 67
    224  // in[7]: 70 71 72 73  74 75 76 77
    225  // to:
    226  // a0:    00 10 01 11  02 12 03 13
    227  // a1:    20 30 21 31  22 32 23 33
    228  // a2:    40 50 41 51  42 52 43 53
    229  // a3:    60 70 61 71  62 72 63 73
    230  // a4:    04 14 05 15  06 16 07 17
    231  // a5:    24 34 25 35  26 36 27 37
    232  // a6:    44 54 45 55  46 56 47 57
    233  // a7:    64 74 65 75  66 76 67 77
    234  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
    235  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
    236  const __m128i a2 = _mm_unpacklo_epi16(in[4], in[5]);
    237  const __m128i a3 = _mm_unpacklo_epi16(in[6], in[7]);
    238  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
    239  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
    240  const __m128i a6 = _mm_unpackhi_epi16(in[4], in[5]);
    241  const __m128i a7 = _mm_unpackhi_epi16(in[6], in[7]);
    242 
    243  // Unpack 32 bit elements resulting in:
    244  // b0: 00 10 20 30  01 11 21 31
    245  // b1: 40 50 60 70  41 51 61 71
    246  // b2: 04 14 24 34  05 15 25 35
    247  // b3: 44 54 64 74  45 55 65 75
    248  // b4: 02 12 22 32  03 13 23 33
    249  // b5: 42 52 62 72  43 53 63 73
    250  // b6: 06 16 26 36  07 17 27 37
    251  // b7: 46 56 66 76  47 57 67 77
    252  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
    253  const __m128i b1 = _mm_unpacklo_epi32(a2, a3);
    254  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
    255  const __m128i b3 = _mm_unpacklo_epi32(a6, a7);
    256  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
    257  const __m128i b5 = _mm_unpackhi_epi32(a2, a3);
    258  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
    259  const __m128i b7 = _mm_unpackhi_epi32(a6, a7);
    260 
    261  // Unpack 64 bit elements resulting in:
    262  // out[0]: 00 10 20 30  40 50 60 70
    263  // out[1]: 01 11 21 31  41 51 61 71
    264  // out[2]: 02 12 22 32  42 52 62 72
    265  // out[3]: 03 13 23 33  43 53 63 73
    266  // out[4]: 04 14 24 34  44 54 64 74
    267  // out[5]: 05 15 25 35  45 55 65 75
    268  // out[6]: 06 16 26 36  46 56 66 76
    269  // out[7]: 07 17 27 37  47 57 67 77
    270  out[0] = _mm_unpacklo_epi64(b0, b1);
    271  out[1] = _mm_unpackhi_epi64(b0, b1);
    272  out[2] = _mm_unpacklo_epi64(b4, b5);
    273  out[3] = _mm_unpackhi_epi64(b4, b5);
    274  out[4] = _mm_unpacklo_epi64(b2, b3);
    275  out[5] = _mm_unpackhi_epi64(b2, b3);
    276  out[6] = _mm_unpacklo_epi64(b6, b7);
    277  out[7] = _mm_unpackhi_epi64(b6, b7);
    278 }
    279 
    280 // Transpose in-place
    281 static inline void transpose_16bit_16x16(__m128i *const left,
    282                                         __m128i *const right) {
    283  __m128i tbuf[8];
    284  transpose_16bit_8x8(left, left);
    285  transpose_16bit_8x8(right, tbuf);
    286  transpose_16bit_8x8(left + 8, right);
    287  transpose_16bit_8x8(right + 8, right + 8);
    288 
    289  left[8] = tbuf[0];
    290  left[9] = tbuf[1];
    291  left[10] = tbuf[2];
    292  left[11] = tbuf[3];
    293  left[12] = tbuf[4];
    294  left[13] = tbuf[5];
    295  left[14] = tbuf[6];
    296  left[15] = tbuf[7];
    297 }
    298 
    299 static inline void transpose_32bit_4x4(const __m128i *const in,
    300                                       __m128i *const out) {
    301  // Unpack 32 bit elements. Goes from:
    302  // in[0]: 00 01 02 03
    303  // in[1]: 10 11 12 13
    304  // in[2]: 20 21 22 23
    305  // in[3]: 30 31 32 33
    306  // to:
    307  // a0:    00 10 01 11
    308  // a1:    20 30 21 31
    309  // a2:    02 12 03 13
    310  // a3:    22 32 23 33
    311 
    312  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
    313  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
    314  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
    315  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
    316 
    317  // Unpack 64 bit elements resulting in:
    318  // out[0]: 00 10 20 30
    319  // out[1]: 01 11 21 31
    320  // out[2]: 02 12 22 32
    321  // out[3]: 03 13 23 33
    322  out[0] = _mm_unpacklo_epi64(a0, a1);
    323  out[1] = _mm_unpackhi_epi64(a0, a1);
    324  out[2] = _mm_unpacklo_epi64(a2, a3);
    325  out[3] = _mm_unpackhi_epi64(a2, a3);
    326 }
    327 
    328 static inline void transpose_32bit_4x4x2(const __m128i *const in,
    329                                         __m128i *const out) {
    330  // Unpack 32 bit elements. Goes from:
    331  // in[0]: 00 01 02 03
    332  // in[1]: 10 11 12 13
    333  // in[2]: 20 21 22 23
    334  // in[3]: 30 31 32 33
    335  // in[4]: 04 05 06 07
    336  // in[5]: 14 15 16 17
    337  // in[6]: 24 25 26 27
    338  // in[7]: 34 35 36 37
    339  // to:
    340  // a0:    00 10 01 11
    341  // a1:    20 30 21 31
    342  // a2:    02 12 03 13
    343  // a3:    22 32 23 33
    344  // a4:    04 14 05 15
    345  // a5:    24 34 25 35
    346  // a6:    06 16 07 17
    347  // a7:    26 36 27 37
    348  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[1]);
    349  const __m128i a1 = _mm_unpacklo_epi32(in[2], in[3]);
    350  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[1]);
    351  const __m128i a3 = _mm_unpackhi_epi32(in[2], in[3]);
    352  const __m128i a4 = _mm_unpacklo_epi32(in[4], in[5]);
    353  const __m128i a5 = _mm_unpacklo_epi32(in[6], in[7]);
    354  const __m128i a6 = _mm_unpackhi_epi32(in[4], in[5]);
    355  const __m128i a7 = _mm_unpackhi_epi32(in[6], in[7]);
    356 
    357  // Unpack 64 bit elements resulting in:
    358  // out[0]: 00 10 20 30
    359  // out[1]: 01 11 21 31
    360  // out[2]: 02 12 22 32
    361  // out[3]: 03 13 23 33
    362  // out[4]: 04 14 24 34
    363  // out[5]: 05 15 25 35
    364  // out[6]: 06 16 26 36
    365  // out[7]: 07 17 27 37
    366  out[0] = _mm_unpacklo_epi64(a0, a1);
    367  out[1] = _mm_unpackhi_epi64(a0, a1);
    368  out[2] = _mm_unpacklo_epi64(a2, a3);
    369  out[3] = _mm_unpackhi_epi64(a2, a3);
    370  out[4] = _mm_unpacklo_epi64(a4, a5);
    371  out[5] = _mm_unpackhi_epi64(a4, a5);
    372  out[6] = _mm_unpacklo_epi64(a6, a7);
    373  out[7] = _mm_unpackhi_epi64(a6, a7);
    374 }
    375 
    376 static inline void transpose_32bit_8x4(const __m128i *const in,
    377                                       __m128i *const out) {
    378  // Unpack 32 bit elements. Goes from:
    379  // in[0]: 00 01 02 03
    380  // in[1]: 04 05 06 07
    381  // in[2]: 10 11 12 13
    382  // in[3]: 14 15 16 17
    383  // in[4]: 20 21 22 23
    384  // in[5]: 24 25 26 27
    385  // in[6]: 30 31 32 33
    386  // in[7]: 34 35 36 37
    387  // to:
    388  // a0: 00 10 01 11
    389  // a1: 20 30 21 31
    390  // a2: 02 12 03 13
    391  // a3: 22 32 23 33
    392  // a4: 04 14 05 15
    393  // a5: 24 34 25 35
    394  // a6: 06 16 07 17
    395  // a7: 26 36 27 37
    396  const __m128i a0 = _mm_unpacklo_epi32(in[0], in[2]);
    397  const __m128i a1 = _mm_unpacklo_epi32(in[4], in[6]);
    398  const __m128i a2 = _mm_unpackhi_epi32(in[0], in[2]);
    399  const __m128i a3 = _mm_unpackhi_epi32(in[4], in[6]);
    400  const __m128i a4 = _mm_unpacklo_epi32(in[1], in[3]);
    401  const __m128i a5 = _mm_unpacklo_epi32(in[5], in[7]);
    402  const __m128i a6 = _mm_unpackhi_epi32(in[1], in[3]);
    403  const __m128i a7 = _mm_unpackhi_epi32(in[5], in[7]);
    404 
    405  // Unpack 64 bit elements resulting in:
    406  // out[0]: 00 10 20 30
    407  // out[1]: 01 11 21 31
    408  // out[2]: 02 12 22 32
    409  // out[3]: 03 13 23 33
    410  // out[4]: 04 14 24 34
    411  // out[5]: 05 15 25 35
    412  // out[6]: 06 16 26 36
    413  // out[7]: 07 17 27 37
    414  out[0] = _mm_unpacklo_epi64(a0, a1);
    415  out[1] = _mm_unpackhi_epi64(a0, a1);
    416  out[2] = _mm_unpacklo_epi64(a2, a3);
    417  out[3] = _mm_unpackhi_epi64(a2, a3);
    418  out[4] = _mm_unpacklo_epi64(a4, a5);
    419  out[5] = _mm_unpackhi_epi64(a4, a5);
    420  out[6] = _mm_unpacklo_epi64(a6, a7);
    421  out[7] = _mm_unpackhi_epi64(a6, a7);
    422 }
    423 
    424 #endif  // AOM_AOM_DSP_X86_TRANSPOSE_SSE2_H_