tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

intrapred_utils.h (7179B)


      1 /*
      2 * Copyright (c) 2021, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 #ifndef AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
     12 #define AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_
     13 
     14 #include <emmintrin.h>  // SSE2
     15 #include "aom/aom_integer.h"
     16 #include "config/aom_config.h"
     17 #include "config/aom_dsp_rtcd.h"
     18 
     19 static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = {
     20  { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 },
     21  { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 },
     22  { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 },
     23  { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 },
     24  { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 },
     25  { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 },
     26  { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 },
     27  { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 }
     28 };
     29 
     30 static DECLARE_ALIGNED(16, uint8_t, LoadMaskx[16][16]) = {
     31  { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
     32  { 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 },
     33  { 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13 },
     34  { 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 },
     35  { 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 },
     36  { 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 },
     37  { 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 },
     38  { 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8 },
     39  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7 },
     40  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6 },
     41  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5 },
     42  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4 },
     43  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3 },
     44  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2 },
     45  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1 },
     46  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
     47 };
     48 
     49 static DECLARE_ALIGNED(32, int, LoadMaskz2[8][8]) = {
     50  { -1, 0, 0, 0, 0, 0, 0, 0 },       { -1, -1, 0, 0, 0, 0, 0, 0 },
     51  { -1, -1, -1, 0, 0, 0, 0, 0 },     { -1, -1, -1, -1, 0, 0, 0, 0 },
     52  { -1, -1, -1, -1, -1, 0, 0, 0 },   { -1, -1, -1, -1, -1, -1, 0, 0 },
     53  { -1, -1, -1, -1, -1, -1, -1, 0 }, { -1, -1, -1, -1, -1, -1, -1, -1 },
     54 };
     55 
     56 static inline void transpose4x16_sse2(__m128i *x, __m128i *d) {
     57  __m128i w0, w1, w2, w3, ww0, ww1, ww2, ww3;
     58  w0 = _mm_unpacklo_epi8(x[0], x[1]);
     59  w1 = _mm_unpacklo_epi8(x[2], x[3]);
     60  w2 = _mm_unpackhi_epi8(x[0], x[1]);
     61  w3 = _mm_unpackhi_epi8(x[2], x[3]);
     62 
     63  ww0 = _mm_unpacklo_epi16(w0, w1);
     64  ww1 = _mm_unpacklo_epi16(w2, w3);
     65  ww2 = _mm_unpackhi_epi16(w0, w1);
     66  ww3 = _mm_unpackhi_epi16(w2, w3);
     67 
     68  w0 = _mm_unpacklo_epi32(ww0, ww1);
     69  w2 = _mm_unpacklo_epi32(ww2, ww3);
     70  w1 = _mm_unpackhi_epi32(ww0, ww1);
     71  w3 = _mm_unpackhi_epi32(ww2, ww3);
     72 
     73  d[0] = _mm_unpacklo_epi64(w0, w2);
     74  d[1] = _mm_unpackhi_epi64(w0, w2);
     75  d[2] = _mm_unpacklo_epi64(w1, w3);
     76  d[3] = _mm_unpackhi_epi64(w1, w3);
     77 
     78  d[4] = _mm_srli_si128(d[0], 8);
     79  d[5] = _mm_srli_si128(d[1], 8);
     80  d[6] = _mm_srli_si128(d[2], 8);
     81  d[7] = _mm_srli_si128(d[3], 8);
     82 
     83  d[8] = _mm_srli_si128(d[0], 4);
     84  d[9] = _mm_srli_si128(d[1], 4);
     85  d[10] = _mm_srli_si128(d[2], 4);
     86  d[11] = _mm_srli_si128(d[3], 4);
     87 
     88  d[12] = _mm_srli_si128(d[0], 12);
     89  d[13] = _mm_srli_si128(d[1], 12);
     90  d[14] = _mm_srli_si128(d[2], 12);
     91  d[15] = _mm_srli_si128(d[3], 12);
     92 }
     93 
     94 static inline void transpose16x16_sse2(__m128i *x, __m128i *d) {
     95  __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9;
     96  __m128i w10, w11, w12, w13, w14, w15;
     97 
     98  w0 = _mm_unpacklo_epi8(x[0], x[1]);
     99  w1 = _mm_unpacklo_epi8(x[2], x[3]);
    100  w2 = _mm_unpacklo_epi8(x[4], x[5]);
    101  w3 = _mm_unpacklo_epi8(x[6], x[7]);
    102 
    103  w8 = _mm_unpacklo_epi8(x[8], x[9]);
    104  w9 = _mm_unpacklo_epi8(x[10], x[11]);
    105  w10 = _mm_unpacklo_epi8(x[12], x[13]);
    106  w11 = _mm_unpacklo_epi8(x[14], x[15]);
    107 
    108  w4 = _mm_unpacklo_epi16(w0, w1);
    109  w5 = _mm_unpacklo_epi16(w2, w3);
    110  w12 = _mm_unpacklo_epi16(w8, w9);
    111  w13 = _mm_unpacklo_epi16(w10, w11);
    112 
    113  w6 = _mm_unpacklo_epi32(w4, w5);
    114  w7 = _mm_unpackhi_epi32(w4, w5);
    115  w14 = _mm_unpacklo_epi32(w12, w13);
    116  w15 = _mm_unpackhi_epi32(w12, w13);
    117 
    118  // Store first 4-line result
    119  d[0] = _mm_unpacklo_epi64(w6, w14);
    120  d[1] = _mm_unpackhi_epi64(w6, w14);
    121  d[2] = _mm_unpacklo_epi64(w7, w15);
    122  d[3] = _mm_unpackhi_epi64(w7, w15);
    123 
    124  w4 = _mm_unpackhi_epi16(w0, w1);
    125  w5 = _mm_unpackhi_epi16(w2, w3);
    126  w12 = _mm_unpackhi_epi16(w8, w9);
    127  w13 = _mm_unpackhi_epi16(w10, w11);
    128 
    129  w6 = _mm_unpacklo_epi32(w4, w5);
    130  w7 = _mm_unpackhi_epi32(w4, w5);
    131  w14 = _mm_unpacklo_epi32(w12, w13);
    132  w15 = _mm_unpackhi_epi32(w12, w13);
    133 
    134  // Store second 4-line result
    135  d[4] = _mm_unpacklo_epi64(w6, w14);
    136  d[5] = _mm_unpackhi_epi64(w6, w14);
    137  d[6] = _mm_unpacklo_epi64(w7, w15);
    138  d[7] = _mm_unpackhi_epi64(w7, w15);
    139 
    140  // upper half
    141  w0 = _mm_unpackhi_epi8(x[0], x[1]);
    142  w1 = _mm_unpackhi_epi8(x[2], x[3]);
    143  w2 = _mm_unpackhi_epi8(x[4], x[5]);
    144  w3 = _mm_unpackhi_epi8(x[6], x[7]);
    145 
    146  w8 = _mm_unpackhi_epi8(x[8], x[9]);
    147  w9 = _mm_unpackhi_epi8(x[10], x[11]);
    148  w10 = _mm_unpackhi_epi8(x[12], x[13]);
    149  w11 = _mm_unpackhi_epi8(x[14], x[15]);
    150 
    151  w4 = _mm_unpacklo_epi16(w0, w1);
    152  w5 = _mm_unpacklo_epi16(w2, w3);
    153  w12 = _mm_unpacklo_epi16(w8, w9);
    154  w13 = _mm_unpacklo_epi16(w10, w11);
    155 
    156  w6 = _mm_unpacklo_epi32(w4, w5);
    157  w7 = _mm_unpackhi_epi32(w4, w5);
    158  w14 = _mm_unpacklo_epi32(w12, w13);
    159  w15 = _mm_unpackhi_epi32(w12, w13);
    160 
    161  // Store first 4-line result
    162  d[8] = _mm_unpacklo_epi64(w6, w14);
    163  d[9] = _mm_unpackhi_epi64(w6, w14);
    164  d[10] = _mm_unpacklo_epi64(w7, w15);
    165  d[11] = _mm_unpackhi_epi64(w7, w15);
    166 
    167  w4 = _mm_unpackhi_epi16(w0, w1);
    168  w5 = _mm_unpackhi_epi16(w2, w3);
    169  w12 = _mm_unpackhi_epi16(w8, w9);
    170  w13 = _mm_unpackhi_epi16(w10, w11);
    171 
    172  w6 = _mm_unpacklo_epi32(w4, w5);
    173  w7 = _mm_unpackhi_epi32(w4, w5);
    174  w14 = _mm_unpacklo_epi32(w12, w13);
    175  w15 = _mm_unpackhi_epi32(w12, w13);
    176 
    177  // Store second 4-line result
    178  d[12] = _mm_unpacklo_epi64(w6, w14);
    179  d[13] = _mm_unpackhi_epi64(w6, w14);
    180  d[14] = _mm_unpacklo_epi64(w7, w15);
    181  d[15] = _mm_unpackhi_epi64(w7, w15);
    182 }
    183 
    184 static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc,
    185                               uint8_t *dst, ptrdiff_t pitchDst) {
    186  __m128i r[16];
    187  __m128i d[16];
    188  for (int j = 0; j < 16; j++) {
    189    r[j] = _mm_loadu_si128((__m128i *)(src + j * pitchSrc));
    190  }
    191  transpose16x16_sse2(r, d);
    192  for (int j = 0; j < 16; j++) {
    193    _mm_storeu_si128((__m128i *)(dst + j * pitchDst), d[j]);
    194  }
    195 }
    196 
    197 static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst,
    198                      ptrdiff_t pitchDst, int width, int height) {
    199  for (int j = 0; j < height; j += 16)
    200    for (int i = 0; i < width; i += 16)
    201      transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc,
    202                         dst + j * pitchDst + i, pitchDst);
    203 }
    204 
    205 #endif  // AOM_AOM_DSP_X86_INTRAPRED_UTILS_H_