tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

encodetxb_sse4.c (3045B)


      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <emmintrin.h>  // SSE2
     14 #include <smmintrin.h>  /* SSE4.1 */
     15 
     16 #include "aom/aom_integer.h"
     17 #include "av1/common/av1_common_int.h"
     18 #include "av1/common/txb_common.h"
     19 #include "aom_dsp/x86/synonyms.h"
     20 
     21 void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width,
     22                                const int height, uint8_t *const levels) {
     23  const int stride = height + TX_PAD_HOR;
     24  const __m128i zeros = _mm_setzero_si128();
     25 
     26  const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride);
     27  uint8_t *bottom_buf = levels + stride * width;
     28  uint8_t *bottom_buf_end = bottom_buf + bottom_len;
     29  do {
     30    _mm_storeu_si128((__m128i *)(bottom_buf), zeros);
     31    bottom_buf += 16;
     32  } while (bottom_buf < bottom_buf_end);
     33 
     34  int i = 0;
     35  uint8_t *ls = levels;
     36  const tran_low_t *cf = coeff;
     37  if (height == 4) {
     38    do {
     39      const __m128i coeffA = xx_loadu_128(cf);
     40      const __m128i coeffB = xx_loadu_128(cf + 4);
     41      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
     42      const __m128i absAB = _mm_abs_epi16(coeffAB);
     43      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
     44      const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros);
     45      xx_storeu_128(ls, lsAB);
     46      ls += (stride << 1);
     47      cf += (height << 1);
     48      i += 2;
     49    } while (i < width);
     50  } else if (height == 8) {
     51    do {
     52      const __m128i coeffA = xx_loadu_128(cf);
     53      const __m128i coeffB = xx_loadu_128(cf + 4);
     54      const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
     55      const __m128i absAB = _mm_abs_epi16(coeffAB);
     56      const __m128i absAB8 = _mm_packs_epi16(absAB, zeros);
     57      xx_storeu_128(ls, absAB8);
     58      ls += stride;
     59      cf += height;
     60      i += 1;
     61    } while (i < width);
     62  } else {
     63    do {
     64      int j = 0;
     65      do {
     66        const __m128i coeffA = xx_loadu_128(cf);
     67        const __m128i coeffB = xx_loadu_128(cf + 4);
     68        const __m128i coeffC = xx_loadu_128(cf + 8);
     69        const __m128i coeffD = xx_loadu_128(cf + 12);
     70        const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB);
     71        const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD);
     72        const __m128i absAB = _mm_abs_epi16(coeffAB);
     73        const __m128i absCD = _mm_abs_epi16(coeffCD);
     74        const __m128i absABCD = _mm_packs_epi16(absAB, absCD);
     75        xx_storeu_128(ls + j, absABCD);
     76        j += 16;
     77        cf += 16;
     78      } while (j < height);
     79      *(int32_t *)(ls + height) = 0;
     80      ls += stride;
     81      i += 1;
     82    } while (i < width);
     83  }
     84 }