tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_variance_sse4.c (8238B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <smmintrin.h> /* SSE4.1 */
     13 
     14 #include "config/aom_config.h"
     15 #include "config/aom_dsp_rtcd.h"
     16 
     17 #include "aom_dsp/variance.h"
     18 #include "aom_dsp/aom_filter.h"
     19 
     20 static inline void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
     21                                         const uint8_t *b8, int b_stride,
     22                                         uint64_t *sse, int64_t *sum) {
     23  __m128i u0, u1, u2, u3;
     24  __m128i s0, s1, s2, s3;
     25  __m128i t0, t1, x0, y0;
     26  __m128i a0, a1, a2, a3;
     27  __m128i b0, b1, b2, b3;
     28  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
     29 
     30  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
     31  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
     32 
     33  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
     34  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
     35  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
     36  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
     37 
     38  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
     39  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
     40  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
     41  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
     42 
     43  u0 = _mm_unpacklo_epi16(a0, a1);
     44  u1 = _mm_unpacklo_epi16(a2, a3);
     45  u2 = _mm_unpacklo_epi16(b0, b1);
     46  u3 = _mm_unpacklo_epi16(b2, b3);
     47 
     48  s0 = _mm_sub_epi16(u0, u2);
     49  s1 = _mm_sub_epi16(u1, u3);
     50 
     51  t0 = _mm_madd_epi16(s0, k_one_epi16);
     52  t1 = _mm_madd_epi16(s1, k_one_epi16);
     53 
     54  s2 = _mm_hadd_epi32(t0, t1);
     55  s3 = _mm_hadd_epi32(s2, s2);
     56  y0 = _mm_hadd_epi32(s3, s3);
     57 
     58  t0 = _mm_madd_epi16(s0, s0);
     59  t1 = _mm_madd_epi16(s1, s1);
     60 
     61  s2 = _mm_hadd_epi32(t0, t1);
     62  s3 = _mm_hadd_epi32(s2, s2);
     63  x0 = _mm_hadd_epi32(s3, s3);
     64 
     65  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
     66  *sum = (int64_t)_mm_extract_epi32(y0, 0);
     67 }
     68 
     69 uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride,
     70                                         const uint8_t *b, int b_stride,
     71                                         uint32_t *sse) {
     72  int64_t sum, diff;
     73  uint64_t local_sse;
     74 
     75  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
     76  *sse = (uint32_t)local_sse;
     77 
     78  diff = (int64_t)*sse - ((sum * sum) >> 4);
     79  return (diff >= 0) ? (uint32_t)diff : 0;
     80 }
     81 
     82 uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride,
     83                                          const uint8_t *b, int b_stride,
     84                                          uint32_t *sse) {
     85  int64_t sum, diff;
     86  uint64_t local_sse;
     87 
     88  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
     89  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
     90  sum = ROUND_POWER_OF_TWO(sum, 2);
     91 
     92  diff = (int64_t)*sse - ((sum * sum) >> 4);
     93  return (diff >= 0) ? (uint32_t)diff : 0;
     94 }
     95 
     96 uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride,
     97                                          const uint8_t *b, int b_stride,
     98                                          uint32_t *sse) {
     99  int64_t sum, diff;
    100  uint64_t local_sse;
    101 
    102  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
    103  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
    104  sum = ROUND_POWER_OF_TWO(sum, 4);
    105 
    106  diff = (int64_t)*sse - ((sum * sum) >> 4);
    107  return diff >= 0 ? (uint32_t)diff : 0;
    108 }
    109 
    110 // Sub-pixel
    111 uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1(
    112    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    113    const uint8_t *dst, int dst_stride, uint32_t *sse) {
    114  uint16_t fdata3[(4 + 1) * 4];
    115  uint16_t temp2[4 * 4];
    116 
    117  aom_highbd_var_filter_block2d_bil_first_pass(
    118      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
    119  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
    120                                                bilinear_filters_2t[yoffset]);
    121 
    122  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride,
    123                                  sse);
    124 }
    125 
    126 uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1(
    127    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    128    const uint8_t *dst, int dst_stride, uint32_t *sse) {
    129  uint16_t fdata3[(4 + 1) * 4];
    130  uint16_t temp2[4 * 4];
    131 
    132  aom_highbd_var_filter_block2d_bil_first_pass(
    133      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
    134  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
    135                                                bilinear_filters_2t[yoffset]);
    136 
    137  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
    138                                   dst_stride, sse);
    139 }
    140 
    141 uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1(
    142    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    143    const uint8_t *dst, int dst_stride, uint32_t *sse) {
    144  uint16_t fdata3[(4 + 1) * 4];
    145  uint16_t temp2[4 * 4];
    146 
    147  aom_highbd_var_filter_block2d_bil_first_pass(
    148      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
    149  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
    150                                                bilinear_filters_2t[yoffset]);
    151 
    152  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst,
    153                                   dst_stride, sse);
    154 }
    155 
    156 // Sub-pixel average
    157 
    158 uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
    159    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    160    const uint8_t *dst, int dst_stride, uint32_t *sse,
    161    const uint8_t *second_pred) {
    162  uint16_t fdata3[(4 + 1) * 4];
    163  uint16_t temp2[4 * 4];
    164  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
    165 
    166  aom_highbd_var_filter_block2d_bil_first_pass(
    167      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
    168  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
    169                                                bilinear_filters_2t[yoffset]);
    170 
    171  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
    172                           CONVERT_TO_BYTEPTR(temp2), 4);
    173 
    174  return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride,
    175                                  sse);
    176 }
    177 
    178 uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
    179    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    180    const uint8_t *dst, int dst_stride, uint32_t *sse,
    181    const uint8_t *second_pred) {
    182  uint16_t fdata3[(4 + 1) * 4];
    183  uint16_t temp2[4 * 4];
    184  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
    185 
    186  aom_highbd_var_filter_block2d_bil_first_pass(
    187      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
    188  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
    189                                                bilinear_filters_2t[yoffset]);
    190 
    191  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
    192                           CONVERT_TO_BYTEPTR(temp2), 4);
    193 
    194  return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
    195                                   dst_stride, sse);
    196 }
    197 
    198 uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
    199    const uint8_t *src, int src_stride, int xoffset, int yoffset,
    200    const uint8_t *dst, int dst_stride, uint32_t *sse,
    201    const uint8_t *second_pred) {
    202  uint16_t fdata3[(4 + 1) * 4];
    203  uint16_t temp2[4 * 4];
    204  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
    205 
    206  aom_highbd_var_filter_block2d_bil_first_pass(
    207      src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]);
    208  aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4,
    209                                                bilinear_filters_2t[yoffset]);
    210 
    211  aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4,
    212                           CONVERT_TO_BYTEPTR(temp2), 4);
    213 
    214  return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst,
    215                                   dst_stride, sse);
    216 }