[ tor-browser ].git.dasho

cfl_ppc.c (7128B)
      1 /*
      2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <altivec.h>
     13 
     14 #include "config/av1_rtcd.h"
     15 
     16 #include "av1/common/cfl.h"
     17 
     18 #define OFF_0 0
     19 #define OFF_1 16
     20 #define OFF_2 32
     21 #define OFF_3 48
     22 #define CFL_LINE_1 64
     23 #define CFL_LINE_2 128
     24 #define CFL_LINE_3 192
     25 
     26 typedef vector signed char int8x16_t;          // NOLINT(runtime/int)
     27 typedef vector unsigned char uint8x16_t;       // NOLINT(runtime/int)
     28 typedef vector signed short int16x8_t;         // NOLINT(runtime/int)
     29 typedef vector unsigned short uint16x8_t;      // NOLINT(runtime/int)
     30 typedef vector signed int int32x4_t;           // NOLINT(runtime/int)
     31 typedef vector unsigned int uint32x4_t;        // NOLINT(runtime/int)
     32 typedef vector unsigned long long uint64x2_t;  // NOLINT(runtime/int)
     33 
     34 static inline void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst,
     35                                        int width, int height, int round_offset,
     36                                        int num_pel_log2) {
     37  const int16_t *sum_buf = (const int16_t *)src_ptr;
     38  const int16_t *end = sum_buf + height * CFL_BUF_LINE;
     39  const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2);
     40  const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
     41                               0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 };
     42  const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03,
     43                               0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B };
     44 
     45  int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset };
     46  int32x4_t sum_32x4_1 = { 0, 0, 0, 0 };
     47  do {
     48    sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0);
     49    sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1);
     50    if (width >= 16) {
     51      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0);
     52      sum_32x4_1 =
     53          vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1);
     54    }
     55    if (width == 32) {
     56      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0);
     57      sum_32x4_1 =
     58          vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1);
     59      sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0);
     60      sum_32x4_1 =
     61          vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1);
     62    }
     63    sum_buf += CFL_BUF_LINE * 2;
     64  } while (sum_buf < end);
     65  int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1);
     66 
     67  const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64);
     68  sum_32x4 = vec_add(sum_32x4, perm_64);
     69  const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32);
     70  sum_32x4 = vec_add(sum_32x4, perm_32);
     71  const int32x4_t avg = vec_sr(sum_32x4, div_shift);
     72  const int16x8_t vec_avg = vec_pack(avg, avg);
     73  const int16_t *src = (const int16_t *)src_ptr;
     74  do {
     75    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, src), vec_avg), OFF_0, dst);
     76    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, src), vec_avg),
     77               OFF_0 + CFL_LINE_1, dst);
     78    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, src), vec_avg),
     79               OFF_0 + CFL_LINE_2, dst);
     80    vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, src), vec_avg),
     81               OFF_0 + CFL_LINE_3, dst);
     82    if (width >= 16) {
     83      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, src), vec_avg), OFF_1, dst);
     84      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, src), vec_avg),
     85                 OFF_1 + CFL_LINE_1, dst);
     86      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, src), vec_avg),
     87                 OFF_1 + CFL_LINE_2, dst);
     88      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, src), vec_avg),
     89                 OFF_1 + CFL_LINE_3, dst);
     90    }
     91    if (width == 32) {
     92      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, src), vec_avg), OFF_2, dst);
     93      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, src), vec_avg),
     94                 OFF_2 + CFL_LINE_1, dst);
     95      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, src), vec_avg),
     96                 OFF_2 + CFL_LINE_2, dst);
     97      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, src), vec_avg),
     98                 OFF_2 + CFL_LINE_3, dst);
     99 
    100      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, src), vec_avg), OFF_3, dst);
    101      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, src), vec_avg),
    102                 OFF_3 + CFL_LINE_1, dst);
    103      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, src), vec_avg),
    104                 OFF_3 + CFL_LINE_2, dst);
    105      vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, src), vec_avg),
    106                 OFF_3 + CFL_LINE_3, dst);
    107    }
    108    src += CFL_BUF_LINE * 4;
    109    dst += CFL_BUF_LINE * 4;
    110  } while (src < end);
    111 }
    112 
    113 // Declare wrappers for VSX sizes
    114 CFL_SUB_AVG_X(vsx, 8, 4, 16, 5)
    115 CFL_SUB_AVG_X(vsx, 8, 8, 32, 6)
    116 CFL_SUB_AVG_X(vsx, 8, 16, 64, 7)
    117 CFL_SUB_AVG_X(vsx, 8, 32, 128, 8)
    118 CFL_SUB_AVG_X(vsx, 16, 4, 32, 6)
    119 CFL_SUB_AVG_X(vsx, 16, 8, 64, 7)
    120 CFL_SUB_AVG_X(vsx, 16, 16, 128, 8)
    121 CFL_SUB_AVG_X(vsx, 16, 32, 256, 9)
    122 CFL_SUB_AVG_X(vsx, 32, 8, 128, 8)
    123 CFL_SUB_AVG_X(vsx, 32, 16, 256, 9)
    124 CFL_SUB_AVG_X(vsx, 32, 32, 512, 10)
    125 
    126 // Based on observation, for small blocks VSX does not outperform C (no 64bit
    127 // load and store intrinsics). So we call the C code for block widths 4.
    128 extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst);
    129 extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst);
    130 extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst);
    131 
    132 cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) {
    133  static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = {
    134    cfl_subtract_average_4x4_c,     /* 4x4 */
    135    cfl_subtract_average_8x8_vsx,   /* 8x8 */
    136    cfl_subtract_average_16x16_vsx, /* 16x16 */
    137    cfl_subtract_average_32x32_vsx, /* 32x32 */
    138    NULL,                           /* 64x64 (invalid CFL size) */
    139    cfl_subtract_average_4x8_c,     /* 4x8 */
    140    cfl_subtract_average_8x4_vsx,   /* 8x4 */
    141    cfl_subtract_average_8x16_vsx,  /* 8x16 */
    142    cfl_subtract_average_16x8_vsx,  /* 16x8 */
    143    cfl_subtract_average_16x32_vsx, /* 16x32 */
    144    cfl_subtract_average_32x16_vsx, /* 32x16 */
    145    NULL,                           /* 32x64 (invalid CFL size) */
    146    NULL,                           /* 64x32 (invalid CFL size) */
    147    cfl_subtract_average_4x16_c,    /* 4x16 */
    148    cfl_subtract_average_16x4_vsx,  /* 16x4 */
    149    cfl_subtract_average_8x32_vsx,  /* 8x32 */
    150    cfl_subtract_average_32x8_vsx,  /* 32x8 */
    151    NULL,                           /* 16x64 (invalid CFL size) */
    152    NULL,                           /* 64x16 (invalid CFL size) */
    153  };
    154  // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
    155  // index the function pointer array out of bounds.
    156  return sub_avg[tx_size % TX_SIZES_ALL];
    157 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE