cfl_ppc.c (7128B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <altivec.h> 13 14 #include "config/av1_rtcd.h" 15 16 #include "av1/common/cfl.h" 17 18 #define OFF_0 0 19 #define OFF_1 16 20 #define OFF_2 32 21 #define OFF_3 48 22 #define CFL_LINE_1 64 23 #define CFL_LINE_2 128 24 #define CFL_LINE_3 192 25 26 typedef vector signed char int8x16_t; // NOLINT(runtime/int) 27 typedef vector unsigned char uint8x16_t; // NOLINT(runtime/int) 28 typedef vector signed short int16x8_t; // NOLINT(runtime/int) 29 typedef vector unsigned short uint16x8_t; // NOLINT(runtime/int) 30 typedef vector signed int int32x4_t; // NOLINT(runtime/int) 31 typedef vector unsigned int uint32x4_t; // NOLINT(runtime/int) 32 typedef vector unsigned long long uint64x2_t; // NOLINT(runtime/int) 33 34 static inline void subtract_average_vsx(const uint16_t *src_ptr, int16_t *dst, 35 int width, int height, int round_offset, 36 int num_pel_log2) { 37 const int16_t *sum_buf = (const int16_t *)src_ptr; 38 const int16_t *end = sum_buf + height * CFL_BUF_LINE; 39 const uint32x4_t div_shift = vec_splats((uint32_t)num_pel_log2); 40 const uint8x16_t mask_64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 41 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 }; 42 const uint8x16_t mask_32 = { 0x14, 0x15, 0x16, 0x17, 0x00, 0x01, 0x02, 0x03, 43 0x1C, 0x1D, 0x1E, 0x1F, 0x08, 0x09, 0x0A, 0x0B }; 44 45 int32x4_t sum_32x4_0 = { 0, 0, 0, round_offset }; 46 int32x4_t sum_32x4_1 = { 0, 0, 0, 0 }; 47 do { 48 sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_0, sum_buf), sum_32x4_0); 49 sum_32x4_1 = vec_sum4s(vec_vsx_ld(OFF_0 + CFL_LINE_1, sum_buf), sum_32x4_1); 50 if (width >= 16) { 51 sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_1, sum_buf), sum_32x4_0); 52 sum_32x4_1 = 53 vec_sum4s(vec_vsx_ld(OFF_1 + CFL_LINE_1, sum_buf), sum_32x4_1); 54 } 55 if (width == 32) { 56 sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_2, sum_buf), sum_32x4_0); 57 sum_32x4_1 = 58 vec_sum4s(vec_vsx_ld(OFF_2 + CFL_LINE_1, sum_buf), sum_32x4_1); 59 sum_32x4_0 = vec_sum4s(vec_vsx_ld(OFF_3, sum_buf), sum_32x4_0); 60 sum_32x4_1 = 61 vec_sum4s(vec_vsx_ld(OFF_3 + CFL_LINE_1, sum_buf), sum_32x4_1); 62 } 63 sum_buf += CFL_BUF_LINE * 2; 64 } while (sum_buf < end); 65 int32x4_t sum_32x4 = vec_add(sum_32x4_0, sum_32x4_1); 66 67 const int32x4_t perm_64 = vec_perm(sum_32x4, sum_32x4, mask_64); 68 sum_32x4 = vec_add(sum_32x4, perm_64); 69 const int32x4_t perm_32 = vec_perm(sum_32x4, sum_32x4, mask_32); 70 sum_32x4 = vec_add(sum_32x4, perm_32); 71 const int32x4_t avg = vec_sr(sum_32x4, div_shift); 72 const int16x8_t vec_avg = vec_pack(avg, avg); 73 const int16_t *src = (const int16_t *)src_ptr; 74 do { 75 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0, src), vec_avg), OFF_0, dst); 76 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_1, src), vec_avg), 77 OFF_0 + CFL_LINE_1, dst); 78 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_2, src), vec_avg), 79 OFF_0 + CFL_LINE_2, dst); 80 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_0 + CFL_LINE_3, src), vec_avg), 81 OFF_0 + CFL_LINE_3, dst); 82 if (width >= 16) { 83 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1, src), vec_avg), OFF_1, dst); 84 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_1, src), vec_avg), 85 OFF_1 + CFL_LINE_1, dst); 86 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_2, src), vec_avg), 87 OFF_1 + CFL_LINE_2, dst); 88 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_1 + CFL_LINE_3, src), vec_avg), 89 OFF_1 + CFL_LINE_3, dst); 90 } 91 if (width == 32) { 92 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2, src), vec_avg), OFF_2, dst); 93 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_1, src), vec_avg), 94 OFF_2 + CFL_LINE_1, dst); 95 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_2, src), vec_avg), 96 OFF_2 + CFL_LINE_2, dst); 97 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_2 + CFL_LINE_3, src), vec_avg), 98 OFF_2 + CFL_LINE_3, dst); 99 100 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3, src), vec_avg), OFF_3, dst); 101 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_1, src), vec_avg), 102 OFF_3 + CFL_LINE_1, dst); 103 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_2, src), vec_avg), 104 OFF_3 + CFL_LINE_2, dst); 105 vec_vsx_st(vec_sub(vec_vsx_ld(OFF_3 + CFL_LINE_3, src), vec_avg), 106 OFF_3 + CFL_LINE_3, dst); 107 } 108 src += CFL_BUF_LINE * 4; 109 dst += CFL_BUF_LINE * 4; 110 } while (src < end); 111 } 112 113 // Declare wrappers for VSX sizes 114 CFL_SUB_AVG_X(vsx, 8, 4, 16, 5) 115 CFL_SUB_AVG_X(vsx, 8, 8, 32, 6) 116 CFL_SUB_AVG_X(vsx, 8, 16, 64, 7) 117 CFL_SUB_AVG_X(vsx, 8, 32, 128, 8) 118 CFL_SUB_AVG_X(vsx, 16, 4, 32, 6) 119 CFL_SUB_AVG_X(vsx, 16, 8, 64, 7) 120 CFL_SUB_AVG_X(vsx, 16, 16, 128, 8) 121 CFL_SUB_AVG_X(vsx, 16, 32, 256, 9) 122 CFL_SUB_AVG_X(vsx, 32, 8, 128, 8) 123 CFL_SUB_AVG_X(vsx, 32, 16, 256, 9) 124 CFL_SUB_AVG_X(vsx, 32, 32, 512, 10) 125 126 // Based on observation, for small blocks VSX does not outperform C (no 64bit 127 // load and store intrinsics). So we call the C code for block widths 4. 128 extern void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); 129 extern void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); 130 extern void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); 131 132 cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { 133 static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { 134 cfl_subtract_average_4x4_c, /* 4x4 */ 135 cfl_subtract_average_8x8_vsx, /* 8x8 */ 136 cfl_subtract_average_16x16_vsx, /* 16x16 */ 137 cfl_subtract_average_32x32_vsx, /* 32x32 */ 138 NULL, /* 64x64 (invalid CFL size) */ 139 cfl_subtract_average_4x8_c, /* 4x8 */ 140 cfl_subtract_average_8x4_vsx, /* 8x4 */ 141 cfl_subtract_average_8x16_vsx, /* 8x16 */ 142 cfl_subtract_average_16x8_vsx, /* 16x8 */ 143 cfl_subtract_average_16x32_vsx, /* 16x32 */ 144 cfl_subtract_average_32x16_vsx, /* 32x16 */ 145 NULL, /* 32x64 (invalid CFL size) */ 146 NULL, /* 64x32 (invalid CFL size) */ 147 cfl_subtract_average_4x16_c, /* 4x16 */ 148 cfl_subtract_average_16x4_vsx, /* 16x4 */ 149 cfl_subtract_average_8x32_vsx, /* 8x32 */ 150 cfl_subtract_average_32x8_vsx, /* 32x8 */ 151 NULL, /* 16x64 (invalid CFL size) */ 152 NULL, /* 64x16 (invalid CFL size) */ 153 }; 154 // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to 155 // index the function pointer array out of bounds. 156 return sub_avg[tx_size % TX_SIZES_ALL]; 157 }