highbd_variance_sse4.c (8238B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <smmintrin.h> /* SSE4.1 */ 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom_dsp/variance.h" 18 #include "aom_dsp/aom_filter.h" 19 20 static inline void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride, 21 const uint8_t *b8, int b_stride, 22 uint64_t *sse, int64_t *sum) { 23 __m128i u0, u1, u2, u3; 24 __m128i s0, s1, s2, s3; 25 __m128i t0, t1, x0, y0; 26 __m128i a0, a1, a2, a3; 27 __m128i b0, b1, b2, b3; 28 __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1); 29 30 uint16_t *a = CONVERT_TO_SHORTPTR(a8); 31 uint16_t *b = CONVERT_TO_SHORTPTR(b8); 32 33 a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride)); 34 a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride)); 35 a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride)); 36 a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride)); 37 38 b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride)); 39 b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride)); 40 b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride)); 41 b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride)); 42 43 u0 = _mm_unpacklo_epi16(a0, a1); 44 u1 = _mm_unpacklo_epi16(a2, a3); 45 u2 = _mm_unpacklo_epi16(b0, b1); 46 u3 = _mm_unpacklo_epi16(b2, b3); 47 48 s0 = _mm_sub_epi16(u0, u2); 49 s1 = _mm_sub_epi16(u1, u3); 50 51 t0 = _mm_madd_epi16(s0, k_one_epi16); 52 t1 = _mm_madd_epi16(s1, k_one_epi16); 53 54 s2 = _mm_hadd_epi32(t0, t1); 55 s3 = _mm_hadd_epi32(s2, s2); 56 y0 = _mm_hadd_epi32(s3, s3); 57 58 t0 = _mm_madd_epi16(s0, s0); 59 t1 = _mm_madd_epi16(s1, s1); 60 61 s2 = _mm_hadd_epi32(t0, t1); 62 s3 = _mm_hadd_epi32(s2, s2); 63 x0 = _mm_hadd_epi32(s3, s3); 64 65 *sse = (uint64_t)_mm_extract_epi32(x0, 0); 66 *sum = (int64_t)_mm_extract_epi32(y0, 0); 67 } 68 69 uint32_t aom_highbd_8_variance4x4_sse4_1(const uint8_t *a, int a_stride, 70 const uint8_t *b, int b_stride, 71 uint32_t *sse) { 72 int64_t sum, diff; 73 uint64_t local_sse; 74 75 variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); 76 *sse = (uint32_t)local_sse; 77 78 diff = (int64_t)*sse - ((sum * sum) >> 4); 79 return (diff >= 0) ? (uint32_t)diff : 0; 80 } 81 82 uint32_t aom_highbd_10_variance4x4_sse4_1(const uint8_t *a, int a_stride, 83 const uint8_t *b, int b_stride, 84 uint32_t *sse) { 85 int64_t sum, diff; 86 uint64_t local_sse; 87 88 variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); 89 *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4); 90 sum = ROUND_POWER_OF_TWO(sum, 2); 91 92 diff = (int64_t)*sse - ((sum * sum) >> 4); 93 return (diff >= 0) ? (uint32_t)diff : 0; 94 } 95 96 uint32_t aom_highbd_12_variance4x4_sse4_1(const uint8_t *a, int a_stride, 97 const uint8_t *b, int b_stride, 98 uint32_t *sse) { 99 int64_t sum, diff; 100 uint64_t local_sse; 101 102 variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum); 103 *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8); 104 sum = ROUND_POWER_OF_TWO(sum, 4); 105 106 diff = (int64_t)*sse - ((sum * sum) >> 4); 107 return diff >= 0 ? (uint32_t)diff : 0; 108 } 109 110 // Sub-pixel 111 uint32_t aom_highbd_8_sub_pixel_variance4x4_sse4_1( 112 const uint8_t *src, int src_stride, int xoffset, int yoffset, 113 const uint8_t *dst, int dst_stride, uint32_t *sse) { 114 uint16_t fdata3[(4 + 1) * 4]; 115 uint16_t temp2[4 * 4]; 116 117 aom_highbd_var_filter_block2d_bil_first_pass( 118 src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); 119 aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, 120 bilinear_filters_2t[yoffset]); 121 122 return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, dst_stride, 123 sse); 124 } 125 126 uint32_t aom_highbd_10_sub_pixel_variance4x4_sse4_1( 127 const uint8_t *src, int src_stride, int xoffset, int yoffset, 128 const uint8_t *dst, int dst_stride, uint32_t *sse) { 129 uint16_t fdata3[(4 + 1) * 4]; 130 uint16_t temp2[4 * 4]; 131 132 aom_highbd_var_filter_block2d_bil_first_pass( 133 src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); 134 aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, 135 bilinear_filters_2t[yoffset]); 136 137 return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, 138 dst_stride, sse); 139 } 140 141 uint32_t aom_highbd_12_sub_pixel_variance4x4_sse4_1( 142 const uint8_t *src, int src_stride, int xoffset, int yoffset, 143 const uint8_t *dst, int dst_stride, uint32_t *sse) { 144 uint16_t fdata3[(4 + 1) * 4]; 145 uint16_t temp2[4 * 4]; 146 147 aom_highbd_var_filter_block2d_bil_first_pass( 148 src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); 149 aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, 150 bilinear_filters_2t[yoffset]); 151 152 return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2), 4, dst, 153 dst_stride, sse); 154 } 155 156 // Sub-pixel average 157 158 uint32_t aom_highbd_8_sub_pixel_avg_variance4x4_sse4_1( 159 const uint8_t *src, int src_stride, int xoffset, int yoffset, 160 const uint8_t *dst, int dst_stride, uint32_t *sse, 161 const uint8_t *second_pred) { 162 uint16_t fdata3[(4 + 1) * 4]; 163 uint16_t temp2[4 * 4]; 164 DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); 165 166 aom_highbd_var_filter_block2d_bil_first_pass( 167 src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); 168 aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, 169 bilinear_filters_2t[yoffset]); 170 171 aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, 172 CONVERT_TO_BYTEPTR(temp2), 4); 173 174 return aom_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, dst_stride, 175 sse); 176 } 177 178 uint32_t aom_highbd_10_sub_pixel_avg_variance4x4_sse4_1( 179 const uint8_t *src, int src_stride, int xoffset, int yoffset, 180 const uint8_t *dst, int dst_stride, uint32_t *sse, 181 const uint8_t *second_pred) { 182 uint16_t fdata3[(4 + 1) * 4]; 183 uint16_t temp2[4 * 4]; 184 DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); 185 186 aom_highbd_var_filter_block2d_bil_first_pass( 187 src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); 188 aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, 189 bilinear_filters_2t[yoffset]); 190 191 aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, 192 CONVERT_TO_BYTEPTR(temp2), 4); 193 194 return aom_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, 195 dst_stride, sse); 196 } 197 198 uint32_t aom_highbd_12_sub_pixel_avg_variance4x4_sse4_1( 199 const uint8_t *src, int src_stride, int xoffset, int yoffset, 200 const uint8_t *dst, int dst_stride, uint32_t *sse, 201 const uint8_t *second_pred) { 202 uint16_t fdata3[(4 + 1) * 4]; 203 uint16_t temp2[4 * 4]; 204 DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]); 205 206 aom_highbd_var_filter_block2d_bil_first_pass( 207 src, fdata3, src_stride, 1, 4 + 1, 4, bilinear_filters_2t[xoffset]); 208 aom_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, 209 bilinear_filters_2t[yoffset]); 210 211 aom_highbd_comp_avg_pred(CONVERT_TO_BYTEPTR(temp3), second_pred, 4, 4, 212 CONVERT_TO_BYTEPTR(temp2), 4); 213 214 return aom_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3), 4, dst, 215 dst_stride, sse); 216 }