wedge_utils_sse2.c (9630B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <immintrin.h> 14 15 #include "aom_dsp/x86/synonyms.h" 16 17 #include "aom/aom_integer.h" 18 19 #include "av1/common/reconinter.h" 20 21 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) 22 23 /** 24 * See av1_wedge_sse_from_residuals_c 25 */ 26 uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, 27 const uint8_t *m, int N) { 28 int n = -N; 29 int n8 = n + 8; 30 31 uint64_t csse; 32 33 const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); 34 const __m128i v_zext_q = _mm_set1_epi64x(~0u); 35 36 __m128i v_acc0_q = _mm_setzero_si128(); 37 38 assert(N % 64 == 0); 39 40 r1 += N; 41 d += N; 42 m += N; 43 44 do { 45 const __m128i v_r0_w = xx_load_128(r1 + n); 46 const __m128i v_r1_w = xx_load_128(r1 + n8); 47 const __m128i v_d0_w = xx_load_128(d + n); 48 const __m128i v_d1_w = xx_load_128(d + n8); 49 const __m128i v_m01_b = xx_load_128(m + n); 50 51 const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); 52 const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); 53 const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); 54 const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); 55 const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); 56 const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); 57 58 const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); 59 const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); 60 const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); 61 const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); 62 63 const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); 64 const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); 65 const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); 66 const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); 67 68 const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); 69 const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); 70 71 const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); 72 const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); 73 74 const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), 75 _mm_srli_epi64(v_sq0_d, 32)); 76 const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), 77 _mm_srli_epi64(v_sq1_d, 32)); 78 79 v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); 80 v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); 81 82 n8 += 16; 83 n += 16; 84 } while (n); 85 86 v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); 87 88 #if AOM_ARCH_X86_64 89 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); 90 #else 91 xx_storel_64(&csse, v_acc0_q); 92 #endif 93 94 return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); 95 } 96 97 /** 98 * See av1_wedge_sign_from_residuals_c 99 */ 100 int8_t av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, 101 int N, int64_t limit) { 102 int64_t acc; 103 104 __m128i v_sign_d; 105 __m128i v_acc0_d = _mm_setzero_si128(); 106 __m128i v_acc1_d = _mm_setzero_si128(); 107 __m128i v_acc_q; 108 109 // Input size limited to 8192 by the use of 32 bit accumulators and m 110 // being between [0, 64]. Overflow might happen at larger sizes, 111 // though it is practically impossible on real video input. 112 assert(N < 8192); 113 assert(N % 64 == 0); 114 115 do { 116 const __m128i v_m01_b = xx_load_128(m); 117 const __m128i v_m23_b = xx_load_128(m + 16); 118 const __m128i v_m45_b = xx_load_128(m + 32); 119 const __m128i v_m67_b = xx_load_128(m + 48); 120 121 const __m128i v_d0_w = xx_load_128(ds); 122 const __m128i v_d1_w = xx_load_128(ds + 8); 123 const __m128i v_d2_w = xx_load_128(ds + 16); 124 const __m128i v_d3_w = xx_load_128(ds + 24); 125 const __m128i v_d4_w = xx_load_128(ds + 32); 126 const __m128i v_d5_w = xx_load_128(ds + 40); 127 const __m128i v_d6_w = xx_load_128(ds + 48); 128 const __m128i v_d7_w = xx_load_128(ds + 56); 129 130 const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); 131 const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); 132 const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); 133 const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); 134 const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); 135 const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); 136 const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); 137 const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); 138 139 const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); 140 const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); 141 const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); 142 const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); 143 const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); 144 const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); 145 const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); 146 const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); 147 148 const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); 149 const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); 150 const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); 151 const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); 152 153 const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); 154 const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); 155 156 v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); 157 v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); 158 159 ds += 64; 160 m += 64; 161 162 N -= 64; 163 } while (N); 164 165 v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); 166 v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), 167 _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); 168 169 v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); 170 v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), 171 _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); 172 173 v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); 174 175 v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); 176 177 #if AOM_ARCH_X86_64 178 acc = _mm_cvtsi128_si64(v_acc_q); 179 #else 180 xx_storel_64(&acc, v_acc_q); 181 #endif 182 183 return acc > limit; 184 } 185 186 // Negate under mask 187 static inline __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) { 188 return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w); 189 } 190 191 /** 192 * av1_wedge_compute_delta_squares_c 193 */ 194 void av1_wedge_compute_delta_squares_sse2(int16_t *d, const int16_t *a, 195 const int16_t *b, int N) { 196 const __m128i v_neg_w = _mm_set_epi16((short)0xffff, 0, (short)0xffff, 0, 197 (short)0xffff, 0, (short)0xffff, 0); 198 199 assert(N % 64 == 0); 200 201 do { 202 const __m128i v_a0_w = xx_load_128(a); 203 const __m128i v_b0_w = xx_load_128(b); 204 const __m128i v_a1_w = xx_load_128(a + 8); 205 const __m128i v_b1_w = xx_load_128(b + 8); 206 const __m128i v_a2_w = xx_load_128(a + 16); 207 const __m128i v_b2_w = xx_load_128(b + 16); 208 const __m128i v_a3_w = xx_load_128(a + 24); 209 const __m128i v_b3_w = xx_load_128(b + 24); 210 211 const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w); 212 const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w); 213 const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w); 214 const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w); 215 const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w); 216 const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w); 217 const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w); 218 const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w); 219 220 // Negate top word of pairs 221 const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w); 222 const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w); 223 const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w); 224 const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w); 225 const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w); 226 const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w); 227 const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w); 228 const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w); 229 230 const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w); 231 const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w); 232 const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w); 233 const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w); 234 const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w); 235 const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w); 236 const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w); 237 const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w); 238 239 const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w); 240 const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w); 241 const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w); 242 const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w); 243 244 xx_store_128(d, v_r0_w); 245 xx_store_128(d + 8, v_r1_w); 246 xx_store_128(d + 16, v_r2_w); 247 xx_store_128(d + 24, v_r3_w); 248 249 a += 32; 250 b += 32; 251 d += 32; 252 N -= 32; 253 } while (N); 254 }