cdef_block_avx2.c (15640B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include "aom_dsp/aom_simd.h" 13 #define SIMD_FUNC(name) name##_avx2 14 #include "av1/common/cdef_block_simd.h" 15 16 /* partial A is a 16-bit vector of the form: 17 [x8 - - x1 | x16 - - x9] and partial B has the form: 18 [0 y1 - y7 | 0 y9 - y15]. 19 This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... 20 (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 on each 128-bit lane. Here the C1..C8 constants 21 are in const1 and const2. */ 22 static inline __m256i fold_mul_and_sum_avx2(__m256i *partiala, 23 __m256i *partialb, 24 const __m256i *const1, 25 const __m256i *const2) { 26 // Mask used to shuffle the elements present in 256bit register. 27 static const int shuffle_reg_256bit[8] = { 0x0b0a0d0c, 0x07060908, 0x03020504, 28 0x0f0e0100, 0x0b0a0d0c, 0x07060908, 29 0x03020504, 0x0f0e0100 }; 30 __m256i tmp; 31 /* Reverse partial B. */ 32 *partialb = _mm256_shuffle_epi8( 33 *partialb, _mm256_loadu_si256((const __m256i *)shuffle_reg_256bit)); 34 35 /* Interleave the x and y values of identical indices and pair x8 with 0. */ 36 tmp = *partiala; 37 *partiala = _mm256_unpacklo_epi16(*partiala, *partialb); 38 *partialb = _mm256_unpackhi_epi16(tmp, *partialb); 39 40 /* Square and add the corresponding x and y values. */ 41 *partiala = _mm256_madd_epi16(*partiala, *partiala); 42 *partialb = _mm256_madd_epi16(*partialb, *partialb); 43 /* Multiply by constant. */ 44 *partiala = _mm256_mullo_epi32(*partiala, *const1); 45 *partialb = _mm256_mullo_epi32(*partialb, *const2); 46 /* Sum all results. */ 47 *partiala = _mm256_add_epi32(*partiala, *partialb); 48 return *partiala; 49 } 50 51 static inline __m256i hsum4_avx2(__m256i *x0, __m256i *x1, __m256i *x2, 52 __m256i *x3) { 53 const __m256i t0 = _mm256_unpacklo_epi32(*x0, *x1); 54 const __m256i t1 = _mm256_unpacklo_epi32(*x2, *x3); 55 const __m256i t2 = _mm256_unpackhi_epi32(*x0, *x1); 56 const __m256i t3 = _mm256_unpackhi_epi32(*x2, *x3); 57 58 *x0 = _mm256_unpacklo_epi64(t0, t1); 59 *x1 = _mm256_unpackhi_epi64(t0, t1); 60 *x2 = _mm256_unpacklo_epi64(t2, t3); 61 *x3 = _mm256_unpackhi_epi64(t2, t3); 62 return _mm256_add_epi32(_mm256_add_epi32(*x0, *x1), 63 _mm256_add_epi32(*x2, *x3)); 64 } 65 66 /* Computes cost for directions 0, 5, 6 and 7. We can call this function again 67 to compute the remaining directions. */ 68 static inline __m256i compute_directions_avx2(__m256i *lines, 69 int32_t cost_frist_8x8[4], 70 int32_t cost_second_8x8[4]) { 71 __m256i partial4a, partial4b, partial5a, partial5b, partial7a, partial7b; 72 __m256i partial6; 73 __m256i tmp; 74 /* Partial sums for lines 0 and 1. */ 75 partial4a = _mm256_slli_si256(lines[0], 14); 76 partial4b = _mm256_srli_si256(lines[0], 2); 77 partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[1], 12)); 78 partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[1], 4)); 79 tmp = _mm256_add_epi16(lines[0], lines[1]); 80 partial5a = _mm256_slli_si256(tmp, 10); 81 partial5b = _mm256_srli_si256(tmp, 6); 82 partial7a = _mm256_slli_si256(tmp, 4); 83 partial7b = _mm256_srli_si256(tmp, 12); 84 partial6 = tmp; 85 86 /* Partial sums for lines 2 and 3. */ 87 partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[2], 10)); 88 partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[2], 6)); 89 partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[3], 8)); 90 partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[3], 8)); 91 tmp = _mm256_add_epi16(lines[2], lines[3]); 92 partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 8)); 93 partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 8)); 94 partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 6)); 95 partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 10)); 96 partial6 = _mm256_add_epi16(partial6, tmp); 97 98 /* Partial sums for lines 4 and 5. */ 99 partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[4], 6)); 100 partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[4], 10)); 101 partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[5], 4)); 102 partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[5], 12)); 103 tmp = _mm256_add_epi16(lines[4], lines[5]); 104 partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 6)); 105 partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 10)); 106 partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 8)); 107 partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 8)); 108 partial6 = _mm256_add_epi16(partial6, tmp); 109 110 /* Partial sums for lines 6 and 7. */ 111 partial4a = _mm256_add_epi16(partial4a, _mm256_slli_si256(lines[6], 2)); 112 partial4b = _mm256_add_epi16(partial4b, _mm256_srli_si256(lines[6], 14)); 113 partial4a = _mm256_add_epi16(partial4a, lines[7]); 114 tmp = _mm256_add_epi16(lines[6], lines[7]); 115 partial5a = _mm256_add_epi16(partial5a, _mm256_slli_si256(tmp, 4)); 116 partial5b = _mm256_add_epi16(partial5b, _mm256_srli_si256(tmp, 12)); 117 partial7a = _mm256_add_epi16(partial7a, _mm256_slli_si256(tmp, 10)); 118 partial7b = _mm256_add_epi16(partial7b, _mm256_srli_si256(tmp, 6)); 119 partial6 = _mm256_add_epi16(partial6, tmp); 120 121 const __m256i const_reg_1 = 122 _mm256_set_epi32(210, 280, 420, 840, 210, 280, 420, 840); 123 const __m256i const_reg_2 = 124 _mm256_set_epi32(105, 120, 140, 168, 105, 120, 140, 168); 125 const __m256i const_reg_3 = _mm256_set_epi32(210, 420, 0, 0, 210, 420, 0, 0); 126 const __m256i const_reg_4 = 127 _mm256_set_epi32(105, 105, 105, 140, 105, 105, 105, 140); 128 129 /* Compute costs in terms of partial sums. */ 130 partial4a = 131 fold_mul_and_sum_avx2(&partial4a, &partial4b, &const_reg_1, &const_reg_2); 132 partial7a = 133 fold_mul_and_sum_avx2(&partial7a, &partial7b, &const_reg_3, &const_reg_4); 134 partial5a = 135 fold_mul_and_sum_avx2(&partial5a, &partial5b, &const_reg_3, &const_reg_4); 136 partial6 = _mm256_madd_epi16(partial6, partial6); 137 partial6 = _mm256_mullo_epi32(partial6, _mm256_set1_epi32(105)); 138 139 partial4a = hsum4_avx2(&partial4a, &partial5a, &partial6, &partial7a); 140 _mm_storeu_si128((__m128i *)cost_frist_8x8, 141 _mm256_castsi256_si128(partial4a)); 142 _mm_storeu_si128((__m128i *)cost_second_8x8, 143 _mm256_extractf128_si256(partial4a, 1)); 144 145 return partial4a; 146 } 147 148 /* transpose and reverse the order of the lines -- equivalent to a 90-degree 149 counter-clockwise rotation of the pixels. */ 150 static inline void array_reverse_transpose_8x8_avx2(__m256i *in, __m256i *res) { 151 const __m256i tr0_0 = _mm256_unpacklo_epi16(in[0], in[1]); 152 const __m256i tr0_1 = _mm256_unpacklo_epi16(in[2], in[3]); 153 const __m256i tr0_2 = _mm256_unpackhi_epi16(in[0], in[1]); 154 const __m256i tr0_3 = _mm256_unpackhi_epi16(in[2], in[3]); 155 const __m256i tr0_4 = _mm256_unpacklo_epi16(in[4], in[5]); 156 const __m256i tr0_5 = _mm256_unpacklo_epi16(in[6], in[7]); 157 const __m256i tr0_6 = _mm256_unpackhi_epi16(in[4], in[5]); 158 const __m256i tr0_7 = _mm256_unpackhi_epi16(in[6], in[7]); 159 160 const __m256i tr1_0 = _mm256_unpacklo_epi32(tr0_0, tr0_1); 161 const __m256i tr1_1 = _mm256_unpacklo_epi32(tr0_4, tr0_5); 162 const __m256i tr1_2 = _mm256_unpackhi_epi32(tr0_0, tr0_1); 163 const __m256i tr1_3 = _mm256_unpackhi_epi32(tr0_4, tr0_5); 164 const __m256i tr1_4 = _mm256_unpacklo_epi32(tr0_2, tr0_3); 165 const __m256i tr1_5 = _mm256_unpacklo_epi32(tr0_6, tr0_7); 166 const __m256i tr1_6 = _mm256_unpackhi_epi32(tr0_2, tr0_3); 167 const __m256i tr1_7 = _mm256_unpackhi_epi32(tr0_6, tr0_7); 168 169 res[7] = _mm256_unpacklo_epi64(tr1_0, tr1_1); 170 res[6] = _mm256_unpackhi_epi64(tr1_0, tr1_1); 171 res[5] = _mm256_unpacklo_epi64(tr1_2, tr1_3); 172 res[4] = _mm256_unpackhi_epi64(tr1_2, tr1_3); 173 res[3] = _mm256_unpacklo_epi64(tr1_4, tr1_5); 174 res[2] = _mm256_unpackhi_epi64(tr1_4, tr1_5); 175 res[1] = _mm256_unpacklo_epi64(tr1_6, tr1_7); 176 res[0] = _mm256_unpackhi_epi64(tr1_6, tr1_7); 177 } 178 179 void cdef_find_dir_dual_avx2(const uint16_t *img1, const uint16_t *img2, 180 int stride, int32_t *var_out_1st, 181 int32_t *var_out_2nd, int coeff_shift, 182 int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { 183 int32_t cost_first_8x8[8]; 184 int32_t cost_second_8x8[8]; 185 // Used to store the best cost for 2 8x8's. 186 int32_t best_cost[2] = { 0 }; 187 // Best direction for 2 8x8's. 188 int best_dir[2] = { 0 }; 189 190 const __m128i const_coeff_shift_reg = _mm_cvtsi32_si128(coeff_shift); 191 const __m256i const_128_reg = _mm256_set1_epi16(128); 192 __m256i lines[8]; 193 for (int i = 0; i < 8; i++) { 194 const __m128i src_1 = _mm_loadu_si128((const __m128i *)&img1[i * stride]); 195 const __m128i src_2 = _mm_loadu_si128((const __m128i *)&img2[i * stride]); 196 197 lines[i] = _mm256_insertf128_si256(_mm256_castsi128_si256(src_1), src_2, 1); 198 lines[i] = _mm256_sub_epi16( 199 _mm256_sra_epi16(lines[i], const_coeff_shift_reg), const_128_reg); 200 } 201 202 /* Compute "mostly vertical" directions. */ 203 const __m256i dir47 = 204 compute_directions_avx2(lines, cost_first_8x8 + 4, cost_second_8x8 + 4); 205 206 /* Transpose and reverse the order of the lines. */ 207 array_reverse_transpose_8x8_avx2(lines, lines); 208 209 /* Compute "mostly horizontal" directions. */ 210 const __m256i dir03 = 211 compute_directions_avx2(lines, cost_first_8x8, cost_second_8x8); 212 213 __m256i max = _mm256_max_epi32(dir03, dir47); 214 max = 215 _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 8), 216 _mm256_slli_si256(max, 16 - (8)))); 217 max = 218 _mm256_max_epi32(max, _mm256_or_si256(_mm256_srli_si256(max, 4), 219 _mm256_slli_si256(max, 16 - (4)))); 220 221 const __m128i first_8x8_output = _mm256_castsi256_si128(max); 222 const __m128i second_8x8_output = _mm256_extractf128_si256(max, 1); 223 const __m128i cmpeg_res_00 = 224 _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir47)); 225 const __m128i cmpeg_res_01 = 226 _mm_cmpeq_epi32(first_8x8_output, _mm256_castsi256_si128(dir03)); 227 const __m128i cmpeg_res_10 = 228 _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir47, 1)); 229 const __m128i cmpeg_res_11 = 230 _mm_cmpeq_epi32(second_8x8_output, _mm256_extractf128_si256(dir03, 1)); 231 const __m128i t_first_8x8 = _mm_packs_epi32(cmpeg_res_01, cmpeg_res_00); 232 const __m128i t_second_8x8 = _mm_packs_epi32(cmpeg_res_11, cmpeg_res_10); 233 234 best_cost[0] = _mm_cvtsi128_si32(_mm256_castsi256_si128(max)); 235 best_cost[1] = _mm_cvtsi128_si32(second_8x8_output); 236 best_dir[0] = _mm_movemask_epi8(_mm_packs_epi16(t_first_8x8, t_first_8x8)); 237 best_dir[0] = 238 get_msb(best_dir[0] ^ (best_dir[0] - 1)); // Count trailing zeros 239 best_dir[1] = _mm_movemask_epi8(_mm_packs_epi16(t_second_8x8, t_second_8x8)); 240 best_dir[1] = 241 get_msb(best_dir[1] ^ (best_dir[1] - 1)); // Count trailing zeros 242 243 /* Difference between the optimal variance and the variance along the 244 orthogonal direction. Again, the sum(x^2) terms cancel out. */ 245 *var_out_1st = best_cost[0] - cost_first_8x8[(best_dir[0] + 4) & 7]; 246 *var_out_2nd = best_cost[1] - cost_second_8x8[(best_dir[1] + 4) & 7]; 247 248 /* We'd normally divide by 840, but dividing by 1024 is close enough 249 for what we're going to do with this. */ 250 *var_out_1st >>= 10; 251 *var_out_2nd >>= 10; 252 *out_dir_1st_8x8 = best_dir[0]; 253 *out_dir_2nd_8x8 = best_dir[1]; 254 } 255 256 void cdef_copy_rect8_8bit_to_16bit_avx2(uint16_t *dst, int dstride, 257 const uint8_t *src, int sstride, 258 int width, int height) { 259 int j = 0; 260 int remaining_width = width; 261 assert(height % 2 == 0); 262 assert(height > 0); 263 assert(width > 0); 264 265 // Process multiple 32 pixels at a time. 266 if (remaining_width > 31) { 267 int i = 0; 268 do { 269 j = 0; 270 do { 271 __m128i row00 = 272 _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + (j + 0)]); 273 __m128i row01 = _mm_loadu_si128( 274 (const __m128i *)&src[(i + 0) * sstride + (j + 16)]); 275 __m128i row10 = 276 _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + (j + 0)]); 277 __m128i row11 = _mm_loadu_si128( 278 (const __m128i *)&src[(i + 1) * sstride + (j + 16)]); 279 _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 0)], 280 _mm256_cvtepu8_epi16(row00)); 281 _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + (j + 16)], 282 _mm256_cvtepu8_epi16(row01)); 283 _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 0)], 284 _mm256_cvtepu8_epi16(row10)); 285 _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + (j + 16)], 286 _mm256_cvtepu8_epi16(row11)); 287 j += 32; 288 } while (j <= width - 32); 289 i += 2; 290 } while (i < height); 291 remaining_width = width & 31; 292 } 293 294 // Process 16 pixels at a time. 295 if (remaining_width > 15) { 296 int i = 0; 297 do { 298 __m128i row0 = 299 _mm_loadu_si128((const __m128i *)&src[(i + 0) * sstride + j]); 300 __m128i row1 = 301 _mm_loadu_si128((const __m128i *)&src[(i + 1) * sstride + j]); 302 _mm256_storeu_si256((__m256i *)&dst[(i + 0) * dstride + j], 303 _mm256_cvtepu8_epi16(row0)); 304 _mm256_storeu_si256((__m256i *)&dst[(i + 1) * dstride + j], 305 _mm256_cvtepu8_epi16(row1)); 306 i += 2; 307 } while (i < height); 308 remaining_width = width & 15; 309 j += 16; 310 } 311 312 // Process 8 pixels at a time. 313 if (remaining_width > 7) { 314 int i = 0; 315 do { 316 __m128i row0 = 317 _mm_loadl_epi64((const __m128i *)&src[(i + 0) * sstride + j]); 318 __m128i row1 = 319 _mm_loadl_epi64((const __m128i *)&src[(i + 1) * sstride + j]); 320 _mm_storeu_si128((__m128i *)&dst[(i + 0) * dstride + j], 321 _mm_unpacklo_epi8(row0, _mm_setzero_si128())); 322 _mm_storeu_si128((__m128i *)&dst[(i + 1) * dstride + j], 323 _mm_unpacklo_epi8(row1, _mm_setzero_si128())); 324 i += 2; 325 } while (i < height); 326 remaining_width = width & 7; 327 j += 8; 328 } 329 330 // Process 4 pixels at a time. 331 if (remaining_width > 3) { 332 int i = 0; 333 do { 334 __m128i row0 = 335 _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 0) * sstride + j])); 336 __m128i row1 = 337 _mm_cvtsi32_si128(*((const int32_t *)&src[(i + 1) * sstride + j])); 338 _mm_storel_epi64((__m128i *)&dst[(i + 0) * dstride + j], 339 _mm_unpacklo_epi8(row0, _mm_setzero_si128())); 340 _mm_storel_epi64((__m128i *)&dst[(i + 1) * dstride + j], 341 _mm_unpacklo_epi8(row1, _mm_setzero_si128())); 342 i += 2; 343 } while (i < height); 344 remaining_width = width & 3; 345 j += 4; 346 } 347 348 // Process the remaining pixels. 349 if (remaining_width) { 350 for (int i = 0; i < height; i++) { 351 for (int k = j; k < width; k++) { 352 dst[i * dstride + k] = src[i * sstride + k]; 353 } 354 } 355 } 356 }