cdef_block_neon.c (52372B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 #include "aom_dsp/arm/mem_neon.h" 19 #include "aom_dsp/arm/sum_neon.h" 20 #include "av1/common/cdef_block.h" 21 22 void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, 23 const uint8_t *src, int sstride, 24 int width, int height) { 25 do { 26 const uint8_t *src_ptr = src; 27 uint16_t *dst_ptr = dst; 28 29 int w = 0; 30 while (width - w >= 16) { 31 uint8x16_t row = vld1q_u8(src_ptr + w); 32 uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } }; 33 vst2q_u8((uint8_t *)(dst_ptr + w), row_u16); 34 35 w += 16; 36 } 37 if (width - w >= 8) { 38 uint8x8_t row = vld1_u8(src_ptr + w); 39 vst1q_u16(dst_ptr + w, vmovl_u8(row)); 40 w += 8; 41 } 42 if (width - w == 4) { 43 for (int i = w; i < w + 4; i++) { 44 dst_ptr[i] = src_ptr[i]; 45 } 46 } 47 48 src += sstride; 49 dst += dstride; 50 } while (--height != 0); 51 } 52 53 #if CONFIG_AV1_HIGHBITDEPTH 54 void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, 55 const uint16_t *src, int sstride, 56 int width, int height) { 57 do { 58 const uint16_t *src_ptr = src; 59 uint16_t *dst_ptr = dst; 60 61 int w = 0; 62 while (width - w >= 8) { 63 uint16x8_t row = vld1q_u16(src_ptr + w); 64 vst1q_u16(dst_ptr + w, row); 65 66 w += 8; 67 } 68 if (width - w == 4) { 69 uint16x4_t row = vld1_u16(src_ptr + w); 70 vst1_u16(dst_ptr + w, row); 71 } 72 73 src += sstride; 74 dst += dstride; 75 } while (--height != 0); 76 } 77 #endif // CONFIG_AV1_HIGHBITDEPTH 78 79 // partial A is a 16-bit vector of the form: 80 // [x8 x7 x6 x5 x4 x3 x2 x1] and partial B has the form: 81 // [0 y1 y2 y3 y4 y5 y6 y7]. 82 // This function computes (x1^2+y1^2)*C1 + (x2^2+y2^2)*C2 + ... 83 // (x7^2+y2^7)*C7 + (x8^2+0^2)*C8 where the C1..C8 constants are in const1 84 // and const2. 85 static inline uint32x4_t fold_mul_and_sum_neon(int16x8_t partiala, 86 int16x8_t partialb, 87 uint32x4_t const1, 88 uint32x4_t const2) { 89 // Reverse partial B. 90 // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. 91 uint8x16_t pattern = vreinterpretq_u8_u64( 92 vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), 93 vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); 94 95 #if AOM_ARCH_AARCH64 96 partialb = 97 vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialb), pattern)); 98 #else 99 int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialb)), 100 vget_high_s8(vreinterpretq_s8_s16(partialb)) } }; 101 int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); 102 int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); 103 partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); 104 #endif 105 106 // Square and add the corresponding x and y values. 107 int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); 108 cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); 109 int32x4_t cost_hi = 110 vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); 111 cost_hi = 112 vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); 113 114 // Multiply by constant. 115 uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); 116 cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); 117 return cost; 118 } 119 120 // This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal 121 // down-right, 6 is vertical). 122 // 123 // For each direction the lines are shifted so that we can perform a 124 // basic sum on each vector element. For example, direction 5 is "south by 125 // southeast", so we need to add the pixels along each line i below: 126 // 127 // 0 1 2 3 4 5 6 7 128 // 0 1 2 3 4 5 6 7 129 // 8 0 1 2 3 4 5 6 130 // 8 0 1 2 3 4 5 6 131 // 9 8 0 1 2 3 4 5 132 // 9 8 0 1 2 3 4 5 133 // 10 9 8 0 1 2 3 4 134 // 10 9 8 0 1 2 3 4 135 // 136 // For this to fit nicely in vectors, the lines need to be shifted like so: 137 // 0 1 2 3 4 5 6 7 138 // 0 1 2 3 4 5 6 7 139 // 8 0 1 2 3 4 5 6 140 // 8 0 1 2 3 4 5 6 141 // 9 8 0 1 2 3 4 5 142 // 9 8 0 1 2 3 4 5 143 // 10 9 8 0 1 2 3 4 144 // 10 9 8 0 1 2 3 4 145 // 146 // In this configuration we can now perform SIMD additions to get the cost 147 // along direction 5. Since this won't fit into a single 128-bit vector, we use 148 // two of them to compute each half of the new configuration, and pad the empty 149 // spaces with zeros. Similar shifting is done for other directions, except 150 // direction 6 which is straightforward as it's the vertical direction. 151 static inline uint32x4_t compute_vert_directions_neon(int16x8_t lines[8], 152 uint32_t cost[4]) { 153 const int16x8_t zero = vdupq_n_s16(0); 154 155 // Partial sums for lines 0 and 1. 156 int16x8_t partial4a = vextq_s16(zero, lines[0], 1); 157 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); 158 int16x8_t partial4b = vextq_s16(lines[0], zero, 1); 159 partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); 160 int16x8_t tmp = vaddq_s16(lines[0], lines[1]); 161 int16x8_t partial5a = vextq_s16(zero, tmp, 3); 162 int16x8_t partial5b = vextq_s16(tmp, zero, 3); 163 int16x8_t partial7a = vextq_s16(zero, tmp, 6); 164 int16x8_t partial7b = vextq_s16(tmp, zero, 6); 165 int16x8_t partial6 = tmp; 166 167 // Partial sums for lines 2 and 3. 168 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); 169 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); 170 partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); 171 partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); 172 tmp = vaddq_s16(lines[2], lines[3]); 173 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); 174 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); 175 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); 176 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); 177 partial6 = vaddq_s16(partial6, tmp); 178 179 // Partial sums for lines 4 and 5. 180 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); 181 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); 182 partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); 183 partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); 184 tmp = vaddq_s16(lines[4], lines[5]); 185 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); 186 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); 187 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); 188 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); 189 partial6 = vaddq_s16(partial6, tmp); 190 191 // Partial sums for lines 6 and 7. 192 partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); 193 partial4a = vaddq_s16(partial4a, lines[7]); 194 partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); 195 tmp = vaddq_s16(lines[6], lines[7]); 196 partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); 197 partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); 198 partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); 199 partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); 200 partial6 = vaddq_s16(partial6, tmp); 201 202 uint32x4_t const0 = vreinterpretq_u32_u64( 203 vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), 204 vcreate_u64((uint64_t)210 << 32 | 280))); 205 uint32x4_t const1 = vreinterpretq_u32_u64( 206 vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), 207 vcreate_u64((uint64_t)105 << 32 | 120))); 208 uint32x4_t const2 = vreinterpretq_u32_u64( 209 vcombine_u64(vcreate_u64(0), vcreate_u64((uint64_t)210 << 32 | 420))); 210 uint32x4_t const3 = vreinterpretq_u32_u64( 211 vcombine_u64(vcreate_u64((uint64_t)105 << 32 | 140), 212 vcreate_u64((uint64_t)105 << 32 | 105))); 213 214 // Compute costs in terms of partial sums. 215 int32x4_t partial6_s32 = 216 vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); 217 partial6_s32 = 218 vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); 219 220 uint32x4_t costs[4]; 221 costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); 222 costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); 223 costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); 224 costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); 225 226 costs[0] = horizontal_add_4d_u32x4(costs); 227 vst1q_u32(cost, costs[0]); 228 return costs[0]; 229 } 230 231 static inline uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala, 232 int16x8_t partialb, 233 int16x8_t partialc, 234 uint32x4_t const0) { 235 // Reverse partial c. 236 // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }. 237 uint8x16_t pattern = vreinterpretq_u8_u64( 238 vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a), 239 vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302))); 240 241 #if AOM_ARCH_AARCH64 242 partialc = 243 vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern)); 244 #else 245 int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)), 246 vget_high_s8(vreinterpretq_s8_s16(partialc)) } }; 247 int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); 248 int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); 249 partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); 250 #endif 251 252 int32x4_t partiala_s32 = vpaddlq_s16(partiala); 253 int32x4_t partialb_s32 = vpaddlq_s16(partialb); 254 int32x4_t partialc_s32 = vpaddlq_s16(partialc); 255 256 partiala_s32 = vmulq_s32(partiala_s32, partiala_s32); 257 partialb_s32 = vmulq_s32(partialb_s32, partialb_s32); 258 partialc_s32 = vmulq_s32(partialc_s32, partialc_s32); 259 260 partiala_s32 = vaddq_s32(partiala_s32, partialc_s32); 261 262 uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105); 263 cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0); 264 return cost; 265 } 266 267 // This function computes the cost along directions 0, 1, 2, 3. (0 means 268 // 45-degree up-right, 2 is horizontal). 269 // 270 // For direction 1 and 3 ("east northeast" and "east southeast") the shifted 271 // lines need three vectors instead of two. For direction 1 for example, we need 272 // to compute the sums along the line i below: 273 // 0 0 1 1 2 2 3 3 274 // 1 1 2 2 3 3 4 4 275 // 2 2 3 3 4 4 5 5 276 // 3 3 4 4 5 5 6 6 277 // 4 4 5 5 6 6 7 7 278 // 5 5 6 6 7 7 8 8 279 // 6 6 7 7 8 8 9 9 280 // 7 7 8 8 9 9 10 10 281 // 282 // Which means we need the following configuration: 283 // 0 0 1 1 2 2 3 3 284 // 1 1 2 2 3 3 4 4 285 // 2 2 3 3 4 4 5 5 286 // 3 3 4 4 5 5 6 6 287 // 4 4 5 5 6 6 7 7 288 // 5 5 6 6 7 7 8 8 289 // 6 6 7 7 8 8 9 9 290 // 7 7 8 8 9 9 10 10 291 // 292 // Three vectors are needed to compute this, as well as some extra pairwise 293 // additions. 294 static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8], 295 uint32_t cost[4]) { 296 const int16x8_t zero = vdupq_n_s16(0); 297 298 // Compute diagonal directions (1, 2, 3). 299 // Partial sums for lines 0 and 1. 300 int16x8_t partial0a = lines[0]; 301 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7)); 302 int16x8_t partial0b = vextq_s16(lines[1], zero, 7); 303 int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6)); 304 int16x8_t partial1b = vextq_s16(lines[1], zero, 6); 305 int16x8_t partial3a = vextq_s16(lines[0], zero, 2); 306 partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4)); 307 int16x8_t partial3b = vextq_s16(zero, lines[0], 2); 308 partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4)); 309 310 // Partial sums for lines 2 and 3. 311 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6)); 312 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5)); 313 partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6)); 314 partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5)); 315 partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4)); 316 partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2)); 317 partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4)); 318 partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2)); 319 partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6)); 320 partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6)); 321 partial3b = vaddq_s16(partial3b, lines[3]); 322 323 // Partial sums for lines 4 and 5. 324 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4)); 325 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3)); 326 partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4)); 327 partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3)); 328 partial1b = vaddq_s16(partial1b, lines[4]); 329 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6)); 330 int16x8_t partial1c = vextq_s16(lines[5], zero, 6); 331 partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2)); 332 partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4)); 333 int16x8_t partial3c = vextq_s16(zero, lines[4], 2); 334 partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4)); 335 336 // Partial sums for lines 6 and 7. 337 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2)); 338 partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1)); 339 partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2)); 340 partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1)); 341 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4)); 342 partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2)); 343 partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4)); 344 partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2)); 345 partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6)); 346 partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6)); 347 partial3c = vaddq_s16(partial3c, lines[7]); 348 349 // Special case for direction 2 as it's just a sum along each line. 350 int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] }; 351 int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] }; 352 int32x4_t partial2a = horizontal_add_4d_s16x8(lines03); 353 int32x4_t partial2b = horizontal_add_4d_s16x8(lines47); 354 355 uint32x4_t partial2a_u32 = 356 vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a)); 357 uint32x4_t partial2b_u32 = 358 vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b)); 359 360 uint32x4_t const0 = vreinterpretq_u32_u64( 361 vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), 362 vcreate_u64((uint64_t)210 << 32 | 280))); 363 uint32x4_t const1 = vreinterpretq_u32_u64( 364 vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), 365 vcreate_u64((uint64_t)105 << 32 | 120))); 366 uint32x4_t const2 = vreinterpretq_u32_u64( 367 vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420), 368 vcreate_u64((uint64_t)105 << 32 | 140))); 369 370 uint32x4_t costs[4]; 371 costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1); 372 costs[1] = 373 fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2); 374 costs[2] = vaddq_u32(partial2a_u32, partial2b_u32); 375 costs[2] = vmulq_n_u32(costs[2], 105); 376 costs[3] = 377 fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2); 378 379 costs[0] = horizontal_add_4d_u32x4(costs); 380 vst1q_u32(cost, costs[0]); 381 return costs[0]; 382 } 383 384 int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, 385 int coeff_shift) { 386 uint32_t cost[8]; 387 uint32_t best_cost = 0; 388 int best_dir = 0; 389 int16x8_t lines[8]; 390 for (int i = 0; i < 8; i++) { 391 uint16x8_t s = vld1q_u16(&img[i * stride]); 392 lines[i] = vreinterpretq_s16_u16( 393 vsubq_u16(vshlq_u16(s, vdupq_n_s16(-coeff_shift)), vdupq_n_u16(128))); 394 } 395 396 // Compute "mostly vertical" directions. 397 uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4); 398 399 // Compute "mostly horizontal" directions. 400 uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost); 401 402 // Find max cost as well as its index to get best_dir. 403 // The max cost needs to be propagated in the whole vector to find its 404 // position in the original cost vectors cost03 and cost47. 405 uint32x4_t cost07 = vmaxq_u32(cost03, cost47); 406 #if AOM_ARCH_AARCH64 407 best_cost = vmaxvq_u32(cost07); 408 uint32x4_t max_cost = vdupq_n_u32(best_cost); 409 uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)), 410 vreinterpretq_u8_u32( 411 vceqq_u32(max_cost, cost47)) } }; 412 // idx = { 28, 24, 20, 16, 12, 8, 4, 0 }; 413 uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); 414 // Get the lowest 8 bit of each 32-bit elements and reverse them. 415 uint8x8_t tbl = vqtbl2_u8(costs, idx); 416 uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0); 417 best_dir = aom_clzll(a) >> 3; 418 #else 419 uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07)); 420 cost64 = vpmax_u32(cost64, cost64); 421 uint32x4_t max_cost = vcombine_u32(cost64, cost64); 422 best_cost = vget_lane_u32(cost64, 0); 423 uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)), 424 vmovn_u32(vceqq_u32(max_cost, cost47))); 425 uint8x8_t idx = 426 vand_u8(vmovn_u16(costs), 427 vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); 428 int sum = horizontal_add_u8x8(idx); 429 best_dir = get_msb(sum ^ (sum - 1)); 430 #endif 431 432 // Difference between the optimal variance and the variance along the 433 // orthogonal direction. Again, the sum(x^2) terms cancel out. 434 *var = best_cost - cost[(best_dir + 4) & 7]; 435 // We'd normally divide by 840, but dividing by 1024 is close enough 436 // for what we're going to do with this. 437 *var >>= 10; 438 return best_dir; 439 } 440 441 void cdef_find_dir_dual_neon(const uint16_t *img1, const uint16_t *img2, 442 int stride, int32_t *var_out_1st, 443 int32_t *var_out_2nd, int coeff_shift, 444 int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { 445 // Process first 8x8. 446 *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); 447 448 // Process second 8x8. 449 *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); 450 } 451 452 // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) 453 static inline int16x8_t constrain16(uint16x8_t a, uint16x8_t b, 454 unsigned int threshold, int adjdamp) { 455 uint16x8_t diff = vabdq_u16(a, b); 456 const uint16x8_t a_gt_b = vcgtq_u16(a, b); 457 const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold), 458 vshlq_u16(diff, vdupq_n_s16(-adjdamp))); 459 const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s)); 460 return vbslq_s16(a_gt_b, clip, vnegq_s16(clip)); 461 } 462 463 static inline void primary_filter(uint16x8_t s, uint16x8_t tap[4], 464 const int *pri_taps, int pri_strength, 465 int pri_damping, int16x8_t *sum) { 466 // Near taps 467 int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping); 468 int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping); 469 // sum += pri_taps[0] * (n0 + n1) 470 n0 = vaddq_s16(n0, n1); 471 *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]); 472 473 // Far taps 474 int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping); 475 int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping); 476 // sum += pri_taps[1] * (f0 + f1) 477 f0 = vaddq_s16(f0, f1); 478 *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]); 479 } 480 481 static inline void secondary_filter(uint16x8_t s, uint16x8_t tap[8], 482 const int *sec_taps, int sec_strength, 483 int sec_damping, int16x8_t *sum) { 484 // Near taps 485 int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping); 486 int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping); 487 int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping); 488 int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping); 489 490 // sum += sec_taps[0] * (p0 + p1 + p2 + p3) 491 s0 = vaddq_s16(s0, s1); 492 s2 = vaddq_s16(s2, s3); 493 s0 = vaddq_s16(s0, s2); 494 *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]); 495 496 // Far taps 497 s0 = constrain16(tap[4], s, sec_strength, sec_damping); 498 s1 = constrain16(tap[5], s, sec_strength, sec_damping); 499 s2 = constrain16(tap[6], s, sec_strength, sec_damping); 500 s3 = constrain16(tap[7], s, sec_strength, sec_damping); 501 502 // sum += sec_taps[1] * (p0 + p1 + p2 + p3) 503 s0 = vaddq_s16(s0, s1); 504 s2 = vaddq_s16(s2, s3); 505 s0 = vaddq_s16(s0, s2); 506 *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]); 507 } 508 509 void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, 510 int pri_strength, int sec_strength, int dir, 511 int pri_damping, int sec_damping, int coeff_shift, 512 int block_width, int block_height) { 513 uint16x8_t max, min; 514 const uint16x8_t cdef_large_value_mask = 515 vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); 516 const int po1 = cdef_directions[dir][0]; 517 const int po2 = cdef_directions[dir][1]; 518 const int s1o1 = cdef_directions[dir + 2][0]; 519 const int s1o2 = cdef_directions[dir + 2][1]; 520 const int s2o1 = cdef_directions[dir - 2][0]; 521 const int s2o2 = cdef_directions[dir - 2][1]; 522 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; 523 const int *sec_taps = cdef_sec_taps; 524 525 if (pri_strength) { 526 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 527 } 528 if (sec_strength) { 529 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 530 } 531 532 if (block_width == 8) { 533 uint8_t *dst8 = (uint8_t *)dest; 534 535 int h = block_height; 536 do { 537 int16x8_t sum = vdupq_n_s16(0); 538 uint16x8_t s = vld1q_u16(in); 539 max = min = s; 540 541 uint16x8_t pri_src[4]; 542 543 // Primary near taps 544 pri_src[0] = vld1q_u16(in + po1); 545 pri_src[1] = vld1q_u16(in - po1); 546 547 // Primary far taps 548 pri_src[2] = vld1q_u16(in + po2); 549 pri_src[3] = vld1q_u16(in - po2); 550 551 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); 552 553 // The source is 16 bits, however, we only really care about the lower 554 // 8 bits. The upper 8 bits contain the "large" flag. After the final 555 // primary max has been calculated, zero out the upper 8 bits. Use this 556 // to find the "16 bit" max. 557 uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), 558 vreinterpretq_u8_u16(pri_src[1])); 559 uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), 560 vreinterpretq_u8_u16(pri_src[3])); 561 pri_max0 = vmaxq_u8(pri_max0, pri_max1); 562 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), 563 cdef_large_value_mask)); 564 565 uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); 566 uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); 567 pri_min0 = vminq_u16(pri_min0, pri_min1); 568 min = vminq_u16(min, pri_min0); 569 570 uint16x8_t sec_src[8]; 571 572 // Secondary near taps 573 sec_src[0] = vld1q_u16(in + s1o1); 574 sec_src[1] = vld1q_u16(in - s1o1); 575 sec_src[2] = vld1q_u16(in + s2o1); 576 sec_src[3] = vld1q_u16(in - s2o1); 577 578 // Secondary far taps 579 sec_src[4] = vld1q_u16(in + s1o2); 580 sec_src[5] = vld1q_u16(in - s1o2); 581 sec_src[6] = vld1q_u16(in + s2o2); 582 sec_src[7] = vld1q_u16(in - s2o2); 583 584 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 585 586 // The source is 16 bits, however, we only really care about the lower 587 // 8 bits. The upper 8 bits contain the "large" flag. After the final 588 // primary max has been calculated, zero out the upper 8 bits. Use this 589 // to find the "16 bit" max. 590 uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), 591 vreinterpretq_u8_u16(sec_src[1])); 592 uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), 593 vreinterpretq_u8_u16(sec_src[3])); 594 uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), 595 vreinterpretq_u8_u16(sec_src[5])); 596 uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), 597 vreinterpretq_u8_u16(sec_src[7])); 598 sec_max0 = vmaxq_u8(sec_max0, sec_max1); 599 sec_max2 = vmaxq_u8(sec_max2, sec_max3); 600 sec_max0 = vmaxq_u8(sec_max0, sec_max2); 601 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), 602 cdef_large_value_mask)); 603 604 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); 605 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); 606 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); 607 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); 608 sec_min0 = vminq_u16(sec_min0, sec_min1); 609 sec_min2 = vminq_u16(sec_min2, sec_min3); 610 sec_min0 = vminq_u16(sec_min0, sec_min2); 611 min = vminq_u16(min, sec_min0); 612 613 // res = s + ((sum - (sum < 0) + 8) >> 4) 614 sum = 615 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 616 int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 617 618 res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), 619 vreinterpretq_s16_u16(max)); 620 621 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 622 vst1_u8(dst8, res_u8); 623 624 in += CDEF_BSTRIDE; 625 dst8 += dstride; 626 } while (--h != 0); 627 } else { 628 uint8_t *dst8 = (uint8_t *)dest; 629 630 int h = block_height; 631 do { 632 int16x8_t sum = vdupq_n_s16(0); 633 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 634 max = min = s; 635 636 uint16x8_t pri_src[4]; 637 638 // Primary near taps 639 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); 640 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); 641 642 // Primary far taps 643 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); 644 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); 645 646 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); 647 648 // The source is 16 bits, however, we only really care about the lower 649 // 8 bits. The upper 8 bits contain the "large" flag. After the final 650 // primary max has been calculated, zero out the upper 8 bits. Use this 651 // to find the "16 bit" max. 652 uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), 653 vreinterpretq_u8_u16(pri_src[1])); 654 uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), 655 vreinterpretq_u8_u16(pri_src[3])); 656 pri_max0 = vmaxq_u8(pri_max0, pri_max1); 657 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), 658 cdef_large_value_mask)); 659 660 uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); 661 uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); 662 pri_min1 = vminq_u16(pri_min1, pri_min2); 663 min = vminq_u16(min, pri_min1); 664 665 uint16x8_t sec_src[8]; 666 667 // Secondary near taps 668 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); 669 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); 670 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); 671 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); 672 673 // Secondary far taps 674 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); 675 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); 676 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); 677 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); 678 679 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 680 681 // The source is 16 bits, however, we only really care about the lower 682 // 8 bits. The upper 8 bits contain the "large" flag. After the final 683 // primary max has been calculated, zero out the upper 8 bits. Use this 684 // to find the "16 bit" max. 685 uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), 686 vreinterpretq_u8_u16(sec_src[1])); 687 uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), 688 vreinterpretq_u8_u16(sec_src[3])); 689 uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), 690 vreinterpretq_u8_u16(sec_src[5])); 691 uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), 692 vreinterpretq_u8_u16(sec_src[7])); 693 sec_max0 = vmaxq_u8(sec_max0, sec_max1); 694 sec_max2 = vmaxq_u8(sec_max2, sec_max3); 695 sec_max0 = vmaxq_u8(sec_max0, sec_max2); 696 max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), 697 cdef_large_value_mask)); 698 699 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); 700 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); 701 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); 702 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); 703 sec_min0 = vminq_u16(sec_min0, sec_min1); 704 sec_min2 = vminq_u16(sec_min2, sec_min3); 705 sec_min0 = vminq_u16(sec_min0, sec_min2); 706 min = vminq_u16(min, sec_min0); 707 708 // res = s + ((sum - (sum < 0) + 8) >> 4) 709 sum = 710 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 711 int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 712 713 res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), 714 vreinterpretq_s16_u16(max)); 715 716 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 717 store_u8x4_strided_x2(dst8, dstride, res_u8); 718 719 in += 2 * CDEF_BSTRIDE; 720 dst8 += 2 * dstride; 721 h -= 2; 722 } while (h != 0); 723 } 724 } 725 726 void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, 727 int pri_strength, int sec_strength, int dir, 728 int pri_damping, int sec_damping, int coeff_shift, 729 int block_width, int block_height) { 730 (void)sec_strength; 731 (void)sec_damping; 732 733 const int po1 = cdef_directions[dir][0]; 734 const int po2 = cdef_directions[dir][1]; 735 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; 736 737 if (pri_strength) { 738 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 739 } 740 741 if (block_width == 8) { 742 uint8_t *dst8 = (uint8_t *)dest; 743 744 int h = block_height; 745 do { 746 int16x8_t sum = vdupq_n_s16(0); 747 uint16x8_t s = vld1q_u16(in); 748 749 uint16x8_t tap[4]; 750 751 // Primary near taps 752 tap[0] = vld1q_u16(in + po1); 753 tap[1] = vld1q_u16(in - po1); 754 755 // Primary far taps 756 tap[2] = vld1q_u16(in + po2); 757 tap[3] = vld1q_u16(in - po2); 758 759 primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); 760 761 // res = s + ((sum - (sum < 0) + 8) >> 4) 762 sum = 763 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 764 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 765 766 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 767 vst1_u8(dst8, res_u8); 768 769 in += CDEF_BSTRIDE; 770 dst8 += dstride; 771 } while (--h != 0); 772 773 } else { 774 uint8_t *dst8 = (uint8_t *)dest; 775 776 int h = block_height; 777 do { 778 int16x8_t sum = vdupq_n_s16(0); 779 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 780 781 uint16x8_t pri_src[4]; 782 783 // Primary near taps 784 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); 785 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); 786 787 // Primary far taps 788 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); 789 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); 790 791 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); 792 793 // res = s + ((sum - (sum < 0) + 8) >> 4) 794 sum = 795 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 796 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 797 798 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 799 store_u8x4_strided_x2(dst8, dstride, res_u8); 800 801 in += 2 * CDEF_BSTRIDE; 802 dst8 += 2 * dstride; 803 h -= 2; 804 } while (h != 0); 805 } 806 } 807 808 void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, 809 int pri_strength, int sec_strength, int dir, 810 int pri_damping, int sec_damping, int coeff_shift, 811 int block_width, int block_height) { 812 (void)pri_strength; 813 (void)pri_damping; 814 (void)coeff_shift; 815 816 const int s1o1 = cdef_directions[dir + 2][0]; 817 const int s1o2 = cdef_directions[dir + 2][1]; 818 const int s2o1 = cdef_directions[dir - 2][0]; 819 const int s2o2 = cdef_directions[dir - 2][1]; 820 const int *sec_taps = cdef_sec_taps; 821 822 if (sec_strength) { 823 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 824 } 825 826 if (block_width == 8) { 827 uint8_t *dst8 = (uint8_t *)dest; 828 829 int h = block_height; 830 do { 831 int16x8_t sum = vdupq_n_s16(0); 832 uint16x8_t s = vld1q_u16(in); 833 834 uint16x8_t sec_src[8]; 835 836 // Secondary near taps 837 sec_src[0] = vld1q_u16(in + s1o1); 838 sec_src[1] = vld1q_u16(in - s1o1); 839 sec_src[2] = vld1q_u16(in + s2o1); 840 sec_src[3] = vld1q_u16(in - s2o1); 841 842 // Secondary far taps 843 sec_src[4] = vld1q_u16(in + s1o2); 844 sec_src[5] = vld1q_u16(in - s1o2); 845 sec_src[6] = vld1q_u16(in + s2o2); 846 sec_src[7] = vld1q_u16(in - s2o2); 847 848 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 849 850 // res = s + ((sum - (sum < 0) + 8) >> 4) 851 sum = 852 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 853 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 854 855 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 856 vst1_u8(dst8, res_u8); 857 858 in += CDEF_BSTRIDE; 859 dst8 += dstride; 860 } while (--h != 0); 861 } else { 862 uint8_t *dst8 = (uint8_t *)dest; 863 864 int h = block_height; 865 do { 866 int16x8_t sum = vdupq_n_s16(0); 867 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 868 869 uint16x8_t sec_src[8]; 870 871 // Secondary near taps 872 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); 873 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); 874 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); 875 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); 876 877 // Secondary far taps 878 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); 879 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); 880 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); 881 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); 882 883 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 884 885 // res = s + ((sum - (sum < 0) + 8) >> 4) 886 sum = 887 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 888 const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 889 890 const uint8x8_t res_u8 = vqmovun_s16(res_s16); 891 store_u8x4_strided_x2(dst8, dstride, res_u8); 892 893 in += 2 * CDEF_BSTRIDE; 894 dst8 += 2 * dstride; 895 h -= 2; 896 } while (h != 0); 897 } 898 } 899 900 void cdef_filter_8_3_neon(void *dest, int dstride, const uint16_t *in, 901 int pri_strength, int sec_strength, int dir, 902 int pri_damping, int sec_damping, int coeff_shift, 903 int block_width, int block_height) { 904 (void)pri_strength; 905 (void)sec_strength; 906 (void)dir; 907 (void)pri_damping; 908 (void)sec_damping; 909 (void)coeff_shift; 910 (void)block_width; 911 if (block_width == 8) { 912 uint8_t *dst8 = (uint8_t *)dest; 913 914 int h = block_height; 915 do { 916 const uint16x8_t s = vld1q_u16(in); 917 const uint8x8_t res = vqmovn_u16(s); 918 vst1_u8(dst8, res); 919 920 in += CDEF_BSTRIDE; 921 dst8 += dstride; 922 } while (--h != 0); 923 } else { 924 uint8_t *dst8 = (uint8_t *)dest; 925 926 int h = block_height; 927 do { 928 const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 929 const uint8x8_t res = vqmovn_u16(s); 930 store_u8x4_strided_x2(dst8, dstride, res); 931 932 in += 2 * CDEF_BSTRIDE; 933 dst8 += 2 * dstride; 934 h -= 2; 935 } while (h != 0); 936 } 937 } 938 939 void cdef_filter_16_0_neon(void *dest, int dstride, const uint16_t *in, 940 int pri_strength, int sec_strength, int dir, 941 int pri_damping, int sec_damping, int coeff_shift, 942 int block_width, int block_height) { 943 uint16x8_t max, min; 944 const uint16x8_t cdef_large_value_mask = 945 vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); 946 const int po1 = cdef_directions[dir][0]; 947 const int po2 = cdef_directions[dir][1]; 948 const int s1o1 = cdef_directions[dir + 2][0]; 949 const int s1o2 = cdef_directions[dir + 2][1]; 950 const int s2o1 = cdef_directions[dir - 2][0]; 951 const int s2o2 = cdef_directions[dir - 2][1]; 952 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; 953 const int *sec_taps = cdef_sec_taps; 954 955 if (pri_strength) { 956 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 957 } 958 if (sec_strength) { 959 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 960 } 961 962 if (block_width == 8) { 963 uint16_t *dst16 = (uint16_t *)dest; 964 965 int h = block_height; 966 do { 967 int16x8_t sum = vdupq_n_s16(0); 968 uint16x8_t s = vld1q_u16(in); 969 max = min = s; 970 971 uint16x8_t pri_src[4]; 972 973 // Primary near taps 974 pri_src[0] = vld1q_u16(in + po1); 975 pri_src[1] = vld1q_u16(in - po1); 976 977 // Primary far taps 978 pri_src[2] = vld1q_u16(in + po2); 979 pri_src[3] = vld1q_u16(in - po2); 980 981 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); 982 983 uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); 984 uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); 985 pri_min0 = vminq_u16(pri_min0, pri_min1); 986 min = vminq_u16(min, pri_min0); 987 988 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ 989 pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); 990 pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); 991 pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); 992 pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); 993 994 uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); 995 uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); 996 pri_max0 = vmaxq_u16(pri_max0, pri_max1); 997 max = vmaxq_u16(max, pri_max0); 998 999 uint16x8_t sec_src[8]; 1000 1001 // Secondary near taps 1002 sec_src[0] = vld1q_u16(in + s1o1); 1003 sec_src[1] = vld1q_u16(in - s1o1); 1004 sec_src[2] = vld1q_u16(in + s2o1); 1005 sec_src[3] = vld1q_u16(in - s2o1); 1006 1007 // Secondary far taps 1008 sec_src[4] = vld1q_u16(in + s1o2); 1009 sec_src[5] = vld1q_u16(in - s1o2); 1010 sec_src[6] = vld1q_u16(in + s2o2); 1011 sec_src[7] = vld1q_u16(in - s2o2); 1012 1013 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 1014 1015 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); 1016 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); 1017 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); 1018 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); 1019 sec_min0 = vminq_u16(sec_min0, sec_min1); 1020 sec_min2 = vminq_u16(sec_min2, sec_min3); 1021 sec_min0 = vminq_u16(sec_min0, sec_min2); 1022 min = vminq_u16(min, sec_min0); 1023 1024 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ 1025 sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); 1026 sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); 1027 sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); 1028 sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); 1029 sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); 1030 sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); 1031 sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); 1032 sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); 1033 1034 uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); 1035 uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); 1036 uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); 1037 uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); 1038 sec_max0 = vmaxq_u16(sec_max0, sec_max1); 1039 sec_max2 = vmaxq_u16(sec_max2, sec_max3); 1040 sec_max0 = vmaxq_u16(sec_max0, sec_max2); 1041 max = vmaxq_u16(max, sec_max0); 1042 1043 // res = s + ((sum - (sum < 0) + 8) >> 4) 1044 sum = 1045 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 1046 int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 1047 1048 res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), 1049 vreinterpretq_s16_u16(max)); 1050 1051 vst1q_u16(dst16, vreinterpretq_u16_s16(res)); 1052 1053 in += CDEF_BSTRIDE; 1054 dst16 += dstride; 1055 } while (--h != 0); 1056 } else { 1057 uint16_t *dst16 = (uint16_t *)dest; 1058 1059 int h = block_height; 1060 do { 1061 int16x8_t sum = vdupq_n_s16(0); 1062 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 1063 max = min = s; 1064 1065 uint16x8_t pri_src[4]; 1066 1067 // Primary near taps 1068 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); 1069 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); 1070 1071 // Primary far taps 1072 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); 1073 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); 1074 1075 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); 1076 1077 uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); 1078 uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); 1079 pri_min1 = vminq_u16(pri_min1, pri_min2); 1080 min = vminq_u16(min, pri_min1); 1081 1082 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ 1083 pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); 1084 pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); 1085 pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); 1086 pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); 1087 uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); 1088 uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); 1089 pri_max0 = vmaxq_u16(pri_max0, pri_max1); 1090 max = vmaxq_u16(max, pri_max0); 1091 1092 uint16x8_t sec_src[8]; 1093 1094 // Secondary near taps 1095 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); 1096 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); 1097 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); 1098 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); 1099 1100 // Secondary far taps 1101 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); 1102 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); 1103 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); 1104 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); 1105 1106 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 1107 1108 uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); 1109 uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); 1110 uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); 1111 uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); 1112 sec_min0 = vminq_u16(sec_min0, sec_min1); 1113 sec_min2 = vminq_u16(sec_min2, sec_min3); 1114 sec_min0 = vminq_u16(sec_min0, sec_min2); 1115 min = vminq_u16(min, sec_min0); 1116 1117 /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ 1118 sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); 1119 sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); 1120 sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); 1121 sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); 1122 sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); 1123 sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); 1124 sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); 1125 sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); 1126 1127 uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); 1128 uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); 1129 uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); 1130 uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); 1131 sec_max0 = vmaxq_u16(sec_max0, sec_max1); 1132 sec_max2 = vmaxq_u16(sec_max2, sec_max3); 1133 sec_max0 = vmaxq_u16(sec_max0, sec_max2); 1134 max = vmaxq_u16(max, sec_max0); 1135 1136 // res = s + ((sum - (sum < 0) + 8) >> 4) 1137 sum = 1138 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 1139 int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 1140 1141 res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), 1142 vreinterpretq_s16_u16(max)); 1143 1144 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); 1145 1146 in += 2 * CDEF_BSTRIDE; 1147 dst16 += 2 * dstride; 1148 h -= 2; 1149 } while (h != 0); 1150 } 1151 } 1152 1153 void cdef_filter_16_1_neon(void *dest, int dstride, const uint16_t *in, 1154 int pri_strength, int sec_strength, int dir, 1155 int pri_damping, int sec_damping, int coeff_shift, 1156 int block_width, int block_height) { 1157 (void)sec_strength; 1158 (void)sec_damping; 1159 1160 const int po1 = cdef_directions[dir][0]; 1161 const int po2 = cdef_directions[dir][1]; 1162 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; 1163 1164 if (pri_strength) { 1165 pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); 1166 } 1167 1168 if (block_width == 8) { 1169 uint16_t *dst16 = (uint16_t *)dest; 1170 1171 int h = block_height; 1172 do { 1173 int16x8_t sum = vdupq_n_s16(0); 1174 uint16x8_t s = vld1q_u16(in); 1175 1176 uint16x8_t tap[4]; 1177 1178 // Primary near taps 1179 tap[0] = vld1q_u16(in + po1); 1180 tap[1] = vld1q_u16(in - po1); 1181 1182 // Primary far taps 1183 tap[2] = vld1q_u16(in + po2); 1184 tap[3] = vld1q_u16(in - po2); 1185 1186 primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); 1187 1188 // res = s + ((sum - (sum < 0) + 8) >> 4) 1189 sum = 1190 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 1191 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 1192 1193 vst1q_u16(dst16, vreinterpretq_u16_s16(res)); 1194 1195 in += CDEF_BSTRIDE; 1196 dst16 += dstride; 1197 } while (--h != 0); 1198 } else { 1199 uint16_t *dst16 = (uint16_t *)dest; 1200 1201 int h = block_height; 1202 do { 1203 int16x8_t sum = vdupq_n_s16(0); 1204 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 1205 1206 uint16x8_t pri_src[4]; 1207 1208 // Primary near taps 1209 pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); 1210 pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); 1211 1212 // Primary far taps 1213 pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); 1214 pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); 1215 1216 primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); 1217 1218 // res = s + ((sum - (sum < 0) + 8) >> 4) 1219 sum = 1220 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 1221 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 1222 1223 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); 1224 1225 in += 2 * CDEF_BSTRIDE; 1226 dst16 += 2 * dstride; 1227 h -= 2; 1228 } while (h != 0); 1229 } 1230 } 1231 1232 void cdef_filter_16_2_neon(void *dest, int dstride, const uint16_t *in, 1233 int pri_strength, int sec_strength, int dir, 1234 int pri_damping, int sec_damping, int coeff_shift, 1235 int block_width, int block_height) { 1236 (void)pri_strength; 1237 (void)pri_damping; 1238 (void)coeff_shift; 1239 1240 const int s1o1 = cdef_directions[dir + 2][0]; 1241 const int s1o2 = cdef_directions[dir + 2][1]; 1242 const int s2o1 = cdef_directions[dir - 2][0]; 1243 const int s2o2 = cdef_directions[dir - 2][1]; 1244 const int *sec_taps = cdef_sec_taps; 1245 1246 if (sec_strength) { 1247 sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); 1248 } 1249 1250 if (block_width == 8) { 1251 uint16_t *dst16 = (uint16_t *)dest; 1252 1253 int h = block_height; 1254 do { 1255 int16x8_t sum = vdupq_n_s16(0); 1256 uint16x8_t s = vld1q_u16(in); 1257 1258 uint16x8_t sec_src[8]; 1259 1260 // Secondary near taps 1261 sec_src[0] = vld1q_u16(in + s1o1); 1262 sec_src[1] = vld1q_u16(in - s1o1); 1263 sec_src[2] = vld1q_u16(in + s2o1); 1264 sec_src[3] = vld1q_u16(in - s2o1); 1265 1266 // Secondary far taps 1267 sec_src[4] = vld1q_u16(in + s1o2); 1268 sec_src[5] = vld1q_u16(in - s1o2); 1269 sec_src[6] = vld1q_u16(in + s2o2); 1270 sec_src[7] = vld1q_u16(in - s2o2); 1271 1272 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 1273 1274 // res = s + ((sum - (sum < 0) + 8) >> 4) 1275 sum = 1276 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 1277 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 1278 1279 vst1q_u16(dst16, vreinterpretq_u16_s16(res)); 1280 1281 in += CDEF_BSTRIDE; 1282 dst16 += dstride; 1283 } while (--h != 0); 1284 } else { 1285 uint16_t *dst16 = (uint16_t *)dest; 1286 1287 int h = block_height; 1288 do { 1289 int16x8_t sum = vdupq_n_s16(0); 1290 uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 1291 1292 uint16x8_t sec_src[8]; 1293 1294 // Secondary near taps 1295 sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); 1296 sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); 1297 sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); 1298 sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); 1299 1300 // Secondary far taps 1301 sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); 1302 sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); 1303 sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); 1304 sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); 1305 1306 secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); 1307 1308 // res = s + ((sum - (sum < 0) + 8) >> 4) 1309 sum = 1310 vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); 1311 const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); 1312 1313 store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); 1314 1315 in += 2 * CDEF_BSTRIDE; 1316 dst16 += 2 * dstride; 1317 h -= 2; 1318 } while (h != 0); 1319 } 1320 } 1321 1322 void cdef_filter_16_3_neon(void *dest, int dstride, const uint16_t *in, 1323 int pri_strength, int sec_strength, int dir, 1324 int pri_damping, int sec_damping, int coeff_shift, 1325 int block_width, int block_height) { 1326 (void)pri_strength; 1327 (void)sec_strength; 1328 (void)dir; 1329 (void)pri_damping; 1330 (void)sec_damping; 1331 (void)coeff_shift; 1332 (void)block_width; 1333 if (block_width == 8) { 1334 uint16_t *dst16 = (uint16_t *)dest; 1335 1336 int h = block_height; 1337 do { 1338 const uint16x8_t s = vld1q_u16(in); 1339 vst1q_u16(dst16, s); 1340 1341 in += CDEF_BSTRIDE; 1342 dst16 += dstride; 1343 } while (--h != 0); 1344 } else { 1345 uint16_t *dst16 = (uint16_t *)dest; 1346 1347 int h = block_height; 1348 do { 1349 const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); 1350 store_u16x4_strided_x2(dst16, dstride, s); 1351 1352 in += 2 * CDEF_BSTRIDE; 1353 dst16 += 2 * dstride; 1354 h -= 2; 1355 } while (h != 0); 1356 } 1357 }