cdef_block.c (18391B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <math.h> 13 #include <stdlib.h> 14 15 #include "config/aom_dsp_rtcd.h" 16 #include "config/av1_rtcd.h" 17 18 #include "av1/common/cdef.h" 19 /* 20 This is Cdef_Directions (section 7.15.3) with 2 padding entries at the 21 beginning and end of the table. The cdef direction range is [0, 7] and the 22 first index is offset +/-2. This removes the need to constrain the first 23 index to the same range using e.g., & 7. 24 */ 25 DECLARE_ALIGNED(16, static const int, cdef_directions_padded[12][2]) = { 26 /* Padding: cdef_directions[6] */ 27 { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, 28 /* Padding: cdef_directions[7] */ 29 { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, 30 31 /* Begin cdef_directions */ 32 { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, 33 { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, 34 { 0 * CDEF_BSTRIDE + 1, 0 * CDEF_BSTRIDE + 2 }, 35 { 0 * CDEF_BSTRIDE + 1, 1 * CDEF_BSTRIDE + 2 }, 36 { 1 * CDEF_BSTRIDE + 1, 2 * CDEF_BSTRIDE + 2 }, 37 { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 1 }, 38 { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE + 0 }, 39 { 1 * CDEF_BSTRIDE + 0, 2 * CDEF_BSTRIDE - 1 }, 40 /* End cdef_directions */ 41 42 /* Padding: cdef_directions[0] */ 43 { -1 * CDEF_BSTRIDE + 1, -2 * CDEF_BSTRIDE + 2 }, 44 /* Padding: cdef_directions[1] */ 45 { 0 * CDEF_BSTRIDE + 1, -1 * CDEF_BSTRIDE + 2 }, 46 }; 47 48 const int (*const cdef_directions)[2] = cdef_directions_padded + 2; 49 50 /* Detect direction. 0 means 45-degree up-right, 2 is horizontal, and so on. 51 The search minimizes the weighted variance along all the lines in a 52 particular direction, i.e. the squared error between the input and a 53 "predicted" block where each pixel is replaced by the average along a line 54 in a particular direction. Since each direction have the same sum(x^2) term, 55 that term is never computed. See Section 2, step 2, of: 56 http://jmvalin.ca/notes/intra_paint.pdf */ 57 int cdef_find_dir_c(const uint16_t *img, int stride, int32_t *var, 58 int coeff_shift) { 59 int i; 60 int32_t cost[8] = { 0 }; 61 int partial[8][15] = { { 0 } }; 62 int32_t best_cost = 0; 63 int best_dir = 0; 64 /* Instead of dividing by n between 2 and 8, we multiply by 3*5*7*8/n. 65 The output is then 840 times larger, but we don't care for finding 66 the max. */ 67 static const int div_table[] = { 0, 840, 420, 280, 210, 168, 140, 120, 105 }; 68 for (i = 0; i < 8; i++) { 69 int j; 70 for (j = 0; j < 8; j++) { 71 int x; 72 /* We subtract 128 here to reduce the maximum range of the squared 73 partial sums. */ 74 x = (img[i * stride + j] >> coeff_shift) - 128; 75 partial[0][i + j] += x; 76 partial[1][i + j / 2] += x; 77 partial[2][i] += x; 78 partial[3][3 + i - j / 2] += x; 79 partial[4][7 + i - j] += x; 80 partial[5][3 - i / 2 + j] += x; 81 partial[6][j] += x; 82 partial[7][i / 2 + j] += x; 83 } 84 } 85 for (i = 0; i < 8; i++) { 86 cost[2] += partial[2][i] * partial[2][i]; 87 cost[6] += partial[6][i] * partial[6][i]; 88 } 89 cost[2] *= div_table[8]; 90 cost[6] *= div_table[8]; 91 for (i = 0; i < 7; i++) { 92 cost[0] += (partial[0][i] * partial[0][i] + 93 partial[0][14 - i] * partial[0][14 - i]) * 94 div_table[i + 1]; 95 cost[4] += (partial[4][i] * partial[4][i] + 96 partial[4][14 - i] * partial[4][14 - i]) * 97 div_table[i + 1]; 98 } 99 cost[0] += partial[0][7] * partial[0][7] * div_table[8]; 100 cost[4] += partial[4][7] * partial[4][7] * div_table[8]; 101 for (i = 1; i < 8; i += 2) { 102 int j; 103 for (j = 0; j < 4 + 1; j++) { 104 cost[i] += partial[i][3 + j] * partial[i][3 + j]; 105 } 106 cost[i] *= div_table[8]; 107 for (j = 0; j < 4 - 1; j++) { 108 cost[i] += (partial[i][j] * partial[i][j] + 109 partial[i][10 - j] * partial[i][10 - j]) * 110 div_table[2 * j + 2]; 111 } 112 } 113 for (i = 0; i < 8; i++) { 114 if (cost[i] > best_cost) { 115 best_cost = cost[i]; 116 best_dir = i; 117 } 118 } 119 /* Difference between the optimal variance and the variance along the 120 orthogonal direction. Again, the sum(x^2) terms cancel out. */ 121 *var = best_cost - cost[(best_dir + 4) & 7]; 122 /* We'd normally divide by 840, but dividing by 1024 is close enough 123 for what we're going to do with this. */ 124 *var >>= 10; 125 return best_dir; 126 } 127 128 void cdef_find_dir_dual_c(const uint16_t *img1, const uint16_t *img2, 129 int stride, int32_t *var1, int32_t *var2, 130 int coeff_shift, int *out1, int *out2) { 131 *out1 = cdef_find_dir_c(img1, stride, var1, coeff_shift); 132 *out2 = cdef_find_dir_c(img2, stride, var2, coeff_shift); 133 } 134 135 const int cdef_pri_taps[2][2] = { { 4, 2 }, { 3, 3 } }; 136 const int cdef_sec_taps[2] = { 2, 1 }; 137 138 /* Smooth in the direction detected. */ 139 static void cdef_filter_block_internal( 140 uint8_t *dst8, uint16_t *dst16, int dstride, const uint16_t *in, 141 int pri_strength, int sec_strength, int dir, int pri_damping, 142 int sec_damping, int coeff_shift, int block_width, int block_height, 143 int enable_primary, int enable_secondary) { 144 const int clipping_required = (enable_primary && enable_secondary); 145 int i, j, k; 146 const int s = CDEF_BSTRIDE; 147 const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; 148 const int *sec_taps = cdef_sec_taps; 149 for (i = 0; i < block_height; i++) { 150 for (j = 0; j < block_width; j++) { 151 int16_t sum = 0; 152 int16_t y; 153 int16_t x = in[i * s + j]; 154 int max = x; 155 int min = x; 156 for (k = 0; k < 2; k++) { 157 if (enable_primary) { 158 int16_t p0 = in[i * s + j + cdef_directions[dir][k]]; 159 int16_t p1 = in[i * s + j - cdef_directions[dir][k]]; 160 sum += pri_taps[k] * constrain(p0 - x, pri_strength, pri_damping); 161 sum += pri_taps[k] * constrain(p1 - x, pri_strength, pri_damping); 162 if (clipping_required) { 163 if (p0 != CDEF_VERY_LARGE) max = AOMMAX(p0, max); 164 if (p1 != CDEF_VERY_LARGE) max = AOMMAX(p1, max); 165 min = AOMMIN(p0, min); 166 min = AOMMIN(p1, min); 167 } 168 } 169 if (enable_secondary) { 170 int16_t s0 = in[i * s + j + cdef_directions[dir + 2][k]]; 171 int16_t s1 = in[i * s + j - cdef_directions[dir + 2][k]]; 172 int16_t s2 = in[i * s + j + cdef_directions[dir - 2][k]]; 173 int16_t s3 = in[i * s + j - cdef_directions[dir - 2][k]]; 174 if (clipping_required) { 175 if (s0 != CDEF_VERY_LARGE) max = AOMMAX(s0, max); 176 if (s1 != CDEF_VERY_LARGE) max = AOMMAX(s1, max); 177 if (s2 != CDEF_VERY_LARGE) max = AOMMAX(s2, max); 178 if (s3 != CDEF_VERY_LARGE) max = AOMMAX(s3, max); 179 min = AOMMIN(s0, min); 180 min = AOMMIN(s1, min); 181 min = AOMMIN(s2, min); 182 min = AOMMIN(s3, min); 183 } 184 sum += sec_taps[k] * constrain(s0 - x, sec_strength, sec_damping); 185 sum += sec_taps[k] * constrain(s1 - x, sec_strength, sec_damping); 186 sum += sec_taps[k] * constrain(s2 - x, sec_strength, sec_damping); 187 sum += sec_taps[k] * constrain(s3 - x, sec_strength, sec_damping); 188 } 189 } 190 y = ((int16_t)x + ((8 + sum - (sum < 0)) >> 4)); 191 if (clipping_required) { 192 y = clamp(y, min, max); 193 } 194 195 if (dst8) 196 dst8[i * dstride + j] = (uint8_t)y; 197 else 198 dst16[i * dstride + j] = (uint16_t)y; 199 } 200 } 201 } 202 203 void cdef_filter_8_0_c(void *dst8, int dstride, const uint16_t *in, 204 int pri_strength, int sec_strength, int dir, 205 int pri_damping, int sec_damping, int coeff_shift, 206 int block_width, int block_height) { 207 cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, 208 sec_strength, dir, pri_damping, sec_damping, 209 coeff_shift, block_width, block_height, 210 /*enable_primary=*/1, /*enable_secondary=*/1); 211 } 212 213 void cdef_filter_8_1_c(void *dst8, int dstride, const uint16_t *in, 214 int pri_strength, int sec_strength, int dir, 215 int pri_damping, int sec_damping, int coeff_shift, 216 int block_width, int block_height) { 217 cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, 218 sec_strength, dir, pri_damping, sec_damping, 219 coeff_shift, block_width, block_height, 220 /*enable_primary=*/1, /*enable_secondary=*/0); 221 } 222 223 void cdef_filter_8_2_c(void *dst8, int dstride, const uint16_t *in, 224 int pri_strength, int sec_strength, int dir, 225 int pri_damping, int sec_damping, int coeff_shift, 226 int block_width, int block_height) { 227 cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, 228 sec_strength, dir, pri_damping, sec_damping, 229 coeff_shift, block_width, block_height, 230 /*enable_primary=*/0, /*enable_secondary=*/1); 231 } 232 233 void cdef_filter_8_3_c(void *dst8, int dstride, const uint16_t *in, 234 int pri_strength, int sec_strength, int dir, 235 int pri_damping, int sec_damping, int coeff_shift, 236 int block_width, int block_height) { 237 cdef_filter_block_internal((uint8_t *)dst8, NULL, dstride, in, pri_strength, 238 sec_strength, dir, pri_damping, sec_damping, 239 coeff_shift, block_width, block_height, 240 /*enable_primary=*/0, /*enable_secondary=*/0); 241 } 242 243 void cdef_filter_16_0_c(void *dst16, int dstride, const uint16_t *in, 244 int pri_strength, int sec_strength, int dir, 245 int pri_damping, int sec_damping, int coeff_shift, 246 int block_width, int block_height) { 247 cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, 248 sec_strength, dir, pri_damping, sec_damping, 249 coeff_shift, block_width, block_height, 250 /*enable_primary=*/1, /*enable_secondary=*/1); 251 } 252 253 void cdef_filter_16_1_c(void *dst16, int dstride, const uint16_t *in, 254 int pri_strength, int sec_strength, int dir, 255 int pri_damping, int sec_damping, int coeff_shift, 256 int block_width, int block_height) { 257 cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, 258 sec_strength, dir, pri_damping, sec_damping, 259 coeff_shift, block_width, block_height, 260 /*enable_primary=*/1, /*enable_secondary=*/0); 261 } 262 263 void cdef_filter_16_2_c(void *dst16, int dstride, const uint16_t *in, 264 int pri_strength, int sec_strength, int dir, 265 int pri_damping, int sec_damping, int coeff_shift, 266 int block_width, int block_height) { 267 cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, 268 sec_strength, dir, pri_damping, sec_damping, 269 coeff_shift, block_width, block_height, 270 /*enable_primary=*/0, /*enable_secondary=*/1); 271 } 272 273 void cdef_filter_16_3_c(void *dst16, int dstride, const uint16_t *in, 274 int pri_strength, int sec_strength, int dir, 275 int pri_damping, int sec_damping, int coeff_shift, 276 int block_width, int block_height) { 277 cdef_filter_block_internal(NULL, (uint16_t *)dst16, dstride, in, pri_strength, 278 sec_strength, dir, pri_damping, sec_damping, 279 coeff_shift, block_width, block_height, 280 /*enable_primary=*/0, /*enable_secondary=*/0); 281 } 282 283 /* Compute the primary filter strength for an 8x8 block based on the 284 directional variance difference. A high variance difference means 285 that we have a highly directional pattern (e.g. a high contrast 286 edge), so we can apply more deringing. A low variance means that we 287 either have a low contrast edge, or a non-directional texture, so 288 we want to be careful not to blur. */ 289 static inline int adjust_strength(int strength, int32_t var) { 290 const int i = var >> 6 ? AOMMIN(get_msb(var >> 6), 12) : 0; 291 /* We use the variance of 8x8 blocks to adjust the strength. */ 292 return var ? (strength * (4 + i) + 8) >> 4 : 0; 293 } 294 295 static inline void aom_cdef_find_dir(const uint16_t *in, cdef_list *dlist, 296 int var[CDEF_NBLOCKS][CDEF_NBLOCKS], 297 int cdef_count, int coeff_shift, 298 int dir[CDEF_NBLOCKS][CDEF_NBLOCKS]) { 299 int bi; 300 301 // Find direction of two 8x8 blocks together. 302 for (bi = 0; bi < cdef_count - 1; bi += 2) { 303 const int by = dlist[bi].by; 304 const int bx = dlist[bi].bx; 305 const int by2 = dlist[bi + 1].by; 306 const int bx2 = dlist[bi + 1].bx; 307 const int pos1 = 8 * by * CDEF_BSTRIDE + 8 * bx; 308 const int pos2 = 8 * by2 * CDEF_BSTRIDE + 8 * bx2; 309 cdef_find_dir_dual(&in[pos1], &in[pos2], CDEF_BSTRIDE, &var[by][bx], 310 &var[by2][bx2], coeff_shift, &dir[by][bx], 311 &dir[by2][bx2]); 312 } 313 314 // Process remaining 8x8 blocks here. One 8x8 at a time. 315 if (cdef_count % 2) { 316 const int by = dlist[bi].by; 317 const int bx = dlist[bi].bx; 318 dir[by][bx] = cdef_find_dir(&in[8 * by * CDEF_BSTRIDE + 8 * bx], 319 CDEF_BSTRIDE, &var[by][bx], coeff_shift); 320 } 321 } 322 323 void av1_cdef_filter_fb(uint8_t *dst8, uint16_t *dst16, int dstride, 324 const uint16_t *in, int xdec, int ydec, 325 int dir[CDEF_NBLOCKS][CDEF_NBLOCKS], int *dirinit, 326 int var[CDEF_NBLOCKS][CDEF_NBLOCKS], int pli, 327 cdef_list *dlist, int cdef_count, int level, 328 int sec_strength, int damping, int coeff_shift) { 329 int bi; 330 int bx; 331 int by; 332 const int pri_strength = level << coeff_shift; 333 sec_strength <<= coeff_shift; 334 damping += coeff_shift - (pli != AOM_PLANE_Y); 335 const int bw_log2 = 3 - xdec; 336 const int bh_log2 = 3 - ydec; 337 if (dirinit && pri_strength == 0 && sec_strength == 0) { 338 // If we're here, both primary and secondary strengths are 0, and 339 // we still haven't written anything to y[] yet, so we just copy 340 // the input to y[]. This is necessary only for av1_cdef_search() 341 // and only av1_cdef_search() sets dirinit. 342 for (bi = 0; bi < cdef_count; bi++) { 343 by = dlist[bi].by; 344 bx = dlist[bi].bx; 345 // TODO(stemidts/jmvalin): SIMD optimisations 346 for (int iy = 0; iy < 1 << bh_log2; iy++) { 347 memcpy(&dst16[(bi << (bw_log2 + bh_log2)) + (iy << bw_log2)], 348 &in[((by << bh_log2) + iy) * CDEF_BSTRIDE + (bx << bw_log2)], 349 ((size_t)1 << bw_log2) * sizeof(*dst16)); 350 } 351 } 352 return; 353 } 354 355 if (pli == 0) { 356 if (!dirinit || !*dirinit) { 357 aom_cdef_find_dir(in, dlist, var, cdef_count, coeff_shift, dir); 358 if (dirinit) *dirinit = 1; 359 } 360 } 361 if (pli == 1 && xdec != ydec) { 362 for (bi = 0; bi < cdef_count; bi++) { 363 static const int conv422[8] = { 7, 0, 2, 4, 5, 6, 6, 6 }; 364 static const int conv440[8] = { 1, 2, 2, 2, 3, 4, 6, 0 }; 365 by = dlist[bi].by; 366 bx = dlist[bi].bx; 367 dir[by][bx] = (xdec ? conv422 : conv440)[dir[by][bx]]; 368 } 369 } 370 371 if (dst8) { 372 const int block_width = 8 >> xdec; 373 const int block_height = 8 >> ydec; 374 /* 375 * strength_index == 0 : enable_primary = 1, enable_secondary = 1 376 * strength_index == 1 : enable_primary = 1, enable_secondary = 0 377 * strength_index == 2 : enable_primary = 0, enable_secondary = 1 378 * strength_index == 3 : enable_primary = 0, enable_secondary = 0 379 */ 380 const cdef_filter_block_func cdef_filter_fn[4] = { 381 cdef_filter_8_0, cdef_filter_8_1, cdef_filter_8_2, cdef_filter_8_3 382 }; 383 384 for (bi = 0; bi < cdef_count; bi++) { 385 by = dlist[bi].by; 386 bx = dlist[bi].bx; 387 const int t = 388 (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); 389 const int strength_index = (sec_strength == 0) | ((t == 0) << 1); 390 391 cdef_filter_fn[strength_index]( 392 &dst8[(by << bh_log2) * dstride + (bx << bw_log2)], dstride, 393 &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, 394 sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, 395 coeff_shift, block_width, block_height); 396 } 397 } else { 398 const int block_width = 8 >> xdec; 399 const int block_height = 8 >> ydec; 400 /* 401 * strength_index == 0 : enable_primary = 1, enable_secondary = 1 402 * strength_index == 1 : enable_primary = 1, enable_secondary = 0 403 * strength_index == 2 : enable_primary = 0, enable_secondary = 1 404 * strength_index == 3 : enable_primary = 0, enable_secondary = 0 405 */ 406 const cdef_filter_block_func cdef_filter_fn[4] = { 407 cdef_filter_16_0, cdef_filter_16_1, cdef_filter_16_2, cdef_filter_16_3 408 }; 409 410 for (bi = 0; bi < cdef_count; bi++) { 411 by = dlist[bi].by; 412 bx = dlist[bi].bx; 413 const int t = 414 (pli ? pri_strength : adjust_strength(pri_strength, var[by][bx])); 415 const int strength_index = (sec_strength == 0) | ((t == 0) << 1); 416 417 cdef_filter_fn[strength_index]( 418 &dst16[dirinit ? bi << (bw_log2 + bh_log2) 419 : (by << bh_log2) * dstride + (bx << bw_log2)], 420 dirinit ? 1 << bw_log2 : dstride, 421 &in[(by * CDEF_BSTRIDE << bh_log2) + (bx << bw_log2)], t, 422 sec_strength, pri_strength ? dir[by][bx] : 0, damping, damping, 423 coeff_shift, block_width, block_height); 424 } 425 } 426 }