filmgrain_tmpl.c (18884B)
1 /* 2 * Copyright © 2018, Niklas Haas 3 * Copyright © 2018, VideoLAN and dav1d authors 4 * Copyright © 2018, Two Orioles, LLC 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions are met: 9 * 10 * 1. Redistributions of source code must retain the above copyright notice, this 11 * list of conditions and the following disclaimer. 12 * 13 * 2. Redistributions in binary form must reproduce the above copyright notice, 14 * this list of conditions and the following disclaimer in the documentation 15 * and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 19 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 20 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 21 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 22 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 23 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 24 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 25 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27 */ 28 29 #include "common/attributes.h" 30 #include "common/intops.h" 31 32 #include "src/filmgrain.h" 33 #include "src/tables.h" 34 35 #define SUB_GRAIN_WIDTH 44 36 #define SUB_GRAIN_HEIGHT 38 37 38 static inline int get_random_number(const int bits, unsigned *const state) { 39 const int r = *state; 40 unsigned bit = ((r >> 0) ^ (r >> 1) ^ (r >> 3) ^ (r >> 12)) & 1; 41 *state = (r >> 1) | (bit << 15); 42 43 return (*state >> (16 - bits)) & ((1 << bits) - 1); 44 } 45 46 static inline int round2(const int x, const uint64_t shift) { 47 return (x + ((1 << shift) >> 1)) >> shift; 48 } 49 50 static void generate_grain_y_c(entry buf[][GRAIN_WIDTH], 51 const Dav1dFilmGrainData *const data 52 HIGHBD_DECL_SUFFIX) 53 { 54 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 55 unsigned seed = data->seed; 56 const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; 57 const int grain_ctr = 128 << bitdepth_min_8; 58 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; 59 60 for (int y = 0; y < GRAIN_HEIGHT; y++) { 61 for (int x = 0; x < GRAIN_WIDTH; x++) { 62 const int value = get_random_number(11, &seed); 63 buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); 64 } 65 } 66 67 const int ar_pad = 3; 68 const int ar_lag = data->ar_coeff_lag; 69 70 for (int y = ar_pad; y < GRAIN_HEIGHT; y++) { 71 for (int x = ar_pad; x < GRAIN_WIDTH - ar_pad; x++) { 72 const int8_t *coeff = data->ar_coeffs_y; 73 int sum = 0; 74 for (int dy = -ar_lag; dy <= 0; dy++) { 75 for (int dx = -ar_lag; dx <= ar_lag; dx++) { 76 if (!dx && !dy) 77 break; 78 sum += *(coeff++) * buf[y + dy][x + dx]; 79 } 80 } 81 82 const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); 83 buf[y][x] = iclip(grain, grain_min, grain_max); 84 } 85 } 86 } 87 88 static NOINLINE void 89 generate_grain_uv_c(entry buf[][GRAIN_WIDTH], 90 const entry buf_y[][GRAIN_WIDTH], 91 const Dav1dFilmGrainData *const data, const intptr_t uv, 92 const int subx, const int suby HIGHBD_DECL_SUFFIX) 93 { 94 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 95 unsigned seed = data->seed ^ (uv ? 0x49d8 : 0xb524); 96 const int shift = 4 - bitdepth_min_8 + data->grain_scale_shift; 97 const int grain_ctr = 128 << bitdepth_min_8; 98 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; 99 100 const int chromaW = subx ? SUB_GRAIN_WIDTH : GRAIN_WIDTH; 101 const int chromaH = suby ? SUB_GRAIN_HEIGHT : GRAIN_HEIGHT; 102 103 for (int y = 0; y < chromaH; y++) { 104 for (int x = 0; x < chromaW; x++) { 105 const int value = get_random_number(11, &seed); 106 buf[y][x] = round2(dav1d_gaussian_sequence[ value ], shift); 107 } 108 } 109 110 const int ar_pad = 3; 111 const int ar_lag = data->ar_coeff_lag; 112 113 for (int y = ar_pad; y < chromaH; y++) { 114 for (int x = ar_pad; x < chromaW - ar_pad; x++) { 115 const int8_t *coeff = data->ar_coeffs_uv[uv]; 116 int sum = 0; 117 for (int dy = -ar_lag; dy <= 0; dy++) { 118 for (int dx = -ar_lag; dx <= ar_lag; dx++) { 119 // For the final (current) pixel, we need to add in the 120 // contribution from the luma grain texture 121 if (!dx && !dy) { 122 if (!data->num_y_points) 123 break; 124 int luma = 0; 125 const int lumaX = ((x - ar_pad) << subx) + ar_pad; 126 const int lumaY = ((y - ar_pad) << suby) + ar_pad; 127 for (int i = 0; i <= suby; i++) { 128 for (int j = 0; j <= subx; j++) { 129 luma += buf_y[lumaY + i][lumaX + j]; 130 } 131 } 132 luma = round2(luma, subx + suby); 133 sum += luma * (*coeff); 134 break; 135 } 136 137 sum += *(coeff++) * buf[y + dy][x + dx]; 138 } 139 } 140 141 const int grain = buf[y][x] + round2(sum, data->ar_coeff_shift); 142 buf[y][x] = iclip(grain, grain_min, grain_max); 143 } 144 } 145 } 146 147 #define gnuv_ss_fn(nm, ss_x, ss_y) \ 148 static decl_generate_grain_uv_fn(generate_grain_uv_##nm##_c) { \ 149 generate_grain_uv_c(buf, buf_y, data, uv, ss_x, ss_y HIGHBD_TAIL_SUFFIX); \ 150 } 151 152 gnuv_ss_fn(420, 1, 1); 153 gnuv_ss_fn(422, 1, 0); 154 gnuv_ss_fn(444, 0, 0); 155 156 // samples from the correct block of a grain LUT, while taking into account the 157 // offsets provided by the offsets cache 158 static inline entry sample_lut(const entry grain_lut[][GRAIN_WIDTH], 159 const int offsets[2][2], const int subx, const int suby, 160 const int bx, const int by, const int x, const int y) 161 { 162 const int randval = offsets[bx][by]; 163 const int offx = 3 + (2 >> subx) * (3 + (randval >> 4)); 164 const int offy = 3 + (2 >> suby) * (3 + (randval & 0xF)); 165 return grain_lut[offy + y + (FG_BLOCK_SIZE >> suby) * by] 166 [offx + x + (FG_BLOCK_SIZE >> subx) * bx]; 167 } 168 169 static void fgy_32x32xn_c(pixel *const dst_row, const pixel *const src_row, 170 const ptrdiff_t stride, 171 const Dav1dFilmGrainData *const data, const size_t pw, 172 const uint8_t scaling[SCALING_SIZE], 173 const entry grain_lut[][GRAIN_WIDTH], 174 const int bh, const int row_num HIGHBD_DECL_SUFFIX) 175 { 176 const int rows = 1 + (data->overlap_flag && row_num > 0); 177 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 178 const int grain_ctr = 128 << bitdepth_min_8; 179 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; 180 181 int min_value, max_value; 182 if (data->clip_to_restricted_range) { 183 min_value = 16 << bitdepth_min_8; 184 max_value = 235 << bitdepth_min_8; 185 } else { 186 min_value = 0; 187 max_value = BITDEPTH_MAX; 188 } 189 190 // seed[0] contains the current row, seed[1] contains the previous 191 unsigned seed[2]; 192 for (int i = 0; i < rows; i++) { 193 seed[i] = data->seed; 194 seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; 195 seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); 196 } 197 198 assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0); 199 200 int offsets[2 /* col offset */][2 /* row offset */]; 201 202 // process this row in FG_BLOCK_SIZE^2 blocks 203 for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE) { 204 const int bw = imin(FG_BLOCK_SIZE, (int) pw - bx); 205 206 if (data->overlap_flag && bx) { 207 // shift previous offsets left 208 for (int i = 0; i < rows; i++) 209 offsets[1][i] = offsets[0][i]; 210 } 211 212 // update current offsets 213 for (int i = 0; i < rows; i++) 214 offsets[0][i] = get_random_number(8, &seed[i]); 215 216 // x/y block offsets to compensate for overlapped regions 217 const int ystart = data->overlap_flag && row_num ? imin(2, bh) : 0; 218 const int xstart = data->overlap_flag && bx ? imin(2, bw) : 0; 219 220 static const int w[2][2] = { { 27, 17 }, { 17, 27 } }; 221 222 #define add_noise_y(x, y, grain) \ 223 const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (x) + bx; \ 224 pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (x) + bx; \ 225 const int noise = round2(scaling[ *src ] * (grain), data->scaling_shift); \ 226 *dst = iclip(*src + noise, min_value, max_value); 227 228 for (int y = ystart; y < bh; y++) { 229 // Non-overlapped image region (straightforward) 230 for (int x = xstart; x < bw; x++) { 231 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); 232 add_noise_y(x, y, grain); 233 } 234 235 // Special case for overlapped column 236 for (int x = 0; x < xstart; x++) { 237 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); 238 int old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); 239 grain = round2(old * w[x][0] + grain * w[x][1], 5); 240 grain = iclip(grain, grain_min, grain_max); 241 add_noise_y(x, y, grain); 242 } 243 } 244 245 for (int y = 0; y < ystart; y++) { 246 // Special case for overlapped row (sans corner) 247 for (int x = xstart; x < bw; x++) { 248 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); 249 int old = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); 250 grain = round2(old * w[y][0] + grain * w[y][1], 5); 251 grain = iclip(grain, grain_min, grain_max); 252 add_noise_y(x, y, grain); 253 } 254 255 // Special case for doubly-overlapped corner 256 for (int x = 0; x < xstart; x++) { 257 // Blend the top pixel with the top left block 258 int top = sample_lut(grain_lut, offsets, 0, 0, 0, 1, x, y); 259 int old = sample_lut(grain_lut, offsets, 0, 0, 1, 1, x, y); 260 top = round2(old * w[x][0] + top * w[x][1], 5); 261 top = iclip(top, grain_min, grain_max); 262 263 // Blend the current pixel with the left block 264 int grain = sample_lut(grain_lut, offsets, 0, 0, 0, 0, x, y); 265 old = sample_lut(grain_lut, offsets, 0, 0, 1, 0, x, y); 266 grain = round2(old * w[x][0] + grain * w[x][1], 5); 267 grain = iclip(grain, grain_min, grain_max); 268 269 // Mix the row rows together and apply grain 270 grain = round2(top * w[y][0] + grain * w[y][1], 5); 271 grain = iclip(grain, grain_min, grain_max); 272 add_noise_y(x, y, grain); 273 } 274 } 275 } 276 } 277 278 static NOINLINE void 279 fguv_32x32xn_c(pixel *const dst_row, const pixel *const src_row, 280 const ptrdiff_t stride, const Dav1dFilmGrainData *const data, 281 const size_t pw, const uint8_t scaling[SCALING_SIZE], 282 const entry grain_lut[][GRAIN_WIDTH], const int bh, 283 const int row_num, const pixel *const luma_row, 284 const ptrdiff_t luma_stride, const int uv, const int is_id, 285 const int sx, const int sy HIGHBD_DECL_SUFFIX) 286 { 287 const int rows = 1 + (data->overlap_flag && row_num > 0); 288 const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8; 289 const int grain_ctr = 128 << bitdepth_min_8; 290 const int grain_min = -grain_ctr, grain_max = grain_ctr - 1; 291 292 int min_value, max_value; 293 if (data->clip_to_restricted_range) { 294 min_value = 16 << bitdepth_min_8; 295 max_value = (is_id ? 235 : 240) << bitdepth_min_8; 296 } else { 297 min_value = 0; 298 max_value = BITDEPTH_MAX; 299 } 300 301 // seed[0] contains the current row, seed[1] contains the previous 302 unsigned seed[2]; 303 for (int i = 0; i < rows; i++) { 304 seed[i] = data->seed; 305 seed[i] ^= (((row_num - i) * 37 + 178) & 0xFF) << 8; 306 seed[i] ^= (((row_num - i) * 173 + 105) & 0xFF); 307 } 308 309 assert(stride % (FG_BLOCK_SIZE * sizeof(pixel)) == 0); 310 311 int offsets[2 /* col offset */][2 /* row offset */]; 312 313 // process this row in FG_BLOCK_SIZE^2 blocks (subsampled) 314 for (unsigned bx = 0; bx < pw; bx += FG_BLOCK_SIZE >> sx) { 315 const int bw = imin(FG_BLOCK_SIZE >> sx, (int)(pw - bx)); 316 if (data->overlap_flag && bx) { 317 // shift previous offsets left 318 for (int i = 0; i < rows; i++) 319 offsets[1][i] = offsets[0][i]; 320 } 321 322 // update current offsets 323 for (int i = 0; i < rows; i++) 324 offsets[0][i] = get_random_number(8, &seed[i]); 325 326 // x/y block offsets to compensate for overlapped regions 327 const int ystart = data->overlap_flag && row_num ? imin(2 >> sy, bh) : 0; 328 const int xstart = data->overlap_flag && bx ? imin(2 >> sx, bw) : 0; 329 330 static const int w[2 /* sub */][2 /* off */][2] = { 331 { { 27, 17 }, { 17, 27 } }, 332 { { 23, 22 } }, 333 }; 334 335 #define add_noise_uv(x, y, grain) \ 336 const int lx = (bx + x) << sx; \ 337 const int ly = y << sy; \ 338 const pixel *const luma = luma_row + ly * PXSTRIDE(luma_stride) + lx; \ 339 pixel avg = luma[0]; \ 340 if (sx) \ 341 avg = (avg + luma[1] + 1) >> 1; \ 342 const pixel *const src = src_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ 343 pixel *const dst = dst_row + (y) * PXSTRIDE(stride) + (bx + (x)); \ 344 int val = avg; \ 345 if (!data->chroma_scaling_from_luma) { \ 346 const int combined = avg * data->uv_luma_mult[uv] + \ 347 *src * data->uv_mult[uv]; \ 348 val = iclip_pixel( (combined >> 6) + \ 349 (data->uv_offset[uv] * (1 << bitdepth_min_8)) ); \ 350 } \ 351 const int noise = round2(scaling[ val ] * (grain), data->scaling_shift); \ 352 *dst = iclip(*src + noise, min_value, max_value); 353 354 for (int y = ystart; y < bh; y++) { 355 // Non-overlapped image region (straightforward) 356 for (int x = xstart; x < bw; x++) { 357 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); 358 add_noise_uv(x, y, grain); 359 } 360 361 // Special case for overlapped column 362 for (int x = 0; x < xstart; x++) { 363 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); 364 int old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); 365 grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); 366 grain = iclip(grain, grain_min, grain_max); 367 add_noise_uv(x, y, grain); 368 } 369 } 370 371 for (int y = 0; y < ystart; y++) { 372 // Special case for overlapped row (sans corner) 373 for (int x = xstart; x < bw; x++) { 374 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); 375 int old = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); 376 grain = round2(old * w[sy][y][0] + grain * w[sy][y][1], 5); 377 grain = iclip(grain, grain_min, grain_max); 378 add_noise_uv(x, y, grain); 379 } 380 381 // Special case for doubly-overlapped corner 382 for (int x = 0; x < xstart; x++) { 383 // Blend the top pixel with the top left block 384 int top = sample_lut(grain_lut, offsets, sx, sy, 0, 1, x, y); 385 int old = sample_lut(grain_lut, offsets, sx, sy, 1, 1, x, y); 386 top = round2(old * w[sx][x][0] + top * w[sx][x][1], 5); 387 top = iclip(top, grain_min, grain_max); 388 389 // Blend the current pixel with the left block 390 int grain = sample_lut(grain_lut, offsets, sx, sy, 0, 0, x, y); 391 old = sample_lut(grain_lut, offsets, sx, sy, 1, 0, x, y); 392 grain = round2(old * w[sx][x][0] + grain * w[sx][x][1], 5); 393 grain = iclip(grain, grain_min, grain_max); 394 395 // Mix the row rows together and apply to image 396 grain = round2(top * w[sy][y][0] + grain * w[sy][y][1], 5); 397 grain = iclip(grain, grain_min, grain_max); 398 add_noise_uv(x, y, grain); 399 } 400 } 401 } 402 } 403 404 #define fguv_ss_fn(nm, ss_x, ss_y) \ 405 static decl_fguv_32x32xn_fn(fguv_32x32xn_##nm##_c) { \ 406 fguv_32x32xn_c(dst_row, src_row, stride, data, pw, scaling, grain_lut, bh, \ 407 row_num, luma_row, luma_stride, uv_pl, is_id, ss_x, ss_y \ 408 HIGHBD_TAIL_SUFFIX); \ 409 } 410 411 fguv_ss_fn(420, 1, 1); 412 fguv_ss_fn(422, 1, 0); 413 fguv_ss_fn(444, 0, 0); 414 415 #if HAVE_ASM 416 #if ARCH_AARCH64 || ARCH_ARM 417 #include "src/arm/filmgrain.h" 418 #elif ARCH_X86 419 #include "src/x86/filmgrain.h" 420 #endif 421 #endif 422 423 COLD void bitfn(dav1d_film_grain_dsp_init)(Dav1dFilmGrainDSPContext *const c) { 424 c->generate_grain_y = generate_grain_y_c; 425 c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I420 - 1] = generate_grain_uv_420_c; 426 c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I422 - 1] = generate_grain_uv_422_c; 427 c->generate_grain_uv[DAV1D_PIXEL_LAYOUT_I444 - 1] = generate_grain_uv_444_c; 428 429 c->fgy_32x32xn = fgy_32x32xn_c; 430 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I420 - 1] = fguv_32x32xn_420_c; 431 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I422 - 1] = fguv_32x32xn_422_c; 432 c->fguv_32x32xn[DAV1D_PIXEL_LAYOUT_I444 - 1] = fguv_32x32xn_444_c; 433 434 #if HAVE_ASM 435 #if ARCH_AARCH64 || ARCH_ARM 436 film_grain_dsp_init_arm(c); 437 #elif ARCH_X86 438 film_grain_dsp_init_x86(c); 439 #endif 440 #endif 441 }