hash_motion.c (18662B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <stdbool.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "av1/encoder/block.h" 18 #include "av1/encoder/hash.h" 19 #include "av1/encoder/hash_motion.h" 20 21 #define kSrcBits 16 22 // kMaxAddr is the number of hash table buckets in p_hash_table->p_lookup_table. 23 // p_hash_table->p_lookup_table consists of 6 hash tables of 1 << kSrcBits 24 // buckets each. Each of the 6 supported block sizes (4, 8, 16, 32, 64, 128) has 25 // its own hash table, indexed by the return value of 26 // hash_block_size_to_index(). 27 #define kMaxAddr (6 << kSrcBits) 28 #define kMaxCandidatesPerHashBucket 256 29 30 static void get_pixels_in_1D_char_array_by_block_2x2(const uint8_t *y_src, 31 int stride, 32 uint8_t *p_pixels_in1D) { 33 const uint8_t *p_pel = y_src; 34 int index = 0; 35 for (int i = 0; i < 2; i++) { 36 for (int j = 0; j < 2; j++) { 37 p_pixels_in1D[index++] = p_pel[j]; 38 } 39 p_pel += stride; 40 } 41 } 42 43 static void get_pixels_in_1D_short_array_by_block_2x2(const uint16_t *y_src, 44 int stride, 45 uint16_t *p_pixels_in1D) { 46 const uint16_t *p_pel = y_src; 47 int index = 0; 48 for (int i = 0; i < 2; i++) { 49 for (int j = 0; j < 2; j++) { 50 p_pixels_in1D[index++] = p_pel[j]; 51 } 52 p_pel += stride; 53 } 54 } 55 56 // the hash value (hash_value1) consists of two parts, the first 3 bits relate 57 // to the block size and the remaining 16 bits are the crc values. This 58 // function is used to get the first 3 bits. 59 static int hash_block_size_to_index(int block_size) { 60 switch (block_size) { 61 case 4: return 0; 62 case 8: return 1; 63 case 16: return 2; 64 case 32: return 3; 65 case 64: return 4; 66 case 128: return 5; 67 default: return -1; 68 } 69 } 70 71 static uint32_t get_identity_hash_value(const uint8_t a, const uint8_t b, 72 const uint8_t c, const uint8_t d) { 73 // The four input values add up to 32 bits, which is the size of the output. 74 // Just pack those values as is. 75 return ((uint32_t)a << 24) + ((uint32_t)b << 16) + ((uint32_t)c << 8) + 76 ((uint32_t)d); 77 } 78 79 static uint32_t get_xor_hash_value_hbd(const uint16_t a, const uint16_t b, 80 const uint16_t c, const uint16_t d) { 81 uint32_t result; 82 // Pack the lower 8 bits of each input value to the 32 bit output, then xor 83 // with the upper 8 bits of each input value. 84 result = ((uint32_t)(a & 0x00ff) << 24) + ((uint32_t)(b & 0x00ff) << 16) + 85 ((uint32_t)(c & 0x00ff) << 8) + ((uint32_t)(d & 0x00ff)); 86 result ^= ((uint32_t)(a & 0xff00) << 16) + ((uint32_t)(b & 0xff00) << 8) + 87 ((uint32_t)(c & 0xff00)) + ((uint32_t)(d & 0xff00) >> 8); 88 return result; 89 } 90 91 void av1_hash_table_init(IntraBCHashInfo *intrabc_hash_info) { 92 if (!intrabc_hash_info->crc_initialized) { 93 av1_crc32c_calculator_init(&intrabc_hash_info->crc_calculator); 94 intrabc_hash_info->crc_initialized = 1; 95 } 96 intrabc_hash_info->intrabc_hash_table.p_lookup_table = NULL; 97 } 98 99 static void clear_all(hash_table *p_hash_table) { 100 if (p_hash_table->p_lookup_table == NULL) { 101 return; 102 } 103 for (int i = 0; i < kMaxAddr; i++) { 104 if (p_hash_table->p_lookup_table[i] != NULL) { 105 aom_vector_destroy(p_hash_table->p_lookup_table[i]); 106 aom_free(p_hash_table->p_lookup_table[i]); 107 p_hash_table->p_lookup_table[i] = NULL; 108 } 109 } 110 } 111 112 void av1_hash_table_destroy(hash_table *p_hash_table) { 113 clear_all(p_hash_table); 114 aom_free(p_hash_table->p_lookup_table); 115 p_hash_table->p_lookup_table = NULL; 116 } 117 118 bool av1_hash_table_create(hash_table *p_hash_table) { 119 if (p_hash_table->p_lookup_table != NULL) { 120 clear_all(p_hash_table); 121 return true; 122 } 123 p_hash_table->p_lookup_table = 124 (Vector **)aom_calloc(kMaxAddr, sizeof(p_hash_table->p_lookup_table[0])); 125 if (!p_hash_table->p_lookup_table) return false; 126 return true; 127 } 128 129 static bool hash_table_add_to_table(hash_table *p_hash_table, 130 uint32_t hash_value, 131 const block_hash *curr_block_hash) { 132 if (p_hash_table->p_lookup_table[hash_value] == NULL) { 133 p_hash_table->p_lookup_table[hash_value] = 134 aom_malloc(sizeof(*p_hash_table->p_lookup_table[hash_value])); 135 if (p_hash_table->p_lookup_table[hash_value] == NULL) { 136 return false; 137 } 138 if (aom_vector_setup(p_hash_table->p_lookup_table[hash_value], 10, 139 sizeof(*curr_block_hash)) == VECTOR_ERROR) 140 return false; 141 } 142 // Place an upper bound each hash table bucket to up to 256 intrabc 143 // block candidates, and ignore subsequent ones. Considering more can 144 // unnecessarily slow down encoding for virtually no efficiency gain. 145 if (aom_vector_byte_size(p_hash_table->p_lookup_table[hash_value]) < 146 kMaxCandidatesPerHashBucket * sizeof(*curr_block_hash)) { 147 if (aom_vector_push_back(p_hash_table->p_lookup_table[hash_value], 148 (void *)curr_block_hash) == VECTOR_ERROR) 149 return false; 150 } 151 return true; 152 } 153 154 int32_t av1_hash_table_count(const hash_table *p_hash_table, 155 uint32_t hash_value) { 156 if (p_hash_table->p_lookup_table[hash_value] == NULL) { 157 return 0; 158 } else { 159 return (int32_t)(p_hash_table->p_lookup_table[hash_value]->size); 160 } 161 } 162 163 Iterator av1_hash_get_first_iterator(hash_table *p_hash_table, 164 uint32_t hash_value) { 165 assert(av1_hash_table_count(p_hash_table, hash_value) > 0); 166 return aom_vector_begin(p_hash_table->p_lookup_table[hash_value]); 167 } 168 169 void av1_generate_block_2x2_hash_value(const YV12_BUFFER_CONFIG *picture, 170 uint32_t *pic_block_hash) { 171 const int width = 2; 172 const int height = 2; 173 const int x_end = picture->y_crop_width - width + 1; 174 const int y_end = picture->y_crop_height - height + 1; 175 176 if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { 177 uint16_t p[4]; 178 int pos = 0; 179 for (int y_pos = 0; y_pos < y_end; y_pos++) { 180 for (int x_pos = 0; x_pos < x_end; x_pos++) { 181 get_pixels_in_1D_short_array_by_block_2x2( 182 CONVERT_TO_SHORTPTR(picture->y_buffer) + y_pos * picture->y_stride + 183 x_pos, 184 picture->y_stride, p); 185 // For HBD, we either have 40 or 48 bits of input data that the xor hash 186 // reduce to 32 bits. We intentionally don't want to "discard" bits to 187 // avoid any kind of biasing. 188 pic_block_hash[pos] = get_xor_hash_value_hbd(p[0], p[1], p[2], p[3]); 189 pos++; 190 } 191 pos += width - 1; 192 } 193 } else { 194 uint8_t p[4]; 195 int pos = 0; 196 for (int y_pos = 0; y_pos < y_end; y_pos++) { 197 for (int x_pos = 0; x_pos < x_end; x_pos++) { 198 get_pixels_in_1D_char_array_by_block_2x2( 199 picture->y_buffer + y_pos * picture->y_stride + x_pos, 200 picture->y_stride, p); 201 // This 2x2 hash isn't used directly as a "key" for the hash table, so 202 // we can afford to just copy the 4 8-bit pixel values as a single 203 // 32-bit value directly. (i.e. there are no concerns of a lack of 204 // uniform distribution) 205 pic_block_hash[pos] = get_identity_hash_value(p[0], p[1], p[2], p[3]); 206 pos++; 207 } 208 pos += width - 1; 209 } 210 } 211 } 212 213 void av1_generate_block_hash_value(IntraBCHashInfo *intrabc_hash_info, 214 const YV12_BUFFER_CONFIG *picture, 215 int block_size, 216 const uint32_t *src_pic_block_hash, 217 uint32_t *dst_pic_block_hash) { 218 CRC32C *calc = &intrabc_hash_info->crc_calculator; 219 220 const int pic_width = picture->y_crop_width; 221 const int x_end = picture->y_crop_width - block_size + 1; 222 const int y_end = picture->y_crop_height - block_size + 1; 223 const int src_size = block_size >> 1; 224 225 uint32_t p[4]; 226 const int length = sizeof(p); 227 228 int pos = 0; 229 for (int y_pos = 0; y_pos < y_end; y_pos++) { 230 for (int x_pos = 0; x_pos < x_end; x_pos++) { 231 // Build up a bigger block from 4 smaller, non-overlapping source block 232 // hashes, and compute its hash. Note: source blocks at the right and 233 // bottom borders cannot be part of larger blocks, therefore they won't be 234 // considered into the block hash value generation process. 235 p[0] = src_pic_block_hash[pos]; 236 p[1] = src_pic_block_hash[pos + src_size]; 237 p[2] = src_pic_block_hash[pos + src_size * pic_width]; 238 p[3] = src_pic_block_hash[pos + src_size * pic_width + src_size]; 239 // TODO: bug aomedia:433531610 - serialize input values in a way that's 240 // independent of the computer architecture's endianness 241 dst_pic_block_hash[pos] = 242 av1_get_crc32c_value(calc, (uint8_t *)p, length); 243 pos++; 244 } 245 pos += block_size - 1; 246 } 247 } 248 249 bool av1_add_to_hash_map_by_row_with_precal_data(hash_table *p_hash_table, 250 const uint32_t *pic_hash, 251 int pic_width, int pic_height, 252 int block_size) { 253 const int x_end = pic_width - block_size + 1; 254 const int y_end = pic_height - block_size + 1; 255 256 int add_value = hash_block_size_to_index(block_size); 257 assert(add_value >= 0); 258 add_value <<= kSrcBits; 259 const int crc_mask = (1 << kSrcBits) - 1; 260 int step = block_size; 261 int x_offset = 0; 262 int y_offset = 0; 263 264 // Explore the entire frame hierarchically to add intrabc candidate blocks to 265 // the hash table, by starting with coarser steps (the block size), towards 266 // finer-grained steps until every candidate block has been considered. 267 // The nested for loop goes through the pic_hash array column by column. 268 269 // Doing a hierarchical block exploration helps maximize spatial dispersion 270 // of the first and foremost candidate blocks while minimizing overlap between 271 // them. This is helpful because we only keep up to 256 entries of the 272 // same candidate block (located in different places), so we want those 273 // entries to cover the biggest area of the image to encode to maximize coding 274 // efficiency. 275 276 // This is the coordinate exploration order example for an 8x8 region, with 277 // block_size = 4. The top-left corner (x, y) coordinates of each candidate 278 // block are shown below. There are 5 * 5 (25) candidate blocks. 279 // x 0 1 2 3 4 5 6 7 280 // y +------------------------ 281 // 0 | 1 10 5 13 3 282 // 1 | 16 22 18 24 20 283 // 2 | 7 11 9 14 8 284 // 3 | 17 23 19 25 21 285 // 4 | 2 12 6 15 4--------+ 286 // 5 | | 4 x 4 | 287 // 6 | | block | 288 // 7 | +--------+ 289 290 // Please note that due to the way block exploration works, the smallest step 291 // used is 2 (i.e. no two adjacent blocks will be explored consecutively). 292 // Also, the exploration is designed to visit each block candidate only once. 293 while (step > 1) { 294 for (int x_pos = x_offset; x_pos < x_end; x_pos += step) { 295 for (int y_pos = y_offset; y_pos < y_end; y_pos += step) { 296 const int pos = y_pos * pic_width + x_pos; 297 block_hash curr_block_hash; 298 299 curr_block_hash.x = x_pos; 300 curr_block_hash.y = y_pos; 301 302 const uint32_t hash_value1 = (pic_hash[pos] & crc_mask) + add_value; 303 curr_block_hash.hash_value2 = pic_hash[pos]; 304 305 if (!hash_table_add_to_table(p_hash_table, hash_value1, 306 &curr_block_hash)) { 307 return false; 308 } 309 } 310 } 311 312 // Adjust offsets and step sizes with this state machine. 313 // State 0 is needed because no blocks in pic_hash have been explored, 314 // so exploration requires a way to account for blocks with both zero 315 // x_offset and zero y_offset. 316 // State 0 is always meant to be executed first, but the relative order of 317 // states 1, 2 and 3 can be arbitrary, as long as no two adjacent blocks 318 // are explored consecutively. 319 if (x_offset == 0 && y_offset == 0) { 320 // State 0 -> State 1: special case 321 // This state transition will only execute when step == block_size 322 x_offset = step / 2; 323 } else if (x_offset == step / 2 && y_offset == 0) { 324 // State 1 -> State 2 325 x_offset = 0; 326 y_offset = step / 2; 327 } else if (x_offset == 0 && y_offset == step / 2) { 328 // State 2 -> State 3 329 x_offset = step / 2; 330 } else { 331 assert(x_offset == step / 2 && y_offset == step / 2); 332 // State 3 -> State 1: We've fully explored all the coordinates for the 333 // current step size, continue by halving the step size 334 step /= 2; 335 x_offset = step / 2; 336 y_offset = 0; 337 } 338 } 339 340 return true; 341 } 342 343 int av1_hash_is_horizontal_perfect(const YV12_BUFFER_CONFIG *picture, 344 int block_size, int x_start, int y_start) { 345 const int stride = picture->y_stride; 346 const uint8_t *p = picture->y_buffer + y_start * stride + x_start; 347 348 if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { 349 const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); 350 for (int i = 0; i < block_size; i++) { 351 for (int j = 1; j < block_size; j++) { 352 if (p16[j] != p16[0]) { 353 return 0; 354 } 355 } 356 p16 += stride; 357 } 358 } else { 359 for (int i = 0; i < block_size; i++) { 360 for (int j = 1; j < block_size; j++) { 361 if (p[j] != p[0]) { 362 return 0; 363 } 364 } 365 p += stride; 366 } 367 } 368 369 return 1; 370 } 371 372 int av1_hash_is_vertical_perfect(const YV12_BUFFER_CONFIG *picture, 373 int block_size, int x_start, int y_start) { 374 const int stride = picture->y_stride; 375 const uint8_t *p = picture->y_buffer + y_start * stride + x_start; 376 377 if (picture->flags & YV12_FLAG_HIGHBITDEPTH) { 378 const uint16_t *p16 = CONVERT_TO_SHORTPTR(p); 379 for (int i = 0; i < block_size; i++) { 380 for (int j = 1; j < block_size; j++) { 381 if (p16[j * stride + i] != p16[i]) { 382 return 0; 383 } 384 } 385 } 386 } else { 387 for (int i = 0; i < block_size; i++) { 388 for (int j = 1; j < block_size; j++) { 389 if (p[j * stride + i] != p[i]) { 390 return 0; 391 } 392 } 393 } 394 } 395 return 1; 396 } 397 398 void av1_get_block_hash_value(IntraBCHashInfo *intra_bc_hash_info, 399 const uint8_t *y_src, int stride, int block_size, 400 uint32_t *hash_value1, uint32_t *hash_value2, 401 int use_highbitdepth) { 402 int add_value = hash_block_size_to_index(block_size); 403 assert(add_value >= 0); 404 add_value <<= kSrcBits; 405 const int crc_mask = (1 << kSrcBits) - 1; 406 407 CRC32C *calc = &intra_bc_hash_info->crc_calculator; 408 uint32_t **buf = intra_bc_hash_info->hash_value_buffer; 409 410 // 2x2 subblock hash values in current CU 411 int sub_block_in_width = (block_size >> 1); 412 if (use_highbitdepth) { 413 uint16_t pixel_to_hash[4]; 414 uint16_t *y16_src = CONVERT_TO_SHORTPTR(y_src); 415 for (int y_pos = 0; y_pos < block_size; y_pos += 2) { 416 for (int x_pos = 0; x_pos < block_size; x_pos += 2) { 417 int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); 418 get_pixels_in_1D_short_array_by_block_2x2( 419 y16_src + y_pos * stride + x_pos, stride, pixel_to_hash); 420 assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); 421 // For HBD, we either have 40 or 48 bits of input data that the xor hash 422 // reduce to 32 bits. We intentionally don't want to "discard" bits to 423 // avoid any kind of biasing. 424 buf[0][pos] = 425 get_xor_hash_value_hbd(pixel_to_hash[0], pixel_to_hash[1], 426 pixel_to_hash[2], pixel_to_hash[3]); 427 } 428 } 429 } else { 430 uint8_t pixel_to_hash[4]; 431 for (int y_pos = 0; y_pos < block_size; y_pos += 2) { 432 for (int x_pos = 0; x_pos < block_size; x_pos += 2) { 433 int pos = (y_pos >> 1) * sub_block_in_width + (x_pos >> 1); 434 get_pixels_in_1D_char_array_by_block_2x2(y_src + y_pos * stride + x_pos, 435 stride, pixel_to_hash); 436 assert(pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); 437 // This 2x2 hash isn't used directly as a "key" for the hash table, so 438 // we can afford to just copy the 4 8-bit pixel values as a single 439 // 32-bit value directly. (i.e. there are no concerns of a lack of 440 // uniform distribution) 441 buf[0][pos] = 442 get_identity_hash_value(pixel_to_hash[0], pixel_to_hash[1], 443 pixel_to_hash[2], pixel_to_hash[3]); 444 } 445 } 446 } 447 448 int src_sub_block_in_width = sub_block_in_width; 449 sub_block_in_width >>= 1; 450 451 int src_idx = 0; 452 int dst_idx = !src_idx; 453 454 // 4x4 subblock hash values to current block hash values 455 uint32_t to_hash[4]; 456 for (int sub_width = 4; sub_width <= block_size; 457 sub_width *= 2, src_idx = !src_idx) { 458 dst_idx = !src_idx; 459 460 int dst_pos = 0; 461 for (int y_pos = 0; y_pos < sub_block_in_width; y_pos++) { 462 for (int x_pos = 0; x_pos < sub_block_in_width; x_pos++) { 463 int srcPos = (y_pos << 1) * src_sub_block_in_width + (x_pos << 1); 464 465 assert(srcPos + 1 < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); 466 assert(srcPos + src_sub_block_in_width + 1 < 467 AOM_BUFFER_SIZE_FOR_BLOCK_HASH); 468 assert(dst_pos < AOM_BUFFER_SIZE_FOR_BLOCK_HASH); 469 470 to_hash[0] = buf[src_idx][srcPos]; 471 to_hash[1] = buf[src_idx][srcPos + 1]; 472 to_hash[2] = buf[src_idx][srcPos + src_sub_block_in_width]; 473 to_hash[3] = buf[src_idx][srcPos + src_sub_block_in_width + 1]; 474 475 // TODO: bug aomedia:433531610 - serialize input values in a way that's 476 // independent of the computer architecture's endianness 477 buf[dst_idx][dst_pos] = 478 av1_get_crc32c_value(calc, (uint8_t *)to_hash, sizeof(to_hash)); 479 dst_pos++; 480 } 481 } 482 483 src_sub_block_in_width = sub_block_in_width; 484 sub_block_in_width >>= 1; 485 } 486 487 *hash_value1 = (buf[dst_idx][0] & crc_mask) + add_value; 488 *hash_value2 = buf[dst_idx][0]; 489 }