wedge_utils.c (4102B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 14 #include "aom/aom_integer.h" 15 16 #include "aom_ports/mem.h" 17 18 #include "aom_dsp/aom_dsp_common.h" 19 20 #include "av1/common/reconinter.h" 21 22 #define MAX_MASK_VALUE (1 << WEDGE_WEIGHT_BITS) 23 24 /** 25 * Computes SSE of a compound predictor constructed from 2 fundamental 26 * predictors p0 and p1 using blending with mask. 27 * 28 * r1: Residuals of p1. 29 * (source - p1) 30 * d: Difference of p1 and p0. 31 * (p1 - p0) 32 * m: The blending mask 33 * N: Number of pixels 34 * 35 * 'r1', 'd', and 'm' are contiguous. 36 * 37 * Computes: 38 * Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to: 39 * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2), 40 * where r0 is (source - p0), and r1 is (source - p1), which is in turn 41 * is equivalent to: 42 * Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2), 43 * which is the SSE of the residuals of the compound predictor scaled up by 44 * MAX_MASK_VALUE**2. 45 * 46 * Note that we clamp the partial term in the loop to 16 bits signed. This is 47 * to facilitate equivalent SIMD implementation. It should have no effect if 48 * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always 49 * holds for 8 bit input, and on real input, it should hold practically always, 50 * as residuals are expected to be small. 51 */ 52 uint64_t av1_wedge_sse_from_residuals_c(const int16_t *r1, const int16_t *d, 53 const uint8_t *m, int N) { 54 uint64_t csse = 0; 55 int i; 56 57 for (i = 0; i < N; i++) { 58 int32_t t = MAX_MASK_VALUE * r1[i] + m[i] * d[i]; 59 t = clamp(t, INT16_MIN, INT16_MAX); 60 csse += t * t; 61 } 62 return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); 63 } 64 65 /** 66 * Choose the mask sign for a compound predictor. 67 * 68 * ds: Difference of the squares of the residuals. 69 * r0**2 - r1**2 70 * m: The blending mask 71 * N: Number of pixels 72 * limit: Pre-computed threshold value. 73 * MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) 74 * 75 * 'ds' and 'm' are contiguous. 76 * 77 * Returns true if the negated mask has lower SSE compared to the positive 78 * mask. Computation is based on: 79 * Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2) 80 * > 81 * Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2) 82 * 83 * which can be simplified to: 84 * 85 * Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2)) 86 * 87 * The right hand side does not depend on the mask, and needs to be passed as 88 * the 'limit' parameter. 89 * 90 * After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left 91 * hand side is simply a scalar product between an int16_t and uint8_t vector. 92 * 93 * Note that for efficiency, ds is stored on 16 bits. Real input residuals 94 * being small, this should not cause a noticeable issue. 95 */ 96 int8_t av1_wedge_sign_from_residuals_c(const int16_t *ds, const uint8_t *m, 97 int N, int64_t limit) { 98 int64_t acc = 0; 99 100 do { 101 acc += *ds++ * *m++; 102 } while (--N); 103 104 return acc > limit; 105 } 106 107 /** 108 * Compute the element-wise difference of the squares of 2 arrays. 109 * 110 * d: Difference of the squares of the inputs: a**2 - b**2 111 * a: First input array 112 * b: Second input array 113 * N: Number of elements 114 * 115 * 'd', 'a', and 'b' are contiguous. 116 * 117 * The result is saturated to signed 16 bits. 118 */ 119 void av1_wedge_compute_delta_squares_c(int16_t *d, const int16_t *a, 120 const int16_t *b, int N) { 121 int i; 122 123 for (i = 0; i < N; i++) 124 d[i] = clamp(a[i] * a[i] - b[i] * b[i], INT16_MIN, INT16_MAX); 125 }