tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

avg.c (18015B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <assert.h>
     13 #include <stdlib.h>
     14 
     15 #include "config/aom_dsp_rtcd.h"
     16 #include "aom_ports/mem.h"
     17 
     18 void aom_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
     19                      int *min, int *max) {
     20  int i, j;
     21  *min = 255;
     22  *max = 0;
     23  for (i = 0; i < 8; ++i, s += p, d += dp) {
     24    for (j = 0; j < 8; ++j) {
     25      int diff = abs(s[j] - d[j]);
     26      *min = diff < *min ? diff : *min;
     27      *max = diff > *max ? diff : *max;
     28    }
     29  }
     30 }
     31 
     32 unsigned int aom_avg_4x4_c(const uint8_t *s, int p) {
     33  int i, j;
     34  int sum = 0;
     35  for (i = 0; i < 4; ++i, s += p)
     36    for (j = 0; j < 4; sum += s[j], ++j) {
     37    }
     38 
     39  return (sum + 8) >> 4;
     40 }
     41 
     42 unsigned int aom_avg_8x8_c(const uint8_t *s, int p) {
     43  int i, j;
     44  int sum = 0;
     45  for (i = 0; i < 8; ++i, s += p)
     46    for (j = 0; j < 8; sum += s[j], ++j) {
     47    }
     48 
     49  return (sum + 32) >> 6;
     50 }
     51 
     52 void aom_avg_8x8_quad_c(const uint8_t *s, int p, int x16_idx, int y16_idx,
     53                        int *avg) {
     54  for (int k = 0; k < 4; k++) {
     55    const int x8_idx = x16_idx + ((k & 1) << 3);
     56    const int y8_idx = y16_idx + ((k >> 1) << 3);
     57    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
     58    avg[k] = aom_avg_8x8_c(s_tmp, p);
     59  }
     60 }
     61 
     62 #if CONFIG_AV1_HIGHBITDEPTH
     63 unsigned int aom_highbd_avg_8x8_c(const uint8_t *s8, int p) {
     64  int i, j;
     65  int sum = 0;
     66  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
     67  for (i = 0; i < 8; ++i, s += p)
     68    for (j = 0; j < 8; sum += s[j], ++j) {
     69    }
     70 
     71  return (sum + 32) >> 6;
     72 }
     73 
     74 unsigned int aom_highbd_avg_4x4_c(const uint8_t *s8, int p) {
     75  int i, j;
     76  int sum = 0;
     77  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
     78  for (i = 0; i < 4; ++i, s += p)
     79    for (j = 0; j < 4; sum += s[j], ++j) {
     80    }
     81 
     82  return (sum + 8) >> 4;
     83 }
     84 
     85 void aom_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
     86                             int dp, int *min, int *max) {
     87  int i, j;
     88  const uint16_t *s = CONVERT_TO_SHORTPTR(s8);
     89  const uint16_t *d = CONVERT_TO_SHORTPTR(d8);
     90  *min = 65535;
     91  *max = 0;
     92  for (i = 0; i < 8; ++i, s += p, d += dp) {
     93    for (j = 0; j < 8; ++j) {
     94      int diff = abs(s[j] - d[j]);
     95      *min = diff < *min ? diff : *min;
     96      *max = diff > *max ? diff : *max;
     97    }
     98  }
     99 }
    100 #endif  // CONFIG_AV1_HIGHBITDEPTH
    101 
    102 static void hadamard_col4(const int16_t *src_diff, ptrdiff_t src_stride,
    103                          int16_t *coeff) {
    104  int16_t b0 = (src_diff[0 * src_stride] + src_diff[1 * src_stride]) >> 1;
    105  int16_t b1 = (src_diff[0 * src_stride] - src_diff[1 * src_stride]) >> 1;
    106  int16_t b2 = (src_diff[2 * src_stride] + src_diff[3 * src_stride]) >> 1;
    107  int16_t b3 = (src_diff[2 * src_stride] - src_diff[3 * src_stride]) >> 1;
    108 
    109  coeff[0] = b0 + b2;
    110  coeff[1] = b1 + b3;
    111  coeff[2] = b0 - b2;
    112  coeff[3] = b1 - b3;
    113 }
    114 
    115 void aom_hadamard_4x4_c(const int16_t *src_diff, ptrdiff_t src_stride,
    116                        tran_low_t *coeff) {
    117  int idx;
    118  int16_t buffer[16];
    119  int16_t buffer2[16];
    120  int16_t *tmp_buf = &buffer[0];
    121  for (idx = 0; idx < 4; ++idx) {
    122    hadamard_col4(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
    123                                                   // dynamic range [-255, 255]
    124    tmp_buf += 4;
    125    ++src_diff;
    126  }
    127 
    128  tmp_buf = &buffer[0];
    129  for (idx = 0; idx < 4; ++idx) {
    130    hadamard_col4(tmp_buf, 4, buffer2 + 4 * idx);  // tmp_buf: 12 bit
    131    // dynamic range [-2040, 2040]
    132    // buffer2: 15 bit
    133    // dynamic range [-16320, 16320]
    134    ++tmp_buf;
    135  }
    136 
    137  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_4x4_sse2).
    138  for (int i = 0; i < 4; i++) {
    139    for (int j = 0; j < 4; j++) {
    140      coeff[i * 4 + j] = (tran_low_t)buffer2[j * 4 + i];
    141    }
    142  }
    143 }
    144 
    145 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
    146 //           second pass, 12 bit, dynamic range [-2040, 2040]
    147 static void hadamard_col8(const int16_t *src_diff, ptrdiff_t src_stride,
    148                          int16_t *coeff) {
    149  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
    150  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
    151  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
    152  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
    153  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
    154  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
    155  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
    156  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
    157 
    158  int16_t c0 = b0 + b2;
    159  int16_t c1 = b1 + b3;
    160  int16_t c2 = b0 - b2;
    161  int16_t c3 = b1 - b3;
    162  int16_t c4 = b4 + b6;
    163  int16_t c5 = b5 + b7;
    164  int16_t c6 = b4 - b6;
    165  int16_t c7 = b5 - b7;
    166 
    167  coeff[0] = c0 + c4;
    168  coeff[7] = c1 + c5;
    169  coeff[3] = c2 + c6;
    170  coeff[4] = c3 + c7;
    171  coeff[2] = c0 - c4;
    172  coeff[6] = c1 - c5;
    173  coeff[1] = c2 - c6;
    174  coeff[5] = c3 - c7;
    175 }
    176 
    177 void aom_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
    178                        tran_low_t *coeff) {
    179  int idx;
    180  int16_t buffer[64];
    181  int16_t buffer2[64];
    182  int16_t *tmp_buf = &buffer[0];
    183  for (idx = 0; idx < 8; ++idx) {
    184    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
    185                                                   // dynamic range [-255, 255]
    186    tmp_buf += 8;
    187    ++src_diff;
    188  }
    189 
    190  tmp_buf = &buffer[0];
    191  for (idx = 0; idx < 8; ++idx) {
    192    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
    193    // dynamic range [-2040, 2040]
    194    // buffer2: 15 bit
    195    // dynamic range [-16320, 16320]
    196    ++tmp_buf;
    197  }
    198 
    199  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_8x8_sse2).
    200  for (int i = 0; i < 8; i++) {
    201    for (int j = 0; j < 8; j++) {
    202      coeff[i * 8 + j] = (tran_low_t)buffer2[j * 8 + i];
    203    }
    204  }
    205 }
    206 
    207 void aom_hadamard_lp_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
    208                           int16_t *coeff) {
    209  int16_t buffer[64];
    210  int16_t buffer2[64];
    211  int16_t *tmp_buf = &buffer[0];
    212  for (int idx = 0; idx < 8; ++idx) {
    213    hadamard_col8(src_diff, src_stride, tmp_buf);  // src_diff: 9 bit
    214                                                   // dynamic range [-255, 255]
    215    tmp_buf += 8;
    216    ++src_diff;
    217  }
    218 
    219  tmp_buf = &buffer[0];
    220  for (int idx = 0; idx < 8; ++idx) {
    221    hadamard_col8(tmp_buf, 8, buffer2 + 8 * idx);  // tmp_buf: 12 bit
    222    // dynamic range [-2040, 2040]
    223    // buffer2: 15 bit
    224    // dynamic range [-16320, 16320]
    225    ++tmp_buf;
    226  }
    227 
    228  for (int idx = 0; idx < 64; ++idx) coeff[idx] = buffer2[idx];
    229 
    230  // Extra transpose to match SSE2 behavior(i.e., aom_hadamard_lp_8x8_sse2).
    231  for (int i = 0; i < 8; i++) {
    232    for (int j = 0; j < 8; j++) {
    233      coeff[i * 8 + j] = buffer2[j * 8 + i];
    234    }
    235  }
    236 }
    237 
    238 void aom_hadamard_lp_8x8_dual_c(const int16_t *src_diff, ptrdiff_t src_stride,
    239                                int16_t *coeff) {
    240  for (int i = 0; i < 2; i++) {
    241    aom_hadamard_lp_8x8_c(src_diff + (i * 8), src_stride,
    242                          (int16_t *)coeff + (i * 64));
    243  }
    244 }
    245 
    246 // In place 16x16 2D Hadamard transform
    247 void aom_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
    248                          tran_low_t *coeff) {
    249  int idx;
    250  for (idx = 0; idx < 4; ++idx) {
    251    // src_diff: 9 bit, dynamic range [-255, 255]
    252    const int16_t *src_ptr =
    253        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
    254    aom_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
    255  }
    256 
    257  // coeff: 15 bit, dynamic range [-16320, 16320]
    258  for (idx = 0; idx < 64; ++idx) {
    259    tran_low_t a0 = coeff[0];
    260    tran_low_t a1 = coeff[64];
    261    tran_low_t a2 = coeff[128];
    262    tran_low_t a3 = coeff[192];
    263 
    264    tran_low_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
    265    tran_low_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
    266    tran_low_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
    267    tran_low_t b3 = (a2 - a3) >> 1;
    268 
    269    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
    270    coeff[64] = b1 + b3;
    271    coeff[128] = b0 - b2;
    272    coeff[192] = b1 - b3;
    273 
    274    ++coeff;
    275  }
    276 
    277  coeff -= 64;
    278  // Extra shift to match AVX2 output (i.e., aom_hadamard_16x16_avx2).
    279  // Note that to match SSE2 output, it does not need this step.
    280  for (int i = 0; i < 16; i++) {
    281    for (int j = 0; j < 4; j++) {
    282      tran_low_t temp = coeff[i * 16 + 4 + j];
    283      coeff[i * 16 + 4 + j] = coeff[i * 16 + 8 + j];
    284      coeff[i * 16 + 8 + j] = temp;
    285    }
    286  }
    287 }
    288 
    289 void aom_hadamard_lp_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
    290                             int16_t *coeff) {
    291  for (int idx = 0; idx < 4; ++idx) {
    292    // src_diff: 9 bit, dynamic range [-255, 255]
    293    const int16_t *src_ptr =
    294        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
    295    aom_hadamard_lp_8x8_c(src_ptr, src_stride, coeff + idx * 64);
    296  }
    297 
    298  for (int idx = 0; idx < 64; ++idx) {
    299    int16_t a0 = coeff[0];
    300    int16_t a1 = coeff[64];
    301    int16_t a2 = coeff[128];
    302    int16_t a3 = coeff[192];
    303 
    304    int16_t b0 = (a0 + a1) >> 1;  // (a0 + a1): 16 bit, [-32640, 32640]
    305    int16_t b1 = (a0 - a1) >> 1;  // b0-b3: 15 bit, dynamic range
    306    int16_t b2 = (a2 + a3) >> 1;  // [-16320, 16320]
    307    int16_t b3 = (a2 - a3) >> 1;
    308 
    309    coeff[0] = b0 + b2;  // 16 bit, [-32640, 32640]
    310    coeff[64] = b1 + b3;
    311    coeff[128] = b0 - b2;
    312    coeff[192] = b1 - b3;
    313 
    314    ++coeff;
    315  }
    316 }
    317 
    318 void aom_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
    319                          tran_low_t *coeff) {
    320  int idx;
    321  for (idx = 0; idx < 4; ++idx) {
    322    // src_diff: 9 bit, dynamic range [-255, 255]
    323    const int16_t *src_ptr =
    324        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
    325    aom_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
    326  }
    327 
    328  // coeff: 16 bit, dynamic range [-32768, 32767]
    329  for (idx = 0; idx < 256; ++idx) {
    330    tran_low_t a0 = coeff[0];
    331    tran_low_t a1 = coeff[256];
    332    tran_low_t a2 = coeff[512];
    333    tran_low_t a3 = coeff[768];
    334 
    335    tran_low_t b0 = (a0 + a1) >> 2;  // (a0 + a1): 17 bit, [-65536, 65535]
    336    tran_low_t b1 = (a0 - a1) >> 2;  // b0-b3: 15 bit, dynamic range
    337    tran_low_t b2 = (a2 + a3) >> 2;  // [-16384, 16383]
    338    tran_low_t b3 = (a2 - a3) >> 2;
    339 
    340    coeff[0] = b0 + b2;  // 16 bit, [-32768, 32767]
    341    coeff[256] = b1 + b3;
    342    coeff[512] = b0 - b2;
    343    coeff[768] = b1 - b3;
    344 
    345    ++coeff;
    346  }
    347 }
    348 
    349 #if CONFIG_AV1_HIGHBITDEPTH
    350 static void hadamard_highbd_col8_first_pass(const int16_t *src_diff,
    351                                            ptrdiff_t src_stride,
    352                                            int16_t *coeff) {
    353  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
    354  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
    355  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
    356  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
    357  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
    358  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
    359  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
    360  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
    361 
    362  int16_t c0 = b0 + b2;
    363  int16_t c1 = b1 + b3;
    364  int16_t c2 = b0 - b2;
    365  int16_t c3 = b1 - b3;
    366  int16_t c4 = b4 + b6;
    367  int16_t c5 = b5 + b7;
    368  int16_t c6 = b4 - b6;
    369  int16_t c7 = b5 - b7;
    370 
    371  coeff[0] = c0 + c4;
    372  coeff[7] = c1 + c5;
    373  coeff[3] = c2 + c6;
    374  coeff[4] = c3 + c7;
    375  coeff[2] = c0 - c4;
    376  coeff[6] = c1 - c5;
    377  coeff[1] = c2 - c6;
    378  coeff[5] = c3 - c7;
    379 }
    380 
    381 // src_diff: 16 bit, dynamic range [-32760, 32760]
    382 // coeff: 19 bit
    383 static void hadamard_highbd_col8_second_pass(const int16_t *src_diff,
    384                                             ptrdiff_t src_stride,
    385                                             int32_t *coeff) {
    386  int32_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
    387  int32_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
    388  int32_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
    389  int32_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
    390  int32_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
    391  int32_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
    392  int32_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
    393  int32_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
    394 
    395  int32_t c0 = b0 + b2;
    396  int32_t c1 = b1 + b3;
    397  int32_t c2 = b0 - b2;
    398  int32_t c3 = b1 - b3;
    399  int32_t c4 = b4 + b6;
    400  int32_t c5 = b5 + b7;
    401  int32_t c6 = b4 - b6;
    402  int32_t c7 = b5 - b7;
    403 
    404  coeff[0] = c0 + c4;
    405  coeff[7] = c1 + c5;
    406  coeff[3] = c2 + c6;
    407  coeff[4] = c3 + c7;
    408  coeff[2] = c0 - c4;
    409  coeff[6] = c1 - c5;
    410  coeff[1] = c2 - c6;
    411  coeff[5] = c3 - c7;
    412 }
    413 
    414 // The order of the output coeff of the hadamard is not important. For
    415 // optimization purposes the final transpose may be skipped.
    416 void aom_highbd_hadamard_8x8_c(const int16_t *src_diff, ptrdiff_t src_stride,
    417                               tran_low_t *coeff) {
    418  int idx;
    419  int16_t buffer[64];
    420  int32_t buffer2[64];
    421  int16_t *tmp_buf = &buffer[0];
    422  for (idx = 0; idx < 8; ++idx) {
    423    // src_diff: 13 bit
    424    // buffer: 16 bit, dynamic range [-32760, 32760]
    425    hadamard_highbd_col8_first_pass(src_diff, src_stride, tmp_buf);
    426    tmp_buf += 8;
    427    ++src_diff;
    428  }
    429 
    430  tmp_buf = &buffer[0];
    431  for (idx = 0; idx < 8; ++idx) {
    432    // buffer: 16 bit
    433    // buffer2: 19 bit, dynamic range [-262080, 262080]
    434    hadamard_highbd_col8_second_pass(tmp_buf, 8, buffer2 + 8 * idx);
    435    ++tmp_buf;
    436  }
    437 
    438  for (idx = 0; idx < 64; ++idx) coeff[idx] = (tran_low_t)buffer2[idx];
    439 }
    440 
    441 // In place 16x16 2D Hadamard transform
    442 void aom_highbd_hadamard_16x16_c(const int16_t *src_diff, ptrdiff_t src_stride,
    443                                 tran_low_t *coeff) {
    444  int idx;
    445  for (idx = 0; idx < 4; ++idx) {
    446    // src_diff: 13 bit, dynamic range [-4095, 4095]
    447    const int16_t *src_ptr =
    448        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
    449    aom_highbd_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
    450  }
    451 
    452  // coeff: 19 bit, dynamic range [-262080, 262080]
    453  for (idx = 0; idx < 64; ++idx) {
    454    tran_low_t a0 = coeff[0];
    455    tran_low_t a1 = coeff[64];
    456    tran_low_t a2 = coeff[128];
    457    tran_low_t a3 = coeff[192];
    458 
    459    tran_low_t b0 = (a0 + a1) >> 1;
    460    tran_low_t b1 = (a0 - a1) >> 1;
    461    tran_low_t b2 = (a2 + a3) >> 1;
    462    tran_low_t b3 = (a2 - a3) >> 1;
    463 
    464    // new coeff dynamic range: 20 bit
    465    coeff[0] = b0 + b2;
    466    coeff[64] = b1 + b3;
    467    coeff[128] = b0 - b2;
    468    coeff[192] = b1 - b3;
    469 
    470    ++coeff;
    471  }
    472 }
    473 
    474 void aom_highbd_hadamard_32x32_c(const int16_t *src_diff, ptrdiff_t src_stride,
    475                                 tran_low_t *coeff) {
    476  int idx;
    477  for (idx = 0; idx < 4; ++idx) {
    478    // src_diff: 13 bit, dynamic range [-4095, 4095]
    479    const int16_t *src_ptr =
    480        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
    481    aom_highbd_hadamard_16x16_c(src_ptr, src_stride, coeff + idx * 256);
    482  }
    483 
    484  // coeff: 20 bit
    485  for (idx = 0; idx < 256; ++idx) {
    486    tran_low_t a0 = coeff[0];
    487    tran_low_t a1 = coeff[256];
    488    tran_low_t a2 = coeff[512];
    489    tran_low_t a3 = coeff[768];
    490 
    491    tran_low_t b0 = (a0 + a1) >> 2;
    492    tran_low_t b1 = (a0 - a1) >> 2;
    493    tran_low_t b2 = (a2 + a3) >> 2;
    494    tran_low_t b3 = (a2 - a3) >> 2;
    495 
    496    // new coeff dynamic range: 20 bit
    497    coeff[0] = b0 + b2;
    498    coeff[256] = b1 + b3;
    499    coeff[512] = b0 - b2;
    500    coeff[768] = b1 - b3;
    501 
    502    ++coeff;
    503  }
    504 }
    505 #endif  // CONFIG_AV1_HIGHBITDEPTH
    506 
    507 // coeff: 20 bits, dynamic range [-524287, 524287].
    508 // length: value range {16, 32, 64, 128, 256, 512, 1024}.
    509 int aom_satd_c(const tran_low_t *coeff, int length) {
    510  int i;
    511  int satd = 0;
    512  for (i = 0; i < length; ++i) satd += abs(coeff[i]);
    513 
    514  // satd: 30 bits, dynamic range [-524287 * 1024, 524287 * 1024]
    515  return satd;
    516 }
    517 
    518 int aom_satd_lp_c(const int16_t *coeff, int length) {
    519  int satd = 0;
    520  for (int i = 0; i < length; ++i) satd += abs(coeff[i]);
    521 
    522  // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024]
    523  return satd;
    524 }
    525 
    526 // Integer projection onto row vectors.
    527 // height: value range {16, 32, 64, 128}.
    528 void aom_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride,
    529                       const int width, const int height, int norm_factor) {
    530  assert(height >= 2);
    531  for (int idx = 0; idx < width; ++idx) {
    532    hbuf[idx] = 0;
    533    // hbuf[idx]: 14 bit, dynamic range [0, 32640].
    534    for (int i = 0; i < height; ++i) hbuf[idx] += ref[i * ref_stride];
    535    // hbuf[idx]: 9 bit, dynamic range [0, 1020].
    536    hbuf[idx] >>= norm_factor;
    537    ++ref;
    538  }
    539 }
    540 
    541 // width: value range {16, 32, 64, 128}.
    542 void aom_int_pro_col_c(int16_t *vbuf, const uint8_t *ref, const int ref_stride,
    543                       const int width, const int height, int norm_factor) {
    544  for (int ht = 0; ht < height; ++ht) {
    545    int16_t sum = 0;
    546    // sum: 14 bit, dynamic range [0, 32640]
    547    for (int idx = 0; idx < width; ++idx) sum += ref[idx];
    548    vbuf[ht] = sum >> norm_factor;
    549    ref += ref_stride;
    550  }
    551 }
    552 
    553 // ref: [0 - 510]
    554 // src: [0 - 510]
    555 // bwl: {2, 3, 4, 5}
    556 int aom_vector_var_c(const int16_t *ref, const int16_t *src, int bwl) {
    557  int i;
    558  int width = 4 << bwl;
    559  int sse = 0, mean = 0, var;
    560 
    561  for (i = 0; i < width; ++i) {
    562    int diff = ref[i] - src[i];  // diff: dynamic range [-510, 510], 10 bits.
    563    mean += diff;                // mean: dynamic range 16 bits.
    564    sse += diff * diff;          // sse:  dynamic range 26 bits.
    565  }
    566 
    567  // (mean * mean): dynamic range 31 bits.
    568  // If width == 128, the mean can be 510 * 128 = 65280, and log2(65280 ** 2) ~=
    569  // 31.99, so it needs to be casted to unsigned int to compute its square.
    570  const unsigned int mean_abs = abs(mean);
    571  var = sse - ((mean_abs * mean_abs) >> (bwl + 2));
    572  return var;
    573 }