tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

warped_motion.c (42348B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <stdio.h>
     13 #include <stdlib.h>
     14 #include <memory.h>
     15 #include <math.h>
     16 #include <assert.h>
     17 
     18 #include "config/av1_rtcd.h"
     19 
     20 #include "av1/common/av1_common_int.h"
     21 #include "av1/common/warped_motion.h"
     22 #include "av1/common/scale.h"
     23 
     24 // For warping, we really use a 6-tap filter, but we do blocks of 8 pixels
     25 // at a time. The zoom/rotation/shear in the model are applied to the
     26 // "fractional" position of each pixel, which therefore varies within
     27 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
     28 // We need an extra 2 taps to fit this in, for a total of 8 taps.
     29 /* clang-format off */
     30 const WarpedFilterCoeff av1_warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3 + 1]
     31                                         [8] = {
     32  // [-1, 0)
     33  { 0,   0, 127,   1,   0, 0, 0, 0 }, { 0, - 1, 127,   2,   0, 0, 0, 0 },
     34  { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
     35  { 1, - 5, 126,   8, - 3, 1, 0, 0 }, { 1, - 6, 125,  11, - 4, 1, 0, 0 },
     36  { 1, - 7, 124,  13, - 4, 1, 0, 0 }, { 2, - 8, 123,  15, - 5, 1, 0, 0 },
     37  { 2, - 9, 122,  18, - 6, 1, 0, 0 }, { 2, -10, 121,  20, - 6, 1, 0, 0 },
     38  { 2, -11, 120,  22, - 7, 2, 0, 0 }, { 2, -12, 119,  25, - 8, 2, 0, 0 },
     39  { 3, -13, 117,  27, - 8, 2, 0, 0 }, { 3, -13, 116,  29, - 9, 2, 0, 0 },
     40  { 3, -14, 114,  32, -10, 3, 0, 0 }, { 3, -15, 113,  35, -10, 2, 0, 0 },
     41  { 3, -15, 111,  37, -11, 3, 0, 0 }, { 3, -16, 109,  40, -11, 3, 0, 0 },
     42  { 3, -16, 108,  42, -12, 3, 0, 0 }, { 4, -17, 106,  45, -13, 3, 0, 0 },
     43  { 4, -17, 104,  47, -13, 3, 0, 0 }, { 4, -17, 102,  50, -14, 3, 0, 0 },
     44  { 4, -17, 100,  52, -14, 3, 0, 0 }, { 4, -18,  98,  55, -15, 4, 0, 0 },
     45  { 4, -18,  96,  58, -15, 3, 0, 0 }, { 4, -18,  94,  60, -16, 4, 0, 0 },
     46  { 4, -18,  91,  63, -16, 4, 0, 0 }, { 4, -18,  89,  65, -16, 4, 0, 0 },
     47  { 4, -18,  87,  68, -17, 4, 0, 0 }, { 4, -18,  85,  70, -17, 4, 0, 0 },
     48  { 4, -18,  82,  73, -17, 4, 0, 0 }, { 4, -18,  80,  75, -17, 4, 0, 0 },
     49  { 4, -18,  78,  78, -18, 4, 0, 0 }, { 4, -17,  75,  80, -18, 4, 0, 0 },
     50  { 4, -17,  73,  82, -18, 4, 0, 0 }, { 4, -17,  70,  85, -18, 4, 0, 0 },
     51  { 4, -17,  68,  87, -18, 4, 0, 0 }, { 4, -16,  65,  89, -18, 4, 0, 0 },
     52  { 4, -16,  63,  91, -18, 4, 0, 0 }, { 4, -16,  60,  94, -18, 4, 0, 0 },
     53  { 3, -15,  58,  96, -18, 4, 0, 0 }, { 4, -15,  55,  98, -18, 4, 0, 0 },
     54  { 3, -14,  52, 100, -17, 4, 0, 0 }, { 3, -14,  50, 102, -17, 4, 0, 0 },
     55  { 3, -13,  47, 104, -17, 4, 0, 0 }, { 3, -13,  45, 106, -17, 4, 0, 0 },
     56  { 3, -12,  42, 108, -16, 3, 0, 0 }, { 3, -11,  40, 109, -16, 3, 0, 0 },
     57  { 3, -11,  37, 111, -15, 3, 0, 0 }, { 2, -10,  35, 113, -15, 3, 0, 0 },
     58  { 3, -10,  32, 114, -14, 3, 0, 0 }, { 2, - 9,  29, 116, -13, 3, 0, 0 },
     59  { 2, - 8,  27, 117, -13, 3, 0, 0 }, { 2, - 8,  25, 119, -12, 2, 0, 0 },
     60  { 2, - 7,  22, 120, -11, 2, 0, 0 }, { 1, - 6,  20, 121, -10, 2, 0, 0 },
     61  { 1, - 6,  18, 122, - 9, 2, 0, 0 }, { 1, - 5,  15, 123, - 8, 2, 0, 0 },
     62  { 1, - 4,  13, 124, - 7, 1, 0, 0 }, { 1, - 4,  11, 125, - 6, 1, 0, 0 },
     63  { 1, - 3,   8, 126, - 5, 1, 0, 0 }, { 1, - 2,   6, 126, - 4, 1, 0, 0 },
     64  { 0, - 1,   4, 127, - 3, 1, 0, 0 }, { 0,   0,   2, 127, - 1, 0, 0, 0 },
     65 
     66  // [0, 1)
     67  { 0,  0,   0, 127,   1,   0,  0,  0}, { 0,  0,  -1, 127,   2,   0,  0,  0},
     68  { 0,  1,  -3, 127,   4,  -2,  1,  0}, { 0,  1,  -5, 127,   6,  -2,  1,  0},
     69  { 0,  2,  -6, 126,   8,  -3,  1,  0}, {-1,  2,  -7, 126,  11,  -4,  2, -1},
     70  {-1,  3,  -8, 125,  13,  -5,  2, -1}, {-1,  3, -10, 124,  16,  -6,  3, -1},
     71  {-1,  4, -11, 123,  18,  -7,  3, -1}, {-1,  4, -12, 122,  20,  -7,  3, -1},
     72  {-1,  4, -13, 121,  23,  -8,  3, -1}, {-2,  5, -14, 120,  25,  -9,  4, -1},
     73  {-1,  5, -15, 119,  27, -10,  4, -1}, {-1,  5, -16, 118,  30, -11,  4, -1},
     74  {-2,  6, -17, 116,  33, -12,  5, -1}, {-2,  6, -17, 114,  35, -12,  5, -1},
     75  {-2,  6, -18, 113,  38, -13,  5, -1}, {-2,  7, -19, 111,  41, -14,  6, -2},
     76  {-2,  7, -19, 110,  43, -15,  6, -2}, {-2,  7, -20, 108,  46, -15,  6, -2},
     77  {-2,  7, -20, 106,  49, -16,  6, -2}, {-2,  7, -21, 104,  51, -16,  7, -2},
     78  {-2,  7, -21, 102,  54, -17,  7, -2}, {-2,  8, -21, 100,  56, -18,  7, -2},
     79  {-2,  8, -22,  98,  59, -18,  7, -2}, {-2,  8, -22,  96,  62, -19,  7, -2},
     80  {-2,  8, -22,  94,  64, -19,  7, -2}, {-2,  8, -22,  91,  67, -20,  8, -2},
     81  {-2,  8, -22,  89,  69, -20,  8, -2}, {-2,  8, -22,  87,  72, -21,  8, -2},
     82  {-2,  8, -21,  84,  74, -21,  8, -2}, {-2,  8, -22,  82,  77, -21,  8, -2},
     83  {-2,  8, -21,  79,  79, -21,  8, -2}, {-2,  8, -21,  77,  82, -22,  8, -2},
     84  {-2,  8, -21,  74,  84, -21,  8, -2}, {-2,  8, -21,  72,  87, -22,  8, -2},
     85  {-2,  8, -20,  69,  89, -22,  8, -2}, {-2,  8, -20,  67,  91, -22,  8, -2},
     86  {-2,  7, -19,  64,  94, -22,  8, -2}, {-2,  7, -19,  62,  96, -22,  8, -2},
     87  {-2,  7, -18,  59,  98, -22,  8, -2}, {-2,  7, -18,  56, 100, -21,  8, -2},
     88  {-2,  7, -17,  54, 102, -21,  7, -2}, {-2,  7, -16,  51, 104, -21,  7, -2},
     89  {-2,  6, -16,  49, 106, -20,  7, -2}, {-2,  6, -15,  46, 108, -20,  7, -2},
     90  {-2,  6, -15,  43, 110, -19,  7, -2}, {-2,  6, -14,  41, 111, -19,  7, -2},
     91  {-1,  5, -13,  38, 113, -18,  6, -2}, {-1,  5, -12,  35, 114, -17,  6, -2},
     92  {-1,  5, -12,  33, 116, -17,  6, -2}, {-1,  4, -11,  30, 118, -16,  5, -1},
     93  {-1,  4, -10,  27, 119, -15,  5, -1}, {-1,  4,  -9,  25, 120, -14,  5, -2},
     94  {-1,  3,  -8,  23, 121, -13,  4, -1}, {-1,  3,  -7,  20, 122, -12,  4, -1},
     95  {-1,  3,  -7,  18, 123, -11,  4, -1}, {-1,  3,  -6,  16, 124, -10,  3, -1},
     96  {-1,  2,  -5,  13, 125,  -8,  3, -1}, {-1,  2,  -4,  11, 126,  -7,  2, -1},
     97  { 0,  1,  -3,   8, 126,  -6,  2,  0}, { 0,  1,  -2,   6, 127,  -5,  1,  0},
     98  { 0,  1,  -2,   4, 127,  -3,  1,  0}, { 0,  0,   0,   2, 127,  -1,  0,  0},
     99 
    100  // [1, 2)
    101  { 0, 0, 0,   1, 127,   0,   0, 0 }, { 0, 0, 0, - 1, 127,   2,   0, 0 },
    102  { 0, 0, 1, - 3, 127,   4, - 1, 0 }, { 0, 0, 1, - 4, 126,   6, - 2, 1 },
    103  { 0, 0, 1, - 5, 126,   8, - 3, 1 }, { 0, 0, 1, - 6, 125,  11, - 4, 1 },
    104  { 0, 0, 1, - 7, 124,  13, - 4, 1 }, { 0, 0, 2, - 8, 123,  15, - 5, 1 },
    105  { 0, 0, 2, - 9, 122,  18, - 6, 1 }, { 0, 0, 2, -10, 121,  20, - 6, 1 },
    106  { 0, 0, 2, -11, 120,  22, - 7, 2 }, { 0, 0, 2, -12, 119,  25, - 8, 2 },
    107  { 0, 0, 3, -13, 117,  27, - 8, 2 }, { 0, 0, 3, -13, 116,  29, - 9, 2 },
    108  { 0, 0, 3, -14, 114,  32, -10, 3 }, { 0, 0, 3, -15, 113,  35, -10, 2 },
    109  { 0, 0, 3, -15, 111,  37, -11, 3 }, { 0, 0, 3, -16, 109,  40, -11, 3 },
    110  { 0, 0, 3, -16, 108,  42, -12, 3 }, { 0, 0, 4, -17, 106,  45, -13, 3 },
    111  { 0, 0, 4, -17, 104,  47, -13, 3 }, { 0, 0, 4, -17, 102,  50, -14, 3 },
    112  { 0, 0, 4, -17, 100,  52, -14, 3 }, { 0, 0, 4, -18,  98,  55, -15, 4 },
    113  { 0, 0, 4, -18,  96,  58, -15, 3 }, { 0, 0, 4, -18,  94,  60, -16, 4 },
    114  { 0, 0, 4, -18,  91,  63, -16, 4 }, { 0, 0, 4, -18,  89,  65, -16, 4 },
    115  { 0, 0, 4, -18,  87,  68, -17, 4 }, { 0, 0, 4, -18,  85,  70, -17, 4 },
    116  { 0, 0, 4, -18,  82,  73, -17, 4 }, { 0, 0, 4, -18,  80,  75, -17, 4 },
    117  { 0, 0, 4, -18,  78,  78, -18, 4 }, { 0, 0, 4, -17,  75,  80, -18, 4 },
    118  { 0, 0, 4, -17,  73,  82, -18, 4 }, { 0, 0, 4, -17,  70,  85, -18, 4 },
    119  { 0, 0, 4, -17,  68,  87, -18, 4 }, { 0, 0, 4, -16,  65,  89, -18, 4 },
    120  { 0, 0, 4, -16,  63,  91, -18, 4 }, { 0, 0, 4, -16,  60,  94, -18, 4 },
    121  { 0, 0, 3, -15,  58,  96, -18, 4 }, { 0, 0, 4, -15,  55,  98, -18, 4 },
    122  { 0, 0, 3, -14,  52, 100, -17, 4 }, { 0, 0, 3, -14,  50, 102, -17, 4 },
    123  { 0, 0, 3, -13,  47, 104, -17, 4 }, { 0, 0, 3, -13,  45, 106, -17, 4 },
    124  { 0, 0, 3, -12,  42, 108, -16, 3 }, { 0, 0, 3, -11,  40, 109, -16, 3 },
    125  { 0, 0, 3, -11,  37, 111, -15, 3 }, { 0, 0, 2, -10,  35, 113, -15, 3 },
    126  { 0, 0, 3, -10,  32, 114, -14, 3 }, { 0, 0, 2, - 9,  29, 116, -13, 3 },
    127  { 0, 0, 2, - 8,  27, 117, -13, 3 }, { 0, 0, 2, - 8,  25, 119, -12, 2 },
    128  { 0, 0, 2, - 7,  22, 120, -11, 2 }, { 0, 0, 1, - 6,  20, 121, -10, 2 },
    129  { 0, 0, 1, - 6,  18, 122, - 9, 2 }, { 0, 0, 1, - 5,  15, 123, - 8, 2 },
    130  { 0, 0, 1, - 4,  13, 124, - 7, 1 }, { 0, 0, 1, - 4,  11, 125, - 6, 1 },
    131  { 0, 0, 1, - 3,   8, 126, - 5, 1 }, { 0, 0, 1, - 2,   6, 126, - 4, 1 },
    132  { 0, 0, 0, - 1,   4, 127, - 3, 1 }, { 0, 0, 0,   0,   2, 127, - 1, 0 },
    133  // dummy (replicate row index 191)
    134  { 0, 0, 0,   0,   2, 127, - 1, 0 },
    135 };
    136 
    137 /* clang-format on */
    138 
    139 #define DIV_LUT_PREC_BITS 14
    140 #define DIV_LUT_BITS 8
    141 #define DIV_LUT_NUM (1 << DIV_LUT_BITS)
    142 
    143 static const uint16_t div_lut[DIV_LUT_NUM + 1] = {
    144  16384, 16320, 16257, 16194, 16132, 16070, 16009, 15948, 15888, 15828, 15768,
    145  15709, 15650, 15592, 15534, 15477, 15420, 15364, 15308, 15252, 15197, 15142,
    146  15087, 15033, 14980, 14926, 14873, 14821, 14769, 14717, 14665, 14614, 14564,
    147  14513, 14463, 14413, 14364, 14315, 14266, 14218, 14170, 14122, 14075, 14028,
    148  13981, 13935, 13888, 13843, 13797, 13752, 13707, 13662, 13618, 13574, 13530,
    149  13487, 13443, 13400, 13358, 13315, 13273, 13231, 13190, 13148, 13107, 13066,
    150  13026, 12985, 12945, 12906, 12866, 12827, 12788, 12749, 12710, 12672, 12633,
    151  12596, 12558, 12520, 12483, 12446, 12409, 12373, 12336, 12300, 12264, 12228,
    152  12193, 12157, 12122, 12087, 12053, 12018, 11984, 11950, 11916, 11882, 11848,
    153  11815, 11782, 11749, 11716, 11683, 11651, 11619, 11586, 11555, 11523, 11491,
    154  11460, 11429, 11398, 11367, 11336, 11305, 11275, 11245, 11215, 11185, 11155,
    155  11125, 11096, 11067, 11038, 11009, 10980, 10951, 10923, 10894, 10866, 10838,
    156  10810, 10782, 10755, 10727, 10700, 10673, 10645, 10618, 10592, 10565, 10538,
    157  10512, 10486, 10460, 10434, 10408, 10382, 10356, 10331, 10305, 10280, 10255,
    158  10230, 10205, 10180, 10156, 10131, 10107, 10082, 10058, 10034, 10010, 9986,
    159  9963,  9939,  9916,  9892,  9869,  9846,  9823,  9800,  9777,  9754,  9732,
    160  9709,  9687,  9664,  9642,  9620,  9598,  9576,  9554,  9533,  9511,  9489,
    161  9468,  9447,  9425,  9404,  9383,  9362,  9341,  9321,  9300,  9279,  9259,
    162  9239,  9218,  9198,  9178,  9158,  9138,  9118,  9098,  9079,  9059,  9039,
    163  9020,  9001,  8981,  8962,  8943,  8924,  8905,  8886,  8867,  8849,  8830,
    164  8812,  8793,  8775,  8756,  8738,  8720,  8702,  8684,  8666,  8648,  8630,
    165  8613,  8595,  8577,  8560,  8542,  8525,  8508,  8490,  8473,  8456,  8439,
    166  8422,  8405,  8389,  8372,  8355,  8339,  8322,  8306,  8289,  8273,  8257,
    167  8240,  8224,  8208,  8192,
    168 };
    169 
    170 // Decomposes a divisor D such that 1/D = y/2^shift, where y is returned
    171 // at precision of DIV_LUT_PREC_BITS along with the shift.
    172 static int16_t resolve_divisor_64(uint64_t D, int16_t *shift) {
    173  int64_t f;
    174  *shift = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
    175                               : get_msb((unsigned int)D));
    176  // e is obtained from D after resetting the most significant 1 bit.
    177  const int64_t e = D - ((uint64_t)1 << *shift);
    178  // Get the most significant DIV_LUT_BITS (8) bits of e into f
    179  if (*shift > DIV_LUT_BITS)
    180    f = ROUND_POWER_OF_TWO_64(e, *shift - DIV_LUT_BITS);
    181  else
    182    f = e << (DIV_LUT_BITS - *shift);
    183  assert(f <= DIV_LUT_NUM);
    184  *shift += DIV_LUT_PREC_BITS;
    185  // Use f as lookup into the precomputed table of multipliers
    186  return div_lut[f];
    187 }
    188 
    189 static int16_t resolve_divisor_32(uint32_t D, int16_t *shift) {
    190  int32_t f;
    191  *shift = get_msb(D);
    192  // e is obtained from D after resetting the most significant 1 bit.
    193  const int32_t e = D - ((uint32_t)1 << *shift);
    194  // Get the most significant DIV_LUT_BITS (8) bits of e into f
    195  if (*shift > DIV_LUT_BITS)
    196    f = ROUND_POWER_OF_TWO(e, *shift - DIV_LUT_BITS);
    197  else
    198    f = e << (DIV_LUT_BITS - *shift);
    199  assert(f <= DIV_LUT_NUM);
    200  *shift += DIV_LUT_PREC_BITS;
    201  // Use f as lookup into the precomputed table of multipliers
    202  return div_lut[f];
    203 }
    204 
    205 static int is_affine_valid(const WarpedMotionParams *const wm) {
    206  const int32_t *mat = wm->wmmat;
    207  return (mat[2] > 0);
    208 }
    209 
    210 static int is_affine_shear_allowed(int16_t alpha, int16_t beta, int16_t gamma,
    211                                   int16_t delta) {
    212  if ((4 * abs(alpha) + 7 * abs(beta) >= (1 << WARPEDMODEL_PREC_BITS)) ||
    213      (4 * abs(gamma) + 4 * abs(delta) >= (1 << WARPEDMODEL_PREC_BITS)))
    214    return 0;
    215  else
    216    return 1;
    217 }
    218 
    219 #ifndef NDEBUG
    220 // Check that the given warp model satisfies the relevant constraints for
    221 // its stated model type
    222 static void check_model_consistency(WarpedMotionParams *wm) {
    223  switch (wm->wmtype) {
    224    case IDENTITY:
    225      assert(wm->wmmat[0] == 0);
    226      assert(wm->wmmat[1] == 0);
    227      AOM_FALLTHROUGH_INTENDED;
    228    case TRANSLATION:
    229      assert(wm->wmmat[2] == 1 << WARPEDMODEL_PREC_BITS);
    230      assert(wm->wmmat[3] == 0);
    231      AOM_FALLTHROUGH_INTENDED;
    232    case ROTZOOM:
    233      assert(wm->wmmat[4] == -wm->wmmat[3]);
    234      assert(wm->wmmat[5] == wm->wmmat[2]);
    235      AOM_FALLTHROUGH_INTENDED;
    236    case AFFINE: break;
    237    default: assert(0 && "Bad wmtype");
    238  }
    239 }
    240 #endif  // NDEBUG
    241 
    242 // Returns 1 on success or 0 on an invalid affine set
    243 int av1_get_shear_params(WarpedMotionParams *wm) {
    244 #ifndef NDEBUG
    245  // Check that models have been constructed sensibly
    246  // This is a good place to check, because this function does not need to
    247  // be called until after model construction is complete, but must be called
    248  // before the model can be used for prediction.
    249  check_model_consistency(wm);
    250 #endif  // NDEBUG
    251 
    252  const int32_t *mat = wm->wmmat;
    253  if (!is_affine_valid(wm)) return 0;
    254 
    255  wm->alpha =
    256      clamp(mat[2] - (1 << WARPEDMODEL_PREC_BITS), INT16_MIN, INT16_MAX);
    257  wm->beta = clamp(mat[3], INT16_MIN, INT16_MAX);
    258  int16_t shift;
    259  int16_t y = resolve_divisor_32(abs(mat[2]), &shift) * (mat[2] < 0 ? -1 : 1);
    260  int64_t v = ((int64_t)mat[4] * (1 << WARPEDMODEL_PREC_BITS)) * y;
    261  wm->gamma =
    262      clamp((int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift), INT16_MIN, INT16_MAX);
    263  v = ((int64_t)mat[3] * mat[4]) * y;
    264  wm->delta = clamp(mat[5] - (int)ROUND_POWER_OF_TWO_SIGNED_64(v, shift) -
    265                        (1 << WARPEDMODEL_PREC_BITS),
    266                    INT16_MIN, INT16_MAX);
    267 
    268  wm->alpha = ROUND_POWER_OF_TWO_SIGNED(wm->alpha, WARP_PARAM_REDUCE_BITS) *
    269              (1 << WARP_PARAM_REDUCE_BITS);
    270  wm->beta = ROUND_POWER_OF_TWO_SIGNED(wm->beta, WARP_PARAM_REDUCE_BITS) *
    271             (1 << WARP_PARAM_REDUCE_BITS);
    272  wm->gamma = ROUND_POWER_OF_TWO_SIGNED(wm->gamma, WARP_PARAM_REDUCE_BITS) *
    273              (1 << WARP_PARAM_REDUCE_BITS);
    274  wm->delta = ROUND_POWER_OF_TWO_SIGNED(wm->delta, WARP_PARAM_REDUCE_BITS) *
    275              (1 << WARP_PARAM_REDUCE_BITS);
    276 
    277  if (!is_affine_shear_allowed(wm->alpha, wm->beta, wm->gamma, wm->delta))
    278    return 0;
    279 
    280  return 1;
    281 }
    282 
    283 #if CONFIG_AV1_HIGHBITDEPTH
    284 /* Note: For an explanation of the warp algorithm, and some notes on bit widths
    285    for hardware implementations, see the comments above av1_warp_affine_c
    286 */
    287 void av1_highbd_warp_affine_c(const int32_t *mat, const uint16_t *ref,
    288                              int width, int height, int stride, uint16_t *pred,
    289                              int p_col, int p_row, int p_width, int p_height,
    290                              int p_stride, int subsampling_x,
    291                              int subsampling_y, int bd,
    292                              ConvolveParams *conv_params, int16_t alpha,
    293                              int16_t beta, int16_t gamma, int16_t delta) {
    294  int32_t tmp[15 * 8];
    295  const int reduce_bits_horiz = conv_params->round_0;
    296  const int reduce_bits_vert = conv_params->is_compound
    297                                   ? conv_params->round_1
    298                                   : 2 * FILTER_BITS - reduce_bits_horiz;
    299  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
    300  const int offset_bits_horiz = bd + FILTER_BITS - 1;
    301  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    302  const int round_bits =
    303      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    304  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    305  (void)max_bits_horiz;
    306  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    307 
    308  // Check that, even with 12-bit input, the intermediate values will fit
    309  // into an unsigned 16-bit intermediate array.
    310  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
    311 
    312  for (int i = p_row; i < p_row + p_height; i += 8) {
    313    for (int j = p_col; j < p_col + p_width; j += 8) {
    314      // Calculate the center of this 8x8 block,
    315      // project to luma coordinates (if in a subsampled chroma plane),
    316      // apply the affine transformation,
    317      // then convert back to the original coordinates (if necessary)
    318      const int32_t src_x = (j + 4) << subsampling_x;
    319      const int32_t src_y = (i + 4) << subsampling_y;
    320      const int64_t dst_x =
    321          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
    322      const int64_t dst_y =
    323          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
    324      const int64_t x4 = dst_x >> subsampling_x;
    325      const int64_t y4 = dst_y >> subsampling_y;
    326 
    327      const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
    328      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    329      const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
    330      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    331 
    332      sx4 += alpha * (-4) + beta * (-4);
    333      sy4 += gamma * (-4) + delta * (-4);
    334 
    335      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    336      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    337 
    338      // Horizontal filter
    339      for (int k = -7; k < 8; ++k) {
    340        const int iy = clamp(iy4 + k, 0, height - 1);
    341 
    342        int sx = sx4 + beta * (k + 4);
    343        for (int l = -4; l < 4; ++l) {
    344          int ix = ix4 + l - 3;
    345          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
    346                           WARPEDPIXEL_PREC_SHIFTS;
    347          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    348          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
    349 
    350          int32_t sum = 1 << offset_bits_horiz;
    351          for (int m = 0; m < 8; ++m) {
    352            const int sample_x = clamp(ix + m, 0, width - 1);
    353            sum += ref[iy * stride + sample_x] * coeffs[m];
    354          }
    355          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
    356          assert(0 <= sum && sum < (1 << max_bits_horiz));
    357          tmp[(k + 7) * 8 + (l + 4)] = sum;
    358          sx += alpha;
    359        }
    360      }
    361 
    362      // Vertical filter
    363      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
    364        int sy = sy4 + delta * (k + 4);
    365        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
    366          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
    367                           WARPEDPIXEL_PREC_SHIFTS;
    368          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    369          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
    370 
    371          int32_t sum = 1 << offset_bits_vert;
    372          for (int m = 0; m < 8; ++m) {
    373            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
    374          }
    375 
    376          if (conv_params->is_compound) {
    377            CONV_BUF_TYPE *p =
    378                &conv_params
    379                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
    380                           (j - p_col + l + 4)];
    381            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    382            if (conv_params->do_average) {
    383              uint16_t *dst16 =
    384                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    385              int32_t tmp32 = *p;
    386              if (conv_params->use_dist_wtd_comp_avg) {
    387                tmp32 = tmp32 * conv_params->fwd_offset +
    388                        sum * conv_params->bck_offset;
    389                tmp32 = tmp32 >> DIST_PRECISION_BITS;
    390              } else {
    391                tmp32 += sum;
    392                tmp32 = tmp32 >> 1;
    393              }
    394              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
    395                      (1 << (offset_bits - conv_params->round_1 - 1));
    396              *dst16 =
    397                  clip_pixel_highbd(ROUND_POWER_OF_TWO(tmp32, round_bits), bd);
    398            } else {
    399              *p = sum;
    400            }
    401          } else {
    402            uint16_t *p =
    403                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    404            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    405            assert(0 <= sum && sum < (1 << (bd + 2)));
    406            *p = clip_pixel_highbd(sum - (1 << (bd - 1)) - (1 << bd), bd);
    407          }
    408          sy += gamma;
    409        }
    410      }
    411    }
    412  }
    413 }
    414 
    415 void highbd_warp_plane(WarpedMotionParams *wm, const uint16_t *const ref,
    416                       int width, int height, int stride, uint16_t *const pred,
    417                       int p_col, int p_row, int p_width, int p_height,
    418                       int p_stride, int subsampling_x, int subsampling_y,
    419                       int bd, ConvolveParams *conv_params) {
    420  const int32_t *const mat = wm->wmmat;
    421  const int16_t alpha = wm->alpha;
    422  const int16_t beta = wm->beta;
    423  const int16_t gamma = wm->gamma;
    424  const int16_t delta = wm->delta;
    425 
    426  av1_highbd_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row,
    427                         p_width, p_height, p_stride, subsampling_x,
    428                         subsampling_y, bd, conv_params, alpha, beta, gamma,
    429                         delta);
    430 }
    431 #endif  // CONFIG_AV1_HIGHBITDEPTH
    432 
    433 /* The warp filter for ROTZOOM and AFFINE models works as follows:
    434   * Split the input into 8x8 blocks
    435   * For each block, project the point (4, 4) within the block, to get the
    436     overall block position. Split into integer and fractional coordinates,
    437     maintaining full WARPEDMODEL precision
    438   * Filter horizontally: Generate 15 rows of 8 pixels each. Each pixel gets a
    439     variable horizontal offset. This means that, while the rows of the
    440     intermediate buffer align with the rows of the *reference* image, the
    441     columns align with the columns of the *destination* image.
    442   * Filter vertically: Generate the output block (up to 8x8 pixels, but if the
    443     destination is too small we crop the output at this stage). Each pixel has
    444     a variable vertical offset, so that the resulting rows are aligned with
    445     the rows of the destination image.
    446 
    447   To accomplish these alignments, we factor the warp matrix as a
    448   product of two shear / asymmetric zoom matrices:
    449   / a b \  = /   1       0    \ * / 1+alpha  beta \
    450   \ c d /    \ gamma  1+delta /   \    0      1   /
    451   where a, b, c, d are wmmat[2], wmmat[3], wmmat[4], wmmat[5] respectively.
    452   The horizontal shear (with alpha and beta) is applied first,
    453   then the vertical shear (with gamma and delta) is applied second.
    454 
    455   The only limitation is that, to fit this in a fixed 8-tap filter size,
    456   the fractional pixel offsets must be at most +-1. Since the horizontal filter
    457   generates 15 rows of 8 columns, and the initial point we project is at (4, 4)
    458   within the block, the parameters must satisfy
    459   4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 4 * |delta| <= 1
    460   for this filter to be applicable.
    461 
    462   Note: This function assumes that the caller has done all of the relevant
    463   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
    464   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
    465   gamma, delta are all in range.
    466 
    467   TODO(rachelbarker): Maybe support scaled references?
    468 */
    469 /* A note on hardware implementation:
    470    The warp filter is intended to be implementable using the same hardware as
    471    the high-precision convolve filters from the loop-restoration and
    472    convolve-round experiments.
    473 
    474    For a single filter stage, considering all of the coefficient sets for the
    475    warp filter and the regular convolution filter, an input in the range
    476    [0, 2^k - 1] is mapped into the range [-56 * (2^k - 1), 184 * (2^k - 1)]
    477    before rounding.
    478 
    479    Allowing for some changes to the filter coefficient sets, call the range
    480    [-64 * 2^k, 192 * 2^k]. Then, if we initialize the accumulator to 64 * 2^k,
    481    we can replace this by the range [0, 256 * 2^k], which can be stored in an
    482    unsigned value with 8 + k bits.
    483 
    484    This allows the derivation of the appropriate bit widths and offsets for
    485    the various intermediate values: If
    486 
    487    F := FILTER_BITS = 7 (or else the above ranges need adjusting)
    488         So a *single* filter stage maps a k-bit input to a (k + F + 1)-bit
    489         intermediate value.
    490    H := ROUND0_BITS
    491    V := VERSHEAR_REDUCE_PREC_BITS
    492    (and note that we must have H + V = 2*F for the output to have the same
    493     scale as the input)
    494 
    495    then we end up with the following offsets and ranges:
    496    Horizontal filter: Apply an offset of 1 << (bd + F - 1), sum fits into a
    497                       uint{bd + F + 1}
    498    After rounding: The values stored in 'tmp' fit into a uint{bd + F + 1 - H}.
    499    Vertical filter: Apply an offset of 1 << (bd + 2*F - H), sum fits into a
    500                     uint{bd + 2*F + 2 - H}
    501    After rounding: The final value, before undoing the offset, fits into a
    502                    uint{bd + 2}.
    503 
    504    Then we need to undo the offsets before clamping to a pixel. Note that,
    505    if we do this at the end, the amount to subtract is actually independent
    506    of H and V:
    507 
    508    offset to subtract = (1 << ((bd + F - 1) - H + F - V)) +
    509                         (1 << ((bd + 2*F - H) - V))
    510                      == (1 << (bd - 1)) + (1 << bd)
    511 
    512    This allows us to entirely avoid clamping in both the warp filter and
    513    the convolve-round experiment. As of the time of writing, the Wiener filter
    514    from loop-restoration can encode a central coefficient up to 216, which
    515    leads to a maximum value of about 282 * 2^k after applying the offset.
    516    So in that case we still need to clamp.
    517 */
    518 void av1_warp_affine_c(const int32_t *mat, const uint8_t *ref, int width,
    519                       int height, int stride, uint8_t *pred, int p_col,
    520                       int p_row, int p_width, int p_height, int p_stride,
    521                       int subsampling_x, int subsampling_y,
    522                       ConvolveParams *conv_params, int16_t alpha, int16_t beta,
    523                       int16_t gamma, int16_t delta) {
    524  int32_t tmp[15 * 8];
    525  const int bd = 8;
    526  const int reduce_bits_horiz = conv_params->round_0;
    527  const int reduce_bits_vert = conv_params->is_compound
    528                                   ? conv_params->round_1
    529                                   : 2 * FILTER_BITS - reduce_bits_horiz;
    530  const int max_bits_horiz = bd + FILTER_BITS + 1 - reduce_bits_horiz;
    531  const int offset_bits_horiz = bd + FILTER_BITS - 1;
    532  const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
    533  const int round_bits =
    534      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
    535  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
    536  (void)max_bits_horiz;
    537  assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
    538  assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
    539 
    540  for (int i = p_row; i < p_row + p_height; i += 8) {
    541    for (int j = p_col; j < p_col + p_width; j += 8) {
    542      // Calculate the center of this 8x8 block,
    543      // project to luma coordinates (if in a subsampled chroma plane),
    544      // apply the affine transformation,
    545      // then convert back to the original coordinates (if necessary)
    546      const int32_t src_x = (j + 4) << subsampling_x;
    547      const int32_t src_y = (i + 4) << subsampling_y;
    548      const int64_t dst_x =
    549          (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
    550      const int64_t dst_y =
    551          (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
    552      const int64_t x4 = dst_x >> subsampling_x;
    553      const int64_t y4 = dst_y >> subsampling_y;
    554 
    555      int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
    556      int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    557      int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
    558      int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
    559 
    560      sx4 += alpha * (-4) + beta * (-4);
    561      sy4 += gamma * (-4) + delta * (-4);
    562 
    563      sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    564      sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
    565 
    566      // Horizontal filter
    567      for (int k = -7; k < 8; ++k) {
    568        // Clamp to top/bottom edge of the frame
    569        const int iy = clamp(iy4 + k, 0, height - 1);
    570 
    571        int sx = sx4 + beta * (k + 4);
    572 
    573        for (int l = -4; l < 4; ++l) {
    574          int ix = ix4 + l - 3;
    575          // At this point, sx = sx4 + alpha * l + beta * k
    576          const int offs = ROUND_POWER_OF_TWO(sx, WARPEDDIFF_PREC_BITS) +
    577                           WARPEDPIXEL_PREC_SHIFTS;
    578          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    579          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
    580 
    581          int32_t sum = 1 << offset_bits_horiz;
    582          for (int m = 0; m < 8; ++m) {
    583            // Clamp to left/right edge of the frame
    584            const int sample_x = clamp(ix + m, 0, width - 1);
    585 
    586            sum += ref[iy * stride + sample_x] * coeffs[m];
    587          }
    588          sum = ROUND_POWER_OF_TWO(sum, reduce_bits_horiz);
    589          assert(0 <= sum && sum < (1 << max_bits_horiz));
    590          tmp[(k + 7) * 8 + (l + 4)] = sum;
    591          sx += alpha;
    592        }
    593      }
    594 
    595      // Vertical filter
    596      for (int k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
    597        int sy = sy4 + delta * (k + 4);
    598        for (int l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
    599          // At this point, sy = sy4 + gamma * l + delta * k
    600          const int offs = ROUND_POWER_OF_TWO(sy, WARPEDDIFF_PREC_BITS) +
    601                           WARPEDPIXEL_PREC_SHIFTS;
    602          assert(offs >= 0 && offs <= WARPEDPIXEL_PREC_SHIFTS * 3);
    603          const WarpedFilterCoeff *coeffs = av1_warped_filter[offs];
    604 
    605          int32_t sum = 1 << offset_bits_vert;
    606          for (int m = 0; m < 8; ++m) {
    607            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
    608          }
    609 
    610          if (conv_params->is_compound) {
    611            CONV_BUF_TYPE *p =
    612                &conv_params
    613                     ->dst[(i - p_row + k + 4) * conv_params->dst_stride +
    614                           (j - p_col + l + 4)];
    615            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    616            if (conv_params->do_average) {
    617              uint8_t *dst8 =
    618                  &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    619              int32_t tmp32 = *p;
    620              if (conv_params->use_dist_wtd_comp_avg) {
    621                tmp32 = tmp32 * conv_params->fwd_offset +
    622                        sum * conv_params->bck_offset;
    623                tmp32 = tmp32 >> DIST_PRECISION_BITS;
    624              } else {
    625                tmp32 += sum;
    626                tmp32 = tmp32 >> 1;
    627              }
    628              tmp32 = tmp32 - (1 << (offset_bits - conv_params->round_1)) -
    629                      (1 << (offset_bits - conv_params->round_1 - 1));
    630              *dst8 = clip_pixel(ROUND_POWER_OF_TWO(tmp32, round_bits));
    631            } else {
    632              *p = sum;
    633            }
    634          } else {
    635            uint8_t *p =
    636                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
    637            sum = ROUND_POWER_OF_TWO(sum, reduce_bits_vert);
    638            assert(0 <= sum && sum < (1 << (bd + 2)));
    639            *p = clip_pixel(sum - (1 << (bd - 1)) - (1 << bd));
    640          }
    641          sy += gamma;
    642        }
    643      }
    644    }
    645  }
    646 }
    647 
    648 void warp_plane(WarpedMotionParams *wm, const uint8_t *const ref, int width,
    649                int height, int stride, uint8_t *pred, int p_col, int p_row,
    650                int p_width, int p_height, int p_stride, int subsampling_x,
    651                int subsampling_y, ConvolveParams *conv_params) {
    652  const int32_t *const mat = wm->wmmat;
    653  const int16_t alpha = wm->alpha;
    654  const int16_t beta = wm->beta;
    655  const int16_t gamma = wm->gamma;
    656  const int16_t delta = wm->delta;
    657  av1_warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
    658                  p_height, p_stride, subsampling_x, subsampling_y, conv_params,
    659                  alpha, beta, gamma, delta);
    660 }
    661 
    662 void av1_warp_plane(WarpedMotionParams *wm, int use_hbd, int bd,
    663                    const uint8_t *ref, int width, int height, int stride,
    664                    uint8_t *pred, int p_col, int p_row, int p_width,
    665                    int p_height, int p_stride, int subsampling_x,
    666                    int subsampling_y, ConvolveParams *conv_params) {
    667 #if CONFIG_AV1_HIGHBITDEPTH
    668  if (use_hbd)
    669    highbd_warp_plane(wm, CONVERT_TO_SHORTPTR(ref), width, height, stride,
    670                      CONVERT_TO_SHORTPTR(pred), p_col, p_row, p_width,
    671                      p_height, p_stride, subsampling_x, subsampling_y, bd,
    672                      conv_params);
    673  else
    674    warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
    675               p_height, p_stride, subsampling_x, subsampling_y, conv_params);
    676 #else
    677  (void)use_hbd;
    678  (void)bd;
    679  warp_plane(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
    680             p_height, p_stride, subsampling_x, subsampling_y, conv_params);
    681 #endif
    682 }
    683 
    684 #define LS_MV_MAX 256  // max mv in 1/8-pel
    685 // Use LS_STEP = 8 so that 2 less bits needed for A, Bx, By.
    686 #define LS_STEP 8
    687 
    688 // Assuming LS_MV_MAX is < MAX_SB_SIZE * 8,
    689 // the precision needed is:
    690 //   (MAX_SB_SIZE_LOG2 + 3) [for sx * sx magnitude] +
    691 //   (MAX_SB_SIZE_LOG2 + 4) [for sx * dx magnitude] +
    692 //   1 [for sign] +
    693 //   LEAST_SQUARES_SAMPLES_MAX_BITS
    694 //        [for adding up to LEAST_SQUARES_SAMPLES_MAX samples]
    695 // The value is 23
    696 #define LS_MAT_RANGE_BITS \
    697  ((MAX_SB_SIZE_LOG2 + 4) * 2 + LEAST_SQUARES_SAMPLES_MAX_BITS)
    698 
    699 // Bit-depth reduction from the full-range
    700 #define LS_MAT_DOWN_BITS 2
    701 
    702 // bits range of A, Bx and By after downshifting
    703 #define LS_MAT_BITS (LS_MAT_RANGE_BITS - LS_MAT_DOWN_BITS)
    704 #define LS_MAT_MIN (-(1 << (LS_MAT_BITS - 1)))
    705 #define LS_MAT_MAX ((1 << (LS_MAT_BITS - 1)) - 1)
    706 
    707 // By setting LS_STEP = 8, the least 2 bits of every elements in A, Bx, By are
    708 // 0. So, we can reduce LS_MAT_RANGE_BITS(2) bits here.
    709 #define LS_SQUARE(a)                                              \
    710  (((a) * (a) * 4 + (a) * 4 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
    711   (2 + LS_MAT_DOWN_BITS))
    712 #define LS_PRODUCT1(a, b)                                             \
    713  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP) >> \
    714   (2 + LS_MAT_DOWN_BITS))
    715 #define LS_PRODUCT2(a, b)                                                 \
    716  (((a) * (b) * 4 + ((a) + (b)) * 2 * LS_STEP + LS_STEP * LS_STEP * 2) >> \
    717   (2 + LS_MAT_DOWN_BITS))
    718 
    719 #define USE_LIMITED_PREC_MULT 0
    720 
    721 #if USE_LIMITED_PREC_MULT
    722 
    723 #define MUL_PREC_BITS 16
    724 static uint16_t resolve_multiplier_64(uint64_t D, int16_t *shift) {
    725  int msb = 0;
    726  uint16_t mult = 0;
    727  *shift = 0;
    728  if (D != 0) {
    729    msb = (int16_t)((D >> 32) ? get_msb((unsigned int)(D >> 32)) + 32
    730                              : get_msb((unsigned int)D));
    731    if (msb >= MUL_PREC_BITS) {
    732      mult = (uint16_t)ROUND_POWER_OF_TWO_64(D, msb + 1 - MUL_PREC_BITS);
    733      *shift = msb + 1 - MUL_PREC_BITS;
    734    } else {
    735      mult = (uint16_t)D;
    736      *shift = 0;
    737    }
    738  }
    739  return mult;
    740 }
    741 
    742 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
    743  int32_t ret;
    744  int16_t mshift;
    745  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
    746  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
    747  shift -= mshift;
    748  if (shift > 0) {
    749    return (int32_t)clamp(ROUND_POWER_OF_TWO_SIGNED(v, shift),
    750                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    751                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    752  } else {
    753    return (int32_t)clamp(v * (1 << (-shift)),
    754                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    755                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    756  }
    757  return ret;
    758 }
    759 
    760 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
    761  int16_t mshift;
    762  uint16_t Mul = resolve_multiplier_64(llabs(Px), &mshift);
    763  int32_t v = (int32_t)Mul * (int32_t)iDet * (Px < 0 ? -1 : 1);
    764  shift -= mshift;
    765  if (shift > 0) {
    766    return (int32_t)clamp(
    767        ROUND_POWER_OF_TWO_SIGNED(v, shift),
    768        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    769        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    770  } else {
    771    return (int32_t)clamp(
    772        v * (1 << (-shift)),
    773        (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    774        (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    775  }
    776 }
    777 
    778 #else
    779 
    780 static int32_t get_mult_shift_ndiag(int64_t Px, int16_t iDet, int shift) {
    781  int64_t v = Px * (int64_t)iDet;
    782  return (int32_t)clamp64(ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
    783                          -WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    784                          WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    785 }
    786 
    787 static int32_t get_mult_shift_diag(int64_t Px, int16_t iDet, int shift) {
    788  int64_t v = Px * (int64_t)iDet;
    789  return (int32_t)clamp64(
    790      ROUND_POWER_OF_TWO_SIGNED_64(v, shift),
    791      (1 << WARPEDMODEL_PREC_BITS) - WARPEDMODEL_NONDIAGAFFINE_CLAMP + 1,
    792      (1 << WARPEDMODEL_PREC_BITS) + WARPEDMODEL_NONDIAGAFFINE_CLAMP - 1);
    793 }
    794 #endif  // USE_LIMITED_PREC_MULT
    795 
    796 static int find_affine_int(int np, const int *pts1, const int *pts2,
    797                           BLOCK_SIZE bsize, int mvy, int mvx,
    798                           WarpedMotionParams *wm, int mi_row, int mi_col) {
    799  int32_t A[2][2] = { { 0, 0 }, { 0, 0 } };
    800  int32_t Bx[2] = { 0, 0 };
    801  int32_t By[2] = { 0, 0 };
    802 
    803  const int bw = block_size_wide[bsize];
    804  const int bh = block_size_high[bsize];
    805  const int rsuy = bh / 2 - 1;
    806  const int rsux = bw / 2 - 1;
    807  const int suy = rsuy * 8;
    808  const int sux = rsux * 8;
    809  const int duy = suy + mvy;
    810  const int dux = sux + mvx;
    811 
    812  // Assume the center pixel of the block has exactly the same motion vector
    813  // as transmitted for the block. First shift the origin of the source
    814  // points to the block center, and the origin of the destination points to
    815  // the block center added to the motion vector transmitted.
    816  // Let (xi, yi) denote the source points and (xi', yi') denote destination
    817  // points after origin shfifting, for i = 0, 1, 2, .... n-1.
    818  // Then if  P = [x0, y0,
    819  //               x1, y1
    820  //               x2, y1,
    821  //                ....
    822  //              ]
    823  //          q = [x0', x1', x2', ... ]'
    824  //          r = [y0', y1', y2', ... ]'
    825  // the least squares problems that need to be solved are:
    826  //          [h1, h2]' = inv(P'P)P'q and
    827  //          [h3, h4]' = inv(P'P)P'r
    828  // where the affine transformation is given by:
    829  //          x' = h1.x + h2.y
    830  //          y' = h3.x + h4.y
    831  //
    832  // The loop below computes: A = P'P, Bx = P'q, By = P'r
    833  // We need to just compute inv(A).Bx and inv(A).By for the solutions.
    834  // Contribution from neighbor block
    835  for (int i = 0; i < np; i++) {
    836    const int dx = pts2[i * 2] - dux;
    837    const int dy = pts2[i * 2 + 1] - duy;
    838    const int sx = pts1[i * 2] - sux;
    839    const int sy = pts1[i * 2 + 1] - suy;
    840    // (TODO)yunqing: This comparison wouldn't be necessary if the sample
    841    // selection is done in find_samples(). Also, global offset can be removed
    842    // while collecting samples.
    843    if (abs(sx - dx) < LS_MV_MAX && abs(sy - dy) < LS_MV_MAX) {
    844      A[0][0] += LS_SQUARE(sx);
    845      A[0][1] += LS_PRODUCT1(sx, sy);
    846      A[1][1] += LS_SQUARE(sy);
    847      Bx[0] += LS_PRODUCT2(sx, dx);
    848      Bx[1] += LS_PRODUCT1(sy, dx);
    849      By[0] += LS_PRODUCT1(sx, dy);
    850      By[1] += LS_PRODUCT2(sy, dy);
    851    }
    852  }
    853 
    854  // Just for debugging, and can be removed later.
    855  assert(A[0][0] >= LS_MAT_MIN && A[0][0] <= LS_MAT_MAX);
    856  assert(A[0][1] >= LS_MAT_MIN && A[0][1] <= LS_MAT_MAX);
    857  assert(A[1][1] >= LS_MAT_MIN && A[1][1] <= LS_MAT_MAX);
    858  assert(Bx[0] >= LS_MAT_MIN && Bx[0] <= LS_MAT_MAX);
    859  assert(Bx[1] >= LS_MAT_MIN && Bx[1] <= LS_MAT_MAX);
    860  assert(By[0] >= LS_MAT_MIN && By[0] <= LS_MAT_MAX);
    861  assert(By[1] >= LS_MAT_MIN && By[1] <= LS_MAT_MAX);
    862 
    863  // Compute Determinant of A
    864  const int64_t Det = (int64_t)A[0][0] * A[1][1] - (int64_t)A[0][1] * A[0][1];
    865  if (Det == 0) return 1;
    866 
    867  int16_t shift;
    868  int16_t iDet = resolve_divisor_64(llabs(Det), &shift) * (Det < 0 ? -1 : 1);
    869  shift -= WARPEDMODEL_PREC_BITS;
    870  if (shift < 0) {
    871    iDet <<= (-shift);
    872    shift = 0;
    873  }
    874 
    875  int64_t Px[2], Py[2];
    876  // These divided by the Det, are the least squares solutions
    877  Px[0] = (int64_t)A[1][1] * Bx[0] - (int64_t)A[0][1] * Bx[1];
    878  Px[1] = -(int64_t)A[0][1] * Bx[0] + (int64_t)A[0][0] * Bx[1];
    879  Py[0] = (int64_t)A[1][1] * By[0] - (int64_t)A[0][1] * By[1];
    880  Py[1] = -(int64_t)A[0][1] * By[0] + (int64_t)A[0][0] * By[1];
    881 
    882  wm->wmmat[2] = get_mult_shift_diag(Px[0], iDet, shift);
    883  wm->wmmat[3] = get_mult_shift_ndiag(Px[1], iDet, shift);
    884  wm->wmmat[4] = get_mult_shift_ndiag(Py[0], iDet, shift);
    885  wm->wmmat[5] = get_mult_shift_diag(Py[1], iDet, shift);
    886 
    887  const int isuy = (mi_row * MI_SIZE + rsuy);
    888  const int isux = (mi_col * MI_SIZE + rsux);
    889  // Note: In the vx, vy expressions below, the max value of each of the
    890  // 2nd and 3rd terms are (2^16 - 1) * (2^13 - 1). That leaves enough room
    891  // for the first term so that the overall sum in the worst case fits
    892  // within 32 bits overall.
    893  const int32_t vx = mvx * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
    894                     (isux * (wm->wmmat[2] - (1 << WARPEDMODEL_PREC_BITS)) +
    895                      isuy * wm->wmmat[3]);
    896  const int32_t vy = mvy * (1 << (WARPEDMODEL_PREC_BITS - 3)) -
    897                     (isux * wm->wmmat[4] +
    898                      isuy * (wm->wmmat[5] - (1 << WARPEDMODEL_PREC_BITS)));
    899  wm->wmmat[0] =
    900      clamp(vx, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
    901  wm->wmmat[1] =
    902      clamp(vy, -WARPEDMODEL_TRANS_CLAMP, WARPEDMODEL_TRANS_CLAMP - 1);
    903  return 0;
    904 }
    905 
    906 int av1_find_projection(int np, const int *pts1, const int *pts2,
    907                        BLOCK_SIZE bsize, int mvy, int mvx,
    908                        WarpedMotionParams *wm_params, int mi_row, int mi_col) {
    909  assert(wm_params->wmtype == AFFINE);
    910 
    911  if (find_affine_int(np, pts1, pts2, bsize, mvy, mvx, wm_params, mi_row,
    912                      mi_col))
    913    return 1;
    914 
    915  // check compatibility with the fast warp filter
    916  if (!av1_get_shear_params(wm_params)) return 1;
    917 
    918  return 0;
    919 }