tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

vp9dsp_template.c (89045B)


      1 /*
      2 * VP9 compatible video decoder
      3 *
      4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
      5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
      6 *
      7 * This file is part of FFmpeg.
      8 *
      9 * FFmpeg is free software; you can redistribute it and/or
     10 * modify it under the terms of the GNU Lesser General Public
     11 * License as published by the Free Software Foundation; either
     12 * version 2.1 of the License, or (at your option) any later version.
     13 *
     14 * FFmpeg is distributed in the hope that it will be useful,
     15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     17 * Lesser General Public License for more details.
     18 *
     19 * You should have received a copy of the GNU Lesser General Public
     20 * License along with FFmpeg; if not, write to the Free Software
     21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     22 */
     23 
     24 #include "libavutil/common.h"
     25 #include "bit_depth_template.c"
     26 #include "vp9dsp.h"
     27 
     28 #if BIT_DEPTH != 12
     29 
     30 // FIXME see whether we can merge parts of this (perhaps at least 4x4 and 8x8)
     31 // back with h264pred.[ch]
     32 
     33 static void vert_4x4_c(uint8_t *restrict _dst, ptrdiff_t stride,
     34                       const uint8_t *left, const uint8_t *_top)
     35 {
     36    pixel *dst = (pixel *) _dst;
     37    const pixel *top = (const pixel *) _top;
     38    pixel4 p4 = AV_RN4PA(top);
     39 
     40    stride /= sizeof(pixel);
     41    AV_WN4PA(dst + stride * 0, p4);
     42    AV_WN4PA(dst + stride * 1, p4);
     43    AV_WN4PA(dst + stride * 2, p4);
     44    AV_WN4PA(dst + stride * 3, p4);
     45 }
     46 
     47 static void vert_8x8_c(uint8_t *restrict _dst, ptrdiff_t stride,
     48                       const uint8_t *left, const uint8_t *_top)
     49 {
     50    pixel *dst = (pixel *) _dst;
     51    const pixel *top = (const pixel *) _top;
     52 #if BIT_DEPTH == 8
     53    uint64_t p8 = AV_RN64A(top);
     54 #else
     55    pixel4 p4a = AV_RN4PA(top + 0);
     56    pixel4 p4b = AV_RN4PA(top + 4);
     57 #endif
     58    int y;
     59 
     60    stride /= sizeof(pixel);
     61    for (y = 0; y < 8; y++) {
     62 #if BIT_DEPTH == 8
     63        AV_WN64A(dst, p8);
     64 #else
     65        AV_WN4PA(dst + 0, p4a);
     66        AV_WN4PA(dst + 4, p4b);
     67 #endif
     68        dst += stride;
     69    }
     70 }
     71 
     72 static void vert_16x16_c(uint8_t *restrict _dst, ptrdiff_t stride,
     73                         const uint8_t *left, const uint8_t *_top)
     74 {
     75    pixel *dst = (pixel *) _dst;
     76    const pixel *top = (const pixel *) _top;
     77 #if BIT_DEPTH == 8
     78    uint64_t p8a = AV_RN64A(top);
     79    uint64_t p8b = AV_RN64A(top + 8);
     80 #else
     81    pixel4 p4a = AV_RN4PA(top +  0);
     82    pixel4 p4b = AV_RN4PA(top +  4);
     83    pixel4 p4c = AV_RN4PA(top +  8);
     84    pixel4 p4d = AV_RN4PA(top + 12);
     85 #endif
     86    int y;
     87 
     88    stride /= sizeof(pixel);
     89    for (y = 0; y < 16; y++) {
     90 #if BIT_DEPTH == 8
     91        AV_WN64A(dst +  0, p8a);
     92        AV_WN64A(dst +  8, p8b);
     93 #else
     94        AV_WN4PA(dst +  0, p4a);
     95        AV_WN4PA(dst +  4, p4b);
     96        AV_WN4PA(dst +  8, p4c);
     97        AV_WN4PA(dst + 12, p4d);
     98 #endif
     99        dst += stride;
    100    }
    101 }
    102 
    103 static void vert_32x32_c(uint8_t *restrict _dst, ptrdiff_t stride,
    104                         const uint8_t *left, const uint8_t *_top)
    105 {
    106    pixel *dst = (pixel *) _dst;
    107    const pixel *top = (const pixel *) _top;
    108 #if BIT_DEPTH == 8
    109    uint64_t p8a = AV_RN64A(top);
    110    uint64_t p8b = AV_RN64A(top + 8);
    111    uint64_t p8c = AV_RN64A(top + 16);
    112    uint64_t p8d = AV_RN64A(top + 24);
    113 #else
    114    pixel4 p4a = AV_RN4PA(top +  0);
    115    pixel4 p4b = AV_RN4PA(top +  4);
    116    pixel4 p4c = AV_RN4PA(top +  8);
    117    pixel4 p4d = AV_RN4PA(top + 12);
    118    pixel4 p4e = AV_RN4PA(top + 16);
    119    pixel4 p4f = AV_RN4PA(top + 20);
    120    pixel4 p4g = AV_RN4PA(top + 24);
    121    pixel4 p4h = AV_RN4PA(top + 28);
    122 #endif
    123    int y;
    124 
    125    stride /= sizeof(pixel);
    126    for (y = 0; y < 32; y++) {
    127 #if BIT_DEPTH == 8
    128        AV_WN64A(dst +  0, p8a);
    129        AV_WN64A(dst +  8, p8b);
    130        AV_WN64A(dst + 16, p8c);
    131        AV_WN64A(dst + 24, p8d);
    132 #else
    133        AV_WN4PA(dst +  0, p4a);
    134        AV_WN4PA(dst +  4, p4b);
    135        AV_WN4PA(dst +  8, p4c);
    136        AV_WN4PA(dst + 12, p4d);
    137        AV_WN4PA(dst + 16, p4e);
    138        AV_WN4PA(dst + 20, p4f);
    139        AV_WN4PA(dst + 24, p4g);
    140        AV_WN4PA(dst + 28, p4h);
    141 #endif
    142        dst += stride;
    143    }
    144 }
    145 
    146 static void hor_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    147                      const uint8_t *_left, const uint8_t *top)
    148 {
    149    pixel *dst = (pixel *) _dst;
    150    const pixel *left = (const pixel *) _left;
    151 
    152    stride /= sizeof(pixel);
    153    AV_WN4PA(dst + stride * 0, PIXEL_SPLAT_X4(left[3]));
    154    AV_WN4PA(dst + stride * 1, PIXEL_SPLAT_X4(left[2]));
    155    AV_WN4PA(dst + stride * 2, PIXEL_SPLAT_X4(left[1]));
    156    AV_WN4PA(dst + stride * 3, PIXEL_SPLAT_X4(left[0]));
    157 }
    158 
    159 static void hor_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    160                      const uint8_t *_left, const uint8_t *top)
    161 {
    162    pixel *dst = (pixel *) _dst;
    163    const pixel *left = (const pixel *) _left;
    164    int y;
    165 
    166    stride /= sizeof(pixel);
    167    for (y = 0; y < 8; y++) {
    168        pixel4 p4 = PIXEL_SPLAT_X4(left[7 - y]);
    169 
    170        AV_WN4PA(dst + 0, p4);
    171        AV_WN4PA(dst + 4, p4);
    172        dst += stride;
    173    }
    174 }
    175 
    176 static void hor_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    177                        const uint8_t *_left, const uint8_t *top)
    178 {
    179    pixel *dst = (pixel *) _dst;
    180    const pixel *left = (const pixel *) _left;
    181    int y;
    182 
    183    stride /= sizeof(pixel);
    184    for (y = 0; y < 16; y++) {
    185        pixel4 p4 = PIXEL_SPLAT_X4(left[15 - y]);
    186 
    187        AV_WN4PA(dst +  0, p4);
    188        AV_WN4PA(dst +  4, p4);
    189        AV_WN4PA(dst +  8, p4);
    190        AV_WN4PA(dst + 12, p4);
    191        dst += stride;
    192    }
    193 }
    194 
    195 static void hor_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    196                        const uint8_t *_left, const uint8_t *top)
    197 {
    198    pixel *dst = (pixel *) _dst;
    199    const pixel *left = (const pixel *) _left;
    200    int y;
    201 
    202    stride /= sizeof(pixel);
    203    for (y = 0; y < 32; y++) {
    204        pixel4 p4 = PIXEL_SPLAT_X4(left[31 - y]);
    205 
    206        AV_WN4PA(dst +  0, p4);
    207        AV_WN4PA(dst +  4, p4);
    208        AV_WN4PA(dst +  8, p4);
    209        AV_WN4PA(dst + 12, p4);
    210        AV_WN4PA(dst + 16, p4);
    211        AV_WN4PA(dst + 20, p4);
    212        AV_WN4PA(dst + 24, p4);
    213        AV_WN4PA(dst + 28, p4);
    214        dst += stride;
    215    }
    216 }
    217 
    218 #endif /* BIT_DEPTH != 12 */
    219 
    220 static void tm_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    221                     const uint8_t *_left, const uint8_t *_top)
    222 {
    223    pixel *dst = (pixel *) _dst;
    224    const pixel *left = (const pixel *) _left;
    225    const pixel *top = (const pixel *) _top;
    226    int y, tl = top[-1];
    227 
    228    stride /= sizeof(pixel);
    229    for (y = 0; y < 4; y++) {
    230        int l_m_tl = left[3 - y] - tl;
    231 
    232        dst[0] = av_clip_pixel(top[0] + l_m_tl);
    233        dst[1] = av_clip_pixel(top[1] + l_m_tl);
    234        dst[2] = av_clip_pixel(top[2] + l_m_tl);
    235        dst[3] = av_clip_pixel(top[3] + l_m_tl);
    236        dst += stride;
    237    }
    238 }
    239 
    240 static void tm_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    241                     const uint8_t *_left, const uint8_t *_top)
    242 {
    243    pixel *dst = (pixel *) _dst;
    244    const pixel *left = (const pixel *) _left;
    245    const pixel *top = (const pixel *) _top;
    246    int y, tl = top[-1];
    247 
    248    stride /= sizeof(pixel);
    249    for (y = 0; y < 8; y++) {
    250        int l_m_tl = left[7 - y] - tl;
    251 
    252        dst[0] = av_clip_pixel(top[0] + l_m_tl);
    253        dst[1] = av_clip_pixel(top[1] + l_m_tl);
    254        dst[2] = av_clip_pixel(top[2] + l_m_tl);
    255        dst[3] = av_clip_pixel(top[3] + l_m_tl);
    256        dst[4] = av_clip_pixel(top[4] + l_m_tl);
    257        dst[5] = av_clip_pixel(top[5] + l_m_tl);
    258        dst[6] = av_clip_pixel(top[6] + l_m_tl);
    259        dst[7] = av_clip_pixel(top[7] + l_m_tl);
    260        dst += stride;
    261    }
    262 }
    263 
    264 static void tm_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    265                       const uint8_t *_left, const uint8_t *_top)
    266 {
    267    pixel *dst = (pixel *) _dst;
    268    const pixel *left = (const pixel *) _left;
    269    const pixel *top = (const pixel *) _top;
    270    int y, tl = top[-1];
    271 
    272    stride /= sizeof(pixel);
    273    for (y = 0; y < 16; y++) {
    274        int l_m_tl = left[15 - y] - tl;
    275 
    276        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
    277        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
    278        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
    279        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
    280        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
    281        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
    282        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
    283        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
    284        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
    285        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
    286        dst[10] = av_clip_pixel(top[10] + l_m_tl);
    287        dst[11] = av_clip_pixel(top[11] + l_m_tl);
    288        dst[12] = av_clip_pixel(top[12] + l_m_tl);
    289        dst[13] = av_clip_pixel(top[13] + l_m_tl);
    290        dst[14] = av_clip_pixel(top[14] + l_m_tl);
    291        dst[15] = av_clip_pixel(top[15] + l_m_tl);
    292        dst += stride;
    293    }
    294 }
    295 
    296 static void tm_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    297                       const uint8_t *_left, const uint8_t *_top)
    298 {
    299    pixel *dst = (pixel *) _dst;
    300    const pixel *left = (const pixel *) _left;
    301    const pixel *top = (const pixel *) _top;
    302    int y, tl = top[-1];
    303 
    304    stride /= sizeof(pixel);
    305    for (y = 0; y < 32; y++) {
    306        int l_m_tl = left[31 - y] - tl;
    307 
    308        dst[ 0] = av_clip_pixel(top[ 0] + l_m_tl);
    309        dst[ 1] = av_clip_pixel(top[ 1] + l_m_tl);
    310        dst[ 2] = av_clip_pixel(top[ 2] + l_m_tl);
    311        dst[ 3] = av_clip_pixel(top[ 3] + l_m_tl);
    312        dst[ 4] = av_clip_pixel(top[ 4] + l_m_tl);
    313        dst[ 5] = av_clip_pixel(top[ 5] + l_m_tl);
    314        dst[ 6] = av_clip_pixel(top[ 6] + l_m_tl);
    315        dst[ 7] = av_clip_pixel(top[ 7] + l_m_tl);
    316        dst[ 8] = av_clip_pixel(top[ 8] + l_m_tl);
    317        dst[ 9] = av_clip_pixel(top[ 9] + l_m_tl);
    318        dst[10] = av_clip_pixel(top[10] + l_m_tl);
    319        dst[11] = av_clip_pixel(top[11] + l_m_tl);
    320        dst[12] = av_clip_pixel(top[12] + l_m_tl);
    321        dst[13] = av_clip_pixel(top[13] + l_m_tl);
    322        dst[14] = av_clip_pixel(top[14] + l_m_tl);
    323        dst[15] = av_clip_pixel(top[15] + l_m_tl);
    324        dst[16] = av_clip_pixel(top[16] + l_m_tl);
    325        dst[17] = av_clip_pixel(top[17] + l_m_tl);
    326        dst[18] = av_clip_pixel(top[18] + l_m_tl);
    327        dst[19] = av_clip_pixel(top[19] + l_m_tl);
    328        dst[20] = av_clip_pixel(top[20] + l_m_tl);
    329        dst[21] = av_clip_pixel(top[21] + l_m_tl);
    330        dst[22] = av_clip_pixel(top[22] + l_m_tl);
    331        dst[23] = av_clip_pixel(top[23] + l_m_tl);
    332        dst[24] = av_clip_pixel(top[24] + l_m_tl);
    333        dst[25] = av_clip_pixel(top[25] + l_m_tl);
    334        dst[26] = av_clip_pixel(top[26] + l_m_tl);
    335        dst[27] = av_clip_pixel(top[27] + l_m_tl);
    336        dst[28] = av_clip_pixel(top[28] + l_m_tl);
    337        dst[29] = av_clip_pixel(top[29] + l_m_tl);
    338        dst[30] = av_clip_pixel(top[30] + l_m_tl);
    339        dst[31] = av_clip_pixel(top[31] + l_m_tl);
    340        dst += stride;
    341    }
    342 }
    343 
    344 #if BIT_DEPTH != 12
    345 
    346 static void dc_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    347                     const uint8_t *_left, const uint8_t *_top)
    348 {
    349    pixel *dst = (pixel *) _dst;
    350    const pixel *left = (const pixel *) _left;
    351    const pixel *top = (const pixel *) _top;
    352    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] +
    353                                top[0] + top[1] + top[2] + top[3] + 4) >> 3);
    354 
    355    stride /= sizeof(pixel);
    356    AV_WN4PA(dst + stride * 0, dc);
    357    AV_WN4PA(dst + stride * 1, dc);
    358    AV_WN4PA(dst + stride * 2, dc);
    359    AV_WN4PA(dst + stride * 3, dc);
    360 }
    361 
    362 static void dc_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    363                     const uint8_t *_left, const uint8_t *_top)
    364 {
    365    pixel *dst = (pixel *) _dst;
    366    const pixel *left = (const pixel *) _left;
    367    const pixel *top = (const pixel *) _top;
    368    pixel4 dc = PIXEL_SPLAT_X4
    369        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
    370          left[6] + left[7] + top[0] + top[1] + top[2] + top[3] +
    371          top[4] + top[5] + top[6] + top[7] + 8) >> 4);
    372    int y;
    373 
    374    stride /= sizeof(pixel);
    375    for (y = 0; y < 8; y++) {
    376        AV_WN4PA(dst + 0, dc);
    377        AV_WN4PA(dst + 4, dc);
    378        dst += stride;
    379    }
    380 }
    381 
    382 static void dc_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    383                       const uint8_t *_left, const uint8_t *_top)
    384 {
    385    pixel *dst = (pixel *) _dst;
    386    const pixel *left = (const pixel *) _left;
    387    const pixel *top = (const pixel *) _top;
    388    pixel4 dc = PIXEL_SPLAT_X4
    389        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
    390          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
    391          left[13] + left[14] + left[15] + top[0] + top[1] + top[2] + top[3] +
    392          top[4] + top[5] + top[6] + top[7] + top[8] + top[9] + top[10] +
    393          top[11] + top[12] + top[13] + top[14] + top[15] + 16) >> 5);
    394    int y;
    395 
    396    stride /= sizeof(pixel);
    397    for (y = 0; y < 16; y++) {
    398        AV_WN4PA(dst +  0, dc);
    399        AV_WN4PA(dst +  4, dc);
    400        AV_WN4PA(dst +  8, dc);
    401        AV_WN4PA(dst + 12, dc);
    402        dst += stride;
    403    }
    404 }
    405 
    406 static void dc_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    407                       const uint8_t *_left, const uint8_t *_top)
    408 {
    409    pixel *dst = (pixel *) _dst;
    410    const pixel *left = (const pixel *) _left;
    411    const pixel *top = (const pixel *) _top;
    412    pixel4 dc = PIXEL_SPLAT_X4
    413        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] + left[6] +
    414          left[7] + left[8] + left[9] + left[10] + left[11] + left[12] +
    415          left[13] + left[14] + left[15] + left[16] + left[17] + left[18] +
    416          left[19] + left[20] + left[21] + left[22] + left[23] + left[24] +
    417          left[25] + left[26] + left[27] + left[28] + left[29] + left[30] +
    418          left[31] + top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
    419          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] + top[12] +
    420          top[13] + top[14] + top[15] + top[16] + top[17] + top[18] + top[19] +
    421          top[20] + top[21] + top[22] + top[23] + top[24] + top[25] + top[26] +
    422          top[27] + top[28] + top[29] + top[30] + top[31] + 32) >> 6);
    423    int y;
    424 
    425    stride /= sizeof(pixel);
    426    for (y = 0; y < 32; y++) {
    427        AV_WN4PA(dst +  0, dc);
    428        AV_WN4PA(dst +  4, dc);
    429        AV_WN4PA(dst +  8, dc);
    430        AV_WN4PA(dst + 12, dc);
    431        AV_WN4PA(dst + 16, dc);
    432        AV_WN4PA(dst + 20, dc);
    433        AV_WN4PA(dst + 24, dc);
    434        AV_WN4PA(dst + 28, dc);
    435        dst += stride;
    436    }
    437 }
    438 
    439 static void dc_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    440                          const uint8_t *_left, const uint8_t *top)
    441 {
    442    pixel *dst = (pixel *) _dst;
    443    const pixel *left = (const pixel *) _left;
    444    pixel4 dc = PIXEL_SPLAT_X4((left[0] + left[1] + left[2] + left[3] + 2) >> 2);
    445 
    446    stride /= sizeof(pixel);
    447    AV_WN4PA(dst + stride * 0, dc);
    448    AV_WN4PA(dst + stride * 1, dc);
    449    AV_WN4PA(dst + stride * 2, dc);
    450    AV_WN4PA(dst + stride * 3, dc);
    451 }
    452 
    453 static void dc_left_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    454                          const uint8_t *_left, const uint8_t *top)
    455 {
    456    pixel *dst = (pixel *) _dst;
    457    const pixel *left = (const pixel *) _left;
    458    pixel4 dc = PIXEL_SPLAT_X4
    459        ((left[0] + left[1] + left[2] + left[3] +
    460          left[4] + left[5] + left[6] + left[7] + 4) >> 3);
    461    int y;
    462 
    463    stride /= sizeof(pixel);
    464    for (y = 0; y < 8; y++) {
    465        AV_WN4PA(dst + 0, dc);
    466        AV_WN4PA(dst + 4, dc);
    467        dst += stride;
    468    }
    469 }
    470 
    471 static void dc_left_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    472                            const uint8_t *_left, const uint8_t *top)
    473 {
    474    pixel *dst = (pixel *) _dst;
    475    const pixel *left = (const pixel *) _left;
    476    pixel4 dc = PIXEL_SPLAT_X4
    477        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
    478          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
    479          left[12] + left[13] + left[14] + left[15] + 8) >> 4);
    480    int y;
    481 
    482    stride /= sizeof(pixel);
    483    for (y = 0; y < 16; y++) {
    484        AV_WN4PA(dst +  0, dc);
    485        AV_WN4PA(dst +  4, dc);
    486        AV_WN4PA(dst +  8, dc);
    487        AV_WN4PA(dst + 12, dc);
    488        dst += stride;
    489    }
    490 }
    491 
    492 static void dc_left_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    493                            const uint8_t *_left, const uint8_t *top)
    494 {
    495    pixel *dst = (pixel *) _dst;
    496    const pixel *left = (const pixel *) _left;
    497    pixel4 dc = PIXEL_SPLAT_X4
    498        ((left[0] + left[1] + left[2] + left[3] + left[4] + left[5] +
    499          left[6] + left[7] + left[8] + left[9] + left[10] + left[11] +
    500          left[12] + left[13] + left[14] + left[15] + left[16] + left[17] +
    501          left[18] + left[19] + left[20] + left[21] + left[22] + left[23] +
    502          left[24] + left[25] + left[26] + left[27] + left[28] + left[29] +
    503          left[30] + left[31] + 16) >> 5);
    504    int y;
    505 
    506    stride /= sizeof(pixel);
    507    for (y = 0; y < 32; y++) {
    508        AV_WN4PA(dst +  0, dc);
    509        AV_WN4PA(dst +  4, dc);
    510        AV_WN4PA(dst +  8, dc);
    511        AV_WN4PA(dst + 12, dc);
    512        AV_WN4PA(dst + 16, dc);
    513        AV_WN4PA(dst + 20, dc);
    514        AV_WN4PA(dst + 24, dc);
    515        AV_WN4PA(dst + 28, dc);
    516        dst += stride;
    517    }
    518 }
    519 
    520 static void dc_top_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    521                         const uint8_t *left, const uint8_t *_top)
    522 {
    523    pixel *dst = (pixel *) _dst;
    524    const pixel *top = (const pixel *) _top;
    525    pixel4 dc = PIXEL_SPLAT_X4((top[0] + top[1] + top[2] + top[3] + 2) >> 2);
    526 
    527    stride /= sizeof(pixel);
    528    AV_WN4PA(dst + stride * 0, dc);
    529    AV_WN4PA(dst + stride * 1, dc);
    530    AV_WN4PA(dst + stride * 2, dc);
    531    AV_WN4PA(dst + stride * 3, dc);
    532 }
    533 
    534 static void dc_top_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    535                         const uint8_t *left, const uint8_t *_top)
    536 {
    537    pixel *dst = (pixel *) _dst;
    538    const pixel *top = (const pixel *) _top;
    539    pixel4 dc = PIXEL_SPLAT_X4
    540        ((top[0] + top[1] + top[2] + top[3] +
    541          top[4] + top[5] + top[6] + top[7] + 4) >> 3);
    542    int y;
    543 
    544    stride /= sizeof(pixel);
    545    for (y = 0; y < 8; y++) {
    546        AV_WN4PA(dst + 0, dc);
    547        AV_WN4PA(dst + 4, dc);
    548        dst += stride;
    549    }
    550 }
    551 
    552 static void dc_top_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    553                           const uint8_t *left, const uint8_t *_top)
    554 {
    555    pixel *dst = (pixel *) _dst;
    556    const pixel *top = (const pixel *) _top;
    557    pixel4 dc = PIXEL_SPLAT_X4
    558        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
    559          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
    560          top[12] + top[13] + top[14] + top[15] + 8) >> 4);
    561    int y;
    562 
    563    stride /= sizeof(pixel);
    564    for (y = 0; y < 16; y++) {
    565        AV_WN4PA(dst +  0, dc);
    566        AV_WN4PA(dst +  4, dc);
    567        AV_WN4PA(dst +  8, dc);
    568        AV_WN4PA(dst + 12, dc);
    569        dst += stride;
    570    }
    571 }
    572 
    573 static void dc_top_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    574                           const uint8_t *left, const uint8_t *_top)
    575 {
    576    pixel *dst = (pixel *) _dst;
    577    const pixel *top = (const pixel *) _top;
    578    pixel4 dc = PIXEL_SPLAT_X4
    579        ((top[0] + top[1] + top[2] + top[3] + top[4] + top[5] +
    580          top[6] + top[7] + top[8] + top[9] + top[10] + top[11] +
    581          top[12] + top[13] + top[14] + top[15] + top[16] + top[17] +
    582          top[18] + top[19] + top[20] + top[21] + top[22] + top[23] +
    583          top[24] + top[25] + top[26] + top[27] + top[28] + top[29] +
    584          top[30] + top[31] + 16) >> 5);
    585    int y;
    586 
    587    stride /= sizeof(pixel);
    588    for (y = 0; y < 32; y++) {
    589        AV_WN4PA(dst +  0, dc);
    590        AV_WN4PA(dst +  4, dc);
    591        AV_WN4PA(dst +  8, dc);
    592        AV_WN4PA(dst + 12, dc);
    593        AV_WN4PA(dst + 16, dc);
    594        AV_WN4PA(dst + 20, dc);
    595        AV_WN4PA(dst + 24, dc);
    596        AV_WN4PA(dst + 28, dc);
    597        dst += stride;
    598    }
    599 }
    600 
    601 #endif /* BIT_DEPTH != 12 */
    602 
    603 static void dc_128_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    604                         const uint8_t *left, const uint8_t *top)
    605 {
    606    pixel *dst = (pixel *) _dst;
    607    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
    608 
    609    stride /= sizeof(pixel);
    610    AV_WN4PA(dst + stride * 0, val);
    611    AV_WN4PA(dst + stride * 1, val);
    612    AV_WN4PA(dst + stride * 2, val);
    613    AV_WN4PA(dst + stride * 3, val);
    614 }
    615 
    616 static void dc_128_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    617                         const uint8_t *left, const uint8_t *top)
    618 {
    619    pixel *dst = (pixel *) _dst;
    620    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
    621    int y;
    622 
    623    stride /= sizeof(pixel);
    624    for (y = 0; y < 8; y++) {
    625        AV_WN4PA(dst + 0, val);
    626        AV_WN4PA(dst + 4, val);
    627        dst += stride;
    628    }
    629 }
    630 
    631 static void dc_128_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    632                           const uint8_t *left, const uint8_t *top)
    633 {
    634    pixel *dst = (pixel *) _dst;
    635    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
    636    int y;
    637 
    638    stride /= sizeof(pixel);
    639    for (y = 0; y < 16; y++) {
    640        AV_WN4PA(dst +  0, val);
    641        AV_WN4PA(dst +  4, val);
    642        AV_WN4PA(dst +  8, val);
    643        AV_WN4PA(dst + 12, val);
    644        dst += stride;
    645    }
    646 }
    647 
    648 static void dc_128_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    649                           const uint8_t *left, const uint8_t *top)
    650 {
    651    pixel *dst = (pixel *) _dst;
    652    pixel4 val = PIXEL_SPLAT_X4(128 << (BIT_DEPTH - 8));
    653    int y;
    654 
    655    stride /= sizeof(pixel);
    656    for (y = 0; y < 32; y++) {
    657        AV_WN4PA(dst +  0, val);
    658        AV_WN4PA(dst +  4, val);
    659        AV_WN4PA(dst +  8, val);
    660        AV_WN4PA(dst + 12, val);
    661        AV_WN4PA(dst + 16, val);
    662        AV_WN4PA(dst + 20, val);
    663        AV_WN4PA(dst + 24, val);
    664        AV_WN4PA(dst + 28, val);
    665        dst += stride;
    666    }
    667 }
    668 
    669 static void dc_127_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    670                         const uint8_t *left, const uint8_t *top)
    671 {
    672    pixel *dst = (pixel *) _dst;
    673    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
    674 
    675    stride /= sizeof(pixel);
    676    AV_WN4PA(dst + stride * 0, val);
    677    AV_WN4PA(dst + stride * 1, val);
    678    AV_WN4PA(dst + stride * 2, val);
    679    AV_WN4PA(dst + stride * 3, val);}
    680 
    681 static void dc_127_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    682                         const uint8_t *left, const uint8_t *top)
    683 {
    684    pixel *dst = (pixel *) _dst;
    685    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
    686    int y;
    687 
    688    stride /= sizeof(pixel);
    689    for (y = 0; y < 8; y++) {
    690        AV_WN4PA(dst + 0, val);
    691        AV_WN4PA(dst + 4, val);
    692        dst += stride;
    693    }
    694 }
    695 
    696 static void dc_127_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    697                           const uint8_t *left, const uint8_t *top)
    698 {
    699    pixel *dst = (pixel *) _dst;
    700    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
    701    int y;
    702 
    703    stride /= sizeof(pixel);
    704    for (y = 0; y < 16; y++) {
    705        AV_WN4PA(dst +  0, val);
    706        AV_WN4PA(dst +  4, val);
    707        AV_WN4PA(dst +  8, val);
    708        AV_WN4PA(dst + 12, val);
    709        dst += stride;
    710    }
    711 }
    712 
    713 static void dc_127_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    714                           const uint8_t *left, const uint8_t *top)
    715 {
    716    pixel *dst = (pixel *) _dst;
    717    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) - 1);
    718    int y;
    719 
    720    stride /= sizeof(pixel);
    721    for (y = 0; y < 32; y++) {
    722        AV_WN4PA(dst +  0, val);
    723        AV_WN4PA(dst +  4, val);
    724        AV_WN4PA(dst +  8, val);
    725        AV_WN4PA(dst + 12, val);
    726        AV_WN4PA(dst + 16, val);
    727        AV_WN4PA(dst + 20, val);
    728        AV_WN4PA(dst + 24, val);
    729        AV_WN4PA(dst + 28, val);
    730        dst += stride;
    731    }
    732 }
    733 
    734 static void dc_129_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    735                         const uint8_t *left, const uint8_t *top)
    736 {
    737    pixel *dst = (pixel *) _dst;
    738    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
    739 
    740    stride /= sizeof(pixel);
    741    AV_WN4PA(dst + stride * 0, val);
    742    AV_WN4PA(dst + stride * 1, val);
    743    AV_WN4PA(dst + stride * 2, val);
    744    AV_WN4PA(dst + stride * 3, val);
    745 }
    746 
    747 static void dc_129_8x8_c(uint8_t *_dst, ptrdiff_t stride,
    748                         const uint8_t *left, const uint8_t *top)
    749 {
    750    pixel *dst = (pixel *) _dst;
    751    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
    752    int y;
    753 
    754    stride /= sizeof(pixel);
    755    for (y = 0; y < 8; y++) {
    756        AV_WN4PA(dst + 0, val);
    757        AV_WN4PA(dst + 4, val);
    758        dst += stride;
    759    }
    760 }
    761 
    762 static void dc_129_16x16_c(uint8_t *_dst, ptrdiff_t stride,
    763                           const uint8_t *left, const uint8_t *top)
    764 {
    765    pixel *dst = (pixel *) _dst;
    766    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
    767    int y;
    768 
    769    stride /= sizeof(pixel);
    770    for (y = 0; y < 16; y++) {
    771        AV_WN4PA(dst +  0, val);
    772        AV_WN4PA(dst +  4, val);
    773        AV_WN4PA(dst +  8, val);
    774        AV_WN4PA(dst + 12, val);
    775        dst += stride;
    776    }
    777 }
    778 
    779 static void dc_129_32x32_c(uint8_t *_dst, ptrdiff_t stride,
    780                           const uint8_t *left, const uint8_t *top)
    781 {
    782    pixel *dst = (pixel *) _dst;
    783    pixel4 val = PIXEL_SPLAT_X4((128 << (BIT_DEPTH - 8)) + 1);
    784    int y;
    785 
    786    stride /= sizeof(pixel);
    787    for (y = 0; y < 32; y++) {
    788        AV_WN4PA(dst +  0, val);
    789        AV_WN4PA(dst +  4, val);
    790        AV_WN4PA(dst +  8, val);
    791        AV_WN4PA(dst + 12, val);
    792        AV_WN4PA(dst + 16, val);
    793        AV_WN4PA(dst + 20, val);
    794        AV_WN4PA(dst + 24, val);
    795        AV_WN4PA(dst + 28, val);
    796        dst += stride;
    797    }
    798 }
    799 
    800 #if BIT_DEPTH != 12
    801 
    802 #if BIT_DEPTH == 8
    803 #define memset_bpc memset
    804 #else
    805 static inline void memset_bpc(uint16_t *dst, int val, int len) {
    806    int n;
    807    for (n = 0; n < len; n++) {
    808        dst[n] = val;
    809    }
    810 }
    811 #endif
    812 
    813 #define DST(x, y) dst[(x) + (y) * stride]
    814 
    815 static void diag_downleft_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    816                                const uint8_t *left, const uint8_t *_top)
    817 {
    818    pixel *dst = (pixel *) _dst;
    819    const pixel *top = (const pixel *) _top;
    820    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
    821        a4 = top[4], a5 = top[5], a6 = top[6], a7 = top[7];
    822 
    823    stride /= sizeof(pixel);
    824    DST(0,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
    825    DST(1,0) = DST(0,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
    826    DST(2,0) = DST(1,1) = DST(0,2) = (a2 + a3 * 2 + a4 + 2) >> 2;
    827    DST(3,0) = DST(2,1) = DST(1,2) = DST(0,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
    828    DST(3,1) = DST(2,2) = DST(1,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
    829    DST(3,2) = DST(2,3) = (a5 + a6 * 2 + a7 + 2) >> 2;
    830    DST(3,3) = a7;  // note: this is different from vp8 and such
    831 }
    832 
    833 #define def_diag_downleft(size) \
    834 static void diag_downleft_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
    835                                              const uint8_t *left, const uint8_t *_top) \
    836 { \
    837    pixel *dst = (pixel *) _dst; \
    838    const pixel *top = (const pixel *) _top; \
    839    int i, j; \
    840    pixel v[size - 1]; \
    841 \
    842    stride /= sizeof(pixel); \
    843    for (i = 0; i < size - 2; i++) \
    844        v[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
    845    v[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
    846 \
    847    for (j = 0; j < size; j++) { \
    848        memcpy(dst + j*stride, v + j, (size - 1 - j) * sizeof(pixel)); \
    849        memset_bpc(dst + j*stride + size - 1 - j, top[size - 1], j + 1); \
    850    } \
    851 }
    852 
    853 def_diag_downleft(8)
    854 def_diag_downleft(16)
    855 def_diag_downleft(32)
    856 
    857 static void diag_downright_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    858                                 const uint8_t *_left, const uint8_t *_top)
    859 {
    860    pixel *dst = (pixel *) _dst;
    861    const pixel *top = (const pixel *) _top;
    862    const pixel *left = (const pixel *) _left;
    863    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
    864        l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0];
    865 
    866    stride /= sizeof(pixel);
    867    DST(0,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
    868    DST(0,2) = DST(1,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
    869    DST(0,1) = DST(1,2) = DST(2,3) = (tl + l0 * 2 + l1 + 2) >> 2;
    870    DST(0,0) = DST(1,1) = DST(2,2) = DST(3,3) = (l0 + tl * 2 + a0 + 2) >> 2;
    871    DST(1,0) = DST(2,1) = DST(3,2) = (tl + a0 * 2 + a1 + 2) >> 2;
    872    DST(2,0) = DST(3,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
    873    DST(3,0) = (a1 + a2 * 2 + a3 + 2) >> 2;
    874 }
    875 
    876 #define def_diag_downright(size) \
    877 static void diag_downright_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
    878                                               const uint8_t *_left, const uint8_t *_top) \
    879 { \
    880    pixel *dst = (pixel *) _dst; \
    881    const pixel *top = (const pixel *) _top; \
    882    const pixel *left = (const pixel *) _left; \
    883    int i, j; \
    884    pixel v[size + size - 1]; \
    885 \
    886    stride /= sizeof(pixel); \
    887    for (i = 0; i < size - 2; i++) { \
    888        v[i           ] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
    889        v[size + 1 + i] = (top[i]  + top[i + 1]  * 2 + top[i + 2]  + 2) >> 2; \
    890    } \
    891    v[size - 2] = (left[size - 2] + left[size - 1] * 2 + top[-1] + 2) >> 2; \
    892    v[size - 1] = (left[size - 1] + top[-1] * 2 + top[ 0] + 2) >> 2; \
    893    v[size    ] = (top[-1] + top[0]  * 2 + top[ 1] + 2) >> 2; \
    894 \
    895    for (j = 0; j < size; j++) \
    896        memcpy(dst + j*stride, v + size - 1 - j, size * sizeof(pixel)); \
    897 }
    898 
    899 def_diag_downright(8)
    900 def_diag_downright(16)
    901 def_diag_downright(32)
    902 
    903 static void vert_right_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    904                             const uint8_t *_left, const uint8_t *_top)
    905 {
    906    pixel *dst = (pixel *) _dst;
    907    const pixel *top = (const pixel *) _top;
    908    const pixel *left = (const pixel *) _left;
    909    int tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
    910        l0 = left[3], l1 = left[2], l2 = left[1];
    911 
    912    stride /= sizeof(pixel);
    913    DST(0,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
    914    DST(0,2) = (tl + l0 * 2 + l1 + 2) >> 2;
    915    DST(0,0) = DST(1,2) = (tl + a0 + 1) >> 1;
    916    DST(0,1) = DST(1,3) = (l0 + tl * 2 + a0 + 2) >> 2;
    917    DST(1,0) = DST(2,2) = (a0 + a1 + 1) >> 1;
    918    DST(1,1) = DST(2,3) = (tl + a0 * 2 + a1 + 2) >> 2;
    919    DST(2,0) = DST(3,2) = (a1 + a2 + 1) >> 1;
    920    DST(2,1) = DST(3,3) = (a0 + a1 * 2 + a2 + 2) >> 2;
    921    DST(3,0) = (a2 + a3 + 1) >> 1;
    922    DST(3,1) = (a1 + a2 * 2 + a3 + 2) >> 2;
    923 }
    924 
    925 #define def_vert_right(size) \
    926 static void vert_right_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
    927                                           const uint8_t *_left, const uint8_t *_top) \
    928 { \
    929    pixel *dst = (pixel *) _dst; \
    930    const pixel *top = (const pixel *) _top; \
    931    const pixel *left = (const pixel *) _left; \
    932    int i, j; \
    933    pixel ve[size + size/2 - 1], vo[size + size/2 - 1]; \
    934 \
    935    stride /= sizeof(pixel); \
    936    for (i = 0; i < size/2 - 2; i++) { \
    937        vo[i] = (left[i*2 + 3] + left[i*2 + 2] * 2 + left[i*2 + 1] + 2) >> 2; \
    938        ve[i] = (left[i*2 + 4] + left[i*2 + 3] * 2 + left[i*2 + 2] + 2) >> 2; \
    939    } \
    940    vo[size/2 - 2] = (left[size - 1] + left[size - 2] * 2 + left[size - 3] + 2) >> 2; \
    941    ve[size/2 - 2] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
    942 \
    943    ve[size/2 - 1] = (top[-1] + top[0] + 1) >> 1; \
    944    vo[size/2 - 1] = (left[size - 1] + top[-1] * 2 + top[0] + 2) >> 2; \
    945    for (i = 0; i < size - 1; i++) { \
    946        ve[size/2 + i] = (top[i] + top[i + 1] + 1) >> 1; \
    947        vo[size/2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
    948    } \
    949 \
    950    for (j = 0; j < size / 2; j++) { \
    951        memcpy(dst +  j*2     *stride, ve + size/2 - 1 - j, size * sizeof(pixel)); \
    952        memcpy(dst + (j*2 + 1)*stride, vo + size/2 - 1 - j, size * sizeof(pixel)); \
    953    } \
    954 }
    955 
    956 def_vert_right(8)
    957 def_vert_right(16)
    958 def_vert_right(32)
    959 
    960 static void hor_down_4x4_c(uint8_t *_dst, ptrdiff_t stride,
    961                           const uint8_t *_left, const uint8_t *_top)
    962 {
    963    pixel *dst = (pixel *) _dst;
    964    const pixel *top = (const pixel *) _top;
    965    const pixel *left = (const pixel *) _left;
    966    int l0 = left[3], l1 = left[2], l2 = left[1], l3 = left[0],
    967        tl = top[-1], a0 = top[0], a1 = top[1], a2 = top[2];
    968 
    969    stride /= sizeof(pixel);
    970    DST(2,0) = (tl + a0 * 2 + a1 + 2) >> 2;
    971    DST(3,0) = (a0 + a1 * 2 + a2 + 2) >> 2;
    972    DST(0,0) = DST(2,1) = (tl + l0 + 1) >> 1;
    973    DST(1,0) = DST(3,1) = (a0 + tl * 2 + l0 + 2) >> 2;
    974    DST(0,1) = DST(2,2) = (l0 + l1 + 1) >> 1;
    975    DST(1,1) = DST(3,2) = (tl + l0 * 2 + l1 + 2) >> 2;
    976    DST(0,2) = DST(2,3) = (l1 + l2 + 1) >> 1;
    977    DST(1,2) = DST(3,3) = (l0 + l1 * 2 + l2 + 2) >> 2;
    978    DST(0,3) = (l2 + l3 + 1) >> 1;
    979    DST(1,3) = (l1 + l2 * 2 + l3 + 2) >> 2;
    980 }
    981 
    982 #define def_hor_down(size) \
    983 static void hor_down_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
    984                                         const uint8_t *_left, const uint8_t *_top) \
    985 { \
    986    pixel *dst = (pixel *) _dst; \
    987    const pixel *top = (const pixel *) _top; \
    988    const pixel *left = (const pixel *) _left; \
    989    int i, j; \
    990    pixel v[size * 3 - 2]; \
    991 \
    992    stride /= sizeof(pixel); \
    993    for (i = 0; i < size - 2; i++) { \
    994        v[i*2       ] = (left[i + 1] + left[i + 0] + 1) >> 1; \
    995        v[i*2    + 1] = (left[i + 2] + left[i + 1] * 2 + left[i + 0] + 2) >> 2; \
    996        v[size*2 + i] = (top[i - 1] + top[i] * 2 + top[i + 1] + 2) >> 2; \
    997    } \
    998    v[size*2 - 2] = (top[-1] + left[size - 1] + 1) >> 1; \
    999    v[size*2 - 4] = (left[size - 1] + left[size - 2] + 1) >> 1; \
   1000    v[size*2 - 1] = (top[0]  + top[-1] * 2 + left[size - 1] + 2) >> 2; \
   1001    v[size*2 - 3] = (top[-1] + left[size - 1] * 2 + left[size - 2] + 2) >> 2; \
   1002 \
   1003    for (j = 0; j < size; j++) \
   1004        memcpy(dst + j*stride, v + size*2 - 2 - j*2, size * sizeof(pixel)); \
   1005 }
   1006 
   1007 def_hor_down(8)
   1008 def_hor_down(16)
   1009 def_hor_down(32)
   1010 
   1011 static void vert_left_4x4_c(uint8_t *_dst, ptrdiff_t stride,
   1012                            const uint8_t *left, const uint8_t *_top)
   1013 {
   1014    pixel *dst = (pixel *) _dst;
   1015    const pixel *top = (const pixel *) _top;
   1016    int a0 = top[0], a1 = top[1], a2 = top[2], a3 = top[3],
   1017        a4 = top[4], a5 = top[5], a6 = top[6];
   1018 
   1019    stride /= sizeof(pixel);
   1020    DST(0,0) = (a0 + a1 + 1) >> 1;
   1021    DST(0,1) = (a0 + a1 * 2 + a2 + 2) >> 2;
   1022    DST(1,0) = DST(0,2) = (a1 + a2 + 1) >> 1;
   1023    DST(1,1) = DST(0,3) = (a1 + a2 * 2 + a3 + 2) >> 2;
   1024    DST(2,0) = DST(1,2) = (a2 + a3 + 1) >> 1;
   1025    DST(2,1) = DST(1,3) = (a2 + a3 * 2 + a4 + 2) >> 2;
   1026    DST(3,0) = DST(2,2) = (a3 + a4 + 1) >> 1;
   1027    DST(3,1) = DST(2,3) = (a3 + a4 * 2 + a5 + 2) >> 2;
   1028    DST(3,2) = (a4 + a5 + 1) >> 1;
   1029    DST(3,3) = (a4 + a5 * 2 + a6 + 2) >> 2;
   1030 }
   1031 
   1032 #define def_vert_left(size) \
   1033 static void vert_left_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
   1034                                          const uint8_t *left, const uint8_t *_top) \
   1035 { \
   1036    pixel *dst = (pixel *) _dst; \
   1037    const pixel *top = (const pixel *) _top; \
   1038    int i, j; \
   1039    pixel ve[size - 1], vo[size - 1]; \
   1040 \
   1041    stride /= sizeof(pixel); \
   1042    for (i = 0; i < size - 2; i++) { \
   1043        ve[i] = (top[i] + top[i + 1] + 1) >> 1; \
   1044        vo[i] = (top[i] + top[i + 1] * 2 + top[i + 2] + 2) >> 2; \
   1045    } \
   1046    ve[size - 2] = (top[size - 2] + top[size - 1] + 1) >> 1; \
   1047    vo[size - 2] = (top[size - 2] + top[size - 1] * 3 + 2) >> 2; \
   1048 \
   1049    for (j = 0; j < size / 2; j++) { \
   1050        memcpy(dst +  j*2      * stride, ve + j, (size - j - 1) * sizeof(pixel)); \
   1051        memset_bpc(dst +  j*2      * stride + size - j - 1, top[size - 1], j + 1); \
   1052        memcpy(dst + (j*2 + 1) * stride, vo + j, (size - j - 1) * sizeof(pixel)); \
   1053        memset_bpc(dst + (j*2 + 1) * stride + size - j - 1, top[size - 1], j + 1); \
   1054    } \
   1055 }
   1056 
   1057 def_vert_left(8)
   1058 def_vert_left(16)
   1059 def_vert_left(32)
   1060 
   1061 static void hor_up_4x4_c(uint8_t *_dst, ptrdiff_t stride,
   1062                         const uint8_t *_left, const uint8_t *top)
   1063 {
   1064    pixel *dst = (pixel *) _dst;
   1065    const pixel *left = (const pixel *) _left;
   1066    int l0 = left[0], l1 = left[1], l2 = left[2], l3 = left[3];
   1067 
   1068    stride /= sizeof(pixel);
   1069    DST(0,0) = (l0 + l1 + 1) >> 1;
   1070    DST(1,0) = (l0 + l1 * 2 + l2 + 2) >> 2;
   1071    DST(0,1) = DST(2,0) = (l1 + l2 + 1) >> 1;
   1072    DST(1,1) = DST(3,0) = (l1 + l2 * 2 + l3 + 2) >> 2;
   1073    DST(0,2) = DST(2,1) = (l2 + l3 + 1) >> 1;
   1074    DST(1,2) = DST(3,1) = (l2 + l3 * 3 + 2) >> 2;
   1075    DST(0,3) = DST(1,3) = DST(2,2) = DST(2,3) = DST(3,2) = DST(3,3) = l3;
   1076 }
   1077 
   1078 #define def_hor_up(size) \
   1079 static void hor_up_##size##x##size##_c(uint8_t *_dst, ptrdiff_t stride, \
   1080                                       const uint8_t *_left, const uint8_t *top) \
   1081 { \
   1082    pixel *dst = (pixel *) _dst; \
   1083    const pixel *left = (const pixel *) _left; \
   1084    int i, j; \
   1085    pixel v[size*2 - 2]; \
   1086 \
   1087    stride /= sizeof(pixel); \
   1088    for (i = 0; i < size - 2; i++) { \
   1089        v[i*2    ] = (left[i] + left[i + 1] + 1) >> 1; \
   1090        v[i*2 + 1] = (left[i] + left[i + 1] * 2 + left[i + 2] + 2) >> 2; \
   1091    } \
   1092    v[size*2 - 4] = (left[size - 2] + left[size - 1] + 1) >> 1; \
   1093    v[size*2 - 3] = (left[size - 2] + left[size - 1] * 3 + 2) >> 2; \
   1094 \
   1095    for (j = 0; j < size / 2; j++) \
   1096        memcpy(dst + j*stride, v + j*2, size * sizeof(pixel)); \
   1097    for (j = size / 2; j < size; j++) { \
   1098        memcpy(dst + j*stride, v + j*2, (size*2 - 2 - j*2) * sizeof(pixel)); \
   1099        memset_bpc(dst + j*stride + size*2 - 2 - j*2, left[size - 1], \
   1100                   2 + j*2 - size); \
   1101    } \
   1102 }
   1103 
   1104 def_hor_up(8)
   1105 def_hor_up(16)
   1106 def_hor_up(32)
   1107 
   1108 #undef DST
   1109 
   1110 #endif /* BIT_DEPTH != 12 */
   1111 
   1112 #if BIT_DEPTH != 8
   1113 void ff_vp9dsp_intrapred_init_10(VP9DSPContext *dsp);
   1114 #endif
   1115 #if BIT_DEPTH != 10
   1116 static
   1117 #endif
   1118 av_cold void FUNC(ff_vp9dsp_intrapred_init)(VP9DSPContext *dsp)
   1119 {
   1120 #define init_intra_pred_bd_aware(tx, sz) \
   1121    dsp->intra_pred[tx][TM_VP8_PRED]          = tm_##sz##_c; \
   1122    dsp->intra_pred[tx][DC_128_PRED]          = dc_128_##sz##_c; \
   1123    dsp->intra_pred[tx][DC_127_PRED]          = dc_127_##sz##_c; \
   1124    dsp->intra_pred[tx][DC_129_PRED]          = dc_129_##sz##_c
   1125 
   1126 #if BIT_DEPTH == 12
   1127    ff_vp9dsp_intrapred_init_10(dsp);
   1128 #define init_intra_pred(tx, sz) \
   1129    init_intra_pred_bd_aware(tx, sz)
   1130 #else
   1131    #define init_intra_pred(tx, sz) \
   1132    dsp->intra_pred[tx][VERT_PRED]            = vert_##sz##_c; \
   1133    dsp->intra_pred[tx][HOR_PRED]             = hor_##sz##_c; \
   1134    dsp->intra_pred[tx][DC_PRED]              = dc_##sz##_c; \
   1135    dsp->intra_pred[tx][DIAG_DOWN_LEFT_PRED]  = diag_downleft_##sz##_c; \
   1136    dsp->intra_pred[tx][DIAG_DOWN_RIGHT_PRED] = diag_downright_##sz##_c; \
   1137    dsp->intra_pred[tx][VERT_RIGHT_PRED]      = vert_right_##sz##_c; \
   1138    dsp->intra_pred[tx][HOR_DOWN_PRED]        = hor_down_##sz##_c; \
   1139    dsp->intra_pred[tx][VERT_LEFT_PRED]       = vert_left_##sz##_c; \
   1140    dsp->intra_pred[tx][HOR_UP_PRED]          = hor_up_##sz##_c; \
   1141    dsp->intra_pred[tx][LEFT_DC_PRED]         = dc_left_##sz##_c; \
   1142    dsp->intra_pred[tx][TOP_DC_PRED]          = dc_top_##sz##_c; \
   1143    init_intra_pred_bd_aware(tx, sz)
   1144 #endif
   1145 
   1146    init_intra_pred(TX_4X4,   4x4);
   1147    init_intra_pred(TX_8X8,   8x8);
   1148    init_intra_pred(TX_16X16, 16x16);
   1149    init_intra_pred(TX_32X32, 32x32);
   1150 
   1151 #undef init_intra_pred
   1152 #undef init_intra_pred_bd_aware
   1153 }
   1154 
   1155 #define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly) \
   1156 static void type_a##_##type_b##_##sz##x##sz##_add_c(uint8_t *_dst, \
   1157                                                    ptrdiff_t stride, \
   1158                                                    int16_t *_block, int eob) \
   1159 { \
   1160    int i, j; \
   1161    pixel *dst = (pixel *) _dst; \
   1162    dctcoef *block = (dctcoef *) _block, tmp[sz * sz], out[sz]; \
   1163 \
   1164    stride /= sizeof(pixel); \
   1165    if (has_dconly && eob == 1) { \
   1166        const int t  = ((((dctint) block[0] * 11585 + (1 << 13)) >> 14) \
   1167                                            * 11585 + (1 << 13)) >> 14; \
   1168        block[0] = 0; \
   1169        for (i = 0; i < sz; i++) { \
   1170            for (j = 0; j < sz; j++) \
   1171                dst[j * stride] = av_clip_pixel(dst[j * stride] + \
   1172                                                (bits ? \
   1173                                                 (int)(t + (1U << (bits - 1))) >> bits : \
   1174                                                 t)); \
   1175            dst++; \
   1176        } \
   1177        return; \
   1178    } \
   1179 \
   1180    for (i = 0; i < sz; i++) \
   1181        type_a##sz##_1d(block + i, sz, tmp + i * sz, 0); \
   1182    memset(block, 0, sz * sz * sizeof(*block)); \
   1183    for (i = 0; i < sz; i++) { \
   1184        type_b##sz##_1d(tmp + i, sz, out, 1); \
   1185        for (j = 0; j < sz; j++) \
   1186            dst[j * stride] = av_clip_pixel(dst[j * stride] + \
   1187                                            (bits ? \
   1188                                             (int)(out[j] + (1U << (bits - 1))) >> bits : \
   1189                                             out[j])); \
   1190        dst++; \
   1191    } \
   1192 }
   1193 
   1194 #define itxfm_wrap(sz, bits) \
   1195 itxfm_wrapper(idct,  idct,  sz, bits, 1) \
   1196 itxfm_wrapper(iadst, idct,  sz, bits, 0) \
   1197 itxfm_wrapper(idct,  iadst, sz, bits, 0) \
   1198 itxfm_wrapper(iadst, iadst, sz, bits, 0)
   1199 
   1200 #define IN(x) ((dctint) in[(x) * stride])
   1201 
   1202 static av_always_inline void idct4_1d(const dctcoef *in, ptrdiff_t stride,
   1203                                      dctcoef *out, int pass)
   1204 {
   1205    dctint t0, t1, t2, t3;
   1206 
   1207    t0 = ((IN(0) + IN(2)) * 11585 + (1 << 13)) >> 14;
   1208    t1 = ((IN(0) - IN(2)) * 11585 + (1 << 13)) >> 14;
   1209    t2 = (IN(1) *  6270 - IN(3) * 15137 + (1 << 13)) >> 14;
   1210    t3 = (IN(1) * 15137 + IN(3) *  6270 + (1 << 13)) >> 14;
   1211 
   1212    out[0] = t0 + t3;
   1213    out[1] = t1 + t2;
   1214    out[2] = t1 - t2;
   1215    out[3] = t0 - t3;
   1216 }
   1217 
   1218 static av_always_inline void iadst4_1d(const dctcoef *in, ptrdiff_t stride,
   1219                                       dctcoef *out, int pass)
   1220 {
   1221    dctint t0, t1, t2, t3;
   1222 
   1223    t0 =  5283 * IN(0) + 15212 * IN(2) +  9929 * IN(3);
   1224    t1 =  9929 * IN(0) -  5283 * IN(2) - 15212 * IN(3);
   1225    t2 = 13377 * (IN(0) - IN(2) + IN(3));
   1226    t3 = 13377 * IN(1);
   1227 
   1228    out[0] = (t0 + t3      + (1 << 13)) >> 14;
   1229    out[1] = (t1 + t3      + (1 << 13)) >> 14;
   1230    out[2] = (t2           + (1 << 13)) >> 14;
   1231    out[3] = (t0 + t1 - t3 + (1 << 13)) >> 14;
   1232 }
   1233 
   1234 itxfm_wrap(4, 4)
   1235 
   1236 static av_always_inline void idct8_1d(const dctcoef *in, ptrdiff_t stride,
   1237                                      dctcoef *out, int pass)
   1238 {
   1239    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
   1240 
   1241    t0a = ((IN(0) + IN(4)) * 11585 + (1 << 13)) >> 14;
   1242    t1a = ((IN(0) - IN(4)) * 11585 + (1 << 13)) >> 14;
   1243    t2a = (IN(2) *  6270 - IN(6) * 15137 + (1 << 13)) >> 14;
   1244    t3a = (IN(2) * 15137 + IN(6) *  6270 + (1 << 13)) >> 14;
   1245    t4a = (IN(1) *  3196 - IN(7) * 16069 + (1 << 13)) >> 14;
   1246    t5a = (IN(5) * 13623 - IN(3) *  9102 + (1 << 13)) >> 14;
   1247    t6a = (IN(5) *  9102 + IN(3) * 13623 + (1 << 13)) >> 14;
   1248    t7a = (IN(1) * 16069 + IN(7) *  3196 + (1 << 13)) >> 14;
   1249 
   1250    t0  = t0a + t3a;
   1251    t1  = t1a + t2a;
   1252    t2  = t1a - t2a;
   1253    t3  = t0a - t3a;
   1254    t4  = t4a + t5a;
   1255    t5a = t4a - t5a;
   1256    t7  = t7a + t6a;
   1257    t6a = t7a - t6a;
   1258 
   1259    t5  = ((t6a - t5a) * 11585 + (1 << 13)) >> 14;
   1260    t6  = ((t6a + t5a) * 11585 + (1 << 13)) >> 14;
   1261 
   1262    out[0] = t0 + t7;
   1263    out[1] = t1 + t6;
   1264    out[2] = t2 + t5;
   1265    out[3] = t3 + t4;
   1266    out[4] = t3 - t4;
   1267    out[5] = t2 - t5;
   1268    out[6] = t1 - t6;
   1269    out[7] = t0 - t7;
   1270 }
   1271 
   1272 static av_always_inline void iadst8_1d(const dctcoef *in, ptrdiff_t stride,
   1273                                       dctcoef *out, int pass)
   1274 {
   1275    dctint t0, t0a, t1, t1a, t2, t2a, t3, t3a, t4, t4a, t5, t5a, t6, t6a, t7, t7a;
   1276 
   1277    t0a = 16305 * IN(7) +  1606 * IN(0);
   1278    t1a =  1606 * IN(7) - 16305 * IN(0);
   1279    t2a = 14449 * IN(5) +  7723 * IN(2);
   1280    t3a =  7723 * IN(5) - 14449 * IN(2);
   1281    t4a = 10394 * IN(3) + 12665 * IN(4);
   1282    t5a = 12665 * IN(3) - 10394 * IN(4);
   1283    t6a =  4756 * IN(1) + 15679 * IN(6);
   1284    t7a = 15679 * IN(1) -  4756 * IN(6);
   1285 
   1286    t0 = (t0a + t4a + (1 << 13)) >> 14;
   1287    t1 = (t1a + t5a + (1 << 13)) >> 14;
   1288    t2 = (t2a + t6a + (1 << 13)) >> 14;
   1289    t3 = (t3a + t7a + (1 << 13)) >> 14;
   1290    t4 = (t0a - t4a + (1 << 13)) >> 14;
   1291    t5 = (t1a - t5a + (1 << 13)) >> 14;
   1292    t6 = (t2a - t6a + (1 << 13)) >> 14;
   1293    t7 = (t3a - t7a + (1 << 13)) >> 14;
   1294 
   1295    t4a = 15137U * t4 +  6270U * t5;
   1296    t5a =  6270U * t4 - 15137U * t5;
   1297    t6a = 15137U * t7 -  6270U * t6;
   1298    t7a =  6270U * t7 + 15137U * t6;
   1299 
   1300    out[0] =   t0 + t2;
   1301    out[7] = -(t1 + t3);
   1302    t2     =   t0 - t2;
   1303    t3     =   t1 - t3;
   1304 
   1305    out[1] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
   1306    out[6] =   (dctint)((1U << 13) + t5a + t7a) >> 14;
   1307    t6     =   (dctint)((1U << 13) + t4a - t6a) >> 14;
   1308    t7     =   (dctint)((1U << 13) + t5a - t7a) >> 14;
   1309 
   1310    out[3] = -((dctint)((t2 + t3) * 11585U + (1 << 13)) >> 14);
   1311    out[4] =   (dctint)((t2 - t3) * 11585U + (1 << 13)) >> 14;
   1312    out[2] =   (dctint)((t6 + t7) * 11585U + (1 << 13)) >> 14;
   1313    out[5] = -((dctint)((t6 - t7) * 11585U + (1 << 13)) >> 14);
   1314 }
   1315 
   1316 itxfm_wrap(8, 5)
   1317 
   1318 static av_always_inline void idct16_1d(const dctcoef *in, ptrdiff_t stride,
   1319                                       dctcoef *out, int pass)
   1320 {
   1321    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
   1322    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
   1323    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
   1324 
   1325    t0a  = (dctint)((IN(0) + IN(8)) * 11585U + (1 << 13)) >> 14;
   1326    t1a  = (dctint)((IN(0) - IN(8)) * 11585U + (1 << 13)) >> 14;
   1327    t2a  = (dctint)(IN(4)  *  6270U - IN(12) * 15137U + (1 << 13)) >> 14;
   1328    t3a  = (dctint)(IN(4)  * 15137U + IN(12) *  6270U + (1 << 13)) >> 14;
   1329    t4a  = (dctint)(IN(2)  *  3196U - IN(14) * 16069U + (1 << 13)) >> 14;
   1330    t7a  = (dctint)(IN(2)  * 16069U + IN(14) *  3196U + (1 << 13)) >> 14;
   1331    t5a  = (dctint)(IN(10) * 13623U - IN(6)  *  9102U + (1 << 13)) >> 14;
   1332    t6a  = (dctint)(IN(10) *  9102U + IN(6)  * 13623U + (1 << 13)) >> 14;
   1333    t8a  = (dctint)(IN(1)  *  1606U - IN(15) * 16305U + (1 << 13)) >> 14;
   1334    t15a = (dctint)(IN(1)  * 16305U + IN(15) *  1606U + (1 << 13)) >> 14;
   1335    t9a  = (dctint)(IN(9)  * 12665U - IN(7)  * 10394U + (1 << 13)) >> 14;
   1336    t14a = (dctint)(IN(9)  * 10394U + IN(7)  * 12665U + (1 << 13)) >> 14;
   1337    t10a = (dctint)(IN(5)  *  7723U - IN(11) * 14449U + (1 << 13)) >> 14;
   1338    t13a = (dctint)(IN(5)  * 14449U + IN(11) *  7723U + (1 << 13)) >> 14;
   1339    t11a = (dctint)(IN(13) * 15679U - IN(3)  *  4756U + (1 << 13)) >> 14;
   1340    t12a = (dctint)(IN(13) *  4756U + IN(3)  * 15679U + (1 << 13)) >> 14;
   1341 
   1342    t0  = t0a  + t3a;
   1343    t1  = t1a  + t2a;
   1344    t2  = t1a  - t2a;
   1345    t3  = t0a  - t3a;
   1346    t4  = t4a  + t5a;
   1347    t5  = t4a  - t5a;
   1348    t6  = t7a  - t6a;
   1349    t7  = t7a  + t6a;
   1350    t8  = t8a  + t9a;
   1351    t9  = t8a  - t9a;
   1352    t10 = t11a - t10a;
   1353    t11 = t11a + t10a;
   1354    t12 = t12a + t13a;
   1355    t13 = t12a - t13a;
   1356    t14 = t15a - t14a;
   1357    t15 = t15a + t14a;
   1358 
   1359    t5a  = (dctint)((t6 - t5) * 11585U + (1 << 13)) >> 14;
   1360    t6a  = (dctint)((t6 + t5) * 11585U + (1 << 13)) >> 14;
   1361    t9a  = (dctint)(  t14 *  6270U - t9  * 15137U  + (1 << 13)) >> 14;
   1362    t14a = (dctint)(  t14 * 15137U + t9  *  6270U  + (1 << 13)) >> 14;
   1363    t10a = (dctint)(-(t13 * 15137U + t10 *  6270U) + (1 << 13)) >> 14;
   1364    t13a = (dctint)(  t13 *  6270U - t10 * 15137U  + (1 << 13)) >> 14;
   1365 
   1366    t0a  = t0   + t7;
   1367    t1a  = t1   + t6a;
   1368    t2a  = t2   + t5a;
   1369    t3a  = t3   + t4;
   1370    t4   = t3   - t4;
   1371    t5   = t2   - t5a;
   1372    t6   = t1   - t6a;
   1373    t7   = t0   - t7;
   1374    t8a  = t8   + t11;
   1375    t9   = t9a  + t10a;
   1376    t10  = t9a  - t10a;
   1377    t11a = t8   - t11;
   1378    t12a = t15  - t12;
   1379    t13  = t14a - t13a;
   1380    t14  = t14a + t13a;
   1381    t15a = t15  + t12;
   1382 
   1383    t10a = (dctint)((t13  - t10)  * 11585U + (1 << 13)) >> 14;
   1384    t13a = (dctint)((t13  + t10)  * 11585U + (1 << 13)) >> 14;
   1385    t11  = (dctint)((t12a - t11a) * 11585U + (1 << 13)) >> 14;
   1386    t12  = (dctint)((t12a + t11a) * 11585U + (1 << 13)) >> 14;
   1387 
   1388    out[ 0] = t0a + t15a;
   1389    out[ 1] = t1a + t14;
   1390    out[ 2] = t2a + t13a;
   1391    out[ 3] = t3a + t12;
   1392    out[ 4] = t4  + t11;
   1393    out[ 5] = t5  + t10a;
   1394    out[ 6] = t6  + t9;
   1395    out[ 7] = t7  + t8a;
   1396    out[ 8] = t7  - t8a;
   1397    out[ 9] = t6  - t9;
   1398    out[10] = t5  - t10a;
   1399    out[11] = t4  - t11;
   1400    out[12] = t3a - t12;
   1401    out[13] = t2a - t13a;
   1402    out[14] = t1a - t14;
   1403    out[15] = t0a - t15a;
   1404 }
   1405 
   1406 static av_always_inline void iadst16_1d(const dctcoef *in, ptrdiff_t stride,
   1407                                        dctcoef *out, int pass)
   1408 {
   1409    dctint t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
   1410    dctint t0a, t1a, t2a, t3a, t4a, t5a, t6a, t7a;
   1411    dctint t8a, t9a, t10a, t11a, t12a, t13a, t14a, t15a;
   1412 
   1413    t0  = IN(15) * 16364U + IN(0)  *   804U;
   1414    t1  = IN(15) *   804U - IN(0)  * 16364U;
   1415    t2  = IN(13) * 15893U + IN(2)  *  3981U;
   1416    t3  = IN(13) *  3981U - IN(2)  * 15893U;
   1417    t4  = IN(11) * 14811U + IN(4)  *  7005U;
   1418    t5  = IN(11) *  7005U - IN(4)  * 14811U;
   1419    t6  = IN(9)  * 13160U + IN(6)  *  9760U;
   1420    t7  = IN(9)  *  9760U - IN(6)  * 13160U;
   1421    t8  = IN(7)  * 11003U + IN(8)  * 12140U;
   1422    t9  = IN(7)  * 12140U - IN(8)  * 11003U;
   1423    t10 = IN(5)  *  8423U + IN(10) * 14053U;
   1424    t11 = IN(5)  * 14053U - IN(10) *  8423U;
   1425    t12 = IN(3)  *  5520U + IN(12) * 15426U;
   1426    t13 = IN(3)  * 15426U - IN(12) *  5520U;
   1427    t14 = IN(1)  *  2404U + IN(14) * 16207U;
   1428    t15 = IN(1)  * 16207U - IN(14) *  2404U;
   1429 
   1430    t0a  = (dctint)((1U << 13) + t0 + t8 ) >> 14;
   1431    t1a  = (dctint)((1U << 13) + t1 + t9 ) >> 14;
   1432    t2a  = (dctint)((1U << 13) + t2 + t10) >> 14;
   1433    t3a  = (dctint)((1U << 13) + t3 + t11) >> 14;
   1434    t4a  = (dctint)((1U << 13) + t4 + t12) >> 14;
   1435    t5a  = (dctint)((1U << 13) + t5 + t13) >> 14;
   1436    t6a  = (dctint)((1U << 13) + t6 + t14) >> 14;
   1437    t7a  = (dctint)((1U << 13) + t7 + t15) >> 14;
   1438    t8a  = (dctint)((1U << 13) + t0 - t8 ) >> 14;
   1439    t9a  = (dctint)((1U << 13) + t1 - t9 ) >> 14;
   1440    t10a = (dctint)((1U << 13) + t2 - t10) >> 14;
   1441    t11a = (dctint)((1U << 13) + t3 - t11) >> 14;
   1442    t12a = (dctint)((1U << 13) + t4 - t12) >> 14;
   1443    t13a = (dctint)((1U << 13) + t5 - t13) >> 14;
   1444    t14a = (dctint)((1U << 13) + t6 - t14) >> 14;
   1445    t15a = (dctint)((1U << 13) + t7 - t15) >> 14;
   1446 
   1447    t8   = t8a  * 16069U + t9a  *  3196U;
   1448    t9   = t8a  *  3196U - t9a  * 16069U;
   1449    t10  = t10a *  9102U + t11a * 13623U;
   1450    t11  = t10a * 13623U - t11a *  9102U;
   1451    t12  = t13a * 16069U - t12a *  3196U;
   1452    t13  = t13a *  3196U + t12a * 16069U;
   1453    t14  = t15a *  9102U - t14a * 13623U;
   1454    t15  = t15a * 13623U + t14a *  9102U;
   1455 
   1456    t0   = t0a + t4a;
   1457    t1   = t1a + t5a;
   1458    t2   = t2a + t6a;
   1459    t3   = t3a + t7a;
   1460    t4   = t0a - t4a;
   1461    t5   = t1a - t5a;
   1462    t6   = t2a - t6a;
   1463    t7   = t3a - t7a;
   1464    t8a  = (dctint)((1U << 13) + t8  + t12) >> 14;
   1465    t9a  = (dctint)((1U << 13) + t9  + t13) >> 14;
   1466    t10a = (dctint)((1U << 13) + t10 + t14) >> 14;
   1467    t11a = (dctint)((1U << 13) + t11 + t15) >> 14;
   1468    t12a = (dctint)((1U << 13) + t8  - t12) >> 14;
   1469    t13a = (dctint)((1U << 13) + t9  - t13) >> 14;
   1470    t14a = (dctint)((1U << 13) + t10 - t14) >> 14;
   1471    t15a = (dctint)((1U << 13) + t11 - t15) >> 14;
   1472 
   1473    t4a  = t4 * 15137U + t5 *  6270U;
   1474    t5a  = t4 *  6270U - t5 * 15137U;
   1475    t6a  = t7 * 15137U - t6 *  6270U;
   1476    t7a  = t7 *  6270U + t6 * 15137U;
   1477    t12  = t12a * 15137U + t13a *  6270U;
   1478    t13  = t12a *  6270U - t13a * 15137U;
   1479    t14  = t15a * 15137U - t14a *  6270U;
   1480    t15  = t15a *  6270U + t14a * 15137U;
   1481 
   1482    out[ 0] =   t0 + t2;
   1483    out[15] = -(t1 + t3);
   1484    t2a     =   t0 - t2;
   1485    t3a     =   t1 - t3;
   1486    out[ 3] = -((dctint)((1U << 13) + t4a + t6a) >> 14);
   1487    out[12] =   (dctint)((1U << 13) + t5a + t7a) >> 14;
   1488    t6      =   (dctint)((1U << 13) + t4a - t6a) >> 14;
   1489    t7      =   (dctint)((1U << 13) + t5a - t7a) >> 14;
   1490    out[ 1] = -(t8a + t10a);
   1491    out[14] =   t9a + t11a;
   1492    t10     =   t8a - t10a;
   1493    t11     =   t9a - t11a;
   1494    out[ 2] =   (dctint)((1U << 13) + t12 + t14) >> 14;
   1495    out[13] = -((dctint)((1U << 13) + t13 + t15) >> 14);
   1496    t14a    =   (dctint)((1U << 13) + t12 - t14) >> 14;
   1497    t15a    =   (dctint)((1U << 13) + t13 - t15) >> 14;
   1498 
   1499    out[ 7] = (dctint)(-(t2a  + t3a)  * 11585U  + (1 << 13)) >> 14;
   1500    out[ 8] = (dctint)( (t2a  - t3a)  * 11585U  + (1 << 13)) >> 14;
   1501    out[ 4] = (dctint)( (t7   + t6)   * 11585U  + (1 << 13)) >> 14;
   1502    out[11] = (dctint)( (t7   - t6)   * 11585U  + (1 << 13)) >> 14;
   1503    out[ 6] = (dctint)( (t11  + t10)  * 11585U  + (1 << 13)) >> 14;
   1504    out[ 9] = (dctint)( (t11  - t10)  * 11585U  + (1 << 13)) >> 14;
   1505    out[ 5] = (dctint)(-(t14a + t15a) * 11585U  + (1 << 13)) >> 14;
   1506    out[10] = (dctint)( (t14a - t15a) * 11585U  + (1 << 13)) >> 14;
   1507 }
   1508 
   1509 itxfm_wrap(16, 6)
   1510 
   1511 static av_always_inline void idct32_1d(const dctcoef *in, ptrdiff_t stride,
   1512                                       dctcoef *out, int pass)
   1513 {
   1514    dctint t0a  = (dctint)((IN(0) + IN(16)) * 11585U         + (1 << 13)) >> 14;
   1515    dctint t1a  = (dctint)((IN(0) - IN(16)) * 11585U         + (1 << 13)) >> 14;
   1516    dctint t2a  = (dctint)(IN( 8) *  6270U - IN(24) * 15137U + (1 << 13)) >> 14;
   1517    dctint t3a  = (dctint)(IN( 8) * 15137U + IN(24) *  6270U + (1 << 13)) >> 14;
   1518    dctint t4a  = (dctint)(IN( 4) *  3196U - IN(28) * 16069U + (1 << 13)) >> 14;
   1519    dctint t7a  = (dctint)(IN( 4) * 16069U + IN(28) *  3196U + (1 << 13)) >> 14;
   1520    dctint t5a  = (dctint)(IN(20) * 13623U - IN(12) *  9102U + (1 << 13)) >> 14;
   1521    dctint t6a  = (dctint)(IN(20) *  9102U + IN(12) * 13623U + (1 << 13)) >> 14;
   1522    dctint t8a  = (dctint)(IN( 2) *  1606U - IN(30) * 16305U + (1 << 13)) >> 14;
   1523    dctint t15a = (dctint)(IN( 2) * 16305U + IN(30) *  1606U + (1 << 13)) >> 14;
   1524    dctint t9a  = (dctint)(IN(18) * 12665U - IN(14) * 10394U + (1 << 13)) >> 14;
   1525    dctint t14a = (dctint)(IN(18) * 10394U + IN(14) * 12665U + (1 << 13)) >> 14;
   1526    dctint t10a = (dctint)(IN(10) *  7723U - IN(22) * 14449U + (1 << 13)) >> 14;
   1527    dctint t13a = (dctint)(IN(10) * 14449U + IN(22) *  7723U + (1 << 13)) >> 14;
   1528    dctint t11a = (dctint)(IN(26) * 15679U - IN( 6) *  4756U + (1 << 13)) >> 14;
   1529    dctint t12a = (dctint)(IN(26) *  4756U + IN( 6) * 15679U + (1 << 13)) >> 14;
   1530    dctint t16a = (dctint)(IN( 1) *   804U - IN(31) * 16364U + (1 << 13)) >> 14;
   1531    dctint t31a = (dctint)(IN( 1) * 16364U + IN(31) *   804U + (1 << 13)) >> 14;
   1532    dctint t17a = (dctint)(IN(17) * 12140U - IN(15) * 11003U + (1 << 13)) >> 14;
   1533    dctint t30a = (dctint)(IN(17) * 11003U + IN(15) * 12140U + (1 << 13)) >> 14;
   1534    dctint t18a = (dctint)(IN( 9) *  7005U - IN(23) * 14811U + (1 << 13)) >> 14;
   1535    dctint t29a = (dctint)(IN( 9) * 14811U + IN(23) *  7005U + (1 << 13)) >> 14;
   1536    dctint t19a = (dctint)(IN(25) * 15426U - IN( 7) *  5520U + (1 << 13)) >> 14;
   1537    dctint t28a = (dctint)(IN(25) *  5520U + IN( 7) * 15426U + (1 << 13)) >> 14;
   1538    dctint t20a = (dctint)(IN( 5) *  3981U - IN(27) * 15893U + (1 << 13)) >> 14;
   1539    dctint t27a = (dctint)(IN( 5) * 15893U + IN(27) *  3981U + (1 << 13)) >> 14;
   1540    dctint t21a = (dctint)(IN(21) * 14053U - IN(11) *  8423U + (1 << 13)) >> 14;
   1541    dctint t26a = (dctint)(IN(21) *  8423U + IN(11) * 14053U + (1 << 13)) >> 14;
   1542    dctint t22a = (dctint)(IN(13) *  9760U - IN(19) * 13160U + (1 << 13)) >> 14;
   1543    dctint t25a = (dctint)(IN(13) * 13160U + IN(19) *  9760U + (1 << 13)) >> 14;
   1544    dctint t23a = (dctint)(IN(29) * 16207U - IN( 3) *  2404U + (1 << 13)) >> 14;
   1545    dctint t24a = (dctint)(IN(29) *  2404U + IN( 3) * 16207U + (1 << 13)) >> 14;
   1546 
   1547    dctint t0  = t0a  + t3a;
   1548    dctint t1  = t1a  + t2a;
   1549    dctint t2  = t1a  - t2a;
   1550    dctint t3  = t0a  - t3a;
   1551    dctint t4  = t4a  + t5a;
   1552    dctint t5  = t4a  - t5a;
   1553    dctint t6  = t7a  - t6a;
   1554    dctint t7  = t7a  + t6a;
   1555    dctint t8  = t8a  + t9a;
   1556    dctint t9  = t8a  - t9a;
   1557    dctint t10 = t11a - t10a;
   1558    dctint t11 = t11a + t10a;
   1559    dctint t12 = t12a + t13a;
   1560    dctint t13 = t12a - t13a;
   1561    dctint t14 = t15a - t14a;
   1562    dctint t15 = t15a + t14a;
   1563    dctint t16 = t16a + t17a;
   1564    dctint t17 = t16a - t17a;
   1565    dctint t18 = t19a - t18a;
   1566    dctint t19 = t19a + t18a;
   1567    dctint t20 = t20a + t21a;
   1568    dctint t21 = t20a - t21a;
   1569    dctint t22 = t23a - t22a;
   1570    dctint t23 = t23a + t22a;
   1571    dctint t24 = t24a + t25a;
   1572    dctint t25 = t24a - t25a;
   1573    dctint t26 = t27a - t26a;
   1574    dctint t27 = t27a + t26a;
   1575    dctint t28 = t28a + t29a;
   1576    dctint t29 = t28a - t29a;
   1577    dctint t30 = t31a - t30a;
   1578    dctint t31 = t31a + t30a;
   1579 
   1580    t5a  = (dctint)((t6 - t5) * 11585U             + (1 << 13)) >> 14;
   1581    t6a  = (dctint)((t6 + t5) * 11585U             + (1 << 13)) >> 14;
   1582    t9a  = (dctint)(  t14 *  6270U - t9  * 15137U  + (1 << 13)) >> 14;
   1583    t14a = (dctint)(  t14 * 15137U + t9  *  6270U  + (1 << 13)) >> 14;
   1584    t10a = (dctint)(-(t13 * 15137U + t10 *  6270U) + (1 << 13)) >> 14;
   1585    t13a = (dctint)(  t13 *  6270U - t10 * 15137U  + (1 << 13)) >> 14;
   1586    t17a = (dctint)(  t30 *  3196U - t17 * 16069U  + (1 << 13)) >> 14;
   1587    t30a = (dctint)(  t30 * 16069U + t17 *  3196U  + (1 << 13)) >> 14;
   1588    t18a = (dctint)(-(t29 * 16069U + t18 *  3196U) + (1 << 13)) >> 14;
   1589    t29a = (dctint)(  t29 *  3196U - t18 * 16069U  + (1 << 13)) >> 14;
   1590    t21a = (dctint)(  t26 * 13623U - t21 *  9102U  + (1 << 13)) >> 14;
   1591    t26a = (dctint)(  t26 *  9102U + t21 * 13623U  + (1 << 13)) >> 14;
   1592    t22a = (dctint)(-(t25 *  9102U + t22 * 13623U) + (1 << 13)) >> 14;
   1593    t25a = (dctint)(  t25 * 13623U - t22 *  9102U  + (1 << 13)) >> 14;
   1594 
   1595    t0a  = t0   + t7;
   1596    t1a  = t1   + t6a;
   1597    t2a  = t2   + t5a;
   1598    t3a  = t3   + t4;
   1599    t4a  = t3   - t4;
   1600    t5   = t2   - t5a;
   1601    t6   = t1   - t6a;
   1602    t7a  = t0   - t7;
   1603    t8a  = t8   + t11;
   1604    t9   = t9a  + t10a;
   1605    t10  = t9a  - t10a;
   1606    t11a = t8   - t11;
   1607    t12a = t15  - t12;
   1608    t13  = t14a - t13a;
   1609    t14  = t14a + t13a;
   1610    t15a = t15  + t12;
   1611    t16a = t16  + t19;
   1612    t17  = t17a + t18a;
   1613    t18  = t17a - t18a;
   1614    t19a = t16  - t19;
   1615    t20a = t23  - t20;
   1616    t21  = t22a - t21a;
   1617    t22  = t22a + t21a;
   1618    t23a = t23  + t20;
   1619    t24a = t24  + t27;
   1620    t25  = t25a + t26a;
   1621    t26  = t25a - t26a;
   1622    t27a = t24  - t27;
   1623    t28a = t31  - t28;
   1624    t29  = t30a - t29a;
   1625    t30  = t30a + t29a;
   1626    t31a = t31  + t28;
   1627 
   1628    t10a = (dctint)((t13  - t10)  * 11585U           + (1 << 13)) >> 14;
   1629    t13a = (dctint)((t13  + t10)  * 11585U           + (1 << 13)) >> 14;
   1630    t11  = (dctint)((t12a - t11a) * 11585U           + (1 << 13)) >> 14;
   1631    t12  = (dctint)((t12a + t11a) * 11585U           + (1 << 13)) >> 14;
   1632    t18a = (dctint)(  t29  *  6270U - t18  * 15137U  + (1 << 13)) >> 14;
   1633    t29a = (dctint)(  t29  * 15137U + t18  *  6270U  + (1 << 13)) >> 14;
   1634    t19  = (dctint)(  t28a *  6270U - t19a * 15137U  + (1 << 13)) >> 14;
   1635    t28  = (dctint)(  t28a * 15137U + t19a *  6270U  + (1 << 13)) >> 14;
   1636    t20  = (dctint)(-(t27a * 15137U + t20a *  6270U) + (1 << 13)) >> 14;
   1637    t27  = (dctint)(  t27a *  6270U - t20a * 15137U  + (1 << 13)) >> 14;
   1638    t21a = (dctint)(-(t26  * 15137U + t21  *  6270U) + (1 << 13)) >> 14;
   1639    t26a = (dctint)(  t26  *  6270U - t21  * 15137U  + (1 << 13)) >> 14;
   1640 
   1641    t0   = t0a + t15a;
   1642    t1   = t1a + t14;
   1643    t2   = t2a + t13a;
   1644    t3   = t3a + t12;
   1645    t4   = t4a + t11;
   1646    t5a  = t5  + t10a;
   1647    t6a  = t6  + t9;
   1648    t7   = t7a + t8a;
   1649    t8   = t7a - t8a;
   1650    t9a  = t6  - t9;
   1651    t10  = t5  - t10a;
   1652    t11a = t4a - t11;
   1653    t12a = t3a - t12;
   1654    t13  = t2a - t13a;
   1655    t14a = t1a - t14;
   1656    t15  = t0a - t15a;
   1657    t16  = t16a + t23a;
   1658    t17a = t17  + t22;
   1659    t18  = t18a + t21a;
   1660    t19a = t19  + t20;
   1661    t20a = t19  - t20;
   1662    t21  = t18a - t21a;
   1663    t22a = t17  - t22;
   1664    t23  = t16a - t23a;
   1665    t24  = t31a - t24a;
   1666    t25a = t30  - t25;
   1667    t26  = t29a - t26a;
   1668    t27a = t28  - t27;
   1669    t28a = t28  + t27;
   1670    t29  = t29a + t26a;
   1671    t30a = t30  + t25;
   1672    t31  = t31a + t24a;
   1673 
   1674    t20  = (dctint)((t27a - t20a) * 11585U + (1 << 13)) >> 14;
   1675    t27  = (dctint)((t27a + t20a) * 11585U + (1 << 13)) >> 14;
   1676    t21a = (dctint)((t26  - t21 ) * 11585U + (1 << 13)) >> 14;
   1677    t26a = (dctint)((t26  + t21 ) * 11585U + (1 << 13)) >> 14;
   1678    t22  = (dctint)((t25a - t22a) * 11585U + (1 << 13)) >> 14;
   1679    t25  = (dctint)((t25a + t22a) * 11585U + (1 << 13)) >> 14;
   1680    t23a = (dctint)((t24  - t23 ) * 11585U + (1 << 13)) >> 14;
   1681    t24a = (dctint)((t24  + t23 ) * 11585U + (1 << 13)) >> 14;
   1682 
   1683    out[ 0] = t0   + t31;
   1684    out[ 1] = t1   + t30a;
   1685    out[ 2] = t2   + t29;
   1686    out[ 3] = t3   + t28a;
   1687    out[ 4] = t4   + t27;
   1688    out[ 5] = t5a  + t26a;
   1689    out[ 6] = t6a  + t25;
   1690    out[ 7] = t7   + t24a;
   1691    out[ 8] = t8   + t23a;
   1692    out[ 9] = t9a  + t22;
   1693    out[10] = t10  + t21a;
   1694    out[11] = t11a + t20;
   1695    out[12] = t12a + t19a;
   1696    out[13] = t13  + t18;
   1697    out[14] = t14a + t17a;
   1698    out[15] = t15  + t16;
   1699    out[16] = t15  - t16;
   1700    out[17] = t14a - t17a;
   1701    out[18] = t13  - t18;
   1702    out[19] = t12a - t19a;
   1703    out[20] = t11a - t20;
   1704    out[21] = t10  - t21a;
   1705    out[22] = t9a  - t22;
   1706    out[23] = t8   - t23a;
   1707    out[24] = t7   - t24a;
   1708    out[25] = t6a  - t25;
   1709    out[26] = t5a  - t26a;
   1710    out[27] = t4   - t27;
   1711    out[28] = t3   - t28a;
   1712    out[29] = t2   - t29;
   1713    out[30] = t1   - t30a;
   1714    out[31] = t0   - t31;
   1715 }
   1716 
   1717 itxfm_wrapper(idct, idct, 32, 6, 1)
   1718 
   1719 static av_always_inline void iwht4_1d(const dctcoef *in, ptrdiff_t stride,
   1720                                      dctcoef *out, int pass)
   1721 {
   1722    int t0, t1, t2, t3, t4;
   1723 
   1724    if (pass == 0) {
   1725        t0 = IN(0) >> 2;
   1726        t1 = IN(3) >> 2;
   1727        t2 = IN(1) >> 2;
   1728        t3 = IN(2) >> 2;
   1729    } else {
   1730        t0 = IN(0);
   1731        t1 = IN(3);
   1732        t2 = IN(1);
   1733        t3 = IN(2);
   1734    }
   1735 
   1736    t0 += t2;
   1737    t3 -= t1;
   1738    t4 = (t0 - t3) >> 1;
   1739    t1 = t4 - t1;
   1740    t2 = t4 - t2;
   1741    t0 -= t1;
   1742    t3 += t2;
   1743 
   1744    out[0] = t0;
   1745    out[1] = t1;
   1746    out[2] = t2;
   1747    out[3] = t3;
   1748 }
   1749 
   1750 itxfm_wrapper(iwht, iwht, 4, 0, 0)
   1751 
   1752 #undef IN
   1753 #undef itxfm_wrapper
   1754 #undef itxfm_wrap
   1755 
   1756 static av_cold void vp9dsp_itxfm_init(VP9DSPContext *dsp)
   1757 {
   1758 #define init_itxfm(tx, sz) \
   1759    dsp->itxfm_add[tx][DCT_DCT]   = idct_idct_##sz##_add_c; \
   1760    dsp->itxfm_add[tx][DCT_ADST]  = iadst_idct_##sz##_add_c; \
   1761    dsp->itxfm_add[tx][ADST_DCT]  = idct_iadst_##sz##_add_c; \
   1762    dsp->itxfm_add[tx][ADST_ADST] = iadst_iadst_##sz##_add_c
   1763 
   1764 #define init_idct(tx, nm) \
   1765    dsp->itxfm_add[tx][DCT_DCT]   = \
   1766    dsp->itxfm_add[tx][ADST_DCT]  = \
   1767    dsp->itxfm_add[tx][DCT_ADST]  = \
   1768    dsp->itxfm_add[tx][ADST_ADST] = nm##_add_c
   1769 
   1770    init_itxfm(TX_4X4,   4x4);
   1771    init_itxfm(TX_8X8,   8x8);
   1772    init_itxfm(TX_16X16, 16x16);
   1773    init_idct(TX_32X32,  idct_idct_32x32);
   1774    init_idct(4 /* lossless */, iwht_iwht_4x4);
   1775 
   1776 #undef init_itxfm
   1777 #undef init_idct
   1778 }
   1779 
   1780 static av_always_inline void loop_filter(pixel *dst, int E, int I, int H,
   1781                                         ptrdiff_t stridea, ptrdiff_t strideb,
   1782                                         int wd)
   1783 {
   1784    int i, F = 1 << (BIT_DEPTH - 8);
   1785 
   1786    E <<= (BIT_DEPTH - 8);
   1787    I <<= (BIT_DEPTH - 8);
   1788    H <<= (BIT_DEPTH - 8);
   1789    for (i = 0; i < 8; i++, dst += stridea) {
   1790        int p7, p6, p5, p4;
   1791        int p3 = dst[strideb * -4], p2 = dst[strideb * -3];
   1792        int p1 = dst[strideb * -2], p0 = dst[strideb * -1];
   1793        int q0 = dst[strideb * +0], q1 = dst[strideb * +1];
   1794        int q2 = dst[strideb * +2], q3 = dst[strideb * +3];
   1795        int q4, q5, q6, q7;
   1796        int fm = FFABS(p3 - p2) <= I && FFABS(p2 - p1) <= I &&
   1797                 FFABS(p1 - p0) <= I && FFABS(q1 - q0) <= I &&
   1798                 FFABS(q2 - q1) <= I && FFABS(q3 - q2) <= I &&
   1799                 FFABS(p0 - q0) * 2 + (FFABS(p1 - q1) >> 1) <= E;
   1800        int flat8out, flat8in;
   1801 
   1802        if (!fm)
   1803            continue;
   1804 
   1805        if (wd >= 16) {
   1806            p7 = dst[strideb * -8];
   1807            p6 = dst[strideb * -7];
   1808            p5 = dst[strideb * -6];
   1809            p4 = dst[strideb * -5];
   1810            q4 = dst[strideb * +4];
   1811            q5 = dst[strideb * +5];
   1812            q6 = dst[strideb * +6];
   1813            q7 = dst[strideb * +7];
   1814 
   1815            flat8out = FFABS(p7 - p0) <= F && FFABS(p6 - p0) <= F &&
   1816                       FFABS(p5 - p0) <= F && FFABS(p4 - p0) <= F &&
   1817                       FFABS(q4 - q0) <= F && FFABS(q5 - q0) <= F &&
   1818                       FFABS(q6 - q0) <= F && FFABS(q7 - q0) <= F;
   1819        }
   1820 
   1821        if (wd >= 8)
   1822            flat8in = FFABS(p3 - p0) <= F && FFABS(p2 - p0) <= F &&
   1823                      FFABS(p1 - p0) <= F && FFABS(q1 - q0) <= F &&
   1824                      FFABS(q2 - q0) <= F && FFABS(q3 - q0) <= F;
   1825 
   1826        if (wd >= 16 && flat8out && flat8in) {
   1827            dst[strideb * -7] = (p7 + p7 + p7 + p7 + p7 + p7 + p7 + p6 * 2 +
   1828                                 p5 + p4 + p3 + p2 + p1 + p0 + q0 + 8) >> 4;
   1829            dst[strideb * -6] = (p7 + p7 + p7 + p7 + p7 + p7 + p6 + p5 * 2 +
   1830                                 p4 + p3 + p2 + p1 + p0 + q0 + q1 + 8) >> 4;
   1831            dst[strideb * -5] = (p7 + p7 + p7 + p7 + p7 + p6 + p5 + p4 * 2 +
   1832                                 p3 + p2 + p1 + p0 + q0 + q1 + q2 + 8) >> 4;
   1833            dst[strideb * -4] = (p7 + p7 + p7 + p7 + p6 + p5 + p4 + p3 * 2 +
   1834                                 p2 + p1 + p0 + q0 + q1 + q2 + q3 + 8) >> 4;
   1835            dst[strideb * -3] = (p7 + p7 + p7 + p6 + p5 + p4 + p3 + p2 * 2 +
   1836                                 p1 + p0 + q0 + q1 + q2 + q3 + q4 + 8) >> 4;
   1837            dst[strideb * -2] = (p7 + p7 + p6 + p5 + p4 + p3 + p2 + p1 * 2 +
   1838                                 p0 + q0 + q1 + q2 + q3 + q4 + q5 + 8) >> 4;
   1839            dst[strideb * -1] = (p7 + p6 + p5 + p4 + p3 + p2 + p1 + p0 * 2 +
   1840                                 q0 + q1 + q2 + q3 + q4 + q5 + q6 + 8) >> 4;
   1841            dst[strideb * +0] = (p6 + p5 + p4 + p3 + p2 + p1 + p0 + q0 * 2 +
   1842                                 q1 + q2 + q3 + q4 + q5 + q6 + q7 + 8) >> 4;
   1843            dst[strideb * +1] = (p5 + p4 + p3 + p2 + p1 + p0 + q0 + q1 * 2 +
   1844                                 q2 + q3 + q4 + q5 + q6 + q7 + q7 + 8) >> 4;
   1845            dst[strideb * +2] = (p4 + p3 + p2 + p1 + p0 + q0 + q1 + q2 * 2 +
   1846                                 q3 + q4 + q5 + q6 + q7 + q7 + q7 + 8) >> 4;
   1847            dst[strideb * +3] = (p3 + p2 + p1 + p0 + q0 + q1 + q2 + q3 * 2 +
   1848                                 q4 + q5 + q6 + q7 + q7 + q7 + q7 + 8) >> 4;
   1849            dst[strideb * +4] = (p2 + p1 + p0 + q0 + q1 + q2 + q3 + q4 * 2 +
   1850                                 q5 + q6 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
   1851            dst[strideb * +5] = (p1 + p0 + q0 + q1 + q2 + q3 + q4 + q5 * 2 +
   1852                                 q6 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
   1853            dst[strideb * +6] = (p0 + q0 + q1 + q2 + q3 + q4 + q5 + q6 * 2 +
   1854                                 q7 + q7 + q7 + q7 + q7 + q7 + q7 + 8) >> 4;
   1855        } else if (wd >= 8 && flat8in) {
   1856            dst[strideb * -3] = (p3 + p3 + p3 + 2 * p2 + p1 + p0 + q0 + 4) >> 3;
   1857            dst[strideb * -2] = (p3 + p3 + p2 + 2 * p1 + p0 + q0 + q1 + 4) >> 3;
   1858            dst[strideb * -1] = (p3 + p2 + p1 + 2 * p0 + q0 + q1 + q2 + 4) >> 3;
   1859            dst[strideb * +0] = (p2 + p1 + p0 + 2 * q0 + q1 + q2 + q3 + 4) >> 3;
   1860            dst[strideb * +1] = (p1 + p0 + q0 + 2 * q1 + q2 + q3 + q3 + 4) >> 3;
   1861            dst[strideb * +2] = (p0 + q0 + q1 + 2 * q2 + q3 + q3 + q3 + 4) >> 3;
   1862        } else {
   1863            int hev = FFABS(p1 - p0) > H || FFABS(q1 - q0) > H;
   1864 
   1865            if (hev) {
   1866                int f = av_clip_intp2(p1 - q1, BIT_DEPTH - 1), f1, f2;
   1867                f = av_clip_intp2(3 * (q0 - p0) + f, BIT_DEPTH - 1);
   1868 
   1869                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
   1870                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
   1871 
   1872                dst[strideb * -1] = av_clip_pixel(p0 + f2);
   1873                dst[strideb * +0] = av_clip_pixel(q0 - f1);
   1874            } else {
   1875                int f = av_clip_intp2(3 * (q0 - p0), BIT_DEPTH - 1), f1, f2;
   1876 
   1877                f1 = FFMIN(f + 4, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
   1878                f2 = FFMIN(f + 3, (1 << (BIT_DEPTH - 1)) - 1) >> 3;
   1879 
   1880                dst[strideb * -1] = av_clip_pixel(p0 + f2);
   1881                dst[strideb * +0] = av_clip_pixel(q0 - f1);
   1882 
   1883                f = (f1 + 1) >> 1;
   1884                dst[strideb * -2] = av_clip_pixel(p1 + f);
   1885                dst[strideb * +1] = av_clip_pixel(q1 - f);
   1886            }
   1887        }
   1888    }
   1889 }
   1890 
   1891 #define lf_8_fn(dir, wd, stridea, strideb) \
   1892 static void loop_filter_##dir##_##wd##_8_c(uint8_t *_dst, \
   1893                                           ptrdiff_t stride, \
   1894                                           int E, int I, int H) \
   1895 { \
   1896    pixel *dst = (pixel *) _dst; \
   1897    stride /= sizeof(pixel); \
   1898    loop_filter(dst, E, I, H, stridea, strideb, wd); \
   1899 }
   1900 
   1901 #define lf_8_fns(wd) \
   1902 lf_8_fn(h, wd, stride, 1) \
   1903 lf_8_fn(v, wd, 1, stride)
   1904 
   1905 lf_8_fns(4)
   1906 lf_8_fns(8)
   1907 lf_8_fns(16)
   1908 
   1909 #undef lf_8_fn
   1910 #undef lf_8_fns
   1911 
   1912 #define lf_16_fn(dir, stridea) \
   1913 static void loop_filter_##dir##_16_16_c(uint8_t *dst, \
   1914                                        ptrdiff_t stride, \
   1915                                        int E, int I, int H) \
   1916 { \
   1917    loop_filter_##dir##_16_8_c(dst, stride, E, I, H); \
   1918    loop_filter_##dir##_16_8_c(dst + 8 * stridea, stride, E, I, H); \
   1919 }
   1920 
   1921 lf_16_fn(h, stride)
   1922 lf_16_fn(v, sizeof(pixel))
   1923 
   1924 #undef lf_16_fn
   1925 
   1926 #define lf_mix_fn(dir, wd1, wd2, stridea) \
   1927 static void loop_filter_##dir##_##wd1##wd2##_16_c(uint8_t *dst, \
   1928                                                  ptrdiff_t stride, \
   1929                                                  int E, int I, int H) \
   1930 { \
   1931    loop_filter_##dir##_##wd1##_8_c(dst, stride, E & 0xff, I & 0xff, H & 0xff); \
   1932    loop_filter_##dir##_##wd2##_8_c(dst + 8 * stridea, stride, E >> 8, I >> 8, H >> 8); \
   1933 }
   1934 
   1935 #define lf_mix_fns(wd1, wd2) \
   1936 lf_mix_fn(h, wd1, wd2, stride) \
   1937 lf_mix_fn(v, wd1, wd2, sizeof(pixel))
   1938 
   1939 lf_mix_fns(4, 4)
   1940 lf_mix_fns(4, 8)
   1941 lf_mix_fns(8, 4)
   1942 lf_mix_fns(8, 8)
   1943 
   1944 #undef lf_mix_fn
   1945 #undef lf_mix_fns
   1946 
   1947 static av_cold void vp9dsp_loopfilter_init(VP9DSPContext *dsp)
   1948 {
   1949    dsp->loop_filter_8[0][0] = loop_filter_h_4_8_c;
   1950    dsp->loop_filter_8[0][1] = loop_filter_v_4_8_c;
   1951    dsp->loop_filter_8[1][0] = loop_filter_h_8_8_c;
   1952    dsp->loop_filter_8[1][1] = loop_filter_v_8_8_c;
   1953    dsp->loop_filter_8[2][0] = loop_filter_h_16_8_c;
   1954    dsp->loop_filter_8[2][1] = loop_filter_v_16_8_c;
   1955 
   1956    dsp->loop_filter_16[0] = loop_filter_h_16_16_c;
   1957    dsp->loop_filter_16[1] = loop_filter_v_16_16_c;
   1958 
   1959    dsp->loop_filter_mix2[0][0][0] = loop_filter_h_44_16_c;
   1960    dsp->loop_filter_mix2[0][0][1] = loop_filter_v_44_16_c;
   1961    dsp->loop_filter_mix2[0][1][0] = loop_filter_h_48_16_c;
   1962    dsp->loop_filter_mix2[0][1][1] = loop_filter_v_48_16_c;
   1963    dsp->loop_filter_mix2[1][0][0] = loop_filter_h_84_16_c;
   1964    dsp->loop_filter_mix2[1][0][1] = loop_filter_v_84_16_c;
   1965    dsp->loop_filter_mix2[1][1][0] = loop_filter_h_88_16_c;
   1966    dsp->loop_filter_mix2[1][1][1] = loop_filter_v_88_16_c;
   1967 }
   1968 
   1969 #if BIT_DEPTH != 12
   1970 
   1971 static av_always_inline void copy_c(uint8_t *restrict dst, ptrdiff_t dst_stride,
   1972                                    const uint8_t *restrict src,
   1973                                    ptrdiff_t src_stride, int w, int h)
   1974 {
   1975    do {
   1976        memcpy(dst, src, w * sizeof(pixel));
   1977 
   1978        dst += dst_stride;
   1979        src += src_stride;
   1980    } while (--h);
   1981 }
   1982 
   1983 static av_always_inline void avg_c(uint8_t *restrict _dst, ptrdiff_t dst_stride,
   1984                                   const uint8_t *restrict _src,
   1985                                   ptrdiff_t src_stride, int w, int h)
   1986 {
   1987    pixel *dst = (pixel *) _dst;
   1988    const pixel *src = (const pixel *) _src;
   1989 
   1990    dst_stride /= sizeof(pixel);
   1991    src_stride /= sizeof(pixel);
   1992    do {
   1993        int x;
   1994 
   1995        for (x = 0; x < w; x += 4)
   1996            AV_WN4PA(&dst[x], rnd_avg_pixel4(AV_RN4PA(&dst[x]), AV_RN4P(&src[x])));
   1997 
   1998        dst += dst_stride;
   1999        src += src_stride;
   2000    } while (--h);
   2001 }
   2002 
   2003 #define fpel_fn(type, sz) \
   2004 static void type##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2005                         const uint8_t *src, ptrdiff_t src_stride, \
   2006                         int h, int mx, int my) \
   2007 { \
   2008    type##_c(dst, dst_stride, src, src_stride, sz, h); \
   2009 }
   2010 
   2011 #define copy_avg_fn(sz) \
   2012 fpel_fn(copy, sz) \
   2013 fpel_fn(avg,  sz)
   2014 
   2015 copy_avg_fn(64)
   2016 copy_avg_fn(32)
   2017 copy_avg_fn(16)
   2018 copy_avg_fn(8)
   2019 copy_avg_fn(4)
   2020 
   2021 #undef fpel_fn
   2022 #undef copy_avg_fn
   2023 
   2024 #endif /* BIT_DEPTH != 12 */
   2025 
   2026 #define FILTER_8TAP(src, x, F, stride) \
   2027    av_clip_pixel((F[0] * src[x + -3 * stride] + \
   2028                   F[1] * src[x + -2 * stride] + \
   2029                   F[2] * src[x + -1 * stride] + \
   2030                   F[3] * src[x + +0 * stride] + \
   2031                   F[4] * src[x + +1 * stride] + \
   2032                   F[5] * src[x + +2 * stride] + \
   2033                   F[6] * src[x + +3 * stride] + \
   2034                   F[7] * src[x + +4 * stride] + 64) >> 7)
   2035 
   2036 static av_always_inline void do_8tap_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
   2037                                          const uint8_t *_src, ptrdiff_t src_stride,
   2038                                          int w, int h, ptrdiff_t ds,
   2039                                          const int16_t *filter, int avg)
   2040 {
   2041    pixel *dst = (pixel *) _dst;
   2042    const pixel *src = (const pixel *) _src;
   2043 
   2044    dst_stride /= sizeof(pixel);
   2045    src_stride /= sizeof(pixel);
   2046    do {
   2047        int x;
   2048 
   2049        for (x = 0; x < w; x++)
   2050            if (avg) {
   2051                dst[x] = (dst[x] + FILTER_8TAP(src, x, filter, ds) + 1) >> 1;
   2052            } else {
   2053                dst[x] = FILTER_8TAP(src, x, filter, ds);
   2054            }
   2055 
   2056        dst += dst_stride;
   2057        src += src_stride;
   2058    } while (--h);
   2059 }
   2060 
   2061 #define filter_8tap_1d_fn(opn, opa, dir, ds) \
   2062 static av_noinline void opn##_8tap_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2063                                                const uint8_t *src, ptrdiff_t src_stride, \
   2064                                                int w, int h, const int16_t *filter) \
   2065 { \
   2066    do_8tap_1d_c(dst, dst_stride, src, src_stride, w, h, ds, filter, opa); \
   2067 }
   2068 
   2069 filter_8tap_1d_fn(put, 0, v, src_stride / sizeof(pixel))
   2070 filter_8tap_1d_fn(put, 0, h, 1)
   2071 filter_8tap_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
   2072 filter_8tap_1d_fn(avg, 1, h, 1)
   2073 
   2074 #undef filter_8tap_1d_fn
   2075 
   2076 static av_always_inline void do_8tap_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
   2077                                          const uint8_t *_src, ptrdiff_t src_stride,
   2078                                          int w, int h, const int16_t *filterx,
   2079                                          const int16_t *filtery, int avg)
   2080 {
   2081    int tmp_h = h + 7;
   2082    pixel tmp[64 * 71], *tmp_ptr = tmp;
   2083    pixel *dst = (pixel *) _dst;
   2084    const pixel *src = (const pixel *) _src;
   2085 
   2086    dst_stride /= sizeof(pixel);
   2087    src_stride /= sizeof(pixel);
   2088    src -= src_stride * 3;
   2089    do {
   2090        int x;
   2091 
   2092        for (x = 0; x < w; x++)
   2093            tmp_ptr[x] = FILTER_8TAP(src, x, filterx, 1);
   2094 
   2095        tmp_ptr += 64;
   2096        src += src_stride;
   2097    } while (--tmp_h);
   2098 
   2099    tmp_ptr = tmp + 64 * 3;
   2100    do {
   2101        int x;
   2102 
   2103        for (x = 0; x < w; x++)
   2104            if (avg) {
   2105                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filtery, 64) + 1) >> 1;
   2106            } else {
   2107                dst[x] = FILTER_8TAP(tmp_ptr, x, filtery, 64);
   2108            }
   2109 
   2110        tmp_ptr += 64;
   2111        dst += dst_stride;
   2112    } while (--h);
   2113 }
   2114 
   2115 #define filter_8tap_2d_fn(opn, opa) \
   2116 static av_noinline void opn##_8tap_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2117                                           const uint8_t *src, ptrdiff_t src_stride, \
   2118                                           int w, int h, const int16_t *filterx, \
   2119                                           const int16_t *filtery) \
   2120 { \
   2121    do_8tap_2d_c(dst, dst_stride, src, src_stride, w, h, filterx, filtery, opa); \
   2122 }
   2123 
   2124 filter_8tap_2d_fn(put, 0)
   2125 filter_8tap_2d_fn(avg, 1)
   2126 
   2127 #undef filter_8tap_2d_fn
   2128 
   2129 #define filter_fn_1d(sz, dir, dir_m, type, type_idx, avg) \
   2130 static void avg##_8tap_##type##_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2131                                              const uint8_t *src, ptrdiff_t src_stride, \
   2132                                              int h, int mx, int my) \
   2133 { \
   2134    avg##_8tap_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, \
   2135                            ff_vp9_subpel_filters[type_idx][dir_m]); \
   2136 }
   2137 
   2138 #define filter_fn_2d(sz, type, type_idx, avg) \
   2139 static void avg##_8tap_##type##_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2140                                           const uint8_t *src, ptrdiff_t src_stride, \
   2141                                           int h, int mx, int my) \
   2142 { \
   2143    avg##_8tap_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, \
   2144                       ff_vp9_subpel_filters[type_idx][mx], \
   2145                       ff_vp9_subpel_filters[type_idx][my]); \
   2146 }
   2147 
   2148 #if BIT_DEPTH != 12
   2149 
   2150 #define FILTER_BILIN(src, x, mxy, stride) \
   2151    (src[x] + ((mxy * (src[x + stride] - src[x]) + 8) >> 4))
   2152 
   2153 static av_always_inline void do_bilin_1d_c(uint8_t *_dst, ptrdiff_t dst_stride,
   2154                                           const uint8_t *_src, ptrdiff_t src_stride,
   2155                                           int w, int h, ptrdiff_t ds, int mxy, int avg)
   2156 {
   2157    pixel *dst = (pixel *) _dst;
   2158    const pixel *src = (const pixel *) _src;
   2159 
   2160    dst_stride /= sizeof(pixel);
   2161    src_stride /= sizeof(pixel);
   2162    do {
   2163        int x;
   2164 
   2165        for (x = 0; x < w; x++)
   2166            if (avg) {
   2167                dst[x] = (dst[x] + FILTER_BILIN(src, x, mxy, ds) + 1) >> 1;
   2168            } else {
   2169                dst[x] = FILTER_BILIN(src, x, mxy, ds);
   2170            }
   2171 
   2172        dst += dst_stride;
   2173        src += src_stride;
   2174    } while (--h);
   2175 }
   2176 
   2177 #define bilin_1d_fn(opn, opa, dir, ds) \
   2178 static av_noinline void opn##_bilin_1d_##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2179                                                 const uint8_t *src, ptrdiff_t src_stride, \
   2180                                                 int w, int h, int mxy) \
   2181 { \
   2182    do_bilin_1d_c(dst, dst_stride, src, src_stride, w, h, ds, mxy, opa); \
   2183 }
   2184 
   2185 bilin_1d_fn(put, 0, v, src_stride / sizeof(pixel))
   2186 bilin_1d_fn(put, 0, h, 1)
   2187 bilin_1d_fn(avg, 1, v, src_stride / sizeof(pixel))
   2188 bilin_1d_fn(avg, 1, h, 1)
   2189 
   2190 #undef bilin_1d_fn
   2191 
   2192 static av_always_inline void do_bilin_2d_c(uint8_t *_dst, ptrdiff_t dst_stride,
   2193                                           const uint8_t *_src, ptrdiff_t src_stride,
   2194                                           int w, int h, int mx, int my, int avg)
   2195 {
   2196    pixel tmp[64 * 65], *tmp_ptr = tmp;
   2197    int tmp_h = h + 1;
   2198    pixel *dst = (pixel *) _dst;
   2199    const pixel *src = (const pixel *) _src;
   2200 
   2201    dst_stride /= sizeof(pixel);
   2202    src_stride /= sizeof(pixel);
   2203    do {
   2204        int x;
   2205 
   2206        for (x = 0; x < w; x++)
   2207            tmp_ptr[x] = FILTER_BILIN(src, x, mx, 1);
   2208 
   2209        tmp_ptr += 64;
   2210        src += src_stride;
   2211    } while (--tmp_h);
   2212 
   2213    tmp_ptr = tmp;
   2214    do {
   2215        int x;
   2216 
   2217        for (x = 0; x < w; x++)
   2218            if (avg) {
   2219                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
   2220            } else {
   2221                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
   2222            }
   2223 
   2224        tmp_ptr += 64;
   2225        dst += dst_stride;
   2226    } while (--h);
   2227 }
   2228 
   2229 #define bilin_2d_fn(opn, opa) \
   2230 static av_noinline void opn##_bilin_2d_hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2231                                            const uint8_t *src, ptrdiff_t src_stride, \
   2232                                            int w, int h, int mx, int my) \
   2233 { \
   2234    do_bilin_2d_c(dst, dst_stride, src, src_stride, w, h, mx, my, opa); \
   2235 }
   2236 
   2237 bilin_2d_fn(put, 0)
   2238 bilin_2d_fn(avg, 1)
   2239 
   2240 #undef bilin_2d_fn
   2241 
   2242 #define bilinf_fn_1d(sz, dir, dir_m, avg) \
   2243 static void avg##_bilin_##sz##dir##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2244                                      const uint8_t *src, ptrdiff_t src_stride, \
   2245                                      int h, int mx, int my) \
   2246 { \
   2247    avg##_bilin_1d_##dir##_c(dst, dst_stride, src, src_stride, sz, h, dir_m); \
   2248 }
   2249 
   2250 #define bilinf_fn_2d(sz, avg) \
   2251 static void avg##_bilin_##sz##hv_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2252                                   const uint8_t *src, ptrdiff_t src_stride, \
   2253                                   int h, int mx, int my) \
   2254 { \
   2255    avg##_bilin_2d_hv_c(dst, dst_stride, src, src_stride, sz, h, mx, my); \
   2256 }
   2257 
   2258 #else
   2259 
   2260 #define bilinf_fn_1d(a, b, c, d)
   2261 #define bilinf_fn_2d(a, b)
   2262 
   2263 #endif
   2264 
   2265 #define filter_fn(sz, avg) \
   2266 filter_fn_1d(sz, h, mx, regular, FILTER_8TAP_REGULAR, avg) \
   2267 filter_fn_1d(sz, v, my, regular, FILTER_8TAP_REGULAR, avg) \
   2268 filter_fn_2d(sz,        regular, FILTER_8TAP_REGULAR, avg) \
   2269 filter_fn_1d(sz, h, mx, smooth,  FILTER_8TAP_SMOOTH,  avg) \
   2270 filter_fn_1d(sz, v, my, smooth,  FILTER_8TAP_SMOOTH,  avg) \
   2271 filter_fn_2d(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
   2272 filter_fn_1d(sz, h, mx, sharp,   FILTER_8TAP_SHARP,   avg) \
   2273 filter_fn_1d(sz, v, my, sharp,   FILTER_8TAP_SHARP,   avg) \
   2274 filter_fn_2d(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
   2275 bilinf_fn_1d(sz, h, mx,                               avg) \
   2276 bilinf_fn_1d(sz, v, my,                               avg) \
   2277 bilinf_fn_2d(sz,                                      avg)
   2278 
   2279 #define filter_fn_set(avg) \
   2280 filter_fn(64, avg) \
   2281 filter_fn(32, avg) \
   2282 filter_fn(16, avg) \
   2283 filter_fn(8,  avg) \
   2284 filter_fn(4,  avg)
   2285 
   2286 filter_fn_set(put)
   2287 filter_fn_set(avg)
   2288 
   2289 #undef filter_fn
   2290 #undef filter_fn_set
   2291 #undef filter_fn_1d
   2292 #undef filter_fn_2d
   2293 #undef bilinf_fn_1d
   2294 #undef bilinf_fn_2d
   2295 
   2296 #if BIT_DEPTH != 8
   2297 void ff_vp9dsp_mc_init_10(VP9DSPContext *dsp);
   2298 #endif
   2299 #if BIT_DEPTH != 10
   2300 static
   2301 #endif
   2302 av_cold void FUNC(ff_vp9dsp_mc_init)(VP9DSPContext *dsp)
   2303 {
   2304 #if BIT_DEPTH == 12
   2305    ff_vp9dsp_mc_init_10(dsp);
   2306 #else /* BIT_DEPTH == 12 */
   2307 
   2308 #define init_fpel(idx1, idx2, sz, type) \
   2309    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = type##sz##_c; \
   2310    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = type##sz##_c; \
   2311    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = type##sz##_c; \
   2312    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = type##sz##_c
   2313 
   2314 #define init_copy_avg(idx, sz) \
   2315    init_fpel(idx, 0, sz, copy); \
   2316    init_fpel(idx, 1, sz, avg)
   2317 
   2318    init_copy_avg(0, 64);
   2319    init_copy_avg(1, 32);
   2320    init_copy_avg(2, 16);
   2321    init_copy_avg(3,  8);
   2322    init_copy_avg(4,  4);
   2323 
   2324 #undef init_copy_avg
   2325 #undef init_fpel
   2326 
   2327 #endif /* BIT_DEPTH == 12 */
   2328 
   2329 #define init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type) \
   2330    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_c; \
   2331    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_c; \
   2332    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_c
   2333 
   2334 #if BIT_DEPTH == 12
   2335 #define init_subpel1 init_subpel1_bd_aware
   2336 #else
   2337 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type) \
   2338    init_subpel1_bd_aware(idx1, idx2, idxh, idxv, sz, dir, type); \
   2339    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][idxh][idxv] = type##_bilin_##sz##dir##_c
   2340 #endif
   2341 
   2342 #define init_subpel2(idx, idxh, idxv, dir, type) \
   2343    init_subpel1(0, idx, idxh, idxv, 64, dir, type); \
   2344    init_subpel1(1, idx, idxh, idxv, 32, dir, type); \
   2345    init_subpel1(2, idx, idxh, idxv, 16, dir, type); \
   2346    init_subpel1(3, idx, idxh, idxv,  8, dir, type); \
   2347    init_subpel1(4, idx, idxh, idxv,  4, dir, type)
   2348 
   2349 #define init_subpel3(idx, type) \
   2350    init_subpel2(idx, 1, 1, hv, type); \
   2351    init_subpel2(idx, 0, 1, v, type); \
   2352    init_subpel2(idx, 1, 0, h, type)
   2353 
   2354    init_subpel3(0, put);
   2355    init_subpel3(1, avg);
   2356 
   2357 #undef init_subpel1
   2358 #undef init_subpel2
   2359 #undef init_subpel3
   2360 #undef init_subpel1_bd_aware
   2361 }
   2362 
   2363 static av_always_inline void do_scaled_8tap_c(uint8_t *_dst, ptrdiff_t dst_stride,
   2364                                              const uint8_t *_src, ptrdiff_t src_stride,
   2365                                              int w, int h, int mx, int my,
   2366                                              int dx, int dy, int avg,
   2367                                              const int16_t (*filters)[8])
   2368 {
   2369    int tmp_h = (((h - 1) * dy + my) >> 4) + 8;
   2370    pixel tmp[64 * 135], *tmp_ptr = tmp;
   2371    pixel *dst = (pixel *) _dst;
   2372    const pixel *src = (const pixel *) _src;
   2373 
   2374    dst_stride /= sizeof(pixel);
   2375    src_stride /= sizeof(pixel);
   2376    src -= src_stride * 3;
   2377    do {
   2378        int x;
   2379        int imx = mx, ioff = 0;
   2380 
   2381        for (x = 0; x < w; x++) {
   2382            tmp_ptr[x] = FILTER_8TAP(src, ioff, filters[imx], 1);
   2383            imx += dx;
   2384            ioff += imx >> 4;
   2385            imx &= 0xf;
   2386        }
   2387 
   2388        tmp_ptr += 64;
   2389        src += src_stride;
   2390    } while (--tmp_h);
   2391 
   2392    tmp_ptr = tmp + 64 * 3;
   2393    do {
   2394        int x;
   2395        const int16_t *filter = filters[my];
   2396 
   2397        for (x = 0; x < w; x++)
   2398            if (avg) {
   2399                dst[x] = (dst[x] + FILTER_8TAP(tmp_ptr, x, filter, 64) + 1) >> 1;
   2400            } else {
   2401                dst[x] = FILTER_8TAP(tmp_ptr, x, filter, 64);
   2402            }
   2403 
   2404        my += dy;
   2405        tmp_ptr += (my >> 4) * 64;
   2406        my &= 0xf;
   2407        dst += dst_stride;
   2408    } while (--h);
   2409 }
   2410 
   2411 #define scaled_filter_8tap_fn(opn, opa) \
   2412 static av_noinline void opn##_scaled_8tap_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2413                                            const uint8_t *src, ptrdiff_t src_stride, \
   2414                                            int w, int h, int mx, int my, int dx, int dy, \
   2415                                            const int16_t (*filters)[8]) \
   2416 { \
   2417    do_scaled_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
   2418                     opa, filters); \
   2419 }
   2420 
   2421 scaled_filter_8tap_fn(put, 0)
   2422 scaled_filter_8tap_fn(avg, 1)
   2423 
   2424 #undef scaled_filter_8tap_fn
   2425 
   2426 #undef FILTER_8TAP
   2427 
   2428 #define scaled_filter_fn(sz, type, type_idx, avg) \
   2429 static void avg##_scaled_##type##_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2430                                           const uint8_t *src, ptrdiff_t src_stride, \
   2431                                           int h, int mx, int my, int dx, int dy) \
   2432 { \
   2433    avg##_scaled_8tap_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy, \
   2434                        ff_vp9_subpel_filters[type_idx]); \
   2435 }
   2436 
   2437 #if BIT_DEPTH != 12
   2438 
   2439 static av_always_inline void do_scaled_bilin_c(uint8_t *_dst, ptrdiff_t dst_stride,
   2440                                               const uint8_t *_src, ptrdiff_t src_stride,
   2441                                               int w, int h, int mx, int my,
   2442                                               int dx, int dy, int avg)
   2443 {
   2444    pixel tmp[64 * 129], *tmp_ptr = tmp;
   2445    int tmp_h = (((h - 1) * dy + my) >> 4) + 2;
   2446    pixel *dst = (pixel *) _dst;
   2447    const pixel *src = (const pixel *) _src;
   2448 
   2449    dst_stride /= sizeof(pixel);
   2450    src_stride /= sizeof(pixel);
   2451    do {
   2452        int x;
   2453        int imx = mx, ioff = 0;
   2454 
   2455        for (x = 0; x < w; x++) {
   2456            tmp_ptr[x] = FILTER_BILIN(src, ioff, imx, 1);
   2457            imx += dx;
   2458            ioff += imx >> 4;
   2459            imx &= 0xf;
   2460        }
   2461 
   2462        tmp_ptr += 64;
   2463        src += src_stride;
   2464    } while (--tmp_h);
   2465 
   2466    tmp_ptr = tmp;
   2467    do {
   2468        int x;
   2469 
   2470        for (x = 0; x < w; x++)
   2471            if (avg) {
   2472                dst[x] = (dst[x] + FILTER_BILIN(tmp_ptr, x, my, 64) + 1) >> 1;
   2473            } else {
   2474                dst[x] = FILTER_BILIN(tmp_ptr, x, my, 64);
   2475            }
   2476 
   2477        my += dy;
   2478        tmp_ptr += (my >> 4) * 64;
   2479        my &= 0xf;
   2480        dst += dst_stride;
   2481    } while (--h);
   2482 }
   2483 
   2484 #define scaled_bilin_fn(opn, opa) \
   2485 static av_noinline void opn##_scaled_bilin_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2486                                             const uint8_t *src, ptrdiff_t src_stride, \
   2487                                             int w, int h, int mx, int my, int dx, int dy) \
   2488 { \
   2489    do_scaled_bilin_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, opa); \
   2490 }
   2491 
   2492 scaled_bilin_fn(put, 0)
   2493 scaled_bilin_fn(avg, 1)
   2494 
   2495 #undef scaled_bilin_fn
   2496 
   2497 #undef FILTER_BILIN
   2498 
   2499 #define scaled_bilinf_fn(sz, avg) \
   2500 static void avg##_scaled_bilin_##sz##_c(uint8_t *dst, ptrdiff_t dst_stride, \
   2501                                        const uint8_t *src, ptrdiff_t src_stride, \
   2502                                        int h, int mx, int my, int dx, int dy) \
   2503 { \
   2504    avg##_scaled_bilin_c(dst, dst_stride, src, src_stride, sz, h, mx, my, dx, dy); \
   2505 }
   2506 
   2507 #else
   2508 
   2509 #define scaled_bilinf_fn(a, b)
   2510 
   2511 #endif
   2512 
   2513 #define scaled_filter_fns(sz, avg) \
   2514 scaled_filter_fn(sz,        regular, FILTER_8TAP_REGULAR, avg) \
   2515 scaled_filter_fn(sz,        smooth,  FILTER_8TAP_SMOOTH,  avg) \
   2516 scaled_filter_fn(sz,        sharp,   FILTER_8TAP_SHARP,   avg) \
   2517 scaled_bilinf_fn(sz,                                      avg)
   2518 
   2519 #define scaled_filter_fn_set(avg) \
   2520 scaled_filter_fns(64, avg) \
   2521 scaled_filter_fns(32, avg) \
   2522 scaled_filter_fns(16, avg) \
   2523 scaled_filter_fns(8,  avg) \
   2524 scaled_filter_fns(4,  avg)
   2525 
   2526 scaled_filter_fn_set(put)
   2527 scaled_filter_fn_set(avg)
   2528 
   2529 #undef scaled_filter_fns
   2530 #undef scaled_filter_fn_set
   2531 #undef scaled_filter_fn
   2532 #undef scaled_bilinf_fn
   2533 
   2534 #if BIT_DEPTH != 8
   2535 void ff_vp9dsp_scaled_mc_init_10(VP9DSPContext *dsp);
   2536 #endif
   2537 #if BIT_DEPTH != 10
   2538 static
   2539 #endif
   2540 av_cold void FUNC(ff_vp9dsp_scaled_mc_init)(VP9DSPContext *dsp)
   2541 {
   2542 #define init_scaled_bd_aware(idx1, idx2, sz, type) \
   2543    dsp->smc[idx1][FILTER_8TAP_SMOOTH ][idx2] = type##_scaled_smooth_##sz##_c; \
   2544    dsp->smc[idx1][FILTER_8TAP_REGULAR][idx2] = type##_scaled_regular_##sz##_c; \
   2545    dsp->smc[idx1][FILTER_8TAP_SHARP  ][idx2] = type##_scaled_sharp_##sz##_c
   2546 
   2547 #if BIT_DEPTH == 12
   2548    ff_vp9dsp_scaled_mc_init_10(dsp);
   2549 #define init_scaled(a,b,c,d) init_scaled_bd_aware(a,b,c,d)
   2550 #else
   2551 #define init_scaled(idx1, idx2, sz, type) \
   2552    init_scaled_bd_aware(idx1, idx2, sz, type); \
   2553    dsp->smc[idx1][FILTER_BILINEAR    ][idx2] = type##_scaled_bilin_##sz##_c
   2554 #endif
   2555 
   2556 #define init_scaled_put_avg(idx, sz) \
   2557    init_scaled(idx, 0, sz, put); \
   2558    init_scaled(idx, 1, sz, avg)
   2559 
   2560    init_scaled_put_avg(0, 64);
   2561    init_scaled_put_avg(1, 32);
   2562    init_scaled_put_avg(2, 16);
   2563    init_scaled_put_avg(3,  8);
   2564    init_scaled_put_avg(4,  4);
   2565 
   2566 #undef init_scaled_put_avg
   2567 #undef init_scaled
   2568 #undef init_scaled_bd_aware
   2569 }
   2570 
   2571 av_cold void FUNC(ff_vp9dsp_init)(VP9DSPContext *dsp)
   2572 {
   2573    FUNC(ff_vp9dsp_intrapred_init)(dsp);
   2574    vp9dsp_itxfm_init(dsp);
   2575    vp9dsp_loopfilter_init(dsp);
   2576    FUNC(ff_vp9dsp_mc_init)(dsp);
   2577    FUNC(ff_vp9dsp_scaled_mc_init)(dsp);
   2578 }