tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264pred_template.c (43346B)


      1 /*
      2 * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
      3 * Copyright (c) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
      4 *
      5 * This file is part of FFmpeg.
      6 *
      7 * FFmpeg is free software; you can redistribute it and/or
      8 * modify it under the terms of the GNU Lesser General Public
      9 * License as published by the Free Software Foundation; either
     10 * version 2.1 of the License, or (at your option) any later version.
     11 *
     12 * FFmpeg is distributed in the hope that it will be useful,
     13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     15 * Lesser General Public License for more details.
     16 *
     17 * You should have received a copy of the GNU Lesser General Public
     18 * License along with FFmpeg; if not, write to the Free Software
     19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     20 */
     21 
     22 /**
     23 * @file
     24 * H.264 / AVC / MPEG-4 part10 prediction functions.
     25 * @author Michael Niedermayer <michaelni@gmx.at>
     26 */
     27 
     28 #include "libavutil/intreadwrite.h"
     29 
     30 #include "mathops.h"
     31 
     32 #include "bit_depth_template.c"
     33 
     34 static void FUNCC(pred4x4_vertical)(uint8_t *_src, const uint8_t *topright,
     35                                    ptrdiff_t _stride)
     36 {
     37    pixel *src = (pixel*)_src;
     38    int stride = _stride>>(sizeof(pixel)-1);
     39    const pixel4 a= AV_RN4PA(src-stride);
     40 
     41    AV_WN4PA(src+0*stride, a);
     42    AV_WN4PA(src+1*stride, a);
     43    AV_WN4PA(src+2*stride, a);
     44    AV_WN4PA(src+3*stride, a);
     45 }
     46 
     47 static void FUNCC(pred4x4_horizontal)(uint8_t *_src, const uint8_t *topright,
     48                                      ptrdiff_t _stride)
     49 {
     50    pixel *src = (pixel*)_src;
     51    int stride = _stride>>(sizeof(pixel)-1);
     52    AV_WN4PA(src+0*stride, PIXEL_SPLAT_X4(src[-1+0*stride]));
     53    AV_WN4PA(src+1*stride, PIXEL_SPLAT_X4(src[-1+1*stride]));
     54    AV_WN4PA(src+2*stride, PIXEL_SPLAT_X4(src[-1+2*stride]));
     55    AV_WN4PA(src+3*stride, PIXEL_SPLAT_X4(src[-1+3*stride]));
     56 }
     57 
     58 static void FUNCC(pred4x4_dc)(uint8_t *_src, const uint8_t *topright,
     59                              ptrdiff_t _stride)
     60 {
     61    pixel *src = (pixel*)_src;
     62    int stride = _stride>>(sizeof(pixel)-1);
     63    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
     64                   + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
     65    const pixel4 a = PIXEL_SPLAT_X4(dc);
     66 
     67    AV_WN4PA(src+0*stride, a);
     68    AV_WN4PA(src+1*stride, a);
     69    AV_WN4PA(src+2*stride, a);
     70    AV_WN4PA(src+3*stride, a);
     71 }
     72 
     73 static void FUNCC(pred4x4_left_dc)(uint8_t *_src, const uint8_t *topright,
     74                                   ptrdiff_t _stride)
     75 {
     76    pixel *src = (pixel*)_src;
     77    int stride = _stride>>(sizeof(pixel)-1);
     78    const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
     79    const pixel4 a = PIXEL_SPLAT_X4(dc);
     80 
     81    AV_WN4PA(src+0*stride, a);
     82    AV_WN4PA(src+1*stride, a);
     83    AV_WN4PA(src+2*stride, a);
     84    AV_WN4PA(src+3*stride, a);
     85 }
     86 
     87 static void FUNCC(pred4x4_top_dc)(uint8_t *_src, const uint8_t *topright,
     88                                  ptrdiff_t _stride)
     89 {
     90    pixel *src = (pixel*)_src;
     91    int stride = _stride>>(sizeof(pixel)-1);
     92    const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
     93    const pixel4 a = PIXEL_SPLAT_X4(dc);
     94 
     95    AV_WN4PA(src+0*stride, a);
     96    AV_WN4PA(src+1*stride, a);
     97    AV_WN4PA(src+2*stride, a);
     98    AV_WN4PA(src+3*stride, a);
     99 }
    100 
    101 static void FUNCC(pred4x4_128_dc)(uint8_t *_src, const uint8_t *topright,
    102                                  ptrdiff_t _stride)
    103 {
    104    pixel *src = (pixel*)_src;
    105    int stride = _stride>>(sizeof(pixel)-1);
    106    const pixel4 a = PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1));
    107 
    108    AV_WN4PA(src+0*stride, a);
    109    AV_WN4PA(src+1*stride, a);
    110    AV_WN4PA(src+2*stride, a);
    111    AV_WN4PA(src+3*stride, a);
    112 }
    113 
    114 
    115 #define LOAD_TOP_RIGHT_EDGE\
    116    const unsigned av_unused t4 = topright[0];\
    117    const unsigned av_unused t5 = topright[1];\
    118    const unsigned av_unused t6 = topright[2];\
    119    const unsigned av_unused t7 = topright[3];\
    120 
    121 #define LOAD_DOWN_LEFT_EDGE\
    122    const unsigned av_unused l4 = src[-1+4*stride];\
    123    const unsigned av_unused l5 = src[-1+5*stride];\
    124    const unsigned av_unused l6 = src[-1+6*stride];\
    125    const unsigned av_unused l7 = src[-1+7*stride];\
    126 
    127 #define LOAD_LEFT_EDGE\
    128    const unsigned av_unused l0 = src[-1+0*stride];\
    129    const unsigned av_unused l1 = src[-1+1*stride];\
    130    const unsigned av_unused l2 = src[-1+2*stride];\
    131    const unsigned av_unused l3 = src[-1+3*stride];\
    132 
    133 #define LOAD_TOP_EDGE\
    134    const unsigned av_unused t0 = src[ 0-1*stride];\
    135    const unsigned av_unused t1 = src[ 1-1*stride];\
    136    const unsigned av_unused t2 = src[ 2-1*stride];\
    137    const unsigned av_unused t3 = src[ 3-1*stride];\
    138 
    139 static void FUNCC(pred4x4_down_right)(uint8_t *_src, const uint8_t *topright,
    140                                      ptrdiff_t _stride)
    141 {
    142    pixel *src = (pixel*)_src;
    143    int stride = _stride>>(sizeof(pixel)-1);
    144    const int lt= src[-1-1*stride];
    145    LOAD_TOP_EDGE
    146    LOAD_LEFT_EDGE
    147 
    148    src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
    149    src[0+2*stride]=
    150    src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
    151    src[0+1*stride]=
    152    src[1+2*stride]=
    153    src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
    154    src[0+0*stride]=
    155    src[1+1*stride]=
    156    src[2+2*stride]=
    157    src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
    158    src[1+0*stride]=
    159    src[2+1*stride]=
    160    src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
    161    src[2+0*stride]=
    162    src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
    163    src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
    164 }
    165 
    166 static void FUNCC(pred4x4_down_left)(uint8_t *_src, const uint8_t *_topright,
    167                                     ptrdiff_t _stride)
    168 {
    169    pixel *src = (pixel*)_src;
    170    const pixel *topright = (const pixel*)_topright;
    171    int stride = _stride>>(sizeof(pixel)-1);
    172    LOAD_TOP_EDGE
    173    LOAD_TOP_RIGHT_EDGE
    174 //    LOAD_LEFT_EDGE
    175 
    176    src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
    177    src[1+0*stride]=
    178    src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
    179    src[2+0*stride]=
    180    src[1+1*stride]=
    181    src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
    182    src[3+0*stride]=
    183    src[2+1*stride]=
    184    src[1+2*stride]=
    185    src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
    186    src[3+1*stride]=
    187    src[2+2*stride]=
    188    src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
    189    src[3+2*stride]=
    190    src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
    191    src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
    192 }
    193 
    194 static void FUNCC(pred4x4_vertical_right)(uint8_t *_src,
    195                                          const uint8_t *topright,
    196                                          ptrdiff_t _stride)
    197 {
    198    pixel *src = (pixel*)_src;
    199    int stride = _stride>>(sizeof(pixel)-1);
    200    const int lt= src[-1-1*stride];
    201    LOAD_TOP_EDGE
    202    LOAD_LEFT_EDGE
    203 
    204    src[0+0*stride]=
    205    src[1+2*stride]=(lt + t0 + 1)>>1;
    206    src[1+0*stride]=
    207    src[2+2*stride]=(t0 + t1 + 1)>>1;
    208    src[2+0*stride]=
    209    src[3+2*stride]=(t1 + t2 + 1)>>1;
    210    src[3+0*stride]=(t2 + t3 + 1)>>1;
    211    src[0+1*stride]=
    212    src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
    213    src[1+1*stride]=
    214    src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
    215    src[2+1*stride]=
    216    src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
    217    src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
    218    src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
    219    src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
    220 }
    221 
    222 static void FUNCC(pred4x4_vertical_left)(uint8_t *_src,
    223                                         const uint8_t *_topright,
    224                                         ptrdiff_t _stride)
    225 {
    226    pixel *src = (pixel*)_src;
    227    const pixel *topright = (const pixel*)_topright;
    228    int stride = _stride>>(sizeof(pixel)-1);
    229    LOAD_TOP_EDGE
    230    LOAD_TOP_RIGHT_EDGE
    231 
    232    src[0+0*stride]=(t0 + t1 + 1)>>1;
    233    src[1+0*stride]=
    234    src[0+2*stride]=(t1 + t2 + 1)>>1;
    235    src[2+0*stride]=
    236    src[1+2*stride]=(t2 + t3 + 1)>>1;
    237    src[3+0*stride]=
    238    src[2+2*stride]=(t3 + t4+ 1)>>1;
    239    src[3+2*stride]=(t4 + t5+ 1)>>1;
    240    src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
    241    src[1+1*stride]=
    242    src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
    243    src[2+1*stride]=
    244    src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
    245    src[3+1*stride]=
    246    src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
    247    src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
    248 }
    249 
    250 static void FUNCC(pred4x4_horizontal_up)(uint8_t *_src, const uint8_t *topright,
    251                                         ptrdiff_t _stride)
    252 {
    253    pixel *src = (pixel*)_src;
    254    int stride = _stride>>(sizeof(pixel)-1);
    255    LOAD_LEFT_EDGE
    256 
    257    src[0+0*stride]=(l0 + l1 + 1)>>1;
    258    src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
    259    src[2+0*stride]=
    260    src[0+1*stride]=(l1 + l2 + 1)>>1;
    261    src[3+0*stride]=
    262    src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
    263    src[2+1*stride]=
    264    src[0+2*stride]=(l2 + l3 + 1)>>1;
    265    src[3+1*stride]=
    266    src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
    267    src[3+2*stride]=
    268    src[1+3*stride]=
    269    src[0+3*stride]=
    270    src[2+2*stride]=
    271    src[2+3*stride]=
    272    src[3+3*stride]=l3;
    273 }
    274 
    275 static void FUNCC(pred4x4_horizontal_down)(uint8_t *_src,
    276                                           const uint8_t *topright,
    277                                           ptrdiff_t _stride)
    278 {
    279    pixel *src = (pixel*)_src;
    280    int stride = _stride>>(sizeof(pixel)-1);
    281    const int lt= src[-1-1*stride];
    282    LOAD_TOP_EDGE
    283    LOAD_LEFT_EDGE
    284 
    285    src[0+0*stride]=
    286    src[2+1*stride]=(lt + l0 + 1)>>1;
    287    src[1+0*stride]=
    288    src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
    289    src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
    290    src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
    291    src[0+1*stride]=
    292    src[2+2*stride]=(l0 + l1 + 1)>>1;
    293    src[1+1*stride]=
    294    src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
    295    src[0+2*stride]=
    296    src[2+3*stride]=(l1 + l2+ 1)>>1;
    297    src[1+2*stride]=
    298    src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
    299    src[0+3*stride]=(l2 + l3 + 1)>>1;
    300    src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
    301 }
    302 
    303 static void FUNCC(pred16x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
    304 {
    305    int i;
    306    pixel *src = (pixel*)_src;
    307    int stride = _stride>>(sizeof(pixel)-1);
    308    const pixel4 a = AV_RN4PA(((pixel4*)(src-stride))+0);
    309    const pixel4 b = AV_RN4PA(((pixel4*)(src-stride))+1);
    310    const pixel4 c = AV_RN4PA(((pixel4*)(src-stride))+2);
    311    const pixel4 d = AV_RN4PA(((pixel4*)(src-stride))+3);
    312 
    313    for(i=0; i<16; i++){
    314        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
    315        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
    316        AV_WN4PA(((pixel4*)(src+i*stride))+2, c);
    317        AV_WN4PA(((pixel4*)(src+i*stride))+3, d);
    318    }
    319 }
    320 
    321 static void FUNCC(pred16x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
    322 {
    323    int i;
    324    pixel *src = (pixel*)_src;
    325    stride >>= sizeof(pixel)-1;
    326 
    327    for(i=0; i<16; i++){
    328        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
    329 
    330        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
    331        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
    332        AV_WN4PA(((pixel4*)(src+i*stride))+2, a);
    333        AV_WN4PA(((pixel4*)(src+i*stride))+3, a);
    334    }
    335 }
    336 
    337 #define PREDICT_16x16_DC(v)\
    338    for(i=0; i<16; i++){\
    339        AV_WN4PA(src+ 0, v);\
    340        AV_WN4PA(src+ 4, v);\
    341        AV_WN4PA(src+ 8, v);\
    342        AV_WN4PA(src+12, v);\
    343        src += stride;\
    344    }
    345 
    346 static void FUNCC(pred16x16_dc)(uint8_t *_src, ptrdiff_t stride)
    347 {
    348    int i, dc=0;
    349    pixel *src = (pixel*)_src;
    350    pixel4 dcsplat;
    351    stride >>= sizeof(pixel)-1;
    352 
    353    for(i=0;i<16; i++){
    354        dc+= src[-1+i*stride];
    355    }
    356 
    357    for(i=0;i<16; i++){
    358        dc+= src[i-stride];
    359    }
    360 
    361    dcsplat = PIXEL_SPLAT_X4((dc+16)>>5);
    362    PREDICT_16x16_DC(dcsplat);
    363 }
    364 
    365 static void FUNCC(pred16x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
    366 {
    367    int i, dc=0;
    368    pixel *src = (pixel*)_src;
    369    pixel4 dcsplat;
    370    stride >>= sizeof(pixel)-1;
    371 
    372    for(i=0;i<16; i++){
    373        dc+= src[-1+i*stride];
    374    }
    375 
    376    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
    377    PREDICT_16x16_DC(dcsplat);
    378 }
    379 
    380 static void FUNCC(pred16x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
    381 {
    382    int i, dc=0;
    383    pixel *src = (pixel*)_src;
    384    pixel4 dcsplat;
    385    stride >>= sizeof(pixel)-1;
    386 
    387    for(i=0;i<16; i++){
    388        dc+= src[i-stride];
    389    }
    390 
    391    dcsplat = PIXEL_SPLAT_X4((dc+8)>>4);
    392    PREDICT_16x16_DC(dcsplat);
    393 }
    394 
    395 #define PRED16x16_X(n, v) \
    396 static void FUNCC(pred16x16_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
    397 {\
    398    int i;\
    399    pixel *src = (pixel*)_src;\
    400    stride >>= sizeof(pixel)-1;\
    401    PREDICT_16x16_DC(PIXEL_SPLAT_X4(v));\
    402 }
    403 
    404 PRED16x16_X(128, (1<<(BIT_DEPTH-1))+0)
    405 #if BIT_DEPTH == 8
    406 PRED16x16_X(127, (1<<(BIT_DEPTH-1))-1)
    407 PRED16x16_X(129, (1<<(BIT_DEPTH-1))+1)
    408 #endif
    409 
    410 static inline void FUNCC(pred16x16_plane_compat)(uint8_t *_src,
    411                                                 ptrdiff_t _stride,
    412                                                 const int svq3,
    413                                                 const int rv40)
    414 {
    415  int i, j, k;
    416  int a;
    417  INIT_CLIP
    418  pixel *src = (pixel*)_src;
    419  int stride = _stride>>(sizeof(pixel)-1);
    420  const pixel * const src0 = src +7-stride;
    421  const pixel *       src1 = src +8*stride-1;
    422  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
    423  int H = src0[1] - src0[-1];
    424  int V = src1[0] - src2[ 0];
    425  for(k=2; k<=8; ++k) {
    426    src1 += stride; src2 -= stride;
    427    H += k*(src0[k] - src0[-k]);
    428    V += k*(src1[0] - src2[ 0]);
    429  }
    430  if(svq3){
    431    H = ( 5*(H/4) ) / 16;
    432    V = ( 5*(V/4) ) / 16;
    433 
    434    /* required for 100% accuracy */
    435    i = H; H = V; V = i;
    436  }else if(rv40){
    437    H = ( H + (H>>2) ) >> 4;
    438    V = ( V + (V>>2) ) >> 4;
    439  }else{
    440    H = ( 5*H+32 ) >> 6;
    441    V = ( 5*V+32 ) >> 6;
    442  }
    443 
    444  a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
    445  for(j=16; j>0; --j) {
    446    int b = a;
    447    a += V;
    448    for(i=-16; i<0; i+=4) {
    449      src[16+i] = CLIP((b    ) >> 5);
    450      src[17+i] = CLIP((b+  H) >> 5);
    451      src[18+i] = CLIP((b+2*H) >> 5);
    452      src[19+i] = CLIP((b+3*H) >> 5);
    453      b += 4*H;
    454    }
    455    src += stride;
    456  }
    457 }
    458 
    459 static void FUNCC(pred16x16_plane)(uint8_t *src, ptrdiff_t stride)
    460 {
    461    FUNCC(pred16x16_plane_compat)(src, stride, 0, 0);
    462 }
    463 
    464 static void FUNCC(pred8x8_vertical)(uint8_t *_src, ptrdiff_t _stride)
    465 {
    466    int i;
    467    pixel *src = (pixel*)_src;
    468    int stride = _stride>>(sizeof(pixel)-1);
    469    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
    470    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
    471 
    472    for(i=0; i<8; i++){
    473        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
    474        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
    475    }
    476 }
    477 
    478 static void FUNCC(pred8x16_vertical)(uint8_t *_src, ptrdiff_t _stride)
    479 {
    480    int i;
    481    pixel *src = (pixel*)_src;
    482    int stride = _stride>>(sizeof(pixel)-1);
    483    const pixel4 a= AV_RN4PA(((pixel4*)(src-stride))+0);
    484    const pixel4 b= AV_RN4PA(((pixel4*)(src-stride))+1);
    485 
    486    for(i=0; i<16; i++){
    487        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
    488        AV_WN4PA(((pixel4*)(src+i*stride))+1, b);
    489    }
    490 }
    491 
    492 static void FUNCC(pred8x8_horizontal)(uint8_t *_src, ptrdiff_t stride)
    493 {
    494    int i;
    495    pixel *src = (pixel*)_src;
    496    stride >>= sizeof(pixel)-1;
    497 
    498    for(i=0; i<8; i++){
    499        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
    500        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
    501        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
    502    }
    503 }
    504 
    505 static void FUNCC(pred8x16_horizontal)(uint8_t *_src, ptrdiff_t stride)
    506 {
    507    int i;
    508    pixel *src = (pixel*)_src;
    509    stride >>= sizeof(pixel)-1;
    510    for(i=0; i<16; i++){
    511        const pixel4 a = PIXEL_SPLAT_X4(src[-1+i*stride]);
    512        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);
    513        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);
    514    }
    515 }
    516 
    517 #define PRED8x8_X(n, v)\
    518 static void FUNCC(pred8x8_##n##_dc)(uint8_t *_src, ptrdiff_t stride)\
    519 {\
    520    int i;\
    521    const pixel4 a = PIXEL_SPLAT_X4(v);\
    522    pixel *src = (pixel*)_src;\
    523    stride >>= sizeof(pixel)-1;\
    524    for(i=0; i<8; i++){\
    525        AV_WN4PA(((pixel4*)(src+i*stride))+0, a);\
    526        AV_WN4PA(((pixel4*)(src+i*stride))+1, a);\
    527    }\
    528 }
    529 
    530 PRED8x8_X(128, (1<<(BIT_DEPTH-1))+0)
    531 #if BIT_DEPTH == 8
    532 PRED8x8_X(127, (1<<(BIT_DEPTH-1))-1)
    533 PRED8x8_X(129, (1<<(BIT_DEPTH-1))+1)
    534 #endif
    535 
    536 static void FUNCC(pred8x16_128_dc)(uint8_t *_src, ptrdiff_t stride)
    537 {
    538    FUNCC(pred8x8_128_dc)(_src, stride);
    539    FUNCC(pred8x8_128_dc)(_src+8*stride, stride);
    540 }
    541 
    542 static void FUNCC(pred8x8_left_dc)(uint8_t *_src, ptrdiff_t stride)
    543 {
    544    int i;
    545    int dc0, dc2;
    546    pixel4 dc0splat, dc2splat;
    547    pixel *src = (pixel*)_src;
    548    stride >>= sizeof(pixel)-1;
    549 
    550    dc0=dc2=0;
    551    for(i=0;i<4; i++){
    552        dc0+= src[-1+i*stride];
    553        dc2+= src[-1+(i+4)*stride];
    554    }
    555    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
    556    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
    557 
    558    for(i=0; i<4; i++){
    559        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
    560        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc0splat);
    561    }
    562    for(i=4; i<8; i++){
    563        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
    564        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc2splat);
    565    }
    566 }
    567 
    568 static void FUNCC(pred8x16_left_dc)(uint8_t *_src, ptrdiff_t stride)
    569 {
    570    FUNCC(pred8x8_left_dc)(_src, stride);
    571    FUNCC(pred8x8_left_dc)(_src+8*stride, stride);
    572 }
    573 
    574 static void FUNCC(pred8x8_top_dc)(uint8_t *_src, ptrdiff_t stride)
    575 {
    576    int i;
    577    int dc0, dc1;
    578    pixel4 dc0splat, dc1splat;
    579    pixel *src = (pixel*)_src;
    580    stride >>= sizeof(pixel)-1;
    581 
    582    dc0=dc1=0;
    583    for(i=0;i<4; i++){
    584        dc0+= src[i-stride];
    585        dc1+= src[4+i-stride];
    586    }
    587    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
    588    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
    589 
    590    for(i=0; i<4; i++){
    591        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
    592        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
    593    }
    594    for(i=4; i<8; i++){
    595        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
    596        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
    597    }
    598 }
    599 
    600 static void FUNCC(pred8x16_top_dc)(uint8_t *_src, ptrdiff_t stride)
    601 {
    602    int i;
    603    int dc0, dc1;
    604    pixel4 dc0splat, dc1splat;
    605    pixel *src = (pixel*)_src;
    606    stride >>= sizeof(pixel)-1;
    607 
    608    dc0=dc1=0;
    609    for(i=0;i<4; i++){
    610        dc0+= src[i-stride];
    611        dc1+= src[4+i-stride];
    612    }
    613    dc0splat = PIXEL_SPLAT_X4((dc0 + 2)>>2);
    614    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
    615 
    616    for(i=0; i<16; i++){
    617        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
    618        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
    619    }
    620 }
    621 
    622 static void FUNCC(pred8x8_dc)(uint8_t *_src, ptrdiff_t stride)
    623 {
    624    int i;
    625    int dc0, dc1, dc2;
    626    pixel4 dc0splat, dc1splat, dc2splat, dc3splat;
    627    pixel *src = (pixel*)_src;
    628    stride >>= sizeof(pixel)-1;
    629 
    630    dc0=dc1=dc2=0;
    631    for(i=0;i<4; i++){
    632        dc0+= src[-1+i*stride] + src[i-stride];
    633        dc1+= src[4+i-stride];
    634        dc2+= src[-1+(i+4)*stride];
    635    }
    636    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
    637    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
    638    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
    639    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
    640 
    641    for(i=0; i<4; i++){
    642        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
    643        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
    644    }
    645    for(i=4; i<8; i++){
    646        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
    647        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
    648    }
    649 }
    650 
    651 static void FUNCC(pred8x16_dc)(uint8_t *_src, ptrdiff_t stride)
    652 {
    653    int i;
    654    int dc0, dc1, dc2, dc3, dc4;
    655    pixel4 dc0splat, dc1splat, dc2splat, dc3splat, dc4splat, dc5splat, dc6splat, dc7splat;
    656    pixel *src = (pixel*)_src;
    657    stride >>= sizeof(pixel)-1;
    658 
    659    dc0=dc1=dc2=dc3=dc4=0;
    660    for(i=0;i<4; i++){
    661        dc0+= src[-1+i*stride] + src[i-stride];
    662        dc1+= src[4+i-stride];
    663        dc2+= src[-1+(i+4)*stride];
    664        dc3+= src[-1+(i+8)*stride];
    665        dc4+= src[-1+(i+12)*stride];
    666    }
    667    dc0splat = PIXEL_SPLAT_X4((dc0 + 4)>>3);
    668    dc1splat = PIXEL_SPLAT_X4((dc1 + 2)>>2);
    669    dc2splat = PIXEL_SPLAT_X4((dc2 + 2)>>2);
    670    dc3splat = PIXEL_SPLAT_X4((dc1 + dc2 + 4)>>3);
    671    dc4splat = PIXEL_SPLAT_X4((dc3 + 2)>>2);
    672    dc5splat = PIXEL_SPLAT_X4((dc1 + dc3 + 4)>>3);
    673    dc6splat = PIXEL_SPLAT_X4((dc4 + 2)>>2);
    674    dc7splat = PIXEL_SPLAT_X4((dc1 + dc4 + 4)>>3);
    675 
    676    for(i=0; i<4; i++){
    677        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc0splat);
    678        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc1splat);
    679    }
    680    for(i=4; i<8; i++){
    681        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc2splat);
    682        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc3splat);
    683    }
    684    for(i=8; i<12; i++){
    685        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc4splat);
    686        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc5splat);
    687    }
    688    for(i=12; i<16; i++){
    689        AV_WN4PA(((pixel4*)(src+i*stride))+0, dc6splat);
    690        AV_WN4PA(((pixel4*)(src+i*stride))+1, dc7splat);
    691    }
    692 }
    693 
    694 //the following 4 function should not be optimized!
    695 static void FUNC(pred8x8_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
    696 {
    697    FUNCC(pred8x8_top_dc)(src, stride);
    698    FUNCC(pred4x4_dc)(src, NULL, stride);
    699 }
    700 
    701 static void FUNC(pred8x16_mad_cow_dc_l0t)(uint8_t *src, ptrdiff_t stride)
    702 {
    703    FUNCC(pred8x16_top_dc)(src, stride);
    704    FUNCC(pred4x4_dc)(src, NULL, stride);
    705 }
    706 
    707 static void FUNC(pred8x8_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
    708 {
    709    FUNCC(pred8x8_dc)(src, stride);
    710    FUNCC(pred4x4_top_dc)(src, NULL, stride);
    711 }
    712 
    713 static void FUNC(pred8x16_mad_cow_dc_0lt)(uint8_t *src, ptrdiff_t stride)
    714 {
    715    FUNCC(pred8x16_dc)(src, stride);
    716    FUNCC(pred4x4_top_dc)(src, NULL, stride);
    717 }
    718 
    719 static void FUNC(pred8x8_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
    720 {
    721    FUNCC(pred8x8_left_dc)(src, stride);
    722    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
    723    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
    724 }
    725 
    726 static void FUNC(pred8x16_mad_cow_dc_l00)(uint8_t *src, ptrdiff_t stride)
    727 {
    728    FUNCC(pred8x16_left_dc)(src, stride);
    729    FUNCC(pred4x4_128_dc)(src + 4*stride                  , NULL, stride);
    730    FUNCC(pred4x4_128_dc)(src + 4*stride + 4*sizeof(pixel), NULL, stride);
    731 }
    732 
    733 static void FUNC(pred8x8_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
    734 {
    735    FUNCC(pred8x8_left_dc)(src, stride);
    736    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
    737    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
    738 }
    739 
    740 static void FUNC(pred8x16_mad_cow_dc_0l0)(uint8_t *src, ptrdiff_t stride)
    741 {
    742    FUNCC(pred8x16_left_dc)(src, stride);
    743    FUNCC(pred4x4_128_dc)(src                  , NULL, stride);
    744    FUNCC(pred4x4_128_dc)(src + 4*sizeof(pixel), NULL, stride);
    745 }
    746 
    747 static void FUNCC(pred8x8_plane)(uint8_t *_src, ptrdiff_t _stride)
    748 {
    749  int j, k;
    750  int a;
    751  INIT_CLIP
    752  pixel *src = (pixel*)_src;
    753  int stride = _stride>>(sizeof(pixel)-1);
    754  const pixel * const src0 = src +3-stride;
    755  const pixel *       src1 = src +4*stride-1;
    756  const pixel *       src2 = src1-2*stride;    // == src+2*stride-1;
    757  int H = src0[1] - src0[-1];
    758  int V = src1[0] - src2[ 0];
    759  for(k=2; k<=4; ++k) {
    760    src1 += stride; src2 -= stride;
    761    H += k*(src0[k] - src0[-k]);
    762    V += k*(src1[0] - src2[ 0]);
    763  }
    764  H = ( 17*H+16 ) >> 5;
    765  V = ( 17*V+16 ) >> 5;
    766 
    767  a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
    768  for(j=8; j>0; --j) {
    769    int b = a;
    770    a += V;
    771    src[0] = CLIP((b    ) >> 5);
    772    src[1] = CLIP((b+  H) >> 5);
    773    src[2] = CLIP((b+2*H) >> 5);
    774    src[3] = CLIP((b+3*H) >> 5);
    775    src[4] = CLIP((b+4*H) >> 5);
    776    src[5] = CLIP((b+5*H) >> 5);
    777    src[6] = CLIP((b+6*H) >> 5);
    778    src[7] = CLIP((b+7*H) >> 5);
    779    src += stride;
    780  }
    781 }
    782 
    783 static void FUNCC(pred8x16_plane)(uint8_t *_src, ptrdiff_t _stride)
    784 {
    785  int j, k;
    786  int a;
    787  INIT_CLIP
    788  pixel *src = (pixel*)_src;
    789  int stride = _stride>>(sizeof(pixel)-1);
    790  const pixel * const src0 = src +3-stride;
    791  const pixel *       src1 = src +8*stride-1;
    792  const pixel *       src2 = src1-2*stride;    // == src+6*stride-1;
    793  int H = src0[1] - src0[-1];
    794  int V = src1[0] - src2[ 0];
    795 
    796  for (k = 2; k <= 4; ++k) {
    797      src1 += stride; src2 -= stride;
    798      H += k*(src0[k] - src0[-k]);
    799      V += k*(src1[0] - src2[ 0]);
    800  }
    801  for (; k <= 8; ++k) {
    802      src1 += stride; src2 -= stride;
    803      V += k*(src1[0] - src2[0]);
    804  }
    805 
    806  H = (17*H+16) >> 5;
    807  V = (5*V+32) >> 6;
    808 
    809  a = 16*(src1[0] + src2[8] + 1) - 7*V - 3*H;
    810  for(j=16; j>0; --j) {
    811    int b = a;
    812    a += V;
    813    src[0] = CLIP((b    ) >> 5);
    814    src[1] = CLIP((b+  H) >> 5);
    815    src[2] = CLIP((b+2*H) >> 5);
    816    src[3] = CLIP((b+3*H) >> 5);
    817    src[4] = CLIP((b+4*H) >> 5);
    818    src[5] = CLIP((b+5*H) >> 5);
    819    src[6] = CLIP((b+6*H) >> 5);
    820    src[7] = CLIP((b+7*H) >> 5);
    821    src += stride;
    822  }
    823 }
    824 
    825 #define SRC(x,y) src[(x)+(y)*stride]
    826 #define PL(y) \
    827    const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
    828 #define PREDICT_8x8_LOAD_LEFT \
    829    const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
    830                     + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
    831    PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
    832    const int l7 av_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
    833 
    834 #define PT(x) \
    835    const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
    836 #define PREDICT_8x8_LOAD_TOP \
    837    const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
    838                     + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
    839    PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
    840    const int t7 av_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
    841                     + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
    842 
    843 #define PTR(x) \
    844    t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
    845 #define PREDICT_8x8_LOAD_TOPRIGHT \
    846    int t8, t9, t10, t11, t12, t13, t14, t15; \
    847    if(has_topright) { \
    848        PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
    849        t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
    850    } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
    851 
    852 #define PREDICT_8x8_LOAD_TOPLEFT \
    853    const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
    854 
    855 #define PREDICT_8x8_DC(v) \
    856    int y; \
    857    for( y = 0; y < 8; y++ ) { \
    858        AV_WN4PA(((pixel4*)src)+0, v); \
    859        AV_WN4PA(((pixel4*)src)+1, v); \
    860        src += stride; \
    861    }
    862 
    863 static void FUNCC(pred8x8l_128_dc)(uint8_t *_src, int has_topleft,
    864                                   int has_topright, ptrdiff_t _stride)
    865 {
    866    pixel *src = (pixel*)_src;
    867    int stride = _stride>>(sizeof(pixel)-1);
    868 
    869    PREDICT_8x8_DC(PIXEL_SPLAT_X4(1<<(BIT_DEPTH-1)));
    870 }
    871 static void FUNCC(pred8x8l_left_dc)(uint8_t *_src, int has_topleft,
    872                                    int has_topright, ptrdiff_t _stride)
    873 {
    874    pixel *src = (pixel*)_src;
    875    int stride = _stride>>(sizeof(pixel)-1);
    876 
    877    PREDICT_8x8_LOAD_LEFT;
    878    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3);
    879    PREDICT_8x8_DC(dc);
    880 }
    881 static void FUNCC(pred8x8l_top_dc)(uint8_t *_src, int has_topleft,
    882                                   int has_topright, ptrdiff_t _stride)
    883 {
    884    pixel *src = (pixel*)_src;
    885    int stride = _stride>>(sizeof(pixel)-1);
    886 
    887    PREDICT_8x8_LOAD_TOP;
    888    const pixel4 dc = PIXEL_SPLAT_X4((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3);
    889    PREDICT_8x8_DC(dc);
    890 }
    891 static void FUNCC(pred8x8l_dc)(uint8_t *_src, int has_topleft,
    892                               int has_topright, ptrdiff_t _stride)
    893 {
    894    pixel *src = (pixel*)_src;
    895    int stride = _stride>>(sizeof(pixel)-1);
    896 
    897    PREDICT_8x8_LOAD_LEFT;
    898    PREDICT_8x8_LOAD_TOP;
    899    const pixel4 dc = PIXEL_SPLAT_X4((l0+l1+l2+l3+l4+l5+l6+l7
    900                                     +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4);
    901    PREDICT_8x8_DC(dc);
    902 }
    903 static void FUNCC(pred8x8l_horizontal)(uint8_t *_src, int has_topleft,
    904                                       int has_topright, ptrdiff_t _stride)
    905 {
    906    pixel *src = (pixel*)_src;
    907    int stride = _stride>>(sizeof(pixel)-1);
    908    pixel4 a;
    909 
    910    PREDICT_8x8_LOAD_LEFT;
    911 #define ROW(y) a = PIXEL_SPLAT_X4(l##y); \
    912               AV_WN4PA(src+y*stride, a); \
    913               AV_WN4PA(src+y*stride+4, a);
    914    ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
    915 #undef ROW
    916 }
    917 static void FUNCC(pred8x8l_vertical)(uint8_t *_src, int has_topleft,
    918                                     int has_topright, ptrdiff_t _stride)
    919 {
    920    int y;
    921    pixel *src = (pixel*)_src;
    922    int stride = _stride>>(sizeof(pixel)-1);
    923    pixel4 a, b;
    924 
    925    PREDICT_8x8_LOAD_TOP;
    926    src[0] = t0;
    927    src[1] = t1;
    928    src[2] = t2;
    929    src[3] = t3;
    930    src[4] = t4;
    931    src[5] = t5;
    932    src[6] = t6;
    933    src[7] = t7;
    934    a = AV_RN4PA(((pixel4*)src)+0);
    935    b = AV_RN4PA(((pixel4*)src)+1);
    936    for( y = 1; y < 8; y++ ) {
    937        AV_WN4PA(((pixel4*)(src+y*stride))+0, a);
    938        AV_WN4PA(((pixel4*)(src+y*stride))+1, b);
    939    }
    940 }
    941 static void FUNCC(pred8x8l_down_left)(uint8_t *_src, int has_topleft,
    942                                      int has_topright, ptrdiff_t _stride)
    943 {
    944    pixel *src = (pixel*)_src;
    945    int stride = _stride>>(sizeof(pixel)-1);
    946    PREDICT_8x8_LOAD_TOP;
    947    PREDICT_8x8_LOAD_TOPRIGHT;
    948    SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
    949    SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
    950    SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
    951    SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
    952    SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
    953    SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
    954    SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
    955    SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
    956    SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
    957    SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
    958    SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
    959    SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
    960    SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
    961    SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
    962    SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
    963 }
    964 static void FUNCC(pred8x8l_down_right)(uint8_t *_src, int has_topleft,
    965                                       int has_topright, ptrdiff_t _stride)
    966 {
    967    pixel *src = (pixel*)_src;
    968    int stride = _stride>>(sizeof(pixel)-1);
    969    PREDICT_8x8_LOAD_TOP;
    970    PREDICT_8x8_LOAD_LEFT;
    971    PREDICT_8x8_LOAD_TOPLEFT;
    972    SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
    973    SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
    974    SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
    975    SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
    976    SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
    977    SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
    978    SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
    979    SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
    980    SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
    981    SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
    982    SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
    983    SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
    984    SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
    985    SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
    986    SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
    987 }
    988 static void FUNCC(pred8x8l_vertical_right)(uint8_t *_src, int has_topleft,
    989                                           int has_topright, ptrdiff_t _stride)
    990 {
    991    pixel *src = (pixel*)_src;
    992    int stride = _stride>>(sizeof(pixel)-1);
    993    PREDICT_8x8_LOAD_TOP;
    994    PREDICT_8x8_LOAD_LEFT;
    995    PREDICT_8x8_LOAD_TOPLEFT;
    996    SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
    997    SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
    998    SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
    999    SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
   1000    SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
   1001    SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
   1002    SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
   1003    SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
   1004    SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
   1005    SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
   1006    SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
   1007    SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
   1008    SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
   1009    SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
   1010    SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
   1011    SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
   1012    SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
   1013    SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
   1014    SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
   1015    SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
   1016    SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
   1017    SRC(7,0)= (t6 + t7 + 1) >> 1;
   1018 }
   1019 static void FUNCC(pred8x8l_horizontal_down)(uint8_t *_src, int has_topleft,
   1020                                            int has_topright, ptrdiff_t _stride)
   1021 {
   1022    pixel *src = (pixel*)_src;
   1023    int stride = _stride>>(sizeof(pixel)-1);
   1024    PREDICT_8x8_LOAD_TOP;
   1025    PREDICT_8x8_LOAD_LEFT;
   1026    PREDICT_8x8_LOAD_TOPLEFT;
   1027    SRC(0,7)= (l6 + l7 + 1) >> 1;
   1028    SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
   1029    SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
   1030    SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
   1031    SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
   1032    SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
   1033    SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
   1034    SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
   1035    SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
   1036    SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
   1037    SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
   1038    SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
   1039    SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
   1040    SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
   1041    SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
   1042    SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
   1043    SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
   1044    SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
   1045    SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
   1046    SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
   1047    SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
   1048    SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
   1049 }
   1050 static void FUNCC(pred8x8l_vertical_left)(uint8_t *_src, int has_topleft,
   1051                                          int has_topright, ptrdiff_t _stride)
   1052 {
   1053    pixel *src = (pixel*)_src;
   1054    int stride = _stride>>(sizeof(pixel)-1);
   1055    PREDICT_8x8_LOAD_TOP;
   1056    PREDICT_8x8_LOAD_TOPRIGHT;
   1057    SRC(0,0)= (t0 + t1 + 1) >> 1;
   1058    SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
   1059    SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
   1060    SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
   1061    SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
   1062    SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
   1063    SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
   1064    SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
   1065    SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
   1066    SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
   1067    SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
   1068    SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
   1069    SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
   1070    SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
   1071    SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
   1072    SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
   1073    SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
   1074    SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
   1075    SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
   1076    SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
   1077    SRC(7,6)= (t10 + t11 + 1) >> 1;
   1078    SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
   1079 }
   1080 static void FUNCC(pred8x8l_horizontal_up)(uint8_t *_src, int has_topleft,
   1081                                          int has_topright, ptrdiff_t _stride)
   1082 {
   1083    pixel *src = (pixel*)_src;
   1084    int stride = _stride>>(sizeof(pixel)-1);
   1085    PREDICT_8x8_LOAD_LEFT;
   1086    SRC(0,0)= (l0 + l1 + 1) >> 1;
   1087    SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
   1088    SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
   1089    SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
   1090    SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
   1091    SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
   1092    SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
   1093    SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
   1094    SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
   1095    SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
   1096    SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
   1097    SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
   1098    SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
   1099    SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
   1100    SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
   1101    SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
   1102    SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
   1103    SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
   1104 }
   1105 
   1106 static void FUNCC(pred8x8l_vertical_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
   1107                                     int has_topright, ptrdiff_t _stride)
   1108 {
   1109    int i;
   1110    pixel *src = (pixel*)_src;
   1111    const dctcoef *block = (const dctcoef*)_block;
   1112    pixel pix[8];
   1113    int stride = _stride>>(sizeof(pixel)-1);
   1114    PREDICT_8x8_LOAD_TOP;
   1115 
   1116    pix[0] = t0;
   1117    pix[1] = t1;
   1118    pix[2] = t2;
   1119    pix[3] = t3;
   1120    pix[4] = t4;
   1121    pix[5] = t5;
   1122    pix[6] = t6;
   1123    pix[7] = t7;
   1124 
   1125    for(i=0; i<8; i++){
   1126        pixel v = pix[i];
   1127        src[0*stride]= v += block[0];
   1128        src[1*stride]= v += block[8];
   1129        src[2*stride]= v += block[16];
   1130        src[3*stride]= v += block[24];
   1131        src[4*stride]= v += block[32];
   1132        src[5*stride]= v += block[40];
   1133        src[6*stride]= v += block[48];
   1134        src[7*stride]= v +  block[56];
   1135        src++;
   1136        block++;
   1137    }
   1138 
   1139    memset(_block, 0, sizeof(dctcoef) * 64);
   1140 }
   1141 
   1142 static void FUNCC(pred8x8l_horizontal_filter_add)(uint8_t *_src, int16_t *_block, int has_topleft,
   1143                               int has_topright, ptrdiff_t _stride)
   1144 {
   1145    int i;
   1146    pixel *src = (pixel*)_src;
   1147    const dctcoef *block = (const dctcoef*)_block;
   1148    pixel pix[8];
   1149    int stride = _stride>>(sizeof(pixel)-1);
   1150    PREDICT_8x8_LOAD_LEFT;
   1151 
   1152    pix[0] = l0;
   1153    pix[1] = l1;
   1154    pix[2] = l2;
   1155    pix[3] = l3;
   1156    pix[4] = l4;
   1157    pix[5] = l5;
   1158    pix[6] = l6;
   1159    pix[7] = l7;
   1160 
   1161    for(i=0; i<8; i++){
   1162        pixel v = pix[i];
   1163        src[0]= v += block[0];
   1164        src[1]= v += block[1];
   1165        src[2]= v += block[2];
   1166        src[3]= v += block[3];
   1167        src[4]= v += block[4];
   1168        src[5]= v += block[5];
   1169        src[6]= v += block[6];
   1170        src[7]= v +  block[7];
   1171        src+= stride;
   1172        block+= 8;
   1173    }
   1174 
   1175    memset(_block, 0, sizeof(dctcoef) * 64);
   1176 }
   1177 
   1178 #undef PREDICT_8x8_LOAD_LEFT
   1179 #undef PREDICT_8x8_LOAD_TOP
   1180 #undef PREDICT_8x8_LOAD_TOPLEFT
   1181 #undef PREDICT_8x8_LOAD_TOPRIGHT
   1182 #undef PREDICT_8x8_DC
   1183 #undef PTR
   1184 #undef PT
   1185 #undef PL
   1186 #undef SRC
   1187 
   1188 static void FUNCC(pred4x4_vertical_add)(uint8_t *_pix, int16_t *_block,
   1189                                        ptrdiff_t stride)
   1190 {
   1191    int i;
   1192    pixel *pix = (pixel*)_pix;
   1193    const dctcoef *block = (const dctcoef*)_block;
   1194    stride >>= sizeof(pixel)-1;
   1195    pix -= stride;
   1196    for(i=0; i<4; i++){
   1197        pixel v = pix[0];
   1198        pix[1*stride]= v += block[0];
   1199        pix[2*stride]= v += block[4];
   1200        pix[3*stride]= v += block[8];
   1201        pix[4*stride]= v +  block[12];
   1202        pix++;
   1203        block++;
   1204    }
   1205 
   1206    memset(_block, 0, sizeof(dctcoef) * 16);
   1207 }
   1208 
   1209 static void FUNCC(pred4x4_horizontal_add)(uint8_t *_pix, int16_t *_block,
   1210                                          ptrdiff_t stride)
   1211 {
   1212    int i;
   1213    pixel *pix = (pixel*)_pix;
   1214    const dctcoef *block = (const dctcoef*)_block;
   1215    stride >>= sizeof(pixel)-1;
   1216    for(i=0; i<4; i++){
   1217        pixel v = pix[-1];
   1218        pix[0]= v += block[0];
   1219        pix[1]= v += block[1];
   1220        pix[2]= v += block[2];
   1221        pix[3]= v +  block[3];
   1222        pix+= stride;
   1223        block+= 4;
   1224    }
   1225 
   1226    memset(_block, 0, sizeof(dctcoef) * 16);
   1227 }
   1228 
   1229 static void FUNCC(pred8x8l_vertical_add)(uint8_t *_pix, int16_t *_block,
   1230                                         ptrdiff_t stride)
   1231 {
   1232    int i;
   1233    pixel *pix = (pixel*)_pix;
   1234    const dctcoef *block = (const dctcoef*)_block;
   1235    stride >>= sizeof(pixel)-1;
   1236    pix -= stride;
   1237    for(i=0; i<8; i++){
   1238        pixel v = pix[0];
   1239        pix[1*stride]= v += block[0];
   1240        pix[2*stride]= v += block[8];
   1241        pix[3*stride]= v += block[16];
   1242        pix[4*stride]= v += block[24];
   1243        pix[5*stride]= v += block[32];
   1244        pix[6*stride]= v += block[40];
   1245        pix[7*stride]= v += block[48];
   1246        pix[8*stride]= v +  block[56];
   1247        pix++;
   1248        block++;
   1249    }
   1250 
   1251    memset(_block, 0, sizeof(dctcoef) * 64);
   1252 }
   1253 
   1254 static void FUNCC(pred8x8l_horizontal_add)(uint8_t *_pix, int16_t *_block,
   1255                                           ptrdiff_t stride)
   1256 {
   1257    int i;
   1258    pixel *pix = (pixel*)_pix;
   1259    const dctcoef *block = (const dctcoef*)_block;
   1260    stride >>= sizeof(pixel)-1;
   1261    for(i=0; i<8; i++){
   1262        pixel v = pix[-1];
   1263        pix[0]= v += block[0];
   1264        pix[1]= v += block[1];
   1265        pix[2]= v += block[2];
   1266        pix[3]= v += block[3];
   1267        pix[4]= v += block[4];
   1268        pix[5]= v += block[5];
   1269        pix[6]= v += block[6];
   1270        pix[7]= v +  block[7];
   1271        pix+= stride;
   1272        block+= 8;
   1273    }
   1274 
   1275    memset(_block, 0, sizeof(dctcoef) * 64);
   1276 }
   1277 
   1278 static void FUNCC(pred16x16_vertical_add)(uint8_t *pix, const int *block_offset,
   1279                                          int16_t *block,
   1280                                          ptrdiff_t stride)
   1281 {
   1282    int i;
   1283    for(i=0; i<16; i++)
   1284        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
   1285 }
   1286 
   1287 static void FUNCC(pred16x16_horizontal_add)(uint8_t *pix,
   1288                                            const int *block_offset,
   1289                                            int16_t *block,
   1290                                            ptrdiff_t stride)
   1291 {
   1292    int i;
   1293    for(i=0; i<16; i++)
   1294        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
   1295 }
   1296 
   1297 static void FUNCC(pred8x8_vertical_add)(uint8_t *pix, const int *block_offset,
   1298                                        int16_t *block, ptrdiff_t stride)
   1299 {
   1300    int i;
   1301    for(i=0; i<4; i++)
   1302        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
   1303 }
   1304 
   1305 static void FUNCC(pred8x16_vertical_add)(uint8_t *pix, const int *block_offset,
   1306                                         int16_t *block, ptrdiff_t stride)
   1307 {
   1308    int i;
   1309    for(i=0; i<4; i++)
   1310        FUNCC(pred4x4_vertical_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
   1311    for(i=4; i<8; i++)
   1312        FUNCC(pred4x4_vertical_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
   1313 }
   1314 
   1315 static void FUNCC(pred8x8_horizontal_add)(uint8_t *pix, const int *block_offset,
   1316                                          int16_t *block,
   1317                                          ptrdiff_t stride)
   1318 {
   1319    int i;
   1320    for(i=0; i<4; i++)
   1321        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
   1322 }
   1323 
   1324 static void FUNCC(pred8x16_horizontal_add)(uint8_t *pix,
   1325                                           const int *block_offset,
   1326                                           int16_t *block, ptrdiff_t stride)
   1327 {
   1328    int i;
   1329    for(i=0; i<4; i++)
   1330        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i], block + i*16*sizeof(pixel), stride);
   1331    for(i=4; i<8; i++)
   1332        FUNCC(pred4x4_horizontal_add)(pix + block_offset[i+4], block + i*16*sizeof(pixel), stride);
   1333 }