tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred.S (214835B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 // void ipred_dc_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
     32 //                             const pixel *const topleft,
     33 //                             const int width, const int height, const int a,
     34 //                             const int max_width, const int max_height);
     35 function ipred_dc_128_8bpc_neon, export=1
     36        clz             w3,  w3
     37        movrel          x5,  ipred_dc_128_tbl
     38        sub             w3,  w3,  #25
     39        ldrsw           x3,  [x5, w3, uxtw #2]
     40        movi            v0.16b,  #128
     41        add             x5,  x5,  x3
     42        add             x6,  x0,  x1
     43        lsl             x1,  x1,  #1
     44        br              x5
     45 40:
     46        AARCH64_VALID_JUMP_TARGET
     47 4:
     48        st1             {v0.s}[0],  [x0], x1
     49        st1             {v0.s}[0],  [x6], x1
     50        subs            w4,  w4,  #4
     51        st1             {v0.s}[0],  [x0], x1
     52        st1             {v0.s}[0],  [x6], x1
     53        b.gt            4b
     54        ret
     55 80:
     56        AARCH64_VALID_JUMP_TARGET
     57 8:
     58        st1             {v0.8b},  [x0], x1
     59        st1             {v0.8b},  [x6], x1
     60        subs            w4,  w4,  #4
     61        st1             {v0.8b},  [x0], x1
     62        st1             {v0.8b},  [x6], x1
     63        b.gt            8b
     64        ret
     65 160:
     66        AARCH64_VALID_JUMP_TARGET
     67 16:
     68        st1             {v0.16b}, [x0], x1
     69        st1             {v0.16b}, [x6], x1
     70        subs            w4,  w4,  #4
     71        st1             {v0.16b}, [x0], x1
     72        st1             {v0.16b}, [x6], x1
     73        b.gt            16b
     74        ret
     75 320:
     76        AARCH64_VALID_JUMP_TARGET
     77        movi            v1.16b,  #128
     78 32:
     79        st1             {v0.16b, v1.16b}, [x0], x1
     80        st1             {v0.16b, v1.16b}, [x6], x1
     81        subs            w4,  w4,  #4
     82        st1             {v0.16b, v1.16b}, [x0], x1
     83        st1             {v0.16b, v1.16b}, [x6], x1
     84        b.gt            32b
     85        ret
     86 640:
     87        AARCH64_VALID_JUMP_TARGET
     88        movi            v1.16b,  #128
     89        movi            v2.16b,  #128
     90        movi            v3.16b,  #128
     91 64:
     92        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
     93        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
     94        subs            w4,  w4,  #4
     95        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
     96        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
     97        b.gt            64b
     98        ret
     99 endfunc
    100 
    101 jumptable ipred_dc_128_tbl
    102        .word 640b - ipred_dc_128_tbl
    103        .word 320b - ipred_dc_128_tbl
    104        .word 160b - ipred_dc_128_tbl
    105        .word 80b  - ipred_dc_128_tbl
    106        .word 40b  - ipred_dc_128_tbl
    107 endjumptable
    108 
    109 // void ipred_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    110 //                        const pixel *const topleft,
    111 //                        const int width, const int height, const int a,
    112 //                        const int max_width, const int max_height);
    113 function ipred_v_8bpc_neon, export=1
    114        clz             w3,  w3
    115        movrel          x5,  ipred_v_tbl
    116        sub             w3,  w3,  #25
    117        ldrsw           x3,  [x5, w3, uxtw #2]
    118        add             x2,  x2,  #1
    119        add             x5,  x5,  x3
    120        add             x6,  x0,  x1
    121        lsl             x1,  x1,  #1
    122        br              x5
    123 40:
    124        AARCH64_VALID_JUMP_TARGET
    125        ld1             {v0.s}[0],  [x2]
    126 4:
    127        st1             {v0.s}[0],  [x0], x1
    128        st1             {v0.s}[0],  [x6], x1
    129        subs            w4,  w4,  #4
    130        st1             {v0.s}[0],  [x0], x1
    131        st1             {v0.s}[0],  [x6], x1
    132        b.gt            4b
    133        ret
    134 80:
    135        AARCH64_VALID_JUMP_TARGET
    136        ld1             {v0.8b},  [x2]
    137 8:
    138        st1             {v0.8b},  [x0], x1
    139        st1             {v0.8b},  [x6], x1
    140        subs            w4,  w4,  #4
    141        st1             {v0.8b},  [x0], x1
    142        st1             {v0.8b},  [x6], x1
    143        b.gt            8b
    144        ret
    145 160:
    146        AARCH64_VALID_JUMP_TARGET
    147        ld1             {v0.16b}, [x2]
    148 16:
    149        st1             {v0.16b}, [x0], x1
    150        st1             {v0.16b}, [x6], x1
    151        subs            w4,  w4,  #4
    152        st1             {v0.16b}, [x0], x1
    153        st1             {v0.16b}, [x6], x1
    154        b.gt            16b
    155        ret
    156 320:
    157        AARCH64_VALID_JUMP_TARGET
    158        ld1             {v0.16b, v1.16b}, [x2]
    159 32:
    160        st1             {v0.16b, v1.16b}, [x0], x1
    161        st1             {v0.16b, v1.16b}, [x6], x1
    162        subs            w4,  w4,  #4
    163        st1             {v0.16b, v1.16b}, [x0], x1
    164        st1             {v0.16b, v1.16b}, [x6], x1
    165        b.gt            32b
    166        ret
    167 640:
    168        AARCH64_VALID_JUMP_TARGET
    169        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
    170 64:
    171        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    172        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    173        subs            w4,  w4,  #4
    174        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    175        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    176        b.gt            64b
    177        ret
    178 endfunc
    179 
    180 jumptable ipred_v_tbl
    181        .word 640b - ipred_v_tbl
    182        .word 320b - ipred_v_tbl
    183        .word 160b - ipred_v_tbl
    184        .word 80b  - ipred_v_tbl
    185        .word 40b  - ipred_v_tbl
    186 endjumptable
    187 
    188 // void ipred_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    189 //                        const pixel *const topleft,
    190 //                        const int width, const int height, const int a,
    191 //                        const int max_width, const int max_height);
    192 function ipred_h_8bpc_neon, export=1
    193        clz             w3,  w3
    194        movrel          x5,  ipred_h_tbl
    195        sub             w3,  w3,  #25
    196        ldrsw           x3,  [x5, w3, uxtw #2]
    197        sub             x2,  x2,  #4
    198        add             x5,  x5,  x3
    199        mov             x7,  #-4
    200        add             x6,  x0,  x1
    201        lsl             x1,  x1,  #1
    202        br              x5
    203 40:
    204        AARCH64_VALID_JUMP_TARGET
    205 4:
    206        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
    207        st1             {v3.s}[0],  [x0], x1
    208        st1             {v2.s}[0],  [x6], x1
    209        subs            w4,  w4,  #4
    210        st1             {v1.s}[0],  [x0], x1
    211        st1             {v0.s}[0],  [x6], x1
    212        b.gt            4b
    213        ret
    214 80:
    215        AARCH64_VALID_JUMP_TARGET
    216 8:
    217        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
    218        st1             {v3.8b},  [x0], x1
    219        st1             {v2.8b},  [x6], x1
    220        subs            w4,  w4,  #4
    221        st1             {v1.8b},  [x0], x1
    222        st1             {v0.8b},  [x6], x1
    223        b.gt            8b
    224        ret
    225 160:
    226        AARCH64_VALID_JUMP_TARGET
    227 16:
    228        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
    229        st1             {v3.16b}, [x0], x1
    230        st1             {v2.16b}, [x6], x1
    231        subs            w4,  w4,  #4
    232        st1             {v1.16b}, [x0], x1
    233        st1             {v0.16b}, [x6], x1
    234        b.gt            16b
    235        ret
    236 320:
    237        AARCH64_VALID_JUMP_TARGET
    238 32:
    239        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
    240        str             q3,  [x0, #16]
    241        str             q2,  [x6, #16]
    242        st1             {v3.16b}, [x0], x1
    243        st1             {v2.16b}, [x6], x1
    244        subs            w4,  w4,  #4
    245        str             q1,  [x0, #16]
    246        str             q0,  [x6, #16]
    247        st1             {v1.16b}, [x0], x1
    248        st1             {v0.16b}, [x6], x1
    249        b.gt            32b
    250        ret
    251 640:
    252        AARCH64_VALID_JUMP_TARGET
    253 64:
    254        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
    255        str             q3,  [x0, #16]
    256        str             q2,  [x6, #16]
    257        stp             q3,  q3,  [x0, #32]
    258        stp             q2,  q2,  [x6, #32]
    259        st1             {v3.16b}, [x0], x1
    260        st1             {v2.16b}, [x6], x1
    261        subs            w4,  w4,  #4
    262        str             q1,  [x0, #16]
    263        str             q0,  [x6, #16]
    264        stp             q1,  q1,  [x0, #32]
    265        stp             q0,  q0,  [x6, #32]
    266        st1             {v1.16b}, [x0], x1
    267        st1             {v0.16b}, [x6], x1
    268        b.gt            64b
    269        ret
    270 endfunc
    271 
    272 jumptable ipred_h_tbl
    273        .word 640b - ipred_h_tbl
    274        .word 320b - ipred_h_tbl
    275        .word 160b - ipred_h_tbl
    276        .word 80b  - ipred_h_tbl
    277        .word 40b  - ipred_h_tbl
    278 endjumptable
    279 
    280 // void ipred_dc_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    281 //                             const pixel *const topleft,
    282 //                             const int width, const int height, const int a,
    283 //                             const int max_width, const int max_height);
    284 function ipred_dc_top_8bpc_neon, export=1
    285        clz             w3,  w3
    286        movrel          x5,  ipred_dc_top_tbl
    287        sub             w3,  w3,  #25
    288        ldrsw           x3,  [x5, w3, uxtw #2]
    289        add             x2,  x2,  #1
    290        add             x5,  x5,  x3
    291        add             x6,  x0,  x1
    292        lsl             x1,  x1,  #1
    293        br              x5
    294 40:
    295        AARCH64_VALID_JUMP_TARGET
    296        ld1r            {v0.2s},  [x2]
    297        uaddlv          h0,      v0.8b
    298        rshrn           v0.8b,   v0.8h,   #3
    299        dup             v0.8b,   v0.b[0]
    300 4:
    301        st1             {v0.s}[0],  [x0], x1
    302        st1             {v0.s}[0],  [x6], x1
    303        subs            w4,  w4,  #4
    304        st1             {v0.s}[0],  [x0], x1
    305        st1             {v0.s}[0],  [x6], x1
    306        b.gt            4b
    307        ret
    308 80:
    309        AARCH64_VALID_JUMP_TARGET
    310        ld1             {v0.8b},  [x2]
    311        uaddlv          h0,      v0.8b
    312        rshrn           v0.8b,   v0.8h,   #3
    313        dup             v0.8b,   v0.b[0]
    314 8:
    315        st1             {v0.8b},  [x0], x1
    316        st1             {v0.8b},  [x6], x1
    317        subs            w4,  w4,  #4
    318        st1             {v0.8b},  [x0], x1
    319        st1             {v0.8b},  [x6], x1
    320        b.gt            8b
    321        ret
    322 160:
    323        AARCH64_VALID_JUMP_TARGET
    324        ld1             {v0.16b}, [x2]
    325        uaddlv          h0,      v0.16b
    326        rshrn           v0.8b,   v0.8h,   #4
    327        dup             v0.16b,  v0.b[0]
    328 16:
    329        st1             {v0.16b}, [x0], x1
    330        st1             {v0.16b}, [x6], x1
    331        subs            w4,  w4,  #4
    332        st1             {v0.16b}, [x0], x1
    333        st1             {v0.16b}, [x6], x1
    334        b.gt            16b
    335        ret
    336 320:
    337        AARCH64_VALID_JUMP_TARGET
    338        ld1             {v0.16b, v1.16b}, [x2]
    339        uaddlv          h0,      v0.16b
    340        uaddlv          h1,      v1.16b
    341        add             v2.4h,   v0.4h,   v1.4h
    342        rshrn           v2.8b,   v2.8h,   #5
    343        dup             v0.16b,  v2.b[0]
    344        dup             v1.16b,  v2.b[0]
    345 32:
    346        st1             {v0.16b, v1.16b}, [x0], x1
    347        st1             {v0.16b, v1.16b}, [x6], x1
    348        subs            w4,  w4,  #4
    349        st1             {v0.16b, v1.16b}, [x0], x1
    350        st1             {v0.16b, v1.16b}, [x6], x1
    351        b.gt            32b
    352        ret
    353 640:
    354        AARCH64_VALID_JUMP_TARGET
    355        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
    356        uaddlv          h0,      v0.16b
    357        uaddlv          h1,      v1.16b
    358        uaddlv          h2,      v2.16b
    359        uaddlv          h3,      v3.16b
    360        add             v4.4h,   v0.4h,   v1.4h
    361        add             v5.4h,   v2.4h,   v3.4h
    362        add             v4.4h,   v4.4h,   v5.4h
    363        rshrn           v4.8b,   v4.8h,   #6
    364        dup             v0.16b,  v4.b[0]
    365        dup             v1.16b,  v4.b[0]
    366        dup             v2.16b,  v4.b[0]
    367        dup             v3.16b,  v4.b[0]
    368 64:
    369        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    370        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    371        subs            w4,  w4,  #4
    372        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    373        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    374        b.gt            64b
    375        ret
    376 endfunc
    377 
    378 jumptable ipred_dc_top_tbl
    379        .word 640b - ipred_dc_top_tbl
    380        .word 320b - ipred_dc_top_tbl
    381        .word 160b - ipred_dc_top_tbl
    382        .word 80b  - ipred_dc_top_tbl
    383        .word 40b  - ipred_dc_top_tbl
    384 endjumptable
    385 
    386 // void ipred_dc_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    387 //                              const pixel *const topleft,
    388 //                              const int width, const int height, const int a,
    389 //                              const int max_width, const int max_height);
    390 function ipred_dc_left_8bpc_neon, export=1
    391        sub             x2,  x2,  w4, uxtw
    392        clz             w3,  w3
    393        clz             w7,  w4
    394        movrel          x5,  ipred_dc_left_tbl
    395        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
    396        sub             w7,  w7,  #25
    397        ldrsw           x3,  [x5, w3, uxtw #2]
    398        ldrsw           x7,  [x5, w7, uxtw #2]
    399        add             x3,  x5,  x3
    400        add             x5,  x5,  x7
    401        add             x6,  x0,  x1
    402        lsl             x1,  x1,  #1
    403        br              x5
    404 
    405 L(ipred_dc_left_h4):
    406        AARCH64_VALID_JUMP_TARGET
    407        ld1r            {v0.2s},  [x2]
    408        uaddlv          h0,      v0.8b
    409        rshrn           v0.8b,   v0.8h,   #3
    410        dup             v0.16b,  v0.b[0]
    411        br              x3
    412 L(ipred_dc_left_w4):
    413        AARCH64_VALID_JUMP_TARGET
    414 1:
    415        st1             {v0.s}[0],  [x0], x1
    416        st1             {v0.s}[0],  [x6], x1
    417        subs            w4,  w4,  #4
    418        st1             {v0.s}[0],  [x0], x1
    419        st1             {v0.s}[0],  [x6], x1
    420        b.gt            1b
    421        ret
    422 
    423 L(ipred_dc_left_h8):
    424        AARCH64_VALID_JUMP_TARGET
    425        ld1             {v0.8b},  [x2]
    426        uaddlv          h0,      v0.8b
    427        rshrn           v0.8b,   v0.8h,   #3
    428        dup             v0.16b,  v0.b[0]
    429        br              x3
    430 L(ipred_dc_left_w8):
    431        AARCH64_VALID_JUMP_TARGET
    432 1:
    433        st1             {v0.8b},  [x0], x1
    434        st1             {v0.8b},  [x6], x1
    435        subs            w4,  w4,  #4
    436        st1             {v0.8b},  [x0], x1
    437        st1             {v0.8b},  [x6], x1
    438        b.gt            1b
    439        ret
    440 
    441 L(ipred_dc_left_h16):
    442        AARCH64_VALID_JUMP_TARGET
    443        ld1             {v0.16b}, [x2]
    444        uaddlv          h0,      v0.16b
    445        rshrn           v0.8b,   v0.8h,   #4
    446        dup             v0.16b,  v0.b[0]
    447        br              x3
    448 L(ipred_dc_left_w16):
    449        AARCH64_VALID_JUMP_TARGET
    450 1:
    451        st1             {v0.16b}, [x0], x1
    452        st1             {v0.16b}, [x6], x1
    453        subs            w4,  w4,  #4
    454        st1             {v0.16b}, [x0], x1
    455        st1             {v0.16b}, [x6], x1
    456        b.gt            1b
    457        ret
    458 
    459 L(ipred_dc_left_h32):
    460        AARCH64_VALID_JUMP_TARGET
    461        ld1             {v0.16b, v1.16b}, [x2]
    462        uaddlv          h0,      v0.16b
    463        uaddlv          h1,      v1.16b
    464        add             v0.4h,   v0.4h,   v1.4h
    465        rshrn           v0.8b,   v0.8h,   #5
    466        dup             v0.16b,  v0.b[0]
    467        br              x3
    468 L(ipred_dc_left_w32):
    469        AARCH64_VALID_JUMP_TARGET
    470        mov             v1.16b,  v0.16b
    471 1:
    472        st1             {v0.16b, v1.16b}, [x0], x1
    473        st1             {v0.16b, v1.16b}, [x6], x1
    474        subs            w4,  w4,  #4
    475        st1             {v0.16b, v1.16b}, [x0], x1
    476        st1             {v0.16b, v1.16b}, [x6], x1
    477        b.gt            1b
    478        ret
    479 
    480 L(ipred_dc_left_h64):
    481        AARCH64_VALID_JUMP_TARGET
    482        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2]
    483        uaddlv          h0,      v0.16b
    484        uaddlv          h1,      v1.16b
    485        uaddlv          h2,      v2.16b
    486        uaddlv          h3,      v3.16b
    487        add             v0.4h,   v0.4h,   v1.4h
    488        add             v2.4h,   v2.4h,   v3.4h
    489        add             v0.4h,   v0.4h,   v2.4h
    490        rshrn           v0.8b,   v0.8h,   #6
    491        dup             v0.16b,  v0.b[0]
    492        br              x3
    493 L(ipred_dc_left_w64):
    494        AARCH64_VALID_JUMP_TARGET
    495        mov             v1.16b,  v0.16b
    496        mov             v2.16b,  v0.16b
    497        mov             v3.16b,  v0.16b
    498 1:
    499        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    500        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    501        subs            w4,  w4,  #4
    502        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    503        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    504        b.gt            1b
    505        ret
    506 endfunc
    507 
    508 jumptable ipred_dc_left_tbl
    509        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
    510        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
    511        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
    512        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
    513        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
    514        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
    515        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
    516        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
    517        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
    518        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
    519 endjumptable
    520 
    521 // void ipred_dc_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    522 //                         const pixel *const topleft,
    523 //                         const int width, const int height, const int a,
    524 //                         const int max_width, const int max_height);
    525 function ipred_dc_8bpc_neon, export=1
    526        sub             x2,  x2,  w4, uxtw
    527        add             w7,  w3,  w4             // width + height
    528        clz             w3,  w3
    529        clz             w6,  w4
    530        dup             v16.8h, w7               // width + height
    531        movrel          x5,  ipred_dc_tbl
    532        rbit            w7,  w7                  // rbit(width + height)
    533        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
    534        sub             w6,  w6,  #25
    535        clz             w7,  w7                  // ctz(width + height)
    536        ldrsw           x3,  [x5, w3, uxtw #2]
    537        ldrsw           x6,  [x5, w6, uxtw #2]
    538        neg             w7,  w7                  // -ctz(width + height)
    539        add             x3,  x5,  x3
    540        add             x5,  x5,  x6
    541        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
    542        dup             v17.8h,  w7              // -ctz(width + height)
    543        add             x6,  x0,  x1
    544        lsl             x1,  x1,  #1
    545        br              x5
    546 
    547 L(ipred_dc_h4):
    548        AARCH64_VALID_JUMP_TARGET
    549        ld1             {v0.s}[0],  [x2], #4
    550        ins             v0.s[1], wzr
    551        uaddlv          h0,      v0.8b
    552        add             x2,  x2,  #1
    553        br              x3
    554 L(ipred_dc_w4):
    555        AARCH64_VALID_JUMP_TARGET
    556        ld1             {v1.s}[0],  [x2]
    557        ins             v1.s[1], wzr
    558        add             v0.4h,   v0.4h,   v16.4h
    559        uaddlv          h1,      v1.8b
    560        cmp             w4,  #4
    561        add             v0.4h,   v0.4h,   v1.4h
    562        ushl            v0.4h,   v0.4h,   v17.4h
    563        b.eq            1f
    564        // h = 8/16
    565        mov             w16, #(0x3334/2)
    566        movk            w16, #(0x5556/2), lsl #16
    567        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
    568        lsr             w16, w16, w17
    569        dup             v16.4h,  w16
    570        sqdmulh         v0.4h,   v0.4h,   v16.4h
    571 1:
    572        dup             v0.8b,   v0.b[0]
    573 2:
    574        st1             {v0.s}[0],  [x0], x1
    575        st1             {v0.s}[0],  [x6], x1
    576        subs            w4,  w4,  #4
    577        st1             {v0.s}[0],  [x0], x1
    578        st1             {v0.s}[0],  [x6], x1
    579        b.gt            2b
    580        ret
    581 
    582 L(ipred_dc_h8):
    583        AARCH64_VALID_JUMP_TARGET
    584        ld1             {v0.8b},  [x2], #8
    585        uaddlv          h0,      v0.8b
    586        add             x2,  x2,  #1
    587        br              x3
    588 L(ipred_dc_w8):
    589        AARCH64_VALID_JUMP_TARGET
    590        ld1             {v1.8b},  [x2]
    591        add             v0.4h,   v0.4h,   v16.4h
    592        uaddlv          h1,      v1.8b
    593        cmp             w4,  #8
    594        add             v0.4h,   v0.4h,   v1.4h
    595        ushl            v0.4h,   v0.4h,   v17.4h
    596        b.eq            1f
    597        // h = 4/16/32
    598        cmp             w4,  #32
    599        mov             w16, #(0x3334/2)
    600        mov             w17, #(0x5556/2)
    601        csel            w16, w16, w17, eq
    602        dup             v16.4h,  w16
    603        sqdmulh         v0.4h,   v0.4h,   v16.4h
    604 1:
    605        dup             v0.8b,   v0.b[0]
    606 2:
    607        st1             {v0.8b},  [x0], x1
    608        st1             {v0.8b},  [x6], x1
    609        subs            w4,  w4,  #4
    610        st1             {v0.8b},  [x0], x1
    611        st1             {v0.8b},  [x6], x1
    612        b.gt            2b
    613        ret
    614 
    615 L(ipred_dc_h16):
    616        AARCH64_VALID_JUMP_TARGET
    617        ld1             {v0.16b}, [x2], #16
    618        uaddlv          h0,      v0.16b
    619        add             x2,  x2,  #1
    620        br              x3
    621 L(ipred_dc_w16):
    622        AARCH64_VALID_JUMP_TARGET
    623        ld1             {v1.16b}, [x2]
    624        add             v0.4h,   v0.4h,   v16.4h
    625        uaddlv          h1,      v1.16b
    626        cmp             w4,  #16
    627        add             v0.4h,   v0.4h,   v1.4h
    628        ushl            v0.4h,   v0.4h,   v17.4h
    629        b.eq            1f
    630        // h = 4/8/32/64
    631        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
    632        mov             w16, #(0x3334/2)
    633        mov             w17, #(0x5556/2)
    634        csel            w16, w16, w17, eq
    635        dup             v16.4h,  w16
    636        sqdmulh         v0.4h,   v0.4h,   v16.4h
    637 1:
    638        dup             v0.16b,  v0.b[0]
    639 2:
    640        st1             {v0.16b}, [x0], x1
    641        st1             {v0.16b}, [x6], x1
    642        subs            w4,  w4,  #4
    643        st1             {v0.16b}, [x0], x1
    644        st1             {v0.16b}, [x6], x1
    645        b.gt            2b
    646        ret
    647 
    648 L(ipred_dc_h32):
    649        AARCH64_VALID_JUMP_TARGET
    650        ld1             {v0.16b, v1.16b}, [x2], #32
    651        uaddlv          h0,      v0.16b
    652        uaddlv          h1,      v1.16b
    653        add             x2,  x2,  #1
    654        add             v0.4h,   v0.4h,   v1.4h
    655        br              x3
    656 L(ipred_dc_w32):
    657        AARCH64_VALID_JUMP_TARGET
    658        ld1             {v1.16b, v2.16b}, [x2]
    659        add             v0.4h,   v0.4h,   v16.4h
    660        uaddlv          h1,      v1.16b
    661        uaddlv          h2,      v2.16b
    662        cmp             w4,  #32
    663        add             v0.4h,   v0.4h,   v1.4h
    664        add             v0.4h,   v0.4h,   v2.4h
    665        ushl            v4.4h,   v0.4h,   v17.4h
    666        b.eq            1f
    667        // h = 8/16/64
    668        cmp             w4,  #8
    669        mov             w16, #(0x3334/2)
    670        mov             w17, #(0x5556/2)
    671        csel            w16, w16, w17, eq
    672        dup             v16.4h,  w16
    673        sqdmulh         v4.4h,   v4.4h,   v16.4h
    674 1:
    675        dup             v0.16b,  v4.b[0]
    676        dup             v1.16b,  v4.b[0]
    677 2:
    678        st1             {v0.16b, v1.16b}, [x0], x1
    679        st1             {v0.16b, v1.16b}, [x6], x1
    680        subs            w4,  w4,  #4
    681        st1             {v0.16b, v1.16b}, [x0], x1
    682        st1             {v0.16b, v1.16b}, [x6], x1
    683        b.gt            2b
    684        ret
    685 
    686 L(ipred_dc_h64):
    687        AARCH64_VALID_JUMP_TARGET
    688        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64
    689        uaddlv          h0,      v0.16b
    690        uaddlv          h1,      v1.16b
    691        uaddlv          h2,      v2.16b
    692        uaddlv          h3,      v3.16b
    693        add             v0.4h,   v0.4h,   v1.4h
    694        add             v2.4h,   v2.4h,   v3.4h
    695        add             x2,  x2,  #1
    696        add             v0.4h,   v0.4h,   v2.4h
    697        br              x3
    698 L(ipred_dc_w64):
    699        AARCH64_VALID_JUMP_TARGET
    700        ld1             {v1.16b, v2.16b, v3.16b, v4.16b}, [x2]
    701        add             v0.4h,   v0.4h,   v16.4h
    702        uaddlv          h1,      v1.16b
    703        uaddlv          h2,      v2.16b
    704        uaddlv          h3,      v3.16b
    705        uaddlv          h4,      v4.16b
    706        add             v1.4h,   v1.4h,   v2.4h
    707        add             v3.4h,   v3.4h,   v4.4h
    708        cmp             w4,  #64
    709        add             v0.4h,   v0.4h,   v1.4h
    710        add             v0.4h,   v0.4h,   v3.4h
    711        ushl            v4.4h,   v0.4h,   v17.4h
    712        b.eq            1f
    713        // h = 16/32
    714        mov             w16, #(0x5556/2)
    715        movk            w16, #(0x3334/2), lsl #16
    716        lsr             w16, w16, w4
    717        dup             v16.4h,  w16
    718        sqdmulh         v4.4h,   v4.4h,   v16.4h
    719 1:
    720        dup             v0.16b,  v4.b[0]
    721        dup             v1.16b,  v4.b[0]
    722        dup             v2.16b,  v4.b[0]
    723        dup             v3.16b,  v4.b[0]
    724 2:
    725        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    726        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    727        subs            w4,  w4,  #4
    728        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x0], x1
    729        st1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x6], x1
    730        b.gt            2b
    731        ret
    732 endfunc
    733 
    734 jumptable ipred_dc_tbl
    735        .word L(ipred_dc_h64) - ipred_dc_tbl
    736        .word L(ipred_dc_h32) - ipred_dc_tbl
    737        .word L(ipred_dc_h16) - ipred_dc_tbl
    738        .word L(ipred_dc_h8)  - ipred_dc_tbl
    739        .word L(ipred_dc_h4)  - ipred_dc_tbl
    740        .word L(ipred_dc_w64) - ipred_dc_tbl
    741        .word L(ipred_dc_w32) - ipred_dc_tbl
    742        .word L(ipred_dc_w16) - ipred_dc_tbl
    743        .word L(ipred_dc_w8)  - ipred_dc_tbl
    744        .word L(ipred_dc_w4)  - ipred_dc_tbl
    745 endjumptable
    746 
    747 // void ipred_paeth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    748 //                            const pixel *const topleft,
    749 //                            const int width, const int height, const int a,
    750 //                            const int max_width, const int max_height);
    751 function ipred_paeth_8bpc_neon, export=1
    752        clz             w9,  w3
    753        movrel          x5,  ipred_paeth_tbl
    754        sub             w9,  w9,  #25
    755        ldrsw           x9,  [x5, w9, uxtw #2]
    756        ld1r            {v4.16b},  [x2]
    757        add             x8,  x2,  #1
    758        sub             x2,  x2,  #4
    759        add             x5,  x5,  x9
    760        mov             x7,  #-4
    761        add             x6,  x0,  x1
    762        lsl             x1,  x1,  #1
    763        br              x5
    764 40:
    765        AARCH64_VALID_JUMP_TARGET
    766        ld1r            {v5.4s},  [x8]
    767        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
    768 4:
    769        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
    770        zip1            v0.2s,   v0.2s,   v1.2s
    771        zip1            v2.2s,   v2.2s,   v3.2s
    772        uaddw           v16.8h,  v6.8h,   v0.8b
    773        uaddw           v17.8h,  v6.8h,   v2.8b
    774        sqxtun          v16.8b,  v16.8h           // base
    775        sqxtun2         v16.16b, v17.8h
    776        zip1            v0.2d,   v0.2d,   v2.2d
    777        uabd            v20.16b, v5.16b,  v16.16b // tdiff
    778        uabd            v22.16b, v4.16b,  v16.16b // tldiff
    779        uabd            v16.16b, v0.16b,  v16.16b // ldiff
    780        umin            v18.16b, v20.16b, v22.16b // min(tdiff, tldiff)
    781        cmhs            v20.16b, v22.16b, v20.16b // tldiff >= tdiff
    782        cmhs            v16.16b, v18.16b, v16.16b // min(tdiff, tldiff) >= ldiff
    783        bsl             v20.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
    784        bit             v20.16b, v0.16b,  v16.16b // ldiff <= min ? left : ...
    785        st1             {v20.s}[3], [x0], x1
    786        st1             {v20.s}[2], [x6], x1
    787        subs            w4,  w4,  #4
    788        st1             {v20.s}[1], [x0], x1
    789        st1             {v20.s}[0], [x6], x1
    790        b.gt            4b
    791        ret
    792 80:
    793        AARCH64_VALID_JUMP_TARGET
    794        ld1r            {v5.2d},  [x8]
    795        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
    796 8:
    797        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7
    798        uaddw           v16.8h,  v6.8h,   v0.8b
    799        uaddw           v17.8h,  v6.8h,   v1.8b
    800        uaddw           v18.8h,  v6.8h,   v2.8b
    801        uaddw           v19.8h,  v6.8h,   v3.8b
    802        sqxtun          v16.8b,  v16.8h           // base
    803        sqxtun2         v16.16b, v17.8h
    804        sqxtun          v18.8b,  v18.8h
    805        sqxtun2         v18.16b, v19.8h
    806        zip1            v2.2d,   v2.2d,   v3.2d
    807        zip1            v0.2d,   v0.2d,   v1.2d
    808        uabd            v21.16b, v5.16b,  v18.16b // tdiff
    809        uabd            v20.16b, v5.16b,  v16.16b
    810        uabd            v23.16b, v4.16b,  v18.16b // tldiff
    811        uabd            v22.16b, v4.16b,  v16.16b
    812        uabd            v17.16b, v2.16b,  v18.16b // ldiff
    813        uabd            v16.16b, v0.16b,  v16.16b
    814        umin            v19.16b, v21.16b, v23.16b // min(tdiff, tldiff)
    815        umin            v18.16b, v20.16b, v22.16b
    816        cmhs            v21.16b, v23.16b, v21.16b // tldiff >= tdiff
    817        cmhs            v20.16b, v22.16b, v20.16b
    818        cmhs            v17.16b, v19.16b, v17.16b // min(tdiff, tldiff) >= ldiff
    819        cmhs            v16.16b, v18.16b, v16.16b
    820        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
    821        bsl             v20.16b, v5.16b,  v4.16b
    822        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
    823        bit             v20.16b, v0.16b,  v16.16b
    824        st1             {v21.d}[1], [x0], x1
    825        st1             {v21.d}[0], [x6], x1
    826        subs            w4,  w4,  #4
    827        st1             {v20.d}[1], [x0], x1
    828        st1             {v20.d}[0], [x6], x1
    829        b.gt            8b
    830        ret
    831 160:
    832 320:
    833 640:
    834        AARCH64_VALID_JUMP_TARGET
    835        ld1             {v5.16b},  [x8], #16
    836        mov             w9,  w3
    837        // Set up pointers for four rows in parallel; x0, x6, x5, x10
    838        add             x5,  x0,  x1
    839        add             x10, x6,  x1
    840        lsl             x1,  x1,  #1
    841        sub             x1,  x1,  w3, uxtw
    842 1:
    843        ld4r            {v0.16b, v1.16b, v2.16b, v3.16b},  [x2], x7
    844 2:
    845        usubl           v6.8h,   v5.8b,   v4.8b   // top - topleft
    846        usubl2          v7.8h,   v5.16b,  v4.16b
    847        uaddw           v24.8h,  v6.8h,   v0.8b
    848        uaddw           v25.8h,  v7.8h,   v0.8b
    849        uaddw           v26.8h,  v6.8h,   v1.8b
    850        uaddw           v27.8h,  v7.8h,   v1.8b
    851        uaddw           v28.8h,  v6.8h,   v2.8b
    852        uaddw           v29.8h,  v7.8h,   v2.8b
    853        uaddw           v30.8h,  v6.8h,   v3.8b
    854        uaddw           v31.8h,  v7.8h,   v3.8b
    855        sqxtun          v17.8b,  v26.8h           // base
    856        sqxtun2         v17.16b, v27.8h
    857        sqxtun          v16.8b,  v24.8h
    858        sqxtun2         v16.16b, v25.8h
    859        sqxtun          v19.8b,  v30.8h
    860        sqxtun2         v19.16b, v31.8h
    861        sqxtun          v18.8b,  v28.8h
    862        sqxtun2         v18.16b, v29.8h
    863        uabd            v23.16b, v5.16b,  v19.16b // tdiff
    864        uabd            v22.16b, v5.16b,  v18.16b
    865        uabd            v21.16b, v5.16b,  v17.16b
    866        uabd            v20.16b, v5.16b,  v16.16b
    867        uabd            v27.16b, v4.16b,  v19.16b // tldiff
    868        uabd            v26.16b, v4.16b,  v18.16b
    869        uabd            v25.16b, v4.16b,  v17.16b
    870        uabd            v24.16b, v4.16b,  v16.16b
    871        uabd            v19.16b, v3.16b,  v19.16b // ldiff
    872        uabd            v18.16b, v2.16b,  v18.16b
    873        uabd            v17.16b, v1.16b,  v17.16b
    874        uabd            v16.16b, v0.16b,  v16.16b
    875        umin            v31.16b, v23.16b, v27.16b // min(tdiff, tldiff)
    876        umin            v30.16b, v22.16b, v26.16b
    877        umin            v29.16b, v21.16b, v25.16b
    878        umin            v28.16b, v20.16b, v24.16b
    879        cmhs            v23.16b, v27.16b, v23.16b // tldiff >= tdiff
    880        cmhs            v22.16b, v26.16b, v22.16b
    881        cmhs            v21.16b, v25.16b, v21.16b
    882        cmhs            v20.16b, v24.16b, v20.16b
    883        cmhs            v19.16b, v31.16b, v19.16b // min(tdiff, tldiff) >= ldiff
    884        cmhs            v18.16b, v30.16b, v18.16b
    885        cmhs            v17.16b, v29.16b, v17.16b
    886        cmhs            v16.16b, v28.16b, v16.16b
    887        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
    888        bsl             v22.16b, v5.16b,  v4.16b
    889        bsl             v21.16b, v5.16b,  v4.16b
    890        bsl             v20.16b, v5.16b,  v4.16b
    891        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
    892        bit             v22.16b, v2.16b,  v18.16b
    893        bit             v21.16b, v1.16b,  v17.16b
    894        bit             v20.16b, v0.16b,  v16.16b
    895        subs            w3,  w3,  #16
    896        st1             {v23.16b}, [x0],  #16
    897        st1             {v22.16b}, [x6],  #16
    898        st1             {v21.16b}, [x5],  #16
    899        st1             {v20.16b}, [x10], #16
    900        b.le            8f
    901        ld1             {v5.16b},  [x8], #16
    902        b               2b
    903 8:
    904        subs            w4,  w4,  #4
    905        b.le            9f
    906        // End of horizontal loop, move pointers to next four rows
    907        sub             x8,  x8,  w9, uxtw
    908        add             x0,  x0,  x1
    909        add             x6,  x6,  x1
    910        // Load the top row as early as possible
    911        ld1             {v5.16b},  [x8], #16
    912        add             x5,  x5,  x1
    913        add             x10, x10, x1
    914        mov             w3,  w9
    915        b               1b
    916 9:
    917        ret
    918 endfunc
    919 
    920 jumptable ipred_paeth_tbl
    921        .word 640b - ipred_paeth_tbl
    922        .word 320b - ipred_paeth_tbl
    923        .word 160b - ipred_paeth_tbl
    924        .word 80b  - ipred_paeth_tbl
    925        .word 40b  - ipred_paeth_tbl
    926 endjumptable
    927 
    928 // void ipred_smooth_8bpc_neon(pixel *dst, const ptrdiff_t stride,
    929 //                             const pixel *const topleft,
    930 //                             const int width, const int height, const int a,
    931 //                             const int max_width, const int max_height);
    932 function ipred_smooth_8bpc_neon, export=1
    933        movrel          x10, X(sm_weights)
    934        add             x11, x10, w4, uxtw
    935        add             x10, x10, w3, uxtw
    936        clz             w9,  w3
    937        movrel          x5,  ipred_smooth_tbl
    938        sub             x12, x2,  w4, uxtw
    939        sub             w9,  w9,  #25
    940        ldrsw           x9,  [x5, w9, uxtw #2]
    941        ld1r            {v4.16b},  [x12] // bottom
    942        add             x8,  x2,  #1
    943        add             x5,  x5,  x9
    944        add             x6,  x0,  x1
    945        lsl             x1,  x1,  #1
    946        br              x5
    947 40:
    948        AARCH64_VALID_JUMP_TARGET
    949        ld1r            {v6.2s}, [x8]             // top
    950        ld1r            {v7.2s}, [x10]            // weights_hor
    951        sub             x2,  x2,  #4
    952        mov             x7,  #-4
    953        dup             v5.16b,  v6.b[3]          // right
    954        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
    955        uxtl            v7.8h,   v7.8b            // weights_hor
    956 4:
    957        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
    958        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
    959        shll            v20.8h,  v5.8b,   #8      // right*256
    960        shll            v21.8h,  v5.8b,   #8
    961        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
    962        zip1            v0.2s,   v3.2s,   v2.2s
    963        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
    964        zip1            v18.2s,  v18.2s,  v19.2s
    965        shll            v22.8h,  v4.8b,   #8      // bottom*256
    966        shll            v23.8h,  v4.8b,   #8
    967        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
    968        usubl           v1.8h,   v1.8b,   v5.8b
    969        uxtl            v16.8h,  v16.8b           // weights_ver
    970        uxtl            v18.8h,  v18.8b
    971        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
    972        mla             v21.8h,  v1.8h,   v7.8h
    973        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
    974        mla             v23.8h,  v6.8h,   v18.8h
    975        uhadd           v20.8h,  v20.8h,  v22.8h
    976        uhadd           v21.8h,  v21.8h,  v23.8h
    977        rshrn           v20.8b,  v20.8h,  #8
    978        rshrn           v21.8b,  v21.8h,  #8
    979        st1             {v20.s}[0], [x0], x1
    980        st1             {v20.s}[1], [x6], x1
    981        subs            w4,  w4,  #4
    982        st1             {v21.s}[0], [x0], x1
    983        st1             {v21.s}[1], [x6], x1
    984        b.gt            4b
    985        ret
    986 80:
    987        AARCH64_VALID_JUMP_TARGET
    988        ld1             {v6.8b}, [x8]             // top
    989        ld1             {v7.8b}, [x10]            // weights_hor
    990        sub             x2,  x2,  #4
    991        mov             x7,  #-4
    992        dup             v5.16b,  v6.b[7]          // right
    993        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
    994        uxtl            v7.8h,   v7.8b            // weights_hor
    995 8:
    996        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
    997        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
    998        shll            v20.8h,  v5.8b,   #8      // right*256
    999        shll            v21.8h,  v5.8b,   #8
   1000        shll            v22.8h,  v5.8b,   #8
   1001        shll            v23.8h,  v5.8b,   #8
   1002        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
   1003        usubl           v1.8h,   v1.8b,   v5.8b
   1004        usubl           v2.8h,   v2.8b,   v5.8b
   1005        usubl           v3.8h,   v3.8b,   v5.8b
   1006        shll            v24.8h,  v4.8b,   #8      // bottom*256
   1007        shll            v25.8h,  v4.8b,   #8
   1008        shll            v26.8h,  v4.8b,   #8
   1009        shll            v27.8h,  v4.8b,   #8
   1010        uxtl            v16.8h,  v16.8b           // weights_ver
   1011        uxtl            v17.8h,  v17.8b
   1012        uxtl            v18.8h,  v18.8b
   1013        uxtl            v19.8h,  v19.8b
   1014        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
   1015        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
   1016        mla             v22.8h,  v1.8h,   v7.8h
   1017        mla             v23.8h,  v0.8h,   v7.8h
   1018        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
   1019        mla             v25.8h,  v6.8h,   v17.8h
   1020        mla             v26.8h,  v6.8h,   v18.8h
   1021        mla             v27.8h,  v6.8h,   v19.8h
   1022        uhadd           v20.8h,  v20.8h,  v24.8h
   1023        uhadd           v21.8h,  v21.8h,  v25.8h
   1024        uhadd           v22.8h,  v22.8h,  v26.8h
   1025        uhadd           v23.8h,  v23.8h,  v27.8h
   1026        rshrn           v20.8b,  v20.8h,  #8
   1027        rshrn           v21.8b,  v21.8h,  #8
   1028        rshrn           v22.8b,  v22.8h,  #8
   1029        rshrn           v23.8b,  v23.8h,  #8
   1030        st1             {v20.8b}, [x0], x1
   1031        st1             {v21.8b}, [x6], x1
   1032        subs            w4,  w4,  #4
   1033        st1             {v22.8b}, [x0], x1
   1034        st1             {v23.8b}, [x6], x1
   1035        b.gt            8b
   1036        ret
   1037 160:
   1038 320:
   1039 640:
   1040        AARCH64_VALID_JUMP_TARGET
   1041        add             x12, x2,  w3, uxtw
   1042        sub             x2,  x2,  #2
   1043        mov             x7,  #-2
   1044        ld1r            {v5.16b}, [x12]           // right
   1045        sub             x1,  x1,  w3, uxtw
   1046        mov             w9,  w3
   1047 
   1048 1:
   1049        ld2r            {v0.8b, v1.8b},   [x2],  x7 // left
   1050        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
   1051        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
   1052        usubl           v1.8h,   v1.8b,   v5.8b
   1053        uxtl            v16.8h,  v16.8b           // weights_ver
   1054        uxtl            v17.8h,  v17.8b
   1055 2:
   1056        ld1             {v7.16b}, [x10],  #16     // weights_hor
   1057        ld1             {v3.16b}, [x8],   #16     // top
   1058        shll            v20.8h,  v5.8b,   #8      // right*256
   1059        shll            v21.8h,  v5.8b,   #8
   1060        shll            v22.8h,  v5.8b,   #8
   1061        shll            v23.8h,  v5.8b,   #8
   1062        uxtl            v6.8h,   v7.8b            // weights_hor
   1063        uxtl2           v7.8h,   v7.16b
   1064        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
   1065        usubl2          v3.8h,   v3.16b,  v4.16b
   1066        mla             v20.8h,  v1.8h,   v6.8h   // right*256  + (left-right)*weights_hor
   1067        mla             v21.8h,  v1.8h,   v7.8h   // (left flipped)
   1068        mla             v22.8h,  v0.8h,   v6.8h
   1069        mla             v23.8h,  v0.8h,   v7.8h
   1070        shll            v24.8h,  v4.8b,   #8      // bottom*256
   1071        shll            v25.8h,  v4.8b,   #8
   1072        shll            v26.8h,  v4.8b,   #8
   1073        shll            v27.8h,  v4.8b,   #8
   1074        mla             v24.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
   1075        mla             v25.8h,  v3.8h,   v16.8h
   1076        mla             v26.8h,  v2.8h,   v17.8h
   1077        mla             v27.8h,  v3.8h,   v17.8h
   1078        uhadd           v20.8h,  v20.8h,  v24.8h
   1079        uhadd           v21.8h,  v21.8h,  v25.8h
   1080        uhadd           v22.8h,  v22.8h,  v26.8h
   1081        uhadd           v23.8h,  v23.8h,  v27.8h
   1082        rshrn           v20.8b,  v20.8h,  #8
   1083        rshrn2          v20.16b, v21.8h,  #8
   1084        rshrn           v22.8b,  v22.8h,  #8
   1085        rshrn2          v22.16b, v23.8h,  #8
   1086        subs            w3,  w3,  #16
   1087        st1             {v20.16b}, [x0],  #16
   1088        st1             {v22.16b}, [x6],  #16
   1089        b.gt            2b
   1090        subs            w4,  w4,  #2
   1091        b.le            9f
   1092        sub             x8,  x8,  w9, uxtw
   1093        sub             x10, x10, w9, uxtw
   1094        add             x0,  x0,  x1
   1095        add             x6,  x6,  x1
   1096        mov             w3,  w9
   1097        b               1b
   1098 9:
   1099        ret
   1100 endfunc
   1101 
   1102 jumptable ipred_smooth_tbl
   1103        .word 640b - ipred_smooth_tbl
   1104        .word 320b - ipred_smooth_tbl
   1105        .word 160b - ipred_smooth_tbl
   1106        .word 80b  - ipred_smooth_tbl
   1107        .word 40b  - ipred_smooth_tbl
   1108 endjumptable
   1109 
   1110 // void ipred_smooth_v_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   1111 //                               const pixel *const topleft,
   1112 //                               const int width, const int height, const int a,
   1113 //                               const int max_width, const int max_height);
   1114 function ipred_smooth_v_8bpc_neon, export=1
   1115        movrel          x7,  X(sm_weights)
   1116        add             x7,  x7,  w4, uxtw
   1117        clz             w9,  w3
   1118        movrel          x5,  ipred_smooth_v_tbl
   1119        sub             x8,  x2,  w4, uxtw
   1120        sub             w9,  w9,  #25
   1121        ldrsw           x9,  [x5, w9, uxtw #2]
   1122        ld1r            {v4.16b},  [x8] // bottom
   1123        add             x2,  x2,  #1
   1124        add             x5,  x5,  x9
   1125        add             x6,  x0,  x1
   1126        lsl             x1,  x1,  #1
   1127        br              x5
   1128 40:
   1129        AARCH64_VALID_JUMP_TARGET
   1130        ld1r            {v6.2s}, [x2]             // top
   1131        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
   1132 4:
   1133        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
   1134        shll            v22.8h,  v4.8b,   #8      // bottom*256
   1135        shll            v23.8h,  v4.8b,   #8
   1136        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
   1137        zip1            v18.2s,  v18.2s,  v19.2s
   1138        uxtl            v16.8h,  v16.8b           // weights_ver
   1139        uxtl            v18.8h,  v18.8b
   1140        mla             v22.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
   1141        mla             v23.8h,  v6.8h,   v18.8h
   1142        rshrn           v22.8b,  v22.8h,  #8
   1143        rshrn           v23.8b,  v23.8h,  #8
   1144        st1             {v22.s}[0], [x0], x1
   1145        st1             {v22.s}[1], [x6], x1
   1146        subs            w4,  w4,  #4
   1147        st1             {v23.s}[0], [x0], x1
   1148        st1             {v23.s}[1], [x6], x1
   1149        b.gt            4b
   1150        ret
   1151 80:
   1152        AARCH64_VALID_JUMP_TARGET
   1153        ld1             {v6.8b}, [x2]             // top
   1154        usubl           v6.8h,   v6.8b,   v4.8b   // top-bottom
   1155 8:
   1156        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
   1157        shll            v24.8h,  v4.8b,   #8      // bottom*256
   1158        shll            v25.8h,  v4.8b,   #8
   1159        shll            v26.8h,  v4.8b,   #8
   1160        shll            v27.8h,  v4.8b,   #8
   1161        uxtl            v16.8h,  v16.8b           // weights_ver
   1162        uxtl            v17.8h,  v17.8b
   1163        uxtl            v18.8h,  v18.8b
   1164        uxtl            v19.8h,  v19.8b
   1165        mla             v24.8h,  v6.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
   1166        mla             v25.8h,  v6.8h,   v17.8h
   1167        mla             v26.8h,  v6.8h,   v18.8h
   1168        mla             v27.8h,  v6.8h,   v19.8h
   1169        rshrn           v24.8b,  v24.8h,  #8
   1170        rshrn           v25.8b,  v25.8h,  #8
   1171        rshrn           v26.8b,  v26.8h,  #8
   1172        rshrn           v27.8b,  v27.8h,  #8
   1173        st1             {v24.8b}, [x0], x1
   1174        st1             {v25.8b}, [x6], x1
   1175        subs            w4,  w4,  #4
   1176        st1             {v26.8b}, [x0], x1
   1177        st1             {v27.8b}, [x6], x1
   1178        b.gt            8b
   1179        ret
   1180 160:
   1181 320:
   1182 640:
   1183        AARCH64_VALID_JUMP_TARGET
   1184        // Set up pointers for four rows in parallel; x0, x6, x5, x8
   1185        add             x5,  x0,  x1
   1186        add             x8,  x6,  x1
   1187        lsl             x1,  x1,  #1
   1188        sub             x1,  x1,  w3, uxtw
   1189        mov             w9,  w3
   1190 
   1191 1:
   1192        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
   1193        uxtl            v16.8h,  v16.8b           // weights_ver
   1194        uxtl            v17.8h,  v17.8b
   1195        uxtl            v18.8h,  v18.8b
   1196        uxtl            v19.8h,  v19.8b
   1197 2:
   1198        ld1             {v3.16b}, [x2],   #16     // top
   1199        shll            v20.8h,  v4.8b,   #8      // bottom*256
   1200        shll            v21.8h,  v4.8b,   #8
   1201        shll            v22.8h,  v4.8b,   #8
   1202        shll            v23.8h,  v4.8b,   #8
   1203        shll            v24.8h,  v4.8b,   #8
   1204        shll            v25.8h,  v4.8b,   #8
   1205        shll            v26.8h,  v4.8b,   #8
   1206        shll            v27.8h,  v4.8b,   #8
   1207        usubl           v2.8h,   v3.8b,   v4.8b   // top-bottom
   1208        usubl2          v3.8h,   v3.16b,  v4.16b
   1209        mla             v20.8h,  v2.8h,   v16.8h  // bottom*256 + (top-bottom)*weights_ver
   1210        mla             v21.8h,  v3.8h,   v16.8h
   1211        mla             v22.8h,  v2.8h,   v17.8h
   1212        mla             v23.8h,  v3.8h,   v17.8h
   1213        mla             v24.8h,  v2.8h,   v18.8h
   1214        mla             v25.8h,  v3.8h,   v18.8h
   1215        mla             v26.8h,  v2.8h,   v19.8h
   1216        mla             v27.8h,  v3.8h,   v19.8h
   1217        rshrn           v20.8b,  v20.8h,  #8
   1218        rshrn2          v20.16b, v21.8h,  #8
   1219        rshrn           v22.8b,  v22.8h,  #8
   1220        rshrn2          v22.16b, v23.8h,  #8
   1221        rshrn           v24.8b,  v24.8h,  #8
   1222        rshrn2          v24.16b, v25.8h,  #8
   1223        rshrn           v26.8b,  v26.8h,  #8
   1224        rshrn2          v26.16b, v27.8h,  #8
   1225        subs            w3,  w3,  #16
   1226        st1             {v20.16b}, [x0],  #16
   1227        st1             {v22.16b}, [x6],  #16
   1228        st1             {v24.16b}, [x5],  #16
   1229        st1             {v26.16b}, [x8],  #16
   1230        b.gt            2b
   1231        subs            w4,  w4,  #4
   1232        b.le            9f
   1233        sub             x2,  x2,  w9, uxtw
   1234        add             x0,  x0,  x1
   1235        add             x6,  x6,  x1
   1236        add             x5,  x5,  x1
   1237        add             x8,  x8,  x1
   1238        mov             w3,  w9
   1239        b               1b
   1240 9:
   1241        ret
   1242 endfunc
   1243 
   1244 jumptable ipred_smooth_v_tbl
   1245        .word 640b - ipred_smooth_v_tbl
   1246        .word 320b - ipred_smooth_v_tbl
   1247        .word 160b - ipred_smooth_v_tbl
   1248        .word 80b  - ipred_smooth_v_tbl
   1249        .word 40b  - ipred_smooth_v_tbl
   1250 endjumptable
   1251 
   1252 // void ipred_smooth_h_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   1253 //                               const pixel *const topleft,
   1254 //                               const int width, const int height, const int a,
   1255 //                               const int max_width, const int max_height);
   1256 function ipred_smooth_h_8bpc_neon, export=1
   1257        movrel          x8,  X(sm_weights)
   1258        add             x8,  x8,  w3, uxtw
   1259        clz             w9,  w3
   1260        movrel          x5,  ipred_smooth_h_tbl
   1261        add             x12, x2,  w3, uxtw
   1262        sub             w9,  w9,  #25
   1263        ldrsw           x9,  [x5, w9, uxtw #2]
   1264        ld1r            {v5.16b},  [x12] // right
   1265        add             x5,  x5,  x9
   1266        add             x6,  x0,  x1
   1267        lsl             x1,  x1,  #1
   1268        br              x5
   1269 40:
   1270        AARCH64_VALID_JUMP_TARGET
   1271        ld1r            {v7.2s}, [x8]             // weights_hor
   1272        sub             x2,  x2,  #4
   1273        mov             x7,  #-4
   1274        uxtl            v7.8h,   v7.8b            // weights_hor
   1275 4:
   1276        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
   1277        shll            v20.8h,  v5.8b,   #8      // right*256
   1278        shll            v21.8h,  v5.8b,   #8
   1279        zip1            v1.2s,   v1.2s,   v0.2s   // left, flipped
   1280        zip1            v0.2s,   v3.2s,   v2.2s
   1281        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
   1282        usubl           v1.8h,   v1.8b,   v5.8b
   1283        mla             v20.8h,  v0.8h,   v7.8h   // right*256  + (left-right)*weights_hor
   1284        mla             v21.8h,  v1.8h,   v7.8h
   1285        rshrn           v20.8b,  v20.8h,  #8
   1286        rshrn           v21.8b,  v21.8h,  #8
   1287        st1             {v20.s}[0], [x0], x1
   1288        st1             {v20.s}[1], [x6], x1
   1289        subs            w4,  w4,  #4
   1290        st1             {v21.s}[0], [x0], x1
   1291        st1             {v21.s}[1], [x6], x1
   1292        b.gt            4b
   1293        ret
   1294 80:
   1295        AARCH64_VALID_JUMP_TARGET
   1296        ld1             {v7.8b}, [x8]             // weights_hor
   1297        sub             x2,  x2,  #4
   1298        mov             x7,  #-4
   1299        uxtl            v7.8h,   v7.8b            // weights_hor
   1300 8:
   1301        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},  [x2], x7 // left
   1302        shll            v20.8h,  v5.8b,   #8      // right*256
   1303        shll            v21.8h,  v5.8b,   #8
   1304        shll            v22.8h,  v5.8b,   #8
   1305        shll            v23.8h,  v5.8b,   #8
   1306        usubl           v3.8h,   v3.8b,   v5.8b   // left-right
   1307        usubl           v2.8h,   v2.8b,   v5.8b
   1308        usubl           v1.8h,   v1.8b,   v5.8b
   1309        usubl           v0.8h,   v0.8b,   v5.8b
   1310        mla             v20.8h,  v3.8h,   v7.8h   // right*256  + (left-right)*weights_hor
   1311        mla             v21.8h,  v2.8h,   v7.8h   // (left flipped)
   1312        mla             v22.8h,  v1.8h,   v7.8h
   1313        mla             v23.8h,  v0.8h,   v7.8h
   1314        rshrn           v20.8b,  v20.8h,  #8
   1315        rshrn           v21.8b,  v21.8h,  #8
   1316        rshrn           v22.8b,  v22.8h,  #8
   1317        rshrn           v23.8b,  v23.8h,  #8
   1318        st1             {v20.8b}, [x0], x1
   1319        st1             {v21.8b}, [x6], x1
   1320        subs            w4,  w4,  #4
   1321        st1             {v22.8b}, [x0], x1
   1322        st1             {v23.8b}, [x6], x1
   1323        b.gt            8b
   1324        ret
   1325 160:
   1326 320:
   1327 640:
   1328        AARCH64_VALID_JUMP_TARGET
   1329        sub             x2,  x2,  #4
   1330        mov             x7,  #-4
   1331        // Set up pointers for four rows in parallel; x0, x6, x5, x10
   1332        add             x5,  x0,  x1
   1333        add             x10, x6,  x1
   1334        lsl             x1,  x1,  #1
   1335        sub             x1,  x1,  w3, uxtw
   1336        mov             w9,  w3
   1337 
   1338 1:
   1339        ld4r            {v0.8b, v1.8b, v2.8b, v3.8b},   [x2],  x7 // left
   1340        usubl           v0.8h,   v0.8b,   v5.8b   // left-right
   1341        usubl           v1.8h,   v1.8b,   v5.8b
   1342        usubl           v2.8h,   v2.8b,   v5.8b
   1343        usubl           v3.8h,   v3.8b,   v5.8b
   1344 2:
   1345        ld1             {v7.16b}, [x8],   #16     // weights_hor
   1346        shll            v20.8h,  v5.8b,   #8      // right*256
   1347        shll            v21.8h,  v5.8b,   #8
   1348        shll            v22.8h,  v5.8b,   #8
   1349        shll            v23.8h,  v5.8b,   #8
   1350        shll            v24.8h,  v5.8b,   #8
   1351        shll            v25.8h,  v5.8b,   #8
   1352        shll            v26.8h,  v5.8b,   #8
   1353        shll            v27.8h,  v5.8b,   #8
   1354        uxtl            v6.8h,   v7.8b            // weights_hor
   1355        uxtl2           v7.8h,   v7.16b
   1356        mla             v20.8h,  v3.8h,   v6.8h   // right*256  + (left-right)*weights_hor
   1357        mla             v21.8h,  v3.8h,   v7.8h   // (left flipped)
   1358        mla             v22.8h,  v2.8h,   v6.8h
   1359        mla             v23.8h,  v2.8h,   v7.8h
   1360        mla             v24.8h,  v1.8h,   v6.8h
   1361        mla             v25.8h,  v1.8h,   v7.8h
   1362        mla             v26.8h,  v0.8h,   v6.8h
   1363        mla             v27.8h,  v0.8h,   v7.8h
   1364        rshrn           v20.8b,  v20.8h,  #8
   1365        rshrn2          v20.16b, v21.8h,  #8
   1366        rshrn           v22.8b,  v22.8h,  #8
   1367        rshrn2          v22.16b, v23.8h,  #8
   1368        rshrn           v24.8b,  v24.8h,  #8
   1369        rshrn2          v24.16b, v25.8h,  #8
   1370        rshrn           v26.8b,  v26.8h,  #8
   1371        rshrn2          v26.16b, v27.8h,  #8
   1372        subs            w3,  w3,  #16
   1373        st1             {v20.16b}, [x0],  #16
   1374        st1             {v22.16b}, [x6],  #16
   1375        st1             {v24.16b}, [x5],  #16
   1376        st1             {v26.16b}, [x10], #16
   1377        b.gt            2b
   1378        subs            w4,  w4,  #4
   1379        b.le            9f
   1380        sub             x8,  x8,  w9, uxtw
   1381        add             x0,  x0,  x1
   1382        add             x6,  x6,  x1
   1383        add             x5,  x5,  x1
   1384        add             x10, x10, x1
   1385        mov             w3,  w9
   1386        b               1b
   1387 9:
   1388        ret
   1389 endfunc
   1390 
   1391 jumptable ipred_smooth_h_tbl
   1392        .word 640b - ipred_smooth_h_tbl
   1393        .word 320b - ipred_smooth_h_tbl
   1394        .word 160b - ipred_smooth_h_tbl
   1395        .word 80b  - ipred_smooth_h_tbl
   1396        .word 40b  - ipred_smooth_h_tbl
   1397 endjumptable
   1398 
   1399 const padding_mask_buf
   1400        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1401        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1402        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1403        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1404 padding_mask:
   1405        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1406        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1407        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1408        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1409 endconst
   1410 
   1411 // void ipred_z1_upsample_edge_8bpc_neon(pixel *out, const int hsz,
   1412 //                                       const pixel *const in, const int end);
   1413 function ipred_z1_upsample_edge_8bpc_neon, export=1
   1414        movrel          x4,  padding_mask
   1415        ld1             {v0.16b},  [x2]           // in[]
   1416        add             x5,  x2,  w3,  uxtw       // in[end]
   1417        sub             x4,  x4,  w3,  uxtw
   1418 
   1419        ld1r            {v1.16b},  [x5]           // padding
   1420        ld1             {v3.16b},  [x4]           // padding_mask
   1421 
   1422        movi            v31.8h,  #9
   1423 
   1424        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
   1425 
   1426        ext             v4.16b,  v0.16b,  v1.16b,  #1
   1427        ext             v5.16b,  v0.16b,  v1.16b,  #2
   1428        ext             v6.16b,  v0.16b,  v1.16b,  #3
   1429 
   1430        uaddl           v16.8h,  v4.8b,   v5.8b   // in[i+1] + in[i+2]
   1431        uaddl2          v17.8h,  v4.16b,  v5.16b
   1432        uaddl           v18.8h,  v0.8b,   v6.8b   // in[i+0] + in[i+3]
   1433        uaddl2          v19.8h,  v0.16b,  v6.16b
   1434        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
   1435        mul             v17.8h,  v17.8h,  v31.8h
   1436        sub             v16.8h,  v16.8h,  v18.8h
   1437        sub             v17.8h,  v17.8h,  v19.8h
   1438 
   1439        sqrshrun        v16.8b,  v16.8h,  #4
   1440        sqrshrun2       v16.16b, v17.8h,  #4
   1441 
   1442        zip1            v0.16b,  v4.16b,  v16.16b
   1443        zip2            v1.16b,  v4.16b,  v16.16b
   1444 
   1445        st1             {v0.16b, v1.16b}, [x0]
   1446 
   1447        ret
   1448 endfunc
   1449 
   1450 // void ipred_z2_upsample_edge_8bpc_neon(pixel *out, const int sz,
   1451 //                                       const pixel *const in);
   1452 function ipred_z2_upsample_edge_8bpc_neon, export=1
   1453        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
   1454        movrel          x4,  padding_mask
   1455        ld1             {v0.16b},  [x2]           // in[]
   1456        add             x5,  x2,  w1,  uxtw       // in[sz]
   1457        sub             x4,  x4,  w1,  uxtw
   1458 
   1459        ld1r            {v2.16b},  [x2]           // in[0] for padding
   1460        ld1r            {v1.16b},  [x5]           // padding
   1461        ld1             {v3.16b},  [x4]           // padding_mask
   1462 
   1463        movi            v31.8h,  #9
   1464 
   1465        bit             v0.16b,  v1.16b,  v3.16b  // padded in[]
   1466 
   1467        ext             v4.16b,  v2.16b,  v0.16b,  #15
   1468        ext             v5.16b,  v0.16b,  v1.16b,  #1
   1469        ext             v6.16b,  v0.16b,  v1.16b,  #2
   1470 
   1471        uaddl           v16.8h,  v0.8b,   v5.8b   // in[i+0] + in[i+1]
   1472        uaddl           v18.8h,  v4.8b,   v6.8b   // in[i-1] + in[i+2]
   1473        mul             v16.8h,  v16.8h,  v31.8h  // 9*(in[i+1] + in[i+2])
   1474        sub             v16.8h,  v16.8h,  v18.8h
   1475 
   1476        sqrshrun        v16.8b,  v16.8h,  #4
   1477 
   1478        add             x5,  x0,  #16
   1479 
   1480        zip1            v2.16b,  v0.16b,  v16.16b
   1481 
   1482        st1             {v1.b}[0], [x5]
   1483        // In case sz=8, output one single pixel in out[16].
   1484        st1             {v2.16b}, [x0]
   1485 
   1486        ret
   1487 endfunc
   1488 
   1489 const edge_filter
   1490        .byte 0, 4, 8, 0
   1491        .byte 0, 5, 6, 0
   1492 // Leaving out the coeffs for strength=3
   1493 //      .byte 2, 4, 4, 0
   1494 endconst
   1495 
   1496 // void ipred_z1_filter_edge_8bpc_neon(pixel *out, const int sz,
   1497 //                                     const pixel *const in, const int end,
   1498 //                                     const int strength);
   1499 function ipred_z1_filter_edge_8bpc_neon, export=1
   1500        cmp             w4, #3
   1501        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
   1502 
   1503        movrel          x5,  edge_filter, -3
   1504        add             x5,  x5,  w4,  uxtw #2    // edge_filter + (strength - 1)*4 + 1
   1505 
   1506        ld1             {v31.h}[0], [x5]          // kernel[1-2]
   1507 
   1508        ld1             {v0.16b}, [x2], #16
   1509 
   1510        dup             v30.16b, v31.b[0]
   1511        dup             v31.16b, v31.b[1]
   1512 1:
   1513        // in[end], is the last valid pixel. We produce 16 pixels out by
   1514        // using 18 pixels in - the last pixel used is [17] of the ones
   1515        // read/buffered.
   1516        cmp             w3,  #17
   1517        ld1             {v1.16b}, [x2], #16
   1518        b.lt            2f
   1519        ext             v2.16b,  v0.16b,  v1.16b,  #1
   1520        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1521        umull           v4.8h,   v0.8b,   v30.8b
   1522        umlal           v4.8h,   v2.8b,   v31.8b
   1523        umlal           v4.8h,   v3.8b,   v30.8b
   1524        umull2          v5.8h,   v0.16b,  v30.16b
   1525        umlal2          v5.8h,   v2.16b,  v31.16b
   1526        umlal2          v5.8h,   v3.16b,  v30.16b
   1527        subs            w1,  w1,  #16
   1528        mov             v0.16b,  v1.16b
   1529        rshrn           v4.8b,   v4.8h,   #4
   1530        rshrn2          v4.16b,  v5.8h,   #4
   1531        sub             w3,  w3,  #16
   1532        st1             {v4.16b}, [x0], #16
   1533        b.gt            1b
   1534        ret
   1535 2:
   1536        // Right padding
   1537 
   1538        // x2[w3-32] is the padding pixel (x2 points 32 bytes ahead)
   1539        movrel          x5,  padding_mask
   1540        sub             w6,  w3,  #32
   1541        sub             x5,  x5,  w3,  uxtw
   1542        add             x6,  x2,  w6,  sxtw
   1543 
   1544        ld1             {v2.16b}, [x5]            // padding_mask
   1545 
   1546        ld1r            {v1.16b}, [x6]
   1547        bit             v0.16b,  v1.16b,  v2.16b  // Pad v0-v1
   1548 
   1549        // Filter one block
   1550        ext             v2.16b,  v0.16b,  v1.16b,  #1
   1551        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1552        umull           v4.8h,   v0.8b,   v30.8b
   1553        umlal           v4.8h,   v2.8b,   v31.8b
   1554        umlal           v4.8h,   v3.8b,   v30.8b
   1555        umull2          v5.8h,   v0.16b,  v30.16b
   1556        umlal2          v5.8h,   v2.16b,  v31.16b
   1557        umlal2          v5.8h,   v3.16b,  v30.16b
   1558        subs            w1,  w1,  #16
   1559        rshrn           v4.8b,   v4.8h,   #4
   1560        rshrn2          v4.16b,  v5.8h,   #4
   1561        st1             {v4.16b}, [x0], #16
   1562        b.le            9f
   1563 5:
   1564        // After one block, any remaining output would only be filtering
   1565        // padding - thus just store the padding.
   1566        subs            w1,  w1,  #16
   1567        st1             {v1.16b}, [x0], #16
   1568        b.gt            5b
   1569 9:
   1570        ret
   1571 
   1572 L(fivetap):
   1573        sub             x2,  x2,  #1              // topleft -= 1
   1574        movi            v29.16b, #2
   1575        ld1             {v0.16b}, [x2], #16
   1576        movi            v30.16b, #4
   1577        movi            v31.16b, #4
   1578        ins             v0.b[0], v0.b[1]
   1579 1:
   1580        // in[end+1], is the last valid pixel. We produce 16 pixels out by
   1581        // using 20 pixels in - the last pixel used is [19] of the ones
   1582        // read/buffered.
   1583        cmp             w3,  #18
   1584        ld1             {v1.16b}, [x2], #16
   1585        b.lt            2f                        // if (end + 1 < 19)
   1586        ext             v2.16b,  v0.16b,  v1.16b,  #1
   1587        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1588        ext             v4.16b,  v0.16b,  v1.16b,  #3
   1589        ext             v5.16b,  v0.16b,  v1.16b,  #4
   1590        umull           v6.8h,   v0.8b,   v29.8b
   1591        umlal           v6.8h,   v2.8b,   v30.8b
   1592        umlal           v6.8h,   v3.8b,   v31.8b
   1593        umlal           v6.8h,   v4.8b,   v30.8b
   1594        umlal           v6.8h,   v5.8b,   v29.8b
   1595        umull2          v7.8h,   v0.16b,  v29.16b
   1596        umlal2          v7.8h,   v2.16b,  v30.16b
   1597        umlal2          v7.8h,   v3.16b,  v31.16b
   1598        umlal2          v7.8h,   v4.16b,  v30.16b
   1599        umlal2          v7.8h,   v5.16b,  v29.16b
   1600        subs            w1,  w1,  #16
   1601        mov             v0.16b,  v1.16b
   1602        rshrn           v6.8b,   v6.8h,   #4
   1603        rshrn2          v6.16b,  v7.8h,   #4
   1604        sub             w3,  w3,  #16
   1605        st1             {v6.16b}, [x0], #16
   1606        b.gt            1b
   1607        ret
   1608 2:
   1609        // Right padding
   1610 
   1611        // x2[w3+1-32] is the padding pixel (x2 points 32 bytes ahead)
   1612        movrel          x5,  padding_mask, -1
   1613        sub             w6,  w3,  #31
   1614        sub             x5,  x5,  w3,  uxtw
   1615        add             x6,  x2,  w6,  sxtw
   1616 
   1617        ld1             {v2.16b, v3.16b}, [x5]    // padding_mask
   1618 
   1619        ld1r            {v28.16b}, [x6]
   1620        bit             v0.16b,  v28.16b, v2.16b  // Pad v0-v1
   1621        bit             v1.16b,  v28.16b, v3.16b
   1622 4:
   1623        // Filter one block
   1624        ext             v2.16b,  v0.16b,  v1.16b,  #1
   1625        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1626        ext             v4.16b,  v0.16b,  v1.16b,  #3
   1627        ext             v5.16b,  v0.16b,  v1.16b,  #4
   1628        umull           v6.8h,   v0.8b,   v29.8b
   1629        umlal           v6.8h,   v2.8b,   v30.8b
   1630        umlal           v6.8h,   v3.8b,   v31.8b
   1631        umlal           v6.8h,   v4.8b,   v30.8b
   1632        umlal           v6.8h,   v5.8b,   v29.8b
   1633        umull2          v7.8h,   v0.16b,  v29.16b
   1634        umlal2          v7.8h,   v2.16b,  v30.16b
   1635        umlal2          v7.8h,   v3.16b,  v31.16b
   1636        umlal2          v7.8h,   v4.16b,  v30.16b
   1637        umlal2          v7.8h,   v5.16b,  v29.16b
   1638        subs            w1,  w1,  #16
   1639        mov             v0.16b,  v1.16b
   1640        mov             v1.16b,  v28.16b
   1641        rshrn           v6.8b,   v6.8h,   #4
   1642        rshrn2          v6.16b,  v7.8h,   #4
   1643        sub             w3,  w3,  #16
   1644        st1             {v6.16b}, [x0], #16
   1645        b.le            9f
   1646        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
   1647        // filter properly once more - aka (w3 >= 0).
   1648        cmp             w3,  #0
   1649        b.ge            4b
   1650 5:
   1651        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
   1652        // last valid pixel - thus just output that without filtering.
   1653        subs            w1,  w1,  #16
   1654        st1             {v1.16b}, [x0], #16
   1655        b.gt            5b
   1656 9:
   1657        ret
   1658 endfunc
   1659 
   1660 // void ipred_pixel_set_8bpc_neon(pixel *out, const pixel px,
   1661 //                                const int n);
   1662 function ipred_pixel_set_8bpc_neon, export=1
   1663        dup             v0.16b,  w1
   1664 1:
   1665        subs            w2,  w2,  #16
   1666        st1             {v0.16b}, [x0], #16
   1667        b.gt            1b
   1668        ret
   1669 endfunc
   1670 
   1671 // void ipred_z1_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   1672 //                               const pixel *const top,
   1673 //                               const int width, const int height,
   1674 //                               const int dx, const int max_base_x);
   1675 function ipred_z1_fill1_8bpc_neon, export=1
   1676        clz             w9,  w3
   1677        movrel          x8,  ipred_z1_fill1_tbl
   1678        sub             w9,  w9,  #25
   1679        ldrsw           x9,  [x8, w9, uxtw #2]
   1680        add             x10, x2,  w6,  uxtw       // top[max_base_x]
   1681        add             x8,  x8,  x9
   1682        ld1r            {v31.16b}, [x10]          // padding
   1683        mov             w7,  w5
   1684        mov             w15, #64
   1685        br              x8
   1686 40:
   1687        AARCH64_VALID_JUMP_TARGET
   1688 4:
   1689        lsr             w8,  w7,  #6              // base
   1690        and             w9,  w7,  #0x3e           // frac
   1691        add             w7,  w7,  w5              // xpos += dx
   1692        cmp             w8,  w6                   // base >= max_base_x
   1693        lsr             w10, w7,  #6              // base
   1694        and             w11, w7,  #0x3e           // frac
   1695        b.ge            49f
   1696        ldr             d0,  [x2, w8, uxtw]       // top[base]
   1697        ldr             d2,  [x2, w10, uxtw]
   1698        dup             v4.4h,   w9               // frac
   1699        dup             v5.4h,   w11
   1700        ext             v1.8b,   v0.8b,   v0.8b,   #1 // top[base+1]
   1701        ext             v3.8b,   v2.8b,   v2.8b,   #1
   1702        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
   1703        usubl           v7.8h,   v3.8b,   v2.8b
   1704        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
   1705        ushll           v17.8h,  v2.8b,   #6
   1706        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
   1707        mla             v17.4h,  v7.4h,   v5.4h
   1708        rshrn           v16.8b,  v16.8h,  #6
   1709        rshrn           v17.8b,  v17.8h,  #6
   1710        st1             {v16.s}[0], [x0], x1
   1711        add             w7,  w7,  w5              // xpos += dx
   1712        subs            w4,  w4,  #2
   1713        st1             {v17.s}[0], [x0], x1
   1714        b.gt            4b
   1715        ret
   1716 
   1717 49:
   1718        st1             {v31.s}[0], [x0], x1
   1719        subs            w4,  w4,  #2
   1720        st1             {v31.s}[0], [x0], x1
   1721        b.gt            49b
   1722        ret
   1723 
   1724 80:
   1725        AARCH64_VALID_JUMP_TARGET
   1726 8:
   1727        lsr             w8,  w7,  #6              // base
   1728        and             w9,  w7,  #0x3e           // frac
   1729        add             w7,  w7,  w5              // xpos += dx
   1730        cmp             w8,  w6                   // base >= max_base_x
   1731        lsr             w10, w7,  #6              // base
   1732        and             w11, w7,  #0x3e           // frac
   1733        b.ge            89f
   1734        ldr             q0,  [x2, w8, uxtw]       // top[base]
   1735        ldr             q2,  [x2, w10, uxtw]
   1736        dup             v4.8b,   w9               // frac
   1737        dup             v5.8b,   w11
   1738        sub             w9,  w15, w9              // 64 - frac
   1739        sub             w11, w15, w11
   1740        dup             v6.8b,   w9               // 64 - frac
   1741        dup             v7.8b,   w11
   1742        ext             v1.16b,  v0.16b,  v0.16b,  #1 // top[base+1]
   1743        ext             v3.16b,  v2.16b,  v2.16b,  #1
   1744        umull           v16.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
   1745        umlal           v16.8h,  v1.8b,   v4.8b   // + top[base+1]*frac
   1746        umull           v17.8h,  v2.8b,   v7.8b
   1747        umlal           v17.8h,  v3.8b,   v5.8b
   1748        rshrn           v16.8b,  v16.8h,  #6
   1749        rshrn           v17.8b,  v17.8h,  #6
   1750        st1             {v16.8b}, [x0], x1
   1751        add             w7,  w7,  w5              // xpos += dx
   1752        subs            w4,  w4,  #2
   1753        st1             {v17.8b}, [x0], x1
   1754        b.gt            8b
   1755        ret
   1756 
   1757 89:
   1758        st1             {v31.8b}, [x0], x1
   1759        subs            w4,  w4,  #2
   1760        st1             {v31.8b}, [x0], x1
   1761        b.gt            89b
   1762        ret
   1763 
   1764 160:
   1765 320:
   1766 640:
   1767        AARCH64_VALID_JUMP_TARGET
   1768 
   1769        mov             w12, w3
   1770 
   1771        add             x13, x0,  x1
   1772        lsl             x1,  x1,  #1
   1773        sub             x1,  x1,  w3,  uxtw
   1774 1:
   1775        lsr             w8,  w7,  #6              // base
   1776        and             w9,  w7,  #0x3e           // frac
   1777        add             w7,  w7,  w5              // xpos += dx
   1778        cmp             w8,  w6                   // base >= max_base_x
   1779        lsr             w10, w7,  #6              // base
   1780        and             w11, w7,  #0x3e           // frac
   1781        b.ge            169f
   1782        add             x8,  x2,  w8,  uxtw
   1783        add             x10, x2,  w10, uxtw
   1784        dup             v4.16b,  w9               // frac
   1785        dup             v5.16b,  w11
   1786        ld1             {v0.16b, v1.16b}, [x8],  #32 // top[base]
   1787        ld1             {v2.16b, v3.16b}, [x10], #32
   1788        sub             w9,  w15, w9              // 64 - frac
   1789        sub             w11, w15, w11
   1790        dup             v6.16b,  w9               // 64 - frac
   1791        dup             v7.16b,  w11
   1792        add             w7,  w7,  w5              // xpos += dx
   1793 2:
   1794        ext             v16.16b, v0.16b,  v1.16b,  #1 // top[base+1]
   1795        ext             v17.16b, v2.16b,  v3.16b,  #1
   1796        subs            w3,  w3,  #16
   1797        umull           v18.8h,  v0.8b,   v6.8b   // top[base]*(64-frac)
   1798        umlal           v18.8h,  v16.8b,  v4.8b   // + top[base+1]*frac
   1799        umull2          v19.8h,  v0.16b,  v6.16b
   1800        umlal2          v19.8h,  v16.16b, v4.16b
   1801        umull           v20.8h,  v2.8b,   v7.8b
   1802        umlal           v20.8h,  v17.8b,  v5.8b
   1803        umull2          v21.8h,  v2.16b,  v7.16b
   1804        umlal2          v21.8h,  v17.16b, v5.16b
   1805        rshrn           v16.8b,  v18.8h,  #6
   1806        rshrn2          v16.16b, v19.8h,  #6
   1807        rshrn           v17.8b,  v20.8h,  #6
   1808        rshrn2          v17.16b, v21.8h,  #6
   1809        st1             {v16.16b}, [x0],  #16
   1810        st1             {v17.16b}, [x13], #16
   1811        b.le            3f
   1812        mov             v0.16b,  v1.16b
   1813        ld1             {v1.16b}, [x8],  #16 // top[base]
   1814        mov             v2.16b,  v3.16b
   1815        ld1             {v3.16b}, [x10], #16
   1816        b               2b
   1817 
   1818 3:
   1819        subs            w4,  w4,  #2
   1820        b.le            9f
   1821        add             x0,  x0,  x1
   1822        add             x13, x13, x1
   1823        mov             w3,  w12
   1824        b               1b
   1825 9:
   1826        ret
   1827 
   1828 169:
   1829        st1             {v31.16b}, [x0],  #16
   1830        subs            w3,  w3,  #16
   1831        st1             {v31.16b}, [x13], #16
   1832        b.gt            169b
   1833        subs            w4,  w4,  #2
   1834        b.le            9b
   1835        add             x0,  x0,  x1
   1836        add             x13, x13, x1
   1837        mov             w3,  w12
   1838        b               169b
   1839 endfunc
   1840 
   1841 jumptable ipred_z1_fill1_tbl
   1842        .word 640b - ipred_z1_fill1_tbl
   1843        .word 320b - ipred_z1_fill1_tbl
   1844        .word 160b - ipred_z1_fill1_tbl
   1845        .word 80b  - ipred_z1_fill1_tbl
   1846        .word 40b  - ipred_z1_fill1_tbl
   1847 endjumptable
   1848 
   1849 function ipred_z1_fill2_8bpc_neon, export=1
   1850        cmp             w3,  #8
   1851        add             x10, x2,  w6,  uxtw       // top[max_base_x]
   1852        ld1r            {v31.16b}, [x10]          // padding
   1853        mov             w7,  w5
   1854        mov             w15, #64
   1855        b.eq            8f
   1856 
   1857 4:      // w == 4
   1858        lsr             w8,  w7,  #6              // base
   1859        and             w9,  w7,  #0x3e           // frac
   1860        add             w7,  w7,  w5              // xpos += dx
   1861        cmp             w8,  w6                   // base >= max_base_x
   1862        lsr             w10, w7,  #6              // base
   1863        and             w11, w7,  #0x3e           // frac
   1864        b.ge            49f
   1865        ldr             d0,  [x2, w8, uxtw]       // top[base]
   1866        ldr             d2,  [x2, w10, uxtw]
   1867        dup             v4.4h,   w9               // frac
   1868        dup             v5.4h,   w11
   1869        uzp2            v1.8b,   v0.8b,   v0.8b   // top[base+1]
   1870        uzp1            v0.8b,   v0.8b,   v0.8b   // top[base]
   1871        uzp2            v3.8b,   v2.8b,   v2.8b
   1872        uzp1            v2.8b,   v2.8b,   v2.8b
   1873        usubl           v6.8h,   v1.8b,   v0.8b   // top[base+1]-top[base]
   1874        usubl           v7.8h,   v3.8b,   v2.8b
   1875        ushll           v16.8h,  v0.8b,   #6      // top[base]*64
   1876        ushll           v17.8h,  v2.8b,   #6
   1877        mla             v16.4h,  v6.4h,   v4.4h   // + top[base+1]*frac
   1878        mla             v17.4h,  v7.4h,   v5.4h
   1879        rshrn           v16.8b,  v16.8h,  #6
   1880        rshrn           v17.8b,  v17.8h,  #6
   1881        st1             {v16.s}[0], [x0], x1
   1882        add             w7,  w7,  w5              // xpos += dx
   1883        subs            w4,  w4,  #2
   1884        st1             {v17.s}[0], [x0], x1
   1885        b.gt            4b
   1886        ret
   1887 
   1888 49:
   1889        st1             {v31.s}[0], [x0], x1
   1890        subs            w4,  w4,  #2
   1891        st1             {v31.s}[0], [x0], x1
   1892        b.gt            49b
   1893        ret
   1894 
   1895 8:      // w == 8
   1896        lsr             w8,  w7,  #6              // base
   1897        and             w9,  w7,  #0x3e           // frac
   1898        add             w7,  w7,  w5              // xpos += dx
   1899        cmp             w8,  w6                   // base >= max_base_x
   1900        lsr             w10, w7,  #6              // base
   1901        and             w11, w7,  #0x3e           // frac
   1902        b.ge            89f
   1903        ldr             q0,  [x2, w8, uxtw]       // top[base]
   1904        ldr             q2,  [x2, w10, uxtw]
   1905        dup             v4.8b,   w9               // frac
   1906        dup             v5.8b,   w11
   1907        sub             w9,  w15, w9              // 64 - frac
   1908        sub             w11, w15, w11
   1909        dup             v6.8b,   w9               // 64 - frac
   1910        dup             v7.8b,   w11
   1911        uzp2            v1.16b,  v0.16b,  v0.16b  // top[base+1]
   1912        uzp1            v0.16b,  v0.16b,  v0.16b  // top[base]
   1913        uzp2            v3.16b,  v2.16b,  v2.16b
   1914        uzp1            v2.16b,  v2.16b,  v2.16b
   1915        umull           v16.8h,  v1.8b,   v4.8b   // top[base+1]*frac
   1916        umlal           v16.8h,  v0.8b,   v6.8b   // + top[base]*(64-frac)
   1917        umull           v17.8h,  v3.8b,   v5.8b
   1918        umlal           v17.8h,  v2.8b,   v7.8b
   1919        rshrn           v16.8b,  v16.8h,  #6
   1920        rshrn           v17.8b,  v17.8h,  #6
   1921        st1             {v16.8b}, [x0], x1
   1922        add             w7,  w7,  w5              // xpos += dx
   1923        subs            w4,  w4,  #2
   1924        st1             {v17.8b}, [x0], x1
   1925        b.gt            8b
   1926        ret
   1927 
   1928 89:
   1929        st1             {v31.8b}, [x0], x1
   1930        subs            w4,  w4,  #2
   1931        st1             {v31.8b}, [x0], x1
   1932        b.gt            89b
   1933        ret
   1934 endfunc
   1935 
   1936 // void ipred_reverse_8bpc_neon(pixel *dst, const pixel *const src,
   1937 //                              const int n);
   1938 function ipred_reverse_8bpc_neon, export=1
   1939        sub             x1,  x1,  #16
   1940        add             x3,  x0,  #8
   1941        mov             x4,  #16
   1942 1:
   1943        ld1             {v0.16b}, [x1]
   1944        subs            w2,  w2,  #16
   1945        rev64           v0.16b,  v0.16b
   1946        sub             x1,  x1,  #16
   1947        st1             {v0.d}[1], [x0], x4
   1948        st1             {v0.d}[0], [x3], x4
   1949        b.gt            1b
   1950        ret
   1951 endfunc
   1952 
   1953 const increments
   1954        .short          0,  1,  2,  3,  4,  5,  6,  7
   1955        .short          8,  9,  10, 11, 12, 13, 14, 15
   1956 endconst
   1957 
   1958 // void ipred_z2_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   1959 //                               const pixel *const top,
   1960 //                               const pixel *const left,
   1961 //                               const int width, const int height,
   1962 //                               const int dx, const int dy);
   1963 function ipred_z2_fill1_8bpc_neon, export=1
   1964        clz             w10, w4
   1965        movrel          x9,  ipred_z2_fill1_tbl
   1966        sub             w10, w10, #25
   1967        ldrsw           x10, [x9, w10, uxtw #2]
   1968        mov             w8,  #(1 << 6)            // xpos = 1 << 6
   1969        add             x9,  x9,  x10
   1970        sub             w8,  w8,  w6              // xpos -= dx
   1971 
   1972        movrel          x11, increments
   1973        ld1             {v31.8h},  [x11]          // increments
   1974        neg             w7,  w7                   // -dy
   1975 
   1976        br              x9
   1977 40:
   1978        AARCH64_VALID_JUMP_TARGET
   1979 
   1980        dup             v30.4h,  w7               // -dy
   1981        movi            v17.8b,  #1
   1982 
   1983        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
   1984        movi            v25.16b, #0x3e
   1985        add             v30.4h,  v16.4h,  v30.4h  // -= dy
   1986 
   1987        xtn             v31.8b,  v31.8h           // {0,1,2,3}
   1988 
   1989        // Worst case height for w=4 is 16, but we need at least h+1 elements
   1990        ld1             {v0.16b, v1.16b}, [x3]    // left[]
   1991 
   1992        movi            v26.16b, #64
   1993        movi            v19.16b, #2
   1994 
   1995        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
   1996        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   1997        and             v27.8b,  v27.8b,  v25.8b  // frac_y
   1998 
   1999        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   2000 
   2001        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
   2002        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
   2003 
   2004        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
   2005 
   2006        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
   2007 
   2008        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
   2009 
   2010        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
   2011 
   2012        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
   2013        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
   2014 
   2015        movi            v29.8b,  #2
   2016 4:
   2017        asr             w9,  w8,  #6              // base_x
   2018        dup             v6.4h,   w8               // xpos
   2019        sub             w8,  w8,  w6              // xpos -= dx
   2020        cmp             w9,  #-4                  // base_x <= -4
   2021        asr             w11, w8,  #6              // base_x
   2022        b.le            49f
   2023 
   2024        dup             v7.4h,   w8               // xpos
   2025 
   2026        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
   2027        ldr             d4,  [x2, w11, sxtw]
   2028 
   2029        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
   2030 
   2031        // Cut corners here; only doing tbl over v0 here; we only
   2032        // seem to need the last pixel, from v1, after skipping to the
   2033        // left-only codepath below.
   2034        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
   2035 
   2036        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
   2037        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
   2038 
   2039        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
   2040        ext             v5.8b,   v4.8b,   v4.8b,   #1
   2041 
   2042        and             v6.8b,   v6.8b,   v25.8b  // frac_x
   2043 
   2044        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
   2045 
   2046        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
   2047        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
   2048 
   2049        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
   2050 
   2051        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
   2052 
   2053        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2054        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
   2055 
   2056        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
   2057        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
   2058 
   2059        cmge            v20.8b,  v20.8b,  #0
   2060 
   2061        rshrn           v16.8b,  v16.8h,  #6
   2062        rshrn           v22.8b,  v22.8h,  #6
   2063 
   2064        bit             v16.8b,  v22.8b,  v20.8b
   2065 
   2066        st1             {v16.s}[0], [x0], x1
   2067        sub             w8,  w8,  w6              // xpos -= dx
   2068        subs            w5,  w5,  #2
   2069        st1             {v16.s}[1], [x0], x1
   2070        b.le            9f
   2071 
   2072        ext             v16.8b,  v17.8b,  v17.8b, #4
   2073        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
   2074        b               4b
   2075 
   2076 49:
   2077        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+2]
   2078 
   2079        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
   2080 
   2081        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
   2082        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
   2083        rshrn           v18.8b,  v18.8h,  #6
   2084 
   2085        st1             {v18.s}[0], [x0], x1
   2086        subs            w5,  w5,  #2
   2087        st1             {v18.s}[1], [x0], x1
   2088        b.le            9f
   2089 
   2090        ext             v16.8b,  v17.8b,  v17.8b, #4
   2091        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
   2092        b               49b
   2093 
   2094 9:
   2095        ret
   2096 
   2097 80:
   2098        AARCH64_VALID_JUMP_TARGET
   2099 
   2100        dup             v30.8h,  w7               // -dy
   2101        movi            v17.8b,  #1
   2102 
   2103        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
   2104        movi            v25.16b, #0x3e
   2105        add             v30.8h,  v16.8h,  v30.8h  // -= dy
   2106 
   2107        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
   2108 
   2109        // Worst case height for w=8 is 32, but we need at least h+1 elements
   2110        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
   2111 
   2112        movi            v26.16b, #64
   2113        movi            v19.16b, #2
   2114 
   2115        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
   2116        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   2117        and             v27.8b,  v27.8b,  v25.8b  // frac_y
   2118 
   2119        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   2120 
   2121        // Cut corners here; for the first row we don't expect to need to
   2122        // read outside of v0.
   2123        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
   2124 
   2125        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
   2126        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
   2127 
   2128        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
   2129 
   2130        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
   2131 
   2132        movi            v24.8b,  #2               // 2
   2133 8:
   2134        asr             w9,  w8,  #6              // base_x
   2135        dup             v16.8h,   w8              // xpos
   2136        sub             w8,  w8,  w6              // xpos -= dx
   2137        cmp             w9,  #-8                  // base_x <= -8
   2138        asr             w11, w8,  #6              // base_x
   2139        b.le            89f
   2140 
   2141        dup             v17.8h,   w8              // xpos
   2142 
   2143        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
   2144        ldr             q6,  [x2, w11, sxtw]
   2145 
   2146        // Cut corners here; only doing tbl over v0-v1 here; we only
   2147        // seem to need the last pixel, from v2, after skipping to the
   2148        // left-only codepath below.
   2149        tbl             v19.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+1]
   2150 
   2151        shrn            v21.8b,  v16.8h,  #6      // first base_x
   2152        shrn2           v21.16b, v17.8h,  #6
   2153        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
   2154        xtn2            v16.16b, v17.8h
   2155 
   2156        tbl             v20.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+2]
   2157 
   2158        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
   2159        ext             v7.16b,  v6.16b,  v6.16b,  #1
   2160 
   2161        and             v16.16b, v16.16b, v25.16b // frac_x
   2162 
   2163        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
   2164        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
   2165 
   2166        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
   2167 
   2168        add             v21.16b, v21.16b, v31.16b // actual base_x
   2169 
   2170        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2171        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2172        umull           v17.8h,  v19.8b,  v28.8b
   2173        umlal           v17.8h,  v20.8b,  v27.8b
   2174 
   2175        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
   2176        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
   2177        umull2          v23.8h,  v4.16b,  v7.16b
   2178        umlal2          v23.8h,  v5.16b,  v16.16b
   2179 
   2180        cmge            v21.16b, v21.16b, #0
   2181 
   2182        rshrn           v6.8b,   v6.8h,   #6
   2183        rshrn2          v6.16b,  v17.8h,  #6
   2184        rshrn           v22.8b,  v22.8h,  #6
   2185        rshrn2          v22.16b, v23.8h,  #6
   2186 
   2187        bit             v6.16b,  v22.16b, v21.16b
   2188 
   2189        st1             {v6.d}[0], [x0], x1
   2190        sub             w8,  w8,  w6              // xpos -= dx
   2191        subs            w5,  w5,  #2
   2192        st1             {v6.d}[1], [x0], x1
   2193        b.le            9f
   2194 
   2195        mov             v18.8b,  v20.8b
   2196        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
   2197        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
   2198        b               8b
   2199 
   2200 89:
   2201        tbl             v19.8b, {v0.16b, v1.16b, v2.16b}, v29.8b // left[base_y+1]
   2202        tbl             v20.8b, {v0.16b, v1.16b, v2.16b}, v30.8b // left[base_y+2]
   2203 
   2204        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2205        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2206        umull           v17.8h,  v19.8b,  v28.8b
   2207        umlal           v17.8h,  v20.8b,  v27.8b
   2208 
   2209        rshrn           v6.8b,   v6.8h,   #6
   2210        rshrn2          v6.16b,  v17.8h,  #6
   2211 
   2212        st1             {v6.d}[0], [x0], x1
   2213        subs            w5,  w5,  #2
   2214        st1             {v6.d}[1], [x0], x1
   2215        b.le            9f
   2216 
   2217        mov             v18.8b,  v20.8b
   2218        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
   2219        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
   2220        b               89b
   2221 
   2222 9:
   2223        ret
   2224 
   2225 160:
   2226        AARCH64_VALID_JUMP_TARGET
   2227 
   2228        stp             d8,  d9,  [sp, #-0x40]!
   2229        stp             d10, d11, [sp, #0x10]
   2230        stp             d12, d13, [sp, #0x20]
   2231        stp             d14, d15, [sp, #0x30]
   2232 
   2233        add             x11, x11, #16             // increments
   2234 
   2235        dup             v18.8h,  w7               // -dy
   2236        movi            v17.16b, #1
   2237        add             x3,  x3,  #1              // Skip past left[0]
   2238 
   2239        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
   2240 
   2241        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
   2242        mul             v19.8h,  v14.8h,  v18.8h  // {8,9,10,11,12,13,14,15}* -dy
   2243        movi            v25.16b, #0x3e
   2244        add             v16.8h,  v16.8h,  v18.8h  // -= dy
   2245        add             v18.8h,  v19.8h,  v18.8h
   2246 
   2247        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
   2248        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
   2249 
   2250        // Worst case height is 64.
   2251        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
   2252        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
   2253 
   2254        movi            v26.16b, #64
   2255        movi            v19.16b, #2
   2256 
   2257        xtn             v27.8b,  v16.8h           // (uint8_t)ypos
   2258        xtn2            v27.16b, v18.8h
   2259        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
   2260        shrn2           v29.16b, v18.8h,  #6
   2261        mov             v18.16b, v15.16b          // left[0]
   2262        and             v27.16b, v27.16b, v25.16b // frac_y
   2263 
   2264        // Cut corners here; for the first row we don't expect to need to
   2265        // read outside of v0.
   2266        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
   2267 
   2268        add             v30.16b, v29.16b, v19.16b // base_y + 2
   2269        add             v29.16b, v29.16b, v17.16b // base_y + 1
   2270 
   2271        sub             v28.16b, v26.16b, v27.16b // 64 - frac_y
   2272 
   2273        movi            v24.16b, #2               // 2
   2274 16:
   2275        asr             w9,  w8,  #6              // base_x
   2276        dup             v16.8h,   w8              // xpos
   2277        sub             w8,  w8,  w6              // xpos -= dx
   2278        cmp             w9,  #-16                 // base_x <= -16
   2279        asr             w11, w8,  #6              // base_x
   2280        b.le            169f
   2281 
   2282        dup             v17.8h,   w8              // xpos
   2283 
   2284        add             x9,  x2,  w9,  sxtw
   2285        add             x11, x2,  w11, sxtw
   2286 
   2287        ld1             {v4.16b, v5.16b}, [x9]    // top[base_x]
   2288        mov             v19.16b, v15.16b          // left[0]
   2289        ld1             {v6.16b, v7.16b}, [x11]
   2290 
   2291        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2292 
   2293        mov             v20.16b, v15.16b          // left[0]
   2294 
   2295        shrn            v21.8b,  v16.8h,  #6      // first base_x
   2296        shrn            v22.8b,  v17.8h,  #6
   2297        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
   2298        xtn             v17.8b,  v17.8h
   2299 
   2300        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
   2301 
   2302        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
   2303        trn1            v22.2d,  v22.2d,  v22.2d
   2304        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
   2305        trn1            v17.2d,  v17.2d,  v17.2d
   2306 
   2307        ext             v5.16b,  v4.16b,  v5.16b,  #1 // top[base_x+1]
   2308        ext             v7.16b,  v6.16b,  v7.16b,  #1
   2309 
   2310        and             v16.16b, v16.16b, v25.16b // frac_x
   2311        and             v17.16b, v17.16b, v25.16b
   2312 
   2313        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2314        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2315 
   2316        sub             v8.16b,  v26.16b, v16.16b // 64 - frac_x
   2317        sub             v9.16b,  v26.16b, v17.16b
   2318 
   2319        umull2          v11.8h,  v18.16b, v28.16b
   2320        umlal2          v11.8h,  v19.16b, v27.16b
   2321 
   2322        add             v21.16b, v21.16b, v31.16b // actual base_x
   2323        add             v22.16b, v22.16b, v31.16b
   2324 
   2325        umull           v12.8h,  v19.8b,  v28.8b
   2326        umlal           v12.8h,  v20.8b,  v27.8b
   2327        umull2          v13.8h,  v19.16b, v28.16b
   2328        umlal2          v13.8h,  v20.16b, v27.16b
   2329 
   2330        rshrn           v10.8b,  v10.8h,  #6
   2331        rshrn2          v10.16b, v11.8h,  #6
   2332        rshrn           v11.8b,  v12.8h,  #6
   2333        rshrn2          v11.16b, v13.8h,  #6
   2334 
   2335        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
   2336        umlal           v12.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
   2337        umull2          v13.8h,  v4.16b,  v8.16b
   2338        umlal2          v13.8h,  v5.16b,  v16.16b
   2339        umull           v14.8h,  v6.8b,   v9.8b
   2340        umlal           v14.8h,  v7.8b,   v17.8b
   2341        umull2          v18.8h,  v6.16b,  v9.16b
   2342        umlal2          v18.8h,  v7.16b,  v17.16b
   2343 
   2344        cmge            v21.16b, v21.16b, #0
   2345        cmge            v22.16b, v22.16b, #0
   2346 
   2347        rshrn           v12.8b,  v12.8h,  #6
   2348        rshrn2          v12.16b, v13.8h,  #6
   2349        rshrn           v13.8b,  v14.8h,  #6
   2350        rshrn2          v13.16b, v18.8h,  #6
   2351 
   2352        bit             v10.16b, v12.16b, v21.16b
   2353        bit             v11.16b, v13.16b, v22.16b
   2354 
   2355        st1             {v10.16b}, [x0], x1
   2356        subs            w5,  w5,  #2
   2357        sub             w8,  w8,  w6              // xpos -= dx
   2358        st1             {v11.16b}, [x0], x1
   2359        b.le            9f
   2360 
   2361        mov             v18.16b, v20.16b
   2362        add             v29.16b, v29.16b, v24.16b // base_y += 2
   2363        add             v30.16b, v30.16b, v24.16b // base_y += 2
   2364        b               16b
   2365 
   2366 169:
   2367        mov             v19.16b, v15.16b
   2368        mov             v20.16b, v15.16b
   2369        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2370        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
   2371 
   2372        umull           v4.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2373        umlal           v4.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2374        umull2          v5.8h,   v18.16b, v28.16b
   2375        umlal2          v5.8h,   v19.16b, v27.16b
   2376        umull           v6.8h,   v19.8b,  v28.8b
   2377        umlal           v6.8h,   v20.8b,  v27.8b
   2378        umull2          v7.8h,   v19.16b, v28.16b
   2379        umlal2          v7.8h,   v20.16b, v27.16b
   2380 
   2381        rshrn           v4.8b,   v4.8h,   #6
   2382        rshrn2          v4.16b,  v5.8h,   #6
   2383        rshrn           v5.8b,   v6.8h,   #6
   2384        rshrn2          v5.16b,  v7.8h,   #6
   2385 
   2386        st1             {v4.16b}, [x0], x1
   2387        subs            w5,  w5,  #2
   2388        st1             {v5.16b}, [x0], x1
   2389        b.le            9f
   2390 
   2391        mov             v18.16b, v20.16b
   2392        add             v29.16b, v29.16b, v24.16b // base_y += 2
   2393        add             v30.16b, v30.16b, v24.16b // base_y += 2
   2394        b               169b
   2395 
   2396 9:
   2397        ldp             d14, d15, [sp, #0x30]
   2398        ldp             d12, d13, [sp, #0x20]
   2399        ldp             d10, d11, [sp, #0x10]
   2400        ldp             d8,  d9,  [sp], 0x40
   2401        ret
   2402 
   2403 320:
   2404 640:
   2405        AARCH64_VALID_JUMP_TARGET
   2406 
   2407        stp             d8,  d9,  [sp, #-0x40]!
   2408        stp             d10, d11, [sp, #0x10]
   2409        stp             d12, d13, [sp, #0x20]
   2410        stp             d14, d15, [sp, #0x30]
   2411 
   2412        add             x11, x11, #16             // increments
   2413 
   2414        dup             v25.8h,  w7               // -dy
   2415        add             x3,  x3,  #1              // Skip past left[0]
   2416 
   2417        ld1             {v14.8h}, [x11]           // {8,9,10,11,12,13,14,15}
   2418 
   2419        add             x13, x0,  x1              // alternating row
   2420        lsl             x1,  x1,  #1              // stride *= 2
   2421        sub             x1,  x1,  w4,  uxtw       // stride -= width
   2422 
   2423        movi            v11.8h,  #8
   2424        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
   2425        add             v26.8h,  v26.8h,  v25.8h  // -= dy
   2426        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
   2427 
   2428        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
   2429        xtn2            v31.16b, v14.8h           // {8,9,10,11,12,13,14,15}
   2430 
   2431        // Worst case height is 64.
   2432        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x3] // left[]
   2433        ld1r            {v15.16b}, [x2]           // left[0] == top[0]
   2434 
   2435        mov             w12, w4                   // orig w
   2436        neg             w14, w4                   // -w
   2437 
   2438 1:
   2439        mov             v23.16b, v26.16b          // reset ypos
   2440 
   2441        asr             w9,  w8,  #6              // base_x
   2442        dup             v16.8h,   w8              // xpos
   2443        sub             w8,  w8,  w6              // xpos -= dx
   2444        cmp             w9,  w14                  // base_x <= -w
   2445        asr             w11, w8,  #6              // base_x
   2446        b.le            329f
   2447 
   2448        dup             v17.8h,   w8              // xpos
   2449        sub             w8,  w8,  w6              // xpos -= dx
   2450 
   2451        add             x9,  x2,  w9,  sxtw
   2452        add             x11, x2,  w11, sxtw
   2453 
   2454        sqshrn          v21.8b,  v16.8h,  #6      // first base_x
   2455        sqshrn          v22.8b,  v17.8h,  #6
   2456        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
   2457        xtn             v17.8b,  v17.8h
   2458 
   2459        ld1             {v4.16b}, [x9], #16       // top[base_x]
   2460        ld1             {v6.16b}, [x11], #16
   2461 
   2462        trn1            v21.2d,  v21.2d,  v21.2d  // first base_x
   2463        trn1            v22.2d,  v22.2d,  v22.2d
   2464        trn1            v16.2d,  v16.2d,  v16.2d  // (uint8_t)xpos
   2465        trn1            v17.2d,  v17.2d,  v17.2d
   2466 
   2467        movi            v10.16b, #0x3e
   2468        movi            v11.16b, #64
   2469 
   2470        and             v16.16b, v16.16b, v10.16b // frac_x
   2471        and             v17.16b, v17.16b, v10.16b
   2472 
   2473        sub             v8.16b,  v11.16b, v16.16b // 64 - frac_x
   2474        sub             v9.16b,  v11.16b, v17.16b
   2475 
   2476        add             v21.16b, v21.16b, v31.16b // actual base_x
   2477        add             v22.16b, v22.16b, v31.16b
   2478 
   2479 2:
   2480        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
   2481        movi            v12.16b, #64
   2482        movi            v20.16b, #2
   2483        movi            v10.16b, #0x3e
   2484 
   2485        smov            w10,     v22.b[0]
   2486 
   2487        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
   2488        xtn2            v27.16b, v13.8h
   2489        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
   2490        shrn2           v29.16b, v13.8h,  #6
   2491        cmp             w10, #0                   // base_x (bottom left) >= 0
   2492        and             v27.16b, v27.16b, v10.16b // frac_y
   2493 
   2494        mov             v18.16b, v15.16b          // left[0]
   2495 
   2496        b.ge            4f
   2497 
   2498        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
   2499        movi            v13.16b, #1
   2500 
   2501        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
   2502        add             v29.16b, v29.16b, v13.16b // base_y + 1
   2503        mov             v19.16b, v15.16b          // left[0]
   2504 
   2505        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
   2506 
   2507        ld1             {v5.16b}, [x9], #16       // top[base_x]
   2508        ld1             {v7.16b}, [x11], #16
   2509 
   2510        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2511        add             v29.16b, v29.16b, v13.16b // base_y + 2
   2512 
   2513        mov             v20.16b, v15.16b          // left[0]
   2514        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
   2515 
   2516        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2517        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2518        umull2          v11.8h,  v18.16b, v28.16b
   2519        umlal2          v11.8h,  v19.16b, v27.16b
   2520        umull           v12.8h,  v19.8b,  v28.8b
   2521        umlal           v12.8h,  v20.8b,  v27.8b
   2522        umull2          v13.8h,  v19.16b, v28.16b
   2523        umlal2          v13.8h,  v20.16b, v27.16b
   2524 
   2525        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
   2526        ext             v19.16b, v6.16b,  v7.16b,  #1
   2527 
   2528        rshrn           v10.8b,  v10.8h,  #6
   2529        rshrn2          v10.16b, v11.8h,  #6
   2530        rshrn           v11.8b,  v12.8h,  #6
   2531        rshrn2          v11.16b, v13.8h,  #6
   2532 
   2533        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
   2534        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
   2535        umull2          v13.8h,  v4.16b,  v8.16b
   2536        umlal2          v13.8h,  v18.16b, v16.16b
   2537        umull           v14.8h,  v6.8b,   v9.8b
   2538        umlal           v14.8h,  v19.8b,  v17.8b
   2539        umull2          v20.8h,  v6.16b,  v9.16b
   2540        umlal2          v20.8h,  v19.16b, v17.16b
   2541 
   2542        cmge            v18.16b, v21.16b, #0
   2543        cmge            v19.16b, v22.16b, #0
   2544 
   2545        rshrn           v12.8b,  v12.8h,  #6
   2546        rshrn2          v12.16b, v13.8h,  #6
   2547        rshrn           v13.8b,  v14.8h,  #6
   2548        rshrn2          v13.16b, v20.8h,  #6
   2549 
   2550        bit             v10.16b, v12.16b, v18.16b
   2551        bit             v11.16b, v13.16b, v19.16b
   2552 
   2553        st1             {v10.16b}, [x0], #16
   2554        subs            w4,  w4,  #16
   2555        st1             {v11.16b}, [x13], #16
   2556        b.le            3f
   2557 
   2558        movi            v10.16b, #16
   2559        mov             v4.16b,  v5.16b
   2560        mov             v6.16b,  v7.16b
   2561        add             v21.16b, v21.16b, v10.16b // base_x += 16
   2562        add             v22.16b, v22.16b, v10.16b
   2563        b               2b
   2564 
   2565 3:
   2566        subs            w5,  w5,  #2
   2567        b.le            9f
   2568        movi            v10.8h, #128
   2569        add             x0,  x0,  x1
   2570        add             x13, x13, x1
   2571        mov             w4,  w12                  // reset w
   2572        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
   2573        b               1b
   2574 
   2575 4:      // The rest of the row only predicted from top[]
   2576        ld1             {v5.16b}, [x9], #16       // top[base_x]
   2577        ld1             {v7.16b}, [x11], #16
   2578 
   2579        ext             v18.16b, v4.16b,  v5.16b,  #1 // top[base_x+1]
   2580        ext             v19.16b, v6.16b,  v7.16b,  #1
   2581 
   2582        umull           v12.8h,  v4.8b,   v8.8b   // top[base_x]-*(64-frac_x)
   2583        umlal           v12.8h,  v18.8b,  v16.8b  // + top[base_x+1]*frac_x
   2584        umull2          v13.8h,  v4.16b,  v8.16b
   2585        umlal2          v13.8h,  v18.16b, v16.16b
   2586        umull           v14.8h,  v6.8b,   v9.8b
   2587        umlal           v14.8h,  v19.8b,  v17.8b
   2588        umull2          v20.8h,  v6.16b,  v9.16b
   2589        umlal2          v20.8h,  v19.16b, v17.16b
   2590 
   2591        rshrn           v12.8b,  v12.8h,  #6
   2592        rshrn2          v12.16b, v13.8h,  #6
   2593        rshrn           v13.8b,  v14.8h,  #6
   2594        rshrn2          v13.16b, v20.8h,  #6
   2595 
   2596        st1             {v12.16b}, [x0], #16
   2597        subs            w4,  w4,  #16
   2598        st1             {v13.16b}, [x13], #16
   2599        b.le            3b
   2600 
   2601        mov             v4.16b,  v5.16b
   2602        mov             v6.16b,  v7.16b
   2603        b               4b
   2604 
   2605 329:    // The rest of the block only predicted from left[]
   2606        add             x1,  x1,  w4,  uxtw       // restore stride
   2607        mov             w12, w5                   // orig remaining h
   2608 1:
   2609        add             v13.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
   2610        movi            v12.16b, #64
   2611        movi            v10.16b, #0x3e
   2612 
   2613        xtn             v27.8b,  v23.8h           // (uint8_t)ypos
   2614        xtn2            v27.16b, v13.8h
   2615        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
   2616        shrn2           v29.16b, v13.8h,  #6
   2617        and             v27.16b, v27.16b, v10.16b // frac_y
   2618 
   2619        mov             v18.16b, v15.16b          // left[0]
   2620        add             v23.8h,  v13.8h,  v25.8h  // ypos -= 8*dy
   2621        movi            v21.16b, #1
   2622 
   2623        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
   2624        add             v29.16b, v29.16b, v21.16b // base_y + 1
   2625 
   2626        sub             v28.16b, v12.16b, v27.16b // 64 - frac_y
   2627 2:
   2628        mov             v19.16b, v15.16b          // left[0]
   2629        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2630        add             v29.16b, v29.16b, v21.16b // base_y + 2
   2631        mov             v20.16b, v15.16b          // left[0]
   2632        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
   2633        add             v29.16b, v29.16b, v21.16b // next base_y
   2634 
   2635        umull           v10.8h,  v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2636        umlal           v10.8h,  v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2637        umull2          v11.8h,  v18.16b, v28.16b
   2638        umlal2          v11.8h,  v19.16b, v27.16b
   2639        umull           v12.8h,  v19.8b,  v28.8b
   2640        umlal           v12.8h,  v20.8b,  v27.8b
   2641        umull2          v13.8h,  v19.16b, v28.16b
   2642        umlal2          v13.8h,  v20.16b, v27.16b
   2643 
   2644        rshrn           v10.8b,  v10.8h,  #6
   2645        rshrn2          v10.16b, v11.8h,  #6
   2646        rshrn           v11.8b,  v12.8h,  #6
   2647        rshrn2          v11.16b, v13.8h,  #6
   2648 
   2649        st1             {v10.16b}, [x0], x1
   2650        subs            w5,  w5,  #2
   2651        st1             {v11.16b}, [x13], x1
   2652        b.le            3f
   2653        mov             v18.16b, v20.16b
   2654        b               2b
   2655 
   2656 3:
   2657        subs            w4,  w4,  #16
   2658        b.le            9f
   2659 
   2660        lsr             x1,  x1,  #1
   2661        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   2662        msub            x13, x1,  x12, x13
   2663        lsl             x1,  x1,  #1
   2664        add             x0,  x0,  #16
   2665        add             x13, x13, #16
   2666        mov             w5,  w12                  // reset h
   2667        b               1b
   2668 
   2669 9:
   2670        ldp             d14, d15, [sp, #0x30]
   2671        ldp             d12, d13, [sp, #0x20]
   2672        ldp             d10, d11, [sp, #0x10]
   2673        ldp             d8,  d9,  [sp], 0x40
   2674        ret
   2675 endfunc
   2676 
   2677 jumptable ipred_z2_fill1_tbl
   2678        .word 640b - ipred_z2_fill1_tbl
   2679        .word 320b - ipred_z2_fill1_tbl
   2680        .word 160b - ipred_z2_fill1_tbl
   2681        .word 80b  - ipred_z2_fill1_tbl
   2682        .word 40b  - ipred_z2_fill1_tbl
   2683 endjumptable
   2684 
   2685 function ipred_z2_fill2_8bpc_neon, export=1
   2686        cmp             w4,  #8
   2687        mov             w8,  #(2 << 6)            // xpos = 2 << 6
   2688        sub             w8,  w8,  w6              // xpos -= dx
   2689 
   2690        movrel          x11, increments
   2691        ld1             {v31.8h},  [x11]          // increments
   2692        neg             w7,  w7                   // -dy
   2693        b.eq            80f
   2694 
   2695 40:
   2696        dup             v30.4h,  w7               // -dy
   2697        movi            v17.8b,  #1
   2698 
   2699        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
   2700        movi            v25.16b, #0x3e
   2701        add             v30.4h,  v16.4h,  v30.4h  // -= dy
   2702 
   2703        xtn             v31.8b,  v31.8h           // {0,1,2,3}
   2704 
   2705        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
   2706        // from left.
   2707        ld1             {v0.16b}, [x3]            // left[]
   2708 
   2709        movi            v26.16b, #64
   2710        movi            v19.16b, #2
   2711 
   2712        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
   2713        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   2714        and             v27.8b,  v27.8b,  v25.8b  // frac_y
   2715 
   2716        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   2717 
   2718        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
   2719        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
   2720 
   2721        tbl             v16.8b, {v0.16b}, v29.8b  // left[base_y]
   2722 
   2723        trn1            v30.2s,  v30.2s,  v28.2s  // base_y + 1, base_y + 2
   2724 
   2725        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
   2726 
   2727        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
   2728 
   2729        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
   2730        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
   2731 
   2732        movi            v29.8b,  #2
   2733        add             v31.8b,  v31.8b,  v31.8b  // {0,2,4,6,0,2,4,6}
   2734 4:
   2735        asr             w9,  w8,  #6              // base_x
   2736        dup             v6.4h,   w8               // xpos
   2737        sub             w8,  w8,  w6              // xpos -= dx
   2738        cmp             w9,  #-8                  // base_x <= -8
   2739        asr             w11, w8,  #6              // base_x
   2740        b.le            49f
   2741 
   2742        dup             v7.4h,   w8               // xpos
   2743 
   2744        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
   2745        ldr             d4,  [x2, w11, sxtw]
   2746 
   2747        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
   2748 
   2749        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
   2750 
   2751        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
   2752        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
   2753 
   2754        uzp2            v3.8b,   v2.8b,   v4.8b   // top[base_x+1]
   2755        uzp1            v2.8b,   v2.8b,   v4.8b   // top[base_x]
   2756 
   2757        and             v6.8b,   v6.8b,   v25.8b  // frac_x
   2758 
   2759        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
   2760 
   2761        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
   2762 
   2763        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
   2764 
   2765        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2766        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
   2767 
   2768        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
   2769        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
   2770 
   2771        cmge            v20.8b,  v20.8b,  #0
   2772 
   2773        rshrn           v16.8b,  v16.8h,  #6
   2774        rshrn           v22.8b,  v22.8h,  #6
   2775 
   2776        bit             v16.8b,  v22.8b,  v20.8b
   2777 
   2778        st1             {v16.s}[0], [x0], x1
   2779        sub             w8,  w8,  w6              // xpos -= dx
   2780        subs            w5,  w5,  #2
   2781        st1             {v16.s}[1], [x0], x1
   2782        b.le            9f
   2783 
   2784        ext             v16.8b,  v17.8b,  v17.8b, #4
   2785        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
   2786        b               4b
   2787 
   2788 49:
   2789        tbl             v17.8b, {v0.16b}, v30.8b  // left[base_y+1], left[base_y+2]
   2790 
   2791        trn1            v16.2s,  v16.2s,  v17.2s  // left[base_y], left[base_y+1]
   2792 
   2793        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
   2794        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
   2795        rshrn           v18.8b,  v18.8h,  #6
   2796 
   2797        st1             {v18.s}[0], [x0], x1
   2798        subs            w5,  w5,  #2
   2799        st1             {v18.s}[1], [x0], x1
   2800        b.le            9f
   2801 
   2802        ext             v16.8b,  v17.8b,  v17.8b, #4
   2803        add             v30.8b,  v30.8b,  v29.8b  // base_y += 2
   2804        b               49b
   2805 
   2806 9:
   2807        ret
   2808 
   2809 80:
   2810        dup             v30.8h,  w7               // -dy
   2811        movi            v17.8b,  #1
   2812 
   2813        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
   2814        movi            v25.16b, #0x3e
   2815        add             v30.8h,  v16.8h,  v30.8h  // -= dy
   2816 
   2817        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
   2818 
   2819        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
   2820        // from left.
   2821        ld1             {v0.16b}, [x3]    // left[]
   2822 
   2823        movi            v26.16b, #64
   2824        movi            v19.16b, #2
   2825 
   2826        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
   2827        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   2828        and             v27.8b,  v27.8b,  v25.8b  // frac_y
   2829 
   2830        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   2831 
   2832        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
   2833 
   2834        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
   2835        add             v29.8b,  v29.8b,  v17.8b  // base_y + 1
   2836 
   2837        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
   2838 
   2839        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
   2840 
   2841        movi            v24.8b,  #2               // 2
   2842        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
   2843 8:
   2844        asr             w9,  w8,  #6              // base_x
   2845        dup             v16.8h,   w8              // xpos
   2846        sub             w8,  w8,  w6              // xpos -= dx
   2847        cmp             w9,  #-16                 // base_x <= -16
   2848        asr             w11, w8,  #6              // base_x
   2849        b.le            89f
   2850 
   2851        dup             v17.8h,   w8              // xpos
   2852 
   2853        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
   2854        ldr             q6,  [x2, w11, sxtw]
   2855 
   2856        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
   2857 
   2858        shrn            v21.8b,  v16.8h,  #6      // first base_x
   2859        shrn2           v21.16b, v17.8h,  #6
   2860        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
   2861        xtn2            v16.16b, v17.8h
   2862 
   2863        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
   2864 
   2865        uzp2            v5.16b,  v4.16b,  v6.16b  // top[base_x+1]
   2866        uzp1            v4.16b,  v4.16b,  v6.16b  // top[base_x]
   2867 
   2868        and             v16.16b, v16.16b, v25.16b // frac_x
   2869 
   2870        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
   2871 
   2872        add             v21.16b, v21.16b, v31.16b // actual base_x
   2873 
   2874        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2875        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2876        umull           v17.8h,  v19.8b,  v28.8b
   2877        umlal           v17.8h,  v20.8b,  v27.8b
   2878 
   2879        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
   2880        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
   2881        umull2          v23.8h,  v4.16b,  v7.16b
   2882        umlal2          v23.8h,  v5.16b,  v16.16b
   2883 
   2884        cmge            v21.16b, v21.16b, #0
   2885 
   2886        rshrn           v6.8b,   v6.8h,   #6
   2887        rshrn2          v6.16b,  v17.8h,  #6
   2888        rshrn           v22.8b,  v22.8h,  #6
   2889        rshrn2          v22.16b, v23.8h,  #6
   2890 
   2891        bit             v6.16b,  v22.16b, v21.16b
   2892 
   2893        st1             {v6.d}[0], [x0], x1
   2894        sub             w8,  w8,  w6              // xpos -= dx
   2895        subs            w5,  w5,  #2
   2896        st1             {v6.d}[1], [x0], x1
   2897        b.le            9f
   2898 
   2899        mov             v18.8b,  v20.8b
   2900        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
   2901        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
   2902        b               8b
   2903 
   2904 89:
   2905        tbl             v19.8b, {v0.16b}, v29.8b  // left[base_y+1]
   2906        tbl             v20.8b, {v0.16b}, v30.8b  // left[base_y+2]
   2907 
   2908        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   2909        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   2910        umull           v17.8h,  v19.8b,  v28.8b
   2911        umlal           v17.8h,  v20.8b,  v27.8b
   2912 
   2913        rshrn           v6.8b,   v6.8h,   #6
   2914        rshrn2          v6.16b,  v17.8h,  #6
   2915 
   2916        st1             {v6.d}[0], [x0], x1
   2917        subs            w5,  w5,  #2
   2918        st1             {v6.d}[1], [x0], x1
   2919        b.le            9f
   2920 
   2921        mov             v18.8b,  v20.8b
   2922        add             v29.8b,  v29.8b,  v24.8b  // base_y += 2
   2923        add             v30.8b,  v30.8b,  v24.8b  // base_y += 2
   2924        b               89b
   2925 
   2926 9:
   2927        ret
   2928 endfunc
   2929 
   2930 function ipred_z2_fill3_8bpc_neon, export=1
   2931        cmp             w4,  #8
   2932        mov             w8,  #(1 << 6)            // xpos = 1 << 6
   2933        sub             w8,  w8,  w6              // xpos -= dx
   2934 
   2935        movrel          x11, increments
   2936        ld1             {v31.8h},  [x11]          // increments
   2937        neg             w7,  w7                   // -dy
   2938        b.eq            80f
   2939 
   2940 40:
   2941        dup             v30.4h,  w7               // -dy
   2942        movi            v17.8b,  #1
   2943 
   2944        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
   2945        movi            v25.16b, #0x3e
   2946        add             v30.4h,  v16.4h,  v30.4h  // -= dy
   2947 
   2948        xtn             v31.8b,  v31.8h           // {0,1,2,3}
   2949 
   2950        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
   2951        ld1             {v0.16b, v1.16b}, [x3]    // left[]
   2952 
   2953        movi            v26.16b, #64
   2954        movi            v19.16b, #2
   2955 
   2956        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
   2957        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   2958        and             v27.8b,  v27.8b,  v25.8b  // frac_y
   2959 
   2960        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
   2961 
   2962        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1
   2963        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2
   2964 
   2965        trn1            v31.2s,  v31.2s,  v31.2s  // {0,1,2,3,0,1,2,3}
   2966 
   2967        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3
   2968 
   2969        trn1            v29.2s,  v29.2s,  v28.2s  // base_y + 0, base_y + 2
   2970        trn1            v30.2s,  v30.2s,  v24.2s  // base_y + 1, base_y + 3
   2971 
   2972        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
   2973 
   2974        trn1            v27.2s,  v27.2s,  v27.2s  // frac_y
   2975        trn1            v28.2s,  v28.2s,  v28.2s  // 64 - frac_y
   2976 
   2977        movi            v24.8b,  #4
   2978 4:
   2979        asr             w9,  w8,  #6              // base_x
   2980        dup             v6.4h,   w8               // xpos
   2981        sub             w8,  w8,  w6              // xpos -= dx
   2982        cmp             w9,  #-4                  // base_x <= -4
   2983        asr             w11, w8,  #6              // base_x
   2984        b.le            49f
   2985 
   2986        dup             v7.4h,   w8               // xpos
   2987 
   2988        ldr             d2,  [x2, w9, sxtw]       // top[base_x]
   2989        ldr             d4,  [x2, w11, sxtw]
   2990 
   2991        trn1            v6.2d,   v6.2d,   v7.2d   // xpos
   2992 
   2993        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
   2994        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
   2995 
   2996        shrn            v20.8b,  v6.8h,   #6      // first base_x for each row
   2997        xtn             v6.8b,   v6.8h            // (uint8_t)xpos
   2998 
   2999        ext             v3.8b,   v2.8b,   v2.8b,   #1 // top[base_x+1]
   3000        ext             v5.8b,   v4.8b,   v4.8b,   #1
   3001 
   3002        and             v6.8b,   v6.8b,   v25.8b  // frac_x
   3003 
   3004        trn1            v2.2s,   v2.2s,   v4.2s   // top[base_x]
   3005        trn1            v3.2s,   v3.2s,   v5.2s   // top[base_x+1]
   3006 
   3007        sub             v7.8b,   v26.8b,  v6.8b   // 64 - frac_x
   3008 
   3009        add             v20.8b,  v20.8b,  v31.8b  // actual base_x
   3010 
   3011        umull           v16.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_y)
   3012        umlal           v16.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
   3013 
   3014        umull           v22.8h,  v2.8b,   v7.8b   // top[base_x]-*(64-frac_x)
   3015        umlal           v22.8h,  v3.8b,   v6.8b   // + top[base_x+1]*frac_x
   3016 
   3017        cmge            v20.8b,  v20.8b,  #0
   3018 
   3019        rshrn           v16.8b,  v16.8h,  #6
   3020        rshrn           v22.8b,  v22.8h,  #6
   3021 
   3022        bit             v16.8b,  v22.8b,  v20.8b
   3023 
   3024        st1             {v16.s}[0], [x0], x1
   3025        sub             w8,  w8,  w6              // xpos -= dx
   3026        subs            w5,  w5,  #2
   3027        st1             {v16.s}[1], [x0], x1
   3028        b.le            9f
   3029 
   3030        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
   3031        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
   3032        b               4b
   3033 
   3034 49:
   3035        tbl             v16.8b, {v0.16b, v1.16b}, v29.8b // left[base_y+0], left[base_y+2]
   3036        tbl             v17.8b, {v0.16b, v1.16b}, v30.8b // left[base_y+1], left[base_y+3]
   3037 
   3038        umull           v18.8h,  v16.8b,  v28.8b  // left[base_y]*(64-frac_t)
   3039        umlal           v18.8h,  v17.8b,  v27.8b  // + left[base_y+1]*frac_y
   3040        rshrn           v18.8b,  v18.8h,  #6
   3041 
   3042        st1             {v18.s}[0], [x0], x1
   3043        subs            w5,  w5,  #2
   3044        st1             {v18.s}[1], [x0], x1
   3045        b.le            9f
   3046 
   3047        add             v29.8b,  v29.8b,  v24.8b  // base_y += 4
   3048        add             v30.8b,  v30.8b,  v24.8b  // base_y += 4
   3049        b               49b
   3050 
   3051 9:
   3052        ret
   3053 
   3054 80:
   3055        dup             v30.8h,  w7               // -dy
   3056        movi            v17.8b,  #1
   3057 
   3058        mul             v16.8h,  v31.8h,  v30.8h  // {0,1,2,3,4,5,6,7}* -dy
   3059        movi            v25.16b, #0x3e
   3060        add             v30.8h,  v16.8h,  v30.8h  // -= dy
   3061 
   3062        xtn             v31.8b,  v31.8h           // {0,1,2,3,4,5,6,7}
   3063 
   3064        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
   3065        ld1             {v0.16b, v1.16b, v2.16b}, [x3]    // left[]
   3066 
   3067        movi            v26.16b, #64
   3068        movi            v19.16b, #2
   3069 
   3070        xtn             v27.8b,  v30.8h           // (uint8_t)ypos
   3071        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   3072        and             v27.8b,  v27.8b,  v25.8b  // frac_y
   3073 
   3074        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
   3075 
   3076        add             v28.8b,  v29.8b,  v17.8b  // base_y + 1
   3077        add             v30.8b,  v29.8b,  v19.8b  // base_y + 2
   3078 
   3079        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7}
   3080        add             v24.8b,  v28.8b,  v19.8b  // base_y + 3
   3081 
   3082        trn1            v29.2d,  v29.2d,  v30.2d  // base_y + 0, base_y + 2
   3083        trn1            v30.2d,  v28.2d,  v24.2d  // base_y + 1, base_y + 3
   3084 
   3085        sub             v28.8b,  v26.8b,  v27.8b  // 64 - frac_y
   3086 
   3087        movi            v24.16b, #4
   3088 
   3089        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
   3090        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
   3091 8:
   3092        asr             w9,  w8,  #6              // base_x
   3093        dup             v16.8h,   w8              // xpos
   3094        sub             w8,  w8,  w6              // xpos -= dx
   3095        cmp             w9,  #-8                  // base_x <= -8
   3096        asr             w11, w8,  #6              // base_x
   3097        b.le            89f
   3098 
   3099        dup             v17.8h,   w8              // xpos
   3100 
   3101        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
   3102        ldr             q6,  [x2, w11, sxtw]
   3103 
   3104        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
   3105        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
   3106 
   3107        shrn            v21.8b,  v16.8h,  #6      // first base_x
   3108        shrn2           v21.16b, v17.8h,  #6
   3109        xtn             v16.8b,  v16.8h           // (uint8_t)xpos
   3110        xtn2            v16.16b, v17.8h
   3111 
   3112        ext             v5.16b,  v4.16b,  v4.16b,  #1 // top[base_x+1]
   3113        ext             v7.16b,  v6.16b,  v6.16b,  #1
   3114 
   3115        and             v16.16b, v16.16b, v25.16b // frac_x
   3116 
   3117        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
   3118        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
   3119 
   3120        sub             v7.16b,  v26.16b, v16.16b // 64 - frac_x
   3121 
   3122        add             v21.16b, v21.16b, v31.16b // actual base_x
   3123 
   3124        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   3125        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   3126        umull2          v17.8h,  v18.16b, v28.16b
   3127        umlal2          v17.8h,  v19.16b, v27.16b
   3128 
   3129        umull           v22.8h,  v4.8b,   v7.8b   // top[base_x]-*(64-frac_x)
   3130        umlal           v22.8h,  v5.8b,   v16.8b  // + top[base_x+1]*frac_x
   3131        umull2          v23.8h,  v4.16b,  v7.16b
   3132        umlal2          v23.8h,  v5.16b,  v16.16b
   3133 
   3134        cmge            v21.16b, v21.16b, #0
   3135 
   3136        rshrn           v6.8b,   v6.8h,   #6
   3137        rshrn2          v6.16b,  v17.8h,  #6
   3138        rshrn           v22.8b,  v22.8h,  #6
   3139        rshrn2          v22.16b, v23.8h,  #6
   3140 
   3141        bit             v6.16b,  v22.16b, v21.16b
   3142 
   3143        st1             {v6.d}[0], [x0], x1
   3144        sub             w8,  w8,  w6              // xpos -= dx
   3145        subs            w5,  w5,  #2
   3146        st1             {v6.d}[1], [x0], x1
   3147        b.le            9f
   3148 
   3149        add             v29.16b, v29.16b, v24.16b // base_y += 4
   3150        add             v30.16b, v30.16b, v24.16b // base_y += 4
   3151        b               8b
   3152 
   3153 89:
   3154        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
   3155        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
   3156 
   3157        umull           v6.8h,   v18.8b,  v28.8b  // left[base_y]*(64-frac_y)
   3158        umlal           v6.8h,   v19.8b,  v27.8b  // + left[base_y+1]*frac_y
   3159        umull2          v17.8h,  v18.16b, v28.16b
   3160        umlal2          v17.8h,  v19.16b, v27.16b
   3161 
   3162        rshrn           v6.8b,   v6.8h,   #6
   3163        rshrn2          v6.16b,  v17.8h,  #6
   3164 
   3165        st1             {v6.d}[0], [x0], x1
   3166        subs            w5,  w5,  #2
   3167        st1             {v6.d}[1], [x0], x1
   3168        b.le            9f
   3169 
   3170        add             v29.16b, v29.16b, v24.16b // base_y += 4
   3171        add             v30.16b, v30.16b, v24.16b // base_y += 4
   3172        b               89b
   3173 
   3174 9:
   3175        ret
   3176 endfunc
   3177 
   3178 
   3179 // void ipred_z3_fill1_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   3180 //                               const pixel *const left,
   3181 //                               const int width, const int height,
   3182 //                               const int dy, const int max_base_y);
   3183 function ipred_z3_fill1_8bpc_neon, export=1
   3184        cmp             w6,  #64
   3185        clz             w9,  w3
   3186        movrel          x8,  ipred_z3_fill1_tbl
   3187        sub             w9,  w9,  #25
   3188        ldrsw           x9,  [x8, w9, uxtw #2]
   3189        add             x10, x2,  w6,  uxtw       // left[max_base_y]
   3190        add             x8,  x8,  x9
   3191        movrel          x11, increments
   3192        ld1r            {v31.16b}, [x10]          // padding
   3193        ld1             {v30.8h},  [x11]          // increments
   3194        mov             w7,  w5
   3195        b.gt            L(ipred_z3_fill1_large_h16)
   3196        br              x8
   3197 
   3198 40:
   3199        AARCH64_VALID_JUMP_TARGET
   3200        dup             v29.4h,  w5               // dy
   3201 
   3202        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
   3203        movi            v23.16b, #0x3e
   3204 
   3205        // Worst case max_base_y is width+height-1, for w=4, h=16, <= 32
   3206        ld1             {v0.16b, v1.16b}, [x2] // left[]
   3207        add             v30.4h,  v29.4h,  v30.4h  // ypos
   3208 
   3209        movi            v22.16b, #64
   3210        movi            v20.16b, #1
   3211        movi            v21.16b, #2
   3212 
   3213        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
   3214        uqshrn          v26.8b,  v30.8h,  #6      // base
   3215        and             v24.8b,  v24.8b,  v23.8b  // frac
   3216 
   3217        mov             v4.8b,   v31.8b
   3218        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
   3219        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
   3220        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
   3221 
   3222        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base]
   3223 
   3224        trn1            v27.2s,  v27.2s,  v28.2s  // base + 1, base + 2
   3225        trn1            v24.2s,  v24.2s,  v24.2s  // frac
   3226        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
   3227 1:
   3228        mov             v5.8b,   v31.8b
   3229        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+2]
   3230 
   3231        trn1            v4.2s,   v4.2s,   v5.2s   // left[base], left[base+1]
   3232 
   3233        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
   3234        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
   3235        rshrn           v16.8b,  v16.8h,  #6
   3236        st1             {v16.s}[0], [x0], x1
   3237        subs            w4,  w4,  #2
   3238        st1             {v16.s}[1], [x0], x1
   3239        b.le            9f
   3240 
   3241        ext             v4.8b,   v5.8b,   v5.8b,  #4
   3242        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
   3243        b               1b
   3244 
   3245 9:
   3246        ret
   3247 
   3248 80:
   3249        AARCH64_VALID_JUMP_TARGET
   3250        dup             v29.8h,  w5               // dy
   3251 
   3252        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
   3253        movi            v23.16b, #0x3e
   3254 
   3255        // Worst case max_base_y is width+height-1, for w=8, h=32, <= 48
   3256        ld1             {v0.16b, v1.16b, v2.16b}, [x2] // left[]
   3257        add             v30.8h,  v29.8h,  v30.8h  // ypos
   3258 
   3259        movi            v22.16b, #64
   3260        movi            v20.16b, #1
   3261        movi            v21.16b, #2
   3262 
   3263        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
   3264        uqshrn          v26.8b,  v30.8h,  #6      // base
   3265        and             v24.8b,  v24.8b,  v23.8b  // frac
   3266 
   3267        mov             v4.8b,   v31.8b
   3268        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
   3269        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
   3270        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
   3271 
   3272        tbx             v4.8b, {v0.16b, v1.16b, v2.16b}, v26.8b // left[base]
   3273 1:
   3274        mov             v5.8b,   v31.8b
   3275        mov             v6.8b,   v31.8b
   3276        tbx             v5.8b, {v0.16b, v1.16b, v2.16b}, v27.8b // left[base+1]
   3277        tbx             v6.8b, {v0.16b, v1.16b, v2.16b}, v28.8b // left[base+2]
   3278 
   3279        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
   3280        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
   3281        umull           v17.8h,  v5.8b,   v25.8b
   3282        umlal           v17.8h,  v6.8b,   v24.8b
   3283        rshrn           v16.8b,  v16.8h,  #6
   3284        rshrn           v17.8b,  v17.8h,  #6
   3285        st1             {v16.8b}, [x0], x1
   3286        subs            w4,  w4,  #2
   3287        st1             {v17.8b}, [x0], x1
   3288        b.le            9f
   3289 
   3290        mov             v4.8b,   v6.8b
   3291        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 2
   3292        uqadd           v28.8b,  v28.8b,  v21.8b  // base += 2
   3293        b               1b
   3294 
   3295 9:
   3296        ret
   3297 
   3298 160:
   3299        AARCH64_VALID_JUMP_TARGET
   3300        dup             v28.8h,  w5               // dy
   3301 
   3302        shl             v29.8h,  v28.8h,  #3      // 8*dy
   3303        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
   3304        movi            v23.16b, #0x3e
   3305 
   3306        // This is only executed if we've checked that max_base_y <= 64.
   3307        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
   3308        add             v28.8h,  v28.8h,  v30.8h  // ypos
   3309 
   3310        movi            v22.16b, #64
   3311        movi            v20.16b, #1
   3312        movi            v21.16b, #2
   3313 
   3314        add             v29.8h,  v28.8h,  v29.8h  // ypos + 8*dy
   3315 
   3316        xtn             v24.8b,  v28.8h           // (uint8_t)ypos
   3317        xtn2            v24.16b, v29.8h
   3318        uqshrn          v26.8b,  v28.8h,  #6      // base
   3319        uqshrn2         v26.16b, v29.8h,  #6
   3320        and             v24.16b, v24.16b, v23.16b // frac
   3321 
   3322        mov             v4.16b,  v31.16b
   3323        uqadd           v27.16b, v26.16b, v20.16b // base + 1
   3324        uqadd           v28.16b, v26.16b, v21.16b // base + 2
   3325        sub             v25.16b, v22.16b, v24.16b // 64 - frac
   3326 
   3327        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base]
   3328 1:
   3329        mov             v5.16b,  v31.16b
   3330        mov             v6.16b,  v31.16b
   3331        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1]
   3332        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v28.16b // left[base+2]
   3333 
   3334        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
   3335        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
   3336        umull2          v17.8h,  v4.16b,  v25.16b
   3337        umlal2          v17.8h,  v5.16b,  v24.16b
   3338        umull           v18.8h,  v5.8b,   v25.8b
   3339        umlal           v18.8h,  v6.8b,   v24.8b
   3340        umull2          v19.8h,  v5.16b,  v25.16b
   3341        umlal2          v19.8h,  v6.16b,  v24.16b
   3342        rshrn           v16.8b,  v16.8h,  #6
   3343        rshrn2          v16.16b, v17.8h,  #6
   3344        rshrn           v17.8b,  v18.8h,  #6
   3345        rshrn2          v17.16b, v19.8h,  #6
   3346        st1             {v16.16b}, [x0], x1
   3347        subs            w4,  w4,  #2
   3348        st1             {v17.16b}, [x0], x1
   3349        b.le            9f
   3350 
   3351        mov             v4.16b,  v6.16b
   3352        uqadd           v27.16b, v27.16b, v21.16b // base += 2
   3353        uqadd           v28.16b, v28.16b, v21.16b // base += 2
   3354        b               1b
   3355 
   3356 9:
   3357        ret
   3358 320:
   3359 640:
   3360        AARCH64_VALID_JUMP_TARGET
   3361        dup             v28.8h,  w5               // dy
   3362        mov             w12, w3
   3363 
   3364        add             x13, x0,  x1
   3365 
   3366        shl             v29.8h,  v28.8h,  #3      // 8*dy
   3367        mul             v30.8h,  v30.8h,  v28.8h  // {0,1,2,3,4,5,6,7}*dy
   3368        movi            v23.16b, #0x3e
   3369 
   3370        lsl             x1,  x1,  #1
   3371        sub             x1,  x1,  w3,  uxtw
   3372        add             v30.8h,  v28.8h,  v30.8h  // ypos
   3373 
   3374        // This is only executed if we've checked that max_base_y <= 64.
   3375        ld1             {v0.16b, v1.16b, v2.16b, v3.16b}, [x2] // left[]
   3376 
   3377        movi            v22.16b, #64
   3378        movi            v20.16b, #1
   3379        movi            v21.16b, #2
   3380 
   3381 1:
   3382        mov             v26.16b,  v30.16b         // reset ypos
   3383 
   3384 2:
   3385        add             v27.8h,  v26.8h,  v29.8h  // ypos + 8*dy
   3386        uqshrn          v16.8b,  v26.8h,  #6      // base
   3387        uqshrn2         v16.16b, v27.8h,  #6
   3388        xtn             v24.8b,  v26.8h           // (uint8_t)ypos
   3389        xtn2            v24.16b, v27.8h
   3390        umov            w14,     v16.b[0]
   3391        and             v24.16b, v24.16b, v23.16b // frac
   3392 
   3393        uqadd           v17.16b, v16.16b, v20.16b // base + 1
   3394        cmp             w14, w6                   // base >= max_base_y
   3395        uqadd           v18.16b, v16.16b, v21.16b // base + 2
   3396        sub             v25.16b, v22.16b, v24.16b // 64 - frac
   3397 
   3398        b.ge            4f
   3399 
   3400        mov             v4.16b,  v31.16b
   3401        mov             v5.16b,  v31.16b
   3402        mov             v6.16b,  v31.16b
   3403        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v16.16b // left[base]
   3404        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v17.16b // left[base+1]
   3405        tbx             v6.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v18.16b // left[base+2]
   3406 
   3407        subs            w3,  w3,  #16
   3408        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
   3409        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
   3410        umull2          v17.8h,  v4.16b,  v25.16b
   3411        umlal2          v17.8h,  v5.16b,  v24.16b
   3412        umull           v18.8h,  v5.8b,   v25.8b
   3413        umlal           v18.8h,  v6.8b,   v24.8b
   3414        umull2          v19.8h,  v5.16b,  v25.16b
   3415        umlal2          v19.8h,  v6.16b,  v24.16b
   3416        rshrn           v16.8b,  v16.8h,  #6
   3417        rshrn2          v16.16b, v17.8h,  #6
   3418        rshrn           v17.8b,  v18.8h,  #6
   3419        rshrn2          v17.16b, v19.8h,  #6
   3420        st1             {v16.16b}, [x0],  #16
   3421        st1             {v17.16b}, [x13], #16
   3422        b.le            3f
   3423        add             v26.8h,  v27.8h,  v29.8h  // ypos += 16*dy
   3424        b               2b
   3425 
   3426 3:
   3427        subs            w4,  w4,  #2
   3428        b.le            9f
   3429        movi            v16.8h,  #128
   3430        add             x0,  x0,  x1
   3431        add             x13, x13, x1
   3432        add             v30.8h,  v30.8h,  v16.8h  // ypos = dy + y*(1<<6)*2
   3433        mov             w3,  w12
   3434        b               1b
   3435 
   3436 4:
   3437        subs            w3,  w3,  #16
   3438        st1             {v31.16b}, [x0],  #16
   3439        st1             {v31.16b}, [x13], #16
   3440        b.gt            4b
   3441        b               3b
   3442 
   3443 9:
   3444        ret
   3445 
   3446 L(ipred_z3_fill1_large_h16):
   3447        // Fallback case for max_base_y > 64; similar to the z1
   3448        // implementation. This does the filtering vertically, filling out
   3449        // a 2x pixel column at a time.
   3450        mov             w15, #64
   3451        add             x13, x0,  x1
   3452        lsl             x1,  x1,  #1
   3453 
   3454        mov             w12, w4
   3455 1:
   3456        lsr             w8,  w7,  #6              // base
   3457        and             w9,  w7,  #0x3e           // frac
   3458        add             w7,  w7,  w5              // ypos += dy
   3459        cmp             w8,  w6                   // base >= max_base_y
   3460        lsr             w10, w7,  #6              // base
   3461        and             w11, w7,  #0x3e           // frac
   3462        b.ge            ipred_z3_fill_padding_neon
   3463        add             x8,  x2,  w8,  uxtw
   3464        add             x10, x2,  w10, uxtw
   3465        dup             v4.16b,  w9               // frac
   3466        dup             v5.16b,  w11
   3467        ld1             {v0.16b, v1.16b}, [x8],  #32 // left[base]
   3468        ld1             {v2.16b, v3.16b}, [x10], #32
   3469        sub             w9,  w15, w9              // 64 - frac
   3470        sub             w11, w15, w11
   3471        dup             v6.16b,  w9               // 64 - frac
   3472        dup             v7.16b,  w11
   3473        add             w7,  w7,  w5              // ypos += dy
   3474 2:
   3475        ext             v16.16b, v0.16b,  v1.16b,  #1 // left[base+1]
   3476        ext             v17.16b, v2.16b,  v3.16b,  #1
   3477        subs            w4,  w4,  #16
   3478        umull           v18.8h,  v16.8b,  v4.8b   // left[base+1]*frac
   3479        umlal           v18.8h,  v0.8b,   v6.8b   // + left[base]*(64-frac)
   3480        umull2          v19.8h,  v16.16b, v4.16b
   3481        umlal2          v19.8h,  v0.16b,  v6.16b
   3482        umull           v20.8h,  v17.8b,  v5.8b
   3483        umlal           v20.8h,  v2.8b,   v7.8b
   3484        umull2          v21.8h,  v17.16b, v5.16b
   3485        umlal2          v21.8h,  v2.16b,  v7.16b
   3486        rshrn           v16.8b,  v18.8h,  #6
   3487        rshrn2          v16.16b, v19.8h,  #6
   3488        rshrn           v17.8b,  v20.8h,  #6
   3489        rshrn2          v17.16b, v21.8h,  #6
   3490        zip1            v18.16b, v16.16b, v17.16b
   3491        zip2            v19.16b, v16.16b, v17.16b
   3492        st1             {v18.h}[0], [x0],  x1
   3493        st1             {v18.h}[1], [x13], x1
   3494        st1             {v18.h}[2], [x0],  x1
   3495        st1             {v18.h}[3], [x13], x1
   3496        st1             {v18.h}[4], [x0],  x1
   3497        st1             {v18.h}[5], [x13], x1
   3498        st1             {v18.h}[6], [x0],  x1
   3499        st1             {v18.h}[7], [x13], x1
   3500        st1             {v19.h}[0], [x0],  x1
   3501        st1             {v19.h}[1], [x13], x1
   3502        st1             {v19.h}[2], [x0],  x1
   3503        st1             {v19.h}[3], [x13], x1
   3504        st1             {v19.h}[4], [x0],  x1
   3505        st1             {v19.h}[5], [x13], x1
   3506        st1             {v19.h}[6], [x0],  x1
   3507        st1             {v19.h}[7], [x13], x1
   3508        b.le            3f
   3509        mov             v0.16b,  v1.16b
   3510        ld1             {v1.16b}, [x8],  #16      // left[base]
   3511        mov             v2.16b,  v3.16b
   3512        ld1             {v3.16b}, [x10], #16
   3513        b               2b
   3514 
   3515 3:
   3516        subs            w3,  w3,  #2
   3517        b.le            9f
   3518        lsr             x1,  x1,  #1
   3519        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3520        msub            x13, x1,  x12, x13
   3521        lsl             x1,  x1,  #1
   3522        add             x0,  x0,  #2
   3523        add             x13, x13, #2
   3524        mov             w4,  w12
   3525        b               1b
   3526 9:
   3527        ret
   3528 endfunc
   3529 
   3530 jumptable ipred_z3_fill1_tbl
   3531        .word 640b - ipred_z3_fill1_tbl
   3532        .word 320b - ipred_z3_fill1_tbl
   3533        .word 160b - ipred_z3_fill1_tbl
   3534        .word 80b  - ipred_z3_fill1_tbl
   3535        .word 40b  - ipred_z3_fill1_tbl
   3536 endjumptable
   3537 
   3538 function ipred_z3_fill_padding_neon, export=0
   3539        cmp             w3,  #16
   3540        movrel          x8,  ipred_z3_fill_padding_tbl
   3541        b.gt            ipred_z3_fill_padding_wide
   3542        // w3 = remaining width, w4 = constant height
   3543        mov             w12, w4
   3544 
   3545 1:
   3546        // Fill a WxH rectangle with padding. W can be any number;
   3547        // this fills the exact width by filling in the largest
   3548        // power of two in the remaining width, and repeating.
   3549        clz             w9,  w3
   3550        sub             w9,  w9,  #25
   3551        ldrsw           x9,  [x8, w9, uxtw #2]
   3552        add             x9,  x8,  x9
   3553        br              x9
   3554 
   3555 20:
   3556        AARCH64_VALID_JUMP_TARGET
   3557 2:
   3558        st1             {v31.h}[0], [x0],  x1
   3559        subs            w4,  w4,  #4
   3560        st1             {v31.h}[0], [x13], x1
   3561        st1             {v31.h}[0], [x0],  x1
   3562        st1             {v31.h}[0], [x13], x1
   3563        b.gt            2b
   3564        subs            w3,  w3,  #2
   3565        lsr             x1,  x1,  #1
   3566        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3567        msub            x13, x1,  x12, x13
   3568        b.le            9f
   3569        lsl             x1,  x1,  #1
   3570        add             x0,  x0,  #2
   3571        add             x13, x13, #2
   3572        mov             w4,  w12
   3573        b               1b
   3574 
   3575 40:
   3576        AARCH64_VALID_JUMP_TARGET
   3577 4:
   3578        st1             {v31.s}[0], [x0],  x1
   3579        subs            w4,  w4,  #4
   3580        st1             {v31.s}[0], [x13], x1
   3581        st1             {v31.s}[0], [x0],  x1
   3582        st1             {v31.s}[0], [x13], x1
   3583        b.gt            4b
   3584        subs            w3,  w3,  #4
   3585        lsr             x1,  x1,  #1
   3586        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3587        msub            x13, x1,  x12, x13
   3588        b.le            9f
   3589        lsl             x1,  x1,  #1
   3590        add             x0,  x0,  #4
   3591        add             x13, x13, #4
   3592        mov             w4,  w12
   3593        b               1b
   3594 
   3595 80:
   3596        AARCH64_VALID_JUMP_TARGET
   3597 8:
   3598        st1             {v31.8b}, [x0],  x1
   3599        subs            w4,  w4,  #4
   3600        st1             {v31.8b}, [x13], x1
   3601        st1             {v31.8b}, [x0],  x1
   3602        st1             {v31.8b}, [x13], x1
   3603        b.gt            8b
   3604        subs            w3,  w3,  #8
   3605        lsr             x1,  x1,  #1
   3606        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3607        msub            x13, x1,  x12, x13
   3608        b.le            9f
   3609        lsl             x1,  x1,  #1
   3610        add             x0,  x0,  #8
   3611        add             x13, x13, #8
   3612        mov             w4,  w12
   3613        b               1b
   3614 
   3615 160:
   3616 320:
   3617 640:
   3618        AARCH64_VALID_JUMP_TARGET
   3619 16:
   3620        st1             {v31.16b}, [x0],  x1
   3621        subs            w4,  w4,  #4
   3622        st1             {v31.16b}, [x13], x1
   3623        st1             {v31.16b}, [x0],  x1
   3624        st1             {v31.16b}, [x13], x1
   3625        b.gt            16b
   3626        subs            w3,  w3,  #16
   3627        lsr             x1,  x1,  #1
   3628        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3629        msub            x13, x1,  x12, x13
   3630        b.le            9f
   3631        lsl             x1,  x1,  #1
   3632        add             x0,  x0,  #16
   3633        add             x13, x13, #16
   3634        mov             w4,  w12
   3635        b               1b
   3636 
   3637 9:
   3638        ret
   3639 endfunc
   3640 
   3641 jumptable ipred_z3_fill_padding_tbl
   3642        .word 640b - ipred_z3_fill_padding_tbl
   3643        .word 320b - ipred_z3_fill_padding_tbl
   3644        .word 160b - ipred_z3_fill_padding_tbl
   3645        .word 80b  - ipred_z3_fill_padding_tbl
   3646        .word 40b  - ipred_z3_fill_padding_tbl
   3647        .word 20b  - ipred_z3_fill_padding_tbl
   3648 endjumptable
   3649 
   3650 function ipred_z3_fill_padding_wide
   3651        // Fill a WxH rectangle with padding, with W > 16.
   3652        lsr             x1,  x1,  #1
   3653        mov             w12, w3
   3654        sub             x1,  x1,  w3,  uxtw
   3655 1:
   3656        ands            w5,  w3,  #15
   3657        b.eq            2f
   3658        // If the width isn't aligned to 16, first do one 16 byte write
   3659        // and align the start pointer.
   3660        sub             w3,  w3,  w5
   3661        st1             {v31.16b}, [x0]
   3662        add             x0,  x0,  w5,  uxtw
   3663 2:
   3664        // Fill the rest of the line with aligned 16 byte writes.
   3665        subs            w3,  w3,  #16
   3666        st1             {v31.16b}, [x0], #16
   3667        b.gt            2b
   3668        subs            w4,  w4,  #1
   3669        add             x0,  x0,  x1
   3670        b.le            9f
   3671        mov             w3,  w12
   3672        b               1b
   3673 9:
   3674        ret
   3675 endfunc
   3676 
   3677 function ipred_z3_fill2_8bpc_neon, export=1
   3678        cmp             w3,  #8
   3679        add             x10, x2,  w6,  uxtw       // left[max_base_y]
   3680        movrel          x11, increments
   3681        ld1r            {v31.16b}, [x10]          // padding
   3682        ld1             {v30.8h},  [x11]          // increments
   3683        b.eq            80f
   3684 
   3685 40:     // w == 4
   3686        dup             v29.4h,  w5               // dy
   3687 
   3688        mul             v30.4h,  v30.4h,  v29.4h  // {0,1,2,3,4,5,6,7}*dy
   3689        movi            v23.16b, #0x3e
   3690 
   3691        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
   3692        // so max_base_y <= 32.
   3693        ld1             {v0.16b, v1.16b}, [x2] // left[]
   3694        add             v30.4h,  v29.4h,  v30.4h  // ypos
   3695 
   3696        movi            v22.16b, #64
   3697        movi            v20.16b, #1
   3698        movi            v21.16b, #2
   3699 
   3700        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
   3701        uqshrn          v26.8b,  v30.8h,  #6      // base
   3702        and             v24.8b,  v24.8b,  v23.8b  // frac
   3703 
   3704        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
   3705        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
   3706        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
   3707        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
   3708 
   3709        trn1            v24.2s,  v24.2s,  v24.2s  // frac
   3710        trn1            v26.2s,  v26.2s,  v28.2s  // base + 0, base + 2
   3711        trn1            v27.2s,  v27.2s,  v29.2s  // base + 1, base + 3
   3712        trn1            v25.2s,  v25.2s,  v25.2s  // 64 - frac
   3713 
   3714        movi            v21.16b, #4
   3715 1:
   3716        mov             v4.8b,   v31.8b
   3717        mov             v5.8b,   v31.8b
   3718        tbx             v4.8b, {v0.16b, v1.16b}, v26.8b // left[base], left[base+2]
   3719        tbx             v5.8b, {v0.16b, v1.16b}, v27.8b // left[base+1], left[base+3]
   3720 
   3721        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
   3722        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
   3723        rshrn           v16.8b,  v16.8h,  #6
   3724        st1             {v16.s}[0], [x0], x1
   3725        subs            w4,  w4,  #2
   3726        st1             {v16.s}[1], [x0], x1
   3727        b.le            9f
   3728 
   3729        uqadd           v26.8b,  v26.8b,  v21.8b  // base += 4
   3730        uqadd           v27.8b,  v27.8b,  v21.8b  // base += 4
   3731        b               1b
   3732 
   3733 9:
   3734        ret
   3735 
   3736 80:     // w == 8
   3737        dup             v29.8h,  w5               // dy
   3738 
   3739        mul             v30.8h,  v30.8h,  v29.8h  // {0,1,2,3,4,5,6,7}*dy
   3740        movi            v23.16b, #0x3e
   3741 
   3742        // Worst case max_base_y is 2*(width+height)-2, but width+height <= 16,
   3743        // so max_base_y <= 32.
   3744        ld1             {v0.16b, v1.16b}, [x2] // left[]
   3745        add             v30.8h,  v29.8h,  v30.8h  // ypos
   3746 
   3747        movi            v22.16b, #64
   3748        movi            v20.16b, #1
   3749        movi            v21.16b, #2
   3750 
   3751        xtn             v24.8b,  v30.8h           // (uint8_t)ypos
   3752        uqshrn          v26.8b,  v30.8h,  #6      // base
   3753        and             v24.8b,  v24.8b,  v23.8b  // frac
   3754 
   3755        uqadd           v27.8b,  v26.8b,  v20.8b  // base + 1
   3756        uqadd           v28.8b,  v26.8b,  v21.8b  // base + 2
   3757        sub             v25.8b,  v22.8b,  v24.8b  // 64 - frac
   3758        uqadd           v29.8b,  v27.8b,  v21.8b  // base + 3
   3759 
   3760        trn1            v24.2d,  v24.2d,  v24.2d  // frac
   3761        trn1            v26.2d,  v26.2d,  v28.2d  // base + 0, base + 2
   3762        trn1            v27.2d,  v27.2d,  v29.2d  // base + 1, base + 3
   3763        trn1            v25.2d,  v25.2d,  v25.2d  // 64 - frac
   3764 
   3765        movi            v21.16b, #4
   3766 1:
   3767        mov             v4.16b,  v31.16b
   3768        mov             v5.16b,  v31.16b
   3769        tbx             v4.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v26.16b // left[base], left[base+2]
   3770        tbx             v5.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v27.16b // left[base+1], left[base+3]
   3771 
   3772        umull           v16.8h,  v4.8b,   v25.8b  // left[base]*(64-frac)
   3773        umlal           v16.8h,  v5.8b,   v24.8b  // + left[base+1]*frac
   3774        umull2          v17.8h,  v4.16b,  v25.16b
   3775        umlal2          v17.8h,  v5.16b,  v24.16b
   3776        rshrn           v16.8b,  v16.8h,  #6
   3777        rshrn           v17.8b,  v17.8h,  #6
   3778        st1             {v16.8b}, [x0], x1
   3779        subs            w4,  w4,  #2
   3780        st1             {v17.8b}, [x0], x1
   3781        b.le            9f
   3782 
   3783        uqadd           v26.16b, v26.16b, v21.16b // base += 4
   3784        uqadd           v27.16b, v27.16b, v21.16b // base += 4
   3785        b               1b
   3786 
   3787 9:
   3788        ret
   3789 endfunc
   3790 
   3791 
   3792 // void ipred_filter_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   3793 //                             const pixel *const topleft,
   3794 //                             const int width, const int height, const int filt_idx,
   3795 //                             const int max_width, const int max_height);
   3796 function ipred_filter_8bpc_neon, export=1
   3797        and             w5,  w5,  #511
   3798        movrel          x6,  X(filter_intra_taps)
   3799        lsl             w5,  w5,  #6
   3800        add             x6,  x6,  w5, uxtw
   3801        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
   3802        clz             w9,  w3
   3803        movrel          x5,  ipred_filter_tbl
   3804        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
   3805        sub             w9,  w9,  #26
   3806        ldrsw           x9,  [x5, w9, uxtw #2]
   3807        sxtl            v16.8h,  v16.8b
   3808        sxtl            v17.8h,  v17.8b
   3809        add             x5,  x5,  x9
   3810        sxtl            v18.8h,  v18.8b
   3811        sxtl            v19.8h,  v19.8b
   3812        add             x6,  x0,  x1
   3813        lsl             x1,  x1,  #1
   3814        sxtl            v20.8h,  v20.8b
   3815        sxtl            v21.8h,  v21.8b
   3816        sxtl            v22.8h,  v22.8b
   3817        br              x5
   3818 40:
   3819        AARCH64_VALID_JUMP_TARGET
   3820        ldur            s0,  [x2, #1]             // top (0-3)
   3821        sub             x2,  x2,  #2
   3822        mov             x7,  #-2
   3823        uxtl            v0.8h,   v0.8b            // top (0-3)
   3824 4:
   3825        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
   3826        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
   3827        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
   3828        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
   3829        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
   3830        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
   3831        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
   3832        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
   3833        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
   3834        sqrshrun        v2.8b,   v2.8h,   #4
   3835        subs            w4,  w4,  #2
   3836        st1             {v2.s}[0], [x0], x1
   3837        uxtl            v0.8h,   v2.8b
   3838        st1             {v2.s}[1], [x6], x1
   3839        ext             v0.16b,  v0.16b,  v0.16b, #8 // move top from [4-7] to [0-3]
   3840        b.gt            4b
   3841        ret
   3842 80:
   3843        AARCH64_VALID_JUMP_TARGET
   3844        ldur            d0,  [x2, #1]             // top (0-7)
   3845        sub             x2,  x2,  #2
   3846        mov             x7,  #-2
   3847        uxtl            v0.8h,   v0.8b            // top (0-7)
   3848 8:
   3849        ld1             {v1.s}[0], [x2], x7       // left (0-1) + topleft (2)
   3850        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
   3851        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
   3852        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
   3853        uxtl            v1.8h,   v1.8b            // left (0-1) + topleft (2)
   3854        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
   3855        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
   3856        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
   3857        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
   3858        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
   3859        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
   3860        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
   3861        sqrshrun        v2.8b,   v2.8h,   #4
   3862        uxtl            v1.8h,   v2.8b            // first block, in 16 bit
   3863        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
   3864        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
   3865        mla             v3.8h,   v21.8h,  v1.h[3] // p5(left[0]) * filter(5)
   3866        mla             v3.8h,   v22.8h,  v1.h[7] // p6(left[1]) * filter(6)
   3867        sqrshrun        v3.8b,   v3.8h,   #4
   3868        subs            w4,  w4,  #2
   3869        st2             {v2.s, v3.s}[0], [x0], x1
   3870        zip2            v0.2s,   v2.2s,   v3.2s
   3871        st2             {v2.s, v3.s}[1], [x6], x1
   3872        uxtl            v0.8h,   v0.8b
   3873        b.gt            8b
   3874        ret
   3875 160:
   3876 320:
   3877        AARCH64_VALID_JUMP_TARGET
   3878        add             x8,  x2,  #1
   3879        sub             x2,  x2,  #2
   3880        mov             x7,  #-2
   3881        sub             x1,  x1,  w3, uxtw
   3882        mov             w9,  w3
   3883 
   3884 1:
   3885        ld1             {v0.s}[0], [x2], x7       // left (0-1) + topleft (2)
   3886        uxtl            v0.8h,   v0.8b            // left (0-1) + topleft (2)
   3887 2:
   3888        ld1             {v2.16b}, [x8],   #16     // top(0-15)
   3889        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
   3890        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
   3891        uxtl            v1.8h,   v2.8b            // top(0-7)
   3892        uxtl2           v2.8h,   v2.16b           // top(8-15)
   3893        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
   3894        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
   3895        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
   3896        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
   3897        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
   3898 
   3899        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
   3900        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
   3901        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
   3902        sqrshrun        v3.8b,   v3.8h,   #4
   3903        uxtl            v0.8h,   v3.8b            // first block, in 16 bit
   3904        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
   3905        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
   3906        mla             v4.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
   3907        mla             v4.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
   3908 
   3909        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
   3910        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
   3911        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
   3912        sqrshrun        v4.8b,   v4.8h,   #4
   3913        uxtl            v0.8h,   v4.8b            // second block, in 16 bit
   3914        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
   3915        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
   3916        mla             v5.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
   3917        mla             v5.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
   3918 
   3919        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
   3920        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
   3921        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
   3922        sqrshrun        v5.8b,   v5.8h,   #4
   3923        uxtl            v0.8h,   v5.8b            // third block, in 16 bit
   3924        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
   3925        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
   3926        mla             v6.8h,   v21.8h,  v0.h[3] // p5(left[0]) * filter(5)
   3927        mla             v6.8h,   v22.8h,  v0.h[7] // p6(left[1]) * filter(6)
   3928 
   3929        subs            w3,  w3,  #16
   3930        sqrshrun        v6.8b,   v6.8h,   #4
   3931 
   3932        st4             {v3.s, v4.s, v5.s, v6.s}[0], [x0], #16
   3933        st4             {v3.s, v4.s, v5.s, v6.s}[1], [x6], #16
   3934        b.le            8f
   3935        ins             v0.h[2], v2.h[7]
   3936        ins             v0.b[0], v6.b[7]
   3937        ins             v0.b[2], v6.b[3]
   3938        b               2b
   3939 8:
   3940        subs            w4,  w4,  #2
   3941        b.le            9f
   3942        sub             x8,  x6,  w9, uxtw
   3943        add             x0,  x0,  x1
   3944        add             x6,  x6,  x1
   3945        mov             w3,  w9
   3946        b               1b
   3947 9:
   3948        ret
   3949 endfunc
   3950 
   3951 jumptable ipred_filter_tbl
   3952        .word 320b - ipred_filter_tbl
   3953        .word 160b - ipred_filter_tbl
   3954        .word 80b  - ipred_filter_tbl
   3955        .word 40b  - ipred_filter_tbl
   3956 endjumptable
   3957 
   3958 // void pal_pred_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   3959 //                         const pixel *const pal, const uint8_t *idx,
   3960 //                         const int w, const int h);
   3961 function pal_pred_8bpc_neon, export=1
   3962        ld1             {v0.8b}, [x2]
   3963        clz             w9,  w4
   3964        movrel          x6,  pal_pred_tbl
   3965        sub             w9,  w9,  #25
   3966        movi            v31.16b, #7
   3967        ldrsw           x9,  [x6, w9, uxtw #2]
   3968        add             x6,  x6,  x9
   3969        add             x2,  x0,  x1
   3970        lsl             x1,  x1,  #1
   3971        br              x6
   3972 40:
   3973        AARCH64_VALID_JUMP_TARGET
   3974 4:
   3975        ld1             {v1.8b}, [x3], #8
   3976        subs            w5,  w5,  #4
   3977        ushr            v3.8b,   v1.8b,   #4
   3978        and             v2.8b,   v1.8b,   v31.8b
   3979        zip1            v1.16b,  v2.16b,  v3.16b
   3980        tbl             v1.16b, {v0.16b}, v1.16b
   3981        st1             {v1.s}[0], [x0], x1
   3982        st1             {v1.s}[1], [x2], x1
   3983        st1             {v1.s}[2], [x0], x1
   3984        st1             {v1.s}[3], [x2], x1
   3985        b.gt            4b
   3986        ret
   3987 80:
   3988        AARCH64_VALID_JUMP_TARGET
   3989 8:
   3990        ld1             {v1.16b}, [x3], #16
   3991        subs            w5,  w5,  #4
   3992        ushr            v4.16b,  v1.16b,  #4
   3993        and             v3.16b,  v1.16b,  v31.16b
   3994        zip1            v1.16b,  v3.16b,  v4.16b
   3995        zip2            v2.16b,  v3.16b,  v4.16b
   3996        tbl             v1.16b, {v0.16b}, v1.16b
   3997        st1             {v1.d}[0], [x0], x1
   3998        tbl             v2.16b, {v0.16b}, v2.16b
   3999        st1             {v1.d}[1], [x2], x1
   4000        st1             {v2.d}[0], [x0], x1
   4001        st1             {v2.d}[1], [x2], x1
   4002        b.gt            8b
   4003        ret
   4004 160:
   4005        AARCH64_VALID_JUMP_TARGET
   4006 16:
   4007        ld1             {v1.16b, v2.16b}, [x3], #32
   4008        subs            w5,  w5,  #4
   4009        ushr            v5.16b,  v1.16b,  #4
   4010        and             v4.16b,  v1.16b,  v31.16b
   4011        ushr            v7.16b,  v2.16b,  #4
   4012        and             v6.16b,  v2.16b,  v31.16b
   4013        zip1            v1.16b,  v4.16b,  v5.16b
   4014        zip2            v2.16b,  v4.16b,  v5.16b
   4015        zip1            v3.16b,  v6.16b,  v7.16b
   4016        tbl             v1.16b, {v0.16b}, v1.16b
   4017        zip2            v4.16b,  v6.16b,  v7.16b
   4018        tbl             v2.16b, {v0.16b}, v2.16b
   4019        st1             {v1.16b}, [x0], x1
   4020        tbl             v3.16b, {v0.16b}, v3.16b
   4021        st1             {v2.16b}, [x2], x1
   4022        tbl             v4.16b, {v0.16b}, v4.16b
   4023        st1             {v3.16b}, [x0], x1
   4024        st1             {v4.16b}, [x2], x1
   4025        b.gt            16b
   4026        ret
   4027 320:
   4028        AARCH64_VALID_JUMP_TARGET
   4029 32:
   4030        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
   4031        subs            w5,  w5,  #4
   4032        ushr            v21.16b, v16.16b, #4
   4033        and             v20.16b, v16.16b, v31.16b
   4034        ushr            v23.16b, v17.16b, #4
   4035        and             v22.16b, v17.16b, v31.16b
   4036        ushr            v25.16b, v18.16b, #4
   4037        and             v24.16b, v18.16b, v31.16b
   4038        ushr            v27.16b, v19.16b, #4
   4039        and             v26.16b, v19.16b, v31.16b
   4040        zip1            v16.16b, v20.16b, v21.16b
   4041        zip2            v17.16b, v20.16b, v21.16b
   4042        zip1            v18.16b, v22.16b, v23.16b
   4043        zip2            v19.16b, v22.16b, v23.16b
   4044        zip1            v20.16b, v24.16b, v25.16b
   4045        zip2            v21.16b, v24.16b, v25.16b
   4046        tbl             v16.16b, {v0.16b}, v16.16b
   4047        zip1            v22.16b, v26.16b, v27.16b
   4048        tbl             v17.16b, {v0.16b}, v17.16b
   4049        zip2            v23.16b, v26.16b, v27.16b
   4050        tbl             v18.16b, {v0.16b}, v18.16b
   4051        tbl             v19.16b, {v0.16b}, v19.16b
   4052        tbl             v20.16b, {v0.16b}, v20.16b
   4053        st1             {v16.16b, v17.16b}, [x0], x1
   4054        tbl             v21.16b, {v0.16b}, v21.16b
   4055        st1             {v18.16b, v19.16b}, [x2], x1
   4056        tbl             v22.16b, {v0.16b}, v22.16b
   4057        st1             {v20.16b, v21.16b}, [x0], x1
   4058        tbl             v23.16b, {v0.16b}, v23.16b
   4059        st1             {v22.16b, v23.16b}, [x2], x1
   4060        b.gt            32b
   4061        ret
   4062 640:
   4063        AARCH64_VALID_JUMP_TARGET
   4064 64:
   4065        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x3], #64
   4066        subs            w5,  w5,  #2
   4067        ushr            v21.16b, v16.16b, #4
   4068        and             v20.16b, v16.16b, v31.16b
   4069        ushr            v23.16b, v17.16b, #4
   4070        and             v22.16b, v17.16b, v31.16b
   4071        ushr            v25.16b, v18.16b, #4
   4072        and             v24.16b, v18.16b, v31.16b
   4073        ushr            v27.16b, v19.16b, #4
   4074        and             v26.16b, v19.16b, v31.16b
   4075        zip1            v16.16b, v20.16b, v21.16b
   4076        zip2            v17.16b, v20.16b, v21.16b
   4077        zip1            v18.16b, v22.16b, v23.16b
   4078        zip2            v19.16b, v22.16b, v23.16b
   4079        zip1            v20.16b, v24.16b, v25.16b
   4080        zip2            v21.16b, v24.16b, v25.16b
   4081        tbl             v16.16b, {v0.16b}, v16.16b
   4082        zip1            v22.16b, v26.16b, v27.16b
   4083        tbl             v17.16b, {v0.16b}, v17.16b
   4084        zip2            v23.16b, v26.16b, v27.16b
   4085        tbl             v18.16b, {v0.16b}, v18.16b
   4086        tbl             v19.16b, {v0.16b}, v19.16b
   4087        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
   4088        tbl             v20.16b, {v0.16b}, v20.16b
   4089        tbl             v21.16b, {v0.16b}, v21.16b
   4090        tbl             v22.16b, {v0.16b}, v22.16b
   4091        tbl             v23.16b, {v0.16b}, v23.16b
   4092        st1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x1
   4093        b.gt            64b
   4094        ret
   4095 endfunc
   4096 
   4097 jumptable pal_pred_tbl
   4098        .word 640b - pal_pred_tbl
   4099        .word 320b - pal_pred_tbl
   4100        .word 160b - pal_pred_tbl
   4101        .word 80b  - pal_pred_tbl
   4102        .word 40b  - pal_pred_tbl
   4103 endjumptable
   4104 
   4105 // void ipred_cfl_128_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   4106 //                              const pixel *const topleft,
   4107 //                              const int width, const int height,
   4108 //                              const int16_t *ac, const int alpha);
   4109 function ipred_cfl_128_8bpc_neon, export=1
   4110        clz             w9,  w3
   4111        movrel          x7,  ipred_cfl_128_tbl
   4112        sub             w9,  w9,  #26
   4113        ldrsw           x9,  [x7, w9, uxtw #2]
   4114        movi            v0.8h,   #128 // dc
   4115        dup             v1.8h,   w6   // alpha
   4116        add             x7,  x7,  x9
   4117        add             x6,  x0,  x1
   4118        lsl             x1,  x1,  #1
   4119        br              x7
   4120 L(ipred_cfl_splat_w4):
   4121        AARCH64_VALID_JUMP_TARGET
   4122 1:
   4123        ld1             {v2.8h, v3.8h}, [x5], #32
   4124        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
   4125        mul             v3.8h,   v3.8h,   v1.8h
   4126        cmlt            v4.8h,   v2.8h,   #0     // sign
   4127        cmlt            v5.8h,   v3.8h,   #0
   4128        add             v2.8h,   v2.8h,   v4.8h  // diff + sign
   4129        add             v3.8h,   v3.8h,   v5.8h
   4130        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
   4131        srshr           v3.8h,   v3.8h,   #6
   4132        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
   4133        add             v3.8h,   v3.8h,   v0.8h
   4134        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
   4135        sqxtun          v3.8b,   v3.8h
   4136        st1             {v2.s}[0],  [x0], x1
   4137        st1             {v2.s}[1],  [x6], x1
   4138        subs            w4,  w4,  #4
   4139        st1             {v3.s}[0],  [x0], x1
   4140        st1             {v3.s}[1],  [x6], x1
   4141        b.gt            1b
   4142        ret
   4143 L(ipred_cfl_splat_w8):
   4144        AARCH64_VALID_JUMP_TARGET
   4145 1:
   4146        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x5], #64
   4147        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
   4148        mul             v3.8h,   v3.8h,   v1.8h
   4149        mul             v4.8h,   v4.8h,   v1.8h
   4150        mul             v5.8h,   v5.8h,   v1.8h
   4151        cmlt            v16.8h,  v2.8h,   #0     // sign
   4152        cmlt            v17.8h,  v3.8h,   #0
   4153        cmlt            v18.8h,  v4.8h,   #0
   4154        cmlt            v19.8h,  v5.8h,   #0
   4155        add             v2.8h,   v2.8h,   v16.8h // diff + sign
   4156        add             v3.8h,   v3.8h,   v17.8h
   4157        add             v4.8h,   v4.8h,   v18.8h
   4158        add             v5.8h,   v5.8h,   v19.8h
   4159        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
   4160        srshr           v3.8h,   v3.8h,   #6
   4161        srshr           v4.8h,   v4.8h,   #6
   4162        srshr           v5.8h,   v5.8h,   #6
   4163        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
   4164        add             v3.8h,   v3.8h,   v0.8h
   4165        add             v4.8h,   v4.8h,   v0.8h
   4166        add             v5.8h,   v5.8h,   v0.8h
   4167        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
   4168        sqxtun          v3.8b,   v3.8h
   4169        sqxtun          v4.8b,   v4.8h
   4170        sqxtun          v5.8b,   v5.8h
   4171        st1             {v2.8b},  [x0], x1
   4172        st1             {v3.8b},  [x6], x1
   4173        subs            w4,  w4,  #4
   4174        st1             {v4.8b},  [x0], x1
   4175        st1             {v5.8b},  [x6], x1
   4176        b.gt            1b
   4177        ret
   4178 L(ipred_cfl_splat_w16):
   4179        AARCH64_VALID_JUMP_TARGET
   4180        add             x7,  x5,  w3, uxtw #1
   4181        sub             x1,  x1,  w3, uxtw
   4182        mov             w9,  w3
   4183 1:
   4184        ld1             {v2.8h, v3.8h}, [x5], #32
   4185        ld1             {v4.8h, v5.8h}, [x7], #32
   4186        mul             v2.8h,   v2.8h,   v1.8h  // diff = ac * alpha
   4187        mul             v3.8h,   v3.8h,   v1.8h
   4188        mul             v4.8h,   v4.8h,   v1.8h
   4189        mul             v5.8h,   v5.8h,   v1.8h
   4190        cmlt            v16.8h,  v2.8h,   #0     // sign
   4191        cmlt            v17.8h,  v3.8h,   #0
   4192        cmlt            v18.8h,  v4.8h,   #0
   4193        cmlt            v19.8h,  v5.8h,   #0
   4194        add             v2.8h,   v2.8h,   v16.8h // diff + sign
   4195        add             v3.8h,   v3.8h,   v17.8h
   4196        add             v4.8h,   v4.8h,   v18.8h
   4197        add             v5.8h,   v5.8h,   v19.8h
   4198        srshr           v2.8h,   v2.8h,   #6     // (diff + sign + 32) >> 6 = apply_sign()
   4199        srshr           v3.8h,   v3.8h,   #6
   4200        srshr           v4.8h,   v4.8h,   #6
   4201        srshr           v5.8h,   v5.8h,   #6
   4202        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
   4203        add             v3.8h,   v3.8h,   v0.8h
   4204        add             v4.8h,   v4.8h,   v0.8h
   4205        add             v5.8h,   v5.8h,   v0.8h
   4206        sqxtun          v2.8b,   v2.8h           // iclip_pixel(dc + apply_sign())
   4207        sqxtun          v3.8b,   v3.8h
   4208        sqxtun          v4.8b,   v4.8h
   4209        sqxtun          v5.8b,   v5.8h
   4210        subs            w3,  w3,  #16
   4211        st1             {v2.8b, v3.8b},  [x0], #16
   4212        st1             {v4.8b, v5.8b},  [x6], #16
   4213        b.gt            1b
   4214        subs            w4,  w4,  #2
   4215        add             x5,  x5,  w9, uxtw #1
   4216        add             x7,  x7,  w9, uxtw #1
   4217        add             x0,  x0,  x1
   4218        add             x6,  x6,  x1
   4219        mov             w3,  w9
   4220        b.gt            1b
   4221        ret
   4222 endfunc
   4223 
   4224 jumptable ipred_cfl_128_tbl
   4225 ipred_cfl_splat_tbl:
   4226        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
   4227        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
   4228        .word L(ipred_cfl_splat_w8)  - ipred_cfl_128_tbl
   4229        .word L(ipred_cfl_splat_w4)  - ipred_cfl_128_tbl
   4230 endjumptable
   4231 
   4232 // void ipred_cfl_top_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   4233 //                              const pixel *const topleft,
   4234 //                              const int width, const int height,
   4235 //                              const int16_t *ac, const int alpha);
   4236 function ipred_cfl_top_8bpc_neon, export=1
   4237        clz             w9,  w3
   4238        movrel          x7,  ipred_cfl_top_tbl
   4239        sub             w9,  w9,  #26
   4240        ldrsw           x9,  [x7, w9, uxtw #2]
   4241        dup             v1.8h,   w6   // alpha
   4242        add             x2,  x2,  #1
   4243        add             x7,  x7,  x9
   4244        add             x6,  x0,  x1
   4245        lsl             x1,  x1,  #1
   4246        br              x7
   4247 4:
   4248        AARCH64_VALID_JUMP_TARGET
   4249        ld1r            {v0.2s},  [x2]
   4250        uaddlv          h0,      v0.8b
   4251        urshr           v0.4h,   v0.4h,   #3
   4252        dup             v0.8h,   v0.h[0]
   4253        b               L(ipred_cfl_splat_w4)
   4254 8:
   4255        AARCH64_VALID_JUMP_TARGET
   4256        ld1             {v0.8b},  [x2]
   4257        uaddlv          h0,      v0.8b
   4258        urshr           v0.4h,   v0.4h,   #3
   4259        dup             v0.8h,   v0.h[0]
   4260        b               L(ipred_cfl_splat_w8)
   4261 16:
   4262        AARCH64_VALID_JUMP_TARGET
   4263        ld1             {v0.16b}, [x2]
   4264        uaddlv          h0,      v0.16b
   4265        urshr           v0.4h,   v0.4h,   #4
   4266        dup             v0.8h,   v0.h[0]
   4267        b               L(ipred_cfl_splat_w16)
   4268 32:
   4269        AARCH64_VALID_JUMP_TARGET
   4270        ld1             {v2.16b, v3.16b}, [x2]
   4271        uaddlv          h2,      v2.16b
   4272        uaddlv          h3,      v3.16b
   4273        add             v2.4h,   v2.4h,   v3.4h
   4274        urshr           v2.4h,   v2.4h,   #5
   4275        dup             v0.8h,   v2.h[0]
   4276        b               L(ipred_cfl_splat_w16)
   4277 endfunc
   4278 
   4279 jumptable ipred_cfl_top_tbl
   4280        .word 32b - ipred_cfl_top_tbl
   4281        .word 16b - ipred_cfl_top_tbl
   4282        .word 8b  - ipred_cfl_top_tbl
   4283        .word 4b  - ipred_cfl_top_tbl
   4284 endjumptable
   4285 
   4286 // void ipred_cfl_left_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   4287 //                               const pixel *const topleft,
   4288 //                               const int width, const int height,
   4289 //                               const int16_t *ac, const int alpha);
   4290 function ipred_cfl_left_8bpc_neon, export=1
   4291        sub             x2,  x2,  w4, uxtw
   4292        clz             w9,  w3
   4293        clz             w8,  w4
   4294        movrel          x10, ipred_cfl_splat_tbl
   4295        movrel          x7,  ipred_cfl_left_tbl
   4296        sub             w9,  w9,  #26
   4297        sub             w8,  w8,  #26
   4298        ldrsw           x9,  [x10, w9, uxtw #2]
   4299        ldrsw           x8,  [x7,  w8, uxtw #2]
   4300        dup             v1.8h,   w6   // alpha
   4301        add             x9,  x10, x9
   4302        add             x7,  x7,  x8
   4303        add             x6,  x0,  x1
   4304        lsl             x1,  x1,  #1
   4305        br              x7
   4306 
   4307 L(ipred_cfl_left_h4):
   4308        AARCH64_VALID_JUMP_TARGET
   4309        ld1r            {v0.2s},  [x2]
   4310        uaddlv          h0,      v0.8b
   4311        urshr           v0.4h,   v0.4h,   #3
   4312        dup             v0.8h,   v0.h[0]
   4313        br              x9
   4314 
   4315 L(ipred_cfl_left_h8):
   4316        AARCH64_VALID_JUMP_TARGET
   4317        ld1             {v0.8b},  [x2]
   4318        uaddlv          h0,      v0.8b
   4319        urshr           v0.4h,   v0.4h,   #3
   4320        dup             v0.8h,   v0.h[0]
   4321        br              x9
   4322 
   4323 L(ipred_cfl_left_h16):
   4324        AARCH64_VALID_JUMP_TARGET
   4325        ld1             {v0.16b}, [x2]
   4326        uaddlv          h0,      v0.16b
   4327        urshr           v0.4h,   v0.4h,   #4
   4328        dup             v0.8h,   v0.h[0]
   4329        br              x9
   4330 
   4331 L(ipred_cfl_left_h32):
   4332        AARCH64_VALID_JUMP_TARGET
   4333        ld1             {v2.16b, v3.16b}, [x2]
   4334        uaddlv          h2,      v2.16b
   4335        uaddlv          h3,      v3.16b
   4336        add             v2.4h,   v2.4h,   v3.4h
   4337        urshr           v2.4h,   v2.4h,   #5
   4338        dup             v0.8h,   v2.h[0]
   4339        br              x9
   4340 endfunc
   4341 
   4342 jumptable ipred_cfl_left_tbl
   4343        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
   4344        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
   4345        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
   4346        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
   4347 endjumptable
   4348 
   4349 // void ipred_cfl_8bpc_neon(pixel *dst, const ptrdiff_t stride,
   4350 //                          const pixel *const topleft,
   4351 //                          const int width, const int height,
   4352 //                          const int16_t *ac, const int alpha);
   4353 function ipred_cfl_8bpc_neon, export=1
   4354        sub             x2,  x2,  w4, uxtw
   4355        add             w8,  w3,  w4             // width + height
   4356        dup             v1.8h,   w6              // alpha
   4357        clz             w9,  w3
   4358        clz             w6,  w4
   4359        dup             v16.8h, w8               // width + height
   4360        movrel          x7,  ipred_cfl_tbl
   4361        rbit            w8,  w8                  // rbit(width + height)
   4362        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
   4363        sub             w6,  w6,  #26
   4364        clz             w8,  w8                  // ctz(width + height)
   4365        ldrsw           x9,  [x7, w9, uxtw #2]
   4366        ldrsw           x6,  [x7, w6, uxtw #2]
   4367        neg             w8,  w8                  // -ctz(width + height)
   4368        add             x9,  x7,  x9
   4369        add             x7,  x7,  x6
   4370        ushr            v16.8h,  v16.8h,  #1     // (width + height) >> 1
   4371        dup             v17.8h,  w8              // -ctz(width + height)
   4372        add             x6,  x0,  x1
   4373        lsl             x1,  x1,  #1
   4374        br              x7
   4375 
   4376 L(ipred_cfl_h4):
   4377        AARCH64_VALID_JUMP_TARGET
   4378        ld1             {v0.s}[0],  [x2], #4
   4379        ins             v0.s[1], wzr
   4380        add             x2,  x2,  #1
   4381        uaddlv          h0,      v0.8b
   4382        br              x9
   4383 L(ipred_cfl_w4):
   4384        AARCH64_VALID_JUMP_TARGET
   4385        ld1             {v2.s}[0],  [x2]
   4386        ins             v2.s[1], wzr
   4387        add             v0.4h,   v0.4h,   v16.4h
   4388        uaddlv          h2,      v2.8b
   4389        cmp             w4,  #4
   4390        add             v0.4h,   v0.4h,   v2.4h
   4391        ushl            v0.4h,   v0.4h,   v17.4h
   4392        b.eq            1f
   4393        // h = 8/16
   4394        mov             w16, #(0x3334/2)
   4395        movk            w16, #(0x5556/2), lsl #16
   4396        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
   4397        lsr             w16, w16, w17
   4398        dup             v16.4h,  w16
   4399        sqdmulh         v0.4h,   v0.4h,   v16.4h
   4400 1:
   4401        dup             v0.8h,   v0.h[0]
   4402        b               L(ipred_cfl_splat_w4)
   4403 
   4404 L(ipred_cfl_h8):
   4405        AARCH64_VALID_JUMP_TARGET
   4406        ld1             {v0.8b},  [x2], #8
   4407        uaddlv          h0,      v0.8b
   4408        add             x2,  x2,  #1
   4409        br              x9
   4410 L(ipred_cfl_w8):
   4411        AARCH64_VALID_JUMP_TARGET
   4412        ld1             {v2.8b},  [x2]
   4413        add             v0.4h,   v0.4h,   v16.4h
   4414        uaddlv          h2,      v2.8b
   4415        cmp             w4,  #8
   4416        add             v0.4h,   v0.4h,   v2.4h
   4417        ushl            v0.4h,   v0.4h,   v17.4h
   4418        b.eq            1f
   4419        // h = 4/16/32
   4420        cmp             w4,  #32
   4421        mov             w16, #(0x3334/2)
   4422        mov             w17, #(0x5556/2)
   4423        csel            w16, w16, w17, eq
   4424        dup             v16.4h,  w16
   4425        sqdmulh         v0.4h,   v0.4h,   v16.4h
   4426 1:
   4427        dup             v0.8h,   v0.h[0]
   4428        b               L(ipred_cfl_splat_w8)
   4429 
   4430 L(ipred_cfl_h16):
   4431        AARCH64_VALID_JUMP_TARGET
   4432        ld1             {v0.16b}, [x2], #16
   4433        uaddlv          h0,      v0.16b
   4434        add             x2,  x2,  #1
   4435        br              x9
   4436 L(ipred_cfl_w16):
   4437        AARCH64_VALID_JUMP_TARGET
   4438        ld1             {v2.16b}, [x2]
   4439        add             v0.4h,   v0.4h,   v16.4h
   4440        uaddlv          h2,      v2.16b
   4441        cmp             w4,  #16
   4442        add             v0.4h,   v0.4h,   v2.4h
   4443        ushl            v0.4h,   v0.4h,   v17.4h
   4444        b.eq            1f
   4445        // h = 4/8/32
   4446        cmp             w4,  #4
   4447        mov             w16, #(0x3334/2)
   4448        mov             w17, #(0x5556/2)
   4449        csel            w16, w16, w17, eq
   4450        dup             v16.4h,  w16
   4451        sqdmulh         v0.4h,   v0.4h,   v16.4h
   4452 1:
   4453        dup             v0.8h,   v0.h[0]
   4454        b               L(ipred_cfl_splat_w16)
   4455 
   4456 L(ipred_cfl_h32):
   4457        AARCH64_VALID_JUMP_TARGET
   4458        ld1             {v2.16b, v3.16b}, [x2], #32
   4459        uaddlv          h2,      v2.16b
   4460        uaddlv          h3,      v3.16b
   4461        add             x2,  x2,  #1
   4462        add             v0.4h,   v2.4h,   v3.4h
   4463        br              x9
   4464 L(ipred_cfl_w32):
   4465        AARCH64_VALID_JUMP_TARGET
   4466        ld1             {v2.16b, v3.16b}, [x2]
   4467        add             v0.4h,   v0.4h,   v16.4h
   4468        uaddlv          h2,      v2.16b
   4469        uaddlv          h3,      v3.16b
   4470        cmp             w4,  #32
   4471        add             v0.4h,   v0.4h,   v2.4h
   4472        add             v0.4h,   v0.4h,   v3.4h
   4473        ushl            v0.4h,   v0.4h,   v17.4h
   4474        b.eq            1f
   4475        // h = 8/16
   4476        mov             w16, #(0x5556/2)
   4477        movk            w16, #(0x3334/2), lsl #16
   4478        add             w17, w4,  w4  // w17 = 2*h = 16 or 32
   4479        lsr             w16, w16, w17
   4480        dup             v16.4h,  w16
   4481        sqdmulh         v0.4h,   v0.4h,   v16.4h
   4482 1:
   4483        dup             v0.8h,   v0.h[0]
   4484        b               L(ipred_cfl_splat_w16)
   4485 endfunc
   4486 
   4487 jumptable ipred_cfl_tbl
   4488        .word L(ipred_cfl_h32) - ipred_cfl_tbl
   4489        .word L(ipred_cfl_h16) - ipred_cfl_tbl
   4490        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
   4491        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
   4492        .word L(ipred_cfl_w32) - ipred_cfl_tbl
   4493        .word L(ipred_cfl_w16) - ipred_cfl_tbl
   4494        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
   4495        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
   4496 endjumptable
   4497 
   4498 // void cfl_ac_420_8bpc_neon(int16_t *const ac, const pixel *const ypx,
   4499 //                           const ptrdiff_t stride, const int w_pad,
   4500 //                           const int h_pad, const int cw, const int ch);
   4501 function ipred_cfl_ac_420_8bpc_neon, export=1
   4502        clz             w8,  w5
   4503        lsl             w4,  w4,  #2
   4504        movrel          x7,  ipred_cfl_ac_420_tbl
   4505        sub             w8,  w8,  #27
   4506        ldrsw           x8,  [x7, w8, uxtw #2]
   4507        movi            v16.8h,  #0
   4508        movi            v17.8h,  #0
   4509        movi            v18.8h,  #0
   4510        movi            v19.8h,  #0
   4511        add             x7,  x7,  x8
   4512        sub             w8,  w6,  w4         // height - h_pad
   4513        rbit            w9,  w5              // rbit(width)
   4514        rbit            w10, w6              // rbit(height)
   4515        clz             w9,  w9              // ctz(width)
   4516        clz             w10, w10             // ctz(height)
   4517        add             w9,  w9,  w10        // log2sz
   4518        add             x10, x1,  x2
   4519        dup             v31.4s,  w9
   4520        lsl             x2,  x2,  #1
   4521        neg             v31.4s,  v31.4s      // -log2sz
   4522        br              x7
   4523 
   4524 L(ipred_cfl_ac_420_w4):
   4525        AARCH64_VALID_JUMP_TARGET
   4526 1:      // Copy and subsample input
   4527        ld1             {v0.8b},   [x1],  x2
   4528        ld1             {v1.8b},   [x10], x2
   4529        ld1             {v0.d}[1], [x1],  x2
   4530        ld1             {v1.d}[1], [x10], x2
   4531        uaddlp          v0.8h,   v0.16b
   4532        uaddlp          v1.8h,   v1.16b
   4533        add             v0.8h,   v0.8h,   v1.8h
   4534        shl             v0.8h,   v0.8h,   #1
   4535        subs            w8,  w8,  #2
   4536        st1             {v0.8h}, [x0], #16
   4537        add             v16.8h,  v16.8h,  v0.8h
   4538        b.gt            1b
   4539        trn2            v1.2d,   v0.2d,   v0.2d
   4540        trn2            v0.2d,   v0.2d,   v0.2d
   4541 L(ipred_cfl_ac_420_w4_hpad):
   4542        cbz             w4,  3f
   4543 2:      // Vertical padding (h_pad > 0)
   4544        subs            w4,  w4,  #4
   4545        st1             {v0.8h, v1.8h}, [x0], #32
   4546        add             v16.8h,  v16.8h,  v0.8h
   4547        add             v17.8h,  v17.8h,  v1.8h
   4548        b.gt            2b
   4549 3:
   4550        // Aggregate the sums
   4551        add             v0.8h,   v16.8h,  v17.8h
   4552        uaddlv          s0,  v0.8h                // sum
   4553        sub             x0,  x0,  w6, uxtw #3
   4554        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
   4555        dup             v4.8h,   v4.h[0]
   4556 6:      // Subtract dc from ac
   4557        ld1             {v0.8h, v1.8h}, [x0]
   4558        subs            w6,  w6,  #4
   4559        sub             v0.8h,   v0.8h,   v4.8h
   4560        sub             v1.8h,   v1.8h,   v4.8h
   4561        st1             {v0.8h, v1.8h}, [x0], #32
   4562        b.gt            6b
   4563        ret
   4564 
   4565 L(ipred_cfl_ac_420_w8):
   4566        AARCH64_VALID_JUMP_TARGET
   4567        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
   4568 1:      // Copy and subsample input, without padding
   4569        ld1             {v0.16b}, [x1],  x2
   4570        ld1             {v1.16b}, [x10], x2
   4571        ld1             {v2.16b}, [x1],  x2
   4572        uaddlp          v0.8h,   v0.16b
   4573        ld1             {v3.16b}, [x10], x2
   4574        uaddlp          v1.8h,   v1.16b
   4575        uaddlp          v2.8h,   v2.16b
   4576        uaddlp          v3.8h,   v3.16b
   4577        add             v0.8h,   v0.8h,   v1.8h
   4578        add             v2.8h,   v2.8h,   v3.8h
   4579        shl             v0.8h,   v0.8h,   #1
   4580        shl             v1.8h,   v2.8h,   #1
   4581        subs            w8,  w8,  #2
   4582        st1             {v0.8h, v1.8h}, [x0], #32
   4583        add             v16.8h,  v16.8h,  v0.8h
   4584        add             v17.8h,  v17.8h,  v1.8h
   4585        b.gt            1b
   4586        mov             v0.16b,  v1.16b
   4587        b               L(ipred_cfl_ac_420_w8_hpad)
   4588 
   4589 L(ipred_cfl_ac_420_w8_wpad):
   4590 1:      // Copy and subsample input, padding 4
   4591        ld1             {v0.8b},   [x1],  x2
   4592        ld1             {v1.8b},   [x10], x2
   4593        ld1             {v0.d}[1], [x1],  x2
   4594        ld1             {v1.d}[1], [x10], x2
   4595        uaddlp          v0.8h,   v0.16b
   4596        uaddlp          v1.8h,   v1.16b
   4597        add             v0.8h,   v0.8h,   v1.8h
   4598        shl             v0.8h,   v0.8h,   #1
   4599        dup             v1.4h,   v0.h[3]
   4600        dup             v3.4h,   v0.h[7]
   4601        trn2            v2.2d,   v0.2d,   v0.2d
   4602        subs            w8,  w8,  #2
   4603        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
   4604        add             v16.4h,  v16.4h,  v0.4h
   4605        add             v17.4h,  v17.4h,  v1.4h
   4606        add             v18.4h,  v18.4h,  v2.4h
   4607        add             v19.4h,  v19.4h,  v3.4h
   4608        b.gt            1b
   4609        trn1            v0.2d,   v2.2d,   v3.2d
   4610        trn1            v1.2d,   v2.2d,   v3.2d
   4611 
   4612 L(ipred_cfl_ac_420_w8_hpad):
   4613        cbz             w4,  3f
   4614 2:      // Vertical padding (h_pad > 0)
   4615        subs            w4,  w4,  #4
   4616        st1             {v0.8h, v1.8h}, [x0], #32
   4617        add             v16.8h,  v16.8h,  v0.8h
   4618        add             v17.8h,  v17.8h,  v1.8h
   4619        st1             {v0.8h, v1.8h}, [x0], #32
   4620        add             v18.8h,  v18.8h,  v0.8h
   4621        add             v19.8h,  v19.8h,  v1.8h
   4622        b.gt            2b
   4623 3:
   4624 
   4625 L(ipred_cfl_ac_420_w8_calc_subtract_dc):
   4626        // Aggregate the sums
   4627        add             v0.8h,   v16.8h,  v17.8h
   4628        add             v2.8h,   v18.8h,  v19.8h
   4629        uaddlp          v0.4s,   v0.8h
   4630        uaddlp          v2.4s,   v2.8h
   4631        add             v0.4s,   v0.4s,   v2.4s
   4632        addv            s0,  v0.4s                // sum
   4633        sub             x0,  x0,  w6, uxtw #4
   4634        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
   4635        dup             v4.8h,   v4.h[0]
   4636 L(ipred_cfl_ac_420_w8_subtract_dc):
   4637 6:      // Subtract dc from ac
   4638        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
   4639        subs            w6,  w6,  #4
   4640        sub             v0.8h,   v0.8h,   v4.8h
   4641        sub             v1.8h,   v1.8h,   v4.8h
   4642        sub             v2.8h,   v2.8h,   v4.8h
   4643        sub             v3.8h,   v3.8h,   v4.8h
   4644        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4645        b.gt            6b
   4646        ret
   4647 
   4648 L(ipred_cfl_ac_420_w16):
   4649        AARCH64_VALID_JUMP_TARGET
   4650        movrel          x7,  ipred_cfl_ac_420_w16_tbl
   4651        ldrsw           x3,  [x7, w3, uxtw #2]
   4652        add             x7,  x7,  x3
   4653        br              x7
   4654 
   4655 L(ipred_cfl_ac_420_w16_wpad0):
   4656        AARCH64_VALID_JUMP_TARGET
   4657 1:      // Copy and subsample input, without padding
   4658        ld1             {v0.16b, v1.16b}, [x1],  x2
   4659        ld1             {v2.16b, v3.16b}, [x10], x2
   4660        uaddlp          v0.8h,   v0.16b
   4661        ld1             {v4.16b, v5.16b}, [x1],  x2
   4662        uaddlp          v1.8h,   v1.16b
   4663        ld1             {v6.16b, v7.16b}, [x10], x2
   4664        uaddlp          v2.8h,   v2.16b
   4665        uaddlp          v3.8h,   v3.16b
   4666        uaddlp          v4.8h,   v4.16b
   4667        uaddlp          v5.8h,   v5.16b
   4668        uaddlp          v6.8h,   v6.16b
   4669        uaddlp          v7.8h,   v7.16b
   4670        add             v0.8h,   v0.8h,   v2.8h
   4671        add             v1.8h,   v1.8h,   v3.8h
   4672        add             v4.8h,   v4.8h,   v6.8h
   4673        add             v5.8h,   v5.8h,   v7.8h
   4674        shl             v0.8h,   v0.8h,   #1
   4675        shl             v1.8h,   v1.8h,   #1
   4676        shl             v2.8h,   v4.8h,   #1
   4677        shl             v3.8h,   v5.8h,   #1
   4678        subs            w8,  w8,  #2
   4679        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4680        add             v16.8h,  v16.8h,  v0.8h
   4681        add             v17.8h,  v17.8h,  v1.8h
   4682        add             v18.8h,  v18.8h,  v2.8h
   4683        add             v19.8h,  v19.8h,  v3.8h
   4684        b.gt            1b
   4685        mov             v0.16b,  v2.16b
   4686        mov             v1.16b,  v3.16b
   4687        b               L(ipred_cfl_ac_420_w16_hpad)
   4688 
   4689 L(ipred_cfl_ac_420_w16_wpad1):
   4690        AARCH64_VALID_JUMP_TARGET
   4691 1:      // Copy and subsample input, padding 4
   4692        ldr             d1,  [x1,  #16]
   4693        ld1             {v0.16b}, [x1],  x2
   4694        ldr             d3,  [x10, #16]
   4695        ld1             {v2.16b}, [x10], x2
   4696        uaddlp          v1.4h,   v1.8b
   4697        ldr             d5,  [x1,  #16]
   4698        uaddlp          v0.8h,   v0.16b
   4699        ld1             {v4.16b}, [x1],  x2
   4700        uaddlp          v3.4h,   v3.8b
   4701        ldr             d7,  [x10, #16]
   4702        uaddlp          v2.8h,   v2.16b
   4703        ld1             {v6.16b}, [x10], x2
   4704        uaddlp          v5.4h,   v5.8b
   4705        uaddlp          v4.8h,   v4.16b
   4706        uaddlp          v7.4h,   v7.8b
   4707        uaddlp          v6.8h,   v6.16b
   4708        add             v1.4h,   v1.4h,   v3.4h
   4709        add             v0.8h,   v0.8h,   v2.8h
   4710        add             v5.4h,   v5.4h,   v7.4h
   4711        add             v4.8h,   v4.8h,   v6.8h
   4712        shl             v1.4h,   v1.4h,   #1
   4713        shl             v0.8h,   v0.8h,   #1
   4714        shl             v3.4h,   v5.4h,   #1
   4715        shl             v2.8h,   v4.8h,   #1
   4716        dup             v4.4h,   v1.h[3]
   4717        dup             v5.4h,   v3.h[3]
   4718        trn1            v1.2d,   v1.2d,   v4.2d
   4719        trn1            v3.2d,   v3.2d,   v5.2d
   4720        subs            w8,  w8,  #2
   4721        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4722        add             v16.8h,  v16.8h,  v0.8h
   4723        add             v17.8h,  v17.8h,  v1.8h
   4724        add             v18.8h,  v18.8h,  v2.8h
   4725        add             v19.8h,  v19.8h,  v3.8h
   4726        b.gt            1b
   4727        mov             v0.16b,  v2.16b
   4728        mov             v1.16b,  v3.16b
   4729        b               L(ipred_cfl_ac_420_w16_hpad)
   4730 
   4731 L(ipred_cfl_ac_420_w16_wpad2):
   4732        AARCH64_VALID_JUMP_TARGET
   4733 1:      // Copy and subsample input, padding 8
   4734        ld1             {v0.16b}, [x1],  x2
   4735        ld1             {v2.16b}, [x10], x2
   4736        ld1             {v4.16b}, [x1],  x2
   4737        uaddlp          v0.8h,   v0.16b
   4738        ld1             {v6.16b}, [x10], x2
   4739        uaddlp          v2.8h,   v2.16b
   4740        uaddlp          v4.8h,   v4.16b
   4741        uaddlp          v6.8h,   v6.16b
   4742        add             v0.8h,   v0.8h,   v2.8h
   4743        add             v4.8h,   v4.8h,   v6.8h
   4744        shl             v0.8h,   v0.8h,   #1
   4745        shl             v2.8h,   v4.8h,   #1
   4746        dup             v1.8h,   v0.h[7]
   4747        dup             v3.8h,   v2.h[7]
   4748        subs            w8,  w8,  #2
   4749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4750        add             v16.8h,  v16.8h,  v0.8h
   4751        add             v17.8h,  v17.8h,  v1.8h
   4752        add             v18.8h,  v18.8h,  v2.8h
   4753        add             v19.8h,  v19.8h,  v3.8h
   4754        b.gt            1b
   4755        mov             v0.16b,  v2.16b
   4756        mov             v1.16b,  v3.16b
   4757        b               L(ipred_cfl_ac_420_w16_hpad)
   4758 
   4759 L(ipred_cfl_ac_420_w16_wpad3):
   4760        AARCH64_VALID_JUMP_TARGET
   4761 1:      // Copy and subsample input, padding 12
   4762        ld1             {v0.8b}, [x1],  x2
   4763        ld1             {v2.8b}, [x10], x2
   4764        ld1             {v4.8b}, [x1],  x2
   4765        uaddlp          v0.4h,   v0.8b
   4766        ld1             {v6.8b}, [x10], x2
   4767        uaddlp          v2.4h,   v2.8b
   4768        uaddlp          v4.4h,   v4.8b
   4769        uaddlp          v6.4h,   v6.8b
   4770        add             v0.4h,   v0.4h,   v2.4h
   4771        add             v4.4h,   v4.4h,   v6.4h
   4772        shl             v0.4h,   v0.4h,   #1
   4773        shl             v2.4h,   v4.4h,   #1
   4774        dup             v1.8h,   v0.h[3]
   4775        dup             v3.8h,   v2.h[3]
   4776        trn1            v0.2d,   v0.2d,   v1.2d
   4777        trn1            v2.2d,   v2.2d,   v3.2d
   4778        subs            w8,  w8,  #2
   4779        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4780        add             v16.8h,  v16.8h,  v0.8h
   4781        add             v17.8h,  v17.8h,  v1.8h
   4782        add             v18.8h,  v18.8h,  v2.8h
   4783        add             v19.8h,  v19.8h,  v3.8h
   4784        b.gt            1b
   4785        mov             v0.16b,  v2.16b
   4786        mov             v1.16b,  v3.16b
   4787 
   4788 L(ipred_cfl_ac_420_w16_hpad):
   4789        cbz             w4,  3f
   4790 2:      // Vertical padding (h_pad > 0)
   4791        subs            w4,  w4,  #4
   4792        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4793        add             v16.8h,  v16.8h,  v0.8h
   4794        add             v17.8h,  v17.8h,  v1.8h
   4795        add             v18.8h,  v18.8h,  v2.8h
   4796        add             v19.8h,  v19.8h,  v3.8h
   4797        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4798        add             v16.8h,  v16.8h,  v0.8h
   4799        add             v17.8h,  v17.8h,  v1.8h
   4800        add             v18.8h,  v18.8h,  v2.8h
   4801        add             v19.8h,  v19.8h,  v3.8h
   4802        b.gt            2b
   4803 3:
   4804 
   4805        // Double the height and reuse the w8 summing/subtracting
   4806        lsl             w6,  w6,  #1
   4807        b               L(ipred_cfl_ac_420_w8_calc_subtract_dc)
   4808 endfunc
   4809 
   4810 jumptable ipred_cfl_ac_420_tbl
   4811        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
   4812        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
   4813        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
   4814 endjumptable
   4815 
   4816 jumptable ipred_cfl_ac_420_w16_tbl
   4817        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
   4818        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
   4819        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
   4820        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
   4821 endjumptable
   4822 
   4823 // void cfl_ac_422_8bpc_neon(int16_t *const ac, const pixel *const ypx,
   4824 //                           const ptrdiff_t stride, const int w_pad,
   4825 //                           const int h_pad, const int cw, const int ch);
   4826 function ipred_cfl_ac_422_8bpc_neon, export=1
   4827        clz             w8,  w5
   4828        lsl             w4,  w4,  #2
   4829        movrel          x7,  ipred_cfl_ac_422_tbl
   4830        sub             w8,  w8,  #27
   4831        ldrsw           x8,  [x7, w8, uxtw #2]
   4832        movi            v16.8h,  #0
   4833        movi            v17.8h,  #0
   4834        movi            v18.8h,  #0
   4835        movi            v19.8h,  #0
   4836        add             x7,  x7,  x8
   4837        sub             w8,  w6,  w4         // height - h_pad
   4838        rbit            w9,  w5              // rbit(width)
   4839        rbit            w10, w6              // rbit(height)
   4840        clz             w9,  w9              // ctz(width)
   4841        clz             w10, w10             // ctz(height)
   4842        add             w9,  w9,  w10        // log2sz
   4843        add             x10, x1,  x2
   4844        dup             v31.4s,  w9
   4845        lsl             x2,  x2,  #1
   4846        neg             v31.4s,  v31.4s      // -log2sz
   4847        br              x7
   4848 
   4849 L(ipred_cfl_ac_422_w4):
   4850        AARCH64_VALID_JUMP_TARGET
   4851 1:      // Copy and subsample input
   4852        ld1             {v0.8b},   [x1],  x2
   4853        ld1             {v0.d}[1], [x10], x2
   4854        ld1             {v1.8b},   [x1],  x2
   4855        ld1             {v1.d}[1], [x10], x2
   4856        uaddlp          v0.8h,   v0.16b
   4857        uaddlp          v1.8h,   v1.16b
   4858        shl             v0.8h,   v0.8h,   #2
   4859        shl             v1.8h,   v1.8h,   #2
   4860        subs            w8,  w8,  #4
   4861        add             v16.8h,  v16.8h,  v0.8h
   4862        add             v17.8h,  v17.8h,  v1.8h
   4863        st1             {v0.8h, v1.8h}, [x0], #32
   4864        b.gt            1b
   4865        trn2            v0.2d,   v1.2d,   v1.2d
   4866        trn2            v1.2d,   v1.2d,   v1.2d
   4867        b               L(ipred_cfl_ac_420_w4_hpad)
   4868 
   4869 L(ipred_cfl_ac_422_w8):
   4870        AARCH64_VALID_JUMP_TARGET
   4871        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
   4872 1:      // Copy and subsample input, without padding
   4873        ld1             {v0.16b}, [x1],  x2
   4874        ld1             {v1.16b}, [x10], x2
   4875        ld1             {v2.16b}, [x1],  x2
   4876        uaddlp          v0.8h,   v0.16b
   4877        ld1             {v3.16b}, [x10], x2
   4878        uaddlp          v1.8h,   v1.16b
   4879        uaddlp          v2.8h,   v2.16b
   4880        uaddlp          v3.8h,   v3.16b
   4881        shl             v0.8h,   v0.8h,   #2
   4882        shl             v1.8h,   v1.8h,   #2
   4883        shl             v2.8h,   v2.8h,   #2
   4884        shl             v3.8h,   v3.8h,   #2
   4885        subs            w8,  w8,  #4
   4886        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4887        add             v16.8h,  v16.8h,  v0.8h
   4888        add             v17.8h,  v17.8h,  v1.8h
   4889        add             v18.8h,  v18.8h,  v2.8h
   4890        add             v19.8h,  v19.8h,  v3.8h
   4891        b.gt            1b
   4892        mov             v0.16b,  v3.16b
   4893        mov             v1.16b,  v3.16b
   4894        b               L(ipred_cfl_ac_420_w8_hpad)
   4895 
   4896 L(ipred_cfl_ac_422_w8_wpad):
   4897 1:      // Copy and subsample input, padding 4
   4898        ld1             {v0.8b},   [x1],  x2
   4899        ld1             {v0.d}[1], [x10], x2
   4900        ld1             {v2.8b},   [x1],  x2
   4901        ld1             {v2.d}[1], [x10], x2
   4902        uaddlp          v0.8h,   v0.16b
   4903        uaddlp          v2.8h,   v2.16b
   4904        shl             v0.8h,   v0.8h,   #2
   4905        shl             v2.8h,   v2.8h,   #2
   4906        dup             v4.4h,   v0.h[3]
   4907        dup             v5.8h,   v0.h[7]
   4908        dup             v6.4h,   v2.h[3]
   4909        dup             v7.8h,   v2.h[7]
   4910        trn2            v1.2d,   v0.2d,   v5.2d
   4911        trn1            v0.2d,   v0.2d,   v4.2d
   4912        trn2            v3.2d,   v2.2d,   v7.2d
   4913        trn1            v2.2d,   v2.2d,   v6.2d
   4914        subs            w8,  w8,  #4
   4915        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4916        add             v16.8h,  v16.8h,  v0.8h
   4917        add             v17.8h,  v17.8h,  v1.8h
   4918        add             v18.8h,  v18.8h,  v2.8h
   4919        add             v19.8h,  v19.8h,  v3.8h
   4920        b.gt            1b
   4921        mov             v0.16b,  v3.16b
   4922        mov             v1.16b,  v3.16b
   4923        b               L(ipred_cfl_ac_420_w8_hpad)
   4924 
   4925 L(ipred_cfl_ac_422_w16):
   4926        AARCH64_VALID_JUMP_TARGET
   4927        movrel          x7,  ipred_cfl_ac_422_w16_tbl
   4928        ldrsw           x3,  [x7, w3, uxtw #2]
   4929        add             x7,  x7,  x3
   4930        br              x7
   4931 
   4932 L(ipred_cfl_ac_422_w16_wpad0):
   4933        AARCH64_VALID_JUMP_TARGET
   4934 1:      // Copy and subsample input, without padding
   4935        ld1             {v0.16b, v1.16b}, [x1],  x2
   4936        ld1             {v2.16b, v3.16b}, [x10], x2
   4937        uaddlp          v0.8h,   v0.16b
   4938        uaddlp          v1.8h,   v1.16b
   4939        uaddlp          v2.8h,   v2.16b
   4940        uaddlp          v3.8h,   v3.16b
   4941        shl             v0.8h,   v0.8h,   #2
   4942        shl             v1.8h,   v1.8h,   #2
   4943        shl             v2.8h,   v2.8h,   #2
   4944        shl             v3.8h,   v3.8h,   #2
   4945        subs            w8,  w8,  #2
   4946        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4947        add             v16.8h,  v16.8h,  v0.8h
   4948        add             v17.8h,  v17.8h,  v1.8h
   4949        add             v18.8h,  v18.8h,  v2.8h
   4950        add             v19.8h,  v19.8h,  v3.8h
   4951        b.gt            1b
   4952        mov             v0.16b,  v2.16b
   4953        mov             v1.16b,  v3.16b
   4954        b               L(ipred_cfl_ac_420_w16_hpad)
   4955 
   4956 L(ipred_cfl_ac_422_w16_wpad1):
   4957        AARCH64_VALID_JUMP_TARGET
   4958 1:      // Copy and subsample input, padding 4
   4959        ldr             d1,  [x1,  #16]
   4960        ld1             {v0.16b}, [x1],  x2
   4961        ldr             d3,  [x10, #16]
   4962        ld1             {v2.16b}, [x10], x2
   4963        uaddlp          v1.4h,   v1.8b
   4964        uaddlp          v0.8h,   v0.16b
   4965        uaddlp          v3.4h,   v3.8b
   4966        uaddlp          v2.8h,   v2.16b
   4967        shl             v1.4h,   v1.4h,   #2
   4968        shl             v0.8h,   v0.8h,   #2
   4969        shl             v3.4h,   v3.4h,   #2
   4970        shl             v2.8h,   v2.8h,   #2
   4971        dup             v4.4h,   v1.h[3]
   4972        dup             v5.4h,   v3.h[3]
   4973        trn1            v1.2d,   v1.2d,   v4.2d
   4974        trn1            v3.2d,   v3.2d,   v5.2d
   4975        subs            w8,  w8,  #2
   4976        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4977        add             v16.8h,  v16.8h,  v0.8h
   4978        add             v17.8h,  v17.8h,  v1.8h
   4979        add             v18.8h,  v18.8h,  v2.8h
   4980        add             v19.8h,  v19.8h,  v3.8h
   4981        b.gt            1b
   4982        mov             v0.16b,  v2.16b
   4983        mov             v1.16b,  v3.16b
   4984        b               L(ipred_cfl_ac_420_w16_hpad)
   4985 
   4986 L(ipred_cfl_ac_422_w16_wpad2):
   4987        AARCH64_VALID_JUMP_TARGET
   4988 1:      // Copy and subsample input, padding 8
   4989        ld1             {v0.16b}, [x1],  x2
   4990        ld1             {v2.16b}, [x10], x2
   4991        uaddlp          v0.8h,   v0.16b
   4992        uaddlp          v2.8h,   v2.16b
   4993        shl             v0.8h,   v0.8h,   #2
   4994        shl             v2.8h,   v2.8h,   #2
   4995        dup             v1.8h,   v0.h[7]
   4996        dup             v3.8h,   v2.h[7]
   4997        subs            w8,  w8,  #2
   4998        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   4999        add             v16.8h,  v16.8h,  v0.8h
   5000        add             v17.8h,  v17.8h,  v1.8h
   5001        add             v18.8h,  v18.8h,  v2.8h
   5002        add             v19.8h,  v19.8h,  v3.8h
   5003        b.gt            1b
   5004        mov             v0.16b,  v2.16b
   5005        mov             v1.16b,  v3.16b
   5006        b               L(ipred_cfl_ac_420_w16_hpad)
   5007 
   5008 L(ipred_cfl_ac_422_w16_wpad3):
   5009        AARCH64_VALID_JUMP_TARGET
   5010 1:      // Copy and subsample input, padding 12
   5011        ld1             {v0.8b}, [x1],  x2
   5012        ld1             {v2.8b}, [x10], x2
   5013        uaddlp          v0.4h,   v0.8b
   5014        uaddlp          v2.4h,   v2.8b
   5015        shl             v0.4h,   v0.4h,   #2
   5016        shl             v2.4h,   v2.4h,   #2
   5017        dup             v1.8h,   v0.h[3]
   5018        dup             v3.8h,   v2.h[3]
   5019        trn1            v0.2d,   v0.2d,   v1.2d
   5020        trn1            v2.2d,   v2.2d,   v3.2d
   5021        subs            w8,  w8,  #2
   5022        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5023        add             v16.8h,  v16.8h,  v0.8h
   5024        add             v17.8h,  v17.8h,  v1.8h
   5025        add             v18.8h,  v18.8h,  v2.8h
   5026        add             v19.8h,  v19.8h,  v3.8h
   5027        b.gt            1b
   5028        mov             v0.16b,  v2.16b
   5029        mov             v1.16b,  v3.16b
   5030        b               L(ipred_cfl_ac_420_w16_hpad)
   5031 endfunc
   5032 
   5033 jumptable ipred_cfl_ac_422_tbl
   5034        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
   5035        .word L(ipred_cfl_ac_422_w8) - ipred_cfl_ac_422_tbl
   5036        .word L(ipred_cfl_ac_422_w4) - ipred_cfl_ac_422_tbl
   5037 endjumptable
   5038 
   5039 jumptable ipred_cfl_ac_422_w16_tbl
   5040        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
   5041        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
   5042        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
   5043        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
   5044 endjumptable
   5045 
   5046 // void cfl_ac_444_8bpc_neon(int16_t *const ac, const pixel *const ypx,
   5047 //                           const ptrdiff_t stride, const int w_pad,
   5048 //                           const int h_pad, const int cw, const int ch);
   5049 function ipred_cfl_ac_444_8bpc_neon, export=1
   5050        clz             w8,  w5
   5051        lsl             w4,  w4,  #2
   5052        movrel          x7,  ipred_cfl_ac_444_tbl
   5053        sub             w8,  w8,  #26
   5054        ldrsw           x8,  [x7, w8, uxtw #2]
   5055        movi            v16.8h,  #0
   5056        movi            v17.8h,  #0
   5057        movi            v18.8h,  #0
   5058        movi            v19.8h,  #0
   5059        add             x7,  x7,  x8
   5060        sub             w8,  w6,  w4         // height - h_pad
   5061        rbit            w9,  w5              // rbit(width)
   5062        rbit            w10, w6              // rbit(height)
   5063        clz             w9,  w9              // ctz(width)
   5064        clz             w10, w10             // ctz(height)
   5065        add             w9,  w9,  w10        // log2sz
   5066        add             x10, x1,  x2
   5067        dup             v31.4s,  w9
   5068        lsl             x2,  x2,  #1
   5069        neg             v31.4s,  v31.4s      // -log2sz
   5070        br              x7
   5071 
   5072 L(ipred_cfl_ac_444_w4):
   5073        AARCH64_VALID_JUMP_TARGET
   5074 1:      // Copy and expand input
   5075        ld1             {v0.s}[0], [x1],  x2
   5076        ld1             {v0.s}[1], [x10], x2
   5077        ld1             {v1.s}[0], [x1],  x2
   5078        ld1             {v1.s}[1], [x10], x2
   5079        ushll           v0.8h,   v0.8b,   #3
   5080        ushll           v1.8h,   v1.8b,   #3
   5081        subs            w8,  w8,  #4
   5082        add             v16.8h,  v16.8h,  v0.8h
   5083        add             v17.8h,  v17.8h,  v1.8h
   5084        st1             {v0.8h, v1.8h}, [x0], #32
   5085        b.gt            1b
   5086        trn2            v0.2d,   v1.2d,   v1.2d
   5087        trn2            v1.2d,   v1.2d,   v1.2d
   5088        b               L(ipred_cfl_ac_420_w4_hpad)
   5089 
   5090 L(ipred_cfl_ac_444_w8):
   5091        AARCH64_VALID_JUMP_TARGET
   5092 1:      // Copy and expand input
   5093        ld1             {v0.8b}, [x1],  x2
   5094        ld1             {v1.8b}, [x10], x2
   5095        ld1             {v2.8b}, [x1],  x2
   5096        ushll           v0.8h,   v0.8b,   #3
   5097        ld1             {v3.8b}, [x10], x2
   5098        ushll           v1.8h,   v1.8b,   #3
   5099        ushll           v2.8h,   v2.8b,   #3
   5100        ushll           v3.8h,   v3.8b,   #3
   5101        subs            w8,  w8,  #4
   5102        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5103        add             v16.8h,  v16.8h,  v0.8h
   5104        add             v17.8h,  v17.8h,  v1.8h
   5105        add             v18.8h,  v18.8h,  v2.8h
   5106        add             v19.8h,  v19.8h,  v3.8h
   5107        b.gt            1b
   5108        mov             v0.16b,  v3.16b
   5109        mov             v1.16b,  v3.16b
   5110        b               L(ipred_cfl_ac_420_w8_hpad)
   5111 
   5112 L(ipred_cfl_ac_444_w16):
   5113        AARCH64_VALID_JUMP_TARGET
   5114        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
   5115 1:      // Copy and expand input, without padding
   5116        ld1             {v0.16b}, [x1],  x2
   5117        ld1             {v2.16b}, [x10], x2
   5118        ld1             {v4.16b}, [x1],  x2
   5119        ushll2          v1.8h,   v0.16b,  #3
   5120        ushll           v0.8h,   v0.8b,   #3
   5121        ld1             {v6.16b}, [x10], x2
   5122        ushll2          v3.8h,   v2.16b,  #3
   5123        ushll           v2.8h,   v2.8b,   #3
   5124        ushll2          v5.8h,   v4.16b,  #3
   5125        ushll           v4.8h,   v4.8b,   #3
   5126        ushll2          v7.8h,   v6.16b,  #3
   5127        ushll           v6.8h,   v6.8b,   #3
   5128        subs            w8,  w8,  #4
   5129        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5130        add             v16.8h,  v16.8h,  v0.8h
   5131        add             v17.8h,  v17.8h,  v1.8h
   5132        add             v18.8h,  v18.8h,  v2.8h
   5133        add             v19.8h,  v19.8h,  v3.8h
   5134        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5135        add             v16.8h,  v16.8h,  v4.8h
   5136        add             v17.8h,  v17.8h,  v5.8h
   5137        add             v18.8h,  v18.8h,  v6.8h
   5138        add             v19.8h,  v19.8h,  v7.8h
   5139        b.gt            1b
   5140        mov             v0.16b,  v6.16b
   5141        mov             v1.16b,  v7.16b
   5142        mov             v2.16b,  v6.16b
   5143        mov             v3.16b,  v7.16b
   5144        b               L(ipred_cfl_ac_420_w16_hpad)
   5145 
   5146 L(ipred_cfl_ac_444_w16_wpad):
   5147 1:      // Copy and expand input, padding 8
   5148        ld1             {v0.8b}, [x1],  x2
   5149        ld1             {v2.8b}, [x10], x2
   5150        ld1             {v4.8b}, [x1],  x2
   5151        ld1             {v6.8b}, [x10], x2
   5152        ushll           v0.8h,   v0.8b,   #3
   5153        ushll           v2.8h,   v2.8b,   #3
   5154        ushll           v4.8h,   v4.8b,   #3
   5155        ushll           v6.8h,   v6.8b,   #3
   5156        dup             v1.8h,   v0.h[7]
   5157        dup             v3.8h,   v2.h[7]
   5158        dup             v5.8h,   v4.h[7]
   5159        dup             v7.8h,   v6.h[7]
   5160        subs            w8,  w8,  #4
   5161        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5162        add             v16.8h,  v16.8h,  v0.8h
   5163        add             v17.8h,  v17.8h,  v1.8h
   5164        add             v18.8h,  v18.8h,  v2.8h
   5165        add             v19.8h,  v19.8h,  v3.8h
   5166        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5167        add             v16.8h,  v16.8h,  v4.8h
   5168        add             v17.8h,  v17.8h,  v5.8h
   5169        add             v18.8h,  v18.8h,  v6.8h
   5170        add             v19.8h,  v19.8h,  v7.8h
   5171        b.gt            1b
   5172        mov             v0.16b,  v6.16b
   5173        mov             v1.16b,  v7.16b
   5174        mov             v2.16b,  v6.16b
   5175        mov             v3.16b,  v7.16b
   5176        b               L(ipred_cfl_ac_420_w16_hpad)
   5177 
   5178 L(ipred_cfl_ac_444_w32):
   5179        AARCH64_VALID_JUMP_TARGET
   5180        movrel          x7,  ipred_cfl_ac_444_w32_tbl
   5181        lsr             w3,  w3,  #1
   5182        ldrsw           x3,  [x7, w3, uxtw #2]
   5183        add             x7,  x7,  x3
   5184        br              x7
   5185 
   5186 L(ipred_cfl_ac_444_w32_wpad0):
   5187        AARCH64_VALID_JUMP_TARGET
   5188 1:      // Copy and expand input, without padding
   5189        ld1             {v2.16b, v3.16b}, [x1],  x2
   5190        ld1             {v6.16b, v7.16b}, [x10], x2
   5191        ushll           v0.8h,   v2.8b,   #3
   5192        ushll2          v1.8h,   v2.16b,  #3
   5193        ushll           v2.8h,   v3.8b,   #3
   5194        ushll2          v3.8h,   v3.16b,  #3
   5195        ushll           v4.8h,   v6.8b,   #3
   5196        ushll2          v5.8h,   v6.16b,  #3
   5197        ushll           v6.8h,   v7.8b,   #3
   5198        ushll2          v7.8h,   v7.16b,  #3
   5199        subs            w8,  w8,  #2
   5200        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5201        add             v16.8h,  v16.8h,  v0.8h
   5202        add             v17.8h,  v17.8h,  v1.8h
   5203        add             v18.8h,  v18.8h,  v2.8h
   5204        add             v19.8h,  v19.8h,  v3.8h
   5205        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5206        add             v16.8h,  v16.8h,  v4.8h
   5207        add             v17.8h,  v17.8h,  v5.8h
   5208        add             v18.8h,  v18.8h,  v6.8h
   5209        add             v19.8h,  v19.8h,  v7.8h
   5210        b.gt            1b
   5211        b               L(ipred_cfl_ac_444_w32_hpad)
   5212 
   5213 L(ipred_cfl_ac_444_w32_wpad2):
   5214        AARCH64_VALID_JUMP_TARGET
   5215 1:      // Copy and expand input, padding 8
   5216        ldr             d2,  [x1,  #16]
   5217        ld1             {v1.16b}, [x1],  x2
   5218        ldr             d6,  [x10, #16]
   5219        ld1             {v5.16b}, [x10], x2
   5220        ushll           v2.8h,   v2.8b,   #3
   5221        ushll           v0.8h,   v1.8b,   #3
   5222        ushll2          v1.8h,   v1.16b,  #3
   5223        ushll           v6.8h,   v6.8b,   #3
   5224        ushll           v4.8h,   v5.8b,   #3
   5225        ushll2          v5.8h,   v5.16b,  #3
   5226        dup             v3.8h,   v2.h[7]
   5227        dup             v7.8h,   v6.h[7]
   5228        subs            w8,  w8,  #2
   5229        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5230        add             v16.8h,  v16.8h,  v0.8h
   5231        add             v17.8h,  v17.8h,  v1.8h
   5232        add             v18.8h,  v18.8h,  v2.8h
   5233        add             v19.8h,  v19.8h,  v3.8h
   5234        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5235        add             v16.8h,  v16.8h,  v4.8h
   5236        add             v17.8h,  v17.8h,  v5.8h
   5237        add             v18.8h,  v18.8h,  v6.8h
   5238        add             v19.8h,  v19.8h,  v7.8h
   5239        b.gt            1b
   5240        b               L(ipred_cfl_ac_444_w32_hpad)
   5241 
   5242 L(ipred_cfl_ac_444_w32_wpad4):
   5243        AARCH64_VALID_JUMP_TARGET
   5244 1:      // Copy and expand input, padding 16
   5245        ld1             {v1.16b}, [x1],  x2
   5246        ld1             {v5.16b}, [x10], x2
   5247        ushll           v0.8h,   v1.8b,   #3
   5248        ushll2          v1.8h,   v1.16b,  #3
   5249        ushll           v4.8h,   v5.8b,   #3
   5250        ushll2          v5.8h,   v5.16b,  #3
   5251        dup             v2.8h,   v1.h[7]
   5252        dup             v3.8h,   v1.h[7]
   5253        dup             v6.8h,   v5.h[7]
   5254        dup             v7.8h,   v5.h[7]
   5255        subs            w8,  w8,  #2
   5256        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5257        add             v16.8h,  v16.8h,  v0.8h
   5258        add             v17.8h,  v17.8h,  v1.8h
   5259        add             v18.8h,  v18.8h,  v2.8h
   5260        add             v19.8h,  v19.8h,  v3.8h
   5261        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5262        add             v16.8h,  v16.8h,  v4.8h
   5263        add             v17.8h,  v17.8h,  v5.8h
   5264        add             v18.8h,  v18.8h,  v6.8h
   5265        add             v19.8h,  v19.8h,  v7.8h
   5266        b.gt            1b
   5267        b               L(ipred_cfl_ac_444_w32_hpad)
   5268 
   5269 L(ipred_cfl_ac_444_w32_wpad6):
   5270        AARCH64_VALID_JUMP_TARGET
   5271 1:      // Copy and expand input, padding 24
   5272        ld1             {v0.8b}, [x1],  x2
   5273        ld1             {v4.8b}, [x10], x2
   5274        ushll           v0.8h,   v0.8b,   #3
   5275        ushll           v4.8h,   v4.8b,   #3
   5276        dup             v1.8h,   v0.h[7]
   5277        dup             v2.8h,   v0.h[7]
   5278        dup             v3.8h,   v0.h[7]
   5279        dup             v5.8h,   v4.h[7]
   5280        dup             v6.8h,   v4.h[7]
   5281        dup             v7.8h,   v4.h[7]
   5282        subs            w8,  w8,  #2
   5283        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5284        add             v16.8h,  v16.8h,  v0.8h
   5285        add             v17.8h,  v17.8h,  v1.8h
   5286        add             v18.8h,  v18.8h,  v2.8h
   5287        add             v19.8h,  v19.8h,  v3.8h
   5288        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5289        add             v16.8h,  v16.8h,  v4.8h
   5290        add             v17.8h,  v17.8h,  v5.8h
   5291        add             v18.8h,  v18.8h,  v6.8h
   5292        add             v19.8h,  v19.8h,  v7.8h
   5293        b.gt            1b
   5294 
   5295 L(ipred_cfl_ac_444_w32_hpad):
   5296        cbz             w4,  3f
   5297 2:      // Vertical padding (h_pad > 0)
   5298        subs            w4,  w4,  #2
   5299        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5300        add             v16.8h,  v16.8h,  v4.8h
   5301        add             v17.8h,  v17.8h,  v5.8h
   5302        add             v18.8h,  v18.8h,  v6.8h
   5303        add             v19.8h,  v19.8h,  v7.8h
   5304        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], #64
   5305        add             v16.8h,  v16.8h,  v4.8h
   5306        add             v17.8h,  v17.8h,  v5.8h
   5307        add             v18.8h,  v18.8h,  v6.8h
   5308        add             v19.8h,  v19.8h,  v7.8h
   5309        b.gt            2b
   5310 3:
   5311 
   5312        // Quadruple the height and reuse the w8 subtracting
   5313        lsl             w6,  w6,  #2
   5314        // Aggregate the sums, with wider intermediates earlier than in
   5315        // ipred_cfl_ac_420_w8_calc_subtract_dc.
   5316        uaddlp          v0.4s,   v16.8h
   5317        uaddlp          v1.4s,   v17.8h
   5318        uaddlp          v2.4s,   v18.8h
   5319        uaddlp          v3.4s,   v19.8h
   5320        add             v0.4s,   v0.4s,   v1.4s
   5321        add             v2.4s,   v2.4s,   v3.4s
   5322        add             v0.4s,   v0.4s,   v2.4s
   5323        addv            s0,  v0.4s                // sum
   5324        sub             x0,  x0,  w6, uxtw #4
   5325        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1))) >>= log2sz
   5326        dup             v4.8h,   v4.h[0]
   5327        b               L(ipred_cfl_ac_420_w8_subtract_dc)
   5328 endfunc
   5329 
   5330 jumptable ipred_cfl_ac_444_tbl
   5331        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
   5332        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
   5333        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
   5334        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
   5335 endjumptable
   5336 
   5337 jumptable ipred_cfl_ac_444_w32_tbl
   5338        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
   5339        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
   5340        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
   5341        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
   5342 endjumptable