tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ipred16.S (238393B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 // void ipred_dc_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
     32 //                              const pixel *const topleft,
     33 //                              const int width, const int height, const int a,
     34 //                              const int max_width, const int max_height,
     35 //                              const int bitdepth_max);
     36 function ipred_dc_128_16bpc_neon, export=1
     37        ldr             w8,  [sp]
     38        clz             w3,  w3
     39        movrel          x5,  ipred_dc_128_tbl
     40        sub             w3,  w3,  #25
     41        ldrsw           x3,  [x5, w3, uxtw #2]
     42        dup             v0.8h,   w8
     43        add             x5,  x5,  x3
     44        add             x6,  x0,  x1
     45        lsl             x1,  x1,  #1
     46        urshr           v0.8h,   v0.8h,  #1
     47        br              x5
     48 40:
     49        AARCH64_VALID_JUMP_TARGET
     50 4:
     51        st1             {v0.4h},  [x0], x1
     52        st1             {v0.4h},  [x6], x1
     53        subs            w4,  w4,  #4
     54        st1             {v0.4h},  [x0], x1
     55        st1             {v0.4h},  [x6], x1
     56        b.gt            4b
     57        ret
     58 80:
     59        AARCH64_VALID_JUMP_TARGET
     60 8:
     61        st1             {v0.8h},  [x0], x1
     62        st1             {v0.8h},  [x6], x1
     63        subs            w4,  w4,  #4
     64        st1             {v0.8h},  [x0], x1
     65        st1             {v0.8h},  [x6], x1
     66        b.gt            8b
     67        ret
     68 160:
     69        AARCH64_VALID_JUMP_TARGET
     70        mov             v1.16b,  v0.16b
     71 16:
     72        st1             {v0.8h, v1.8h}, [x0], x1
     73        st1             {v0.8h, v1.8h}, [x6], x1
     74        subs            w4,  w4,  #4
     75        st1             {v0.8h, v1.8h}, [x0], x1
     76        st1             {v0.8h, v1.8h}, [x6], x1
     77        b.gt            16b
     78        ret
     79 320:
     80        AARCH64_VALID_JUMP_TARGET
     81        mov             v1.16b,  v0.16b
     82        mov             v2.16b,  v0.16b
     83        mov             v3.16b,  v0.16b
     84 32:
     85        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
     86        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
     87        subs            w4,  w4,  #4
     88        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
     89        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
     90        b.gt            32b
     91        ret
     92 640:
     93        AARCH64_VALID_JUMP_TARGET
     94        mov             v1.16b,  v0.16b
     95        mov             v2.16b,  v0.16b
     96        mov             v3.16b,  v0.16b
     97        sub             x1,  x1,  #64
     98 64:
     99        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    100        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    101        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    102        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    103        subs            w4,  w4,  #4
    104        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    105        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    106        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    107        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    108        b.gt            64b
    109        ret
    110 endfunc
    111 
    112 jumptable ipred_dc_128_tbl
    113        .word 640b - ipred_dc_128_tbl
    114        .word 320b - ipred_dc_128_tbl
    115        .word 160b - ipred_dc_128_tbl
    116        .word 80b  - ipred_dc_128_tbl
    117        .word 40b  - ipred_dc_128_tbl
    118 endjumptable
    119 
    120 // void ipred_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    121 //                         const pixel *const topleft,
    122 //                         const int width, const int height, const int a,
    123 //                         const int max_width, const int max_height);
    124 function ipred_v_16bpc_neon, export=1
    125        clz             w3,  w3
    126        movrel          x5,  ipred_v_tbl
    127        sub             w3,  w3,  #25
    128        ldrsw           x3,  [x5, w3, uxtw #2]
    129        add             x2,  x2,  #2
    130        add             x5,  x5,  x3
    131        add             x6,  x0,  x1
    132        lsl             x1,  x1,  #1
    133        br              x5
    134 40:
    135        AARCH64_VALID_JUMP_TARGET
    136        ld1             {v0.4h},  [x2]
    137 4:
    138        st1             {v0.4h},  [x0], x1
    139        st1             {v0.4h},  [x6], x1
    140        subs            w4,  w4,  #4
    141        st1             {v0.4h},  [x0], x1
    142        st1             {v0.4h},  [x6], x1
    143        b.gt            4b
    144        ret
    145 80:
    146        AARCH64_VALID_JUMP_TARGET
    147        ld1             {v0.8h},  [x2]
    148 8:
    149        st1             {v0.8h},  [x0], x1
    150        st1             {v0.8h},  [x6], x1
    151        subs            w4,  w4,  #4
    152        st1             {v0.8h},  [x0], x1
    153        st1             {v0.8h},  [x6], x1
    154        b.gt            8b
    155        ret
    156 160:
    157        AARCH64_VALID_JUMP_TARGET
    158        ld1             {v0.8h, v1.8h}, [x2]
    159 16:
    160        st1             {v0.8h, v1.8h}, [x0], x1
    161        st1             {v0.8h, v1.8h}, [x6], x1
    162        subs            w4,  w4,  #4
    163        st1             {v0.8h, v1.8h}, [x0], x1
    164        st1             {v0.8h, v1.8h}, [x6], x1
    165        b.gt            16b
    166        ret
    167 320:
    168        AARCH64_VALID_JUMP_TARGET
    169        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
    170 32:
    171        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    173        subs            w4,  w4,  #4
    174        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    175        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    176        b.gt            32b
    177        ret
    178 640:
    179        AARCH64_VALID_JUMP_TARGET
    180        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
    181        sub             x1,  x1,  #64
    182        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
    183 64:
    184        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    185        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    186        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
    187        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
    188        subs            w4,  w4,  #4
    189        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    190        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    191        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x0], x1
    192        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x6], x1
    193        b.gt            64b
    194        ret
    195 endfunc
    196 
    197 jumptable ipred_v_tbl
    198        .word 640b - ipred_v_tbl
    199        .word 320b - ipred_v_tbl
    200        .word 160b - ipred_v_tbl
    201        .word 80b  - ipred_v_tbl
    202        .word 40b  - ipred_v_tbl
    203 endjumptable
    204 
    205 // void ipred_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    206 //                         const pixel *const topleft,
    207 //                         const int width, const int height, const int a,
    208 //                         const int max_width, const int max_height);
    209 function ipred_h_16bpc_neon, export=1
    210        clz             w3,  w3
    211        movrel          x5,  ipred_h_tbl
    212        sub             w3,  w3,  #25
    213        ldrsw           x3,  [x5, w3, uxtw #2]
    214        sub             x2,  x2,  #8
    215        add             x5,  x5,  x3
    216        mov             x7,  #-8
    217        add             x6,  x0,  x1
    218        lsl             x1,  x1,  #1
    219        br              x5
    220 40:
    221        AARCH64_VALID_JUMP_TARGET
    222 4:
    223        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
    224        st1             {v3.4h},  [x0], x1
    225        st1             {v2.4h},  [x6], x1
    226        subs            w4,  w4,  #4
    227        st1             {v1.4h},  [x0], x1
    228        st1             {v0.4h},  [x6], x1
    229        b.gt            4b
    230        ret
    231 80:
    232        AARCH64_VALID_JUMP_TARGET
    233 8:
    234        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
    235        st1             {v3.8h},  [x0], x1
    236        st1             {v2.8h},  [x6], x1
    237        subs            w4,  w4,  #4
    238        st1             {v1.8h},  [x0], x1
    239        st1             {v0.8h},  [x6], x1
    240        b.gt            8b
    241        ret
    242 160:
    243        AARCH64_VALID_JUMP_TARGET
    244 16:
    245        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
    246        str             q3,  [x0, #16]
    247        str             q2,  [x6, #16]
    248        st1             {v3.8h}, [x0], x1
    249        st1             {v2.8h}, [x6], x1
    250        subs            w4,  w4,  #4
    251        str             q1,  [x0, #16]
    252        str             q0,  [x6, #16]
    253        st1             {v1.8h}, [x0], x1
    254        st1             {v0.8h}, [x6], x1
    255        b.gt            16b
    256        ret
    257 320:
    258        AARCH64_VALID_JUMP_TARGET
    259 32:
    260        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
    261        str             q3,  [x0, #16]
    262        str             q2,  [x6, #16]
    263        stp             q3,  q3,  [x0, #32]
    264        stp             q2,  q2,  [x6, #32]
    265        st1             {v3.8h}, [x0], x1
    266        st1             {v2.8h}, [x6], x1
    267        subs            w4,  w4,  #4
    268        str             q1,  [x0, #16]
    269        str             q0,  [x6, #16]
    270        stp             q1,  q1,  [x0, #32]
    271        stp             q0,  q0,  [x6, #32]
    272        st1             {v1.8h}, [x0], x1
    273        st1             {v0.8h}, [x6], x1
    274        b.gt            32b
    275        ret
    276 640:
    277        AARCH64_VALID_JUMP_TARGET
    278 64:
    279        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
    280        str             q3,  [x0, #16]
    281        str             q2,  [x6, #16]
    282        stp             q3,  q3,  [x0, #32]
    283        stp             q2,  q2,  [x6, #32]
    284        stp             q3,  q3,  [x0, #64]
    285        stp             q2,  q2,  [x6, #64]
    286        stp             q3,  q3,  [x0, #96]
    287        stp             q2,  q2,  [x6, #96]
    288        st1             {v3.8h}, [x0], x1
    289        st1             {v2.8h}, [x6], x1
    290        subs            w4,  w4,  #4
    291        str             q1,  [x0, #16]
    292        str             q0,  [x6, #16]
    293        stp             q1,  q1,  [x0, #32]
    294        stp             q0,  q0,  [x6, #32]
    295        stp             q1,  q1,  [x0, #64]
    296        stp             q0,  q0,  [x6, #64]
    297        stp             q1,  q1,  [x0, #96]
    298        stp             q0,  q0,  [x6, #96]
    299        st1             {v1.8h}, [x0], x1
    300        st1             {v0.8h}, [x6], x1
    301        b.gt            64b
    302        ret
    303 endfunc
    304 
    305 jumptable ipred_h_tbl
    306        .word 640b - ipred_h_tbl
    307        .word 320b - ipred_h_tbl
    308        .word 160b - ipred_h_tbl
    309        .word 80b  - ipred_h_tbl
    310        .word 40b  - ipred_h_tbl
    311 endjumptable
    312 
    313 // void ipred_dc_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    314 //                              const pixel *const topleft,
    315 //                              const int width, const int height, const int a,
    316 //                              const int max_width, const int max_height);
    317 function ipred_dc_top_16bpc_neon, export=1
    318        clz             w3,  w3
    319        movrel          x5,  ipred_dc_top_tbl
    320        sub             w3,  w3,  #25
    321        ldrsw           x3,  [x5, w3, uxtw #2]
    322        add             x2,  x2,  #2
    323        add             x5,  x5,  x3
    324        add             x6,  x0,  x1
    325        lsl             x1,  x1,  #1
    326        br              x5
    327 40:
    328        AARCH64_VALID_JUMP_TARGET
    329        ld1             {v0.4h},  [x2]
    330        addv            h0,      v0.4h
    331        urshr           v0.4h,   v0.4h,   #2
    332        dup             v0.4h,   v0.h[0]
    333 4:
    334        st1             {v0.4h},  [x0], x1
    335        st1             {v0.4h},  [x6], x1
    336        subs            w4,  w4,  #4
    337        st1             {v0.4h},  [x0], x1
    338        st1             {v0.4h},  [x6], x1
    339        b.gt            4b
    340        ret
    341 80:
    342        AARCH64_VALID_JUMP_TARGET
    343        ld1             {v0.8h},  [x2]
    344        addv            h0,      v0.8h
    345        urshr           v0.4h,   v0.4h,   #3
    346        dup             v0.8h,   v0.h[0]
    347 8:
    348        st1             {v0.8h},  [x0], x1
    349        st1             {v0.8h},  [x6], x1
    350        subs            w4,  w4,  #4
    351        st1             {v0.8h},  [x0], x1
    352        st1             {v0.8h},  [x6], x1
    353        b.gt            8b
    354        ret
    355 160:
    356        AARCH64_VALID_JUMP_TARGET
    357        ld1             {v0.8h, v1.8h}, [x2]
    358        addp            v0.8h,   v0.8h,   v1.8h
    359        addv            h0,      v0.8h
    360        urshr           v2.4h,   v0.4h,   #4
    361        dup             v0.8h,   v2.h[0]
    362        dup             v1.8h,   v2.h[0]
    363 16:
    364        st1             {v0.8h, v1.8h}, [x0], x1
    365        st1             {v0.8h, v1.8h}, [x6], x1
    366        subs            w4,  w4,  #4
    367        st1             {v0.8h, v1.8h}, [x0], x1
    368        st1             {v0.8h, v1.8h}, [x6], x1
    369        b.gt            16b
    370        ret
    371 320:
    372        AARCH64_VALID_JUMP_TARGET
    373        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
    374        addp            v0.8h,   v0.8h,   v1.8h
    375        addp            v2.8h,   v2.8h,   v3.8h
    376        addp            v0.8h,   v0.8h,   v2.8h
    377        uaddlv          s0,      v0.8h
    378        rshrn           v4.4h,   v0.4s,   #5
    379        dup             v0.8h,   v4.h[0]
    380        dup             v1.8h,   v4.h[0]
    381        dup             v2.8h,   v4.h[0]
    382        dup             v3.8h,   v4.h[0]
    383 32:
    384        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    385        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    386        subs            w4,  w4,  #4
    387        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    388        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    389        b.gt            32b
    390        ret
    391 640:
    392        AARCH64_VALID_JUMP_TARGET
    393        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
    394        addp            v0.8h,   v0.8h,   v1.8h
    395        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
    396        addp            v2.8h,   v2.8h,   v3.8h
    397        addp            v4.8h,   v4.8h,   v5.8h
    398        addp            v6.8h,   v6.8h,   v7.8h
    399        addp            v0.8h,   v0.8h,   v2.8h
    400        addp            v4.8h,   v4.8h,   v6.8h
    401        addp            v0.8h,   v0.8h,   v4.8h
    402        uaddlv          s0,      v0.8h
    403        rshrn           v4.4h,   v0.4s,   #6
    404        sub             x1,  x1,  #64
    405        dup             v0.8h,   v4.h[0]
    406        dup             v1.8h,   v4.h[0]
    407        dup             v2.8h,   v4.h[0]
    408        dup             v3.8h,   v4.h[0]
    409 64:
    410        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    411        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    412        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    413        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    414        subs            w4,  w4,  #4
    415        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    416        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    417        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    418        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    419        b.gt            64b
    420        ret
    421 endfunc
    422 
    423 jumptable ipred_dc_top_tbl
    424        .word 640b - ipred_dc_top_tbl
    425        .word 320b - ipred_dc_top_tbl
    426        .word 160b - ipred_dc_top_tbl
    427        .word 80b  - ipred_dc_top_tbl
    428        .word 40b  - ipred_dc_top_tbl
    429 endjumptable
    430 
    431 // void ipred_dc_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    432 //                               const pixel *const topleft,
    433 //                               const int width, const int height, const int a,
    434 //                               const int max_width, const int max_height);
    435 function ipred_dc_left_16bpc_neon, export=1
    436        sub             x2,  x2,  w4, uxtw #1
    437        clz             w3,  w3
    438        clz             w7,  w4
    439        movrel          x5,  ipred_dc_left_tbl
    440        sub             w3,  w3,  #20 // 25 leading bits, minus table offset 5
    441        sub             w7,  w7,  #25
    442        ldrsw           x3,  [x5, w3, uxtw #2]
    443        ldrsw           x7,  [x5, w7, uxtw #2]
    444        add             x3,  x5,  x3
    445        add             x5,  x5,  x7
    446        add             x6,  x0,  x1
    447        lsl             x1,  x1,  #1
    448        br              x5
    449 
    450 L(ipred_dc_left_h4):
    451        AARCH64_VALID_JUMP_TARGET
    452        ld1             {v0.4h},  [x2]
    453        addv            h0,      v0.4h
    454        urshr           v0.4h,   v0.4h,   #2
    455        dup             v0.8h,   v0.h[0]
    456        br              x3
    457 L(ipred_dc_left_w4):
    458        AARCH64_VALID_JUMP_TARGET
    459 1:
    460        st1             {v0.4h},  [x0], x1
    461        st1             {v0.4h},  [x6], x1
    462        subs            w4,  w4,  #4
    463        st1             {v0.4h},  [x0], x1
    464        st1             {v0.4h},  [x6], x1
    465        b.gt            1b
    466        ret
    467 
    468 L(ipred_dc_left_h8):
    469        AARCH64_VALID_JUMP_TARGET
    470        ld1             {v0.8h},  [x2]
    471        addv            h0,      v0.8h
    472        urshr           v0.4h,   v0.4h,   #3
    473        dup             v0.8h,   v0.h[0]
    474        br              x3
    475 L(ipred_dc_left_w8):
    476        AARCH64_VALID_JUMP_TARGET
    477 1:
    478        st1             {v0.8h},  [x0], x1
    479        st1             {v0.8h},  [x6], x1
    480        subs            w4,  w4,  #4
    481        st1             {v0.8h},  [x0], x1
    482        st1             {v0.8h},  [x6], x1
    483        b.gt            1b
    484        ret
    485 
    486 L(ipred_dc_left_h16):
    487        AARCH64_VALID_JUMP_TARGET
    488        ld1             {v0.8h, v1.8h}, [x2]
    489        addp            v0.8h,   v0.8h,   v1.8h
    490        addv            h0,      v0.8h
    491        urshr           v2.4h,   v0.4h,   #4
    492        dup             v0.8h,   v2.h[0]
    493        dup             v1.8h,   v2.h[0]
    494        br              x3
    495 L(ipred_dc_left_w16):
    496        AARCH64_VALID_JUMP_TARGET
    497        mov             v1.16b,  v0.16b
    498 1:
    499        st1             {v0.8h, v1.8h}, [x0], x1
    500        st1             {v0.8h, v1.8h}, [x6], x1
    501        subs            w4,  w4,  #4
    502        st1             {v0.8h, v1.8h}, [x0], x1
    503        st1             {v0.8h, v1.8h}, [x6], x1
    504        b.gt            1b
    505        ret
    506 
    507 L(ipred_dc_left_h32):
    508        AARCH64_VALID_JUMP_TARGET
    509        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2]
    510        addp            v0.8h,   v0.8h,   v1.8h
    511        addp            v2.8h,   v2.8h,   v3.8h
    512        addp            v0.8h,   v0.8h,   v2.8h
    513        uaddlp          v0.4s,   v0.8h
    514        addv            s0,      v0.4s
    515        rshrn           v4.4h,   v0.4s,   #5
    516        dup             v0.8h,   v4.h[0]
    517        br              x3
    518 L(ipred_dc_left_w32):
    519        AARCH64_VALID_JUMP_TARGET
    520        mov             v1.16b,  v0.16b
    521        mov             v2.16b,  v0.16b
    522        mov             v3.16b,  v0.16b
    523 1:
    524        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    525        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    526        subs            w4,  w4,  #4
    527        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    528        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    529        b.gt            1b
    530        ret
    531 
    532 L(ipred_dc_left_h64):
    533        AARCH64_VALID_JUMP_TARGET
    534        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
    535        addp            v0.8h,   v0.8h,   v1.8h
    536        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2]
    537        addp            v2.8h,   v2.8h,   v3.8h
    538        addp            v4.8h,   v4.8h,   v5.8h
    539        addp            v6.8h,   v6.8h,   v7.8h
    540        addp            v0.8h,   v0.8h,   v2.8h
    541        addp            v4.8h,   v4.8h,   v6.8h
    542        addp            v0.8h,   v0.8h,   v4.8h
    543        uaddlv          s0,      v0.8h
    544        rshrn           v4.4h,   v0.4s,   #6
    545        dup             v0.8h,   v4.h[0]
    546        br              x3
    547 L(ipred_dc_left_w64):
    548        AARCH64_VALID_JUMP_TARGET
    549        mov             v1.16b,  v0.16b
    550        mov             v2.16b,  v0.16b
    551        mov             v3.16b,  v0.16b
    552        sub             x1,  x1,  #64
    553 1:
    554        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    555        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    556        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    557        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    558        subs            w4,  w4,  #4
    559        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    560        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    561        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    562        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    563        b.gt            1b
    564        ret
    565 endfunc
    566 
    567 jumptable ipred_dc_left_tbl
    568        .word L(ipred_dc_left_h64) - ipred_dc_left_tbl
    569        .word L(ipred_dc_left_h32) - ipred_dc_left_tbl
    570        .word L(ipred_dc_left_h16) - ipred_dc_left_tbl
    571        .word L(ipred_dc_left_h8)  - ipred_dc_left_tbl
    572        .word L(ipred_dc_left_h4)  - ipred_dc_left_tbl
    573        .word L(ipred_dc_left_w64) - ipred_dc_left_tbl
    574        .word L(ipred_dc_left_w32) - ipred_dc_left_tbl
    575        .word L(ipred_dc_left_w16) - ipred_dc_left_tbl
    576        .word L(ipred_dc_left_w8)  - ipred_dc_left_tbl
    577        .word L(ipred_dc_left_w4)  - ipred_dc_left_tbl
    578 endjumptable
    579 
    580 // void ipred_dc_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    581 //                          const pixel *const topleft,
    582 //                          const int width, const int height, const int a,
    583 //                          const int max_width, const int max_height);
    584 function ipred_dc_16bpc_neon, export=1
    585        sub             x2,  x2,  w4, uxtw #1
    586        add             w7,  w3,  w4             // width + height
    587        clz             w3,  w3
    588        clz             w6,  w4
    589        dup             v16.4s, w7               // width + height
    590        movrel          x5,  ipred_dc_tbl
    591        rbit            w7,  w7                  // rbit(width + height)
    592        sub             w3,  w3,  #20            // 25 leading bits, minus table offset 5
    593        sub             w6,  w6,  #25
    594        clz             w7,  w7                  // ctz(width + height)
    595        ldrsw           x3,  [x5, w3, uxtw #2]
    596        ldrsw           x6,  [x5, w6, uxtw #2]
    597        neg             w7,  w7                  // -ctz(width + height)
    598        add             x3,  x5,  x3
    599        add             x5,  x5,  x6
    600        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
    601        dup             v17.4s,  w7              // -ctz(width + height)
    602        add             x6,  x0,  x1
    603        lsl             x1,  x1,  #1
    604        br              x5
    605 
    606 L(ipred_dc_h4):
    607        AARCH64_VALID_JUMP_TARGET
    608        ld1             {v0.4h},  [x2], #8
    609        uaddlv          s0,      v0.4h
    610        add             x2,  x2,  #2
    611        br              x3
    612 L(ipred_dc_w4):
    613        AARCH64_VALID_JUMP_TARGET
    614        ld1             {v1.4h},  [x2]
    615        add             v0.2s,   v0.2s,   v16.2s
    616        uaddlv          s1,      v1.4h
    617        cmp             w4,  #4
    618        add             v0.2s,   v0.2s,   v1.2s
    619        ushl            v0.2s,   v0.2s,   v17.2s
    620        b.eq            1f
    621        // h = 8/16
    622        cmp             w4,  #16
    623        mov             w16, #0x6667
    624        mov             w17, #0xAAAB
    625        csel            w16, w16, w17, eq
    626        dup             v16.2s,  w16
    627        mul             v0.2s,   v0.2s,   v16.2s
    628        ushr            v0.2s,   v0.2s,   #17
    629 1:
    630        dup             v0.4h,   v0.h[0]
    631 2:
    632        st1             {v0.4h},  [x0], x1
    633        st1             {v0.4h},  [x6], x1
    634        subs            w4,  w4,  #4
    635        st1             {v0.4h},  [x0], x1
    636        st1             {v0.4h},  [x6], x1
    637        b.gt            2b
    638        ret
    639 
    640 L(ipred_dc_h8):
    641        AARCH64_VALID_JUMP_TARGET
    642        ld1             {v0.8h},  [x2], #16
    643        uaddlv          s0,      v0.8h
    644        add             x2,  x2,  #2
    645        br              x3
    646 L(ipred_dc_w8):
    647        AARCH64_VALID_JUMP_TARGET
    648        ld1             {v1.8h},  [x2]
    649        add             v0.2s,   v0.2s,   v16.2s
    650        uaddlv          s1,      v1.8h
    651        cmp             w4,  #8
    652        add             v0.2s,   v0.2s,   v1.2s
    653        ushl            v0.2s,   v0.2s,   v17.2s
    654        b.eq            1f
    655        // h = 4/16/32
    656        cmp             w4,  #32
    657        mov             w16, #0x6667
    658        mov             w17, #0xAAAB
    659        csel            w16, w16, w17, eq
    660        dup             v16.2s,  w16
    661        mul             v0.2s,   v0.2s,   v16.2s
    662        ushr            v0.2s,   v0.2s,   #17
    663 1:
    664        dup             v0.8h,   v0.h[0]
    665 2:
    666        st1             {v0.8h},  [x0], x1
    667        st1             {v0.8h},  [x6], x1
    668        subs            w4,  w4,  #4
    669        st1             {v0.8h},  [x0], x1
    670        st1             {v0.8h},  [x6], x1
    671        b.gt            2b
    672        ret
    673 
    674 L(ipred_dc_h16):
    675        AARCH64_VALID_JUMP_TARGET
    676        ld1             {v0.8h, v1.8h}, [x2], #32
    677        addp            v0.8h,   v0.8h,   v1.8h
    678        add             x2,  x2,  #2
    679        uaddlv          s0,      v0.8h
    680        br              x3
    681 L(ipred_dc_w16):
    682        AARCH64_VALID_JUMP_TARGET
    683        ld1             {v1.8h, v2.8h}, [x2]
    684        add             v0.2s,   v0.2s,   v16.2s
    685        addp            v1.8h,   v1.8h,   v2.8h
    686        uaddlv          s1,      v1.8h
    687        cmp             w4,  #16
    688        add             v0.2s,   v0.2s,   v1.2s
    689        ushl            v4.2s,   v0.2s,   v17.2s
    690        b.eq            1f
    691        // h = 4/8/32/64
    692        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
    693        mov             w16, #0x6667
    694        mov             w17, #0xAAAB
    695        csel            w16, w16, w17, eq
    696        dup             v16.2s,  w16
    697        mul             v4.2s,   v4.2s,   v16.2s
    698        ushr            v4.2s,   v4.2s,   #17
    699 1:
    700        dup             v0.8h,   v4.h[0]
    701        dup             v1.8h,   v4.h[0]
    702 2:
    703        st1             {v0.8h, v1.8h}, [x0], x1
    704        st1             {v0.8h, v1.8h}, [x6], x1
    705        subs            w4,  w4,  #4
    706        st1             {v0.8h, v1.8h}, [x0], x1
    707        st1             {v0.8h, v1.8h}, [x6], x1
    708        b.gt            2b
    709        ret
    710 
    711 L(ipred_dc_h32):
    712        AARCH64_VALID_JUMP_TARGET
    713        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
    714        addp            v0.8h,   v0.8h,   v1.8h
    715        addp            v2.8h,   v2.8h,   v3.8h
    716        addp            v0.8h,   v0.8h,   v2.8h
    717        add             x2,  x2,  #2
    718        uaddlv          s0,      v0.8h
    719        br              x3
    720 L(ipred_dc_w32):
    721        AARCH64_VALID_JUMP_TARGET
    722        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2]
    723        add             v0.2s,   v0.2s,   v16.2s
    724        addp            v1.8h,   v1.8h,   v2.8h
    725        addp            v3.8h,   v3.8h,   v4.8h
    726        addp            v1.8h,   v1.8h,   v3.8h
    727        uaddlv          s1,      v1.8h
    728        cmp             w4,  #32
    729        add             v0.2s,   v0.2s,   v1.2s
    730        ushl            v4.2s,   v0.2s,   v17.2s
    731        b.eq            1f
    732        // h = 8/16/64
    733        cmp             w4,  #8
    734        mov             w16, #0x6667
    735        mov             w17, #0xAAAB
    736        csel            w16, w16, w17, eq
    737        dup             v16.2s,  w16
    738        mul             v4.2s,   v4.2s,   v16.2s
    739        ushr            v4.2s,   v4.2s,   #17
    740 1:
    741        dup             v0.8h,   v4.h[0]
    742        dup             v1.8h,   v4.h[0]
    743        dup             v2.8h,   v4.h[0]
    744        dup             v3.8h,   v4.h[0]
    745 2:
    746        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    747        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    748        subs            w4,  w4,  #4
    749        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    750        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    751        b.gt            2b
    752        ret
    753 
    754 L(ipred_dc_h64):
    755        AARCH64_VALID_JUMP_TARGET
    756        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x2], #64
    757        addp            v0.8h,   v0.8h,   v1.8h
    758        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], #64
    759        addp            v2.8h,   v2.8h,   v3.8h
    760        addp            v4.8h,   v4.8h,   v5.8h
    761        addp            v6.8h,   v6.8h,   v7.8h
    762        addp            v0.8h,   v0.8h,   v2.8h
    763        addp            v4.8h,   v4.8h,   v6.8h
    764        addp            v0.8h,   v0.8h,   v4.8h
    765        add             x2,  x2,  #2
    766        uaddlv          s0,      v0.8h
    767        br              x3
    768 L(ipred_dc_w64):
    769        AARCH64_VALID_JUMP_TARGET
    770        ld1             {v1.8h, v2.8h, v3.8h, v4.8h}, [x2], #64
    771        add             v0.2s,   v0.2s,   v16.2s
    772        addp            v1.8h,   v1.8h,   v2.8h
    773        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x2]
    774        addp            v3.8h,   v3.8h,   v4.8h
    775        addp            v20.8h,  v20.8h,  v21.8h
    776        addp            v22.8h,  v22.8h,  v23.8h
    777        addp            v1.8h,   v1.8h,   v3.8h
    778        addp            v20.8h,  v20.8h,  v22.8h
    779        addp            v1.8h,   v1.8h,   v20.8h
    780        uaddlv          s1,      v1.8h
    781        cmp             w4,  #64
    782        add             v0.2s,   v0.2s,   v1.2s
    783        ushl            v4.2s,   v0.2s,   v17.2s
    784        b.eq            1f
    785        // h = 16/32
    786        cmp             w4,  #16
    787        mov             w16, #0x6667
    788        mov             w17, #0xAAAB
    789        csel            w16, w16, w17, eq
    790        dup             v16.2s,  w16
    791        mul             v4.2s,   v4.2s,   v16.2s
    792        ushr            v4.2s,   v4.2s,   #17
    793 1:
    794        sub             x1,  x1,  #64
    795        dup             v0.8h,   v4.h[0]
    796        dup             v1.8h,   v4.h[0]
    797        dup             v2.8h,   v4.h[0]
    798        dup             v3.8h,   v4.h[0]
    799 2:
    800        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    801        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    802        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    803        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    804        subs            w4,  w4,  #4
    805        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
    806        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], #64
    807        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
    808        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x6], x1
    809        b.gt            2b
    810        ret
    811 endfunc
    812 
    813 jumptable ipred_dc_tbl
    814        .word L(ipred_dc_h64) - ipred_dc_tbl
    815        .word L(ipred_dc_h32) - ipred_dc_tbl
    816        .word L(ipred_dc_h16) - ipred_dc_tbl
    817        .word L(ipred_dc_h8)  - ipred_dc_tbl
    818        .word L(ipred_dc_h4)  - ipred_dc_tbl
    819        .word L(ipred_dc_w64) - ipred_dc_tbl
    820        .word L(ipred_dc_w32) - ipred_dc_tbl
    821        .word L(ipred_dc_w16) - ipred_dc_tbl
    822        .word L(ipred_dc_w8)  - ipred_dc_tbl
    823        .word L(ipred_dc_w4)  - ipred_dc_tbl
    824 endjumptable
    825 
    826 // void ipred_paeth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    827 //                             const pixel *const topleft,
    828 //                             const int width, const int height, const int a,
    829 //                             const int max_width, const int max_height);
    830 function ipred_paeth_16bpc_neon, export=1
    831        clz             w9,  w3
    832        movrel          x5,  ipred_paeth_tbl
    833        sub             w9,  w9,  #25
    834        ldrsw           x9,  [x5, w9, uxtw #2]
    835        ld1r            {v4.8h},  [x2]
    836        add             x8,  x2,  #2
    837        sub             x2,  x2,  #8
    838        add             x5,  x5,  x9
    839        mov             x7,  #-8
    840        add             x6,  x0,  x1
    841        lsl             x1,  x1,  #1
    842        br              x5
    843 40:
    844        AARCH64_VALID_JUMP_TARGET
    845        ld1r            {v5.2d},  [x8]
    846        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
    847 4:
    848        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7
    849        zip1            v0.2d,   v0.2d,   v1.2d
    850        zip1            v2.2d,   v2.2d,   v3.2d
    851        add             v16.8h,  v6.8h,   v0.8h   // base
    852        add             v17.8h,  v6.8h,   v2.8h
    853        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
    854        sabd            v21.8h,  v5.8h,   v17.8h
    855        sabd            v22.8h,  v4.8h,   v16.8h  // tldiff
    856        sabd            v23.8h,  v4.8h,   v17.8h
    857        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
    858        sabd            v17.8h,  v2.8h,   v17.8h
    859        umin            v18.8h,  v20.8h,  v22.8h  // min(tdiff, tldiff)
    860        umin            v19.8h,  v21.8h,  v23.8h
    861        cmge            v20.8h,  v22.8h,  v20.8h  // tldiff >= tdiff
    862        cmge            v21.8h,  v23.8h,  v21.8h
    863        cmge            v16.8h,  v18.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
    864        cmge            v17.8h,  v19.8h,  v17.8h
    865        bsl             v21.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
    866        bsl             v20.16b, v5.16b,  v4.16b
    867        bit             v21.16b, v2.16b,  v17.16b // ldiff <= min ? left : ...
    868        bit             v20.16b, v0.16b,  v16.16b
    869        st1             {v21.d}[1], [x0], x1
    870        st1             {v21.d}[0], [x6], x1
    871        subs            w4,  w4,  #4
    872        st1             {v20.d}[1], [x0], x1
    873        st1             {v20.d}[0], [x6], x1
    874        b.gt            4b
    875        ret
    876 80:
    877 160:
    878 320:
    879 640:
    880        AARCH64_VALID_JUMP_TARGET
    881        ld1             {v5.8h},  [x8], #16
    882        mov             w9,  w3
    883        // Set up pointers for four rows in parallel; x0, x6, x5, x10
    884        add             x5,  x0,  x1
    885        add             x10, x6,  x1
    886        lsl             x1,  x1,  #1
    887        sub             x1,  x1,  w3, uxtw #1
    888 1:
    889        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7
    890 2:
    891        sub             v6.8h,   v5.8h,   v4.8h   // top - topleft
    892        add             v16.8h,  v6.8h,   v0.8h   // base
    893        add             v17.8h,  v6.8h,   v1.8h
    894        add             v18.8h,  v6.8h,   v2.8h
    895        add             v19.8h,  v6.8h,   v3.8h
    896        sabd            v20.8h,  v5.8h,   v16.8h  // tdiff
    897        sabd            v21.8h,  v5.8h,   v17.8h
    898        sabd            v22.8h,  v5.8h,   v18.8h
    899        sabd            v23.8h,  v5.8h,   v19.8h
    900        sabd            v24.8h,  v4.8h,   v16.8h  // tldiff
    901        sabd            v25.8h,  v4.8h,   v17.8h
    902        sabd            v26.8h,  v4.8h,   v18.8h
    903        sabd            v27.8h,  v4.8h,   v19.8h
    904        sabd            v16.8h,  v0.8h,   v16.8h  // ldiff
    905        sabd            v17.8h,  v1.8h,   v17.8h
    906        sabd            v18.8h,  v2.8h,   v18.8h
    907        sabd            v19.8h,  v3.8h,   v19.8h
    908        umin            v28.8h,  v20.8h,  v24.8h  // min(tdiff, tldiff)
    909        umin            v29.8h,  v21.8h,  v25.8h
    910        umin            v30.8h,  v22.8h,  v26.8h
    911        umin            v31.8h,  v23.8h,  v27.8h
    912        cmge            v20.8h,  v24.8h,  v20.8h  // tldiff >= tdiff
    913        cmge            v21.8h,  v25.8h,  v21.8h
    914        cmge            v22.8h,  v26.8h,  v22.8h
    915        cmge            v23.8h,  v27.8h,  v23.8h
    916        cmge            v16.8h,  v28.8h,  v16.8h  // min(tdiff, tldiff) >= ldiff
    917        cmge            v17.8h,  v29.8h,  v17.8h
    918        cmge            v18.8h,  v30.8h,  v18.8h
    919        cmge            v19.8h,  v31.8h,  v19.8h
    920        bsl             v23.16b, v5.16b,  v4.16b  // tdiff <= tldiff ? top : topleft
    921        bsl             v22.16b, v5.16b,  v4.16b
    922        bsl             v21.16b, v5.16b,  v4.16b
    923        bsl             v20.16b, v5.16b,  v4.16b
    924        bit             v23.16b, v3.16b,  v19.16b // ldiff <= min ? left : ...
    925        bit             v22.16b, v2.16b,  v18.16b
    926        bit             v21.16b, v1.16b,  v17.16b
    927        bit             v20.16b, v0.16b,  v16.16b
    928        st1             {v23.8h}, [x0], #16
    929        st1             {v22.8h}, [x6], #16
    930        subs            w3,  w3,  #8
    931        st1             {v21.8h}, [x5], #16
    932        st1             {v20.8h}, [x10], #16
    933        b.le            8f
    934        ld1             {v5.8h},  [x8], #16
    935        b               2b
    936 8:
    937        subs            w4,  w4,  #4
    938        b.le            9f
    939        // End of horizontal loop, move pointers to next four rows
    940        sub             x8,  x8,  w9, uxtw #1
    941        add             x0,  x0,  x1
    942        add             x6,  x6,  x1
    943        // Load the top row as early as possible
    944        ld1             {v5.8h},  [x8], #16
    945        add             x5,  x5,  x1
    946        add             x10, x10, x1
    947        mov             w3,  w9
    948        b               1b
    949 9:
    950        ret
    951 endfunc
    952 
    953 jumptable ipred_paeth_tbl
    954        .word 640b - ipred_paeth_tbl
    955        .word 320b - ipred_paeth_tbl
    956        .word 160b - ipred_paeth_tbl
    957        .word 80b  - ipred_paeth_tbl
    958        .word 40b  - ipred_paeth_tbl
    959 endjumptable
    960 
    961 // void ipred_smooth_16bpc_neon(pixel *dst, const ptrdiff_t stride,
    962 //                              const pixel *const topleft,
    963 //                              const int width, const int height, const int a,
    964 //                              const int max_width, const int max_height);
    965 function ipred_smooth_16bpc_neon, export=1
    966        movrel          x10, X(sm_weights)
    967        add             x11, x10, w4, uxtw
    968        add             x10, x10, w3, uxtw
    969        clz             w9,  w3
    970        movrel          x5,  ipred_smooth_tbl
    971        sub             x12, x2,  w4, uxtw #1
    972        sub             w9,  w9,  #25
    973        ldrsw           x9,  [x5, w9, uxtw #2]
    974        ld1r            {v4.8h},  [x12] // bottom
    975        add             x8,  x2,  #2
    976        add             x5,  x5,  x9
    977        add             x6,  x0,  x1
    978        lsl             x1,  x1,  #1
    979        br              x5
    980 40:
    981        AARCH64_VALID_JUMP_TARGET
    982        ld1r            {v6.2d}, [x8]             // top
    983        ld1r            {v7.2s}, [x10]            // weights_hor
    984        sub             x2,  x2,  #8
    985        mov             x7,  #-8
    986        dup             v5.8h,   v6.h[3]          // right
    987        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
    988        uxtl            v7.8h,   v7.8b            // weights_hor
    989        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
    990 4:
    991        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
    992        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
    993        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
    994        ushll           v21.4s,  v31.4h,  #8
    995        ushll           v22.4s,  v31.4h,  #8
    996        ushll           v23.4s,  v31.4h,  #8
    997        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
    998        zip1            v0.2d,   v3.2d,   v2.2d
    999        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
   1000        zip1            v18.2s,  v18.2s,  v19.2s
   1001        sub             v0.8h,   v0.8h,   v5.8h   // left-right
   1002        sub             v1.8h,   v1.8h,   v5.8h
   1003        uxtl            v16.8h,  v16.8b           // weights_ver
   1004        uxtl            v18.8h,  v18.8b
   1005        smlal           v20.4s,  v0.4h,   v7.4h   // += (left-right)*weights_hor
   1006        smlal2          v21.4s,  v0.8h,   v7.8h
   1007        smlal           v22.4s,  v1.4h,   v7.4h
   1008        smlal2          v23.4s,  v1.8h,   v7.8h
   1009        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
   1010        smlal2          v21.4s,  v6.8h,   v16.8h
   1011        smlal           v22.4s,  v6.4h,   v18.4h
   1012        smlal2          v23.4s,  v6.8h,   v18.8h
   1013        rshrn           v20.4h,  v20.4s,  #9
   1014        rshrn           v21.4h,  v21.4s,  #9
   1015        rshrn           v22.4h,  v22.4s,  #9
   1016        rshrn           v23.4h,  v23.4s,  #9
   1017        st1             {v20.4h}, [x0], x1
   1018        st1             {v21.4h}, [x6], x1
   1019        subs            w4,  w4,  #4
   1020        st1             {v22.4h}, [x0], x1
   1021        st1             {v23.4h}, [x6], x1
   1022        b.gt            4b
   1023        ret
   1024 80:
   1025        AARCH64_VALID_JUMP_TARGET
   1026        ld1             {v6.8h}, [x8]             // top
   1027        ld1             {v7.8b}, [x10]            // weights_hor
   1028        sub             x2,  x2,  #8
   1029        mov             x7,  #-8
   1030        dup             v5.8h,   v6.h[7]          // right
   1031        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
   1032        uxtl            v7.8h,   v7.8b            // weights_hor
   1033        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
   1034 8:
   1035        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
   1036        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x11], #4 // weights_ver
   1037        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
   1038        ushll           v21.4s,  v31.4h,  #8
   1039        ushll           v22.4s,  v31.4h,  #8
   1040        ushll           v23.4s,  v31.4h,  #8
   1041        ushll           v24.4s,  v31.4h,  #8
   1042        ushll           v25.4s,  v31.4h,  #8
   1043        ushll           v26.4s,  v31.4h,  #8
   1044        ushll           v27.4s,  v31.4h,  #8
   1045        sub             v0.8h,   v0.8h,   v5.8h   // left-right
   1046        sub             v1.8h,   v1.8h,   v5.8h
   1047        sub             v2.8h,   v2.8h,   v5.8h
   1048        sub             v3.8h,   v3.8h,   v5.8h
   1049        uxtl            v16.8h,  v16.8b           // weights_ver
   1050        uxtl            v17.8h,  v17.8b
   1051        uxtl            v18.8h,  v18.8b
   1052        uxtl            v19.8h,  v19.8b
   1053        smlal           v20.4s,  v3.4h,   v7.4h   // += (left-right)*weights_hor
   1054        smlal2          v21.4s,  v3.8h,   v7.8h   // (left flipped)
   1055        smlal           v22.4s,  v2.4h,   v7.4h
   1056        smlal2          v23.4s,  v2.8h,   v7.8h
   1057        smlal           v24.4s,  v1.4h,   v7.4h
   1058        smlal2          v25.4s,  v1.8h,   v7.8h
   1059        smlal           v26.4s,  v0.4h,   v7.4h
   1060        smlal2          v27.4s,  v0.8h,   v7.8h
   1061        smlal           v20.4s,  v6.4h,   v16.4h  // += (top-bottom)*weights_ver
   1062        smlal2          v21.4s,  v6.8h,   v16.8h
   1063        smlal           v22.4s,  v6.4h,   v17.4h
   1064        smlal2          v23.4s,  v6.8h,   v17.8h
   1065        smlal           v24.4s,  v6.4h,   v18.4h
   1066        smlal2          v25.4s,  v6.8h,   v18.8h
   1067        smlal           v26.4s,  v6.4h,   v19.4h
   1068        smlal2          v27.4s,  v6.8h,   v19.8h
   1069        rshrn           v20.4h,  v20.4s,  #9
   1070        rshrn2          v20.8h,  v21.4s,  #9
   1071        rshrn           v21.4h,  v22.4s,  #9
   1072        rshrn2          v21.8h,  v23.4s,  #9
   1073        rshrn           v22.4h,  v24.4s,  #9
   1074        rshrn2          v22.8h,  v25.4s,  #9
   1075        rshrn           v23.4h,  v26.4s,  #9
   1076        rshrn2          v23.8h,  v27.4s,  #9
   1077        st1             {v20.8h}, [x0], x1
   1078        st1             {v21.8h}, [x6], x1
   1079        subs            w4,  w4,  #4
   1080        st1             {v22.8h}, [x0], x1
   1081        st1             {v23.8h}, [x6], x1
   1082        b.gt            8b
   1083        ret
   1084 160:
   1085 320:
   1086 640:
   1087        AARCH64_VALID_JUMP_TARGET
   1088        add             x12, x2,  w3, uxtw #1
   1089        sub             x1,  x1,  w3, uxtw #1
   1090        ld1r            {v5.8h}, [x12]            // right
   1091        sub             x2,  x2,  #4
   1092        mov             x7,  #-4
   1093        mov             w9,  w3
   1094        add             v31.4h,  v4.4h,   v5.4h   // bottom+right
   1095 
   1096 1:
   1097        ld2r            {v0.8h, v1.8h},   [x2],  x7 // left
   1098        ld2r            {v16.8b, v17.8b}, [x11], #2 // weights_ver
   1099        sub             v0.8h,   v0.8h,   v5.8h   // left-right
   1100        sub             v1.8h,   v1.8h,   v5.8h
   1101        uxtl            v16.8h,  v16.8b           // weights_ver
   1102        uxtl            v17.8h,  v17.8b
   1103 2:
   1104        ld1             {v7.16b}, [x10],  #16     // weights_hor
   1105        ld1             {v2.8h, v3.8h}, [x8], #32 // top
   1106        ushll           v20.4s,  v31.4h,  #8      // (bottom+right)*256
   1107        ushll           v21.4s,  v31.4h,  #8
   1108        ushll           v22.4s,  v31.4h,  #8
   1109        ushll           v23.4s,  v31.4h,  #8
   1110        ushll           v24.4s,  v31.4h,  #8
   1111        ushll           v25.4s,  v31.4h,  #8
   1112        ushll           v26.4s,  v31.4h,  #8
   1113        ushll           v27.4s,  v31.4h,  #8
   1114        uxtl            v6.8h,   v7.8b            // weights_hor
   1115        uxtl2           v7.8h,   v7.16b
   1116        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
   1117        sub             v3.8h,   v3.8h,   v4.8h
   1118        smlal           v20.4s,  v1.4h,   v6.4h   // += (left-right)*weights_hor
   1119        smlal2          v21.4s,  v1.8h,   v6.8h   // (left flipped)
   1120        smlal           v22.4s,  v1.4h,   v7.4h
   1121        smlal2          v23.4s,  v1.8h,   v7.8h
   1122        smlal           v24.4s,  v0.4h,   v6.4h
   1123        smlal2          v25.4s,  v0.8h,   v6.8h
   1124        smlal           v26.4s,  v0.4h,   v7.4h
   1125        smlal2          v27.4s,  v0.8h,   v7.8h
   1126        smlal           v20.4s,  v2.4h,   v16.4h  // += (top-bottom)*weights_ver
   1127        smlal2          v21.4s,  v2.8h,   v16.8h
   1128        smlal           v22.4s,  v3.4h,   v16.4h
   1129        smlal2          v23.4s,  v3.8h,   v16.8h
   1130        smlal           v24.4s,  v2.4h,   v17.4h
   1131        smlal2          v25.4s,  v2.8h,   v17.8h
   1132        smlal           v26.4s,  v3.4h,   v17.4h
   1133        smlal2          v27.4s,  v3.8h,   v17.8h
   1134        rshrn           v20.4h,  v20.4s,  #9
   1135        rshrn2          v20.8h,  v21.4s,  #9
   1136        rshrn           v21.4h,  v22.4s,  #9
   1137        rshrn2          v21.8h,  v23.4s,  #9
   1138        rshrn           v22.4h,  v24.4s,  #9
   1139        rshrn2          v22.8h,  v25.4s,  #9
   1140        rshrn           v23.4h,  v26.4s,  #9
   1141        rshrn2          v23.8h,  v27.4s,  #9
   1142        subs            w3,  w3,  #16
   1143        st1             {v20.8h, v21.8h}, [x0], #32
   1144        st1             {v22.8h, v23.8h}, [x6], #32
   1145        b.gt            2b
   1146        subs            w4,  w4,  #2
   1147        b.le            9f
   1148        sub             x8,  x8,  w9, uxtw #1
   1149        sub             x10, x10, w9, uxtw
   1150        add             x0,  x0,  x1
   1151        add             x6,  x6,  x1
   1152        mov             w3,  w9
   1153        b               1b
   1154 9:
   1155        ret
   1156 endfunc
   1157 
   1158 jumptable ipred_smooth_tbl
   1159        .word 640b - ipred_smooth_tbl
   1160        .word 320b - ipred_smooth_tbl
   1161        .word 160b - ipred_smooth_tbl
   1162        .word 80b  - ipred_smooth_tbl
   1163        .word 40b  - ipred_smooth_tbl
   1164 endjumptable
   1165 
   1166 // void ipred_smooth_v_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   1167 //                                const pixel *const topleft,
   1168 //                                const int width, const int height, const int a,
   1169 //                                const int max_width, const int max_height);
   1170 function ipred_smooth_v_16bpc_neon, export=1
   1171        movrel          x7,  X(sm_weights)
   1172        add             x7,  x7,  w4, uxtw
   1173        clz             w9,  w3
   1174        movrel          x5,  ipred_smooth_v_tbl
   1175        sub             x8,  x2,  w4, uxtw #1
   1176        sub             w9,  w9,  #25
   1177        ldrsw           x9,  [x5, w9, uxtw #2]
   1178        ld1r            {v4.8h},  [x8] // bottom
   1179        add             x2,  x2,  #2
   1180        add             x5,  x5,  x9
   1181        add             x6,  x0,  x1
   1182        lsl             x1,  x1,  #1
   1183        br              x5
   1184 40:
   1185        AARCH64_VALID_JUMP_TARGET
   1186        ld1r            {v6.2d}, [x2]             // top
   1187        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
   1188 4:
   1189        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
   1190        zip1            v16.2s,  v16.2s,  v17.2s  // weights_ver
   1191        zip1            v18.2s,  v18.2s,  v19.2s
   1192        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
   1193        ushll           v18.8h,  v18.8b,  #7
   1194        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
   1195        sqrdmulh        v21.8h,  v6.8h,   v18.8h
   1196        add             v20.8h,  v20.8h,  v4.8h
   1197        add             v21.8h,  v21.8h,  v4.8h
   1198        st1             {v20.d}[0], [x0], x1
   1199        st1             {v20.d}[1], [x6], x1
   1200        subs            w4,  w4,  #4
   1201        st1             {v21.d}[0], [x0], x1
   1202        st1             {v21.d}[1], [x6], x1
   1203        b.gt            4b
   1204        ret
   1205 80:
   1206        AARCH64_VALID_JUMP_TARGET
   1207        ld1             {v6.8h}, [x2]             // top
   1208        sub             v6.8h,   v6.8h,   v4.8h   // top-bottom
   1209 8:
   1210        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b},  [x7], #4 // weights_ver
   1211        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
   1212        ushll           v17.8h,  v17.8b,  #7
   1213        ushll           v18.8h,  v18.8b,  #7
   1214        ushll           v19.8h,  v19.8b,  #7
   1215        sqrdmulh        v20.8h,  v6.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
   1216        sqrdmulh        v21.8h,  v6.8h,   v17.8h
   1217        sqrdmulh        v22.8h,  v6.8h,   v18.8h
   1218        sqrdmulh        v23.8h,  v6.8h,   v19.8h
   1219        add             v20.8h,  v20.8h,  v4.8h
   1220        add             v21.8h,  v21.8h,  v4.8h
   1221        add             v22.8h,  v22.8h,  v4.8h
   1222        add             v23.8h,  v23.8h,  v4.8h
   1223        st1             {v20.8h}, [x0], x1
   1224        st1             {v21.8h}, [x6], x1
   1225        subs            w4,  w4,  #4
   1226        st1             {v22.8h}, [x0], x1
   1227        st1             {v23.8h}, [x6], x1
   1228        b.gt            8b
   1229        ret
   1230 160:
   1231 320:
   1232 640:
   1233        AARCH64_VALID_JUMP_TARGET
   1234        // Set up pointers for four rows in parallel; x0, x6, x5, x8
   1235        add             x5,  x0,  x1
   1236        add             x8,  x6,  x1
   1237        lsl             x1,  x1,  #1
   1238        sub             x1,  x1,  w3, uxtw #1
   1239        mov             w9,  w3
   1240 
   1241 1:
   1242        ld4r            {v16.8b, v17.8b, v18.8b, v19.8b}, [x7], #4 // weights_ver
   1243        ushll           v16.8h,  v16.8b,  #7      // weights_ver << 7
   1244        ushll           v17.8h,  v17.8b,  #7
   1245        ushll           v18.8h,  v18.8b,  #7
   1246        ushll           v19.8h,  v19.8b,  #7
   1247 2:
   1248        ld1             {v2.8h, v3.8h}, [x2], #32 // top
   1249        sub             v2.8h,   v2.8h,   v4.8h   // top-bottom
   1250        sub             v3.8h,   v3.8h,   v4.8h
   1251        sqrdmulh        v20.8h,  v2.8h,   v16.8h  // ((top-bottom)*weights_ver + 128) >> 8
   1252        sqrdmulh        v21.8h,  v3.8h,   v16.8h
   1253        sqrdmulh        v22.8h,  v2.8h,   v17.8h
   1254        sqrdmulh        v23.8h,  v3.8h,   v17.8h
   1255        sqrdmulh        v24.8h,  v2.8h,   v18.8h
   1256        sqrdmulh        v25.8h,  v3.8h,   v18.8h
   1257        sqrdmulh        v26.8h,  v2.8h,   v19.8h
   1258        sqrdmulh        v27.8h,  v3.8h,   v19.8h
   1259        add             v20.8h,  v20.8h,  v4.8h
   1260        add             v21.8h,  v21.8h,  v4.8h
   1261        add             v22.8h,  v22.8h,  v4.8h
   1262        add             v23.8h,  v23.8h,  v4.8h
   1263        add             v24.8h,  v24.8h,  v4.8h
   1264        add             v25.8h,  v25.8h,  v4.8h
   1265        add             v26.8h,  v26.8h,  v4.8h
   1266        add             v27.8h,  v27.8h,  v4.8h
   1267        subs            w3,  w3,  #16
   1268        st1             {v20.8h, v21.8h}, [x0], #32
   1269        st1             {v22.8h, v23.8h}, [x6], #32
   1270        st1             {v24.8h, v25.8h}, [x5], #32
   1271        st1             {v26.8h, v27.8h}, [x8], #32
   1272        b.gt            2b
   1273        subs            w4,  w4,  #4
   1274        b.le            9f
   1275        sub             x2,  x2,  w9, uxtw #1
   1276        add             x0,  x0,  x1
   1277        add             x6,  x6,  x1
   1278        add             x5,  x5,  x1
   1279        add             x8,  x8,  x1
   1280        mov             w3,  w9
   1281        b               1b
   1282 9:
   1283        ret
   1284 endfunc
   1285 
   1286 jumptable ipred_smooth_v_tbl
   1287        .word 640b - ipred_smooth_v_tbl
   1288        .word 320b - ipred_smooth_v_tbl
   1289        .word 160b - ipred_smooth_v_tbl
   1290        .word 80b  - ipred_smooth_v_tbl
   1291        .word 40b  - ipred_smooth_v_tbl
   1292 endjumptable
   1293 
   1294 // void ipred_smooth_h_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   1295 //                                const pixel *const topleft,
   1296 //                                const int width, const int height, const int a,
   1297 //                                const int max_width, const int max_height);
   1298 function ipred_smooth_h_16bpc_neon, export=1
   1299        movrel          x8,  X(sm_weights)
   1300        add             x8,  x8,  w3, uxtw
   1301        clz             w9,  w3
   1302        movrel          x5,  ipred_smooth_h_tbl
   1303        add             x12, x2,  w3, uxtw #1
   1304        sub             w9,  w9,  #25
   1305        ldrsw           x9,  [x5, w9, uxtw #2]
   1306        ld1r            {v5.8h},  [x12] // right
   1307        add             x5,  x5,  x9
   1308        add             x6,  x0,  x1
   1309        lsl             x1,  x1,  #1
   1310        br              x5
   1311 40:
   1312        AARCH64_VALID_JUMP_TARGET
   1313        ld1r            {v7.2s}, [x8]             // weights_hor
   1314        sub             x2,  x2,  #8
   1315        mov             x7,  #-8
   1316        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
   1317 4:
   1318        ld4r            {v0.4h, v1.4h, v2.4h, v3.4h},  [x2], x7 // left
   1319        zip1            v1.2d,   v1.2d,   v0.2d   // left, flipped
   1320        zip1            v0.2d,   v3.2d,   v2.2d
   1321        sub             v0.8h,   v0.8h,   v5.8h   // left-right
   1322        sub             v1.8h,   v1.8h,   v5.8h
   1323        sqrdmulh        v20.8h,  v0.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
   1324        sqrdmulh        v21.8h,  v1.8h,   v7.8h
   1325        add             v20.8h,  v20.8h,  v5.8h
   1326        add             v21.8h,  v21.8h,  v5.8h
   1327        st1             {v20.d}[0], [x0], x1
   1328        st1             {v20.d}[1], [x6], x1
   1329        subs            w4,  w4,  #4
   1330        st1             {v21.d}[0], [x0], x1
   1331        st1             {v21.d}[1], [x6], x1
   1332        b.gt            4b
   1333        ret
   1334 80:
   1335        AARCH64_VALID_JUMP_TARGET
   1336        ld1             {v7.8b}, [x8]             // weights_hor
   1337        sub             x2,  x2,  #8
   1338        mov             x7,  #-8
   1339        ushll           v7.8h,   v7.8b,   #7      // weights_hor << 7
   1340 8:
   1341        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},  [x2], x7 // left
   1342        sub             v3.8h,   v3.8h,   v5.8h   // left-right
   1343        sub             v2.8h,   v2.8h,   v5.8h
   1344        sub             v1.8h,   v1.8h,   v5.8h
   1345        sub             v0.8h,   v0.8h,   v5.8h
   1346        sqrdmulh        v20.8h,  v3.8h,   v7.8h   // ((left-right)*weights_hor + 128) >> 8
   1347        sqrdmulh        v21.8h,  v2.8h,   v7.8h   // (left flipped)
   1348        sqrdmulh        v22.8h,  v1.8h,   v7.8h
   1349        sqrdmulh        v23.8h,  v0.8h,   v7.8h
   1350        add             v20.8h,  v20.8h,  v5.8h
   1351        add             v21.8h,  v21.8h,  v5.8h
   1352        add             v22.8h,  v22.8h,  v5.8h
   1353        add             v23.8h,  v23.8h,  v5.8h
   1354        st1             {v20.8h}, [x0], x1
   1355        st1             {v21.8h}, [x6], x1
   1356        subs            w4,  w4,  #4
   1357        st1             {v22.8h}, [x0], x1
   1358        st1             {v23.8h}, [x6], x1
   1359        b.gt            8b
   1360        ret
   1361 160:
   1362 320:
   1363 640:
   1364        AARCH64_VALID_JUMP_TARGET
   1365        sub             x2,  x2,  #8
   1366        mov             x7,  #-8
   1367        // Set up pointers for four rows in parallel; x0, x6, x5, x10
   1368        add             x5,  x0,  x1
   1369        add             x10, x6,  x1
   1370        lsl             x1,  x1,  #1
   1371        sub             x1,  x1,  w3, uxtw #1
   1372        mov             w9,  w3
   1373 
   1374 1:
   1375        ld4r            {v0.8h, v1.8h, v2.8h, v3.8h},   [x2],  x7 // left
   1376        sub             v0.8h,   v0.8h,   v5.8h   // left-right
   1377        sub             v1.8h,   v1.8h,   v5.8h
   1378        sub             v2.8h,   v2.8h,   v5.8h
   1379        sub             v3.8h,   v3.8h,   v5.8h
   1380 2:
   1381        ld1             {v7.16b}, [x8],   #16     // weights_hor
   1382        ushll           v6.8h,   v7.8b,   #7      // weights_hor << 7
   1383        ushll2          v7.8h,   v7.16b,  #7
   1384        sqrdmulh        v20.8h,  v3.8h,   v6.8h   // ((left-right)*weights_hor + 128) >> 8
   1385        sqrdmulh        v21.8h,  v3.8h,   v7.8h   // (left flipped)
   1386        sqrdmulh        v22.8h,  v2.8h,   v6.8h
   1387        sqrdmulh        v23.8h,  v2.8h,   v7.8h
   1388        sqrdmulh        v24.8h,  v1.8h,   v6.8h
   1389        sqrdmulh        v25.8h,  v1.8h,   v7.8h
   1390        sqrdmulh        v26.8h,  v0.8h,   v6.8h
   1391        sqrdmulh        v27.8h,  v0.8h,   v7.8h
   1392        add             v20.8h,  v20.8h,  v5.8h
   1393        add             v21.8h,  v21.8h,  v5.8h
   1394        add             v22.8h,  v22.8h,  v5.8h
   1395        add             v23.8h,  v23.8h,  v5.8h
   1396        add             v24.8h,  v24.8h,  v5.8h
   1397        add             v25.8h,  v25.8h,  v5.8h
   1398        add             v26.8h,  v26.8h,  v5.8h
   1399        add             v27.8h,  v27.8h,  v5.8h
   1400        subs            w3,  w3,  #16
   1401        st1             {v20.8h, v21.8h}, [x0],  #32
   1402        st1             {v22.8h, v23.8h}, [x6],  #32
   1403        st1             {v24.8h, v25.8h}, [x5],  #32
   1404        st1             {v26.8h, v27.8h}, [x10], #32
   1405        b.gt            2b
   1406        subs            w4,  w4,  #4
   1407        b.le            9f
   1408        sub             x8,  x8,  w9, uxtw
   1409        add             x0,  x0,  x1
   1410        add             x6,  x6,  x1
   1411        add             x5,  x5,  x1
   1412        add             x10, x10, x1
   1413        mov             w3,  w9
   1414        b               1b
   1415 9:
   1416        ret
   1417 endfunc
   1418 
   1419 jumptable ipred_smooth_h_tbl
   1420        .word 640b - ipred_smooth_h_tbl
   1421        .word 320b - ipred_smooth_h_tbl
   1422        .word 160b - ipred_smooth_h_tbl
   1423        .word 80b  - ipred_smooth_h_tbl
   1424        .word 40b  - ipred_smooth_h_tbl
   1425 endjumptable
   1426 
   1427 const padding_mask_buf
   1428        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1429        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1430        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1431        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1432        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1433        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
   1434 padding_mask:
   1435        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1436        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1437        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1438        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1439        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1440        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
   1441 endconst
   1442 
   1443 // void ipred_z1_upsample_edge_16bpc_neon(pixel *out, const int hsz,
   1444 //                                        const pixel *const in, const int end,
   1445 //                                        const int bitdepth_max);
   1446 function ipred_z1_upsample_edge_16bpc_neon, export=1
   1447        dup             v30.8h,  w4               // bitdepth_max
   1448        movrel          x4,  padding_mask
   1449        ld1             {v0.8h, v1.8h},  [x2]     // in[]
   1450        add             x5,  x2,  w3,  uxtw #1    // in[end]
   1451        sub             x4,  x4,  w3,  uxtw #1
   1452 
   1453        ld1r            {v2.8h},  [x5]            // padding
   1454        ld1             {v3.8h, v4.8h}, [x4]      // padding_mask
   1455 
   1456        movi            v31.8h,  #9
   1457 
   1458        bit             v0.16b,  v2.16b,  v3.16b  // padded in[]
   1459        bit             v1.16b,  v2.16b,  v4.16b
   1460 
   1461        ext             v4.16b,  v0.16b,  v1.16b,  #2
   1462        ext             v5.16b,  v1.16b,  v2.16b,  #2
   1463        ext             v6.16b,  v0.16b,  v1.16b,  #4
   1464        ext             v7.16b,  v1.16b,  v2.16b,  #4
   1465        ext             v16.16b, v0.16b,  v1.16b,  #6
   1466        ext             v17.16b, v1.16b,  v2.16b,  #6
   1467 
   1468        add             v18.8h,  v4.8h,   v6.8h   // in[i+1] + in[i+2]
   1469        add             v19.8h,  v5.8h,   v7.8h
   1470        add             v20.8h,  v0.8h,   v16.8h
   1471        add             v21.8h,  v1.8h,   v17.8h
   1472        umull           v22.4s,  v18.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
   1473        umull2          v23.4s,  v18.8h,  v31.8h
   1474        umull           v24.4s,  v19.4h,  v31.4h
   1475        umull2          v25.4s,  v19.8h,  v31.8h
   1476        usubw           v22.4s,  v22.4s,  v20.4h
   1477        usubw2          v23.4s,  v23.4s,  v20.8h
   1478        usubw           v24.4s,  v24.4s,  v21.4h
   1479        usubw2          v25.4s,  v25.4s,  v21.8h
   1480 
   1481        sqrshrun        v16.4h,  v22.4s,  #4
   1482        sqrshrun2       v16.8h,  v23.4s,  #4
   1483        sqrshrun        v17.4h,  v24.4s,  #4
   1484        sqrshrun2       v17.8h,  v25.4s,  #4
   1485 
   1486        smin            v16.8h,  v16.8h,  v30.8h
   1487        smin            v17.8h,  v17.8h,  v30.8h
   1488 
   1489        zip1            v0.8h,   v4.8h,   v16.8h
   1490        zip2            v1.8h,   v4.8h,   v16.8h
   1491        zip1            v2.8h,   v5.8h,   v17.8h
   1492        zip2            v3.8h,   v5.8h,   v17.8h
   1493 
   1494        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0]
   1495 
   1496        ret
   1497 endfunc
   1498 
   1499 // void ipred_z2_upsample_edge_16bpc_neon(pixel *out, const int sz,
   1500 //                                        const pixel *const in,
   1501 //                                        const int bitdepth_max);
   1502 function ipred_z2_upsample_edge_16bpc_neon, export=1
   1503        dup             v30.8h,  w3               // bitdepth_max
   1504        // Here, sz is 4 or 8, and we produce 2*sz+1 output elements.
   1505        movrel          x4,  padding_mask
   1506        ld1             {v0.8h, v1.8h}, [x2]      // in[]
   1507        add             x5,  x2,  w1,  uxtw #1    // in[sz]
   1508        sub             x4,  x4,  w1,  uxtw #1
   1509 
   1510        ld1r            {v3.8h},  [x2]            // in[0] for padding
   1511        ld1r            {v2.8h},  [x5]            // padding
   1512        ld1             {v4.8h, v5.8h}, [x4]      // padding_mask
   1513 
   1514        movi            v31.8h,  #9
   1515 
   1516        bit             v0.16b,  v2.16b,  v4.16b  // padded in[]
   1517        bit             v1.16b,  v2.16b,  v5.16b
   1518 
   1519        ext             v4.16b,  v3.16b,  v0.16b,  #14
   1520        ext             v5.16b,  v0.16b,  v1.16b,  #2
   1521        ext             v6.16b,  v0.16b,  v1.16b,  #4
   1522 
   1523        add             v16.8h,  v0.8h,   v5.8h   // in[i+0] + in[i+1]
   1524        add             v17.8h,  v4.8h,   v6.8h   // in[i-1] + in[i+2]
   1525        umull           v18.4s,  v16.4h,  v31.4h  // 9*(in[i+1] + in[i+2])
   1526        umull2          v19.4s,  v16.8h,  v31.8h
   1527        usubw           v18.4s,  v18.4s,  v17.4h
   1528        usubw2          v19.4s,  v19.4s,  v17.8h
   1529 
   1530        sqrshrun        v16.4h,  v18.4s,  #4
   1531        sqrshrun2       v16.8h,  v19.4s,  #4
   1532 
   1533        add             x5,  x0,  #2*16
   1534 
   1535        smin            v16.8h,  v16.8h,  v30.8h
   1536 
   1537        zip1            v4.8h,   v0.8h,   v16.8h
   1538        zip2            v5.8h,   v0.8h,   v16.8h
   1539 
   1540        st1             {v2.h}[0], [x5]
   1541        // In case sz=8, output one single pixel in out[16].
   1542        st1             {v4.8h, v5.8h}, [x0]
   1543 
   1544        ret
   1545 endfunc
   1546 
   1547 const edge_filter
   1548        .short 0, 4, 8, 0
   1549        .short 0, 5, 6, 0
   1550 // Leaving out the coeffs for strength=3
   1551 //      .byte 2, 4, 4, 0
   1552 endconst
   1553 
   1554 // void ipred_z1_filter_edge_16bpc_neon(pixel *out, const int sz,
   1555 //                                      const pixel *const in, const int end,
   1556 //                                      const int strength);
   1557 function ipred_z1_filter_edge_16bpc_neon, export=1
   1558        cmp             w4, #3
   1559        b.eq            L(fivetap)                // if (strength == 3) goto fivetap
   1560 
   1561        movrel          x5,  edge_filter, -6
   1562        add             x5,  x5,  w4,  uxtw #3    // edge_filter + 2*((strength - 1)*4 + 1)
   1563 
   1564        ld1             {v31.s}[0], [x5]          // kernel[1-2]
   1565 
   1566        ld1             {v0.8h}, [x2], #16
   1567 
   1568        dup             v30.8h, v31.h[0]
   1569        dup             v31.8h, v31.h[1]
   1570 1:
   1571        // in[end], is the last valid pixel. We produce 16 pixels out by
   1572        // using 18 pixels in - the last pixel used is [17] of the ones
   1573        // read/buffered.
   1574        cmp             w3,  #17
   1575        ld1             {v1.8h, v2.8h}, [x2], #32
   1576        b.lt            2f
   1577        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1578        ext             v4.16b,  v1.16b,  v2.16b,  #2
   1579        ext             v5.16b,  v0.16b,  v1.16b,  #4
   1580        ext             v6.16b,  v1.16b,  v2.16b,  #4
   1581        mul             v16.8h,  v0.8h,   v30.8h
   1582        mla             v16.8h,  v3.8h,   v31.8h
   1583        mla             v16.8h,  v5.8h,   v30.8h
   1584        mul             v17.8h,  v1.8h,   v30.8h
   1585        mla             v17.8h,  v4.8h,   v31.8h
   1586        mla             v17.8h,  v6.8h,   v30.8h
   1587        subs            w1,  w1,  #16
   1588        mov             v0.16b,  v2.16b
   1589        urshr           v16.8h,  v16.8h,  #4
   1590        urshr           v17.8h,  v17.8h,  #4
   1591        sub             w3,  w3,  #16
   1592        st1             {v16.8h, v17.8h}, [x0], #32
   1593        b.gt            1b
   1594        ret
   1595 2:
   1596        // Right padding
   1597 
   1598        // x2[w3-24] is the padding pixel (x2 points 24 pixels ahead)
   1599        movrel          x5,  padding_mask
   1600        sub             w6,  w3,  #24
   1601        sub             x5,  x5,  w3,  uxtw #1
   1602        add             x6,  x2,  w6,  sxtw #1
   1603 
   1604        ld1             {v3.8h, v4.8h}, [x5] // padding_mask
   1605 
   1606        ld1r            {v2.8h}, [x6]
   1607        bit             v0.16b,  v2.16b,  v3.16b  // Pad v0-v1
   1608        bit             v1.16b,  v2.16b,  v4.16b
   1609 
   1610        // Filter one block
   1611        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1612        ext             v4.16b,  v1.16b,  v2.16b,  #2
   1613        ext             v5.16b,  v0.16b,  v1.16b,  #4
   1614        ext             v6.16b,  v1.16b,  v2.16b,  #4
   1615        mul             v16.8h,  v0.8h,   v30.8h
   1616        mla             v16.8h,  v3.8h,   v31.8h
   1617        mla             v16.8h,  v5.8h,   v30.8h
   1618        mul             v17.8h,  v1.8h,   v30.8h
   1619        mla             v17.8h,  v4.8h,   v31.8h
   1620        mla             v17.8h,  v6.8h,   v30.8h
   1621        subs            w1,  w1,  #16
   1622        urshr           v16.8h,  v16.8h,  #4
   1623        urshr           v17.8h,  v17.8h,  #4
   1624        st1             {v16.8h, v17.8h}, [x0], #32
   1625        b.le            9f
   1626 5:
   1627        // After one block, any remaining output would only be filtering
   1628        // padding - thus just store the padding.
   1629        subs            w1,  w1,  #16
   1630        st1             {v2.16b}, [x0], #16
   1631        b.gt            5b
   1632 9:
   1633        ret
   1634 
   1635 L(fivetap):
   1636        sub             x2,  x2,  #2              // topleft -= 1 pixel
   1637        movi            v29.8h, #2
   1638        ld1             {v0.8h}, [x2], #16
   1639        movi            v30.8h, #4
   1640        movi            v31.8h, #4
   1641        ins             v0.h[0], v0.h[1]
   1642 1:
   1643        // in[end+1], is the last valid pixel. We produce 16 pixels out by
   1644        // using 20 pixels in - the last pixel used is [19] of the ones
   1645        // read/buffered.
   1646        cmp             w3,  #18
   1647        ld1             {v1.8h, v2.8h}, [x2], #32
   1648        b.lt            2f                        // if (end + 1 < 19)
   1649        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1650        ext             v4.16b,  v1.16b,  v2.16b,  #2
   1651        ext             v5.16b,  v0.16b,  v1.16b,  #4
   1652        ext             v6.16b,  v1.16b,  v2.16b,  #4
   1653        ext             v16.16b, v0.16b,  v1.16b,  #6
   1654        ext             v17.16b, v1.16b,  v2.16b,  #6
   1655        ext             v18.16b, v0.16b,  v1.16b,  #8
   1656        ext             v19.16b, v1.16b,  v2.16b,  #8
   1657        mul             v20.8h,  v0.8h,   v29.8h
   1658        mla             v20.8h,  v3.8h,   v30.8h
   1659        mla             v20.8h,  v5.8h,   v31.8h
   1660        mla             v20.8h,  v16.8h,  v30.8h
   1661        mla             v20.8h,  v18.8h,  v29.8h
   1662        mul             v21.8h,  v1.8h,   v29.8h
   1663        mla             v21.8h,  v4.8h,   v30.8h
   1664        mla             v21.8h,  v6.8h,   v31.8h
   1665        mla             v21.8h,  v17.8h,  v30.8h
   1666        mla             v21.8h,  v19.8h,  v29.8h
   1667        subs            w1,  w1,  #16
   1668        mov             v0.16b,  v2.16b
   1669        urshr           v20.8h,  v20.8h,  #4
   1670        urshr           v21.8h,  v21.8h,  #4
   1671        sub             w3,  w3,  #16
   1672        st1             {v20.8h, v21.8h}, [x0], #32
   1673        b.gt            1b
   1674        ret
   1675 2:
   1676        // Right padding
   1677 
   1678        // x2[w3+1-24] is the padding pixel (x2 points 24 pixels ahead)
   1679        movrel          x5,  padding_mask, -2
   1680        sub             w6,  w3,  #23
   1681        sub             x5,  x5,  w3,  uxtw #1
   1682        add             x6,  x2,  w6,  sxtw #1
   1683 
   1684        ld1             {v3.8h, v4.8h, v5.8h}, [x5] // padding_mask
   1685 
   1686        ld1r            {v28.8h}, [x6]
   1687        bit             v0.16b,  v28.16b, v3.16b  // Pad v0-v2
   1688        bit             v1.16b,  v28.16b, v4.16b
   1689        bit             v2.16b,  v28.16b, v5.16b
   1690 4:
   1691        // Filter one block
   1692        ext             v3.16b,  v0.16b,  v1.16b,  #2
   1693        ext             v4.16b,  v1.16b,  v2.16b,  #2
   1694        ext             v5.16b,  v0.16b,  v1.16b,  #4
   1695        ext             v6.16b,  v1.16b,  v2.16b,  #4
   1696        ext             v16.16b, v0.16b,  v1.16b,  #6
   1697        ext             v17.16b, v1.16b,  v2.16b,  #6
   1698        ext             v18.16b, v0.16b,  v1.16b,  #8
   1699        ext             v19.16b, v1.16b,  v2.16b,  #8
   1700        mul             v20.8h,  v0.8h,   v29.8h
   1701        mla             v20.8h,  v3.8h,   v30.8h
   1702        mla             v20.8h,  v5.8h,   v31.8h
   1703        mla             v20.8h,  v16.8h,  v30.8h
   1704        mla             v20.8h,  v18.8h,  v29.8h
   1705        mul             v21.8h,  v1.8h,   v29.8h
   1706        mla             v21.8h,  v4.8h,   v30.8h
   1707        mla             v21.8h,  v6.8h,   v31.8h
   1708        mla             v21.8h,  v17.8h,  v30.8h
   1709        mla             v21.8h,  v19.8h,  v29.8h
   1710        subs            w1,  w1,  #16
   1711        mov             v0.16b,  v2.16b
   1712        mov             v1.16b,  v28.16b
   1713        mov             v2.16b,  v28.16b
   1714        urshr           v20.8h,  v20.8h,  #4
   1715        urshr           v21.8h,  v21.8h,  #4
   1716        sub             w3,  w3,  #16
   1717        st1             {v20.8h, v21.8h}, [x0], #32
   1718        b.le            9f
   1719        // v0-v1[w3+1] is the last valid pixel; if (w3 + 1 > 0) we need to
   1720        // filter properly once more - aka (w3 >= 0).
   1721        cmp             w3,  #0
   1722        b.ge            4b
   1723 5:
   1724        // When w3 <= 0, all remaining pixels in v0-v1 are equal to the
   1725        // last valid pixel - thus just output that without filtering.
   1726        subs            w1,  w1,  #8
   1727        st1             {v28.8h}, [x0], #16
   1728        b.gt            5b
   1729 9:
   1730        ret
   1731 endfunc
   1732 
   1733 // void ipred_pixel_set_16bpc_neon(pixel *out, const pixel px,
   1734 //                                 const int n);
   1735 function ipred_pixel_set_16bpc_neon, export=1
   1736        dup             v0.8h,   w1
   1737 1:
   1738        subs            w2,  w2,  #8
   1739        st1             {v0.8h}, [x0], #16
   1740        b.gt            1b
   1741        ret
   1742 endfunc
   1743 
   1744 // void ipred_z1_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   1745 //                                const pixel *const top,
   1746 //                                const int width, const int height,
   1747 //                                const int dx, const int max_base_x);
   1748 function ipred_z1_fill1_16bpc_neon, export=1
   1749        clz             w9,  w3
   1750        movrel          x8,  ipred_z1_fill1_tbl
   1751        sub             w9,  w9,  #25
   1752        ldrsw           x9,  [x8, w9, uxtw #2]
   1753        add             x10, x2,  w6,  uxtw #1    // top[max_base_x]
   1754        add             x8,  x8,  x9
   1755        ld1r            {v31.8h}, [x10]           // padding
   1756        mov             w7,  w5
   1757        mov             w15, #64
   1758        br              x8
   1759 40:
   1760        AARCH64_VALID_JUMP_TARGET
   1761 4:
   1762        lsr             w8,  w7,  #6              // base
   1763        and             w9,  w7,  #0x3e           // frac
   1764        add             w7,  w7,  w5              // xpos += dx
   1765        cmp             w8,  w6                   // base >= max_base_x
   1766        lsr             w10, w7,  #6              // base
   1767        and             w11, w7,  #0x3e           // frac
   1768        b.ge            49f
   1769        lsl             w8,  w8,  #1
   1770        lsl             w10, w10, #1
   1771        ldr             q0,  [x2, w8, uxtw]       // top[base]
   1772        ldr             q2,  [x2, w10, uxtw]
   1773        dup             v4.4h,   w9               // frac
   1774        dup             v5.4h,   w11
   1775        ext             v1.16b,  v0.16b,  v0.16b,  #2 // top[base+1]
   1776        ext             v3.16b,  v2.16b,  v2.16b,  #2
   1777        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
   1778        sub             v7.4h,   v3.4h,   v2.4h
   1779        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
   1780        ushll           v17.4s,  v2.4h,   #6
   1781        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
   1782        smlal           v17.4s,  v7.4h,   v5.4h
   1783        rshrn           v16.4h,  v16.4s,  #6
   1784        rshrn           v17.4h,  v17.4s,  #6
   1785        st1             {v16.4h}, [x0], x1
   1786        add             w7,  w7,  w5              // xpos += dx
   1787        subs            w4,  w4,  #2
   1788        st1             {v17.4h}, [x0], x1
   1789        b.gt            4b
   1790        ret
   1791 
   1792 49:
   1793        st1             {v31.4h}, [x0], x1
   1794        subs            w4,  w4,  #2
   1795        st1             {v31.4h}, [x0], x1
   1796        b.gt            49b
   1797        ret
   1798 
   1799 80:
   1800        AARCH64_VALID_JUMP_TARGET
   1801 8:
   1802        lsr             w8,  w7,  #6              // base
   1803        and             w9,  w7,  #0x3e           // frac
   1804        add             w7,  w7,  w5              // xpos += dx
   1805        cmp             w8,  w6                   // base >= max_base_x
   1806        lsr             w10, w7,  #6              // base
   1807        and             w11, w7,  #0x3e           // frac
   1808        b.ge            89f
   1809        add             x8,  x2,  w8,  uxtw #1
   1810        add             x10, x2,  w10, uxtw #1
   1811        dup             v4.8h,   w9               // frac
   1812        dup             v5.8h,   w11
   1813        ld1             {v0.8h},  [x8]            // top[base]
   1814        ld1             {v2.8h},  [x10]
   1815        sub             w9,  w15, w9              // 64 - frac
   1816        sub             w11, w15, w11
   1817        ldr             h1, [x8, #16]
   1818        ldr             h3, [x10, #16]
   1819        dup             v6.8h,   w9               // 64 - frac
   1820        dup             v7.8h,   w11
   1821        ext             v1.16b,  v0.16b,  v1.16b,  #2 // top[base+1]
   1822        ext             v3.16b,  v2.16b,  v3.16b,  #2
   1823        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
   1824        umlal           v16.4s,  v1.4h,   v4.4h   // + top[base+1]*frac
   1825        umull2          v17.4s,  v0.8h,   v6.8h
   1826        umlal2          v17.4s,  v1.8h,   v4.8h
   1827        umull           v18.4s,  v2.4h,   v7.4h
   1828        umlal           v18.4s,  v3.4h,   v5.4h
   1829        umull2          v19.4s,  v2.8h,   v7.8h
   1830        umlal2          v19.4s,  v3.8h,   v5.8h
   1831        rshrn           v16.4h,  v16.4s,  #6
   1832        rshrn2          v16.8h,  v17.4s,  #6
   1833        rshrn           v17.4h,  v18.4s,  #6
   1834        rshrn2          v17.8h,  v19.4s,  #6
   1835        st1             {v16.8h}, [x0], x1
   1836        add             w7,  w7,  w5              // xpos += dx
   1837        subs            w4,  w4,  #2
   1838        st1             {v17.8h}, [x0], x1
   1839        b.gt            8b
   1840        ret
   1841 
   1842 89:
   1843        st1             {v31.8h}, [x0], x1
   1844        subs            w4,  w4,  #2
   1845        st1             {v31.8h}, [x0], x1
   1846        b.gt            89b
   1847        ret
   1848 
   1849 160:
   1850 320:
   1851 640:
   1852        AARCH64_VALID_JUMP_TARGET
   1853 
   1854        mov             w12, w3
   1855 
   1856        add             x13, x0,  x1
   1857        lsl             x1,  x1,  #1
   1858        sub             x1,  x1,  w3,  uxtw #1
   1859 1:
   1860        lsr             w8,  w7,  #6              // base
   1861        and             w9,  w7,  #0x3e           // frac
   1862        add             w7,  w7,  w5              // xpos += dx
   1863        cmp             w8,  w6                   // base >= max_base_x
   1864        lsr             w10, w7,  #6              // base
   1865        and             w11, w7,  #0x3e           // frac
   1866        b.ge            169f
   1867        add             x8,  x2,  w8,  uxtw #1
   1868        add             x10, x2,  w10, uxtw #1
   1869        dup             v6.8h,   w9               // frac
   1870        dup             v7.8h,   w11
   1871        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // top[base]
   1872        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
   1873        sub             w9,  w15, w9              // 64 - frac
   1874        sub             w11, w15, w11
   1875        dup             v16.8h,  w9               // 64 - frac
   1876        dup             v17.8h,  w11
   1877        add             w7,  w7,  w5              // xpos += dx
   1878 2:
   1879        ext             v18.16b, v0.16b,  v1.16b,  #2 // top[base+1]
   1880        ext             v19.16b, v1.16b,  v2.16b,  #2
   1881        ext             v20.16b, v3.16b,  v4.16b,  #2
   1882        ext             v21.16b, v4.16b,  v5.16b,  #2
   1883        subs            w3,  w3,  #16
   1884        umull           v22.4s,  v0.4h,   v16.4h  // top[base]*(64-frac)
   1885        umlal           v22.4s,  v18.4h,  v6.4h   // + top[base+1]*frac
   1886        umull2          v23.4s,  v0.8h,   v16.8h
   1887        umlal2          v23.4s,  v18.8h,  v6.8h
   1888        umull           v24.4s,  v1.4h,   v16.4h
   1889        umlal           v24.4s,  v19.4h,  v6.4h
   1890        umull2          v25.4s,  v1.8h,   v16.8h
   1891        umlal2          v25.4s,  v19.8h,  v6.8h
   1892        umull           v26.4s,  v3.4h,   v17.4h
   1893        umlal           v26.4s,  v20.4h,  v7.4h
   1894        umull2          v27.4s,  v3.8h,   v17.8h
   1895        umlal2          v27.4s,  v20.8h,  v7.8h
   1896        umull           v28.4s,  v4.4h,   v17.4h
   1897        umlal           v28.4s,  v21.4h,  v7.4h
   1898        umull2          v29.4s,  v4.8h,   v17.8h
   1899        umlal2          v29.4s,  v21.8h,  v7.8h
   1900        rshrn           v22.4h,  v22.4s,  #6
   1901        rshrn2          v22.8h,  v23.4s,  #6
   1902        rshrn           v23.4h,  v24.4s,  #6
   1903        rshrn2          v23.8h,  v25.4s,  #6
   1904        rshrn           v24.4h,  v26.4s,  #6
   1905        rshrn2          v24.8h,  v27.4s,  #6
   1906        rshrn           v25.4h,  v28.4s,  #6
   1907        rshrn2          v25.8h,  v29.4s,  #6
   1908        st1             {v22.8h, v23.8h}, [x0],  #32
   1909        st1             {v24.8h, v25.8h}, [x13], #32
   1910        b.le            3f
   1911        mov             v0.16b,  v2.16b
   1912        ld1             {v1.8h, v2.8h}, [x8],  #32 // top[base]
   1913        mov             v3.16b,  v5.16b
   1914        ld1             {v4.8h, v5.8h}, [x10], #32
   1915        b               2b
   1916 
   1917 3:
   1918        subs            w4,  w4,  #2
   1919        b.le            9f
   1920        add             x0,  x0,  x1
   1921        add             x13, x13, x1
   1922        mov             w3,  w12
   1923        b               1b
   1924 9:
   1925        ret
   1926 
   1927 169:
   1928        st1             {v31.8h}, [x0],  #16
   1929        subs            w3,  w3,  #8
   1930        st1             {v31.8h}, [x13], #16
   1931        b.gt            169b
   1932        subs            w4,  w4,  #2
   1933        b.le            9b
   1934        add             x0,  x0,  x1
   1935        add             x13, x13, x1
   1936        mov             w3,  w12
   1937        b               169b
   1938 endfunc
   1939 
   1940 jumptable ipred_z1_fill1_tbl
   1941        .word 640b - ipred_z1_fill1_tbl
   1942        .word 320b - ipred_z1_fill1_tbl
   1943        .word 160b - ipred_z1_fill1_tbl
   1944        .word 80b  - ipred_z1_fill1_tbl
   1945        .word 40b  - ipred_z1_fill1_tbl
   1946 endjumptable
   1947 
   1948 function ipred_z1_fill2_16bpc_neon, export=1
   1949        cmp             w3,  #8
   1950        add             x10, x2,  w6,  uxtw       // top[max_base_x]
   1951        ld1r            {v31.16b}, [x10]          // padding
   1952        mov             w7,  w5
   1953        mov             w15, #64
   1954        b.eq            8f
   1955 
   1956 4:      // w == 4
   1957        lsr             w8,  w7,  #6              // base
   1958        and             w9,  w7,  #0x3e           // frac
   1959        add             w7,  w7,  w5              // xpos += dx
   1960        cmp             w8,  w6                   // base >= max_base_x
   1961        lsr             w10, w7,  #6              // base
   1962        and             w11, w7,  #0x3e           // frac
   1963        b.ge            49f
   1964        lsl             w8,  w8,  #1
   1965        lsl             w10, w10, #1
   1966        ldr             q0,  [x2, w8, uxtw]       // top[base]
   1967        ldr             q2,  [x2, w10, uxtw]
   1968        dup             v4.4h,   w9               // frac
   1969        dup             v5.4h,   w11
   1970        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
   1971        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
   1972        uzp2            v3.8h,   v2.8h,   v2.8h
   1973        uzp1            v2.8h,   v2.8h,   v2.8h
   1974        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
   1975        sub             v7.4h,   v3.4h,   v2.4h
   1976        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
   1977        ushll           v17.4s,  v2.4h,   #6
   1978        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
   1979        smlal           v17.4s,  v7.4h,   v5.4h
   1980        rshrn           v16.4h,  v16.4s,  #6
   1981        rshrn           v17.4h,  v17.4s,  #6
   1982        st1             {v16.4h}, [x0], x1
   1983        add             w7,  w7,  w5              // xpos += dx
   1984        subs            w4,  w4,  #2
   1985        st1             {v17.4h}, [x0], x1
   1986        b.gt            4b
   1987        ret
   1988 
   1989 49:
   1990        st1             {v31.4h}, [x0], x1
   1991        subs            w4,  w4,  #2
   1992        st1             {v31.4h}, [x0], x1
   1993        b.gt            49b
   1994        ret
   1995 
   1996 8:      // w == 8
   1997        lsr             w8,  w7,  #6              // base
   1998        and             w9,  w7,  #0x3e           // frac
   1999        add             w7,  w7,  w5              // xpos += dx
   2000        cmp             w8,  w6                   // base >= max_base_x
   2001        lsr             w10, w7,  #6              // base
   2002        and             w11, w7,  #0x3e           // frac
   2003        b.ge            89f
   2004        add             x8,  x2,  w8,  uxtw #1
   2005        add             x10, x2,  w10, uxtw #1
   2006        dup             v4.8h,   w9               // frac
   2007        dup             v5.8h,   w11
   2008        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
   2009        ld1             {v2.8h, v3.8h},  [x10]
   2010        sub             w9,  w15, w9              // 64 - frac
   2011        sub             w11, w15, w11
   2012        dup             v6.8h,   w9               // 64 - frac
   2013        dup             v7.8h,   w11
   2014        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
   2015        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
   2016        uzp2            v21.8h,  v2.8h,   v3.8h
   2017        uzp1            v2.8h,   v2.8h,   v3.8h
   2018        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
   2019        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
   2020        umull2          v17.4s,  v0.8h,   v6.8h
   2021        umlal2          v17.4s,  v20.8h,  v4.8h
   2022        umull           v18.4s,  v2.4h,   v7.4h
   2023        umlal           v18.4s,  v21.4h,  v5.4h
   2024        umull2          v19.4s,  v2.8h,   v7.8h
   2025        umlal2          v19.4s,  v21.8h,  v5.8h
   2026        rshrn           v16.4h,  v16.4s,  #6
   2027        rshrn2          v16.8h,  v17.4s,  #6
   2028        rshrn           v17.4h,  v18.4s,  #6
   2029        rshrn2          v17.8h,  v19.4s,  #6
   2030        st1             {v16.8h}, [x0], x1
   2031        add             w7,  w7,  w5              // xpos += dx
   2032        subs            w4,  w4,  #2
   2033        st1             {v17.8h}, [x0], x1
   2034        b.gt            8b
   2035        ret
   2036 
   2037 89:
   2038        st1             {v31.8h}, [x0], x1
   2039        subs            w4,  w4,  #2
   2040        st1             {v31.8h}, [x0], x1
   2041        b.gt            89b
   2042        ret
   2043 endfunc
   2044 
   2045 // void ipred_reverse_16bpc_neon(pixel *dst, const pixel *const src,
   2046 //                               const int n);
   2047 function ipred_reverse_16bpc_neon, export=1
   2048        sub             x1,  x1,  #16
   2049        add             x3,  x0,  #8
   2050        mov             x4,  #16
   2051 1:
   2052        ld1             {v0.8h}, [x1]
   2053        subs            w2,  w2,  #8
   2054        rev64           v0.8h,  v0.8h
   2055        sub             x1,  x1,  #16
   2056        st1             {v0.d}[1], [x0], x4
   2057        st1             {v0.d}[0], [x3], x4
   2058        b.gt            1b
   2059        ret
   2060 endfunc
   2061 
   2062 const increments
   2063        .short          0,  1,  2,  3,  4,  5,  6,  7
   2064 endconst
   2065 
   2066 // void ipred_z2_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   2067 //                                const pixel *const top,
   2068 //                                const pixel *const left,
   2069 //                                const int width, const int height,
   2070 //                                const int dx, const int dy);
   2071 function ipred_z2_fill1_16bpc_neon, export=1
   2072        clz             w10, w4
   2073        movrel          x9,  ipred_z2_fill1_tbl
   2074        sub             w10, w10, #25
   2075        ldrsw           x10, [x9, w10, uxtw #2]
   2076        mov             w8,  #(1 << 6)            // xpos = 1 << 6
   2077        add             x9,  x9,  x10
   2078        sub             w8,  w8,  w6              // xpos -= dx
   2079 
   2080        movrel          x11, increments
   2081        ld1             {v31.8h},  [x11]          // increments
   2082        neg             w7,  w7                   // -dy
   2083 
   2084        br              x9
   2085 40:
   2086        AARCH64_VALID_JUMP_TARGET
   2087 
   2088        dup             v30.4h,  w7               // -dy
   2089        movi            v17.8b,  #1
   2090 
   2091        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
   2092        movi            v25.8h,  #0x3e
   2093        add             v30.4h,  v16.4h,  v30.4h  // -= dy
   2094 
   2095        // Worst case height for w=4 is 16, but we need at least h+1 elements
   2096        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
   2097 
   2098        movi            v26.8h,  #64
   2099        movi            v19.16b, #4
   2100 
   2101        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   2102        and             v27.8b,  v30.8b,  v25.8b  // frac_y
   2103 
   2104        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   2105 
   2106        movi            v23.4h,  #1, lsl #8
   2107        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   2108        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
   2109        movi            v17.8b,  #2
   2110        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
   2111 
   2112        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
   2113        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
   2114 
   2115        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
   2116 
   2117        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
   2118 
   2119        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
   2120 
   2121        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
   2122 
   2123        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
   2124        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
   2125 
   2126        movi            v29.16b, #4
   2127 4:
   2128        asr             w9,  w8,  #6              // base_x
   2129        dup             v16.4h,  w8               // xpos
   2130        sub             w8,  w8,  w6              // xpos -= dx
   2131        cmp             w9,  #-4                  // base_x <= -4
   2132        asr             w11, w8,  #6              // base_x
   2133        b.le            49f
   2134 
   2135        lsl             w9,  w9,  #1
   2136        lsl             w11, w11, #1
   2137 
   2138        dup             v17.4h,  w8               // xpos
   2139 
   2140        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
   2141        ldr             q6,  [x2, w11, sxtw]
   2142 
   2143        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
   2144 
   2145        // Cut corners here; only doing tbl over v0-v1 here; we only
   2146        // seem to need the last pixel, from v2, after skipping to the
   2147        // left-only codepath below.
   2148        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
   2149 
   2150        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
   2151 
   2152        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
   2153        ext             v7.16b,  v6.16b,  v6.16b,  #2
   2154 
   2155        and             v16.16b, v16.16b, v25.16b // frac_x
   2156 
   2157        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
   2158 
   2159        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
   2160        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
   2161 
   2162        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
   2163 
   2164        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
   2165 
   2166        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2167        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2168        umull2          v22.4s,  v18.8h,  v28.8h
   2169        umlal2          v22.4s,  v19.8h,  v27.8h
   2170 
   2171        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
   2172        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
   2173        umull2          v24.4s,  v4.8h,   v17.8h
   2174        umlal2          v24.4s,  v5.8h,   v16.8h
   2175 
   2176        cmge            v20.8h,  v20.8h,  #0
   2177 
   2178        rshrn           v21.4h,  v21.4s,  #6
   2179        rshrn2          v21.8h,  v22.4s,  #6
   2180        rshrn           v22.4h,  v23.4s,  #6
   2181        rshrn2          v22.8h,  v24.4s,  #6
   2182 
   2183        bit             v21.16b, v22.16b, v20.16b
   2184 
   2185        st1             {v21.d}[0], [x0], x1
   2186        sub             w8,  w8,  w6              // xpos -= dx
   2187        subs            w5,  w5,  #2
   2188        st1             {v21.d}[1], [x0], x1
   2189        b.le            9f
   2190 
   2191        ext             v18.16b, v19.16b, v19.16b, #8
   2192        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
   2193        b               4b
   2194 
   2195 49:
   2196        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+2]
   2197 
   2198        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
   2199 
   2200        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2201        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2202        umull2          v21.4s,  v18.8h,  v28.8h
   2203        umlal2          v21.4s,  v19.8h,  v27.8h
   2204 
   2205        rshrn           v20.4h,  v20.4s,  #6
   2206        rshrn2          v20.8h,  v21.4s,  #6
   2207 
   2208        st1             {v20.d}[0], [x0], x1
   2209        subs            w5,  w5,  #2
   2210        st1             {v20.d}[1], [x0], x1
   2211        b.le            9f
   2212 
   2213        ext             v18.16b, v19.16b, v19.16b, #8
   2214        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
   2215        b               49b
   2216 
   2217 9:
   2218        ret
   2219 
   2220 80:
   2221        AARCH64_VALID_JUMP_TARGET
   2222 
   2223        stp             d8,  d9,  [sp, #-0x40]!
   2224        stp             d10, d11, [sp, #0x10]
   2225        stp             d12, d13, [sp, #0x20]
   2226        stp             d14, d15, [sp, #0x30]
   2227 
   2228        dup             v18.8h,  w7               // -dy
   2229        add             x3,  x3,  #2              // Skip past left[0]
   2230 
   2231        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
   2232        movi            v25.8h,  #0x3e
   2233        add             v16.8h,  v16.8h,  v18.8h  // -= dy
   2234 
   2235        // Worst case height for w=8 is 32.
   2236        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
   2237        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
   2238 
   2239        movi            v26.8h,  #64
   2240        movi            v19.16b, #4
   2241 
   2242        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
   2243        and             v27.16b, v16.16b, v25.16b // frac_y
   2244 
   2245        movi            v23.8h,  #1, lsl #8
   2246        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   2247        mov             v18.16b, v15.16b          // left[0]
   2248        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
   2249        movi            v17.16b, #2
   2250        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
   2251 
   2252        // Cut corners here; for the first row we don't expect to need to
   2253        // read outside of v0.
   2254        tbx             v18.16b, {v0.16b}, v29.16b // left[base_y]
   2255 
   2256        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
   2257        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
   2258 
   2259        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
   2260 
   2261        movi            v24.16b, #4
   2262 8:
   2263        asr             w9,  w8,  #6              // base_x
   2264        dup             v16.8h,   w8              // xpos
   2265        sub             w8,  w8,  w6              // xpos -= dx
   2266        cmp             w9,  #-16                 // base_x <= -16
   2267        asr             w11, w8,  #6              // base_x
   2268        b.le            89f
   2269 
   2270        dup             v17.8h,   w8              // xpos
   2271 
   2272        add             x9,  x2,  w9,  sxtw #1
   2273        add             x11, x2,  w11, sxtw #1
   2274 
   2275        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
   2276        mov             v19.16b, v15.16b          // left[0]
   2277        ld1             {v6.8h, v7.8h}, [x11]
   2278 
   2279        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2280 
   2281        mov             v20.16b, v15.16b          // left[0]
   2282 
   2283        sshr            v21.8h,  v16.8h,  #6      // first base_x
   2284        sshr            v22.8h,  v17.8h,  #6
   2285 
   2286        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
   2287 
   2288        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
   2289        ext             v7.16b,  v6.16b,  v7.16b,  #2
   2290 
   2291        and             v16.16b, v16.16b, v25.16b // frac_x
   2292        and             v17.16b, v17.16b, v25.16b
   2293 
   2294        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2295        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2296 
   2297        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
   2298        sub             v9.8h,   v26.8h,  v17.8h
   2299 
   2300        umull2          v11.4s,  v18.8h,  v28.8h
   2301        umlal2          v11.4s,  v19.8h,  v27.8h
   2302 
   2303        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
   2304        add             v22.8h,  v22.8h,  v31.8h
   2305 
   2306        umull           v12.4s,  v19.4h,  v28.4h
   2307        umlal           v12.4s,  v20.4h,  v27.4h
   2308        umull2          v13.4s,  v19.8h,  v28.8h
   2309        umlal2          v13.4s,  v20.8h,  v27.8h
   2310 
   2311        rshrn           v10.4h,  v10.4s,  #6
   2312        rshrn2          v10.8h,  v11.4s,  #6
   2313        rshrn           v11.4h,  v12.4s,  #6
   2314        rshrn2          v11.8h,  v13.4s,  #6
   2315 
   2316        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
   2317        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
   2318        umull2          v13.4s,  v4.8h,   v8.8h
   2319        umlal2          v13.4s,  v5.8h,   v16.8h
   2320        umull           v14.4s,  v6.4h,   v9.4h
   2321        umlal           v14.4s,  v7.4h,   v17.4h
   2322        umull2          v18.4s,  v6.8h,   v9.8h
   2323        umlal2          v18.4s,  v7.8h,   v17.8h
   2324 
   2325        cmge            v21.8h,  v21.8h,  #0
   2326        cmge            v22.8h,  v22.8h,  #0
   2327 
   2328        rshrn           v12.4h,  v12.4s,  #6
   2329        rshrn2          v12.8h,  v13.4s,  #6
   2330        rshrn           v13.4h,  v14.4s,  #6
   2331        rshrn2          v13.8h,  v18.4s,  #6
   2332 
   2333        bit             v10.16b, v12.16b, v21.16b
   2334        bit             v11.16b, v13.16b, v22.16b
   2335 
   2336        st1             {v10.8h}, [x0], x1
   2337        subs            w5,  w5,  #2
   2338        sub             w8,  w8,  w6              // xpos -= dx
   2339        st1             {v11.8h}, [x0], x1
   2340        b.le            9f
   2341 
   2342        mov             v18.16b, v20.16b
   2343        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   2344        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
   2345        b               8b
   2346 
   2347 89:
   2348        mov             v19.16b, v15.16b
   2349        mov             v20.16b, v15.16b
   2350        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2351        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v30.16b // left[base_y+2]
   2352 
   2353        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2354        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2355        umull2          v5.4s,   v18.8h,  v28.8h
   2356        umlal2          v5.4s,   v19.8h,  v27.8h
   2357        umull           v6.4s,   v19.4h,  v28.4h
   2358        umlal           v6.4s,   v20.4h,  v27.4h
   2359        umull2          v7.4s,   v19.8h,  v28.8h
   2360        umlal2          v7.4s,   v20.8h,  v27.8h
   2361 
   2362        rshrn           v4.4h,   v4.4s,   #6
   2363        rshrn2          v4.8h,   v5.4s,   #6
   2364        rshrn           v5.4h,   v6.4s,   #6
   2365        rshrn2          v5.8h,   v7.4s,   #6
   2366 
   2367        st1             {v4.8h}, [x0], x1
   2368        subs            w5,  w5,  #2
   2369        st1             {v5.8h}, [x0], x1
   2370        b.le            9f
   2371 
   2372        mov             v18.16b, v20.16b
   2373        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   2374        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
   2375        b               89b
   2376 
   2377 9:
   2378        ldp             d14, d15, [sp, #0x30]
   2379        ldp             d12, d13, [sp, #0x20]
   2380        ldp             d10, d11, [sp, #0x10]
   2381        ldp             d8,  d9,  [sp], 0x40
   2382        ret
   2383 
   2384 160:
   2385 320:
   2386 640:
   2387        AARCH64_VALID_JUMP_TARGET
   2388 
   2389        stp             d8,  d9,  [sp, #-0x40]!
   2390        stp             d10, d11, [sp, #0x10]
   2391        stp             d12, d13, [sp, #0x20]
   2392        stp             d14, d15, [sp, #0x30]
   2393 
   2394        dup             v25.8h,  w7               // -dy
   2395        add             x3,  x3,  #2              // Skip past left[0]
   2396 
   2397        add             x13, x0,  x1              // alternating row
   2398        lsl             x1,  x1,  #1              // stride *= 2
   2399        sub             x1,  x1,  w4,  uxtw #1    // stride -= width
   2400 
   2401        movi            v11.8h,  #8
   2402        mul             v26.8h,  v31.8h,  v25.8h  // {0,1,2,3,4,5,6,7}* -dy
   2403        add             v26.8h,  v26.8h,  v25.8h  // -= dy
   2404        mul             v25.8h,  v25.8h,  v11.8h  // -8*dy
   2405 
   2406        // Worst case height is 64, but we can only fit 32 pixels into
   2407        // v0-v3 usable within one tbx instruction. As long as base_y is
   2408        // up to 32, we use tbx.
   2409        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x3] // left[]
   2410        ld1r            {v15.8h}, [x2]            // left[0] == top[0]
   2411 
   2412        mov             w12, w4                   // orig w
   2413        neg             w14, w4                   // -w
   2414 
   2415 1:
   2416        mov             v23.16b, v26.16b          // reset ypos
   2417 
   2418        asr             w9,  w8,  #6              // base_x
   2419        dup             v16.8h,   w8              // xpos
   2420        sub             w8,  w8,  w6              // xpos -= dx
   2421        cmp             w9,  w14                  // base_x <= -2*w
   2422        asr             w11, w8,  #6              // base_x
   2423        b.le            169f
   2424 
   2425        dup             v17.8h,   w8              // xpos
   2426        sub             w8,  w8,  w6              // xpos -= dx
   2427 
   2428        add             x9,  x2,  w9,  sxtw #1
   2429        add             x11, x2,  w11, sxtw #1
   2430 
   2431        sshr            v21.8h,  v16.8h,  #6      // first base_x
   2432        sshr            v22.8h,  v17.8h,  #6
   2433 
   2434        ld1             {v4.8h}, [x9], #16        // top[base_x]
   2435        ld1             {v6.8h}, [x11], #16
   2436 
   2437        movi            v10.8h,  #0x3e
   2438        movi            v11.8h,  #64
   2439 
   2440        and             v16.16b, v16.16b, v10.16b // frac_x
   2441        and             v17.16b, v17.16b, v10.16b
   2442 
   2443        sub             v8.8h,   v11.8h,  v16.8h  // 64 - frac_x
   2444        sub             v9.8h,   v11.8h,  v17.8h
   2445 
   2446        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
   2447        add             v22.8h,  v22.8h,  v31.8h
   2448 
   2449 2:
   2450        smov            w10,     v22.h[0]
   2451 
   2452        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
   2453        movi            v12.8h,  #64
   2454        cmp             w10, #0                   // base_x (bottom left) >= 0
   2455        smov            w10,     v29.b[0]         // base_y[0]
   2456        movi            v10.8h,  #0x3e
   2457 
   2458        b.ge            4f
   2459        and             v27.16b, v23.16b, v10.16b // frac_y
   2460        cmp             w10,     #(32-3)
   2461 
   2462        mov             v18.16b, v15.16b          // left[0]
   2463        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
   2464        b.gt            22f
   2465 
   2466 21:
   2467        // base_y < 32, using tbx
   2468        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   2469        movi            v11.8h,  #1, lsl #8
   2470        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
   2471        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
   2472 
   2473        movi            v13.16b, #2
   2474 
   2475        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
   2476 
   2477        add             v29.16b, v29.16b, v13.16b // base_y + 1 (*2)
   2478        mov             v19.16b, v15.16b          // left[0]
   2479 
   2480        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2481 
   2482        add             v29.16b, v29.16b, v13.16b // base_y + 2 (*2)
   2483        mov             v20.16b, v15.16b          // left[0]
   2484 
   2485        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
   2486 
   2487        b               23f
   2488 
   2489 22:
   2490        // base_y >= 32, using separate loads.
   2491        smov            w15,     v29.b[1]
   2492        smov            w16,     v29.b[2]
   2493        add             x10, x3,  w10, sxtw #1
   2494        smov            w17,     v29.b[3]
   2495        add             x15, x3,  w15, sxtw #1
   2496        ld3             {v18.h, v19.h, v20.h}[0], [x10]
   2497        smov            w10,     v29.b[4]
   2498        add             x16, x3,  w16, sxtw #1
   2499        ld3             {v18.h, v19.h, v20.h}[1], [x15]
   2500        smov            w15,     v29.b[5]
   2501        add             x17, x3,  w17, sxtw #1
   2502        ld3             {v18.h, v19.h, v20.h}[2], [x16]
   2503        smov            w16,     v29.b[6]
   2504        add             x10, x3,  w10, sxtw #1
   2505        ld3             {v18.h, v19.h, v20.h}[3], [x17]
   2506        smov            w17,     v29.b[7]
   2507        add             x15, x3,  w15, sxtw #1
   2508        add             x16, x3,  w16, sxtw #1
   2509        ld3             {v18.h, v19.h, v20.h}[4], [x10]
   2510        add             x17, x3,  w17, sxtw #1
   2511        ld3             {v18.h, v19.h, v20.h}[5], [x15]
   2512        ld3             {v18.h, v19.h, v20.h}[6], [x16]
   2513        ld3             {v18.h, v19.h, v20.h}[7], [x17]
   2514 
   2515 23:
   2516 
   2517        ld1             {v5.8h}, [x9], #16        // top[base_x]
   2518        ld1             {v7.8h}, [x11], #16
   2519 
   2520        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
   2521 
   2522        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2523        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2524        umull2          v11.4s,  v18.8h,  v28.8h
   2525        umlal2          v11.4s,  v19.8h,  v27.8h
   2526        umull           v12.4s,  v19.4h,  v28.4h
   2527        umlal           v12.4s,  v20.4h,  v27.4h
   2528        umull2          v13.4s,  v19.8h,  v28.8h
   2529        umlal2          v13.4s,  v20.8h,  v27.8h
   2530 
   2531        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
   2532        ext             v19.16b, v6.16b,  v7.16b,  #2
   2533 
   2534        rshrn           v10.4h,  v10.4s,  #6
   2535        rshrn2          v10.8h,  v11.4s,  #6
   2536        rshrn           v11.4h,  v12.4s,  #6
   2537        rshrn2          v11.8h,  v13.4s,  #6
   2538 
   2539        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
   2540        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
   2541        umull2          v13.4s,  v4.8h,   v8.8h
   2542        umlal2          v13.4s,  v18.8h,  v16.8h
   2543        umull           v14.4s,  v6.4h,   v9.4h
   2544        umlal           v14.4s,  v19.4h,  v17.4h
   2545        umull2          v20.4s,  v6.8h,   v9.8h
   2546        umlal2          v20.4s,  v19.8h,  v17.8h
   2547 
   2548        cmge            v18.8h,  v21.8h,  #0
   2549        cmge            v19.8h,  v22.8h,  #0
   2550 
   2551        rshrn           v12.4h,  v12.4s,  #6
   2552        rshrn2          v12.8h,  v13.4s,  #6
   2553        rshrn           v13.4h,  v14.4s,  #6
   2554        rshrn2          v13.8h,  v20.4s,  #6
   2555 
   2556        bit             v10.16b, v12.16b, v18.16b
   2557        bit             v11.16b, v13.16b, v19.16b
   2558 
   2559        st1             {v10.8h}, [x0], #16
   2560        subs            w4,  w4,  #8
   2561        st1             {v11.8h}, [x13], #16
   2562        b.le            3f
   2563 
   2564        movi            v10.8h,  #8
   2565        mov             v4.16b,  v5.16b
   2566        mov             v6.16b,  v7.16b
   2567        add             v21.8h,  v21.8h,  v10.8h  // base_x += 8
   2568        add             v22.8h,  v22.8h,  v10.8h
   2569        b               2b
   2570 
   2571 3:
   2572        subs            w5,  w5,  #2
   2573        b.le            9f
   2574        movi            v10.8h, #128
   2575        add             x0,  x0,  x1
   2576        add             x13, x13, x1
   2577        mov             w4,  w12                  // reset w
   2578        add             v26.8h,  v26.8h,  v10.8h  // ypos += 2*(1<<6)
   2579        b               1b
   2580 
   2581 4:      // The rest of the row only predicted from top[]
   2582        ld1             {v5.8h}, [x9], #16        // top[base_x]
   2583        ld1             {v7.8h}, [x11], #16
   2584 
   2585        ext             v18.16b, v4.16b,  v5.16b,  #2 // top[base_x+1]
   2586        ext             v19.16b, v6.16b,  v7.16b,  #2
   2587 
   2588        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
   2589        umlal           v12.4s,  v18.4h,  v16.4h  // + top[base_x+1]*frac_x
   2590        umull2          v13.4s,  v4.8h,   v8.8h
   2591        umlal2          v13.4s,  v18.8h,  v16.8h
   2592        umull           v14.4s,  v6.4h,   v9.4h
   2593        umlal           v14.4s,  v19.4h,  v17.4h
   2594        umull2          v20.4s,  v6.8h,   v9.8h
   2595        umlal2          v20.4s,  v19.8h,  v17.8h
   2596 
   2597        rshrn           v12.4h,  v12.4s,  #6
   2598        rshrn2          v12.8h,  v13.4s,  #6
   2599        rshrn           v13.4h,  v14.4s,  #6
   2600        rshrn2          v13.8h,  v20.4s,  #6
   2601 
   2602        st1             {v12.8h}, [x0], #16
   2603        subs            w4,  w4,  #8
   2604        st1             {v13.8h}, [x13], #16
   2605        b.le            3b
   2606 
   2607        mov             v4.16b,  v5.16b
   2608        mov             v6.16b,  v7.16b
   2609        b               4b
   2610 
   2611 169:    // The rest of the block only predicted from left[]
   2612        add             x1,  x1,  w4,  uxtw #1    // restore stride
   2613        mov             w12, w5                   // orig remaining h
   2614 1:
   2615        movi            v12.8h,  #64
   2616        movi            v10.8h,  #0x3e
   2617 
   2618        shrn            v29.8b,  v23.8h,  #6      // ypos >> 6
   2619        and             v27.16b, v23.16b, v10.16b // frac_y
   2620 
   2621        smov            w10,     v29.b[0]         // base_y[0]
   2622 
   2623        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   2624        movi            v11.8h,  #1, lsl #8
   2625        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
   2626        add             v23.8h,  v23.8h,  v25.8h  // ypos -= 8*dy
   2627        add             v29.16b, v29.16b, v11.16b // 2*base, 2*base+1, ...
   2628 
   2629        cmp             w10,     #(32-1)
   2630 
   2631        mov             v18.16b, v15.16b          // left[0]
   2632        movi            v21.16b, #2
   2633 
   2634        sub             v28.8h,  v12.8h,  v27.8h  // 64 - frac_y
   2635 
   2636        b.gt            31f
   2637 
   2638        tbx             v18.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y]
   2639        add             v29.16b, v29.16b, v21.16b // base_y + 1 (*2)
   2640 
   2641 2:
   2642        // base_y < 32, using tbx.
   2643        smov            w10,     v29.b[0]         // base_y[0]
   2644        mov             v19.16b, v15.16b          // left[0]
   2645        cmp             w10,     #(64-4)
   2646        b.gt            32f
   2647        tbx             v19.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+1]
   2648        add             v29.16b, v29.16b, v21.16b // base_y + 2 (*2)
   2649        mov             v20.16b, v15.16b          // left[0]
   2650        tbx             v20.16b, {v0.16b, v1.16b, v2.16b, v3.16b}, v29.16b // left[base_y+2]
   2651        add             v29.16b, v29.16b, v21.16b // next base_y
   2652 
   2653        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2654        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2655        umull2          v11.4s,  v18.8h,  v28.8h
   2656        umlal2          v11.4s,  v19.8h,  v27.8h
   2657        umull           v12.4s,  v19.4h,  v28.4h
   2658        umlal           v12.4s,  v20.4h,  v27.4h
   2659        umull2          v13.4s,  v19.8h,  v28.8h
   2660        umlal2          v13.4s,  v20.8h,  v27.8h
   2661 
   2662        rshrn           v10.4h,  v10.4s,  #6
   2663        rshrn2          v10.8h,  v11.4s,  #6
   2664        rshrn           v11.4h,  v12.4s,  #6
   2665        rshrn2          v11.8h,  v13.4s,  #6
   2666 
   2667        st1             {v10.8h}, [x0], x1
   2668        subs            w5,  w5,  #2
   2669        st1             {v11.8h}, [x13], x1
   2670        b.le            4f
   2671        mov             v18.16b, v20.16b
   2672        b               2b
   2673 
   2674 31:     // base_y >= 32, using separate loads, loading v18 if we had to bail
   2675        // in the prologue.
   2676        smov            w10,     v29.b[0]
   2677        smov            w15,     v29.b[2]
   2678        movi            v21.16b, #2
   2679        smov            w16,     v29.b[4]
   2680        add             x10, x3,  w10, sxtw
   2681        smov            w17,     v29.b[6]
   2682        add             x15, x3,  w15, sxtw
   2683        ld1             {v18.h}[0], [x10]
   2684        smov            w10,     v29.b[8]
   2685        add             x16, x3,  w16, sxtw
   2686        ld1             {v18.h}[1], [x15]
   2687        smov            w15,     v29.b[10]
   2688        add             x17, x3,  w17, sxtw
   2689        ld1             {v18.h}[2], [x16]
   2690        smov            w16,     v29.b[12]
   2691        add             x10, x3,  w10, sxtw
   2692        ld1             {v18.h}[3], [x17]
   2693        smov            w17,     v29.b[14]
   2694        add             x15, x3,  w15, sxtw
   2695        add             x16, x3,  w16, sxtw
   2696        ld1             {v18.h}[4], [x10]
   2697        add             x17, x3,  w17, sxtw
   2698        ld1             {v18.h}[5], [x15]
   2699        add             v29.16b, v29.16b, v21.16b // next base_y
   2700        ld1             {v18.h}[6], [x16]
   2701        ld1             {v18.h}[7], [x17]
   2702 
   2703 32:     // base_y >= 32, using separate loads.
   2704        cmp             w5,  #4
   2705        b.lt            34f
   2706 33:     // h >= 4, preserving v18 from the previous round, loading v19-v22.
   2707        smov            w10,     v29.b[0]
   2708        subs            w5,  w5,  #4
   2709        smov            w15,     v29.b[2]
   2710        movi            v10.16b, #8
   2711        smov            w16,     v29.b[4]
   2712        add             x10, x3,  w10, sxtw
   2713        smov            w17,     v29.b[6]
   2714        add             x15, x3,  w15, sxtw
   2715        ld4             {v19.h, v20.h, v21.h, v22.h}[0], [x10]
   2716        smov            w10,     v29.b[8]
   2717        add             x16, x3,  w16, sxtw
   2718        ld4             {v19.h, v20.h, v21.h, v22.h}[1], [x15]
   2719        smov            w15,     v29.b[10]
   2720        add             x17, x3,  w17, sxtw
   2721        ld4             {v19.h, v20.h, v21.h, v22.h}[2], [x16]
   2722        smov            w16,     v29.b[12]
   2723        add             x10, x3,  w10, sxtw
   2724        ld4             {v19.h, v20.h, v21.h, v22.h}[3], [x17]
   2725        smov            w17,     v29.b[14]
   2726        add             x15, x3,  w15, sxtw
   2727        add             x16, x3,  w16, sxtw
   2728        ld4             {v19.h, v20.h, v21.h, v22.h}[4], [x10]
   2729        add             x17, x3,  w17, sxtw
   2730        ld4             {v19.h, v20.h, v21.h, v22.h}[5], [x15]
   2731        ld4             {v19.h, v20.h, v21.h, v22.h}[6], [x16]
   2732        add             v29.16b, v29.16b, v10.16b // next base_y
   2733        ld4             {v19.h, v20.h, v21.h, v22.h}[7], [x17]
   2734 
   2735        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2736        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2737        umull2          v11.4s,  v18.8h,  v28.8h
   2738        umlal2          v11.4s,  v19.8h,  v27.8h
   2739        umull           v12.4s,  v19.4h,  v28.4h
   2740        umlal           v12.4s,  v20.4h,  v27.4h
   2741        umull2          v13.4s,  v19.8h,  v28.8h
   2742        umlal2          v13.4s,  v20.8h,  v27.8h
   2743 
   2744        rshrn           v10.4h,  v10.4s,  #6
   2745        rshrn2          v10.8h,  v11.4s,  #6
   2746        rshrn           v11.4h,  v12.4s,  #6
   2747        rshrn2          v11.8h,  v13.4s,  #6
   2748 
   2749        umull           v12.4s,  v20.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2750        umlal           v12.4s,  v21.4h,  v27.4h  // + left[base_y+1]*frac_y
   2751        umull2          v13.4s,  v20.8h,  v28.8h
   2752        umlal2          v13.4s,  v21.8h,  v27.8h
   2753        umull           v14.4s,  v21.4h,  v28.4h
   2754        umlal           v14.4s,  v22.4h,  v27.4h
   2755        umull2          v18.4s,  v21.8h,  v28.8h
   2756        umlal2          v18.4s,  v22.8h,  v27.8h
   2757 
   2758        rshrn           v12.4h,  v12.4s,  #6
   2759        rshrn2          v12.8h,  v13.4s,  #6
   2760        rshrn           v13.4h,  v14.4s,  #6
   2761        rshrn2          v13.8h,  v18.4s,  #6
   2762 
   2763        st1             {v10.8h}, [x0],  x1
   2764        cmp             w5,  #2
   2765        st1             {v11.8h}, [x13], x1
   2766        st1             {v12.8h}, [x0],  x1
   2767        st1             {v13.8h}, [x13], x1
   2768        b.lt            4f
   2769        mov             v18.16b, v22.16b
   2770        b.gt            33b
   2771 
   2772 34:     // h == 2, preserving v18 from the previous round, loading v19-v20.
   2773        smov            w10,     v29.b[0]
   2774        smov            w15,     v29.b[2]
   2775        movi            v21.16b, #4
   2776        smov            w16,     v29.b[4]
   2777        add             x10, x3,  w10, sxtw
   2778        smov            w17,     v29.b[6]
   2779        add             x15, x3,  w15, sxtw
   2780        ld2             {v19.h, v20.h}[0], [x10]
   2781        smov            w10,     v29.b[8]
   2782        add             x16, x3,  w16, sxtw
   2783        ld2             {v19.h, v20.h}[1], [x15]
   2784        smov            w15,     v29.b[10]
   2785        add             x17, x3,  w17, sxtw
   2786        ld2             {v19.h, v20.h}[2], [x16]
   2787        smov            w16,     v29.b[12]
   2788        add             x10, x3,  w10, sxtw
   2789        ld2             {v19.h, v20.h}[3], [x17]
   2790        smov            w17,     v29.b[14]
   2791        add             x15, x3,  w15, sxtw
   2792        add             x16, x3,  w16, sxtw
   2793        ld2             {v19.h, v20.h}[4], [x10]
   2794        add             x17, x3,  w17, sxtw
   2795        ld2             {v19.h, v20.h}[5], [x15]
   2796        ld2             {v19.h, v20.h}[6], [x16]
   2797        add             v29.16b, v29.16b, v21.16b // next base_y
   2798        ld2             {v19.h, v20.h}[7], [x17]
   2799 
   2800        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2801        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2802        umull2          v11.4s,  v18.8h,  v28.8h
   2803        umlal2          v11.4s,  v19.8h,  v27.8h
   2804        umull           v12.4s,  v19.4h,  v28.4h
   2805        umlal           v12.4s,  v20.4h,  v27.4h
   2806        umull2          v13.4s,  v19.8h,  v28.8h
   2807        umlal2          v13.4s,  v20.8h,  v27.8h
   2808 
   2809        rshrn           v10.4h,  v10.4s,  #6
   2810        rshrn2          v10.8h,  v11.4s,  #6
   2811        rshrn           v11.4h,  v12.4s,  #6
   2812        rshrn2          v11.8h,  v13.4s,  #6
   2813 
   2814        st1             {v10.8h}, [x0], x1
   2815        st1             {v11.8h}, [x13], x1
   2816        // The h==2 case only happens once at the end, if at all.
   2817 
   2818 4:
   2819        subs            w4,  w4,  #8
   2820        b.le            9f
   2821 
   2822        lsr             x1,  x1,  #1
   2823        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   2824        msub            x13, x1,  x12, x13
   2825        lsl             x1,  x1,  #1
   2826        add             x0,  x0,  #16
   2827        add             x13, x13, #16
   2828        mov             w5,  w12                  // reset h
   2829        b               1b
   2830 
   2831 9:
   2832        ldp             d14, d15, [sp, #0x30]
   2833        ldp             d12, d13, [sp, #0x20]
   2834        ldp             d10, d11, [sp, #0x10]
   2835        ldp             d8,  d9,  [sp], 0x40
   2836        ret
   2837 endfunc
   2838 
   2839 jumptable ipred_z2_fill1_tbl
   2840        .word 640b - ipred_z2_fill1_tbl
   2841        .word 320b - ipred_z2_fill1_tbl
   2842        .word 160b - ipred_z2_fill1_tbl
   2843        .word 80b  - ipred_z2_fill1_tbl
   2844        .word 40b  - ipred_z2_fill1_tbl
   2845 endjumptable
   2846 
   2847 function ipred_z2_fill2_16bpc_neon, export=1
   2848        cmp             w4,  #8
   2849        mov             w8,  #(2 << 6)            // xpos = 2 << 6
   2850        sub             w8,  w8,  w6              // xpos -= dx
   2851 
   2852        movrel          x11, increments
   2853        ld1             {v31.8h},  [x11]          // increments
   2854        neg             w7,  w7                   // -dy
   2855        b.eq            80f
   2856 
   2857 40:
   2858        dup             v30.4h,  w7               // -dy
   2859        movi            v17.8b,  #1
   2860 
   2861        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
   2862        movi            v25.8h,  #0x3e
   2863        add             v30.4h,  v16.4h,  v30.4h  // -= dy
   2864 
   2865        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
   2866        // from left.
   2867        ld1             {v0.8h, v1.8h}, [x3]      // left[]
   2868 
   2869        movi            v26.8h,  #64
   2870        movi            v19.16b, #4
   2871 
   2872        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   2873        and             v27.8b,  v30.8b,  v25.8b  // frac_y
   2874 
   2875        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   2876 
   2877        movi            v23.4h,  #1, lsl #8
   2878        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   2879        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
   2880        movi            v17.8b,  #2
   2881        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
   2882 
   2883        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
   2884        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
   2885 
   2886        tbl             v18.8b, {v0.16b}, v29.8b  // left[base_y]
   2887 
   2888        trn1            v30.2d,  v30.2d,  v28.2d  // base_y + 1, base_y + 2
   2889 
   2890        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
   2891 
   2892        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
   2893 
   2894        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
   2895        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
   2896 
   2897        movi            v29.16b, #4
   2898        add             v31.8h,  v31.8h,  v31.8h  // {0,2,4,6,0,2,4,6}
   2899 4:
   2900        asr             w9,  w8,  #6              // base_x
   2901        dup             v16.4h,  w8               // xpos
   2902        sub             w8,  w8,  w6              // xpos -= dx
   2903        cmp             w9,  #-8                  // base_x <= -8
   2904        asr             w11, w8,  #6              // base_x
   2905        b.le            49f
   2906 
   2907        lsl             w9,  w9,  #1
   2908        lsl             w11, w11, #1
   2909 
   2910        dup             v17.4h,  w8               // xpos
   2911 
   2912        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
   2913        ldr             q6,  [x2, w11, sxtw]
   2914 
   2915        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
   2916 
   2917        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
   2918 
   2919        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
   2920 
   2921        uzp2            v5.8h,   v4.8h,   v6.8h   // top[base_x+1]
   2922        uzp1            v4.8h,   v4.8h,   v6.8h   // top[base_x]
   2923 
   2924        and             v16.16b, v16.16b, v25.16b // frac_x
   2925 
   2926        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
   2927 
   2928        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
   2929 
   2930        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
   2931 
   2932        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2933        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2934        umull2          v22.4s,  v18.8h,  v28.8h
   2935        umlal2          v22.4s,  v19.8h,  v27.8h
   2936 
   2937        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
   2938        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
   2939        umull2          v24.4s,  v4.8h,   v17.8h
   2940        umlal2          v24.4s,  v5.8h,   v16.8h
   2941 
   2942        cmge            v20.8h,  v20.8h,  #0
   2943 
   2944        rshrn           v21.4h,  v21.4s,  #6
   2945        rshrn2          v21.8h,  v22.4s,  #6
   2946        rshrn           v22.4h,  v23.4s,  #6
   2947        rshrn2          v22.8h,  v24.4s,  #6
   2948 
   2949        bit             v21.16b, v22.16b, v20.16b
   2950 
   2951        st1             {v21.d}[0], [x0], x1
   2952        sub             w8,  w8,  w6              // xpos -= dx
   2953        subs            w5,  w5,  #2
   2954        st1             {v21.d}[1], [x0], x1
   2955        b.le            9f
   2956 
   2957        ext             v18.16b, v19.16b, v19.16b, #8
   2958        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
   2959        b               4b
   2960 
   2961 49:
   2962        tbl             v19.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+1], left[base_y+2]
   2963 
   2964        trn1            v18.2d,  v18.2d,  v19.2d  // left[base_y], left[base_y+1]
   2965 
   2966        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   2967        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   2968        umull2          v21.4s,  v18.8h,  v28.8h
   2969        umlal2          v21.4s,  v19.8h,  v27.8h
   2970 
   2971        rshrn           v20.4h,  v20.4s,  #6
   2972        rshrn2          v20.8h,  v21.4s,  #6
   2973 
   2974        st1             {v20.d}[0], [x0], x1
   2975        subs            w5,  w5,  #2
   2976        st1             {v20.d}[1], [x0], x1
   2977        b.le            9f
   2978 
   2979        ext             v18.16b, v19.16b, v19.16b, #8
   2980        add             v30.16b, v30.16b, v29.16b // base_y += 2 (*2)
   2981        b               49b
   2982 
   2983 9:
   2984        ret
   2985 
   2986 80:
   2987        stp             d8,  d9,  [sp, #-0x40]!
   2988        stp             d10, d11, [sp, #0x10]
   2989        stp             d12, d13, [sp, #0x20]
   2990        stp             d14, d15, [sp, #0x30]
   2991 
   2992        dup             v18.8h,  w7               // -dy
   2993        movi            v17.8b,  #1
   2994 
   2995        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
   2996        movi            v25.8h,  #0x3e
   2997        add             v16.8h,  v16.8h,  v18.8h  // -= dy
   2998 
   2999        // For upsample_top, w <= 8 and h <= 8; we may need up to h+1 elements
   3000        // from left.
   3001        ld1             {v0.8h, v1.8h}, [x3]      // left[]
   3002 
   3003        movi            v26.8h,  #64
   3004        movi            v19.16b, #4
   3005 
   3006        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
   3007        and             v27.16b, v16.16b, v25.16b // frac_y
   3008 
   3009        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 1
   3010 
   3011        movi            v23.8h,  #1, lsl #8
   3012        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   3013        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
   3014        movi            v17.16b, #2
   3015        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
   3016 
   3017        // Cut corners here; for the first row we don't expect to need to
   3018        // read outside of v0.
   3019        tbl             v18.16b, {v0.16b}, v29.16b // left[base_y]
   3020 
   3021        add             v30.16b, v29.16b, v19.16b // base_y + 2 (*2)
   3022        add             v29.16b, v29.16b, v17.16b // base_y + 1 (*2)
   3023 
   3024        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
   3025 
   3026        movi            v24.16b, #4
   3027        add             v31.16b, v31.16b, v31.16b // {0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14}
   3028 8:
   3029        asr             w9,  w8,  #6              // base_x
   3030        dup             v16.8h,   w8              // xpos
   3031        sub             w8,  w8,  w6              // xpos -= dx
   3032        cmp             w9,  #-16                 // base_x <= -16
   3033        asr             w11, w8,  #6              // base_x
   3034        b.le            89f
   3035 
   3036        dup             v17.8h,   w8              // xpos
   3037 
   3038        add             x9,  x2,  w9,  sxtw #1
   3039        add             x11, x2,  w11, sxtw #1
   3040 
   3041        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
   3042        ld1             {v6.8h, v7.8h}, [x11]
   3043 
   3044        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
   3045 
   3046        sshr            v21.8h,  v16.8h,  #6      // first base_x
   3047        sshr            v22.8h,  v17.8h,  #6
   3048 
   3049        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
   3050 
   3051        uzp2            v2.8h,   v4.8h,   v5.8h   // top[base_x+1]
   3052        uzp1            v4.8h,   v4.8h,   v5.8h   // top[base_x]
   3053        uzp2            v3.8h,   v6.8h,   v7.8h
   3054        uzp1            v6.8h,   v6.8h,   v7.8h
   3055        mov             v5.16b,  v2.16b
   3056        mov             v7.16b,  v3.16b
   3057 
   3058        and             v16.16b, v16.16b, v25.16b // frac_x
   3059        and             v17.16b, v17.16b, v25.16b
   3060 
   3061        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   3062        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   3063 
   3064        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
   3065        sub             v9.8h,   v26.8h,  v17.8h
   3066 
   3067        umull2          v11.4s,  v18.8h,  v28.8h
   3068        umlal2          v11.4s,  v19.8h,  v27.8h
   3069 
   3070        add             v21.8h,  v21.8h,  v31.8h  // actual base_x
   3071        add             v22.8h,  v22.8h,  v31.8h
   3072 
   3073        umull           v12.4s,  v19.4h,  v28.4h
   3074        umlal           v12.4s,  v20.4h,  v27.4h
   3075        umull2          v13.4s,  v19.8h,  v28.8h
   3076        umlal2          v13.4s,  v20.8h,  v27.8h
   3077 
   3078        rshrn           v10.4h,  v10.4s,  #6
   3079        rshrn2          v10.8h,  v11.4s,  #6
   3080        rshrn           v11.4h,  v12.4s,  #6
   3081        rshrn2          v11.8h,  v13.4s,  #6
   3082 
   3083        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
   3084        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
   3085        umull2          v13.4s,  v4.8h,   v8.8h
   3086        umlal2          v13.4s,  v5.8h,   v16.8h
   3087        umull           v14.4s,  v6.4h,   v9.4h
   3088        umlal           v14.4s,  v7.4h,   v17.4h
   3089        umull2          v18.4s,  v6.8h,   v9.8h
   3090        umlal2          v18.4s,  v7.8h,   v17.8h
   3091 
   3092        cmge            v21.8h,  v21.8h,  #0
   3093        cmge            v22.8h,  v22.8h,  #0
   3094 
   3095        rshrn           v12.4h,  v12.4s,  #6
   3096        rshrn2          v12.8h,  v13.4s,  #6
   3097        rshrn           v13.4h,  v14.4s,  #6
   3098        rshrn2          v13.8h,  v18.4s,  #6
   3099 
   3100        bit             v10.16b, v12.16b, v21.16b
   3101        bit             v11.16b, v13.16b, v22.16b
   3102 
   3103        st1             {v10.8h}, [x0], x1
   3104        subs            w5,  w5,  #2
   3105        sub             w8,  w8,  w6              // xpos -= dx
   3106        st1             {v11.8h}, [x0], x1
   3107        b.le            9f
   3108 
   3109        mov             v18.16b, v20.16b
   3110        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   3111        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
   3112        b               8b
   3113 
   3114 89:
   3115        tbl             v19.16b, {v0.16b, v1.16b}, v29.16b // left[base_y+1]
   3116        tbl             v20.16b, {v0.16b, v1.16b}, v30.16b // left[base_y+2]
   3117 
   3118        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   3119        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   3120        umull2          v5.4s,   v18.8h,  v28.8h
   3121        umlal2          v5.4s,   v19.8h,  v27.8h
   3122        umull           v6.4s,   v19.4h,  v28.4h
   3123        umlal           v6.4s,   v20.4h,  v27.4h
   3124        umull2          v7.4s,   v19.8h,  v28.8h
   3125        umlal2          v7.4s,   v20.8h,  v27.8h
   3126 
   3127        rshrn           v4.4h,   v4.4s,   #6
   3128        rshrn2          v4.8h,   v5.4s,   #6
   3129        rshrn           v5.4h,   v6.4s,   #6
   3130        rshrn2          v5.8h,   v7.4s,   #6
   3131 
   3132        st1             {v4.8h}, [x0], x1
   3133        subs            w5,  w5,  #2
   3134        st1             {v5.8h}, [x0], x1
   3135        b.le            9f
   3136 
   3137        mov             v18.16b, v20.16b
   3138        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   3139        add             v30.16b, v30.16b, v24.16b // base_y += 2 (*2)
   3140        b               89b
   3141 
   3142 9:
   3143        ldp             d14, d15, [sp, #0x30]
   3144        ldp             d12, d13, [sp, #0x20]
   3145        ldp             d10, d11, [sp, #0x10]
   3146        ldp             d8,  d9,  [sp], 0x40
   3147        ret
   3148 endfunc
   3149 
   3150 function ipred_z2_fill3_16bpc_neon, export=1
   3151        cmp             w4,  #8
   3152        mov             w8,  #(1 << 6)            // xpos = 1 << 6
   3153        sub             w8,  w8,  w6              // xpos -= dx
   3154 
   3155        movrel          x11, increments
   3156        ld1             {v31.8h},  [x11]          // increments
   3157        neg             w7,  w7                   // -dy
   3158        b.eq            80f
   3159 
   3160 40:
   3161        dup             v30.4h,  w7               // -dy
   3162        movi            v17.8b,  #1
   3163 
   3164        mul             v16.4h,  v31.4h,  v30.4h  // {0,1,2,3}* -dy
   3165        movi            v25.8h,  #0x3e
   3166        add             v30.4h,  v16.4h,  v30.4h  // -= dy
   3167 
   3168        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
   3169        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
   3170 
   3171        movi            v26.8h,  #64
   3172        movi            v19.16b, #2
   3173 
   3174        shrn            v29.8b,  v30.8h,  #6      // ypos >> 6
   3175        and             v27.8b,  v30.8b,  v25.8b  // frac_y
   3176 
   3177        add             v29.8b,  v29.8b,  v19.8b  // base_y = (ypos >> 6) + 2
   3178 
   3179        movi            v23.4h,  #1, lsl #8
   3180        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   3181        movi            v19.16b, #4
   3182        zip1            v29.8b,  v29.8b,  v29.8b  // duplicate elements
   3183        movi            v17.8b,  #2
   3184        add             v29.8b,  v29.8b,  v23.8b  // 2*base, 2*base+1, ...
   3185 
   3186        add             v30.8b,  v29.8b,  v17.8b  // base_y + 1 (*2)
   3187        add             v28.8b,  v29.8b,  v19.8b  // base_y + 2 (*2)
   3188 
   3189        trn1            v31.2d,  v31.2d,  v31.2d  // {0,1,2,3,0,1,2,3}
   3190 
   3191        add             v24.8b,  v30.8b,  v19.8b  // base_y + 3 (*2)
   3192 
   3193        trn1            v29.2d,  v29.2d,  v28.2d  // base_y + 0, base_y + 2
   3194        trn1            v30.2d,  v30.2d,  v24.2d  // base_y + 1, base_y + 3
   3195 
   3196        sub             v28.4h,  v26.4h,  v27.4h  // 64 - frac_y
   3197 
   3198        trn1            v27.2d,  v27.2d,  v27.2d  // frac_y
   3199        trn1            v28.2d,  v28.2d,  v28.2d  // 64 - frac_y
   3200 
   3201        movi            v24.16b, #8
   3202 4:
   3203        asr             w9,  w8,  #6              // base_x
   3204        dup             v16.4h,  w8               // xpos
   3205        sub             w8,  w8,  w6              // xpos -= dx
   3206        cmp             w9,  #-4                  // base_x <= -4
   3207        asr             w11, w8,  #6              // base_x
   3208        b.le            49f
   3209 
   3210        lsl             w9,  w9,  #1
   3211        lsl             w11, w11, #1
   3212 
   3213        dup             v17.4h,  w8               // xpos
   3214 
   3215        ldr             q4,  [x2, w9, sxtw]       // top[base_x]
   3216        ldr             q6,  [x2, w11, sxtw]
   3217 
   3218        trn1            v16.2d,  v16.2d,  v17.2d  // xpos
   3219 
   3220        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
   3221        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
   3222 
   3223        sshr            v20.8h,  v16.8h,  #6      // first base_x for each row
   3224 
   3225        ext             v5.16b,  v4.16b,  v4.16b,  #2 // top[base_x+1]
   3226        ext             v7.16b,  v6.16b,  v6.16b,  #2
   3227 
   3228        and             v16.16b, v16.16b, v25.16b // frac_x
   3229 
   3230        trn1            v4.2d,   v4.2d,   v6.2d   // top[base_x]
   3231        trn1            v5.2d,   v5.2d,   v7.2d   // top[base_x+1]
   3232 
   3233        sub             v17.8h,  v26.8h,  v16.8h  // 64 - frac_x
   3234 
   3235        add             v20.8h,  v20.8h,  v31.8h  // actual base_x
   3236 
   3237        umull           v21.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   3238        umlal           v21.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   3239        umull2          v22.4s,  v18.8h,  v28.8h
   3240        umlal2          v22.4s,  v19.8h,  v27.8h
   3241 
   3242        umull           v23.4s,  v4.4h,   v17.4h  // top[base_x]-*(64-frac_x)
   3243        umlal           v23.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
   3244        umull2          v24.4s,  v4.8h,   v17.8h
   3245        umlal2          v24.4s,  v5.8h,   v16.8h
   3246 
   3247        cmge            v20.8h,  v20.8h,  #0
   3248 
   3249        rshrn           v21.4h,  v21.4s,  #6
   3250        rshrn2          v21.8h,  v22.4s,  #6
   3251        rshrn           v22.4h,  v23.4s,  #6
   3252        rshrn2          v22.8h,  v24.4s,  #6
   3253 
   3254        movi            v24.16b, #8
   3255 
   3256        bit             v21.16b, v22.16b, v20.16b
   3257 
   3258        st1             {v21.d}[0], [x0], x1
   3259        sub             w8,  w8,  w6              // xpos -= dx
   3260        subs            w5,  w5,  #2
   3261        st1             {v21.d}[1], [x0], x1
   3262        b.le            9f
   3263 
   3264        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
   3265        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
   3266        b               4b
   3267 
   3268 49:
   3269        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0], left[base_y+2]
   3270        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1], left[base_y+3]
   3271 
   3272        umull           v20.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   3273        umlal           v20.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   3274        umull2          v21.4s,  v18.8h,  v28.8h
   3275        umlal2          v21.4s,  v19.8h,  v27.8h
   3276 
   3277        rshrn           v20.4h,  v20.4s,  #6
   3278        rshrn2          v20.8h,  v21.4s,  #6
   3279 
   3280        st1             {v20.d}[0], [x0], x1
   3281        subs            w5,  w5,  #2
   3282        st1             {v20.d}[1], [x0], x1
   3283        b.le            9f
   3284 
   3285        add             v29.16b, v29.16b, v24.16b // base_y += 4 (*2)
   3286        add             v30.16b, v30.16b, v24.16b // base_y += 4 (*2)
   3287        b               49b
   3288 
   3289 9:
   3290        ret
   3291 
   3292 80:
   3293        stp             d8,  d9,  [sp, #-0x40]!
   3294        stp             d10, d11, [sp, #0x10]
   3295        stp             d12, d13, [sp, #0x20]
   3296        stp             d14, d15, [sp, #0x30]
   3297 
   3298        dup             v18.8h,  w7               // -dy
   3299        movi            v17.16b, #2
   3300 
   3301        mul             v16.8h,  v31.8h,  v18.8h  // {0,1,2,3,4,5,6,7}* -dy
   3302        movi            v25.8h,  #0x3e
   3303        add             v16.8h,  v16.8h,  v18.8h  // -= dy
   3304 
   3305        // For upsample_left, w <= 8 and h <= 8; we may need up to 2*h+1 elements.
   3306        ld1             {v0.8h, v1.8h, v2.8h}, [x3]    // left[]
   3307 
   3308        movi            v26.8h,  #64
   3309        movi            v19.16b, #4
   3310 
   3311        shrn            v29.8b,  v16.8h,  #6      // ypos >> 6
   3312        and             v27.16b, v16.16b, v25.16b // frac_y
   3313 
   3314        add             v29.8b,  v29.8b,  v17.8b  // base_y = (ypos >> 6) + 2
   3315 
   3316        movi            v23.8h,  #1, lsl #8
   3317        shl             v29.8b,  v29.8b,  #1      // 2*base_y
   3318        mov             v18.16b, v15.16b          // left[0]
   3319        zip1            v29.16b, v29.16b, v29.16b // duplicate elements
   3320        add             v29.16b, v29.16b, v23.16b // 2*base, 2*base+1, ...
   3321 
   3322        add             v30.16b, v29.16b, v17.16b // base_y + 1 (*2)
   3323 
   3324        sub             v28.8h,  v26.8h,  v27.8h  // 64 - frac_y
   3325 
   3326        movi            v24.16b, #4
   3327 8:
   3328        asr             w9,  w8,  #6              // base_x
   3329        dup             v16.8h,   w8              // xpos
   3330        sub             w8,  w8,  w6              // xpos -= dx
   3331        cmp             w9,  #-16                 // base_x <= -16
   3332        asr             w11, w8,  #6              // base_x
   3333        b.le            89f
   3334 
   3335        dup             v17.8h,   w8              // xpos
   3336 
   3337        add             x9,  x2,  w9,  sxtw #1
   3338        add             x11, x2,  w11, sxtw #1
   3339 
   3340        ld1             {v4.8h, v5.8h}, [x9]      // top[base_x]
   3341        ld1             {v6.8h, v7.8h}, [x11]
   3342 
   3343        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
   3344        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   3345        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
   3346        add             v30.16b, v30.16b, v24.16b
   3347 
   3348        sshr            v22.8h,  v16.8h,  #6      // first base_x
   3349        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
   3350        sshr            v23.8h,  v17.8h,  #6
   3351        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
   3352 
   3353        ext             v5.16b,  v4.16b,  v5.16b,  #2 // top[base_x+1]
   3354        ext             v7.16b,  v6.16b,  v7.16b,  #2
   3355 
   3356        and             v16.16b, v16.16b, v25.16b // frac_x
   3357        and             v17.16b, v17.16b, v25.16b
   3358 
   3359        umull           v10.4s,  v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   3360        umlal           v10.4s,  v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   3361 
   3362        sub             v8.8h,   v26.8h,  v16.8h  // 64 - frac_x
   3363        sub             v9.8h,   v26.8h,  v17.8h
   3364 
   3365        umull2          v11.4s,  v18.8h,  v28.8h
   3366        umlal2          v11.4s,  v19.8h,  v27.8h
   3367 
   3368        add             v22.8h,  v22.8h,  v31.8h  // actual base_x
   3369        add             v23.8h,  v23.8h,  v31.8h
   3370 
   3371        umull           v12.4s,  v20.4h,  v28.4h
   3372        umlal           v12.4s,  v21.4h,  v27.4h
   3373        umull2          v13.4s,  v20.8h,  v28.8h
   3374        umlal2          v13.4s,  v21.8h,  v27.8h
   3375 
   3376        rshrn           v10.4h,  v10.4s,  #6
   3377        rshrn2          v10.8h,  v11.4s,  #6
   3378        rshrn           v11.4h,  v12.4s,  #6
   3379        rshrn2          v11.8h,  v13.4s,  #6
   3380 
   3381        umull           v12.4s,  v4.4h,   v8.4h   // top[base_x]-*(64-frac_x)
   3382        umlal           v12.4s,  v5.4h,   v16.4h  // + top[base_x+1]*frac_x
   3383        umull2          v13.4s,  v4.8h,   v8.8h
   3384        umlal2          v13.4s,  v5.8h,   v16.8h
   3385        umull           v14.4s,  v6.4h,   v9.4h
   3386        umlal           v14.4s,  v7.4h,   v17.4h
   3387        umull2          v18.4s,  v6.8h,   v9.8h
   3388        umlal2          v18.4s,  v7.8h,   v17.8h
   3389 
   3390        cmge            v22.8h,  v22.8h,  #0
   3391        cmge            v23.8h,  v23.8h,  #0
   3392 
   3393        rshrn           v12.4h,  v12.4s,  #6
   3394        rshrn2          v12.8h,  v13.4s,  #6
   3395        rshrn           v13.4h,  v14.4s,  #6
   3396        rshrn2          v13.8h,  v18.4s,  #6
   3397 
   3398        bit             v10.16b, v12.16b, v22.16b
   3399        bit             v11.16b, v13.16b, v23.16b
   3400 
   3401        st1             {v10.8h}, [x0], x1
   3402        subs            w5,  w5,  #2
   3403        sub             w8,  w8,  w6              // xpos -= dx
   3404        st1             {v11.8h}, [x0], x1
   3405        b.le            9f
   3406 
   3407        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   3408        add             v30.16b, v30.16b, v24.16b
   3409        b               8b
   3410 
   3411 89:
   3412        tbl             v18.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+0]
   3413        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   3414        tbl             v19.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+1]
   3415        add             v30.16b, v30.16b, v24.16b
   3416        tbl             v20.16b, {v0.16b, v1.16b, v2.16b}, v29.16b // left[base_y+2]
   3417        tbl             v21.16b, {v0.16b, v1.16b, v2.16b}, v30.16b // left[base_y+3]
   3418 
   3419        umull           v4.4s,   v18.4h,  v28.4h  // left[base_y]*(64-frac_y)
   3420        umlal           v4.4s,   v19.4h,  v27.4h  // + left[base_y+1]*frac_y
   3421        umull2          v5.4s,   v18.8h,  v28.8h
   3422        umlal2          v5.4s,   v19.8h,  v27.8h
   3423        umull           v6.4s,   v20.4h,  v28.4h
   3424        umlal           v6.4s,   v21.4h,  v27.4h
   3425        umull2          v7.4s,   v20.8h,  v28.8h
   3426        umlal2          v7.4s,   v21.8h,  v27.8h
   3427 
   3428        rshrn           v4.4h,   v4.4s,   #6
   3429        rshrn2          v4.8h,   v5.4s,   #6
   3430        rshrn           v5.4h,   v6.4s,   #6
   3431        rshrn2          v5.8h,   v7.4s,   #6
   3432 
   3433        st1             {v4.8h}, [x0], x1
   3434        subs            w5,  w5,  #2
   3435        st1             {v5.8h}, [x0], x1
   3436        b.le            9f
   3437 
   3438        add             v29.16b, v29.16b, v24.16b // base_y += 2 (*2)
   3439        add             v30.16b, v30.16b, v24.16b
   3440        b               89b
   3441 
   3442 9:
   3443        ldp             d14, d15, [sp, #0x30]
   3444        ldp             d12, d13, [sp, #0x20]
   3445        ldp             d10, d11, [sp, #0x10]
   3446        ldp             d8,  d9,  [sp], 0x40
   3447        ret
   3448 endfunc
   3449 
   3450 // void ipred_z3_fill1_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   3451 //                                const pixel *const left,
   3452 //                                const int width, const int height,
   3453 //                                const int dy, const int max_base_y);
   3454 function ipred_z3_fill1_16bpc_neon, export=1
   3455        clz             w9,  w4
   3456        movrel          x8,  ipred_z3_fill1_tbl
   3457        sub             w9,  w9,  #25
   3458        ldrsw           x9,  [x8, w9, uxtw #2]
   3459        add             x10, x2,  w6,  uxtw #1    // left[max_base_y]
   3460        add             x8,  x8,  x9
   3461        ld1r            {v31.8h}, [x10]           // padding
   3462        mov             w7,  w5
   3463        mov             w15, #64
   3464        add             x13, x0,  x1
   3465        lsl             x1,  x1,  #1
   3466        br              x8
   3467 
   3468 40:
   3469        AARCH64_VALID_JUMP_TARGET
   3470 4:
   3471        lsr             w8,  w7,  #6              // base
   3472        and             w9,  w7,  #0x3e           // frac
   3473        add             w7,  w7,  w5              // xpos += dx
   3474        cmp             w8,  w6                   // base >= max_base_x
   3475        lsr             w10, w7,  #6              // base
   3476        and             w11, w7,  #0x3e           // frac
   3477        b.ge            ipred_z3_fill_padding_neon
   3478        lsl             w8,  w8,  #1
   3479        lsl             w10, w10, #1
   3480        ldr             q0,  [x2, w8, uxtw]       // left[base]
   3481        ldr             q2,  [x2, w10, uxtw]
   3482        dup             v4.8h,   w9               // frac
   3483        dup             v5.8h,   w11
   3484        ext             v1.16b,  v0.16b,  v0.16b,  #2 // left[base+1]
   3485        ext             v3.16b,  v2.16b,  v2.16b,  #2
   3486        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
   3487        sub             v7.4h,   v3.4h,   v2.4h
   3488        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
   3489        ushll           v17.4s,  v2.4h,   #6
   3490        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
   3491        smlal           v17.4s,  v7.4h,   v5.4h
   3492        rshrn           v16.4h,  v16.4s,  #6
   3493        rshrn           v17.4h,  v17.4s,  #6
   3494        subs            w3,  w3,  #2
   3495        zip1            v18.8h,  v16.8h,  v17.8h
   3496        st1             {v18.s}[0], [x0],  x1
   3497        st1             {v18.s}[1], [x13], x1
   3498        add             w7,  w7,  w5              // xpos += dx
   3499        st1             {v18.s}[2], [x0]
   3500        st1             {v18.s}[3], [x13]
   3501        b.le            9f
   3502        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
   3503        sub             x13, x13, x1
   3504        add             x0,  x0,  #4
   3505        add             x13, x13, #4
   3506        b               4b
   3507 9:
   3508        ret
   3509 
   3510 80:
   3511        AARCH64_VALID_JUMP_TARGET
   3512 8:
   3513        lsr             w8,  w7,  #6              // base
   3514        and             w9,  w7,  #0x3e           // frac
   3515        add             w7,  w7,  w5              // xpos += dx
   3516        cmp             w8,  w6                   // base >= max_base_x
   3517        lsr             w10, w7,  #6              // base
   3518        and             w11, w7,  #0x3e           // frac
   3519        b.ge            ipred_z3_fill_padding_neon
   3520        add             x8,  x2,  w8,  uxtw #1
   3521        add             x10, x2,  w10, uxtw #1
   3522        dup             v4.8h,   w9               // frac
   3523        dup             v5.8h,   w11
   3524        ld1             {v0.8h},  [x8]            // left[base]
   3525        ld1             {v2.8h},  [x10]
   3526        sub             w9,  w15, w9              // 64 - frac
   3527        sub             w11, w15, w11
   3528        ldr             h1, [x8, #16]
   3529        ldr             h3, [x10, #16]
   3530        dup             v6.8h,   w9               // 64 - frac
   3531        dup             v7.8h,   w11
   3532        ext             v1.16b,  v0.16b,  v1.16b,  #2 // left[base+1]
   3533        ext             v3.16b,  v2.16b,  v3.16b,  #2
   3534        umull           v16.4s,  v0.4h,   v6.4h   // left[base]*(64-frac)
   3535        umlal           v16.4s,  v1.4h,   v4.4h   // + left[base+1]*frac
   3536        umull2          v17.4s,  v0.8h,   v6.8h
   3537        umlal2          v17.4s,  v1.8h,   v4.8h
   3538        umull           v18.4s,  v2.4h,   v7.4h
   3539        umlal           v18.4s,  v3.4h,   v5.4h
   3540        umull2          v19.4s,  v2.8h,   v7.8h
   3541        umlal2          v19.4s,  v3.8h,   v5.8h
   3542        rshrn           v16.4h,  v16.4s,  #6
   3543        rshrn2          v16.8h,  v17.4s,  #6
   3544        rshrn           v17.4h,  v18.4s,  #6
   3545        rshrn2          v17.8h,  v19.4s,  #6
   3546        subs            w3,  w3,  #2
   3547        zip1            v18.8h,  v16.8h,  v17.8h
   3548        zip2            v19.8h,  v16.8h,  v17.8h
   3549        add             w7,  w7,  w5              // xpos += dx
   3550        st1             {v18.s}[0], [x0],  x1
   3551        st1             {v18.s}[1], [x13], x1
   3552        st1             {v18.s}[2], [x0],  x1
   3553        st1             {v18.s}[3], [x13], x1
   3554        st1             {v19.s}[0], [x0],  x1
   3555        st1             {v19.s}[1], [x13], x1
   3556        st1             {v19.s}[2], [x0],  x1
   3557        st1             {v19.s}[3], [x13], x1
   3558        b.le            9f
   3559        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
   3560        sub             x13, x13, x1, lsl #2
   3561        add             x0,  x0,  #4
   3562        add             x13, x13, #4
   3563        b               8b
   3564 9:
   3565        ret
   3566 
   3567 160:
   3568 320:
   3569 640:
   3570        AARCH64_VALID_JUMP_TARGET
   3571        mov             w12, w4
   3572 1:
   3573        lsr             w8,  w7,  #6              // base
   3574        and             w9,  w7,  #0x3e           // frac
   3575        add             w7,  w7,  w5              // ypos += dy
   3576        cmp             w8,  w6                   // base >= max_base_y
   3577        lsr             w10, w7,  #6              // base
   3578        and             w11, w7,  #0x3e           // frac
   3579        b.ge            ipred_z3_fill_padding_neon
   3580        add             x8,  x2,  w8,  uxtw #1
   3581        add             x10, x2,  w10, uxtw #1
   3582        dup             v6.8h,   w9               // frac
   3583        dup             v7.8h,   w11
   3584        ld1             {v0.8h, v1.8h, v2.8h}, [x8],  #48 // left[base]
   3585        ld1             {v3.8h, v4.8h, v5.8h}, [x10], #48
   3586        sub             w9,  w15, w9              // 64 - frac
   3587        sub             w11, w15, w11
   3588        dup             v16.8h,  w9               // 64 - frac
   3589        dup             v17.8h,  w11
   3590        add             w7,  w7,  w5              // ypos += dy
   3591 2:
   3592        ext             v18.16b, v0.16b,  v1.16b,  #2 // left[base+1]
   3593        ext             v19.16b, v1.16b,  v2.16b,  #2
   3594        ext             v20.16b, v3.16b,  v4.16b,  #2
   3595        ext             v21.16b, v4.16b,  v5.16b,  #2
   3596        subs            w4,  w4,  #16
   3597        umull           v22.4s,  v0.4h,   v16.4h  // left[base]*(64-frac)
   3598        umlal           v22.4s,  v18.4h,  v6.4h   // + left[base+1]*frac
   3599        umull2          v23.4s,  v0.8h,   v16.8h
   3600        umlal2          v23.4s,  v18.8h,  v6.8h
   3601        umull           v24.4s,  v1.4h,   v16.4h
   3602        umlal           v24.4s,  v19.4h,  v6.4h
   3603        umull2          v25.4s,  v1.8h,   v16.8h
   3604        umlal2          v25.4s,  v19.8h,  v6.8h
   3605        umull           v26.4s,  v3.4h,   v17.4h
   3606        umlal           v26.4s,  v20.4h,  v7.4h
   3607        umull2          v27.4s,  v3.8h,   v17.8h
   3608        umlal2          v27.4s,  v20.8h,  v7.8h
   3609        umull           v28.4s,  v4.4h,   v17.4h
   3610        umlal           v28.4s,  v21.4h,  v7.4h
   3611        umull2          v29.4s,  v4.8h,   v17.8h
   3612        umlal2          v29.4s,  v21.8h,  v7.8h
   3613        rshrn           v22.4h,  v22.4s,  #6
   3614        rshrn2          v22.8h,  v23.4s,  #6
   3615        rshrn           v23.4h,  v24.4s,  #6
   3616        rshrn2          v23.8h,  v25.4s,  #6
   3617        rshrn           v24.4h,  v26.4s,  #6
   3618        rshrn2          v24.8h,  v27.4s,  #6
   3619        rshrn           v25.4h,  v28.4s,  #6
   3620        rshrn2          v25.8h,  v29.4s,  #6
   3621        zip1            v18.8h,  v22.8h,  v24.8h
   3622        zip2            v19.8h,  v22.8h,  v24.8h
   3623        zip1            v20.8h,  v23.8h,  v25.8h
   3624        zip2            v21.8h,  v23.8h,  v25.8h
   3625        st1             {v18.s}[0], [x0],  x1
   3626        st1             {v18.s}[1], [x13], x1
   3627        st1             {v18.s}[2], [x0],  x1
   3628        st1             {v18.s}[3], [x13], x1
   3629        st1             {v19.s}[0], [x0],  x1
   3630        st1             {v19.s}[1], [x13], x1
   3631        st1             {v19.s}[2], [x0],  x1
   3632        st1             {v19.s}[3], [x13], x1
   3633        st1             {v20.s}[0], [x0],  x1
   3634        st1             {v20.s}[1], [x13], x1
   3635        st1             {v20.s}[2], [x0],  x1
   3636        st1             {v20.s}[3], [x13], x1
   3637        st1             {v21.s}[0], [x0],  x1
   3638        st1             {v21.s}[1], [x13], x1
   3639        st1             {v21.s}[2], [x0],  x1
   3640        st1             {v21.s}[3], [x13], x1
   3641        b.le            3f
   3642        mov             v0.16b,  v2.16b
   3643        ld1             {v1.8h, v2.8h}, [x8],  #32      // left[base]
   3644        mov             v3.16b,  v5.16b
   3645        ld1             {v4.8h, v5.8h}, [x10], #32
   3646        b               2b
   3647 
   3648 3:
   3649        subs            w3,  w3,  #2
   3650        b.le            9f
   3651        lsr             x1,  x1,  #1
   3652        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3653        msub            x13, x1,  x12, x13
   3654        lsl             x1,  x1,  #1
   3655        add             x0,  x0,  #4
   3656        add             x13, x13, #4
   3657        mov             w4,  w12
   3658        b               1b
   3659 9:
   3660        ret
   3661 endfunc
   3662 
   3663 jumptable ipred_z3_fill1_tbl
   3664        .word 640b - ipred_z3_fill1_tbl
   3665        .word 320b - ipred_z3_fill1_tbl
   3666        .word 160b - ipred_z3_fill1_tbl
   3667        .word 80b  - ipred_z3_fill1_tbl
   3668        .word 40b  - ipred_z3_fill1_tbl
   3669 endjumptable
   3670 
   3671 function ipred_z3_fill_padding_neon, export=0
   3672        cmp             w3,  #8
   3673        movrel          x8,  ipred_z3_fill_padding_tbl
   3674        b.gt            ipred_z3_fill_padding_wide
   3675        // w3 = remaining width, w4 = constant height
   3676        mov             w12, w4
   3677 
   3678 1:
   3679        // Fill a WxH rectangle with padding. W can be any number;
   3680        // this fills the exact width by filling in the largest
   3681        // power of two in the remaining width, and repeating.
   3682        clz             w9,  w3
   3683        sub             w9,  w9,  #25
   3684        ldrsw           x9,  [x8, w9, uxtw #2]
   3685        add             x9,  x8,  x9
   3686        br              x9
   3687 
   3688 20:
   3689        AARCH64_VALID_JUMP_TARGET
   3690 2:
   3691        st1             {v31.s}[0], [x0],  x1
   3692        subs            w4,  w4,  #4
   3693        st1             {v31.s}[0], [x13], x1
   3694        st1             {v31.s}[0], [x0],  x1
   3695        st1             {v31.s}[0], [x13], x1
   3696        b.gt            2b
   3697        subs            w3,  w3,  #2
   3698        lsr             x1,  x1,  #1
   3699        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3700        msub            x13, x1,  x12, x13
   3701        b.le            9f
   3702        lsl             x1,  x1,  #1
   3703        add             x0,  x0,  #4
   3704        add             x13, x13, #4
   3705        mov             w4,  w12
   3706        b               1b
   3707 
   3708 40:
   3709        AARCH64_VALID_JUMP_TARGET
   3710 4:
   3711        st1             {v31.4h}, [x0],  x1
   3712        subs            w4,  w4,  #4
   3713        st1             {v31.4h}, [x13], x1
   3714        st1             {v31.4h}, [x0],  x1
   3715        st1             {v31.4h}, [x13], x1
   3716        b.gt            4b
   3717        subs            w3,  w3,  #4
   3718        lsr             x1,  x1,  #1
   3719        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3720        msub            x13, x1,  x12, x13
   3721        b.le            9f
   3722        lsl             x1,  x1,  #1
   3723        add             x0,  x0,  #8
   3724        add             x13, x13, #8
   3725        mov             w4,  w12
   3726        b               1b
   3727 
   3728 80:
   3729 160:
   3730 320:
   3731 640:
   3732        AARCH64_VALID_JUMP_TARGET
   3733 8:
   3734        st1             {v31.8h}, [x0],  x1
   3735        subs            w4,  w4,  #4
   3736        st1             {v31.8h}, [x13], x1
   3737        st1             {v31.8h}, [x0],  x1
   3738        st1             {v31.8h}, [x13], x1
   3739        b.gt            8b
   3740        subs            w3,  w3,  #8
   3741        lsr             x1,  x1,  #1
   3742        msub            x0,  x1,  x12, x0         // ptr -= h * stride
   3743        msub            x13, x1,  x12, x13
   3744        b.le            9f
   3745        lsl             x1,  x1,  #1
   3746        add             x0,  x0,  #16
   3747        add             x13, x13, #16
   3748        mov             w4,  w12
   3749        b               1b
   3750 
   3751 9:
   3752        ret
   3753 endfunc
   3754 
   3755 jumptable ipred_z3_fill_padding_tbl
   3756        .word 640b - ipred_z3_fill_padding_tbl
   3757        .word 320b - ipred_z3_fill_padding_tbl
   3758        .word 160b - ipred_z3_fill_padding_tbl
   3759        .word 80b  - ipred_z3_fill_padding_tbl
   3760        .word 40b  - ipred_z3_fill_padding_tbl
   3761        .word 20b  - ipred_z3_fill_padding_tbl
   3762 endjumptable
   3763 
   3764 function ipred_z3_fill_padding_wide
   3765        // Fill a WxH rectangle with padding, with W > 8.
   3766        lsr             x1,  x1,  #1
   3767        mov             w12, w3
   3768        sub             x1,  x1,  w3,  uxtw #1
   3769 1:
   3770        ands            w5,  w3,  #7
   3771        b.eq            2f
   3772        // If the width isn't aligned to 8, first do one 8 pixel write
   3773        // and align the start pointer.
   3774        sub             w3,  w3,  w5
   3775        st1             {v31.8h}, [x0]
   3776        add             x0,  x0,  w5,  uxtw #1
   3777 2:
   3778        // Fill the rest of the line with aligned 8 pixel writes.
   3779        subs            w3,  w3,  #8
   3780        st1             {v31.8h}, [x0], #16
   3781        b.gt            2b
   3782        subs            w4,  w4,  #1
   3783        add             x0,  x0,  x1
   3784        b.le            9f
   3785        mov             w3,  w12
   3786        b               1b
   3787 9:
   3788        ret
   3789 endfunc
   3790 
   3791 function ipred_z3_fill2_16bpc_neon, export=1
   3792        cmp             w4,  #8
   3793        add             x10, x2,  w6,  uxtw       // left[max_base_y]
   3794        ld1r            {v31.16b}, [x10]          // padding
   3795        mov             w7,  w5
   3796        mov             w15, #64
   3797        add             x13, x0,  x1
   3798        lsl             x1,  x1,  #1
   3799        b.eq            8f
   3800 
   3801 4:      // h == 4
   3802        lsr             w8,  w7,  #6              // base
   3803        and             w9,  w7,  #0x3e           // frac
   3804        add             w7,  w7,  w5              // xpos += dx
   3805        cmp             w8,  w6                   // base >= max_base_x
   3806        lsr             w10, w7,  #6              // base
   3807        and             w11, w7,  #0x3e           // frac
   3808        b.ge            ipred_z3_fill_padding_neon
   3809        lsl             w8,  w8,  #1
   3810        lsl             w10, w10, #1
   3811        ldr             q0,  [x2, w8, uxtw]       // top[base]
   3812        ldr             q2,  [x2, w10, uxtw]
   3813        dup             v4.4h,   w9               // frac
   3814        dup             v5.4h,   w11
   3815        uzp2            v1.8h,   v0.8h,   v0.8h   // top[base+1]
   3816        uzp1            v0.8h,   v0.8h,   v0.8h   // top[base]
   3817        uzp2            v3.8h,   v2.8h,   v2.8h
   3818        uzp1            v2.8h,   v2.8h,   v2.8h
   3819        sub             v6.4h,   v1.4h,   v0.4h   // top[base+1]-top[base]
   3820        sub             v7.4h,   v3.4h,   v2.4h
   3821        ushll           v16.4s,  v0.4h,   #6      // top[base]*64
   3822        ushll           v17.4s,  v2.4h,   #6
   3823        smlal           v16.4s,  v6.4h,   v4.4h   // + top[base+1]*frac
   3824        smlal           v17.4s,  v7.4h,   v5.4h
   3825        rshrn           v16.4h,  v16.4s,  #6
   3826        rshrn           v17.4h,  v17.4s,  #6
   3827        subs            w3,  w3,  #2
   3828        zip1            v18.8h,  v16.8h,  v17.8h
   3829        st1             {v18.s}[0], [x0],  x1
   3830        st1             {v18.s}[1], [x13], x1
   3831        add             w7,  w7,  w5              // xpos += dx
   3832        st1             {v18.s}[2], [x0]
   3833        st1             {v18.s}[3], [x13]
   3834        b.le            9f
   3835        sub             x0,  x0,  x1              // ptr -= 4 * (2*stride)
   3836        sub             x13, x13, x1
   3837        add             x0,  x0,  #4
   3838        add             x13, x13, #4
   3839        b               4b
   3840 9:
   3841        ret
   3842 
   3843 8:      // h == 8
   3844        lsr             w8,  w7,  #6              // base
   3845        and             w9,  w7,  #0x3e           // frac
   3846        add             w7,  w7,  w5              // xpos += dx
   3847        cmp             w8,  w6                   // base >= max_base_x
   3848        lsr             w10, w7,  #6              // base
   3849        and             w11, w7,  #0x3e           // frac
   3850        b.ge            ipred_z3_fill_padding_neon
   3851        add             x8,  x2,  w8,  uxtw #1
   3852        add             x10, x2,  w10, uxtw #1
   3853        dup             v4.8h,   w9               // frac
   3854        dup             v5.8h,   w11
   3855        ld1             {v0.8h, v1.8h},  [x8]     // top[base]
   3856        ld1             {v2.8h, v3.8h},  [x10]
   3857        sub             w9,  w15, w9              // 64 - frac
   3858        sub             w11, w15, w11
   3859        dup             v6.8h,   w9               // 64 - frac
   3860        dup             v7.8h,   w11
   3861        uzp2            v20.8h,  v0.8h,   v1.8h   // top[base+1]
   3862        uzp1            v0.8h,   v0.8h,   v1.8h   // top[base]
   3863        uzp2            v21.8h,  v2.8h,   v3.8h
   3864        uzp1            v2.8h,   v2.8h,   v3.8h
   3865        umull           v16.4s,  v0.4h,   v6.4h   // top[base]*(64-frac)
   3866        umlal           v16.4s,  v20.4h,  v4.4h   // + top[base+1]*frac
   3867        umull2          v17.4s,  v0.8h,   v6.8h
   3868        umlal2          v17.4s,  v20.8h,  v4.8h
   3869        umull           v18.4s,  v2.4h,   v7.4h
   3870        umlal           v18.4s,  v21.4h,  v5.4h
   3871        umull2          v19.4s,  v2.8h,   v7.8h
   3872        umlal2          v19.4s,  v21.8h,  v5.8h
   3873        rshrn           v16.4h,  v16.4s,  #6
   3874        rshrn2          v16.8h,  v17.4s,  #6
   3875        rshrn           v17.4h,  v18.4s,  #6
   3876        rshrn2          v17.8h,  v19.4s,  #6
   3877        subs            w3,  w3,  #2
   3878        zip1            v18.8h,  v16.8h,  v17.8h
   3879        zip2            v19.8h,  v16.8h,  v17.8h
   3880        add             w7,  w7,  w5              // xpos += dx
   3881        st1             {v18.s}[0], [x0],  x1
   3882        st1             {v18.s}[1], [x13], x1
   3883        st1             {v18.s}[2], [x0],  x1
   3884        st1             {v18.s}[3], [x13], x1
   3885        st1             {v19.s}[0], [x0],  x1
   3886        st1             {v19.s}[1], [x13], x1
   3887        st1             {v19.s}[2], [x0],  x1
   3888        st1             {v19.s}[3], [x13], x1
   3889        b.le            9f
   3890        sub             x0,  x0,  x1, lsl #2      // ptr -= 4 * (2*stride)
   3891        sub             x13, x13, x1, lsl #2
   3892        add             x0,  x0,  #4
   3893        add             x13, x13, #4
   3894        b               8b
   3895 9:
   3896        ret
   3897 endfunc
   3898 
   3899 
   3900 // void ipred_filter_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   3901 //                              const pixel *const topleft,
   3902 //                              const int width, const int height, const int filt_idx,
   3903 //                              const int max_width, const int max_height,
   3904 //                              const int bitdepth_max);
   3905 .macro filter_fn bpc
   3906 function ipred_filter_\bpc\()bpc_neon
   3907        and             w5,  w5,  #511
   3908        movrel          x6,  X(filter_intra_taps)
   3909        lsl             w5,  w5,  #6
   3910        add             x6,  x6,  w5, uxtw
   3911        ld1             {v16.8b, v17.8b, v18.8b, v19.8b}, [x6], #32
   3912        clz             w9,  w3
   3913        movrel          x5,  ipred_filter\bpc\()_tbl
   3914        ld1             {v20.8b, v21.8b, v22.8b}, [x6]
   3915        sub             w9,  w9,  #26
   3916        ldrsw           x9,  [x5, w9, uxtw #2]
   3917        sxtl            v16.8h,  v16.8b
   3918        sxtl            v17.8h,  v17.8b
   3919        add             x5,  x5,  x9
   3920        sxtl            v18.8h,  v18.8b
   3921        sxtl            v19.8h,  v19.8b
   3922        add             x6,  x0,  x1
   3923        lsl             x1,  x1,  #1
   3924        sxtl            v20.8h,  v20.8b
   3925        sxtl            v21.8h,  v21.8b
   3926        sxtl            v22.8h,  v22.8b
   3927        dup             v31.8h,  w8
   3928 .if \bpc == 10
   3929        movi            v30.8h,  #0
   3930 .endif
   3931        br              x5
   3932 40:
   3933        AARCH64_VALID_JUMP_TARGET
   3934        ldur            d0,  [x2, #2]             // top (0-3)
   3935        sub             x2,  x2,  #4
   3936        mov             x7,  #-4
   3937 4:
   3938        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
   3939 .if \bpc == 10
   3940        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
   3941        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
   3942        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
   3943        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
   3944        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
   3945        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
   3946        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
   3947        srshr           v2.8h,   v2.8h,   #4
   3948        smax            v2.8h,   v2.8h,   v30.8h
   3949 .else
   3950        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
   3951        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
   3952        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
   3953        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
   3954        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
   3955        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
   3956        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
   3957        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
   3958        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
   3959        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
   3960        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
   3961        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
   3962        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
   3963        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
   3964        sqrshrun        v2.4h,   v2.4s,   #4
   3965        sqrshrun2       v2.8h,   v3.4s,   #4
   3966 .endif
   3967        smin            v2.8h,   v2.8h,   v31.8h
   3968        subs            w4,  w4,  #2
   3969        st1             {v2.d}[0], [x0], x1
   3970        ext             v0.16b,  v2.16b,  v2.16b, #8 // move top from [4-7] to [0-3]
   3971        st1             {v2.d}[1], [x6], x1
   3972        b.gt            4b
   3973        ret
   3974 80:
   3975        AARCH64_VALID_JUMP_TARGET
   3976        ldur            q0,  [x2, #2]             // top (0-7)
   3977        sub             x2,  x2,  #4
   3978        mov             x7,  #-4
   3979 8:
   3980        ld1             {v1.4h}, [x2], x7         // left (0-1) + topleft (2)
   3981 .if \bpc == 10
   3982        mul             v2.8h,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
   3983        mla             v2.8h,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
   3984        mla             v2.8h,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
   3985        mla             v2.8h,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
   3986        mla             v2.8h,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
   3987        mla             v2.8h,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
   3988        mla             v2.8h,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
   3989        mul             v3.8h,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
   3990        mla             v3.8h,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
   3991        mla             v3.8h,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
   3992        srshr           v2.8h,   v2.8h,   #4
   3993        smax            v2.8h,   v2.8h,   v30.8h
   3994        smin            v2.8h,   v2.8h,   v31.8h
   3995        mla             v3.8h,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
   3996        mla             v3.8h,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
   3997        mla             v3.8h,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
   3998        mla             v3.8h,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
   3999        srshr           v3.8h,   v3.8h,   #4
   4000        smax            v3.8h,   v3.8h,   v30.8h
   4001 .else
   4002        smull           v2.4s,   v17.4h,  v0.h[0] // p1(top[0]) * filter(1)
   4003        smlal           v2.4s,   v18.4h,  v0.h[1] // p2(top[1]) * filter(2)
   4004        smlal           v2.4s,   v19.4h,  v0.h[2] // p3(top[2]) * filter(3)
   4005        smlal           v2.4s,   v20.4h,  v0.h[3] // p4(top[3]) * filter(4)
   4006        smlal           v2.4s,   v16.4h,  v1.h[2] // p0(topleft) * filter(0)
   4007        smlal           v2.4s,   v21.4h,  v1.h[1] // p5(left[0]) * filter(5)
   4008        smlal           v2.4s,   v22.4h,  v1.h[0] // p6(left[1]) * filter(6)
   4009        smull2          v3.4s,   v17.8h,  v0.h[0] // p1(top[0]) * filter(1)
   4010        smlal2          v3.4s,   v18.8h,  v0.h[1] // p2(top[1]) * filter(2)
   4011        smlal2          v3.4s,   v19.8h,  v0.h[2] // p3(top[2]) * filter(3)
   4012        smlal2          v3.4s,   v20.8h,  v0.h[3] // p4(top[3]) * filter(4)
   4013        smlal2          v3.4s,   v16.8h,  v1.h[2] // p0(topleft) * filter(0)
   4014        smlal2          v3.4s,   v21.8h,  v1.h[1] // p5(left[0]) * filter(5)
   4015        smlal2          v3.4s,   v22.8h,  v1.h[0] // p6(left[1]) * filter(6)
   4016        smull           v4.4s,   v17.4h,  v0.h[4] // p1(top[0]) * filter(1)
   4017        smlal           v4.4s,   v18.4h,  v0.h[5] // p2(top[1]) * filter(2)
   4018        smlal           v4.4s,   v19.4h,  v0.h[6] // p3(top[2]) * filter(3)
   4019        sqrshrun        v2.4h,   v2.4s,   #4
   4020        sqrshrun2       v2.8h,   v3.4s,   #4
   4021        smin            v2.8h,   v2.8h,   v31.8h
   4022        smlal           v4.4s,   v20.4h,  v0.h[7] // p4(top[3]) * filter(4)
   4023        smlal           v4.4s,   v16.4h,  v0.h[3] // p0(topleft) * filter(0)
   4024        smlal           v4.4s,   v21.4h,  v2.h[3] // p5(left[0]) * filter(5)
   4025        smlal           v4.4s,   v22.4h,  v2.h[7] // p6(left[1]) * filter(6)
   4026        smull2          v5.4s,   v17.8h,  v0.h[4] // p1(top[0]) * filter(1)
   4027        smlal2          v5.4s,   v18.8h,  v0.h[5] // p2(top[1]) * filter(2)
   4028        smlal2          v5.4s,   v19.8h,  v0.h[6] // p3(top[2]) * filter(3)
   4029        smlal2          v5.4s,   v20.8h,  v0.h[7] // p4(top[3]) * filter(4)
   4030        smlal2          v5.4s,   v16.8h,  v0.h[3] // p0(topleft) * filter(0)
   4031        smlal2          v5.4s,   v21.8h,  v2.h[3] // p5(left[0]) * filter(5)
   4032        smlal2          v5.4s,   v22.8h,  v2.h[7] // p6(left[1]) * filter(6)
   4033        sqrshrun        v3.4h,   v4.4s,   #4
   4034        sqrshrun2       v3.8h,   v5.4s,   #4
   4035 .endif
   4036        smin            v3.8h,   v3.8h,   v31.8h
   4037        subs            w4,  w4,  #2
   4038        st2             {v2.d, v3.d}[0], [x0], x1
   4039        zip2            v0.2d,   v2.2d,   v3.2d
   4040        st2             {v2.d, v3.d}[1], [x6], x1
   4041        b.gt            8b
   4042        ret
   4043 160:
   4044 320:
   4045        AARCH64_VALID_JUMP_TARGET
   4046        add             x8,  x2,  #2
   4047        sub             x2,  x2,  #4
   4048        mov             x7,  #-4
   4049        sub             x1,  x1,  w3, uxtw #1
   4050        mov             w9,  w3
   4051 
   4052 1:
   4053        ld1             {v0.4h}, [x2], x7         // left (0-1) + topleft (2)
   4054 2:
   4055        ld1             {v1.8h, v2.8h}, [x8], #32 // top(0-15)
   4056 .if \bpc == 10
   4057        mul             v3.8h,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
   4058        mla             v3.8h,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
   4059        mla             v3.8h,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
   4060        mla             v3.8h,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
   4061        mla             v3.8h,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
   4062        mla             v3.8h,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
   4063        mla             v3.8h,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
   4064 
   4065        mul             v4.8h,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
   4066        mla             v4.8h,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
   4067        mla             v4.8h,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
   4068        srshr           v3.8h,   v3.8h,   #4
   4069        smax            v3.8h,   v3.8h,   v30.8h
   4070        smin            v3.8h,   v3.8h,   v31.8h
   4071        mla             v4.8h,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
   4072        mla             v4.8h,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
   4073        mla             v4.8h,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
   4074        mla             v4.8h,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
   4075 
   4076        mul             v5.8h,   v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
   4077        mla             v5.8h,   v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
   4078        mla             v5.8h,   v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
   4079        srshr           v4.8h,   v4.8h,   #4
   4080        smax            v4.8h,   v4.8h,   v30.8h
   4081        smin            v4.8h,   v4.8h,   v31.8h
   4082        mla             v5.8h,   v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
   4083        mla             v5.8h,   v16.8h,  v1.h[7] // p0(topleft) * filter(0)
   4084        mla             v5.8h,   v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
   4085        mla             v5.8h,   v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
   4086 
   4087        mul             v6.8h,   v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
   4088        mla             v6.8h,   v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
   4089        mla             v6.8h,   v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
   4090        srshr           v5.8h,   v5.8h,   #4
   4091        smax            v5.8h,   v5.8h,   v30.8h
   4092        smin            v5.8h,   v5.8h,   v31.8h
   4093        mla             v6.8h,   v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
   4094        mla             v6.8h,   v16.8h,  v2.h[3] // p0(topleft) * filter(0)
   4095        mla             v6.8h,   v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
   4096        mla             v6.8h,   v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
   4097 
   4098        subs            w3,  w3,  #16
   4099        srshr           v6.8h,   v6.8h,   #4
   4100        smax            v6.8h,   v6.8h,   v30.8h
   4101 .else
   4102        smull           v3.4s,   v16.4h,  v0.h[2] // p0(topleft) * filter(0)
   4103        smlal           v3.4s,   v21.4h,  v0.h[1] // p5(left[0]) * filter(5)
   4104        smlal           v3.4s,   v22.4h,  v0.h[0] // p6(left[1]) * filter(6)
   4105        smlal           v3.4s,   v17.4h,  v1.h[0] // p1(top[0]) * filter(1)
   4106        smlal           v3.4s,   v18.4h,  v1.h[1] // p2(top[1]) * filter(2)
   4107        smlal           v3.4s,   v19.4h,  v1.h[2] // p3(top[2]) * filter(3)
   4108        smlal           v3.4s,   v20.4h,  v1.h[3] // p4(top[3]) * filter(4)
   4109        smull2          v4.4s,   v16.8h,  v0.h[2] // p0(topleft) * filter(0)
   4110        smlal2          v4.4s,   v21.8h,  v0.h[1] // p5(left[0]) * filter(5)
   4111        smlal2          v4.4s,   v22.8h,  v0.h[0] // p6(left[1]) * filter(6)
   4112        smlal2          v4.4s,   v17.8h,  v1.h[0] // p1(top[0]) * filter(1)
   4113        smlal2          v4.4s,   v18.8h,  v1.h[1] // p2(top[1]) * filter(2)
   4114        smlal2          v4.4s,   v19.8h,  v1.h[2] // p3(top[2]) * filter(3)
   4115        smlal2          v4.4s,   v20.8h,  v1.h[3] // p4(top[3]) * filter(4)
   4116 
   4117        smull           v5.4s,   v17.4h,  v1.h[4] // p1(top[0]) * filter(1)
   4118        smlal           v5.4s,   v18.4h,  v1.h[5] // p2(top[1]) * filter(2)
   4119        smlal           v5.4s,   v19.4h,  v1.h[6] // p3(top[2]) * filter(3)
   4120        sqrshrun        v3.4h,   v3.4s,   #4
   4121        sqrshrun2       v3.8h,   v4.4s,   #4
   4122        smin            v3.8h,   v3.8h,   v31.8h
   4123        smlal           v5.4s,   v20.4h,  v1.h[7] // p4(top[3]) * filter(4)
   4124        smlal           v5.4s,   v16.4h,  v1.h[3] // p0(topleft) * filter(0)
   4125        smlal           v5.4s,   v21.4h,  v3.h[3] // p5(left[0]) * filter(5)
   4126        smlal           v5.4s,   v22.4h,  v3.h[7] // p6(left[1]) * filter(6)
   4127        smull2          v6.4s,   v17.8h,  v1.h[4] // p1(top[0]) * filter(1)
   4128        smlal2          v6.4s,   v18.8h,  v1.h[5] // p2(top[1]) * filter(2)
   4129        smlal2          v6.4s,   v19.8h,  v1.h[6] // p3(top[2]) * filter(3)
   4130        smlal2          v6.4s,   v20.8h,  v1.h[7] // p4(top[3]) * filter(4)
   4131        smlal2          v6.4s,   v16.8h,  v1.h[3] // p0(topleft) * filter(0)
   4132        smlal2          v6.4s,   v21.8h,  v3.h[3] // p5(left[0]) * filter(5)
   4133        smlal2          v6.4s,   v22.8h,  v3.h[7] // p6(left[1]) * filter(6)
   4134 
   4135        smull           v24.4s,  v17.4h,  v2.h[0] // p1(top[0]) * filter(1)
   4136        smlal           v24.4s,  v18.4h,  v2.h[1] // p2(top[1]) * filter(2)
   4137        smlal           v24.4s,  v19.4h,  v2.h[2] // p3(top[2]) * filter(3)
   4138        sqrshrun        v4.4h,   v5.4s,   #4
   4139        sqrshrun2       v4.8h,   v6.4s,   #4
   4140        smin            v4.8h,   v4.8h,   v31.8h
   4141        smlal           v24.4s,  v20.4h,  v2.h[3] // p4(top[3]) * filter(4)
   4142        smlal           v24.4s,  v16.4h,  v1.h[7] // p0(topleft) * filter(0)
   4143        smlal           v24.4s,  v21.4h,  v4.h[3] // p5(left[0]) * filter(5)
   4144        smlal           v24.4s,  v22.4h,  v4.h[7] // p6(left[1]) * filter(6)
   4145        smull2          v25.4s,  v17.8h,  v2.h[0] // p1(top[0]) * filter(1)
   4146        smlal2          v25.4s,  v18.8h,  v2.h[1] // p2(top[1]) * filter(2)
   4147        smlal2          v25.4s,  v19.8h,  v2.h[2] // p3(top[2]) * filter(3)
   4148        smlal2          v25.4s,  v20.8h,  v2.h[3] // p4(top[3]) * filter(4)
   4149        smlal2          v25.4s,  v16.8h,  v1.h[7] // p0(topleft) * filter(0)
   4150        smlal2          v25.4s,  v21.8h,  v4.h[3] // p5(left[0]) * filter(5)
   4151        smlal2          v25.4s,  v22.8h,  v4.h[7] // p6(left[1]) * filter(6)
   4152 
   4153        smull           v26.4s,  v17.4h,  v2.h[4] // p1(top[0]) * filter(1)
   4154        smlal           v26.4s,  v18.4h,  v2.h[5] // p2(top[1]) * filter(2)
   4155        smlal           v26.4s,  v19.4h,  v2.h[6] // p3(top[2]) * filter(3)
   4156        sqrshrun        v5.4h,   v24.4s,  #4
   4157        sqrshrun2       v5.8h,   v25.4s,  #4
   4158        smin            v5.8h,   v5.8h,   v31.8h
   4159        smlal           v26.4s,  v20.4h,  v2.h[7] // p4(top[3]) * filter(4)
   4160        smlal           v26.4s,  v16.4h,  v2.h[3] // p0(topleft) * filter(0)
   4161        smlal           v26.4s,  v21.4h,  v5.h[3] // p5(left[0]) * filter(5)
   4162        smlal           v26.4s,  v22.4h,  v5.h[7] // p6(left[1]) * filter(6)
   4163        smull2          v27.4s,  v17.8h,  v2.h[4] // p1(top[0]) * filter(1)
   4164        smlal2          v27.4s,  v18.8h,  v2.h[5] // p2(top[1]) * filter(2)
   4165        smlal2          v27.4s,  v19.8h,  v2.h[6] // p3(top[2]) * filter(3)
   4166        smlal2          v27.4s,  v20.8h,  v2.h[7] // p4(top[3]) * filter(4)
   4167        smlal2          v27.4s,  v16.8h,  v2.h[3] // p0(topleft) * filter(0)
   4168        smlal2          v27.4s,  v21.8h,  v5.h[3] // p5(left[0]) * filter(5)
   4169        smlal2          v27.4s,  v22.8h,  v5.h[7] // p6(left[1]) * filter(6)
   4170 
   4171        subs            w3,  w3,  #16
   4172        sqrshrun        v6.4h,   v26.4s,  #4
   4173        sqrshrun2       v6.8h,   v27.4s,  #4
   4174 .endif
   4175        smin            v6.8h,   v6.8h,   v31.8h
   4176 
   4177        ins             v0.h[2], v2.h[7]
   4178        st4             {v3.d, v4.d, v5.d, v6.d}[0], [x0], #32
   4179        ins             v0.h[0], v6.h[7]
   4180        st4             {v3.d, v4.d, v5.d, v6.d}[1], [x6], #32
   4181        ins             v0.h[1], v6.h[3]
   4182        b.gt            2b
   4183        subs            w4,  w4,  #2
   4184        b.le            9f
   4185        sub             x8,  x6,  w9, uxtw #1
   4186        add             x0,  x0,  x1
   4187        add             x6,  x6,  x1
   4188        mov             w3,  w9
   4189        b               1b
   4190 9:
   4191        ret
   4192 endfunc
   4193 
   4194 jumptable ipred_filter\bpc\()_tbl
   4195        .word 320b - ipred_filter\bpc\()_tbl
   4196        .word 160b - ipred_filter\bpc\()_tbl
   4197        .word 80b  - ipred_filter\bpc\()_tbl
   4198        .word 40b  - ipred_filter\bpc\()_tbl
   4199 endjumptable
   4200 .endm
   4201 
   4202 filter_fn 10
   4203 filter_fn 12
   4204 
   4205 function ipred_filter_16bpc_neon, export=1
   4206        ldr             w8,  [sp]
   4207        cmp             w8,  0x3ff
   4208        b.le            ipred_filter_10bpc_neon
   4209        b               ipred_filter_12bpc_neon
   4210 endfunc
   4211 
   4212 // void pal_pred_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   4213 //                          const pixel *const pal, const uint8_t *idx,
   4214 //                          const int w, const int h);
   4215 function pal_pred_16bpc_neon, export=1
   4216        ld1             {v30.8h}, [x2]
   4217        clz             w9,  w4
   4218        movrel          x6,  pal_pred_tbl
   4219        sub             w9,  w9,  #25
   4220        movi            v29.16b, #7
   4221        ldrsw           x9,  [x6, w9, uxtw #2]
   4222        movi            v31.8h,  #1, lsl #8
   4223        add             x6,  x6,  x9
   4224        br              x6
   4225 40:
   4226        AARCH64_VALID_JUMP_TARGET
   4227        add             x2,  x0,  x1
   4228        lsl             x1,  x1,  #1
   4229 4:
   4230        ld1             {v1.8b}, [x3], #8
   4231        subs            w5,  w5,  #4
   4232        ushr            v3.8b,   v1.8b,   #4
   4233        and             v2.8b,   v1.8b,   v29.8b
   4234        zip1            v1.16b,  v2.16b,  v3.16b
   4235        // Restructure v1 from a, b, c, ... into 2*a, 2*a+1, 2*b, 2*b+1, 2*c, 2*c+1, ...
   4236        add             v1.16b,  v1.16b,  v1.16b
   4237        zip1            v0.16b,  v1.16b,  v1.16b
   4238        zip2            v1.16b,  v1.16b,  v1.16b
   4239        add             v0.8h,   v0.8h,   v31.8h
   4240        add             v1.8h,   v1.8h,   v31.8h
   4241        tbl             v0.16b, {v30.16b}, v0.16b
   4242        st1             {v0.d}[0], [x0], x1
   4243        tbl             v1.16b, {v30.16b}, v1.16b
   4244        st1             {v0.d}[1], [x2], x1
   4245        st1             {v1.d}[0], [x0], x1
   4246        st1             {v1.d}[1], [x2], x1
   4247        b.gt            4b
   4248        ret
   4249 80:
   4250        AARCH64_VALID_JUMP_TARGET
   4251        add             x2,  x0,  x1
   4252        lsl             x1,  x1,  #1
   4253 8:
   4254        ld1             {v2.16b}, [x3], #16
   4255        subs            w5,  w5,  #4
   4256        ushr            v4.16b,  v2.16b,  #4
   4257        and             v3.16b,  v2.16b,  v29.16b
   4258        zip1            v2.16b,  v3.16b,  v4.16b
   4259        zip2            v3.16b,  v3.16b,  v4.16b
   4260        add             v2.16b,  v2.16b,  v2.16b
   4261        add             v3.16b,  v3.16b,  v3.16b
   4262        zip1            v0.16b,  v2.16b,  v2.16b
   4263        zip2            v1.16b,  v2.16b,  v2.16b
   4264        zip1            v2.16b,  v3.16b,  v3.16b
   4265        zip2            v3.16b,  v3.16b,  v3.16b
   4266        add             v0.8h,   v0.8h,   v31.8h
   4267        add             v1.8h,   v1.8h,   v31.8h
   4268        add             v2.8h,   v2.8h,   v31.8h
   4269        add             v3.8h,   v3.8h,   v31.8h
   4270        tbl             v0.16b, {v30.16b}, v0.16b
   4271        tbl             v1.16b, {v30.16b}, v1.16b
   4272        st1             {v0.8h}, [x0], x1
   4273        tbl             v2.16b, {v30.16b}, v2.16b
   4274        st1             {v1.8h}, [x2], x1
   4275        tbl             v3.16b, {v30.16b}, v3.16b
   4276        st1             {v2.8h}, [x0], x1
   4277        st1             {v3.8h}, [x2], x1
   4278        b.gt            8b
   4279        ret
   4280 160:
   4281        AARCH64_VALID_JUMP_TARGET
   4282        add             x2,  x0,  x1
   4283        lsl             x1,  x1,  #1
   4284 16:
   4285        ld1             {v4.16b, v5.16b}, [x3], #32
   4286        subs            w5,  w5,  #4
   4287        ushr            v7.16b,  v4.16b,  #4
   4288        and             v6.16b,  v4.16b,  v29.16b
   4289        ushr            v3.16b,  v5.16b,  #4
   4290        and             v2.16b,  v5.16b,  v29.16b
   4291        zip1            v4.16b,  v6.16b,  v7.16b
   4292        zip2            v5.16b,  v6.16b,  v7.16b
   4293        zip1            v6.16b,  v2.16b,  v3.16b
   4294        zip2            v7.16b,  v2.16b,  v3.16b
   4295        add             v4.16b,  v4.16b,  v4.16b
   4296        add             v5.16b,  v5.16b,  v5.16b
   4297        add             v6.16b,  v6.16b,  v6.16b
   4298        add             v7.16b,  v7.16b,  v7.16b
   4299        zip1            v0.16b,  v4.16b,  v4.16b
   4300        zip2            v1.16b,  v4.16b,  v4.16b
   4301        zip1            v2.16b,  v5.16b,  v5.16b
   4302        zip2            v3.16b,  v5.16b,  v5.16b
   4303        zip1            v4.16b,  v6.16b,  v6.16b
   4304        zip2            v5.16b,  v6.16b,  v6.16b
   4305        zip1            v6.16b,  v7.16b,  v7.16b
   4306        zip2            v7.16b,  v7.16b,  v7.16b
   4307        add             v0.8h,   v0.8h,   v31.8h
   4308        add             v1.8h,   v1.8h,   v31.8h
   4309        add             v2.8h,   v2.8h,   v31.8h
   4310        add             v3.8h,   v3.8h,   v31.8h
   4311        add             v4.8h,   v4.8h,   v31.8h
   4312        tbl             v0.16b, {v30.16b}, v0.16b
   4313        add             v5.8h,   v5.8h,   v31.8h
   4314        tbl             v1.16b, {v30.16b}, v1.16b
   4315        add             v6.8h,   v6.8h,   v31.8h
   4316        tbl             v2.16b, {v30.16b}, v2.16b
   4317        add             v7.8h,   v7.8h,   v31.8h
   4318        tbl             v3.16b, {v30.16b}, v3.16b
   4319        tbl             v4.16b, {v30.16b}, v4.16b
   4320        tbl             v5.16b, {v30.16b}, v5.16b
   4321        st1             {v0.8h, v1.8h}, [x0], x1
   4322        tbl             v6.16b, {v30.16b}, v6.16b
   4323        st1             {v2.8h, v3.8h}, [x2], x1
   4324        tbl             v7.16b, {v30.16b}, v7.16b
   4325        st1             {v4.8h, v5.8h}, [x0], x1
   4326        st1             {v6.8h, v7.8h}, [x2], x1
   4327        b.gt            16b
   4328        ret
   4329 320:
   4330        AARCH64_VALID_JUMP_TARGET
   4331        add             x2,  x0,  x1
   4332        lsl             x1,  x1,  #1
   4333 32:
   4334        ld1             {v4.16b, v5.16b}, [x3], #32
   4335        subs            w5,  w5,  #2
   4336        ushr            v7.16b,  v4.16b,  #4
   4337        and             v6.16b,  v4.16b,  v29.16b
   4338        ushr            v3.16b,  v5.16b,  #4
   4339        and             v2.16b,  v5.16b,  v29.16b
   4340        zip1            v4.16b,  v6.16b,  v7.16b
   4341        zip2            v5.16b,  v6.16b,  v7.16b
   4342        zip1            v6.16b,  v2.16b,  v3.16b
   4343        zip2            v7.16b,  v2.16b,  v3.16b
   4344        add             v4.16b,  v4.16b,  v4.16b
   4345        add             v5.16b,  v5.16b,  v5.16b
   4346        add             v6.16b,  v6.16b,  v6.16b
   4347        add             v7.16b,  v7.16b,  v7.16b
   4348        zip1            v0.16b,  v4.16b,  v4.16b
   4349        zip2            v1.16b,  v4.16b,  v4.16b
   4350        zip1            v2.16b,  v5.16b,  v5.16b
   4351        zip2            v3.16b,  v5.16b,  v5.16b
   4352        zip1            v4.16b,  v6.16b,  v6.16b
   4353        zip2            v5.16b,  v6.16b,  v6.16b
   4354        zip1            v6.16b,  v7.16b,  v7.16b
   4355        zip2            v7.16b,  v7.16b,  v7.16b
   4356        add             v0.8h,   v0.8h,   v31.8h
   4357        add             v1.8h,   v1.8h,   v31.8h
   4358        add             v2.8h,   v2.8h,   v31.8h
   4359        add             v3.8h,   v3.8h,   v31.8h
   4360        add             v4.8h,   v4.8h,   v31.8h
   4361        tbl             v0.16b, {v30.16b}, v0.16b
   4362        add             v5.8h,   v5.8h,   v31.8h
   4363        tbl             v1.16b, {v30.16b}, v1.16b
   4364        add             v6.8h,   v6.8h,   v31.8h
   4365        tbl             v2.16b, {v30.16b}, v2.16b
   4366        add             v7.8h,   v7.8h,   v31.8h
   4367        tbl             v3.16b, {v30.16b}, v3.16b
   4368        tbl             v4.16b, {v30.16b}, v4.16b
   4369        tbl             v5.16b, {v30.16b}, v5.16b
   4370        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
   4371        tbl             v6.16b, {v30.16b}, v6.16b
   4372        tbl             v7.16b, {v30.16b}, v7.16b
   4373        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
   4374        b.gt            32b
   4375        ret
   4376 640:
   4377        AARCH64_VALID_JUMP_TARGET
   4378        add             x2,  x0,  #64
   4379 64:
   4380        ld1             {v4.16b, v5.16b}, [x3], #32
   4381        subs            w5,  w5,  #1
   4382        ushr            v7.16b,  v4.16b,  #4
   4383        and             v6.16b,  v4.16b,  v29.16b
   4384        ushr            v3.16b,  v5.16b,  #4
   4385        and             v2.16b,  v5.16b,  v29.16b
   4386        zip1            v4.16b,  v6.16b,  v7.16b
   4387        zip2            v5.16b,  v6.16b,  v7.16b
   4388        zip1            v6.16b,  v2.16b,  v3.16b
   4389        zip2            v7.16b,  v2.16b,  v3.16b
   4390        add             v4.16b,  v4.16b,  v4.16b
   4391        add             v5.16b,  v5.16b,  v5.16b
   4392        add             v6.16b,  v6.16b,  v6.16b
   4393        add             v7.16b,  v7.16b,  v7.16b
   4394        zip1            v0.16b,  v4.16b,  v4.16b
   4395        zip2            v1.16b,  v4.16b,  v4.16b
   4396        zip1            v2.16b,  v5.16b,  v5.16b
   4397        zip2            v3.16b,  v5.16b,  v5.16b
   4398        zip1            v4.16b,  v6.16b,  v6.16b
   4399        zip2            v5.16b,  v6.16b,  v6.16b
   4400        zip1            v6.16b,  v7.16b,  v7.16b
   4401        zip2            v7.16b,  v7.16b,  v7.16b
   4402        add             v0.8h,   v0.8h,   v31.8h
   4403        add             v1.8h,   v1.8h,   v31.8h
   4404        add             v2.8h,   v2.8h,   v31.8h
   4405        add             v3.8h,   v3.8h,   v31.8h
   4406        add             v4.8h,   v4.8h,   v31.8h
   4407        tbl             v0.16b, {v30.16b}, v0.16b
   4408        add             v5.8h,   v5.8h,   v31.8h
   4409        tbl             v1.16b, {v30.16b}, v1.16b
   4410        add             v6.8h,   v6.8h,   v31.8h
   4411        tbl             v2.16b, {v30.16b}, v2.16b
   4412        add             v7.8h,   v7.8h,   v31.8h
   4413        tbl             v3.16b, {v30.16b}, v3.16b
   4414        tbl             v4.16b, {v30.16b}, v4.16b
   4415        tbl             v5.16b, {v30.16b}, v5.16b
   4416        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], x1
   4417        tbl             v6.16b, {v30.16b}, v6.16b
   4418        tbl             v7.16b, {v30.16b}, v7.16b
   4419        st1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x2], x1
   4420        b.gt            64b
   4421        ret
   4422 endfunc
   4423 
   4424 jumptable pal_pred_tbl
   4425        .word 640b - pal_pred_tbl
   4426        .word 320b - pal_pred_tbl
   4427        .word 160b - pal_pred_tbl
   4428        .word 80b  - pal_pred_tbl
   4429        .word 40b  - pal_pred_tbl
   4430 endjumptable
   4431 
   4432 // void ipred_cfl_128_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   4433 //                               const pixel *const topleft,
   4434 //                               const int width, const int height,
   4435 //                               const int16_t *ac, const int alpha,
   4436 //                               const int bitdepth_max);
   4437 function ipred_cfl_128_16bpc_neon, export=1
   4438        dup             v31.8h,  w7   // bitdepth_max
   4439        clz             w9,  w3
   4440        movrel          x7,  ipred_cfl_128_tbl
   4441        sub             w9,  w9,  #26
   4442        ldrsw           x9,  [x7, w9, uxtw #2]
   4443        urshr           v0.8h,   v31.8h,  #1
   4444        dup             v1.8h,   w6   // alpha
   4445        add             x7,  x7,  x9
   4446        add             x6,  x0,  x1
   4447        lsl             x1,  x1,  #1
   4448        movi            v30.8h,  #0
   4449        br              x7
   4450 L(ipred_cfl_splat_w4):
   4451        AARCH64_VALID_JUMP_TARGET
   4452 1:
   4453        ld1             {v4.8h, v5.8h}, [x5], #32
   4454        subs            w4,  w4,  #4
   4455        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
   4456        smull2          v3.4s,   v4.8h,   v1.8h
   4457        smull           v4.4s,   v5.4h,   v1.4h
   4458        smull2          v5.4s,   v5.8h,   v1.8h
   4459        cmlt            v16.4s,  v2.4s,   #0     // sign
   4460        cmlt            v17.4s,  v3.4s,   #0
   4461        cmlt            v18.4s,  v4.4s,   #0
   4462        cmlt            v19.4s,  v5.4s,   #0
   4463        add             v2.4s,   v2.4s,   v16.4s // diff + sign
   4464        add             v3.4s,   v3.4s,   v17.4s
   4465        add             v4.4s,   v4.4s,   v18.4s
   4466        add             v5.4s,   v5.4s,   v19.4s
   4467        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
   4468        rshrn2          v2.8h,   v3.4s,   #6
   4469        rshrn           v3.4h,   v4.4s,   #6
   4470        rshrn2          v3.8h,   v5.4s,   #6
   4471        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
   4472        add             v3.8h,   v3.8h,   v0.8h
   4473        smax            v2.8h,   v2.8h,   v30.8h
   4474        smax            v3.8h,   v3.8h,   v30.8h
   4475        smin            v2.8h,   v2.8h,   v31.8h
   4476        smin            v3.8h,   v3.8h,   v31.8h
   4477        st1             {v2.d}[0],  [x0], x1
   4478        st1             {v2.d}[1],  [x6], x1
   4479        st1             {v3.d}[0],  [x0], x1
   4480        st1             {v3.d}[1],  [x6], x1
   4481        b.gt            1b
   4482        ret
   4483 L(ipred_cfl_splat_w8):
   4484        AARCH64_VALID_JUMP_TARGET
   4485 1:
   4486        ld1             {v4.8h, v5.8h}, [x5], #32
   4487        subs            w4,  w4,  #2
   4488        smull           v2.4s,   v4.4h,   v1.4h  // diff = ac * alpha
   4489        smull2          v3.4s,   v4.8h,   v1.8h
   4490        smull           v4.4s,   v5.4h,   v1.4h
   4491        smull2          v5.4s,   v5.8h,   v1.8h
   4492        cmlt            v16.4s,  v2.4s,   #0     // sign
   4493        cmlt            v17.4s,  v3.4s,   #0
   4494        cmlt            v18.4s,  v4.4s,   #0
   4495        cmlt            v19.4s,  v5.4s,   #0
   4496        add             v2.4s,   v2.4s,   v16.4s // diff + sign
   4497        add             v3.4s,   v3.4s,   v17.4s
   4498        add             v4.4s,   v4.4s,   v18.4s
   4499        add             v5.4s,   v5.4s,   v19.4s
   4500        rshrn           v2.4h,   v2.4s,   #6     // (diff + sign + 32) >> 6 = apply_sign()
   4501        rshrn2          v2.8h,   v3.4s,   #6
   4502        rshrn           v3.4h,   v4.4s,   #6
   4503        rshrn2          v3.8h,   v5.4s,   #6
   4504        add             v2.8h,   v2.8h,   v0.8h  // dc + apply_sign()
   4505        add             v3.8h,   v3.8h,   v0.8h
   4506        smax            v2.8h,   v2.8h,   v30.8h
   4507        smax            v3.8h,   v3.8h,   v30.8h
   4508        smin            v2.8h,   v2.8h,   v31.8h
   4509        smin            v3.8h,   v3.8h,   v31.8h
   4510        st1             {v2.8h},  [x0], x1
   4511        st1             {v3.8h},  [x6], x1
   4512        b.gt            1b
   4513        ret
   4514 L(ipred_cfl_splat_w16):
   4515        AARCH64_VALID_JUMP_TARGET
   4516        add             x7,  x5,  w3, uxtw #1
   4517        sub             x1,  x1,  w3, uxtw #1
   4518        mov             w9,  w3
   4519 1:
   4520        ld1             {v2.8h, v3.8h}, [x5], #32
   4521        ld1             {v4.8h, v5.8h}, [x7], #32
   4522        subs            w3,  w3,  #16
   4523        smull           v16.4s,  v2.4h,   v1.4h  // diff = ac * alpha
   4524        smull2          v17.4s,  v2.8h,   v1.8h
   4525        smull           v18.4s,  v3.4h,   v1.4h
   4526        smull2          v19.4s,  v3.8h,   v1.8h
   4527        smull           v2.4s,   v4.4h,   v1.4h
   4528        smull2          v3.4s,   v4.8h,   v1.8h
   4529        smull           v4.4s,   v5.4h,   v1.4h
   4530        smull2          v5.4s,   v5.8h,   v1.8h
   4531        cmlt            v20.4s,  v16.4s,  #0     // sign
   4532        cmlt            v21.4s,  v17.4s,  #0
   4533        cmlt            v22.4s,  v18.4s,  #0
   4534        cmlt            v23.4s,  v19.4s,  #0
   4535        cmlt            v24.4s,  v2.4s,   #0
   4536        cmlt            v25.4s,  v3.4s,   #0
   4537        cmlt            v26.4s,  v4.4s,   #0
   4538        cmlt            v27.4s,  v5.4s,   #0
   4539        add             v16.4s,  v16.4s,  v20.4s // diff + sign
   4540        add             v17.4s,  v17.4s,  v21.4s
   4541        add             v18.4s,  v18.4s,  v22.4s
   4542        add             v19.4s,  v19.4s,  v23.4s
   4543        add             v2.4s,   v2.4s,   v24.4s
   4544        add             v3.4s,   v3.4s,   v25.4s
   4545        add             v4.4s,   v4.4s,   v26.4s
   4546        add             v5.4s,   v5.4s,   v27.4s
   4547        rshrn           v16.4h,  v16.4s,  #6     // (diff + sign + 32) >> 6 = apply_sign()
   4548        rshrn2          v16.8h,  v17.4s,  #6
   4549        rshrn           v17.4h,  v18.4s,  #6
   4550        rshrn2          v17.8h,  v19.4s,  #6
   4551        rshrn           v6.4h,   v2.4s,   #6
   4552        rshrn2          v6.8h,   v3.4s,   #6
   4553        rshrn           v7.4h,   v4.4s,   #6
   4554        rshrn2          v7.8h,   v5.4s,   #6
   4555        add             v2.8h,   v16.8h,  v0.8h  // dc + apply_sign()
   4556        add             v3.8h,   v17.8h,  v0.8h
   4557        add             v4.8h,   v6.8h,   v0.8h
   4558        add             v5.8h,   v7.8h,   v0.8h
   4559        smax            v2.8h,   v2.8h,   v30.8h
   4560        smax            v3.8h,   v3.8h,   v30.8h
   4561        smax            v4.8h,   v4.8h,   v30.8h
   4562        smax            v5.8h,   v5.8h,   v30.8h
   4563        smin            v2.8h,   v2.8h,   v31.8h
   4564        smin            v3.8h,   v3.8h,   v31.8h
   4565        smin            v4.8h,   v4.8h,   v31.8h
   4566        smin            v5.8h,   v5.8h,   v31.8h
   4567        st1             {v2.8h, v3.8h},  [x0], #32
   4568        st1             {v4.8h, v5.8h},  [x6], #32
   4569        b.gt            1b
   4570        subs            w4,  w4,  #2
   4571        add             x5,  x5,  w9, uxtw #1
   4572        add             x7,  x7,  w9, uxtw #1
   4573        add             x0,  x0,  x1
   4574        add             x6,  x6,  x1
   4575        mov             w3,  w9
   4576        b.gt            1b
   4577        ret
   4578 endfunc
   4579 
   4580 jumptable ipred_cfl_128_tbl
   4581 ipred_cfl_splat_tbl:
   4582        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
   4583        .word L(ipred_cfl_splat_w16) - ipred_cfl_128_tbl
   4584        .word L(ipred_cfl_splat_w8) - ipred_cfl_128_tbl
   4585        .word L(ipred_cfl_splat_w4) - ipred_cfl_128_tbl
   4586 endjumptable
   4587 
   4588 // void ipred_cfl_top_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   4589 //                               const pixel *const topleft,
   4590 //                               const int width, const int height,
   4591 //                               const int16_t *ac, const int alpha,
   4592 //                               const int bitdepth_max);
   4593 function ipred_cfl_top_16bpc_neon, export=1
   4594        dup             v31.8h,  w7   // bitdepth_max
   4595        clz             w9,  w3
   4596        movrel          x7,  ipred_cfl_top_tbl
   4597        sub             w9,  w9,  #26
   4598        ldrsw           x9,  [x7, w9, uxtw #2]
   4599        dup             v1.8h,   w6   // alpha
   4600        add             x2,  x2,  #2
   4601        add             x7,  x7,  x9
   4602        add             x6,  x0,  x1
   4603        lsl             x1,  x1,  #1
   4604        movi            v30.8h,  #0
   4605        br              x7
   4606 4:
   4607        AARCH64_VALID_JUMP_TARGET
   4608        ld1             {v0.4h},  [x2]
   4609        addv            h0,      v0.4h
   4610        urshr           v0.4h,   v0.4h,   #2
   4611        dup             v0.8h,   v0.h[0]
   4612        b               L(ipred_cfl_splat_w4)
   4613 8:
   4614        AARCH64_VALID_JUMP_TARGET
   4615        ld1             {v0.8h},  [x2]
   4616        addv            h0,      v0.8h
   4617        urshr           v0.4h,   v0.4h,   #3
   4618        dup             v0.8h,   v0.h[0]
   4619        b               L(ipred_cfl_splat_w8)
   4620 16:
   4621        AARCH64_VALID_JUMP_TARGET
   4622        ld1             {v2.8h, v3.8h}, [x2]
   4623        addp            v0.8h,   v2.8h,   v3.8h
   4624        addv            h0,      v0.8h
   4625        urshr           v0.4h,   v0.4h,   #4
   4626        dup             v0.8h,   v0.h[0]
   4627        b               L(ipred_cfl_splat_w16)
   4628 32:
   4629        AARCH64_VALID_JUMP_TARGET
   4630        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
   4631        addp            v2.8h,   v2.8h,   v3.8h
   4632        addp            v4.8h,   v4.8h,   v5.8h
   4633        addp            v0.8h,   v2.8h,   v4.8h
   4634        uaddlv          s0,      v0.8h
   4635        rshrn           v0.4h,   v0.4s,   #5
   4636        dup             v0.8h,   v0.h[0]
   4637        b               L(ipred_cfl_splat_w16)
   4638 endfunc
   4639 
   4640 jumptable ipred_cfl_top_tbl
   4641        .word 32b - ipred_cfl_top_tbl
   4642        .word 16b - ipred_cfl_top_tbl
   4643        .word 8b  - ipred_cfl_top_tbl
   4644        .word 4b  - ipred_cfl_top_tbl
   4645 endjumptable
   4646 
   4647 // void ipred_cfl_left_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   4648 //                                const pixel *const topleft,
   4649 //                                const int width, const int height,
   4650 //                                const int16_t *ac, const int alpha,
   4651 //                                const int bitdepth_max);
   4652 function ipred_cfl_left_16bpc_neon, export=1
   4653        dup             v31.8h,  w7   // bitdepth_max
   4654        sub             x2,  x2,  w4, uxtw #1
   4655        clz             w9,  w3
   4656        clz             w8,  w4
   4657        movrel          x10, ipred_cfl_splat_tbl
   4658        movrel          x7,  ipred_cfl_left_tbl
   4659        sub             w9,  w9,  #26
   4660        sub             w8,  w8,  #26
   4661        ldrsw           x9,  [x10, w9, uxtw #2]
   4662        ldrsw           x8,  [x7,  w8, uxtw #2]
   4663        dup             v1.8h,   w6   // alpha
   4664        add             x9,  x10, x9
   4665        add             x7,  x7,  x8
   4666        add             x6,  x0,  x1
   4667        lsl             x1,  x1,  #1
   4668        movi            v30.8h,  #0
   4669        br              x7
   4670 
   4671 L(ipred_cfl_left_h4):
   4672        AARCH64_VALID_JUMP_TARGET
   4673        ld1             {v0.4h},  [x2]
   4674        addv            h0,      v0.4h
   4675        urshr           v0.4h,   v0.4h,   #2
   4676        dup             v0.8h,   v0.h[0]
   4677        br              x9
   4678 
   4679 L(ipred_cfl_left_h8):
   4680        AARCH64_VALID_JUMP_TARGET
   4681        ld1             {v0.8h},  [x2]
   4682        addv            h0,      v0.8h
   4683        urshr           v0.4h,   v0.4h,   #3
   4684        dup             v0.8h,   v0.h[0]
   4685        br              x9
   4686 
   4687 L(ipred_cfl_left_h16):
   4688        AARCH64_VALID_JUMP_TARGET
   4689        ld1             {v2.8h, v3.8h}, [x2]
   4690        addp            v0.8h,   v2.8h,   v3.8h
   4691        addv            h0,      v0.8h
   4692        urshr           v0.4h,   v0.4h,   #4
   4693        dup             v0.8h,   v0.h[0]
   4694        br              x9
   4695 
   4696 L(ipred_cfl_left_h32):
   4697        AARCH64_VALID_JUMP_TARGET
   4698        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
   4699        addp            v2.8h,   v2.8h,   v3.8h
   4700        addp            v4.8h,   v4.8h,   v5.8h
   4701        addp            v0.8h,   v2.8h,   v4.8h
   4702        uaddlv          s0,      v0.8h
   4703        rshrn           v0.4h,   v0.4s,   #5
   4704        dup             v0.8h,   v0.h[0]
   4705        br              x9
   4706 endfunc
   4707 
   4708 jumptable ipred_cfl_left_tbl
   4709        .word L(ipred_cfl_left_h32) - ipred_cfl_left_tbl
   4710        .word L(ipred_cfl_left_h16) - ipred_cfl_left_tbl
   4711        .word L(ipred_cfl_left_h8)  - ipred_cfl_left_tbl
   4712        .word L(ipred_cfl_left_h4)  - ipred_cfl_left_tbl
   4713 endjumptable
   4714 
   4715 // void ipred_cfl_16bpc_neon(pixel *dst, const ptrdiff_t stride,
   4716 //                           const pixel *const topleft,
   4717 //                           const int width, const int height,
   4718 //                           const int16_t *ac, const int alpha,
   4719 //                           const int bitdepth_max);
   4720 function ipred_cfl_16bpc_neon, export=1
   4721        dup             v31.8h,  w7              // bitdepth_max
   4722        sub             x2,  x2,  w4, uxtw #1
   4723        add             w8,  w3,  w4             // width + height
   4724        dup             v1.8h,   w6              // alpha
   4725        clz             w9,  w3
   4726        clz             w6,  w4
   4727        dup             v16.4s, w8               // width + height
   4728        movrel          x7,  ipred_cfl_tbl
   4729        rbit            w8,  w8                  // rbit(width + height)
   4730        sub             w9,  w9,  #22            // 26 leading bits, minus table offset 4
   4731        sub             w6,  w6,  #26
   4732        clz             w8,  w8                  // ctz(width + height)
   4733        ldrsw           x9,  [x7, w9, uxtw #2]
   4734        ldrsw           x6,  [x7, w6, uxtw #2]
   4735        neg             w8,  w8                  // -ctz(width + height)
   4736        add             x9,  x7,  x9
   4737        add             x7,  x7,  x6
   4738        ushr            v16.4s,  v16.4s,  #1     // (width + height) >> 1
   4739        dup             v17.4s,  w8              // -ctz(width + height)
   4740        add             x6,  x0,  x1
   4741        lsl             x1,  x1,  #1
   4742        movi            v30.8h,  #0
   4743        br              x7
   4744 
   4745 L(ipred_cfl_h4):
   4746        AARCH64_VALID_JUMP_TARGET
   4747        ld1             {v0.4h},  [x2], #8
   4748        uaddlv          s0,      v0.4h
   4749        add             x2,  x2,  #2
   4750        br              x9
   4751 L(ipred_cfl_w4):
   4752        AARCH64_VALID_JUMP_TARGET
   4753        ld1             {v2.4h},  [x2]
   4754        add             v0.2s,   v0.2s,   v16.2s
   4755        uaddlv          s2,      v2.4h
   4756        cmp             w4,  #4
   4757        add             v0.2s,   v0.2s,   v2.2s
   4758        ushl            v0.2s,   v0.2s,   v17.2s
   4759        b.eq            1f
   4760        // h = 8/16
   4761        cmp             w4,  #16
   4762        mov             w16, #0x6667
   4763        mov             w17, #0xAAAB
   4764        csel            w16, w16, w17, eq
   4765        dup             v16.2s,  w16
   4766        mul             v0.2s,   v0.2s,   v16.2s
   4767        ushr            v0.2s,   v0.2s,   #17
   4768 1:
   4769        dup             v0.8h,   v0.h[0]
   4770        b               L(ipred_cfl_splat_w4)
   4771 
   4772 L(ipred_cfl_h8):
   4773        AARCH64_VALID_JUMP_TARGET
   4774        ld1             {v0.8h},  [x2], #16
   4775        uaddlv          s0,      v0.8h
   4776        add             x2,  x2,  #2
   4777        br              x9
   4778 L(ipred_cfl_w8):
   4779        AARCH64_VALID_JUMP_TARGET
   4780        ld1             {v2.8h},  [x2]
   4781        add             v0.2s,   v0.2s,   v16.2s
   4782        uaddlv          s2,      v2.8h
   4783        cmp             w4,  #8
   4784        add             v0.2s,   v0.2s,   v2.2s
   4785        ushl            v0.2s,   v0.2s,   v17.2s
   4786        b.eq            1f
   4787        // h = 4/16/32
   4788        cmp             w4,  #32
   4789        mov             w16, #0x6667
   4790        mov             w17, #0xAAAB
   4791        csel            w16, w16, w17, eq
   4792        dup             v16.2s,  w16
   4793        mul             v0.2s,   v0.2s,   v16.2s
   4794        ushr            v0.2s,   v0.2s,   #17
   4795 1:
   4796        dup             v0.8h,   v0.h[0]
   4797        b               L(ipred_cfl_splat_w8)
   4798 
   4799 L(ipred_cfl_h16):
   4800        AARCH64_VALID_JUMP_TARGET
   4801        ld1             {v2.8h, v3.8h}, [x2], #32
   4802        addp            v0.8h,   v2.8h,   v3.8h
   4803        add             x2,  x2,  #2
   4804        uaddlv          s0,      v0.8h
   4805        br              x9
   4806 L(ipred_cfl_w16):
   4807        AARCH64_VALID_JUMP_TARGET
   4808        ld1             {v2.8h, v3.8h}, [x2]
   4809        add             v0.2s,   v0.2s,   v16.2s
   4810        addp            v2.8h,   v2.8h,   v3.8h
   4811        uaddlv          s2,      v2.8h
   4812        cmp             w4,  #16
   4813        add             v0.2s,   v0.2s,   v2.2s
   4814        ushl            v0.2s,   v0.2s,   v17.2s
   4815        b.eq            1f
   4816        // h = 4/8/32
   4817        tst             w4,  #(32+16+8) // 16 added to make a consecutive bitmask
   4818        mov             w16, #0x6667
   4819        mov             w17, #0xAAAB
   4820        csel            w16, w16, w17, eq
   4821        dup             v16.2s,  w16
   4822        mul             v0.2s,   v0.2s,   v16.2s
   4823        ushr            v0.2s,   v0.2s,   #17
   4824 1:
   4825        dup             v0.8h,   v0.h[0]
   4826        b               L(ipred_cfl_splat_w16)
   4827 
   4828 L(ipred_cfl_h32):
   4829        AARCH64_VALID_JUMP_TARGET
   4830        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2], #64
   4831        addp            v2.8h,   v2.8h,   v3.8h
   4832        addp            v4.8h,   v4.8h,   v5.8h
   4833        addp            v0.8h,   v2.8h,   v4.8h
   4834        add             x2,  x2,  #2
   4835        uaddlv          s0,      v0.8h
   4836        br              x9
   4837 L(ipred_cfl_w32):
   4838        AARCH64_VALID_JUMP_TARGET
   4839        ld1             {v2.8h, v3.8h, v4.8h, v5.8h}, [x2]
   4840        add             v0.4s,   v0.4s,   v16.4s
   4841        addp            v2.8h,   v2.8h,   v3.8h
   4842        addp            v4.8h,   v4.8h,   v5.8h
   4843        addp            v2.8h,   v2.8h,   v4.8h
   4844        cmp             w4,  #32
   4845        uaddlv          s2,      v2.8h
   4846        add             v0.2s,   v0.2s,   v2.2s
   4847        ushl            v0.2s,   v0.2s,   v17.2s
   4848        b.eq            1f
   4849        // h = 8/16
   4850        cmp             w4,  #8
   4851        mov             w16, #0x6667
   4852        mov             w17, #0xAAAB
   4853        csel            w16, w16, w17, eq
   4854        dup             v16.2s,  w16
   4855        mul             v0.2s,   v0.2s,   v16.2s
   4856        ushr            v0.2s,   v0.2s,   #17
   4857 1:
   4858        dup             v0.8h,   v0.h[0]
   4859        b               L(ipred_cfl_splat_w16)
   4860 endfunc
   4861 
   4862 jumptable ipred_cfl_tbl
   4863        .word L(ipred_cfl_h32) - ipred_cfl_tbl
   4864        .word L(ipred_cfl_h16) - ipred_cfl_tbl
   4865        .word L(ipred_cfl_h8)  - ipred_cfl_tbl
   4866        .word L(ipred_cfl_h4)  - ipred_cfl_tbl
   4867        .word L(ipred_cfl_w32) - ipred_cfl_tbl
   4868        .word L(ipred_cfl_w16) - ipred_cfl_tbl
   4869        .word L(ipred_cfl_w8)  - ipred_cfl_tbl
   4870        .word L(ipred_cfl_w4)  - ipred_cfl_tbl
   4871 endjumptable
   4872 
   4873 // void cfl_ac_420_16bpc_neon(int16_t *const ac, const pixel *const ypx,
   4874 //                            const ptrdiff_t stride, const int w_pad,
   4875 //                            const int h_pad, const int cw, const int ch);
   4876 function ipred_cfl_ac_420_16bpc_neon, export=1
   4877        clz             w8,  w5
   4878        lsl             w4,  w4,  #2
   4879        movrel          x7,  ipred_cfl_ac_420_tbl
   4880        sub             w8,  w8,  #27
   4881        ldrsw           x8,  [x7, w8, uxtw #2]
   4882        movi            v24.4s,  #0
   4883        movi            v25.4s,  #0
   4884        movi            v26.4s,  #0
   4885        movi            v27.4s,  #0
   4886        add             x7,  x7,  x8
   4887        sub             w8,  w6,  w4         // height - h_pad
   4888        rbit            w9,  w5              // rbit(width)
   4889        rbit            w10, w6              // rbit(height)
   4890        clz             w9,  w9              // ctz(width)
   4891        clz             w10, w10             // ctz(height)
   4892        add             w9,  w9,  w10        // log2sz
   4893        add             x10, x1,  x2
   4894        dup             v31.4s,  w9
   4895        lsl             x2,  x2,  #1
   4896        neg             v31.4s,  v31.4s      // -log2sz
   4897        br              x7
   4898 
   4899 L(ipred_cfl_ac_420_w4):
   4900        AARCH64_VALID_JUMP_TARGET
   4901 1:      // Copy and subsample input
   4902        ld1             {v0.8h}, [x1],  x2
   4903        ld1             {v1.8h}, [x10], x2
   4904        ld1             {v2.8h}, [x1],  x2
   4905        ld1             {v3.8h}, [x10], x2
   4906        addp            v0.8h,   v0.8h,   v2.8h
   4907        addp            v1.8h,   v1.8h,   v3.8h
   4908        add             v0.8h,   v0.8h,   v1.8h
   4909        shl             v0.8h,   v0.8h,   #1
   4910        subs            w8,  w8,  #2
   4911        st1             {v0.8h}, [x0], #16
   4912        uaddw           v24.4s,  v24.4s,  v0.4h
   4913        uaddw2          v25.4s,  v25.4s,  v0.8h
   4914        b.gt            1b
   4915        trn2            v1.2d,   v0.2d,   v0.2d
   4916        trn2            v0.2d,   v0.2d,   v0.2d
   4917 L(ipred_cfl_ac_420_w4_hpad):
   4918        cbz             w4,  3f
   4919 2:      // Vertical padding (h_pad > 0)
   4920        subs            w4,  w4,  #4
   4921        st1             {v0.8h, v1.8h}, [x0], #32
   4922        uaddw           v24.4s,  v24.4s,  v0.4h
   4923        uaddw2          v25.4s,  v25.4s,  v0.8h
   4924        uaddw           v26.4s,  v26.4s,  v1.4h
   4925        uaddw2          v27.4s,  v27.4s,  v1.8h
   4926        b.gt            2b
   4927 3:
   4928 L(ipred_cfl_ac_420_w4_calc_subtract_dc):
   4929        // Aggregate the sums
   4930        add             v24.4s,  v24.4s,  v25.4s
   4931        add             v26.4s,  v26.4s,  v27.4s
   4932        add             v0.4s,   v24.4s,  v26.4s
   4933        addv            s0,  v0.4s                // sum
   4934        sub             x0,  x0,  w6, uxtw #3
   4935        urshl           v4.2s,   v0.2s,   v31.2s  // (sum + (1 << (log2sz - 1)))  >>= log2sz
   4936        dup             v4.8h,   v4.h[0]
   4937 6:      // Subtract dc from ac
   4938        ld1             {v0.8h, v1.8h}, [x0]
   4939        subs            w6,  w6,  #4
   4940        sub             v0.8h,   v0.8h,   v4.8h
   4941        sub             v1.8h,   v1.8h,   v4.8h
   4942        st1             {v0.8h, v1.8h}, [x0], #32
   4943        b.gt            6b
   4944        ret
   4945 
   4946 L(ipred_cfl_ac_420_w8):
   4947        AARCH64_VALID_JUMP_TARGET
   4948        cbnz            w3,  L(ipred_cfl_ac_420_w8_wpad)
   4949 1:      // Copy and subsample input, without padding
   4950        ld1             {v0.8h, v1.8h}, [x1],  x2
   4951        ld1             {v2.8h, v3.8h}, [x10], x2
   4952        ld1             {v4.8h, v5.8h}, [x1],  x2
   4953        addp            v0.8h,   v0.8h,   v1.8h
   4954        ld1             {v6.8h, v7.8h}, [x10], x2
   4955        addp            v2.8h,   v2.8h,   v3.8h
   4956        addp            v4.8h,   v4.8h,   v5.8h
   4957        addp            v6.8h,   v6.8h,   v7.8h
   4958        add             v0.8h,   v0.8h,   v2.8h
   4959        add             v4.8h,   v4.8h,   v6.8h
   4960        shl             v0.8h,   v0.8h,   #1
   4961        shl             v1.8h,   v4.8h,   #1
   4962        subs            w8,  w8,  #2
   4963        st1             {v0.8h, v1.8h}, [x0], #32
   4964        uaddw           v24.4s,  v24.4s,  v0.4h
   4965        uaddw2          v25.4s,  v25.4s,  v0.8h
   4966        uaddw           v26.4s,  v26.4s,  v1.4h
   4967        uaddw2          v27.4s,  v27.4s,  v1.8h
   4968        b.gt            1b
   4969        mov             v0.16b,  v1.16b
   4970        b               L(ipred_cfl_ac_420_w8_hpad)
   4971 
   4972 L(ipred_cfl_ac_420_w8_wpad):
   4973 1:      // Copy and subsample input, padding 4
   4974        ld1             {v0.8h}, [x1],  x2
   4975        ld1             {v1.8h}, [x10], x2
   4976        ld1             {v2.8h}, [x1],  x2
   4977        ld1             {v3.8h}, [x10], x2
   4978        addp            v0.8h,   v0.8h,   v2.8h
   4979        addp            v1.8h,   v1.8h,   v3.8h
   4980        add             v0.8h,   v0.8h,   v1.8h
   4981        shl             v0.8h,   v0.8h,   #1
   4982        dup             v1.4h,   v0.h[3]
   4983        dup             v3.4h,   v0.h[7]
   4984        trn2            v2.2d,   v0.2d,   v0.2d
   4985        subs            w8,  w8,  #2
   4986        st1             {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32
   4987        uaddw           v24.4s,  v24.4s,  v0.4h
   4988        uaddw           v25.4s,  v25.4s,  v1.4h
   4989        uaddw           v26.4s,  v26.4s,  v2.4h
   4990        uaddw           v27.4s,  v27.4s,  v3.4h
   4991        b.gt            1b
   4992        trn1            v0.2d,   v2.2d,   v3.2d
   4993        trn1            v1.2d,   v2.2d,   v3.2d
   4994 
   4995 L(ipred_cfl_ac_420_w8_hpad):
   4996        cbz             w4,  3f
   4997 2:      // Vertical padding (h_pad > 0)
   4998        subs            w4,  w4,  #4
   4999        st1             {v0.8h, v1.8h}, [x0], #32
   5000        uaddw           v24.4s,  v24.4s,  v0.4h
   5001        uaddw2          v25.4s,  v25.4s,  v0.8h
   5002        uaddw           v26.4s,  v26.4s,  v1.4h
   5003        uaddw2          v27.4s,  v27.4s,  v1.8h
   5004        st1             {v0.8h, v1.8h}, [x0], #32
   5005        uaddw           v24.4s,  v24.4s,  v0.4h
   5006        uaddw2          v25.4s,  v25.4s,  v0.8h
   5007        uaddw           v26.4s,  v26.4s,  v1.4h
   5008        uaddw2          v27.4s,  v27.4s,  v1.8h
   5009        b.gt            2b
   5010 3:
   5011 
   5012        // Double the height and reuse the w4 summing/subtracting
   5013        lsl             w6,  w6,  #1
   5014        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
   5015 
   5016 L(ipred_cfl_ac_420_w16):
   5017        AARCH64_VALID_JUMP_TARGET
   5018        movrel          x7,  ipred_cfl_ac_420_w16_tbl
   5019        ldrsw           x3,  [x7, w3, uxtw #2]
   5020        add             x7,  x7,  x3
   5021        br              x7
   5022 
   5023 L(ipred_cfl_ac_420_w16_wpad0):
   5024        AARCH64_VALID_JUMP_TARGET
   5025 1:      // Copy and subsample input, without padding
   5026        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
   5027        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
   5028        addp            v0.8h,   v0.8h,   v1.8h
   5029        addp            v2.8h,   v2.8h,   v3.8h
   5030        addp            v4.8h,   v4.8h,   v5.8h
   5031        addp            v6.8h,   v6.8h,   v7.8h
   5032        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x1],  x2
   5033        add             v0.8h,   v0.8h,   v4.8h
   5034        ld1             {v20.8h, v21.8h, v22.8h, v23.8h}, [x10], x2
   5035        add             v2.8h,   v2.8h,   v6.8h
   5036        addp            v16.8h,  v16.8h,  v17.8h
   5037        addp            v18.8h,  v18.8h,  v19.8h
   5038        addp            v20.8h,  v20.8h,  v21.8h
   5039        addp            v22.8h,  v22.8h,  v23.8h
   5040        add             v16.8h,  v16.8h,  v20.8h
   5041        add             v18.8h,  v18.8h,  v22.8h
   5042        shl             v0.8h,   v0.8h,   #1
   5043        shl             v1.8h,   v2.8h,   #1
   5044        shl             v2.8h,   v16.8h,  #1
   5045        shl             v3.8h,   v18.8h,  #1
   5046        subs            w8,  w8,  #2
   5047        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5048        uaddw           v24.4s,  v24.4s,  v0.4h
   5049        uaddw2          v25.4s,  v25.4s,  v0.8h
   5050        uaddw           v26.4s,  v26.4s,  v1.4h
   5051        uaddw2          v27.4s,  v27.4s,  v1.8h
   5052        uaddw           v24.4s,  v24.4s,  v2.4h
   5053        uaddw2          v25.4s,  v25.4s,  v2.8h
   5054        uaddw           v26.4s,  v26.4s,  v3.4h
   5055        uaddw2          v27.4s,  v27.4s,  v3.8h
   5056        b.gt            1b
   5057        mov             v0.16b,  v2.16b
   5058        mov             v1.16b,  v3.16b
   5059        b               L(ipred_cfl_ac_420_w16_hpad)
   5060 
   5061 L(ipred_cfl_ac_420_w16_wpad1):
   5062        AARCH64_VALID_JUMP_TARGET
   5063 1:      // Copy and subsample input, padding 4
   5064        ldr             q2,  [x1,  #32]
   5065        ld1             {v0.8h, v1.8h}, [x1],  x2
   5066        ldr             q5,  [x10, #32]
   5067        ld1             {v3.8h, v4.8h}, [x10], x2
   5068        addp            v2.8h,   v2.8h,   v2.8h
   5069        addp            v0.8h,   v0.8h,   v1.8h
   5070        addp            v5.8h,   v5.8h,   v5.8h
   5071        addp            v3.8h,   v3.8h,   v4.8h
   5072        ldr             q18, [x1,  #32]
   5073        add             v2.4h,   v2.4h,   v5.4h
   5074        ld1             {v16.8h, v17.8h}, [x1],  x2
   5075        add             v0.8h,   v0.8h,   v3.8h
   5076        ldr             q21, [x10, #32]
   5077        ld1             {v19.8h, v20.8h}, [x10], x2
   5078        addp            v18.8h,  v18.8h,  v18.8h
   5079        addp            v16.8h,  v16.8h,  v17.8h
   5080        addp            v21.8h,  v21.8h,  v21.8h
   5081        addp            v19.8h,  v19.8h,  v20.8h
   5082        add             v18.4h,  v18.4h,  v21.4h
   5083        add             v16.8h,  v16.8h,  v19.8h
   5084        shl             v1.4h,   v2.4h,   #1
   5085        shl             v0.8h,   v0.8h,   #1
   5086        shl             v3.4h,   v18.4h,  #1
   5087        shl             v2.8h,   v16.8h,  #1
   5088        dup             v4.4h,   v1.h[3]
   5089        dup             v5.4h,   v3.h[3]
   5090        trn1            v1.2d,   v1.2d,   v4.2d
   5091        trn1            v3.2d,   v3.2d,   v5.2d
   5092        subs            w8,  w8,  #2
   5093        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5094        uaddw           v24.4s,  v24.4s,  v0.4h
   5095        uaddw2          v25.4s,  v25.4s,  v0.8h
   5096        uaddw           v26.4s,  v26.4s,  v1.4h
   5097        uaddw2          v27.4s,  v27.4s,  v1.8h
   5098        uaddw           v24.4s,  v24.4s,  v2.4h
   5099        uaddw2          v25.4s,  v25.4s,  v2.8h
   5100        uaddw           v26.4s,  v26.4s,  v3.4h
   5101        uaddw2          v27.4s,  v27.4s,  v3.8h
   5102        b.gt            1b
   5103        mov             v0.16b,  v2.16b
   5104        mov             v1.16b,  v3.16b
   5105        b               L(ipred_cfl_ac_420_w16_hpad)
   5106 
   5107 L(ipred_cfl_ac_420_w16_wpad2):
   5108        AARCH64_VALID_JUMP_TARGET
   5109 1:      // Copy and subsample input, padding 8
   5110        ld1             {v0.8h, v1.8h}, [x1],  x2
   5111        ld1             {v2.8h, v3.8h}, [x10], x2
   5112        ld1             {v4.8h, v5.8h}, [x1],  x2
   5113        addp            v0.8h,   v0.8h,   v1.8h
   5114        ld1             {v6.8h, v7.8h}, [x10], x2
   5115        addp            v2.8h,   v2.8h,   v3.8h
   5116        addp            v4.8h,   v4.8h,   v5.8h
   5117        addp            v6.8h,   v6.8h,   v7.8h
   5118        add             v0.8h,   v0.8h,   v2.8h
   5119        add             v4.8h,   v4.8h,   v6.8h
   5120        shl             v0.8h,   v0.8h,   #1
   5121        shl             v2.8h,   v4.8h,   #1
   5122        dup             v1.8h,   v0.h[7]
   5123        dup             v3.8h,   v2.h[7]
   5124        subs            w8,  w8,  #2
   5125        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5126        uaddw           v24.4s,  v24.4s,  v0.4h
   5127        uaddw2          v25.4s,  v25.4s,  v0.8h
   5128        uaddw           v26.4s,  v26.4s,  v1.4h
   5129        uaddw2          v27.4s,  v27.4s,  v1.8h
   5130        uaddw           v24.4s,  v24.4s,  v2.4h
   5131        uaddw2          v25.4s,  v25.4s,  v2.8h
   5132        uaddw           v26.4s,  v26.4s,  v3.4h
   5133        uaddw2          v27.4s,  v27.4s,  v3.8h
   5134        b.gt            1b
   5135        mov             v0.16b,  v2.16b
   5136        mov             v1.16b,  v3.16b
   5137        b               L(ipred_cfl_ac_420_w16_hpad)
   5138 
   5139 L(ipred_cfl_ac_420_w16_wpad3):
   5140        AARCH64_VALID_JUMP_TARGET
   5141 1:      // Copy and subsample input, padding 12
   5142        ld1             {v0.8h}, [x1],  x2
   5143        ld1             {v2.8h}, [x10], x2
   5144        ld1             {v4.8h}, [x1],  x2
   5145        ld1             {v6.8h}, [x10], x2
   5146        addp            v0.8h,   v0.8h,   v4.8h
   5147        addp            v2.8h,   v2.8h,   v6.8h
   5148        add             v0.8h,   v0.8h,   v2.8h
   5149        shl             v0.8h,   v0.8h,   #1
   5150        dup             v1.8h,   v0.h[3]
   5151        dup             v3.8h,   v0.h[7]
   5152        trn2            v2.2d,   v0.2d,   v3.2d
   5153        trn1            v0.2d,   v0.2d,   v1.2d
   5154        subs            w8,  w8,  #2
   5155        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5156        uaddw           v24.4s,  v24.4s,  v0.4h
   5157        uaddw2          v25.4s,  v25.4s,  v0.8h
   5158        uaddw           v26.4s,  v26.4s,  v1.4h
   5159        uaddw2          v27.4s,  v27.4s,  v1.8h
   5160        uaddw           v24.4s,  v24.4s,  v2.4h
   5161        uaddw2          v25.4s,  v25.4s,  v2.8h
   5162        uaddw           v26.4s,  v26.4s,  v3.4h
   5163        uaddw2          v27.4s,  v27.4s,  v3.8h
   5164        b.gt            1b
   5165        mov             v0.16b,  v2.16b
   5166        mov             v1.16b,  v3.16b
   5167 
   5168 L(ipred_cfl_ac_420_w16_hpad):
   5169        cbz             w4,  3f
   5170 2:      // Vertical padding (h_pad > 0)
   5171        subs            w4,  w4,  #4
   5172        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5173        uaddw           v24.4s,  v24.4s,  v0.4h
   5174        uaddw2          v25.4s,  v25.4s,  v0.8h
   5175        uaddw           v26.4s,  v26.4s,  v1.4h
   5176        uaddw2          v27.4s,  v27.4s,  v1.8h
   5177        uaddw           v24.4s,  v24.4s,  v2.4h
   5178        uaddw2          v25.4s,  v25.4s,  v2.8h
   5179        uaddw           v26.4s,  v26.4s,  v3.4h
   5180        uaddw2          v27.4s,  v27.4s,  v3.8h
   5181        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5182        uaddw           v24.4s,  v24.4s,  v0.4h
   5183        uaddw2          v25.4s,  v25.4s,  v0.8h
   5184        uaddw           v26.4s,  v26.4s,  v1.4h
   5185        uaddw2          v27.4s,  v27.4s,  v1.8h
   5186        uaddw           v24.4s,  v24.4s,  v2.4h
   5187        uaddw2          v25.4s,  v25.4s,  v2.8h
   5188        uaddw           v26.4s,  v26.4s,  v3.4h
   5189        uaddw2          v27.4s,  v27.4s,  v3.8h
   5190        b.gt            2b
   5191 3:
   5192 
   5193        // Quadruple the height and reuse the w4 summing/subtracting
   5194        lsl             w6,  w6,  #2
   5195        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
   5196 endfunc
   5197 
   5198 jumptable ipred_cfl_ac_420_tbl
   5199        .word L(ipred_cfl_ac_420_w16) - ipred_cfl_ac_420_tbl
   5200        .word L(ipred_cfl_ac_420_w8)  - ipred_cfl_ac_420_tbl
   5201        .word L(ipred_cfl_ac_420_w4)  - ipred_cfl_ac_420_tbl
   5202 endjumptable
   5203 
   5204 jumptable ipred_cfl_ac_420_w16_tbl
   5205        .word L(ipred_cfl_ac_420_w16_wpad0) - ipred_cfl_ac_420_w16_tbl
   5206        .word L(ipred_cfl_ac_420_w16_wpad1) - ipred_cfl_ac_420_w16_tbl
   5207        .word L(ipred_cfl_ac_420_w16_wpad2) - ipred_cfl_ac_420_w16_tbl
   5208        .word L(ipred_cfl_ac_420_w16_wpad3) - ipred_cfl_ac_420_w16_tbl
   5209 endjumptable
   5210 
   5211 // void cfl_ac_422_16bpc_neon(int16_t *const ac, const pixel *const ypx,
   5212 //                            const ptrdiff_t stride, const int w_pad,
   5213 //                            const int h_pad, const int cw, const int ch);
   5214 function ipred_cfl_ac_422_16bpc_neon, export=1
   5215        clz             w8,  w5
   5216        lsl             w4,  w4,  #2
   5217        movrel          x7,  ipred_cfl_ac_422_tbl
   5218        sub             w8,  w8,  #27
   5219        ldrsw           x8,  [x7, w8, uxtw #2]
   5220        movi            v24.4s,  #0
   5221        movi            v25.4s,  #0
   5222        movi            v26.4s,  #0
   5223        movi            v27.4s,  #0
   5224        add             x7,  x7,  x8
   5225        sub             w8,  w6,  w4         // height - h_pad
   5226        rbit            w9,  w5              // rbit(width)
   5227        rbit            w10, w6              // rbit(height)
   5228        clz             w9,  w9              // ctz(width)
   5229        clz             w10, w10             // ctz(height)
   5230        add             w9,  w9,  w10        // log2sz
   5231        add             x10, x1,  x2
   5232        dup             v31.4s,  w9
   5233        lsl             x2,  x2,  #1
   5234        neg             v31.4s,  v31.4s      // -log2sz
   5235        br              x7
   5236 
   5237 L(ipred_cfl_ac_422_w4):
   5238        AARCH64_VALID_JUMP_TARGET
   5239 1:      // Copy and subsample input
   5240        ld1             {v0.8h}, [x1],  x2
   5241        ld1             {v1.8h}, [x10], x2
   5242        ld1             {v2.8h}, [x1],  x2
   5243        ld1             {v3.8h}, [x10], x2
   5244        addp            v0.8h,   v0.8h,   v1.8h
   5245        addp            v2.8h,   v2.8h,   v3.8h
   5246        shl             v0.8h,   v0.8h,   #2
   5247        shl             v1.8h,   v2.8h,   #2
   5248        subs            w8,  w8,  #4
   5249        st1             {v0.8h, v1.8h}, [x0], #32
   5250        uaddw           v24.4s,  v24.4s,  v0.4h
   5251        uaddw2          v25.4s,  v25.4s,  v0.8h
   5252        uaddw           v26.4s,  v26.4s,  v1.4h
   5253        uaddw2          v27.4s,  v27.4s,  v1.8h
   5254        b.gt            1b
   5255        trn2            v0.2d,   v1.2d,   v1.2d
   5256        trn2            v1.2d,   v1.2d,   v1.2d
   5257        b               L(ipred_cfl_ac_420_w4_hpad)
   5258 
   5259 L(ipred_cfl_ac_422_w8):
   5260        AARCH64_VALID_JUMP_TARGET
   5261        cbnz            w3,  L(ipred_cfl_ac_422_w8_wpad)
   5262 1:      // Copy and subsample input, without padding
   5263        ld1             {v0.8h, v1.8h}, [x1],  x2
   5264        ld1             {v2.8h, v3.8h}, [x10], x2
   5265        ld1             {v4.8h, v5.8h}, [x1],  x2
   5266        addp            v0.8h,   v0.8h,   v1.8h
   5267        ld1             {v6.8h, v7.8h}, [x10], x2
   5268        addp            v2.8h,   v2.8h,   v3.8h
   5269        addp            v4.8h,   v4.8h,   v5.8h
   5270        addp            v6.8h,   v6.8h,   v7.8h
   5271        shl             v0.8h,   v0.8h,   #2
   5272        shl             v1.8h,   v2.8h,   #2
   5273        shl             v2.8h,   v4.8h,   #2
   5274        shl             v3.8h,   v6.8h,   #2
   5275        subs            w8,  w8,  #4
   5276        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5277        uaddw           v24.4s,  v24.4s,  v0.4h
   5278        uaddw2          v25.4s,  v25.4s,  v0.8h
   5279        uaddw           v26.4s,  v26.4s,  v1.4h
   5280        uaddw2          v27.4s,  v27.4s,  v1.8h
   5281        uaddw           v24.4s,  v24.4s,  v2.4h
   5282        uaddw2          v25.4s,  v25.4s,  v2.8h
   5283        uaddw           v26.4s,  v26.4s,  v3.4h
   5284        uaddw2          v27.4s,  v27.4s,  v3.8h
   5285        b.gt            1b
   5286        mov             v0.16b,  v3.16b
   5287        mov             v1.16b,  v3.16b
   5288        b               L(ipred_cfl_ac_420_w8_hpad)
   5289 
   5290 L(ipred_cfl_ac_422_w8_wpad):
   5291 1:      // Copy and subsample input, padding 4
   5292        ld1             {v0.8h}, [x1],  x2
   5293        ld1             {v1.8h}, [x10], x2
   5294        ld1             {v2.8h}, [x1],  x2
   5295        ld1             {v3.8h}, [x10], x2
   5296        addp            v0.8h,   v0.8h,   v1.8h
   5297        addp            v2.8h,   v2.8h,   v3.8h
   5298        shl             v0.8h,   v0.8h,   #2
   5299        shl             v2.8h,   v2.8h,   #2
   5300        dup             v4.4h,   v0.h[3]
   5301        dup             v5.8h,   v0.h[7]
   5302        dup             v6.4h,   v2.h[3]
   5303        dup             v7.8h,   v2.h[7]
   5304        trn2            v1.2d,   v0.2d,   v5.2d
   5305        trn1            v0.2d,   v0.2d,   v4.2d
   5306        trn2            v3.2d,   v2.2d,   v7.2d
   5307        trn1            v2.2d,   v2.2d,   v6.2d
   5308        subs            w8,  w8,  #4
   5309        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5310        uaddw           v24.4s,  v24.4s,  v0.4h
   5311        uaddw2          v25.4s,  v25.4s,  v0.8h
   5312        uaddw           v26.4s,  v26.4s,  v1.4h
   5313        uaddw2          v27.4s,  v27.4s,  v1.8h
   5314        uaddw           v24.4s,  v24.4s,  v2.4h
   5315        uaddw2          v25.4s,  v25.4s,  v2.8h
   5316        uaddw           v26.4s,  v26.4s,  v3.4h
   5317        uaddw2          v27.4s,  v27.4s,  v3.8h
   5318        b.gt            1b
   5319        mov             v0.16b,  v3.16b
   5320        mov             v1.16b,  v3.16b
   5321        b               L(ipred_cfl_ac_420_w8_hpad)
   5322 
   5323 L(ipred_cfl_ac_422_w16):
   5324        AARCH64_VALID_JUMP_TARGET
   5325        movrel          x7,  ipred_cfl_ac_422_w16_tbl
   5326        ldrsw           x3,  [x7, w3, uxtw #2]
   5327        add             x7,  x7,  x3
   5328        br              x7
   5329 
   5330 L(ipred_cfl_ac_422_w16_wpad0):
   5331        AARCH64_VALID_JUMP_TARGET
   5332 1:      // Copy and subsample input, without padding
   5333        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
   5334        ld1             {v4.8h, v5.8h, v6.8h, v7.8h}, [x10], x2
   5335        addp            v0.8h,   v0.8h,   v1.8h
   5336        addp            v2.8h,   v2.8h,   v3.8h
   5337        addp            v4.8h,   v4.8h,   v5.8h
   5338        addp            v6.8h,   v6.8h,   v7.8h
   5339        shl             v0.8h,   v0.8h,   #2
   5340        shl             v1.8h,   v2.8h,   #2
   5341        shl             v2.8h,   v4.8h,   #2
   5342        shl             v3.8h,   v6.8h,   #2
   5343        subs            w8,  w8,  #2
   5344        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5345        uaddw           v24.4s,  v24.4s,  v0.4h
   5346        uaddw2          v25.4s,  v25.4s,  v0.8h
   5347        uaddw           v26.4s,  v26.4s,  v1.4h
   5348        uaddw2          v27.4s,  v27.4s,  v1.8h
   5349        uaddw           v24.4s,  v24.4s,  v2.4h
   5350        uaddw2          v25.4s,  v25.4s,  v2.8h
   5351        uaddw           v26.4s,  v26.4s,  v3.4h
   5352        uaddw2          v27.4s,  v27.4s,  v3.8h
   5353        b.gt            1b
   5354        mov             v0.16b,  v2.16b
   5355        mov             v1.16b,  v3.16b
   5356        b               L(ipred_cfl_ac_420_w16_hpad)
   5357 
   5358 L(ipred_cfl_ac_422_w16_wpad1):
   5359        AARCH64_VALID_JUMP_TARGET
   5360 1:      // Copy and subsample input, padding 4
   5361        ldr             q2,  [x1,  #32]
   5362        ld1             {v0.8h, v1.8h}, [x1],  x2
   5363        ldr             q6,  [x10, #32]
   5364        ld1             {v4.8h, v5.8h}, [x10], x2
   5365        addp            v2.8h,   v2.8h,   v2.8h
   5366        addp            v0.8h,   v0.8h,   v1.8h
   5367        addp            v6.8h,   v6.8h,   v6.8h
   5368        addp            v4.8h,   v4.8h,   v5.8h
   5369        shl             v1.4h,   v2.4h,   #2
   5370        shl             v0.8h,   v0.8h,   #2
   5371        shl             v3.4h,   v6.4h,   #2
   5372        shl             v2.8h,   v4.8h,   #2
   5373        dup             v4.4h,   v1.h[3]
   5374        dup             v5.4h,   v3.h[3]
   5375        trn1            v1.2d,   v1.2d,   v4.2d
   5376        trn1            v3.2d,   v3.2d,   v5.2d
   5377        subs            w8,  w8,  #2
   5378        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5379        uaddw           v24.4s,  v24.4s,  v0.4h
   5380        uaddw2          v25.4s,  v25.4s,  v0.8h
   5381        uaddw           v26.4s,  v26.4s,  v1.4h
   5382        uaddw2          v27.4s,  v27.4s,  v1.8h
   5383        uaddw           v24.4s,  v24.4s,  v2.4h
   5384        uaddw2          v25.4s,  v25.4s,  v2.8h
   5385        uaddw           v26.4s,  v26.4s,  v3.4h
   5386        uaddw2          v27.4s,  v27.4s,  v3.8h
   5387        b.gt            1b
   5388        mov             v0.16b,  v2.16b
   5389        mov             v1.16b,  v3.16b
   5390        b               L(ipred_cfl_ac_420_w16_hpad)
   5391 
   5392 L(ipred_cfl_ac_422_w16_wpad2):
   5393        AARCH64_VALID_JUMP_TARGET
   5394 1:      // Copy and subsample input, padding 8
   5395        ld1             {v0.8h, v1.8h}, [x1],  x2
   5396        ld1             {v2.8h, v3.8h}, [x10], x2
   5397        addp            v0.8h,   v0.8h,   v1.8h
   5398        addp            v2.8h,   v2.8h,   v3.8h
   5399        shl             v0.8h,   v0.8h,   #2
   5400        shl             v2.8h,   v2.8h,   #2
   5401        dup             v1.8h,   v0.h[7]
   5402        dup             v3.8h,   v2.h[7]
   5403        subs            w8,  w8,  #2
   5404        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5405        uaddw           v24.4s,  v24.4s,  v0.4h
   5406        uaddw2          v25.4s,  v25.4s,  v0.8h
   5407        uaddw           v26.4s,  v26.4s,  v1.4h
   5408        uaddw2          v27.4s,  v27.4s,  v1.8h
   5409        uaddw           v24.4s,  v24.4s,  v2.4h
   5410        uaddw2          v25.4s,  v25.4s,  v2.8h
   5411        uaddw           v26.4s,  v26.4s,  v3.4h
   5412        uaddw2          v27.4s,  v27.4s,  v3.8h
   5413        b.gt            1b
   5414        mov             v0.16b,  v2.16b
   5415        mov             v1.16b,  v3.16b
   5416        b               L(ipred_cfl_ac_420_w16_hpad)
   5417 
   5418 L(ipred_cfl_ac_422_w16_wpad3):
   5419        AARCH64_VALID_JUMP_TARGET
   5420 1:      // Copy and subsample input, padding 12
   5421        ld1             {v0.8h}, [x1],  x2
   5422        ld1             {v2.8h}, [x10], x2
   5423        addp            v0.8h,   v0.8h,   v0.8h
   5424        addp            v2.8h,   v2.8h,   v2.8h
   5425        shl             v0.4h,   v0.4h,   #2
   5426        shl             v2.4h,   v2.4h,   #2
   5427        dup             v1.8h,   v0.h[3]
   5428        dup             v3.8h,   v2.h[3]
   5429        trn1            v0.2d,   v0.2d,   v1.2d
   5430        trn1            v2.2d,   v2.2d,   v3.2d
   5431        subs            w8,  w8,  #2
   5432        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5433        uaddw           v24.4s,  v24.4s,  v0.4h
   5434        uaddw2          v25.4s,  v25.4s,  v0.8h
   5435        uaddw           v26.4s,  v26.4s,  v1.4h
   5436        uaddw2          v27.4s,  v27.4s,  v1.8h
   5437        uaddw           v24.4s,  v24.4s,  v2.4h
   5438        uaddw2          v25.4s,  v25.4s,  v2.8h
   5439        uaddw           v26.4s,  v26.4s,  v3.4h
   5440        uaddw2          v27.4s,  v27.4s,  v3.8h
   5441        b.gt            1b
   5442        mov             v0.16b,  v2.16b
   5443        mov             v1.16b,  v3.16b
   5444        b               L(ipred_cfl_ac_420_w16_hpad)
   5445 endfunc
   5446 
   5447 jumptable ipred_cfl_ac_422_tbl
   5448        .word L(ipred_cfl_ac_422_w16) - ipred_cfl_ac_422_tbl
   5449        .word L(ipred_cfl_ac_422_w8)  - ipred_cfl_ac_422_tbl
   5450        .word L(ipred_cfl_ac_422_w4)  - ipred_cfl_ac_422_tbl
   5451 endjumptable
   5452 
   5453 jumptable ipred_cfl_ac_422_w16_tbl
   5454        .word L(ipred_cfl_ac_422_w16_wpad0) - ipred_cfl_ac_422_w16_tbl
   5455        .word L(ipred_cfl_ac_422_w16_wpad1) - ipred_cfl_ac_422_w16_tbl
   5456        .word L(ipred_cfl_ac_422_w16_wpad2) - ipred_cfl_ac_422_w16_tbl
   5457        .word L(ipred_cfl_ac_422_w16_wpad3) - ipred_cfl_ac_422_w16_tbl
   5458 endjumptable
   5459 
   5460 // void cfl_ac_444_16bpc_neon(int16_t *const ac, const pixel *const ypx,
   5461 //                            const ptrdiff_t stride, const int w_pad,
   5462 //                            const int h_pad, const int cw, const int ch);
   5463 function ipred_cfl_ac_444_16bpc_neon, export=1
   5464        clz             w8,  w5
   5465        lsl             w4,  w4,  #2
   5466        movrel          x7,  ipred_cfl_ac_444_tbl
   5467        sub             w8,  w8,  #26
   5468        ldrsw           x8,  [x7, w8, uxtw #2]
   5469        movi            v24.4s,  #0
   5470        movi            v25.4s,  #0
   5471        movi            v26.4s,  #0
   5472        movi            v27.4s,  #0
   5473        add             x7,  x7,  x8
   5474        sub             w8,  w6,  w4         // height - h_pad
   5475        rbit            w9,  w5              // rbit(width)
   5476        rbit            w10, w6              // rbit(height)
   5477        clz             w9,  w9              // ctz(width)
   5478        clz             w10, w10             // ctz(height)
   5479        add             w9,  w9,  w10        // log2sz
   5480        add             x10, x1,  x2
   5481        dup             v31.4s,  w9
   5482        lsl             x2,  x2,  #1
   5483        neg             v31.4s,  v31.4s      // -log2sz
   5484        br              x7
   5485 
   5486 L(ipred_cfl_ac_444_w4):
   5487        AARCH64_VALID_JUMP_TARGET
   5488 1:      // Copy and expand input
   5489        ld1             {v0.4h},   [x1],  x2
   5490        ld1             {v0.d}[1], [x10], x2
   5491        ld1             {v1.4h},   [x1],  x2
   5492        ld1             {v1.d}[1], [x10], x2
   5493        shl             v0.8h,   v0.8h,   #3
   5494        shl             v1.8h,   v1.8h,   #3
   5495        subs            w8,  w8,  #4
   5496        st1             {v0.8h, v1.8h}, [x0], #32
   5497        uaddw           v24.4s,  v24.4s,  v0.4h
   5498        uaddw2          v25.4s,  v25.4s,  v0.8h
   5499        uaddw           v26.4s,  v26.4s,  v1.4h
   5500        uaddw2          v27.4s,  v27.4s,  v1.8h
   5501        b.gt            1b
   5502        trn2            v0.2d,   v1.2d,   v1.2d
   5503        trn2            v1.2d,   v1.2d,   v1.2d
   5504        b               L(ipred_cfl_ac_420_w4_hpad)
   5505 
   5506 L(ipred_cfl_ac_444_w8):
   5507        AARCH64_VALID_JUMP_TARGET
   5508 1:      // Copy and expand input
   5509        ld1             {v0.8h}, [x1],  x2
   5510        ld1             {v1.8h}, [x10], x2
   5511        ld1             {v2.8h}, [x1],  x2
   5512        shl             v0.8h,   v0.8h,   #3
   5513        ld1             {v3.8h}, [x10], x2
   5514        shl             v1.8h,   v1.8h,   #3
   5515        shl             v2.8h,   v2.8h,   #3
   5516        shl             v3.8h,   v3.8h,   #3
   5517        subs            w8,  w8,  #4
   5518        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5519        uaddw           v24.4s,  v24.4s,  v0.4h
   5520        uaddw2          v25.4s,  v25.4s,  v0.8h
   5521        uaddw           v26.4s,  v26.4s,  v1.4h
   5522        uaddw2          v27.4s,  v27.4s,  v1.8h
   5523        uaddw           v24.4s,  v24.4s,  v2.4h
   5524        uaddw2          v25.4s,  v25.4s,  v2.8h
   5525        uaddw           v26.4s,  v26.4s,  v3.4h
   5526        uaddw2          v27.4s,  v27.4s,  v3.8h
   5527        b.gt            1b
   5528        mov             v0.16b,  v3.16b
   5529        mov             v1.16b,  v3.16b
   5530        b               L(ipred_cfl_ac_420_w8_hpad)
   5531 
   5532 L(ipred_cfl_ac_444_w16):
   5533        AARCH64_VALID_JUMP_TARGET
   5534        cbnz            w3,  L(ipred_cfl_ac_444_w16_wpad)
   5535 1:      // Copy and expand input, without padding
   5536        ld1             {v0.8h, v1.8h}, [x1],  x2
   5537        ld1             {v2.8h, v3.8h}, [x10], x2
   5538        shl             v0.8h,   v0.8h,   #3
   5539        shl             v1.8h,   v1.8h,   #3
   5540        shl             v2.8h,   v2.8h,   #3
   5541        shl             v3.8h,   v3.8h,   #3
   5542        subs            w8,  w8,  #2
   5543        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5544        uaddw           v24.4s,  v24.4s,  v0.4h
   5545        uaddw2          v25.4s,  v25.4s,  v0.8h
   5546        uaddw           v26.4s,  v26.4s,  v1.4h
   5547        uaddw2          v27.4s,  v27.4s,  v1.8h
   5548        uaddw           v24.4s,  v24.4s,  v2.4h
   5549        uaddw2          v25.4s,  v25.4s,  v2.8h
   5550        uaddw           v26.4s,  v26.4s,  v3.4h
   5551        uaddw2          v27.4s,  v27.4s,  v3.8h
   5552        b.gt            1b
   5553        mov             v0.16b,  v2.16b
   5554        mov             v1.16b,  v3.16b
   5555        b               L(ipred_cfl_ac_420_w16_hpad)
   5556 
   5557 L(ipred_cfl_ac_444_w16_wpad):
   5558 1:      // Copy and expand input, padding 8
   5559        ld1             {v0.8h}, [x1],  x2
   5560        ld1             {v2.8h}, [x10], x2
   5561        shl             v0.8h,   v0.8h,   #3
   5562        shl             v2.8h,   v2.8h,   #3
   5563        dup             v1.8h,   v0.h[7]
   5564        dup             v3.8h,   v2.h[7]
   5565        subs            w8,  w8,  #2
   5566        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5567        uaddw           v24.4s,  v24.4s,  v0.4h
   5568        uaddw2          v25.4s,  v25.4s,  v0.8h
   5569        uaddw           v26.4s,  v26.4s,  v1.4h
   5570        uaddw2          v27.4s,  v27.4s,  v1.8h
   5571        uaddw           v24.4s,  v24.4s,  v2.4h
   5572        uaddw2          v25.4s,  v25.4s,  v2.8h
   5573        uaddw           v26.4s,  v26.4s,  v3.4h
   5574        uaddw2          v27.4s,  v27.4s,  v3.8h
   5575        b.gt            1b
   5576        mov             v0.16b,  v2.16b
   5577        mov             v1.16b,  v3.16b
   5578        b               L(ipred_cfl_ac_420_w16_hpad)
   5579 
   5580 L(ipred_cfl_ac_444_w32):
   5581        AARCH64_VALID_JUMP_TARGET
   5582        movrel          x7,  ipred_cfl_ac_444_w32_tbl
   5583        lsr             w3,  w3,  #1
   5584        ldrsw           x3,  [x7, w3, uxtw #2]
   5585        lsr             x2,  x2,  #1 // Restore the stride to one line increments
   5586        add             x7,  x7,  x3
   5587        br              x7
   5588 
   5589 L(ipred_cfl_ac_444_w32_wpad0):
   5590        AARCH64_VALID_JUMP_TARGET
   5591 1:      // Copy and expand input, without padding
   5592        ld1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x1],  x2
   5593        shl             v0.8h,   v0.8h,   #3
   5594        shl             v1.8h,   v1.8h,   #3
   5595        shl             v2.8h,   v2.8h,   #3
   5596        shl             v3.8h,   v3.8h,   #3
   5597        subs            w8,  w8,  #1
   5598        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5599        uaddw           v24.4s,  v24.4s,  v0.4h
   5600        uaddw2          v25.4s,  v25.4s,  v0.8h
   5601        uaddw           v26.4s,  v26.4s,  v1.4h
   5602        uaddw2          v27.4s,  v27.4s,  v1.8h
   5603        uaddw           v24.4s,  v24.4s,  v2.4h
   5604        uaddw2          v25.4s,  v25.4s,  v2.8h
   5605        uaddw           v26.4s,  v26.4s,  v3.4h
   5606        uaddw2          v27.4s,  v27.4s,  v3.8h
   5607        b.gt            1b
   5608        b               L(ipred_cfl_ac_444_w32_hpad)
   5609 
   5610 L(ipred_cfl_ac_444_w32_wpad2):
   5611        AARCH64_VALID_JUMP_TARGET
   5612 1:      // Copy and expand input, padding 8
   5613        ld1             {v0.8h, v1.8h, v2.8h}, [x1],  x2
   5614        shl             v2.8h,   v2.8h,   #3
   5615        shl             v0.8h,   v0.8h,   #3
   5616        shl             v1.8h,   v1.8h,   #3
   5617        dup             v3.8h,   v2.h[7]
   5618        subs            w8,  w8,  #1
   5619        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5620        uaddw           v24.4s,  v24.4s,  v0.4h
   5621        uaddw2          v25.4s,  v25.4s,  v0.8h
   5622        uaddw           v26.4s,  v26.4s,  v1.4h
   5623        uaddw2          v27.4s,  v27.4s,  v1.8h
   5624        uaddw           v24.4s,  v24.4s,  v2.4h
   5625        uaddw2          v25.4s,  v25.4s,  v2.8h
   5626        uaddw           v26.4s,  v26.4s,  v3.4h
   5627        uaddw2          v27.4s,  v27.4s,  v3.8h
   5628        b.gt            1b
   5629        b               L(ipred_cfl_ac_444_w32_hpad)
   5630 
   5631 L(ipred_cfl_ac_444_w32_wpad4):
   5632        AARCH64_VALID_JUMP_TARGET
   5633 1:      // Copy and expand input, padding 16
   5634        ld1             {v0.8h, v1.8h}, [x1],  x2
   5635        shl             v1.8h,   v1.8h,   #3
   5636        shl             v0.8h,   v0.8h,   #3
   5637        dup             v2.8h,   v1.h[7]
   5638        dup             v3.8h,   v1.h[7]
   5639        subs            w8,  w8,  #1
   5640        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5641        uaddw           v24.4s,  v24.4s,  v0.4h
   5642        uaddw2          v25.4s,  v25.4s,  v0.8h
   5643        uaddw           v26.4s,  v26.4s,  v1.4h
   5644        uaddw2          v27.4s,  v27.4s,  v1.8h
   5645        uaddw           v24.4s,  v24.4s,  v2.4h
   5646        uaddw2          v25.4s,  v25.4s,  v2.8h
   5647        uaddw           v26.4s,  v26.4s,  v3.4h
   5648        uaddw2          v27.4s,  v27.4s,  v3.8h
   5649        b.gt            1b
   5650        b               L(ipred_cfl_ac_444_w32_hpad)
   5651 
   5652 L(ipred_cfl_ac_444_w32_wpad6):
   5653        AARCH64_VALID_JUMP_TARGET
   5654 1:      // Copy and expand input, padding 24
   5655        ld1             {v0.8h}, [x1],  x2
   5656        shl             v0.8h,   v0.8h,   #3
   5657        dup             v1.8h,   v0.h[7]
   5658        dup             v2.8h,   v0.h[7]
   5659        dup             v3.8h,   v0.h[7]
   5660        subs            w8,  w8,  #1
   5661        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5662        uaddw           v24.4s,  v24.4s,  v0.4h
   5663        uaddw2          v25.4s,  v25.4s,  v0.8h
   5664        uaddw           v26.4s,  v26.4s,  v1.4h
   5665        uaddw2          v27.4s,  v27.4s,  v1.8h
   5666        uaddw           v24.4s,  v24.4s,  v2.4h
   5667        uaddw2          v25.4s,  v25.4s,  v2.8h
   5668        uaddw           v26.4s,  v26.4s,  v3.4h
   5669        uaddw2          v27.4s,  v27.4s,  v3.8h
   5670        b.gt            1b
   5671 
   5672 L(ipred_cfl_ac_444_w32_hpad):
   5673        cbz             w4,  3f
   5674 2:      // Vertical padding (h_pad > 0)
   5675        subs            w4,  w4,  #2
   5676        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5677        uaddw           v24.4s,  v24.4s,  v0.4h
   5678        uaddw2          v25.4s,  v25.4s,  v0.8h
   5679        uaddw           v26.4s,  v26.4s,  v1.4h
   5680        uaddw2          v27.4s,  v27.4s,  v1.8h
   5681        uaddw           v24.4s,  v24.4s,  v2.4h
   5682        uaddw2          v25.4s,  v25.4s,  v2.8h
   5683        uaddw           v26.4s,  v26.4s,  v3.4h
   5684        uaddw2          v27.4s,  v27.4s,  v3.8h
   5685        st1             {v0.8h, v1.8h, v2.8h, v3.8h}, [x0], #64
   5686        uaddw           v24.4s,  v24.4s,  v0.4h
   5687        uaddw2          v25.4s,  v25.4s,  v0.8h
   5688        uaddw           v26.4s,  v26.4s,  v1.4h
   5689        uaddw2          v27.4s,  v27.4s,  v1.8h
   5690        uaddw           v24.4s,  v24.4s,  v2.4h
   5691        uaddw2          v25.4s,  v25.4s,  v2.8h
   5692        uaddw           v26.4s,  v26.4s,  v3.4h
   5693        uaddw2          v27.4s,  v27.4s,  v3.8h
   5694        b.gt            2b
   5695 3:
   5696 
   5697        //  Multiply the height by eight and reuse the w4 subtracting
   5698        lsl             w6,  w6,  #3
   5699        b               L(ipred_cfl_ac_420_w4_calc_subtract_dc)
   5700 endfunc
   5701 
   5702 jumptable ipred_cfl_ac_444_tbl
   5703        .word L(ipred_cfl_ac_444_w32) - ipred_cfl_ac_444_tbl
   5704        .word L(ipred_cfl_ac_444_w16) - ipred_cfl_ac_444_tbl
   5705        .word L(ipred_cfl_ac_444_w8)  - ipred_cfl_ac_444_tbl
   5706        .word L(ipred_cfl_ac_444_w4)  - ipred_cfl_ac_444_tbl
   5707 endjumptable
   5708 
   5709 jumptable ipred_cfl_ac_444_w32_tbl
   5710        .word L(ipred_cfl_ac_444_w32_wpad0) - ipred_cfl_ac_444_w32_tbl
   5711        .word L(ipred_cfl_ac_444_w32_wpad2) - ipred_cfl_ac_444_w32_tbl
   5712        .word L(ipred_cfl_ac_444_w32_wpad4) - ipred_cfl_ac_444_w32_tbl
   5713        .word L(ipred_cfl_ac_444_w32_wpad6) - ipred_cfl_ac_444_w32_tbl
   5714 endjumptable