tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

h264dsp_neon.S (40316B)


      1 /*
      2 * Copyright (c) 2008 Mans Rullgard <mans@mansr.com>
      3 * Copyright (c) 2013 Janne Grunau <janne-libav@jannau.net>
      4 * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
      5 *
      6 * This file is part of FFmpeg.
      7 *
      8 * FFmpeg is free software; you can redistribute it and/or
      9 * modify it under the terms of the GNU Lesser General Public
     10 * License as published by the Free Software Foundation; either
     11 * version 2.1 of the License, or (at your option) any later version.
     12 *
     13 * FFmpeg is distributed in the hope that it will be useful,
     14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     16 * Lesser General Public License for more details.
     17 *
     18 * You should have received a copy of the GNU Lesser General Public
     19 * License along with FFmpeg; if not, write to the Free Software
     20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     21 */
     22 
     23 #include "libavutil/aarch64/asm.S"
     24 #include "neon.S"
     25 
     26 .macro  h264_loop_filter_start
     27        cmp             w2,  #0
     28        ldr             w6,  [x4]
     29        ccmp            w3,  #0, #0, ne
     30        mov             v24.s[0], w6
     31        and             w8,  w6,  w6,  lsl #16
     32        b.eq            1f
     33        ands            w8,  w8,  w8,  lsl #8
     34        b.ge            2f
     35 1:
     36        ret
     37 2:
     38 .endm
     39 
     40 .macro  h264_loop_filter_luma
     41        dup             v22.16b, w2                     // alpha
     42        uxtl            v24.8h,  v24.8b
     43        uabd            v21.16b, v16.16b, v0.16b        // abs(p0 - q0)
     44        uxtl            v24.4s,  v24.4h
     45        uabd            v28.16b, v18.16b, v16.16b       // abs(p1 - p0)
     46        sli             v24.8h,  v24.8h,  #8
     47        uabd            v30.16b, v2.16b,  v0.16b        // abs(q1 - q0)
     48        sli             v24.4s,  v24.4s,  #16
     49        cmhi            v21.16b, v22.16b, v21.16b       // < alpha
     50        dup             v22.16b, w3                     // beta
     51        cmlt            v23.16b, v24.16b, #0
     52        cmhi            v28.16b, v22.16b, v28.16b       // < beta
     53        cmhi            v30.16b, v22.16b, v30.16b       // < beta
     54        bic             v21.16b, v21.16b, v23.16b
     55        uabd            v17.16b, v20.16b, v16.16b       // abs(p2 - p0)
     56        and             v21.16b, v21.16b, v28.16b
     57        uabd            v19.16b,  v4.16b,  v0.16b       // abs(q2 - q0)
     58        and             v21.16b, v21.16b, v30.16b      // < beta
     59        shrn            v30.8b,  v21.8h,  #4
     60        mov             x7, v30.d[0]
     61        cmhi            v17.16b, v22.16b, v17.16b       // < beta
     62        cmhi            v19.16b, v22.16b, v19.16b       // < beta
     63        cbz             x7,  9f
     64        and             v17.16b, v17.16b, v21.16b
     65        and             v19.16b, v19.16b, v21.16b
     66        and             v24.16b, v24.16b, v21.16b
     67        urhadd          v28.16b, v16.16b,  v0.16b
     68        sub             v21.16b, v24.16b, v17.16b
     69        uqadd           v23.16b, v18.16b, v24.16b
     70        uhadd           v20.16b, v20.16b, v28.16b
     71        sub             v21.16b, v21.16b, v19.16b
     72        uhadd           v28.16b,  v4.16b, v28.16b
     73        umin            v23.16b, v23.16b, v20.16b
     74        uqsub           v22.16b, v18.16b, v24.16b
     75        uqadd           v4.16b,   v2.16b, v24.16b
     76        umax            v23.16b, v23.16b, v22.16b
     77        uqsub           v22.16b,  v2.16b, v24.16b
     78        umin            v28.16b,  v4.16b, v28.16b
     79        uxtl            v4.8h,    v0.8b
     80        umax            v28.16b, v28.16b, v22.16b
     81        uxtl2           v20.8h,   v0.16b
     82        usubw           v4.8h,    v4.8h,  v16.8b
     83        usubw2          v20.8h,  v20.8h,  v16.16b
     84        shl             v4.8h,    v4.8h,  #2
     85        shl             v20.8h,  v20.8h,  #2
     86        uaddw           v4.8h,    v4.8h,  v18.8b
     87        uaddw2          v20.8h,  v20.8h,  v18.16b
     88        usubw           v4.8h,    v4.8h,   v2.8b
     89        usubw2          v20.8h,  v20.8h,   v2.16b
     90        rshrn           v4.8b,    v4.8h,  #3
     91        rshrn2          v4.16b,  v20.8h,  #3
     92        bsl             v17.16b, v23.16b, v18.16b
     93        bsl             v19.16b, v28.16b,  v2.16b
     94        neg             v23.16b, v21.16b
     95        uxtl            v28.8h,  v16.8b
     96        smin            v4.16b,   v4.16b, v21.16b
     97        uxtl2           v21.8h,  v16.16b
     98        smax            v4.16b,   v4.16b, v23.16b
     99        uxtl            v22.8h,   v0.8b
    100        uxtl2           v24.8h,   v0.16b
    101        saddw           v28.8h,  v28.8h,  v4.8b
    102        saddw2          v21.8h,  v21.8h,  v4.16b
    103        ssubw           v22.8h,  v22.8h,  v4.8b
    104        ssubw2          v24.8h,  v24.8h,  v4.16b
    105        sqxtun          v16.8b,  v28.8h
    106        sqxtun2         v16.16b, v21.8h
    107        sqxtun          v0.8b,   v22.8h
    108        sqxtun2         v0.16b,  v24.8h
    109 .endm
    110 
    111 function ff_h264_v_loop_filter_luma_neon, export=1
    112        h264_loop_filter_start
    113 
    114        ld1             {v0.16b},  [x0], x1
    115        ld1             {v2.16b},  [x0], x1
    116        ld1             {v4.16b},  [x0], x1
    117        sub             x0,  x0,  x1, lsl #2
    118        sub             x0,  x0,  x1, lsl #1
    119        ld1             {v20.16b},  [x0], x1
    120        ld1             {v18.16b},  [x0], x1
    121        ld1             {v16.16b},  [x0], x1
    122 
    123        h264_loop_filter_luma
    124 
    125        sub             x0,  x0,  x1, lsl #1
    126        st1             {v17.16b},  [x0], x1
    127        st1             {v16.16b}, [x0], x1
    128        st1             {v0.16b},  [x0], x1
    129        st1             {v19.16b}, [x0]
    130 9:
    131        ret
    132 endfunc
    133 
    134 function ff_h264_h_loop_filter_luma_neon, export=1
    135        h264_loop_filter_start
    136 
    137        sub             x0,  x0,  #4
    138        ld1             {v6.8b},  [x0], x1
    139        ld1             {v20.8b}, [x0], x1
    140        ld1             {v18.8b}, [x0], x1
    141        ld1             {v16.8b}, [x0], x1
    142        ld1             {v0.8b},  [x0], x1
    143        ld1             {v2.8b},  [x0], x1
    144        ld1             {v4.8b},  [x0], x1
    145        ld1             {v26.8b}, [x0], x1
    146        ld1             {v6.d}[1],  [x0], x1
    147        ld1             {v20.d}[1], [x0], x1
    148        ld1             {v18.d}[1], [x0], x1
    149        ld1             {v16.d}[1], [x0], x1
    150        ld1             {v0.d}[1],  [x0], x1
    151        ld1             {v2.d}[1],  [x0], x1
    152        ld1             {v4.d}[1],  [x0], x1
    153        ld1             {v26.d}[1], [x0], x1
    154 
    155        transpose_8x16B v6, v20, v18, v16, v0, v2, v4, v26, v21, v23
    156 
    157        h264_loop_filter_luma
    158 
    159        transpose_4x16B v17, v16, v0, v19, v21, v23, v25, v27
    160 
    161        sub             x0,  x0,  x1, lsl #4
    162        add             x0,  x0,  #2
    163        st1             {v17.s}[0],  [x0], x1
    164        st1             {v16.s}[0], [x0], x1
    165        st1             {v0.s}[0],  [x0], x1
    166        st1             {v19.s}[0], [x0], x1
    167        st1             {v17.s}[1],  [x0], x1
    168        st1             {v16.s}[1], [x0], x1
    169        st1             {v0.s}[1],  [x0], x1
    170        st1             {v19.s}[1], [x0], x1
    171        st1             {v17.s}[2],  [x0], x1
    172        st1             {v16.s}[2], [x0], x1
    173        st1             {v0.s}[2],  [x0], x1
    174        st1             {v19.s}[2], [x0], x1
    175        st1             {v17.s}[3],  [x0], x1
    176        st1             {v16.s}[3], [x0], x1
    177        st1             {v0.s}[3],  [x0], x1
    178        st1             {v19.s}[3], [x0], x1
    179 9:
    180        ret
    181 endfunc
    182 
    183 
    184 .macro h264_loop_filter_start_intra
    185        orr             w4,  w2,  w3
    186        cbnz            w4,  1f
    187        ret
    188 1:
    189        dup             v30.16b, w2                // alpha
    190        dup             v31.16b, w3                // beta
    191 .endm
    192 
    193 .macro h264_loop_filter_luma_intra
    194        uabd            v16.16b, v7.16b,  v0.16b        // abs(p0 - q0)
    195        uabd            v17.16b, v6.16b,  v7.16b        // abs(p1 - p0)
    196        uabd            v18.16b, v1.16b,  v0.16b        // abs(q1 - q0)
    197        cmhi            v19.16b, v30.16b, v16.16b       // < alpha
    198        cmhi            v17.16b, v31.16b, v17.16b       // < beta
    199        cmhi            v18.16b, v31.16b, v18.16b       // < beta
    200 
    201        movi            v29.16b, #2
    202        ushr            v30.16b, v30.16b, #2            // alpha >> 2
    203        add             v30.16b, v30.16b, v29.16b       // (alpha >> 2) + 2
    204        cmhi            v16.16b, v30.16b, v16.16b       // < (alpha >> 2) + 2
    205 
    206        and             v19.16b, v19.16b, v17.16b
    207        and             v19.16b, v19.16b, v18.16b
    208        shrn            v20.8b,  v19.8h,  #4
    209        mov             x4, v20.d[0]
    210        cbz             x4, 9f
    211 
    212        ushll           v20.8h,  v6.8b,   #1
    213        ushll           v22.8h,  v1.8b,   #1
    214        ushll2          v21.8h,  v6.16b,  #1
    215        ushll2          v23.8h,  v1.16b,  #1
    216        uaddw           v20.8h,  v20.8h,  v7.8b
    217        uaddw           v22.8h,  v22.8h,  v0.8b
    218        uaddw2          v21.8h,  v21.8h,  v7.16b
    219        uaddw2          v23.8h,  v23.8h,  v0.16b
    220        uaddw           v20.8h,  v20.8h,  v1.8b
    221        uaddw           v22.8h,  v22.8h,  v6.8b
    222        uaddw2          v21.8h,  v21.8h,  v1.16b
    223        uaddw2          v23.8h,  v23.8h,  v6.16b
    224 
    225        rshrn           v24.8b,  v20.8h,  #2 // p0'_1
    226        rshrn           v25.8b,  v22.8h,  #2 // q0'_1
    227        rshrn2          v24.16b, v21.8h,  #2 // p0'_1
    228        rshrn2          v25.16b, v23.8h,  #2 // q0'_1
    229 
    230        uabd            v17.16b, v5.16b,  v7.16b        // abs(p2 - p0)
    231        uabd            v18.16b, v2.16b,  v0.16b        // abs(q2 - q0)
    232        cmhi            v17.16b, v31.16b, v17.16b       // < beta
    233        cmhi            v18.16b, v31.16b, v18.16b       // < beta
    234 
    235        and             v17.16b, v16.16b, v17.16b  // if_2 && if_3
    236        and             v18.16b, v16.16b, v18.16b  // if_2 && if_4
    237 
    238        not             v30.16b, v17.16b
    239        not             v31.16b, v18.16b
    240 
    241        and             v30.16b, v30.16b, v19.16b  // if_1 && !(if_2 && if_3)
    242        and             v31.16b, v31.16b, v19.16b  // if_1 && !(if_2 && if_4)
    243 
    244        and             v17.16b, v19.16b, v17.16b  // if_1 && if_2 && if_3
    245        and             v18.16b, v19.16b, v18.16b  // if_1 && if_2 && if_4
    246 
    247        //calc            p, v7, v6, v5, v4, v17, v7, v6, v5, v4
    248        uaddl           v26.8h,  v5.8b,   v7.8b
    249        uaddl2          v27.8h,  v5.16b,  v7.16b
    250        uaddw           v26.8h,  v26.8h,  v0.8b
    251        uaddw2          v27.8h,  v27.8h,  v0.16b
    252        add             v20.8h,  v20.8h,  v26.8h
    253        add             v21.8h,  v21.8h,  v27.8h
    254        uaddw           v20.8h,  v20.8h,  v0.8b
    255        uaddw2          v21.8h,  v21.8h,  v0.16b
    256        rshrn           v20.8b,  v20.8h,  #3 // p0'_2
    257        rshrn2          v20.16b, v21.8h,  #3 // p0'_2
    258        uaddw           v26.8h,  v26.8h,  v6.8b
    259        uaddw2          v27.8h,  v27.8h,  v6.16b
    260        rshrn           v21.8b,  v26.8h,  #2 // p1'_2
    261        rshrn2          v21.16b, v27.8h,  #2 // p1'_2
    262        uaddl           v28.8h,  v4.8b,   v5.8b
    263        uaddl2          v29.8h,  v4.16b,  v5.16b
    264        shl             v28.8h,  v28.8h,  #1
    265        shl             v29.8h,  v29.8h,  #1
    266        add             v28.8h,  v28.8h,  v26.8h
    267        add             v29.8h,  v29.8h,  v27.8h
    268        rshrn           v19.8b,  v28.8h,  #3 // p2'_2
    269        rshrn2          v19.16b, v29.8h,  #3 // p2'_2
    270 
    271        //calc            q, v0, v1, v2, v3, v18, v0, v1, v2, v3
    272        uaddl           v26.8h,  v2.8b,   v0.8b
    273        uaddl2          v27.8h,  v2.16b,  v0.16b
    274        uaddw           v26.8h,  v26.8h,  v7.8b
    275        uaddw2          v27.8h,  v27.8h,  v7.16b
    276        add             v22.8h,  v22.8h,  v26.8h
    277        add             v23.8h,  v23.8h,  v27.8h
    278        uaddw           v22.8h,  v22.8h,  v7.8b
    279        uaddw2          v23.8h,  v23.8h,  v7.16b
    280        rshrn           v22.8b,  v22.8h,  #3 // q0'_2
    281        rshrn2          v22.16b, v23.8h,  #3 // q0'_2
    282        uaddw           v26.8h,  v26.8h,  v1.8b
    283        uaddw2          v27.8h,  v27.8h,  v1.16b
    284        rshrn           v23.8b,  v26.8h,  #2 // q1'_2
    285        rshrn2          v23.16b, v27.8h,  #2 // q1'_2
    286        uaddl           v28.8h,  v2.8b,   v3.8b
    287        uaddl2          v29.8h,  v2.16b,  v3.16b
    288        shl             v28.8h,  v28.8h,  #1
    289        shl             v29.8h,  v29.8h,  #1
    290        add             v28.8h,  v28.8h,  v26.8h
    291        add             v29.8h,  v29.8h,  v27.8h
    292        rshrn           v26.8b,  v28.8h,  #3 // q2'_2
    293        rshrn2          v26.16b, v29.8h,  #3 // q2'_2
    294 
    295        bit             v7.16b,  v24.16b, v30.16b  // p0'_1
    296        bit             v0.16b,  v25.16b, v31.16b  // q0'_1
    297        bit             v7.16b,  v20.16b, v17.16b  // p0'_2
    298        bit             v6.16b,  v21.16b, v17.16b  // p1'_2
    299        bit             v5.16b,  v19.16b, v17.16b  // p2'_2
    300        bit             v0.16b,  v22.16b, v18.16b  // q0'_2
    301        bit             v1.16b,  v23.16b, v18.16b  // q1'_2
    302        bit             v2.16b,  v26.16b, v18.16b  // q2'_2
    303 .endm
    304 
    305 function ff_h264_v_loop_filter_luma_intra_neon, export=1
    306        h264_loop_filter_start_intra
    307 
    308        ld1             {v0.16b},  [x0], x1 // q0
    309        ld1             {v1.16b},  [x0], x1 // q1
    310        ld1             {v2.16b},  [x0], x1 // q2
    311        ld1             {v3.16b},  [x0], x1 // q3
    312        sub             x0,  x0,  x1, lsl #3
    313        ld1             {v4.16b},  [x0], x1 // p3
    314        ld1             {v5.16b},  [x0], x1 // p2
    315        ld1             {v6.16b},  [x0], x1 // p1
    316        ld1             {v7.16b},  [x0]     // p0
    317 
    318        h264_loop_filter_luma_intra
    319 
    320        sub             x0,  x0,  x1, lsl #1
    321        st1             {v5.16b}, [x0], x1  // p2
    322        st1             {v6.16b}, [x0], x1  // p1
    323        st1             {v7.16b}, [x0], x1  // p0
    324        st1             {v0.16b}, [x0], x1  // q0
    325        st1             {v1.16b}, [x0], x1  // q1
    326        st1             {v2.16b}, [x0]      // q2
    327 9:
    328        ret
    329 endfunc
    330 
    331 function ff_h264_h_loop_filter_luma_intra_neon, export=1
    332        h264_loop_filter_start_intra
    333 
    334        sub             x0,  x0,  #4
    335        ld1             {v4.8b},  [x0], x1
    336        ld1             {v5.8b},  [x0], x1
    337        ld1             {v6.8b},  [x0], x1
    338        ld1             {v7.8b},  [x0], x1
    339        ld1             {v0.8b},  [x0], x1
    340        ld1             {v1.8b},  [x0], x1
    341        ld1             {v2.8b},  [x0], x1
    342        ld1             {v3.8b},  [x0], x1
    343        ld1             {v4.d}[1],  [x0], x1
    344        ld1             {v5.d}[1],  [x0], x1
    345        ld1             {v6.d}[1],  [x0], x1
    346        ld1             {v7.d}[1],  [x0], x1
    347        ld1             {v0.d}[1],  [x0], x1
    348        ld1             {v1.d}[1],  [x0], x1
    349        ld1             {v2.d}[1],  [x0], x1
    350        ld1             {v3.d}[1],  [x0], x1
    351 
    352        transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
    353 
    354        h264_loop_filter_luma_intra
    355 
    356        transpose_8x16B v4, v5, v6, v7, v0, v1, v2, v3, v21, v23
    357 
    358        sub             x0,  x0,  x1, lsl #4
    359        st1             {v4.8b},  [x0], x1
    360        st1             {v5.8b},  [x0], x1
    361        st1             {v6.8b},  [x0], x1
    362        st1             {v7.8b},  [x0], x1
    363        st1             {v0.8b},  [x0], x1
    364        st1             {v1.8b},  [x0], x1
    365        st1             {v2.8b},  [x0], x1
    366        st1             {v3.8b},  [x0], x1
    367        st1             {v4.d}[1],  [x0], x1
    368        st1             {v5.d}[1],  [x0], x1
    369        st1             {v6.d}[1],  [x0], x1
    370        st1             {v7.d}[1],  [x0], x1
    371        st1             {v0.d}[1],  [x0], x1
    372        st1             {v1.d}[1],  [x0], x1
    373        st1             {v2.d}[1],  [x0], x1
    374        st1             {v3.d}[1],  [x0], x1
    375 9:
    376        ret
    377 endfunc
    378 
    379 .macro  h264_loop_filter_chroma
    380        dup             v22.8b, w2              // alpha
    381        dup             v23.8b, w3              // beta
    382        uxtl            v24.8h, v24.8b
    383        uabd            v26.8b, v16.8b, v0.8b   // abs(p0 - q0)
    384        uabd            v28.8b, v18.8b, v16.8b  // abs(p1 - p0)
    385        uabd            v30.8b, v2.8b,  v0.8b   // abs(q1 - q0)
    386        cmhi            v26.8b, v22.8b, v26.8b  // < alpha
    387        cmhi            v28.8b, v23.8b, v28.8b  // < beta
    388        cmhi            v30.8b, v23.8b, v30.8b  // < beta
    389        uxtl            v4.8h,  v0.8b
    390        and             v26.8b, v26.8b, v28.8b
    391        usubw           v4.8h,  v4.8h,  v16.8b
    392        and             v26.8b, v26.8b, v30.8b
    393        shl             v4.8h,  v4.8h,  #2
    394        mov             x8,  v26.d[0]
    395        sli             v24.8h, v24.8h, #8
    396        uaddw           v4.8h,  v4.8h,  v18.8b
    397        cbz             x8,  9f
    398        usubw           v4.8h,  v4.8h,  v2.8b
    399        rshrn           v4.8b,  v4.8h,  #3
    400        smin            v4.8b,  v4.8b,  v24.8b
    401        neg             v25.8b, v24.8b
    402        smax            v4.8b,  v4.8b,  v25.8b
    403        uxtl            v22.8h, v0.8b
    404        and             v4.8b,  v4.8b,  v26.8b
    405        uxtl            v28.8h, v16.8b
    406        saddw           v28.8h, v28.8h, v4.8b
    407        ssubw           v22.8h, v22.8h, v4.8b
    408        sqxtun          v16.8b, v28.8h
    409        sqxtun          v0.8b,  v22.8h
    410 .endm
    411 
    412 function ff_h264_v_loop_filter_chroma_neon, export=1
    413        h264_loop_filter_start
    414 
    415        sub             x0,  x0,  x1, lsl #1
    416        ld1             {v18.8b}, [x0], x1
    417        ld1             {v16.8b}, [x0], x1
    418        ld1             {v0.8b},  [x0], x1
    419        ld1             {v2.8b},  [x0]
    420 
    421        h264_loop_filter_chroma
    422 
    423        sub             x0,  x0,  x1, lsl #1
    424        st1             {v16.8b}, [x0], x1
    425        st1             {v0.8b},  [x0], x1
    426 9:
    427        ret
    428 endfunc
    429 
    430 function ff_h264_h_loop_filter_chroma_neon, export=1
    431        h264_loop_filter_start
    432 
    433        sub             x0,  x0,  #2
    434 h_loop_filter_chroma420:
    435        ld1             {v18.s}[0], [x0], x1
    436        ld1             {v16.s}[0], [x0], x1
    437        ld1             {v0.s}[0],  [x0], x1
    438        ld1             {v2.s}[0],  [x0], x1
    439        ld1             {v18.s}[1], [x0], x1
    440        ld1             {v16.s}[1], [x0], x1
    441        ld1             {v0.s}[1],  [x0], x1
    442        ld1             {v2.s}[1],  [x0], x1
    443 
    444        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
    445 
    446        h264_loop_filter_chroma
    447 
    448        transpose_4x8B  v18, v16, v0, v2, v28, v29, v30, v31
    449 
    450        sub             x0,  x0,  x1, lsl #3
    451        st1             {v18.s}[0], [x0], x1
    452        st1             {v16.s}[0], [x0], x1
    453        st1             {v0.s}[0],  [x0], x1
    454        st1             {v2.s}[0],  [x0], x1
    455        st1             {v18.s}[1], [x0], x1
    456        st1             {v16.s}[1], [x0], x1
    457        st1             {v0.s}[1],  [x0], x1
    458        st1             {v2.s}[1],  [x0], x1
    459 9:
    460        ret
    461 endfunc
    462 
    463 function ff_h264_h_loop_filter_chroma422_neon, export=1
    464        h264_loop_filter_start
    465        add             x5,  x0,  x1
    466        sub             x0,  x0,  #2
    467        add             x1,  x1,  x1
    468        mov             x7,  x30
    469        bl              h_loop_filter_chroma420
    470        mov             x30, x7
    471        sub             x0,  x5,  #2
    472        mov             v24.s[0], w6
    473        b               h_loop_filter_chroma420
    474 endfunc
    475 
    476 .macro h264_loop_filter_chroma_intra
    477        uabd            v26.8b,  v16.8b,  v17.8b  // abs(p0 - q0)
    478        uabd            v27.8b,  v18.8b,  v16.8b  // abs(p1 - p0)
    479        uabd            v28.8b,  v19.8b,  v17.8b  // abs(q1 - q0)
    480        cmhi            v26.8b,  v30.8b,  v26.8b  // < alpha
    481        cmhi            v27.8b,  v31.8b,  v27.8b  // < beta
    482        cmhi            v28.8b,  v31.8b,  v28.8b  // < beta
    483        and             v26.8b,  v26.8b,  v27.8b
    484        and             v26.8b,  v26.8b,  v28.8b
    485        mov             x2, v26.d[0]
    486 
    487        ushll           v4.8h,   v18.8b,  #1
    488        ushll           v6.8h,   v19.8b,  #1
    489        cbz             x2, 9f
    490        uaddl           v20.8h,  v16.8b,  v19.8b
    491        uaddl           v22.8h,  v17.8b,  v18.8b
    492        add             v20.8h,  v20.8h,  v4.8h
    493        add             v22.8h,  v22.8h,  v6.8h
    494        uqrshrn         v24.8b,  v20.8h,  #2
    495        uqrshrn         v25.8b,  v22.8h,  #2
    496        bit             v16.8b,  v24.8b,  v26.8b
    497        bit             v17.8b,  v25.8b,  v26.8b
    498 .endm
    499 
    500 function ff_h264_v_loop_filter_chroma_intra_neon, export=1
    501        h264_loop_filter_start_intra
    502 
    503        sub             x0,  x0,  x1, lsl #1
    504        ld1             {v18.8b}, [x0], x1
    505        ld1             {v16.8b}, [x0], x1
    506        ld1             {v17.8b}, [x0], x1
    507        ld1             {v19.8b}, [x0]
    508 
    509        h264_loop_filter_chroma_intra
    510 
    511        sub             x0,  x0,  x1, lsl #1
    512        st1             {v16.8b}, [x0], x1
    513        st1             {v17.8b}, [x0], x1
    514 
    515 9:
    516        ret
    517 endfunc
    518 
    519 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon, export=1
    520        h264_loop_filter_start_intra
    521 
    522        sub             x4,  x0,  #2
    523        sub             x0,  x0,  #1
    524        ld1             {v18.8b}, [x4], x1
    525        ld1             {v16.8b}, [x4], x1
    526        ld1             {v17.8b}, [x4], x1
    527        ld1             {v19.8b}, [x4], x1
    528 
    529        transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29
    530 
    531        h264_loop_filter_chroma_intra
    532 
    533        st2             {v16.b,v17.b}[0], [x0], x1
    534        st2             {v16.b,v17.b}[1], [x0], x1
    535        st2             {v16.b,v17.b}[2], [x0], x1
    536        st2             {v16.b,v17.b}[3], [x0], x1
    537 
    538 9:
    539        ret
    540 endfunc
    541 
    542 function ff_h264_h_loop_filter_chroma_intra_neon, export=1
    543        h264_loop_filter_start_intra
    544 
    545        sub             x4,  x0,  #2
    546        sub             x0,  x0,  #1
    547 h_loop_filter_chroma420_intra:
    548        ld1             {v18.8b}, [x4], x1
    549        ld1             {v16.8b}, [x4], x1
    550        ld1             {v17.8b}, [x4], x1
    551        ld1             {v19.8b}, [x4], x1
    552        ld1             {v18.s}[1], [x4], x1
    553        ld1             {v16.s}[1], [x4], x1
    554        ld1             {v17.s}[1], [x4], x1
    555        ld1             {v19.s}[1], [x4], x1
    556 
    557        transpose_4x8B  v18, v16, v17, v19, v26, v27, v28, v29
    558 
    559        h264_loop_filter_chroma_intra
    560 
    561        st2             {v16.b,v17.b}[0], [x0], x1
    562        st2             {v16.b,v17.b}[1], [x0], x1
    563        st2             {v16.b,v17.b}[2], [x0], x1
    564        st2             {v16.b,v17.b}[3], [x0], x1
    565        st2             {v16.b,v17.b}[4], [x0], x1
    566        st2             {v16.b,v17.b}[5], [x0], x1
    567        st2             {v16.b,v17.b}[6], [x0], x1
    568        st2             {v16.b,v17.b}[7], [x0], x1
    569 
    570 9:
    571        ret
    572 endfunc
    573 
    574 function ff_h264_h_loop_filter_chroma422_intra_neon, export=1
    575        h264_loop_filter_start_intra
    576        sub             x4,  x0,  #2
    577        add             x5,  x0,  x1, lsl #3
    578        sub             x0,  x0,  #1
    579        mov             x7,  x30
    580        bl              h_loop_filter_chroma420_intra
    581        sub             x0,  x5,  #1
    582        mov             x30, x7
    583        b               h_loop_filter_chroma420_intra
    584 endfunc
    585 
    586 .macro  biweight_16     macs, macd
    587        dup             v0.16b,  w5
    588        dup             v1.16b,  w6
    589        mov             v4.16b,  v16.16b
    590        mov             v6.16b,  v16.16b
    591 1:      subs            w3,  w3,  #2
    592        ld1             {v20.16b}, [x0], x2
    593        \macd           v4.8h,   v0.8b,  v20.8b
    594        \macd\()2       v6.8H,   v0.16B, v20.16B
    595        ld1             {v22.16b}, [x1], x2
    596        \macs           v4.8h,   v1.8b,  v22.8b
    597        \macs\()2       v6.8H,   v1.16B, v22.16B
    598        mov             v24.16b, v16.16b
    599        ld1             {v28.16b}, [x0], x2
    600        mov             v26.16b, v16.16b
    601        \macd           v24.8h,  v0.8b,  v28.8b
    602        \macd\()2       v26.8H,  v0.16B, v28.16B
    603        ld1             {v30.16b}, [x1], x2
    604        \macs           v24.8h,  v1.8b,  v30.8b
    605        \macs\()2       v26.8H,  v1.16B, v30.16B
    606        sshl            v4.8h,   v4.8h,  v18.8h
    607        sshl            v6.8h,   v6.8h,  v18.8h
    608        sqxtun          v4.8b,   v4.8h
    609        sqxtun2         v4.16b,  v6.8h
    610        sshl            v24.8h,  v24.8h, v18.8h
    611        sshl            v26.8h,  v26.8h, v18.8h
    612        sqxtun          v24.8b,  v24.8h
    613        sqxtun2         v24.16b, v26.8h
    614        mov             v6.16b,  v16.16b
    615        st1             {v4.16b},  [x7], x2
    616        mov             v4.16b,  v16.16b
    617        st1             {v24.16b}, [x7], x2
    618        b.ne            1b
    619        ret
    620 .endm
    621 
    622 .macro  biweight_8      macs, macd
    623        dup             v0.8b,  w5
    624        dup             v1.8b,  w6
    625        mov             v2.16b,  v16.16b
    626        mov             v20.16b, v16.16b
    627 1:      subs            w3,  w3,  #2
    628        ld1             {v4.8b}, [x0], x2
    629        \macd           v2.8h,  v0.8b,  v4.8b
    630        ld1             {v5.8b}, [x1], x2
    631        \macs           v2.8h,  v1.8b,  v5.8b
    632        ld1             {v6.8b}, [x0], x2
    633        \macd           v20.8h, v0.8b,  v6.8b
    634        ld1             {v7.8b}, [x1], x2
    635        \macs           v20.8h, v1.8b,  v7.8b
    636        sshl            v2.8h,  v2.8h,  v18.8h
    637        sqxtun          v2.8b,  v2.8h
    638        sshl            v20.8h, v20.8h, v18.8h
    639        sqxtun          v4.8b,  v20.8h
    640        mov             v20.16b, v16.16b
    641        st1             {v2.8b}, [x7], x2
    642        mov             v2.16b,  v16.16b
    643        st1             {v4.8b}, [x7], x2
    644        b.ne            1b
    645        ret
    646 .endm
    647 
    648 .macro  biweight_4      macs, macd
    649        dup             v0.8b,  w5
    650        dup             v1.8b,  w6
    651        mov             v2.16b, v16.16b
    652        mov             v20.16b,v16.16b
    653 1:      subs            w3,  w3,  #4
    654        ld1             {v4.s}[0], [x0], x2
    655        ld1             {v4.s}[1], [x0], x2
    656        \macd           v2.8h,  v0.8b,  v4.8b
    657        ld1             {v5.s}[0], [x1], x2
    658        ld1             {v5.s}[1], [x1], x2
    659        \macs           v2.8h,  v1.8b,  v5.8b
    660        b.lt            2f
    661        ld1             {v6.s}[0], [x0], x2
    662        ld1             {v6.s}[1], [x0], x2
    663        \macd           v20.8h, v0.8b,  v6.8b
    664        ld1             {v7.s}[0], [x1], x2
    665        ld1             {v7.s}[1], [x1], x2
    666        \macs           v20.8h, v1.8b,  v7.8b
    667        sshl            v2.8h,  v2.8h,  v18.8h
    668        sqxtun          v2.8b,  v2.8h
    669        sshl            v20.8h, v20.8h, v18.8h
    670        sqxtun          v4.8b,  v20.8h
    671        mov             v20.16b, v16.16b
    672        st1             {v2.s}[0], [x7], x2
    673        st1             {v2.s}[1], [x7], x2
    674        mov             v2.16b,  v16.16b
    675        st1             {v4.s}[0], [x7], x2
    676        st1             {v4.s}[1], [x7], x2
    677        b.ne            1b
    678        ret
    679 2:      sshl            v2.8h,  v2.8h,  v18.8h
    680        sqxtun          v2.8b,  v2.8h
    681        st1             {v2.s}[0], [x7], x2
    682        st1             {v2.s}[1], [x7], x2
    683        ret
    684 .endm
    685 
    686 .macro  biweight_func   w
    687 function ff_biweight_h264_pixels_\w\()_neon, export=1
    688        lsr             w8,  w5,  #31
    689        add             w7,  w7,  #1
    690        eor             w8,  w8,  w6,  lsr #30
    691        orr             w7,  w7,  #1
    692        dup             v18.8h,   w4
    693        lsl             w7,  w7,  w4
    694        not             v18.16b,  v18.16b
    695        dup             v16.8h,   w7
    696        mov             x7,  x0
    697        cbz             w8,  10f
    698        subs            w8,  w8,  #1
    699        b.eq            20f
    700        subs            w8,  w8,  #1
    701        b.eq            30f
    702        b               40f
    703 10:     biweight_\w     umlal, umlal
    704 20:     neg             w5, w5
    705        biweight_\w     umlal, umlsl
    706 30:     neg             w5, w5
    707        neg             w6, w6
    708        biweight_\w     umlsl, umlsl
    709 40:     neg             w6, w6
    710        biweight_\w     umlsl, umlal
    711 endfunc
    712 .endm
    713 
    714        biweight_func   16
    715        biweight_func   8
    716        biweight_func   4
    717 
    718 .macro  weight_16       add
    719        dup             v0.16b,  w4
    720 1:      subs            w2,  w2,  #2
    721        ld1             {v20.16b}, [x0], x1
    722        umull           v4.8h,   v0.8b,  v20.8b
    723        umull2          v6.8h,   v0.16b, v20.16b
    724        ld1             {v28.16b}, [x0], x1
    725        umull           v24.8h,  v0.8b,  v28.8b
    726        umull2          v26.8h,  v0.16b, v28.16b
    727        \add            v4.8h,   v16.8h, v4.8h
    728        srshl           v4.8h,   v4.8h,  v18.8h
    729        \add            v6.8h,   v16.8h, v6.8h
    730        srshl           v6.8h,   v6.8h,  v18.8h
    731        sqxtun          v4.8b,   v4.8h
    732        sqxtun2         v4.16b,  v6.8h
    733        \add            v24.8h,  v16.8h, v24.8h
    734        srshl           v24.8h,  v24.8h, v18.8h
    735        \add            v26.8h,  v16.8h, v26.8h
    736        srshl           v26.8h,  v26.8h, v18.8h
    737        sqxtun          v24.8b,  v24.8h
    738        sqxtun2         v24.16b, v26.8h
    739        st1             {v4.16b},  [x5], x1
    740        st1             {v24.16b}, [x5], x1
    741        b.ne            1b
    742        ret
    743 .endm
    744 
    745 .macro  weight_8        add
    746        dup             v0.8b,  w4
    747 1:      subs            w2,  w2,  #2
    748        ld1             {v4.8b}, [x0], x1
    749        umull           v2.8h,  v0.8b,  v4.8b
    750        ld1             {v6.8b}, [x0], x1
    751        umull           v20.8h, v0.8b,  v6.8b
    752        \add            v2.8h,  v16.8h,  v2.8h
    753        srshl           v2.8h,  v2.8h,  v18.8h
    754        sqxtun          v2.8b,  v2.8h
    755        \add            v20.8h, v16.8h,  v20.8h
    756        srshl           v20.8h, v20.8h, v18.8h
    757        sqxtun          v4.8b,  v20.8h
    758        st1             {v2.8b}, [x5], x1
    759        st1             {v4.8b}, [x5], x1
    760        b.ne            1b
    761        ret
    762 .endm
    763 
    764 .macro  weight_4        add
    765        dup             v0.8b,  w4
    766 1:      subs            w2,  w2,  #4
    767        ld1             {v4.s}[0], [x0], x1
    768        ld1             {v4.s}[1], [x0], x1
    769        umull           v2.8h,  v0.8b,  v4.8b
    770        b.lt            2f
    771        ld1             {v6.s}[0], [x0], x1
    772        ld1             {v6.s}[1], [x0], x1
    773        umull           v20.8h, v0.8b,  v6.8b
    774        \add            v2.8h,  v16.8h,  v2.8h
    775        srshl           v2.8h,  v2.8h,  v18.8h
    776        sqxtun          v2.8b,  v2.8h
    777        \add            v20.8h, v16.8h,  v20.8h
    778        srshl           v20.8h, v20.8h, v18.8h
    779        sqxtun          v4.8b,  v20.8h
    780        st1             {v2.s}[0], [x5], x1
    781        st1             {v2.s}[1], [x5], x1
    782        st1             {v4.s}[0], [x5], x1
    783        st1             {v4.s}[1], [x5], x1
    784        b.ne            1b
    785        ret
    786 2:      \add            v2.8h,  v16.8h,  v2.8h
    787        srshl           v2.8h,  v2.8h,  v18.8h
    788        sqxtun          v2.8b,  v2.8h
    789        st1             {v2.s}[0], [x5], x1
    790        st1             {v2.s}[1], [x5], x1
    791        ret
    792 .endm
    793 
    794 .macro  weight_func     w
    795 function ff_weight_h264_pixels_\w\()_neon, export=1
    796        cmp             w3,  #1
    797        mov             w6,  #1
    798        lsl             w5,  w5,  w3
    799        dup             v16.8h,  w5
    800        mov             x5,  x0
    801        b.le            20f
    802        sub             w6,  w6,  w3
    803        dup             v18.8h,  w6
    804        cmp             w4, #0
    805        b.lt            10f
    806        weight_\w       shadd
    807 10:     neg             w4,  w4
    808        weight_\w       shsub
    809 20:     neg             w6,  w3
    810        dup             v18.8h,  w6
    811        cmp             w4,  #0
    812        b.lt            10f
    813        weight_\w       add
    814 10:     neg             w4,  w4
    815        weight_\w       sub
    816 endfunc
    817 .endm
    818 
    819        weight_func     16
    820        weight_func     8
    821        weight_func     4
    822 
    823 .macro  h264_loop_filter_start_10
    824        cmp             w2,  #0
    825        ldr             w6,  [x4]
    826        ccmp            w3,  #0,  #0,  ne
    827        lsl             w2,  w2,  #2
    828        mov             v24.s[0], w6
    829        lsl             w3,  w3,  #2
    830        and             w8,  w6,  w6,  lsl #16
    831        b.eq            1f
    832        ands            w8,  w8,  w8,  lsl #8
    833        b.ge            2f
    834 1:
    835        ret
    836 2:
    837 .endm
    838 
    839 .macro h264_loop_filter_start_intra_10
    840        orr             w4,  w2,  w3
    841        cbnz            w4,  1f
    842        ret
    843 1:
    844        lsl             w2,  w2,  #2
    845        lsl             w3,  w3,  #2
    846        dup             v30.8h,   w2              // alpha
    847        dup             v31.8h,   w3              // beta
    848 .endm
    849 
    850 .macro  h264_loop_filter_chroma_10
    851        dup             v22.8h,  w2               // alpha
    852        dup             v23.8h,  w3               // beta
    853        uxtl            v24.8h,  v24.8b           // tc0
    854 
    855        uabd            v26.8h,  v16.8h,  v0.8h   // abs(p0 - q0)
    856        uabd            v28.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
    857        uabd            v30.8h,  v2.8h,   v0.8h   // abs(q1 - q0)
    858        cmhi            v26.8h,  v22.8h,  v26.8h  // < alpha
    859        cmhi            v28.8h,  v23.8h,  v28.8h  // < beta
    860        cmhi            v30.8h,  v23.8h,  v30.8h  // < beta
    861 
    862        and             v26.16b, v26.16b, v28.16b
    863        mov             v4.16b,  v0.16b
    864        sub             v4.8h,   v4.8h,   v16.8h
    865        and             v26.16b, v26.16b, v30.16b
    866        shl             v4.8h,   v4.8h,   #2
    867        mov             x8, v26.d[0]
    868        mov             x9, v26.d[1]
    869        sli             v24.8h,  v24.8h,  #8
    870        uxtl            v24.8h,  v24.8b
    871        add             v4.8h,   v4.8h,   v18.8h
    872        adds            x8,  x8,  x9
    873        shl             v24.8h,  v24.8h,  #2
    874 
    875        b.eq            9f
    876 
    877        movi            v31.8h, #3                // (tc0 - 1) << (BIT_DEPTH - 8)) + 1
    878        uqsub           v24.8h,  v24.8h,  v31.8h
    879        sub             v4.8h,   v4.8h,   v2.8h
    880        srshr           v4.8h,   v4.8h,   #3
    881        smin            v4.8h,   v4.8h,   v24.8h
    882        neg             v25.8h,  v24.8h
    883        smax            v4.8h,   v4.8h,   v25.8h
    884        and             v4.16b,  v4.16b,  v26.16b
    885        add             v16.8h,  v16.8h,  v4.8h
    886        sub             v0.8h,   v0.8h,   v4.8h
    887 
    888        mvni            v4.8h,   #0xFC, lsl #8    // 1023 for clipping
    889        movi            v5.8h,   #0
    890        smin            v0.8h,   v0.8h,   v4.8h
    891        smin            v16.8h,  v16.8h,  v4.8h
    892        smax            v0.8h,   v0.8h,   v5.8h
    893        smax            v16.8h,  v16.8h,  v5.8h
    894 .endm
    895 
    896 function ff_h264_v_loop_filter_chroma_neon_10, export=1
    897        h264_loop_filter_start_10
    898 
    899        mov             x10,  x0
    900        sub             x0,  x0,  x1, lsl #1
    901        ld1             {v18.8h}, [x0 ], x1
    902        ld1             {v0.8h},  [x10], x1
    903        ld1             {v16.8h}, [x0 ], x1
    904        ld1             {v2.8h},  [x10]
    905 
    906        h264_loop_filter_chroma_10
    907 
    908        sub             x0,  x10,  x1, lsl #1
    909        st1             {v16.8h}, [x0], x1
    910        st1             {v0.8h},  [x0], x1
    911 9:
    912        ret
    913 endfunc
    914 
    915 function ff_h264_h_loop_filter_chroma_neon_10, export=1
    916        h264_loop_filter_start_10
    917 
    918        sub             x0,  x0,  #4 // access the 2nd left pixel
    919 h_loop_filter_chroma420_10:
    920        add             x10,  x0,  x1,  lsl #2
    921        ld1             {v18.d}[0], [x0 ], x1
    922        ld1             {v18.d}[1], [x10], x1
    923        ld1             {v16.d}[0], [x0 ], x1
    924        ld1             {v16.d}[1], [x10], x1
    925        ld1             {v0.d}[0],  [x0 ], x1
    926        ld1             {v0.d}[1],  [x10], x1
    927        ld1             {v2.d}[0],  [x0 ], x1
    928        ld1             {v2.d}[1],  [x10], x1
    929 
    930        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
    931 
    932        h264_loop_filter_chroma_10
    933 
    934        transpose_4x8H  v18, v16, v0, v2, v28, v29, v30, v31
    935 
    936        sub             x0,  x10,  x1, lsl #3
    937        st1             {v18.d}[0], [x0], x1
    938        st1             {v16.d}[0], [x0], x1
    939        st1             {v0.d}[0],  [x0], x1
    940        st1             {v2.d}[0],  [x0], x1
    941        st1             {v18.d}[1], [x0], x1
    942        st1             {v16.d}[1], [x0], x1
    943        st1             {v0.d}[1],  [x0], x1
    944        st1             {v2.d}[1],  [x0], x1
    945 9:
    946        ret
    947 endfunc
    948 
    949 function ff_h264_h_loop_filter_chroma422_neon_10, export=1
    950        h264_loop_filter_start_10
    951        add             x5,  x0,  x1
    952        sub             x0,  x0,  #4
    953        add             x1,  x1,  x1
    954        mov             x7,  x30
    955        bl              h_loop_filter_chroma420_10
    956        mov             x30, x7
    957        sub             x0,  x5,  #4
    958        mov             v24.s[0], w6
    959        b               h_loop_filter_chroma420_10
    960 endfunc
    961 
    962 .macro h264_loop_filter_chroma_intra_10
    963        uabd            v26.8h,  v16.8h,  v17.8h  // abs(p0 - q0)
    964        uabd            v27.8h,  v18.8h,  v16.8h  // abs(p1 - p0)
    965        uabd            v28.8h,  v19.8h,  v17.8h  // abs(q1 - q0)
    966        cmhi            v26.8h,  v30.8h,  v26.8h  // < alpha
    967        cmhi            v27.8h,  v31.8h,  v27.8h  // < beta
    968        cmhi            v28.8h,  v31.8h,  v28.8h  // < beta
    969        and             v26.16b, v26.16b, v27.16b
    970        and             v26.16b, v26.16b, v28.16b
    971        mov             x2, v26.d[0]
    972        mov             x3, v26.d[1]
    973 
    974        shl             v4.8h,  v18.8h,  #1
    975        shl             v6.8h,  v19.8h,  #1
    976 
    977        adds            x2,  x2,  x3
    978        b.eq            9f
    979 
    980        add             v20.8h,  v16.8h,  v19.8h
    981        add             v22.8h,  v17.8h,  v18.8h
    982        add             v20.8h,  v20.8h,  v4.8h
    983        add             v22.8h,  v22.8h,  v6.8h
    984        urshr           v24.8h,  v20.8h,  #2
    985        urshr           v25.8h,  v22.8h,  #2
    986        bit             v16.16b, v24.16b, v26.16b
    987        bit             v17.16b, v25.16b, v26.16b
    988 .endm
    989 
    990 function ff_h264_v_loop_filter_chroma_intra_neon_10, export=1
    991        h264_loop_filter_start_intra_10
    992        mov             x9,  x0
    993        sub             x0,  x0,  x1, lsl #1
    994        ld1             {v18.8h}, [x0], x1
    995        ld1             {v17.8h}, [x9], x1
    996        ld1             {v16.8h}, [x0], x1
    997        ld1             {v19.8h}, [x9]
    998 
    999        h264_loop_filter_chroma_intra_10
   1000 
   1001        sub             x0,  x9,  x1, lsl #1
   1002        st1             {v16.8h}, [x0], x1
   1003        st1             {v17.8h}, [x0], x1
   1004 
   1005 9:
   1006        ret
   1007 endfunc
   1008 
   1009 function ff_h264_h_loop_filter_chroma_mbaff_intra_neon_10, export=1
   1010        h264_loop_filter_start_intra_10
   1011 
   1012        sub             x4,  x0,  #4
   1013        sub             x0,  x0,  #2
   1014        add             x9,  x4,  x1, lsl #1
   1015        ld1             {v18.8h}, [x4], x1
   1016        ld1             {v17.8h}, [x9], x1
   1017        ld1             {v16.8h}, [x4], x1
   1018        ld1             {v19.8h}, [x9], x1
   1019 
   1020        transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29
   1021 
   1022        h264_loop_filter_chroma_intra_10
   1023 
   1024        st2             {v16.h,v17.h}[0], [x0], x1
   1025        st2             {v16.h,v17.h}[1], [x0], x1
   1026        st2             {v16.h,v17.h}[2], [x0], x1
   1027        st2             {v16.h,v17.h}[3], [x0], x1
   1028 
   1029 9:
   1030        ret
   1031 endfunc
   1032 
   1033 function ff_h264_h_loop_filter_chroma_intra_neon_10, export=1
   1034        h264_loop_filter_start_intra_10
   1035        sub             x4,  x0,  #4
   1036        sub             x0,  x0,  #2
   1037 h_loop_filter_chroma420_intra_10:
   1038        add             x9,  x4,  x1, lsl #2
   1039        ld1             {v18.4h},   [x4], x1
   1040        ld1             {v18.d}[1], [x9], x1
   1041        ld1             {v16.4h},   [x4], x1
   1042        ld1             {v16.d}[1], [x9], x1
   1043        ld1             {v17.4h},   [x4], x1
   1044        ld1             {v17.d}[1], [x9], x1
   1045        ld1             {v19.4h},   [x4], x1
   1046        ld1             {v19.d}[1], [x9], x1
   1047 
   1048        transpose_4x8H  v18, v16, v17, v19, v26, v27, v28, v29
   1049 
   1050        h264_loop_filter_chroma_intra_10
   1051 
   1052        st2             {v16.h,v17.h}[0], [x0], x1
   1053        st2             {v16.h,v17.h}[1], [x0], x1
   1054        st2             {v16.h,v17.h}[2], [x0], x1
   1055        st2             {v16.h,v17.h}[3], [x0], x1
   1056        st2             {v16.h,v17.h}[4], [x0], x1
   1057        st2             {v16.h,v17.h}[5], [x0], x1
   1058        st2             {v16.h,v17.h}[6], [x0], x1
   1059        st2             {v16.h,v17.h}[7], [x0], x1
   1060 
   1061 9:
   1062        ret
   1063 endfunc
   1064 
   1065 function ff_h264_h_loop_filter_chroma422_intra_neon_10, export=1
   1066        h264_loop_filter_start_intra_10
   1067        sub             x4,  x0,  #4
   1068        add             x5,  x0,  x1, lsl #3
   1069        sub             x0,  x0,  #2
   1070        mov             x7,  x30
   1071        bl              h_loop_filter_chroma420_intra_10
   1072        mov             x4,  x9
   1073        sub             x0,  x5,  #2
   1074        mov             x30, x7
   1075        b               h_loop_filter_chroma420_intra_10
   1076 endfunc