tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain16.S (75477B)


      1 /*
      2 * Copyright © 2021, VideoLAN and dav1d authors
      3 * Copyright © 2021, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 #include "src/arm/asm-offsets.h"
     31 
     32 #define GRAIN_WIDTH 82
     33 #define GRAIN_HEIGHT 73
     34 
     35 #define SUB_GRAIN_WIDTH 44
     36 #define SUB_GRAIN_HEIGHT 38
     37 
     38 .macro increment_seed steps, shift=1
     39        lsr             r11, r2,  #3
     40        lsr             r12, r2,  #12
     41        lsr             lr,  r2,  #1
     42        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
     43        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
     44        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
     45 .if \shift
     46        lsr             r2,  r2,  #\steps
     47 .endif
     48        and             r11, r11, #((1 << \steps) - 1)    // bit
     49 .if \shift
     50        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
     51 .else
     52        orr             r2,  r2,  r11, lsl #16            // *state
     53 .endif
     54 .endm
     55 
     56 .macro read_rand dest, bits, age
     57        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
     58 .endm
     59 
     60 .macro read_shift_rand dest, bits
     61        ubfx            \dest,  r2,   #17 - \bits, #\bits
     62        lsr             r2,  r2,  #1
     63 .endm
     64 
     65 // special calling convention:
     66 // r2 holds seed
     67 // r3 holds dav1d_gaussian_sequence
     68 // clobbers r11-r12
     69 // returns in d0-d1
     70 function get_gaussian_neon
     71        push            {r5-r6,lr}
     72        increment_seed  4
     73        read_rand       r5,  11,  3
     74        read_rand       r6,  11,  2
     75        add             r5,  r3,  r5,  lsl #1
     76        add             r6,  r3,  r6,  lsl #1
     77        vld1.16         {d0[0]}, [r5]
     78        read_rand       r5,  11,  1
     79        vld1.16         {d0[1]}, [r6]
     80        add             r5,  r3,  r5,  lsl #1
     81        read_rand       r6, 11,  0
     82        increment_seed  4
     83        add             r6,  r3,  r6,  lsl #1
     84        vld1.16         {d0[2]}, [r5]
     85        read_rand       r5,  11,  3
     86        vld1.16         {d0[3]}, [r6]
     87        add             r5,  r3,  r5,  lsl #1
     88        read_rand       r6,  11,  2
     89        vld1.16         {d1[0]}, [r5]
     90        add             r6,  r3,  r6,  lsl #1
     91        read_rand       r5,  11,  1
     92        vld1.16         {d1[1]}, [r6]
     93        read_rand       r6,  11,  0
     94        add             r5,  r3,  r5,  lsl #1
     95        add             r6,  r3,  r6,  lsl #1
     96        vld1.16         {d1[2]}, [r5]
     97        vld1.16         {d1[3]}, [r6]
     98        pop             {r5-r6,pc}
     99 endfunc
    100 
    101 function get_grain_2_neon
    102        push            {r11,lr}
    103        increment_seed  2
    104        read_rand       r11, 11,  1
    105        read_rand       r12, 11,  0
    106        add             r11, r3,  r11, lsl #1
    107        add             r12, r3,  r12, lsl #1
    108        vld1.16         {d0[0]}, [r11]
    109        vld1.16         {d0[1]}, [r12]
    110        vrshl.s16       d0,  d0,  d30
    111        pop             {r11,pc}
    112 endfunc
    113 
    114 .macro get_grain_2 dst
    115        bl              get_grain_2_neon
    116 .ifnc \dst, d0
    117        vmov            \dst, d0
    118 .endif
    119 .endm
    120 
    121 function get_grain_4_neon
    122        push            {r11,lr}
    123        increment_seed  4
    124        read_rand       r11, 11,  3
    125        read_rand       r12, 11,  2
    126        add             r11, r3,  r11, lsl #1
    127        add             r12, r3,  r12, lsl #1
    128        vld1.16         {d0[0]}, [r11]
    129        read_rand       r11, 11,  1
    130        vld1.16         {d0[1]}, [r12]
    131        read_rand       r12, 11,  0
    132        add             r11, r3,  r11, lsl #1
    133        add             r12, r3,  r12, lsl #1
    134        vld1.16         {d0[2]}, [r11]
    135        vld1.16         {d0[3]}, [r12]
    136        vrshl.s16       d0,  d0,  d30
    137        pop             {r11,pc}
    138 endfunc
    139 
    140 .macro get_grain_4 dst
    141        bl              get_grain_4_neon
    142 .ifnc \dst, d0
    143        vmov            \dst, d0
    144 .endif
    145 .endm
    146 
    147 // r1 holds the number of entries to produce
    148 // r6, r8 and r10 hold the previous output entries
    149 // q0 holds the vector of produced entries
    150 // q1 holds the input vector of sums from above
    151 .macro output_lag n
    152 function output_lag\n\()_neon
    153        push            {r0, lr}
    154 .if \n == 1
    155        mvn             lr,  r5                   // grain_min = ~grain_max
    156 .else
    157        mov             r0,  #1
    158        mov             lr,  #1
    159        sub             r7,  r7,  #1
    160        sub             r9,  r9,  #1
    161        lsl             r0,  r0,  r7
    162        lsl             lr,  lr,  r9
    163        add             r7,  r7,  #1
    164        add             r9,  r9,  #1
    165 .endif
    166 1:
    167        read_shift_rand r12, 11
    168        vmov.32         r11, d2[0]
    169        lsl             r12, r12, #1
    170        vext.8          q0,  q0,  q0,  #2
    171        ldrsh           r12, [r3, r12]
    172 .if \n == 1
    173        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
    174        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
    175        add             r12, r12, r10
    176        asr             r6,  r6,  r7              // >> ar_coeff_shift
    177        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
    178        add             r6,  r6,  r12
    179        cmp             r6,  r5
    180 .elseif \n == 2
    181        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
    182        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
    183        mov             r8,  r6
    184        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
    185        add             r12, r12, lr              // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
    186        asr             r6,  r6,  r7              // >> ar_coeff_shift
    187        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
    188        add             r6,  r6,  r12
    189        push            {lr}
    190        cmp             r6,  r5
    191        mvn             lr,  r5                   // grain_min = ~grain_max
    192 .else
    193        push            {r1-r3}
    194        sbfx            r1,  r4,  #0,  #8
    195        sbfx            r2,  r4,  #8,  #8
    196        sbfx            r3,  r4,  #16, #8
    197        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
    198        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
    199        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
    200        pop             {r1-r3}
    201        mov             r10, r8
    202        mov             r8,  r6
    203 
    204        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
    205        add             r12, r12, lr              // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
    206        asr             r6,  r6,  r7              // >> ar_coeff_shift
    207        asr             r12, r12, r9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
    208        add             r6,  r6,  r12
    209        push            {lr}
    210        cmp             r6,  r5
    211        mvn             lr,  r5                   // grain_min = ~grain_max
    212 .endif
    213        it              gt
    214        movgt           r6,  r5
    215        cmp             r6,  lr
    216        it              lt
    217        movlt           r6,  lr
    218 .if \n >= 2
    219        pop             {lr}
    220 .endif
    221        subs            r1,  r1,  #1
    222        vext.8          q1,  q1,  q1,  #4
    223        vmov.16         d1[3], r6
    224        bgt             1b
    225        pop             {r0, pc}
    226 endfunc
    227 .endm
    228 
    229 output_lag 1
    230 output_lag 2
    231 output_lag 3
    232 
    233 
    234 function sum_lag1_above_neon
    235        sub             r12, r0,  #1*GRAIN_WIDTH*2 - 16
    236        vld1.16         {q10}, [r12] // load top right
    237 
    238        vext.8          q0,  q8,  q9,  #14 // top left, top mid
    239        vext.8          q1,  q9,  q10, #2  // top left, top mid
    240 
    241        vmull.s16       q2,  d18, d28
    242        vmlal.s16       q2,  d0,  d27
    243        vmlal.s16       q2,  d2,  d29
    244        vmull.s16       q3,  d19, d28
    245        vmlal.s16       q3,  d1,  d27
    246        vmlal.s16       q3,  d3,  d29
    247 
    248        vmov            q8,  q9
    249        vmov            q9,  q10
    250 
    251        bx              lr
    252 endfunc
    253 
    254 .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
    255 .ifc \lag\()_\edge, lag3_left
    256        bl              sum_lag3_left_above_neon
    257 .else
    258        bl              sum_\lag\()_above_neon
    259 .endif
    260 .ifc \type, uv_420
    261        vpush           {q6-q7}
    262        add             r12, r11, #GRAIN_WIDTH*2
    263        vld1.16         {q0, q1}, [r11]!
    264        vld1.16         {q6, q7}, [r12]!
    265        vpadd.i16       d0,  d0,  d1
    266        vpadd.i16       d1,  d2,  d3
    267        vpadd.i16       d12, d12, d13
    268        vpadd.i16       d13, d14, d15
    269        vadd.i16        q0,  q0,  q6
    270        vpop            {q6-q7}
    271        vrshr.s16       q0,  q0,  #2
    272 .endif
    273 .ifc \type, uv_422
    274        vld1.16         {q0, q1}, [r11]!
    275        vpadd.i16       d0,  d0,  d1
    276        vpadd.i16       d1,  d2,  d3
    277        vrshr.s16       q0,  q0,  #1
    278 .endif
    279 .ifc \type, uv_444
    280        vld1.16         {q0}, [r11]!
    281 .endif
    282 .if \uv_layout
    283 .ifnb \uv_coeff
    284        vdup.8          d13, \uv_coeff
    285        vmovl.s8        q6,  d13
    286 .endif
    287        vmlal.s16       q2,  d0,  d13
    288        vmlal.s16       q3,  d1,  d13
    289 .endif
    290 .if \uv_layout && \elems == 8
    291        b               sum_\lag\()_y_\edge\()_start
    292 .elseif \uv_layout == 444 && \elems == 7
    293        b               sum_\lag\()_y_\edge\()_start
    294 .elseif \uv_layout == 422 && \elems == 1
    295        b               sum_\lag\()_uv_420_\edge\()_start
    296 .else
    297 sum_\lag\()_\type\()_\edge\()_start:
    298        push            {r11}
    299 .if \elems > 4
    300 .ifc \edge, left
    301        increment_seed  4
    302        read_rand       r11, 11,  3
    303        read_rand       r12, 11,  2
    304        add             r11, r3,  r11, lsl #1
    305        add             r12, r3,  r12, lsl #1
    306        vld1.16         {d1[1]}, [r11]
    307        read_rand       r11, 11,  1
    308        vld1.16         {d1[2]}, [r12]
    309        add             r11, r3,  r11, lsl #1
    310        vld1.16         {d1[3]}, [r11]
    311        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
    312        vrshl.s16       d1,  d1,  d30
    313        vext.8          q2,  q2,  q2,  #12
    314 .ifc \lag, lag3
    315        vmov.s16        r10, d1[1]
    316 .endif
    317 .ifnc \lag, lag1
    318        vmov.s16        r8,  d1[2]
    319 .endif
    320        vmov.s16        r6,  d1[3]
    321 
    322        vmov            q1,  q2
    323        mov             r1,  #1
    324        bl              output_\lag\()_neon
    325 .else
    326        increment_seed  4, shift=0
    327        vmov            q1,  q2
    328        mov             r1,  #4
    329        bl              output_\lag\()_neon
    330 .endif
    331 
    332        increment_seed  4, shift=0
    333        vmov            q1,  q3
    334 .ifc \edge, right
    335        mov             r1,  #3
    336        bl              output_\lag\()_neon
    337        read_shift_rand r12, 11
    338        add             r12, r3,  r12, lsl #1
    339        vld1.16         {d2[0]}, [r12]
    340        vrshl.s16       d2,  d2,  d30
    341        vext.8          q0,  q0,  q1,  #2
    342 .else
    343        mov             r1,  #4
    344        bl              output_\lag\()_neon
    345 .endif
    346 .else
    347        // elems == 1
    348        increment_seed  4, shift=0
    349        vmov            q1,  q2
    350        mov             r1,  #1
    351        bl              output_\lag\()_neon
    352        lsr             r2,  r2,  #3
    353 
    354        read_rand       r11, 11,  2
    355        read_rand       r12, 11,  1
    356        add             r11, r3,  r11, lsl #1
    357        add             r12, r3,  r12, lsl #1
    358        vld1.16         {d2[0]}, [r11]
    359        read_rand       r11, 11,  0
    360        vld1.16         {d2[1]}, [r12]
    361        add             r11, r3,  r11, lsl #1
    362        vld1.16         {d2[2]}, [r11]
    363        vrshl.s16       d2,  d2,  d30
    364        vext.8          q0,  q0,  q1,  #14
    365 .endif
    366        vst1.16         {q0}, [r0]!
    367        pop             {r11}
    368        pop             {r1, pc}
    369 .endif
    370 .endm
    371 
    372 .macro sum_lag1_func type, uv_layout, edge, elems=8
    373 function sum_\type\()_lag1_\edge\()_neon
    374        push            {r1, lr}
    375 .ifc \edge, left
    376        sub             r12, r0,  #1*GRAIN_WIDTH*2
    377        vld1.8          {q9},  [r12] // load the previous block right above
    378 .endif
    379        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
    380 endfunc
    381 .endm
    382 
    383 sum_lag1_func y,      0,   left
    384 sum_lag1_func y,      0,   mid
    385 sum_lag1_func y,      0,   right, 7
    386 sum_lag1_func uv_444, 444, left
    387 sum_lag1_func uv_444, 444, mid
    388 sum_lag1_func uv_444, 444, right, 7
    389 sum_lag1_func uv_422, 422, left
    390 sum_lag1_func uv_422, 422, mid
    391 sum_lag1_func uv_422, 422, right, 1
    392 sum_lag1_func uv_420, 420, left
    393 sum_lag1_func uv_420, 420, mid
    394 sum_lag1_func uv_420, 420, right, 1
    395 
    396 
    397 function sum_lag2_above_neon
    398        push            {lr}
    399        sub             r12, r0,  #2*GRAIN_WIDTH*2 - 16
    400        sub             lr,  r0,  #1*GRAIN_WIDTH*2 - 16
    401        vld1.16         {q10}, [r12] // load top right
    402        vld1.16         {q13}, [lr]
    403 
    404        vdup.8          d10, d28[0]
    405        vext.8          q0,  q8,  q9,  #12 // top left, top mid
    406        vdup.8          d12, d28[1]
    407        vext.8          q1,  q8,  q9,  #14
    408        vdup.8          d14, d28[3]
    409        vext.8          q4,  q9,  q10, #2  // top mid, top right
    410        vmovl.s8        q5,  d10
    411        vmovl.s8        q6,  d12
    412        vmovl.s8        q7,  d14
    413 
    414        vmull.s16       q2,  d0,  d10
    415        vmlal.s16       q2,  d2,  d12
    416        vmlal.s16       q2,  d8,  d14
    417        vmull.s16       q3,  d1,  d10
    418        vmlal.s16       q3,  d3,  d12
    419        vmlal.s16       q3,  d9,  d14
    420 
    421        vdup.8          d10, d28[4]
    422        vext.8          q0,  q9,  q10, #4  // top mid, top right
    423        vdup.8          d12, d28[5]
    424        vext.8          q1,  q11, q12, #12 // top left, top mid
    425        vdup.8          d14, d28[6]
    426        vext.8          q4,  q11, q12, #14
    427        vmovl.s8        q5,  d10
    428        vmovl.s8        q6,  d12
    429        vmovl.s8        q7,  d14
    430 
    431        vmlal.s16       q2,  d0,  d10
    432        vmlal.s16       q2,  d2,  d12
    433        vmlal.s16       q2,  d8,  d14
    434        vmlal.s16       q3,  d1,  d10
    435        vmlal.s16       q3,  d3,  d12
    436        vmlal.s16       q3,  d9,  d14
    437 
    438        vdup.8          d10, d29[0]
    439        vext.8          q0,  q12, q13, #2  // top mid, top right
    440        vdup.8          d12, d29[1]
    441        vext.8          q1,  q12, q13, #4
    442 
    443        vdup.8          d14, d28[2]
    444        vdup.8          d8,  d28[7]
    445 
    446        vmovl.s8        q5,  d10
    447        vmovl.s8        q6,  d12
    448        vmovl.s8        q7,  d14
    449        vmovl.s8        q4,  d8
    450 
    451        vmlal.s16       q2,  d0,  d10
    452        vmlal.s16       q2,  d2,  d12
    453        vmlal.s16       q2,  d18, d14
    454        vmlal.s16       q2,  d24, d8
    455        vmlal.s16       q3,  d1,  d10
    456        vmlal.s16       q3,  d3,  d12
    457        vmlal.s16       q3,  d19, d14
    458        vmlal.s16       q3,  d25, d8
    459 
    460        vmov            q8,  q9
    461        vmov            q9,  q10
    462 
    463        vmov            q11, q12
    464        vmov            q12, q13
    465 
    466        pop             {pc}
    467 endfunc
    468 
    469 .macro sum_lag2_func type, uv_layout, edge, elems=8
    470 function sum_\type\()_lag2_\edge\()_neon
    471        push            {r1, lr}
    472 .ifc \edge, left
    473        sub             r12, r0,  #2*GRAIN_WIDTH*2
    474        sub             lr,  r0,  #1*GRAIN_WIDTH*2
    475        vld1.16         {q9},  [r12] // load the previous block right above
    476        vld1.16         {q12}, [lr]
    477 .endif
    478        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, uv_coeff=d29[4]
    479 endfunc
    480 .endm
    481 
    482 sum_lag2_func y,      0,   left
    483 sum_lag2_func y,      0,   mid
    484 sum_lag2_func y,      0,   right, 7
    485 sum_lag2_func uv_444, 444, left
    486 sum_lag2_func uv_444, 444, mid
    487 sum_lag2_func uv_444, 444, right, 7
    488 sum_lag2_func uv_422, 422, left
    489 sum_lag2_func uv_422, 422, mid
    490 sum_lag2_func uv_422, 422, right, 1
    491 sum_lag2_func uv_420, 420, left
    492 sum_lag2_func uv_420, 420, mid
    493 sum_lag2_func uv_420, 420, right, 1
    494 
    495 
    496 function sum_lag3_left_above_neon
    497        // A separate codepath for the left edge, to avoid reading outside
    498        // of the edge of the buffer.
    499        sub             r12, r0,  #3*GRAIN_WIDTH*2
    500        vld1.8          {q11, q12}, [r12]
    501        vext.8          q12, q11, q12, #10
    502        vext.8          q11, q11, q11, #10
    503        b               sum_lag3_above_start
    504 endfunc
    505 
    506 function sum_lag3_above_neon
    507        movw            r12, #(3*GRAIN_WIDTH + 3)*2
    508        sub             r12, r0,  r12
    509        vld1.8          {q11, q12}, [r12]
    510 
    511 sum_lag3_above_start:
    512        vdup.8          d12, d26[0]
    513        vext.8          q1,  q11, q12, #2
    514        vdup.8          d14, d26[1]
    515        vext.8          q4,  q11, q12, #4
    516        vdup.8          d16, d26[2]
    517        vext.8          q5,  q11, q12, #6
    518        vdup.8          d18, d26[3]
    519        vmovl.s8        q6,  d12
    520        vmovl.s8        q7,  d14
    521        vmovl.s8        q8,  d16
    522        vmovl.s8        q9,  d18
    523 
    524        movw            r12, #(2*GRAIN_WIDTH + 3)*2
    525        sub             r12, r0,  r12
    526 
    527        vmull.s16       q2,  d22, d12
    528        vmlal.s16       q2,  d2,  d14
    529        vmlal.s16       q2,  d8,  d16
    530        vmlal.s16       q2,  d10, d18
    531        vmull.s16       q3,  d23, d12
    532        vmlal.s16       q3,  d3,  d14
    533        vmlal.s16       q3,  d9,  d16
    534        vmlal.s16       q3,  d11, d18
    535 
    536        vdup.8          d12, d26[4]
    537        vext.8          q0,  q11, q12, #8
    538        vdup.8          d14, d26[5]
    539        vext.8          q1,  q11, q12, #10
    540        vdup.8          d16, d26[6]
    541        vext.8          q4,  q11, q12, #12
    542        vld1.8          {q11, q12}, [r12]
    543        vdup.8          d18, d26[7]
    544        vmovl.s8        q6,  d12
    545        vmovl.s8        q7,  d14
    546        vmovl.s8        q8,  d16
    547        vmovl.s8        q9,  d18
    548 
    549        vmlal.s16       q2,  d0,  d12
    550        vmlal.s16       q2,  d2,  d14
    551        vmlal.s16       q2,  d8,  d16
    552        vmlal.s16       q2,  d22, d18
    553        vmlal.s16       q3,  d1,  d12
    554        vmlal.s16       q3,  d3,  d14
    555        vmlal.s16       q3,  d9,  d16
    556        vmlal.s16       q3,  d23, d18
    557 
    558        vdup.8          d12, d27[0]
    559        vext.8          q0,  q11, q12, #2
    560        vdup.8          d14, d27[1]
    561        vext.8          q1,  q11, q12, #4
    562        vdup.8          d16, d27[2]
    563        vext.8          q4,  q11, q12, #6
    564        vdup.8          d18, d27[3]
    565        vext.8          q5,  q11, q12, #8
    566        vmovl.s8        q6,  d12
    567        vmovl.s8        q7,  d14
    568        vmovl.s8        q8,  d16
    569        vmovl.s8        q9,  d18
    570 
    571        sub             r12, r0,  #(1*GRAIN_WIDTH + 3)*2
    572 
    573        vmlal.s16       q2,  d0,  d12
    574        vmlal.s16       q2,  d2,  d14
    575        vmlal.s16       q2,  d8,  d16
    576        vmlal.s16       q2,  d10, d18
    577        vmlal.s16       q3,  d1,  d12
    578        vmlal.s16       q3,  d3,  d14
    579        vmlal.s16       q3,  d9,  d16
    580        vmlal.s16       q3,  d11, d18
    581 
    582        vdup.8          d12, d27[4]
    583        vext.8          q0,  q11, q12, #10
    584        vdup.8          d14, d27[5]
    585        vext.8          q1,  q11, q12, #12
    586        vld1.8          {q11, q12}, [r12]
    587        vdup.8          d16, d27[6]
    588        vdup.8          d18, d27[7]
    589        vmovl.s8        q6,  d12
    590        vmovl.s8        q7,  d14
    591        vext.8          q5,  q11, q12, #2
    592        vmovl.s8        q8,  d16
    593        vmovl.s8        q9,  d18
    594 
    595        vmlal.s16       q2,  d0,  d12
    596        vmlal.s16       q2,  d2,  d14
    597        vmlal.s16       q2,  d22, d16
    598        vmlal.s16       q2,  d10, d18
    599        vmlal.s16       q3,  d1,  d12
    600        vmlal.s16       q3,  d3,  d14
    601        vmlal.s16       q3,  d23, d16
    602        vmlal.s16       q3,  d11, d18
    603 
    604        vdup.8          d12, d28[0]
    605        vext.8          q0,  q11, q12, #4
    606        vdup.8          d14, d28[1]
    607        vext.8          q1,  q11, q12, #6
    608        vdup.8          d16, d28[2]
    609        vext.8          q4,  q11, q12, #8
    610        vdup.8          d18, d28[3]
    611        vext.8          q5,  q11, q12, #10
    612        vmovl.s8        q6,  d12
    613        vmovl.s8        q7,  d14
    614        vmovl.s8        q8,  d16
    615        vmovl.s8        q9,  d18
    616 
    617        vmlal.s16       q2,  d0,  d12
    618        vmlal.s16       q2,  d2,  d14
    619        vmlal.s16       q2,  d8,  d16
    620        vmlal.s16       q2,  d10, d18
    621        vmlal.s16       q3,  d1,  d12
    622        vmlal.s16       q3,  d3,  d14
    623        vmlal.s16       q3,  d9,  d16
    624        vmlal.s16       q3,  d11, d18
    625 
    626        vdup.8          d12, d28[4]
    627        vext.8          q0,  q11, q12, #12
    628        vmovl.s8        q6,  d12
    629 
    630        vmlal.s16       q2,  d0,  d12
    631        vmlal.s16       q3,  d1,  d12
    632 
    633        bx              lr
    634 endfunc
    635 
    636 .macro sum_lag3_func type, uv_layout, edge, elems=8
    637 function sum_\type\()_lag3_\edge\()_neon
    638        push            {r1, lr}
    639        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, uv_coeff=d29[0]
    640 endfunc
    641 .endm
    642 
    643 sum_lag3_func y,      0,   left
    644 sum_lag3_func y,      0,   mid
    645 sum_lag3_func y,      0,   right, 7
    646 sum_lag3_func uv_444, 444, left
    647 sum_lag3_func uv_444, 444, mid
    648 sum_lag3_func uv_444, 444, right, 7
    649 sum_lag3_func uv_422, 422, left
    650 sum_lag3_func uv_422, 422, mid
    651 sum_lag3_func uv_422, 422, right, 1
    652 sum_lag3_func uv_420, 420, left
    653 sum_lag3_func uv_420, 420, mid
    654 sum_lag3_func uv_420, 420, right, 1
    655 
    656 function generate_grain_rows_neon
    657        push            {r10-r11,lr}
    658 1:
    659        mov             r10, #80
    660 2:
    661        bl              get_gaussian_neon
    662        vrshl.s16       q0,  q0,  q15
    663        subs            r10, r10, #8
    664        vst1.16         {q0}, [r0]!
    665        bgt             2b
    666        get_grain_2     d0
    667        subs            r1,  r1,  #1
    668        vst1.32         {d0[0]}, [r0]!
    669        bgt             1b
    670        pop             {r10-r11,pc}
    671 endfunc
    672 
    673 function generate_grain_rows_44_neon
    674        push            {r10-r11,lr}
    675 1:
    676        mov             r10, #40
    677 2:
    678        bl              get_gaussian_neon
    679        vrshl.s16       q0,  q0,  q15
    680        subs            r10, r10, #8
    681        vst1.16         {q0}, [r0]!
    682        bgt             2b
    683        get_grain_4     d0
    684        subs            r1,  r1,  #1
    685        vst1.16         {d0}, [r0]
    686        add             r0,  r0,  #GRAIN_WIDTH*2-80
    687        bgt             1b
    688        pop             {r10-r11,pc}
    689 endfunc
    690 
    691 function gen_grain_uv_444_lag0_neon
    692        vld1.16         {q3}, [r11]!
    693 gen_grain_uv_lag0_8_start:
    694        push            {r11,lr}
    695        bl              get_gaussian_neon
    696        vrshl.s16       q0,  q0,  q15
    697 gen_grain_uv_lag0_8_add:
    698        vand            q3,  q3,  q1
    699        vmull.s16       q2,  d6,  d22
    700        vmull.s16       q3,  d7,  d22
    701        vrshl.s32       q2,  q2,  q12
    702        vrshl.s32       q3,  q3,  q12
    703        vqmovn.s32      d4,  q2
    704        vqmovn.s32      d5,  q3
    705        vqadd.s16       q2,  q2,  q0
    706        vmin.s16        q2,  q2,  q9
    707        vmax.s16        q2,  q2,  q10
    708        vst1.16         {q2}, [r0]!
    709        pop             {r11,pc}
    710 endfunc
    711 
    712 function gen_grain_uv_420_lag0_8_neon
    713        add             r12, r11, #GRAIN_WIDTH*2
    714        vld1.16         {q2,q3}, [r11]!
    715        vld1.16         {q4,q5}, [r12]
    716        vpadd.i16       d4,  d4,  d5
    717        vpadd.i16       d5,  d6,  d7
    718        vpadd.i16       d8,  d8,  d9
    719        vpadd.i16       d9,  d10, d11
    720        vadd.i16        q2,  q2,  q4
    721        vrshr.s16       q3,  q2,  #2
    722        b               gen_grain_uv_lag0_8_start
    723 endfunc
    724 
    725 function gen_grain_uv_422_lag0_8_neon
    726        vld1.16         {q2,q3}, [r11]!
    727        vpadd.i16       d4,  d4,  d5
    728        vpadd.i16       d5,  d6,  d7
    729        vrshr.s16       q3,  q2,  #1
    730        b               gen_grain_uv_lag0_8_start
    731 endfunc
    732 
    733 function gen_grain_uv_420_lag0_4_neon
    734        add             r12, r11, #GRAIN_WIDTH*2
    735        vld1.16         {q2}, [r11]
    736        vld1.16         {q0}, [r12]
    737        add             r11, r11, #32
    738        vpadd.i16       d4,  d4,  d5
    739        vpadd.i16       d0,  d0,  d1
    740        vadd.i16        d4,  d4,  d0
    741        vrshr.s16       d6,  d4,  #2
    742        push            {r11,lr}
    743        get_grain_4     d0
    744        b               gen_grain_uv_lag0_8_add
    745 endfunc
    746 
    747 function gen_grain_uv_422_lag0_4_neon
    748        vld1.16         {q2}, [r11]
    749        add             r11, r11, #32
    750        vpadd.i16       d4,  d4,  d5
    751        vrshr.s16       d6,  d4,  #1
    752        push            {r11,lr}
    753        get_grain_4     d0
    754        b               gen_grain_uv_lag0_8_add
    755 endfunc
    756 
    757 .macro gen_grain_82 type
    758 function generate_grain_\type\()_16bpc_neon, export=1
    759        push            {r4-r11,lr}
    760 
    761 .ifc \type, uv_444
    762        ldr             r4,  [sp, #36]
    763        mov             r12, r3
    764        mov             lr,  #28
    765        add             r11, r1,  #3*GRAIN_WIDTH*2
    766        mov             r1,  r2
    767        mul             r12, r12, lr
    768        clz             lr,  r4
    769 .else
    770        clz             lr,  r2
    771 .endif
    772        movrel          r3,  X(gaussian_sequence)
    773        sub             lr,  lr,  #24 // -bitdepth_min_8
    774        ldr             r2,  [r1, #FGD_SEED]
    775        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
    776 .ifc \type, y
    777        add             r4,  r1,  #FGD_AR_COEFFS_Y
    778 .else
    779        add             r4,  r1,  #FGD_AR_COEFFS_UV
    780 .endif
    781        add             r9,  r9,  lr // grain_scale_shift - bitdepth_min_8
    782        adr             r5,  L(gen_grain_\type\()_tbl)
    783        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
    784        add             r9,  r9,  #4
    785        ldr             r6,  [r5, r6, lsl #2]
    786        vdup.16         q15, r9    // 4 - bitdepth_min_8 + data->grain_scale_shift
    787        add             r5,  r5,  r6
    788        vneg.s16        q15, q15
    789 
    790 .ifc \type, uv_444
    791        push            {lr}
    792        cmp             r12, #0
    793        movw            r10, #0x49d8
    794        movw            lr,  #0xb524
    795        // Intentionally using a separate register instead of moveq with an
    796        // immediate constant, to avoid armv8 deprecated it instruction forms.
    797        it              eq
    798        moveq           r10, lr
    799        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
    800        eor             r2,  r2,  r10
    801        pop             {lr}
    802 .endif
    803 
    804        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
    805        neg             lr,  lr             // bitdepth_min_8
    806        mov             r8,  #1
    807        mov             r10, #1
    808        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
    809        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
    810        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
    811        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
    812 
    813        bx              r5
    814 
    815        .align 2
    816 L(gen_grain_\type\()_tbl):
    817        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
    818        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
    819        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
    820        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
    821 
    822 L(generate_grain_\type\()_lag0):
    823 .ifc \type, y
    824        mov             r1,  #GRAIN_HEIGHT
    825        bl              generate_grain_rows_neon
    826 .else
    827        mov             r5,  #128
    828        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
    829        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
    830        mvn             r6,  r5             // grain_min = ~grain_max
    831 
    832        mov             r1,  #3
    833        bl              generate_grain_rows_neon
    834        mov             r1,  #GRAIN_HEIGHT-3
    835 
    836        vdup.32         q12, r7
    837        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
    838        vmov.i8         q0,  #0
    839        vmov.i8         q1,  #255
    840        vdup.16         q9,  r5
    841        vdup.16         q10, r6
    842        vext.8          q13, q0,  q1,  #10
    843        vext.8          q14, q1,  q0,  #2
    844        vneg.s32        q12, q12
    845        vmovl.s8        q11, d22
    846 
    847 1:
    848        vmov            q1,  q13
    849        bl              gen_grain_uv_444_lag0_neon // 8
    850        vmov.i8         q1,  #255
    851        bl              gen_grain_uv_444_lag0_neon // 16
    852        bl              gen_grain_uv_444_lag0_neon // 24
    853        bl              gen_grain_uv_444_lag0_neon // 32
    854        bl              gen_grain_uv_444_lag0_neon // 40
    855        bl              gen_grain_uv_444_lag0_neon // 48
    856        bl              gen_grain_uv_444_lag0_neon // 56
    857        bl              gen_grain_uv_444_lag0_neon // 64
    858        bl              gen_grain_uv_444_lag0_neon // 72
    859        vmov            q1,  q14
    860        bl              gen_grain_uv_444_lag0_neon // 80
    861        get_grain_2     d16
    862        subs            r1,  r1,  #1
    863        add             r11, r11, #4
    864        vst1.32         {d16[0]}, [r0]!
    865        bgt             1b
    866 .endif
    867        pop             {r4-r11,pc}
    868 
    869 L(generate_grain_\type\()_lag1):
    870        vpush           {q4-q7}
    871        mov             r5,  #128
    872        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
    873        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
    874        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
    875        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
    876        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
    877 .ifc \type, y
    878        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
    879 .else
    880        add             r4,  r4,  #2
    881 .endif
    882 
    883        mov             r1,  #3
    884 .ifc \type, uv_444
    885        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
    886        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
    887 .endif
    888        bl              generate_grain_rows_neon
    889        vmovl.s8        q13, d27
    890        vmovl.s8        q12, d29
    891        vmovl.s8        q14, d28
    892        vmov            d29, d24
    893 .ifc \type, uv_444
    894        vmovl.s8        q6,  d13
    895 .endif
    896 
    897        mov             r1,  #GRAIN_HEIGHT - 3
    898 1:
    899        bl              sum_\type\()_lag1_left_neon  // 8
    900        bl              sum_\type\()_lag1_mid_neon   // 16
    901        bl              sum_\type\()_lag1_mid_neon   // 24
    902        bl              sum_\type\()_lag1_mid_neon   // 32
    903        bl              sum_\type\()_lag1_mid_neon   // 40
    904        bl              sum_\type\()_lag1_mid_neon   // 48
    905        bl              sum_\type\()_lag1_mid_neon   // 56
    906        bl              sum_\type\()_lag1_mid_neon   // 64
    907        bl              sum_\type\()_lag1_mid_neon   // 72
    908        bl              sum_\type\()_lag1_right_neon // 80
    909        get_grain_2     d16
    910        subs            r1,  r1,  #1
    911 .ifc \type, uv_444
    912        add             r11, r11, #4
    913 .endif
    914        vst1.32         {d16[0]}, [r0]!
    915        bgt             1b
    916 
    917        vpop            {q4-q7}
    918        pop             {r4-r11,pc}
    919 
    920 L(generate_grain_\type\()_lag2):
    921        vpush           {q4-q7}
    922        mov             r5,  #128
    923        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
    924        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
    925        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
    926 
    927        vmov.s8         r4,  d29[2]
    928        vmov.s8         r10, d29[3]
    929 
    930        mov             r1,  #3
    931        bl              generate_grain_rows_neon
    932 
    933        mov             r1,  #GRAIN_HEIGHT - 3
    934 1:
    935        bl              sum_\type\()_lag2_left_neon  // 8
    936        bl              sum_\type\()_lag2_mid_neon   // 16
    937        bl              sum_\type\()_lag2_mid_neon   // 24
    938        bl              sum_\type\()_lag2_mid_neon   // 32
    939        bl              sum_\type\()_lag2_mid_neon   // 40
    940        bl              sum_\type\()_lag2_mid_neon   // 48
    941        bl              sum_\type\()_lag2_mid_neon   // 56
    942        bl              sum_\type\()_lag2_mid_neon   // 64
    943        bl              sum_\type\()_lag2_mid_neon   // 72
    944        bl              sum_\type\()_lag2_right_neon // 80
    945        get_grain_2     d16
    946        subs            r1,  r1,  #1
    947 .ifc \type, uv_444
    948        add             r11, r11, #4
    949 .endif
    950        vst1.32         {d16[0]}, [r0]!
    951        bgt             1b
    952 
    953        vpop            {q4-q7}
    954        pop             {r4-r11,pc}
    955 
    956 L(generate_grain_\type\()_lag3):
    957        vpush           {q4-q7}
    958        mov             r5,  #128
    959        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
    960        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
    961        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
    962 
    963        vmov.u8         r4,  d28[5]
    964        vmov.u8         r10, d28[6]
    965        vmov.u8         r12, d28[7]
    966 
    967        orr             r4,  r4,  r10, lsl #8
    968        orr             r4,  r4,  r12, lsl #16
    969 
    970        mov             r1,  #3
    971        vpush           {d26}
    972        bl              generate_grain_rows_neon
    973        vpop            {d26}
    974 
    975        mov             r1,  #GRAIN_HEIGHT - 3
    976 1:
    977        bl              sum_\type\()_lag3_left_neon  // 8
    978        bl              sum_\type\()_lag3_mid_neon   // 16
    979        bl              sum_\type\()_lag3_mid_neon   // 24
    980        bl              sum_\type\()_lag3_mid_neon   // 32
    981        bl              sum_\type\()_lag3_mid_neon   // 40
    982        bl              sum_\type\()_lag3_mid_neon   // 48
    983        bl              sum_\type\()_lag3_mid_neon   // 56
    984        bl              sum_\type\()_lag3_mid_neon   // 64
    985        bl              sum_\type\()_lag3_mid_neon   // 72
    986        bl              sum_\type\()_lag3_right_neon // 80
    987        get_grain_2     d16
    988        subs            r1,  r1,  #1
    989 .ifc \type, uv_444
    990        add             r11, r11, #4
    991 .endif
    992        vst1.32         {d16[0]}, [r0]!
    993        bgt             1b
    994 
    995        vpop            {q4-q7}
    996        pop             {r4-r11,pc}
    997 endfunc
    998 .endm
    999 
   1000 gen_grain_82 y
   1001 gen_grain_82 uv_444
   1002 
   1003 .macro set_height dst, type
   1004 .ifc \type, uv_420
   1005        mov             \dst,  #SUB_GRAIN_HEIGHT-3
   1006 .else
   1007        mov             \dst,  #GRAIN_HEIGHT-3
   1008 .endif
   1009 .endm
   1010 
   1011 .macro increment_y_ptr reg, type
   1012 .ifc \type, uv_420
   1013        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
   1014 .else
   1015        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
   1016 .endif
   1017 .endm
   1018 
   1019 .macro gen_grain_44 type
   1020 function generate_grain_\type\()_16bpc_neon, export=1
   1021        push            {r4-r11,lr}
   1022 
   1023        ldr             r4,  [sp, #36]
   1024        mov             r12, r3
   1025        movw            r11, #(3*GRAIN_WIDTH-3)*2
   1026        mov             lr,  #28
   1027        add             r11, r1,  r11
   1028        mov             r1,  r2
   1029        mul             r12, r12, lr
   1030        clz             lr,  r4
   1031 
   1032        movrel          r3,  X(gaussian_sequence)
   1033        sub             lr,  lr,  #24 // -bitdepth_min_8
   1034        ldr             r2,  [r1, #FGD_SEED]
   1035        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
   1036        add             r4,  r1,  #FGD_AR_COEFFS_UV
   1037        add             r9,  r9,  lr // grain_scale_shift - bitdepth_min_8
   1038        adr             r5,  L(gen_grain_\type\()_tbl)
   1039        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
   1040        add             r9,  r9,  #4
   1041        ldr             r6,  [r5, r6, lsl #2]
   1042        vdup.16         q15, r9    // 4 - bitdepth_min_8 + data->grain_scale_shift
   1043        add             r5,  r5,  r6
   1044        vneg.s16        q15, q15
   1045 
   1046        push            {lr}
   1047        cmp             r12, #0
   1048        movw            r10, #0x49d8
   1049        movw            lr,  #0xb524
   1050        // Intentionally using a separate register instead of moveq with an
   1051        // immediate constant, to avoid armv8 deprecated it instruction forms.
   1052        it              eq
   1053        moveq           r10, lr
   1054        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
   1055        eor             r2,  r2,  r10
   1056        pop             {lr}
   1057 
   1058        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
   1059        neg             lr,  lr
   1060        mov             r8,  #1
   1061        mov             r10, #1
   1062        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
   1063        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
   1064        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
   1065        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
   1066        bx              r5
   1067 
   1068        .align 2
   1069 L(gen_grain_\type\()_tbl):
   1070        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1071        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1072        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1073        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1074 
   1075 L(generate_grain_\type\()_lag0):
   1076 .ifc \type, uv_420
   1077        vpush           {q4-q5}
   1078 .endif
   1079        mov             r5,  #128
   1080        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
   1081        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
   1082        mvn             r6,  r5             // grain_min = ~grain_max
   1083 
   1084        mov             r1,  #3
   1085        bl              generate_grain_rows_44_neon
   1086        set_height      r1,  \type
   1087 
   1088        vdup.32         q12, r7
   1089        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
   1090        vmov.i8         q0,  #0
   1091        vmov.i8         q1,  #255
   1092        vdup.16         q9,  r5
   1093        vdup.16         q10, r6
   1094        vext.8          q13, q0,  q1,  #10
   1095        vext.8          q14, q1,  q0,  #14
   1096        vneg.s32        q12, q12
   1097        vmovl.s8        q11, d22
   1098 
   1099 1:
   1100        vmov            q1,  q13
   1101        bl              gen_grain_\type\()_lag0_8_neon // 8
   1102        vmov.i8         q1,  #255
   1103        bl              gen_grain_\type\()_lag0_8_neon // 16
   1104        bl              gen_grain_\type\()_lag0_8_neon // 24
   1105        bl              gen_grain_\type\()_lag0_8_neon // 32
   1106        bl              gen_grain_\type\()_lag0_8_neon // 40
   1107        vmov            q1,  q14
   1108        bl              gen_grain_\type\()_lag0_4_neon // 44
   1109        subs            r1,  r1,  #1
   1110        increment_y_ptr r11, \type
   1111        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
   1112        bgt             1b
   1113 
   1114 .ifc \type, uv_420
   1115        vpop            {q4-q5}
   1116 .endif
   1117        pop             {r4-r11,pc}
   1118 
   1119 L(generate_grain_\type\()_lag1):
   1120        vpush           {q4-q7}
   1121        mov             r5,  #128
   1122        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
   1123        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
   1124        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
   1125        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
   1126        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
   1127        add             r4,  r4,  #2
   1128 
   1129        mov             r1,  #3
   1130        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
   1131        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
   1132        bl              generate_grain_rows_44_neon
   1133        vmovl.s8        q13, d27
   1134        vmovl.s8        q12, d29
   1135        vmovl.s8        q14, d28
   1136        vmov            d29, d24
   1137        vmovl.s8        q6,  d13
   1138 
   1139        set_height      r1,  \type
   1140 1:
   1141        bl              sum_\type\()_lag1_left_neon  // 8
   1142        bl              sum_\type\()_lag1_mid_neon   // 16
   1143        bl              sum_\type\()_lag1_mid_neon   // 24
   1144        bl              sum_\type\()_lag1_mid_neon   // 32
   1145        bl              sum_\type\()_lag1_mid_neon   // 40
   1146        bl              sum_\type\()_lag1_right_neon // 44
   1147        subs            r1,  r1,  #1
   1148        increment_y_ptr r11, \type
   1149        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
   1150        bgt             1b
   1151 
   1152        vpop            {q4-q7}
   1153        pop             {r4-r11,pc}
   1154 
   1155 L(generate_grain_\type\()_lag2):
   1156        vpush           {q4-q7}
   1157        mov             r5,  #128
   1158        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
   1159        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
   1160        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
   1161 
   1162        vmov.s8         r4,  d29[2]
   1163        vmov.s8         r10, d29[3]
   1164 
   1165        mov             r1,  #3
   1166        bl              generate_grain_rows_44_neon
   1167 
   1168        set_height      r1,  \type
   1169 1:
   1170        bl              sum_\type\()_lag2_left_neon  // 8
   1171        bl              sum_\type\()_lag2_mid_neon   // 16
   1172        bl              sum_\type\()_lag2_mid_neon   // 24
   1173        bl              sum_\type\()_lag2_mid_neon   // 32
   1174        bl              sum_\type\()_lag2_mid_neon   // 40
   1175        bl              sum_\type\()_lag2_right_neon // 44
   1176        subs            r1,  r1,  #1
   1177        increment_y_ptr r11, \type
   1178        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
   1179        bgt             1b
   1180 
   1181        vpop            {q4-q7}
   1182        pop             {r4-r11,pc}
   1183 
   1184 L(generate_grain_\type\()_lag3):
   1185        vpush           {q4-q7}
   1186        mov             r5,  #128
   1187        lsl             r5,  r5,  lr        //  128 << bitdepth_min_8
   1188        sub             r5,  r5,  #1        // (128 << bitdepth_min_8) - 1
   1189        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
   1190 
   1191        vmov.u8         r4,  d28[5]
   1192        vmov.u8         r10, d28[6]
   1193        vmov.u8         r12, d28[7]
   1194 
   1195        orr             r4,  r4,  r10, lsl #8
   1196        orr             r4,  r4,  r12, lsl #16
   1197 
   1198        mov             r1,  #3
   1199        bl              generate_grain_rows_44_neon
   1200 
   1201        set_height      r1,  \type
   1202 1:
   1203        bl              sum_\type\()_lag3_left_neon  // 8
   1204        bl              sum_\type\()_lag3_mid_neon   // 16
   1205        bl              sum_\type\()_lag3_mid_neon   // 24
   1206        bl              sum_\type\()_lag3_mid_neon   // 32
   1207        bl              sum_\type\()_lag3_mid_neon   // 40
   1208        bl              sum_\type\()_lag3_right_neon // 44
   1209        subs            r1,  r1,  #1
   1210        increment_y_ptr r11, \type
   1211        add             r0,  r0,  #GRAIN_WIDTH*2-6*16
   1212        bgt             1b
   1213 
   1214        vpop            {q4-q7}
   1215        pop             {r4-r11,pc}
   1216 endfunc
   1217 .endm
   1218 
   1219 gen_grain_44 uv_420
   1220 gen_grain_44 uv_422
   1221 
   1222 .macro gather_interleaved dst1, dst2, src1, src2, src3, src4, off
   1223        vmov.u16        r11, \src1[0+\off]
   1224        vmov.u16        r12, \src3[0+\off]
   1225        add             r11, r11, r3
   1226        vmov.u16        lr,  \src1[2+\off]
   1227        add             r12, r12, r3
   1228        vld1.8          {\dst1[0+\off]}, [r11]
   1229        vmov.u16        r11, \src3[2+\off]
   1230        add             lr,  lr,  r3
   1231        vld1.8          {\dst2[0+\off]}, [r12]
   1232        vmov.u16        r12, \src2[0+\off]
   1233        add             r11, r11, r3
   1234        vld1.8          {\dst1[2+\off]}, [lr]
   1235        vmov.u16        lr,  \src4[0+\off]
   1236        add             r12, r12, r3
   1237        vld1.8          {\dst2[2+\off]}, [r11]
   1238        vmov.u16        r11, \src2[2+\off]
   1239        add             lr,  lr,  r3
   1240        vld1.8          {\dst1[4+\off]}, [r12]
   1241        vmov.u16        r12, \src4[2+\off]
   1242        add             r11, r11, r3
   1243        vld1.8          {\dst2[4+\off]}, [lr]
   1244        add             r12, r12, r3
   1245        vld1.8          {\dst1[6+\off]}, [r11]
   1246        vld1.8          {\dst2[6+\off]}, [r12]
   1247 .endm
   1248 
   1249 .macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, src7, src8
   1250        gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 0
   1251        gather_interleaved \dst1, \dst3, \src1, \src2, \src5, \src6, 1
   1252        gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 0
   1253        gather_interleaved \dst2, \dst4, \src3, \src4, \src7, \src8, 1
   1254 .endm
   1255 
   1256 function gather32_neon
   1257        push            {r11-r12,lr}
   1258        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3,  d4,  d5,  d6,  d7
   1259        pop             {r11-r12,pc}
   1260 endfunc
   1261 
   1262 function gather16_neon
   1263        push            {r11-r12,lr}
   1264        gather_interleaved d8,  d9,  d0,  d1,  d2,  d3,  0
   1265        gather_interleaved d8,  d9,  d0,  d1,  d2,  d3,  1
   1266        pop             {r11-r12,pc}
   1267 endfunc
   1268 
   1269 const overlap_coeffs_0, align=4
   1270        .short 27, 17, 0,  0
   1271        .short 17, 27, 32, 32
   1272 endconst
   1273 
   1274 const overlap_coeffs_1, align=4
   1275        .short 23, 0,  0,  0
   1276        .short 22, 32, 32, 32
   1277 endconst
   1278 
   1279 .macro calc_offset offx, offy, src, sx, sy
   1280        and             \offy, \src,  #0xF     // randval & 0xF
   1281        lsr             \offx, \src,  #4       // randval >> 4
   1282 .if \sy == 0
   1283        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
   1284 .endif
   1285 .if \sx == 0
   1286        add             \offx, \offx, \offx    // 2 * (randval >> 4)
   1287 .endif
   1288 .endm
   1289 
   1290 .macro add_offset dst, offx, offy, src, stride
   1291        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
   1292        add             \dst, \dst, \offx, lsl #1  // grain_lut += offx
   1293 .endm
   1294 
   1295 // void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
   1296 //                                 const ptrdiff_t stride,
   1297 //                                 const uint8_t scaling[SCALING_SIZE],
   1298 //                                 const int scaling_shift,
   1299 //                                 const entry grain_lut[][GRAIN_WIDTH],
   1300 //                                 const int offsets[][2],
   1301 //                                 const int h, const ptrdiff_t clip,
   1302 //                                 const ptrdiff_t type,
   1303 //                                 const int bitdepth_max);
   1304 function fgy_32x32_16bpc_neon, export=1
   1305        push            {r4-r11,lr}
   1306        vpush           {q4-q7}
   1307        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
   1308        ldrd            r6,  r7,  [sp, #108]   // offsets, h
   1309        ldr             r8,       [sp, #116]   // clip
   1310        mov             r9,  #GRAIN_WIDTH*2    // grain_lut stride
   1311        ldr             r10,      [sp, #124]   // bitdepth_max
   1312 
   1313        eor             r4,  r4,  #15          // 15 - scaling_shift
   1314        vdup.16         q6,  r10               // bitdepth_max
   1315        clz             r10, r10
   1316        vdup.16         q13, r4                // 15 - scaling_shift
   1317        rsb             r10, r10, #24          // bitdepth_min_8
   1318        cmp             r8,  #0
   1319        vdup.16         q12, r10               // bitdepth_min_8
   1320 
   1321        movrel_local    r12, overlap_coeffs_0
   1322 
   1323        beq             1f
   1324        // clip
   1325        vmov.i16        q14, #16
   1326        vmov.i16        q15, #235
   1327        vshl.s16        q14, q14, q12
   1328        vshl.s16        q15, q15, q12
   1329        b               2f
   1330 1:
   1331        // no clip
   1332        vmov.i16        q14, #0
   1333        vmov            q15, q6
   1334 2:
   1335        vshr.u16        q6,  q6,  #1           // grain_max
   1336 
   1337        vld1.16         {d24, d25}, [r12, :128] // overlap_coeffs
   1338 
   1339        add             r5,  r5,  #18          // grain_lut += 9
   1340        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
   1341        add             r5,  r5,  r9           // grain_lut += grain_stride
   1342 
   1343        ldr             r10, [r6, #8]          // offsets[1][0]
   1344        calc_offset     r10, r4,  r10, 0,   0
   1345        add_offset      r4,  r10, r4,  r5,  r9
   1346        ldr             r10, [r6, #4]          // offsets[0][1]
   1347        calc_offset     r10, r11, r10, 0,   0
   1348        add_offset      r11, r10, r11, r5,  r9
   1349        ldr             r10, [r6, #12]         // offsets[1][1]
   1350        calc_offset     r10, r8,  r10, 0,   0
   1351        add_offset      r8,  r10, r8,  r5,  r9
   1352        ldr             r6,  [r6]              // offsets[0][0]
   1353        calc_offset     r6,  lr,  r6,  0,   0
   1354        add_offset      r5,  r6,  lr,  r5,  r9
   1355 
   1356        add             r4,  r4,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
   1357        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1358 
   1359        ldr             r10, [sp, #120]        // type
   1360        adr             r11, L(fgy_loop_tbl)
   1361 
   1362        tst             r10, #1
   1363        ldr             r10, [r11, r10, lsl #2]
   1364 
   1365        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1366        add             r8,  r8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
   1367 
   1368        add             r11, r11, r10
   1369 
   1370        beq             1f
   1371        // y overlap
   1372        vdup.16         d14, d24[0]
   1373        vdup.16         d15, d24[1]
   1374        mov             r10, r7                // backup actual h
   1375        mov             r7,  #2
   1376 1:
   1377        sub             r2,  r2,  #32          // src_stride   -= 32
   1378        sub             r9,  r9,  #32          // grain_stride -= 32
   1379        bx              r11
   1380 endfunc
   1381 
   1382 function fgy_loop_neon
   1383 L(fgy_loop_tbl):
   1384        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
   1385        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
   1386        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
   1387        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
   1388 
   1389 .macro fgy ox, oy
   1390 L(loop_\ox\oy):
   1391 1:
   1392 .if \ox
   1393        vld1.16         {d0},       [r4],       r9 // grain_lut old
   1394 .endif
   1395 .if \oy
   1396        vld1.16         {q2,  q3},  [r6]!          // grain_lut top
   1397 .endif
   1398 .if \ox && \oy
   1399        vld1.16         {d2},       [r8],       r9 // grain_lut top old
   1400 .endif
   1401 .if \oy
   1402        vld1.16         {q4,  q5},  [r6],       r9 // grain_lut top
   1403 .endif
   1404 .if !\ox && !\oy
   1405        vld1.16         {q0,  q1},  [r1, :128]!    // src
   1406 .endif
   1407        vld1.16         {q8,  q9},  [r5]!          // grain_lut
   1408 .if !\ox && !\oy
   1409        vld1.16         {q2,  q3},  [r1, :128], r2 // src
   1410 .endif
   1411 .if !\oy
   1412        vmvn.i16        q5,  #0xf000               // 0x0fff
   1413 .endif
   1414        vld1.16         {q10, q11}, [r5],       r9 // grain_lut
   1415 
   1416 .if \ox
   1417        add             r4,  r4,  #32
   1418        vmull.s16       q0,  d0,  d24
   1419        vmlal.s16       q0,  d16, d25
   1420 .endif
   1421 
   1422 .if \oy
   1423 .if \ox
   1424        add             r8,  r8,  #32
   1425        vmull.s16       q1,  d2,  d24
   1426        vmlal.s16       q1,  d4,  d25
   1427        vqrshrn.s32     d16, q0,  #5
   1428        vmvn            d0,  d12                   // grain_min
   1429        vqrshrn.s32     d4,  q1,  #5
   1430        vmin.s16        d16, d16, d12
   1431        vmin.s16        d4,  d4,  d12
   1432        vmax.s16        d16, d16, d0
   1433        vmax.s16        d4,  d4,  d0
   1434 .endif
   1435 
   1436        vmull.s16       q0,  d4,  d14
   1437        vmull.s16       q1,  d5,  d14
   1438        vmull.s16       q2,  d6,  d14
   1439        vmull.s16       q3,  d7,  d14
   1440        vmlal.s16       q0,  d16, d15
   1441        vmlal.s16       q1,  d17, d15
   1442        vmlal.s16       q2,  d18, d15
   1443        vmlal.s16       q3,  d19, d15
   1444        vmull.s16       q8,  d20, d15
   1445        vmull.s16       q9,  d21, d15
   1446        vmull.s16       q10, d22, d15
   1447        vmull.s16       q11, d23, d15
   1448        vmlal.s16       q8,  d8,  d14
   1449        vmlal.s16       q9,  d9,  d14
   1450        vmlal.s16       q10, d10, d14
   1451        vmlal.s16       q11, d11, d14
   1452        vmvn            q4,  q6                   // grain_min
   1453        vqrshrn.s32     d0,  q0,  #5
   1454        vqrshrn.s32     d1,  q1,  #5
   1455        vqrshrn.s32     d2,  q2,  #5
   1456        vqrshrn.s32     d3,  q3,  #5
   1457        vqrshrn.s32     d4,  q8,  #5
   1458        vqrshrn.s32     d5,  q9,  #5
   1459        vqrshrn.s32     d6,  q10, #5
   1460        vqrshrn.s32     d7,  q11, #5
   1461        vmin.s16        q8,  q0,  q6
   1462        vmin.s16        q9,  q1,  q6
   1463        vld1.16         {q0,  q1},  [r1, :128]!    // src
   1464        vmin.s16        q10, q2,  q6
   1465        vmin.s16        q11, q3,  q6
   1466        vmax.s16        q8,  q8,  q4
   1467        vmax.s16        q9,  q9,  q4
   1468        vld1.16         {q2,  q3},  [r1, :128], r2 // src
   1469        vmvn.i16        q5,  #0xf000               // 0x0fff
   1470        vmax.s16        q10, q10, q4
   1471        vmax.s16        q11, q11, q4
   1472 .elseif \ox
   1473        vmvn            d4,  d12                   // grain_min
   1474        vqrshrn.s32     d16, q0,  #5
   1475        vld1.16         {q0,  q1},  [r1, :128]!    // src
   1476        vmin.s16        d16, d16, d12
   1477        vmax.s16        d16, d16, d4
   1478        vld1.16         {q2,  q3},  [r1, :128], r2 // src
   1479 .endif
   1480 
   1481        // Make sure that uninitialized pixels out of range past the right
   1482        // edge are in range; their actual values shouldn't matter.
   1483        vand            q0,  q0,  q5
   1484        vand            q1,  q1,  q5
   1485        vand            q2,  q2,  q5
   1486        vand            q3,  q3,  q5
   1487 
   1488        bl              gather32_neon
   1489 
   1490 .if \ox || \oy
   1491        vpush           {q6-q7}
   1492 .endif
   1493 
   1494        vmovl.u8        q6,  d8        // scaling
   1495        vmovl.u8        q7,  d9
   1496        vmovl.u8        q4,  d10
   1497        vmovl.u8        q5,  d11
   1498 
   1499        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
   1500        vshl.u16        q7,  q7,  q13
   1501        vshl.u16        q4,  q4,  q13
   1502        vshl.u16        q5,  q5,  q13
   1503 
   1504        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
   1505        vqrdmulh.s16    q9,  q9,  q7
   1506        vqrdmulh.s16    q10, q10, q4
   1507        vqrdmulh.s16    q11, q11, q5
   1508 
   1509 .if \ox || \oy
   1510        vpop            {q6-q7}
   1511 .endif
   1512 
   1513        vqadd.s16       q0,  q0,  q8   // *src + noise
   1514        vqadd.s16       q1,  q1,  q9
   1515        vqadd.s16       q2,  q2,  q10
   1516        vqadd.s16       q3,  q3,  q11
   1517 
   1518        vmax.s16        q0,  q0,  q14
   1519        vmax.s16        q1,  q1,  q14
   1520        vmax.s16        q2,  q2,  q14
   1521        vmax.s16        q3,  q3,  q14
   1522        vmin.s16        q0,  q0,  q15
   1523        vmin.s16        q1,  q1,  q15
   1524        vmin.s16        q2,  q2,  q15
   1525        vmin.s16        q3,  q3,  q15
   1526 
   1527        vst1.16         {q0, q1}, [r0, :128]!    // dst
   1528        subs            r7,  r7,  #1
   1529 .if \oy
   1530        vdup.16         d14, d25[0]
   1531        vdup.16         d15, d25[1]
   1532 .endif
   1533        vst1.16         {q2, q3}, [r0, :128], r2 // dst
   1534        bgt             1b
   1535 
   1536 .if \oy
   1537        cmp             r10, #2
   1538        sub             r7,  r10, #2           // restore actual remaining h
   1539        bgt             L(loop_\ox\()0)
   1540 .endif
   1541        vpop            {q4-q7}
   1542        pop             {r4-r11,pc}
   1543 .endm
   1544 
   1545        fgy             0, 0
   1546        fgy             0, 1
   1547        fgy             1, 0
   1548        fgy             1, 1
   1549 endfunc
   1550 
   1551 // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
   1552 //                                      const pixel *const src,
   1553 //                                      const ptrdiff_t stride,
   1554 //                                      const uint8_t scaling[SCALING_SIZE],
   1555 //                                      const Dav1dFilmGrainData *const data,
   1556 //                                      const entry grain_lut[][GRAIN_WIDTH],
   1557 //                                      const pixel *const luma_row,
   1558 //                                      const ptrdiff_t luma_stride,
   1559 //                                      const int offsets[][2],
   1560 //                                      const ptrdiff_t h, const ptrdiff_t uv,
   1561 //                                      const ptrdiff_t is_id,
   1562 //                                      const ptrdiff_t type,
   1563 //                                      const int bitdepth_max);
   1564 .macro fguv layout, sx, sy
   1565 function fguv_32x32_\layout\()_16bpc_neon, export=1
   1566        push            {r4-r11,lr}
   1567        vpush           {q4-q7}
   1568        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
   1569        ldrd            r10, r11, [sp, #124]   // uv, is_id
   1570        ldr             r6,       [sp, #136]   // bitdepth_max
   1571 
   1572        clz             r7,  r6
   1573        rsb             r7,  r7,  #24          // bitdepth_min_8
   1574 
   1575        // !csfl
   1576        add             r10, r4,  r10, lsl #2  // + 4*uv
   1577        add             r12, r10, #FGD_UV_LUMA_MULT
   1578        add             lr,  r10, #FGD_UV_MULT
   1579        ldrh            r10, [r10, #FGD_UV_OFFSET] // uv_offset
   1580        vld1.16         {d30[]},  [r12]        // uv_luma_mult
   1581        lsl             r10, r10, r7           // uv_offset << bitdepth_min_8
   1582        vld1.16         {d30[1]}, [lr]         // uv_mult
   1583 
   1584        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
   1585        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
   1586        eor             lr,  lr,  #15          // 15 - scaling_shift
   1587 
   1588        vmov.16         d30[2], r10            // uv_offset << bitdepth_min_8
   1589 
   1590        cmp             r12, #0
   1591        vdup.16         q13, lr                // 15 - scaling_shift
   1592 
   1593        beq             1f
   1594        // clip
   1595        cmp             r11, #0
   1596        mov             r8,  #16
   1597        mov             r9,  #240
   1598        lsl             r8,  r8,  r7
   1599        lsl             r9,  r9,  r7
   1600        beq             2f
   1601        // is_id
   1602        mov             r9,  #235
   1603        lsl             r9,  r9,  r7
   1604        b               2f
   1605 1:
   1606        // no clip
   1607        mov             r8,  #0
   1608        mov             r9,  r6                // bitdepth_max
   1609 2:
   1610        vmov.16         d30[3], r6             // bitdepth_max
   1611        vdup.16         d31, r8                // clip_min
   1612 
   1613        mov             r10, #GRAIN_WIDTH*2    // grain_lut stride
   1614 
   1615 .if \sy
   1616        mov             r6,  #23
   1617        mov             r7,  #22
   1618 .else
   1619        mov             r6,  #27
   1620        mov             r7,  #17
   1621 .endif
   1622        vmov.16         d31[1], r9             // clip_max
   1623 
   1624        ldrd            r8,  r9,  [sp, #116]   // offsets, h
   1625 
   1626        add             r5,  r5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
   1627 .if \sy
   1628        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
   1629        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
   1630 .else
   1631        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
   1632        add             r5,  r5,  r10          // grain_lut += grain_stride
   1633 .endif
   1634        vmov.16         d31[2], r6             // overlap y [0]
   1635 
   1636        ldr             r12, [r8, #8]          // offsets[1][0]
   1637        calc_offset     r12, r4,  r12, \sx, \sy
   1638        add_offset      r4,  r12, r4,  r5,  r10
   1639 
   1640        ldr             r12, [r8, #4]          // offsets[0][1]
   1641        calc_offset     r12, lr,  r12, \sx, \sy
   1642        add_offset      lr,  r12, lr,  r5,  r10
   1643 
   1644        ldr             r12, [r8, #12]         // offsets[1][1]
   1645        calc_offset     r12, r11, r12, \sx, \sy
   1646        add_offset      r11, r12, r11, r5,  r10
   1647 
   1648        ldr             r8,  [r8]              // offsets[0][0]
   1649        calc_offset     r8,  r12, r8,  \sx, \sy
   1650        add_offset      r5,  r8,  r12, r5,  r10
   1651 
   1652        vmov.16         d31[3], r7             // overlap y [1]
   1653 
   1654        add             r4,  r4,  #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
   1655        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1656        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1657        add             r11, r11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
   1658 
   1659        movrel_local    r12, overlap_coeffs_\sx
   1660        ldr             lr,       [sp, #132]   // type
   1661        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
   1662 
   1663        vld1.16         {d24, d25}, [r12, :128] // overlap_coeffs
   1664 
   1665        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
   1666 #if CONFIG_THUMB
   1667        // This uses movrel_local instead of adr above, because the target
   1668        // can be out of range for adr. But movrel_local leaves the thumb bit
   1669        // set on COFF (but probably wouldn't if building for thumb on ELF),
   1670        // thus try to clear the bit for robustness.
   1671        bic             r12, r12, #1
   1672 #endif
   1673 
   1674        tst             lr,  #1
   1675        ldr             lr,  [r12, lr,  lsl #2]
   1676 
   1677        add             r12, r12, lr
   1678 
   1679        beq             1f
   1680        // y overlap
   1681        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
   1682        mov             r9,  #(2 >> \sy)
   1683 
   1684 1:
   1685 .if \sy
   1686        add             r7,  r7,  r7           // luma_stride *= 2
   1687 .endif
   1688        sub             r7,  r7,  #32          // luma_stride -= 32
   1689 
   1690        bx              r12
   1691 endfunc
   1692 .endm
   1693 
   1694 fguv 420, 1, 1
   1695 fguv 422, 1, 0
   1696 fguv 444, 0, 0
   1697 
   1698 function fguv_loop_sx0_neon
   1699 L(fguv_loop_sx0_tbl):
   1700        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1701        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1702        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1703        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1704        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1705        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1706        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1707        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1708 
   1709 .macro fguv_loop_sx0 csfl, ox, oy
   1710 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
   1711        sub             r2,  r2,  #32          // src_stride   -= 32
   1712        sub             r10, r10, #32          // grain_stride -= 32
   1713 .if \oy
   1714        mov             r12, lr
   1715 .endif
   1716 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy\()_loopstart):
   1717 1:
   1718 .if \ox
   1719        vld1.16         {d0},       [r4],       r10 // grain_lut old
   1720 .endif
   1721 .if \oy
   1722        vld1.16         {q2,  q3},  [r8]!           // grain_lut top
   1723 .endif
   1724 .if \ox && \oy
   1725        vld1.16         {d2},       [r11],      r10 // grain_lut top old
   1726 .endif
   1727 .if !\ox && !\oy
   1728        vld1.16         {q0,  q1},  [r6, :128]!     // luma
   1729 .endif
   1730        vld1.16         {q8,  q9},  [r5]!           // grain_lut
   1731 .if \oy
   1732        vld1.16         {q4,  q5},  [r8],       r10 // grain_lut top
   1733 .endif
   1734 .if !\ox && !\oy
   1735        vld1.16         {q2,  q3},  [r6, :128], r7  // luma
   1736 .endif
   1737 .if \oy
   1738        vdup.16         d28, d31[2]                 // overlap y coeff
   1739        vdup.16         d29, d31[3]                 // overlap y coeff
   1740 .endif
   1741        vld1.16         {q10, q11}, [r5],       r10 // grain_lut
   1742 
   1743 .if \ox
   1744        vdup.16         q7,  d30[3]                // bitdepth_max
   1745        add             r4,  r4,  #32
   1746        vmull.s16       q0,  d0,  d24
   1747        vshr.u16        q7,  q7,  #1               // grain_max
   1748        vmlal.s16       q0,  d16, d25
   1749        vmvn            q6,  q7                    // grain_min
   1750 .endif
   1751 
   1752 .if \oy
   1753 .if \ox
   1754        add             r11, r11, #32
   1755        vmull.s16       q1,  d2,  d24
   1756        vmlal.s16       q1,  d4,  d25
   1757        vqrshrn.s32     d16, q0,  #5
   1758        vqrshrn.s32     d4,  q1,  #5
   1759        vmin.s16        d4,  d4,  d14
   1760        vmin.s16        d16, d16, d14
   1761        vmax.s16        d4,  d4,  d12
   1762        vmax.s16        d16, d16, d12
   1763 .endif
   1764 
   1765        vmull.s16       q0,  d4,  d28
   1766        vmull.s16       q1,  d5,  d28
   1767        vmull.s16       q2,  d6,  d28
   1768        vmull.s16       q3,  d7,  d28
   1769 .if !\ox
   1770        vdup.16         q7,  d30[3]                // bitdepth_max
   1771 .endif
   1772        vmlal.s16       q0,  d16, d29
   1773        vmlal.s16       q1,  d17, d29
   1774        vmlal.s16       q2,  d18, d29
   1775        vmlal.s16       q3,  d19, d29
   1776 .if !\ox
   1777        vshr.u16        q7,  q7,  #1               // grain_max
   1778 .endif
   1779        vmull.s16       q8,  d20, d29
   1780        vmull.s16       q9,  d21, d29
   1781        vmull.s16       q10, d22, d29
   1782        vmull.s16       q11, d23, d29
   1783 .if !\ox
   1784        vmvn            q6,  q7                    // grain_min
   1785 .endif
   1786        vmlal.s16       q8,  d8,  d28
   1787        vmlal.s16       q9,  d9,  d28
   1788        vmlal.s16       q10, d10, d28
   1789        vmlal.s16       q11, d11, d28
   1790        vqrshrn.s32     d0,  q0,  #5
   1791        vqrshrn.s32     d1,  q1,  #5
   1792        vqrshrn.s32     d2,  q2,  #5
   1793        vqrshrn.s32     d3,  q3,  #5
   1794        vqrshrn.s32     d4,  q8,  #5
   1795        vqrshrn.s32     d5,  q9,  #5
   1796        vqrshrn.s32     d6,  q10, #5
   1797        vqrshrn.s32     d7,  q11, #5
   1798        vmin.s16        q8,  q0,  q7
   1799        vmin.s16        q9,  q1,  q7
   1800        vld1.16         {q0,  q1},  [r6, :128]!    // luma
   1801        vmin.s16        q10, q2,  q7
   1802        vmin.s16        q11, q3,  q7
   1803        vmax.s16        q8,  q8,  q6
   1804        vmax.s16        q9,  q9,  q6
   1805        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
   1806        vmax.s16        q10, q10, q6
   1807        vmax.s16        q11, q11, q6
   1808 .elseif \ox
   1809        vqrshrn.s32     d16, q0,  #5
   1810        vld1.16         {q0,  q1},  [r6, :128]!    // luma
   1811        vmin.s16        d16, d16, d14
   1812        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
   1813        vmax.s16        d16, d16, d12
   1814 .endif
   1815 
   1816 .if !\csfl
   1817        vdup.16         d28, d30[0]   // uv_luma_mult
   1818        vld1.16         {q4,  q5},  [r1, :128]! // src
   1819        vdup.16         d29, d30[1]   // uv_mult
   1820        vmull.s16       q6,  d0,  d28
   1821        vmull.s16       q7,  d1,  d28
   1822        vmull.s16       q0,  d2,  d28
   1823        vmull.s16       q1,  d3,  d28
   1824        vmlal.s16       q6,  d8,  d29
   1825        vmlal.s16       q7,  d9,  d29
   1826        vmlal.s16       q0,  d10, d29
   1827        vmlal.s16       q1,  d11, d29
   1828        vld1.16         {q4,  q5},  [r1, :128]  // src
   1829        sub             r1,  r1,  #32
   1830        vshrn.s32       d12, q6,  #6
   1831        vshrn.s32       d13, q7,  #6
   1832        vshrn.s32       d14, q0,  #6
   1833        vshrn.s32       d15, q1,  #6
   1834        vmull.s16       q0,  d4,  d28
   1835        vmull.s16       q1,  d5,  d28
   1836        vmull.s16       q2,  d6,  d28
   1837        vmull.s16       q3,  d7,  d28
   1838        vmlal.s16       q0,  d8,  d29
   1839        vmlal.s16       q1,  d9,  d29
   1840        vmlal.s16       q2,  d10, d29
   1841        vmlal.s16       q3,  d11, d29
   1842        vdup.16         q14, d30[2]   // uv_offset
   1843        vshrn.s32       d0,  q0,  #6
   1844        vshrn.s32       d1,  q1,  #6
   1845        vshrn.s32       d2,  q2,  #6
   1846        vshrn.s32       d3,  q3,  #6
   1847        vdup.16         q4,  d30[3]   // bitdepth_max
   1848        vmov.i16        q5,  #0
   1849        vadd.i16        q6,  q6,  q14
   1850        vadd.i16        q7,  q7,  q14
   1851        vadd.i16        q2,  q0,  q14
   1852        vadd.i16        q3,  q1,  q14
   1853        vmin.s16        q0,  q6,  q4
   1854        vmin.s16        q1,  q7,  q4
   1855        vmin.s16        q2,  q2,  q4
   1856        vmin.s16        q3,  q3,  q4
   1857        vmax.s16        q0,  q0,  q5
   1858        vmax.s16        q1,  q1,  q5
   1859        vmax.s16        q2,  q2,  q5
   1860        vmax.s16        q3,  q3,  q5
   1861 .else
   1862        vdup.16         q14, d30[3]  // bitdepth_max
   1863        // Make sure that uninitialized pixels out of range past the right
   1864        // edge are in range; their actual values shouldn't matter.
   1865        vand            q0,  q0,  q14
   1866        vand            q1,  q1,  q14
   1867        vand            q2,  q2,  q14
   1868        vand            q3,  q3,  q14
   1869 .endif
   1870 
   1871        bl              gather32_neon
   1872 
   1873        vld1.16         {q0,  q1},  [r1, :128]!    // src
   1874 
   1875        vmovl.u8        q6,  d8        // scaling
   1876        vmovl.u8        q7,  d9
   1877        vmovl.u8        q4,  d10
   1878        vmovl.u8        q5,  d11
   1879 
   1880        vld1.16         {q2,  q3},  [r1, :128], r2 // src
   1881 
   1882        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
   1883        vshl.u16        q7,  q7,  q13
   1884        vshl.u16        q4,  q4,  q13
   1885        vshl.u16        q5,  q5,  q13
   1886 
   1887        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
   1888        vqrdmulh.s16    q9,  q9,  q7
   1889        vqrdmulh.s16    q10, q10, q4
   1890        vqrdmulh.s16    q11, q11, q5
   1891 
   1892 
   1893        vdup.16         q4,  d31[0]    // clip_min
   1894        vdup.16         q5,  d31[1]    // clip_max
   1895 
   1896        vqadd.s16       q0,  q0,  q8   // *src + noise
   1897        vqadd.s16       q1,  q1,  q9
   1898        vqadd.s16       q2,  q2,  q10
   1899        vqadd.s16       q3,  q3,  q11
   1900 
   1901 .if \oy
   1902        vmov.32         lr,  d25[0] // 2 first 16 bit coeffs from overlap x
   1903 .endif
   1904 
   1905        vmax.s16        q0,  q0,  q4
   1906        vmax.s16        q1,  q1,  q4
   1907        vmax.s16        q2,  q2,  q4
   1908        vmax.s16        q3,  q3,  q4
   1909        vmin.s16        q0,  q0,  q5
   1910        vmin.s16        q1,  q1,  q5
   1911        vmin.s16        q2,  q2,  q5
   1912        vmin.s16        q3,  q3,  q5
   1913 
   1914        vst1.16         {q0, q1}, [r0, :128]! // dst
   1915 
   1916        subs            r9,  r9,  #1
   1917 .if \oy
   1918        vmov.32         d31[1], lr  // new coeffs for overlap y
   1919 .endif
   1920 
   1921        vst1.16         {q2, q3}, [r0, :128], r2 // dst
   1922        bgt             1b
   1923 
   1924 .if \oy
   1925        cmp             r12, #0
   1926        mov             r9,  r12               // restore actual remaining h
   1927        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0_loopstart)
   1928 .endif
   1929        b               9f
   1930 .endm
   1931        fguv_loop_sx0   0, 0, 0
   1932        fguv_loop_sx0   0, 0, 1
   1933        fguv_loop_sx0   0, 1, 0
   1934        fguv_loop_sx0   0, 1, 1
   1935        fguv_loop_sx0   1, 0, 0
   1936        fguv_loop_sx0   1, 0, 1
   1937        fguv_loop_sx0   1, 1, 0
   1938        fguv_loop_sx0   1, 1, 1
   1939 
   1940 9:
   1941        vpop            {q4-q7}
   1942        pop             {r4-r11,pc}
   1943 endfunc
   1944 
   1945 function fguv_loop_sx1_neon
   1946 L(fguv_loop_sx1_tbl):
   1947        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1948        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1949        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1950        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1951        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1952        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1953        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1954        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1955 
   1956 .macro fguv_loop_sx1 csfl, ox, oy
   1957 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
   1958 .if \oy
   1959        mov             r12, lr
   1960 .endif
   1961 1:
   1962 .if \ox
   1963        vld1.16         {d0},       [r4],       r10 // grain_lut old
   1964 .endif
   1965 .if \ox && \oy
   1966        vld1.16         {d2},       [r11],      r10 // grain_lut top old
   1967 .endif
   1968 .if \oy
   1969        vld1.16         {q2,  q3},  [r8],       r10 // grain_lut top
   1970 .endif
   1971 .if !\ox && !\oy
   1972        vld1.16         {q0,  q1},  [r6, :128]!     // luma
   1973 .endif
   1974        vld1.16         {q8,  q9},  [r5],       r10 // grain_lut
   1975 .if \oy
   1976        vdup.16         d28, d31[2]                 // overlap y coeff
   1977        vdup.16         d29, d31[3]                 // overlap y coeff
   1978 .endif
   1979 .if !\ox && !\oy
   1980        vld1.16         {q2,  q3},  [r6, :128], r7  // luma
   1981 .endif
   1982 
   1983 .if \ox
   1984        vdup.16         q7,  d30[3]                // bitdepth_max
   1985        vmull.s16       q0,  d0,  d24
   1986        vshr.u16        q7,  q7,  #1               // grain_max
   1987        vmlal.s16       q0,  d16, d25
   1988        vmvn            q6,  q7                    // grain_min
   1989 .endif
   1990 
   1991 .if \oy
   1992 .if \ox
   1993        vmull.s16       q1,  d2,  d24
   1994        vmlal.s16       q1,  d4,  d25
   1995        vqrshrn.s32     d16, q0,  #5
   1996        vqrshrn.s32     d4,  q1,  #5
   1997        vmin.s16        d4,  d4,  d14
   1998        vmin.s16        d16, d16, d14
   1999        vmax.s16        d4,  d4,  d12
   2000        vmax.s16        d16, d16, d12
   2001 .endif
   2002 
   2003        vmull.s16       q0,  d4,  d28
   2004        vmull.s16       q1,  d5,  d28
   2005        vmull.s16       q2,  d6,  d28
   2006        vmull.s16       q3,  d7,  d28
   2007 .if !\ox
   2008        vdup.16         q7,  d30[3]                // bitdepth_max
   2009 .endif
   2010        vmlal.s16       q0,  d16, d29
   2011        vmlal.s16       q1,  d17, d29
   2012        vmlal.s16       q2,  d18, d29
   2013        vmlal.s16       q3,  d19, d29
   2014 .if !\ox
   2015        vshr.u16        q7,  q7,  #1               // grain_max
   2016 .endif
   2017        vqrshrn.s32     d16, q0,  #5
   2018        vqrshrn.s32     d17, q1,  #5
   2019        vqrshrn.s32     d18, q2,  #5
   2020        vqrshrn.s32     d19, q3,  #5
   2021 .if !\ox
   2022        vmvn            q6,  q7                    // grain_min
   2023 .endif
   2024        vld1.16         {q0,  q1},  [r6, :128]!    // luma
   2025        vmin.s16        q8,  q8,  q7
   2026        vmin.s16        q9,  q9,  q7
   2027        vmax.s16        q8,  q8,  q6
   2028        vmax.s16        q9,  q9,  q6
   2029        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
   2030 .elseif \ox
   2031        vqrshrn.s32     d16, q0,  #5
   2032        vld1.16         {q0,  q1},  [r6, :128]!    // luma
   2033        vmin.s16        d16, d16, d14
   2034        vld1.16         {q2,  q3},  [r6, :128], r7 // luma
   2035        vmax.s16        d16, d16, d12
   2036 .endif
   2037 
   2038        vpadd.i16       d0,  d0,  d1
   2039        vpadd.i16       d1,  d2,  d3
   2040        vpadd.i16       d2,  d4,  d5
   2041        vpadd.i16       d3,  d6,  d7
   2042        vrshr.u16       q0,  q0,  #1
   2043        vrshr.u16       q1,  q1,  #1
   2044 .if !\csfl
   2045        vdup.16         d28, d30[0]   // uv_luma_mult
   2046        vld1.16         {q2,  q3},  [r1, :128], r2 // src
   2047        vdup.16         d29, d30[1]   // uv_mult
   2048        vmull.s16       q6,  d0,  d28
   2049        vmull.s16       q7,  d1,  d28
   2050        vmull.s16       q0,  d2,  d28
   2051        vmull.s16       q1,  d3,  d28
   2052        vmlal.s16       q6,  d4,  d29
   2053        vmlal.s16       q7,  d5,  d29
   2054        vmlal.s16       q0,  d6,  d29
   2055        vmlal.s16       q1,  d7,  d29
   2056        vshrn.s32       d12, q6,  #6
   2057        vshrn.s32       d13, q7,  #6
   2058        vshrn.s32       d14, q0,  #6
   2059        vshrn.s32       d15, q1,  #6
   2060        vdup.16         q14, d30[2]   // uv_offset
   2061        vdup.16         q4,  d30[3]   // bitdepth_max
   2062        vmov.i16        q5,  #0
   2063        vadd.i16        q6,  q6,  q14
   2064        vadd.i16        q7,  q7,  q14
   2065        vmin.s16        q0,  q6,  q4
   2066        vmin.s16        q1,  q7,  q4
   2067        vmax.s16        q0,  q0,  q5
   2068        vmax.s16        q1,  q1,  q5
   2069 .else
   2070        vdup.16         q14, d30[3]  // bitdepth_max
   2071        vld1.16         {q2,  q3},  [r1, :128], r2 // src
   2072 
   2073        // Make sure that uninitialized pixels out of range past the right
   2074        // edge are in range; their actual values shouldn't matter.
   2075        vand            q0,  q0,  q14
   2076        vand            q1,  q1,  q14
   2077 .endif
   2078 
   2079        bl              gather16_neon
   2080 
   2081        vmovl.u8        q6,  d8        // scaling
   2082        vmovl.u8        q7,  d9
   2083 
   2084        vshl.u16        q6,  q6,  q13  // scaling << (15 - scaling_shift)
   2085        vshl.u16        q7,  q7,  q13
   2086 
   2087        vqrdmulh.s16    q8,  q8,  q6   // round2((scaling << (15 - scaling_shift) * grain, 15)
   2088        vqrdmulh.s16    q9,  q9,  q7
   2089 
   2090 
   2091        vdup.16         q4,  d31[0]    // clip_min
   2092        vdup.16         q5,  d31[1]    // clip_max
   2093 
   2094        vqadd.s16       q0,  q2,  q8   // *src + noise
   2095        vqadd.s16       q1,  q3,  q9
   2096 
   2097 .if \oy
   2098        // Swap the two last coefficients of d31, place them first in d28
   2099        vrev64.16       d28, d31
   2100 .endif
   2101 
   2102        vmax.s16        q0,  q0,  q4
   2103        vmax.s16        q1,  q1,  q4
   2104        vmin.s16        q0,  q0,  q5
   2105        vmin.s16        q1,  q1,  q5
   2106 
   2107        subs            r9,  r9,  #1
   2108 .if \oy
   2109        // Take the first two 16 bit coefficients of d28 and place them at the
   2110        // end of d31
   2111        vtrn.32         d31, d28
   2112 .endif
   2113 
   2114        vst1.16         {q0, q1}, [r0, :128], r2 // dst
   2115        bgt             1b
   2116 
   2117 .if \oy
   2118        cmp             r12, #0
   2119        mov             r9,  r12               // restore actual remaining h
   2120        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
   2121 .endif
   2122 
   2123        b               9f
   2124 .endm
   2125        fguv_loop_sx1   0, 0, 0
   2126        fguv_loop_sx1   0, 0, 1
   2127        fguv_loop_sx1   0, 1, 0
   2128        fguv_loop_sx1   0, 1, 1
   2129        fguv_loop_sx1   1, 0, 0
   2130        fguv_loop_sx1   1, 0, 1
   2131        fguv_loop_sx1   1, 1, 0
   2132        fguv_loop_sx1   1, 1, 1
   2133 
   2134 9:
   2135        vpop            {q4-q7}
   2136        pop             {r4-r11,pc}
   2137 endfunc