tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain.S (67567B)


      1 /*
      2 * Copyright © 2021, VideoLAN and dav1d authors
      3 * Copyright © 2021, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 #include "src/arm/asm-offsets.h"
     31 
     32 #define GRAIN_WIDTH 82
     33 #define GRAIN_HEIGHT 73
     34 
     35 #define SUB_GRAIN_WIDTH 44
     36 #define SUB_GRAIN_HEIGHT 38
     37 
     38 .macro increment_seed steps, shift=1
     39        lsr             r11, r2,  #3
     40        lsr             r12, r2,  #12
     41        lsr             lr,  r2,  #1
     42        eor             r11, r2,  r11                     // (r >> 0) ^ (r >> 3)
     43        eor             r12, r12, lr                      // (r >> 12) ^ (r >> 1)
     44        eor             r11, r11, r12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
     45 .if \shift
     46        lsr             r2,  r2,  #\steps
     47 .endif
     48        and             r11, r11, #((1 << \steps) - 1)    // bit
     49 .if \shift
     50        orr             r2,  r2,  r11, lsl #(16 - \steps) // *state
     51 .else
     52        orr             r2,  r2,  r11, lsl #16            // *state
     53 .endif
     54 .endm
     55 
     56 .macro read_rand dest, bits, age
     57        ubfx            \dest,  r2,   #16 - \bits - \age, #\bits
     58 .endm
     59 
     60 .macro read_shift_rand dest, bits
     61        ubfx            \dest,  r2,   #17 - \bits, #\bits
     62        lsr             r2,  r2,  #1
     63 .endm
     64 
     65 // special calling convention:
     66 // r2 holds seed
     67 // r3 holds dav1d_gaussian_sequence
     68 // clobbers r11-r12
     69 // returns in d0-d1
     70 function get_gaussian_neon
     71        push            {r5-r6,lr}
     72        increment_seed  4
     73        read_rand       r5,  11,  3
     74        read_rand       r6,  11,  2
     75        add             r5,  r3,  r5,  lsl #1
     76        add             r6,  r3,  r6,  lsl #1
     77        vld1.16         {d0[0]}, [r5]
     78        read_rand       r5,  11,  1
     79        vld1.16         {d0[1]}, [r6]
     80        add             r5,  r3,  r5,  lsl #1
     81        read_rand       r6, 11,  0
     82        increment_seed  4
     83        add             r6,  r3,  r6,  lsl #1
     84        vld1.16         {d0[2]}, [r5]
     85        read_rand       r5,  11,  3
     86        vld1.16         {d0[3]}, [r6]
     87        add             r5,  r3,  r5,  lsl #1
     88        read_rand       r6,  11,  2
     89        vld1.16         {d1[0]}, [r5]
     90        add             r6,  r3,  r6,  lsl #1
     91        read_rand       r5,  11,  1
     92        vld1.16         {d1[1]}, [r6]
     93        read_rand       r6,  11,  0
     94        add             r5,  r3,  r5,  lsl #1
     95        add             r6,  r3,  r6,  lsl #1
     96        vld1.16         {d1[2]}, [r5]
     97        vld1.16         {d1[3]}, [r6]
     98        pop             {r5-r6,pc}
     99 endfunc
    100 
    101 .macro get_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
    102        bl              get_gaussian_neon
    103        vrshl.s16       q0,  q0,  q15
    104        vmovn.i16       \r0, q0
    105        bl              get_gaussian_neon
    106        vrshl.s16       q0,  q0,  q15
    107        vmovn.i16       \r1, q0
    108        bl              get_gaussian_neon
    109        vrshl.s16       q0,  q0,  q15
    110        vmovn.i16       \r2, q0
    111        bl              get_gaussian_neon
    112        vrshl.s16       q0,  q0,  q15
    113        vmovn.i16       \r3, q0
    114        bl              get_gaussian_neon
    115        vrshl.s16       q0,  q0,  q15
    116        vmovn.i16       \r4, q0
    117        bl              get_gaussian_neon
    118        vrshl.s16       q0,  q0,  q15
    119        vmovn.i16       \r5, q0
    120        bl              get_gaussian_neon
    121        vrshl.s16       q0,  q0,  q15
    122        vmovn.i16       \r6, q0
    123        bl              get_gaussian_neon
    124        vrshl.s16       q0,  q0,  q15
    125        vmovn.i16       \r7, q0
    126        bl              get_gaussian_neon
    127        vrshl.s16       q0,  q0,  q15
    128        vmovn.i16       \r8, q0
    129        bl              get_gaussian_neon
    130        vrshl.s16       q0,  q0,  q15
    131        vmovn.i16       \r9, q0
    132        increment_seed  2
    133        read_rand       r11, 11,  1
    134        read_rand       r12, 11,  0
    135        add             r11, r3,  r11, lsl #1
    136        add             r12, r3,  r12, lsl #1
    137        vld1.16         {d0[0]}, [r11]
    138        vld1.16         {d0[1]}, [r12]
    139        vrshl.s16       d0,  d0,  d30
    140        vmovn.i16       \r10, q0
    141 .endm
    142 
    143 .macro store_grain_row r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10
    144        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
    145        vst1.16         {\r4, \r5, \r6, \r7}, [r0]!
    146        vst1.16         {\r8, \r9},           [r0]!
    147        vst1.16         {\r10[0]},            [r0]!
    148 .endm
    149 
    150 .macro get_grain_row_44 r0, r1, r2, r3, r4, r5
    151        bl              get_gaussian_neon
    152        vrshl.s16       q0,  q0,  q15
    153        vmovn.i16       \r0, q0
    154        bl              get_gaussian_neon
    155        vrshl.s16       q0,  q0,  q15
    156        vmovn.i16       \r1, q0
    157        bl              get_gaussian_neon
    158        vrshl.s16       q0,  q0,  q15
    159        vmovn.i16       \r2, q0
    160        bl              get_gaussian_neon
    161        vrshl.s16       q0,  q0,  q15
    162        vmovn.i16       \r3, q0
    163        bl              get_gaussian_neon
    164        vrshl.s16       q0,  q0,  q15
    165        vmovn.i16       \r4, q0
    166        increment_seed  4
    167        read_rand       r11, 11,  3
    168        read_rand       r12, 11,  2
    169        add             r11, r3,  r11, lsl #1
    170        add             r12, r3,  r12, lsl #1
    171        vld1.16         {d0[]}, [r11]
    172        read_rand       r11, 11,  1
    173        vld1.16         {d0[1]}, [r12]
    174        add             r11, r3,  r11, lsl #1
    175        read_rand       r12, 11,  0
    176        vld1.16         {d0[2]}, [r11]
    177        add             r12, r3,  r12, lsl #1
    178        vld1.16         {d0[3]}, [r12]
    179        vrshl.s16       d0,  d0,  d30
    180        vmovn.i16       \r5, q0
    181 .endm
    182 
    183 .macro store_grain_row_44 r0, r1, r2, r3, r4, r5
    184        vst1.16         {\r0, \r1, \r2, \r3}, [r0]!
    185        vst1.16         {\r4, \r5},           [r0]
    186        add             r0,  r0,  #GRAIN_WIDTH-32
    187 .endm
    188 
    189 function get_grain_2_neon
    190        push            {r11,lr}
    191        increment_seed  2
    192        read_rand       r11, 11,  1
    193        read_rand       r12, 11,  0
    194        add             r11, r3,  r11, lsl #1
    195        add             r12, r3,  r12, lsl #1
    196        vld1.16         {d0[0]}, [r11]
    197        vld1.16         {d0[1]}, [r12]
    198        vrshl.s16       d0,  d0,  d30
    199        vmovn.i16       d0,  q0
    200        pop             {r11,pc}
    201 endfunc
    202 
    203 .macro get_grain_2 dst
    204        bl              get_grain_2_neon
    205 .ifnc \dst, d0
    206        vmov            \dst, d0
    207 .endif
    208 .endm
    209 
    210 // r1 holds the number of entries to produce
    211 // r6, r8 and r10 hold the previous output entries
    212 // q0 holds the vector of produced entries
    213 // q1 holds the input vector of sums from above
    214 .macro output_lag n
    215 function output_lag\n\()_neon
    216        push            {r0, lr}
    217 .if \n == 1
    218        mov             lr,  #-128
    219 .else
    220        mov             r0,  #1
    221        mov             lr,  #1
    222        sub             r7,  r7,  #1
    223        sub             r9,  r9,  #1
    224        lsl             r0,  r0,  r7
    225        lsl             lr,  lr,  r9
    226        add             r7,  r7,  #1
    227        add             r9,  r9,  #1
    228 .endif
    229 1:
    230        read_shift_rand r12, 11
    231        vmov.32         r11, d2[0]
    232        lsl             r12, r12, #1
    233        vext.8          q0,  q0,  q0,  #1
    234        ldrsh           r12, [r3, r12]
    235 .if \n == 1
    236        mla             r11, r6,  r4,  r11        // sum (above) + *coeff * prev output
    237        add             r6,  r11, r8              // 1 << (ar_coeff_shift - 1)
    238        add             r12, r12, r10
    239        asr             r6,  r6,  r7              // >> ar_coeff_shift
    240        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
    241        add             r6,  r6,  r12
    242        cmp             r6,  r5
    243 .elseif \n == 2
    244        mla             r11, r8,  r4,  r11        // sum (above) + *coeff * prev output 1
    245        mla             r11, r6,  r10, r11        // += *coeff * prev output 2
    246        mov             r8,  r6
    247        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
    248        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
    249        asr             r6,  r6,  r7              // >> ar_coeff_shift
    250        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
    251        add             r6,  r6,  r12
    252        push            {lr}
    253        cmp             r6,  r5
    254        mov             lr,  #-128
    255 .else
    256        push            {r1-r3}
    257        sbfx            r1,  r4,  #0,  #8
    258        sbfx            r2,  r4,  #8,  #8
    259        sbfx            r3,  r4,  #16, #8
    260        mla             r11, r10, r1,  r11        // sum (above) + *coeff * prev output 1
    261        mla             r11, r8,  r2,  r11        // sum (above) + *coeff * prev output 2
    262        mla             r11, r6,  r3,  r11        // += *coeff * prev output 3
    263        pop             {r1-r3}
    264        mov             r10, r8
    265        mov             r8,  r6
    266 
    267        add             r6,  r11, r0              // 1 << (ar_coeff_shift - 1)
    268        add             r12, r12, lr              // 1 << (4 + grain_scale_shift - 1)
    269        asr             r6,  r6,  r7              // >> ar_coeff_shift
    270        asr             r12, r12, r9              // >> (4 + grain_scale_shift)
    271        add             r6,  r6,  r12
    272        push            {lr}
    273        cmp             r6,  r5
    274        mov             lr,  #-128
    275 .endif
    276        it              gt
    277        movgt           r6,  r5
    278        cmp             r6,  lr
    279        it              lt
    280        movlt           r6,  lr
    281 .if \n >= 2
    282        pop             {lr}
    283 .endif
    284        subs            r1,  r1,  #1
    285        vext.8          q1,  q1,  q1,  #4
    286        vmov.8          d1[7], r6
    287        bgt             1b
    288        pop             {r0, pc}
    289 endfunc
    290 .endm
    291 
    292 output_lag 1
    293 output_lag 2
    294 output_lag 3
    295 
    296 
    297 function sum_lag1_above_neon
    298        vmull.s8        q2,  d6,  d28
    299        vmull.s8        q3,  d7,  d28
    300        vmull.s8        q4,  d0,  d27
    301        vmull.s8        q5,  d1,  d27
    302 
    303        vaddl.s16       q0,  d4,  d8
    304        vaddl.s16       q2,  d5,  d9
    305        vaddl.s16       q4,  d6,  d10
    306        vaddl.s16       q5,  d7,  d11
    307 
    308        vmull.s8        q3,  d3,  d29
    309        vmull.s8        q1,  d2,  d29
    310 
    311        vaddw.s16       q4,  q4,  d6
    312        vaddw.s16       q5,  q5,  d7
    313        vaddw.s16       q3,  q2,  d3
    314        vaddw.s16       q2,  q0,  d2
    315        bx              lr
    316 endfunc
    317 
    318 .macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
    319 .ifc \lag\()_\edge, lag3_left
    320        bl              sum_lag3_left_above_neon
    321 .else
    322        bl              sum_\lag\()_above_neon
    323 .endif
    324 .ifc \type, uv_420
    325        vpush           {q6-q7}
    326        add             r12, r11, #GRAIN_WIDTH
    327        vld1.16         {q0, q1}, [r11]!
    328        vld1.16         {q6, q7}, [r12]!
    329        vpaddl.s8       q0,  q0
    330        vpaddl.s8       q1,  q1
    331        vpaddl.s8       q6,  q6
    332        vpaddl.s8       q7,  q7
    333        vadd.i16        q0,  q0,  q6
    334        vadd.i16        q1,  q1,  q7
    335        vpop            {q6-q7}
    336        vrshrn.s16      d0,  q0,  #2
    337        vrshrn.s16      d1,  q1,  #2
    338 .endif
    339 .ifc \type, uv_422
    340        vld1.8          {q0, q1}, [r11]!
    341        vpaddl.s8       q0,  q0
    342        vpaddl.s8       q1,  q1
    343        vrshrn.s16      d0,  q0,  #1
    344        vrshrn.s16      d1,  q1,  #1
    345 .endif
    346 .ifc \type, uv_444
    347        vld1.8          {q0}, [r11]!
    348 .endif
    349 .if \uv_layout
    350 .ifnb \uv_coeff
    351        vdup.8          d13, \uv_coeff
    352 .endif
    353        vmull.s8        q1,  d0,  d13
    354        vmull.s8        q0,  d1,  d13
    355        vaddw.s16       q2,  q2,  d2
    356        vaddw.s16       q3,  q3,  d3
    357        vaddw.s16       q4,  q4,  d0
    358        vaddw.s16       q5,  q5,  d1
    359 .endif
    360 .if \uv_layout && \elems == 16
    361        b               sum_\lag\()_y_\edge\()_start
    362 .elseif \uv_layout == 444 && \elems == 15
    363        b               sum_\lag\()_y_\edge\()_start
    364 .elseif \uv_layout == 422 && \elems == 9
    365        b               sum_\lag\()_uv_420_\edge\()_start
    366 .else
    367 sum_\lag\()_\type\()_\edge\()_start:
    368        push            {r11}
    369 .ifc \edge, left
    370        increment_seed  4
    371        read_rand       r11, 11,  3
    372        read_rand       r12, 11,  2
    373        add             r11, r3,  r11, lsl #1
    374        add             r12, r3,  r12, lsl #1
    375        vld1.16         {d1[1]}, [r11]
    376        read_rand       r11, 11,  1
    377        vld1.16         {d1[2]}, [r12]
    378        add             r11, r3,  r11, lsl #1
    379        vld1.16         {d1[3]}, [r11]
    380        lsl             r2,  r2,  #1             // shift back the state as if we'd done increment_seed with shift=0
    381        vrshl.s16       d1,  d1,  d30
    382        vmovn.i16       d1,  q0
    383        vext.8          q2,  q2,  q2,  #12
    384 .ifc \lag, lag3
    385        vmov.s8         r10, d1[5]
    386 .endif
    387 .ifnc \lag, lag1
    388        vmov.s8         r8,  d1[6]
    389 .endif
    390        vmov.s8         r6,  d1[7]
    391 
    392        vmov            q1,  q2
    393        mov             r1,  #1
    394        bl              output_\lag\()_neon
    395 .else
    396        increment_seed  4, shift=0
    397        vmov            q1,  q2
    398        mov             r1,  #4
    399        bl              output_\lag\()_neon
    400 .endif
    401 
    402        increment_seed  4, shift=0
    403        vmov            q1,  q3
    404        mov             r1,  #4
    405        bl              output_\lag\()_neon
    406 
    407        increment_seed  4, shift=0
    408        vmov            q1,  q4
    409 .if \elems == 9
    410        mov             r1,  #1
    411        bl              output_\lag\()_neon
    412        lsr             r2,  r2,  #3
    413 
    414        read_rand       r11, 11,  2
    415        read_rand       r12, 11,  1
    416        add             r11, r3,  r11, lsl #1
    417        add             r12, r3,  r12, lsl #1
    418        vld1.16         {d2[0]}, [r11]
    419        read_rand       r11, 11,  0
    420        vld1.16         {d2[1]}, [r12]
    421        add             r11, r3,  r11, lsl #1
    422        vld1.16         {d2[2]}, [r11]
    423        vrshl.s16       d2,  d2,  d30
    424        vmovn.i16       d2,  q1
    425        vext.8          q0,  q0,  q1,  #7
    426 .else
    427        mov             r1,  #4
    428        bl              output_\lag\()_neon
    429 
    430        increment_seed  4, shift=0
    431        vmov            q1,  q5
    432 
    433 .ifc \edge, right
    434        mov             r1,  #3
    435        bl              output_\lag\()_neon
    436        read_shift_rand r11, 11
    437        add             r11, r3,  r11, lsl #1
    438        vld1.16         {d2[0]}, [r11]
    439        vrshl.s16       d2,  d2,  d30
    440        vext.8          q0,  q0,  q1,  #1
    441 .else
    442        mov             r1,  #4
    443        bl              output_\lag\()_neon
    444 .endif
    445 .endif
    446 .if \store
    447        vst1.8          {q0}, [r0]!
    448 .endif
    449        pop             {r11}
    450        pop             {r1, pc}
    451 .endif
    452 .endm
    453 
    454 .macro sum_lag1_func type, uv_layout, edge, elems=16
    455 function sum_\type\()_lag1_\edge\()_neon
    456        push            {r1, lr}
    457        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
    458 endfunc
    459 .endm
    460 
    461 sum_lag1_func y,      0,   left
    462 sum_lag1_func y,      0,   mid
    463 sum_lag1_func y,      0,   right, 15
    464 sum_lag1_func uv_444, 444, left
    465 sum_lag1_func uv_444, 444, mid
    466 sum_lag1_func uv_444, 444, right, 15
    467 sum_lag1_func uv_422, 422, left
    468 sum_lag1_func uv_422, 422, mid
    469 sum_lag1_func uv_422, 422, right, 9
    470 sum_lag1_func uv_420, 420, left
    471 sum_lag1_func uv_420, 420, mid
    472 sum_lag1_func uv_420, 420, right, 9
    473 
    474 .macro sum_lag1 type, dst, left, mid, right, edge=mid
    475        vmov            q3,  \mid
    476        vext.8          q0,  \left, \mid,   #15
    477        vext.8          q1,  \mid,  \right, #1
    478        bl              sum_\type\()_lag1_\edge\()_neon
    479        vmov            \dst, q0
    480 .endm
    481 
    482 .macro sum_y_lag1 dst, left, mid, right, edge=mid
    483        sum_lag1        y, \dst, \left, \mid, \right, \edge
    484 .endm
    485 
    486 .macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
    487        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
    488 .endm
    489 
    490 .macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
    491        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
    492 .endm
    493 
    494 .macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
    495        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
    496 .endm
    497 
    498 
    499 function sum_lag2_above_neon
    500        push            {lr}
    501        sub             r12, r0,  #2*GRAIN_WIDTH - 16
    502        sub             lr,  r0,  #1*GRAIN_WIDTH - 16
    503        vld1.8          {q10}, [r12] // load top right
    504        vld1.8          {q13}, [lr]
    505 
    506        vext.8          q6,  q8,  q9,  #14 // top left, top mid
    507        vdup.8          d14, d28[0]
    508        vext.8          q8,  q8,  q9,  #15
    509        vdup.8          d15, d28[1]
    510 
    511        vmull.s8        q0,  d12, d14
    512        vmull.s8        q1,  d13, d14
    513        vmull.s8        q6,  d16, d15
    514        vmull.s8        q8,  d17, d15
    515 
    516        vaddl.s16       q2,  d0,  d12
    517        vaddl.s16       q3,  d1,  d13
    518        vaddl.s16       q4,  d2,  d16
    519        vaddl.s16       q5,  d3,  d17
    520 
    521        vext.8          q6,  q9,  q10, #1  // top mid, top right
    522        vdup.8          d14, d28[3]
    523        vext.8          q8,  q9,  q10, #2
    524        vdup.8          d15, d28[4]
    525 
    526        vmull.s8        q0,  d12, d14
    527        vmull.s8        q1,  d13, d14
    528        vmull.s8        q6,  d16, d15
    529        vmull.s8        q8,  d17, d15
    530 
    531        vaddl.s16       q7,  d0,  d12
    532        vaddl.s16       q0,  d1,  d13
    533        vaddl.s16       q6,  d2,  d16
    534        vaddl.s16       q1,  d3,  d17
    535 
    536        vadd.i32        q2,  q2,  q7
    537        vadd.i32        q3,  q3,  q0
    538        vadd.i32        q4,  q4,  q6
    539        vadd.i32        q5,  q5,  q1
    540 
    541        vext.8          q6,  q11, q12, #14 // top left, top mid
    542        vdup.8          d14, d28[5]
    543        vext.8          q8,  q11, q12, #15
    544        vdup.8          d15, d28[6]
    545 
    546        vmull.s8        q0,  d12, d14
    547        vmull.s8        q1,  d13, d14
    548        vmull.s8        q6,  d16, d15
    549        vmull.s8        q8,  d17, d15
    550 
    551        vaddl.s16       q7,  d0,  d12
    552        vaddl.s16       q0,  d1,  d13
    553        vaddl.s16       q6,  d2,  d16
    554        vaddl.s16       q1,  d3,  d17
    555 
    556        vadd.i32        q2,  q2,  q7
    557        vadd.i32        q3,  q3,  q0
    558        vadd.i32        q4,  q4,  q6
    559        vadd.i32        q5,  q5,  q1
    560 
    561        vext.8          q6,  q12, q13, #1  // top mid, top right
    562        vdup.8          d14, d29[0]
    563        vext.8          q8,  q12, q13, #2
    564        vdup.8          d15, d29[1]
    565 
    566        vmull.s8        q0,  d12, d14
    567        vmull.s8        q1,  d13, d14
    568        vmull.s8        q6,  d16, d15
    569        vmull.s8        q8,  d17, d15
    570 
    571        vaddl.s16       q7,  d0,  d12
    572        vaddl.s16       q0,  d1,  d13
    573        vaddl.s16       q6,  d2,  d16
    574        vaddl.s16       q1,  d3,  d17
    575 
    576        vadd.i32        q2,  q2,  q7
    577        vadd.i32        q3,  q3,  q0
    578        vadd.i32        q4,  q4,  q6
    579        vadd.i32        q5,  q5,  q1
    580 
    581        vdup.8          d14, d28[2]
    582        vdup.8          d15, d28[7]
    583 
    584        vmull.s8        q0,  d18, d14
    585        vmull.s8        q1,  d19, d14
    586        vmull.s8        q6,  d24, d15
    587        vmull.s8        q8,  d25, d15
    588 
    589        vaddl.s16       q7,  d0,  d12
    590        vaddl.s16       q0,  d1,  d13
    591        vaddl.s16       q6,  d2,  d16
    592        vaddl.s16       q1,  d3,  d17
    593 
    594        vmov            q8,  q9
    595        vmov            q9,  q10
    596 
    597        vadd.i32        q2,  q2,  q7
    598        vadd.i32        q3,  q3,  q0
    599        vadd.i32        q4,  q4,  q6
    600        vadd.i32        q5,  q5,  q1
    601 
    602        vmov            q11, q12
    603        vmov            q12, q13
    604 
    605        pop             {pc}
    606 endfunc
    607 
    608 .macro sum_lag2_func type, uv_layout, edge, elems=16
    609 function sum_\type\()_lag2_\edge\()_neon
    610        push            {r1, lr}
    611 .ifc \edge, left
    612        sub             r12, r0,  #2*GRAIN_WIDTH
    613        sub             lr,  r0,  #1*GRAIN_WIDTH
    614        vld1.8          {q9},  [r12] // load the previous block right above
    615        vld1.8          {q12}, [lr]
    616 .endif
    617        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[4]
    618 endfunc
    619 .endm
    620 
    621 sum_lag2_func y,      0,   left
    622 sum_lag2_func y,      0,   mid
    623 sum_lag2_func y,      0,   right, 15
    624 sum_lag2_func uv_444, 444, left
    625 sum_lag2_func uv_444, 444, mid
    626 sum_lag2_func uv_444, 444, right, 15
    627 sum_lag2_func uv_422, 422, left
    628 sum_lag2_func uv_422, 422, mid
    629 sum_lag2_func uv_422, 422, right, 9
    630 sum_lag2_func uv_420, 420, left
    631 sum_lag2_func uv_420, 420, mid
    632 sum_lag2_func uv_420, 420, right, 9
    633 
    634 
    635 function sum_lag3_left_above_neon
    636        // A separate codepath for the left edge, to avoid reading outside
    637        // of the edge of the buffer.
    638        sub             r12, r0,  #3*GRAIN_WIDTH
    639        vld1.8          {q11, q12}, [r12]
    640        vext.8          q12, q11, q12, #13
    641        vext.8          q11, q11, q11, #13
    642        b               sum_lag3_above_start
    643 endfunc
    644 
    645 function sum_lag3_above_neon
    646        sub             r12, r0,  #3*GRAIN_WIDTH + 3
    647        vld1.8          {q11, q12}, [r12]
    648 
    649 sum_lag3_above_start:
    650        vdup.8          d20, d26[0]
    651        vext.8          q9,  q11, q12, #1
    652        vdup.8          d21, d26[1]
    653 
    654        vmull.s8        q0,  d22, d20
    655        vmull.s8        q1,  d23, d20
    656        vmull.s8        q6,  d18, d21
    657        vmull.s8        q7,  d19, d21
    658 
    659        vext.8          q8,  q11, q12, #2
    660        vdup.8          d20, d26[2]
    661        vext.8          q9,  q11, q12, #3
    662        vdup.8          d21, d26[3]
    663 
    664        vaddl.s16       q2,  d0,  d12
    665        vaddl.s16       q3,  d1,  d13
    666        vaddl.s16       q4,  d2,  d14
    667        vaddl.s16       q5,  d3,  d15
    668 
    669        vmull.s8        q0,  d16, d20
    670        vmull.s8        q1,  d17, d20
    671        vmull.s8        q6,  d18, d21
    672        vmull.s8        q7,  d19, d21
    673 
    674        vaddl.s16       q8,  d0,  d12
    675        vaddl.s16       q9,  d1,  d13
    676        vaddl.s16       q0,  d2,  d14
    677        vaddl.s16       q1,  d3,  d15
    678 
    679        vext.8          q6,  q11, q12, #4
    680        vdup.8          d20, d26[4]
    681        vext.8          q7,  q11, q12, #5
    682        vdup.8          d21, d26[5]
    683 
    684        vadd.i32        q2,  q2,  q8
    685        vadd.i32        q3,  q3,  q9
    686        vadd.i32        q4,  q4,  q0
    687        vadd.i32        q5,  q5,  q1
    688 
    689        vmull.s8        q0,  d12, d20
    690        vmull.s8        q1,  d13, d20
    691        vmull.s8        q8,  d14, d21
    692        vmull.s8        q9,  d15, d21
    693 
    694        sub             r12, r0,  #2*GRAIN_WIDTH + 3
    695 
    696        vaddl.s16       q6,  d0,  d16
    697        vaddl.s16       q7,  d1,  d17
    698        vaddl.s16       q0,  d2,  d18
    699        vaddl.s16       q1,  d3,  d19
    700 
    701        vext.8          q8,  q11, q12, #6
    702        vld1.8          {q11, q12}, [r12]
    703        vdup.8          d20, d26[6]
    704        vdup.8          d21, d26[7]
    705 
    706        vadd.i32        q2,  q2,  q6
    707        vadd.i32        q3,  q3,  q7
    708        vadd.i32        q4,  q4,  q0
    709        vadd.i32        q5,  q5,  q1
    710 
    711        vmull.s8        q0,  d16, d20
    712        vmull.s8        q1,  d17, d20
    713        vmull.s8        q6,  d22, d21
    714        vmull.s8        q7,  d23, d21
    715 
    716        vaddl.s16       q8,  d0,  d12
    717        vaddl.s16       q9,  d1,  d13
    718        vaddl.s16       q0,  d2,  d14
    719        vaddl.s16       q1,  d3,  d15
    720 
    721        vext.8          q6,  q11, q12, #1
    722        vdup.8          d20, d27[0]
    723        vext.8          q7,  q11, q12, #2
    724        vdup.8          d21, d27[1]
    725 
    726        vadd.i32        q2,  q2,  q8
    727        vadd.i32        q3,  q3,  q9
    728        vadd.i32        q4,  q4,  q0
    729        vadd.i32        q5,  q5,  q1
    730 
    731        vmull.s8        q0,  d12, d20
    732        vmull.s8        q1,  d13, d20
    733        vmull.s8        q8,  d14, d21
    734        vmull.s8        q9,  d15, d21
    735 
    736        vaddl.s16       q6,  d0,  d16
    737        vaddl.s16       q7,  d1,  d17
    738        vaddl.s16       q0,  d2,  d18
    739        vaddl.s16       q1,  d3,  d19
    740 
    741        vext.8          q8,  q11, q12, #3
    742        vdup.8          d20, d27[2]
    743        vext.8          q9,  q11, q12, #4
    744        vdup.8          d21, d27[3]
    745 
    746        vadd.i32        q2,  q2,  q6
    747        vadd.i32        q3,  q3,  q7
    748        vadd.i32        q4,  q4,  q0
    749        vadd.i32        q5,  q5,  q1
    750 
    751        vmull.s8        q0,  d16, d20
    752        vmull.s8        q1,  d17, d20
    753        vmull.s8        q6,  d18, d21
    754        vmull.s8        q7,  d19, d21
    755 
    756        sub             r12, r0,  #1*GRAIN_WIDTH + 3
    757 
    758        vaddl.s16       q8,  d0,  d12
    759        vaddl.s16       q9,  d1,  d13
    760        vaddl.s16       q0,  d2,  d14
    761        vaddl.s16       q1,  d3,  d15
    762 
    763        vext.8          q6,  q11, q12, #5
    764        vdup.8          d20, d27[4]
    765        vext.8          q7,  q11, q12, #6
    766        vdup.8          d21, d27[5]
    767 
    768        vld1.8          {q11, q12}, [r12]
    769 
    770        vadd.i32        q2,  q2,  q8
    771        vadd.i32        q3,  q3,  q9
    772        vadd.i32        q4,  q4,  q0
    773        vadd.i32        q5,  q5,  q1
    774 
    775        vmull.s8        q0,  d12, d20
    776        vmull.s8        q1,  d13, d20
    777        vmull.s8        q8,  d14, d21
    778        vmull.s8        q9,  d15, d21
    779 
    780        vaddl.s16       q6,  d0,  d16
    781        vaddl.s16       q7,  d1,  d17
    782        vaddl.s16       q0,  d2,  d18
    783        vaddl.s16       q1,  d3,  d19
    784 
    785        vdup.8          d20, d27[6]
    786        vext.8          q9,  q11, q12, #1
    787        vdup.8          d21, d27[7]
    788 
    789        vadd.i32        q2,  q2,  q6
    790        vadd.i32        q3,  q3,  q7
    791        vadd.i32        q4,  q4,  q0
    792        vadd.i32        q5,  q5,  q1
    793 
    794        vmull.s8        q0,  d22, d20
    795        vmull.s8        q1,  d23, d20
    796        vmull.s8        q6,  d18, d21
    797        vmull.s8        q7,  d19, d21
    798 
    799        vaddl.s16       q8,  d0,  d12
    800        vaddl.s16       q9,  d1,  d13
    801        vaddl.s16       q0,  d2,  d14
    802        vaddl.s16       q1,  d3,  d15
    803 
    804        vext.8          q6,  q11, q12, #2
    805        vdup.8          d20, d28[0]
    806        vext.8          q7,  q11, q12, #3
    807        vdup.8          d21, d28[1]
    808 
    809        vadd.i32        q2,  q2,  q8
    810        vadd.i32        q3,  q3,  q9
    811        vadd.i32        q4,  q4,  q0
    812        vadd.i32        q5,  q5,  q1
    813 
    814        vmull.s8        q0,  d12, d20
    815        vmull.s8        q1,  d13, d20
    816        vmull.s8        q8,  d14, d21
    817        vmull.s8        q9,  d15, d21
    818 
    819        vaddl.s16       q6,  d0,  d16
    820        vaddl.s16       q7,  d1,  d17
    821        vaddl.s16       q0,  d2,  d18
    822        vaddl.s16       q1,  d3,  d19
    823 
    824        vext.8          q8,  q11, q12, #4
    825        vdup.8          d20, d28[2]
    826        vext.8          q9,  q11, q12, #5
    827        vdup.8          d21, d28[3]
    828 
    829        vadd.i32        q2,  q2,  q6
    830        vadd.i32        q3,  q3,  q7
    831        vadd.i32        q4,  q4,  q0
    832        vadd.i32        q5,  q5,  q1
    833 
    834        vmull.s8        q0,  d16, d20
    835        vmull.s8        q1,  d17, d20
    836        vmull.s8        q6,  d18, d21
    837        vmull.s8        q7,  d19, d21
    838 
    839        vaddl.s16       q8,  d0,  d12
    840        vaddl.s16       q9,  d1,  d13
    841        vaddl.s16       q0,  d2,  d14
    842        vaddl.s16       q1,  d3,  d15
    843 
    844        vext.8          q6,  q11, q12, #6
    845        vdup.8          d20, d28[4]
    846 
    847        vadd.i32        q2,  q2,  q8
    848        vadd.i32        q3,  q3,  q9
    849        vadd.i32        q4,  q4,  q0
    850        vadd.i32        q5,  q5,  q1
    851 
    852        vmull.s8        q0,  d12, d20
    853        vmull.s8        q1,  d13, d20
    854 
    855        vaddw.s16       q2,  q2,  d0
    856        vaddw.s16       q3,  q3,  d1
    857        vaddw.s16       q4,  q4,  d2
    858        vaddw.s16       q5,  q5,  d3
    859 
    860        bx              lr
    861 endfunc
    862 
    863 .macro sum_lag3_func type, uv_layout, edge, elems=16
    864 function sum_\type\()_lag3_\edge\()_neon
    865        push            {r1, lr}
    866        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=d29[0]
    867 endfunc
    868 .endm
    869 
    870 sum_lag3_func y,      0,   left
    871 sum_lag3_func y,      0,   mid
    872 sum_lag3_func y,      0,   right, 15
    873 sum_lag3_func uv_444, 444, left
    874 sum_lag3_func uv_444, 444, mid
    875 sum_lag3_func uv_444, 444, right, 15
    876 sum_lag3_func uv_422, 422, left
    877 sum_lag3_func uv_422, 422, mid
    878 sum_lag3_func uv_422, 422, right, 9
    879 sum_lag3_func uv_420, 420, left
    880 sum_lag3_func uv_420, 420, mid
    881 sum_lag3_func uv_420, 420, right, 9
    882 
    883 function generate_grain_rows_neon
    884        push            {r11,lr}
    885 1:
    886        get_grain_row   d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
    887        subs            r1,  r1,  #1
    888        store_grain_row d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26
    889        bgt             1b
    890        pop             {r11,pc}
    891 endfunc
    892 
    893 function generate_grain_rows_44_neon
    894        push            {r11,lr}
    895 1:
    896        get_grain_row_44 d16, d17, d18, d19, d20, d21
    897        subs            r1,  r1,  #1
    898        store_grain_row_44 d16, d17, d18, d19, d20, d21
    899        bgt             1b
    900        pop             {r11,pc}
    901 endfunc
    902 
    903 function gen_grain_uv_444_lag0_neon
    904        vld1.8          {q3}, [r11]!
    905        push            {r11,lr}
    906        bl              get_gaussian_neon
    907        vrshl.s16       q8,  q0,  q15
    908        bl              get_gaussian_neon
    909        vrshl.s16       q9,  q0,  q15
    910        vqmovn.s16      d0,  q8
    911        vqmovn.s16      d1,  q9
    912 
    913        vand            q3,  q3,  q1
    914        vmull.s8        q2,  d6,  d22
    915        vmull.s8        q3,  d7,  d22
    916        vrshl.s16       q2,  q2,  q12
    917        vrshl.s16       q3,  q3,  q12
    918        vaddw.s8        q2,  q2,  d0
    919        vaddw.s8        q3,  q3,  d1
    920        vqmovn.s16      d4,  q2
    921        vqmovn.s16      d5,  q3
    922        vst1.8          {q2}, [r0]!
    923        pop             {r11,pc}
    924 endfunc
    925 
    926 function get_grain_row_44_neon
    927        push            {r11,lr}
    928        get_grain_row_44 d16, d17, d18, d19, d20, d21
    929        pop             {r11,pc}
    930 endfunc
    931 
    932 function add_uv_420_coeff_lag0_neon
    933        vld1.16         {q2, q3}, [r11]!
    934        vld1.16         {q4, q5}, [r12]!
    935        vpaddl.s8       q2,  q2
    936        vpaddl.s8       q3,  q3
    937        vpaddl.s8       q4,  q4
    938        vpaddl.s8       q5,  q5
    939        vadd.i16        q2,  q2,  q4
    940        vadd.i16        q3,  q3,  q5
    941        vrshrn.s16      d4,  q2,  #2
    942        vrshrn.s16      d5,  q3,  #2
    943        b               add_coeff_lag0_start
    944 endfunc
    945 
    946 function add_uv_422_coeff_lag0_neon
    947        vld1.16         {q2, q3}, [r11]!
    948        vpaddl.s8       q2,  q2
    949        vpaddl.s8       q3,  q3
    950        vrshrn.s16      d4,  q2,  #1
    951        vrshrn.s16      d5,  q3,  #1
    952 
    953 add_coeff_lag0_start:
    954        vand            q3,  q2,  q1
    955        vmull.s8        q2,  d6,  d22
    956        vmull.s8        q3,  d7,  d22
    957        vrshl.s16       q2,  q2,  q12
    958        vrshl.s16       q3,  q3,  q12
    959        vaddw.s8        q2,  q2,  d0
    960        vaddw.s8        q3,  q3,  d1
    961        vqmovn.s16      d4,  q2
    962        vqmovn.s16      d5,  q3
    963        bx              lr
    964 endfunc
    965 
    966 .macro gen_grain_82 type
    967 function generate_grain_\type\()_8bpc_neon, export=1
    968        push            {r4-r11,lr}
    969 
    970 .ifc \type, uv_444
    971        mov             r12, r3
    972        mov             lr,  #28
    973        add             r11, r1,  #3*GRAIN_WIDTH
    974        mov             r1,  r2
    975        mul             r12, r12, lr
    976 .endif
    977        movrel          r3,  X(gaussian_sequence)
    978        ldr             r2,  [r1, #FGD_SEED]
    979        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
    980 .ifc \type, y
    981        add             r4,  r1,  #FGD_AR_COEFFS_Y
    982 .else
    983        add             r4,  r1,  #FGD_AR_COEFFS_UV
    984 .endif
    985        adr             r5,  L(gen_grain_\type\()_tbl)
    986        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
    987        add             r9,  r9,  #4
    988        ldr             r6,  [r5, r6, lsl #2]
    989        vdup.16         q15, r9    // 4 + data->grain_scale_shift
    990        add             r5,  r5,  r6
    991        vneg.s16        q15, q15
    992 
    993 .ifc \type, uv_444
    994        cmp             r12, #0
    995        movw            r10, #0x49d8
    996        movw            lr,  #0xb524
    997        // Intentionally using a separate register instead of moveq with an
    998        // immediate constant, to avoid armv8 deprecated it instruction forms.
    999        it              eq
   1000        moveq           r10, lr
   1001        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
   1002        eor             r2,  r2,  r10
   1003 .endif
   1004 
   1005        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
   1006        mov             r8,  #1
   1007        mov             r10, #1
   1008        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
   1009        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
   1010        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
   1011        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
   1012 
   1013        bx              r5
   1014 
   1015        .align 2
   1016 L(gen_grain_\type\()_tbl):
   1017        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1018        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1019        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1020        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1021 
   1022 L(generate_grain_\type\()_lag0):
   1023 .ifc \type, y
   1024        mov             r1,  #GRAIN_HEIGHT
   1025        bl              generate_grain_rows_neon
   1026 .else
   1027 
   1028        mov             r1,  #3
   1029        bl              generate_grain_rows_neon
   1030        mov             r1,  #GRAIN_HEIGHT-3
   1031 
   1032        vdup.16         q12, r7
   1033        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
   1034        vmov.i8         q0,  #0
   1035        vmov.i8         q1,  #255
   1036        vext.8          q13, q0,  q1,  #13
   1037        vext.8          q14, q1,  q0,  #1
   1038        vneg.s16        q12, q12
   1039 
   1040 1:
   1041        vmov            q1,  q13
   1042        bl              gen_grain_uv_444_lag0_neon // 16
   1043        vmov.i8         q1,  #255
   1044        bl              gen_grain_uv_444_lag0_neon // 32
   1045        bl              gen_grain_uv_444_lag0_neon // 48
   1046        bl              gen_grain_uv_444_lag0_neon // 64
   1047        vmov            q1,  q14
   1048        bl              gen_grain_uv_444_lag0_neon // 80
   1049        get_grain_2     d16
   1050        subs            r1,  r1,  #1
   1051        add             r11, r11, #2
   1052        vst1.16         {d16[0]}, [r0]!
   1053        bgt             1b
   1054 .endif
   1055        pop             {r4-r11,pc}
   1056 
   1057 L(generate_grain_\type\()_lag1):
   1058        vpush           {q4-q7}
   1059        mov             r5,  #127
   1060        vld1.8          {d27[]}, [r4]!      // ar_coeffs_y[0]
   1061        vld1.8          {d28[]}, [r4]!      // ar_coeffs_y[1]
   1062        vld1.8          {d29[]}, [r4]       // ar_coeffs_y[2]
   1063 .ifc \type, y
   1064        ldrsb           r4,  [r4, #1]       // ar_coeffs_y[3]
   1065 .else
   1066        add             r4,  r4,  #2
   1067 .endif
   1068 
   1069        mov             r1,  #3
   1070 .ifc \type, uv_444
   1071        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
   1072        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
   1073 .endif
   1074        bl              generate_grain_rows_neon
   1075 
   1076        mov             r1,  #GRAIN_HEIGHT - 3
   1077 1:
   1078        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
   1079        sum_\type\()_lag1 q8,  q8,  q9,  q10
   1080        sum_\type\()_lag1 q9,  q9,  q10, q11
   1081        sum_\type\()_lag1 q10, q10, q11, q12
   1082        sum_\type\()_lag1 q12, q11, q12, q13, right
   1083        get_grain_2     d26
   1084        subs            r1,  r1,  #1
   1085 .ifc \type, uv_444
   1086        add             r11, r11, #2
   1087 .endif
   1088        store_grain_row d14, d15, d16, d17, d18, d19, d20, d21, d24, d25, d26
   1089        vmov            q11, q10
   1090        vmov            q10, q9
   1091        vmov            q9,  q8
   1092        vmov            q8,  q7
   1093        bgt             1b
   1094 
   1095        vpop            {q4-q7}
   1096        pop             {r4-r11,pc}
   1097 
   1098 L(generate_grain_\type\()_lag2):
   1099        vpush           {q4-q7}
   1100        mov             r5,  #127
   1101        vld1.8          {d28,d29}, [r4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
   1102 
   1103        vmov.s8         r4,  d29[2]
   1104        vmov.s8         r10, d29[3]
   1105 
   1106        mov             r1,  #3
   1107        bl              generate_grain_rows_neon
   1108 
   1109        mov             r1,  #GRAIN_HEIGHT - 3
   1110 1:
   1111        bl              sum_\type\()_lag2_left_neon
   1112        bl              sum_\type\()_lag2_mid_neon
   1113        bl              sum_\type\()_lag2_mid_neon
   1114        bl              sum_\type\()_lag2_mid_neon
   1115        bl              sum_\type\()_lag2_right_neon
   1116        get_grain_2     d16
   1117        subs            r1,  r1,  #1
   1118 .ifc \type, uv_444
   1119        add             r11, r11, #2
   1120 .endif
   1121        vst1.16         {d16[0]}, [r0]!
   1122        bgt             1b
   1123 
   1124        vpop            {q4-q7}
   1125        pop             {r4-r11,pc}
   1126 
   1127 L(generate_grain_\type\()_lag3):
   1128        vpush           {q4-q7}
   1129        mov             r5,  #127
   1130        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
   1131 
   1132        vmov.u8         r4,  d28[5]
   1133        vmov.u8         r10, d28[6]
   1134        vmov.u8         r12, d28[7]
   1135 
   1136        orr             r4,  r4,  r10, lsl #8
   1137        orr             r4,  r4,  r12, lsl #16
   1138 
   1139        mov             r1,  #3
   1140        vpush           {d26}
   1141        bl              generate_grain_rows_neon
   1142        vpop            {d26}
   1143 
   1144        mov             r1,  #GRAIN_HEIGHT - 3
   1145 1:
   1146        bl              sum_\type\()_lag3_left_neon
   1147        bl              sum_\type\()_lag3_mid_neon
   1148        bl              sum_\type\()_lag3_mid_neon
   1149        bl              sum_\type\()_lag3_mid_neon
   1150        bl              sum_\type\()_lag3_right_neon
   1151        get_grain_2     d16
   1152        subs            r1,  r1,  #1
   1153 .ifc \type, uv_444
   1154        add             r11, r11, #2
   1155 .endif
   1156        vst1.16         {d16[0]}, [r0]!
   1157        bgt             1b
   1158 
   1159        vpop            {q4-q7}
   1160        pop             {r4-r11,pc}
   1161 endfunc
   1162 .endm
   1163 
   1164 gen_grain_82 y
   1165 gen_grain_82 uv_444
   1166 
   1167 .macro set_height dst, type
   1168 .ifc \type, uv_420
   1169        mov             \dst,  #SUB_GRAIN_HEIGHT-3
   1170 .else
   1171        mov             \dst,  #GRAIN_HEIGHT-3
   1172 .endif
   1173 .endm
   1174 
   1175 .macro increment_y_ptr reg, type
   1176 .ifc \type, uv_420
   1177        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
   1178 .else
   1179        sub             \reg, \reg, #3*32-GRAIN_WIDTH
   1180 .endif
   1181 .endm
   1182 
   1183 .macro gen_grain_44 type
   1184 function generate_grain_\type\()_8bpc_neon, export=1
   1185        push            {r4-r11,lr}
   1186 
   1187        mov             r12, r3
   1188        mov             lr,  #28
   1189        add             r11, r1,  #3*GRAIN_WIDTH-3
   1190        mov             r1,  r2
   1191        mul             r12, r12, lr
   1192 
   1193        movrel          r3,  X(gaussian_sequence)
   1194        ldr             r2,  [r1, #FGD_SEED]
   1195        ldr             r9,  [r1, #FGD_GRAIN_SCALE_SHIFT]
   1196        add             r4,  r1,  #FGD_AR_COEFFS_UV
   1197        adr             r5,  L(gen_grain_\type\()_tbl)
   1198        ldr             r6,  [r1, #FGD_AR_COEFF_LAG]
   1199        add             r9,  r9,  #4
   1200        ldr             r6,  [r5, r6, lsl #2]
   1201        vdup.16         q15, r9    // 4 + data->grain_scale_shift
   1202        add             r5,  r5,  r6
   1203        vneg.s16        q15, q15
   1204 
   1205        cmp             r12, #0
   1206        movw            r10, #0x49d8
   1207        movw            lr,  #0xb524
   1208        // Intentionally using a separate register instead of moveq with an
   1209        // immediate constant, to avoid armv8 deprecated it instruction forms.
   1210        it              eq
   1211        moveq           r10, lr
   1212        add             r4,  r4,  r12       // Add offset to ar_coeffs_uv[1]
   1213        eor             r2,  r2,  r10
   1214 
   1215        ldr             r7,  [r1, #FGD_AR_COEFF_SHIFT]
   1216        mov             r8,  #1
   1217        mov             r10, #1
   1218        lsl             r8,  r8,  r7        // 1 << ar_coeff_shift
   1219        lsl             r10, r10, r9        // 1 << (4 + data->grain_scale_shift)
   1220        lsr             r8,  r8,  #1        // 1 << (ar_coeff_shift - 1)
   1221        lsr             r10, r10, #1        // 1 << (4 + data->grain_scale_shift - 1)
   1222        bx              r5
   1223 
   1224        .align 2
   1225 L(gen_grain_\type\()_tbl):
   1226        .word L(generate_grain_\type\()_lag0) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1227        .word L(generate_grain_\type\()_lag1) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1228        .word L(generate_grain_\type\()_lag2) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1229        .word L(generate_grain_\type\()_lag3) - L(gen_grain_\type\()_tbl) + CONFIG_THUMB
   1230 
   1231 L(generate_grain_\type\()_lag0):
   1232 .ifc \type, uv_420
   1233        vpush           {q4-q5}
   1234 .endif
   1235        mov             r1,  #3
   1236        bl              generate_grain_rows_44_neon
   1237        set_height      r1,  \type
   1238 
   1239        vdup.16         q12, r7
   1240        vld1.8          {d22[]}, [r4]       // ar_coeffs_uv[0]
   1241        vmov.i8         q0,  #0
   1242        vmov.i8         q1,  #255
   1243        vext.8          q13, q0,  q1,  #13
   1244        vext.8          q14, q1,  q0,  #7
   1245        vneg.s16        q12, q12
   1246 
   1247 1:
   1248        bl              get_grain_row_44_neon
   1249 .ifc \type, uv_420
   1250        add             r12, r11, #GRAIN_WIDTH
   1251 .endif
   1252        vmov            q1,  q13
   1253        vmov            q0,  q8
   1254        bl              add_\type\()_coeff_lag0_neon
   1255        vmov.i8         q1,  #255
   1256        vmov            q0,  q9
   1257        vmov            q8,  q2
   1258        bl              add_\type\()_coeff_lag0_neon
   1259        vmov.i8         q1,  q14
   1260        vmov            q0,  q10
   1261        vmov            q9,  q2
   1262        bl              add_\type\()_coeff_lag0_neon
   1263        vmov            q10, q2
   1264        subs            r1,  r1,  #1
   1265        increment_y_ptr r11, \type
   1266        store_grain_row_44 d16, d17, d18, d19, d20, d21
   1267        bgt             1b
   1268 
   1269 .ifc \type, uv_420
   1270        vpop            {q4-q5}
   1271 .endif
   1272        pop             {r4-r11,pc}
   1273 
   1274 L(generate_grain_\type\()_lag1):
   1275        vpush           {q4-q7}
   1276        mov             r5,  #127
   1277        vld1.8          {d27[]}, [r4]!      // ar_coeffs_uv[0]
   1278        vld1.8          {d28[]}, [r4]!      // ar_coeffs_uv[1]
   1279        vld1.8          {d29[]}, [r4]       // ar_coeffs_uv[2]
   1280        add             r4,  r4,  #2
   1281 
   1282        mov             r1,  #3
   1283        vld1.8          {d13[]}, [r4]       // ar_coeffs_uv[4]
   1284        ldrsb           r4,  [r4, #-1]      // ar_coeffs_uv[3]
   1285        bl              generate_grain_rows_44_neon
   1286 
   1287        set_height      r1,  \type
   1288 1:
   1289        sum_\type\()_lag1 q7,  q8,  q8,  q9,  left
   1290        sum_\type\()_lag1 q8,  q8,  q9,  q10
   1291        sum_\type\()_lag1 q10, q9,  q10, q11, right
   1292        subs            r1,  r1,  #1
   1293        increment_y_ptr r11, \type
   1294        store_grain_row_44 d14, d15, d16, d17, d20, d21
   1295        vmov            q9,  q8
   1296        vmov            q8,  q7
   1297        bgt             1b
   1298 
   1299        vpop            {q4-q7}
   1300        pop             {r4-r11,pc}
   1301 
   1302 L(generate_grain_\type\()_lag2):
   1303        vpush           {q4-q7}
   1304        mov             r5,  #127
   1305        vld1.8          {d28,d29}, [r4]     // ar_coeffs_uv[0-12]
   1306 
   1307        vmov.s8         r4,  d29[2]
   1308        vmov.s8         r10, d29[3]
   1309 
   1310        mov             r1,  #3
   1311        bl              generate_grain_rows_44_neon
   1312 
   1313        set_height      r1,  \type
   1314 1:
   1315        bl              sum_\type\()_lag2_left_neon
   1316        bl              sum_\type\()_lag2_mid_neon
   1317        bl              sum_\type\()_lag2_right_neon
   1318        subs            r1,  r1,  #1
   1319        increment_y_ptr r11, \type
   1320        add             r0,  r0,  #GRAIN_WIDTH-48
   1321        bgt             1b
   1322 
   1323        vpop            {q4-q7}
   1324        pop             {r4-r11,pc}
   1325 
   1326 L(generate_grain_\type\()_lag3):
   1327        vpush           {q4-q7}
   1328        mov             r5,  #127
   1329        vld1.8          {q13, q14}, [r4]    // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
   1330 
   1331        vmov.u8         r4,  d28[5]
   1332        vmov.u8         r10, d28[6]
   1333        vmov.u8         r12, d28[7]
   1334 
   1335        orr             r4,  r4,  r10, lsl #8
   1336        orr             r4,  r4,  r12, lsl #16
   1337 
   1338        mov             r1,  #3
   1339        bl              generate_grain_rows_44_neon
   1340 
   1341        set_height      r1,  \type
   1342 1:
   1343        bl              sum_\type\()_lag3_left_neon
   1344        bl              sum_\type\()_lag3_mid_neon
   1345        bl              sum_\type\()_lag3_right_neon
   1346        subs            r1,  r1,  #1
   1347        increment_y_ptr r11, \type
   1348        add             r0,  r0,  #GRAIN_WIDTH-48
   1349        bgt             1b
   1350 
   1351        vpop            {q4-q7}
   1352        pop             {r4-r11,pc}
   1353 endfunc
   1354 .endm
   1355 
   1356 gen_grain_44 uv_420
   1357 gen_grain_44 uv_422
   1358 
   1359 .macro gather_interleaved dst1, dst2, src1, src2, off
   1360        vmov.u8         r11, \src1[0+\off]
   1361        vmov.u8         r12, \src2[0+\off]
   1362        add             r11, r11, r3
   1363        vmov.u8         lr,  \src1[2+\off]
   1364        add             r12, r12, r3
   1365        vld1.8          {\dst1[0+\off]}, [r11]
   1366        vmov.u8         r11, \src2[2+\off]
   1367        add             lr,  lr,  r3
   1368        vld1.8          {\dst2[0+\off]}, [r12]
   1369        vmov.u8         r12, \src1[4+\off]
   1370        add             r11, r11, r3
   1371        vld1.8          {\dst1[2+\off]}, [lr]
   1372        vmov.u8         lr,  \src2[4+\off]
   1373        add             r12, r12, r3
   1374        vld1.8          {\dst2[2+\off]}, [r11]
   1375        vmov.u8         r11, \src1[6+\off]
   1376        add             lr,  lr,  r3
   1377        vld1.8          {\dst1[4+\off]}, [r12]
   1378        vmov.u8         r12, \src2[6+\off]
   1379        add             r11, r11, r3
   1380        vld1.8          {\dst2[4+\off]}, [lr]
   1381        add             r12, r12, r3
   1382        vld1.8          {\dst1[6+\off]}, [r11]
   1383        vld1.8          {\dst2[6+\off]}, [r12]
   1384 .endm
   1385 
   1386 .macro gather dst1, dst2, dst3, dst4, src1, src2, src3, src4
   1387        gather_interleaved \dst1, \dst3, \src1, \src3, 0
   1388        gather_interleaved \dst1, \dst3, \src1, \src3, 1
   1389        gather_interleaved \dst2, \dst4, \src2, \src4, 0
   1390        gather_interleaved \dst2, \dst4, \src2, \src4, 1
   1391 .endm
   1392 
   1393 function gather32_neon
   1394        push            {r11-r12,lr}
   1395        gather          d8,  d9,  d10, d11, d0,  d1,  d2,  d3
   1396        pop             {r11-r12,pc}
   1397 endfunc
   1398 
   1399 function gather16_neon
   1400        push            {r11-r12,lr}
   1401        gather_interleaved d8,  d9,  d0,  d1,  0
   1402        gather_interleaved d8,  d9,  d0,  d1,  1
   1403        pop             {r11-r12,pc}
   1404 endfunc
   1405 
   1406 const overlap_coeffs_0, align=4
   1407        .byte 27, 17, 0,  0,  0,  0,  0,  0
   1408        .byte 17, 27, 32, 32, 32, 32, 32, 32
   1409 endconst
   1410 
   1411 const overlap_coeffs_1, align=4
   1412        .byte 23, 0,  0,  0,  0,  0,  0,  0
   1413        .byte 22, 32, 32, 32, 32, 32, 32, 32
   1414 endconst
   1415 
   1416 .macro calc_offset offx, offy, src, sx, sy
   1417        and             \offy, \src,  #0xF     // randval & 0xF
   1418        lsr             \offx, \src,  #4       // randval >> 4
   1419 .if \sy == 0
   1420        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
   1421 .endif
   1422 .if \sx == 0
   1423        add             \offx, \offx, \offx    // 2 * (randval >> 4)
   1424 .endif
   1425 .endm
   1426 
   1427 .macro add_offset dst, offx, offy, src, stride
   1428        mla             \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
   1429        add             \dst, \dst, \offx          // grain_lut += offx
   1430 .endm
   1431 
   1432 // void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
   1433 //                                const ptrdiff_t stride,
   1434 //                                const uint8_t scaling[SCALING_SIZE],
   1435 //                                const int scaling_shift,
   1436 //                                const entry grain_lut[][GRAIN_WIDTH],
   1437 //                                const int offsets[][2],
   1438 //                                const int h, const ptrdiff_t clip,
   1439 //                                const ptrdiff_t type);
   1440 function fgy_32x32_8bpc_neon, export=1
   1441        push            {r4-r11,lr}
   1442        vpush           {q4-q7}
   1443        ldrd            r4,  r5,  [sp, #100]   // scaling_shift, grain_lut
   1444        ldrd            r6,  r7,  [sp, #108]   // offsets, h
   1445        ldr             r8,       [sp, #116]   // clip
   1446        mov             r9,  #GRAIN_WIDTH      // grain_lut stride
   1447 
   1448        neg             r4,  r4
   1449        vdup.16         q13, r4                // -scaling_shift
   1450        cmp             r8,  #0
   1451 
   1452        movrel_local    r12, overlap_coeffs_0
   1453 
   1454        beq             1f
   1455        // clip
   1456        vmov.i8         q14, #16
   1457        vmov.i8         q15, #235
   1458        b               2f
   1459 1:
   1460        // no clip
   1461        vmov.i8         q14, #0
   1462        vmov.i8         q15, #255
   1463 2:
   1464 
   1465        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
   1466 
   1467        add             r5,  r5,  #9           // grain_lut += 9
   1468        add             r5,  r5,  r9,  lsl #3  // grain_lut += 8 * grain_stride
   1469        add             r5,  r5,  r9           // grain_lut += grain_stride
   1470 
   1471        ldr             r10, [r6, #8]          // offsets[1][0]
   1472        calc_offset     r10, r4,  r10, 0,   0
   1473        add_offset      r4,  r10, r4,  r5,  r9
   1474        ldr             r10, [r6, #4]          // offsets[0][1]
   1475        calc_offset     r10, r11, r10, 0,   0
   1476        add_offset      r11, r10, r11, r5,  r9
   1477        ldr             r10, [r6, #12]         // offsets[1][1]
   1478        calc_offset     r10, r8,  r10, 0,   0
   1479        add_offset      r8,  r10, r8,  r5,  r9
   1480        ldr             r6,  [r6]              // offsets[0][0]
   1481        calc_offset     r6,  lr,  r6,  0,   0
   1482        add_offset      r5,  r6,  lr,  r5,  r9
   1483 
   1484        add             r4,  r4,  #32          // grain_lut += FG_BLOCK_SIZE * bx
   1485        add             r6,  r11, r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1486 
   1487        ldr             r10, [sp, #120]        // type
   1488        adr             r11, L(fgy_loop_tbl)
   1489 
   1490        tst             r10, #1
   1491        ldr             r10, [r11, r10, lsl #2]
   1492 
   1493        add             r8,  r8,  r9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1494        add             r8,  r8,  #32          // grain_lut += FG_BLOCK_SIZE * bx
   1495 
   1496        add             r11, r11, r10
   1497 
   1498        beq             1f
   1499        // y overlap
   1500        vdup.8          d14, d24[0]
   1501        vdup.8          d15, d24[1]
   1502        mov             r10, r7                // backup actual h
   1503        mov             r7,  #2
   1504 1:
   1505        bx              r11
   1506 endfunc
   1507 
   1508 function fgy_loop_neon
   1509 L(fgy_loop_tbl):
   1510        .word L(loop_00) - L(fgy_loop_tbl) + CONFIG_THUMB
   1511        .word L(loop_01) - L(fgy_loop_tbl) + CONFIG_THUMB
   1512        .word L(loop_10) - L(fgy_loop_tbl) + CONFIG_THUMB
   1513        .word L(loop_11) - L(fgy_loop_tbl) + CONFIG_THUMB
   1514 
   1515 .macro fgy ox, oy
   1516 L(loop_\ox\oy):
   1517 1:
   1518 .if \ox
   1519        vld1.8          {d8},       [r4],       r9 // grain_lut old
   1520 .endif
   1521 .if \oy
   1522        vld1.8          {q2, q3},   [r6],       r9 // grain_lut top
   1523 .endif
   1524 .if \ox && \oy
   1525        vld1.8          {d10},      [r8],       r9 // grain_lut top old
   1526 .endif
   1527        vld1.8          {q0,  q1},  [r1, :128], r2 // src
   1528        vld1.8          {q10, q11}, [r5],       r9 // grain_lut
   1529 
   1530 .if \ox
   1531        vmull.s8        q4,  d8,  d24
   1532        vmlal.s8        q4,  d20, d25
   1533 .endif
   1534 
   1535 .if \oy
   1536 .if \ox
   1537        vmull.s8        q5,  d10, d24
   1538        vmlal.s8        q5,  d4,  d25
   1539        vqrshrn.s16     d20, q4,  #5
   1540        vqrshrn.s16     d4,  q5,  #5
   1541 .endif
   1542 
   1543        vmull.s8        q4,  d20, d15
   1544        vmull.s8        q5,  d21, d15
   1545        vmull.s8        q8,  d22, d15
   1546        vmull.s8        q9,  d23, d15
   1547        vmlal.s8        q4,  d4,  d14
   1548        vmlal.s8        q5,  d5,  d14
   1549        vmlal.s8        q8,  d6,  d14
   1550        vmlal.s8        q9,  d7,  d14
   1551        vqrshrn.s16     d20, q4,  #5
   1552        vqrshrn.s16     d21, q5,  #5
   1553        vqrshrn.s16     d22, q8,  #5
   1554        vqrshrn.s16     d23, q9,  #5
   1555 .elseif \ox
   1556        vqrshrn.s16     d20, q4,  #5
   1557 .endif
   1558 
   1559        bl              gather32_neon
   1560 
   1561        vmovl.s8        q8,  d20       // grain
   1562        vmovl.s8        q9,  d21
   1563        vmovl.s8        q10, d22
   1564        vmovl.s8        q11, d23
   1565 
   1566        vmovl.u8        q2,  d8        // scaling
   1567        vmovl.u8        q3,  d9
   1568        vmovl.u8        q4,  d10
   1569        vmovl.u8        q5,  d11
   1570 
   1571        vmul.i16        q8,  q8,  q2   // scaling * grain
   1572        vmul.i16        q9,  q9,  q3
   1573        vmul.i16        q10, q10, q4
   1574        vmul.i16        q11, q11, q5
   1575 
   1576        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
   1577        vrshl.s16       q9,  q9,  q13
   1578        vrshl.s16       q10, q10, q13
   1579        vrshl.s16       q11, q11, q13
   1580 
   1581        vaddw.u8        q8,  q8,  d0   // *src + noise
   1582        vaddw.u8        q9,  q9,  d1
   1583        vaddw.u8        q10, q10, d2
   1584        vaddw.u8        q11, q11, d3
   1585 
   1586        vqmovun.s16     d0,  q8
   1587        vqmovun.s16     d1,  q9
   1588        vqmovun.s16     d2,  q10
   1589        vqmovun.s16     d3,  q11
   1590 
   1591        vmax.u8         q0,  q0,  q14
   1592        vmax.u8         q1,  q1,  q14
   1593        vmin.u8         q0,  q0,  q15
   1594        vmin.u8         q1,  q1,  q15
   1595 
   1596        subs            r7,  r7,  #1
   1597 .if \oy
   1598        vdup.8          d14, d25[0]
   1599        vdup.8          d15, d25[1]
   1600 .endif
   1601        vst1.8          {q0, q1}, [r0, :128], r2 // dst
   1602        bgt             1b
   1603 
   1604 .if \oy
   1605        cmp             r10, #2
   1606        sub             r7,  r10, #2           // restore actual remaining h
   1607        bgt             L(loop_\ox\()0)
   1608 .endif
   1609        vpop            {q4-q7}
   1610        pop             {r4-r11,pc}
   1611 .endm
   1612 
   1613        fgy             0, 0
   1614        fgy             0, 1
   1615        fgy             1, 0
   1616        fgy             1, 1
   1617 endfunc
   1618 
   1619 // void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
   1620 //                                     const pixel *const src,
   1621 //                                     const ptrdiff_t stride,
   1622 //                                     const uint8_t scaling[SCALING_SIZE],
   1623 //                                     const Dav1dFilmGrainData *const data,
   1624 //                                     const entry grain_lut[][GRAIN_WIDTH],
   1625 //                                     const pixel *const luma_row,
   1626 //                                     const ptrdiff_t luma_stride,
   1627 //                                     const int offsets[][2],
   1628 //                                     const ptrdiff_t h, const ptrdiff_t uv,
   1629 //                                     const ptrdiff_t is_id,
   1630 //                                     const ptrdiff_t type);
   1631 .macro fguv layout, sx, sy
   1632 function fguv_32x32_\layout\()_8bpc_neon, export=1
   1633        push            {r4-r11,lr}
   1634        vpush           {q4-q7}
   1635        ldrd            r4,  r5,  [sp, #100]   // data, grain_lut
   1636        ldrd            r6,  r7,  [sp, #108]   // luma_row, luma_stride
   1637        ldrd            r8,  r9,  [sp, #116]   // offsets, h
   1638        ldrd            r10, r11, [sp, #124]   // uv, is_id
   1639 
   1640        // !csfl
   1641        add             r10, r4,  r10, lsl #2  // + 4*uv
   1642        add             r12, r10, #FGD_UV_LUMA_MULT
   1643        add             lr,  r10, #FGD_UV_MULT
   1644        add             r10, r10, #FGD_UV_OFFSET
   1645        vld1.16         {d4[]},  [r12]         // uv_luma_mult
   1646        vld1.16         {d4[2]}, [r10]         // uv_offset
   1647        vld1.16         {d4[1]}, [lr]          // uv_mult
   1648 
   1649        ldr             lr,  [r4, #FGD_SCALING_SHIFT]
   1650        ldr             r12, [r4, #FGD_CLIP_TO_RESTRICTED_RANGE]
   1651        neg             lr,  lr                // -scaling_shift
   1652 
   1653        cmp             r12, #0
   1654        vdup.16         q13, lr                // -scaling_shift
   1655 
   1656        beq             1f
   1657        // clip
   1658        cmp             r11, #0
   1659        vmov.i8         q14, #16
   1660        vmov.i8         q15, #240
   1661        beq             2f
   1662        // is_id
   1663        vmov.i8         q15, #235
   1664        b               2f
   1665 1:
   1666        // no clip
   1667        vmov.i8         q14, #0
   1668        vmov.i8         q15, #255
   1669 2:
   1670 
   1671        mov             r10, #GRAIN_WIDTH      // grain_lut stride
   1672 
   1673        add             r5,  r5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
   1674 .if \sy
   1675        add             r5,  r5,  r10, lsl #2  // grain_lut += 4 * grain_stride
   1676        add             r5,  r5,  r10, lsl #1  // grain_lut += 2 * grain_stride
   1677 .else
   1678        add             r5,  r5,  r10, lsl #3  // grain_lut += 8 * grain_stride
   1679        add             r5,  r5,  r10          // grain_lut += grain_stride
   1680 .endif
   1681 
   1682        ldr             r12, [r8, #8]          // offsets[1][0]
   1683        calc_offset     r12, r4,  r12, \sx, \sy
   1684        add_offset      r4,  r12, r4,  r5,  r10
   1685 
   1686        ldr             r12, [r8, #4]          // offsets[0][1]
   1687        calc_offset     r12, lr,  r12, \sx, \sy
   1688        add_offset      lr,  r12, lr,  r5,  r10
   1689 
   1690        ldr             r12, [r8, #12]         // offsets[1][1]
   1691        calc_offset     r12, r11, r12, \sx, \sy
   1692        add_offset      r11, r12, r11, r5,  r10
   1693 
   1694        ldr             r8,  [r8]              // offsets[0][0]
   1695        calc_offset     r8,  r12, r8,  \sx, \sy
   1696        add_offset      r5,  r8,  r12, r5,  r10
   1697 
   1698        add             r4,  r4,  #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
   1699        add             r8,  lr,  r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1700        add             r11, r11, r10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1701        add             r11, r11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
   1702 
   1703        movrel_local    r12, overlap_coeffs_\sx
   1704        ldr             lr,  [sp, #132]        // type
   1705 
   1706        vld1.8          {d24, d25}, [r12, :128] // overlap_coeffs
   1707 
   1708        movrel_local    r12, L(fguv_loop_sx\sx\()_tbl)
   1709 #if CONFIG_THUMB
   1710        // This uses movrel_local instead of adr above, because the target
   1711        // can be out of range for adr. But movrel_local leaves the thumb bit
   1712        // set on COFF (but probably wouldn't if building for thumb on ELF),
   1713        // thus try to clear the bit for robustness.
   1714        bic             r12, r12, #1
   1715 #endif
   1716 
   1717        tst             lr,  #1
   1718        ldr             lr,  [r12, lr,  lsl #2]
   1719 
   1720        add             r12, r12, lr
   1721 
   1722        beq             1f
   1723        // y overlap
   1724        sub             lr,  r9,  #(2 >> \sy)  // backup remaining h
   1725        mov             r9,  #(2 >> \sy)
   1726 
   1727 1:
   1728 
   1729 .if \sy
   1730        vmov.i8         d6,  #23
   1731        vmov.i8         d7,  #22
   1732 .else
   1733        vmov.i8         d6,  #27
   1734        vmov.i8         d7,  #17
   1735 .endif
   1736 
   1737 .if \sy
   1738        add             r7,  r7,  r7           // luma_stride *= 2
   1739 .endif
   1740 
   1741        bx              r12
   1742 endfunc
   1743 .endm
   1744 
   1745 fguv 420, 1, 1
   1746 fguv 422, 1, 0
   1747 fguv 444, 0, 0
   1748 
   1749 function fguv_loop_sx0_neon
   1750 L(fguv_loop_sx0_tbl):
   1751        .word L(fguv_loop_sx0_csfl0_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1752        .word L(fguv_loop_sx0_csfl0_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1753        .word L(fguv_loop_sx0_csfl0_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1754        .word L(fguv_loop_sx0_csfl0_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1755        .word L(fguv_loop_sx0_csfl1_00) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1756        .word L(fguv_loop_sx0_csfl1_01) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1757        .word L(fguv_loop_sx0_csfl1_10) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1758        .word L(fguv_loop_sx0_csfl1_11) - L(fguv_loop_sx0_tbl) + CONFIG_THUMB
   1759 
   1760 .macro fguv_loop_sx0 csfl, ox, oy
   1761 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
   1762 .if \oy
   1763        mov             r12, lr
   1764 .endif
   1765 1:
   1766 .if \ox
   1767        vld1.8          {d8},       [r4],        r10 // grain_lut old
   1768 .endif
   1769 .if \oy
   1770        vld1.8          {q8, q9},   [r8],        r10 // grain_lut top
   1771 .endif
   1772 .if \ox && \oy
   1773        vld1.8          {d10},      [r11],       r10 // grain_lut top old
   1774 .endif
   1775        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
   1776        vld1.8          {q10, q11}, [r5],        r10 // grain_lut
   1777 
   1778 .if \ox
   1779        vmull.s8        q4,  d8,  d24
   1780        vmlal.s8        q4,  d20, d25
   1781 .endif
   1782 
   1783 .if \oy
   1784 .if \ox
   1785        vmull.s8        q5,  d10, d24
   1786        vmlal.s8        q5,  d16, d25
   1787        vqrshrn.s16     d20, q4,  #5
   1788        vqrshrn.s16     d16, q5,  #5
   1789 .endif
   1790 
   1791        vmull.s8        q4,  d20, d7
   1792        vmull.s8        q5,  d21, d7
   1793        vmull.s8        q6,  d22, d7
   1794        vmull.s8        q7,  d23, d7
   1795        vmlal.s8        q4,  d16, d6
   1796        vmlal.s8        q5,  d17, d6
   1797        vmlal.s8        q6,  d18, d6
   1798        vmlal.s8        q7,  d19, d6
   1799        vqrshrn.s16     d20, q4,  #5
   1800        vqrshrn.s16     d21, q5,  #5
   1801        vqrshrn.s16     d22, q6,  #5
   1802        vqrshrn.s16     d23, q7,  #5
   1803 .elseif \ox
   1804        vqrshrn.s16     d20, q4,  #5
   1805 .endif
   1806 .if !\csfl
   1807        vld1.8          {q8,  q9},  [r1, :128] // src
   1808        vmovl.u8        q4,  d0
   1809        vmovl.u8        q5,  d1
   1810        vmovl.u8        q6,  d2
   1811        vmovl.u8        q7,  d3
   1812        vmovl.u8        q0,  d16
   1813        vmovl.u8        q1,  d17
   1814        vmovl.u8        q8,  d18
   1815        vmovl.u8        q9,  d19
   1816        vmul.i16        q4,  q4,  d4[0]
   1817        vmul.i16        q5,  q5,  d4[0]
   1818        vmul.i16        q6,  q6,  d4[0]
   1819        vmul.i16        q7,  q7,  d4[0]
   1820        vmul.i16        q0,  q0,  d4[1]
   1821        vmul.i16        q1,  q1,  d4[1]
   1822        vmul.i16        q8,  q8,  d4[1]
   1823        vmul.i16        q9,  q9,  d4[1]
   1824        vqadd.s16       q4,  q4,  q0
   1825        vqadd.s16       q5,  q5,  q1
   1826        vqadd.s16       q6,  q6,  q8
   1827        vqadd.s16       q7,  q7,  q9
   1828        vdup.16         q0,  d4[2]
   1829        vshr.s16        q4,  q4,  #6
   1830        vshr.s16        q5,  q5,  #6
   1831        vshr.s16        q6,  q6,  #6
   1832        vshr.s16        q7,  q7,  #6
   1833        vadd.i16        q4,  q4,  q0
   1834        vadd.i16        q5,  q5,  q0
   1835        vadd.i16        q6,  q6,  q0
   1836        vadd.i16        q7,  q7,  q0
   1837        vqmovun.s16     d0,  q4
   1838        vqmovun.s16     d1,  q5
   1839        vqmovun.s16     d2,  q6
   1840        vqmovun.s16     d3,  q7
   1841 .endif
   1842 
   1843        bl              gather32_neon
   1844 
   1845        vld1.8          {q0,  q1},  [r1, :128], r2 // src
   1846 
   1847        vmovl.s8        q8,  d20       // grain
   1848        vmovl.s8        q9,  d21
   1849        vmovl.s8        q10, d22
   1850        vmovl.s8        q11, d23
   1851 
   1852        vmovl.u8        q6,  d8        // scaling
   1853        vmovl.u8        q7,  d9
   1854        vmovl.u8        q4,  d10
   1855        vmovl.u8        q5,  d11
   1856 
   1857        vmul.i16        q8,  q8,  q6   // scaling * grain
   1858        vmul.i16        q9,  q9,  q7
   1859        vmul.i16        q10, q10, q4
   1860        vmul.i16        q11, q11, q5
   1861 
   1862        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
   1863        vrshl.s16       q9,  q9,  q13
   1864        vrshl.s16       q10, q10, q13
   1865        vrshl.s16       q11, q11, q13
   1866 
   1867        vaddw.u8        q8,  q8,  d0   // *src + noise
   1868        vaddw.u8        q9,  q9,  d1
   1869        vaddw.u8        q10, q10, d2
   1870        vaddw.u8        q11, q11, d3
   1871 
   1872        vqmovun.s16     d0,  q8
   1873        vqmovun.s16     d1,  q9
   1874        vqmovun.s16     d2,  q10
   1875        vqmovun.s16     d3,  q11
   1876 
   1877        vmax.u8         q0,  q0,  q14
   1878        vmax.u8         q1,  q1,  q14
   1879        vmin.u8         q0,  q0,  q15
   1880        vmin.u8         q1,  q1,  q15
   1881 
   1882        subs            r9,  r9,  #1
   1883 .if \oy
   1884        vdup.8          d6,  d25[0]
   1885        vdup.8          d7,  d25[1]
   1886 .endif
   1887 
   1888        vst1.8          {q0, q1}, [r0, :128], r2 // dst
   1889        bgt             1b
   1890 
   1891 .if \oy
   1892        cmp             r12, #0
   1893        mov             r9,  r12               // restore actual remaining h
   1894        bgt             L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
   1895 .endif
   1896        b               9f
   1897 .endm
   1898        fguv_loop_sx0   0, 0, 0
   1899        fguv_loop_sx0   0, 0, 1
   1900        fguv_loop_sx0   0, 1, 0
   1901        fguv_loop_sx0   0, 1, 1
   1902        fguv_loop_sx0   1, 0, 0
   1903        fguv_loop_sx0   1, 0, 1
   1904        fguv_loop_sx0   1, 1, 0
   1905        fguv_loop_sx0   1, 1, 1
   1906 
   1907 9:
   1908        vpop            {q4-q7}
   1909        pop             {r4-r11,pc}
   1910 endfunc
   1911 
   1912 function fguv_loop_sx1_neon
   1913 L(fguv_loop_sx1_tbl):
   1914        .word L(fguv_loop_sx1_csfl0_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1915        .word L(fguv_loop_sx1_csfl0_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1916        .word L(fguv_loop_sx1_csfl0_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1917        .word L(fguv_loop_sx1_csfl0_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1918        .word L(fguv_loop_sx1_csfl1_00) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1919        .word L(fguv_loop_sx1_csfl1_01) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1920        .word L(fguv_loop_sx1_csfl1_10) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1921        .word L(fguv_loop_sx1_csfl1_11) - L(fguv_loop_sx1_tbl) + CONFIG_THUMB
   1922 
   1923 .macro fguv_loop_sx1 csfl, ox, oy
   1924 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
   1925 .if \oy
   1926        mov             r12, lr
   1927 .endif
   1928 1:
   1929 .if \ox
   1930        vld1.8          {d8},       [r4],        r10 // grain_lut old
   1931 .endif
   1932 .if \oy
   1933        vld1.8          {q8},       [r8],        r10 // grain_lut top
   1934 .endif
   1935 .if \ox && \oy
   1936        vld1.8          {d10},      [r11],       r10 // grain_lut top old
   1937 .endif
   1938        vld1.8          {q0,  q1},  [r6, :128],  r7  // luma
   1939        vld1.8          {q10},      [r5],        r10 // grain_lut
   1940        vld1.8          {q11},      [r1, :128],  r2  // src
   1941 
   1942 .if \ox
   1943        vmull.s8        q4,  d8,  d24
   1944        vmlal.s8        q4,  d20, d25
   1945 .endif
   1946 
   1947        vpaddl.u8       q0,  q0
   1948        vpaddl.u8       q1,  q1
   1949 .if \oy
   1950 .if \ox
   1951        vmull.s8        q5,  d10, d24
   1952        vmlal.s8        q5,  d16, d25
   1953        vqrshrn.s16     d20, q4,  #5
   1954        vqrshrn.s16     d16, q5,  #5
   1955 .endif
   1956 
   1957        vmull.s8        q4,  d20, d7
   1958        vmull.s8        q5,  d21, d7
   1959        vmlal.s8        q4,  d16, d6
   1960        vmlal.s8        q5,  d17, d6
   1961        vqrshrn.s16     d20, q4,  #5
   1962        vqrshrn.s16     d21, q5,  #5
   1963 .elseif \ox
   1964        vqrshrn.s16     d20, q4,  #5
   1965 .endif
   1966 .if \csfl
   1967        vrshrn.u16      d0,  q0,  #1
   1968        vrshrn.u16      d1,  q1,  #1
   1969 .else
   1970        vrshr.u16       q4,  q0,  #1
   1971        vrshr.u16       q5,  q1,  #1
   1972        vmovl.u8        q0,  d22
   1973        vmovl.u8        q1,  d23
   1974        vmul.i16        q4,  q4,  d4[0]
   1975        vmul.i16        q5,  q5,  d4[0]
   1976        vmul.i16        q0,  q0,  d4[1]
   1977        vmul.i16        q1,  q1,  d4[1]
   1978        vqadd.s16       q4,  q4,  q0
   1979        vqadd.s16       q5,  q5,  q1
   1980        vdup.16         q0,  d4[2]
   1981        vshr.s16        q4,  q4,  #6
   1982        vshr.s16        q5,  q5,  #6
   1983        vadd.i16        q4,  q4,  q0
   1984        vadd.i16        q5,  q5,  q0
   1985        vqmovun.s16     d0,  q4
   1986        vqmovun.s16     d1,  q5
   1987 .endif
   1988 
   1989        bl              gather16_neon
   1990 
   1991        vmovl.s8        q8,  d20       // grain
   1992        vmovl.s8        q9,  d21
   1993 
   1994        vmovl.u8        q6,  d8        // scaling
   1995        vmovl.u8        q7,  d9
   1996 
   1997        vmul.i16        q8,  q8,  q6   // scaling * grain
   1998        vmul.i16        q9,  q9,  q7
   1999 
   2000        vrshl.s16       q8,  q8,  q13  // round2(scaling * grain, scaling_shift)
   2001        vrshl.s16       q9,  q9,  q13
   2002 
   2003        vaddw.u8        q8,  q8,  d22  // *src + noise
   2004        vaddw.u8        q9,  q9,  d23
   2005 
   2006        vqmovun.s16     d0,  q8
   2007        vqmovun.s16     d1,  q9
   2008 
   2009        vmax.u8         q0,  q0,  q14
   2010        vmin.u8         q0,  q0,  q15
   2011 
   2012        subs            r9,  r9,  #1
   2013 .if \oy
   2014        vswp            d6,  d7
   2015 .endif
   2016        vst1.8          {q0}, [r0, :128], r2 // dst
   2017        bgt             1b
   2018 
   2019 .if \oy
   2020        cmp             r12, #0
   2021        mov             r9,  r12               // restore actual remaining h
   2022        bgt             L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
   2023 .endif
   2024 
   2025        b               9f
   2026 .endm
   2027        fguv_loop_sx1   0, 0, 0
   2028        fguv_loop_sx1   0, 0, 1
   2029        fguv_loop_sx1   0, 1, 0
   2030        fguv_loop_sx1   0, 1, 1
   2031        fguv_loop_sx1   1, 0, 0
   2032        fguv_loop_sx1   1, 0, 1
   2033        fguv_loop_sx1   1, 1, 0
   2034        fguv_loop_sx1   1, 1, 1
   2035 
   2036 9:
   2037        vpop            {q4-q7}
   2038        pop             {r4-r11,pc}
   2039 endfunc