tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain.S (73466B)


      1 /*
      2 * Copyright © 2021, VideoLAN and dav1d authors
      3 * Copyright © 2021, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 #include "src/arm/asm-offsets.h"
     31 
     32 #define GRAIN_WIDTH 82
     33 #define GRAIN_HEIGHT 73
     34 
     35 #define SUB_GRAIN_WIDTH 44
     36 #define SUB_GRAIN_HEIGHT 38
     37 
     38 .macro increment_seed steps, shift=1
     39        lsr             w11, w2,  #3
     40        lsr             w12, w2,  #12
     41        lsr             w13, w2,  #1
     42        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
     43        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
     44        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
     45 .if \shift
     46        lsr             w2,  w2,  #\steps
     47 .endif
     48        and             w11, w11, #((1 << \steps) - 1)    // bit
     49 .if \shift
     50        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
     51 .else
     52        orr             w2,  w2,  w11, lsl #16            // *state
     53 .endif
     54 .endm
     55 
     56 .macro read_rand dest, bits, age
     57        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
     58 .endm
     59 
     60 .macro read_shift_rand dest, bits
     61        ubfx            \dest,  x2,   #17 - \bits, #\bits
     62        lsr             w2,  w2,  #1
     63 .endm
     64 
     65 // special calling convention:
     66 // w2 holds seed
     67 // x3 holds dav1d_gaussian_sequence
     68 // clobbers x11-x15
     69 // returns in v0.8h
     70 function get_gaussian_neon
     71        increment_seed  4
     72        read_rand       x14, 11,  3
     73        read_rand       x15, 11,  2
     74        add             x14, x3,  x14, lsl #1
     75        add             x15, x3,  x15, lsl #1
     76        ld1             {v0.h}[0], [x14]
     77        read_rand       x14, 11,  1
     78        ld1             {v0.h}[1], [x15]
     79        add             x14, x3,  x14, lsl #1
     80        read_rand       x15, 11,  0
     81        increment_seed  4
     82        add             x15, x3,  x15, lsl #1
     83        ld1             {v0.h}[2], [x14]
     84        read_rand       x14, 11,  3
     85        ld1             {v0.h}[3], [x15]
     86        add             x14, x3,  x14, lsl #1
     87        read_rand       x15, 11,  2
     88        ld1             {v0.h}[4], [x14]
     89        add             x15, x3,  x15, lsl #1
     90        read_rand       x14, 11,  1
     91        ld1             {v0.h}[5], [x15]
     92        read_rand       x15, 11,  0
     93        add             x14, x3,  x14, lsl #1
     94        add             x15, x3,  x15, lsl #1
     95        ld1             {v0.h}[6], [x14]
     96        ld1             {v0.h}[7], [x15]
     97        ret
     98 endfunc
     99 
    100 .macro get_grain_row r0, r1, r2, r3, r4, r5
    101        bl              get_gaussian_neon
    102        srshl           \r5\().8h,  v0.8h,  v31.8h
    103        xtn             \r0\().8b,  \r5\().8h
    104        bl              get_gaussian_neon
    105        srshl           \r5\().8h,  v0.8h,  v31.8h
    106        xtn2            \r0\().16b, \r5\().8h
    107        bl              get_gaussian_neon
    108        srshl           \r5\().8h,  v0.8h,  v31.8h
    109        xtn             \r1\().8b,  \r5\().8h
    110        bl              get_gaussian_neon
    111        srshl           \r5\().8h,  v0.8h,  v31.8h
    112        xtn2            \r1\().16b, \r5\().8h
    113        bl              get_gaussian_neon
    114        srshl           \r5\().8h,  v0.8h,  v31.8h
    115        xtn             \r2\().8b,  \r5\().8h
    116        bl              get_gaussian_neon
    117        srshl           \r5\().8h,  v0.8h,  v31.8h
    118        xtn2            \r2\().16b, \r5\().8h
    119        bl              get_gaussian_neon
    120        srshl           \r5\().8h,  v0.8h,  v31.8h
    121        xtn             \r3\().8b,  \r5\().8h
    122        bl              get_gaussian_neon
    123        srshl           \r5\().8h,  v0.8h,  v31.8h
    124        xtn2            \r3\().16b, \r5\().8h
    125        bl              get_gaussian_neon
    126        srshl           \r5\().8h,  v0.8h,  v31.8h
    127        xtn             \r4\().8b,  \r5\().8h
    128        bl              get_gaussian_neon
    129        srshl           \r5\().8h,  v0.8h,  v31.8h
    130        xtn2            \r4\().16b, \r5\().8h
    131        increment_seed  2
    132        read_rand       x14, 11,  1
    133        read_rand       x15, 11,  0
    134        add             x14, x3,  x14, lsl #1
    135        add             x15, x3,  x15, lsl #1
    136        ld1             {\r5\().h}[0], [x14]
    137        ld1             {\r5\().h}[1], [x15]
    138        srshl           v0.4h,      \r5\().4h,  v31.4h
    139        xtn             \r5\().8b,  v0.8h
    140 .endm
    141 
    142 .macro store_grain_row r0, r1, r2, r3, r4, r5
    143        st1             {\r0\().16b,\r1\().16b}, [x0], #32
    144        st1             {\r2\().16b,\r3\().16b}, [x0], #32
    145        st1             {\r4\().16b},  [x0], #16
    146        st1             {\r5\().h}[0], [x0], #2
    147 .endm
    148 
    149 .macro get_grain_row_44 r0, r1, r2
    150        bl              get_gaussian_neon
    151        srshl           \r2\().8h,  v0.8h,  v31.8h
    152        xtn             \r0\().8b,  \r2\().8h
    153        bl              get_gaussian_neon
    154        srshl           \r2\().8h,  v0.8h,  v31.8h
    155        xtn2            \r0\().16b, \r2\().8h
    156        bl              get_gaussian_neon
    157        srshl           \r2\().8h,  v0.8h,  v31.8h
    158        xtn             \r1\().8b,  \r2\().8h
    159        bl              get_gaussian_neon
    160        srshl           \r2\().8h,  v0.8h,  v31.8h
    161        xtn2            \r1\().16b, \r2\().8h
    162        bl              get_gaussian_neon
    163        srshl           \r2\().8h,  v0.8h,  v31.8h
    164        xtn             \r2\().8b,  \r2\().8h
    165 
    166        increment_seed  4
    167        read_rand       x14, 11,  3
    168        read_rand       x15, 11,  2
    169        add             x14, x3,  x14, lsl #1
    170        add             x15, x3,  x15, lsl #1
    171        ld1             {v0.h}[0], [x14]
    172        read_rand       x14, 11,  1
    173        ld1             {v0.h}[1], [x15]
    174        read_rand       x15, 11,  0
    175        add             x14, x3,  x14, lsl #1
    176        add             x15, x3,  x15, lsl #1
    177        ld1             {v0.h}[2], [x14]
    178        ld1             {v0.h}[3], [x15]
    179        srshl           v0.4h,      v0.4h,  v31.4h
    180        xtn2            \r2\().16b, v0.8h
    181 .endm
    182 
    183 .macro store_grain_row_44 r0, r1, r2
    184        st1             {\r0\().16b,\r1\().16b}, [x0], #32
    185        st1             {\r2\().16b},  [x0]
    186        add             x0,  x0,  #GRAIN_WIDTH-32
    187 .endm
    188 
    189 function get_grain_2_neon
    190        increment_seed  2
    191        read_rand       x14, 11,  1
    192        read_rand       x15, 11,  0
    193        add             x14, x3,  x14, lsl #1
    194        add             x15, x3,  x15, lsl #1
    195        ld1             {v0.h}[0], [x14]
    196        ld1             {v0.h}[1], [x15]
    197        srshl           v0.4h,   v0.4h,   v31.4h
    198        xtn             v0.8b,   v0.8h
    199        ret
    200 endfunc
    201 
    202 .macro get_grain_2 dst
    203        bl              get_grain_2_neon
    204 .ifnc \dst, v0
    205        mov             \dst\().8b, v0.8b
    206 .endif
    207 .endm
    208 
    209 // w15 holds the number of entries to produce
    210 // w14, w16 and w17 hold the previous output entries
    211 // v0 holds the vector of produced entries
    212 // v1 holds the input vector of sums from above
    213 .macro output_lag n
    214 function output_lag\n\()_neon
    215 1:
    216        read_shift_rand x13, 11
    217        mov             w11, v1.s[0]
    218        ldrsh           w12, [x3, x13, lsl #1]
    219        ext             v0.16b,  v0.16b,  v0.16b,  #1
    220 .if \n == 1
    221        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
    222 .elseif \n == 2
    223        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
    224        madd            w11, w14, w17, w11        // += *coeff * prev output 2
    225        mov             w16, w14
    226 .else
    227        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
    228        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
    229        madd            w11, w14, w21, w11        // += *coeff * prev output 3
    230        mov             w17, w16
    231        mov             w16, w14
    232 .endif
    233        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
    234        add             w12, w12, w10             // 1 << (4 + grain_scale_shift - 1)
    235        asr             w14, w14, w7              // >> ar_coeff_shift
    236        asr             w12, w12, w9              // >> (4 + grain_scale_shift)
    237        add             w14, w14, w12
    238        cmp             w14, w5
    239        csel            w14, w14, w5,  le
    240        cmp             w14, w6
    241        csel            w14, w14, w6,  ge
    242        subs            w15, w15, #1
    243        ext             v1.16b,  v1.16b,  v1.16b,  #4
    244        ins             v0.b[15], w14
    245        b.gt            1b
    246        ret
    247 endfunc
    248 .endm
    249 
    250 output_lag 1
    251 output_lag 2
    252 output_lag 3
    253 
    254 
    255 function sum_lag1_above_neon
    256        smull           v2.8h,   v3.8b,   v28.8b
    257        smull2          v3.8h,   v3.16b,  v28.16b
    258        smull           v4.8h,   v0.8b,   v27.8b
    259        smull2          v5.8h,   v0.16b,  v27.16b
    260        smull           v6.8h,   v1.8b,   v29.8b
    261        smull2          v7.8h,   v1.16b,  v29.16b
    262        saddl           v0.4s,   v2.4h,   v4.4h
    263        saddl2          v1.4s,   v2.8h,   v4.8h
    264        saddl           v2.4s,   v3.4h,   v5.4h
    265        saddl2          v3.4s,   v3.8h,   v5.8h
    266        saddw           v4.4s,   v0.4s,   v6.4h
    267        saddw2          v5.4s,   v1.4s,   v6.8h
    268        saddw           v6.4s,   v2.4s,   v7.4h
    269        saddw2          v7.4s,   v3.4s,   v7.8h
    270        ret
    271 endfunc
    272 
    273 .macro sum_lag_n_body lag, type, uv_layout, edge, elems, store, uv_coeff
    274        bl              sum_\lag\()_above_neon
    275 .ifc \type, uv_420
    276        add             x12, x19, #GRAIN_WIDTH
    277        ld1             {v22.16b, v23.16b}, [x19], #32
    278        ld1             {v24.16b, v25.16b}, [x12]
    279        saddlp          v22.8h,  v22.16b
    280        saddlp          v23.8h,  v23.16b
    281        saddlp          v24.8h,  v24.16b
    282        saddlp          v25.8h,  v25.16b
    283        add             v22.8h,  v22.8h,  v24.8h
    284        add             v23.8h,  v23.8h,  v25.8h
    285        rshrn           v0.8b,   v22.8h,  #2
    286        rshrn2          v0.16b,  v23.8h,  #2
    287 .endif
    288 .ifc \type, uv_422
    289        ld1             {v22.16b, v23.16b}, [x19], #32
    290        saddlp          v22.8h,  v22.16b
    291        saddlp          v23.8h,  v23.16b
    292        rshrn           v0.8b,   v22.8h,  #1
    293        rshrn2          v0.16b,  v23.8h,  #1
    294 .endif
    295 .ifc \type, uv_444
    296        ld1             {v0.16b}, [x19], #16
    297 .endif
    298 .if \uv_layout
    299 .ifnb \uv_coeff
    300        dup             v1.16b,  \uv_coeff
    301        smull           v2.8h,   v0.8b,   v1.8b
    302        smull2          v3.8h,   v0.16b,  v1.16b
    303 .else
    304        smull           v2.8h,   v0.8b,   v30.8b
    305        smull2          v3.8h,   v0.16b,  v30.16b
    306 .endif
    307        saddw           v4.4s,   v4.4s,   v2.4h
    308        saddw2          v5.4s,   v5.4s,   v2.8h
    309        saddw           v6.4s,   v6.4s,   v3.4h
    310        saddw2          v7.4s,   v7.4s,   v3.8h
    311 .endif
    312 .if \uv_layout && \elems == 16
    313        b               sum_\lag\()_y_\edge\()_start
    314 .elseif \uv_layout == 444 && \elems == 15
    315        b               sum_\lag\()_y_\edge\()_start
    316 .elseif \uv_layout == 422 && \elems == 9
    317        b               sum_\lag\()_uv_420_\edge\()_start
    318 .else
    319 sum_\lag\()_\type\()_\edge\()_start:
    320 .ifc \edge, left
    321        increment_seed  4
    322        read_rand       x12, 11,  3
    323        read_rand       x13, 11,  2
    324        read_rand       x14, 11,  1
    325        add             x12, x3,  x12, lsl #1
    326        add             x13, x3,  x13, lsl #1
    327        add             x14, x3,  x14, lsl #1
    328        ld1             {v0.h}[5], [x12]
    329        ld1             {v0.h}[6], [x13]
    330        ld1             {v0.h}[7], [x14]
    331        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
    332        srshl           v0.8h,   v0.8h,   v31.8h
    333        xtn2            v0.16b,  v0.8h
    334        ext             v4.16b,  v4.16b,  v4.16b,  #12
    335 .ifc \lag, lag3
    336        smov            w17, v0.b[13]
    337 .endif
    338 .ifnc \lag, lag1
    339        smov            w16, v0.b[14]
    340 .endif
    341        smov            w14, v0.b[15]
    342 
    343        mov             v1.16b,  v4.16b
    344        mov             w15, #1
    345        bl              output_\lag\()_neon
    346 .else
    347        increment_seed  4, shift=0
    348        mov             v1.16b,  v4.16b
    349        mov             w15, #4
    350        bl              output_\lag\()_neon
    351 .endif
    352 
    353        increment_seed  4, shift=0
    354        mov             v1.16b,  v5.16b
    355        mov             w15, #4
    356        bl              output_\lag\()_neon
    357 
    358        increment_seed  4, shift=0
    359        mov             v1.16b,  v6.16b
    360 .if \elems == 9
    361        mov             w15, #1
    362        bl              output_\lag\()_neon
    363        lsr             w2,  w2,  #3
    364 
    365        read_rand       x12, 11,  2
    366        read_rand       x13, 11,  1
    367        read_rand       x14, 11,  0
    368        add             x12, x3,  x12, lsl #1
    369        add             x13, x3,  x13, lsl #1
    370        add             x14, x3,  x14, lsl #1
    371        ld1             {v1.h}[0], [x12]
    372        ld1             {v1.h}[1], [x13]
    373        ld1             {v1.h}[2], [x14]
    374        srshl           v1.4h,   v1.4h,   v31.4h
    375        xtn             v1.8b,   v1.8h
    376        ext             v0.16b,  v0.16b,  v1.16b,  #7
    377 .else
    378        mov             w15, #4
    379        bl              output_\lag\()_neon
    380 
    381        increment_seed  4, shift=0
    382        mov             v1.16b,  v7.16b
    383 
    384 .ifc \edge, right
    385        mov             w15, #3
    386        bl              output_\lag\()_neon
    387        read_shift_rand x15, 11
    388        add             x15, x3,  x15, lsl #1
    389        ld1             {v1.h}[0], [x15]
    390        srshl           v1.4h,   v1.4h,   v31.4h
    391        ext             v0.16b,  v0.16b,  v1.16b,  #1
    392 .else
    393        mov             w15, #4
    394        bl              output_\lag\()_neon
    395 .endif
    396 .endif
    397 .if \store
    398        st1             {v0.16b}, [x0], #16
    399 .endif
    400        ldr             x30, [sp], #16
    401        AARCH64_VALIDATE_LINK_REGISTER
    402        ret
    403 .endif
    404 .endm
    405 
    406 .macro sum_lag1_func type, uv_layout, edge, elems=16
    407 function sum_\type\()_lag1_\edge\()_neon
    408        AARCH64_SIGN_LINK_REGISTER
    409        str             x30, [sp, #-16]!
    410        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems, store=0
    411 endfunc
    412 .endm
    413 
    414 sum_lag1_func y,      0,   left
    415 sum_lag1_func y,      0,   mid
    416 sum_lag1_func y,      0,   right, 15
    417 sum_lag1_func uv_444, 444, left
    418 sum_lag1_func uv_444, 444, mid
    419 sum_lag1_func uv_444, 444, right, 15
    420 sum_lag1_func uv_422, 422, left
    421 sum_lag1_func uv_422, 422, mid
    422 sum_lag1_func uv_422, 422, right, 9
    423 sum_lag1_func uv_420, 420, left
    424 sum_lag1_func uv_420, 420, mid
    425 sum_lag1_func uv_420, 420, right, 9
    426 
    427 .macro sum_lag1 type, dst, left, mid, right, edge=mid
    428        mov             v3.16b,  \mid\().16b
    429        ext             v0.16b,  \left\().16b, \mid\().16b,   #15
    430        ext             v1.16b,  \mid\().16b,  \right\().16b, #1
    431        bl              sum_\type\()_lag1_\edge\()_neon
    432        mov             \dst\().16b, v0.16b
    433 .endm
    434 
    435 .macro sum_y_lag1 dst, left, mid, right, edge=mid
    436        sum_lag1        y, \dst, \left, \mid, \right, \edge
    437 .endm
    438 
    439 .macro sum_uv_444_lag1 dst, left, mid, right, edge=mid
    440        sum_lag1        uv_444, \dst, \left, \mid, \right, \edge
    441 .endm
    442 
    443 .macro sum_uv_422_lag1 dst, left, mid, right, edge=mid
    444        sum_lag1        uv_422, \dst, \left, \mid, \right, \edge
    445 .endm
    446 
    447 .macro sum_uv_420_lag1 dst, left, mid, right, edge=mid
    448        sum_lag1        uv_420, \dst, \left, \mid, \right, \edge
    449 .endm
    450 
    451 
    452 function sum_lag2_above_neon
    453        sub             x12, x0,  #2*GRAIN_WIDTH - 16
    454        sub             x13, x0,  #1*GRAIN_WIDTH - 16
    455        ld1             {v18.16b}, [x12] // load top right
    456        ld1             {v21.16b}, [x13]
    457 
    458        ext             v22.16b, v16.16b, v17.16b, #14 // top left, top mid
    459        dup             v26.16b, v30.b[0]
    460        ext             v23.16b, v16.16b, v17.16b, #15
    461        dup             v27.16b, v30.b[1]
    462        ext             v0.16b,  v17.16b, v18.16b, #1  // top mid, top right
    463        dup             v28.16b, v30.b[3]
    464        ext             v1.16b,  v17.16b, v18.16b, #2
    465        dup             v29.16b, v30.b[4]
    466 
    467        smull           v2.8h,   v22.8b,  v26.8b
    468        smull2          v3.8h,   v22.16b, v26.16b
    469        smull           v4.8h,   v23.8b,  v27.8b
    470        smull2          v5.8h,   v23.16b, v27.16b
    471        smull           v6.8h,   v0.8b,   v28.8b
    472        smull2          v7.8h,   v0.16b,  v28.16b
    473        smull           v0.8h,   v1.8b,   v29.8b
    474        smull2          v1.8h,   v1.16b,  v29.16b
    475        saddl           v22.4s,  v2.4h,   v4.4h
    476        saddl2          v23.4s,  v2.8h,   v4.8h
    477        saddl           v26.4s,  v3.4h,   v5.4h
    478        saddl2          v27.4s,  v3.8h,   v5.8h
    479        saddl           v2.4s,   v0.4h,   v6.4h
    480        saddl2          v3.4s,   v0.8h,   v6.8h
    481        saddl           v6.4s,   v1.4h,   v7.4h
    482        saddl2          v7.4s,   v1.8h,   v7.8h
    483        add             v4.4s,   v22.4s,  v2.4s
    484        add             v5.4s,   v23.4s,  v3.4s
    485        add             v6.4s,   v26.4s,  v6.4s
    486        add             v7.4s,   v27.4s,  v7.4s
    487 
    488        ext             v22.16b, v19.16b, v20.16b, #14 // top left, top mid
    489        dup             v26.16b, v30.b[5]
    490        ext             v23.16b, v19.16b, v20.16b, #15
    491        dup             v27.16b, v30.b[6]
    492        ext             v0.16b,  v20.16b, v21.16b, #1  // top mid, top right
    493        dup             v28.16b, v30.b[8]
    494        ext             v1.16b,  v20.16b, v21.16b, #2
    495        dup             v29.16b, v30.b[9]
    496 
    497        smull           v2.8h,   v22.8b,  v26.8b
    498        smull2          v3.8h,   v22.16b, v26.16b
    499        smull           v22.8h,  v23.8b,  v27.8b
    500        smull2          v23.8h,  v23.16b, v27.16b
    501        smull           v26.8h,  v0.8b,   v28.8b
    502        smull2          v27.8h,  v0.16b,  v28.16b
    503        smull           v28.8h,  v1.8b,   v29.8b
    504        smull2          v29.8h,  v1.16b,  v29.16b
    505        saddl           v0.4s,   v2.4h,   v22.4h
    506        saddl2          v1.4s,   v2.8h,   v22.8h
    507        saddl           v2.4s,   v3.4h,   v23.4h
    508        saddl2          v3.4s,   v3.8h,   v23.8h
    509        saddl           v22.4s,  v26.4h,  v28.4h
    510        saddl2          v23.4s,  v26.8h,  v28.8h
    511        saddl           v26.4s,  v27.4h,  v29.4h
    512        saddl2          v27.4s,  v27.8h,  v29.8h
    513        add             v0.4s,   v0.4s,   v22.4s
    514        add             v1.4s,   v1.4s,   v23.4s
    515        add             v2.4s,   v2.4s,   v26.4s
    516        add             v3.4s,   v3.4s,   v27.4s
    517        dup             v26.16b, v30.b[2]
    518        dup             v27.16b, v30.b[7]
    519        smull           v22.8h,  v17.8b,  v26.8b
    520        smull2          v23.8h,  v17.16b, v26.16b
    521        smull           v24.8h,  v20.8b,  v27.8b
    522        smull2          v25.8h,  v20.16b, v27.16b
    523        add             v4.4s,   v4.4s,   v0.4s
    524        add             v5.4s,   v5.4s,   v1.4s
    525        add             v6.4s,   v6.4s,   v2.4s
    526        add             v7.4s,   v7.4s,   v3.4s
    527 
    528        mov             v16.16b, v17.16b
    529        mov             v17.16b, v18.16b
    530 
    531        saddl           v0.4s,   v22.4h,  v24.4h
    532        saddl2          v1.4s,   v22.8h,  v24.8h
    533        saddl           v2.4s,   v23.4h,  v25.4h
    534        saddl2          v3.4s,   v23.8h,  v25.8h
    535        mov             v19.16b, v20.16b
    536        mov             v20.16b, v21.16b
    537        add             v4.4s,   v4.4s,   v0.4s
    538        add             v5.4s,   v5.4s,   v1.4s
    539        add             v6.4s,   v6.4s,   v2.4s
    540        add             v7.4s,   v7.4s,   v3.4s
    541        ret
    542 endfunc
    543 
    544 .macro sum_lag2_func type, uv_layout, edge, elems=16
    545 function sum_\type\()_lag2_\edge\()_neon
    546        AARCH64_SIGN_LINK_REGISTER
    547        str             x30, [sp, #-16]!
    548 .ifc \edge, left
    549        sub             x12, x0,  #2*GRAIN_WIDTH
    550        sub             x13, x0,  #1*GRAIN_WIDTH
    551        ld1             {v17.16b}, [x12] // load the previous block right above
    552        ld1             {v20.16b}, [x13]
    553 .endif
    554        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[12]
    555 endfunc
    556 .endm
    557 
    558 sum_lag2_func y,      0,   left
    559 sum_lag2_func y,      0,   mid
    560 sum_lag2_func y,      0,   right, 15
    561 sum_lag2_func uv_444, 444, left
    562 sum_lag2_func uv_444, 444, mid
    563 sum_lag2_func uv_444, 444, right, 15
    564 sum_lag2_func uv_422, 422, left
    565 sum_lag2_func uv_422, 422, mid
    566 sum_lag2_func uv_422, 422, right, 9
    567 sum_lag2_func uv_420, 420, left
    568 sum_lag2_func uv_420, 420, mid
    569 sum_lag2_func uv_420, 420, right, 9
    570 
    571 
    572 function sum_lag3_above_neon
    573        sub             x11, x0,  #3*GRAIN_WIDTH - 16
    574        sub             x12, x0,  #2*GRAIN_WIDTH - 16
    575        sub             x13, x0,  #1*GRAIN_WIDTH - 16
    576        ld1             {v15.16b}, [x11] // load top right
    577        ld1             {v18.16b}, [x12]
    578        ld1             {v21.16b}, [x13]
    579 
    580        ext             v8.16b,  v13.16b, v14.16b, #13 // top left, top mid
    581        dup             v22.16b, v29.b[0]
    582        ext             v9.16b,  v13.16b, v14.16b, #14
    583        dup             v23.16b, v29.b[1]
    584        ext             v10.16b, v13.16b, v14.16b, #15
    585        dup             v24.16b, v29.b[2]
    586        dup             v25.16b, v29.b[3]
    587        ext             v11.16b, v14.16b, v15.16b, #1  // top mid, top right
    588        dup             v26.16b, v29.b[4]
    589        ext             v12.16b, v14.16b, v15.16b, #2
    590        dup             v27.16b, v29.b[5]
    591        ext             v13.16b, v14.16b, v15.16b, #3
    592        dup             v28.16b, v29.b[6]
    593 
    594        smull           v0.8h,   v8.8b,   v22.8b
    595        smull2          v1.8h,   v8.16b,  v22.16b
    596        smull           v2.8h,   v9.8b,   v23.8b
    597        smull2          v3.8h,   v9.16b,  v23.16b
    598        smull           v8.8h,   v10.8b,  v24.8b
    599        smull2          v9.8h,   v10.16b, v24.16b
    600        smull           v10.8h,  v11.8b,  v26.8b
    601        smull2          v11.8h,  v11.16b, v26.16b
    602        saddl           v22.4s,  v0.4h,   v2.4h
    603        saddl2          v23.4s,  v0.8h,   v2.8h
    604        saddl           v24.4s,  v1.4h,   v3.4h
    605        saddl2          v26.4s,  v1.8h,   v3.8h
    606        saddl           v0.4s,   v8.4h,   v10.4h
    607        saddl2          v1.4s,   v8.8h,   v10.8h
    608        saddl           v2.4s,   v9.4h,   v11.4h
    609        saddl2          v3.4s,   v9.8h,   v11.8h
    610        smull           v8.8h,   v12.8b,  v27.8b
    611        smull2          v9.8h,   v12.16b, v27.16b
    612        smull           v10.8h,  v13.8b,  v28.8b
    613        smull2          v11.8h,  v13.16b, v28.16b
    614        smull           v12.8h,  v14.8b,  v25.8b
    615        smull2          v13.8h,  v14.16b, v25.16b
    616        add             v4.4s,   v22.4s,  v0.4s
    617        add             v5.4s,   v23.4s,  v1.4s
    618        add             v6.4s,   v24.4s,  v2.4s
    619        add             v7.4s,   v26.4s,  v3.4s
    620        saddl           v0.4s,   v8.4h,   v10.4h
    621        saddl2          v1.4s,   v8.8h,   v10.8h
    622        saddl           v2.4s,   v9.4h,   v11.4h
    623        saddl2          v3.4s,   v9.8h,   v11.8h
    624        add             v4.4s,   v4.4s,   v0.4s
    625        add             v5.4s,   v5.4s,   v1.4s
    626        add             v6.4s,   v6.4s,   v2.4s
    627        add             v7.4s,   v7.4s,   v3.4s
    628        saddw           v4.4s,   v4.4s,   v12.4h
    629        saddw2          v5.4s,   v5.4s,   v12.8h
    630        saddw           v6.4s,   v6.4s,   v13.4h
    631        saddw2          v7.4s,   v7.4s,   v13.8h
    632 
    633        ext             v8.16b,  v16.16b, v17.16b, #13 // top left, top mid
    634        dup             v22.16b, v29.b[7]
    635        ext             v9.16b,  v16.16b, v17.16b, #14
    636        dup             v23.16b, v29.b[8]
    637        ext             v10.16b, v16.16b, v17.16b, #15
    638        dup             v24.16b, v29.b[9]
    639        dup             v25.16b, v29.b[10]
    640        ext             v11.16b, v17.16b, v18.16b, #1  // top mid, top right
    641        dup             v26.16b, v29.b[11]
    642        ext             v12.16b, v17.16b, v18.16b, #2
    643        dup             v27.16b, v29.b[12]
    644        ext             v13.16b, v17.16b, v18.16b, #3
    645        dup             v28.16b, v29.b[13]
    646 
    647        smull           v0.8h,   v8.8b,   v22.8b
    648        smull2          v1.8h,   v8.16b,  v22.16b
    649        smull           v2.8h,   v9.8b,   v23.8b
    650        smull2          v3.8h,   v9.16b,  v23.16b
    651        smull           v8.8h,   v10.8b,  v24.8b
    652        smull2          v9.8h,   v10.16b, v24.16b
    653        smull           v10.8h,  v11.8b,  v26.8b
    654        smull2          v11.8h,  v11.16b, v26.16b
    655        saddl           v22.4s,  v0.4h,   v2.4h
    656        saddl2          v23.4s,  v0.8h,   v2.8h
    657        saddl           v24.4s,  v1.4h,   v3.4h
    658        saddl2          v26.4s,  v1.8h,   v3.8h
    659        saddl           v0.4s,   v8.4h,   v10.4h
    660        saddl2          v1.4s,   v8.8h,   v10.8h
    661        saddl           v2.4s,   v9.4h,   v11.4h
    662        saddl2          v3.4s,   v9.8h,   v11.8h
    663        smull           v8.8h,   v12.8b,  v27.8b
    664        smull2          v9.8h,   v12.16b, v27.16b
    665        smull           v10.8h,  v13.8b,  v28.8b
    666        smull2          v11.8h,  v13.16b, v28.16b
    667        smull           v12.8h,  v17.8b,  v25.8b
    668        smull2          v13.8h,  v17.16b, v25.16b
    669        add             v22.4s,  v22.4s,  v0.4s
    670        add             v23.4s,  v23.4s,  v1.4s
    671        add             v24.4s,  v24.4s,  v2.4s
    672        add             v26.4s,  v26.4s,  v3.4s
    673        saddl           v0.4s,   v8.4h,   v10.4h
    674        saddl2          v1.4s,   v8.8h,   v10.8h
    675        saddl           v2.4s,   v9.4h,   v11.4h
    676        saddl2          v3.4s,   v9.8h,   v11.8h
    677        add             v4.4s,   v4.4s,   v22.4s
    678        add             v5.4s,   v5.4s,   v23.4s
    679        add             v6.4s,   v6.4s,   v24.4s
    680        add             v7.4s,   v7.4s,   v26.4s
    681        add             v4.4s,   v4.4s,   v0.4s
    682        add             v5.4s,   v5.4s,   v1.4s
    683        add             v6.4s,   v6.4s,   v2.4s
    684        add             v7.4s,   v7.4s,   v3.4s
    685        saddw           v4.4s,   v4.4s,   v12.4h
    686        saddw2          v5.4s,   v5.4s,   v12.8h
    687        saddw           v6.4s,   v6.4s,   v13.4h
    688        saddw2          v7.4s,   v7.4s,   v13.8h
    689 
    690        ext             v8.16b,  v19.16b, v20.16b, #13 // top left, top mid
    691        dup             v22.16b, v29.b[14]
    692        ext             v9.16b,  v19.16b, v20.16b, #14
    693        dup             v23.16b, v29.b[15]
    694        ext             v10.16b, v19.16b, v20.16b, #15
    695        dup             v24.16b, v30.b[0]
    696        dup             v25.16b, v30.b[1]
    697        ext             v11.16b, v20.16b, v21.16b, #1  // top mid, top right
    698        dup             v26.16b, v30.b[2]
    699        ext             v12.16b, v20.16b, v21.16b, #2
    700        dup             v27.16b, v30.b[3]
    701        ext             v13.16b, v20.16b, v21.16b, #3
    702        dup             v28.16b, v30.b[4]
    703 
    704        smull           v0.8h,   v8.8b,   v22.8b
    705        smull2          v1.8h,   v8.16b,  v22.16b
    706        smull           v2.8h,   v9.8b,   v23.8b
    707        smull2          v3.8h,   v9.16b,  v23.16b
    708        smull           v8.8h,   v10.8b,  v24.8b
    709        smull2          v9.8h,   v10.16b, v24.16b
    710        smull           v10.8h,  v11.8b,  v26.8b
    711        smull2          v11.8h,  v11.16b, v26.16b
    712        saddl           v22.4s,  v0.4h,   v2.4h
    713        saddl2          v23.4s,  v0.8h,   v2.8h
    714        saddl           v24.4s,  v1.4h,   v3.4h
    715        saddl2          v26.4s,  v1.8h,   v3.8h
    716        saddl           v0.4s,   v8.4h,   v10.4h
    717        saddl2          v1.4s,   v8.8h,   v10.8h
    718        saddl           v2.4s,   v9.4h,   v11.4h
    719        saddl2          v3.4s,   v9.8h,   v11.8h
    720        smull           v8.8h,   v12.8b,  v27.8b
    721        smull2          v9.8h,   v12.16b, v27.16b
    722        smull           v10.8h,  v13.8b,  v28.8b
    723        smull2          v11.8h,  v13.16b, v28.16b
    724        smull           v12.8h,  v20.8b,  v25.8b
    725        smull2          v19.8h,  v20.16b, v25.16b
    726        add             v22.4s,  v22.4s,  v0.4s
    727        add             v23.4s,  v23.4s,  v1.4s
    728        add             v24.4s,  v24.4s,  v2.4s
    729        add             v26.4s,  v26.4s,  v3.4s
    730        saddl           v0.4s,   v8.4h,   v10.4h
    731        saddl2          v1.4s,   v8.8h,   v10.8h
    732        saddl           v2.4s,   v9.4h,   v11.4h
    733        saddl2          v3.4s,   v9.8h,   v11.8h
    734        add             v4.4s,   v4.4s,   v22.4s
    735        add             v5.4s,   v5.4s,   v23.4s
    736        add             v6.4s,   v6.4s,   v24.4s
    737        add             v7.4s,   v7.4s,   v26.4s
    738        mov             v13.16b, v14.16b
    739        mov             v14.16b, v15.16b
    740        add             v4.4s,   v4.4s,   v0.4s
    741        add             v5.4s,   v5.4s,   v1.4s
    742        add             v6.4s,   v6.4s,   v2.4s
    743        add             v7.4s,   v7.4s,   v3.4s
    744        mov             v16.16b, v17.16b
    745        mov             v17.16b, v18.16b
    746        saddw           v4.4s,   v4.4s,   v12.4h
    747        saddw2          v5.4s,   v5.4s,   v12.8h
    748        saddw           v6.4s,   v6.4s,   v19.4h
    749        saddw2          v7.4s,   v7.4s,   v19.8h
    750 
    751        mov             v19.16b, v20.16b
    752        mov             v20.16b, v21.16b
    753        ret
    754 endfunc
    755 
    756 .macro sum_lag3_func type, uv_layout, edge, elems=16
    757 function sum_\type\()_lag3_\edge\()_neon
    758        AARCH64_SIGN_LINK_REGISTER
    759        str             x30, [sp, #-16]!
    760 .ifc \edge, left
    761        sub             x11, x0,  #3*GRAIN_WIDTH
    762        sub             x12, x0,  #2*GRAIN_WIDTH
    763        sub             x13, x0,  #1*GRAIN_WIDTH
    764        ld1             {v14.16b}, [x11] // load the previous block right above
    765        ld1             {v17.16b}, [x12]
    766        ld1             {v20.16b}, [x13]
    767 .endif
    768        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, store=1, uv_coeff=v30.b[8]
    769 endfunc
    770 .endm
    771 
    772 sum_lag3_func y,      0,   left
    773 sum_lag3_func y,      0,   mid
    774 sum_lag3_func y,      0,   right, 15
    775 sum_lag3_func uv_444, 444, left
    776 sum_lag3_func uv_444, 444, mid
    777 sum_lag3_func uv_444, 444, right, 15
    778 sum_lag3_func uv_422, 422, left
    779 sum_lag3_func uv_422, 422, mid
    780 sum_lag3_func uv_422, 422, right, 9
    781 sum_lag3_func uv_420, 420, left
    782 sum_lag3_func uv_420, 420, mid
    783 sum_lag3_func uv_420, 420, right, 9
    784 
    785 function generate_grain_rows_neon
    786        AARCH64_SIGN_LINK_REGISTER
    787        str             x30, [sp, #-16]!
    788 1:
    789        get_grain_row   v16, v17, v18, v19, v20, v21
    790        subs            w1,  w1,  #1
    791        store_grain_row v16, v17, v18, v19, v20, v21
    792        b.gt            1b
    793        ldr             x30, [sp], #16
    794        AARCH64_VALIDATE_LINK_REGISTER
    795        ret
    796 endfunc
    797 
    798 function generate_grain_rows_44_neon
    799        AARCH64_SIGN_LINK_REGISTER
    800        str             x30, [sp, #-16]!
    801 1:
    802        get_grain_row_44 v16, v17, v18
    803        subs            w1,  w1,  #1
    804        store_grain_row_44 v16, v17, v18
    805        b.gt            1b
    806        ldr             x30, [sp], #16
    807        AARCH64_VALIDATE_LINK_REGISTER
    808        ret
    809 endfunc
    810 
    811 function get_grain_row_neon
    812        AARCH64_SIGN_LINK_REGISTER
    813        str             x30, [sp, #-16]!
    814        get_grain_row   v16, v17, v18, v19, v20, v21
    815        ldr             x30, [sp], #16
    816        AARCH64_VALIDATE_LINK_REGISTER
    817        ret
    818 endfunc
    819 
    820 function get_grain_row_44_neon
    821        AARCH64_SIGN_LINK_REGISTER
    822        str             x30, [sp, #-16]!
    823        get_grain_row_44 v16, v17, v18
    824        ldr             x30, [sp], #16
    825        AARCH64_VALIDATE_LINK_REGISTER
    826        ret
    827 endfunc
    828 
    829 function add_uv_444_coeff_lag0_neon
    830 add_coeff_lag0_start:
    831        smull           v2.8h,   v0.8b,   v27.8b
    832        smull2          v3.8h,   v0.16b,  v27.16b
    833        srshl           v2.8h,   v2.8h,   v28.8h
    834        srshl           v3.8h,   v3.8h,   v28.8h
    835        saddw           v2.8h,   v2.8h,   v1.8b
    836        saddw2          v3.8h,   v3.8h,   v1.16b
    837        sqxtn           v2.8b,   v2.8h
    838        sqxtn2          v2.16b,  v3.8h
    839        ret
    840 endfunc
    841 
    842 function add_uv_420_coeff_lag0_neon
    843        ld1             {v4.16b, v5.16b}, [x19], #32
    844        ld1             {v6.16b, v7.16b}, [x12], #32
    845        saddlp          v4.8h,   v4.16b
    846        saddlp          v5.8h,   v5.16b
    847        saddlp          v6.8h,   v6.16b
    848        saddlp          v7.8h,   v7.16b
    849        add             v4.8h,   v4.8h,   v6.8h
    850        add             v5.8h,   v5.8h,   v7.8h
    851        rshrn           v4.8b,   v4.8h,   #2
    852        rshrn2          v4.16b,  v5.8h,   #2
    853        and             v0.16b,  v4.16b,  v0.16b
    854        b               add_coeff_lag0_start
    855 endfunc
    856 
    857 function add_uv_422_coeff_lag0_neon
    858        ld1             {v4.16b, v5.16b}, [x19], #32
    859        saddlp          v4.8h,   v4.16b
    860        saddlp          v5.8h,   v5.16b
    861        rshrn           v4.8b,   v4.8h,   #1
    862        rshrn2          v4.16b,  v5.8h,   #1
    863        and             v0.16b,  v4.16b,  v0.16b
    864        b               add_coeff_lag0_start
    865 endfunc
    866 
    867 .macro gen_grain_82 type
    868 function generate_grain_\type\()_8bpc_neon, export=1
    869        AARCH64_SIGN_LINK_REGISTER
    870        stp             x30, x19, [sp, #-96]!
    871 
    872 .ifc \type, uv_444
    873        mov             w13, w3
    874        mov             w14, #28
    875        add             x19, x1,  #3*GRAIN_WIDTH
    876        mov             x1,  x2
    877        mul             w13, w13, w14
    878 .endif
    879        movrel          x3,  X(gaussian_sequence)
    880        ldr             w2,  [x1, #FGD_SEED]
    881        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
    882 .ifc \type, y
    883        add             x4,  x1,  #FGD_AR_COEFFS_Y
    884 .else
    885        add             x4,  x1,  #FGD_AR_COEFFS_UV
    886 .endif
    887        movrel          x16, gen_grain_\type\()_tbl
    888        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
    889        add             w9,  w9,  #4
    890        ldrsw           x17, [x16, w17, uxtw #2]
    891        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
    892        add             x16, x16, x17
    893        neg             v31.8h,  v31.8h
    894 
    895 .ifc \type, uv_444
    896        cmp             w13, #0
    897        mov             w11, #0x49d8
    898        mov             w14, #0xb524
    899        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
    900        csel            w11, w11, w14, ne
    901 .endif
    902 
    903        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
    904        mov             w8,  #1
    905        mov             w10, #1
    906        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
    907        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
    908        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
    909        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
    910        mov             w5,  #127
    911        mov             w6,  #-128
    912 
    913 .ifc \type, uv_444
    914        eor             w2,  w2,  w11
    915 .endif
    916 
    917        br              x16
    918 
    919 L(generate_grain_\type\()_lag0):
    920        AARCH64_VALID_JUMP_TARGET
    921 .ifc \type, y
    922        mov             w1,  #GRAIN_HEIGHT
    923        bl              generate_grain_rows_neon
    924 .else
    925        dup             v28.8h,  w7
    926        ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
    927        movi            v0.16b,  #0
    928        movi            v1.16b,  #255
    929        ext             v29.16b, v0.16b,  v1.16b,  #13
    930        ext             v30.16b, v1.16b,  v0.16b,  #1
    931        neg             v28.8h,  v28.8h
    932 
    933        mov             w1,  #3
    934        bl              generate_grain_rows_neon
    935        mov             w1,  #GRAIN_HEIGHT-3
    936 1:
    937        ld1             {v22.16b, v23.16b, v24.16b, v25.16b}, [x19], #64
    938        bl              get_grain_row_neon
    939        and             v0.16b,  v22.16b, v29.16b
    940        mov             v1.16b,  v16.16b
    941        bl              add_uv_444_coeff_lag0_neon
    942        mov             v0.16b,  v23.16b
    943        mov             v1.16b,  v17.16b
    944        mov             v16.16b, v2.16b
    945        bl              add_uv_444_coeff_lag0_neon
    946        ld1             {v26.16b}, [x19], #16
    947        mov             v0.16b,  v24.16b
    948        mov             v1.16b,  v18.16b
    949        mov             v17.16b, v2.16b
    950        bl              add_uv_444_coeff_lag0_neon
    951        add             x19, x19, #2
    952        mov             v0.16b,  v25.16b
    953        mov             v1.16b,  v19.16b
    954        mov             v18.16b, v2.16b
    955        bl              add_uv_444_coeff_lag0_neon
    956        and             v0.16b,  v26.16b, v30.16b
    957        mov             v1.16b,  v20.16b
    958        mov             v19.16b, v2.16b
    959        bl              add_uv_444_coeff_lag0_neon
    960        mov             v20.16b, v2.16b
    961        subs            w1,  w1,  #1
    962        store_grain_row v16, v17, v18, v19, v20, v21
    963        b.gt            1b
    964 .endif
    965        ldp             x30, x19, [sp], #96
    966        AARCH64_VALIDATE_LINK_REGISTER
    967        ret
    968 
    969 L(generate_grain_\type\()_lag1):
    970        AARCH64_VALID_JUMP_TARGET
    971        ld1r            {v27.16b}, [x4], #1 // ar_coeffs_y[0]
    972        ld1r            {v28.16b}, [x4], #1 // ar_coeffs_y[1]
    973        ld1r            {v29.16b}, [x4]     // ar_coeffs_y[2]
    974 .ifc \type, y
    975        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
    976 .else
    977        add             x4,  x4,  #2
    978 .endif
    979 
    980        mov             w1,  #3
    981 .ifc \type, uv_444
    982        ld1r            {v30.16b}, [x4]     // ar_coeffs_uv[4]
    983        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
    984 .endif
    985        bl              generate_grain_rows_neon
    986 
    987        mov             w1,  #GRAIN_HEIGHT - 3
    988 1:
    989        sum_\type\()_lag1 v22, v16, v16, v17, left
    990        sum_\type\()_lag1 v23, v16, v17, v18
    991        sum_\type\()_lag1 v24, v17, v18, v19
    992        sum_\type\()_lag1 v25, v18, v19, v20
    993        sum_\type\()_lag1 v20, v19, v20, v21, right
    994        get_grain_2     v21
    995        subs            w1,  w1,  #1
    996 .ifc \type, uv_444
    997        add             x19, x19, #2
    998 .endif
    999        store_grain_row v22, v23, v24, v25, v20, v21
   1000        mov             v16.16b, v22.16b
   1001        mov             v17.16b, v23.16b
   1002        mov             v18.16b, v24.16b
   1003        mov             v19.16b, v25.16b
   1004        b.gt            1b
   1005 
   1006        ldp             x30, x19, [sp], #96
   1007        AARCH64_VALIDATE_LINK_REGISTER
   1008        ret
   1009 
   1010 L(generate_grain_\type\()_lag2):
   1011        AARCH64_VALID_JUMP_TARGET
   1012        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
   1013 
   1014        smov            w4,  v30.b[10]
   1015        smov            w17, v30.b[11]
   1016 
   1017        mov             w1,  #3
   1018        bl              generate_grain_rows_neon
   1019 
   1020        mov             w1,  #GRAIN_HEIGHT - 3
   1021 1:
   1022        bl              sum_\type\()_lag2_left_neon
   1023        bl              sum_\type\()_lag2_mid_neon
   1024        bl              sum_\type\()_lag2_mid_neon
   1025        bl              sum_\type\()_lag2_mid_neon
   1026        bl              sum_\type\()_lag2_right_neon
   1027        get_grain_2     v16
   1028        subs            w1,  w1,  #1
   1029 .ifc \type, uv_444
   1030        add             x19, x19, #2
   1031 .endif
   1032        st1             {v16.h}[0], [x0], #2
   1033        b.gt            1b
   1034 
   1035        ldp             x30, x19, [sp], #96
   1036        AARCH64_VALIDATE_LINK_REGISTER
   1037        ret
   1038 
   1039 L(generate_grain_\type\()_lag3):
   1040        AARCH64_VALID_JUMP_TARGET
   1041        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
   1042        stp             d8,  d9,  [sp, #16]
   1043        stp             d10, d11, [sp, #32]
   1044        stp             d12, d13, [sp, #48]
   1045        stp             d14, d15, [sp, #64]
   1046        stp             x20, x21, [sp, #80]
   1047 
   1048        smov            w4,  v30.b[5]
   1049        smov            w20, v30.b[6]
   1050        smov            w21, v30.b[7]
   1051 
   1052        mov             w1,  #3
   1053        bl              generate_grain_rows_neon
   1054 
   1055        mov             w1,  #GRAIN_HEIGHT - 3
   1056 1:
   1057        bl              sum_\type\()_lag3_left_neon
   1058        bl              sum_\type\()_lag3_mid_neon
   1059        bl              sum_\type\()_lag3_mid_neon
   1060        bl              sum_\type\()_lag3_mid_neon
   1061        bl              sum_\type\()_lag3_right_neon
   1062        get_grain_2     v16
   1063        subs            w1,  w1,  #1
   1064 .ifc \type, uv_444
   1065        add             x19, x19, #2
   1066 .endif
   1067        st1             {v16.h}[0], [x0], #2
   1068        b.gt            1b
   1069 
   1070        ldp             x20, x21, [sp, #80]
   1071        ldp             d14, d15, [sp, #64]
   1072        ldp             d12, d13, [sp, #48]
   1073        ldp             d10, d11, [sp, #32]
   1074        ldp             d8,  d9,  [sp, #16]
   1075        ldp             x30, x19, [sp], #96
   1076        AARCH64_VALIDATE_LINK_REGISTER
   1077        ret
   1078 endfunc
   1079 
   1080 jumptable gen_grain_\type\()_tbl
   1081        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
   1082        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
   1083        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
   1084        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
   1085 endjumptable
   1086 .endm
   1087 
   1088 gen_grain_82 y
   1089 gen_grain_82 uv_444
   1090 
   1091 .macro set_height dst, type
   1092 .ifc \type, uv_420
   1093        mov             \dst,  #SUB_GRAIN_HEIGHT-3
   1094 .else
   1095        mov             \dst,  #GRAIN_HEIGHT-3
   1096 .endif
   1097 .endm
   1098 
   1099 .macro increment_y_ptr reg, type
   1100 .ifc \type, uv_420
   1101        add             \reg, \reg, #2*GRAIN_WIDTH-(3*32)
   1102 .else
   1103        sub             \reg, \reg, #3*32-GRAIN_WIDTH
   1104 .endif
   1105 .endm
   1106 
   1107 .macro gen_grain_44 type
   1108 function generate_grain_\type\()_8bpc_neon, export=1
   1109        AARCH64_SIGN_LINK_REGISTER
   1110        stp             x30, x19, [sp, #-96]!
   1111 
   1112        mov             w13, w3
   1113        mov             w14, #28
   1114        add             x19, x1,  #3*GRAIN_WIDTH-3
   1115        mov             x1,  x2
   1116        mul             w13, w13, w14
   1117 
   1118        movrel          x3,  X(gaussian_sequence)
   1119        ldr             w2,  [x1, #FGD_SEED]
   1120        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
   1121        add             x4,  x1,  #FGD_AR_COEFFS_UV
   1122        movrel          x16, gen_grain_\type\()_tbl
   1123        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
   1124        add             w9,  w9,  #4
   1125        ldrsw           x17, [x16, w17, uxtw #2]
   1126        dup             v31.8h,  w9    // 4 + data->grain_scale_shift
   1127        add             x16, x16, x17
   1128        neg             v31.8h,  v31.8h
   1129 
   1130        cmp             w13, #0
   1131        mov             w11, #0x49d8
   1132        mov             w14, #0xb524
   1133        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
   1134        csel            w11, w11, w14, ne
   1135 
   1136        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
   1137        mov             w8,  #1
   1138        mov             w10, #1
   1139        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
   1140        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
   1141        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
   1142        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
   1143        mov             w5,  #127
   1144        mov             w6,  #-128
   1145 
   1146        eor             w2,  w2,  w11
   1147 
   1148        br              x16
   1149 
   1150 L(generate_grain_\type\()_lag0):
   1151        AARCH64_VALID_JUMP_TARGET
   1152        dup             v28.8h,  w7
   1153        ld1r            {v27.16b}, [x4]     // ar_coeffs_uv[0]
   1154        movi            v0.16b,  #0
   1155        movi            v1.16b,  #255
   1156        ext             v29.16b, v0.16b,  v1.16b,  #13
   1157        ext             v30.16b, v1.16b,  v0.16b,  #7
   1158        neg             v28.8h,  v28.8h
   1159 
   1160        mov             w1,  #3
   1161        bl              generate_grain_rows_44_neon
   1162        set_height      w1,  \type
   1163 1:
   1164        bl              get_grain_row_44_neon
   1165 .ifc \type, uv_420
   1166        add             x12, x19, #GRAIN_WIDTH
   1167 .endif
   1168        mov             v0.16b,  v29.16b
   1169        mov             v1.16b,  v16.16b
   1170        bl              add_\type\()_coeff_lag0_neon
   1171        movi            v0.16b,  #255
   1172        mov             v1.16b,  v17.16b
   1173        mov             v16.16b, v2.16b
   1174        bl              add_\type\()_coeff_lag0_neon
   1175        mov             v0.16b,  v30.16b
   1176        mov             v1.16b,  v18.16b
   1177        mov             v17.16b, v2.16b
   1178        bl              add_\type\()_coeff_lag0_neon
   1179        mov             v18.16b, v2.16b
   1180        subs            w1,  w1,  #1
   1181        increment_y_ptr x19, \type
   1182        store_grain_row_44 v16, v17, v18
   1183        b.gt            1b
   1184 
   1185        ldp             x30, x19, [sp], #96
   1186        AARCH64_VALIDATE_LINK_REGISTER
   1187        ret
   1188 
   1189 L(generate_grain_\type\()_lag1):
   1190        AARCH64_VALID_JUMP_TARGET
   1191        ld1r            {v27.16b}, [x4], #1 // ar_coeffs_uv[0]
   1192        ld1r            {v28.16b}, [x4], #1 // ar_coeffs_uv[1]
   1193        ld1r            {v29.16b}, [x4]     // ar_coeffs_uv[2]
   1194        add             x4,  x4,  #2
   1195 
   1196        mov             w1,  #3
   1197        ld1r            {v30.16b}, [x4]     // ar_coeffs_u4[4]
   1198        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
   1199        bl              generate_grain_rows_44_neon
   1200 
   1201        set_height      w1,  \type
   1202 1:
   1203        sum_\type\()_lag1 v20, v16, v16, v17, left
   1204        sum_\type\()_lag1 v21, v16, v17, v18
   1205        sum_\type\()_lag1 v18, v17, v18, v18, right
   1206        subs            w1,  w1,  #1
   1207        increment_y_ptr x19, \type
   1208        store_grain_row_44 v20, v21, v18
   1209        mov             v16.16b, v20.16b
   1210        mov             v17.16b, v21.16b
   1211        b.gt            1b
   1212 
   1213        ldp             x30, x19, [sp], #96
   1214        AARCH64_VALIDATE_LINK_REGISTER
   1215        ret
   1216 
   1217 L(generate_grain_\type\()_lag2):
   1218        AARCH64_VALID_JUMP_TARGET
   1219        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
   1220 
   1221        smov            w4,  v30.b[10]
   1222        smov            w17, v30.b[11]
   1223 
   1224        mov             w1,  #3
   1225        bl              generate_grain_rows_44_neon
   1226 
   1227        set_height      w1,  \type
   1228 1:
   1229        bl              sum_\type\()_lag2_left_neon
   1230        bl              sum_\type\()_lag2_mid_neon
   1231        bl              sum_\type\()_lag2_right_neon
   1232        subs            w1,  w1,  #1
   1233        increment_y_ptr x19, \type
   1234        add             x0,  x0,  #GRAIN_WIDTH-48
   1235        b.gt            1b
   1236 
   1237        ldp             x30, x19, [sp], #96
   1238        AARCH64_VALIDATE_LINK_REGISTER
   1239        ret
   1240 
   1241 L(generate_grain_\type\()_lag3):
   1242        AARCH64_VALID_JUMP_TARGET
   1243        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
   1244        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
   1245        stp             d8,  d9,  [sp, #16]
   1246        stp             d10, d11, [sp, #32]
   1247        stp             d12, d13, [sp, #48]
   1248        stp             d14, d15, [sp, #64]
   1249        stp             x20, x21, [sp, #80]
   1250 
   1251        smov            w4,  v30.b[5]
   1252        smov            w20, v30.b[6]
   1253        smov            w21, v30.b[7]
   1254 
   1255        mov             w1,  #3
   1256        bl              generate_grain_rows_44_neon
   1257 
   1258        set_height      w1,  \type
   1259 1:
   1260        bl              sum_\type\()_lag3_left_neon
   1261        bl              sum_\type\()_lag3_mid_neon
   1262        bl              sum_\type\()_lag3_right_neon
   1263        subs            w1,  w1,  #1
   1264        increment_y_ptr x19, \type
   1265        add             x0,  x0,  #GRAIN_WIDTH-48
   1266        b.gt            1b
   1267 
   1268        ldp             x20, x21, [sp, #80]
   1269        ldp             d14, d15, [sp, #64]
   1270        ldp             d12, d13, [sp, #48]
   1271        ldp             d10, d11, [sp, #32]
   1272        ldp             d8,  d9,  [sp, #16]
   1273        ldp             x30, x19, [sp], #96
   1274        AARCH64_VALIDATE_LINK_REGISTER
   1275        ret
   1276 endfunc
   1277 
   1278 jumptable gen_grain_\type\()_tbl
   1279        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
   1280        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
   1281        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
   1282        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
   1283 endjumptable
   1284 .endm
   1285 
   1286 gen_grain_44 uv_420
   1287 gen_grain_44 uv_422
   1288 
   1289 .macro gather_interleaved dst1, dst2, src1, src2, off
   1290        umov            w14, \src1[0+\off]
   1291        umov            w15, \src2[8+\off]
   1292        umov            w16, \src1[2+\off]
   1293        add             x14, x14, x3
   1294        umov            w17, \src2[10+\off]
   1295        add             x15, x15, x3
   1296        ld1             {\dst1}[0+\off],  [x14]
   1297        umov            w14, \src1[4+\off]
   1298        add             x16, x16, x3
   1299        ld1             {\dst2}[8+\off],  [x15]
   1300        umov            w15, \src2[12+\off]
   1301        add             x17, x17, x3
   1302        ld1             {\dst1}[2+\off],  [x16]
   1303        umov            w16, \src1[6+\off]
   1304        add             x14, x14, x3
   1305        ld1             {\dst2}[10+\off], [x17]
   1306        umov            w17, \src2[14+\off]
   1307        add             x15, x15, x3
   1308        ld1             {\dst1}[4+\off],  [x14]
   1309        add             x16, x16, x3
   1310        ld1             {\dst2}[12+\off], [x15]
   1311        add             x17, x17, x3
   1312        ld1             {\dst1}[6+\off],  [x16]
   1313        ld1             {\dst2}[14+\off], [x17]
   1314 .endm
   1315 
   1316 .macro gather dst1, dst2, src1, src2
   1317        gather_interleaved \dst1, \dst2, \src1, \src2, 0
   1318        gather_interleaved \dst2, \dst1, \src2, \src1, 0
   1319        gather_interleaved \dst1, \dst2, \src1, \src2, 1
   1320        gather_interleaved \dst2, \dst1, \src2, \src1, 1
   1321 .endm
   1322 
   1323 function gather32_neon
   1324        gather          v4.b, v5.b, v0.b, v1.b
   1325        ret
   1326 endfunc
   1327 
   1328 function gather16_neon
   1329        gather_interleaved v4.b, v5.b, v0.b, v0.b, 0
   1330        gather_interleaved v4.b, v5.b, v0.b, v0.b, 1
   1331        ins             v4.d[1], v5.d[1]
   1332        ret
   1333 endfunc
   1334 
   1335 const overlap_coeffs_0, align=4
   1336        .byte 27, 17, 0,  0,  0,  0,  0,  0
   1337        .byte 17, 27, 32, 32, 32, 32, 32, 32
   1338 endconst
   1339 
   1340 const overlap_coeffs_1, align=4
   1341        .byte 23, 0,  0,  0,  0,  0,  0,  0
   1342        .byte 22, 32, 32, 32, 32, 32, 32, 32
   1343 endconst
   1344 
   1345 .macro calc_offset offx, offy, src, sx, sy
   1346        and             \offy, \src,  #0xF     // randval & 0xF
   1347        lsr             \offx, \src,  #4       // randval >> 4
   1348 .if \sy == 0
   1349        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
   1350 .endif
   1351 .if \sx == 0
   1352        add             \offx, \offx, \offx    // 2 * (randval >> 4)
   1353 .endif
   1354 .endm
   1355 
   1356 .macro add_offset dst, offx, offy, src, stride
   1357        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
   1358        add             \dst, \dst, \offx, uxtw // grain_lut += offx
   1359 .endm
   1360 
   1361 // void dav1d_fgy_32x32_8bpc_neon(pixel *const dst, const pixel *const src,
   1362 //                                const ptrdiff_t stride,
   1363 //                                const uint8_t scaling[SCALING_SIZE],
   1364 //                                const int scaling_shift,
   1365 //                                const entry grain_lut[][GRAIN_WIDTH],
   1366 //                                const int offsets[][2],
   1367 //                                const int h, const ptrdiff_t clip,
   1368 //                                const ptrdiff_t type);
   1369 function fgy_32x32_8bpc_neon, export=1
   1370        AARCH64_SIGN_LINK_REGISTER
   1371        str             x30, [sp, #-16]!
   1372        ldr             w11, [x6, #8]          // offsets[1][0]
   1373        ldr             w13, [x6, #4]          // offsets[0][1]
   1374        ldr             w15, [x6, #12]         // offsets[1][1]
   1375        ldr             w6,  [x6]              // offsets[0][0]
   1376        ldr             w8,  [sp, #16]         // clip
   1377        mov             x9,  #GRAIN_WIDTH      // grain_lut stride
   1378 
   1379        neg             w4,  w4
   1380        dup             v29.8h,  w4            // -scaling_shift
   1381 
   1382        movrel          x16, overlap_coeffs_0
   1383 
   1384        cbz             w8,  1f
   1385        // clip
   1386        movi            v30.16b, #16
   1387        movi            v31.16b, #235
   1388        b               2f
   1389 1:
   1390        // no clip
   1391        movi            v30.16b, #0
   1392        movi            v31.16b, #255
   1393 2:
   1394 
   1395        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
   1396 
   1397        add             x5,  x5,  #9           // grain_lut += 9
   1398        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
   1399        add             x5,  x5,  x9           // grain_lut += grain_stride
   1400 
   1401        calc_offset     w11, w12, w11, 0,  0
   1402        calc_offset     w13, w14, w13, 0,  0
   1403        calc_offset     w15, w16, w15, 0,  0
   1404        calc_offset     w6,  w10, w6,  0,  0
   1405 
   1406        add_offset      x12, w11, x12, x5,  x9
   1407        add_offset      x14, w13, x14, x5,  x9
   1408        add_offset      x16, w15, x16, x5,  x9
   1409        add_offset      x5,  w6,  x10, x5,  x9
   1410 
   1411        ldr             w11, [sp, #24]         // type
   1412        movrel          x13, fgy_loop_tbl
   1413 
   1414        add             x4,  x12, #32          // grain_lut += FG_BLOCK_SIZE * bx
   1415        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1416 
   1417        tst             w11, #1
   1418        ldrsw           x11, [x13, w11, uxtw #2]
   1419 
   1420        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1421        add             x8,  x8,  #32          // grain_lut += FG_BLOCK_SIZE * bx
   1422 
   1423        add             x11, x13, x11
   1424 
   1425        b.eq            1f
   1426        // y overlap
   1427        dup             v6.16b,  v27.b[0]
   1428        dup             v7.16b,  v27.b[1]
   1429        mov             w10, w7                // backup actual h
   1430        mov             w7,  #2
   1431 1:
   1432        br              x11
   1433 endfunc
   1434 
   1435 function fgy_loop_neon
   1436 .macro fgy ox, oy
   1437 L(loop_\ox\oy):
   1438        AARCH64_VALID_JUMP_TARGET
   1439 1:
   1440        ld1             {v0.16b,  v1.16b},  [x1],  x2 // src
   1441 .if \ox
   1442        ld1             {v20.8b},           [x4],  x9 // grain_lut old
   1443 .endif
   1444 .if \oy
   1445        ld1             {v22.16b, v23.16b}, [x6],  x9 // grain_lut top
   1446 .endif
   1447 .if \ox && \oy
   1448        ld1             {v21.8b},           [x8],  x9 // grain_lut top old
   1449 .endif
   1450        ld1             {v18.16b, v19.16b}, [x5],  x9 // grain_lut
   1451 
   1452        bl              gather32_neon
   1453 
   1454 .if \ox
   1455        smull           v20.8h,  v20.8b,  v27.8b
   1456        smlal           v20.8h,  v18.8b,  v28.8b
   1457 .endif
   1458 
   1459 .if \oy
   1460 .if \ox
   1461        smull           v21.8h,  v21.8b,  v27.8b
   1462        smlal           v21.8h,  v22.8b,  v28.8b
   1463        sqrshrn         v20.8b,  v20.8h,  #5
   1464        sqrshrn         v21.8b,  v21.8h,  #5
   1465 .endif
   1466 
   1467 .if \ox
   1468        smull           v16.8h,  v20.8b,  v7.8b
   1469 .else
   1470        smull           v16.8h,  v18.8b,  v7.8b
   1471 .endif
   1472        smull2          v17.8h,  v18.16b, v7.16b
   1473        smull           v18.8h,  v19.8b,  v7.8b
   1474        smull2          v19.8h,  v19.16b, v7.16b
   1475 .if \ox
   1476        smlal           v16.8h,  v21.8b,  v6.8b
   1477 .else
   1478        smlal           v16.8h,  v22.8b,  v6.8b
   1479 .endif
   1480        smlal2          v17.8h,  v22.16b, v6.16b
   1481        smlal           v18.8h,  v23.8b,  v6.8b
   1482        smlal2          v19.8h,  v23.16b, v6.16b
   1483        sqrshrn         v22.8b,  v16.8h,  #5
   1484        sqrshrn2        v22.16b, v17.8h,  #5
   1485        sqrshrn         v23.8b,  v18.8h,  #5
   1486        sqrshrn2        v23.16b, v19.8h,  #5
   1487 .endif
   1488 
   1489        // sxtl of grain
   1490 .if \oy
   1491        sxtl            v16.8h,  v22.8b
   1492        sxtl2           v17.8h,  v22.16b
   1493        sxtl            v18.8h,  v23.8b
   1494        sxtl2           v19.8h,  v23.16b
   1495 .elseif \ox
   1496        sqrshrn         v20.8b,  v20.8h,  #5
   1497        sxtl2           v17.8h,  v18.16b
   1498        sxtl            v18.8h,  v19.8b
   1499        sxtl2           v19.8h,  v19.16b
   1500        sxtl            v16.8h,  v20.8b
   1501 .else
   1502        sxtl            v16.8h,  v18.8b
   1503        sxtl2           v17.8h,  v18.16b
   1504        sxtl            v18.8h,  v19.8b
   1505        sxtl2           v19.8h,  v19.16b
   1506 .endif
   1507 
   1508        uxtl            v2.8h,   v4.8b   // scaling
   1509        uxtl2           v3.8h,   v4.16b
   1510        uxtl            v4.8h,   v5.8b
   1511        uxtl2           v5.8h,   v5.16b
   1512 
   1513        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
   1514        mul             v17.8h,  v17.8h,  v3.8h
   1515        mul             v18.8h,  v18.8h,  v4.8h
   1516        mul             v19.8h,  v19.8h,  v5.8h
   1517 
   1518        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
   1519        srshl           v17.8h,  v17.8h,  v29.8h
   1520        srshl           v18.8h,  v18.8h,  v29.8h
   1521        srshl           v19.8h,  v19.8h,  v29.8h
   1522 
   1523        uaddw           v16.8h,  v16.8h,  v0.8b   // *src + noise
   1524        uaddw2          v17.8h,  v17.8h,  v0.16b
   1525        uaddw           v18.8h,  v18.8h,  v1.8b
   1526        uaddw2          v19.8h,  v19.8h,  v1.16b
   1527 
   1528        sqxtun          v0.8b,   v16.8h
   1529        sqxtun2         v0.16b,  v17.8h
   1530        sqxtun          v1.8b,   v18.8h
   1531        sqxtun2         v1.16b,  v19.8h
   1532 
   1533        umax            v0.16b,  v0.16b,  v30.16b
   1534        umax            v1.16b,  v1.16b,  v30.16b
   1535        umin            v0.16b,  v0.16b,  v31.16b
   1536        umin            v1.16b,  v1.16b,  v31.16b
   1537 
   1538        subs            w7,  w7,  #1
   1539 .if \oy
   1540        dup             v6.16b,  v28.b[0]
   1541        dup             v7.16b,  v28.b[1]
   1542 .endif
   1543        st1             {v0.16b,  v1.16b},  [x0], x2 // dst
   1544        b.gt            1b
   1545 
   1546 .if \oy
   1547        cmp             w10, #2
   1548        sub             w7,  w10, #2           // restore actual remaining h
   1549        b.gt            L(loop_\ox\()0)
   1550 .endif
   1551        ldr             x30, [sp], #16
   1552        AARCH64_VALIDATE_LINK_REGISTER
   1553        ret
   1554 .endm
   1555 
   1556        fgy             0, 0
   1557        fgy             0, 1
   1558        fgy             1, 0
   1559        fgy             1, 1
   1560 endfunc
   1561 
   1562 jumptable fgy_loop_tbl
   1563        .word L(loop_00) - fgy_loop_tbl
   1564        .word L(loop_01) - fgy_loop_tbl
   1565        .word L(loop_10) - fgy_loop_tbl
   1566        .word L(loop_11) - fgy_loop_tbl
   1567 endjumptable
   1568 
   1569 // void dav1d_fguv_32x32_420_8bpc_neon(pixel *const dst,
   1570 //                                     const pixel *const src,
   1571 //                                     const ptrdiff_t stride,
   1572 //                                     const uint8_t scaling[SCALING_SIZE],
   1573 //                                     const Dav1dFilmGrainData *const data,
   1574 //                                     const entry grain_lut[][GRAIN_WIDTH],
   1575 //                                     const pixel *const luma_row,
   1576 //                                     const ptrdiff_t luma_stride,
   1577 //                                     const int offsets[][2],
   1578 //                                     const ptrdiff_t h, const ptrdiff_t uv,
   1579 //                                     const ptrdiff_t is_id,
   1580 //                                     const ptrdiff_t type);
   1581 .macro fguv layout, sx, sy
   1582 function fguv_32x32_\layout\()_8bpc_neon, export=1
   1583        AARCH64_SIGN_LINK_REGISTER
   1584        str             x30,      [sp, #-32]!
   1585        str             d8,       [sp, #16]
   1586        ldp             x8,  x9,  [sp, #32]    // offsets, h
   1587        ldp             x10, x11, [sp, #48]    // uv, is_id
   1588 
   1589        ldr             w13, [x4, #FGD_SCALING_SHIFT]
   1590        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
   1591        neg             w13, w13               // -scaling_shift
   1592 
   1593        // !csfl
   1594        add             x10, x4,  x10, lsl #2  // + 4*uv
   1595        add             x14, x10, #FGD_UV_LUMA_MULT
   1596        add             x15, x10, #FGD_UV_MULT
   1597        add             x10, x10, #FGD_UV_OFFSET
   1598        ld1             {v8.h}[0], [x14]       // uv_luma_mult
   1599        ld1r            {v24.8h},  [x10]       // uv_offset
   1600        ld1             {v8.h}[1], [x15]       // uv_mult
   1601 
   1602        dup             v29.8h,  w13           // -scaling_shift
   1603 
   1604        cbz             w12, 1f
   1605        // clip
   1606        movi            v30.16b, #16
   1607        movi            v31.16b, #240
   1608        cbz             w11, 2f
   1609        // is_id
   1610        movi            v31.16b, #235
   1611        b               2f
   1612 1:
   1613        // no clip
   1614        movi            v30.16b, #0
   1615        movi            v31.16b, #255
   1616 2:
   1617 
   1618        ldr             w12, [x8, #8]          // offsets[1][0]
   1619        ldr             w14, [x8, #4]          // offsets[0][1]
   1620        ldr             w16, [x8, #12]         // offsets[1][1]
   1621        ldr             w8,  [x8]              // offsets[0][0]
   1622 
   1623        mov             x10, #GRAIN_WIDTH      // grain_lut stride
   1624 
   1625        add             x5,  x5,  #(3 + (2 >> \sx)*3) // grain_lut += 9 or 6
   1626 .if \sy
   1627        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
   1628        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
   1629 .else
   1630        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
   1631        add             x5,  x5,  x10          // grain_lut += grain_stride
   1632 .endif
   1633 
   1634        calc_offset     w12, w13, w12, \sx, \sy
   1635        calc_offset     w14, w15, w14, \sx, \sy
   1636        calc_offset     w16, w17, w16, \sx, \sy
   1637        calc_offset     w8,  w11, w8,  \sx, \sy
   1638 
   1639        add_offset      x13, w12, x13, x5,  x10
   1640        add_offset      x15, w14, x15, x5,  x10
   1641        add_offset      x17, w16, x17, x5,  x10
   1642        add_offset      x5,  w8,  x11, x5,  x10
   1643 
   1644        add             x4,  x13, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
   1645        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1646        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1647        add             x11, x11, #(32 >> \sx) // grain_lut += FG_BLOCK_SIZE * bx
   1648 
   1649        ldr             w13, [sp, #64]         // type
   1650 
   1651        movrel          x16, overlap_coeffs_\sx
   1652        movrel          x14, fguv_loop_sx\sx\()_tbl
   1653 
   1654        ld1             {v27.8b, v28.8b}, [x16] // overlap_coeffs
   1655        tst             w13, #1
   1656        ldrsw           x13, [x14, w13, uxtw #2]
   1657 
   1658        b.eq            1f
   1659        // y overlap
   1660        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
   1661        mov             w9,  #(2 >> \sy)
   1662 
   1663 1:
   1664        add             x13, x14, x13
   1665 
   1666 .if \sy
   1667        movi            v25.16b, #23
   1668        movi            v26.16b, #22
   1669 .else
   1670        movi            v25.16b, #27
   1671        movi            v26.16b, #17
   1672 .endif
   1673 
   1674 .if \sy
   1675        add             x7,  x7,  x7           // luma_stride *= 2
   1676 .endif
   1677 
   1678        br              x13
   1679 endfunc
   1680 .endm
   1681 
   1682 fguv 420, 1, 1
   1683 fguv 422, 1, 0
   1684 fguv 444, 0, 0
   1685 
   1686 function fguv_loop_sx0_neon
   1687 .macro fguv_loop_sx0 csfl, ox, oy
   1688 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
   1689        AARCH64_VALID_JUMP_TARGET
   1690 1:
   1691        ld1             {v0.16b,  v1.16b},  [x6],  x7  // luma
   1692        ld1             {v6.16b,  v7.16b},  [x1],  x2  // src
   1693 .if \ox
   1694        ld1             {v20.8b},           [x4],  x10 // grain_lut old
   1695 .endif
   1696 .if \oy
   1697        ld1             {v22.16b, v23.16b}, [x8],  x10 // grain_lut top
   1698 .endif
   1699 .if \ox && \oy
   1700        ld1             {v21.8b},           [x11], x10 // grain_lut top old
   1701 .endif
   1702        ld1             {v18.16b, v19.16b}, [x5],  x10 // grain_lut
   1703 
   1704 .if !\csfl
   1705        uxtl            v2.8h,   v0.8b
   1706        uxtl2           v3.8h,   v0.16b
   1707        uxtl            v4.8h,   v1.8b
   1708        uxtl2           v5.8h,   v1.16b
   1709        uxtl            v0.8h,   v6.8b
   1710        uxtl2           v1.8h,   v6.16b
   1711        uxtl            v16.8h,  v7.8b
   1712        uxtl2           v17.8h,  v7.16b
   1713        mul             v2.8h,   v2.8h,   v8.h[0]
   1714        mul             v3.8h,   v3.8h,   v8.h[0]
   1715        mul             v4.8h,   v4.8h,   v8.h[0]
   1716        mul             v5.8h,   v5.8h,   v8.h[0]
   1717        mul             v0.8h,   v0.8h,   v8.h[1]
   1718        mul             v1.8h,   v1.8h,   v8.h[1]
   1719        mul             v16.8h,  v16.8h,  v8.h[1]
   1720        mul             v17.8h,  v17.8h,  v8.h[1]
   1721        sqadd           v2.8h,   v2.8h,   v0.8h
   1722        sqadd           v3.8h,   v3.8h,   v1.8h
   1723        sqadd           v4.8h,   v4.8h,   v16.8h
   1724        sqadd           v5.8h,   v5.8h,   v17.8h
   1725        sshr            v2.8h,   v2.8h,   #6
   1726        sshr            v3.8h,   v3.8h,   #6
   1727        sshr            v4.8h,   v4.8h,   #6
   1728        sshr            v5.8h,   v5.8h,   #6
   1729        add             v2.8h,   v2.8h,   v24.8h
   1730        add             v3.8h,   v3.8h,   v24.8h
   1731        add             v4.8h,   v4.8h,   v24.8h
   1732        add             v5.8h,   v5.8h,   v24.8h
   1733        sqxtun          v0.8b,   v2.8h
   1734        sqxtun2         v0.16b,  v3.8h
   1735        sqxtun          v1.8b,   v4.8h
   1736        sqxtun2         v1.16b,  v5.8h
   1737 .endif
   1738 
   1739        bl              gather32_neon
   1740 
   1741 .if \ox
   1742        smull           v20.8h,  v20.8b,  v27.8b
   1743        smlal           v20.8h,  v18.8b,  v28.8b
   1744 .endif
   1745 
   1746 .if \oy
   1747 .if \ox
   1748        smull           v21.8h,  v21.8b,  v27.8b
   1749        smlal           v21.8h,  v22.8b,  v28.8b
   1750        sqrshrn         v20.8b,  v20.8h,  #5
   1751        sqrshrn         v21.8b,  v21.8h,  #5
   1752 .endif
   1753 
   1754 .if \ox
   1755        smull           v16.8h,  v20.8b,  v26.8b
   1756 .else
   1757        smull           v16.8h,  v18.8b,  v26.8b
   1758 .endif
   1759        smull2          v17.8h,  v18.16b, v26.16b
   1760        smull           v18.8h,  v19.8b,  v26.8b
   1761        smull2          v19.8h,  v19.16b, v26.16b
   1762 .if \ox
   1763        smlal           v16.8h,  v21.8b,  v25.8b
   1764 .else
   1765        smlal           v16.8h,  v22.8b,  v25.8b
   1766 .endif
   1767        smlal2          v17.8h,  v22.16b, v25.16b
   1768        smlal           v18.8h,  v23.8b,  v25.8b
   1769        smlal2          v19.8h,  v23.16b, v25.16b
   1770        sqrshrn         v22.8b,  v16.8h,  #5
   1771        sqrshrn2        v22.16b, v17.8h,  #5
   1772        sqrshrn         v23.8b,  v18.8h,  #5
   1773        sqrshrn2        v23.16b, v19.8h,  #5
   1774 .endif
   1775 
   1776        // sxtl of grain
   1777 .if \oy
   1778        sxtl            v16.8h,  v22.8b
   1779        sxtl2           v17.8h,  v22.16b
   1780        sxtl            v18.8h,  v23.8b
   1781        sxtl2           v19.8h,  v23.16b
   1782 .elseif \ox
   1783        sqrshrn         v20.8b,  v20.8h,  #5
   1784        sxtl2           v17.8h,  v18.16b
   1785        sxtl            v18.8h,  v19.8b
   1786        sxtl2           v19.8h,  v19.16b
   1787        sxtl            v16.8h,  v20.8b
   1788 .else
   1789        sxtl            v16.8h,  v18.8b
   1790        sxtl2           v17.8h,  v18.16b
   1791        sxtl            v18.8h,  v19.8b
   1792        sxtl2           v19.8h,  v19.16b
   1793 .endif
   1794 
   1795        uxtl            v2.8h,   v4.8b   // scaling
   1796        uxtl2           v3.8h,   v4.16b
   1797        uxtl            v4.8h,   v5.8b
   1798        uxtl2           v5.8h,   v5.16b
   1799 
   1800        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
   1801        mul             v17.8h,  v17.8h,  v3.8h
   1802        mul             v18.8h,  v18.8h,  v4.8h
   1803        mul             v19.8h,  v19.8h,  v5.8h
   1804 
   1805        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
   1806        srshl           v17.8h,  v17.8h,  v29.8h
   1807        srshl           v18.8h,  v18.8h,  v29.8h
   1808        srshl           v19.8h,  v19.8h,  v29.8h
   1809 
   1810        uaddw           v16.8h,  v16.8h,  v6.8b   // *src + noise
   1811        uaddw2          v17.8h,  v17.8h,  v6.16b
   1812        uaddw           v18.8h,  v18.8h,  v7.8b
   1813        uaddw2          v19.8h,  v19.8h,  v7.16b
   1814 
   1815        sqxtun          v0.8b,   v16.8h
   1816        sqxtun2         v0.16b,  v17.8h
   1817        sqxtun          v1.8b,   v18.8h
   1818        sqxtun2         v1.16b,  v19.8h
   1819 
   1820        umax            v0.16b,  v0.16b,  v30.16b
   1821        umax            v1.16b,  v1.16b,  v30.16b
   1822        umin            v0.16b,  v0.16b,  v31.16b
   1823        umin            v1.16b,  v1.16b,  v31.16b
   1824 
   1825        subs            w9,  w9,  #1
   1826 .if \oy
   1827        dup             v25.16b, v28.b[0]
   1828        dup             v26.16b, v28.b[1]
   1829 .endif
   1830        st1             {v0.16b,  v1.16b},  [x0], x2 // dst
   1831        b.gt            1b
   1832 
   1833 .if \oy
   1834        cmp             w12, #0
   1835        mov             w9,  w12               // restore actual remaining h
   1836        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
   1837 .endif
   1838        b               9f
   1839 .endm
   1840        fguv_loop_sx0   0, 0, 0
   1841        fguv_loop_sx0   0, 0, 1
   1842        fguv_loop_sx0   0, 1, 0
   1843        fguv_loop_sx0   0, 1, 1
   1844        fguv_loop_sx0   1, 0, 0
   1845        fguv_loop_sx0   1, 0, 1
   1846        fguv_loop_sx0   1, 1, 0
   1847        fguv_loop_sx0   1, 1, 1
   1848 
   1849 9:
   1850        ldr             d8,       [sp, #16]
   1851        ldr             x30,      [sp], #32
   1852        AARCH64_VALIDATE_LINK_REGISTER
   1853        ret
   1854 endfunc
   1855 
   1856 jumptable fguv_loop_sx0_tbl
   1857        .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
   1858        .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
   1859        .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
   1860        .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
   1861        .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
   1862        .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
   1863        .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
   1864        .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
   1865 endjumptable
   1866 
   1867 function fguv_loop_sx1_neon
   1868 .macro fguv_loop_sx1 csfl, ox, oy
   1869 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
   1870        AARCH64_VALID_JUMP_TARGET
   1871 1:
   1872        ld1             {v0.16b, v1.16b},  [x6],  x7  // luma
   1873        ld1             {v6.16b},          [x1],  x2  // src
   1874 .if \ox
   1875        ld1             {v20.8b},          [x4],  x10 // grain_lut old
   1876 .endif
   1877 .if \oy
   1878        ld1             {v22.16b},         [x8],  x10 // grain_lut top
   1879 .endif
   1880 .if \ox && \oy
   1881        ld1             {v21.8b},          [x11], x10 // grain_lut top old
   1882 .endif
   1883        ld1             {v18.16b},         [x5],  x10 // grain_lut
   1884 
   1885        uaddlp          v2.8h,   v0.16b
   1886        uaddlp          v3.8h,   v1.16b
   1887 .if \csfl
   1888        rshrn           v0.8b,   v2.8h,   #1
   1889        rshrn2          v0.16b,  v3.8h,   #1
   1890 .else
   1891        urshr           v2.8h,   v2.8h,   #1
   1892        urshr           v3.8h,   v3.8h,   #1
   1893        uxtl            v0.8h,   v6.8b
   1894        uxtl2           v1.8h,   v6.16b
   1895        mul             v2.8h,   v2.8h,   v8.h[0]
   1896        mul             v3.8h,   v3.8h,   v8.h[0]
   1897        mul             v0.8h,   v0.8h,   v8.h[1]
   1898        mul             v1.8h,   v1.8h,   v8.h[1]
   1899        sqadd           v2.8h,   v2.8h,   v0.8h
   1900        sqadd           v3.8h,   v3.8h,   v1.8h
   1901        sshr            v2.8h,   v2.8h,   #6
   1902        sshr            v3.8h,   v3.8h,   #6
   1903        add             v2.8h,   v2.8h,   v24.8h
   1904        add             v3.8h,   v3.8h,   v24.8h
   1905        sqxtun          v0.8b,   v2.8h
   1906        sqxtun2         v0.16b,  v3.8h
   1907 .endif
   1908 
   1909        bl              gather16_neon
   1910 
   1911 .if \ox
   1912        smull           v20.8h,  v20.8b,  v27.8b
   1913        smlal           v20.8h,  v18.8b,  v28.8b
   1914 .endif
   1915 
   1916 .if \oy
   1917 .if \ox
   1918        smull           v21.8h,  v21.8b,  v27.8b
   1919        smlal           v21.8h,  v22.8b,  v28.8b
   1920        sqrshrn         v20.8b,  v20.8h,  #5
   1921        sqrshrn         v21.8b,  v21.8h,  #5
   1922 .endif
   1923 
   1924 .if \ox
   1925        smull           v16.8h,  v20.8b,  v26.8b
   1926 .else
   1927        smull           v16.8h,  v18.8b,  v26.8b
   1928 .endif
   1929        smull2          v17.8h,  v18.16b, v26.16b
   1930 .if \ox
   1931        smlal           v16.8h,  v21.8b,  v25.8b
   1932 .else
   1933        smlal           v16.8h,  v22.8b,  v25.8b
   1934 .endif
   1935        smlal2          v17.8h,  v22.16b, v25.16b
   1936        sqrshrn         v22.8b,  v16.8h,  #5
   1937        sqrshrn2        v22.16b, v17.8h,  #5
   1938 .endif
   1939 
   1940        // sxtl of grain
   1941 .if \oy
   1942        sxtl            v16.8h,  v22.8b
   1943        sxtl2           v17.8h,  v22.16b
   1944 .elseif \ox
   1945        sqrshrn         v20.8b,  v20.8h,  #5
   1946        sxtl2           v17.8h,  v18.16b
   1947        sxtl            v16.8h,  v20.8b
   1948 .else
   1949        sxtl            v16.8h,  v18.8b
   1950        sxtl2           v17.8h,  v18.16b
   1951 .endif
   1952 
   1953        uxtl            v2.8h,   v4.8b   // scaling
   1954        uxtl2           v3.8h,   v4.16b
   1955 
   1956        mul             v16.8h,  v16.8h,  v2.8h   // scaling * grain
   1957        mul             v17.8h,  v17.8h,  v3.8h
   1958 
   1959        srshl           v16.8h,  v16.8h,  v29.8h  // round2(scaling * grain, scaling_shift)
   1960        srshl           v17.8h,  v17.8h,  v29.8h
   1961 
   1962        uaddw           v16.8h,  v16.8h,  v6.8b   // *src + noise
   1963        uaddw2          v17.8h,  v17.8h,  v6.16b
   1964 
   1965        sqxtun          v0.8b,   v16.8h
   1966        sqxtun2         v0.16b,  v17.8h
   1967 
   1968        umax            v0.16b,  v0.16b,  v30.16b
   1969        umin            v0.16b,  v0.16b,  v31.16b
   1970 
   1971 .if \oy
   1972        mov             v16.16b, v25.16b
   1973 .endif
   1974        subs            w9,  w9,  #1
   1975 .if \oy
   1976        mov             v25.16b, v26.16b
   1977        mov             v26.16b, v16.16b
   1978 .endif
   1979        st1             {v0.16b},  [x0], x2 // dst
   1980        b.gt            1b
   1981 
   1982 .if \oy
   1983        cmp             w12, #0
   1984        mov             w9,  w12               // restore actual remaining h
   1985        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
   1986 .endif
   1987 
   1988        b               9f
   1989 .endm
   1990        fguv_loop_sx1   0, 0, 0
   1991        fguv_loop_sx1   0, 0, 1
   1992        fguv_loop_sx1   0, 1, 0
   1993        fguv_loop_sx1   0, 1, 1
   1994        fguv_loop_sx1   1, 0, 0
   1995        fguv_loop_sx1   1, 0, 1
   1996        fguv_loop_sx1   1, 1, 0
   1997        fguv_loop_sx1   1, 1, 1
   1998 
   1999 9:
   2000        ldr             d8,       [sp, #16]
   2001        ldr             x30,      [sp], #32
   2002        AARCH64_VALIDATE_LINK_REGISTER
   2003        ret
   2004 endfunc
   2005 
   2006 jumptable fguv_loop_sx1_tbl
   2007        .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
   2008        .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
   2009        .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
   2010        .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
   2011        .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
   2012        .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
   2013        .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
   2014        .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
   2015 endjumptable