tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

filmgrain16.S (74703B)


      1 /*
      2 * Copyright © 2021, VideoLAN and dav1d authors
      3 * Copyright © 2021, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 #include "src/arm/asm-offsets.h"
     31 
     32 #define GRAIN_WIDTH 82
     33 #define GRAIN_HEIGHT 73
     34 
     35 #define SUB_GRAIN_WIDTH 44
     36 #define SUB_GRAIN_HEIGHT 38
     37 
     38 .macro increment_seed steps, shift=1
     39        lsr             w11, w2,  #3
     40        lsr             w12, w2,  #12
     41        lsr             w13, w2,  #1
     42        eor             w11, w2,  w11                     // (r >> 0) ^ (r >> 3)
     43        eor             w12, w12, w13                     // (r >> 12) ^ (r >> 1)
     44        eor             w11, w11, w12                     // (r >> 0) ^ (r >> 3) ^ (r >> 12) ^ (r >> 1)
     45 .if \shift
     46        lsr             w2,  w2,  #\steps
     47 .endif
     48        and             w11, w11, #((1 << \steps) - 1)    // bit
     49 .if \shift
     50        orr             w2,  w2,  w11, lsl #(16 - \steps) // *state
     51 .else
     52        orr             w2,  w2,  w11, lsl #16            // *state
     53 .endif
     54 .endm
     55 
     56 .macro read_rand dest, bits, age
     57        ubfx            \dest,  x2,   #16 - \bits - \age, #\bits
     58 .endm
     59 
     60 .macro read_shift_rand dest, bits
     61        ubfx            \dest,  x2,   #17 - \bits, #\bits
     62        lsr             w2,  w2,  #1
     63 .endm
     64 
     65 // special calling convention:
     66 // w2 holds seed
     67 // x3 holds dav1d_gaussian_sequence
     68 // clobbers x11-x15
     69 // returns in v0.8h
     70 function get_gaussian_neon
     71        increment_seed  4
     72        read_rand       x14, 11,  3
     73        read_rand       x15, 11,  2
     74        add             x14, x3,  x14, lsl #1
     75        add             x15, x3,  x15, lsl #1
     76        ld1             {v0.h}[0], [x14]
     77        read_rand       x14, 11,  1
     78        ld1             {v0.h}[1], [x15]
     79        add             x14, x3,  x14, lsl #1
     80        read_rand       x15, 11,  0
     81        increment_seed  4
     82        add             x15, x3,  x15, lsl #1
     83        ld1             {v0.h}[2], [x14]
     84        read_rand       x14, 11,  3
     85        ld1             {v0.h}[3], [x15]
     86        add             x14, x3,  x14, lsl #1
     87        read_rand       x15, 11,  2
     88        ld1             {v0.h}[4], [x14]
     89        add             x15, x3,  x15, lsl #1
     90        read_rand       x14, 11,  1
     91        ld1             {v0.h}[5], [x15]
     92        read_rand       x15, 11,  0
     93        add             x14, x3,  x14, lsl #1
     94        add             x15, x3,  x15, lsl #1
     95        ld1             {v0.h}[6], [x14]
     96        ld1             {v0.h}[7], [x15]
     97        ret
     98 endfunc
     99 
    100 .macro store_grain_row r0, r1, r2, r3, r4, r5
    101        st1             {\r0\().16b,\r1\().16b}, [x0], #32
    102        st1             {\r2\().16b,\r3\().16b}, [x0], #32
    103        st1             {\r4\().16b},  [x0], #16
    104        st1             {\r5\().h}[0], [x0], #2
    105 .endm
    106 
    107 function get_grain_2_neon
    108        increment_seed  2
    109        read_rand       x14, 11,  1
    110        read_rand       x15, 11,  0
    111        add             x14, x3,  x14, lsl #1
    112        add             x15, x3,  x15, lsl #1
    113        ld1             {v0.h}[0], [x14]
    114        ld1             {v0.h}[1], [x15]
    115        srshl           v0.4h,   v0.4h,   v31.4h
    116        ret
    117 endfunc
    118 
    119 .macro get_grain_2 dst
    120        bl              get_grain_2_neon
    121 .ifnc \dst, v0
    122        mov             \dst\().8b, v0.8b
    123 .endif
    124 .endm
    125 
    126 function get_grain_4_neon
    127        increment_seed  4
    128        read_rand       x14, 11,  3
    129        read_rand       x15, 11,  2
    130        add             x14, x3,  x14, lsl #1
    131        add             x15, x3,  x15, lsl #1
    132        ld1             {v0.h}[0], [x14]
    133        read_rand       x14, 11,  1
    134        ld1             {v0.h}[1], [x15]
    135        add             x14, x3,  x14, lsl #1
    136        read_rand       x15, 11,  0
    137        add             x15, x3,  x15, lsl #1
    138        ld1             {v0.h}[2], [x14]
    139        ld1             {v0.h}[3], [x15]
    140        srshl           v0.4h,   v0.4h,   v31.4h
    141        ret
    142 endfunc
    143 
    144 .macro get_grain_4 dst
    145        bl              get_grain_4_neon
    146 .ifnc \dst, v0
    147        mov             \dst\().8b, v0.8b
    148 .endif
    149 .endm
    150 
    151 // w15 holds the number of entries to produce
    152 // w14, w16 and w17 hold the previous output entries
    153 // v0 holds the vector of produced entries
    154 // v1 holds the input vector of sums from above
    155 .macro output_lag n
    156 function output_lag\n\()_neon
    157 1:
    158        read_shift_rand x13, 11
    159        mov             w11, v1.s[0]
    160        ldrsh           w12, [x3, x13, lsl #1]
    161        ext             v0.16b,  v0.16b,  v0.16b,  #2
    162 .if \n == 1
    163        madd            w11, w14, w4,  w11        // sum (above) + *coeff * prev output
    164 .elseif \n == 2
    165        madd            w11, w16, w4,  w11        // sum (above) + *coeff * prev output 1
    166        madd            w11, w14, w17, w11        // += *coeff * prev output 2
    167        mov             w16, w14
    168 .else
    169        madd            w11, w17, w4,  w11        // sum (above) + *coeff * prev output 1
    170        madd            w11, w16, w20, w11        // sum (above) + *coeff * prev output 2
    171        madd            w11, w14, w21, w11        // += *coeff * prev output 3
    172        mov             w17, w16
    173        mov             w16, w14
    174 .endif
    175        add             w14, w11, w8              // 1 << (ar_coeff_shift - 1)
    176        add             w12, w12, w10             // 1 << (4 - bitdepth_min_8 + grain_scale_shift - 1)
    177        asr             w14, w14, w7              // >> ar_coeff_shift
    178        asr             w12, w12, w9              // >> (4 - bitdepth_min_8 + grain_scale_shift)
    179        add             w14, w14, w12
    180        cmp             w14, w5
    181        csel            w14, w14, w5,  le
    182        cmp             w14, w6
    183        csel            w14, w14, w6,  ge
    184        subs            w15, w15, #1
    185        ext             v1.16b,  v1.16b,  v1.16b,  #4
    186        ins             v0.h[7], w14
    187        b.gt            1b
    188        ret
    189 endfunc
    190 .endm
    191 
    192 output_lag 1
    193 output_lag 2
    194 output_lag 3
    195 
    196 
    197 function sum_lag1_above_neon
    198        sub             x12, x0,  #1*GRAIN_WIDTH*2 - 16
    199        ld1             {v18.8h}, [x12] // load top right
    200 
    201        ext             v0.16b,  v16.16b, v17.16b, #14 // top left, top mid
    202        ext             v1.16b,  v17.16b, v18.16b, #2  // top mid, top right
    203 
    204        smull           v4.4s,   v17.4h,  v28.4h
    205        smlal           v4.4s,   v0.4h,   v27.4h
    206        smlal           v4.4s,   v1.4h,   v29.4h
    207        smull2          v5.4s,   v17.8h,  v28.8h
    208        smlal2          v5.4s,   v0.8h,   v27.8h
    209        smlal2          v5.4s,   v1.8h,   v29.8h
    210 
    211        mov             v16.16b, v17.16b
    212        mov             v17.16b, v18.16b
    213 
    214        ret
    215 endfunc
    216 
    217 .macro sum_lag_n_body lag, type, uv_layout, edge, elems, uv_coeff
    218        bl              sum_\lag\()_above_neon
    219 .ifc \type, uv_420
    220        add             x12, x19, #GRAIN_WIDTH*2
    221        ld1             {v22.8h, v23.8h}, [x19], #32
    222        ld1             {v24.8h, v25.8h}, [x12]
    223        addp            v22.8h,  v22.8h,  v23.8h
    224        addp            v23.8h,  v24.8h,  v25.8h
    225        add             v22.8h,  v22.8h,  v23.8h
    226        srshr           v0.8h,   v22.8h,  #2
    227 .endif
    228 .ifc \type, uv_422
    229        ld1             {v22.8h, v23.8h}, [x19], #32
    230        addp            v22.8h,  v22.8h,  v23.8h
    231        srshr           v0.8h,   v22.8h,  #1
    232 .endif
    233 .ifc \type, uv_444
    234        ld1             {v0.8h}, [x19], #16
    235 .endif
    236 .if \uv_layout
    237 .ifnb \uv_coeff
    238        dup             v1.8b,   \uv_coeff
    239        sxtl            v1.8h,   v1.8b
    240        smlal           v4.4s,   v0.4h,   v1.4h
    241        smlal2          v5.4s,   v0.8h,   v1.8h
    242 .else
    243        smlal           v4.4s,   v0.4h,   v30.4h
    244        smlal2          v5.4s,   v0.8h,   v30.8h
    245 .endif
    246 .endif
    247 .if \uv_layout && \elems == 8
    248        b               sum_\lag\()_y_\edge\()_start
    249 .elseif \uv_layout == 444 && \elems == 7
    250        b               sum_\lag\()_y_\edge\()_start
    251 .elseif \uv_layout == 422 && \elems == 1
    252        b               sum_\lag\()_uv_420_\edge\()_start
    253 .else
    254 sum_\lag\()_\type\()_\edge\()_start:
    255 .if \elems > 4
    256 .ifc \edge, left
    257        increment_seed  4
    258        read_rand       x12, 11,  3
    259        read_rand       x13, 11,  2
    260        read_rand       x14, 11,  1
    261        add             x12, x3,  x12, lsl #1
    262        add             x13, x3,  x13, lsl #1
    263        add             x14, x3,  x14, lsl #1
    264        ld1             {v0.h}[5], [x12]
    265        ld1             {v0.h}[6], [x13]
    266        ld1             {v0.h}[7], [x14]
    267        lsl             x2,  x2,  #1             // shift back the state as if we'd done increment_seed with shift=0
    268        srshl           v0.8h,   v0.8h,   v31.8h
    269        ext             v4.16b,  v4.16b,  v4.16b,  #12
    270 .ifc \lag, lag3
    271        smov            w17, v0.h[5]
    272 .endif
    273 .ifnc \lag, lag1
    274        smov            w16, v0.h[6]
    275 .endif
    276        smov            w14, v0.h[7]
    277 
    278        mov             v1.16b,  v4.16b
    279        mov             w15, #1
    280        bl              output_\lag\()_neon
    281 .else
    282        increment_seed  4, shift=0
    283        mov             v1.16b,  v4.16b
    284        mov             w15, #4
    285        bl              output_\lag\()_neon
    286 .endif
    287 
    288        increment_seed  4, shift=0
    289        mov             v1.16b,  v5.16b
    290 .ifc \edge, right
    291        mov             w15, #3
    292        bl              output_\lag\()_neon
    293        read_shift_rand x15, 11
    294        add             x15, x3,  x15, lsl #1
    295        ld1             {v1.h}[0], [x15]
    296        srshl           v1.4h,   v1.4h,   v31.4h
    297        ext             v0.16b,  v0.16b,  v1.16b,  #2
    298 .else
    299        mov             w15, #4
    300        bl              output_\lag\()_neon
    301 .endif
    302 .else
    303        // elems == 1
    304        increment_seed  4, shift=0
    305        mov             v1.16b,  v4.16b
    306        mov             w15, #1
    307        bl              output_\lag\()_neon
    308        lsr             w2,  w2,  #3
    309 
    310        read_rand       x12, 11,  2
    311        read_rand       x13, 11,  1
    312        read_rand       x14, 11,  0
    313        add             x12, x3,  x12, lsl #1
    314        add             x13, x3,  x13, lsl #1
    315        add             x14, x3,  x14, lsl #1
    316        ld1             {v1.h}[0], [x12]
    317        ld1             {v1.h}[1], [x13]
    318        ld1             {v1.h}[2], [x14]
    319        srshl           v1.4h,   v1.4h,   v31.4h
    320        ext             v0.16b,  v0.16b,  v1.16b,  #14
    321 .endif
    322        st1             {v0.8h}, [x0], #16
    323        ldr             x30,     [sp], #16
    324        AARCH64_VALIDATE_LINK_REGISTER
    325        ret
    326 .endif
    327 .endm
    328 
    329 .macro sum_lag1_func type, uv_layout, edge, elems=8
    330 function sum_\type\()_lag1_\edge\()_neon
    331        AARCH64_SIGN_LINK_REGISTER
    332        str             x30, [sp, #-16]!
    333 .ifc \edge, left
    334        sub             x12, x0,  #1*GRAIN_WIDTH*2
    335        ld1             {v17.8h}, [x12] // load the previous block right above
    336 .endif
    337        sum_lag_n_body  lag1, \type, \uv_layout, \edge, \elems
    338 endfunc
    339 .endm
    340 
    341 sum_lag1_func y,      0,   left
    342 sum_lag1_func y,      0,   mid
    343 sum_lag1_func y,      0,   right, 7
    344 sum_lag1_func uv_444, 444, left
    345 sum_lag1_func uv_444, 444, mid
    346 sum_lag1_func uv_444, 444, right, 7
    347 sum_lag1_func uv_422, 422, left
    348 sum_lag1_func uv_422, 422, mid
    349 sum_lag1_func uv_422, 422, right, 1
    350 sum_lag1_func uv_420, 420, left
    351 sum_lag1_func uv_420, 420, mid
    352 sum_lag1_func uv_420, 420, right, 1
    353 
    354 
    355 function sum_lag2_above_neon
    356        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
    357        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
    358        ld1             {v18.8h}, [x12] // load top right
    359        ld1             {v21.8h}, [x13]
    360 
    361        dup             v26.8b,  v30.b[0]
    362        ext             v22.16b, v16.16b, v17.16b, #12 // top left, top mid
    363        dup             v27.8b,  v30.b[1]
    364        ext             v23.16b, v16.16b, v17.16b, #14
    365        sxtl            v26.8h,  v26.8b
    366        dup             v28.8b,  v30.b[3]
    367        ext             v0.16b,  v17.16b, v18.16b, #2  // top mid, top right
    368        sxtl            v27.8h,  v27.8b
    369        dup             v29.8b,  v30.b[4]
    370        ext             v1.16b,  v17.16b, v18.16b, #4
    371        sxtl            v28.8h,  v28.8b
    372        sxtl            v29.8h,  v29.8b
    373 
    374        smull           v4.4s,   v22.4h,  v26.4h
    375        smlal           v4.4s,   v23.4h,  v27.4h
    376        smlal           v4.4s,   v0.4h,   v28.4h
    377        smlal           v4.4s,   v1.4h,   v29.4h
    378        smull2          v5.4s,   v22.8h,  v26.8h
    379        smlal2          v5.4s,   v23.8h,  v27.8h
    380        smlal2          v5.4s,   v0.8h,   v28.8h
    381        smlal2          v5.4s,   v1.8h,   v29.8h
    382 
    383        dup             v26.16b, v30.b[5]
    384        ext             v22.16b, v19.16b, v20.16b, #12 // top left, top mid
    385        dup             v27.16b, v30.b[6]
    386        ext             v23.16b, v19.16b, v20.16b, #14
    387        sxtl            v26.8h,  v26.8b
    388        dup             v28.16b, v30.b[8]
    389        ext             v0.16b,  v20.16b, v21.16b, #2  // top mid, top right
    390        sxtl            v27.8h,  v27.8b
    391        dup             v29.16b, v30.b[9]
    392        ext             v1.16b,  v20.16b, v21.16b, #4
    393        sxtl            v28.8h,  v28.8b
    394        sxtl            v29.8h,  v29.8b
    395 
    396        smlal           v4.4s,   v22.4h,  v26.4h
    397        smlal           v4.4s,   v23.4h,  v27.4h
    398        smlal           v4.4s,   v0.4h,   v28.4h
    399        smlal           v4.4s,   v1.4h,   v29.4h
    400        smlal2          v5.4s,   v22.8h,  v26.8h
    401        smlal2          v5.4s,   v23.8h,  v27.8h
    402        smlal2          v5.4s,   v0.8h,   v28.8h
    403        smlal2          v5.4s,   v1.8h,   v29.8h
    404 
    405        dup             v26.16b, v30.b[2]
    406        dup             v27.16b, v30.b[7]
    407        sxtl            v26.8h,  v26.8b
    408        sxtl            v27.8h,  v27.8b
    409 
    410        smlal           v4.4s,   v17.4h,  v26.4h
    411        smlal           v4.4s,   v20.4h,  v27.4h
    412        smlal2          v5.4s,   v17.8h,  v26.8h
    413        smlal2          v5.4s,   v20.8h,  v27.8h
    414        mov             v16.16b, v17.16b
    415        mov             v17.16b, v18.16b
    416 
    417        mov             v19.16b, v20.16b
    418        mov             v20.16b, v21.16b
    419        ret
    420 endfunc
    421 
    422 .macro sum_lag2_func type, uv_layout, edge, elems=8
    423 function sum_\type\()_lag2_\edge\()_neon
    424        AARCH64_SIGN_LINK_REGISTER
    425        str             x30, [sp, #-16]!
    426 .ifc \edge, left
    427        sub             x12, x0,  #2*GRAIN_WIDTH*2
    428        sub             x13, x0,  #1*GRAIN_WIDTH*2
    429        ld1             {v17.8h}, [x12] // load the previous block right above
    430        ld1             {v20.8h}, [x13]
    431 .endif
    432        sum_lag_n_body  lag2, \type, \uv_layout, \edge, \elems, v30.b[12]
    433 endfunc
    434 .endm
    435 
    436 sum_lag2_func y,      0,   left
    437 sum_lag2_func y,      0,   mid
    438 sum_lag2_func y,      0,   right, 7
    439 sum_lag2_func uv_444, 444, left
    440 sum_lag2_func uv_444, 444, mid
    441 sum_lag2_func uv_444, 444, right, 7
    442 sum_lag2_func uv_422, 422, left
    443 sum_lag2_func uv_422, 422, mid
    444 sum_lag2_func uv_422, 422, right, 1
    445 sum_lag2_func uv_420, 420, left
    446 sum_lag2_func uv_420, 420, mid
    447 sum_lag2_func uv_420, 420, right, 1
    448 
    449 
    450 function sum_lag3_above_neon
    451        sub             x11, x0,  #3*GRAIN_WIDTH*2 - 16
    452        sub             x12, x0,  #2*GRAIN_WIDTH*2 - 16
    453        sub             x13, x0,  #1*GRAIN_WIDTH*2 - 16
    454        ld1             {v15.8h}, [x11] // load top right
    455        ld1             {v18.8h}, [x12]
    456        ld1             {v21.8h}, [x13]
    457 
    458        dup             v22.8b,  v29.b[0]
    459        ext             v8.16b,  v13.16b, v14.16b, #10 // top left, top mid
    460        dup             v23.8b,  v29.b[1]
    461        ext             v9.16b,  v13.16b, v14.16b, #12
    462        sxtl            v22.8h,  v22.8b
    463        dup             v24.8b,  v29.b[2]
    464        sxtl            v23.8h,  v23.8b
    465        dup             v25.8b,  v29.b[3]
    466        ext             v10.16b, v13.16b, v14.16b, #14
    467        sxtl            v24.8h,  v24.8b
    468        dup             v26.8b,  v29.b[4]
    469        ext             v11.16b, v14.16b, v15.16b, #2  // top mid, top right
    470        sxtl            v25.8h,  v25.8b
    471        dup             v27.8b,  v29.b[5]
    472        ext             v12.16b, v14.16b, v15.16b, #4
    473        sxtl            v26.8h,  v26.8b
    474        dup             v28.8b,  v29.b[6]
    475        ext             v13.16b, v14.16b, v15.16b, #6
    476        sxtl            v27.8h,  v27.8b
    477        sxtl            v28.8h,  v28.8b
    478 
    479        smull           v4.4s,   v8.4h,   v22.4h
    480        smlal           v4.4s,   v9.4h,   v23.4h
    481        smlal           v4.4s,   v10.4h,  v24.4h
    482        smlal           v4.4s,   v11.4h,  v26.4h
    483        smlal           v4.4s,   v12.4h,  v27.4h
    484        smlal           v4.4s,   v13.4h,  v28.4h
    485        smlal           v4.4s,   v14.4h,  v25.4h
    486        smull2          v5.4s,   v8.8h,   v22.8h
    487        smlal2          v5.4s,   v9.8h,   v23.8h
    488        smlal2          v5.4s,   v10.8h,  v24.8h
    489        smlal2          v5.4s,   v11.8h,  v26.8h
    490        smlal2          v5.4s,   v12.8h,  v27.8h
    491        smlal2          v5.4s,   v13.8h,  v28.8h
    492        smlal2          v5.4s,   v14.8h,  v25.8h
    493 
    494        dup             v22.8b,  v29.b[7]
    495        ext             v8.16b,  v16.16b, v17.16b, #10 // top left, top mid
    496        dup             v23.8b,  v29.b[8]
    497        ext             v9.16b,  v16.16b, v17.16b, #12
    498        sxtl            v22.8h,  v22.8b
    499        dup             v24.8b,  v29.b[9]
    500        sxtl            v23.8h,  v23.8b
    501        dup             v25.8b,  v29.b[10]
    502        ext             v10.16b, v16.16b, v17.16b, #14
    503        sxtl            v24.8h,  v24.8b
    504        dup             v26.8b,  v29.b[11]
    505        ext             v11.16b, v17.16b, v18.16b, #2  // top mid, top right
    506        sxtl            v25.8h,  v25.8b
    507        dup             v27.8b,  v29.b[12]
    508        ext             v12.16b, v17.16b, v18.16b, #4
    509        sxtl            v26.8h,  v26.8b
    510        dup             v28.8b,  v29.b[13]
    511        ext             v13.16b, v17.16b, v18.16b, #6
    512        sxtl            v27.8h,  v27.8b
    513        sxtl            v28.8h,  v28.8b
    514 
    515        smlal           v4.4s,   v8.4h,   v22.4h
    516        smlal           v4.4s,   v9.4h,   v23.4h
    517        smlal           v4.4s,   v10.4h,  v24.4h
    518        smlal           v4.4s,   v11.4h,  v26.4h
    519        smlal           v4.4s,   v12.4h,  v27.4h
    520        smlal           v4.4s,   v13.4h,  v28.4h
    521        smlal           v4.4s,   v17.4h,  v25.4h
    522        smlal2          v5.4s,   v8.8h,   v22.8h
    523        smlal2          v5.4s,   v9.8h,   v23.8h
    524        smlal2          v5.4s,   v10.8h,  v24.8h
    525        smlal2          v5.4s,   v11.8h,  v26.8h
    526        smlal2          v5.4s,   v12.8h,  v27.8h
    527        smlal2          v5.4s,   v13.8h,  v28.8h
    528        smlal2          v5.4s,   v17.8h,  v25.8h
    529 
    530        dup             v22.8b,  v29.b[14]
    531        ext             v8.16b,  v19.16b, v20.16b, #10 // top left, top mid
    532        dup             v23.8b,  v29.b[15]
    533        ext             v9.16b,  v19.16b, v20.16b, #12
    534        sxtl            v22.8h,  v22.8b
    535        dup             v24.8b,  v30.b[0]
    536        sxtl            v23.8h,  v23.8b
    537        dup             v25.8b,  v30.b[1]
    538        ext             v10.16b, v19.16b, v20.16b, #14
    539        sxtl            v24.8h,  v24.8b
    540        dup             v26.8b,  v30.b[2]
    541        ext             v11.16b, v20.16b, v21.16b, #2  // top mid, top right
    542        sxtl            v25.8h,  v25.8b
    543        dup             v27.8b,  v30.b[3]
    544        ext             v12.16b, v20.16b, v21.16b, #4
    545        sxtl            v26.8h,  v26.8b
    546        dup             v28.8b,  v30.b[4]
    547        ext             v13.16b, v20.16b, v21.16b, #6
    548        sxtl            v27.8h,  v27.8b
    549        sxtl            v28.8h,  v28.8b
    550 
    551        smlal           v4.4s,   v8.4h,   v22.4h
    552        smlal           v4.4s,   v9.4h,   v23.4h
    553        smlal           v4.4s,   v10.4h,  v24.4h
    554        smlal           v4.4s,   v11.4h,  v26.4h
    555        smlal           v4.4s,   v12.4h,  v27.4h
    556        smlal           v4.4s,   v13.4h,  v28.4h
    557        smlal           v4.4s,   v20.4h,  v25.4h
    558        mov             v16.16b, v17.16b
    559        mov             v17.16b, v18.16b
    560        smlal2          v5.4s,   v8.8h,   v22.8h
    561        smlal2          v5.4s,   v9.8h,   v23.8h
    562        smlal2          v5.4s,   v10.8h,  v24.8h
    563        smlal2          v5.4s,   v11.8h,  v26.8h
    564        smlal2          v5.4s,   v12.8h,  v27.8h
    565        smlal2          v5.4s,   v13.8h,  v28.8h
    566        smlal2          v5.4s,   v20.8h,  v25.8h
    567 
    568        mov             v13.16b, v14.16b
    569        mov             v14.16b, v15.16b
    570 
    571        mov             v19.16b, v20.16b
    572        mov             v20.16b, v21.16b
    573        ret
    574 endfunc
    575 
    576 .macro sum_lag3_func type, uv_layout, edge, elems=8
    577 function sum_\type\()_lag3_\edge\()_neon
    578        AARCH64_SIGN_LINK_REGISTER
    579        str             x30, [sp, #-16]!
    580 .ifc \edge, left
    581        sub             x11, x0,  #3*GRAIN_WIDTH*2
    582        sub             x12, x0,  #2*GRAIN_WIDTH*2
    583        sub             x13, x0,  #1*GRAIN_WIDTH*2
    584        ld1             {v14.8h}, [x11] // load the previous block right above
    585        ld1             {v17.8h}, [x12]
    586        ld1             {v20.8h}, [x13]
    587 .endif
    588        sum_lag_n_body  lag3, \type, \uv_layout, \edge, \elems, v30.b[8]
    589 endfunc
    590 .endm
    591 
    592 sum_lag3_func y,      0,   left
    593 sum_lag3_func y,      0,   mid
    594 sum_lag3_func y,      0,   right, 7
    595 sum_lag3_func uv_444, 444, left
    596 sum_lag3_func uv_444, 444, mid
    597 sum_lag3_func uv_444, 444, right, 7
    598 sum_lag3_func uv_422, 422, left
    599 sum_lag3_func uv_422, 422, mid
    600 sum_lag3_func uv_422, 422, right, 1
    601 sum_lag3_func uv_420, 420, left
    602 sum_lag3_func uv_420, 420, mid
    603 sum_lag3_func uv_420, 420, right, 1
    604 
    605 function generate_grain_rows_neon
    606        AARCH64_SIGN_LINK_REGISTER
    607        str             x30, [sp, #-16]!
    608 1:
    609        mov             w16, #80
    610 2:
    611        bl              get_gaussian_neon
    612        srshl           v0.8h,   v0.8h,   v31.8h
    613        subs            w16, w16, #8
    614        st1             {v0.8h}, [x0], #16
    615        b.gt            2b
    616        get_grain_2     v0
    617        subs            w1,  w1,  #1
    618        st1             {v0.s}[0], [x0], #4
    619        b.gt            1b
    620        ldr             x30, [sp], #16
    621        AARCH64_VALIDATE_LINK_REGISTER
    622        ret
    623 endfunc
    624 
    625 function generate_grain_rows_44_neon
    626        AARCH64_SIGN_LINK_REGISTER
    627        str             x30, [sp, #-16]!
    628 1:
    629        mov             w16, #40
    630 2:
    631        bl              get_gaussian_neon
    632        srshl           v0.8h,   v0.8h,   v31.8h
    633        subs            w16, w16, #8
    634        st1             {v0.8h}, [x0], #16
    635        b.gt            2b
    636        get_grain_4     v0
    637        subs            w1,  w1,  #1
    638        st1             {v0.4h}, [x0]
    639        add             x0,  x0,  #GRAIN_WIDTH*2-80
    640        b.gt            1b
    641        ldr             x30, [sp], #16
    642        AARCH64_VALIDATE_LINK_REGISTER
    643        ret
    644 endfunc
    645 
    646 function gen_grain_uv_444_lag0_neon
    647        AARCH64_SIGN_LINK_REGISTER
    648        str             x30, [sp, #-16]!
    649        ld1             {v4.8h}, [x19], #16
    650 gen_grain_uv_lag0_8_start:
    651        bl              get_gaussian_neon
    652        srshl           v0.8h,   v0.8h,   v31.8h
    653 gen_grain_uv_lag0_8_add:
    654        and             v4.16b,  v4.16b,  v1.16b
    655        smull           v2.4s,   v4.4h,   v27.4h
    656        smull2          v3.4s,   v4.8h,   v27.8h
    657        srshl           v2.4s,   v2.4s,   v28.4s
    658        srshl           v3.4s,   v3.4s,   v28.4s
    659        sqxtn           v2.4h,   v2.4s
    660        sqxtn2          v2.8h,   v3.4s
    661        sqadd           v2.8h,   v2.8h,   v0.8h
    662        smin            v2.8h,   v2.8h,   v25.8h
    663        smax            v2.8h,   v2.8h,   v26.8h
    664        st1             {v2.8h}, [x0], #16
    665        ldr             x30, [sp], #16
    666        AARCH64_VALIDATE_LINK_REGISTER
    667        ret
    668 endfunc
    669 
    670 function gen_grain_uv_420_lag0_8_neon
    671        AARCH64_SIGN_LINK_REGISTER
    672        add             x12, x19, #GRAIN_WIDTH*2
    673        str             x30, [sp, #-16]!
    674        ld1             {v16.8h, v17.8h}, [x19], #32
    675        ld1             {v18.8h, v19.8h}, [x12]
    676        addp            v16.8h,  v16.8h,  v17.8h
    677        addp            v17.8h,  v18.8h,  v19.8h
    678        add             v16.8h,  v16.8h,  v17.8h
    679        srshr           v4.8h,   v16.8h,  #2
    680        b               gen_grain_uv_lag0_8_start
    681 endfunc
    682 
    683 function gen_grain_uv_422_lag0_8_neon
    684        AARCH64_SIGN_LINK_REGISTER
    685        str             x30, [sp, #-16]!
    686        ld1             {v16.8h, v17.8h}, [x19], #32
    687        addp            v16.8h,  v16.8h,  v17.8h
    688        srshr           v4.8h,   v16.8h,  #1
    689        b               gen_grain_uv_lag0_8_start
    690 endfunc
    691 
    692 function gen_grain_uv_420_lag0_4_neon
    693        add             x12, x19, #GRAIN_WIDTH*2
    694        AARCH64_SIGN_LINK_REGISTER
    695        str             x30, [sp, #-16]!
    696        ld1             {v16.4h, v17.4h}, [x19]
    697        ld1             {v18.4h, v19.4h}, [x12]
    698        add             x19,  x19,  #32
    699        addp            v16.4h,  v16.4h,  v17.4h
    700        addp            v17.4h,  v18.4h,  v19.4h
    701        add             v16.4h,  v16.4h,  v17.4h
    702        srshr           v4.4h,   v16.4h,  #2
    703        get_grain_4     v0
    704        b               gen_grain_uv_lag0_8_add
    705 endfunc
    706 
    707 function gen_grain_uv_422_lag0_4_neon
    708        AARCH64_SIGN_LINK_REGISTER
    709        str             x30, [sp, #-16]!
    710        ld1             {v16.4h, v17.4h}, [x19]
    711        add             x19,  x19,  #32
    712        addp            v16.4h,  v16.4h,  v17.4h
    713        srshr           v4.4h,   v16.4h,  #1
    714        get_grain_4     v0
    715        b               gen_grain_uv_lag0_8_add
    716 endfunc
    717 
    718 .macro gen_grain_82 type
    719 function generate_grain_\type\()_16bpc_neon, export=1
    720        AARCH64_SIGN_LINK_REGISTER
    721        stp             x30, x19, [sp, #-96]!
    722 
    723 .ifc \type, uv_444
    724        mov             w13, w3
    725        mov             w14, #28
    726        add             x19, x1,  #3*GRAIN_WIDTH*2
    727        mov             x1,  x2
    728        mul             w13, w13, w14
    729        clz             w15, w4
    730 .else
    731        clz             w15, w2
    732 .endif
    733        movrel          x3,  X(gaussian_sequence)
    734        sub             w15, w15, #24 // -bitdepth_min_8
    735        ldr             w2,  [x1, #FGD_SEED]
    736        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
    737 .ifc \type, y
    738        add             x4,  x1,  #FGD_AR_COEFFS_Y
    739 .else
    740        add             x4,  x1,  #FGD_AR_COEFFS_UV
    741 .endif
    742        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
    743        movrel          x16, gen_grain_\type\()_tbl
    744        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
    745        add             w9,  w9,  #4
    746        ldrsw           x17, [x16, w17, uxtw #2]
    747        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
    748        add             x16, x16, x17
    749        neg             v31.8h,  v31.8h
    750 
    751 .ifc \type, uv_444
    752        cmp             w13, #0
    753        mov             w11, #0x49d8
    754        mov             w14, #0xb524
    755        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
    756        csel            w11, w11, w14, ne
    757 .endif
    758 
    759        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
    760        neg             w15, w15            // bitdepth_min_8
    761        mov             w8,  #1
    762        mov             w10, #1
    763        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
    764        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
    765        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
    766        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
    767        mov             w5,  #128
    768        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
    769        neg             w6,  w5             // -(128 << bitpdeth_min_8)
    770        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
    771 
    772 .ifc \type, uv_444
    773        eor             w2,  w2,  w11
    774 .endif
    775 
    776        br              x16
    777 
    778 L(generate_grain_\type\()_lag0):
    779        AARCH64_VALID_JUMP_TARGET
    780 .ifc \type, y
    781        mov             w1,  #GRAIN_HEIGHT
    782        bl              generate_grain_rows_neon
    783 .else
    784        dup             v28.4s,  w7
    785        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
    786        movi            v0.16b,  #0
    787        movi            v1.16b,  #255
    788        dup             v25.8h,  w5
    789        dup             v26.8h,  w6
    790        ext             v29.16b, v0.16b,  v1.16b,  #10
    791        ext             v30.16b, v1.16b,  v0.16b,  #2
    792        neg             v28.4s,  v28.4s
    793        sxtl            v27.8h,  v27.8b
    794 
    795        mov             w1,  #3
    796        bl              generate_grain_rows_neon
    797        mov             w1,  #GRAIN_HEIGHT-3
    798 1:
    799        mov             v1.16b,  v29.16b
    800        bl              gen_grain_uv_444_lag0_neon // 8
    801        movi            v1.16b,  #255
    802        bl              gen_grain_uv_444_lag0_neon // 16
    803        bl              gen_grain_uv_444_lag0_neon // 24
    804        bl              gen_grain_uv_444_lag0_neon // 32
    805        bl              gen_grain_uv_444_lag0_neon // 40
    806        bl              gen_grain_uv_444_lag0_neon // 48
    807        bl              gen_grain_uv_444_lag0_neon // 56
    808        bl              gen_grain_uv_444_lag0_neon // 64
    809        bl              gen_grain_uv_444_lag0_neon // 72
    810        mov             v1.16b,  v30.16b
    811        bl              gen_grain_uv_444_lag0_neon // 80
    812        get_grain_2     v16
    813        subs            w1,  w1,  #1
    814        add             x19, x19, #4
    815        st1             {v16.s}[0], [x0], #4
    816        b.gt            1b
    817 .endif
    818        ldp             x30, x19, [sp], #96
    819        AARCH64_VALIDATE_LINK_REGISTER
    820        ret
    821 
    822 L(generate_grain_\type\()_lag1):
    823        AARCH64_VALID_JUMP_TARGET
    824        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_y[0]
    825        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_y[1]
    826        ld1r            {v29.8b}, [x4]      // ar_coeffs_y[2]
    827 .ifc \type, y
    828        ldrsb           w4,  [x4, #1]       // ar_coeffs_y[3]
    829 .else
    830        add             x4,  x4,  #2
    831 .endif
    832 
    833        mov             w1,  #3
    834 .ifc \type, uv_444
    835        ld1r            {v30.8b}, [x4]      // ar_coeffs_uv[4]
    836        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
    837 .endif
    838        bl              generate_grain_rows_neon
    839        sxtl            v27.8h,  v27.8b
    840        sxtl            v28.8h,  v28.8b
    841        sxtl            v29.8h,  v29.8b
    842 .ifc \type, uv_444
    843        sxtl            v30.8h,  v30.8b
    844 .endif
    845 
    846        mov             w1,  #GRAIN_HEIGHT - 3
    847 1:
    848        bl              sum_\type\()_lag1_left_neon  // 8
    849        bl              sum_\type\()_lag1_mid_neon   // 16
    850        bl              sum_\type\()_lag1_mid_neon   // 24
    851        bl              sum_\type\()_lag1_mid_neon   // 32
    852        bl              sum_\type\()_lag1_mid_neon   // 40
    853        bl              sum_\type\()_lag1_mid_neon   // 48
    854        bl              sum_\type\()_lag1_mid_neon   // 56
    855        bl              sum_\type\()_lag1_mid_neon   // 64
    856        bl              sum_\type\()_lag1_mid_neon   // 72
    857        bl              sum_\type\()_lag1_right_neon // 80
    858        get_grain_2     v16
    859        subs            w1,  w1,  #1
    860 .ifc \type, uv_444
    861        add             x19, x19, #4
    862 .endif
    863        st1             {v16.s}[0], [x0], #4
    864        b.gt            1b
    865 
    866        ldp             x30, x19, [sp], #96
    867        AARCH64_VALIDATE_LINK_REGISTER
    868        ret
    869 
    870 L(generate_grain_\type\()_lag2):
    871        AARCH64_VALID_JUMP_TARGET
    872        ld1             {v30.16b}, [x4]     // ar_coeffs_y[0-11], ar_coeffs_uv[0-12]
    873 
    874        smov            w4,  v30.b[10]
    875        smov            w17, v30.b[11]
    876 
    877        mov             w1,  #3
    878        bl              generate_grain_rows_neon
    879 
    880        mov             w1,  #GRAIN_HEIGHT - 3
    881 1:
    882        bl              sum_\type\()_lag2_left_neon  // 8
    883        bl              sum_\type\()_lag2_mid_neon   // 16
    884        bl              sum_\type\()_lag2_mid_neon   // 24
    885        bl              sum_\type\()_lag2_mid_neon   // 32
    886        bl              sum_\type\()_lag2_mid_neon   // 40
    887        bl              sum_\type\()_lag2_mid_neon   // 48
    888        bl              sum_\type\()_lag2_mid_neon   // 56
    889        bl              sum_\type\()_lag2_mid_neon   // 64
    890        bl              sum_\type\()_lag2_mid_neon   // 72
    891        bl              sum_\type\()_lag2_right_neon // 80
    892        get_grain_2     v16
    893        subs            w1,  w1,  #1
    894 .ifc \type, uv_444
    895        add             x19, x19, #4
    896 .endif
    897        st1             {v16.s}[0], [x0], #4
    898        b.gt            1b
    899 
    900        ldp             x30, x19, [sp], #96
    901        AARCH64_VALIDATE_LINK_REGISTER
    902        ret
    903 
    904 L(generate_grain_\type\()_lag3):
    905        AARCH64_VALID_JUMP_TARGET
    906        ld1             {v29.16b, v30.16b}, [x4] // ar_coeffs_y[0-23], ar_coeffs_uv[0-24]
    907        stp             d8,  d9,  [sp, #16]
    908        stp             d10, d11, [sp, #32]
    909        stp             d12, d13, [sp, #48]
    910        stp             d14, d15, [sp, #64]
    911        stp             x20, x21, [sp, #80]
    912 
    913        smov            w4,  v30.b[5]
    914        smov            w20, v30.b[6]
    915        smov            w21, v30.b[7]
    916 
    917        mov             w1,  #3
    918        bl              generate_grain_rows_neon
    919 
    920        mov             w1,  #GRAIN_HEIGHT - 3
    921 1:
    922        bl              sum_\type\()_lag3_left_neon  // 8
    923        bl              sum_\type\()_lag3_mid_neon   // 16
    924        bl              sum_\type\()_lag3_mid_neon   // 24
    925        bl              sum_\type\()_lag3_mid_neon   // 32
    926        bl              sum_\type\()_lag3_mid_neon   // 40
    927        bl              sum_\type\()_lag3_mid_neon   // 48
    928        bl              sum_\type\()_lag3_mid_neon   // 56
    929        bl              sum_\type\()_lag3_mid_neon   // 64
    930        bl              sum_\type\()_lag3_mid_neon   // 72
    931        bl              sum_\type\()_lag3_right_neon // 80
    932        get_grain_2     v16
    933        subs            w1,  w1,  #1
    934 .ifc \type, uv_444
    935        add             x19, x19, #4
    936 .endif
    937        st1             {v16.s}[0], [x0], #4
    938        b.gt            1b
    939 
    940        ldp             x20, x21, [sp, #80]
    941        ldp             d14, d15, [sp, #64]
    942        ldp             d12, d13, [sp, #48]
    943        ldp             d10, d11, [sp, #32]
    944        ldp             d8,  d9,  [sp, #16]
    945        ldp             x30, x19, [sp], #96
    946        AARCH64_VALIDATE_LINK_REGISTER
    947        ret
    948 endfunc
    949 
    950 jumptable gen_grain_\type\()_tbl
    951        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
    952        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
    953        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
    954        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
    955 endjumptable
    956 .endm
    957 
    958 gen_grain_82 y
    959 gen_grain_82 uv_444
    960 
    961 .macro set_height dst, type
    962 .ifc \type, uv_420
    963        mov             \dst,  #SUB_GRAIN_HEIGHT-3
    964 .else
    965        mov             \dst,  #GRAIN_HEIGHT-3
    966 .endif
    967 .endm
    968 
    969 .macro increment_y_ptr reg, type
    970 .ifc \type, uv_420
    971        add             \reg, \reg, #2*GRAIN_WIDTH*2-(6*32)
    972 .else
    973        sub             \reg, \reg, #6*32-GRAIN_WIDTH*2
    974 .endif
    975 .endm
    976 
    977 .macro gen_grain_44 type
    978 function generate_grain_\type\()_16bpc_neon, export=1
    979        AARCH64_SIGN_LINK_REGISTER
    980        stp             x30, x19, [sp, #-96]!
    981 
    982        mov             w13, w3
    983        mov             w14, #28
    984        add             x19, x1,  #(3*GRAIN_WIDTH-3)*2
    985        mov             x1,  x2
    986        mul             w13, w13, w14
    987        clz             w15, w4
    988 
    989        movrel          x3,  X(gaussian_sequence)
    990        sub             w15, w15, #24 // -bitdepth_min_8
    991        ldr             w2,  [x1, #FGD_SEED]
    992        ldr             w9,  [x1, #FGD_GRAIN_SCALE_SHIFT]
    993        add             x4,  x1,  #FGD_AR_COEFFS_UV
    994        add             w9,  w9,  w15 // grain_scale_shift - bitdepth_min_8
    995        movrel          x16, gen_grain_\type\()_tbl
    996        ldr             w17, [x1, #FGD_AR_COEFF_LAG]
    997        add             w9,  w9,  #4
    998        ldrsw           x17, [x16, w17, uxtw #2]
    999        dup             v31.8h,  w9    // 4 - bitdepth_min_8 + data->grain_scale_shift
   1000        add             x16, x16, x17
   1001        neg             v31.8h,  v31.8h
   1002 
   1003        cmp             w13, #0
   1004        mov             w11, #0x49d8
   1005        mov             w14, #0xb524
   1006        add             x4,  x4,  w13, uxtw // Add offset to ar_coeffs_uv[1]
   1007        csel            w11, w11, w14, ne
   1008 
   1009        ldr             w7,  [x1, #FGD_AR_COEFF_SHIFT]
   1010        neg             w15, w15            // bitdepth_min_8
   1011        mov             w8,  #1
   1012        mov             w10, #1
   1013        lsl             w8,  w8,  w7        // 1 << ar_coeff_shift
   1014        lsl             w10, w10, w9        // 1 << (4 + data->grain_scale_shift)
   1015        lsr             w8,  w8,  #1        // 1 << (ar_coeff_shift - 1)
   1016        lsr             w10, w10, #1        // 1 << (4 + data->grain_scale_shift - 1)
   1017        mov             w5,  #128
   1018        lsl             w5,  w5,  w15       //   128 << bitdepth_min_8
   1019        neg             w6,  w5             // -(128 << bitpdeth_min_8)
   1020        sub             w5,  w5,  #1        //  (128 << bitdepth_min_8) - 1
   1021 
   1022        eor             w2,  w2,  w11
   1023 
   1024        br              x16
   1025 
   1026 L(generate_grain_\type\()_lag0):
   1027        AARCH64_VALID_JUMP_TARGET
   1028        dup             v28.4s,  w7
   1029        ld1r            {v27.8b}, [x4]      // ar_coeffs_uv[0]
   1030        movi            v0.16b,  #0
   1031        movi            v1.16b,  #255
   1032        dup             v25.8h,  w5
   1033        dup             v26.8h,  w6
   1034        ext             v29.16b, v0.16b,  v1.16b,  #10
   1035        ext             v30.16b, v1.16b,  v0.16b,  #14
   1036        neg             v28.4s,  v28.4s
   1037        sxtl            v27.8h,  v27.8b
   1038 
   1039        mov             w1,  #3
   1040        bl              generate_grain_rows_44_neon
   1041        set_height      w1,  \type
   1042 1:
   1043        mov             v1.16b,  v29.16b
   1044        bl              gen_grain_\type\()_lag0_8_neon // 8
   1045        movi            v1.16b,  #255
   1046        bl              gen_grain_\type\()_lag0_8_neon // 16
   1047        bl              gen_grain_\type\()_lag0_8_neon // 24
   1048        bl              gen_grain_\type\()_lag0_8_neon // 32
   1049        bl              gen_grain_\type\()_lag0_8_neon // 40
   1050        mov             v1.16b,  v30.16b
   1051        bl              gen_grain_\type\()_lag0_4_neon // 44
   1052        subs            w1,  w1,  #1
   1053        increment_y_ptr x19, \type
   1054        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
   1055        b.gt            1b
   1056 
   1057        ldp             x30, x19, [sp], #96
   1058        AARCH64_VALIDATE_LINK_REGISTER
   1059        ret
   1060 
   1061 L(generate_grain_\type\()_lag1):
   1062        AARCH64_VALID_JUMP_TARGET
   1063        ld1r            {v27.8b}, [x4], #1  // ar_coeffs_uv[0]
   1064        ld1r            {v28.8b}, [x4], #1  // ar_coeffs_uv[1]
   1065        ld1r            {v29.8b}, [x4]      // ar_coeffs_uv[2]
   1066        add             x4,  x4,  #2
   1067 
   1068        mov             w1,  #3
   1069        ld1r            {v30.8b}, [x4]      // ar_coeffs_u4[4]
   1070        ldursb          w4,  [x4, #-1]      // ar_coeffs_uv[3]
   1071        bl              generate_grain_rows_44_neon
   1072 
   1073        sxtl            v27.8h,  v27.8b
   1074        sxtl            v28.8h,  v28.8b
   1075        sxtl            v29.8h,  v29.8b
   1076        sxtl            v30.8h,  v30.8b
   1077        set_height      w1,  \type
   1078 1:
   1079        bl              sum_\type\()_lag1_left_neon  // 8
   1080        bl              sum_\type\()_lag1_mid_neon   // 16
   1081        bl              sum_\type\()_lag1_mid_neon   // 24
   1082        bl              sum_\type\()_lag1_mid_neon   // 32
   1083        bl              sum_\type\()_lag1_mid_neon   // 40
   1084        bl              sum_\type\()_lag1_right_neon // 44
   1085        subs            w1,  w1,  #1
   1086        increment_y_ptr x19, \type
   1087        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
   1088        b.gt            1b
   1089 
   1090        ldp             x30, x19, [sp], #96
   1091        AARCH64_VALIDATE_LINK_REGISTER
   1092        ret
   1093 
   1094 L(generate_grain_\type\()_lag2):
   1095        AARCH64_VALID_JUMP_TARGET
   1096        ld1             {v30.16b}, [x4]     // ar_coeffs_uv[0-12]
   1097 
   1098        smov            w4,  v30.b[10]
   1099        smov            w17, v30.b[11]
   1100 
   1101        mov             w1,  #3
   1102        bl              generate_grain_rows_44_neon
   1103 
   1104        set_height      w1,  \type
   1105 1:
   1106        bl              sum_\type\()_lag2_left_neon  // 8
   1107        bl              sum_\type\()_lag2_mid_neon   // 16
   1108        bl              sum_\type\()_lag2_mid_neon   // 24
   1109        bl              sum_\type\()_lag2_mid_neon   // 32
   1110        bl              sum_\type\()_lag2_mid_neon   // 40
   1111        bl              sum_\type\()_lag2_right_neon // 44
   1112        subs            w1,  w1,  #1
   1113        increment_y_ptr x19, \type
   1114        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
   1115        b.gt            1b
   1116 
   1117        ldp             x30, x19, [sp], #96
   1118        AARCH64_VALIDATE_LINK_REGISTER
   1119        ret
   1120 
   1121 L(generate_grain_\type\()_lag3):
   1122        AARCH64_VALID_JUMP_TARGET
   1123        ldr             q29,      [x4]      // ar_coeffs_uv[0-15]
   1124        ldr             q30,      [x4, #16] // ar_coeffs_uv[16-24]
   1125        stp             d8,  d9,  [sp, #16]
   1126        stp             d10, d11, [sp, #32]
   1127        stp             d12, d13, [sp, #48]
   1128        stp             d14, d15, [sp, #64]
   1129        stp             x20, x21, [sp, #80]
   1130 
   1131        smov            w4,  v30.b[5]
   1132        smov            w20, v30.b[6]
   1133        smov            w21, v30.b[7]
   1134 
   1135        mov             w1,  #3
   1136        bl              generate_grain_rows_44_neon
   1137 
   1138        set_height      w1,  \type
   1139 1:
   1140        bl              sum_\type\()_lag3_left_neon  // 8
   1141        bl              sum_\type\()_lag3_mid_neon   // 16
   1142        bl              sum_\type\()_lag3_mid_neon   // 24
   1143        bl              sum_\type\()_lag3_mid_neon   // 32
   1144        bl              sum_\type\()_lag3_mid_neon   // 40
   1145        bl              sum_\type\()_lag3_right_neon // 44
   1146        subs            w1,  w1,  #1
   1147        increment_y_ptr x19, \type
   1148        add             x0,  x0,  #GRAIN_WIDTH*2-6*16
   1149        b.gt            1b
   1150 
   1151        ldp             x20, x21, [sp, #80]
   1152        ldp             d14, d15, [sp, #64]
   1153        ldp             d12, d13, [sp, #48]
   1154        ldp             d10, d11, [sp, #32]
   1155        ldp             d8,  d9,  [sp, #16]
   1156        ldp             x30, x19, [sp], #96
   1157        AARCH64_VALIDATE_LINK_REGISTER
   1158        ret
   1159 endfunc
   1160 
   1161 jumptable gen_grain_\type\()_tbl
   1162        .word L(generate_grain_\type\()_lag0) - gen_grain_\type\()_tbl
   1163        .word L(generate_grain_\type\()_lag1) - gen_grain_\type\()_tbl
   1164        .word L(generate_grain_\type\()_lag2) - gen_grain_\type\()_tbl
   1165        .word L(generate_grain_\type\()_lag3) - gen_grain_\type\()_tbl
   1166 endjumptable
   1167 .endm
   1168 
   1169 gen_grain_44 uv_420
   1170 gen_grain_44 uv_422
   1171 
   1172 .macro gather_interleaved dst1, dst2, src1, src2, off
   1173        umov            w14, \src1[0]
   1174        umov            w15, \src2[1]
   1175        umov            w16, \src1[2]
   1176        add             x14, x14, x3
   1177        umov            w17, \src2[3]
   1178        add             x15, x15, x3
   1179        ld1             {\dst1}[0+\off], [x14]
   1180        umov            w14, \src1[4]
   1181        add             x16, x16, x3
   1182        ld1             {\dst2}[1+\off], [x15]
   1183        umov            w15, \src2[5]
   1184        add             x17, x17, x3
   1185        ld1             {\dst1}[2+\off], [x16]
   1186        umov            w16, \src1[6]
   1187        add             x14, x14, x3
   1188        ld1             {\dst2}[3+\off], [x17]
   1189        umov            w17, \src2[7]
   1190        add             x15, x15, x3
   1191        ld1             {\dst1}[4+\off], [x14]
   1192        add             x16, x16, x3
   1193        ld1             {\dst2}[5+\off], [x15]
   1194        add             x17, x17, x3
   1195        ld1             {\dst1}[6+\off], [x16]
   1196        ld1             {\dst2}[7+\off], [x17]
   1197 .endm
   1198 
   1199 .macro gather dst1, dst2, src1, src2, src3, src4
   1200        gather_interleaved \dst1, \dst2, \src1, \src3, 0
   1201        gather_interleaved \dst2, \dst1, \src3, \src1, 0
   1202        gather_interleaved \dst1, \dst2, \src2, \src4, 8
   1203        gather_interleaved \dst2, \dst1, \src4, \src2, 8
   1204 .endm
   1205 
   1206 function gather32_neon
   1207        gather          v6.b, v7.b, v0.h, v1.h, v2.h, v3.h
   1208        ret
   1209 endfunc
   1210 
   1211 function gather16_neon
   1212        gather_interleaved v6.b, v7.b, v0.h, v1.h, 0
   1213        gather_interleaved v7.b, v6.b, v1.h, v0.h, 0
   1214        ins             v6.d[1], v7.d[0]
   1215        ret
   1216 endfunc
   1217 
   1218 const overlap_coeffs_0, align=4
   1219        .short 27, 17, 0,  0
   1220        .short 17, 27, 32, 32
   1221 endconst
   1222 
   1223 const overlap_coeffs_1, align=4
   1224        .short 23, 0,  0,  0
   1225        .short 22, 32, 32, 32
   1226 endconst
   1227 
   1228 .macro calc_offset offx, offy, src, sx, sy
   1229        and             \offy, \src,  #0xF     // randval & 0xF
   1230        lsr             \offx, \src,  #4       // randval >> 4
   1231 .if \sy == 0
   1232        add             \offy, \offy, \offy    // 2 * (randval & 0xF)
   1233 .endif
   1234 .if \sx == 0
   1235        add             \offx, \offx, \offx    // 2 * (randval >> 4)
   1236 .endif
   1237 .endm
   1238 
   1239 .macro add_offset dst, offx, offy, src, stride
   1240        madd            \dst, \stride, \offy, \src // grain_lut += grain_stride * offy
   1241        add             \dst, \dst, \offx, uxtw #1 // grain_lut += offx
   1242 .endm
   1243 
   1244 // void dav1d_fgy_32x32_16bpc_neon(pixel *const dst, const pixel *const src,
   1245 //                                 const ptrdiff_t stride,
   1246 //                                 const uint8_t scaling[SCALING_SIZE],
   1247 //                                 const int scaling_shift,
   1248 //                                 const entry grain_lut[][GRAIN_WIDTH],
   1249 //                                 const int offsets[][2],
   1250 //                                 const int h, const ptrdiff_t clip,
   1251 //                                 const ptrdiff_t type,
   1252 //                                 const int bitdepth_max);
   1253 function fgy_32x32_16bpc_neon, export=1
   1254        AARCH64_SIGN_LINK_REGISTER
   1255        str             x30, [sp, #-80]!
   1256        stp             d8,  d9,  [sp, #16]
   1257        stp             d10, d11, [sp, #32]
   1258        stp             d12, d13, [sp, #48]
   1259        str             d14,      [sp, #64]
   1260        eor             w4,  w4,  #15          // 15 - scaling_shift
   1261        ldr             w11, [x6, #8]          // offsets[1][0]
   1262        ldr             w13, [x6, #4]          // offsets[0][1]
   1263        ldr             w15, [x6, #12]         // offsets[1][1]
   1264        ldr             w10, [sp, #96]         // bitdepth_max
   1265        ldr             w6,  [x6]              // offsets[0][0]
   1266        dup             v26.8h,  w10           // bitdepth_max
   1267        clz             w10, w10
   1268        ldr             w8,  [sp, #80]         // clip
   1269        sub             w10, w10, #24          // -bitdepth_min_8
   1270        mov             x9,  #GRAIN_WIDTH*2    // grain_lut stride
   1271        neg             w10, w10               // bitdepth_min_8
   1272 
   1273        dup             v29.8h,  w4            // 15 - scaling_shift
   1274        dup             v27.8h,  w10           // bitdepth_min_8
   1275 
   1276        movrel          x16, overlap_coeffs_0
   1277 
   1278        cbz             w8,  1f
   1279        // clip
   1280        movi            v30.8h,  #16
   1281        movi            v31.8h,  #235
   1282        sshl            v30.8h,  v30.8h,  v27.8h
   1283        sshl            v31.8h,  v31.8h,  v27.8h
   1284        b               2f
   1285 1:
   1286        // no clip
   1287        movi            v30.8h,  #0
   1288        mov             v31.16b, v26.16b       // bitdepth_max
   1289 2:
   1290 
   1291        ushr            v26.8h,  v26.8h,  #1   // grain_max
   1292        not             v25.16b, v26.16b       // grain_min
   1293 
   1294        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
   1295 
   1296        add             x5,  x5,  #18          // grain_lut += 9
   1297        add             x5,  x5,  x9,  lsl #3  // grain_lut += 8 * grain_stride
   1298        add             x5,  x5,  x9           // grain_lut += grain_stride
   1299 
   1300        calc_offset     w11, w12, w11, 0,  0
   1301        calc_offset     w13, w14, w13, 0,  0
   1302        calc_offset     w15, w16, w15, 0,  0
   1303        calc_offset     w6,  w10, w6,  0,  0
   1304 
   1305        add_offset      x12, w11, x12, x5,  x9
   1306        add_offset      x14, w13, x14, x5,  x9
   1307        add_offset      x16, w15, x16, x5,  x9
   1308        add_offset      x5,  w6,  x10, x5,  x9
   1309 
   1310        ldr             w11, [sp, #88]         // type
   1311        movrel          x13, fgy_loop_tbl
   1312 
   1313        add             x4,  x12, #32*2        // grain_lut += FG_BLOCK_SIZE * bx
   1314        add             x6,  x14, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1315 
   1316        tst             w11, #1
   1317        ldrsw           x11, [x13, w11, uxtw #2]
   1318 
   1319        add             x8,  x16, x9,  lsl #5  // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1320        add             x8,  x8,  #32*2        // grain_lut += FG_BLOCK_SIZE * bx
   1321 
   1322        add             x11, x13, x11
   1323 
   1324        b.eq            1f
   1325        // y overlap
   1326        dup             v8.8h,   v27.h[0]
   1327        dup             v9.8h,   v27.h[1]
   1328        mov             w10, w7                // backup actual h
   1329        mov             w7,  #2
   1330 1:
   1331        br              x11
   1332 endfunc
   1333 
   1334 function fgy_loop_neon
   1335 .macro fgy ox, oy
   1336 L(loop_\ox\oy):
   1337        AARCH64_VALID_JUMP_TARGET
   1338 1:
   1339        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x1],  x2 // src
   1340 .if \ox
   1341        ld1             {v20.4h},                         [x4],  x9 // grain_lut old
   1342 .endif
   1343 .if \oy
   1344        ld1             {v21.8h, v22.8h, v23.8h, v24.8h}, [x6],  x9 // grain_lut top
   1345 .endif
   1346 .if \ox && \oy
   1347        ld1             {v14.4h},                         [x8],  x9 // grain_lut top old
   1348 .endif
   1349        mvni            v4.8h,   #0xf0, lsl #8 // 0x0fff
   1350        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x9 // grain_lut
   1351 
   1352        // Make sure that uninitialized pixels out of range past the right
   1353        // edge are in range; their actual values shouldn't matter.
   1354        and             v0.16b,  v0.16b,  v4.16b
   1355        and             v1.16b,  v1.16b,  v4.16b
   1356        and             v2.16b,  v2.16b,  v4.16b
   1357        and             v3.16b,  v3.16b,  v4.16b
   1358        bl              gather32_neon
   1359 
   1360 .if \ox
   1361        smull           v20.4s,  v20.4h,  v27.4h
   1362        smlal           v20.4s,  v16.4h,  v28.4h
   1363 .endif
   1364 
   1365 .if \oy
   1366 .if \ox
   1367        smull           v14.4s,  v14.4h,  v27.4h
   1368        smlal           v14.4s,  v21.4h,  v28.4h
   1369        sqrshrn         v20.4h,  v20.4s,  #5
   1370        sqrshrn         v14.4h,  v14.4s,  #5
   1371        smin            v20.4h,  v20.4h,  v26.4h
   1372        smin            v14.4h,  v14.4h,  v26.4h
   1373        smax            v20.4h,  v20.4h,  v25.4h
   1374        smax            v14.4h,  v14.4h,  v25.4h
   1375 .endif
   1376 
   1377 .if \ox
   1378        smull           v10.4s,  v20.4h,  v9.4h
   1379 .else
   1380        smull           v10.4s,  v16.4h,  v9.4h
   1381 .endif
   1382        smull2          v11.4s,  v16.8h,  v9.8h
   1383        smull           v12.4s,  v17.4h,  v9.4h
   1384        smull2          v13.4s,  v17.8h,  v9.8h
   1385        smull           v16.4s,  v18.4h,  v9.4h
   1386        smull2          v17.4s,  v18.8h,  v9.8h
   1387        smull           v18.4s,  v19.4h,  v9.4h
   1388        smull2          v19.4s,  v19.8h,  v9.8h
   1389 .if \ox
   1390        smlal           v10.4s,  v14.4h,  v8.4h
   1391 .else
   1392        smlal           v10.4s,  v21.4h,  v8.4h
   1393 .endif
   1394        smlal2          v11.4s,  v21.8h,  v8.8h
   1395        smlal           v12.4s,  v22.4h,  v8.4h
   1396        smlal2          v13.4s,  v22.8h,  v8.8h
   1397        smlal           v16.4s,  v23.4h,  v8.4h
   1398        smlal2          v17.4s,  v23.8h,  v8.8h
   1399        smlal           v18.4s,  v24.4h,  v8.4h
   1400        smlal2          v19.4s,  v24.8h,  v8.8h
   1401        sqrshrn         v10.4h,  v10.4s,  #5
   1402        sqrshrn2        v10.8h,  v11.4s,  #5
   1403        sqrshrn         v11.4h,  v12.4s,  #5
   1404        sqrshrn2        v11.8h,  v13.4s,  #5
   1405        sqrshrn         v12.4h,  v16.4s,  #5
   1406        sqrshrn2        v12.8h,  v17.4s,  #5
   1407        sqrshrn         v13.4h,  v18.4s,  #5
   1408        sqrshrn2        v13.8h,  v19.4s,  #5
   1409        smin            v16.8h,  v10.8h,  v26.8h
   1410        smin            v17.8h,  v11.8h,  v26.8h
   1411        smin            v18.8h,  v12.8h,  v26.8h
   1412        smin            v19.8h,  v13.8h,  v26.8h
   1413        smax            v16.8h,  v16.8h,  v25.8h
   1414        smax            v17.8h,  v17.8h,  v25.8h
   1415        smax            v18.8h,  v18.8h,  v25.8h
   1416        smax            v19.8h,  v19.8h,  v25.8h
   1417 .endif
   1418 
   1419        uxtl            v4.8h,   v6.8b            // scaling
   1420 .if \ox && !\oy
   1421        sqrshrn         v20.4h,  v20.4s,  #5
   1422 .endif
   1423        uxtl2           v5.8h,   v6.16b
   1424 .if \ox && !\oy
   1425        smin            v20.4h,  v20.4h,  v26.4h
   1426 .endif
   1427        uxtl            v6.8h,   v7.8b
   1428 .if \ox && !\oy
   1429        smax            v20.4h,  v20.4h,  v25.4h
   1430 .endif
   1431        uxtl2           v7.8h,   v7.16b
   1432 .if \ox && !\oy
   1433        ins             v16.d[0], v20.d[0]
   1434 .endif
   1435        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
   1436        ushl            v5.8h,   v5.8h,   v29.8h
   1437        ushl            v6.8h,   v6.8h,   v29.8h
   1438        ushl            v7.8h,   v7.8h,   v29.8h
   1439 
   1440        sqrdmulh        v20.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
   1441        sqrdmulh        v21.8h,  v17.8h,  v5.8h
   1442        sqrdmulh        v22.8h,  v18.8h,  v6.8h
   1443        sqrdmulh        v23.8h,  v19.8h,  v7.8h
   1444 
   1445        usqadd          v0.8h,   v20.8h           // *src + noise
   1446        usqadd          v1.8h,   v21.8h
   1447        usqadd          v2.8h,   v22.8h
   1448        usqadd          v3.8h,   v23.8h
   1449 
   1450        umax            v0.8h,   v0.8h,   v30.8h
   1451        umax            v1.8h,   v1.8h,   v30.8h
   1452        umax            v2.8h,   v2.8h,   v30.8h
   1453        umax            v3.8h,   v3.8h,   v30.8h
   1454        umin            v0.8h,   v0.8h,   v31.8h
   1455        umin            v1.8h,   v1.8h,   v31.8h
   1456        umin            v2.8h,   v2.8h,   v31.8h
   1457        umin            v3.8h,   v3.8h,   v31.8h
   1458 
   1459        subs            w7,  w7,  #1
   1460 .if \oy
   1461        dup             v8.8h,   v28.h[0]
   1462        dup             v9.8h,   v28.h[1]
   1463 .endif
   1464        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
   1465        b.gt            1b
   1466 
   1467 .if \oy
   1468        cmp             w10, #2
   1469        sub             w7,  w10, #2           // restore actual remaining h
   1470        b.gt            L(loop_\ox\()0)
   1471 .endif
   1472        ldr             d14,      [sp, #64]
   1473        ldp             d12, d13, [sp, #48]
   1474        ldp             d10, d11, [sp, #32]
   1475        ldp             d8,  d9,  [sp, #16]
   1476        ldr             x30, [sp], #80
   1477        AARCH64_VALIDATE_LINK_REGISTER
   1478        ret
   1479 .endm
   1480 
   1481        fgy             0, 0
   1482        fgy             0, 1
   1483        fgy             1, 0
   1484        fgy             1, 1
   1485 endfunc
   1486 
   1487 jumptable fgy_loop_tbl
   1488        .word L(loop_00) - fgy_loop_tbl
   1489        .word L(loop_01) - fgy_loop_tbl
   1490        .word L(loop_10) - fgy_loop_tbl
   1491        .word L(loop_11) - fgy_loop_tbl
   1492 endjumptable
   1493 
   1494 // void dav1d_fguv_32x32_420_16bpc_neon(pixel *const dst,
   1495 //                                      const pixel *const src,
   1496 //                                      const ptrdiff_t stride,
   1497 //                                      const uint8_t scaling[SCALING_SIZE],
   1498 //                                      const Dav1dFilmGrainData *const data,
   1499 //                                      const entry grain_lut[][GRAIN_WIDTH],
   1500 //                                      const pixel *const luma_row,
   1501 //                                      const ptrdiff_t luma_stride,
   1502 //                                      const int offsets[][2],
   1503 //                                      const ptrdiff_t h, const ptrdiff_t uv,
   1504 //                                      const ptrdiff_t is_id,
   1505 //                                      const ptrdiff_t type,
   1506 //                                      const int bitdepth_max);
   1507 .macro fguv layout, sx, sy
   1508 function fguv_32x32_\layout\()_16bpc_neon, export=1
   1509        AARCH64_SIGN_LINK_REGISTER
   1510        str             x30,      [sp, #-80]!
   1511        stp             d8,  d9,  [sp, #16]
   1512        stp             d10, d11, [sp, #32]
   1513        stp             d12, d13, [sp, #48]
   1514        stp             d14, d15, [sp, #64]
   1515 
   1516        ldp             x8,  x9,  [sp, #80]    // offsets, h
   1517        ldp             x10, x11, [sp, #96]    // uv, is_id
   1518        ldr             w16,      [sp, #120]   // bitdepth_max
   1519 
   1520        ldr             w13, [x4, #FGD_SCALING_SHIFT]
   1521        ldr             w12, [x4, #FGD_CLIP_TO_RESTRICTED_RANGE]
   1522        dup             v23.8h,  w16           // bitdepth_max
   1523        clz             w16, w16
   1524        eor             w13, w13, #15          // 15 - scaling_shift
   1525        sub             w16, w16, #24          // -bitdepth_min_8
   1526 
   1527        // !csfl
   1528        add             x10, x4,  x10, lsl #2  // + 4*uv
   1529        add             x14, x10, #FGD_UV_LUMA_MULT
   1530        add             x15, x10, #FGD_UV_MULT
   1531        add             x10, x10, #FGD_UV_OFFSET
   1532        neg             w16, w16               // bitdepth_min_8
   1533        ld1r            {v8.8h},  [x14]        // uv_luma_mult
   1534        ld1r            {v24.8h}, [x10]        // uv_offset
   1535        ld1r            {v9.8h},  [x15]        // uv_mult
   1536 
   1537        dup             v29.8h,  w13           // 15 - scaling_shift
   1538        dup             v27.8h,  w16           // bitdepth_min_8
   1539 
   1540        cbz             w12, 1f
   1541        // clip
   1542        movi            v30.8h,  #16
   1543        movi            v31.8h,  #240
   1544        sshl            v30.8h,  v30.8h,  v27.8h
   1545        sshl            v31.8h,  v31.8h,  v27.8h
   1546        cbz             w11, 2f
   1547        // is_id
   1548        movi            v31.8h,  #235
   1549        sshl            v31.8h,  v31.8h,  v27.8h
   1550        b               2f
   1551 1:
   1552        // no clip
   1553        movi            v30.8h,  #0
   1554        mov             v31.16b, v23.16b       // bitdepth_max
   1555 2:
   1556 
   1557        ushr            v15.8h,  v23.8h,  #1   // grain_max
   1558        sshl            v24.8h,  v24.8h,  v27.8h // uv_offset << bitdepth_min_8
   1559        not             v14.16b, v15.16b       // grain_min
   1560 
   1561        ldr             w12, [x8, #8]          // offsets[1][0]
   1562        ldr             w14, [x8, #4]          // offsets[0][1]
   1563        ldr             w16, [x8, #12]         // offsets[1][1]
   1564        ldr             w8,  [x8]              // offsets[0][0]
   1565 
   1566        mov             x10, #GRAIN_WIDTH*2    // grain_lut stride
   1567 
   1568        add             x5,  x5,  #(2*(3 + (2 >> \sx)*3)) // grain_lut += 9 or 6
   1569 .if \sy
   1570        add             x5,  x5,  x10, lsl #2  // grain_lut += 4 * grain_stride
   1571        add             x5,  x5,  x10, lsl #1  // grain_lut += 2 * grain_stride
   1572 .else
   1573        add             x5,  x5,  x10, lsl #3  // grain_lut += 8 * grain_stride
   1574        add             x5,  x5,  x10          // grain_lut += grain_stride
   1575 .endif
   1576 
   1577        calc_offset     w12, w13, w12, \sx, \sy
   1578        calc_offset     w14, w15, w14, \sx, \sy
   1579        calc_offset     w16, w17, w16, \sx, \sy
   1580        calc_offset     w8,  w11, w8,  \sx, \sy
   1581 
   1582        add_offset      x13, w12, x13, x5,  x10
   1583        add_offset      x15, w14, x15, x5,  x10
   1584        add_offset      x17, w16, x17, x5,  x10
   1585        add_offset      x5,  w8,  x11, x5,  x10
   1586 
   1587        add             x4,  x13, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
   1588        add             x8,  x15, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1589        add             x11, x17, x10, lsl #(5 - \sy) // grain_lut += grain_stride * FG_BLOCK_SIZE * by
   1590        add             x11, x11, #2*(32 >> \sx)      // grain_lut += FG_BLOCK_SIZE * bx
   1591 
   1592        ldr             w13, [sp, #112]        // type
   1593 
   1594        movrel          x16, overlap_coeffs_\sx
   1595        movrel          x14, fguv_loop_sx\sx\()_tbl
   1596 
   1597        ld1             {v27.4h, v28.4h}, [x16] // overlap_coeffs
   1598        tst             w13, #1
   1599        ldrsw           x13, [x14, w13, uxtw #2]
   1600 
   1601        b.eq            1f
   1602        // y overlap
   1603        sub             w12, w9,  #(2 >> \sy)  // backup remaining h
   1604        mov             w9,  #(2 >> \sy)
   1605 
   1606 1:
   1607        add             x13, x14, x13
   1608 
   1609 .if \sy
   1610        movi            v25.8h,  #23
   1611        movi            v26.8h,  #22
   1612 .else
   1613        movi            v25.8h,  #27
   1614        movi            v26.8h,  #17
   1615 .endif
   1616 
   1617 .if \sy
   1618        add             x7,  x7,  x7           // luma_stride *= 2
   1619 .endif
   1620 
   1621        br              x13
   1622 endfunc
   1623 .endm
   1624 
   1625 fguv 420, 1, 1
   1626 fguv 422, 1, 0
   1627 fguv 444, 0, 0
   1628 
   1629 function fguv_loop_sx0_neon
   1630 .macro fguv_loop_sx0 csfl, ox, oy
   1631 L(fguv_loop_sx0_csfl\csfl\()_\ox\oy):
   1632        AARCH64_VALID_JUMP_TARGET
   1633 1:
   1634 .if \ox
   1635        ld1             {v4.4h}, [x4],  x10  // grain_lut old
   1636 .endif
   1637 .if \oy
   1638        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x8],  x10 // grain_lut top
   1639 .endif
   1640 .if \ox && \oy
   1641        ld1             {v5.4h}, [x11], x10  // grain_lut top old
   1642 .endif
   1643        ld1             {v16.8h, v17.8h, v18.8h, v19.8h}, [x5],  x10 // grain_lut
   1644 
   1645 .if \ox
   1646        smull           v4.4s,   v4.4h,   v27.4h
   1647        smlal           v4.4s,   v16.4h,  v28.4h
   1648 .endif
   1649 
   1650 .if \oy
   1651 .if \ox
   1652        smull           v5.4s,   v5.4h,   v27.4h
   1653        smlal           v5.4s,   v0.4h,   v28.4h
   1654        sqrshrn         v4.4h,   v4.4s,   #5
   1655        sqrshrn         v5.4h,   v5.4s,   #5
   1656        smin            v4.4h,   v4.4h,   v15.4h
   1657        smin            v5.4h,   v5.4h,   v15.4h
   1658        smax            v4.4h,   v4.4h,   v14.4h
   1659        smax            v5.4h,   v5.4h,   v14.4h
   1660        ins             v16.d[0], v4.d[0]
   1661        ins             v0.d[0],  v5.d[0]
   1662 .endif
   1663 
   1664        smull           v6.4s,   v16.4h,  v26.4h
   1665        smull2          v7.4s,   v16.8h,  v26.8h
   1666        smull           v10.4s,  v17.4h,  v26.4h
   1667        smull2          v11.4s,  v17.8h,  v26.8h
   1668        smull           v16.4s,  v18.4h,  v26.4h
   1669        smull2          v17.4s,  v18.8h,  v26.8h
   1670        smull           v18.4s,  v19.4h,  v26.4h
   1671        smull2          v19.4s,  v19.8h,  v26.8h
   1672        smlal           v6.4s,   v0.4h,   v25.4h
   1673        smlal2          v7.4s,   v0.8h,   v25.8h
   1674        smlal           v10.4s,  v1.4h,   v25.4h
   1675        smlal2          v11.4s,  v1.8h,   v25.8h
   1676        smlal           v16.4s,  v2.4h,   v25.4h
   1677        smlal2          v17.4s,  v2.8h,   v25.8h
   1678        smlal           v18.4s,  v3.4h,   v25.4h
   1679        smlal2          v19.4s,  v3.8h,   v25.8h
   1680        sqrshrn         v6.4h,   v6.4s,   #5
   1681        sqrshrn2        v6.8h,   v7.4s,   #5
   1682        sqrshrn         v7.4h,   v10.4s,  #5
   1683        sqrshrn2        v7.8h,   v11.4s,  #5
   1684        sqrshrn         v10.4h,  v16.4s,  #5
   1685        sqrshrn2        v10.8h,  v17.4s,  #5
   1686        sqrshrn         v11.4h,  v18.4s,  #5
   1687        sqrshrn2        v11.8h,  v19.4s,  #5
   1688 .endif
   1689 
   1690 .if \ox && !\oy
   1691        sqrshrn         v4.4h,   v4.4s,   #5
   1692        smin            v4.4h,   v4.4h,   v15.4h
   1693 .endif
   1694        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
   1695 .if \oy
   1696        smin            v16.8h,  v6.8h,   v15.8h
   1697        smin            v17.8h,  v7.8h,   v15.8h
   1698        smin            v18.8h,  v10.8h,  v15.8h
   1699        smin            v19.8h,  v11.8h,  v15.8h
   1700        smax            v16.8h,  v16.8h,  v14.8h
   1701        smax            v17.8h,  v17.8h,  v14.8h
   1702        smax            v18.8h,  v18.8h,  v14.8h
   1703        smax            v19.8h,  v19.8h,  v14.8h
   1704 .endif
   1705 
   1706 .if \ox && !\oy
   1707        smax            v4.4h,   v4.4h,   v14.4h
   1708 .endif
   1709        ld1             {v10.8h, v11.8h, v12.8h, v13.8h}, [x1],  x2 // src
   1710 .if \ox && !\oy
   1711        ins             v16.d[0], v4.d[0]
   1712 .endif
   1713 
   1714 .if !\csfl
   1715        smull           v4.4s,   v0.4h,   v8.4h
   1716        smull2          v5.4s,   v0.8h,   v8.8h
   1717        smull           v6.4s,   v1.4h,   v8.4h
   1718        smull2          v7.4s,   v1.8h,   v8.8h
   1719        smull           v0.4s,   v2.4h,   v8.4h
   1720        smull2          v1.4s,   v2.8h,   v8.8h
   1721        smull           v2.4s,   v3.4h,   v8.4h
   1722        smull2          v3.4s,   v3.8h,   v8.8h
   1723        smlal           v4.4s,   v10.4h,  v9.4h
   1724        smlal2          v5.4s,   v10.8h,  v9.8h
   1725        smlal           v6.4s,   v11.4h,  v9.4h
   1726        smlal2          v7.4s,   v11.8h,  v9.8h
   1727        smlal           v0.4s,   v12.4h,  v9.4h
   1728        smlal2          v1.4s,   v12.8h,  v9.8h
   1729        smlal           v2.4s,   v13.4h,  v9.4h
   1730        smlal2          v3.4s,   v13.8h,  v9.8h
   1731        shrn            v4.4h,   v4.4s,   #6
   1732        shrn2           v4.8h,   v5.4s,   #6
   1733        shrn            v5.4h,   v6.4s,   #6
   1734        shrn2           v5.8h,   v7.4s,   #6
   1735        shrn            v6.4h,   v0.4s,   #6
   1736        shrn2           v6.8h,   v1.4s,   #6
   1737        shrn            v7.4h,   v2.4s,   #6
   1738        shrn2           v7.8h,   v3.4s,   #6
   1739        add             v0.8h,   v4.8h,   v24.8h
   1740        add             v1.8h,   v5.8h,   v24.8h
   1741        add             v2.8h,   v6.8h,   v24.8h
   1742        add             v3.8h,   v7.8h,   v24.8h
   1743        movi            v20.8h,  #0
   1744        smin            v0.8h,   v0.8h,   v23.8h
   1745        smin            v1.8h,   v1.8h,   v23.8h
   1746        smin            v2.8h,   v2.8h,   v23.8h
   1747        smin            v3.8h,   v3.8h,   v23.8h
   1748        smax            v0.8h,   v0.8h,   v20.8h
   1749        smax            v1.8h,   v1.8h,   v20.8h
   1750        smax            v2.8h,   v2.8h,   v20.8h
   1751        smax            v3.8h,   v3.8h,   v20.8h
   1752 .else
   1753        // Make sure that uninitialized pixels out of range past the right
   1754        // edge are in range; their actual values shouldn't matter.
   1755        and             v0.16b,  v0.16b,  v23.16b
   1756        and             v1.16b,  v1.16b,  v23.16b
   1757        and             v2.16b,  v2.16b,  v23.16b
   1758        and             v3.16b,  v3.16b,  v23.16b
   1759 .endif
   1760 
   1761        bl              gather32_neon
   1762 
   1763        uxtl            v4.8h,   v6.8b            // scaling
   1764        uxtl2           v5.8h,   v6.16b
   1765        uxtl            v6.8h,   v7.8b
   1766        uxtl2           v7.8h,   v7.16b
   1767 
   1768        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
   1769        ushl            v5.8h,   v5.8h,   v29.8h
   1770        ushl            v6.8h,   v6.8h,   v29.8h
   1771        ushl            v7.8h,   v7.8h,   v29.8h
   1772 
   1773        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
   1774        sqrdmulh        v17.8h,  v17.8h,  v5.8h
   1775        sqrdmulh        v18.8h,  v18.8h,  v6.8h
   1776        sqrdmulh        v19.8h,  v19.8h,  v7.8h
   1777 
   1778        usqadd          v10.8h,  v16.8h           // *src + noise
   1779        usqadd          v11.8h,  v17.8h
   1780        usqadd          v12.8h,  v18.8h
   1781        usqadd          v13.8h,  v19.8h
   1782 
   1783        umax            v0.8h,   v10.8h,  v30.8h
   1784        umax            v1.8h,   v11.8h,  v30.8h
   1785        umax            v2.8h,   v12.8h,  v30.8h
   1786        umax            v3.8h,   v13.8h,  v30.8h
   1787        umin            v0.8h,   v0.8h,   v31.8h
   1788        umin            v1.8h,   v1.8h,   v31.8h
   1789        umin            v2.8h,   v2.8h,   v31.8h
   1790        umin            v3.8h,   v3.8h,   v31.8h
   1791 
   1792        subs            w9,  w9,  #1
   1793 .if \oy
   1794        dup             v25.8h,  v28.h[0]
   1795        dup             v26.8h,  v28.h[1]
   1796 .endif
   1797        st1             {v0.8h, v1.8h, v2.8h, v3.8h},  [x0], x2 // dst
   1798        b.gt            1b
   1799 
   1800 .if \oy
   1801        cmp             w12, #0
   1802        mov             w9,  w12               // restore actual remaining h
   1803        b.gt            L(fguv_loop_sx0_csfl\csfl\()_\ox\()0)
   1804 .endif
   1805        b               9f
   1806 .endm
   1807        fguv_loop_sx0   0, 0, 0
   1808        fguv_loop_sx0   0, 0, 1
   1809        fguv_loop_sx0   0, 1, 0
   1810        fguv_loop_sx0   0, 1, 1
   1811        fguv_loop_sx0   1, 0, 0
   1812        fguv_loop_sx0   1, 0, 1
   1813        fguv_loop_sx0   1, 1, 0
   1814        fguv_loop_sx0   1, 1, 1
   1815 
   1816 9:
   1817        ldp             d14, d15, [sp, #64]
   1818        ldp             d12, d13, [sp, #48]
   1819        ldp             d10, d11, [sp, #32]
   1820        ldp             d8,  d9,  [sp, #16]
   1821        ldr             x30,      [sp], #80
   1822        AARCH64_VALIDATE_LINK_REGISTER
   1823        ret
   1824 endfunc
   1825 
   1826 jumptable fguv_loop_sx0_tbl
   1827        .word L(fguv_loop_sx0_csfl0_00) - fguv_loop_sx0_tbl
   1828        .word L(fguv_loop_sx0_csfl0_01) - fguv_loop_sx0_tbl
   1829        .word L(fguv_loop_sx0_csfl0_10) - fguv_loop_sx0_tbl
   1830        .word L(fguv_loop_sx0_csfl0_11) - fguv_loop_sx0_tbl
   1831        .word L(fguv_loop_sx0_csfl1_00) - fguv_loop_sx0_tbl
   1832        .word L(fguv_loop_sx0_csfl1_01) - fguv_loop_sx0_tbl
   1833        .word L(fguv_loop_sx0_csfl1_10) - fguv_loop_sx0_tbl
   1834        .word L(fguv_loop_sx0_csfl1_11) - fguv_loop_sx0_tbl
   1835 endjumptable
   1836 
   1837 function fguv_loop_sx1_neon
   1838 .macro fguv_loop_sx1 csfl, ox, oy
   1839 L(fguv_loop_sx1_csfl\csfl\()_\ox\oy):
   1840        AARCH64_VALID_JUMP_TARGET
   1841 1:
   1842 .if \ox
   1843        ld1             {v18.4h}, [x4],  x10  // grain_lut old
   1844 .endif
   1845 .if \oy
   1846        ld1             {v20.8h, v21.8h},  [x8],  x10 // grain_lut top
   1847 .endif
   1848 .if \ox && \oy
   1849        ld1             {v19.4h}, [x11], x10  // grain_lut top old
   1850 .endif
   1851        ld1             {v16.8h, v17.8h}, [x5],  x10 // grain_lut
   1852 
   1853 .if \ox
   1854        smull           v18.4s,  v18.4h,  v27.4h
   1855        smlal           v18.4s,  v16.4h,  v28.4h
   1856 .endif
   1857 
   1858 .if \oy
   1859 .if \ox
   1860        smull           v19.4s,  v19.4h,  v27.4h
   1861        smlal           v19.4s,  v20.4h,  v28.4h
   1862        sqrshrn         v18.4h,  v18.4s,  #5
   1863        sqrshrn         v19.4h,  v19.4s,  #5
   1864        smin            v18.4h,  v18.4h,  v15.4h
   1865        smin            v19.4h,  v19.4h,  v15.4h
   1866        smax            v18.4h,  v18.4h,  v14.4h
   1867        smax            v19.4h,  v19.4h,  v14.4h
   1868        ins             v16.d[0], v18.d[0]
   1869        ins             v20.d[0], v19.d[0]
   1870 .endif
   1871 
   1872        smull           v0.4s,   v16.4h,  v26.4h
   1873        smull2          v1.4s,   v16.8h,  v26.8h
   1874        smull           v2.4s,   v17.4h,  v26.4h
   1875        smull2          v3.4s,   v17.8h,  v26.8h
   1876        smlal           v0.4s,   v20.4h,  v25.4h
   1877        smlal2          v1.4s,   v20.8h,  v25.8h
   1878        smlal           v2.4s,   v21.4h,  v25.4h
   1879        smlal2          v3.4s,   v21.8h,  v25.8h
   1880        sqrshrn         v16.4h,  v0.4s,   #5
   1881        sqrshrn2        v16.8h,  v1.4s,   #5
   1882        sqrshrn         v17.4h,  v2.4s,   #5
   1883        sqrshrn2        v17.8h,  v3.4s,   #5
   1884 .endif
   1885 
   1886 .if \ox && !\oy
   1887        sqrshrn         v18.4h,  v18.4s,  #5
   1888        smin            v18.4h,  v18.4h,  v15.4h
   1889 .endif
   1890        ld1             {v0.8h,  v1.8h,  v2.8h,  v3.8h},  [x6],  x7 // luma
   1891 .if \oy
   1892        smin            v16.8h,  v16.8h,  v15.8h
   1893        smin            v17.8h,  v17.8h,  v15.8h
   1894        smax            v16.8h,  v16.8h,  v14.8h
   1895        smax            v17.8h,  v17.8h,  v14.8h
   1896 .endif
   1897 
   1898 .if \ox && !\oy
   1899        smax            v18.4h,  v18.4h,  v14.4h
   1900 .endif
   1901        ld1             {v10.8h, v11.8h},  [x1],  x2 // src
   1902 .if \ox && !\oy
   1903        ins             v16.d[0], v18.d[0]
   1904 .endif
   1905        addp            v0.8h,   v0.8h,   v1.8h
   1906        addp            v1.8h,   v2.8h,   v3.8h
   1907        urshr           v0.8h,   v0.8h,   #1
   1908        urshr           v1.8h,   v1.8h,   #1
   1909 .if !\csfl
   1910        smull           v2.4s,   v0.4h,   v8.4h
   1911        smull2          v3.4s,   v0.8h,   v8.8h
   1912        smull           v0.4s,   v1.4h,   v8.4h
   1913        smull2          v1.4s,   v1.8h,   v8.8h
   1914        smlal           v2.4s,   v10.4h,  v9.4h
   1915        smlal2          v3.4s,   v10.8h,  v9.8h
   1916        smlal           v0.4s,   v11.4h,  v9.4h
   1917        smlal2          v1.4s,   v11.8h,  v9.8h
   1918        shrn            v2.4h,   v2.4s,   #6
   1919        shrn2           v2.8h,   v3.4s,   #6
   1920        shrn            v3.4h,   v0.4s,   #6
   1921        shrn2           v3.8h,   v1.4s,   #6
   1922        add             v0.8h,   v2.8h,   v24.8h
   1923        add             v1.8h,   v3.8h,   v24.8h
   1924        movi            v2.8h,   #0
   1925        smin            v0.8h,   v0.8h,   v23.8h
   1926        smin            v1.8h,   v1.8h,   v23.8h
   1927        smax            v0.8h,   v0.8h,   v2.8h
   1928        smax            v1.8h,   v1.8h,   v2.8h
   1929 .else
   1930        // Make sure that uninitialized pixels out of range past the right
   1931        // edge are in range; their actual values shouldn't matter.
   1932        and             v0.16b,  v0.16b,  v23.16b
   1933        and             v1.16b,  v1.16b,  v23.16b
   1934 .endif
   1935 
   1936        bl              gather16_neon
   1937 
   1938        uxtl            v4.8h,   v6.8b            // scaling
   1939        uxtl2           v5.8h,   v6.16b
   1940 
   1941        ushl            v4.8h,   v4.8h,   v29.8h  // scaling << (15 - scaling_shift)
   1942        ushl            v5.8h,   v5.8h,   v29.8h
   1943 
   1944        sqrdmulh        v16.8h,  v16.8h,  v4.8h   // round2((scaling << (15 - scaling_shift) * grain, 15)
   1945        sqrdmulh        v17.8h,  v17.8h,  v5.8h
   1946 
   1947        usqadd          v10.8h,  v16.8h           // *src + noise
   1948        usqadd          v11.8h,  v17.8h
   1949 
   1950        umax            v0.8h,   v10.8h,  v30.8h
   1951        umax            v1.8h,   v11.8h,  v30.8h
   1952        umin            v0.8h,   v0.8h,   v31.8h
   1953        umin            v1.8h,   v1.8h,   v31.8h
   1954 
   1955 .if \oy
   1956        mov             v16.16b, v25.16b
   1957 .endif
   1958        subs            w9,  w9,  #1
   1959 .if \oy
   1960        mov             v25.16b, v26.16b
   1961        mov             v26.16b, v16.16b
   1962 .endif
   1963        st1             {v0.8h, v1.8h},  [x0], x2 // dst
   1964        b.gt            1b
   1965 
   1966 .if \oy
   1967        cmp             w12, #0
   1968        mov             w9,  w12               // restore actual remaining h
   1969        b.gt            L(fguv_loop_sx1_csfl\csfl\()_\ox\()0)
   1970 .endif
   1971 
   1972        b               9f
   1973 .endm
   1974        fguv_loop_sx1   0, 0, 0
   1975        fguv_loop_sx1   0, 0, 1
   1976        fguv_loop_sx1   0, 1, 0
   1977        fguv_loop_sx1   0, 1, 1
   1978        fguv_loop_sx1   1, 0, 0
   1979        fguv_loop_sx1   1, 0, 1
   1980        fguv_loop_sx1   1, 1, 0
   1981        fguv_loop_sx1   1, 1, 1
   1982 
   1983 9:
   1984        ldp             d14, d15, [sp, #64]
   1985        ldp             d12, d13, [sp, #48]
   1986        ldp             d10, d11, [sp, #32]
   1987        ldp             d8,  d9,  [sp, #16]
   1988        ldr             x30,      [sp], #80
   1989        AARCH64_VALIDATE_LINK_REGISTER
   1990        ret
   1991 endfunc
   1992 
   1993 jumptable fguv_loop_sx1_tbl
   1994        .word L(fguv_loop_sx1_csfl0_00) - fguv_loop_sx1_tbl
   1995        .word L(fguv_loop_sx1_csfl0_01) - fguv_loop_sx1_tbl
   1996        .word L(fguv_loop_sx1_csfl0_10) - fguv_loop_sx1_tbl
   1997        .word L(fguv_loop_sx1_csfl0_11) - fguv_loop_sx1_tbl
   1998        .word L(fguv_loop_sx1_csfl1_00) - fguv_loop_sx1_tbl
   1999        .word L(fguv_loop_sx1_csfl1_01) - fguv_loop_sx1_tbl
   2000        .word L(fguv_loop_sx1_csfl1_10) - fguv_loop_sx1_tbl
   2001        .word L(fguv_loop_sx1_csfl1_11) - fguv_loop_sx1_tbl
   2002 endjumptable