tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration16.S (28845B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2020, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 const right_ext_mask_buf
     32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     36 right_ext_mask:
     37        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     38        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     41 endconst
     42 
     43 // void dav1d_wiener_filter_h_16bpc_neon(int16_t *dst, const pixel (*left)[4],
     44 //                                       const pixel *src, const int16_t fh[8],
     45 //                                       const int w,
     46 //                                       enum LrEdgeFlags edges,
     47 //                                       const int bitdepth_max);
     48 function wiener_filter_h_16bpc_neon, export=1
     49        push            {r4-r6,lr}
     50        ldrd            r4,  r5,  [sp, #16]
     51        ldr             r6,       [sp, #24] // bitdepth_max
     52        vld1.16         {q0}, [r3, :128]
     53        clz             r6,  r6
     54        vmov.i32        q14, #1
     55        sub             r12, r6,  #38  // -(bitdepth + 6)
     56        sub             r6,  r6,  #25  // -round_bits_h
     57        neg             r12, r12       // bitdepth + 6
     58        vdup.32         q1,  r12
     59        vdup.32         q13, r6        // -round_bits_h
     60        vmov.i16        q15, #8192
     61        vshl.u32        q14, q14, q1   // 1 << (bitdepth + 6)
     62        vmvn.i16        q12, #0x8000   // 0x7fff = (1 << 15) - 1
     63 
     64        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
     65        tst             r5,  #1 // LR_HAVE_LEFT
     66        beq             1f
     67        // LR_HAVE_LEFT
     68        cmp             r1,  #0
     69        bne             0f
     70        // left == NULL
     71        sub             r2,  r2,  #6
     72        vld1.16         {q2, q3}, [r2]!
     73        b               2f
     74 
     75 0:
     76        // LR_HAVE_LEFT, left != NULL
     77        vld1.16         {q2, q3}, [r2]!
     78        vld1.16         {d3},  [r1]!
     79        // Move r2 back to account for the last 3 pixels we loaded earlier,
     80        // which we'll shift out.
     81        sub             r2,  r2,  #6
     82        vext.8          q3,  q2,  q3,  #10
     83        vext.8          q2,  q1,  q2,  #10
     84        b               2f
     85 1:
     86        vld1.16         {q2, q3}, [r2]!
     87        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
     88        // and shift q2/q3 to have 3x the first pixel at the front.
     89        vdup.16         q1,  d4[0]
     90        // Move r2 back to account for the last 3 pixels we loaded before,
     91        // which we shifted out.
     92        sub             r2,  r2,  #6
     93        vext.8          q3,  q2,  q3,  #10
     94        vext.8          q2,  q1,  q2,  #10
     95 
     96 2:
     97        tst             r5,  #2 // LR_HAVE_RIGHT
     98        bne             4f
     99 
    100 3:      // !LR_HAVE_RIGHT
    101 
    102        // Check whether we need to pad the right edge
    103        cmp             r4,  #11
    104        bge             4f   // If w >= 11, all used input pixels are valid
    105 
    106        // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
    107        // this ends up called again; it's not strictly needed in those
    108        // cases (we pad enough here), but keeping the code as simple as possible.
    109 
    110        // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
    111        // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
    112        sub             r12, r4,  #14
    113        lsl             r12, r12, #1
    114        // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
    115        // buffer pointer.
    116        movrel_local    r3,  right_ext_mask, -6
    117        ldrh            r12, [r2, r12]
    118        sub             r3,  r3,  r4,  lsl #1
    119        vdup.16         q11, r12
    120        vld1.8          {q9, q10}, [r3]
    121 
    122        vbit            q2,  q11, q9
    123        vbit            q3,  q11, q10
    124 
    125 4:      // Loop horizontally
    126        vext.8          q9,  q2,  q3,  #4
    127        vext.8          q10, q2,  q3,  #8
    128        vext.8          q8,  q2,  q3,  #2
    129        vext.8          q11, q2,  q3,  #10
    130        vadd.i16        q10, q10, q9
    131        vadd.i16        q11, q11, q8
    132        vext.8          q8,  q2,  q3,  #12
    133        vext.8          q9,  q2,  q3,  #6
    134        vadd.i16        q2,  q2,  q8
    135        vmull.s16       q8,  d18, d0[3]
    136        vmlal.s16       q8,  d20, d1[0]
    137        vmlal.s16       q8,  d22, d1[1]
    138        vmlal.s16       q8,  d4,  d1[2]
    139        vmull.s16       q9,  d19, d0[3]
    140        vmlal.s16       q9,  d21, d1[0]
    141        vmlal.s16       q9,  d23, d1[1]
    142        vmlal.s16       q9,  d5,  d1[2]
    143 
    144        vadd.i32        q8,  q8,  q14
    145        vadd.i32        q9,  q9,  q14
    146        vrshl.s32       q8,  q8,  q13
    147        vrshl.s32       q9,  q9,  q13
    148        vqmovun.s32     d16, q8
    149        vqmovun.s32     d17, q9
    150        vmin.u16        q8,  q8,  q12
    151        vsub.i16        q8,  q8,  q15
    152        subs            r4,  r4,  #8
    153        vst1.16         {q8}, [r0,  :128]!
    154 
    155        ble             9f
    156        vmov            q2,  q3
    157        tst             r5,  #2 // LR_HAVE_RIGHT
    158        vld1.16         {q3}, [r2]!
    159        bne             4b // If we don't need to pad, just keep filtering.
    160        b               3b // If we need to pad, check how many pixels we have left.
    161 
    162 9:
    163        pop             {r4-r6,pc}
    164 endfunc
    165 
    166 // void dav1d_wiener_filter_v_16bpc_neon(pixel *dst, int16_t **ptrs,
    167 //                                       const int16_t fv[8], const int w,
    168 //                                       const int bitdepth_max);
    169 function wiener_filter_v_16bpc_neon, export=1
    170        push            {r4-r9,lr}
    171        vpush           {q4-q7}
    172 
    173        ldr             lr,  [sp, #92]  // bitdepth_max
    174        vld1.16         {q0},  [r2, :128]
    175        vdup.16         q2,  lr
    176        clz             lr,  lr
    177        sub             lr,  lr,  #11   // round_bits_v
    178 
    179        vdup.32         q1,  lr
    180 
    181        ldrd            r4,  r5,  [r1]
    182        ldrd            r6,  r7,  [r1, #8]
    183        ldrd            r8,  r9,  [r1, #16]
    184 
    185        vneg.s32        q1,  q1         // -round_bits_v
    186 
    187 1:
    188        vld1.16         {q4,  q5},  [r4, :128]!
    189        vld1.16         {q6,  q7},  [r5, :128]!
    190        vld1.16         {q8,  q9},  [r6, :128]!
    191        vld1.16         {q10, q11}, [r7, :128]!
    192        vld1.16         {q12, q13}, [r8, :128]!
    193        vld1.16         {q14, q15}, [r9, :128]!
    194 
    195        subs            r3,  r3,  #16
    196 
    197        vmull.s16       q3,  d8,  d0[0]
    198        vmlal.s16       q3,  d12, d0[1]
    199        vmlal.s16       q3,  d16, d0[2]
    200        vmlal.s16       q3,  d20, d0[3]
    201        vmlal.s16       q3,  d24, d1[0]
    202        vmlal.s16       q3,  d28, d1[1]
    203        vmlal.s16       q3,  d28, d1[2]
    204        vmull.s16       q4,  d9,  d0[0]
    205        vmlal.s16       q4,  d13, d0[1]
    206        vmlal.s16       q4,  d17, d0[2]
    207        vmlal.s16       q4,  d21, d0[3]
    208        vmlal.s16       q4,  d25, d1[0]
    209        vmlal.s16       q4,  d29, d1[1]
    210        vmlal.s16       q4,  d29, d1[2]
    211 
    212        vmull.s16       q6,  d10, d0[0]
    213        vmlal.s16       q6,  d14, d0[1]
    214        vmlal.s16       q6,  d18, d0[2]
    215        vmlal.s16       q6,  d22, d0[3]
    216        vmlal.s16       q6,  d26, d1[0]
    217        vmlal.s16       q6,  d30, d1[1]
    218        vmlal.s16       q6,  d30, d1[2]
    219        vmull.s16       q5,  d11, d0[0]
    220        vmlal.s16       q5,  d15, d0[1]
    221        vmlal.s16       q5,  d19, d0[2]
    222        vmlal.s16       q5,  d23, d0[3]
    223        vmlal.s16       q5,  d27, d1[0]
    224        vmlal.s16       q5,  d31, d1[1]
    225        vmlal.s16       q5,  d31, d1[2]
    226 
    227        vrshl.s32       q3,  q3,  q1    // round_bits_v
    228        vrshl.s32       q4,  q4,  q1
    229        vrshl.s32       q6,  q6,  q1
    230        vrshl.s32       q5,  q5,  q1
    231        vqmovun.s32     d6,  q3
    232        vqmovun.s32     d7,  q4
    233        vqmovun.s32     d8,  q6
    234        vqmovun.s32     d9,  q5
    235        vmin.u16        q3,  q3,  q2    // bitdepth_max
    236        vmin.u16        q4,  q4,  q2
    237        vst1.16         {q3, q4}, [r0, :128]!
    238        bgt             1b
    239 
    240        // Shift the pointers, but only update the first 5; the 6th pointer is
    241        // kept as it was before (and the 7th is implicitly identical to the
    242        // 6th).
    243        ldrd            r4,  r5,  [r1, #4]
    244        ldrd            r6,  r7,  [r1, #12]
    245        ldr             r8,       [r1, #20]
    246        strd            r4,  r5,  [r1]
    247        strd            r6,  r7,  [r1, #8]
    248        str             r8,       [r1, #16]
    249 
    250        vpop            {q4-q7}
    251        pop             {r4-r9,pc}
    252 endfunc
    253 
    254 // void dav1d_wiener_filter_hv_16bpc_neon(pixel *dst, const pixel (*left)[4],
    255 //                                        const pixel *src,
    256 //                                        const int16_t filter[2][8],
    257 //                                        const int w,
    258 //                                        const enum LrEdgeFlags edges,
    259 //                                        int16_t **ptrs,
    260 //                                        const int bitdepth_max);
    261 function wiener_filter_hv_16bpc_neon, export=1
    262        push            {r4-r11,lr}
    263        vpush           {q4-q7}
    264        ldrd            r4,  r5,  [sp, #100]
    265        ldrd            r6,  r7,  [sp, #108]
    266        vld1.16         {q0, q1}, [r3, :128]
    267        vdup.16         q11, r7        // bitdepth_max
    268        clz             r7,  r7
    269        vmov.i32        q14, #1
    270        sub             r12, r7,  #38  // -(bitdepth + 6)
    271        sub             lr,  r7,  #11  // round_bits_v
    272        sub             r7,  r7,  #25  // -round_bits_h
    273        neg             r12, r12       // bitdepth + 6
    274        vdup.32         q2,  r12
    275        vdup.32         q13, r7        // -round_bits_h
    276        vdup.32         q10, lr        // round_bits_v
    277        mov             lr,  r6
    278        vmov.i16        q15, #8192
    279        vshl.u32        q14, q14, q2   // 1 << (bitdepth + 6)
    280        vneg.s32        q10, q10       // -round_bits_v
    281 
    282        ldrd            r6,  r7,  [lr]
    283        ldrd            r8,  r9,  [lr, #8]
    284        ldrd            r10, r11, [lr, #16]
    285        ldr             r12,      [lr, #24]
    286 
    287        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    288        tst             r5,  #1 // LR_HAVE_LEFT
    289        beq             1f
    290        // LR_HAVE_LEFT
    291        cmp             r1,  #0
    292        bne             0f
    293        // left == NULL
    294        sub             r2,  r2,  #6
    295        vld1.16         {q2, q3}, [r2]!
    296        b               2f
    297 
    298 0:
    299        // LR_HAVE_LEFT, left != NULL
    300        vld1.16         {q2, q3}, [r2]!
    301        vld1.16         {d9},  [r1]!
    302        // Move r2 back to account for the last 3 pixels we loaded earlier,
    303        // which we'll shift out.
    304        sub             r2,  r2,  #6
    305        vext.8          q3,  q2,  q3,  #10
    306        vext.8          q2,  q4,  q2,  #10
    307        b               2f
    308 1:
    309        vld1.16         {q2, q3}, [r2]!
    310        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
    311        // and shift q2/q3 to have 3x the first pixel at the front.
    312        vdup.16         q4,  d4[0]
    313        // Move r2 back to account for the last 3 pixels we loaded before,
    314        // which we shifted out.
    315        sub             r2,  r2,  #6
    316        vext.8          q3,  q2,  q3,  #10
    317        vext.8          q2,  q4,  q2,  #10
    318 
    319 2:
    320        tst             r5,  #2 // LR_HAVE_RIGHT
    321        bne             4f
    322 
    323 3:      // !LR_HAVE_RIGHT
    324 
    325        // Check whether we need to pad the right edge
    326        cmp             r4,  #11
    327        bge             4f   // If w >= 11, all used input pixels are valid
    328 
    329        // 1 <= w < 11, w+3 pixels valid in q2-q3. For w=9 or w=10,
    330        // this ends up called again; it's not strictly needed in those
    331        // cases (we pad enough here), but keeping the code as simple as possible.
    332 
    333        // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
    334        // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
    335        sub             lr,  r4,  #14
    336        lsl             lr,  lr,  #1
    337        // Insert padding in q2/3.h[w+3] onwards; fuse the +3 (*2) into the
    338        // buffer pointer.
    339        movrel_local    r3,  right_ext_mask, -6
    340        ldrh            lr,  [r2, lr]
    341        sub             r3,  r3,  r4,  lsl #1
    342        vdup.16         q4,  lr
    343        vld1.8          {q8, q9}, [r3]
    344 
    345        vbit            q2,  q4,  q8
    346        vbit            q3,  q4,  q9
    347 
    348 4:      // Loop horizontally
    349        vext.8          q5,  q2,  q3,  #4
    350        vext.8          q6,  q2,  q3,  #8
    351        vext.8          q4,  q2,  q3,  #2
    352        vext.8          q7,  q2,  q3,  #10
    353        vadd.i16        q6,  q6,  q5
    354        vadd.i16        q7,  q7,  q4
    355        vext.8          q4,  q2,  q3,  #12
    356        vext.8          q5,  q2,  q3,  #6
    357        vadd.i16        q2,  q2,  q4
    358        vld1.16         {q4},  [r6,  :128]!
    359        vmull.s16       q8,  d10, d0[3]
    360        vmlal.s16       q8,  d12, d1[0]
    361        vmlal.s16       q8,  d14, d1[1]
    362        vmlal.s16       q8,  d4,  d1[2]
    363        vmull.s16       q9,  d11, d0[3]
    364        vmlal.s16       q9,  d13, d1[0]
    365        vmlal.s16       q9,  d15, d1[1]
    366        vmlal.s16       q9,  d5,  d1[2]
    367        vld1.16         {q5},  [r7,  :128]!
    368 
    369        vmvn.i16        q12, #0x8000   // 0x7fff = (1 << 15) - 1
    370 
    371        vadd.i32        q8,  q8,  q14
    372        vadd.i32        q9,  q9,  q14
    373        vld1.16         {q6},  [r8,  :128]!
    374        vrshl.s32       q8,  q8,  q13
    375        vrshl.s32       q9,  q9,  q13
    376        vqmovun.s32     d16, q8
    377        vqmovun.s32     d17, q9
    378        vld1.16         {q7},  [r9,  :128]!
    379        vmin.u16        q8,  q8,  q12
    380        vld1.16         {q9},  [r10, :128]!
    381        vsub.i16        q8,  q8,  q15
    382 
    383        vld1.16         {q2},  [r11, :128]!
    384 
    385        vmull.s16       q12, d8,  d2[0]
    386        vmlal.s16       q12, d10, d2[1]
    387        vmlal.s16       q12, d12, d2[2]
    388        vmlal.s16       q12, d14, d2[3]
    389        vmlal.s16       q12, d18, d3[0]
    390        vmlal.s16       q12, d4,  d3[1]
    391        vmlal.s16       q12, d16, d3[2]
    392        vmull.s16       q4,  d9,  d2[0]
    393        vmlal.s16       q4,  d11, d2[1]
    394        vmlal.s16       q4,  d13, d2[2]
    395        vmlal.s16       q4,  d15, d2[3]
    396        vmlal.s16       q4,  d19, d3[0]
    397        vmlal.s16       q4,  d5,  d3[1]
    398        vmlal.s16       q4,  d17, d3[2]
    399 
    400        vrshl.s32       q12, q12, q10   // round_bits_v
    401        vrshl.s32       q4,  q4,  q10
    402        vqmovun.s32     d24, q12
    403        vqmovun.s32     d25, q4
    404        vst1.16         {q8},  [r12, :128]!
    405        vmin.u16        q12, q12, q11   // bitdepth_max
    406        subs            r4,  r4,  #8
    407        vst1.16         {q12}, [r0, :128]!
    408 
    409        ble             9f
    410        vmov            q2,  q3
    411        tst             r5,  #2 // LR_HAVE_RIGHT
    412        vld1.16         {q3}, [r2]!
    413        bne             4b // If we don't need to pad, just keep filtering.
    414        b               3b // If we need to pad, check how many pixels we have left.
    415 
    416 9:
    417        // Reload ptrs from arguments on the stack
    418        ldr             lr,       [sp, #108]
    419        // Rotate the window of pointers. Shift the 6 pointers downwards one step.
    420        ldrd            r6,  r7,  [lr, #4]
    421        ldrd            r8,  r9,  [lr, #12]
    422        ldrd            r10, r11, [lr, #20]
    423 
    424        strd            r6,  r7,  [lr]
    425        strd            r8,  r9,  [lr, #8]
    426        strd            r10, r11, [lr, #16]
    427        // The topmost pointer, ptrs[6], which isn't used as input, is set to
    428        // ptrs[0], which will be used as output for the next _hv call.
    429        // At the start of the filtering, the caller may set ptrs[6] to the
    430        // right next buffer to fill in, instead.
    431        str             r6,       [lr, #24]
    432 
    433        vpop            {q4-q7}
    434        pop             {r4-r11,pc}
    435 endfunc
    436 
    437 #include "looprestoration_tmpl.S"
    438 
    439 // void dav1d_sgr_box3_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
    440 //                                      const pixel (*left)[4],
    441 //                                      const pixel *src, const int w,
    442 //                                      const enum LrEdgeFlags edges);
    443 function sgr_box3_row_h_16bpc_neon, export=1
    444        push            {r4-r5,lr}
    445        ldrd            r4,  r5,  [sp, #12]
    446        add             r4,  r4,  #2 // w += 2
    447 
    448        tst             r5,  #1 // LR_HAVE_LEFT
    449        beq             1f
    450        cmp             r2,  #0
    451        bne             0f
    452 
    453        // LR_HAVE_LEFT && left == NULL
    454        sub             r3,  r3,  #4
    455        vld1.8          {q0, q1}, [r3]!
    456        b               2f
    457 
    458 0:
    459        // LR_HAVE_LEFT, left != NULL
    460        vld1.8          {q0, q1}, [r3]!
    461        vld1.16         {d5},     [r2]
    462        // Move r3 back to account for the last 2 pixels we loaded earlier,
    463        // which we'll shift out.
    464        sub             r3,  r3,  #4
    465        vext.8          q1,  q0,  q1,  #12
    466        vext.8          q0,  q2,  q0,  #12
    467        b               2f
    468 
    469 1:
    470        vld1.8          {q0, q1}, [r3]!
    471        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
    472        // and shift q0/q1 to have 2x the first pixel at the front.
    473        vdup.16         q2,  d0[0]
    474        // Move r3 back to account for the last 2 pixels we loaded before,
    475        // which we shifted out.
    476        sub             r3,  r3,  #4
    477        vext.8          q1,  q0,  q1,  #12
    478        vext.8          q0,  q2,  q0,  #12
    479 
    480 2:
    481        tst             r5,  #2 // LR_HAVE_RIGHT
    482        bne             4f
    483        // If we'll need to pad the right edge, load that pixel to pad with
    484        // here since we can find it pretty easily from here.
    485        sub             lr,  r4,  #(2 + 16 - 2 + 1)
    486        lsl             lr,  lr,  #1
    487        ldrh            lr,  [r3,  lr]
    488        // Fill q14 with the right padding pixel
    489        vdup.16         q14, lr
    490 3:      // !LR_HAVE_RIGHT
    491 
    492        // Check whether we need to pad the right edge
    493        cmp             r4,  #10
    494        bge             4f   // If w >= 10, all used input pixels are valid
    495 
    496        // 1 <= w < 10, w pixels valid in q0-q1. For w=9, this ends up called
    497        // again; it's not strictly needed in those cases (we pad enough here),
    498        // but keeping the code as simple as possible.
    499 
    500        // Insert padding in q0.h[w] onwards
    501        movrel_local    lr,  right_ext_mask
    502        sub             lr,  lr,  r4,  lsl #1
    503        vld1.8          {q12, q13}, [lr]
    504 
    505        vbit            q0,  q14, q12
    506        vbit            q1,  q14, q13
    507 
    508 4:      // Loop horizontally
    509        vext.8          q8,  q0,  q1,  #2
    510        vext.8          q9,  q0,  q1,  #4
    511 
    512        vadd.i16        q2,  q0,  q8
    513        vmull.u16       q12, d0,  d0
    514        vmlal.u16       q12, d16, d16
    515        vmlal.u16       q12, d18, d18
    516        vadd.i16        q2,  q2,  q9
    517        vmull.u16       q13, d1,  d1
    518        vmlal.u16       q13, d17, d17
    519        vmlal.u16       q13, d19, d19
    520        subs            r4,  r4,  #8
    521        vst1.16         {q2},       [r1,  :128]!
    522        vst1.32         {q12, q13}, [r0,  :128]!
    523 
    524        ble             9f
    525        tst             r5,  #2 // LR_HAVE_RIGHT
    526        vmov            q0,  q1
    527        vld1.16         {q1}, [r3]!
    528 
    529        bne             4b // If we don't need to pad, just keep summing.
    530        b               3b // If we need to pad, check how many pixels we have left.
    531 
    532 9:
    533        pop             {r4-r5,pc}
    534 endfunc
    535 
    536 // void dav1d_sgr_box5_row_h_16bpc_neon(int32_t *sumsq, int16_t *sum,
    537 //                                      const pixel (*left)[4],
    538 //                                      const pixel *src, const int w,
    539 //                                      const enum LrEdgeFlags edges);
    540 function sgr_box5_row_h_16bpc_neon, export=1
    541        push            {r4-r5,lr}
    542        ldrd            r4,  r5,  [sp, #12]
    543        add             r4,  r4,  #2 // w += 2
    544 
    545        tst             r5,  #1 // LR_HAVE_LEFT
    546        beq             1f
    547        cmp             r2,  #0
    548        bne             0f
    549 
    550        // LR_HAVE_LEFT && left == NULL
    551        sub             r3,  r3,  #6
    552        vld1.8          {q0, q1}, [r3]!
    553        b               2f
    554 
    555 0:
    556        // LR_HAVE_LEFT, left != NULL
    557        vld1.8          {q0, q1}, [r3]!
    558        vld1.16         {d5},     [r2]
    559        // Move r3 back to account for the last 2 pixels we loaded earlier,
    560        // which we'll shift out.
    561        sub             r3,  r3,  #6
    562        vext.8          q1,  q0,  q1,  #10
    563        vext.8          q0,  q2,  q0,  #10
    564        b               2f
    565 
    566 1:
    567        vld1.8          {q0, q1}, [r3]!
    568        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
    569        // and shift q0/q1 to have 3x the first pixel at the front.
    570        vdup.16         q2,  d0[0]
    571        // Move r3 back to account for the last 3 pixels we loaded before,
    572        // which we shifted out.
    573        sub             r3,  r3,  #6
    574        vext.8          q1,  q0,  q1,  #10
    575        vext.8          q0,  q2,  q0,  #10
    576 
    577 2:
    578        tst             r5,  #2 // LR_HAVE_RIGHT
    579        bne             4f
    580        // If we'll need to pad the right edge, load that pixel to pad with
    581        // here since we can find it pretty easily from here.
    582        sub             lr,  r4,  #(2 + 16 - 3 + 1)
    583        lsl             lr,  lr,  #1
    584        ldrh            lr,  [r3,  lr]
    585        // Fill q14 with the right padding pixel
    586        vdup.16         q14, lr
    587 3:      // !LR_HAVE_RIGHT
    588 
    589        // Check whether we need to pad the right edge
    590        cmp             r4,  #11
    591        bge             4f   // If w >= 11, all used input pixels are valid
    592 
    593        // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
    594        // this ends up called again; it's not strictly needed in those
    595        // cases (we pad enough here), but keeping the code as simple as possible.
    596 
    597        // Insert padding in q0.h[w+1] onwards; fuse the +1 into the
    598        // buffer pointer.
    599        movrel_local    lr,  right_ext_mask, -2
    600        sub             lr,  lr,  r4,  lsl #1
    601        vld1.8          {q12, q13}, [lr]
    602 
    603        vbit            q0,  q14, q12
    604        vbit            q1,  q14, q13
    605 
    606 4:      // Loop horizontally
    607        vext.8          q8,  q0,  q1,  #2
    608        vext.8          q9,  q0,  q1,  #4
    609 
    610        vadd.i16        q2,  q0,  q8
    611        vmull.u16       q12, d0,  d0
    612        vmlal.u16       q12, d16, d16
    613        vmlal.u16       q12, d18, d18
    614        vadd.i16        q2,  q2,  q9
    615        vmull.u16       q13, d1,  d1
    616        vmlal.u16       q13, d17, d17
    617        vmlal.u16       q13, d19, d19
    618 
    619        vext.8          q8,  q0,  q1,  #6
    620        vext.8          q9,  q0,  q1,  #8
    621 
    622        vadd.i16        q2,  q2,  q8
    623        vmlal.u16       q12, d16, d16
    624        vmlal.u16       q12, d1,  d1
    625        vadd.i16        q2,  q2,  q9
    626        vmlal.u16       q13, d17, d17
    627        vmlal.u16       q13, d19, d19
    628 
    629        subs            r4,  r4,  #8
    630        vst1.16         {q2},       [r1,  :128]!
    631        vst1.32         {q12, q13}, [r0,  :128]!
    632 
    633        ble             9f
    634        tst             r5,  #2 // LR_HAVE_RIGHT
    635        vmov            q0,  q1
    636        vld1.16         {q1}, [r3]!
    637        bne             4b // If we don't need to pad, just keep summing.
    638        b               3b // If we need to pad, check how many pixels we have left.
    639 
    640 9:
    641        pop             {r4-r5,pc}
    642 endfunc
    643 
    644 // void dav1d_sgr_box35_row_h_16bpc_neon(int32_t *sumsq3, int16_t *sum3,
    645 //                                       int32_t *sumsq5, int16_t *sum5,
    646 //                                       const pixel (*left)[4],
    647 //                                       const pixel *src, const int w,
    648 //                                       const enum LrEdgeFlags edges);
    649 function sgr_box35_row_h_16bpc_neon, export=1
    650        push            {r4-r7,lr}
    651        ldrd            r4,  r5,  [sp, #20]
    652        ldrd            r6,  r7,  [sp, #28]
    653        add             r6,  r6,  #2 // w += 2
    654 
    655        tst             r7,  #1 // LR_HAVE_LEFT
    656        beq             1f
    657        cmp             r4,  #0
    658        bne             0f
    659 
    660        // LR_HAVE_LEFT && left == NULL
    661        sub             r5,  r5,  #6
    662        vld1.8          {q0, q1}, [r5]!
    663        b               2f
    664 
    665 0:
    666        // LR_HAVE_LEFT, left != NULL
    667        vld1.8          {q0, q1}, [r5]!
    668        vld1.16         {d5},     [r4]
    669        // Move r3 back to account for the last 2 pixels we loaded earlier,
    670        // which we'll shift out.
    671        sub             r5,  r5,  #6
    672        vext.8          q1,  q0,  q1,  #10
    673        vext.8          q0,  q2,  q0,  #10
    674        b               2f
    675 
    676 1:
    677        vld1.8          {q0, q1}, [r5]!
    678        // !LR_HAVE_LEFT, fill q1 with the leftmost pixel
    679        // and shift q0/q1 to have 3x the first pixel at the front.
    680        vdup.16         q2,  d0[0]
    681        // Move r3 back to account for the last 3 pixels we loaded before,
    682        // which we shifted out.
    683        sub             r5,  r5,  #6
    684        vext.8          q1,  q0,  q1,  #10
    685        vext.8          q0,  q2,  q0,  #10
    686 
    687 2:
    688        tst             r7,  #2 // LR_HAVE_RIGHT
    689        bne             4f
    690        // If we'll need to pad the right edge, load that pixel to pad with
    691        // here since we can find it pretty easily from here.
    692        sub             lr,  r6,  #(2 + 16 - 3 + 1)
    693        lsl             lr,  lr,  #1
    694        ldrh            lr,  [r5,  lr]
    695        // Fill q14 with the right padding pixel
    696        vdup.16         q14, lr
    697 3:      // !LR_HAVE_RIGHT
    698 
    699        // Check whether we need to pad the right edge
    700        cmp             r6,  #11
    701        bge             4f   // If w >= 11, all used input pixels are valid
    702 
    703        // 1 <= w < 11, w+1 pixels valid in q0-q1. For w=9 or w=10,
    704        // this ends up called again; it's not strictly needed in those
    705        // cases (we pad enough here), but keeping the code as simple as possible.
    706 
    707        // Insert padding in q0.h[w+1] onwards; fuse the +1 into the
    708        // buffer pointer.
    709        movrel_local    lr,  right_ext_mask, -2
    710        sub             lr,  lr,  r6,  lsl #1
    711        vld1.8          {q12, q13}, [lr]
    712 
    713        vbit            q0,  q14, q12
    714        vbit            q1,  q14, q13
    715 
    716 4:      // Loop horizontally
    717        vext.8          q8,  q0,  q1,  #2
    718        vext.8          q9,  q0,  q1,  #4
    719        vext.8          q10, q0,  q1,  #6
    720        vext.8          q11, q0,  q1,  #8
    721 
    722        vadd.i16        q2,  q8,  q9
    723        vadd.i16        q3,  q0,  q11
    724        vadd.i16        q2,  q2,  q10
    725 
    726        vmull.u16       q12, d16, d16
    727        vmlal.u16       q12, d18, d18
    728        vmlal.u16       q12, d20, d20
    729        vmull.u16       q13, d17, d17
    730        vmlal.u16       q13, d19, d19
    731        vmlal.u16       q13, d21, d21
    732 
    733        vadd.i16        q3,  q3,  q2
    734        vst1.16         {q2},       [r1,  :128]!
    735        vst1.32         {q12, q13}, [r0,  :128]!
    736 
    737        vmlal.u16       q12, d0,  d0
    738        vmlal.u16       q12, d22, d22
    739        vmlal.u16       q13, d1,  d1
    740        vmlal.u16       q13, d23, d23
    741 
    742        subs            r6,  r6,  #8
    743        vst1.16         {q3},       [r3,  :128]!
    744        vst1.32         {q12, q13}, [r2,  :128]!
    745 
    746        ble             9f
    747        tst             r7,  #2 // LR_HAVE_RIGHT
    748        vmov            q0,  q1
    749        vld1.16         {q1}, [r5]!
    750        bne             4b // If we don't need to pad, just keep summing.
    751        b               3b // If we need to pad, check how many pixels we have left.
    752 
    753 9:
    754        pop             {r4-r7,pc}
    755 endfunc
    756 
    757 sgr_funcs 16