tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

looprestoration.S (26504B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2019, Martin Storsjo
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/arm/asm.S"
     29 #include "util.S"
     30 
     31 const right_ext_mask_buf
     32        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     33        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     34        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     35        .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
     36 right_ext_mask:
     37        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     38        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     39        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     40        .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
     41 endconst
     42 
     43 // void dav1d_wiener_filter_h_8bpc_neon(int16_t *dst, const pixel (*left)[4],
     44 //                                      const pixel *src, const int16_t fh[8],
     45 //                                      const int w,
     46 //                                      const enum LrEdgeFlags edges);
     47 function wiener_filter_h_8bpc_neon, export=1
     48        push            {r4-r5,lr}
     49        ldrd            r4,  r5,  [sp, #12]
     50        vld1.16         {q0},  [r3, :128]
     51        movw            r12, #(1 << 14) - (1 << 2)
     52        vdup.16         q14, r12
     53        vmov.s16        q15, #2048
     54 
     55        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
     56        tst             r5,  #1 // LR_HAVE_LEFT
     57        beq             1f
     58        // LR_HAVE_LEFT
     59        cmp             r1,  #0
     60        bne             0f
     61        // left == NULL
     62        sub             r2,  r2,  #3
     63        vld1.8          {q2},  [r2]!
     64        b               2f
     65 
     66 0:
     67        // LR_HAVE_LEFT, left != NULL
     68        vld1.8          {q2},  [r2]!
     69        vld1.32         {d3[1]},  [r1]
     70        // Move r2 back to account for the last 3 bytes we loaded earlier,
     71        // which we'll shift out.
     72        sub             r2,  r2,  #3
     73        vext.8          q2,  q1,  q2,  #13
     74        b               2f
     75 
     76 1:
     77        vld1.8          {q2},  [r2]!
     78        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
     79        // and shift q2 to have 3x the first byte at the front.
     80        vdup.8          q1,  d4[0]
     81        // Move r2 back to account for the last 3 bytes we loaded before,
     82        // which we shifted out.
     83        sub             r2,  r2,  #3
     84        vext.8          q2,  q1,  q2,  #13
     85 
     86 2:
     87        vmovl.u8        q1,  d4
     88        vmovl.u8        q2,  d5
     89 
     90        tst             r5,  #2 // LR_HAVE_RIGHT
     91        bne             4f
     92 
     93 3:      // !LR_HAVE_RIGHT
     94 
     95        // Check whether we need to pad the right edge
     96        cmp             r4,  #11
     97        bge             4f   // If w >= 11, all used input pixels are valid
     98 
     99        // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
    100        // this ends up called again; it's not strictly needed in those
    101        // cases (we pad enough here), but keeping the code as simple as possible.
    102 
    103        // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
    104        // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
    105        sub             r12, r4,  #14
    106        // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
    107        // buffer pointer.
    108        movrel_local    r3,  right_ext_mask, -6
    109        ldrb            r12, [r2, r12]
    110        sub             r3,  r3,  r4,  lsl #1
    111        vdup.16         q13, r12
    112        vld1.8          {q10, q11}, [r3]
    113 
    114        vbit            q1,  q13, q10
    115        vbit            q2,  q13, q11
    116 
    117 4:      // Loop horizontally
    118        vext.8          q10, q1,  q2,  #4
    119        vext.8          q11, q1,  q2,  #8
    120        vext.8          q9,  q1,  q2,  #2
    121        vext.8          q12, q1,  q2,  #10
    122        vext.8          q13, q1,  q2,  #12
    123        vext.8          q8,  q1,  q2,  #6
    124        vadd.i16        q10, q10, q11
    125        vadd.i16        q9,  q9,  q12
    126        vadd.i16        q13, q13, q1
    127        vshl.s16        q1,  q8,  #7
    128        vmul.s16        q3,  q8,  d0[3]
    129        vmla.s16        q3,  q10, d1[0]
    130        vmla.s16        q3,  q9,  d1[1]
    131        vmla.s16        q3,  q13, d1[2]
    132 
    133        vsub.s16        q1,  q1,  q14
    134        vqadd.s16       q3,  q3,  q1
    135        vshr.s16        q3,  q3,  #3
    136        vadd.s16        q3,  q3,  q15
    137        subs            r4,  r4,  #8
    138        vst1.16         {q3},  [r0,  :128]!
    139 
    140        ble             9f
    141        vmov            q1,  q2
    142        vld1.8          {d4},  [r2]!
    143        tst             r5,  #2 // LR_HAVE_RIGHT
    144        vmovl.u8        q2,  d4
    145        bne             4b // If we don't need to pad, just keep filtering.
    146        b               3b // If we need to pad, check how many pixels we have left.
    147 
    148 9:
    149        pop             {r4-r5,pc}
    150 endfunc
    151 
    152 // void dav1d_wiener_filter_v_8bpc_neon(pixel *dst, int16_t **ptrs,
    153 //                                      const int16_t fv[8], const int w);
    154 function wiener_filter_v_8bpc_neon, export=1
    155        push            {r4-r9,lr}
    156        vpush           {q4-q6}
    157 
    158        vld1.16         {q0},  [r2, :128]
    159 
    160        ldrd            r4,  r5,  [r1]
    161        ldrd            r6,  r7,  [r1, #8]
    162        ldrd            r8,  r9,  [r1, #16]
    163 
    164 1:
    165        vld1.16         {q1,  q2},  [r4, :128]!
    166        vld1.16         {q8,  q9},  [r9, :128]!
    167 
    168        vld1.16         {q5,  q6},  [r5, :128]!
    169 
    170        vld1.16         {q10, q11}, [r6, :128]!
    171        vld1.16         {q12, q13}, [r8, :128]!
    172 
    173        vld1.16         {q14, q15}, [r7, :128]!
    174 
    175        subs            r3,  r3,  #16
    176 
    177        vadd.i16        q1,  q1,  q8
    178        vadd.i16        q2,  q2,  q9
    179 
    180        vadd.i16        q5,  q5,  q8
    181        vadd.i16        q6,  q6,  q9
    182 
    183        vadd.i16        q10, q10, q12
    184        vadd.i16        q11, q11, q13
    185 
    186        vmull.s16       q3,  d28, d0[3]
    187        vmlal.s16       q3,  d2,  d0[0]
    188        vmlal.s16       q3,  d10, d0[1]
    189        vmlal.s16       q3,  d20, d0[2]
    190 
    191        vmull.s16       q4,  d29, d0[3]
    192        vmlal.s16       q4,  d3,  d0[0]
    193        vmlal.s16       q4,  d11, d0[1]
    194        vmlal.s16       q4,  d21, d0[2]
    195 
    196        vmull.s16       q8,  d30, d0[3]
    197        vmlal.s16       q8,  d4,  d0[0]
    198        vmlal.s16       q8,  d12, d0[1]
    199        vmlal.s16       q8,  d22, d0[2]
    200 
    201        vmull.s16       q9,  d31, d0[3]
    202        vmlal.s16       q9,  d5,  d0[0]
    203        vmlal.s16       q9,  d13, d0[1]
    204        vmlal.s16       q9,  d23, d0[2]
    205 
    206        vqrshrun.s32    d6,  q3,  #11
    207        vqrshrun.s32    d7,  q4,  #11
    208        vqrshrun.s32    d16, q8,  #11
    209        vqrshrun.s32    d17, q9,  #11
    210        vqmovun.s16     d6,  q3
    211        vqmovun.s16     d7,  q8
    212        vst1.8          {q3}, [r0, :128]!
    213        bgt             1b
    214 
    215        // Shift the pointers, but only update the first 5; the 6th pointer is
    216        // kept as it was before (and the 7th is implicitly identical to the
    217        // 6th).
    218        ldrd            r4,  r5,  [r1, #4]
    219        ldrd            r6,  r7,  [r1, #12]
    220        ldr             r8,       [r1, #20]
    221        strd            r4,  r5,  [r1]
    222        strd            r6,  r7,  [r1, #8]
    223        str             r8,       [r1, #16]
    224 
    225        vpop            {q4-q6}
    226        pop             {r4-r9,pc}
    227 endfunc
    228 
    229 // void dav1d_wiener_filter_hv_8bpc_neon(pixel *dst, const pixel (*left)[4],
    230 //                                       const pixel *src,
    231 //                                       const int16_t filter[2][8],
    232 //                                       const int w,
    233 //                                       const enum LrEdgeFlags edges,
    234 //                                       int16_t **ptrs);
    235 function wiener_filter_hv_8bpc_neon, export=1
    236        push            {r4-r11,lr}
    237        vpush           {q4-q7}
    238        ldrd            r4,  r5,  [sp, #100]
    239        ldr             lr,       [sp, #108]
    240        vld1.16         {q0, q1}, [r3, :128]
    241        movw            r12, #(1 << 14) - (1 << 2)
    242        vdup.16         q14, r12
    243        vmov.s16        q15, #2048
    244 
    245        ldrd            r6,  r7,  [lr]
    246        ldrd            r8,  r9,  [lr, #8]
    247        ldrd            r10, r11, [lr, #16]
    248        ldr             r12,      [lr, #24]
    249 
    250        // Set up the src pointers to include the left edge, for LR_HAVE_LEFT, left == NULL
    251        tst             r5,  #1 // LR_HAVE_LEFT
    252        beq             1f
    253        // LR_HAVE_LEFT
    254        cmp             r1,  #0
    255        bne             0f
    256        // left == NULL
    257        sub             r2,  r2,  #3
    258        vld1.8          {q2},  [r2]!
    259        b               2f
    260 
    261 0:
    262        // LR_HAVE_LEFT, left != NULL
    263        vld1.8          {q2},  [r2]!
    264        vld1.32         {d3[1]},  [r1]
    265        // Move r2 back to account for the last 3 bytes we loaded earlier,
    266        // which we'll shift out.
    267        sub             r2,  r2,  #3
    268        vext.8          q2,  q1,  q2,  #13
    269        b               2f
    270 
    271 1:
    272        vld1.8          {q2},  [r2]!
    273        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
    274        // and shift q2 to have 3x the first byte at the front.
    275        vdup.8          q3,  d4[0]
    276        // Move r2 back to account for the last 3 bytes we loaded before,
    277        // which we shifted out.
    278        sub             r2,  r2,  #3
    279        vext.8          q2,  q3,  q2,  #13
    280 
    281 2:
    282        vmovl.u8        q3,  d5
    283        vmovl.u8        q2,  d4
    284 
    285        tst             r5,  #2 // LR_HAVE_RIGHT
    286        bne             4f
    287 
    288 3:      // !LR_HAVE_RIGHT
    289 
    290        // Check whether we need to pad the right edge
    291        cmp             r4,  #11
    292        bge             4f   // If w >= 11, all used input pixels are valid
    293 
    294        // 1 <= w < 11, w+3 pixels valid in q1-q2. For w=9 or w=10,
    295        // this ends up called again; it's not strictly needed in those
    296        // cases (we pad enough here), but keeping the code as simple as possible.
    297 
    298        // The padding pixel is q1/2.h[w+2]. r2 points at the next input, ie
    299        // q1/2.h[16]. Thus read from r2[w-14] to find the padding pixel.
    300        sub             lr,  r4,  #14
    301        // Insert padding in q1/2.h[w+3] onwards; fuse the +3 (*2) into the
    302        // buffer pointer.
    303        movrel_local    r3,  right_ext_mask, -6
    304        ldrb            lr,  [r2, lr]
    305        sub             r3,  r3,  r4,  lsl #1
    306        vdup.16         q13, lr
    307        vld1.8          {q10, q11}, [r3]
    308 
    309        vbit            q2,  q13, q10
    310        vbit            q3,  q13, q11
    311 
    312 4:      // Loop horizontally
    313        vext.8          q10, q2,  q3,  #4
    314        vext.8          q11, q2,  q3,  #8
    315        vext.8          q9,  q2,  q3,  #2
    316        vext.8          q12, q2,  q3,  #10
    317        vext.8          q13, q2,  q3,  #12
    318        vext.8          q8,  q2,  q3,  #6
    319        vadd.i16        q10, q10, q11
    320        vadd.i16        q9,  q9,  q12
    321        vadd.i16        q13, q13, q2
    322        vld1.16         {q6},   [r7,  :128]!
    323        vshl.s16        q2,  q8,  #7
    324        vld1.16         {q11},  [r11, :128]!
    325        vsub.s16        q2,  q2,  q14
    326        vld1.16         {q7},   [r8,  :128]!
    327        vmul.s16        q4,  q8,  d0[3]
    328        vmla.s16        q4,  q10, d1[0]
    329        vmla.s16        q4,  q9,  d1[1]
    330        vmla.s16        q4,  q13, d1[2]
    331 
    332        vld1.16         {q10},  [r10, :128]!
    333        vqadd.s16       q4,  q4,  q2
    334 
    335        vld1.16         {q9},   [r9,  :128]!
    336        vshr.s16        q4,  q4,  #3
    337        vld1.16         {q5},   [r6,  :128]!
    338        vadd.s16        q4,  q4,  q15
    339 
    340        vadd.s16        q6,  q6,  q11
    341        vadd.s16        q7,  q7,  q10
    342        vadd.s16        q5,  q5,  q4
    343 
    344        vmull.s16       q8,  d18, d2[3]
    345        vmlal.s16       q8,  d12, d2[1]
    346        vmlal.s16       q8,  d14, d2[2]
    347        vmlal.s16       q8,  d10, d2[0]
    348 
    349        vmull.s16       q9,  d19, d2[3]
    350        vmlal.s16       q9,  d13, d2[1]
    351        vmlal.s16       q9,  d15, d2[2]
    352        vmlal.s16       q9,  d11, d2[0]
    353 
    354        vqrshrun.s32    d16, q8,  #11
    355        vqrshrun.s32    d17, q9,  #11
    356        vst1.16         {q4},  [r12, :128]!
    357        vqmovun.s16     d16, q8
    358        subs            r4,  r4,  #8
    359        vst1.8          {d16}, [r0, :64]!
    360 
    361        ble             9f
    362        vmov            q2,  q3
    363        vld1.8          {d6},  [r2]!
    364        tst             r5,  #2 // LR_HAVE_RIGHT
    365        vmovl.u8        q3,  d6
    366        bne             4b // If we don't need to pad, just keep filtering.
    367        b               3b // If we need to pad, check how many pixels we have left.
    368 
    369 9:
    370        // Reload ptrs from arguments on the stack
    371        ldr             lr,       [sp, #108]
    372        // Rotate the window of pointers. Shift the 6 pointers downwards one step.
    373        ldrd            r6,  r7,  [lr, #4]
    374        ldrd            r8,  r9,  [lr, #12]
    375        ldrd            r10, r11, [lr, #20]
    376 
    377        strd            r6,  r7,  [lr]
    378        strd            r8,  r9,  [lr, #8]
    379        strd            r10, r11, [lr, #16]
    380        // The topmost pointer, ptrs[6], which isn't used as input, is set to
    381        // ptrs[0], which will be used as output for the next _hv call.
    382        // At the start of the filtering, the caller may set ptrs[6] to the
    383        // right next buffer to fill in, instead.
    384        str             r6,       [lr, #24]
    385 
    386        vpop            {q4-q7}
    387        pop             {r4-r11,pc}
    388 endfunc
    389 
    390 #include "looprestoration_tmpl.S"
    391 
    392 // void dav1d_sgr_box3_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
    393 //                                     const pixel (*left)[4],
    394 //                                     const pixel *src, const int w,
    395 //                                     const enum LrEdgeFlags edges);
    396 function sgr_box3_row_h_8bpc_neon, export=1
    397        push            {r4-r5,lr}
    398        ldrd            r4,  r5,  [sp, #12]
    399        add             r4,  r4,  #2 // w += 2
    400 
    401        tst             r5,  #1 // LR_HAVE_LEFT
    402        beq             1f
    403        cmp             r2,  #0
    404        bne             0f
    405 
    406        // LR_HAVE_LEFT && left == NULL
    407        sub             r3,  r3,  #2
    408        vld1.8          {q0}, [r3]!
    409        b               2f
    410 
    411 0:
    412        // LR_HAVE_LEFT, left != NULL
    413        vld1.8          {q0},   [r3]!
    414        vld1.32         {d3[]}, [r2]
    415        // Move r3 back to account for the last 2 bytes we loaded earlier,
    416        // which we'll shift out.
    417        sub             r3,  r3,  #2
    418        vext.8          q0,  q1,  q0,  #14
    419        b               2f
    420 
    421 1:
    422        vld1.8          {q0},   [r3]!
    423        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
    424        // and shift q0 to have 2x the first byte at the front.
    425        vdup.8          q1,  d0[0]
    426        // Move r3 back to account for the last 2 bytes we loaded before,
    427        // which we shifted out.
    428        sub             r3,  r3,  #2
    429        vext.8          q0,  q1,  q0,  #14
    430 
    431 2:
    432        vmull.u8        q1,  d0,  d0
    433        vmull.u8        q2,  d1,  d1
    434 
    435        tst             r5,  #2 // LR_HAVE_RIGHT
    436        bne             4f
    437        // If we'll need to pad the right edge, load that byte to pad with
    438        // here since we can find it pretty easily from here.
    439        sub             lr,  r4,  #(2 + 16 - 2 + 1)
    440        ldrb            lr,  [r3,  lr]
    441        // Fill q14 with the right padding pixel
    442        vdup.8          q14, lr
    443 3:      // !LR_HAVE_RIGHT
    444 
    445        // Check whether we need to pad the right edge
    446        cmp             r4,  #10
    447        bge             4f   // If w >= 10, all used input pixels are valid
    448 
    449        // 1 <= w < 10, w pixels valid in q0. For w=9, this ends up called
    450        // again; it's not strictly needed in those cases (we pad enough here),
    451        // but keeping the code as simple as possible.
    452 
    453        // Insert padding in q0.b[w] onwards
    454        movrel_local    lr,  right_ext_mask
    455        sub             lr,  lr,  r4
    456        vld1.8          {q13}, [lr]
    457 
    458        vbit            q0,  q14, q13
    459 
    460        // Update the precalculated squares
    461        vmull.u8        q1,  d0,  d0
    462        vmull.u8        q2,  d1,  d1
    463 
    464 4:      // Loop horizontally
    465        vext.8          d16, d0,  d1,  #1
    466        vext.8          d17, d0,  d1,  #2
    467        vaddl.u8        q3,  d0,  d16
    468        vext.8          q9,  q1,  q2,  #2
    469        vaddw.u8        q3,  q3,  d17
    470 
    471        vext.8          q10, q1,  q2,  #4
    472 
    473        vaddl.u16       q12, d2,  d18
    474        vaddl.u16       q13, d3,  d19
    475        vaddw.u16       q12, q12, d20
    476        vaddw.u16       q13, q13, d21
    477 
    478        subs            r4,  r4,  #8
    479        vst1.16         {q3},       [r1,  :128]!
    480        vst1.32         {q12, q13}, [r0,  :128]!
    481 
    482        ble             9f
    483        tst             r5,  #2 // LR_HAVE_RIGHT
    484        vld1.8          {d6},  [r3]!
    485        vmov            q1,  q2
    486        vext.8          q0,  q0,  q3,  #8
    487        vmull.u8        q2,  d6,  d6
    488 
    489        bne             4b // If we don't need to pad, just keep summing.
    490        b               3b // If we need to pad, check how many pixels we have left.
    491 
    492 9:
    493        pop             {r4-r5,pc}
    494 endfunc
    495 
    496 // void dav1d_sgr_box5_row_h_8bpc_neon(int32_t *sumsq, int16_t *sum,
    497 //                                     const pixel (*left)[4],
    498 //                                     const pixel *src, const int w,
    499 //                                     const enum LrEdgeFlags edges);
    500 function sgr_box5_row_h_8bpc_neon, export=1
    501        push            {r4-r5,lr}
    502        ldrd            r4,  r5,  [sp, #12]
    503        add             r4,  r4,  #2 // w += 2
    504 
    505        tst             r5,  #1 // LR_HAVE_LEFT
    506        beq             1f
    507        cmp             r2,  #0
    508        bne             0f
    509 
    510        // LR_HAVE_LEFT && left == NULL
    511        sub             r3,  r3,  #3
    512        vld1.8          {q0}, [r3]!
    513        b               2f
    514 
    515 0:
    516        // LR_HAVE_LEFT, left != NULL
    517        vld1.8          {q0},   [r3]!
    518        vld1.32         {d3[]}, [r2]
    519        // Move r3 back to account for the last 3 bytes we loaded earlier,
    520        // which we'll shift out.
    521        sub             r3,  r3,  #3
    522        vext.8          q0,  q1,  q0,  #13
    523        b               2f
    524 
    525 1:
    526        vld1.8          {q0},   [r3]!
    527        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
    528        // and shift q0 to have 3x the first byte at the front.
    529        vdup.8          q1,  d0[0]
    530        // Move r3 back to account for the last 3 bytes we loaded before,
    531        // which we shifted out.
    532        sub             r3,  r3,  #3
    533        vext.8          q0,  q1,  q0,  #13
    534 
    535 2:
    536        vmull.u8        q1,  d0,  d0
    537        vmull.u8        q2,  d1,  d1
    538 
    539        tst             r5,  #2 // LR_HAVE_RIGHT
    540        bne             4f
    541        // If we'll need to pad the right edge, load that byte to pad with
    542        // here since we can find it pretty easily from here.
    543        sub             lr,  r4,  #(2 + 16 - 3 + 1)
    544        ldrb            lr,  [r3,  lr]
    545        // Fill q14 with the right padding pixel
    546        vdup.8          q14, lr
    547 3:      // !LR_HAVE_RIGHT
    548 
    549        // Check whether we need to pad the right edge
    550        cmp             r4,  #11
    551        bge             4f   // If w >= 11, all used input pixels are valid
    552 
    553        // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
    554        // this ends up called again; it's not strictly needed in those
    555        // cases (we pad enough here), but keeping the code as simple as possible.
    556 
    557        // Insert padding in q0.b[w+1] onwards; fuse the +1 into the
    558        // buffer pointer.
    559        movrel_local    lr,  right_ext_mask, -1
    560        sub             lr,  lr,  r4
    561        vld1.8          {q13}, [lr]
    562 
    563        vbit            q0,  q14, q13
    564 
    565        // Update the precalculated squares
    566        vmull.u8        q1,  d0,  d0
    567        vmull.u8        q2,  d1,  d1
    568 
    569 4:      // Loop horizontally
    570        vext.8          d16, d0,  d1,  #1
    571        vext.8          d17, d0,  d1,  #2
    572        vext.8          d18, d0,  d1,  #3
    573        vext.8          d19, d0,  d1,  #4
    574        vaddl.u8        q3,  d0,  d16
    575        vaddl.u8        q12, d17, d18
    576        vaddw.u8        q3,  q3,  d19
    577        vadd.u16        q3,  q3,  q12
    578 
    579        vext.8          q8,  q1,  q2,  #2
    580        vext.8          q9,  q1,  q2,  #4
    581        vext.8          q10, q1,  q2,  #6
    582        vext.8          q11, q1,  q2,  #8
    583        vaddl.u16       q12, d2,  d16
    584        vaddl.u16       q13, d3,  d17
    585        vaddl.u16       q8,  d18, d20
    586        vaddl.u16       q9,  d19, d21
    587        vaddw.u16       q12, q12, d22
    588        vaddw.u16       q13, q13, d23
    589        vadd.i32        q12, q12, q8
    590        vadd.i32        q13, q13, q9
    591 
    592        subs            r4,  r4,  #8
    593        vst1.16         {q3},       [r1,  :128]!
    594        vst1.32         {q12, q13}, [r0,  :128]!
    595 
    596        ble             9f
    597        tst             r5,  #2 // LR_HAVE_RIGHT
    598        vld1.8          {d6},  [r3]!
    599        vmov            q1,  q2
    600        vext.8          q0,  q0,  q3,  #8
    601        vmull.u8        q2,  d6,  d6
    602        bne             4b // If we don't need to pad, just keep summing.
    603        b               3b // If we need to pad, check how many pixels we have left.
    604 
    605 9:
    606        pop             {r4-r5,pc}
    607 endfunc
    608 
    609 // void dav1d_sgr_box35_row_h_8bpc_neon(int32_t *sumsq3, int16_t *sum3,
    610 //                                      int32_t *sumsq5, int16_t *sum5,
    611 //                                      const pixel (*left)[4],
    612 //                                      const pixel *src, const int w,
    613 //                                      const enum LrEdgeFlags edges);
    614 function sgr_box35_row_h_8bpc_neon, export=1
    615        push            {r4-r7,lr}
    616        ldrd            r4,  r5,  [sp, #20]
    617        ldrd            r6,  r7,  [sp, #28]
    618        add             r6,  r6,  #2 // w += 2
    619 
    620        tst             r7,  #1 // LR_HAVE_LEFT
    621        beq             1f
    622        cmp             r4,  #0
    623        bne             0f
    624 
    625        // LR_HAVE_LEFT && left == NULL
    626        sub             r5,  r5,  #3
    627        vld1.8          {q0}, [r5]!
    628        b               2f
    629 
    630 0:
    631        // LR_HAVE_LEFT, left != NULL
    632        vld1.8          {q0},   [r5]!
    633        vld1.32         {d3[]}, [r4]
    634        // Move r3 back to account for the last 3 bytes we loaded earlier,
    635        // which we'll shift out.
    636        sub             r5,  r5,  #3
    637        vext.8          q0,  q1,  q0,  #13
    638        b               2f
    639 
    640 1:
    641        vld1.8          {q0},   [r5]!
    642        // !LR_HAVE_LEFT, fill q1 with the leftmost byte
    643        // and shift q0 to have 3x the first byte at the front.
    644        vdup.8          q1,  d0[0]
    645        // Move r3 back to account for the last 3 bytes we loaded before,
    646        // which we shifted out.
    647        sub             r5,  r5,  #3
    648        vext.8          q0,  q1,  q0,  #13
    649 
    650 2:
    651        vmull.u8        q1,  d0,  d0
    652        vmull.u8        q2,  d1,  d1
    653 
    654        tst             r7,  #2 // LR_HAVE_RIGHT
    655        bne             4f
    656        // If we'll need to pad the right edge, load that byte to pad with
    657        // here since we can find it pretty easily from here.
    658        sub             lr,  r6,  #(2 + 16 - 3 + 1)
    659        ldrb            lr,  [r5,  lr]
    660        // Fill q14 with the right padding pixel
    661        vdup.8          q14, lr
    662 3:      // !LR_HAVE_RIGHT
    663 
    664        // Check whether we need to pad the right edge
    665        cmp             r6,  #11
    666        bge             4f   // If w >= 11, all used input pixels are valid
    667 
    668        // 1 <= w < 11, w+1 pixels valid in q0. For w=9 or w=10,
    669        // this ends up called again; it's not strictly needed in those
    670        // cases (we pad enough here), but keeping the code as simple as possible.
    671 
    672        // Insert padding in q0.b[w+1] onwards; fuse the +1 into the
    673        // buffer pointer.
    674        movrel_local    lr,  right_ext_mask, -1
    675        sub             lr,  lr,  r6
    676        vld1.8          {q13}, [lr]
    677 
    678        vbit            q0,  q14, q13
    679 
    680        // Update the precalculated squares
    681        vmull.u8        q1,  d0,  d0
    682        vmull.u8        q2,  d1,  d1
    683 
    684 4:      // Loop horizontally
    685        vext.8          d16, d0,  d1,  #1
    686        vext.8          d17, d0,  d1,  #2
    687        vext.8          d18, d0,  d1,  #3
    688        vext.8          d19, d0,  d1,  #4
    689        vaddl.u8        q3,  d16, d17
    690        vaddl.u8        q12, d0,  d19
    691        vaddw.u8        q3,  q3,  d18
    692 
    693        vext.8          q8,  q1,  q2,  #2
    694        vext.8          q9,  q1,  q2,  #4
    695        vext.8          q10, q1,  q2,  #6
    696        vext.8          q11, q1,  q2,  #8
    697 
    698        vst1.16         {q3},       [r1,  :128]!
    699        vadd.u16        q3,  q3,  q12
    700 
    701        vaddl.u16       q12, d16, d18
    702        vaddl.u16       q13, d17, d19
    703        vaddl.u16       q8,  d2,  d22
    704        vaddl.u16       q9,  d3,  d23
    705        vaddw.u16       q12, q12, d20
    706        vaddw.u16       q13, q13, d21
    707 
    708        vst1.32         {q12, q13}, [r0,  :128]!
    709        vadd.i32        q12, q12, q8
    710        vadd.i32        q13, q13, q9
    711 
    712        subs            r6,  r6,  #8
    713        vst1.16         {q3},       [r3,  :128]!
    714        vst1.32         {q12, q13}, [r2,  :128]!
    715 
    716        ble             9f
    717        tst             r7,  #2 // LR_HAVE_RIGHT
    718        vld1.8          {d6},  [r5]!
    719        vmov            q1,  q2
    720        vext.8          q0,  q0,  q3,  #8
    721        vmull.u8        q2,  d6,  d6
    722        bne             4b // If we don't need to pad, just keep summing.
    723        b               3b // If we need to pad, check how many pixels we have left.
    724 
    725 9:
    726        pop             {r4-r7,pc}
    727 endfunc
    728 
    729 sgr_funcs 8