tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arma64-neon-asm.h (45160B)


      1 /*
      2 * Copyright © 2009 Nokia Corporation
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21 * DEALINGS IN THE SOFTWARE.
     22 *
     23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
     24 */
     25 
     26 /*
     27 * This file contains a macro ('generate_composite_function') which can
     28 * construct 2D image processing functions, based on a common template.
     29 * Any combinations of source, destination and mask images with 8bpp,
     30 * 16bpp, 24bpp, 32bpp color formats are supported.
     31 *
     32 * This macro takes care of:
     33 *  - handling of leading and trailing unaligned pixels
     34 *  - doing most of the work related to L2 cache preload
     35 *  - encourages the use of software pipelining for better instructions
     36 *    scheduling
     37 *
     38 * The user of this macro has to provide some configuration parameters
     39 * (bit depths for the images, prefetch distance, etc.) and a set of
     40 * macros, which should implement basic code chunks responsible for
     41 * pixels processing. See 'pixman-armv8-neon-asm.S' file for the usage
     42 * examples.
     43 *
     44 * TODO:
     45 *  - try overlapped pixel method (from Ian Rickards) when processing
     46 *    exactly two blocks of pixels
     47 *  - maybe add an option to do reverse scanline processing
     48 */
     49 
     50 #include "pixman-arm-asm.h"
     51 
     52 /*
     53 * Bit flags for 'generate_composite_function' macro which are used
     54 * to tune generated functions behavior.
     55 */
     56 .set FLAG_DST_WRITEONLY,       0
     57 .set FLAG_DST_READWRITE,       1
     58 .set FLAG_DEINTERLEAVE_32BPP,  2
     59 
     60 /*
     61 * Constants for selecting preferable prefetch type.
     62 */
     63 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
     64 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
     65 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
     66 
     67 /*
     68 * prefetch mode
     69 * available modes are:
     70 * pldl1keep
     71 * pldl1strm
     72 * pldl2keep
     73 * pldl2strm
     74 * pldl3keep
     75 * pldl3strm
     76 */
     77 #define PREFETCH_MODE pldl1keep
     78 
     79 /*
     80 * Definitions of supplementary pixld/pixst macros (for partial load/store of
     81 * pixel data).
     82 */
     83 
     84 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
     85    \op {v\()\reg1\().\()\elem_size}, [\()\mem_operand\()], #8
     86 .endm
     87 
     88 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
     89    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size}, [\()\mem_operand\()], #16
     90 .endm
     91 
     92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
     93    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size, v\()\reg4\().\()\elem_size}, [\()\mem_operand\()], #32
     94 .endm
     95 
     96 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits, bytes
     97    \op {v\()\reg1\().\()\elem_size}[\idx], [\()\mem_operand\()], #\()\bytes\()
     98 .endm
     99 
    100 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
    101    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}, [\()\mem_operand\()], #24
    102 .endm
    103 
    104 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
    105    \op {v\()\reg1\().\()\elem_size, v\()\reg2\().\()\elem_size, v\()\reg3\().\()\elem_size}[\idx], [\()\mem_operand\()], #3
    106 .endm
    107 
    108 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
    109 .if \numbytes == 32
    110    .if \elem_size==32
    111        pixldst4 \op, 2s, %(\basereg+4), %(\basereg+5), \
    112                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    113    .elseif \elem_size==16
    114        pixldst4 \op, 4h, %(\basereg+4), %(\basereg+5), \
    115                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    116    .else
    117        pixldst4 \op, 8b, %(\basereg+4), %(\basereg+5), \
    118                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    119    .endif
    120 .elseif \numbytes == 16
    121    .if \elem_size==32
    122          pixldst2 \op, 2s, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
    123    .elseif \elem_size==16
    124          pixldst2 \op, 4h, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
    125    .else
    126          pixldst2 \op, 8b, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
    127    .endif
    128 .elseif \numbytes == 8
    129    .if \elem_size==32
    130        pixldst1 \op, 2s, %(\basereg+1), \mem_operand, \abits
    131    .elseif \elem_size==16
    132        pixldst1 \op, 4h, %(\basereg+1), \mem_operand, \abits
    133    .else
    134        pixldst1 \op, 8b, %(\basereg+1), \mem_operand, \abits
    135    .endif
    136 .elseif \numbytes == 4
    137    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
    138        pixldst0 \op, s, %(\basereg+0), 1, \mem_operand, \abits, 4
    139    .elseif \elem_size == 16
    140        pixldst0 \op, h, %(\basereg+0), 2, \mem_operand, \abits, 2
    141        pixldst0 \op, h, %(\basereg+0), 3, \mem_operand, \abits, 2
    142    .else
    143        pixldst0 \op, b, %(\basereg+0), 4, \mem_operand, \abits, 1
    144        pixldst0 \op, b, %(\basereg+0), 5, \mem_operand, \abits, 1
    145        pixldst0 \op, b, %(\basereg+0), 6, \mem_operand, \abits, 1
    146        pixldst0 \op, b, %(\basereg+0), 7, \mem_operand, \abits, 1
    147    .endif
    148 .elseif \numbytes == 2
    149    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
    150        pixldst0 \op, h, %(\basereg+0), 1, \mem_operand, \abits, 2
    151    .else
    152        pixldst0 \op, b, %(\basereg+0), 2, \mem_operand, \abits, 1
    153        pixldst0 \op, b, %(\basereg+0), 3, \mem_operand, \abits, 1
    154    .endif
    155 .elseif \numbytes == 1
    156        pixldst0 \op, b, %(\basereg+0), 1, \mem_operand, \abits, 1
    157 .else
    158    .error "unsupported size: \numbytes"
    159 .endif
    160 .endm
    161 
    162 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
    163 .if \bpp > 0
    164 .if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    165    pixldst4 ld4, 8b, %(\basereg+4), %(\basereg+5), \
    166                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    167 .elseif (\bpp == 24) && (\numpix == 8)
    168    pixldst3 ld3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
    169 .elseif (\bpp == 24) && (\numpix == 4)
    170    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
    171    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
    172    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
    173    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
    174 .elseif (\bpp == 24) && (\numpix == 2)
    175    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
    176    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
    177 .elseif (\bpp == 24) && (\numpix == 1)
    178    pixldst30 ld3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
    179 .else
    180    pixldst %(\numpix * \bpp / 8), ld1, %(\bpp), \basereg, \mem_operand, \abits
    181 .endif
    182 .endif
    183 .endm
    184 
    185 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
    186 .if \bpp > 0
    187 .if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    188    pixldst4 st4, 8b, %(\basereg+4), %(\basereg+5), \
    189                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    190 .elseif (\bpp == 24) && (\numpix == 8)
    191    pixldst3 st3, 8b, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
    192 .elseif (\bpp == 24) && (\numpix == 4)
    193    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
    194    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
    195    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
    196    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
    197 .elseif (\bpp == 24) && (\numpix == 2)
    198    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
    199    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
    200 .elseif (\bpp == 24) && (\numpix == 1)
    201    pixldst30 st3, b, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
    202 .elseif \numpix * \bpp == 32 && \abits == 32
    203    pixldst 4, st1, 32, \basereg, \mem_operand, \abits
    204 .elseif \numpix * \bpp == 16 && \abits == 16
    205    pixldst 2, st1, 16, \basereg, \mem_operand, \abits
    206 .else
    207    pixldst %(\numpix * \bpp / 8), st1, %(\bpp), \basereg, \mem_operand, \abits
    208 .endif
    209 .endif
    210 .endm
    211 
    212 .macro pixld_a numpix, bpp, basereg, mem_operand
    213 .if (\bpp * \numpix) <= 128
    214    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
    215 .else
    216    pixld \numpix, \bpp, \basereg, \mem_operand, 128
    217 .endif
    218 .endm
    219 
    220 .macro pixst_a numpix, bpp, basereg, mem_operand
    221 .if (\bpp * \numpix) <= 128
    222    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
    223 .else
    224    pixst \numpix, \bpp, \basereg, \mem_operand, 128
    225 .endif
    226 .endm
    227 
    228 /*
    229 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
    230 * aliases to be defined)
    231 */
    232 .macro pixld1_s elem_size, reg1, mem_operand
    233 .if \elem_size == 16
    234    asr     TMP1, VX, #16
    235    adds    VX, VX, UNIT_X
    236    bmi     55f
    237 5:
    238    subs    VX, VX, SRC_WIDTH_FIXED
    239    bpl     5b
    240 55:
    241    add     TMP1, \mem_operand, TMP1, lsl #1
    242    asr     TMP2, VX, #16
    243    adds    VX, VX, UNIT_X
    244    bmi     55f
    245 5:
    246    subs    VX, VX, SRC_WIDTH_FIXED
    247    bpl     5b
    248 55:
    249    add     TMP2, \mem_operand, TMP2, lsl #1
    250    ld1     {v\()\reg1\().h}[0], [TMP1]
    251    asr     TMP1, VX, #16
    252    adds    VX, VX, UNIT_X
    253    bmi     55f
    254 5:
    255    subs    VX, VX, SRC_WIDTH_FIXED
    256    bpl     5b
    257 55:
    258    add     TMP1, \mem_operand, TMP1, lsl #1
    259    ld1     {v\()\reg1\().h}[1], [TMP2]
    260    asr     TMP2, VX, #16
    261    adds    VX, VX, UNIT_X
    262    bmi     55f
    263 5:
    264    subs    VX, VX, SRC_WIDTH_FIXED
    265    bpl     5b
    266 55:
    267    add     TMP2, \mem_operand, TMP2, lsl #1
    268    ld1     {v\()\reg1\().h}[2], [TMP1]
    269    ld1     {v\()\reg1\().h}[3], [TMP2]
    270 .elseif \elem_size == 32
    271    asr     TMP1, VX, #16
    272    adds    VX, VX, UNIT_X
    273    bmi     55f
    274 5:
    275    subs    VX, VX, SRC_WIDTH_FIXED
    276    bpl     5b
    277 55:
    278    add     TMP1, \mem_operand, TMP1, lsl #2
    279    asr     TMP2, VX, #16
    280    adds    VX, VX, UNIT_X
    281    bmi     55f
    282 5:
    283    subs    VX, VX, SRC_WIDTH_FIXED
    284    bpl     5b
    285 55:
    286    add     TMP2, \mem_operand, TMP2, lsl #2
    287    ld1     {v\()\reg1\().s}[0], [TMP1]
    288    ld1     {v\()\reg1\().s}[1], [TMP2]
    289 .else
    290    .error "unsupported"
    291 .endif
    292 .endm
    293 
    294 .macro pixld2_s elem_size, reg1, reg2, mem_operand
    295 .if 0 /* \elem_size == 32 */
    296    mov     TMP1, VX, asr #16
    297    add     VX, VX, UNIT_X, asl #1
    298    add     TMP1, \mem_operand, TMP1, asl #2
    299    mov     TMP2, VX, asr #16
    300    sub     VX, VX, UNIT_X
    301    add     TMP2, \mem_operand, TMP2, asl #2
    302    ld1     {v\()\reg1\().s}[0], [TMP1]
    303    mov     TMP1, VX, asr #16
    304    add     VX, VX, UNIT_X, asl #1
    305    add     TMP1, \mem_operand, TMP1, asl #2
    306    ld1     {v\()\reg2\().s}[0], [TMP2, :32]
    307    mov     TMP2, VX, asr #16
    308    add     VX, VX, UNIT_X
    309    add     TMP2, \mem_operand, TMP2, asl #2
    310    ld1     {v\()\reg1\().s}[1], [TMP1]
    311    ld1     {v\()\reg2\().s}[1], [TMP2]
    312 .else
    313    pixld1_s \elem_size, \reg1, \mem_operand
    314    pixld1_s \elem_size, \reg2, \mem_operand
    315 .endif
    316 .endm
    317 
    318 .macro pixld0_s elem_size, reg1, idx, mem_operand
    319 .if \elem_size == 16
    320    asr     TMP1, VX, #16
    321    adds    VX, VX, UNIT_X
    322    bmi     55f
    323 5:
    324    subs    VX, VX, SRC_WIDTH_FIXED
    325    bpl     5b
    326 55:
    327    add     TMP1, \mem_operand, TMP1, lsl #1
    328    ld1     {v\()\reg1\().h}[\idx], [TMP1]
    329 .elseif \elem_size == 32
    330    asr     DUMMY, VX, #16
    331    mov     TMP1, DUMMY
    332    adds    VX, VX, UNIT_X
    333    bmi     55f
    334 5:
    335    subs    VX, VX, SRC_WIDTH_FIXED
    336    bpl     5b
    337 55:
    338    add     TMP1, \mem_operand, TMP1, lsl #2
    339    ld1     {v\()\reg1\().s}[\idx], [TMP1]
    340 .endif
    341 .endm
    342 
    343 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
    344 .if \numbytes == 32
    345    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
    346    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
    347    pixdeinterleave \elem_size, %(\basereg+4)
    348 .elseif \numbytes == 16
    349    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
    350 .elseif \numbytes == 8
    351    pixld1_s \elem_size, %(\basereg+1), \mem_operand
    352 .elseif \numbytes == 4
    353    .if \elem_size == 32
    354        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
    355    .elseif \elem_size == 16
    356        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
    357        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
    358    .else
    359        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
    360        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
    361        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
    362        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
    363    .endif
    364 .elseif \numbytes == 2
    365    .if \elem_size == 16
    366        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
    367    .else
    368        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
    369        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
    370    .endif
    371 .elseif \numbytes == 1
    372    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
    373 .else
    374    .error "unsupported size: \numbytes"
    375 .endif
    376 .endm
    377 
    378 .macro pixld_s numpix, bpp, basereg, mem_operand
    379 .if \bpp > 0
    380    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
    381 .endif
    382 .endm
    383 
    384 .macro vuzp8 reg1, reg2
    385    umov DUMMY, v16.d[0]
    386    uzp1 v16.8b,          v\()\reg1\().8b, v\()\reg2\().8b
    387    uzp2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
    388    mov  v\()\reg1\().8b, v16.8b
    389    mov  v16.d[0], DUMMY
    390 .endm
    391 
    392 .macro vzip8 reg1, reg2
    393    umov DUMMY, v16.d[0]
    394    zip1 v16.8b,          v\()\reg1\().8b, v\()\reg2\().8b
    395    zip2 v\()\reg2\().8b, v\()\reg1\().8b, v\()\reg2\().8b
    396    mov  v\()\reg1\().8b, v16.8b
    397    mov  v16.d[0], DUMMY
    398 .endm
    399 
    400 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
    401 .macro pixdeinterleave bpp, basereg
    402 .if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    403    vuzp8 %(\basereg+0), %(\basereg+1)
    404    vuzp8 %(\basereg+2), %(\basereg+3)
    405    vuzp8 %(\basereg+1), %(\basereg+3)
    406    vuzp8 %(\basereg+0), %(\basereg+2)
    407 .endif
    408 .endm
    409 
    410 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
    411 .macro pixinterleave bpp, basereg
    412 .if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    413    vzip8 %(\basereg+0), %(\basereg+2)
    414    vzip8 %(\basereg+1), %(\basereg+3)
    415    vzip8 %(\basereg+2), %(\basereg+3)
    416    vzip8 %(\basereg+0), %(\basereg+1)
    417 .endif
    418 .endm
    419 
    420 /*
    421 * This is a macro for implementing cache preload. The main idea is that
    422 * cache preload logic is mostly independent from the rest of pixels
    423 * processing code. It starts at the top left pixel and moves forward
    424 * across pixels and can jump across scanlines. Prefetch distance is
    425 * handled in an 'incremental' way: it starts from 0 and advances to the
    426 * optimal distance over time. After reaching optimal prefetch distance,
    427 * it is kept constant. There are some checks which prevent prefetching
    428 * unneeded pixel lines below the image (but it still can prefetch a bit
    429 * more data on the right side of the image - not a big issue and may
    430 * be actually helpful when rendering text glyphs). Additional trick is
    431 * the use of LDR instruction for prefetch instead of PLD when moving to
    432 * the next line, the point is that we have a high chance of getting TLB
    433 * miss in this case, and PLD would be useless.
    434 *
    435 * This sounds like it may introduce a noticeable overhead (when working with
    436 * fully cached data). But in reality, due to having a separate pipeline and
    437 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
    438 * execute simultaneously with NEON and be completely shadowed by it. Thus
    439 * we get no performance overhead at all (*). This looks like a very nice
    440 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
    441 * but still can implement some rather advanced prefetch logic in software
    442 * for almost zero cost!
    443 *
    444 * (*) The overhead of the prefetcher is visible when running some trivial
    445 * pixels processing like simple copy. Anyway, having prefetch is a must
    446 * when working with the graphics data.
    447 */
    448 .macro PF a, x:vararg
    449 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
    450    \a \x
    451 .endif
    452 .endm
    453 
    454 .macro cache_preload std_increment, boost_increment
    455 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
    456 .if \std_increment != 0
    457    PF add, PF_X, PF_X, #\std_increment
    458 .endif
    459    PF tst, PF_CTL, #0xF
    460    PF beq, 71f
    461    PF add, PF_X, PF_X, #\boost_increment
    462    PF sub, PF_CTL, PF_CTL, #1
    463 71:
    464    PF cmp, PF_X, ORIG_W
    465 .if src_bpp_shift >= 0
    466    PF lsl, DUMMY, PF_X, #src_bpp_shift
    467    PF prfm, PREFETCH_MODE, [PF_SRC, DUMMY]
    468 .endif
    469 .if dst_r_bpp != 0
    470    PF lsl, DUMMY, PF_X, #dst_bpp_shift
    471    PF prfm, PREFETCH_MODE, [PF_DST, DUMMY]
    472 .endif
    473 .if mask_bpp_shift >= 0
    474    PF lsl, DUMMY, PF_X, #mask_bpp_shift
    475    PF prfm, PREFETCH_MODE, [PF_MASK, DUMMY]
    476 .endif
    477    PF ble, 72f
    478    PF sub, PF_X, PF_X, ORIG_W
    479    PF subs, PF_CTL, PF_CTL, #0x10
    480    PF ble, 72f
    481 .if src_bpp_shift >= 0
    482    PF add, PF_SRC, PF_SRC, SRC_STRIDE, lsl #src_bpp_shift
    483    PF ldrsb, DUMMY, [PF_SRC]
    484 .endif
    485 .if dst_r_bpp != 0
    486    PF add, PF_DST, PF_DST, DST_STRIDE, lsl #dst_bpp_shift
    487    PF ldrsb, DUMMY, [PF_DST]
    488 .endif
    489 .if mask_bpp_shift >= 0
    490    PF add, PF_MASK, PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift
    491    PF ldrsb, DUMMY, [PF_MASK]
    492 .endif
    493 72:
    494 .endif
    495 .endm
    496 
    497 .macro cache_preload_simple
    498 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
    499 .if src_bpp > 0
    500    prfm PREFETCH_MODE, [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
    501 .endif
    502 .if dst_r_bpp > 0
    503    prfm PREFETCH_MODE, [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
    504 .endif
    505 .if mask_bpp > 0
    506    prfm PREFETCH_MODE, [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
    507 .endif
    508 .endif
    509 .endm
    510 
    511 .macro fetch_mask_pixblock
    512    pixld       pixblock_size, mask_bpp, \
    513                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    514 .endm
    515 
    516 /*
    517 * Macro which is used to process leading pixels until destination
    518 * pointer is properly aligned (at 16 bytes boundary). When destination
    519 * buffer uses 16bpp format, this is unnecessary, or even pointless.
    520 */
    521 .macro ensure_destination_ptr_alignment process_pixblock_head, \
    522                                        process_pixblock_tail, \
    523                                        process_pixblock_tail_head
    524 .if dst_w_bpp != 24
    525    tst         DST_R, #0xF
    526    beq         52f
    527 
    528 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
    529 .irp lowbit, 1, 2, 4, 8, 16
    530 
    531 .if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
    532 .if \lowbit < 16 /* we don't need more than 16-byte alignment */
    533    tst         DST_R, #\lowbit
    534    beq         51f
    535 .endif
    536    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
    537    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
    538 .if dst_r_bpp > 0
    539    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
    540 .else
    541    add         DST_R, DST_R, #\lowbit
    542 .endif
    543    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
    544    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
    545 51:
    546 .endif
    547 .endr
    548 .endif
    549    pixdeinterleave src_bpp, src_basereg
    550    pixdeinterleave mask_bpp, mask_basereg
    551    pixdeinterleave dst_r_bpp, dst_r_basereg
    552 
    553    \process_pixblock_head
    554    cache_preload 0, pixblock_size
    555    cache_preload_simple
    556    \process_pixblock_tail
    557 
    558    pixinterleave dst_w_bpp, dst_w_basereg
    559 
    560 .irp lowbit, 1, 2, 4, 8, 16
    561 .if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
    562 .if \lowbit < 16 /* we don't need more than 16-byte alignment */
    563    tst         DST_W, #\lowbit
    564    beq         51f
    565 .endif
    566 .if src_bpp == 0 && mask_bpp == 0 && dst_r_bpp == 0
    567    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
    568 .endif
    569    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
    570 51:
    571 .endif
    572 .endr
    573 .endif
    574 52:
    575 .endm
    576 
    577 /*
    578 * Special code for processing up to (pixblock_size - 1) remaining
    579 * trailing pixels. As SIMD processing performs operation on
    580 * pixblock_size pixels, anything smaller than this has to be loaded
    581 * and stored in a special way. Loading and storing of pixel data is
    582 * performed in such a way that we fill some 'slots' in the NEON
    583 * registers (some slots naturally are unused), then perform compositing
    584 * operation as usual. In the end, the data is taken from these 'slots'
    585 * and saved to memory.
    586 *
    587 * cache_preload_flag - allows to suppress prefetch if
    588 *                      set to 0
    589 * dst_aligned_flag   - selects whether destination buffer
    590 *                      is aligned
    591 */
    592 .macro process_trailing_pixels cache_preload_flag, \
    593                               dst_aligned_flag, \
    594                               process_pixblock_head, \
    595                               process_pixblock_tail, \
    596                               process_pixblock_tail_head
    597    tst         W, #(pixblock_size - 1)
    598    beq         52f
    599 .if src_bpp > 0 || mask_bpp > 0 || dst_r_bpp > 0
    600 .irp chunk_size, 16, 8, 4, 2, 1
    601 .if pixblock_size > \chunk_size
    602    tst         W, #\chunk_size
    603    beq         51f
    604    pixld_src   \chunk_size, src_bpp, src_basereg, SRC
    605    pixld       \chunk_size, mask_bpp, mask_basereg, MASK
    606 .if \dst_aligned_flag != 0
    607    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
    608 .else
    609    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
    610 .endif
    611 .if \cache_preload_flag != 0
    612    PF add,     PF_X, PF_X, #\chunk_size
    613 .endif
    614 51:
    615 .endif
    616 .endr
    617 .endif
    618    pixdeinterleave src_bpp, src_basereg
    619    pixdeinterleave mask_bpp, mask_basereg
    620    pixdeinterleave dst_r_bpp, dst_r_basereg
    621 
    622    \process_pixblock_head
    623 .if \cache_preload_flag != 0
    624    cache_preload 0, pixblock_size
    625    cache_preload_simple
    626 .endif
    627    \process_pixblock_tail
    628    pixinterleave dst_w_bpp, dst_w_basereg
    629 .irp chunk_size, 16, 8, 4, 2, 1
    630 .if pixblock_size > \chunk_size
    631    tst         W, #\chunk_size
    632    beq         51f
    633 .if \dst_aligned_flag != 0
    634    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
    635 .else
    636    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
    637 .endif
    638 51:
    639 .endif
    640 .endr
    641 52:
    642 .endm
    643 
    644 /*
    645 * Macro, which performs all the needed operations to switch to the next
    646 * scanline and start the next loop iteration unless all the scanlines
    647 * are already processed.
    648 */
    649 .macro advance_to_next_scanline start_of_loop_label
    650    mov         W, ORIG_W
    651    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
    652 .if src_bpp != 0
    653    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
    654 .endif
    655 .if mask_bpp != 0
    656    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
    657 .endif
    658 .if (dst_w_bpp != 24)
    659    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
    660 .endif
    661 .if (src_bpp != 24) && (src_bpp != 0)
    662    sub         SRC, SRC, W, lsl #src_bpp_shift
    663 .endif
    664 .if (mask_bpp != 24) && (mask_bpp != 0)
    665    sub         MASK, MASK, W, lsl #mask_bpp_shift
    666 .endif
    667    subs        H, H, #1
    668    mov         DST_R, DST_W
    669    bge         \start_of_loop_label
    670 .endm
    671 
    672 /*
    673 * Registers are allocated in the following way by default:
    674 * v0, v1, v2, v3     - reserved for loading source pixel data
    675 * v4, v5, v6, v7     - reserved for loading destination pixel data
    676 * v24, v25, v26, v27 - reserved for loading mask pixel data
    677 * v28, v29, v30, v31 - final destination pixel data for writeback to memory
    678 */
    679 .macro generate_composite_function fname, \
    680                                   src_bpp_, \
    681                                   mask_bpp_, \
    682                                   dst_w_bpp_, \
    683                                   flags, \
    684                                   pixblock_size_, \
    685                                   prefetch_distance, \
    686                                   init, \
    687                                   cleanup, \
    688                                   process_pixblock_head, \
    689                                   process_pixblock_tail, \
    690                                   process_pixblock_tail_head, \
    691                                   dst_w_basereg_ = 28, \
    692                                   dst_r_basereg_ = 4, \
    693                                   src_basereg_   = 0, \
    694                                   mask_basereg_  = 24
    695 
    696    pixman_asm_function \fname
    697    stp         x29, x30, [sp, -16]!
    698    mov         x29, sp
    699    sub         sp,   sp, 232  /* push all registers */
    700    sub         x29, x29, 64
    701    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], #32
    702    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], #32
    703    stp          x8,   x9, [x29, -80]
    704    stp         x10,  x11, [x29, -96]
    705    stp         x12,  x13, [x29, -112]
    706    stp         x14,  x15, [x29, -128]
    707    stp         x16,  x17, [x29, -144]
    708    stp         x18,  x19, [x29, -160]
    709    stp         x20,  x21, [x29, -176]
    710    stp         x22,  x23, [x29, -192]
    711    stp         x24,  x25, [x29, -208]
    712    stp         x26,  x27, [x29, -224]
    713    str         x28, [x29, -232]
    714 
    715 /*
    716 * Select prefetch type for this function. If prefetch distance is
    717 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
    718 * has to be used instead of ADVANCED.
    719 */
    720    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
    721 .if \prefetch_distance == 0
    722    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    723 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
    724        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
    725    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
    726 .endif
    727 
    728 /*
    729 * Make some macro arguments globally visible and accessible
    730 * from other macros
    731 */
    732    .set src_bpp, \src_bpp_
    733    .set mask_bpp, \mask_bpp_
    734    .set dst_w_bpp, \dst_w_bpp_
    735    .set pixblock_size, \pixblock_size_
    736    .set dst_w_basereg, \dst_w_basereg_
    737    .set dst_r_basereg, \dst_r_basereg_
    738    .set src_basereg, \src_basereg_
    739    .set mask_basereg, \mask_basereg_
    740 
    741    .macro pixld_src x:vararg
    742        pixld \x
    743    .endm
    744    .macro fetch_src_pixblock
    745        pixld_src   pixblock_size, src_bpp, \
    746                    (src_basereg - pixblock_size * src_bpp / 64), SRC
    747    .endm
    748 /*
    749 * Assign symbolic names to registers
    750 */
    751    W           .req       x0      /* width (is updated during processing) */
    752    H           .req       x1      /* height (is updated during processing) */
    753    DST_W       .req       x2      /* destination buffer pointer for writes */
    754    DST_STRIDE  .req       x3      /* destination image stride */
    755    SRC         .req       x4      /* source buffer pointer */
    756    SRC_STRIDE  .req       x5      /* source image stride */
    757    MASK        .req       x6      /* mask pointer */
    758    MASK_STRIDE .req       x7      /* mask stride */
    759 
    760    DST_R       .req       x8      /* destination buffer pointer for reads */
    761 
    762    PF_CTL      .req       x9      /* combined lines counter and prefetch */
    763                                    /* distance increment counter */
    764    PF_X        .req       x10     /* pixel index in a scanline for current */
    765                                    /* pretetch position */
    766    PF_SRC      .req       x11     /* pointer to source scanline start */
    767                                    /* for prefetch purposes */
    768    PF_DST      .req       x12     /* pointer to destination scanline start */
    769                                    /* for prefetch purposes */
    770    PF_MASK     .req       x13     /* pointer to mask scanline start */
    771                                    /* for prefetch purposes */
    772 
    773    ORIG_W      .req       x14     /* saved original width */
    774    DUMMY       .req       x15     /* temporary register */
    775 
    776    sxtw        x0, w0
    777    sxtw        x1, w1
    778    sxtw        x3, w3
    779    sxtw        x5, w5
    780    sxtw        x7, w7
    781 
    782    .set mask_bpp_shift, -1
    783 .if src_bpp == 32
    784    .set src_bpp_shift, 2
    785 .elseif src_bpp == 24
    786    .set src_bpp_shift, 0
    787 .elseif src_bpp == 16
    788    .set src_bpp_shift, 1
    789 .elseif src_bpp == 8
    790    .set src_bpp_shift, 0
    791 .elseif src_bpp == 0
    792    .set src_bpp_shift, -1
    793 .else
    794    .error "requested src bpp (src_bpp) is not supported"
    795 .endif
    796 .if mask_bpp == 32
    797    .set mask_bpp_shift, 2
    798 .elseif mask_bpp == 24
    799    .set mask_bpp_shift, 0
    800 .elseif mask_bpp == 8
    801    .set mask_bpp_shift, 0
    802 .elseif mask_bpp == 0
    803    .set mask_bpp_shift, -1
    804 .else
    805    .error "requested mask bpp (mask_bpp) is not supported"
    806 .endif
    807 .if dst_w_bpp == 32
    808    .set dst_bpp_shift, 2
    809 .elseif dst_w_bpp == 24
    810    .set dst_bpp_shift, 0
    811 .elseif dst_w_bpp == 16
    812    .set dst_bpp_shift, 1
    813 .elseif dst_w_bpp == 8
    814    .set dst_bpp_shift, 0
    815 .else
    816    .error "requested dst bpp (dst_w_bpp) is not supported"
    817 .endif
    818 
    819 .if (((\flags) & FLAG_DST_READWRITE) != 0)
    820    .set dst_r_bpp, dst_w_bpp
    821 .else
    822    .set dst_r_bpp, 0
    823 .endif
    824 .if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
    825    .set DEINTERLEAVE_32BPP_ENABLED, 1
    826 .else
    827    .set DEINTERLEAVE_32BPP_ENABLED, 0
    828 .endif
    829 
    830 .if \prefetch_distance < 0 || \prefetch_distance > 15
    831    .error "invalid prefetch distance (\prefetch_distance)"
    832 .endif
    833 
    834    PF mov,     PF_X, #0
    835    mov         DST_R, DST_W
    836 
    837 .if src_bpp == 24
    838    sub         SRC_STRIDE, SRC_STRIDE, W
    839    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
    840 .endif
    841 .if mask_bpp == 24
    842    sub         MASK_STRIDE, MASK_STRIDE, W
    843    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
    844 .endif
    845 .if dst_w_bpp == 24
    846    sub         DST_STRIDE, DST_STRIDE, W
    847    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
    848 .endif
    849 
    850 /*
    851 * Setup advanced prefetcher initial state
    852 */
    853    PF mov,     PF_SRC, SRC
    854    PF mov,     PF_DST, DST_R
    855    PF mov,     PF_MASK, MASK
    856    /* PF_CTL = \prefetch_distance | ((h - 1) << 4) */
    857    PF lsl,     PF_CTL, H, #4
    858    PF add,     PF_CTL, PF_CTL, #(\prefetch_distance - 0x10)
    859 
    860    \init
    861    subs        H, H, #1
    862    mov         ORIG_W, W
    863    blt         9f
    864    cmp         W, #(pixblock_size * 2)
    865    blt         800f
    866 /*
    867 * This is the start of the pipelined loop, which if optimized for
    868 * long scanlines
    869 */
    870 0:
    871    ensure_destination_ptr_alignment \process_pixblock_head, \
    872                                     \process_pixblock_tail, \
    873                                     \process_pixblock_tail_head
    874 
    875    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
    876    pixld_a     pixblock_size, dst_r_bpp, \
    877                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    878    fetch_src_pixblock
    879    pixld       pixblock_size, mask_bpp, \
    880                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    881    PF add,     PF_X, PF_X, #pixblock_size
    882    \process_pixblock_head
    883    cache_preload 0, pixblock_size
    884    cache_preload_simple
    885    subs        W, W, #(pixblock_size * 2)
    886    blt         200f
    887 
    888 100:
    889    \process_pixblock_tail_head
    890    cache_preload_simple
    891    subs        W, W, #pixblock_size
    892    bge         100b
    893 
    894 200:
    895    \process_pixblock_tail
    896    pixst_a     pixblock_size, dst_w_bpp, \
    897                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
    898 
    899    /* Process the remaining trailing pixels in the scanline */
    900    process_trailing_pixels 1, 1, \
    901                            \process_pixblock_head, \
    902                            \process_pixblock_tail, \
    903                            \process_pixblock_tail_head
    904    advance_to_next_scanline 0b
    905 
    906    \cleanup
    907 1000:
    908    /* pop all registers */
    909    sub         x29, x29, 64
    910    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    911    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    912    ldp          x8,   x9, [x29, -80]
    913    ldp         x10,  x11, [x29, -96]
    914    ldp         x12,  x13, [x29, -112]
    915    ldp         x14,  x15, [x29, -128]
    916    ldp         x16,  x17, [x29, -144]
    917    ldp         x18,  x19, [x29, -160]
    918    ldp         x20,  x21, [x29, -176]
    919    ldp         x22,  x23, [x29, -192]
    920    ldp         x24,  x25, [x29, -208]
    921    ldp         x26,  x27, [x29, -224]
    922    ldr         x28, [x29, -232]
    923    mov         sp, x29
    924    ldp         x29, x30, [sp], 16
    925    VERIFY_LR
    926    ret  /* exit */
    927 /*
    928 * This is the start of the loop, designed to process images with small width
    929 * (less than pixblock_size * 2 pixels). In this case neither pipelining
    930 * nor prefetch are used.
    931 */
    932 800:
    933 .if src_bpp_shift >= 0
    934    PF lsl, DUMMY, SRC_STRIDE, #src_bpp_shift
    935    PF prfm, PREFETCH_MODE, [SRC, DUMMY]
    936 .endif
    937 .if dst_r_bpp != 0
    938    PF lsl,  DUMMY, DST_STRIDE, #dst_bpp_shift
    939    PF prfm, PREFETCH_MODE, [DST_R, DUMMY]
    940 .endif
    941 .if mask_bpp_shift >= 0
    942    PF lsl,  DUMMY, MASK_STRIDE, #mask_bpp_shift
    943    PF prfm, PREFETCH_MODE, [MASK, DUMMY]
    944 .endif
    945    /* Process exactly pixblock_size pixels if needed */
    946    tst         W, #pixblock_size
    947    beq         100f
    948    pixld       pixblock_size, dst_r_bpp, \
    949                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    950    fetch_src_pixblock
    951    pixld       pixblock_size, mask_bpp, \
    952                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    953    \process_pixblock_head
    954    \process_pixblock_tail
    955    pixst       pixblock_size, dst_w_bpp, \
    956                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
    957 100:
    958    /* Process the remaining trailing pixels in the scanline */
    959    process_trailing_pixels 0, 0, \
    960                            \process_pixblock_head, \
    961                            \process_pixblock_tail, \
    962                            \process_pixblock_tail_head
    963    advance_to_next_scanline 800b
    964 9:
    965    \cleanup
    966    /* pop all registers */
    967    sub         x29, x29, 64
    968    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
    969    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
    970    ldp          x8,   x9, [x29, -80]
    971    ldp         x10,  x11, [x29, -96]
    972    ldp         x12,  x13, [x29, -112]
    973    ldp         x14,  x15, [x29, -128]
    974    ldp         x16,  x17, [x29, -144]
    975    ldp         x18,  x19, [x29, -160]
    976    ldp         x20,  x21, [x29, -176]
    977    ldp         x22,  x23, [x29, -192]
    978    ldp         x24,  x25, [x29, -208]
    979    ldp         x26,  x27, [x29, -224]
    980    ldr         x28, [x29, -232]
    981    mov         sp, x29
    982    ldp         x29, x30, [sp], 16
    983    VERIFY_LR
    984    ret  /* exit */
    985 
    986    .purgem     fetch_src_pixblock
    987    .purgem     pixld_src
    988 
    989    .unreq      SRC
    990    .unreq      MASK
    991    .unreq      DST_R
    992    .unreq      DST_W
    993    .unreq      ORIG_W
    994    .unreq      W
    995    .unreq      H
    996    .unreq      SRC_STRIDE
    997    .unreq      DST_STRIDE
    998    .unreq      MASK_STRIDE
    999    .unreq      PF_CTL
   1000    .unreq      PF_X
   1001    .unreq      PF_SRC
   1002    .unreq      PF_DST
   1003    .unreq      PF_MASK
   1004    .unreq      DUMMY
   1005    pixman_end_asm_function
   1006 .endm
   1007 
   1008 /*
   1009 * A simplified variant of function generation template for a single
   1010 * scanline processing (for implementing pixman combine functions)
   1011 */
   1012 .macro generate_composite_function_scanline        use_nearest_scaling, \
   1013                                                   fname, \
   1014                                                   src_bpp_, \
   1015                                                   mask_bpp_, \
   1016                                                   dst_w_bpp_, \
   1017                                                   flags, \
   1018                                                   pixblock_size_, \
   1019                                                   init, \
   1020                                                   cleanup, \
   1021                                                   process_pixblock_head, \
   1022                                                   process_pixblock_tail, \
   1023                                                   process_pixblock_tail_head, \
   1024                                                   dst_w_basereg_ = 28, \
   1025                                                   dst_r_basereg_ = 4, \
   1026                                                   src_basereg_   = 0, \
   1027                                                   mask_basereg_  = 24
   1028 
   1029    pixman_asm_function \fname
   1030    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
   1031 
   1032 /*
   1033 * Make some macro arguments globally visible and accessible
   1034 * from other macros
   1035 */
   1036    .set src_bpp, \src_bpp_
   1037    .set mask_bpp, \mask_bpp_
   1038    .set dst_w_bpp, \dst_w_bpp_
   1039    .set pixblock_size, \pixblock_size_
   1040    .set dst_w_basereg, \dst_w_basereg_
   1041    .set dst_r_basereg, \dst_r_basereg_
   1042    .set src_basereg, \src_basereg_
   1043    .set mask_basereg, \mask_basereg_
   1044 
   1045 .if \use_nearest_scaling != 0
   1046    /*
   1047     * Assign symbolic names to registers for nearest scaling
   1048     */
   1049    W           .req        x0
   1050    DST_W       .req        x1
   1051    SRC         .req        x2
   1052    VX          .req        x3
   1053    UNIT_X      .req        x4
   1054    SRC_WIDTH_FIXED .req    x5
   1055    MASK        .req        x6
   1056    TMP1        .req        x8
   1057    TMP2        .req        x9
   1058    DST_R       .req        x10
   1059    DUMMY       .req        x30
   1060 
   1061    .macro pixld_src x:vararg
   1062        pixld_s \x
   1063    .endm
   1064 
   1065    sxtw        x0, w0
   1066    sxtw        x3, w3
   1067    sxtw        x4, w4
   1068    sxtw        x5, w5
   1069 
   1070    stp         x29, x30, [sp, -16]!
   1071    mov         x29, sp
   1072    sub         sp, sp, 88
   1073    sub         x29, x29, 64
   1074    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
   1075    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
   1076    stp         x8, x9, [x29, -80]
   1077    str         x10, [x29, -88]
   1078 .else
   1079    /*
   1080     * Assign symbolic names to registers
   1081     */
   1082    W           .req        x0      /* width (is updated during processing) */
   1083    DST_W       .req        x1      /* destination buffer pointer for writes */
   1084    SRC         .req        x2      /* source buffer pointer */
   1085    MASK        .req        x3      /* mask pointer */
   1086    DST_R       .req        x4      /* destination buffer pointer for reads */
   1087    DUMMY       .req        x30
   1088 
   1089    .macro pixld_src x:vararg
   1090        pixld \x
   1091    .endm
   1092 
   1093    sxtw        x0, w0
   1094 
   1095    stp         x29, x30, [sp, -16]!
   1096    mov         x29, sp
   1097    sub         sp, sp, 64
   1098    sub         x29, x29, 64
   1099    st1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
   1100    st1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
   1101 .endif
   1102 
   1103 .if (((\flags) & FLAG_DST_READWRITE) != 0)
   1104    .set dst_r_bpp, dst_w_bpp
   1105 .else
   1106    .set dst_r_bpp, 0
   1107 .endif
   1108 .if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
   1109    .set DEINTERLEAVE_32BPP_ENABLED, 1
   1110 .else
   1111    .set DEINTERLEAVE_32BPP_ENABLED, 0
   1112 .endif
   1113 
   1114    .macro fetch_src_pixblock
   1115        pixld_src   pixblock_size, src_bpp, \
   1116                    (src_basereg - pixblock_size * src_bpp / 64), SRC
   1117    .endm
   1118 
   1119    \init
   1120    mov         DST_R, DST_W
   1121 
   1122    cmp         W, #pixblock_size
   1123    blt         800f
   1124 
   1125    ensure_destination_ptr_alignment \process_pixblock_head, \
   1126                                     \process_pixblock_tail, \
   1127                                     \process_pixblock_tail_head
   1128 
   1129    subs        W, W, #pixblock_size
   1130    blt         700f
   1131 
   1132    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
   1133    pixld_a     pixblock_size, dst_r_bpp, \
   1134                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   1135    fetch_src_pixblock
   1136    pixld       pixblock_size, mask_bpp, \
   1137                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   1138    \process_pixblock_head
   1139    subs        W, W, #pixblock_size
   1140    blt         200f
   1141 100:
   1142    \process_pixblock_tail_head
   1143    subs        W, W, #pixblock_size
   1144    bge         100b
   1145 200:
   1146    \process_pixblock_tail
   1147    pixst_a     pixblock_size, dst_w_bpp, \
   1148                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   1149 700:
   1150    /* Process the remaining trailing pixels in the scanline (dst aligned) */
   1151    process_trailing_pixels 0, 1, \
   1152                            \process_pixblock_head, \
   1153                            \process_pixblock_tail, \
   1154                            \process_pixblock_tail_head
   1155 
   1156    \cleanup
   1157 .if \use_nearest_scaling != 0
   1158    sub         x29, x29, 64
   1159    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
   1160    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
   1161    ldp         x8, x9, [x29, -80]
   1162    ldr         x10, [x29, -96]
   1163    mov         sp, x29
   1164    ldp         x29, x30, [sp], 16
   1165    VERIFY_LR
   1166    ret  /* exit */
   1167 .else
   1168    sub         x29, x29, 64
   1169    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
   1170    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
   1171    mov         sp, x29
   1172    ldp         x29, x30, [sp], 16
   1173    VERIFY_LR
   1174    ret  /* exit */
   1175 .endif
   1176 800:
   1177    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
   1178    process_trailing_pixels 0, 0, \
   1179                            \process_pixblock_head, \
   1180                            \process_pixblock_tail, \
   1181                            \process_pixblock_tail_head
   1182 
   1183    \cleanup
   1184 .if \use_nearest_scaling != 0
   1185    sub         x29, x29, 64
   1186    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
   1187    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
   1188    ldp         x8, x9, [x29, -80]
   1189    ldr         x10, [x29, -88]
   1190    mov         sp, x29
   1191    ldp         x29, x30, [sp], 16
   1192    VERIFY_LR
   1193    ret  /* exit */
   1194 
   1195    .unreq      DUMMY
   1196    .unreq      DST_R
   1197    .unreq      SRC
   1198    .unreq      W
   1199    .unreq      VX
   1200    .unreq      UNIT_X
   1201    .unreq      TMP1
   1202    .unreq      TMP2
   1203    .unreq      DST_W
   1204    .unreq      MASK
   1205    .unreq      SRC_WIDTH_FIXED
   1206 
   1207 .else
   1208    sub         x29, x29, 64
   1209    ld1         {v8.8b, v9.8b, v10.8b, v11.8b}, [x29], 32
   1210    ld1         {v12.8b, v13.8b, v14.8b, v15.8b}, [x29], 32
   1211    mov          sp, x29
   1212    ldp          x29, x30, [sp], 16
   1213    VERIFY_LR
   1214    ret  /* exit */
   1215 
   1216    .unreq      DUMMY
   1217    .unreq      SRC
   1218    .unreq      MASK
   1219    .unreq      DST_R
   1220    .unreq      DST_W
   1221    .unreq      W
   1222 .endif
   1223 
   1224    .purgem     fetch_src_pixblock
   1225    .purgem     pixld_src
   1226 
   1227    pixman_end_asm_function
   1228 .endm
   1229 
   1230 .macro generate_composite_function_single_scanline x:vararg
   1231    generate_composite_function_scanline 0, \x
   1232 .endm
   1233 
   1234 .macro generate_composite_function_nearest_scanline x:vararg
   1235    generate_composite_function_scanline 1, \x
   1236 .endm
   1237 
   1238 /* Default prologue/epilogue, nothing special needs to be done */
   1239 
   1240 .macro default_init
   1241 .endm
   1242 
   1243 .macro default_cleanup
   1244 .endm
   1245 
   1246 /*
   1247 * Prologue/epilogue variant which additionally saves/restores v8-v15
   1248 * registers (they need to be saved/restored by callee according to ABI).
   1249 * This is required if the code needs to use all the NEON registers.
   1250 */
   1251 
   1252 .macro default_init_need_all_regs
   1253 .endm
   1254 
   1255 .macro default_cleanup_need_all_regs
   1256 .endm
   1257 
   1258 /******************************************************************************/
   1259 
   1260 /*
   1261 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
   1262 * into a planar a8r8g8b8 format (with a, r, g, b color components
   1263 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
   1264 *
   1265 * Warning: the conversion is destructive and the original
   1266 *          value (in) is lost.
   1267 */
   1268 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
   1269    shrn        \()\out_r\().8b, \()\in\().8h,    #8
   1270    shrn        \()\out_g\().8b, \()\in\().8h,    #3
   1271    sli         \()\in\().8h,    \()\in\().8h,    #5
   1272    movi        \()\out_a\().8b, #255
   1273    sri         \()\out_r\().8b, \()\out_r\().8b, #5
   1274    sri         \()\out_g\().8b, \()\out_g\().8b, #6
   1275    shrn        \()\out_b\().8b, \()\in\().8h,    #2
   1276 .endm
   1277 
   1278 .macro convert_0565_to_x888 in, out_r, out_g, out_b
   1279    shrn        \()\out_r\().8b, \()\in\().8h,    #8
   1280    shrn        \()\out_g\().8b, \()\in\().8h,    #3
   1281    sli         \()\in\().8h,    \()\in\().8h,    #5
   1282    sri         \()\out_r\().8b, \()\out_r\().8b, #5
   1283    sri         \()\out_g\().8b, \()\out_g\().8b, #6
   1284    shrn        \()\out_b\().8b, \()\in\().8h,    #2
   1285 .endm
   1286 
   1287 /*
   1288 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
   1289 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
   1290 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
   1291 * registers (tmp1, tmp2)
   1292 */
   1293 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
   1294    ushll       \()\tmp1\().8h, \()\in_g\().8b, #7
   1295    shl         \()\tmp1\().8h, \()\tmp1\().8h, #1
   1296    ushll       \()\out\().8h,  \()\in_r\().8b, #7
   1297    shl         \()\out\().8h,  \()\out\().8h,  #1
   1298    ushll       \()\tmp2\().8h, \()\in_b\().8b, #7
   1299    shl         \()\tmp2\().8h, \()\tmp2\().8h, #1
   1300    sri         \()\out\().8h, \()\tmp1\().8h, #5
   1301    sri         \()\out\().8h, \()\tmp2\().8h, #11
   1302 .endm
   1303 
   1304 /*
   1305 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
   1306 * returned in (out0, out1) registers pair. Requires one temporary
   1307 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
   1308 * value from 'in' is lost
   1309 */
   1310 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
   1311    shl         \()\out0\().4h, \()\in\().4h,   #5  /* G top 6 bits */
   1312    shl         \()\tmp\().4h,  \()\in\().4h,   #11 /* B top 5 bits */
   1313    sri         \()\in\().4h,   \()\in\().4h,   #5  /* R is ready \in top bits */
   1314    sri         \()\out0\().4h, \()\out0\().4h, #6  /* G is ready \in top bits */
   1315    sri         \()\tmp\().4h,  \()\tmp\().4h,  #5  /* B is ready \in top bits */
   1316    ushr        \()\out1\().4h, \()\in\().4h,   #8  /* R is \in place */
   1317    sri         \()\out0\().4h, \()\tmp\().4h,  #8  /* G \() B is \in place */
   1318    zip1        \()\tmp\().4h,  \()\out0\().4h, \()\out1\().4h  /* everything is \in place */
   1319    zip2        \()\out1\().4h, \()\out0\().4h, \()\out1\().4h
   1320    mov         \()\out0\().d[0], \()\tmp\().d[0]
   1321 .endm