tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arm-neon-asm.h (40489B)


      1 /*
      2 * Copyright © 2009 Nokia Corporation
      3 *
      4 * Permission is hereby granted, free of charge, to any person obtaining a
      5 * copy of this software and associated documentation files (the "Software"),
      6 * to deal in the Software without restriction, including without limitation
      7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      8 * and/or sell copies of the Software, and to permit persons to whom the
      9 * Software is furnished to do so, subject to the following conditions:
     10 *
     11 * The above copyright notice and this permission notice (including the next
     12 * paragraph) shall be included in all copies or substantial portions of the
     13 * Software.
     14 *
     15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     21 * DEALINGS IN THE SOFTWARE.
     22 *
     23 * Author:  Siarhei Siamashka (siarhei.siamashka@nokia.com)
     24 */
     25 
     26 /*
     27 * This file contains a macro ('generate_composite_function') which can
     28 * construct 2D image processing functions, based on a common template.
     29 * Any combinations of source, destination and mask images with 8bpp,
     30 * 16bpp, 24bpp, 32bpp color formats are supported.
     31 *
     32 * This macro takes care of:
     33 *  - handling of leading and trailing unaligned pixels
     34 *  - doing most of the work related to L2 cache preload
     35 *  - encourages the use of software pipelining for better instructions
     36 *    scheduling
     37 *
     38 * The user of this macro has to provide some configuration parameters
     39 * (bit depths for the images, prefetch distance, etc.) and a set of
     40 * macros, which should implement basic code chunks responsible for
     41 * pixels processing. See 'pixman-arm-neon-asm.S' file for the usage
     42 * examples.
     43 *
     44 * TODO:
     45 *  - try overlapped pixel method (from Ian Rickards) when processing
     46 *    exactly two blocks of pixels
     47 *  - maybe add an option to do reverse scanline processing
     48 */
     49 
     50 /*
     51 * Bit flags for 'generate_composite_function' macro which are used
     52 * to tune generated functions behavior.
     53 */
     54 .set FLAG_DST_WRITEONLY,       0
     55 .set FLAG_DST_READWRITE,       1
     56 .set FLAG_DEINTERLEAVE_32BPP,  2
     57 
     58 /*
     59 * Offset in stack where mask and source pointer/stride can be accessed
     60 * from 'init' macro. This is useful for doing special handling for solid mask.
     61 */
     62 .set ARGS_STACK_OFFSET,        40
     63 
     64 /*
     65 * Constants for selecting preferable prefetch type.
     66 */
     67 .set PREFETCH_TYPE_NONE,       0 /* No prefetch at all */
     68 .set PREFETCH_TYPE_SIMPLE,     1 /* A simple, fixed-distance-ahead prefetch */
     69 .set PREFETCH_TYPE_ADVANCED,   2 /* Advanced fine-grained prefetch */
     70 
     71 /*
     72 * Definitions of supplementary pixld/pixst macros (for partial load/store of
     73 * pixel data).
     74 */
     75 
     76 .macro pixldst1 op, elem_size, reg1, mem_operand, abits
     77 .if \abits > 0
     78    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\(), :\()\abits\()]!
     79 .else
     80    \op\().\()\elem_size {d\()\reg1}, [\()\mem_operand\()]!
     81 .endif
     82 .endm
     83 
     84 .macro pixldst2 op, elem_size, reg1, reg2, mem_operand, abits
     85 .if \abits > 0
     86    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\(), :\()\abits\()]!
     87 .else
     88    \op\().\()\elem_size {d\()\reg1, d\()\reg2}, [\()\mem_operand\()]!
     89 .endif
     90 .endm
     91 
     92 .macro pixldst4 op, elem_size, reg1, reg2, reg3, reg4, mem_operand, abits
     93 .if \abits > 0
     94    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\(), :\()\abits\()]!
     95 .else
     96    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3, d\()\reg4}, [\()\mem_operand\()]!
     97 .endif
     98 .endm
     99 
    100 .macro pixldst0 op, elem_size, reg1, idx, mem_operand, abits
    101    \op\().\()\elem_size {d\()\reg1[\idx]}, [\()\mem_operand\()]!
    102 .endm
    103 
    104 .macro pixldst3 op, elem_size, reg1, reg2, reg3, mem_operand
    105    \op\().\()\elem_size {d\()\reg1, d\()\reg2, d\()\reg3}, [\()\mem_operand\()]!
    106 .endm
    107 
    108 .macro pixldst30 op, elem_size, reg1, reg2, reg3, idx, mem_operand
    109    \op\().\()\elem_size {d\()\reg1[\idx], d\()\reg2[\idx], d\()\reg3[\idx]}, [\()\mem_operand\()]!
    110 .endm
    111 
    112 .macro pixldst numbytes, op, elem_size, basereg, mem_operand, abits
    113 .if \numbytes == 32
    114    pixldst4 \op, \elem_size, %(\basereg+4), %(\basereg+5), \
    115                              %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    116 .elseif \numbytes == 16
    117    pixldst2 \op, \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand, \abits
    118 .elseif \numbytes == 8
    119    pixldst1 \op, \elem_size, %(\basereg+1), \mem_operand, \abits
    120 .elseif \numbytes == 4
    121    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 32)
    122        pixldst0 \op, 32, %(\basereg+0), 1, \mem_operand, \abits
    123    .elseif \elem_size == 16
    124        pixldst0 \op, 16, %(\basereg+0), 2, \mem_operand, \abits
    125        pixldst0 \op, 16, %(\basereg+0), 3, \mem_operand, \abits
    126    .else
    127        pixldst0 \op, 8, %(\basereg+0), 4, \mem_operand, \abits
    128        pixldst0 \op, 8, %(\basereg+0), 5, \mem_operand, \abits
    129        pixldst0 \op, 8, %(\basereg+0), 6, \mem_operand, \abits
    130        pixldst0 \op, 8, %(\basereg+0), 7, \mem_operand, \abits
    131    .endif
    132 .elseif \numbytes == 2
    133    .if !RESPECT_STRICT_ALIGNMENT || (\elem_size == 16)
    134        pixldst0 \op, 16, %(\basereg+0), 1, \mem_operand, \abits
    135    .else
    136        pixldst0 \op, 8, %(\basereg+0), 2, \mem_operand, \abits
    137        pixldst0 \op, 8, %(\basereg+0), 3, \mem_operand, \abits
    138    .endif
    139 .elseif \numbytes == 1
    140    pixldst0 \op, 8, %(\basereg+0), 1, \mem_operand, \abits
    141 .else
    142    .error "unsupported size: \numbytes"
    143 .endif
    144 .endm
    145 
    146 .macro pixld numpix, bpp, basereg, mem_operand, abits=0
    147 .if \bpp > 0
    148 .if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    149    pixldst4 vld4, 8, %(\basereg+4), %(\basereg+5), \
    150                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    151 .elseif (\bpp == 24) && (\numpix == 8)
    152    pixldst3 vld3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
    153 .elseif (\bpp == 24) && (\numpix == 4)
    154    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
    155    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
    156    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
    157    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
    158 .elseif (\bpp == 24) && (\numpix == 2)
    159    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
    160    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
    161 .elseif (\bpp == 24) && (\numpix == 1)
    162    pixldst30 vld3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
    163 .else
    164    pixldst %(\numpix * \bpp / 8), vld1, %(\bpp), \basereg, \mem_operand, \abits
    165 .endif
    166 .endif
    167 .endm
    168 
    169 .macro pixst numpix, bpp, basereg, mem_operand, abits=0
    170 .if \bpp > 0
    171 .if (\bpp == 32) && (\numpix == 8) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    172    pixldst4 vst4, 8, %(\basereg+4), %(\basereg+5), \
    173                      %(\basereg+6), %(\basereg+7), \mem_operand, \abits
    174 .elseif (\bpp == 24) && (\numpix == 8)
    175    pixldst3 vst3, 8, %(\basereg+3), %(\basereg+4), %(\basereg+5), \mem_operand
    176 .elseif (\bpp == 24) && (\numpix == 4)
    177    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 4, \mem_operand
    178    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 5, \mem_operand
    179    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 6, \mem_operand
    180    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 7, \mem_operand
    181 .elseif (\bpp == 24) && (\numpix == 2)
    182    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 2, \mem_operand
    183    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 3, \mem_operand
    184 .elseif (\bpp == 24) && (\numpix == 1)
    185    pixldst30 vst3, 8, %(\basereg+0), %(\basereg+1), %(\basereg+2), 1, \mem_operand
    186 .else
    187    pixldst %(\numpix * \bpp / 8), vst1, %(\bpp), \basereg, \mem_operand, \abits
    188 .endif
    189 .endif
    190 .endm
    191 
    192 .macro pixld_a numpix, bpp, basereg, mem_operand
    193 .if (\bpp * \numpix) <= 128
    194    pixld \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
    195 .else
    196    pixld \numpix, \bpp, \basereg, \mem_operand, 128
    197 .endif
    198 .endm
    199 
    200 .macro pixst_a numpix, bpp, basereg, mem_operand
    201 .if (\bpp * \numpix) <= 128
    202    pixst \numpix, \bpp, \basereg, \mem_operand, %(\bpp * \numpix)
    203 .else
    204    pixst \numpix, \bpp, \basereg, \mem_operand, 128
    205 .endif
    206 .endm
    207 
    208 /*
    209 * Pixel fetcher for nearest scaling (needs TMP1, TMP2, VX, UNIT_X register
    210 * aliases to be defined)
    211 */
    212 .macro pixld1_s elem_size, reg1, mem_operand
    213 .if \elem_size == 16
    214    mov     TMP1, VX, asr #16
    215    adds    VX, VX, UNIT_X
    216 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    217    bpl     5b
    218    add     TMP1, \mem_operand, TMP1, asl #1
    219    mov     TMP2, VX, asr #16
    220    adds    VX, VX, UNIT_X
    221 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    222    bpl     5b
    223    add     TMP2, \mem_operand, TMP2, asl #1
    224    vld1.16 {d\()\reg1\()[0]}, [TMP1, :16]
    225    mov     TMP1, VX, asr #16
    226    adds    VX, VX, UNIT_X
    227 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    228    bpl     5b
    229    add     TMP1, \mem_operand, TMP1, asl #1
    230    vld1.16 {d\()\reg1\()[1]}, [TMP2, :16]
    231    mov     TMP2, VX, asr #16
    232    adds    VX, VX, UNIT_X
    233 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    234    bpl     5b
    235    add     TMP2, \mem_operand, TMP2, asl #1
    236    vld1.16 {d\()\reg1\()[2]}, [TMP1, :16]
    237    vld1.16 {d\()\reg1\()[3]}, [TMP2, :16]
    238 .elseif \elem_size == 32
    239    mov     TMP1, VX, asr #16
    240    adds    VX, VX, UNIT_X
    241 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    242    bpl     5b
    243    add     TMP1, \mem_operand, TMP1, asl #2
    244    mov     TMP2, VX, asr #16
    245    adds    VX, VX, UNIT_X
    246 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    247    bpl     5b
    248    add     TMP2, \mem_operand, TMP2, asl #2
    249    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
    250    vld1.32 {d\()\reg1\()[1]}, [TMP2, :32]
    251 .else
    252    .error "unsupported"
    253 .endif
    254 .endm
    255 
    256 .macro pixld2_s elem_size, reg1, reg2, mem_operand
    257 .if 0 /* elem_size == 32 */
    258    mov     TMP1, VX, asr #16
    259    add     VX, VX, UNIT_X, asl #1
    260    add     TMP1, \mem_operand, TMP1, asl #2
    261    mov     TMP2, VX, asr #16
    262    sub     VX, VX, UNIT_X
    263    add     TMP2, \mem_operand, TMP2, asl #2
    264    vld1.32 {d\()\reg1\()[0]}, [TMP1, :32]
    265    mov     TMP1, VX, asr #16
    266    add     VX, VX, UNIT_X, asl #1
    267    add     TMP1, \mem_operand, TMP1, asl #2
    268    vld1.32 {d\()\reg2\()[0]}, [TMP2, :32]
    269    mov     TMP2, VX, asr #16
    270    add     VX, VX, UNIT_X
    271    add     TMP2, \mem_operand, TMP2, asl #2
    272    vld1.32 {d\()\reg1\()[1]}, [TMP1, :32]
    273    vld1.32 {d\()\reg2\()[1]}, [TMP2, :32]
    274 .else
    275    pixld1_s \elem_size, \reg1, \mem_operand
    276    pixld1_s \elem_size, \reg2, \mem_operand
    277 .endif
    278 .endm
    279 
    280 .macro pixld0_s elem_size, reg1, idx, mem_operand
    281 .if \elem_size == 16
    282    mov     TMP1, VX, asr #16
    283    adds    VX, VX, UNIT_X
    284 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    285    bpl     5b
    286    add     TMP1, \mem_operand, TMP1, asl #1
    287    vld1.16 {d\()\reg1\()[\idx]}, [TMP1, :16]
    288 .elseif \elem_size == 32
    289    mov     TMP1, VX, asr #16
    290    adds    VX, VX, UNIT_X
    291 5:  subspl  VX, VX, SRC_WIDTH_FIXED
    292    bpl     5b
    293    add     TMP1, \mem_operand, TMP1, asl #2
    294    vld1.32 {d\()\reg1\()[\idx]}, [TMP1, :32]
    295 .endif
    296 .endm
    297 
    298 .macro pixld_s_internal numbytes, elem_size, basereg, mem_operand
    299 .if \numbytes == 32
    300    pixld2_s \elem_size, %(\basereg+4), %(\basereg+5), \mem_operand
    301    pixld2_s \elem_size, %(\basereg+6), %(\basereg+7), \mem_operand
    302    pixdeinterleave \elem_size, %(\basereg+4)
    303 .elseif \numbytes == 16
    304    pixld2_s \elem_size, %(\basereg+2), %(\basereg+3), \mem_operand
    305 .elseif \numbytes == 8
    306    pixld1_s \elem_size, %(\basereg+1), \mem_operand
    307 .elseif \numbytes == 4
    308    .if \elem_size == 32
    309        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
    310    .elseif \elem_size == 16
    311        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
    312        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
    313    .else
    314        pixld0_s \elem_size, %(\basereg+0), 4, \mem_operand
    315        pixld0_s \elem_size, %(\basereg+0), 5, \mem_operand
    316        pixld0_s \elem_size, %(\basereg+0), 6, \mem_operand
    317        pixld0_s \elem_size, %(\basereg+0), 7, \mem_operand
    318    .endif
    319 .elseif \numbytes == 2
    320    .if \elem_size == 16
    321        pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
    322    .else
    323        pixld0_s \elem_size, %(\basereg+0), 2, \mem_operand
    324        pixld0_s \elem_size, %(\basereg+0), 3, \mem_operand
    325    .endif
    326 .elseif \numbytes == 1
    327    pixld0_s \elem_size, %(\basereg+0), 1, \mem_operand
    328 .else
    329    .error "unsupported size: \numbytes"
    330 .endif
    331 .endm
    332 
    333 .macro pixld_s numpix, bpp, basereg, mem_operand
    334 .if \bpp > 0
    335    pixld_s_internal %(\numpix * \bpp / 8), %(\bpp), \basereg, \mem_operand
    336 .endif
    337 .endm
    338 
    339 .macro vuzp8 reg1, reg2
    340    vuzp.8 d\()\reg1, d\()\reg2
    341 .endm
    342 
    343 .macro vzip8 reg1, reg2
    344    vzip.8 d\()\reg1, d\()\reg2
    345 .endm
    346 
    347 /* deinterleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
    348 .macro pixdeinterleave bpp, basereg
    349 .if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    350    vuzp8 %(\basereg+0), %(\basereg+1)
    351    vuzp8 %(\basereg+2), %(\basereg+3)
    352    vuzp8 %(\basereg+1), %(\basereg+3)
    353    vuzp8 %(\basereg+0), %(\basereg+2)
    354 .endif
    355 .endm
    356 
    357 /* interleave B, G, R, A channels for eight 32bpp pixels in 4 registers */
    358 .macro pixinterleave bpp, basereg
    359 .if (\bpp == 32) && (DEINTERLEAVE_32BPP_ENABLED != 0)
    360    vzip8 %(\basereg+0), %(\basereg+2)
    361    vzip8 %(\basereg+1), %(\basereg+3)
    362    vzip8 %(\basereg+2), %(\basereg+3)
    363    vzip8 %(\basereg+0), %(\basereg+1)
    364 .endif
    365 .endm
    366 
    367 /*
    368 * This is a macro for implementing cache preload. The main idea is that
    369 * cache preload logic is mostly independent from the rest of pixels
    370 * processing code. It starts at the top left pixel and moves forward
    371 * across pixels and can jump across scanlines. Prefetch distance is
    372 * handled in an 'incremental' way: it starts from 0 and advances to the
    373 * optimal distance over time. After reaching optimal prefetch distance,
    374 * it is kept constant. There are some checks which prevent prefetching
    375 * unneeded pixel lines below the image (but it still can prefetch a bit
    376 * more data on the right side of the image - not a big issue and may
    377 * be actually helpful when rendering text glyphs). Additional trick is
    378 * the use of LDR instruction for prefetch instead of PLD when moving to
    379 * the next line, the point is that we have a high chance of getting TLB
    380 * miss in this case, and PLD would be useless.
    381 *
    382 * This sounds like it may introduce a noticeable overhead (when working with
    383 * fully cached data). But in reality, due to having a separate pipeline and
    384 * instruction queue for NEON unit in ARM Cortex-A8, normal ARM code can
    385 * execute simultaneously with NEON and be completely shadowed by it. Thus
    386 * we get no performance overhead at all (*). This looks like a very nice
    387 * feature of Cortex-A8, if used wisely. We don't have a hardware prefetcher,
    388 * but still can implement some rather advanced prefetch logic in software
    389 * for almost zero cost!
    390 *
    391 * (*) The overhead of the prefetcher is visible when running some trivial
    392 * pixels processing like simple copy. Anyway, having prefetch is a must
    393 * when working with the graphics data.
    394 */
    395 .macro PF a, x:vararg
    396 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_ADVANCED)
    397    \a \x
    398 .endif
    399 .endm
    400 
    401 .macro cache_preload std_increment, boost_increment
    402 .if (src_bpp_shift >= 0) || (dst_r_bpp != 0) || (mask_bpp_shift >= 0)
    403 .if regs_shortage
    404    PF ldr, ORIG_W, [sp] /* If we are short on regs, ORIG_W is kept on stack */
    405 .endif
    406 .if \std_increment != 0
    407    PF add, PF_X, PF_X, #\std_increment
    408 .endif
    409    PF tst, PF_CTL, #0xF
    410    PF addne, PF_X, PF_X, #\boost_increment
    411    PF subne, PF_CTL, PF_CTL, #1
    412    PF cmp, PF_X, ORIG_W
    413 .if src_bpp_shift >= 0
    414    PF pld, [PF_SRC, PF_X, lsl #src_bpp_shift]
    415 .endif
    416 .if dst_r_bpp != 0
    417    PF pld, [PF_DST, PF_X, lsl #dst_bpp_shift]
    418 .endif
    419 .if mask_bpp_shift >= 0
    420    PF pld, [PF_MASK, PF_X, lsl #mask_bpp_shift]
    421 .endif
    422    PF subge, PF_X, PF_X, ORIG_W
    423    PF subsge, PF_CTL, PF_CTL, #0x10
    424 .if src_bpp_shift >= 0
    425    PF ldrbge, DUMMY, [PF_SRC, SRC_STRIDE, lsl #src_bpp_shift]!
    426 .endif
    427 .if dst_r_bpp != 0
    428    PF ldrbge, DUMMY, [PF_DST, DST_STRIDE, lsl #dst_bpp_shift]!
    429 .endif
    430 .if mask_bpp_shift >= 0
    431    PF ldrbge, DUMMY, [PF_MASK, MASK_STRIDE, lsl #mask_bpp_shift]!
    432 .endif
    433 .endif
    434 .endm
    435 
    436 .macro cache_preload_simple
    437 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_SIMPLE)
    438 .if src_bpp > 0
    439    pld [SRC, #(PREFETCH_DISTANCE_SIMPLE * src_bpp / 8)]
    440 .endif
    441 .if dst_r_bpp > 0
    442    pld [DST_R, #(PREFETCH_DISTANCE_SIMPLE * dst_r_bpp / 8)]
    443 .endif
    444 .if mask_bpp > 0
    445    pld [MASK, #(PREFETCH_DISTANCE_SIMPLE * mask_bpp / 8)]
    446 .endif
    447 .endif
    448 .endm
    449 
    450 .macro fetch_mask_pixblock
    451    pixld       pixblock_size, mask_bpp, \
    452                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    453 .endm
    454 
    455 /*
    456 * Macro which is used to process leading pixels until destination
    457 * pointer is properly aligned (at 16 bytes boundary). When destination
    458 * buffer uses 16bpp format, this is unnecessary, or even pointless.
    459 */
    460 .macro ensure_destination_ptr_alignment process_pixblock_head, \
    461                                        process_pixblock_tail, \
    462                                        process_pixblock_tail_head
    463 .if dst_w_bpp != 24
    464    tst         DST_R, #0xF
    465    beq         2f
    466 
    467 .irp lowbit, 1, 2, 4, 8, 16
    468 .if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
    469 .if \lowbit < 16 /* we don't need more than 16-byte alignment */
    470    tst         DST_R, #\lowbit
    471    beq         1f
    472 .endif
    473    pixld_src   (\lowbit * 8 / dst_w_bpp), src_bpp, src_basereg, SRC
    474    pixld       (\lowbit * 8 / dst_w_bpp), mask_bpp, mask_basereg, MASK
    475 .if dst_r_bpp > 0
    476    pixld_a     (\lowbit * 8 / dst_r_bpp), dst_r_bpp, dst_r_basereg, DST_R
    477 .else
    478    add         DST_R, DST_R, #\lowbit
    479 .endif
    480    PF add,     PF_X, PF_X, #(\lowbit * 8 / dst_w_bpp)
    481    sub         W, W, #(\lowbit * 8 / dst_w_bpp)
    482 1:
    483 .endif
    484 .endr
    485    pixdeinterleave src_bpp, src_basereg
    486    pixdeinterleave mask_bpp, mask_basereg
    487    pixdeinterleave dst_r_bpp, dst_r_basereg
    488 
    489    \process_pixblock_head
    490    cache_preload 0, pixblock_size
    491    cache_preload_simple
    492    \process_pixblock_tail
    493 
    494    pixinterleave dst_w_bpp, dst_w_basereg
    495 .irp lowbit, 1, 2, 4, 8, 16
    496 .if (dst_w_bpp <= (\lowbit * 8)) && ((\lowbit * 8) < (pixblock_size * dst_w_bpp))
    497 .if \lowbit < 16 /* we don't need more than 16-byte alignment */
    498    tst         DST_W, #\lowbit
    499    beq         1f
    500 .endif
    501    pixst_a     (\lowbit * 8 / dst_w_bpp), dst_w_bpp, dst_w_basereg, DST_W
    502 1:
    503 .endif
    504 .endr
    505 .endif
    506 2:
    507 .endm
    508 
    509 /*
    510 * Special code for processing up to (pixblock_size - 1) remaining
    511 * trailing pixels. As SIMD processing performs operation on
    512 * pixblock_size pixels, anything smaller than this has to be loaded
    513 * and stored in a special way. Loading and storing of pixel data is
    514 * performed in such a way that we fill some 'slots' in the NEON
    515 * registers (some slots naturally are unused), then perform compositing
    516 * operation as usual. In the end, the data is taken from these 'slots'
    517 * and saved to memory.
    518 *
    519 * cache_preload_flag - allows to suppress prefetch if
    520 *                      set to 0
    521 * dst_aligned_flag   - selects whether destination buffer
    522 *                      is aligned
    523 */
    524 .macro process_trailing_pixels cache_preload_flag, \
    525                               dst_aligned_flag, \
    526                               process_pixblock_head, \
    527                               process_pixblock_tail, \
    528                               process_pixblock_tail_head
    529    tst         W, #(pixblock_size - 1)
    530    beq         2f
    531 .irp chunk_size, 16, 8, 4, 2, 1
    532 .if pixblock_size > \chunk_size
    533    tst         W, #\chunk_size
    534    beq         1f
    535    pixld_src   \chunk_size, src_bpp, src_basereg, SRC
    536    pixld       \chunk_size, mask_bpp, mask_basereg, MASK
    537 .if \dst_aligned_flag != 0
    538    pixld_a     \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
    539 .else
    540    pixld       \chunk_size, dst_r_bpp, dst_r_basereg, DST_R
    541 .endif
    542 .if \cache_preload_flag != 0
    543    PF add,     PF_X, PF_X, #\chunk_size
    544 .endif
    545 1:
    546 .endif
    547 .endr
    548    pixdeinterleave src_bpp, src_basereg
    549    pixdeinterleave mask_bpp, mask_basereg
    550    pixdeinterleave dst_r_bpp, dst_r_basereg
    551 
    552    \process_pixblock_head
    553 .if \cache_preload_flag != 0
    554    cache_preload 0, pixblock_size
    555    cache_preload_simple
    556 .endif
    557    \process_pixblock_tail
    558    pixinterleave dst_w_bpp, dst_w_basereg
    559 .irp chunk_size, 16, 8, 4, 2, 1
    560 .if pixblock_size > \chunk_size
    561    tst         W, #\chunk_size
    562    beq         1f
    563 .if \dst_aligned_flag != 0
    564    pixst_a     \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
    565 .else
    566    pixst       \chunk_size, dst_w_bpp, dst_w_basereg, DST_W
    567 .endif
    568 1:
    569 .endif
    570 .endr
    571 2:
    572 .endm
    573 
    574 /*
    575 * Macro, which performs all the needed operations to switch to the next
    576 * scanline and start the next loop iteration unless all the scanlines
    577 * are already processed.
    578 */
    579 .macro advance_to_next_scanline start_of_loop_label
    580 .if regs_shortage
    581    ldrd        W, [sp] /* load W and H (width and height) from stack */
    582 .else
    583    mov         W, ORIG_W
    584 .endif
    585    add         DST_W, DST_W, DST_STRIDE, lsl #dst_bpp_shift
    586 .if src_bpp != 0
    587    add         SRC, SRC, SRC_STRIDE, lsl #src_bpp_shift
    588 .endif
    589 .if mask_bpp != 0
    590    add         MASK, MASK, MASK_STRIDE, lsl #mask_bpp_shift
    591 .endif
    592 .if (dst_w_bpp != 24)
    593    sub         DST_W, DST_W, W, lsl #dst_bpp_shift
    594 .endif
    595 .if (src_bpp != 24) && (src_bpp != 0)
    596    sub         SRC, SRC, W, lsl #src_bpp_shift
    597 .endif
    598 .if (mask_bpp != 24) && (mask_bpp != 0)
    599    sub         MASK, MASK, W, lsl #mask_bpp_shift
    600 .endif
    601    subs        H, H, #1
    602    mov         DST_R, DST_W
    603 .if regs_shortage
    604    str         H, [sp, #4] /* save updated height to stack */
    605 .endif
    606    bge         \start_of_loop_label
    607 .endm
    608 
    609 /*
    610 * Registers are allocated in the following way by default:
    611 * d0, d1, d2, d3     - reserved for loading source pixel data
    612 * d4, d5, d6, d7     - reserved for loading destination pixel data
    613 * d24, d25, d26, d27 - reserved for loading mask pixel data
    614 * d28, d29, d30, d31 - final destination pixel data for writeback to memory
    615 */
    616 .macro generate_composite_function fname, \
    617                                   src_bpp_, \
    618                                   mask_bpp_, \
    619                                   dst_w_bpp_, \
    620                                   flags, \
    621                                   pixblock_size_, \
    622                                   prefetch_distance, \
    623                                   init, \
    624                                   cleanup, \
    625                                   process_pixblock_head, \
    626                                   process_pixblock_tail, \
    627                                   process_pixblock_tail_head, \
    628                                   dst_w_basereg_ = 28, \
    629                                   dst_r_basereg_ = 4, \
    630                                   src_basereg_   = 0, \
    631                                   mask_basereg_  = 24
    632 
    633    pixman_asm_function \fname
    634 
    635    push        {r4-r12, lr}        /* save all registers */
    636 
    637 /*
    638 * Select prefetch type for this function. If prefetch distance is
    639 * set to 0 or one of the color formats is 24bpp, SIMPLE prefetch
    640 * has to be used instead of ADVANCED.
    641 */
    642    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_DEFAULT
    643 .if \prefetch_distance == 0
    644    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    645 .elseif (PREFETCH_TYPE_CURRENT > PREFETCH_TYPE_SIMPLE) && \
    646        ((\src_bpp_ == 24) || (\mask_bpp_ == 24) || (\dst_w_bpp_ == 24))
    647    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_SIMPLE
    648 .endif
    649 
    650 /*
    651 * Make some macro arguments globally visible and accessible
    652 * from other macros
    653 */
    654    .set src_bpp, \src_bpp_
    655    .set mask_bpp, \mask_bpp_
    656    .set dst_w_bpp, \dst_w_bpp_
    657    .set pixblock_size, \pixblock_size_
    658    .set dst_w_basereg, \dst_w_basereg_
    659    .set dst_r_basereg, \dst_r_basereg_
    660    .set src_basereg, \src_basereg_
    661    .set mask_basereg, \mask_basereg_
    662 
    663    .macro pixld_src x:vararg
    664        pixld \x
    665    .endm
    666    .macro fetch_src_pixblock
    667        pixld_src   pixblock_size, src_bpp, \
    668                    (src_basereg - pixblock_size * src_bpp / 64), SRC
    669    .endm
    670 /*
    671 * Assign symbolic names to registers
    672 */
    673    W           .req        r0      /* width (is updated during processing) */
    674    H           .req        r1      /* height (is updated during processing) */
    675    DST_W       .req        r2      /* destination buffer pointer for writes */
    676    DST_STRIDE  .req        r3      /* destination image stride */
    677    SRC         .req        r4      /* source buffer pointer */
    678    SRC_STRIDE  .req        r5      /* source image stride */
    679    DST_R       .req        r6      /* destination buffer pointer for reads */
    680 
    681    MASK        .req        r7      /* mask pointer */
    682    MASK_STRIDE .req        r8      /* mask stride */
    683 
    684    PF_CTL      .req        r9      /* combined lines counter and prefetch */
    685                                    /* distance increment counter */
    686    PF_X        .req        r10     /* pixel index in a scanline for current */
    687                                    /* pretetch position */
    688    PF_SRC      .req        r11     /* pointer to source scanline start */
    689                                    /* for prefetch purposes */
    690    PF_DST      .req        r12     /* pointer to destination scanline start */
    691                                    /* for prefetch purposes */
    692    PF_MASK     .req        r14     /* pointer to mask scanline start */
    693                                    /* for prefetch purposes */
    694 /*
    695 * Check whether we have enough registers for all the local variables.
    696 * If we don't have enough registers, original width and height are
    697 * kept on top of stack (and 'regs_shortage' variable is set to indicate
    698 * this for the rest of code). Even if there are enough registers, the
    699 * allocation scheme may be a bit different depending on whether source
    700 * or mask is not used.
    701 */
    702 .if (PREFETCH_TYPE_CURRENT < PREFETCH_TYPE_ADVANCED)
    703    ORIG_W      .req        r10     /* saved original width */
    704    DUMMY       .req        r12     /* temporary register */
    705    .set        regs_shortage, 0
    706 .elseif mask_bpp == 0
    707    ORIG_W      .req        r7      /* saved original width */
    708    DUMMY       .req        r8      /* temporary register */
    709    .set        regs_shortage, 0
    710 .elseif src_bpp == 0
    711    ORIG_W      .req        r4      /* saved original width */
    712    DUMMY       .req        r5      /* temporary register */
    713    .set        regs_shortage, 0
    714 .else
    715    ORIG_W      .req        r1      /* saved original width */
    716    DUMMY       .req        r1      /* temporary register */
    717    .set        regs_shortage, 1
    718 .endif
    719 
    720    .set mask_bpp_shift, -1
    721 .if src_bpp == 32
    722    .set src_bpp_shift, 2
    723 .elseif src_bpp == 24
    724    .set src_bpp_shift, 0
    725 .elseif src_bpp == 16
    726    .set src_bpp_shift, 1
    727 .elseif src_bpp == 8
    728    .set src_bpp_shift, 0
    729 .elseif src_bpp == 0
    730    .set src_bpp_shift, -1
    731 .else
    732    .error "requested src bpp (src_bpp) is not supported"
    733 .endif
    734 .if mask_bpp == 32
    735    .set mask_bpp_shift, 2
    736 .elseif mask_bpp == 24
    737    .set mask_bpp_shift, 0
    738 .elseif mask_bpp == 8
    739    .set mask_bpp_shift, 0
    740 .elseif mask_bpp == 0
    741    .set mask_bpp_shift, -1
    742 .else
    743    .error "requested mask bpp (mask_bpp) is not supported"
    744 .endif
    745 .if dst_w_bpp == 32
    746    .set dst_bpp_shift, 2
    747 .elseif dst_w_bpp == 24
    748    .set dst_bpp_shift, 0
    749 .elseif dst_w_bpp == 16
    750    .set dst_bpp_shift, 1
    751 .elseif dst_w_bpp == 8
    752    .set dst_bpp_shift, 0
    753 .else
    754    .error "requested dst bpp (dst_w_bpp) is not supported"
    755 .endif
    756 
    757 .if (((\flags) & FLAG_DST_READWRITE) != 0)
    758    .set dst_r_bpp, dst_w_bpp
    759 .else
    760    .set dst_r_bpp, 0
    761 .endif
    762 .if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
    763    .set DEINTERLEAVE_32BPP_ENABLED, 1
    764 .else
    765    .set DEINTERLEAVE_32BPP_ENABLED, 0
    766 .endif
    767 
    768 .if \prefetch_distance < 0 || \prefetch_distance > 15
    769    .error "invalid prefetch distance (\prefetch_distance)"
    770 .endif
    771 
    772 .if src_bpp > 0
    773    ldr         SRC, [sp, #40]
    774 .endif
    775 .if mask_bpp > 0
    776    ldr         MASK, [sp, #48]
    777 .endif
    778    PF mov,     PF_X, #0
    779 .if src_bpp > 0
    780    ldr         SRC_STRIDE, [sp, #44]
    781 .endif
    782 .if mask_bpp > 0
    783    ldr         MASK_STRIDE, [sp, #52]
    784 .endif
    785    mov         DST_R, DST_W
    786 
    787 .if src_bpp == 24
    788    sub         SRC_STRIDE, SRC_STRIDE, W
    789    sub         SRC_STRIDE, SRC_STRIDE, W, lsl #1
    790 .endif
    791 .if mask_bpp == 24
    792    sub         MASK_STRIDE, MASK_STRIDE, W
    793    sub         MASK_STRIDE, MASK_STRIDE, W, lsl #1
    794 .endif
    795 .if dst_w_bpp == 24
    796    sub         DST_STRIDE, DST_STRIDE, W
    797    sub         DST_STRIDE, DST_STRIDE, W, lsl #1
    798 .endif
    799 
    800 /*
    801 * Setup advanced prefetcher initial state
    802 */
    803    PF mov,     PF_SRC, SRC
    804    PF mov,     PF_DST, DST_R
    805    PF mov,     PF_MASK, MASK
    806    /* PF_CTL = prefetch_distance | ((h - 1) << 4) */
    807    PF mov,     PF_CTL, H, lsl #4
    808    PF add,     PF_CTL, #(\prefetch_distance - 0x10)
    809 
    810    \init
    811 .if regs_shortage
    812    push        {r0, r1}
    813 .endif
    814    subs        H, H, #1
    815 .if regs_shortage
    816    str         H, [sp, #4] /* save updated height to stack */
    817 .else
    818    mov         ORIG_W, W
    819 .endif
    820    blt         9f
    821    cmp         W, #(pixblock_size * 2)
    822    blt         8f
    823 /*
    824 * This is the start of the pipelined loop, which if optimized for
    825 * long scanlines
    826 */
    827 0:
    828    ensure_destination_ptr_alignment \process_pixblock_head, \
    829                                     \process_pixblock_tail, \
    830                                     \process_pixblock_tail_head
    831 
    832    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
    833    pixld_a     pixblock_size, dst_r_bpp, \
    834                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    835    fetch_src_pixblock
    836    pixld       pixblock_size, mask_bpp, \
    837                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    838    PF add,     PF_X, PF_X, #pixblock_size
    839    \process_pixblock_head
    840    cache_preload 0, pixblock_size
    841    cache_preload_simple
    842    subs        W, W, #(pixblock_size * 2)
    843    blt         2f
    844 1:
    845    \process_pixblock_tail_head
    846    cache_preload_simple
    847    subs        W, W, #pixblock_size
    848    bge         1b
    849 2:
    850    \process_pixblock_tail
    851    pixst_a     pixblock_size, dst_w_bpp, \
    852                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
    853 
    854    /* Process the remaining trailing pixels in the scanline */
    855    process_trailing_pixels 1, 1, \
    856                            \process_pixblock_head, \
    857                            \process_pixblock_tail, \
    858                            \process_pixblock_tail_head
    859    advance_to_next_scanline 0b
    860 
    861 .if regs_shortage
    862    pop         {r0, r1}
    863 .endif
    864    \cleanup
    865    pop         {r4-r12, pc}  /* exit */
    866 /*
    867 * This is the start of the loop, designed to process images with small width
    868 * (less than pixblock_size * 2 pixels). In this case neither pipelining
    869 * nor prefetch are used.
    870 */
    871 8:
    872    /* Process exactly pixblock_size pixels if needed */
    873    tst         W, #pixblock_size
    874    beq         1f
    875    pixld       pixblock_size, dst_r_bpp, \
    876                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
    877    fetch_src_pixblock
    878    pixld       pixblock_size, mask_bpp, \
    879                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
    880    \process_pixblock_head
    881    \process_pixblock_tail
    882    pixst       pixblock_size, dst_w_bpp, \
    883                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
    884 1:
    885    /* Process the remaining trailing pixels in the scanline */
    886    process_trailing_pixels 0, 0, \
    887                            \process_pixblock_head, \
    888                            \process_pixblock_tail, \
    889                            \process_pixblock_tail_head
    890    advance_to_next_scanline 8b
    891 9:
    892 .if regs_shortage
    893    pop         {r0, r1}
    894 .endif
    895    \cleanup
    896    pop         {r4-r12, pc}  /* exit */
    897 
    898    .purgem     fetch_src_pixblock
    899    .purgem     pixld_src
    900 
    901    .unreq      SRC
    902    .unreq      MASK
    903    .unreq      DST_R
    904    .unreq      DST_W
    905    .unreq      ORIG_W
    906    .unreq      W
    907    .unreq      H
    908    .unreq      SRC_STRIDE
    909    .unreq      DST_STRIDE
    910    .unreq      MASK_STRIDE
    911    .unreq      PF_CTL
    912    .unreq      PF_X
    913    .unreq      PF_SRC
    914    .unreq      PF_DST
    915    .unreq      PF_MASK
    916    .unreq      DUMMY
    917    pixman_end_asm_function
    918 .endm
    919 
    920 /*
    921 * A simplified variant of function generation template for a single
    922 * scanline processing (for implementing pixman combine functions)
    923 */
    924 .macro generate_composite_function_scanline        use_nearest_scaling, \
    925                                                   fname, \
    926                                                   src_bpp_, \
    927                                                   mask_bpp_, \
    928                                                   dst_w_bpp_, \
    929                                                   flags, \
    930                                                   pixblock_size_, \
    931                                                   init, \
    932                                                   cleanup, \
    933                                                   process_pixblock_head, \
    934                                                   process_pixblock_tail, \
    935                                                   process_pixblock_tail_head, \
    936                                                   dst_w_basereg_ = 28, \
    937                                                   dst_r_basereg_ = 4, \
    938                                                   src_basereg_   = 0, \
    939                                                   mask_basereg_  = 24
    940 
    941    pixman_asm_function \fname
    942 
    943    .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    944 /*
    945 * Make some macro arguments globally visible and accessible
    946 * from other macros
    947 */
    948    .set src_bpp, \src_bpp_
    949    .set mask_bpp, \mask_bpp_
    950    .set dst_w_bpp, \dst_w_bpp_
    951    .set pixblock_size, \pixblock_size_
    952    .set dst_w_basereg, \dst_w_basereg_
    953    .set dst_r_basereg, \dst_r_basereg_
    954    .set src_basereg, \src_basereg_
    955    .set mask_basereg, \mask_basereg_
    956 
    957 .if \use_nearest_scaling != 0
    958    /*
    959     * Assign symbolic names to registers for nearest scaling
    960     */
    961    W           .req        r0
    962    DST_W       .req        r1
    963    SRC         .req        r2
    964    VX          .req        r3
    965    UNIT_X      .req        ip
    966    MASK        .req        lr
    967    TMP1        .req        r4
    968    TMP2        .req        r5
    969    DST_R       .req        r6
    970    SRC_WIDTH_FIXED .req        r7
    971 
    972    .macro pixld_src x:vararg
    973        pixld_s \x
    974    .endm
    975 
    976    ldr         UNIT_X, [sp]
    977    push        {r4-r8, lr}
    978    ldr         SRC_WIDTH_FIXED, [sp, #(24 + 4)]
    979    .if mask_bpp != 0
    980    ldr         MASK, [sp, #(24 + 8)]
    981    .endif
    982 .else
    983    /*
    984     * Assign symbolic names to registers
    985     */
    986    W           .req        r0      /* width (is updated during processing) */
    987    DST_W       .req        r1      /* destination buffer pointer for writes */
    988    SRC         .req        r2      /* source buffer pointer */
    989    DST_R       .req        ip      /* destination buffer pointer for reads */
    990    MASK        .req        r3      /* mask pointer */
    991 
    992    .macro pixld_src x:vararg
    993        pixld \x
    994    .endm
    995 .endif
    996 
    997 .if (((\flags) & FLAG_DST_READWRITE) != 0)
    998    .set dst_r_bpp, dst_w_bpp
    999 .else
   1000    .set dst_r_bpp, 0
   1001 .endif
   1002 .if (((\flags) & FLAG_DEINTERLEAVE_32BPP) != 0)
   1003    .set DEINTERLEAVE_32BPP_ENABLED, 1
   1004 .else
   1005    .set DEINTERLEAVE_32BPP_ENABLED, 0
   1006 .endif
   1007 
   1008    .macro fetch_src_pixblock
   1009        pixld_src   pixblock_size, src_bpp, \
   1010                    (src_basereg - pixblock_size * src_bpp / 64), SRC
   1011    .endm
   1012 
   1013    \init
   1014    mov         DST_R, DST_W
   1015 
   1016    cmp         W, #pixblock_size
   1017    blt         8f
   1018 
   1019    ensure_destination_ptr_alignment \process_pixblock_head, \
   1020                                     \process_pixblock_tail, \
   1021                                     \process_pixblock_tail_head
   1022 
   1023    subs        W, W, #pixblock_size
   1024    blt         7f
   1025 
   1026    /* Implement "head (tail_head) ... (tail_head) tail" loop pattern */
   1027    pixld_a     pixblock_size, dst_r_bpp, \
   1028                (dst_r_basereg - pixblock_size * dst_r_bpp / 64), DST_R
   1029    fetch_src_pixblock
   1030    pixld       pixblock_size, mask_bpp, \
   1031                (mask_basereg - pixblock_size * mask_bpp / 64), MASK
   1032    \process_pixblock_head
   1033    subs        W, W, #pixblock_size
   1034    blt         2f
   1035 1:
   1036    \process_pixblock_tail_head
   1037    subs        W, W, #pixblock_size
   1038    bge         1b
   1039 2:
   1040    \process_pixblock_tail
   1041    pixst_a     pixblock_size, dst_w_bpp, \
   1042                (dst_w_basereg - pixblock_size * dst_w_bpp / 64), DST_W
   1043 7:
   1044    /* Process the remaining trailing pixels in the scanline (dst aligned) */
   1045    process_trailing_pixels 0, 1, \
   1046                            \process_pixblock_head, \
   1047                            \process_pixblock_tail, \
   1048                            \process_pixblock_tail_head
   1049 
   1050    \cleanup
   1051 .if \use_nearest_scaling != 0
   1052    pop         {r4-r8, pc}  /* exit */
   1053 .else
   1054    bx          lr  /* exit */
   1055 .endif
   1056 8:
   1057    /* Process the remaining trailing pixels in the scanline (dst unaligned) */
   1058    process_trailing_pixels 0, 0, \
   1059                            \process_pixblock_head, \
   1060                            \process_pixblock_tail, \
   1061                            \process_pixblock_tail_head
   1062 
   1063    \cleanup
   1064 
   1065 .if \use_nearest_scaling != 0
   1066    pop         {r4-r8, pc}  /* exit */
   1067 
   1068    .unreq      DST_R
   1069    .unreq      SRC
   1070    .unreq      W
   1071    .unreq      VX
   1072    .unreq      UNIT_X
   1073    .unreq      TMP1
   1074    .unreq      TMP2
   1075    .unreq      DST_W
   1076    .unreq      MASK
   1077    .unreq      SRC_WIDTH_FIXED
   1078 
   1079 .else
   1080    bx          lr  /* exit */
   1081 
   1082    .unreq      SRC
   1083    .unreq      MASK
   1084    .unreq      DST_R
   1085    .unreq      DST_W
   1086    .unreq      W
   1087 .endif
   1088 
   1089    .purgem     fetch_src_pixblock
   1090    .purgem     pixld_src
   1091 
   1092    pixman_end_asm_function
   1093 .endm
   1094 
   1095 .macro generate_composite_function_single_scanline x:vararg
   1096    generate_composite_function_scanline 0, \x
   1097 .endm
   1098 
   1099 .macro generate_composite_function_nearest_scanline x:vararg
   1100    generate_composite_function_scanline 1, \x
   1101 .endm
   1102 
   1103 /* Default prologue/epilogue, nothing special needs to be done */
   1104 
   1105 .macro default_init
   1106 .endm
   1107 
   1108 .macro default_cleanup
   1109 .endm
   1110 
   1111 /*
   1112 * Prologue/epilogue variant which additionally saves/restores d8-d15
   1113 * registers (they need to be saved/restored by callee according to ABI).
   1114 * This is required if the code needs to use all the NEON registers.
   1115 */
   1116 
   1117 .macro default_init_need_all_regs
   1118    vpush       {d8-d15}
   1119 .endm
   1120 
   1121 .macro default_cleanup_need_all_regs
   1122    vpop        {d8-d15}
   1123 .endm
   1124 
   1125 /******************************************************************************/
   1126 
   1127 /*
   1128 * Conversion of 8 r5g6b6 pixels packed in 128-bit register (in)
   1129 * into a planar a8r8g8b8 format (with a, r, g, b color components
   1130 * stored into 64-bit registers out_a, out_r, out_g, out_b respectively).
   1131 *
   1132 * Warning: the conversion is destructive and the original
   1133 *          value (in) is lost.
   1134 */
   1135 .macro convert_0565_to_8888 in, out_a, out_r, out_g, out_b
   1136    vshrn.u16   \out_r, \in,    #8
   1137    vshrn.u16   \out_g, \in,    #3
   1138    vsli.u16    \in,    \in,    #5
   1139    vmov.u8     \out_a, #255
   1140    vsri.u8     \out_r, \out_r, #5
   1141    vsri.u8     \out_g, \out_g, #6
   1142    vshrn.u16   \out_b, \in,    #2
   1143 .endm
   1144 
   1145 .macro convert_0565_to_x888 in, out_r, out_g, out_b
   1146    vshrn.u16   \out_r, \in,    #8
   1147    vshrn.u16   \out_g, \in,    #3
   1148    vsli.u16    \in,    \in,    #5
   1149    vsri.u8     \out_r, \out_r, #5
   1150    vsri.u8     \out_g, \out_g, #6
   1151    vshrn.u16   \out_b, \in,    #2
   1152 .endm
   1153 
   1154 /*
   1155 * Conversion from planar a8r8g8b8 format (with a, r, g, b color components
   1156 * in 64-bit registers in_a, in_r, in_g, in_b respectively) into 8 r5g6b6
   1157 * pixels packed in 128-bit register (out). Requires two temporary 128-bit
   1158 * registers (tmp1, tmp2)
   1159 */
   1160 .macro convert_8888_to_0565 in_r, in_g, in_b, out, tmp1, tmp2
   1161    vshll.u8    \tmp1, \in_g, #8
   1162    vshll.u8    \out, \in_r, #8
   1163    vshll.u8    \tmp2, \in_b, #8
   1164    vsri.u16    \out, \tmp1, #5
   1165    vsri.u16    \out, \tmp2, #11
   1166 .endm
   1167 
   1168 /*
   1169 * Conversion of four r5g6b5 pixels (in) to four x8r8g8b8 pixels
   1170 * returned in (out0, out1) registers pair. Requires one temporary
   1171 * 64-bit register (tmp). 'out1' and 'in' may overlap, the original
   1172 * value from 'in' is lost
   1173 */
   1174 .macro convert_four_0565_to_x888_packed in, out0, out1, tmp
   1175    vshl.u16    \out0, \in,   #5  /* G top 6 bits */
   1176    vshl.u16    \tmp,  \in,   #11 /* B top 5 bits */
   1177    vsri.u16    \in,   \in,   #5  /* R is ready in top bits */
   1178    vsri.u16    \out0, \out0, #6  /* G is ready in top bits */
   1179    vsri.u16    \tmp,  \tmp,  #5  /* B is ready in top bits */
   1180    vshr.u16    \out1, \in,   #8  /* R is in place */
   1181    vsri.u16    \out0, \tmp,  #8  /* G & B is in place */
   1182    vzip.u16    \out0, \out1      /* everything is in place */
   1183 .endm