tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-arm-simd-asm.h (34714B)


      1 /*
      2 * Copyright © 2012 Raspberry Pi Foundation
      3 * Copyright © 2012 RISC OS Open Ltd
      4 *
      5 * Permission to use, copy, modify, distribute, and sell this software and its
      6 * documentation for any purpose is hereby granted without fee, provided that
      7 * the above copyright notice appear in all copies and that both that
      8 * copyright notice and this permission notice appear in supporting
      9 * documentation, and that the name of the copyright holders not be used in
     10 * advertising or publicity pertaining to distribution of the software without
     11 * specific, written prior permission.  The copyright holders make no
     12 * representations about the suitability of this software for any purpose.  It
     13 * is provided "as is" without express or implied warranty.
     14 *
     15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     22 * SOFTWARE.
     23 *
     24 * Author:  Ben Avison (bavison@riscosopen.org)
     25 *
     26 */
     27 
     28 /*
     29 * Because the alignment of pixel data to cachelines, and even the number of
     30 * cachelines per row can vary from row to row, and because of the need to
     31 * preload each scanline once and only once, this prefetch strategy treats
     32 * each row of pixels independently. When a pixel row is long enough, there
     33 * are three distinct phases of prefetch:
     34 * * an inner loop section, where each time a cacheline of data is
     35 *    processed, another cacheline is preloaded (the exact distance ahead is
     36 *    determined empirically using profiling results from lowlevel-blt-bench)
     37 * * a leading section, where enough cachelines are preloaded to ensure no
     38 *    cachelines escape being preloaded when the inner loop starts
     39 * * a trailing section, where a limited number (0 or more) of cachelines
     40 *    are preloaded to deal with data (if any) that hangs off the end of the
     41 *    last iteration of the inner loop, plus any trailing bytes that were not
     42 *    enough to make up one whole iteration of the inner loop
     43 * 
     44 * There are (in general) three distinct code paths, selected between
     45 * depending upon how long the pixel row is. If it is long enough that there
     46 * is at least one iteration of the inner loop (as described above) then
     47 * this is described as the "wide" case. If it is shorter than that, but
     48 * there are still enough bytes output that there is at least one 16-byte-
     49 * long, 16-byte-aligned write to the destination (the optimum type of
     50 * write), then this is the "medium" case. If it is not even this long, then
     51 * this is the "narrow" case, and there is no attempt to align writes to
     52 * 16-byte boundaries. In the "medium" and "narrow" cases, all the
     53 * cachelines containing data from the pixel row are prefetched up-front.
     54 */
     55 
     56 /*
     57 * Determine whether we put the arguments on the stack for debugging.
     58 */
     59 #undef DEBUG_PARAMS
     60 
     61 /*
     62 * Bit flags for 'generate_composite_function' macro which are used
     63 * to tune generated functions behavior.
     64 */
     65 .set FLAG_DST_WRITEONLY,         0
     66 .set FLAG_DST_READWRITE,         1
     67 .set FLAG_COND_EXEC,             0
     68 .set FLAG_BRANCH_OVER,           2
     69 .set FLAG_PROCESS_PRESERVES_PSR, 0
     70 .set FLAG_PROCESS_CORRUPTS_PSR,  4
     71 .set FLAG_PROCESS_DOESNT_STORE,  0
     72 .set FLAG_PROCESS_DOES_STORE,    8 /* usually because it needs to conditionally skip it */
     73 .set FLAG_NO_SPILL_LINE_VARS,        0
     74 .set FLAG_SPILL_LINE_VARS_WIDE,      16
     75 .set FLAG_SPILL_LINE_VARS_NON_WIDE,  32
     76 .set FLAG_SPILL_LINE_VARS,           48
     77 .set FLAG_PROCESS_CORRUPTS_SCRATCH,  0
     78 .set FLAG_PROCESS_PRESERVES_SCRATCH, 64
     79 .set FLAG_PROCESS_PRESERVES_WK0,     0
     80 .set FLAG_PROCESS_CORRUPTS_WK0,      128 /* if possible, use the specified register(s) instead so WK0 can hold number of leading pixels */
     81 .set FLAG_PRELOAD_DST,               0
     82 .set FLAG_NO_PRELOAD_DST,            256
     83 
     84 /*
     85 * Number of bytes by which to adjust preload offset of destination
     86 * buffer (allows preload instruction to be moved before the load(s))
     87 */
     88 .set DST_PRELOAD_BIAS, 0
     89 
     90 /*
     91 * Offset into stack where mask and source pointer/stride can be accessed.
     92 */
     93 #ifdef DEBUG_PARAMS
     94 .set ARGS_STACK_OFFSET,        (9*4+9*4)
     95 #else
     96 .set ARGS_STACK_OFFSET,        (9*4)
     97 #endif
     98 
     99 /*
    100 * Offset into stack where space allocated during init macro can be accessed.
    101 */
    102 .set LOCALS_STACK_OFFSET,     0
    103 
    104 /*
    105 * Constants for selecting preferable prefetch type.
    106 */
    107 .set PREFETCH_TYPE_NONE,       0
    108 .set PREFETCH_TYPE_STANDARD,   1
    109 
    110 /*
    111 * Definitions of macros for load/store of pixel data.
    112 */
    113 
    114 .macro pixldst op, cond=al, numbytes, reg0, reg1, reg2, reg3, base, unaligned=0
    115 .if \numbytes == 16
    116  .if \unaligned == 1
    117        \op\()r\()\cond    WK\()\reg0, [\base], #4
    118        \op\()r\()\cond    WK\()\reg1, [\base], #4
    119        \op\()r\()\cond    WK\()\reg2, [\base], #4
    120        \op\()r\()\cond    WK\()\reg3, [\base], #4
    121  .else
    122        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
    123  .endif
    124 .elseif \numbytes == 8
    125  .if \unaligned == 1
    126        \op\()r\()\cond    WK\()\reg0, [\base], #4
    127        \op\()r\()\cond    WK\()\reg1, [\base], #4
    128  .else
    129        \op\()mia\()\cond  \base!, {WK\()\reg0,WK\()\reg1}
    130  .endif
    131 .elseif \numbytes == 4
    132        \op\()r\()\cond    WK\()\reg0, [\base], #4
    133 .elseif \numbytes == 2
    134        \op\()rh\()\cond   WK\()\reg0, [\base], #2
    135 .elseif \numbytes == 1
    136        \op\()rb\()\cond   WK\()\reg0, [\base], #1
    137 .else
    138  .error "unsupported size: \numbytes"
    139 .endif
    140 .endm
    141 
    142 .macro pixst_baseupdated cond, numbytes, reg0, reg1, reg2, reg3, base
    143 .if \numbytes == 16
    144        stm\()\cond\()db \base, {WK\()\reg0,WK\()\reg1,WK\()\reg2,WK\()\reg3}
    145 .elseif \numbytes == 8
    146        stmdb\()\cond \base, {WK\()\reg0,WK\()\reg1}
    147 .elseif \numbytes == 4
    148        str\()\cond    WK\()\reg0, [\base, #-4]
    149 .elseif \numbytes == 2
    150        strh\()\cond   WK\()\reg0, [\base, #-2]
    151 .elseif \numbytes == 1
    152        strb\()\cond   WK\()\reg0, [\base, #-1]
    153 .else
    154  .error "unsupported size: \numbytes"
    155 .endif
    156 .endm
    157 
    158 .macro pixld cond, numbytes, firstreg, base, unaligned
    159        pixldst ld, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base, \unaligned
    160 .endm
    161 
    162 .macro pixst cond, numbytes, firstreg, base
    163 .if (flags) & FLAG_DST_READWRITE
    164        pixst_baseupdated \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
    165 .else
    166        pixldst st, \cond, \numbytes, %(\firstreg+0), %(\firstreg+1), %(\firstreg+2), %(\firstreg+3), \base
    167 .endif
    168 .endm
    169 
    170 .macro PF a, x:vararg
    171 .if (PREFETCH_TYPE_CURRENT == PREFETCH_TYPE_STANDARD)
    172        \a \x
    173 .endif
    174 .endm
    175 
    176 
    177 .macro preload_leading_step1  bpp, ptr, base
    178 /* If the destination is already 16-byte aligned, then we need to preload
    179 * between 0 and prefetch_distance (inclusive) cache lines ahead so there
    180 * are no gaps when the inner loop starts.
    181 */
    182 .if \bpp > 0
    183        PF  bic,    \ptr, \base, #31
    184  .set OFFSET, 0
    185  .rept prefetch_distance+1
    186        PF  pld,    [\ptr, #OFFSET]
    187   .set OFFSET, OFFSET+32
    188  .endr
    189 .endif
    190 .endm
    191 
    192 .macro preload_leading_step2  bpp, bpp_shift, ptr, base
    193 /* However, if the destination is not 16-byte aligned, we may need to
    194 * preload more cache lines than that. The question we need to ask is:
    195 * are the bytes corresponding to the leading pixels more than the amount
    196 * by which the source pointer will be rounded down for preloading, and if
    197 * so, by how many cache lines? Effectively, we want to calculate
    198 *     leading_bytes = ((-dst)&15)*src_bpp/dst_bpp
    199 *     inner_loop_offset = (src+leading_bytes)&31
    200 *     extra_needed = leading_bytes - inner_loop_offset
    201 * and test if extra_needed is <= 0, <= 32, or > 32 (where > 32 is only
    202 * possible when there are 4 src bytes for every 1 dst byte).
    203 */
    204 .if \bpp > 0
    205  .ifc \base,DST
    206        /* The test can be simplified further when preloading the destination */
    207        PF  tst,    \base, #16
    208        PF  beq,    61f
    209  .else
    210   .if \bpp/dst_w_bpp == 4
    211        PF  add,    SCRATCH, \base, WK0, lsl #\bpp_shift-dst_bpp_shift
    212        PF  and,    SCRATCH, SCRATCH, #31
    213        PF  rsb,    SCRATCH, SCRATCH, WK0, lsl #\bpp_shift-dst_bpp_shift
    214        PF  sub,    SCRATCH, SCRATCH, #1        /* so now ranges are -16..-1 / 0..31 / 32..63 */
    215        PF  movs,   SCRATCH, SCRATCH, lsl #32-6 /* so this sets         NC   /  nc   /   Nc   */
    216        PF  bcs,    61f
    217        PF  bpl,    60f
    218        PF  pld,    [ptr, #32*(prefetch_distance+2)]
    219   .else
    220        PF  mov,    SCRATCH, \base, lsl #32-5
    221        PF  add,    SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
    222        PF  rsbs,   SCRATCH, SCRATCH, WK0, lsl #32-5+\bpp_shift-dst_bpp_shift
    223        PF  bls,    61f
    224   .endif
    225  .endif
    226 60:     PF  pld,    [\ptr, #32*(prefetch_distance+1)]
    227 61:
    228 .endif
    229 .endm
    230 
    231 #define IS_END_OF_GROUP(INDEX,SIZE) ((SIZE) < 2 || ((INDEX) & ~((INDEX)+1)) & ((SIZE)/2))
    232 .macro preload_middle   bpp, base, scratch_holds_offset
    233 .if \bpp > 0
    234        /* prefetch distance = 256/bpp, stm distance = 128/dst_w_bpp */
    235  .if IS_END_OF_GROUP(SUBBLOCK,256/128*dst_w_bpp/\bpp)
    236   .if \scratch_holds_offset
    237        PF  pld,    [\base, SCRATCH]
    238   .else
    239        PF  bic,    SCRATCH, \base, #31
    240        PF  pld,    [SCRATCH, #32*prefetch_distance]
    241   .endif
    242  .endif
    243 .endif
    244 .endm
    245 
    246 .macro preload_trailing  bpp, bpp_shift, base
    247 .if \bpp > 0
    248  .if \bpp*pix_per_block > 256
    249        /* Calculations are more complex if more than one fetch per block */
    250        PF  and,    WK1, \base, #31
    251        PF  add,    WK1, WK1, WK0, lsl #\bpp_shift
    252        PF  add,    WK1, WK1, #32*(\bpp*pix_per_block/256-1)*(prefetch_distance+1)
    253        PF  bic,    SCRATCH, \base, #31
    254 80:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
    255        PF  add,    SCRATCH, SCRATCH, #32
    256        PF  subs,   WK1, WK1, #32
    257        PF  bhi,    80b
    258  .else
    259        /* If exactly one fetch per block, then we need either 0, 1 or 2 extra preloads */
    260        PF  mov,    SCRATCH, \base, lsl #32-5
    261        PF  adds,   SCRATCH, SCRATCH, X, lsl #32-5+\bpp_shift
    262        PF  adcseq, SCRATCH, SCRATCH, #0
    263        /* The instruction above has two effects: ensures Z is only
    264         * set if C was clear (so Z indicates that both shifted quantities
    265         * were 0), and clears C if Z was set (so C indicates that the sum
    266         * of the shifted quantities was greater and not equal to 32) */
    267        PF  beq,    82f
    268        PF  bic,    SCRATCH, \base, #31
    269        PF  bcc,    81f
    270        PF  pld,    [SCRATCH, #32*(prefetch_distance+2)]
    271 81:     PF  pld,    [SCRATCH, #32*(prefetch_distance+1)]
    272 82:
    273  .endif
    274 .endif
    275 .endm
    276 
    277 
    278 .macro preload_line    narrow_case, bpp, bpp_shift, base
    279 /* "narrow_case" - just means that the macro was invoked from the "narrow"
    280 *    code path rather than the "medium" one - because in the narrow case,
    281 *    the row of pixels is known to output no more than 30 bytes, then
    282 *    (assuming the source pixels are no wider than the the destination
    283 *    pixels) they cannot possibly straddle more than 2 32-byte cachelines,
    284 *    meaning there's no need for a loop.
    285 * "bpp" - number of bits per pixel in the channel (source, mask or
    286 *    destination) that's being preloaded, or 0 if this channel is not used
    287 *    for reading
    288 * "bpp_shift" - log2 of ("bpp"/8) (except if "bpp"=0 of course)
    289 * "base" - base address register of channel to preload (SRC, MASK or DST)
    290 */
    291 .if \bpp > 0
    292  .if \narrow_case && (\bpp <= dst_w_bpp)
    293        /* In these cases, each line for each channel is in either 1 or 2 cache lines */
    294        PF  bic,    WK0, \base, #31
    295        PF  pld,    [WK0]
    296        PF  add,    WK1, \base, X, LSL #\bpp_shift
    297        PF  sub,    WK1, WK1, #1
    298        PF  bic,    WK1, WK1, #31
    299        PF  cmp,    WK1, WK0
    300        PF  beq,    90f
    301        PF  pld,    [WK1]
    302 90:
    303  .else
    304        PF  bic,    WK0, \base, #31
    305        PF  pld,    [WK0]
    306        PF  add,    WK1, \base, X, lsl #\bpp_shift
    307        PF  sub,    WK1, WK1, #1
    308        PF  bic,    WK1, WK1, #31
    309        PF  cmp,    WK1, WK0
    310        PF  beq,    92f
    311 91:     PF  add,    WK0, WK0, #32
    312        PF  cmp,    WK0, WK1
    313        PF  pld,    [WK0]
    314        PF  bne,    91b
    315 92:
    316  .endif
    317 .endif
    318 .endm
    319 
    320 
    321 .macro conditional_process1_helper  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    322        \process_head  \cond, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, 0
    323 .if \decrementx
    324        sub\()\cond X, X, #8*\numbytes/dst_w_bpp
    325 .endif
    326        \process_tail  \cond, \numbytes, \firstreg
    327 .if !((flags) & FLAG_PROCESS_DOES_STORE)
    328        pixst   \cond, \numbytes, \firstreg, DST
    329 .endif
    330 .endm
    331 
    332 .macro conditional_process1  cond, process_head, process_tail, numbytes, firstreg, unaligned_src, unaligned_mask, decrementx
    333 .if (flags) & FLAG_BRANCH_OVER
    334  .ifc \cond,mi
    335        bpl     100f
    336  .endif
    337  .ifc \cond,cs
    338        bcc     100f
    339  .endif
    340  .ifc \cond,ne
    341        beq     100f
    342  .endif
    343        conditional_process1_helper  , \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
    344 100:
    345 .else
    346        conditional_process1_helper  \cond, \process_head, \process_tail, \numbytes, \firstreg, \unaligned_src, \unaligned_mask, \decrementx
    347 .endif
    348 .endm
    349 
    350 .macro conditional_process2  test, cond1, cond2, process_head, process_tail, numbytes1, numbytes2, firstreg1, firstreg2, unaligned_src, unaligned_mask, decrementx
    351 .if (flags) & (FLAG_DST_READWRITE | FLAG_BRANCH_OVER | FLAG_PROCESS_CORRUPTS_PSR | FLAG_PROCESS_DOES_STORE)
    352        /* Can't interleave reads and writes */
    353        \test
    354        conditional_process1  \cond1, \process_head, \process_tail, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, \decrementx
    355  .if (flags) & FLAG_PROCESS_CORRUPTS_PSR
    356        \test
    357  .endif
    358        conditional_process1  \cond2, \process_head, \process_tail, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, \decrementx
    359 .else
    360        /* Can interleave reads and writes for better scheduling */
    361        \test
    362        \process_head  \cond1, \numbytes1, \firstreg1, \unaligned_src, \unaligned_mask, 0
    363        \process_head  \cond2, \numbytes2, \firstreg2, \unaligned_src, \unaligned_mask, 0
    364  .if \decrementx
    365        sub\()\cond1 X, X, #8*\numbytes1/dst_w_bpp
    366        sub\()\cond2 X, X, #8*\numbytes2/dst_w_bpp
    367  .endif
    368        \process_tail  \cond1, \numbytes1, \firstreg1
    369        \process_tail  \cond2, \numbytes2, \firstreg2
    370        pixst   \cond1, \numbytes1, \firstreg1, DST
    371        pixst   \cond2, \numbytes2, \firstreg2, DST
    372 .endif
    373 .endm
    374 
    375 
    376 .macro test_bits_1_0_ptr
    377 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    378        movs    SCRATCH, X, lsl #32-1  /* C,N = bits 1,0 of DST */
    379 .else
    380        movs    SCRATCH, WK0, lsl #32-1  /* C,N = bits 1,0 of DST */
    381 .endif
    382 .endm
    383 
    384 .macro test_bits_3_2_ptr
    385 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    386        movs    SCRATCH, X, lsl #32-3  /* C,N = bits 3, 2 of DST */
    387 .else
    388        movs    SCRATCH, WK0, lsl #32-3  /* C,N = bits 3, 2 of DST */
    389 .endif
    390 .endm
    391 
    392 .macro leading_15bytes  process_head, process_tail
    393        /* On entry, WK0 bits 0-3 = number of bytes until destination is 16-byte aligned */
    394 .set DECREMENT_X, 1
    395 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    396  .set DECREMENT_X, 0
    397        sub     X, X, WK0, lsr #dst_bpp_shift
    398        str     X, [sp, #LINE_SAVED_REG_COUNT*4]
    399        mov     X, WK0
    400 .endif
    401        /* Use unaligned loads in all cases for simplicity */
    402 .if dst_w_bpp == 8
    403        conditional_process2  test_bits_1_0_ptr, mi, cs, \process_head, \process_tail, 1, 2, 1, 2, 1, 1, DECREMENT_X
    404 .elseif dst_w_bpp == 16
    405        test_bits_1_0_ptr
    406        conditional_process1  cs, \process_head, \process_tail, 2, 2, 1, 1, DECREMENT_X
    407 .endif
    408        conditional_process2  test_bits_3_2_ptr, mi, cs, \process_head, \process_tail, 4, 8, 1, 2, 1, 1, DECREMENT_X
    409 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    410        ldr     X, [sp, #LINE_SAVED_REG_COUNT*4]
    411 .endif
    412 .endm
    413 
    414 .macro test_bits_3_2_pix
    415        movs    SCRATCH, X, lsl #dst_bpp_shift+32-3
    416 .endm
    417 
    418 .macro test_bits_1_0_pix
    419 .if dst_w_bpp == 8
    420        movs    SCRATCH, X, lsl #dst_bpp_shift+32-1
    421 .else
    422        movs    SCRATCH, X, lsr #1
    423 .endif
    424 .endm
    425 
    426 .macro trailing_15bytes  process_head, process_tail, unaligned_src, unaligned_mask
    427        conditional_process2  test_bits_3_2_pix, cs, mi, \process_head, \process_tail, 8, 4, 0, 2, \unaligned_src, \unaligned_mask, 0
    428 .if dst_w_bpp == 16
    429        test_bits_1_0_pix
    430        conditional_process1  cs, \process_head, \process_tail, 2, 0, \unaligned_src, \unaligned_mask, 0
    431 .elseif dst_w_bpp == 8
    432        conditional_process2  test_bits_1_0_pix, cs, mi, \process_head, \process_tail, 2, 1, 0, 1, \unaligned_src, \unaligned_mask, 0
    433 .endif
    434 .endm
    435 
    436 
    437 .macro wide_case_inner_loop  process_head, process_tail, unaligned_src, unaligned_mask, dst_alignment
    438 110:
    439 .set SUBBLOCK, 0 /* this is a count of STMs; there can be up to 8 STMs per block */
    440 .rept pix_per_block*dst_w_bpp/128
    441        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 1
    442  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    443        preload_middle  src_bpp, SRC, 1
    444  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    445        preload_middle  mask_bpp, MASK, 1
    446  .else
    447        preload_middle  src_bpp, SRC, 0
    448        preload_middle  mask_bpp, MASK, 0
    449  .endif
    450  .if (dst_r_bpp > 0) && ((SUBBLOCK % 2) == 0) && (((flags) & FLAG_NO_PRELOAD_DST) == 0)
    451        /* Because we know that writes are 16-byte aligned, it's relatively easy to ensure that
    452         * destination prefetches are 32-byte aligned. It's also the easiest channel to offset
    453         * preloads for, to achieve staggered prefetches for multiple channels, because there are
    454         * always two STMs per prefetch, so there is always an opposite STM on which to put the
    455         * preload. Note, no need to BIC the base register here */
    456        PF  pld,    [DST, #32*prefetch_distance - \dst_alignment]
    457  .endif
    458        \process_tail  , 16, 0
    459  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    460        pixst   , 16, 0, DST
    461  .endif
    462  .set SUBBLOCK, SUBBLOCK+1
    463 .endr
    464        subs    X, X, #pix_per_block
    465        bhs     110b
    466 .endm
    467 
    468 .macro wide_case_inner_loop_and_trailing_pixels  process_head, process_tail, process_inner_loop, exit_label, unaligned_src, unaligned_mask
    469        /* Destination now 16-byte aligned; we have at least one block before we have to stop preloading */
    470 .if dst_r_bpp > 0
    471        tst     DST, #16
    472        bne     111f
    473        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 16 + DST_PRELOAD_BIAS
    474        b       112f
    475 111:
    476 .endif
    477        \process_inner_loop  \process_head, \process_tail, \unaligned_src, \unaligned_mask, 0 + DST_PRELOAD_BIAS
    478 112:
    479        /* Just before the final (prefetch_distance+1) 32-byte blocks, deal with final preloads */
    480 .if (src_bpp*pix_per_block > 256) || (mask_bpp*pix_per_block > 256) || (dst_r_bpp*pix_per_block > 256)
    481        PF  and,    WK0, X, #pix_per_block-1
    482 .endif
    483        preload_trailing  src_bpp, src_bpp_shift, SRC
    484        preload_trailing  mask_bpp, mask_bpp_shift, MASK
    485 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    486        preload_trailing  dst_r_bpp, dst_bpp_shift, DST
    487 .endif
    488        add     X, X, #(prefetch_distance+2)*pix_per_block - 128/dst_w_bpp
    489        /* The remainder of the line is handled identically to the medium case */
    490        medium_case_inner_loop_and_trailing_pixels  \process_head, \process_tail,, \exit_label, \unaligned_src, \unaligned_mask
    491 .endm
    492 
    493 .macro medium_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
    494 120:
    495        \process_head  , 16, 0, \unaligned_src, \unaligned_mask, 0
    496        \process_tail  , 16, 0
    497 .if !((flags) & FLAG_PROCESS_DOES_STORE)
    498        pixst   , 16, 0, DST
    499 .endif
    500        subs    X, X, #128/dst_w_bpp
    501        bhs     120b
    502        /* Trailing pixels */
    503        tst     X, #128/dst_w_bpp - 1
    504        beq     \exit_label
    505        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
    506 .endm
    507 
    508 .macro narrow_case_inner_loop_and_trailing_pixels  process_head, process_tail, unused, exit_label, unaligned_src, unaligned_mask
    509        tst     X, #16*8/dst_w_bpp
    510        conditional_process1  ne, \process_head, \process_tail, 16, 0, \unaligned_src, \unaligned_mask, 0
    511        /* Trailing pixels */
    512        /* In narrow case, it's relatively unlikely to be aligned, so let's do without a branch here */
    513        trailing_15bytes  \process_head, \process_tail, \unaligned_src, \unaligned_mask
    514 .endm
    515 
    516 .macro switch_on_alignment  action, process_head, process_tail, process_inner_loop, exit_label
    517 /* Note that if we're reading the destination, it's already guaranteed to be aligned at this point */
    518 .if mask_bpp == 8 || mask_bpp == 16
    519        tst     MASK, #3
    520        bne     141f
    521 .endif
    522  .if src_bpp == 8 || src_bpp == 16
    523        tst     SRC, #3
    524        bne     140f
    525  .endif
    526        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 0
    527  .if src_bpp == 8 || src_bpp == 16
    528        b       \exit_label
    529 140:
    530        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 0
    531  .endif
    532 .if mask_bpp == 8 || mask_bpp == 16
    533        b       \exit_label
    534 141:
    535  .if src_bpp == 8 || src_bpp == 16
    536        tst     SRC, #3
    537        bne     142f
    538  .endif
    539        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 0, 1
    540  .if src_bpp == 8 || src_bpp == 16
    541        b       \exit_label
    542 142:
    543        \action  \process_head, \process_tail, \process_inner_loop, \exit_label, 1, 1
    544  .endif
    545 .endif
    546 .endm
    547 
    548 
    549 .macro end_of_line      restore_x, vars_spilled, loop_label, last_one
    550 .if \vars_spilled
    551        /* Sadly, GAS doesn't seem have an equivalent of the DCI directive? */
    552        /* This is ldmia sp,{} */
    553        .word   0xE89D0000 | LINE_SAVED_REGS
    554 .endif
    555        subs    Y, Y, #1
    556 .if \vars_spilled
    557  .if (LINE_SAVED_REGS) & (1<<1)
    558        str     Y, [sp]
    559  .endif
    560 .endif
    561        add     DST, DST, STRIDE_D
    562 .if src_bpp > 0
    563        add     SRC, SRC, STRIDE_S
    564 .endif
    565 .if mask_bpp > 0
    566        add     MASK, MASK, STRIDE_M
    567 .endif
    568 .if \restore_x
    569        mov     X, ORIG_W
    570 .endif
    571        bhs     \loop_label
    572 .ifc "\last_one",""
    573  .if \vars_spilled
    574        b       197f
    575  .else
    576        b       198f
    577  .endif
    578 .else
    579  .if (!\vars_spilled) && ((flags) & FLAG_SPILL_LINE_VARS)
    580        b       198f
    581  .endif
    582 .endif
    583 .endm
    584 
    585 
    586 .macro generate_composite_function fname, \
    587                                   src_bpp_, \
    588                                   mask_bpp_, \
    589                                   dst_w_bpp_, \
    590                                   flags_, \
    591                                   prefetch_distance_, \
    592                                   init, \
    593                                   newline, \
    594                                   cleanup, \
    595                                   process_head, \
    596                                   process_tail, \
    597                                   process_inner_loop
    598 
    599    pixman_asm_function \fname
    600 
    601 /*
    602 * Make some macro arguments globally visible and accessible
    603 * from other macros
    604 */
    605 .set src_bpp, \src_bpp_
    606 .set mask_bpp, \mask_bpp_
    607 .set dst_w_bpp, \dst_w_bpp_
    608 .set flags, \flags_
    609 .set prefetch_distance, \prefetch_distance_
    610 
    611 /*
    612 * Select prefetch type for this function.
    613 */
    614 .if prefetch_distance == 0
    615  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_NONE
    616 .else
    617  .set PREFETCH_TYPE_CURRENT, PREFETCH_TYPE_STANDARD
    618 .endif
    619 
    620 .if src_bpp == 32
    621  .set src_bpp_shift, 2
    622 .elseif src_bpp == 24
    623  .set src_bpp_shift, 0
    624 .elseif src_bpp == 16
    625  .set src_bpp_shift, 1
    626 .elseif src_bpp == 8
    627  .set src_bpp_shift, 0
    628 .elseif src_bpp == 0
    629  .set src_bpp_shift, -1
    630 .else
    631  .error "requested src bpp (src_bpp) is not supported"
    632 .endif
    633 
    634 .if mask_bpp == 32
    635  .set mask_bpp_shift, 2
    636 .elseif mask_bpp == 24
    637  .set mask_bpp_shift, 0
    638 .elseif mask_bpp == 8
    639  .set mask_bpp_shift, 0
    640 .elseif mask_bpp == 0
    641  .set mask_bpp_shift, -1
    642 .else
    643  .error "requested mask bpp (mask_bpp) is not supported"
    644 .endif
    645 
    646 .if dst_w_bpp == 32
    647  .set dst_bpp_shift, 2
    648 .elseif dst_w_bpp == 24
    649  .set dst_bpp_shift, 0
    650 .elseif dst_w_bpp == 16
    651  .set dst_bpp_shift, 1
    652 .elseif dst_w_bpp == 8
    653  .set dst_bpp_shift, 0
    654 .else
    655  .error "requested dst bpp (dst_w_bpp) is not supported"
    656 .endif
    657 
    658 .if (((flags) & FLAG_DST_READWRITE) != 0)
    659  .set dst_r_bpp, dst_w_bpp
    660 .else
    661  .set dst_r_bpp, 0
    662 .endif
    663 
    664 .set pix_per_block, 16*8/dst_w_bpp
    665 .if src_bpp != 0
    666  .if 32*8/src_bpp > pix_per_block
    667   .set pix_per_block, 32*8/src_bpp
    668  .endif
    669 .endif
    670 .if mask_bpp != 0
    671  .if 32*8/mask_bpp > pix_per_block
    672   .set pix_per_block, 32*8/mask_bpp
    673  .endif
    674 .endif
    675 .if dst_r_bpp != 0
    676  .if 32*8/dst_r_bpp > pix_per_block
    677   .set pix_per_block, 32*8/dst_r_bpp
    678  .endif
    679 .endif
    680 
    681 /* The standard entry conditions set up by pixman-arm-common.h are:
    682 * r0 = width (pixels)
    683 * r1 = height (rows)
    684 * r2 = pointer to top-left pixel of destination
    685 * r3 = destination stride (pixels)
    686 * [sp] = source pixel value, or pointer to top-left pixel of source
    687 * [sp,#4] = 0 or source stride (pixels)
    688 * The following arguments are unused for non-mask operations
    689 * [sp,#8] = mask pixel value, or pointer to top-left pixel of mask
    690 * [sp,#12] = 0 or mask stride (pixels)
    691 */
    692 
    693 /*
    694 * Assign symbolic names to registers
    695 */
    696    X           .req    r0  /* pixels to go on this line */
    697    Y           .req    r1  /* lines to go */
    698    DST         .req    r2  /* destination pixel pointer */
    699    STRIDE_D    .req    r3  /* destination stride (bytes, minus width) */
    700    SRC         .req    r4  /* source pixel pointer */
    701    STRIDE_S    .req    r5  /* source stride (bytes, minus width) */
    702    MASK        .req    r6  /* mask pixel pointer (if applicable) */
    703    STRIDE_M    .req    r7  /* mask stride (bytes, minus width) */
    704    WK0         .req    r8  /* pixel data registers */
    705    WK1         .req    r9
    706    WK2         .req    r10
    707    WK3         .req    r11
    708    SCRATCH     .req    r12
    709    ORIG_W      .req    r14 /* width (pixels) */
    710 
    711        push    {r4-r11, lr}        /* save all registers */
    712 
    713        subs    Y, Y, #1
    714        blo     199f
    715 
    716 #ifdef DEBUG_PARAMS
    717        sub     sp, sp, #9*4
    718 #endif
    719 
    720 .if src_bpp > 0
    721        ldr     SRC, [sp, #ARGS_STACK_OFFSET]
    722        ldr     STRIDE_S, [sp, #ARGS_STACK_OFFSET+4]
    723 .endif
    724 .if mask_bpp > 0
    725        ldr     MASK, [sp, #ARGS_STACK_OFFSET+8]
    726        ldr     STRIDE_M, [sp, #ARGS_STACK_OFFSET+12]
    727 .endif
    728        
    729 #ifdef DEBUG_PARAMS
    730        add     Y, Y, #1
    731        stmia   sp, {r0-r7,pc}
    732        sub     Y, Y, #1
    733 #endif
    734 
    735        \init
    736 
    737 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    738        /* Reserve a word in which to store X during leading pixels */
    739        sub     sp, sp, #4
    740  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET+4
    741  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET+4
    742 .endif
    743        
    744        lsl     STRIDE_D, #dst_bpp_shift /* stride in bytes */
    745        sub     STRIDE_D, STRIDE_D, X, lsl #dst_bpp_shift
    746 .if src_bpp > 0
    747        lsl     STRIDE_S, #src_bpp_shift
    748        sub     STRIDE_S, STRIDE_S, X, lsl #src_bpp_shift
    749 .endif
    750 .if mask_bpp > 0
    751        lsl     STRIDE_M, #mask_bpp_shift
    752        sub     STRIDE_M, STRIDE_M, X, lsl #mask_bpp_shift
    753 .endif
    754 
    755        /* Are we not even wide enough to have one 16-byte aligned 16-byte block write? */
    756        cmp     X, #2*16*8/dst_w_bpp - 1
    757        blo     170f
    758 .if src_bpp || mask_bpp || dst_r_bpp /* Wide and medium cases are the same for fill */
    759        /* To preload ahead on the current line, we need at least (prefetch_distance+2) 32-byte blocks on all prefetch channels */
    760        cmp     X, #(prefetch_distance+3)*pix_per_block - 1
    761        blo     160f
    762 
    763        /* Wide case */
    764        /* Adjust X so that the decrement instruction can also test for
    765         * inner loop termination. We want it to stop when there are
    766         * (prefetch_distance+1) complete blocks to go. */
    767        sub     X, X, #(prefetch_distance+2)*pix_per_block
    768        mov     ORIG_W, X
    769  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
    770        /* This is stmdb sp!,{} */
    771        .word   0xE92D0000 | LINE_SAVED_REGS
    772   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    773   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    774  .endif
    775 151:    /* New line */
    776        \newline
    777        preload_leading_step1  src_bpp, WK1, SRC
    778        preload_leading_step1  mask_bpp, WK2, MASK
    779  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    780        preload_leading_step1  dst_r_bpp, WK3, DST
    781  .endif
    782        
    783        ands    WK0, DST, #15
    784        beq     154f
    785        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
    786 
    787        preload_leading_step2  src_bpp, src_bpp_shift, WK1, SRC
    788        preload_leading_step2  mask_bpp, mask_bpp_shift, WK2, MASK
    789  .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    790        preload_leading_step2  dst_r_bpp, dst_bpp_shift, WK3, DST
    791  .endif
    792 
    793        leading_15bytes  \process_head, \process_tail
    794        
    795 154:    /* Destination now 16-byte aligned; we have at least one prefetch on each channel as well as at least one 16-byte output block */
    796  .if (src_bpp > 0) && (mask_bpp == 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    797        and     SCRATCH, SRC, #31
    798        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
    799  .elseif (src_bpp == 0) && (mask_bpp > 0) && ((flags) & FLAG_PROCESS_PRESERVES_SCRATCH)
    800        and     SCRATCH, MASK, #31
    801        rsb     SCRATCH, SCRATCH, #32*prefetch_distance
    802  .endif
    803  .ifc "\process_inner_loop",""
    804        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, wide_case_inner_loop, 157f
    805  .else
    806        switch_on_alignment  wide_case_inner_loop_and_trailing_pixels, \process_head, \process_tail, \process_inner_loop, 157f
    807  .endif
    808 
    809 157:    /* Check for another line */
    810        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_WIDE), 151b
    811  .if (flags) & FLAG_SPILL_LINE_VARS_WIDE
    812   .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    813   .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    814  .endif
    815 .endif
    816 
    817 .ltorg
    818 
    819 160:    /* Medium case */
    820        mov     ORIG_W, X
    821 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    822        /* This is stmdb sp!,{} */
    823        .word   0xE92D0000 | LINE_SAVED_REGS
    824  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    825  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET + LINE_SAVED_REG_COUNT*4
    826 .endif
    827 161:    /* New line */
    828        \newline
    829        preload_line 0, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
    830        preload_line 0, mask_bpp, mask_bpp_shift, MASK
    831 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    832        preload_line 0, dst_r_bpp, dst_bpp_shift, DST
    833 .endif
    834        
    835        sub     X, X, #128/dst_w_bpp     /* simplifies inner loop termination */
    836        ands    WK0, DST, #15
    837        beq     164f
    838        rsb     WK0, WK0, #16 /* number of leading bytes until destination aligned */
    839        
    840        leading_15bytes  \process_head, \process_tail
    841        
    842 164:    /* Destination now 16-byte aligned; we have at least one 16-byte output block */
    843        switch_on_alignment  medium_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 167f
    844        
    845 167:    /* Check for another line */
    846        end_of_line 1, %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 161b
    847 
    848 .ltorg
    849 
    850 170:    /* Narrow case, less than 31 bytes, so no guarantee of at least one 16-byte block */
    851 .if dst_w_bpp < 32
    852        mov     ORIG_W, X
    853 .endif
    854 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    855        /* This is stmdb sp!,{} */
    856        .word   0xE92D0000 | LINE_SAVED_REGS
    857 .endif
    858 171:    /* New line */
    859        \newline
    860        preload_line 1, src_bpp, src_bpp_shift, SRC  /* in: X, corrupts: WK0-WK1 */
    861        preload_line 1, mask_bpp, mask_bpp_shift, MASK
    862 .if ((flags) & FLAG_NO_PRELOAD_DST) == 0
    863        preload_line 1, dst_r_bpp, dst_bpp_shift, DST
    864 .endif
    865        
    866 .if dst_w_bpp == 8
    867        tst     DST, #3
    868        beq     174f
    869 172:    subs    X, X, #1
    870        blo     177f
    871        \process_head  , 1, 0, 1, 1, 0
    872        \process_tail  , 1, 0
    873  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    874        pixst   , 1, 0, DST
    875  .endif
    876        tst     DST, #3
    877        bne     172b
    878 .elseif dst_w_bpp == 16
    879        tst     DST, #2
    880        beq     174f
    881        subs    X, X, #1
    882        blo     177f
    883        \process_head  , 2, 0, 1, 1, 0
    884        \process_tail  , 2, 0
    885  .if !((flags) & FLAG_PROCESS_DOES_STORE)
    886        pixst   , 2, 0, DST
    887  .endif
    888 .endif
    889 
    890 174:    /* Destination now 4-byte aligned; we have 0 or more output bytes to go */
    891        switch_on_alignment  narrow_case_inner_loop_and_trailing_pixels, \process_head, \process_tail,, 177f
    892 
    893 177:    /* Check for another line */
    894        end_of_line %(dst_w_bpp < 32), %((flags) & FLAG_SPILL_LINE_VARS_NON_WIDE), 171b, last_one
    895 .if (flags) & FLAG_SPILL_LINE_VARS_NON_WIDE
    896  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    897  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET - LINE_SAVED_REG_COUNT*4
    898 .endif
    899 
    900 197:
    901 .if (flags) & FLAG_SPILL_LINE_VARS
    902        add     sp, sp, #LINE_SAVED_REG_COUNT*4
    903 .endif
    904 198:
    905 .if (flags) & FLAG_PROCESS_CORRUPTS_WK0
    906  .set ARGS_STACK_OFFSET, ARGS_STACK_OFFSET-4
    907  .set LOCALS_STACK_OFFSET, LOCALS_STACK_OFFSET-4
    908        add     sp, sp, #4
    909 .endif
    910 
    911        \cleanup
    912 
    913 #ifdef DEBUG_PARAMS
    914        add     sp, sp, #9*4 /* junk the debug copy of arguments */
    915 #endif
    916 199:
    917        pop     {r4-r11, pc}  /* exit */
    918 
    919 .ltorg
    920 
    921    .unreq  X
    922    .unreq  Y
    923    .unreq  DST
    924    .unreq  STRIDE_D
    925    .unreq  SRC
    926    .unreq  STRIDE_S
    927    .unreq  MASK
    928    .unreq  STRIDE_M
    929    .unreq  WK0
    930    .unreq  WK1
    931    .unreq  WK2
    932    .unreq  WK3
    933    .unreq  SCRATCH
    934    .unreq  ORIG_W
    935    pixman_end_asm_function
    936 .endm
    937 
    938 .macro line_saved_regs  x:vararg
    939 .set LINE_SAVED_REGS, 0
    940 .set LINE_SAVED_REG_COUNT, 0
    941 .irp SAVED_REG,\x
    942  .ifc "SAVED_REG","Y"
    943   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<1)
    944   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    945  .endif
    946  .ifc "SAVED_REG","STRIDE_D"
    947   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<3)
    948   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    949  .endif
    950  .ifc "SAVED_REG","STRIDE_S"
    951   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<5)
    952   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    953  .endif
    954  .ifc "SAVED_REG","STRIDE_M"
    955   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<7)
    956   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    957  .endif
    958  .ifc "SAVED_REG","ORIG_W"
    959   .set LINE_SAVED_REGS, LINE_SAVED_REGS | (1<<14)
    960   .set LINE_SAVED_REG_COUNT, LINE_SAVED_REG_COUNT + 1
    961  .endif
    962 .endr
    963 .endm
    964 
    965 .macro nop_macro x:vararg
    966 .endm