[ tor-browser ].git.dasho

pixman-arm-simd-asm-scaled.S (4737B)
      1 /*
      2 * Copyright © 2008 Mozilla Corporation
      3 * Copyright © 2010 Nokia Corporation
      4 *
      5 * Permission to use, copy, modify, distribute, and sell this software and its
      6 * documentation for any purpose is hereby granted without fee, provided that
      7 * the above copyright notice appear in all copies and that both that
      8 * copyright notice and this permission notice appear in supporting
      9 * documentation, and that the name of Mozilla Corporation not be used in
     10 * advertising or publicity pertaining to distribution of the software without
     11 * specific, written prior permission.  Mozilla Corporation makes no
     12 * representations about the suitability of this software for any purpose.  It
     13 * is provided "as is" without express or implied warranty.
     14 *
     15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
     16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
     17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
     18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
     19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
     20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
     21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     22 * SOFTWARE.
     23 *
     24 * Author:  Jeff Muizelaar (jeff@infidigm.net)
     25 *
     26 */
     27 
     28 /* Prevent the stack from becoming executable */
     29 #if defined(__linux__) && defined(__ELF__)
     30 .section .note.GNU-stack,"",%progbits
     31 #endif
     32 
     33 .text
     34 .arch armv6
     35 .object_arch armv4
     36 .arm
     37 .altmacro
     38 .p2align 2
     39 
     40 #include "pixman-arm-asm.h"
     41 
     42 pixman_syntax_unified
     43 
     44 /*
     45 * Note: This code is only using armv5te instructions (not even armv6),
     46 *       but is scheduled for ARM Cortex-A8 pipeline. So it might need to
     47 *       be split into a few variants, tuned for each microarchitecture.
     48 *
     49 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't
     50 * have efficient write combining), it needs to be changed to use 16-byte
     51 * aligned writes using STM instruction.
     52 *
     53 * Nearest scanline scaler macro template uses the following arguments:
     54 *  fname                     - name of the function to generate
     55 *  bpp_shift                 - (1 << bpp_shift) is the size of pixel in bytes
     56 *  t                         - type suffix for LDR/STR instructions
     57 *  prefetch_distance         - prefetch in the source image by that many
     58 *                              pixels ahead
     59 *  prefetch_braking_distance - stop prefetching when that many pixels are
     60 *                              remaining before the end of scanline
     61 */
     62 
     63 .macro generate_nearest_scanline_func fname, bpp_shift, t,      \
     64                                      prefetch_distance,        \
     65                                      prefetch_braking_distance
     66 
     67 pixman_asm_function \fname
     68 W		.req	r0
     69 DST		.req	r1
     70 SRC		.req	r2
     71 VX		.req	r3
     72 UNIT_X		.req	ip
     73 TMP1		.req	r4
     74 TMP2		.req	r5
     75 VXMASK		.req	r6
     76 PF_OFFS		.req	r7
     77 SRC_WIDTH_FIXED	.req	r8
     78 
     79 ldr	UNIT_X, [sp]
     80 push	{r4, r5, r6, r7, r8, r10}
     81 mvn	VXMASK, #((1 << \bpp_shift) - 1)
     82 ldr	SRC_WIDTH_FIXED, [sp, #28]
     83 
     84 /* define helper macro */
     85 .macro	scale_2_pixels
     86 	ldr\()\t	TMP1, [SRC, TMP1]
     87 	and	TMP2, VXMASK, VX, asr #(16 - \bpp_shift)
     88 	adds	VX, VX, UNIT_X
     89 	str\()\t	TMP1, [DST], #(1 << \bpp_shift)
     90 9:		subspl	VX, VX, SRC_WIDTH_FIXED
     91 	bpl	9b
     92 
     93 	ldr\()\t	TMP2, [SRC, TMP2]
     94 	and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
     95 	adds	VX, VX, UNIT_X
     96 	str\()\t	TMP2, [DST], #(1 << \bpp_shift)
     97 9:		subspl	VX, VX, SRC_WIDTH_FIXED
     98 	bpl	9b
     99 .endm
    100 
    101 /* now do the scaling */
    102 and	TMP1, VXMASK, VX, asr #(16 - \bpp_shift)
    103 adds	VX, VX, UNIT_X
    104 9:	subspl	VX, VX, SRC_WIDTH_FIXED
    105 bpl	9b
    106 subs	W, W, #(8 + \prefetch_braking_distance)
    107 blt	2f
    108 /* calculate prefetch offset */
    109 mov	PF_OFFS, #\prefetch_distance
    110 mla	PF_OFFS, UNIT_X, PF_OFFS, VX
    111 1:	/* main loop, process 8 pixels per iteration with prefetch */
    112 pld	[SRC, PF_OFFS, asr #(16 - \bpp_shift)]
    113 add	PF_OFFS, PF_OFFS, UNIT_X, lsl #3
    114 scale_2_pixels
    115 scale_2_pixels
    116 scale_2_pixels
    117 scale_2_pixels
    118 subs	W, W, #8
    119 bge	1b
    120 2:
    121 subs	W, W, #(4 - 8 - \prefetch_braking_distance)
    122 blt	2f
    123 1:	/* process the remaining pixels */
    124 scale_2_pixels
    125 scale_2_pixels
    126 subs	W, W, #4
    127 bge	1b
    128 2:
    129 tst	W, #2
    130 beq	2f
    131 scale_2_pixels
    132 2:
    133 tst	W, #1
    134 ldr\()\t\()ne	TMP1, [SRC, TMP1]
    135 str\()\t\()ne	TMP1, [DST]
    136 /* cleanup helper macro */
    137 .purgem	scale_2_pixels
    138 .unreq	DST
    139 .unreq	SRC
    140 .unreq	W
    141 .unreq	VX
    142 .unreq	UNIT_X
    143 .unreq	TMP1
    144 .unreq	TMP2
    145 .unreq	VXMASK
    146 .unreq	PF_OFFS
    147 .unreq  SRC_WIDTH_FIXED
    148 /* return */
    149 pop	{r4, r5, r6, r7, r8, r10}
    150 bx	lr
    151 pixman_end_asm_function
    152 .endm
    153 
    154 generate_nearest_scanline_func \
    155    pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32
    156 
    157 generate_nearest_scanline_func \
    158    pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2,  , 48, 32
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE