[ tor-browser ].git.dasho

pixman-mips-memcpy-asm.S (10556B)
      1 /*
      2 * Copyright (c) 2012
      3 *      MIPS Technologies, Inc., California.
      4 *
      5 * Redistribution and use in source and binary forms, with or without
      6 * modification, are permitted provided that the following conditions
      7 * are met:
      8 * 1. Redistributions of source code must retain the above copyright
      9 *    notice, this list of conditions and the following disclaimer.
     10 * 2. Redistributions in binary form must reproduce the above copyright
     11 *    notice, this list of conditions and the following disclaimer in the
     12 *    documentation and/or other materials provided with the distribution.
     13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
     14 *    contributors may be used to endorse or promote products derived from
     15 *    this software without specific prior written permission.
     16 *
     17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
     18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
     19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
     20 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
     21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
     22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
     23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
     24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
     25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
     26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
     27 * SUCH DAMAGE.
     28 */
     29 
     30 #include "pixman-mips-dspr2-asm.h"
     31 
     32 /*
     33 * This routine could be optimized for MIPS64. The current code only
     34 * uses MIPS32 instructions.
     35 */
     36 
     37 #ifdef EB
     38 #  define LWHI	lwl		/* high part is left in big-endian */
     39 #  define SWHI	swl		/* high part is left in big-endian */
     40 #  define LWLO	lwr		/* low part is right in big-endian */
     41 #  define SWLO	swr		/* low part is right in big-endian */
     42 #else
     43 #  define LWHI	lwr		/* high part is right in little-endian */
     44 #  define SWHI	swr		/* high part is right in little-endian */
     45 #  define LWLO	lwl		/* low part is left in big-endian */
     46 #  define SWLO	swl		/* low part is left in big-endian */
     47 #endif
     48 
     49 LEAF_MIPS32R2(pixman_mips_fast_memcpy)
     50 
     51 slti	AT, a2, 8
     52 bne	AT, zero, $last8
     53 move	v0, a0	/* memcpy returns the dst pointer */
     54 
     55 /* Test if the src and dst are word-aligned, or can be made word-aligned */
     56 xor	t8, a1, a0
     57 andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
     58 
     59 bne	t8, zero, $unaligned
     60 negu	a3, a0
     61 
     62 andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
     63 beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
     64 subu	a2, a2, a3	/* now a2 is the remining bytes count */
     65 
     66 LWHI	t8, 0(a1)
     67 addu	a1, a1, a3
     68 SWHI	t8, 0(a0)
     69 addu	a0, a0, a3
     70 
     71 /* Now the dst/src are mutually word-aligned with word-aligned addresses */
     72 $chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
     73 			/* t8 is the byte count after 64-byte chunks */
     74 
     75 beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
     76 			/* There will be at most 1 32-byte chunk after it */
     77 subu	a3, a2, t8	/* subtract from a2 the reminder */
     78                                /* Here a3 counts bytes in 16w chunks */
     79 addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
     80 
     81 addu	t0, a0, a2	/* t0 is the "past the end" address */
     82 
     83 /*
     84 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
     85 * the "t0-32" address
     86 * This means: for x=128 the last "safe" a0 address is "t0-160"
     87 * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
     88 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
     89 */
     90 subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
     91 
     92 pref    0, 0(a1)		/* bring the first line of src, addr 0 */
     93 pref    0, 32(a1)	/* bring the second line of src, addr 32 */
     94 pref    0, 64(a1)	/* bring the third line of src, addr 64 */
     95 pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
     96 /* In case the a0 > t9 don't use "pref 30" at all */
     97 sgtu	v1, a0, t9
     98 bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
     99 nop
    100 /* otherwise, start with using pref30 */
    101 pref	30, 64(a0)
    102 $loop16w:
    103 pref	0, 96(a1)
    104 lw	t0, 0(a1)
    105 bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
    106 lw	t1, 4(a1)
    107 pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
    108 $skip_pref30_96:
    109 lw	t2, 8(a1)
    110 lw	t3, 12(a1)
    111 lw	t4, 16(a1)
    112 lw	t5, 20(a1)
    113 lw	t6, 24(a1)
    114 lw	t7, 28(a1)
    115        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
    116 
    117 sw	t0, 0(a0)
    118 sw	t1, 4(a0)
    119 sw	t2, 8(a0)
    120 sw	t3, 12(a0)
    121 sw	t4, 16(a0)
    122 sw	t5, 20(a0)
    123 sw	t6, 24(a0)
    124 sw	t7, 28(a0)
    125 
    126 lw	t0, 32(a1)
    127 bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
    128 lw	t1, 36(a1)
    129 pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
    130 $skip_pref30_128:
    131 lw	t2, 40(a1)
    132 lw	t3, 44(a1)
    133 lw	t4, 48(a1)
    134 lw	t5, 52(a1)
    135 lw	t6, 56(a1)
    136 lw	t7, 60(a1)
    137        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
    138 
    139 sw	t0, 32(a0)
    140 sw	t1, 36(a0)
    141 sw	t2, 40(a0)
    142 sw	t3, 44(a0)
    143 sw	t4, 48(a0)
    144 sw	t5, 52(a0)
    145 sw	t6, 56(a0)
    146 sw	t7, 60(a0)
    147 
    148 addiu	a0, a0, 64	/* adding 64 to dest */
    149 sgtu	v1, a0, t9
    150 bne	a0, a3, $loop16w
    151 addiu	a1, a1, 64	/* adding 64 to src */
    152 move	a2, t8
    153 
    154 /* Here we have src and dest word-aligned but less than 64-bytes to go */
    155 
    156 $chk8w:
    157 pref 0, 0x0(a1)
    158 andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
    159 			/* the t8 is the reminder count past 32-bytes */
    160 beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
    161  nop
    162 
    163 lw	t0, 0(a1)
    164 lw	t1, 4(a1)
    165 lw	t2, 8(a1)
    166 lw	t3, 12(a1)
    167 lw	t4, 16(a1)
    168 lw	t5, 20(a1)
    169 lw	t6, 24(a1)
    170 lw	t7, 28(a1)
    171 addiu	a1, a1, 32
    172 
    173 sw	t0, 0(a0)
    174 sw	t1, 4(a0)
    175 sw	t2, 8(a0)
    176 sw	t3, 12(a0)
    177 sw	t4, 16(a0)
    178 sw	t5, 20(a0)
    179 sw	t6, 24(a0)
    180 sw	t7, 28(a0)
    181 addiu	a0, a0, 32
    182 
    183 $chk1w:
    184 andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
    185 beq	a2, t8, $last8
    186 subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
    187 addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
    188 
    189 /* copying in words (4-byte chunks) */
    190 $wordCopy_loop:
    191 lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
    192 addiu	a1, a1, 4
    193 addiu	a0, a0, 4
    194 bne	a0, a3, $wordCopy_loop
    195 sw	t3, -4(a0)
    196 
    197 /* For the last (<8) bytes */
    198 $last8:
    199 blez	a2, leave
    200 addu	a3, a0, a2	/* a3 is the last dst address */
    201 $last8loop:
    202 lb	v1, 0(a1)
    203 addiu	a1, a1, 1
    204 addiu	a0, a0, 1
    205 bne	a0, a3, $last8loop
    206 sb	v1, -1(a0)
    207 
    208 leave:	j	ra
    209 nop
    210 
    211 /*
    212 * UNALIGNED case
    213 */
    214 
    215 $unaligned:
    216 /* got here with a3="negu a0" */
    217 andi	a3, a3, 0x3	/* test if the a0 is word aligned */
    218 beqz	a3, $ua_chk16w
    219 subu	a2, a2, a3	/* bytes left after initial a3 bytes */
    220 
    221 LWHI	v1, 0(a1)
    222 LWLO	v1, 3(a1)
    223 addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
    224 SWHI	v1, 0(a0)
    225 addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
    226 
    227 $ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
    228 			/* t8 is the byte count after 64-byte chunks */
    229 beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
    230 			/* There will be at most 1 32-byte chunk after it */
    231 subu	a3, a2, t8	/* subtract from a2 the reminder */
    232                                /* Here a3 counts bytes in 16w chunks */
    233 addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
    234 
    235 addu	t0, a0, a2	/* t0 is the "past the end" address */
    236 
    237 subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
    238 
    239 pref    0, 0(a1)		/* bring the first line of src, addr 0 */
    240 pref    0, 32(a1)	/* bring the second line of src, addr 32 */
    241 pref    0, 64(a1)	/* bring the third line of src, addr 64 */
    242 pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
    243 /* In case the a0 > t9 don't use "pref 30" at all */
    244 sgtu	v1, a0, t9
    245 bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
    246 nop
    247 /* otherwise,  start with using pref30 */
    248 pref	30, 64(a0)
    249 $ua_loop16w:
    250 pref	0, 96(a1)
    251 LWHI	t0, 0(a1)
    252 LWLO	t0, 3(a1)
    253 LWHI	t1, 4(a1)
    254 bgtz	v1, $ua_skip_pref30_96
    255 LWLO	t1, 7(a1)
    256 pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
    257 $ua_skip_pref30_96:
    258 LWHI	t2, 8(a1)
    259 LWLO	t2, 11(a1)
    260 LWHI	t3, 12(a1)
    261 LWLO	t3, 15(a1)
    262 LWHI	t4, 16(a1)
    263 LWLO	t4, 19(a1)
    264 LWHI	t5, 20(a1)
    265 LWLO	t5, 23(a1)
    266 LWHI	t6, 24(a1)
    267 LWLO	t6, 27(a1)
    268 LWHI	t7, 28(a1)
    269 LWLO	t7, 31(a1)
    270        pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
    271 
    272 sw	t0, 0(a0)
    273 sw	t1, 4(a0)
    274 sw	t2, 8(a0)
    275 sw	t3, 12(a0)
    276 sw	t4, 16(a0)
    277 sw	t5, 20(a0)
    278 sw	t6, 24(a0)
    279 sw	t7, 28(a0)
    280 
    281 LWHI	t0, 32(a1)
    282 LWLO	t0, 35(a1)
    283 LWHI	t1, 36(a1)
    284 bgtz	v1, $ua_skip_pref30_128
    285 LWLO	t1, 39(a1)
    286 pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
    287 $ua_skip_pref30_128:
    288 LWHI	t2, 40(a1)
    289 LWLO	t2, 43(a1)
    290 LWHI	t3, 44(a1)
    291 LWLO	t3, 47(a1)
    292 LWHI	t4, 48(a1)
    293 LWLO	t4, 51(a1)
    294 LWHI	t5, 52(a1)
    295 LWLO	t5, 55(a1)
    296 LWHI	t6, 56(a1)
    297 LWLO	t6, 59(a1)
    298 LWHI	t7, 60(a1)
    299 LWLO	t7, 63(a1)
    300        pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
    301 
    302 sw	t0, 32(a0)
    303 sw	t1, 36(a0)
    304 sw	t2, 40(a0)
    305 sw	t3, 44(a0)
    306 sw	t4, 48(a0)
    307 sw	t5, 52(a0)
    308 sw	t6, 56(a0)
    309 sw	t7, 60(a0)
    310 
    311 addiu	a0, a0, 64	/* adding 64 to dest */
    312 sgtu	v1, a0, t9
    313 bne	a0, a3, $ua_loop16w
    314 addiu	a1, a1, 64	/* adding 64 to src */
    315 move	a2, t8
    316 
    317 /* Here we have src and dest word-aligned but less than 64-bytes to go */
    318 
    319 $ua_chk8w:
    320 pref 0, 0x0(a1)
    321 andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
    322 			/* the t8 is the reminder count */
    323 beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
    324 
    325 LWHI	t0, 0(a1)
    326 LWLO	t0, 3(a1)
    327 LWHI	t1, 4(a1)
    328 LWLO	t1, 7(a1)
    329 LWHI	t2, 8(a1)
    330 LWLO	t2, 11(a1)
    331 LWHI	t3, 12(a1)
    332 LWLO	t3, 15(a1)
    333 LWHI	t4, 16(a1)
    334 LWLO	t4, 19(a1)
    335 LWHI	t5, 20(a1)
    336 LWLO	t5, 23(a1)
    337 LWHI	t6, 24(a1)
    338 LWLO	t6, 27(a1)
    339 LWHI	t7, 28(a1)
    340 LWLO	t7, 31(a1)
    341 addiu	a1, a1, 32
    342 
    343 sw	t0, 0(a0)
    344 sw	t1, 4(a0)
    345 sw	t2, 8(a0)
    346 sw	t3, 12(a0)
    347 sw	t4, 16(a0)
    348 sw	t5, 20(a0)
    349 sw	t6, 24(a0)
    350 sw	t7, 28(a0)
    351 addiu	a0, a0, 32
    352 
    353 $ua_chk1w:
    354 andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
    355 beq	a2, t8, $ua_smallCopy
    356 subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
    357 addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
    358 
    359 /* copying in words (4-byte chunks) */
    360 $ua_wordCopy_loop:
    361 LWHI	v1, 0(a1)
    362 LWLO	v1, 3(a1)
    363 addiu	a1, a1, 4
    364 addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
    365 bne	a0, a3, $ua_wordCopy_loop
    366 sw	v1, -4(a0)
    367 
    368 /* Now less than 4 bytes (value in a2) left to copy */
    369 $ua_smallCopy:
    370 beqz	a2, leave
    371 addu	a3, a0, a2	/* a3 is the last dst address */
    372 $ua_smallCopy_loop:
    373 lb	v1, 0(a1)
    374 addiu	a1, a1, 1
    375 addiu	a0, a0, 1
    376 bne	a0, a3, $ua_smallCopy_loop
    377 sb	v1, -1(a0)
    378 
    379 j	ra
    380 nop
    381 
    382 END(pixman_mips_fast_memcpy)
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE