pixman-arm-simd-asm-scaled.S (4737B)
1 /* 2 * Copyright © 2008 Mozilla Corporation 3 * Copyright © 2010 Nokia Corporation 4 * 5 * Permission to use, copy, modify, distribute, and sell this software and its 6 * documentation for any purpose is hereby granted without fee, provided that 7 * the above copyright notice appear in all copies and that both that 8 * copyright notice and this permission notice appear in supporting 9 * documentation, and that the name of Mozilla Corporation not be used in 10 * advertising or publicity pertaining to distribution of the software without 11 * specific, written prior permission. Mozilla Corporation makes no 12 * representations about the suitability of this software for any purpose. It 13 * is provided "as is" without express or implied warranty. 14 * 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 22 * SOFTWARE. 23 * 24 * Author: Jeff Muizelaar (jeff@infidigm.net) 25 * 26 */ 27 28 /* Prevent the stack from becoming executable */ 29 #if defined(__linux__) && defined(__ELF__) 30 .section .note.GNU-stack,"",%progbits 31 #endif 32 33 .text 34 .arch armv6 35 .object_arch armv4 36 .arm 37 .altmacro 38 .p2align 2 39 40 #include "pixman-arm-asm.h" 41 42 pixman_syntax_unified 43 44 /* 45 * Note: This code is only using armv5te instructions (not even armv6), 46 * but is scheduled for ARM Cortex-A8 pipeline. So it might need to 47 * be split into a few variants, tuned for each microarchitecture. 48 * 49 * TODO: In order to get good performance on ARM9/ARM11 cores (which don't 50 * have efficient write combining), it needs to be changed to use 16-byte 51 * aligned writes using STM instruction. 52 * 53 * Nearest scanline scaler macro template uses the following arguments: 54 * fname - name of the function to generate 55 * bpp_shift - (1 << bpp_shift) is the size of pixel in bytes 56 * t - type suffix for LDR/STR instructions 57 * prefetch_distance - prefetch in the source image by that many 58 * pixels ahead 59 * prefetch_braking_distance - stop prefetching when that many pixels are 60 * remaining before the end of scanline 61 */ 62 63 .macro generate_nearest_scanline_func fname, bpp_shift, t, \ 64 prefetch_distance, \ 65 prefetch_braking_distance 66 67 pixman_asm_function \fname 68 W .req r0 69 DST .req r1 70 SRC .req r2 71 VX .req r3 72 UNIT_X .req ip 73 TMP1 .req r4 74 TMP2 .req r5 75 VXMASK .req r6 76 PF_OFFS .req r7 77 SRC_WIDTH_FIXED .req r8 78 79 ldr UNIT_X, [sp] 80 push {r4, r5, r6, r7, r8, r10} 81 mvn VXMASK, #((1 << \bpp_shift) - 1) 82 ldr SRC_WIDTH_FIXED, [sp, #28] 83 84 /* define helper macro */ 85 .macro scale_2_pixels 86 ldr\()\t TMP1, [SRC, TMP1] 87 and TMP2, VXMASK, VX, asr #(16 - \bpp_shift) 88 adds VX, VX, UNIT_X 89 str\()\t TMP1, [DST], #(1 << \bpp_shift) 90 9: subspl VX, VX, SRC_WIDTH_FIXED 91 bpl 9b 92 93 ldr\()\t TMP2, [SRC, TMP2] 94 and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) 95 adds VX, VX, UNIT_X 96 str\()\t TMP2, [DST], #(1 << \bpp_shift) 97 9: subspl VX, VX, SRC_WIDTH_FIXED 98 bpl 9b 99 .endm 100 101 /* now do the scaling */ 102 and TMP1, VXMASK, VX, asr #(16 - \bpp_shift) 103 adds VX, VX, UNIT_X 104 9: subspl VX, VX, SRC_WIDTH_FIXED 105 bpl 9b 106 subs W, W, #(8 + \prefetch_braking_distance) 107 blt 2f 108 /* calculate prefetch offset */ 109 mov PF_OFFS, #\prefetch_distance 110 mla PF_OFFS, UNIT_X, PF_OFFS, VX 111 1: /* main loop, process 8 pixels per iteration with prefetch */ 112 pld [SRC, PF_OFFS, asr #(16 - \bpp_shift)] 113 add PF_OFFS, PF_OFFS, UNIT_X, lsl #3 114 scale_2_pixels 115 scale_2_pixels 116 scale_2_pixels 117 scale_2_pixels 118 subs W, W, #8 119 bge 1b 120 2: 121 subs W, W, #(4 - 8 - \prefetch_braking_distance) 122 blt 2f 123 1: /* process the remaining pixels */ 124 scale_2_pixels 125 scale_2_pixels 126 subs W, W, #4 127 bge 1b 128 2: 129 tst W, #2 130 beq 2f 131 scale_2_pixels 132 2: 133 tst W, #1 134 ldr\()\t\()ne TMP1, [SRC, TMP1] 135 str\()\t\()ne TMP1, [DST] 136 /* cleanup helper macro */ 137 .purgem scale_2_pixels 138 .unreq DST 139 .unreq SRC 140 .unreq W 141 .unreq VX 142 .unreq UNIT_X 143 .unreq TMP1 144 .unreq TMP2 145 .unreq VXMASK 146 .unreq PF_OFFS 147 .unreq SRC_WIDTH_FIXED 148 /* return */ 149 pop {r4, r5, r6, r7, r8, r10} 150 bx lr 151 pixman_end_asm_function 152 .endm 153 154 generate_nearest_scanline_func \ 155 pixman_scaled_nearest_scanline_0565_0565_SRC_asm_armv6, 1, h, 80, 32 156 157 generate_nearest_scanline_func \ 158 pixman_scaled_nearest_scanline_8888_8888_SRC_asm_armv6, 2, , 48, 32