yuv_convert_arm.cpp (8303B)
1 // Copyright (c) 2010 The Chromium Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 // contributor Siarhei Siamashka <siarhei.siamashka@gmail.com> 6 7 #include "yuv_convert.h" 8 #include "ycbcr_to_rgb565.h" 9 10 11 12 #ifdef HAVE_YCBCR_TO_RGB565 13 14 namespace mozilla { 15 16 namespace gfx { 17 18 # if defined(MOZILLA_MAY_SUPPORT_NEON) 19 # if defined(__clang__) 20 void __attribute((noinline)) 21 # else 22 void __attribute((noinline,optimize("-fomit-frame-pointer"))) 23 # endif 24 yuv42x_to_rgb565_row_neon(uint16_t *dst, 25 const uint8_t *y, 26 const uint8_t *u, 27 const uint8_t *v, 28 int n, 29 int oddflag) 30 { 31 static __attribute__((aligned(16))) uint16_t acc_r[8] = { 32 22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840, 33 }; 34 static __attribute__((aligned(16))) uint16_t acc_g[8] = { 35 17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312, 36 }; 37 static __attribute__((aligned(16))) uint16_t acc_b[8] = { 38 28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832, 39 }; 40 /* 41 * Registers: 42 * q0, q1 : d0, d1, d2, d3 - are used for initial loading of YUV data 43 * q2 : d4, d5 - are used for storing converted RGB data 44 * q3 : d6, d7 - are used for temporary storage 45 * 46 * q4-q7 - reserved 47 * 48 * q8, q9 : d16, d17, d18, d19 - are used for expanded Y data 49 * q10 : d20, d21 50 * q11 : d22, d23 51 * q12 : d24, d25 52 * q13 : d26, d27 53 * q13, q14, q15 - various constants (#16, #149, #204, #50, #104, #154) 54 */ 55 asm volatile ( 56 /* Allow to build on targets not supporting neon, and force the object file 57 * target to avoid bumping the final binary target */ 58 ".arch armv7-a\n" 59 ".object_arch armv4t\n" 60 ".fpu neon\n" 61 ".macro convert_macroblock size\n" 62 /* load up to 16 source pixels */ 63 ".if \\size == 16\n" 64 "pld [%[y], #64]\n" 65 "pld [%[u], #64]\n" 66 "pld [%[v], #64]\n" 67 "vld1.8 {d1}, [%[y]]!\n" 68 "vld1.8 {d3}, [%[y]]!\n" 69 "vld1.8 {d0}, [%[u]]!\n" 70 "vld1.8 {d2}, [%[v]]!\n" 71 ".elseif \\size == 8\n" 72 "vld1.8 {d1}, [%[y]]!\n" 73 "vld1.8 {d0[0]}, [%[u]]!\n" 74 "vld1.8 {d0[1]}, [%[u]]!\n" 75 "vld1.8 {d0[2]}, [%[u]]!\n" 76 "vld1.8 {d0[3]}, [%[u]]!\n" 77 "vld1.8 {d2[0]}, [%[v]]!\n" 78 "vld1.8 {d2[1]}, [%[v]]!\n" 79 "vld1.8 {d2[2]}, [%[v]]!\n" 80 "vld1.8 {d2[3]}, [%[v]]!\n" 81 ".elseif \\size == 4\n" 82 "vld1.8 {d1[0]}, [%[y]]!\n" 83 "vld1.8 {d1[1]}, [%[y]]!\n" 84 "vld1.8 {d1[2]}, [%[y]]!\n" 85 "vld1.8 {d1[3]}, [%[y]]!\n" 86 "vld1.8 {d0[0]}, [%[u]]!\n" 87 "vld1.8 {d0[1]}, [%[u]]!\n" 88 "vld1.8 {d2[0]}, [%[v]]!\n" 89 "vld1.8 {d2[1]}, [%[v]]!\n" 90 ".elseif \\size == 2\n" 91 "vld1.8 {d1[0]}, [%[y]]!\n" 92 "vld1.8 {d1[1]}, [%[y]]!\n" 93 "vld1.8 {d0[0]}, [%[u]]!\n" 94 "vld1.8 {d2[0]}, [%[v]]!\n" 95 ".elseif \\size == 1\n" 96 "vld1.8 {d1[0]}, [%[y]]!\n" 97 "vld1.8 {d0[0]}, [%[u]]!\n" 98 "vld1.8 {d2[0]}, [%[v]]!\n" 99 ".else\n" 100 ".error \"unsupported macroblock size\"\n" 101 ".endif\n" 102 103 /* d1 - Y data (first 8 bytes) */ 104 /* d3 - Y data (next 8 bytes) */ 105 /* d0 - U data, d2 - V data */ 106 107 /* split even and odd Y color components */ 108 "vuzp.8 d1, d3\n" /* d1 - evenY, d3 - oddY */ 109 /* clip upper and lower boundaries */ 110 "vqadd.u8 q0, q0, q4\n" 111 "vqadd.u8 q1, q1, q4\n" 112 "vqsub.u8 q0, q0, q5\n" 113 "vqsub.u8 q1, q1, q5\n" 114 115 "vshr.u8 d4, d2, #1\n" /* d4 = V >> 1 */ 116 117 "vmull.u8 q8, d1, d27\n" /* q8 = evenY * 149 */ 118 "vmull.u8 q9, d3, d27\n" /* q9 = oddY * 149 */ 119 120 "vld1.16 {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */ 121 "vsubw.u8 q10, q10, d4\n" /* red acc -= (V >> 1) */ 122 "vmlsl.u8 q10, d2, d28\n" /* red acc -= V * 204 */ 123 "vld1.16 {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */ 124 "vmlsl.u8 q11, d2, d30\n" /* green acc -= V * 104 */ 125 "vmlsl.u8 q11, d0, d29\n" /* green acc -= U * 50 */ 126 "vld1.16 {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */ 127 "vmlsl.u8 q12, d0, d30\n" /* blue acc -= U * 104 */ 128 "vmlsl.u8 q12, d0, d31\n" /* blue acc -= U * 154 */ 129 130 "vhsub.s16 q3, q8, q10\n" /* calculate even red components */ 131 "vhsub.s16 q10, q9, q10\n" /* calculate odd red components */ 132 "vqshrun.s16 d0, q3, #6\n" /* right shift, narrow and saturate even red components */ 133 "vqshrun.s16 d3, q10, #6\n" /* right shift, narrow and saturate odd red components */ 134 135 "vhadd.s16 q3, q8, q11\n" /* calculate even green components */ 136 "vhadd.s16 q11, q9, q11\n" /* calculate odd green components */ 137 "vqshrun.s16 d1, q3, #6\n" /* right shift, narrow and saturate even green components */ 138 "vqshrun.s16 d4, q11, #6\n" /* right shift, narrow and saturate odd green components */ 139 140 "vhsub.s16 q3, q8, q12\n" /* calculate even blue components */ 141 "vhsub.s16 q12, q9, q12\n" /* calculate odd blue components */ 142 "vqshrun.s16 d2, q3, #6\n" /* right shift, narrow and saturate even blue components */ 143 "vqshrun.s16 d5, q12, #6\n" /* right shift, narrow and saturate odd blue components */ 144 145 "vzip.8 d0, d3\n" /* join even and odd red components */ 146 "vzip.8 d1, d4\n" /* join even and odd green components */ 147 "vzip.8 d2, d5\n" /* join even and odd blue components */ 148 149 "vshll.u8 q3, d0, #8\n\t" 150 "vshll.u8 q8, d1, #8\n\t" 151 "vshll.u8 q9, d2, #8\n\t" 152 "vsri.u16 q3, q8, #5\t\n" 153 "vsri.u16 q3, q9, #11\t\n" 154 /* store pixel data to memory */ 155 ".if \\size == 16\n" 156 " vst1.16 {d6, d7}, [%[dst]]!\n" 157 " vshll.u8 q3, d3, #8\n\t" 158 " vshll.u8 q8, d4, #8\n\t" 159 " vshll.u8 q9, d5, #8\n\t" 160 " vsri.u16 q3, q8, #5\t\n" 161 " vsri.u16 q3, q9, #11\t\n" 162 " vst1.16 {d6, d7}, [%[dst]]!\n" 163 ".elseif \\size == 8\n" 164 " vst1.16 {d6, d7}, [%[dst]]!\n" 165 ".elseif \\size == 4\n" 166 " vst1.16 {d6}, [%[dst]]!\n" 167 ".elseif \\size == 2\n" 168 " vst1.16 {d6[0]}, [%[dst]]!\n" 169 " vst1.16 {d6[1]}, [%[dst]]!\n" 170 ".elseif \\size == 1\n" 171 " vst1.16 {d6[0]}, [%[dst]]!\n" 172 ".endif\n" 173 ".endm\n" 174 175 "vmov.u8 d8, #15\n" /* add this to U/V to saturate upper boundary */ 176 "vmov.u8 d9, #20\n" /* add this to Y to saturate upper boundary */ 177 "vmov.u8 d10, #31\n" /* sub this from U/V to saturate lower boundary */ 178 "vmov.u8 d11, #36\n" /* sub this from Y to saturate lower boundary */ 179 180 "vmov.u8 d26, #16\n" 181 "vmov.u8 d27, #149\n" 182 "vmov.u8 d28, #204\n" 183 "vmov.u8 d29, #50\n" 184 "vmov.u8 d30, #104\n" 185 "vmov.u8 d31, #154\n" 186 187 "cmp %[oddflag], #0\n" 188 "beq 1f\n" 189 "convert_macroblock 1\n" 190 "sub %[n], %[n], #1\n" 191 "1:\n" 192 "subs %[n], %[n], #16\n" 193 "blt 2f\n" 194 "1:\n" 195 "convert_macroblock 16\n" 196 "subs %[n], %[n], #16\n" 197 "bge 1b\n" 198 "2:\n" 199 "tst %[n], #8\n" 200 "beq 3f\n" 201 "convert_macroblock 8\n" 202 "3:\n" 203 "tst %[n], #4\n" 204 "beq 4f\n" 205 "convert_macroblock 4\n" 206 "4:\n" 207 "tst %[n], #2\n" 208 "beq 5f\n" 209 "convert_macroblock 2\n" 210 "5:\n" 211 "tst %[n], #1\n" 212 "beq 6f\n" 213 "convert_macroblock 1\n" 214 "6:\n" 215 ".purgem convert_macroblock\n" 216 : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n) 217 : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]), 218 [oddflag] "r" (oddflag) 219 : "cc", "memory", 220 "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 221 "d8", "d9", "d10", "d11", /* "d12", "d13", "d14", "d15", */ 222 "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", 223 "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31" 224 ); 225 } 226 # endif // MOZILLA_MAY_SUPPORT_NEON 227 228 } // namespace gfx 229 230 } // namespace mozilla 231 232 #endif // HAVE_YCBCR_TO_RGB565