tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_convert_arm.cpp (8303B)


      1 // Copyright (c) 2010 The Chromium Authors. All rights reserved.
      2 // Use of this source code is governed by a BSD-style license that can be
      3 // found in the LICENSE file.
      4 
      5 // contributor Siarhei Siamashka <siarhei.siamashka@gmail.com>
      6 
      7 #include "yuv_convert.h"
      8 #include "ycbcr_to_rgb565.h"
      9 
     10 
     11 
     12 #ifdef HAVE_YCBCR_TO_RGB565
     13 
     14 namespace mozilla {
     15 
     16 namespace gfx {
     17 
     18 #  if defined(MOZILLA_MAY_SUPPORT_NEON)
     19 #  if defined(__clang__)
     20 void __attribute((noinline))
     21 #  else
     22 void __attribute((noinline,optimize("-fomit-frame-pointer")))
     23 #  endif
     24    yuv42x_to_rgb565_row_neon(uint16_t *dst,
     25                              const uint8_t *y,
     26                              const uint8_t *u,
     27                              const uint8_t *v,
     28                              int n,
     29                              int oddflag)
     30 {
     31    static __attribute__((aligned(16))) uint16_t acc_r[8] = {
     32        22840, 22840, 22840, 22840, 22840, 22840, 22840, 22840,
     33    };
     34    static __attribute__((aligned(16))) uint16_t acc_g[8] = {
     35        17312, 17312, 17312, 17312, 17312, 17312, 17312, 17312,
     36    };
     37    static __attribute__((aligned(16))) uint16_t acc_b[8] = {
     38        28832, 28832, 28832, 28832, 28832, 28832, 28832, 28832,
     39    };
     40    /*
     41     * Registers:
     42     * q0, q1 : d0, d1, d2, d3  - are used for initial loading of YUV data
     43     * q2     : d4, d5          - are used for storing converted RGB data
     44     * q3     : d6, d7          - are used for temporary storage
     45     *
     46     * q4-q7 - reserved
     47     *
     48     * q8, q9 : d16, d17, d18, d19  - are used for expanded Y data
     49     * q10    : d20, d21
     50     * q11    : d22, d23
     51     * q12    : d24, d25
     52     * q13    : d26, d27
     53     * q13, q14, q15            - various constants (#16, #149, #204, #50, #104, #154)
     54     */
     55    asm volatile (
     56 /* Allow to build on targets not supporting neon, and force the object file
     57 * target to avoid bumping the final binary target */
     58 ".arch armv7-a\n"
     59 ".object_arch armv4t\n"
     60 ".fpu neon\n"
     61 ".macro convert_macroblock size\n"
     62 /* load up to 16 source pixels */
     63 ".if \\size == 16\n"
     64     "pld [%[y], #64]\n"
     65     "pld [%[u], #64]\n"
     66     "pld [%[v], #64]\n"
     67     "vld1.8 {d1}, [%[y]]!\n"
     68     "vld1.8 {d3}, [%[y]]!\n"
     69     "vld1.8 {d0}, [%[u]]!\n"
     70     "vld1.8 {d2}, [%[v]]!\n"
     71 ".elseif \\size == 8\n"
     72     "vld1.8 {d1}, [%[y]]!\n"
     73     "vld1.8 {d0[0]}, [%[u]]!\n"
     74     "vld1.8 {d0[1]}, [%[u]]!\n"
     75     "vld1.8 {d0[2]}, [%[u]]!\n"
     76     "vld1.8 {d0[3]}, [%[u]]!\n"
     77     "vld1.8 {d2[0]}, [%[v]]!\n"
     78     "vld1.8 {d2[1]}, [%[v]]!\n"
     79     "vld1.8 {d2[2]}, [%[v]]!\n"
     80     "vld1.8 {d2[3]}, [%[v]]!\n"
     81 ".elseif \\size == 4\n"
     82     "vld1.8 {d1[0]}, [%[y]]!\n"
     83     "vld1.8 {d1[1]}, [%[y]]!\n"
     84     "vld1.8 {d1[2]}, [%[y]]!\n"
     85     "vld1.8 {d1[3]}, [%[y]]!\n"
     86     "vld1.8 {d0[0]}, [%[u]]!\n"
     87     "vld1.8 {d0[1]}, [%[u]]!\n"
     88     "vld1.8 {d2[0]}, [%[v]]!\n"
     89     "vld1.8 {d2[1]}, [%[v]]!\n"
     90 ".elseif \\size == 2\n"
     91     "vld1.8 {d1[0]}, [%[y]]!\n"
     92     "vld1.8 {d1[1]}, [%[y]]!\n"
     93     "vld1.8 {d0[0]}, [%[u]]!\n"
     94     "vld1.8 {d2[0]}, [%[v]]!\n"
     95 ".elseif \\size == 1\n"
     96     "vld1.8 {d1[0]}, [%[y]]!\n"
     97     "vld1.8 {d0[0]}, [%[u]]!\n"
     98     "vld1.8 {d2[0]}, [%[v]]!\n"
     99 ".else\n"
    100     ".error \"unsupported macroblock size\"\n"
    101 ".endif\n"
    102 
    103        /* d1 - Y data (first 8 bytes) */
    104        /* d3 - Y data (next 8 bytes) */
    105        /* d0 - U data, d2 - V data */
    106 
    107 /* split even and odd Y color components */
    108 "vuzp.8      d1, d3\n"                       /* d1 - evenY, d3 - oddY */
    109 /* clip upper and lower boundaries */
    110 "vqadd.u8    q0, q0, q4\n"
    111 "vqadd.u8    q1, q1, q4\n"
    112 "vqsub.u8    q0, q0, q5\n"
    113 "vqsub.u8    q1, q1, q5\n"
    114 
    115 "vshr.u8     d4, d2, #1\n"                   /* d4 = V >> 1 */
    116 
    117 "vmull.u8    q8, d1, d27\n"                  /* q8 = evenY * 149 */
    118 "vmull.u8    q9, d3, d27\n"                  /* q9 = oddY * 149 */
    119 
    120 "vld1.16     {d20, d21}, [%[acc_r], :128]\n" /* q10 - initialize accumulator for red */
    121 "vsubw.u8    q10, q10, d4\n"                 /* red acc -= (V >> 1) */
    122 "vmlsl.u8    q10, d2, d28\n"                 /* red acc -= V * 204 */
    123 "vld1.16     {d22, d23}, [%[acc_g], :128]\n" /* q11 - initialize accumulator for green */
    124 "vmlsl.u8    q11, d2, d30\n"                 /* green acc -= V * 104 */
    125 "vmlsl.u8    q11, d0, d29\n"                 /* green acc -= U * 50 */
    126 "vld1.16     {d24, d25}, [%[acc_b], :128]\n" /* q12 - initialize accumulator for blue */
    127 "vmlsl.u8    q12, d0, d30\n"                 /* blue acc -= U * 104 */
    128 "vmlsl.u8    q12, d0, d31\n"                 /* blue acc -= U * 154 */
    129 
    130 "vhsub.s16   q3, q8, q10\n"                  /* calculate even red components */
    131 "vhsub.s16   q10, q9, q10\n"                 /* calculate odd red components */
    132 "vqshrun.s16 d0, q3, #6\n"                   /* right shift, narrow and saturate even red components */
    133 "vqshrun.s16 d3, q10, #6\n"                  /* right shift, narrow and saturate odd red components */
    134 
    135 "vhadd.s16   q3, q8, q11\n"                  /* calculate even green components */
    136 "vhadd.s16   q11, q9, q11\n"                 /* calculate odd green components */
    137 "vqshrun.s16 d1, q3, #6\n"                   /* right shift, narrow and saturate even green components */
    138 "vqshrun.s16 d4, q11, #6\n"                  /* right shift, narrow and saturate odd green components */
    139 
    140 "vhsub.s16   q3, q8, q12\n"                  /* calculate even blue components */
    141 "vhsub.s16   q12, q9, q12\n"                 /* calculate odd blue components */
    142 "vqshrun.s16 d2, q3, #6\n"                   /* right shift, narrow and saturate even blue components */
    143 "vqshrun.s16 d5, q12, #6\n"                  /* right shift, narrow and saturate odd blue components */
    144 
    145 "vzip.8      d0, d3\n"                       /* join even and odd red components */
    146 "vzip.8      d1, d4\n"                       /* join even and odd green components */
    147 "vzip.8      d2, d5\n"                       /* join even and odd blue components */
    148 
    149 "vshll.u8    q3, d0, #8\n\t"
    150 "vshll.u8    q8, d1, #8\n\t"
    151 "vshll.u8    q9, d2, #8\n\t"
    152 "vsri.u16    q3, q8, #5\t\n"
    153 "vsri.u16    q3, q9, #11\t\n"
    154 /* store pixel data to memory */
    155 ".if \\size == 16\n"
    156 "    vst1.16 {d6, d7}, [%[dst]]!\n"
    157 "    vshll.u8    q3, d3, #8\n\t"
    158 "    vshll.u8    q8, d4, #8\n\t"
    159 "    vshll.u8    q9, d5, #8\n\t"
    160 "    vsri.u16    q3, q8, #5\t\n"
    161 "    vsri.u16    q3, q9, #11\t\n"
    162 "    vst1.16 {d6, d7}, [%[dst]]!\n"
    163 ".elseif \\size == 8\n"
    164 "    vst1.16 {d6, d7}, [%[dst]]!\n"
    165 ".elseif \\size == 4\n"
    166 "    vst1.16 {d6}, [%[dst]]!\n"
    167 ".elseif \\size == 2\n"
    168 "    vst1.16 {d6[0]}, [%[dst]]!\n"
    169 "    vst1.16 {d6[1]}, [%[dst]]!\n"
    170 ".elseif \\size == 1\n"
    171 "    vst1.16 {d6[0]}, [%[dst]]!\n"
    172 ".endif\n"
    173 ".endm\n"
    174 
    175 "vmov.u8     d8, #15\n" /* add this to U/V to saturate upper boundary */
    176 "vmov.u8     d9, #20\n" /* add this to Y to saturate upper boundary */
    177 "vmov.u8     d10, #31\n" /* sub this from U/V to saturate lower boundary */
    178 "vmov.u8     d11, #36\n" /* sub this from Y to saturate lower boundary */
    179 
    180 "vmov.u8     d26, #16\n"
    181 "vmov.u8     d27, #149\n"
    182 "vmov.u8     d28, #204\n"
    183 "vmov.u8     d29, #50\n"
    184 "vmov.u8     d30, #104\n"
    185 "vmov.u8     d31, #154\n"
    186 
    187 "cmp         %[oddflag], #0\n"
    188 "beq         1f\n"
    189 "convert_macroblock 1\n"
    190 "sub         %[n], %[n], #1\n"
    191    "1:\n"
    192 "subs        %[n], %[n], #16\n"
    193 "blt         2f\n"
    194    "1:\n"
    195 "convert_macroblock 16\n"
    196 "subs        %[n], %[n], #16\n"
    197 "bge         1b\n"
    198    "2:\n"
    199 "tst         %[n], #8\n"
    200 "beq         3f\n"
    201 "convert_macroblock 8\n"
    202    "3:\n"
    203 "tst         %[n], #4\n"
    204 "beq         4f\n"
    205 "convert_macroblock 4\n"
    206    "4:\n"
    207 "tst         %[n], #2\n"
    208 "beq         5f\n"
    209 "convert_macroblock 2\n"
    210    "5:\n"
    211 "tst         %[n], #1\n"
    212 "beq         6f\n"
    213 "convert_macroblock 1\n"
    214    "6:\n"
    215 ".purgem convert_macroblock\n"
    216 : [y] "+&r" (y), [u] "+&r" (u), [v] "+&r" (v), [dst] "+&r" (dst), [n] "+&r" (n)
    217 : [acc_r] "r" (&acc_r[0]), [acc_g] "r" (&acc_g[0]), [acc_b] "r" (&acc_b[0]),
    218   [oddflag] "r" (oddflag)
    219 : "cc", "memory",
    220   "d0",  "d1",  "d2",  "d3",  "d4",  "d5",  "d6",  "d7",
    221   "d8",  "d9",  "d10", "d11", /* "d12", "d13", "d14", "d15", */
    222   "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23",
    223   "d24", "d25", "d26", "d27", "d28", "d29", "d30", "d31"
    224    );
    225 }
    226 #  endif // MOZILLA_MAY_SUPPORT_NEON
    227 
    228 } // namespace gfx
    229 
    230 } // namespace mozilla
    231 
    232 #endif // HAVE_YCBCR_TO_RGB565