tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

pixman-ssse3.c (9649B)


      1 /*
      2 * Copyright © 2013 Soren Sandmann Pedersen
      3 * Copyright © 2013 Red Hat, Inc.
      4 *
      5 * Permission is hereby granted, free of charge, to any person obtaining a
      6 * copy of this software and associated documentation files (the "Software"),
      7 * to deal in the Software without restriction, including without limitation
      8 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
      9 * and/or sell copies of the Software, and to permit persons to whom the
     10 * Software is furnished to do so, subject to the following conditions:
     11 *
     12 * The above copyright notice and this permission notice (including the next
     13 * paragraph) shall be included in all copies or substantial portions of the
     14 * Software.
     15 * 
     16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
     19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
     20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
     21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
     22 * DEALINGS IN THE SOFTWARE.
     23 *
     24 * Author: Soren Sandmann (soren.sandmann@gmail.com)
     25 */
     26 #ifdef HAVE_CONFIG_H
     27 #include <pixman-config.h>
     28 #endif
     29 
     30 #include <stdlib.h>
     31 #include <mmintrin.h>
     32 #include <xmmintrin.h>
     33 #include <emmintrin.h>
     34 #include <tmmintrin.h>
     35 #include "pixman-private.h"
     36 #include "pixman-inlines.h"
     37 
     38 typedef struct
     39 {
     40    int		y;
     41    uint64_t *	buffer;
     42 } line_t;
     43 
     44 typedef struct
     45 {
     46    line_t		lines[2];
     47    pixman_fixed_t	y;
     48    pixman_fixed_t	x;
     49    uint64_t		data[1];
     50 } bilinear_info_t;
     51 
     52 static void
     53 ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
     54 		int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
     55 {
     56    uint32_t *bits = image->bits + y * image->rowstride;
     57    __m128i vx = _mm_set_epi16 (
     58 - (x + 1), x, - (x + 1), x,
     59 - (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
     60    __m128i vux = _mm_set_epi16 (
     61 - 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
     62 - 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
     63    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
     64    __m128i *b = (__m128i *)line->buffer;
     65    __m128i vrl0, vrl1;
     66 
     67    while ((n -= 2) >= 0)
     68    {
     69 __m128i vw, vr, s;
     70 
     71 vrl1 = _mm_loadl_epi64 (
     72     (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
     73 /* vrl1: R1, L1 */
     74 
     75    final_pixel:
     76 vrl0 = _mm_loadl_epi64 (
     77     (__m128i *)(bits + pixman_fixed_to_int (x)));
     78 /* vrl0: R0, L0 */
     79 
     80 /* The weights are based on vx which is a vector of 
     81  *
     82  *    - (x + 1), x, - (x + 1), x,
     83  *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
     84  *
     85  * so the 16 bit weights end up like this:
     86  *
     87  *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
     88  *
     89  * and after shifting and packing, we get these bytes:
     90  *
     91  *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
     92  *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
     93  *
     94  * which means the first and the second input pixel 
     95  * have to be interleaved like this:
     96  *
     97  *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
     98  *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
     99  *
    100  * before maddubsw can be used.
    101  */
    102 
    103 vw = _mm_add_epi16 (
    104     vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
    105 /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
    106  */
    107 
    108 vw = _mm_packus_epi16 (vw, vw);
    109 /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
    110  *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
    111  */
    112 vx = _mm_add_epi16 (vx, vux);
    113 
    114 x += 2 * ux;
    115 
    116 vr = _mm_unpacklo_epi16 (vrl1, vrl0);
    117 /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */
    118 
    119 s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
    120 /* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */
    121 
    122 vr = _mm_unpackhi_epi8 (vr, s);
    123 /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
    124  *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
    125  */
    126 
    127 vr = _mm_maddubs_epi16 (vr, vw);
    128 
    129 /* When the weight is 0, the inverse weight is
    130  * 128 which can't be represented in a signed byte.
    131  * As a result maddubsw computes the following:
    132  *
    133  *     r = l * -128 + r * 0
    134  *
    135  * rather than the desired
    136  *
    137  *     r = l * 128 + r * 0
    138  *
    139  * We fix this by taking the absolute value of the
    140  * result.
    141  */
    142 vr = _mm_abs_epi16 (vr);
    143 
    144 /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
    145 _mm_store_si128 (b++, vr);
    146    }
    147 
    148    if (n == -1)
    149    {
    150 vrl1 = _mm_setzero_si128();
    151 goto final_pixel;
    152    }
    153 
    154    line->y = y;
    155 }
    156 
    157 static uint32_t *
    158 ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
    159 {
    160    pixman_fixed_t fx, ux;
    161    bilinear_info_t *info = iter->data;
    162    line_t *line0, *line1;
    163    int y0, y1;
    164    int32_t dist_y;
    165    __m128i vw;
    166    int i;
    167 
    168    fx = info->x;
    169    ux = iter->image->common.transform->matrix[0][0];
    170 
    171    y0 = pixman_fixed_to_int (info->y);
    172    y1 = y0 + 1;
    173 
    174    line0 = &info->lines[y0 & 0x01];
    175    line1 = &info->lines[y1 & 0x01];
    176 
    177    if (line0->y != y0)
    178    {
    179 ssse3_fetch_horizontal (
    180     &iter->image->bits, line0, y0, fx, ux, iter->width);
    181    }
    182 
    183    if (line1->y != y1)
    184    {
    185 ssse3_fetch_horizontal (
    186     &iter->image->bits, line1, y1, fx, ux, iter->width);
    187    }
    188 
    189    dist_y = pixman_fixed_to_bilinear_weight (info->y);
    190    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);
    191 
    192    vw = _mm_set_epi16 (
    193 dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);
    194 
    195    for (i = 0; i + 3 < iter->width; i += 4)
    196    {
    197 __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
    198 __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
    199 __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
    200 __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
    201 __m128i r0, r1, tmp, p;
    202 
    203 r0 = _mm_mulhi_epu16 (
    204     _mm_sub_epi16 (bot0, top0), vw);
    205 tmp = _mm_cmplt_epi16 (bot0, top0);
    206 tmp = _mm_and_si128 (tmp, vw);
    207 r0 = _mm_sub_epi16 (r0, tmp);
    208 r0 = _mm_add_epi16 (r0, top0);
    209 r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
    210 /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
    211 r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
    212 /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
    213 
    214 r1 = _mm_mulhi_epu16 (
    215     _mm_sub_epi16 (bot1, top1), vw);
    216 tmp = _mm_cmplt_epi16 (bot1, top1);
    217 tmp = _mm_and_si128 (tmp, vw);
    218 r1 = _mm_sub_epi16 (r1, tmp);
    219 r1 = _mm_add_epi16 (r1, top1);
    220 r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
    221 r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
    222 /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */
    223 
    224 p = _mm_packus_epi16 (r0, r1);
    225 
    226 _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
    227    }
    228 
    229    while (i < iter->width)
    230    {
    231 __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
    232 __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
    233 __m128i r0, tmp, p;
    234 
    235 r0 = _mm_mulhi_epu16 (
    236     _mm_sub_epi16 (bot0, top0), vw);
    237 tmp = _mm_cmplt_epi16 (bot0, top0);
    238 tmp = _mm_and_si128 (tmp, vw);
    239 r0 = _mm_sub_epi16 (r0, tmp);
    240 r0 = _mm_add_epi16 (r0, top0);
    241 r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
    242 /* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
    243 r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
    244 /* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */
    245 
    246 p = _mm_packus_epi16 (r0, r0);
    247 
    248 if (iter->width - i == 1)
    249 {
    250     *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
    251     i++;
    252 }
    253 else
    254 {
    255     _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
    256     i += 2;
    257 }
    258    }
    259    
    260    info->y += iter->image->common.transform->matrix[1][1];
    261 
    262    return iter->buffer;
    263 }
    264 
    265 static void
    266 ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter)
    267 {
    268    free (iter->data);
    269 }
    270 
    271 static void
    272 ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info)
    273 {
    274    int width = iter->width;
    275    bilinear_info_t *info;
    276    pixman_vector_t v;
    277 
    278    /* Reference point is the center of the pixel */
    279    v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2;
    280    v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2;
    281    v.vector[2] = pixman_fixed_1;
    282 
    283    if (!pixman_transform_point_3d (iter->image->common.transform, &v))
    284 goto fail;
    285 
    286    info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64);
    287    if (!info)
    288 goto fail;
    289 
    290    info->x = v.vector[0] - pixman_fixed_1 / 2;
    291    info->y = v.vector[1] - pixman_fixed_1 / 2;
    292 
    293 #define ALIGN(addr)							\
    294    ((void *)((((uintptr_t)(addr)) + 15) & (~15)))
    295 
    296    /* It is safe to set the y coordinates to -1 initially
    297     * because COVER_CLIP_BILINEAR ensures that we will only
    298     * be asked to fetch lines in the [0, height) interval
    299     */
    300    info->lines[0].y = -1;
    301    info->lines[0].buffer = ALIGN (&(info->data[0]));
    302    info->lines[1].y = -1;
    303    info->lines[1].buffer = ALIGN (info->lines[0].buffer + width);
    304 
    305    iter->get_scanline = ssse3_fetch_bilinear_cover;
    306    iter->fini = ssse3_bilinear_cover_iter_fini;
    307 
    308    iter->data = info;
    309    return;
    310 
    311 fail:
    312    /* Something went wrong, either a bad matrix or OOM; in such cases,
    313     * we don't guarantee any particular rendering.
    314     */
    315    _pixman_log_error (
    316 FUNC, "Allocation failure or bad matrix, skipping rendering\n");
    317    
    318    iter->get_scanline = _pixman_iter_get_scanline_noop;
    319    iter->fini = NULL;
    320 }
    321 
    322 static const pixman_iter_info_t ssse3_iters[] = 
    323 {
    324    { PIXMAN_a8r8g8b8,
    325      (FAST_PATH_STANDARD_FLAGS			|
    326       FAST_PATH_SCALE_TRANSFORM		|
    327       FAST_PATH_BILINEAR_FILTER		|
    328       FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR),
    329      ITER_NARROW | ITER_SRC,
    330      ssse3_bilinear_cover_iter_init,
    331      NULL, NULL
    332    },
    333 
    334    { PIXMAN_null },
    335 };
    336 
    337 static const pixman_fast_path_t ssse3_fast_paths[] =
    338 {
    339    { PIXMAN_OP_NONE },
    340 };
    341 
    342 pixman_implementation_t *
    343 _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback)
    344 {
    345    pixman_implementation_t *imp =
    346 _pixman_implementation_create (fallback, ssse3_fast_paths);
    347 
    348    imp->iter_info = ssse3_iters;
    349 
    350    return imp;
    351 }