pixman-ssse3.c (9649B)
1 /* 2 * Copyright © 2013 Soren Sandmann Pedersen 3 * Copyright © 2013 Red Hat, Inc. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 9 * and/or sell copies of the Software, and to permit persons to whom the 10 * Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 21 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 22 * DEALINGS IN THE SOFTWARE. 23 * 24 * Author: Soren Sandmann (soren.sandmann@gmail.com) 25 */ 26 #ifdef HAVE_CONFIG_H 27 #include <pixman-config.h> 28 #endif 29 30 #include <stdlib.h> 31 #include <mmintrin.h> 32 #include <xmmintrin.h> 33 #include <emmintrin.h> 34 #include <tmmintrin.h> 35 #include "pixman-private.h" 36 #include "pixman-inlines.h" 37 38 typedef struct 39 { 40 int y; 41 uint64_t * buffer; 42 } line_t; 43 44 typedef struct 45 { 46 line_t lines[2]; 47 pixman_fixed_t y; 48 pixman_fixed_t x; 49 uint64_t data[1]; 50 } bilinear_info_t; 51 52 static void 53 ssse3_fetch_horizontal (bits_image_t *image, line_t *line, 54 int y, pixman_fixed_t x, pixman_fixed_t ux, int n) 55 { 56 uint32_t *bits = image->bits + y * image->rowstride; 57 __m128i vx = _mm_set_epi16 ( 58 - (x + 1), x, - (x + 1), x, 59 - (x + ux + 1), x + ux, - (x + ux + 1), x + ux); 60 __m128i vux = _mm_set_epi16 ( 61 - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, 62 - 2 * ux, 2 * ux, - 2 * ux, 2 * ux); 63 __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); 64 __m128i *b = (__m128i *)line->buffer; 65 __m128i vrl0, vrl1; 66 67 while ((n -= 2) >= 0) 68 { 69 __m128i vw, vr, s; 70 71 vrl1 = _mm_loadl_epi64 ( 72 (__m128i *)(bits + pixman_fixed_to_int (x + ux))); 73 /* vrl1: R1, L1 */ 74 75 final_pixel: 76 vrl0 = _mm_loadl_epi64 ( 77 (__m128i *)(bits + pixman_fixed_to_int (x))); 78 /* vrl0: R0, L0 */ 79 80 /* The weights are based on vx which is a vector of 81 * 82 * - (x + 1), x, - (x + 1), x, 83 * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux 84 * 85 * so the 16 bit weights end up like this: 86 * 87 * iw0, w0, iw0, w0, iw1, w1, iw1, w1 88 * 89 * and after shifting and packing, we get these bytes: 90 * 91 * iw0, w0, iw0, w0, iw1, w1, iw1, w1, 92 * iw0, w0, iw0, w0, iw1, w1, iw1, w1, 93 * 94 * which means the first and the second input pixel 95 * have to be interleaved like this: 96 * 97 * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, 98 * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 99 * 100 * before maddubsw can be used. 101 */ 102 103 vw = _mm_add_epi16 ( 104 vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); 105 /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 106 */ 107 108 vw = _mm_packus_epi16 (vw, vw); 109 /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, 110 * iw0, w0, iw0, w0, iw1, w1, iw1, w1 111 */ 112 vx = _mm_add_epi16 (vx, vux); 113 114 x += 2 * ux; 115 116 vr = _mm_unpacklo_epi16 (vrl1, vrl0); 117 /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ 118 119 s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); 120 /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ 121 122 vr = _mm_unpackhi_epi8 (vr, s); 123 /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, 124 * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 125 */ 126 127 vr = _mm_maddubs_epi16 (vr, vw); 128 129 /* When the weight is 0, the inverse weight is 130 * 128 which can't be represented in a signed byte. 131 * As a result maddubsw computes the following: 132 * 133 * r = l * -128 + r * 0 134 * 135 * rather than the desired 136 * 137 * r = l * 128 + r * 0 138 * 139 * We fix this by taking the absolute value of the 140 * result. 141 */ 142 vr = _mm_abs_epi16 (vr); 143 144 /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ 145 _mm_store_si128 (b++, vr); 146 } 147 148 if (n == -1) 149 { 150 vrl1 = _mm_setzero_si128(); 151 goto final_pixel; 152 } 153 154 line->y = y; 155 } 156 157 static uint32_t * 158 ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) 159 { 160 pixman_fixed_t fx, ux; 161 bilinear_info_t *info = iter->data; 162 line_t *line0, *line1; 163 int y0, y1; 164 int32_t dist_y; 165 __m128i vw; 166 int i; 167 168 fx = info->x; 169 ux = iter->image->common.transform->matrix[0][0]; 170 171 y0 = pixman_fixed_to_int (info->y); 172 y1 = y0 + 1; 173 174 line0 = &info->lines[y0 & 0x01]; 175 line1 = &info->lines[y1 & 0x01]; 176 177 if (line0->y != y0) 178 { 179 ssse3_fetch_horizontal ( 180 &iter->image->bits, line0, y0, fx, ux, iter->width); 181 } 182 183 if (line1->y != y1) 184 { 185 ssse3_fetch_horizontal ( 186 &iter->image->bits, line1, y1, fx, ux, iter->width); 187 } 188 189 dist_y = pixman_fixed_to_bilinear_weight (info->y); 190 dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); 191 192 vw = _mm_set_epi16 ( 193 dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); 194 195 for (i = 0; i + 3 < iter->width; i += 4) 196 { 197 __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); 198 __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); 199 __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); 200 __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); 201 __m128i r0, r1, tmp, p; 202 203 r0 = _mm_mulhi_epu16 ( 204 _mm_sub_epi16 (bot0, top0), vw); 205 tmp = _mm_cmplt_epi16 (bot0, top0); 206 tmp = _mm_and_si128 (tmp, vw); 207 r0 = _mm_sub_epi16 (r0, tmp); 208 r0 = _mm_add_epi16 (r0, top0); 209 r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); 210 /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ 211 r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); 212 /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ 213 214 r1 = _mm_mulhi_epu16 ( 215 _mm_sub_epi16 (bot1, top1), vw); 216 tmp = _mm_cmplt_epi16 (bot1, top1); 217 tmp = _mm_and_si128 (tmp, vw); 218 r1 = _mm_sub_epi16 (r1, tmp); 219 r1 = _mm_add_epi16 (r1, top1); 220 r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); 221 r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); 222 /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ 223 224 p = _mm_packus_epi16 (r0, r1); 225 226 _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); 227 } 228 229 while (i < iter->width) 230 { 231 __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); 232 __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); 233 __m128i r0, tmp, p; 234 235 r0 = _mm_mulhi_epu16 ( 236 _mm_sub_epi16 (bot0, top0), vw); 237 tmp = _mm_cmplt_epi16 (bot0, top0); 238 tmp = _mm_and_si128 (tmp, vw); 239 r0 = _mm_sub_epi16 (r0, tmp); 240 r0 = _mm_add_epi16 (r0, top0); 241 r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); 242 /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ 243 r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); 244 /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ 245 246 p = _mm_packus_epi16 (r0, r0); 247 248 if (iter->width - i == 1) 249 { 250 *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); 251 i++; 252 } 253 else 254 { 255 _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); 256 i += 2; 257 } 258 } 259 260 info->y += iter->image->common.transform->matrix[1][1]; 261 262 return iter->buffer; 263 } 264 265 static void 266 ssse3_bilinear_cover_iter_fini (pixman_iter_t *iter) 267 { 268 free (iter->data); 269 } 270 271 static void 272 ssse3_bilinear_cover_iter_init (pixman_iter_t *iter, const pixman_iter_info_t *iter_info) 273 { 274 int width = iter->width; 275 bilinear_info_t *info; 276 pixman_vector_t v; 277 278 /* Reference point is the center of the pixel */ 279 v.vector[0] = pixman_int_to_fixed (iter->x) + pixman_fixed_1 / 2; 280 v.vector[1] = pixman_int_to_fixed (iter->y) + pixman_fixed_1 / 2; 281 v.vector[2] = pixman_fixed_1; 282 283 if (!pixman_transform_point_3d (iter->image->common.transform, &v)) 284 goto fail; 285 286 info = malloc (sizeof (*info) + (2 * width - 1) * sizeof (uint64_t) + 64); 287 if (!info) 288 goto fail; 289 290 info->x = v.vector[0] - pixman_fixed_1 / 2; 291 info->y = v.vector[1] - pixman_fixed_1 / 2; 292 293 #define ALIGN(addr) \ 294 ((void *)((((uintptr_t)(addr)) + 15) & (~15))) 295 296 /* It is safe to set the y coordinates to -1 initially 297 * because COVER_CLIP_BILINEAR ensures that we will only 298 * be asked to fetch lines in the [0, height) interval 299 */ 300 info->lines[0].y = -1; 301 info->lines[0].buffer = ALIGN (&(info->data[0])); 302 info->lines[1].y = -1; 303 info->lines[1].buffer = ALIGN (info->lines[0].buffer + width); 304 305 iter->get_scanline = ssse3_fetch_bilinear_cover; 306 iter->fini = ssse3_bilinear_cover_iter_fini; 307 308 iter->data = info; 309 return; 310 311 fail: 312 /* Something went wrong, either a bad matrix or OOM; in such cases, 313 * we don't guarantee any particular rendering. 314 */ 315 _pixman_log_error ( 316 FUNC, "Allocation failure or bad matrix, skipping rendering\n"); 317 318 iter->get_scanline = _pixman_iter_get_scanline_noop; 319 iter->fini = NULL; 320 } 321 322 static const pixman_iter_info_t ssse3_iters[] = 323 { 324 { PIXMAN_a8r8g8b8, 325 (FAST_PATH_STANDARD_FLAGS | 326 FAST_PATH_SCALE_TRANSFORM | 327 FAST_PATH_BILINEAR_FILTER | 328 FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR), 329 ITER_NARROW | ITER_SRC, 330 ssse3_bilinear_cover_iter_init, 331 NULL, NULL 332 }, 333 334 { PIXMAN_null }, 335 }; 336 337 static const pixman_fast_path_t ssse3_fast_paths[] = 338 { 339 { PIXMAN_OP_NONE }, 340 }; 341 342 pixman_implementation_t * 343 _pixman_implementation_create_ssse3 (pixman_implementation_t *fallback) 344 { 345 pixman_implementation_t *imp = 346 _pixman_implementation_create (fallback, ssse3_fast_paths); 347 348 imp->iter_info = ssse3_iters; 349 350 return imp; 351 }