blend_a64_vmask_sse4.c (10831B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <smmintrin.h> // SSE4.1 13 14 #include <assert.h> 15 16 #include "aom/aom_integer.h" 17 #include "aom_ports/mem.h" 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/blend.h" 20 21 #include "aom_dsp/x86/synonyms.h" 22 #include "aom_dsp/x86/blend_sse4.h" 23 24 #include "config/aom_dsp_rtcd.h" 25 26 ////////////////////////////////////////////////////////////////////////////// 27 // Implementation - No sub-sampling 28 ////////////////////////////////////////////////////////////////////////////// 29 30 static void blend_a64_vmask_w4_sse4_1(uint8_t *dst, uint32_t dst_stride, 31 const uint8_t *src0, uint32_t src0_stride, 32 const uint8_t *src1, uint32_t src1_stride, 33 const uint8_t *mask, int w, int h) { 34 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 35 36 (void)w; 37 38 do { 39 const __m128i v_m0_w = _mm_set1_epi16(*mask); 40 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 41 42 const __m128i v_res_w = blend_4(src0, src1, &v_m0_w, &v_m1_w); 43 44 const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); 45 46 xx_storel_32(dst, v_res_b); 47 48 dst += dst_stride; 49 src0 += src0_stride; 50 src1 += src1_stride; 51 mask += 1; 52 } while (--h); 53 } 54 55 static void blend_a64_vmask_w8_sse4_1(uint8_t *dst, uint32_t dst_stride, 56 const uint8_t *src0, uint32_t src0_stride, 57 const uint8_t *src1, uint32_t src1_stride, 58 const uint8_t *mask, int w, int h) { 59 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 60 61 (void)w; 62 63 do { 64 const __m128i v_m0_w = _mm_set1_epi16(*mask); 65 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 66 67 const __m128i v_res_w = blend_8(src0, src1, &v_m0_w, &v_m1_w); 68 69 const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w); 70 71 xx_storel_64(dst, v_res_b); 72 73 dst += dst_stride; 74 src0 += src0_stride; 75 src1 += src1_stride; 76 mask += 1; 77 } while (--h); 78 } 79 80 static void blend_a64_vmask_w16n_sse4_1(uint8_t *dst, uint32_t dst_stride, 81 const uint8_t *src0, 82 uint32_t src0_stride, 83 const uint8_t *src1, 84 uint32_t src1_stride, 85 const uint8_t *mask, int w, int h) { 86 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 87 88 do { 89 int c; 90 const __m128i v_m0_w = _mm_set1_epi16(*mask); 91 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 92 for (c = 0; c < w; c += 16) { 93 const __m128i v_resl_w = blend_8(src0 + c, src1 + c, &v_m0_w, &v_m1_w); 94 const __m128i v_resh_w = 95 blend_8(src0 + c + 8, src1 + c + 8, &v_m0_w, &v_m1_w); 96 97 const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w); 98 99 xx_storeu_128(dst + c, v_res_b); 100 } 101 dst += dst_stride; 102 src0 += src0_stride; 103 src1 += src1_stride; 104 mask += 1; 105 } while (--h); 106 } 107 108 ////////////////////////////////////////////////////////////////////////////// 109 // Dispatch 110 ////////////////////////////////////////////////////////////////////////////// 111 112 void aom_blend_a64_vmask_sse4_1(uint8_t *dst, uint32_t dst_stride, 113 const uint8_t *src0, uint32_t src0_stride, 114 const uint8_t *src1, uint32_t src1_stride, 115 const uint8_t *mask, int w, int h) { 116 typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride, 117 const uint8_t *src0, uint32_t src0_stride, 118 const uint8_t *src1, uint32_t src1_stride, 119 const uint8_t *mask, int w, int h); 120 121 // Dimension: width_index 122 static const blend_fn blend[9] = { 123 blend_a64_vmask_w16n_sse4_1, // w % 16 == 0 124 aom_blend_a64_vmask_c, // w == 1 125 aom_blend_a64_vmask_c, // w == 2 126 NULL, // INVALID 127 blend_a64_vmask_w4_sse4_1, // w == 4 128 NULL, // INVALID 129 NULL, // INVALID 130 NULL, // INVALID 131 blend_a64_vmask_w8_sse4_1, // w == 8 132 }; 133 134 assert(IMPLIES(src0 == dst, src0_stride == dst_stride)); 135 assert(IMPLIES(src1 == dst, src1_stride == dst_stride)); 136 137 assert(h >= 1); 138 assert(w >= 1); 139 assert(IS_POWER_OF_TWO(h)); 140 assert(IS_POWER_OF_TWO(w)); 141 142 blend[w & 0xf](dst, dst_stride, src0, src0_stride, src1, src1_stride, mask, w, 143 h); 144 } 145 146 #if CONFIG_AV1_HIGHBITDEPTH 147 ////////////////////////////////////////////////////////////////////////////// 148 // Implementation - No sub-sampling 149 ////////////////////////////////////////////////////////////////////////////// 150 151 static inline void blend_a64_vmask_bn_w4_sse4_1( 152 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 153 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 154 const uint8_t *mask, int h, blend_unit_fn blend) { 155 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 156 157 do { 158 const __m128i v_m0_w = _mm_set1_epi16(*mask); 159 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 160 161 const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w); 162 163 xx_storel_64(dst, v_res_w); 164 165 dst += dst_stride; 166 src0 += src0_stride; 167 src1 += src1_stride; 168 mask += 1; 169 } while (--h); 170 } 171 172 static void blend_a64_vmask_b10_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, 173 const uint16_t *src0, 174 uint32_t src0_stride, 175 const uint16_t *src1, 176 uint32_t src1_stride, 177 const uint8_t *mask, int w, int h) { 178 (void)w; 179 blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 180 src1_stride, mask, h, blend_4_b10); 181 } 182 183 static void blend_a64_vmask_b12_w4_sse4_1(uint16_t *dst, uint32_t dst_stride, 184 const uint16_t *src0, 185 uint32_t src0_stride, 186 const uint16_t *src1, 187 uint32_t src1_stride, 188 const uint8_t *mask, int w, int h) { 189 (void)w; 190 blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1, 191 src1_stride, mask, h, blend_4_b12); 192 } 193 194 static inline void blend_a64_vmask_bn_w8n_sse4_1( 195 uint16_t *dst, uint32_t dst_stride, const uint16_t *src0, 196 uint32_t src0_stride, const uint16_t *src1, uint32_t src1_stride, 197 const uint8_t *mask, int w, int h, blend_unit_fn blend) { 198 const __m128i v_maxval_w = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 199 200 do { 201 int c; 202 const __m128i v_m0_w = _mm_set1_epi16(*mask); 203 const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w); 204 for (c = 0; c < w; c += 8) { 205 const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w); 206 207 xx_storeu_128(dst + c, v_res_w); 208 } 209 dst += dst_stride; 210 src0 += src0_stride; 211 src1 += src1_stride; 212 mask += 1; 213 } while (--h); 214 } 215 216 static void blend_a64_vmask_b10_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, 217 const uint16_t *src0, 218 uint32_t src0_stride, 219 const uint16_t *src1, 220 uint32_t src1_stride, 221 const uint8_t *mask, int w, int h) { 222 blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 223 src1_stride, mask, w, h, blend_8_b10); 224 } 225 226 static void blend_a64_vmask_b12_w8n_sse4_1(uint16_t *dst, uint32_t dst_stride, 227 const uint16_t *src0, 228 uint32_t src0_stride, 229 const uint16_t *src1, 230 uint32_t src1_stride, 231 const uint8_t *mask, int w, int h) { 232 blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1, 233 src1_stride, mask, w, h, blend_8_b12); 234 } 235 236 ////////////////////////////////////////////////////////////////////////////// 237 // Dispatch 238 ////////////////////////////////////////////////////////////////////////////// 239 240 void aom_highbd_blend_a64_vmask_sse4_1( 241 uint8_t *dst_8, uint32_t dst_stride, const uint8_t *src0_8, 242 uint32_t src0_stride, const uint8_t *src1_8, uint32_t src1_stride, 243 const uint8_t *mask, int w, int h, int bd) { 244 typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride, 245 const uint16_t *src0, uint32_t src0_stride, 246 const uint16_t *src1, uint32_t src1_stride, 247 const uint8_t *mask, int w, int h); 248 249 // Dimensions are: bd_index X width_index 250 static const blend_fn blend[2][2] = { 251 { 252 // bd == 8 or 10 253 blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0 254 blend_a64_vmask_b10_w4_sse4_1, // w == 4 255 }, 256 { 257 // bd == 12 258 blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0 259 blend_a64_vmask_b12_w4_sse4_1, // w == 4 260 } 261 }; 262 263 assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride)); 264 assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride)); 265 266 assert(h >= 1); 267 assert(w >= 1); 268 assert(IS_POWER_OF_TWO(h)); 269 assert(IS_POWER_OF_TWO(w)); 270 271 assert(bd == 8 || bd == 10 || bd == 12); 272 273 if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2) 274 aom_highbd_blend_a64_vmask_c(dst_8, dst_stride, src0_8, src0_stride, src1_8, 275 src1_stride, mask, w, h, bd); 276 } else { 277 uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8); 278 const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8); 279 const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8); 280 281 blend[bd == 12][(w >> 2) & 1](dst, dst_stride, src0, src0_stride, src1, 282 src1_stride, mask, w, h); 283 } 284 } 285 #endif // CONFIG_AV1_HIGHBITDEPTH