blend_mask_sse4.h (9741B)
1 /* 2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #ifndef AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ 13 #define AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_ 14 #include <smmintrin.h> // SSE4.1 15 16 #include <assert.h> 17 18 #include "aom/aom_integer.h" 19 #include "aom_ports/mem.h" 20 #include "aom_dsp/aom_dsp_common.h" 21 #include "aom_dsp/blend.h" 22 23 #include "aom_dsp/x86/synonyms.h" 24 25 #include "config/aom_dsp_rtcd.h" 26 27 static inline void blend_a64_d16_mask_w4_sse41( 28 uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, 29 const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, 30 int shift) { 31 const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); 32 const __m128i s0 = xx_loadl_64(src0); 33 const __m128i s1 = xx_loadl_64(src1); 34 const __m128i s0_s1 = _mm_unpacklo_epi16(s0, s1); 35 const __m128i m_max_minus_m = _mm_unpacklo_epi16(*m, max_minus_m); 36 const __m128i res_a = _mm_madd_epi16(s0_s1, m_max_minus_m); 37 const __m128i res_c = _mm_sub_epi32(res_a, *v_round_offset); 38 const __m128i res_d = _mm_srai_epi32(res_c, shift); 39 const __m128i res_e = _mm_packs_epi32(res_d, res_d); 40 const __m128i res = _mm_packus_epi16(res_e, res_e); 41 42 xx_storel_32(dst, res); 43 } 44 45 static inline void blend_a64_d16_mask_w8_sse41( 46 uint8_t *dst, const CONV_BUF_TYPE *src0, const CONV_BUF_TYPE *src1, 47 const __m128i *m, const __m128i *v_round_offset, const __m128i *v_maxval, 48 int shift) { 49 const __m128i max_minus_m = _mm_sub_epi16(*v_maxval, *m); 50 const __m128i s0 = xx_loadu_128(src0); 51 const __m128i s1 = xx_loadu_128(src1); 52 __m128i res_lo = _mm_madd_epi16(_mm_unpacklo_epi16(s0, s1), 53 _mm_unpacklo_epi16(*m, max_minus_m)); 54 __m128i res_hi = _mm_madd_epi16(_mm_unpackhi_epi16(s0, s1), 55 _mm_unpackhi_epi16(*m, max_minus_m)); 56 res_lo = _mm_srai_epi32(_mm_sub_epi32(res_lo, *v_round_offset), shift); 57 res_hi = _mm_srai_epi32(_mm_sub_epi32(res_hi, *v_round_offset), shift); 58 const __m128i res_e = _mm_packs_epi32(res_lo, res_hi); 59 const __m128i res = _mm_packus_epi16(res_e, res_e); 60 61 _mm_storel_epi64((__m128i *)(dst), res); 62 } 63 64 static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w4_sse4_1( 65 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 66 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 67 const uint8_t *mask, uint32_t mask_stride, int h, 68 const __m128i *round_offset, int shift) { 69 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 70 for (int i = 0; i < h; ++i) { 71 const __m128i m0 = xx_loadl_32(mask); 72 const __m128i m = _mm_cvtepu8_epi16(m0); 73 74 blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 75 shift); 76 mask += mask_stride; 77 dst += dst_stride; 78 src0 += src0_stride; 79 src1 += src1_stride; 80 } 81 } 82 83 static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh0_w8_sse4_1( 84 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 85 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 86 const uint8_t *mask, uint32_t mask_stride, int h, 87 const __m128i *round_offset, int shift) { 88 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 89 for (int i = 0; i < h; ++i) { 90 const __m128i m0 = xx_loadl_64(mask); 91 const __m128i m = _mm_cvtepu8_epi16(m0); 92 blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 93 shift); 94 mask += mask_stride; 95 dst += dst_stride; 96 src0 += src0_stride; 97 src1 += src1_stride; 98 } 99 } 100 101 static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w4_sse4_1( 102 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 103 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 104 const uint8_t *mask, uint32_t mask_stride, int h, 105 const __m128i *round_offset, int shift) { 106 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 107 const __m128i one_b = _mm_set1_epi8(1); 108 const __m128i two_w = _mm_set1_epi16(2); 109 for (int i = 0; i < h; ++i) { 110 const __m128i m_i0 = xx_loadl_64(mask); 111 const __m128i m_i1 = xx_loadl_64(mask + mask_stride); 112 const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); 113 const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); 114 const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); 115 const __m128i m = _mm_srli_epi16(m_acbd_2, 2); 116 117 blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 118 shift); 119 mask += mask_stride << 1; 120 dst += dst_stride; 121 src0 += src0_stride; 122 src1 += src1_stride; 123 } 124 } 125 126 static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh1_w8_sse4_1( 127 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 128 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 129 const uint8_t *mask, uint32_t mask_stride, int h, 130 const __m128i *round_offset, int shift) { 131 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 132 const __m128i one_b = _mm_set1_epi8(1); 133 const __m128i two_w = _mm_set1_epi16(2); 134 for (int i = 0; i < h; ++i) { 135 const __m128i m_i0 = xx_loadu_128(mask); 136 const __m128i m_i1 = xx_loadu_128(mask + mask_stride); 137 const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); 138 const __m128i m_acbd = _mm_maddubs_epi16(m_ac, one_b); 139 const __m128i m_acbd_2 = _mm_add_epi16(m_acbd, two_w); 140 const __m128i m = _mm_srli_epi16(m_acbd_2, 2); 141 142 blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 143 shift); 144 mask += mask_stride << 1; 145 dst += dst_stride; 146 src0 += src0_stride; 147 src1 += src1_stride; 148 } 149 } 150 151 static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w4_sse4_1( 152 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 153 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 154 const uint8_t *mask, uint32_t mask_stride, int h, 155 const __m128i *round_offset, int shift) { 156 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 157 const __m128i one_b = _mm_set1_epi8(1); 158 const __m128i zeros = _mm_setzero_si128(); 159 for (int i = 0; i < h; ++i) { 160 const __m128i m_i0 = xx_loadl_64(mask); 161 const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); 162 const __m128i m = _mm_avg_epu16(m_ac, zeros); 163 164 blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 165 shift); 166 mask += mask_stride; 167 dst += dst_stride; 168 src0 += src0_stride; 169 src1 += src1_stride; 170 } 171 } 172 173 static inline void aom_lowbd_blend_a64_d16_mask_subw1_subh0_w8_sse4_1( 174 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 175 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 176 const uint8_t *mask, uint32_t mask_stride, int h, 177 const __m128i *round_offset, int shift) { 178 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 179 const __m128i one_b = _mm_set1_epi8(1); 180 const __m128i zeros = _mm_setzero_si128(); 181 for (int i = 0; i < h; ++i) { 182 const __m128i m_i0 = xx_loadu_128(mask); 183 const __m128i m_ac = _mm_maddubs_epi16(m_i0, one_b); 184 const __m128i m = _mm_avg_epu16(m_ac, zeros); 185 186 blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 187 shift); 188 mask += mask_stride; 189 dst += dst_stride; 190 src0 += src0_stride; 191 src1 += src1_stride; 192 } 193 } 194 static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w4_sse4_1( 195 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 196 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 197 const uint8_t *mask, uint32_t mask_stride, int h, 198 const __m128i *round_offset, int shift) { 199 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 200 const __m128i zeros = _mm_setzero_si128(); 201 for (int i = 0; i < h; ++i) { 202 const __m128i m_i0 = xx_loadl_64(mask); 203 const __m128i m_i1 = xx_loadl_64(mask + mask_stride); 204 const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); 205 const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); 206 207 blend_a64_d16_mask_w4_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 208 shift); 209 mask += mask_stride << 1; 210 dst += dst_stride; 211 src0 += src0_stride; 212 src1 += src1_stride; 213 } 214 } 215 216 static inline void aom_lowbd_blend_a64_d16_mask_subw0_subh1_w8_sse4_1( 217 uint8_t *dst, uint32_t dst_stride, const CONV_BUF_TYPE *src0, 218 uint32_t src0_stride, const CONV_BUF_TYPE *src1, uint32_t src1_stride, 219 const uint8_t *mask, uint32_t mask_stride, int h, 220 const __m128i *round_offset, int shift) { 221 const __m128i v_maxval = _mm_set1_epi16(AOM_BLEND_A64_MAX_ALPHA); 222 const __m128i zeros = _mm_setzero_si128(); 223 for (int i = 0; i < h; ++i) { 224 const __m128i m_i0 = xx_loadl_64(mask); 225 const __m128i m_i1 = xx_loadl_64(mask + mask_stride); 226 const __m128i m_ac = _mm_adds_epu8(m_i0, m_i1); 227 const __m128i m = _mm_cvtepu8_epi16(_mm_avg_epu8(m_ac, zeros)); 228 229 blend_a64_d16_mask_w8_sse41(dst, src0, src1, &m, round_offset, &v_maxval, 230 shift); 231 mask += mask_stride << 1; 232 dst += dst_stride; 233 src0 += src0_stride; 234 src1 += src1_stride; 235 } 236 } 237 #endif // AOM_AOM_DSP_X86_BLEND_MASK_SSE4_H_