intra_edge_sse4.c (11606B)
1 /* 2 * Copyright (c) 2017, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <assert.h> 13 #include <smmintrin.h> 14 15 #include "config/aom_config.h" 16 #include "config/av1_rtcd.h" 17 18 void av1_filter_intra_edge_sse4_1(uint8_t *p, int sz, int strength) { 19 if (!strength) return; 20 21 DECLARE_ALIGNED(16, static const int8_t, kern[3][16]) = { 22 { 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0, 4, 8, 4, 0 }, // strength 1: 4,8,4 23 { 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0, 5, 6, 5, 0 }, // strength 2: 5,6,5 24 { 2, 4, 4, 4, 2, 0, 0, 0, 2, 4, 4, 4, 2, 0, 0, 0 } // strength 3: 2,4,4,4,2 25 }; 26 27 DECLARE_ALIGNED(16, static const int8_t, v_const[5][16]) = { 28 { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, 29 { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 }, 30 { 0, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8 }, 31 { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, 32 }; 33 34 // Extend the first and last samples to simplify the loop for the 5-tap case 35 p[-1] = p[0]; 36 __m128i last = _mm_set1_epi8((char)p[sz - 1]); 37 _mm_storeu_si128((__m128i *)&p[sz], last); 38 39 // Adjust input pointer for filter support area 40 uint8_t *in = (strength == 3) ? p - 1 : p; 41 42 // Avoid modifying first sample 43 uint8_t *out = p + 1; 44 int len = sz - 1; 45 46 const int use_3tap_filter = (strength < 3); 47 48 if (use_3tap_filter) { 49 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); 50 __m128i shuf0 = _mm_lddqu_si128((__m128i const *)v_const[0]); 51 __m128i shuf1 = _mm_lddqu_si128((__m128i const *)v_const[1]); 52 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); 53 __m128i in0 = _mm_lddqu_si128((__m128i *)in); 54 while (len > 0) { 55 int n_out = (len < 8) ? len : 8; 56 __m128i d0 = _mm_shuffle_epi8(in0, shuf0); 57 __m128i d1 = _mm_shuffle_epi8(in0, shuf1); 58 d0 = _mm_maddubs_epi16(d0, coef0); 59 d1 = _mm_maddubs_epi16(d1, coef0); 60 d0 = _mm_hadd_epi16(d0, d1); 61 __m128i eight = _mm_set1_epi16(8); 62 d0 = _mm_add_epi16(d0, eight); 63 d0 = _mm_srai_epi16(d0, 4); 64 d0 = _mm_packus_epi16(d0, d0); 65 __m128i out0 = _mm_lddqu_si128((__m128i *)out); 66 __m128i n0 = _mm_set1_epi8(n_out); 67 __m128i mask = _mm_cmpgt_epi8(n0, iden); 68 out0 = _mm_blendv_epi8(out0, d0, mask); 69 _mm_storel_epi64((__m128i *)out, out0); 70 __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); 71 in0 = _mm_alignr_epi8(in1, in0, 8); 72 in += 8; 73 out += 8; 74 len -= n_out; 75 } 76 } else { // 5-tap filter 77 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); 78 __m128i two = _mm_set1_epi8(2); 79 __m128i shuf_a = _mm_lddqu_si128((__m128i const *)v_const[2]); 80 __m128i shuf_b = _mm_add_epi8(shuf_a, two); 81 __m128i shuf_c = _mm_add_epi8(shuf_b, two); 82 __m128i shuf_d = _mm_add_epi8(shuf_c, two); 83 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[3]); 84 __m128i in0 = _mm_lddqu_si128((__m128i *)in); 85 while (len > 0) { 86 int n_out = (len < 8) ? len : 8; 87 __m128i d0 = _mm_shuffle_epi8(in0, shuf_a); 88 __m128i d1 = _mm_shuffle_epi8(in0, shuf_b); 89 __m128i d2 = _mm_shuffle_epi8(in0, shuf_c); 90 __m128i d3 = _mm_shuffle_epi8(in0, shuf_d); 91 d0 = _mm_maddubs_epi16(d0, coef0); 92 d1 = _mm_maddubs_epi16(d1, coef0); 93 d2 = _mm_maddubs_epi16(d2, coef0); 94 d3 = _mm_maddubs_epi16(d3, coef0); 95 d0 = _mm_hadd_epi16(d0, d1); 96 d2 = _mm_hadd_epi16(d2, d3); 97 d0 = _mm_hadd_epi16(d0, d2); 98 __m128i eight = _mm_set1_epi16(8); 99 d0 = _mm_add_epi16(d0, eight); 100 d0 = _mm_srai_epi16(d0, 4); 101 d0 = _mm_packus_epi16(d0, d0); 102 __m128i out0 = _mm_lddqu_si128((__m128i *)out); 103 __m128i n0 = _mm_set1_epi8(n_out); 104 __m128i mask = _mm_cmpgt_epi8(n0, iden); 105 out0 = _mm_blendv_epi8(out0, d0, mask); 106 _mm_storel_epi64((__m128i *)out, out0); 107 __m128i in1 = _mm_lddqu_si128((__m128i *)(in + 16)); 108 in0 = _mm_alignr_epi8(in1, in0, 8); 109 in += 8; 110 out += 8; 111 len -= n_out; 112 } 113 } 114 } 115 116 void av1_upsample_intra_edge_sse4_1(uint8_t *p, int sz) { 117 // interpolate half-sample positions 118 assert(sz <= 24); 119 120 DECLARE_ALIGNED(16, static const int8_t, kernel[1][16]) = { 121 { -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1, -1, 9, 9, -1 } 122 }; 123 124 DECLARE_ALIGNED( 125 16, static const int8_t, 126 v_const[2][16]) = { { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6 }, 127 { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } }; 128 129 // Extend first/last samples (upper-left p[-1], last p[sz-1]) 130 // to support 4-tap filter 131 p[-2] = p[-1]; 132 p[sz] = p[sz - 1]; 133 134 uint8_t *in = &p[-2]; 135 uint8_t *out = &p[-2]; 136 137 int n = sz + 1; // Input length including upper-left sample 138 139 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); 140 __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); 141 142 __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); 143 __m128i shuf0 = _mm_lddqu_si128((__m128i *)v_const[0]); 144 __m128i shuf1 = _mm_lddqu_si128((__m128i *)v_const[1]); 145 146 while (n > 0) { 147 __m128i in8 = _mm_alignr_epi8(in16, in0, 8); 148 __m128i d0 = _mm_shuffle_epi8(in0, shuf0); 149 __m128i d1 = _mm_shuffle_epi8(in0, shuf1); 150 __m128i d2 = _mm_shuffle_epi8(in8, shuf0); 151 __m128i d3 = _mm_shuffle_epi8(in8, shuf1); 152 d0 = _mm_maddubs_epi16(d0, coef0); 153 d1 = _mm_maddubs_epi16(d1, coef0); 154 d2 = _mm_maddubs_epi16(d2, coef0); 155 d3 = _mm_maddubs_epi16(d3, coef0); 156 d0 = _mm_hadd_epi16(d0, d1); 157 d2 = _mm_hadd_epi16(d2, d3); 158 __m128i eight = _mm_set1_epi16(8); 159 d0 = _mm_add_epi16(d0, eight); 160 d2 = _mm_add_epi16(d2, eight); 161 d0 = _mm_srai_epi16(d0, 4); 162 d2 = _mm_srai_epi16(d2, 4); 163 d0 = _mm_packus_epi16(d0, d2); 164 __m128i in1 = _mm_alignr_epi8(in16, in0, 1); 165 __m128i out0 = _mm_unpacklo_epi8(in1, d0); 166 __m128i out1 = _mm_unpackhi_epi8(in1, d0); 167 _mm_storeu_si128((__m128i *)&out[0], out0); 168 _mm_storeu_si128((__m128i *)&out[16], out1); 169 in0 = in16; 170 in16 = _mm_setzero_si128(); 171 out += 32; 172 n -= 16; 173 } 174 } 175 176 #if CONFIG_AV1_HIGHBITDEPTH 177 178 void av1_highbd_filter_intra_edge_sse4_1(uint16_t *p, int sz, int strength) { 179 if (!strength) return; 180 181 DECLARE_ALIGNED(16, static const int16_t, kern[3][8]) = { 182 { 4, 8, 4, 8, 4, 8, 4, 8 }, // strength 1: 4,8,4 183 { 5, 6, 5, 6, 5, 6, 5, 6 }, // strength 2: 5,6,5 184 { 2, 4, 2, 4, 2, 4, 2, 4 } // strength 3: 2,4,4,4,2 185 }; 186 187 DECLARE_ALIGNED(16, static const int16_t, 188 v_const[1][8]) = { { 0, 1, 2, 3, 4, 5, 6, 7 } }; 189 190 // Extend the first and last samples to simplify the loop for the 5-tap case 191 p[-1] = p[0]; 192 __m128i last = _mm_set1_epi16(p[sz - 1]); 193 _mm_storeu_si128((__m128i *)&p[sz], last); 194 195 // Adjust input pointer for filter support area 196 uint16_t *in = (strength == 3) ? p - 1 : p; 197 198 // Avoid modifying first sample 199 uint16_t *out = p + 1; 200 int len = sz - 1; 201 202 const int use_3tap_filter = (strength < 3); 203 204 if (use_3tap_filter) { 205 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); 206 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); 207 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); 208 __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); 209 while (len > 0) { 210 int n_out = (len < 8) ? len : 8; 211 __m128i in1 = _mm_alignr_epi8(in8, in0, 2); 212 __m128i in2 = _mm_alignr_epi8(in8, in0, 4); 213 __m128i in02 = _mm_add_epi16(in0, in2); 214 __m128i d0 = _mm_unpacklo_epi16(in02, in1); 215 __m128i d1 = _mm_unpackhi_epi16(in02, in1); 216 d0 = _mm_mullo_epi16(d0, coef0); 217 d1 = _mm_mullo_epi16(d1, coef0); 218 d0 = _mm_hadd_epi16(d0, d1); 219 __m128i eight = _mm_set1_epi16(8); 220 d0 = _mm_add_epi16(d0, eight); 221 d0 = _mm_srli_epi16(d0, 4); 222 __m128i out0 = _mm_lddqu_si128((__m128i *)out); 223 __m128i n0 = _mm_set1_epi16(n_out); 224 __m128i mask = _mm_cmpgt_epi16(n0, iden); 225 out0 = _mm_blendv_epi8(out0, d0, mask); 226 _mm_storeu_si128((__m128i *)out, out0); 227 in += 8; 228 in0 = in8; 229 in8 = _mm_lddqu_si128((__m128i *)&in[8]); 230 out += 8; 231 len -= n_out; 232 } 233 } else { // 5-tap filter 234 __m128i coef0 = _mm_lddqu_si128((__m128i const *)kern[strength - 1]); 235 __m128i iden = _mm_lddqu_si128((__m128i *)v_const[0]); 236 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); 237 __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); 238 while (len > 0) { 239 int n_out = (len < 8) ? len : 8; 240 __m128i in1 = _mm_alignr_epi8(in8, in0, 2); 241 __m128i in2 = _mm_alignr_epi8(in8, in0, 4); 242 __m128i in3 = _mm_alignr_epi8(in8, in0, 6); 243 __m128i in4 = _mm_alignr_epi8(in8, in0, 8); 244 __m128i in04 = _mm_add_epi16(in0, in4); 245 __m128i in123 = _mm_add_epi16(in1, in2); 246 in123 = _mm_add_epi16(in123, in3); 247 __m128i d0 = _mm_unpacklo_epi16(in04, in123); 248 __m128i d1 = _mm_unpackhi_epi16(in04, in123); 249 d0 = _mm_mullo_epi16(d0, coef0); 250 d1 = _mm_mullo_epi16(d1, coef0); 251 d0 = _mm_hadd_epi16(d0, d1); 252 __m128i eight = _mm_set1_epi16(8); 253 d0 = _mm_add_epi16(d0, eight); 254 d0 = _mm_srli_epi16(d0, 4); 255 __m128i out0 = _mm_lddqu_si128((__m128i *)out); 256 __m128i n0 = _mm_set1_epi16(n_out); 257 __m128i mask = _mm_cmpgt_epi16(n0, iden); 258 out0 = _mm_blendv_epi8(out0, d0, mask); 259 _mm_storeu_si128((__m128i *)out, out0); 260 in += 8; 261 in0 = in8; 262 in8 = _mm_lddqu_si128((__m128i *)&in[8]); 263 out += 8; 264 len -= n_out; 265 } 266 } 267 } 268 269 void av1_highbd_upsample_intra_edge_sse4_1(uint16_t *p, int sz, int bd) { 270 // interpolate half-sample positions 271 assert(sz <= 24); 272 273 DECLARE_ALIGNED(16, static const int16_t, 274 kernel[1][8]) = { { -1, 9, -1, 9, -1, 9, -1, 9 } }; 275 276 // Extend first/last samples (upper-left p[-1], last p[sz-1]) 277 // to support 4-tap filter 278 p[-2] = p[-1]; 279 p[sz] = p[sz - 1]; 280 281 uint16_t *in = &p[-2]; 282 uint16_t *out = in; 283 int n = sz + 1; 284 285 __m128i in0 = _mm_lddqu_si128((__m128i *)&in[0]); 286 __m128i in8 = _mm_lddqu_si128((__m128i *)&in[8]); 287 __m128i in16 = _mm_lddqu_si128((__m128i *)&in[16]); 288 __m128i in24 = _mm_lddqu_si128((__m128i *)&in[24]); 289 290 while (n > 0) { 291 __m128i in1 = _mm_alignr_epi8(in8, in0, 2); 292 __m128i in2 = _mm_alignr_epi8(in8, in0, 4); 293 __m128i in3 = _mm_alignr_epi8(in8, in0, 6); 294 __m128i sum0 = _mm_add_epi16(in0, in3); 295 __m128i sum1 = _mm_add_epi16(in1, in2); 296 __m128i d0 = _mm_unpacklo_epi16(sum0, sum1); 297 __m128i d1 = _mm_unpackhi_epi16(sum0, sum1); 298 __m128i coef0 = _mm_lddqu_si128((__m128i *)kernel[0]); 299 d0 = _mm_madd_epi16(d0, coef0); 300 d1 = _mm_madd_epi16(d1, coef0); 301 __m128i eight = _mm_set1_epi32(8); 302 d0 = _mm_add_epi32(d0, eight); 303 d1 = _mm_add_epi32(d1, eight); 304 d0 = _mm_srai_epi32(d0, 4); 305 d1 = _mm_srai_epi32(d1, 4); 306 d0 = _mm_packus_epi32(d0, d1); 307 __m128i max0 = _mm_set1_epi16((1 << bd) - 1); 308 d0 = _mm_min_epi16(d0, max0); 309 __m128i out0 = _mm_unpacklo_epi16(in1, d0); 310 __m128i out1 = _mm_unpackhi_epi16(in1, d0); 311 _mm_storeu_si128((__m128i *)&out[0], out0); 312 _mm_storeu_si128((__m128i *)&out[8], out1); 313 in0 = in8; 314 in8 = in16; 315 in16 = in24; 316 in24 = _mm_setzero_si128(); 317 out += 16; 318 n -= 8; 319 } 320 } 321 322 #endif // CONFIG_AV1_HIGHBITDEPTH