highbd_wiener_convolve_ssse3.c (8616B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <tmmintrin.h> 13 #include <assert.h> 14 15 #include "config/av1_rtcd.h" 16 17 #include "av1/common/convolve.h" 18 #include "aom_dsp/aom_dsp_common.h" 19 #include "aom_dsp/aom_filter.h" 20 21 void av1_highbd_wiener_convolve_add_src_ssse3( 22 const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, 23 ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, 24 const int16_t *filter_y, int y_step_q4, int w, int h, 25 const WienerConvolveParams *conv_params, int bd) { 26 assert(x_step_q4 == 16 && y_step_q4 == 16); 27 assert(!(w & 7)); 28 assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); 29 (void)x_step_q4; 30 (void)y_step_q4; 31 32 const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); 33 uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); 34 35 DECLARE_ALIGNED(16, uint16_t, 36 temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); 37 int intermediate_height = h + SUBPEL_TAPS - 1; 38 int i, j; 39 const int center_tap = ((SUBPEL_TAPS - 1) / 2); 40 const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; 41 42 const __m128i zero = _mm_setzero_si128(); 43 // Add an offset to account for the "add_src" part of the convolve function. 44 const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); 45 46 /* Horizontal filter */ 47 { 48 const __m128i coeffs_x = 49 _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); 50 51 // coeffs 0 1 0 1 2 3 2 3 52 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); 53 // coeffs 4 5 4 5 6 7 6 7 54 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); 55 56 // coeffs 0 1 0 1 0 1 0 1 57 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); 58 // coeffs 2 3 2 3 2 3 2 3 59 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); 60 // coeffs 4 5 4 5 4 5 4 5 61 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); 62 // coeffs 6 7 6 7 6 7 6 7 63 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); 64 65 const __m128i round_const = _mm_set1_epi32( 66 (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); 67 68 for (i = 0; i < intermediate_height; ++i) { 69 for (j = 0; j < w; j += 8) { 70 const __m128i data = 71 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); 72 const __m128i data2 = 73 _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); 74 75 // Filter even-index pixels 76 const __m128i res_0 = _mm_madd_epi16(data, coeff_01); 77 const __m128i res_2 = 78 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); 79 const __m128i res_4 = 80 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); 81 const __m128i res_6 = 82 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); 83 84 __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), 85 _mm_add_epi32(res_2, res_6)); 86 res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), 87 conv_params->round_0); 88 89 // Filter odd-index pixels 90 const __m128i res_1 = 91 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); 92 const __m128i res_3 = 93 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); 94 const __m128i res_5 = 95 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); 96 const __m128i res_7 = 97 _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); 98 99 __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), 100 _mm_add_epi32(res_3, res_7)); 101 res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), 102 conv_params->round_0); 103 104 // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 105 const __m128i maxval = 106 _mm_set1_epi16(WIENER_CLAMP_LIMIT(conv_params->round_0, bd) - 1); 107 __m128i res = _mm_packs_epi32(res_even, res_odd); 108 res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); 109 _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); 110 } 111 } 112 } 113 114 /* Vertical filter */ 115 { 116 const __m128i coeffs_y = 117 _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); 118 119 // coeffs 0 1 0 1 2 3 2 3 120 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); 121 // coeffs 4 5 4 5 6 7 6 7 122 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); 123 124 // coeffs 0 1 0 1 0 1 0 1 125 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); 126 // coeffs 2 3 2 3 2 3 2 3 127 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); 128 // coeffs 4 5 4 5 4 5 4 5 129 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); 130 // coeffs 6 7 6 7 6 7 6 7 131 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); 132 133 const __m128i round_const = 134 _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - 135 (1 << (bd + conv_params->round_1 - 1))); 136 137 for (i = 0; i < h; ++i) { 138 for (j = 0; j < w; j += 8) { 139 // Filter even-index pixels 140 const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; 141 const __m128i src_0 = 142 _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), 143 *(__m128i *)(data + 1 * MAX_SB_SIZE)); 144 const __m128i src_2 = 145 _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), 146 *(__m128i *)(data + 3 * MAX_SB_SIZE)); 147 const __m128i src_4 = 148 _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), 149 *(__m128i *)(data + 5 * MAX_SB_SIZE)); 150 const __m128i src_6 = 151 _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), 152 *(__m128i *)(data + 7 * MAX_SB_SIZE)); 153 154 const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); 155 const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); 156 const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); 157 const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); 158 159 const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), 160 _mm_add_epi32(res_4, res_6)); 161 162 // Filter odd-index pixels 163 const __m128i src_1 = 164 _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), 165 *(__m128i *)(data + 1 * MAX_SB_SIZE)); 166 const __m128i src_3 = 167 _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), 168 *(__m128i *)(data + 3 * MAX_SB_SIZE)); 169 const __m128i src_5 = 170 _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), 171 *(__m128i *)(data + 5 * MAX_SB_SIZE)); 172 const __m128i src_7 = 173 _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), 174 *(__m128i *)(data + 7 * MAX_SB_SIZE)); 175 176 const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); 177 const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); 178 const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); 179 const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); 180 181 const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), 182 _mm_add_epi32(res_5, res_7)); 183 184 // Rearrange pixels back into the order 0 ... 7 185 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); 186 const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); 187 188 const __m128i res_lo_round = _mm_srai_epi32( 189 _mm_add_epi32(res_lo, round_const), conv_params->round_1); 190 const __m128i res_hi_round = _mm_srai_epi32( 191 _mm_add_epi32(res_hi, round_const), conv_params->round_1); 192 193 const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); 194 __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); 195 res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); 196 197 __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; 198 _mm_storeu_si128(p, res_16bit); 199 } 200 } 201 } 202 }