aom_scaled_convolve8_neon.c (12913B)
1 /* 2 * Copyright (c) 2020, Alliance for Open Media. All rights reserved 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "aom_dsp/arm/aom_convolve8_neon.h" 16 #include "aom_dsp/arm/mem_neon.h" 17 #include "aom_dsp/arm/transpose_neon.h" 18 #include "config/aom_dsp_rtcd.h" 19 20 static inline void scaled_convolve_horiz_neon( 21 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 22 const ptrdiff_t dst_stride, const InterpKernel *const x_filter, 23 const int x0_q4, const int x_step_q4, int w, int h) { 24 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); 25 26 if (w == 4) { 27 do { 28 int x_q4 = x0_q4; 29 30 // Process a 4x4 tile. 31 for (int r = 0; r < 4; ++r) { 32 const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; 33 34 if (x_q4 & SUBPEL_MASK) { 35 // Halve filter values (all even) to avoid the need for saturating 36 // arithmetic in convolution kernels. 37 const int16x8_t filter = 38 vshrq_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); 39 40 uint8x8_t t0, t1, t2, t3; 41 load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); 42 transpose_elems_inplace_u8_8x4(&t0, &t1, &t2, &t3); 43 44 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 45 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 46 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 47 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 48 int16x4_t s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 49 int16x4_t s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 50 int16x4_t s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 51 int16x4_t s7 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 52 53 int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); 54 // We halved the filter values so -1 from right shift. 55 uint8x8_t d0 = 56 vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS - 1); 57 58 store_u8_4x1(&temp[4 * r], d0); 59 } else { 60 // Memcpy for non-subpel locations. 61 s += SUBPEL_TAPS / 2 - 1; 62 63 for (int c = 0; c < 4; ++c) { 64 temp[r * 4 + c] = s[c * src_stride]; 65 } 66 } 67 x_q4 += x_step_q4; 68 } 69 70 // Transpose the 4x4 result tile and store. 71 uint8x8_t d01 = vld1_u8(temp + 0); 72 uint8x8_t d23 = vld1_u8(temp + 8); 73 74 transpose_elems_inplace_u8_4x4(&d01, &d23); 75 76 store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); 77 store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); 78 79 src += 4 * src_stride; 80 dst += 4 * dst_stride; 81 h -= 4; 82 } while (h > 0); 83 return; 84 } 85 86 // w >= 8 87 do { 88 int x_q4 = x0_q4; 89 uint8_t *d = dst; 90 int width = w; 91 92 do { 93 // Process an 8x8 tile. 94 for (int r = 0; r < 8; ++r) { 95 const uint8_t *s = &src[x_q4 >> SUBPEL_BITS]; 96 97 if (x_q4 & SUBPEL_MASK) { 98 // Halve filter values (all even) to avoid the need for saturating 99 // arithmetic in convolution kernels. 100 const int16x8_t filter = 101 vshrq_n_s16(vld1q_s16(x_filter[x_q4 & SUBPEL_MASK]), 1); 102 103 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 104 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 105 transpose_elems_inplace_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, 106 &t7); 107 108 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 109 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 110 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 111 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 112 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 113 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 114 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 115 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 116 117 uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); 118 119 vst1_u8(&temp[r * 8], d0); 120 } else { 121 // Memcpy for non-subpel locations. 122 s += SUBPEL_TAPS / 2 - 1; 123 124 for (int c = 0; c < 8; ++c) { 125 temp[r * 8 + c] = s[c * src_stride]; 126 } 127 } 128 x_q4 += x_step_q4; 129 } 130 131 // Transpose the 8x8 result tile and store. 132 uint8x8_t d0, d1, d2, d3, d4, d5, d6, d7; 133 load_u8_8x8(temp, 8, &d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 134 135 transpose_elems_inplace_u8_8x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7); 136 137 store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7); 138 139 d += 8; 140 width -= 8; 141 } while (width != 0); 142 143 src += 8 * src_stride; 144 dst += 8 * dst_stride; 145 h -= 8; 146 } while (h > 0); 147 } 148 149 static inline void scaled_convolve_vert_neon( 150 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, 151 const ptrdiff_t dst_stride, const InterpKernel *const y_filter, 152 const int y0_q4, const int y_step_q4, int w, int h) { 153 int y_q4 = y0_q4; 154 155 if (w == 4) { 156 do { 157 const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 158 159 if (y_q4 & SUBPEL_MASK) { 160 // Halve filter values (all even) to avoid the need for saturating 161 // arithmetic in convolution kernels. 162 const int16x8_t filter = 163 vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); 164 165 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 166 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 167 168 int16x4_t s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0))); 169 int16x4_t s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1))); 170 int16x4_t s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2))); 171 int16x4_t s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3))); 172 int16x4_t s4 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t4))); 173 int16x4_t s5 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t5))); 174 int16x4_t s6 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t6))); 175 int16x4_t s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t7))); 176 177 int16x4_t dd0 = convolve8_4(s0, s1, s2, s3, s4, s5, s6, s7, filter); 178 // We halved the filter values so -1 from right shift. 179 uint8x8_t d0 = 180 vqrshrun_n_s16(vcombine_s16(dd0, vdup_n_s16(0)), FILTER_BITS - 1); 181 182 store_u8_4x1(dst, d0); 183 } else { 184 // Memcpy for non-subpel locations. 185 memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 4); 186 } 187 188 y_q4 += y_step_q4; 189 dst += dst_stride; 190 } while (--h != 0); 191 return; 192 } 193 194 if (w == 8) { 195 do { 196 const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 197 198 if (y_q4 & SUBPEL_MASK) { 199 // Halve filter values (all even) to avoid the need for saturating 200 // arithmetic in convolution kernels. 201 const int16x8_t filter = 202 vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); 203 204 uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7; 205 load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 206 207 int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0)); 208 int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1)); 209 int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2)); 210 int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3)); 211 int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4)); 212 int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5)); 213 int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6)); 214 int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7)); 215 216 uint8x8_t d0 = convolve8_8(s0, s1, s2, s3, s4, s5, s6, s7, filter); 217 218 vst1_u8(dst, d0); 219 } else { 220 // Memcpy for non-subpel locations. 221 memcpy(dst, &s[(SUBPEL_TAPS / 2 - 1) * src_stride], 8); 222 } 223 224 y_q4 += y_step_q4; 225 dst += dst_stride; 226 } while (--h != 0); 227 return; 228 } 229 230 // w >= 16 231 do { 232 const uint8_t *s = &src[(y_q4 >> SUBPEL_BITS) * src_stride]; 233 uint8_t *d = dst; 234 int width = w; 235 236 if (y_q4 & SUBPEL_MASK) { 237 do { 238 // Halve filter values (all even) to avoid the need for saturating 239 // arithmetic in convolution kernels. 240 const int16x8_t filter = 241 vshrq_n_s16(vld1q_s16(y_filter[y_q4 & SUBPEL_MASK]), 1); 242 243 uint8x16_t t0, t1, t2, t3, t4, t5, t6, t7; 244 load_u8_16x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7); 245 246 int16x8_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2]; 247 s0[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0))); 248 s1[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1))); 249 s2[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2))); 250 s3[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3))); 251 s4[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t4))); 252 s5[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t5))); 253 s6[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t6))); 254 s7[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t7))); 255 256 s0[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0))); 257 s1[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1))); 258 s2[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2))); 259 s3[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3))); 260 s4[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t4))); 261 s5[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t5))); 262 s6[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t6))); 263 s7[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t7))); 264 265 uint8x8_t d0 = convolve8_8(s0[0], s1[0], s2[0], s3[0], s4[0], s5[0], 266 s6[0], s7[0], filter); 267 uint8x8_t d1 = convolve8_8(s0[1], s1[1], s2[1], s3[1], s4[1], s5[1], 268 s6[1], s7[1], filter); 269 270 vst1q_u8(d, vcombine_u8(d0, d1)); 271 272 s += 16; 273 d += 16; 274 width -= 16; 275 } while (width != 0); 276 } else { 277 // Memcpy for non-subpel locations. 278 s += (SUBPEL_TAPS / 2 - 1) * src_stride; 279 280 do { 281 uint8x16_t s0 = vld1q_u8(s); 282 vst1q_u8(d, s0); 283 s += 16; 284 d += 16; 285 width -= 16; 286 } while (width != 0); 287 } 288 289 y_q4 += y_step_q4; 290 dst += dst_stride; 291 } while (--h != 0); 292 } 293 294 void aom_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, 295 ptrdiff_t dst_stride, const InterpKernel *filter, 296 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, 297 int w, int h) { 298 // Fixed size intermediate buffer, im_block, places limits on parameters. 299 // 2d filtering proceeds in 2 steps: 300 // (1) Interpolate horizontally into an intermediate buffer, temp. 301 // (2) Interpolate temp vertically to derive the sub-pixel result. 302 // Deriving the maximum number of rows in the im_block buffer (135): 303 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). 304 // --Largest block size is 64x64 pixels. 305 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the 306 // original frame (in 1/16th pixel units). 307 // --Must round-up because block may be located at sub-pixel position. 308 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. 309 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. 310 // --Require an additional 8 rows for the horiz_w8 transpose tail. 311 // When calling in frame scaling function, the smallest scaling factor is x1/4 312 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still 313 // big enough. 314 DECLARE_ALIGNED(16, uint8_t, im_block[(135 + 8) * 64]); 315 const int im_height = 316 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; 317 const ptrdiff_t im_stride = 64; 318 319 assert(w <= 64); 320 assert(h <= 64); 321 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); 322 assert(x_step_q4 <= 64); 323 324 // Account for needing SUBPEL_TAPS / 2 - 1 lines prior and SUBPEL_TAPS / 2 325 // lines post both horizontally and vertically. 326 const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1; 327 const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride; 328 329 scaled_convolve_horiz_neon(src - horiz_offset - vert_offset, src_stride, 330 im_block, im_stride, filter, x0_q4, x_step_q4, w, 331 im_height); 332 333 scaled_convolve_vert_neon(im_block, im_stride, dst, dst_stride, filter, y0_q4, 334 y_step_q4, w, h); 335 }