hadamard_neon.c (12060B)
1 /* 2 * Copyright (c) 2019, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_dsp_rtcd.h" 15 #include "aom/aom_integer.h" 16 #include "aom_dsp/arm/mem_neon.h" 17 #include "aom_dsp/arm/transpose_neon.h" 18 19 static inline void hadamard_4x4_one_pass(int16x4_t *a0, int16x4_t *a1, 20 int16x4_t *a2, int16x4_t *a3) { 21 const int16x4_t b0 = vhadd_s16(*a0, *a1); 22 const int16x4_t b1 = vhsub_s16(*a0, *a1); 23 const int16x4_t b2 = vhadd_s16(*a2, *a3); 24 const int16x4_t b3 = vhsub_s16(*a2, *a3); 25 26 *a0 = vadd_s16(b0, b2); 27 *a1 = vadd_s16(b1, b3); 28 *a2 = vsub_s16(b0, b2); 29 *a3 = vsub_s16(b1, b3); 30 } 31 32 void aom_hadamard_4x4_neon(const int16_t *src_diff, ptrdiff_t src_stride, 33 tran_low_t *coeff) { 34 int16x4_t a0 = vld1_s16(src_diff); 35 int16x4_t a1 = vld1_s16(src_diff + src_stride); 36 int16x4_t a2 = vld1_s16(src_diff + 2 * src_stride); 37 int16x4_t a3 = vld1_s16(src_diff + 3 * src_stride); 38 39 hadamard_4x4_one_pass(&a0, &a1, &a2, &a3); 40 41 transpose_elems_inplace_s16_4x4(&a0, &a1, &a2, &a3); 42 43 hadamard_4x4_one_pass(&a0, &a1, &a2, &a3); 44 45 store_s16_to_tran_low(coeff, a0); 46 store_s16_to_tran_low(coeff + 4, a1); 47 store_s16_to_tran_low(coeff + 8, a2); 48 store_s16_to_tran_low(coeff + 12, a3); 49 } 50 51 static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, 52 int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, 53 int16x8_t *a6, int16x8_t *a7) { 54 const int16x8_t b0 = vaddq_s16(*a0, *a1); 55 const int16x8_t b1 = vsubq_s16(*a0, *a1); 56 const int16x8_t b2 = vaddq_s16(*a2, *a3); 57 const int16x8_t b3 = vsubq_s16(*a2, *a3); 58 const int16x8_t b4 = vaddq_s16(*a4, *a5); 59 const int16x8_t b5 = vsubq_s16(*a4, *a5); 60 const int16x8_t b6 = vaddq_s16(*a6, *a7); 61 const int16x8_t b7 = vsubq_s16(*a6, *a7); 62 63 const int16x8_t c0 = vaddq_s16(b0, b2); 64 const int16x8_t c1 = vaddq_s16(b1, b3); 65 const int16x8_t c2 = vsubq_s16(b0, b2); 66 const int16x8_t c3 = vsubq_s16(b1, b3); 67 const int16x8_t c4 = vaddq_s16(b4, b6); 68 const int16x8_t c5 = vaddq_s16(b5, b7); 69 const int16x8_t c6 = vsubq_s16(b4, b6); 70 const int16x8_t c7 = vsubq_s16(b5, b7); 71 72 *a0 = vaddq_s16(c0, c4); 73 *a1 = vsubq_s16(c2, c6); 74 *a2 = vsubq_s16(c0, c4); 75 *a3 = vaddq_s16(c2, c6); 76 *a4 = vaddq_s16(c3, c7); 77 *a5 = vsubq_s16(c3, c7); 78 *a6 = vsubq_s16(c1, c5); 79 *a7 = vaddq_s16(c1, c5); 80 } 81 82 void aom_hadamard_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, 83 tran_low_t *coeff) { 84 int16x8_t a0 = vld1q_s16(src_diff); 85 int16x8_t a1 = vld1q_s16(src_diff + src_stride); 86 int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); 87 int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); 88 int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); 89 int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); 90 int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); 91 int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); 92 93 hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 94 95 transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 96 97 hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 98 99 // Skip the second transpose because it is not required. 100 101 store_s16q_to_tran_low(coeff + 0, a0); 102 store_s16q_to_tran_low(coeff + 8, a1); 103 store_s16q_to_tran_low(coeff + 16, a2); 104 store_s16q_to_tran_low(coeff + 24, a3); 105 store_s16q_to_tran_low(coeff + 32, a4); 106 store_s16q_to_tran_low(coeff + 40, a5); 107 store_s16q_to_tran_low(coeff + 48, a6); 108 store_s16q_to_tran_low(coeff + 56, a7); 109 } 110 111 void aom_hadamard_lp_8x8_neon(const int16_t *src_diff, ptrdiff_t src_stride, 112 int16_t *coeff) { 113 int16x8_t a0 = vld1q_s16(src_diff); 114 int16x8_t a1 = vld1q_s16(src_diff + src_stride); 115 int16x8_t a2 = vld1q_s16(src_diff + 2 * src_stride); 116 int16x8_t a3 = vld1q_s16(src_diff + 3 * src_stride); 117 int16x8_t a4 = vld1q_s16(src_diff + 4 * src_stride); 118 int16x8_t a5 = vld1q_s16(src_diff + 5 * src_stride); 119 int16x8_t a6 = vld1q_s16(src_diff + 6 * src_stride); 120 int16x8_t a7 = vld1q_s16(src_diff + 7 * src_stride); 121 122 hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 123 124 transpose_elems_inplace_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 125 126 hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); 127 128 // Skip the second transpose because it is not required. 129 130 vst1q_s16(coeff + 0, a0); 131 vst1q_s16(coeff + 8, a1); 132 vst1q_s16(coeff + 16, a2); 133 vst1q_s16(coeff + 24, a3); 134 vst1q_s16(coeff + 32, a4); 135 vst1q_s16(coeff + 40, a5); 136 vst1q_s16(coeff + 48, a6); 137 vst1q_s16(coeff + 56, a7); 138 } 139 140 void aom_hadamard_lp_8x8_dual_neon(const int16_t *src_diff, 141 ptrdiff_t src_stride, int16_t *coeff) { 142 for (int i = 0; i < 2; i++) { 143 aom_hadamard_lp_8x8_neon(src_diff + (i * 8), src_stride, coeff + (i * 64)); 144 } 145 } 146 147 void aom_hadamard_lp_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, 148 int16_t *coeff) { 149 /* Rearrange 16x16 to 8x32 and remove stride. 150 * Top left first. */ 151 aom_hadamard_lp_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, 152 coeff + 0); 153 /* Top right. */ 154 aom_hadamard_lp_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, 155 coeff + 64); 156 /* Bottom left. */ 157 aom_hadamard_lp_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, 158 coeff + 128); 159 /* Bottom right. */ 160 aom_hadamard_lp_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, 161 coeff + 192); 162 163 for (int i = 0; i < 64; i += 8) { 164 const int16x8_t a0 = vld1q_s16(coeff + 0); 165 const int16x8_t a1 = vld1q_s16(coeff + 64); 166 const int16x8_t a2 = vld1q_s16(coeff + 128); 167 const int16x8_t a3 = vld1q_s16(coeff + 192); 168 169 const int16x8_t b0 = vhaddq_s16(a0, a1); 170 const int16x8_t b1 = vhsubq_s16(a0, a1); 171 const int16x8_t b2 = vhaddq_s16(a2, a3); 172 const int16x8_t b3 = vhsubq_s16(a2, a3); 173 174 const int16x8_t c0 = vaddq_s16(b0, b2); 175 const int16x8_t c1 = vaddq_s16(b1, b3); 176 const int16x8_t c2 = vsubq_s16(b0, b2); 177 const int16x8_t c3 = vsubq_s16(b1, b3); 178 179 vst1q_s16(coeff + 0, c0); 180 vst1q_s16(coeff + 64, c1); 181 vst1q_s16(coeff + 128, c2); 182 vst1q_s16(coeff + 192, c3); 183 184 coeff += 8; 185 } 186 } 187 188 void aom_hadamard_16x16_neon(const int16_t *src_diff, ptrdiff_t src_stride, 189 tran_low_t *coeff) { 190 /* Rearrange 16x16 to 8x32 and remove stride. 191 * Top left first. */ 192 aom_hadamard_8x8_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); 193 /* Top right. */ 194 aom_hadamard_8x8_neon(src_diff + 8 + 0 * src_stride, src_stride, coeff + 64); 195 /* Bottom left. */ 196 aom_hadamard_8x8_neon(src_diff + 0 + 8 * src_stride, src_stride, coeff + 128); 197 /* Bottom right. */ 198 aom_hadamard_8x8_neon(src_diff + 8 + 8 * src_stride, src_stride, coeff + 192); 199 200 // Each iteration of the loop operates on entire rows (16 samples each) 201 // because we need to swap the second and third quarters of every row in the 202 // output to match AVX2 output (i.e., aom_hadamard_16x16_avx2). See the for 203 // loop at the end of aom_hadamard_16x16_c. 204 for (int i = 0; i < 64; i += 16) { 205 const int32x4_t a00 = vld1q_s32(coeff + 0); 206 const int32x4_t a01 = vld1q_s32(coeff + 64); 207 const int32x4_t a02 = vld1q_s32(coeff + 128); 208 const int32x4_t a03 = vld1q_s32(coeff + 192); 209 210 const int32x4_t b00 = vhaddq_s32(a00, a01); 211 const int32x4_t b01 = vhsubq_s32(a00, a01); 212 const int32x4_t b02 = vhaddq_s32(a02, a03); 213 const int32x4_t b03 = vhsubq_s32(a02, a03); 214 215 const int32x4_t c00 = vaddq_s32(b00, b02); 216 const int32x4_t c01 = vaddq_s32(b01, b03); 217 const int32x4_t c02 = vsubq_s32(b00, b02); 218 const int32x4_t c03 = vsubq_s32(b01, b03); 219 220 const int32x4_t a10 = vld1q_s32(coeff + 4 + 0); 221 const int32x4_t a11 = vld1q_s32(coeff + 4 + 64); 222 const int32x4_t a12 = vld1q_s32(coeff + 4 + 128); 223 const int32x4_t a13 = vld1q_s32(coeff + 4 + 192); 224 225 const int32x4_t b10 = vhaddq_s32(a10, a11); 226 const int32x4_t b11 = vhsubq_s32(a10, a11); 227 const int32x4_t b12 = vhaddq_s32(a12, a13); 228 const int32x4_t b13 = vhsubq_s32(a12, a13); 229 230 const int32x4_t c10 = vaddq_s32(b10, b12); 231 const int32x4_t c11 = vaddq_s32(b11, b13); 232 const int32x4_t c12 = vsubq_s32(b10, b12); 233 const int32x4_t c13 = vsubq_s32(b11, b13); 234 235 const int32x4_t a20 = vld1q_s32(coeff + 8 + 0); 236 const int32x4_t a21 = vld1q_s32(coeff + 8 + 64); 237 const int32x4_t a22 = vld1q_s32(coeff + 8 + 128); 238 const int32x4_t a23 = vld1q_s32(coeff + 8 + 192); 239 240 const int32x4_t b20 = vhaddq_s32(a20, a21); 241 const int32x4_t b21 = vhsubq_s32(a20, a21); 242 const int32x4_t b22 = vhaddq_s32(a22, a23); 243 const int32x4_t b23 = vhsubq_s32(a22, a23); 244 245 const int32x4_t c20 = vaddq_s32(b20, b22); 246 const int32x4_t c21 = vaddq_s32(b21, b23); 247 const int32x4_t c22 = vsubq_s32(b20, b22); 248 const int32x4_t c23 = vsubq_s32(b21, b23); 249 250 const int32x4_t a30 = vld1q_s32(coeff + 12 + 0); 251 const int32x4_t a31 = vld1q_s32(coeff + 12 + 64); 252 const int32x4_t a32 = vld1q_s32(coeff + 12 + 128); 253 const int32x4_t a33 = vld1q_s32(coeff + 12 + 192); 254 255 const int32x4_t b30 = vhaddq_s32(a30, a31); 256 const int32x4_t b31 = vhsubq_s32(a30, a31); 257 const int32x4_t b32 = vhaddq_s32(a32, a33); 258 const int32x4_t b33 = vhsubq_s32(a32, a33); 259 260 const int32x4_t c30 = vaddq_s32(b30, b32); 261 const int32x4_t c31 = vaddq_s32(b31, b33); 262 const int32x4_t c32 = vsubq_s32(b30, b32); 263 const int32x4_t c33 = vsubq_s32(b31, b33); 264 265 vst1q_s32(coeff + 0 + 0, c00); 266 vst1q_s32(coeff + 0 + 4, c20); 267 vst1q_s32(coeff + 0 + 8, c10); 268 vst1q_s32(coeff + 0 + 12, c30); 269 270 vst1q_s32(coeff + 64 + 0, c01); 271 vst1q_s32(coeff + 64 + 4, c21); 272 vst1q_s32(coeff + 64 + 8, c11); 273 vst1q_s32(coeff + 64 + 12, c31); 274 275 vst1q_s32(coeff + 128 + 0, c02); 276 vst1q_s32(coeff + 128 + 4, c22); 277 vst1q_s32(coeff + 128 + 8, c12); 278 vst1q_s32(coeff + 128 + 12, c32); 279 280 vst1q_s32(coeff + 192 + 0, c03); 281 vst1q_s32(coeff + 192 + 4, c23); 282 vst1q_s32(coeff + 192 + 8, c13); 283 vst1q_s32(coeff + 192 + 12, c33); 284 285 coeff += 16; 286 } 287 } 288 289 void aom_hadamard_32x32_neon(const int16_t *src_diff, ptrdiff_t src_stride, 290 tran_low_t *coeff) { 291 /* Top left first. */ 292 aom_hadamard_16x16_neon(src_diff + 0 + 0 * src_stride, src_stride, coeff + 0); 293 /* Top right. */ 294 aom_hadamard_16x16_neon(src_diff + 16 + 0 * src_stride, src_stride, 295 coeff + 256); 296 /* Bottom left. */ 297 aom_hadamard_16x16_neon(src_diff + 0 + 16 * src_stride, src_stride, 298 coeff + 512); 299 /* Bottom right. */ 300 aom_hadamard_16x16_neon(src_diff + 16 + 16 * src_stride, src_stride, 301 coeff + 768); 302 303 for (int i = 0; i < 256; i += 4) { 304 const int32x4_t a0 = vld1q_s32(coeff); 305 const int32x4_t a1 = vld1q_s32(coeff + 256); 306 const int32x4_t a2 = vld1q_s32(coeff + 512); 307 const int32x4_t a3 = vld1q_s32(coeff + 768); 308 309 const int32x4_t b0 = vshrq_n_s32(vaddq_s32(a0, a1), 2); 310 const int32x4_t b1 = vshrq_n_s32(vsubq_s32(a0, a1), 2); 311 const int32x4_t b2 = vshrq_n_s32(vaddq_s32(a2, a3), 2); 312 const int32x4_t b3 = vshrq_n_s32(vsubq_s32(a2, a3), 2); 313 314 const int32x4_t c0 = vaddq_s32(b0, b2); 315 const int32x4_t c1 = vaddq_s32(b1, b3); 316 const int32x4_t c2 = vsubq_s32(b0, b2); 317 const int32x4_t c3 = vsubq_s32(b1, b3); 318 319 vst1q_s32(coeff + 0, c0); 320 vst1q_s32(coeff + 256, c1); 321 vst1q_s32(coeff + 512, c2); 322 vst1q_s32(coeff + 768, c3); 323 324 coeff += 4; 325 } 326 }