highbd_reconintra_neon.c (7329B)
1 /* 2 * Copyright (c) 2023, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 #include <assert.h> 14 15 #include "aom_dsp/arm/sum_neon.h" 16 #include "config/av1_rtcd.h" 17 18 #define MAX_UPSAMPLE_SZ 16 19 20 void av1_highbd_filter_intra_edge_neon(uint16_t *p, int sz, int strength) { 21 if (!strength) return; 22 assert(sz >= 0 && sz <= 129); 23 24 DECLARE_ALIGNED(16, static const uint16_t, 25 idx[8]) = { 0, 1, 2, 3, 4, 5, 6, 7 }; 26 const uint16x8_t index = vld1q_u16(idx); 27 28 uint16_t edge[160]; // Max value of sz + enough padding for vector accesses. 29 memcpy(edge + 1, p, sz * sizeof(*p)); 30 31 // Populate extra space appropriately. 32 edge[0] = edge[1]; 33 edge[sz + 1] = edge[sz]; 34 edge[sz + 2] = edge[sz]; 35 36 // Don't overwrite first pixel. 37 uint16_t *dst = p + 1; 38 sz--; 39 40 if (strength == 1) { // Filter: {4, 8, 4}. 41 const uint16_t *src = edge + 1; 42 43 while (sz >= 8) { 44 uint16x8_t s0 = vld1q_u16(src); 45 uint16x8_t s1 = vld1q_u16(src + 1); 46 uint16x8_t s2 = vld1q_u16(src + 2); 47 48 // Make use of the identity: 49 // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 50 uint16x8_t t0 = vaddq_u16(s0, s2); 51 uint16x8_t t1 = vaddq_u16(s1, s1); 52 uint16x8_t sum = vaddq_u16(t0, t1); 53 uint16x8_t res = vrshrq_n_u16(sum, 2); 54 55 vst1q_u16(dst, res); 56 57 src += 8; 58 dst += 8; 59 sz -= 8; 60 } 61 62 if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. 63 uint16x8_t s0 = vld1q_u16(src); 64 uint16x8_t s1 = vld1q_u16(src + 1); 65 uint16x8_t s2 = vld1q_u16(src + 2); 66 67 // Make use of the identity: 68 // (4*a + 8*b + 4*c) >> 4 == (a + (b << 1) + c) >> 2 69 uint16x8_t t0 = vaddq_u16(s0, s2); 70 uint16x8_t t1 = vaddq_u16(s1, s1); 71 uint16x8_t sum = vaddq_u16(t0, t1); 72 uint16x8_t res = vrshrq_n_u16(sum, 2); 73 74 // Mask off out-of-bounds indices. 75 uint16x8_t current_dst = vld1q_u16(dst); 76 uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); 77 res = vbslq_u16(mask, res, current_dst); 78 79 vst1q_u16(dst, res); 80 } 81 } else if (strength == 2) { // Filter: {5, 6, 5}. 82 const uint16_t *src = edge + 1; 83 84 const uint16x8x3_t filter = { { vdupq_n_u16(5), vdupq_n_u16(6), 85 vdupq_n_u16(5) } }; 86 while (sz >= 8) { 87 uint16x8_t s0 = vld1q_u16(src); 88 uint16x8_t s1 = vld1q_u16(src + 1); 89 uint16x8_t s2 = vld1q_u16(src + 2); 90 91 uint16x8_t accum = vmulq_u16(s0, filter.val[0]); 92 accum = vmlaq_u16(accum, s1, filter.val[1]); 93 accum = vmlaq_u16(accum, s2, filter.val[2]); 94 uint16x8_t res = vrshrq_n_u16(accum, 4); 95 96 vst1q_u16(dst, res); 97 98 src += 8; 99 dst += 8; 100 sz -= 8; 101 } 102 103 if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. 104 uint16x8_t s0 = vld1q_u16(src); 105 uint16x8_t s1 = vld1q_u16(src + 1); 106 uint16x8_t s2 = vld1q_u16(src + 2); 107 108 uint16x8_t accum = vmulq_u16(s0, filter.val[0]); 109 accum = vmlaq_u16(accum, s1, filter.val[1]); 110 accum = vmlaq_u16(accum, s2, filter.val[2]); 111 uint16x8_t res = vrshrq_n_u16(accum, 4); 112 113 // Mask off out-of-bounds indices. 114 uint16x8_t current_dst = vld1q_u16(dst); 115 uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); 116 res = vbslq_u16(mask, res, current_dst); 117 118 vst1q_u16(dst, res); 119 } 120 } else { // Filter {2, 4, 4, 4, 2}. 121 const uint16_t *src = edge; 122 123 while (sz >= 8) { 124 uint16x8_t s0 = vld1q_u16(src); 125 uint16x8_t s1 = vld1q_u16(src + 1); 126 uint16x8_t s2 = vld1q_u16(src + 2); 127 uint16x8_t s3 = vld1q_u16(src + 3); 128 uint16x8_t s4 = vld1q_u16(src + 4); 129 130 // Make use of the identity: 131 // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 132 uint16x8_t t0 = vaddq_u16(s0, s4); 133 uint16x8_t t1 = vaddq_u16(s1, s2); 134 t1 = vaddq_u16(t1, s3); 135 t1 = vaddq_u16(t1, t1); 136 uint16x8_t sum = vaddq_u16(t0, t1); 137 uint16x8_t res = vrshrq_n_u16(sum, 3); 138 139 vst1q_u16(dst, res); 140 141 src += 8; 142 dst += 8; 143 sz -= 8; 144 } 145 146 if (sz > 0) { // Handle sz < 8 to avoid modifying out-of-bounds values. 147 uint16x8_t s0 = vld1q_u16(src); 148 uint16x8_t s1 = vld1q_u16(src + 1); 149 uint16x8_t s2 = vld1q_u16(src + 2); 150 uint16x8_t s3 = vld1q_u16(src + 3); 151 uint16x8_t s4 = vld1q_u16(src + 4); 152 153 // Make use of the identity: 154 // (2*a + 4*b + 4*c + 4*d + 2*e) >> 4 == (a + ((b + c + d) << 1) + e) >> 3 155 uint16x8_t t0 = vaddq_u16(s0, s4); 156 uint16x8_t t1 = vaddq_u16(s1, s2); 157 t1 = vaddq_u16(t1, s3); 158 t1 = vaddq_u16(t1, t1); 159 uint16x8_t sum = vaddq_u16(t0, t1); 160 uint16x8_t res = vrshrq_n_u16(sum, 3); 161 162 // Mask off out-of-bounds indices. 163 uint16x8_t current_dst = vld1q_u16(dst); 164 uint16x8_t mask = vcgtq_u16(vdupq_n_u16(sz), index); 165 res = vbslq_u16(mask, res, current_dst); 166 167 vst1q_u16(dst, res); 168 } 169 } 170 } 171 172 void av1_highbd_upsample_intra_edge_neon(uint16_t *p, int sz, int bd) { 173 if (!sz) return; 174 175 assert(sz <= MAX_UPSAMPLE_SZ); 176 177 uint16_t edge[MAX_UPSAMPLE_SZ + 3]; 178 const uint16_t *src = edge; 179 180 // Copy p[-1..(sz-1)] and pad out both ends. 181 edge[0] = p[-1]; 182 edge[1] = p[-1]; 183 memcpy(edge + 2, p, sz * 2); 184 edge[sz + 2] = p[sz - 1]; 185 p[-2] = p[-1]; 186 187 uint16x8_t pixel_val_max = vdupq_n_u16((1 << bd) - 1); 188 189 uint16_t *dst = p - 1; 190 191 if (bd == 12) { 192 do { 193 uint16x8_t s0 = vld1q_u16(src); 194 uint16x8_t s1 = vld1q_u16(src + 1); 195 uint16x8_t s2 = vld1q_u16(src + 2); 196 uint16x8_t s3 = vld1q_u16(src + 3); 197 198 uint16x8_t t0 = vaddq_u16(s1, s2); 199 uint16x8_t t1 = vaddq_u16(s0, s3); 200 uint32x4_t acc0 = vmull_n_u16(vget_low_u16(t0), 9); 201 acc0 = vqsubq_u32(acc0, vmovl_u16(vget_low_u16(t1))); 202 uint32x4_t acc1 = vmull_n_u16(vget_high_u16(t0), 9); 203 acc1 = vqsubq_u32(acc1, vmovl_u16(vget_high_u16(t1))); 204 205 uint16x8x2_t res; 206 res.val[0] = vcombine_u16(vrshrn_n_u32(acc0, 4), vrshrn_n_u32(acc1, 4)); 207 // Clamp pixel values at bitdepth maximum. 208 res.val[0] = vminq_u16(res.val[0], pixel_val_max); 209 res.val[1] = s2; 210 211 vst2q_u16(dst, res); 212 213 src += 8; 214 dst += 16; 215 sz -= 8; 216 } while (sz > 0); 217 } else { // Bit depth is 8 or 10. 218 do { 219 uint16x8_t s0 = vld1q_u16(src); 220 uint16x8_t s1 = vld1q_u16(src + 1); 221 uint16x8_t s2 = vld1q_u16(src + 2); 222 uint16x8_t s3 = vld1q_u16(src + 3); 223 224 uint16x8_t t0 = vaddq_u16(s0, s3); 225 uint16x8_t t1 = vaddq_u16(s1, s2); 226 t1 = vmulq_n_u16(t1, 9); 227 t1 = vqsubq_u16(t1, t0); 228 229 uint16x8x2_t res; 230 res.val[0] = vrshrq_n_u16(t1, 4); 231 // Clamp pixel values at bitdepth maximum. 232 res.val[0] = vminq_u16(res.val[0], pixel_val_max); 233 res.val[1] = s2; 234 235 vst2q_u16(dst, res); 236 237 src += 8; 238 dst += 16; 239 sz -= 8; 240 } while (sz > 0); 241 } 242 }