ipred.h (15837B)
1 /* 2 * Copyright © 2018, VideoLAN and dav1d authors 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions are met: 7 * 8 * 1. Redistributions of source code must retain the above copyright notice, this 9 * list of conditions and the following disclaimer. 10 * 11 * 2. Redistributions in binary form must reproduce the above copyright notice, 12 * this list of conditions and the following disclaimer in the documentation 13 * and/or other materials provided with the distribution. 14 * 15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 25 */ 26 27 #include "src/cpu.h" 28 #include "src/ipred.h" 29 30 decl_angular_ipred_fn(BF(dav1d_ipred_dc, neon)); 31 decl_angular_ipred_fn(BF(dav1d_ipred_dc_128, neon)); 32 decl_angular_ipred_fn(BF(dav1d_ipred_dc_top, neon)); 33 decl_angular_ipred_fn(BF(dav1d_ipred_dc_left, neon)); 34 decl_angular_ipred_fn(BF(dav1d_ipred_h, neon)); 35 decl_angular_ipred_fn(BF(dav1d_ipred_v, neon)); 36 decl_angular_ipred_fn(BF(dav1d_ipred_paeth, neon)); 37 decl_angular_ipred_fn(BF(dav1d_ipred_smooth, neon)); 38 decl_angular_ipred_fn(BF(dav1d_ipred_smooth_v, neon)); 39 decl_angular_ipred_fn(BF(dav1d_ipred_smooth_h, neon)); 40 decl_angular_ipred_fn(BF(dav1d_ipred_filter, neon)); 41 42 decl_cfl_pred_fn(BF(dav1d_ipred_cfl, neon)); 43 decl_cfl_pred_fn(BF(dav1d_ipred_cfl_128, neon)); 44 decl_cfl_pred_fn(BF(dav1d_ipred_cfl_top, neon)); 45 decl_cfl_pred_fn(BF(dav1d_ipred_cfl_left, neon)); 46 47 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_420, neon)); 48 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_422, neon)); 49 decl_cfl_ac_fn(BF(dav1d_ipred_cfl_ac_444, neon)); 50 51 decl_pal_pred_fn(BF(dav1d_pal_pred, neon)); 52 53 #if ARCH_AARCH64 54 void BF(dav1d_ipred_z1_upsample_edge, neon)(pixel *out, const int hsz, 55 const pixel *const in, 56 const int end HIGHBD_DECL_SUFFIX); 57 void BF(dav1d_ipred_z1_filter_edge, neon)(pixel *out, const int sz, 58 const pixel *const in, 59 const int end, const int strength); 60 void BF(dav1d_ipred_pixel_set, neon)(pixel *out, const pixel px, 61 const int n); 62 void BF(dav1d_ipred_z1_fill1, neon)(pixel *dst, ptrdiff_t stride, 63 const pixel *const top, const int width, 64 const int height, const int dx, 65 const int max_base_x); 66 void BF(dav1d_ipred_z1_fill2, neon)(pixel *dst, ptrdiff_t stride, 67 const pixel *const top, const int width, 68 const int height, const int dx, 69 const int max_base_x); 70 71 static void ipred_z1_neon(pixel *dst, const ptrdiff_t stride, 72 const pixel *const topleft_in, 73 const int width, const int height, int angle, 74 const int max_width, const int max_height 75 HIGHBD_DECL_SUFFIX) 76 { 77 const int is_sm = (angle >> 9) & 0x1; 78 const int enable_intra_edge_filter = angle >> 10; 79 angle &= 511; 80 int dx = dav1d_dr_intra_derivative[angle >> 1]; 81 pixel top_out[64 + 64 + (64+15)*2 + 16]; 82 int max_base_x; 83 const int upsample_above = enable_intra_edge_filter ? 84 get_upsample(width + height, 90 - angle, is_sm) : 0; 85 if (upsample_above) { 86 BF(dav1d_ipred_z1_upsample_edge, neon)(top_out, width + height, 87 topleft_in, 88 width + imin(width, height) 89 HIGHBD_TAIL_SUFFIX); 90 max_base_x = 2 * (width + height) - 2; 91 dx <<= 1; 92 } else { 93 const int filter_strength = enable_intra_edge_filter ? 94 get_filter_strength(width + height, 90 - angle, is_sm) : 0; 95 if (filter_strength) { 96 BF(dav1d_ipred_z1_filter_edge, neon)(top_out, width + height, 97 topleft_in, 98 width + imin(width, height), 99 filter_strength); 100 max_base_x = width + height - 1; 101 } else { 102 max_base_x = width + imin(width, height) - 1; 103 memcpy(top_out, &topleft_in[1], (max_base_x + 1) * sizeof(pixel)); 104 } 105 } 106 const int base_inc = 1 + upsample_above; 107 int pad_pixels = width + 15; // max(dx >> 6) == 15 108 BF(dav1d_ipred_pixel_set, neon)(&top_out[max_base_x + 1], 109 top_out[max_base_x], pad_pixels * base_inc); 110 if (upsample_above) 111 BF(dav1d_ipred_z1_fill2, neon)(dst, stride, top_out, width, height, 112 dx, max_base_x); 113 else 114 BF(dav1d_ipred_z1_fill1, neon)(dst, stride, top_out, width, height, 115 dx, max_base_x); 116 } 117 118 void BF(dav1d_ipred_reverse, neon)(pixel *dst, const pixel *const src, 119 const int n); 120 121 void BF(dav1d_ipred_z2_upsample_edge, neon)(pixel *out, const int sz, 122 const pixel *const in 123 HIGHBD_DECL_SUFFIX); 124 125 void BF(dav1d_ipred_z2_fill1, neon)(pixel *dst, ptrdiff_t stride, 126 const pixel *const top, 127 const pixel *const left, 128 const int width, const int height, 129 const int dx, const int dy); 130 void BF(dav1d_ipred_z2_fill2, neon)(pixel *dst, ptrdiff_t stride, 131 const pixel *const top, 132 const pixel *const left, 133 const int width, const int height, 134 const int dx, const int dy); 135 void BF(dav1d_ipred_z2_fill3, neon)(pixel *dst, ptrdiff_t stride, 136 const pixel *const top, 137 const pixel *const left, 138 const int width, const int height, 139 const int dx, const int dy); 140 141 static void ipred_z2_neon(pixel *dst, const ptrdiff_t stride, 142 const pixel *const topleft_in, 143 const int width, const int height, int angle, 144 const int max_width, const int max_height 145 HIGHBD_DECL_SUFFIX) 146 { 147 const int is_sm = (angle >> 9) & 0x1; 148 const int enable_intra_edge_filter = angle >> 10; 149 angle &= 511; 150 assert(angle > 90 && angle < 180); 151 int dy = dav1d_dr_intra_derivative[(angle - 90) >> 1]; 152 int dx = dav1d_dr_intra_derivative[(180 - angle) >> 1]; 153 const int upsample_left = enable_intra_edge_filter ? 154 get_upsample(width + height, 180 - angle, is_sm) : 0; 155 const int upsample_above = enable_intra_edge_filter ? 156 get_upsample(width + height, angle - 90, is_sm) : 0; 157 pixel buf[3*(64+1)]; 158 pixel *left = &buf[2*(64+1)]; 159 // The asm can underread below the start of top[] and left[]; to avoid 160 // surprising behaviour, make sure this is within the allocated stack space. 161 pixel *top = &buf[1*(64+1)]; 162 pixel *flipped = &buf[0*(64+1)]; 163 164 if (upsample_above) { 165 BF(dav1d_ipred_z2_upsample_edge, neon)(top, width, topleft_in 166 HIGHBD_TAIL_SUFFIX); 167 dx <<= 1; 168 } else { 169 const int filter_strength = enable_intra_edge_filter ? 170 get_filter_strength(width + height, angle - 90, is_sm) : 0; 171 172 if (filter_strength) { 173 BF(dav1d_ipred_z1_filter_edge, neon)(&top[1], imin(max_width, width), 174 topleft_in, width, 175 filter_strength); 176 if (max_width < width) 177 memcpy(&top[1 + max_width], &topleft_in[1 + max_width], 178 (width - max_width) * sizeof(pixel)); 179 } else { 180 pixel_copy(&top[1], &topleft_in[1], width); 181 } 182 } 183 if (upsample_left) { 184 flipped[0] = topleft_in[0]; 185 BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], 186 height); 187 BF(dav1d_ipred_z2_upsample_edge, neon)(left, height, flipped 188 HIGHBD_TAIL_SUFFIX); 189 dy <<= 1; 190 } else { 191 const int filter_strength = enable_intra_edge_filter ? 192 get_filter_strength(width + height, 180 - angle, is_sm) : 0; 193 194 if (filter_strength) { 195 flipped[0] = topleft_in[0]; 196 BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], 197 height); 198 BF(dav1d_ipred_z1_filter_edge, neon)(&left[1], imin(max_height, height), 199 flipped, height, 200 filter_strength); 201 if (max_height < height) 202 memcpy(&left[1 + max_height], &flipped[1 + max_height], 203 (height - max_height) * sizeof(pixel)); 204 } else { 205 BF(dav1d_ipred_reverse, neon)(&left[1], &topleft_in[0], 206 height); 207 } 208 } 209 top[0] = left[0] = *topleft_in; 210 211 assert(!(upsample_above && upsample_left)); 212 if (!upsample_above && !upsample_left) { 213 BF(dav1d_ipred_z2_fill1, neon)(dst, stride, top, left, width, height, 214 dx, dy); 215 } else if (upsample_above) { 216 BF(dav1d_ipred_z2_fill2, neon)(dst, stride, top, left, width, height, 217 dx, dy); 218 } else /*if (upsample_left)*/ { 219 BF(dav1d_ipred_z2_fill3, neon)(dst, stride, top, left, width, height, 220 dx, dy); 221 } 222 } 223 224 void BF(dav1d_ipred_z3_fill1, neon)(pixel *dst, ptrdiff_t stride, 225 const pixel *const left, const int width, 226 const int height, const int dy, 227 const int max_base_y); 228 void BF(dav1d_ipred_z3_fill2, neon)(pixel *dst, ptrdiff_t stride, 229 const pixel *const left, const int width, 230 const int height, const int dy, 231 const int max_base_y); 232 233 static void ipred_z3_neon(pixel *dst, const ptrdiff_t stride, 234 const pixel *const topleft_in, 235 const int width, const int height, int angle, 236 const int max_width, const int max_height 237 HIGHBD_DECL_SUFFIX) 238 { 239 const int is_sm = (angle >> 9) & 0x1; 240 const int enable_intra_edge_filter = angle >> 10; 241 angle &= 511; 242 assert(angle > 180); 243 int dy = dav1d_dr_intra_derivative[(270 - angle) >> 1]; 244 pixel flipped[64 + 64 + 16]; 245 pixel left_out[64 + 64 + (64+15)*2]; 246 int max_base_y; 247 const int upsample_left = enable_intra_edge_filter ? 248 get_upsample(width + height, angle - 180, is_sm) : 0; 249 if (upsample_left) { 250 flipped[0] = topleft_in[0]; 251 BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], 252 height + imax(width, height)); 253 BF(dav1d_ipred_z1_upsample_edge, neon)(left_out, width + height, 254 flipped, 255 height + imin(width, height) 256 HIGHBD_TAIL_SUFFIX); 257 max_base_y = 2 * (width + height) - 2; 258 dy <<= 1; 259 } else { 260 const int filter_strength = enable_intra_edge_filter ? 261 get_filter_strength(width + height, angle - 180, is_sm) : 0; 262 263 if (filter_strength) { 264 flipped[0] = topleft_in[0]; 265 BF(dav1d_ipred_reverse, neon)(&flipped[1], &topleft_in[0], 266 height + imax(width, height)); 267 BF(dav1d_ipred_z1_filter_edge, neon)(left_out, width + height, 268 flipped, 269 height + imin(width, height), 270 filter_strength); 271 max_base_y = width + height - 1; 272 } else { 273 BF(dav1d_ipred_reverse, neon)(left_out, &topleft_in[0], 274 height + imin(width, height)); 275 max_base_y = height + imin(width, height) - 1; 276 } 277 } 278 const int base_inc = 1 + upsample_left; 279 // The tbx based implementation needs left[] to have 64 bytes intitialized, 280 // the other implementation can read height + max(dy >> 6) past the end. 281 int pad_pixels = imax(64 - max_base_y - 1, height + 15); 282 283 BF(dav1d_ipred_pixel_set, neon)(&left_out[max_base_y + 1], 284 left_out[max_base_y], pad_pixels * base_inc); 285 if (upsample_left) 286 BF(dav1d_ipred_z3_fill2, neon)(dst, stride, left_out, width, height, 287 dy, max_base_y); 288 else 289 BF(dav1d_ipred_z3_fill1, neon)(dst, stride, left_out, width, height, 290 dy, max_base_y); 291 } 292 #endif 293 294 static ALWAYS_INLINE void intra_pred_dsp_init_arm(Dav1dIntraPredDSPContext *const c) { 295 const unsigned flags = dav1d_get_cpu_flags(); 296 297 if (!(flags & DAV1D_ARM_CPU_FLAG_NEON)) return; 298 299 c->intra_pred[DC_PRED] = BF(dav1d_ipred_dc, neon); 300 c->intra_pred[DC_128_PRED] = BF(dav1d_ipred_dc_128, neon); 301 c->intra_pred[TOP_DC_PRED] = BF(dav1d_ipred_dc_top, neon); 302 c->intra_pred[LEFT_DC_PRED] = BF(dav1d_ipred_dc_left, neon); 303 c->intra_pred[HOR_PRED] = BF(dav1d_ipred_h, neon); 304 c->intra_pred[VERT_PRED] = BF(dav1d_ipred_v, neon); 305 c->intra_pred[PAETH_PRED] = BF(dav1d_ipred_paeth, neon); 306 c->intra_pred[SMOOTH_PRED] = BF(dav1d_ipred_smooth, neon); 307 c->intra_pred[SMOOTH_V_PRED] = BF(dav1d_ipred_smooth_v, neon); 308 c->intra_pred[SMOOTH_H_PRED] = BF(dav1d_ipred_smooth_h, neon); 309 #if ARCH_AARCH64 310 c->intra_pred[Z1_PRED] = ipred_z1_neon; 311 c->intra_pred[Z2_PRED] = ipred_z2_neon; 312 c->intra_pred[Z3_PRED] = ipred_z3_neon; 313 #endif 314 c->intra_pred[FILTER_PRED] = BF(dav1d_ipred_filter, neon); 315 316 c->cfl_pred[DC_PRED] = BF(dav1d_ipred_cfl, neon); 317 c->cfl_pred[DC_128_PRED] = BF(dav1d_ipred_cfl_128, neon); 318 c->cfl_pred[TOP_DC_PRED] = BF(dav1d_ipred_cfl_top, neon); 319 c->cfl_pred[LEFT_DC_PRED] = BF(dav1d_ipred_cfl_left, neon); 320 321 c->cfl_ac[DAV1D_PIXEL_LAYOUT_I420 - 1] = BF(dav1d_ipred_cfl_ac_420, neon); 322 c->cfl_ac[DAV1D_PIXEL_LAYOUT_I422 - 1] = BF(dav1d_ipred_cfl_ac_422, neon); 323 c->cfl_ac[DAV1D_PIXEL_LAYOUT_I444 - 1] = BF(dav1d_ipred_cfl_ac_444, neon); 324 325 c->pal_pred = BF(dav1d_pal_pred, neon); 326 }