subtract_neon.c (6474B)
1 /* 2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 * 4 * This source code is subject to the terms of the BSD 2 Clause License and 5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 * was not distributed with this source code in the LICENSE file, you can 7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 * Media Patent License 1.0 was not distributed with this source code in the 9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 */ 11 12 #include <arm_neon.h> 13 14 #include "config/aom_config.h" 15 #include "config/aom_dsp_rtcd.h" 16 17 #include "aom/aom_integer.h" 18 #include "aom_ports/mem.h" 19 20 void aom_subtract_block_neon(int rows, int cols, int16_t *diff, 21 ptrdiff_t diff_stride, const uint8_t *src, 22 ptrdiff_t src_stride, const uint8_t *pred, 23 ptrdiff_t pred_stride) { 24 if (cols > 16) { 25 int r = rows; 26 do { 27 int c = 0; 28 do { 29 const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); 30 const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); 31 const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); 32 const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); 33 const uint16x8_t v_diff_lo_00 = 34 vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); 35 const uint16x8_t v_diff_hi_00 = 36 vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); 37 const uint16x8_t v_diff_lo_16 = 38 vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); 39 const uint16x8_t v_diff_hi_16 = 40 vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); 41 vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); 42 vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); 43 vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); 44 vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); 45 c += 32; 46 } while (c < cols); 47 diff += diff_stride; 48 pred += pred_stride; 49 src += src_stride; 50 } while (--r != 0); 51 } else if (cols > 8) { 52 int r = rows; 53 do { 54 const uint8x16_t v_src = vld1q_u8(&src[0]); 55 const uint8x16_t v_pred = vld1q_u8(&pred[0]); 56 const uint16x8_t v_diff_lo = 57 vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); 58 const uint16x8_t v_diff_hi = 59 vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); 60 vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); 61 vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); 62 diff += diff_stride; 63 pred += pred_stride; 64 src += src_stride; 65 } while (--r != 0); 66 } else if (cols > 4) { 67 int r = rows; 68 do { 69 const uint8x8_t v_src = vld1_u8(&src[0]); 70 const uint8x8_t v_pred = vld1_u8(&pred[0]); 71 const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); 72 vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); 73 diff += diff_stride; 74 pred += pred_stride; 75 src += src_stride; 76 } while (--r != 0); 77 } else { 78 int r = rows; 79 do { 80 int c = 0; 81 do { 82 diff[c] = src[c] - pred[c]; 83 } while (++c < cols); 84 diff += diff_stride; 85 pred += pred_stride; 86 src += src_stride; 87 } while (--r != 0); 88 } 89 } 90 91 #if CONFIG_AV1_HIGHBITDEPTH 92 void aom_highbd_subtract_block_neon(int rows, int cols, int16_t *diff, 93 ptrdiff_t diff_stride, const uint8_t *src8, 94 ptrdiff_t src_stride, const uint8_t *pred8, 95 ptrdiff_t pred_stride) { 96 uint16_t *src = CONVERT_TO_SHORTPTR(src8); 97 uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); 98 99 if (cols > 16) { 100 int r = rows; 101 do { 102 int c = 0; 103 do { 104 const uint16x8_t v_src_00 = vld1q_u16(&src[c + 0]); 105 const uint16x8_t v_pred_00 = vld1q_u16(&pred[c + 0]); 106 const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00); 107 const uint16x8_t v_src_08 = vld1q_u16(&src[c + 8]); 108 const uint16x8_t v_pred_08 = vld1q_u16(&pred[c + 8]); 109 const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08); 110 vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_00)); 111 vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_08)); 112 c += 16; 113 } while (c < cols); 114 diff += diff_stride; 115 pred += pred_stride; 116 src += src_stride; 117 } while (--r != 0); 118 } else if (cols > 8) { 119 int r = rows; 120 do { 121 const uint16x8_t v_src_00 = vld1q_u16(&src[0]); 122 const uint16x8_t v_pred_00 = vld1q_u16(&pred[0]); 123 const uint16x8_t v_diff_00 = vsubq_u16(v_src_00, v_pred_00); 124 const uint16x8_t v_src_08 = vld1q_u16(&src[8]); 125 const uint16x8_t v_pred_08 = vld1q_u16(&pred[8]); 126 const uint16x8_t v_diff_08 = vsubq_u16(v_src_08, v_pred_08); 127 vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_00)); 128 vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_08)); 129 diff += diff_stride; 130 pred += pred_stride; 131 src += src_stride; 132 } while (--r != 0); 133 } else if (cols > 4) { 134 int r = rows; 135 do { 136 const uint16x8_t v_src_r0 = vld1q_u16(&src[0]); 137 const uint16x8_t v_src_r1 = vld1q_u16(&src[src_stride]); 138 const uint16x8_t v_pred_r0 = vld1q_u16(&pred[0]); 139 const uint16x8_t v_pred_r1 = vld1q_u16(&pred[pred_stride]); 140 const uint16x8_t v_diff_r0 = vsubq_u16(v_src_r0, v_pred_r0); 141 const uint16x8_t v_diff_r1 = vsubq_u16(v_src_r1, v_pred_r1); 142 vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_r0)); 143 vst1q_s16(&diff[diff_stride], vreinterpretq_s16_u16(v_diff_r1)); 144 diff += diff_stride << 1; 145 pred += pred_stride << 1; 146 src += src_stride << 1; 147 r -= 2; 148 } while (r != 0); 149 } else { 150 int r = rows; 151 do { 152 const uint16x4_t v_src_r0 = vld1_u16(&src[0]); 153 const uint16x4_t v_src_r1 = vld1_u16(&src[src_stride]); 154 const uint16x4_t v_pred_r0 = vld1_u16(&pred[0]); 155 const uint16x4_t v_pred_r1 = vld1_u16(&pred[pred_stride]); 156 const uint16x4_t v_diff_r0 = vsub_u16(v_src_r0, v_pred_r0); 157 const uint16x4_t v_diff_r1 = vsub_u16(v_src_r1, v_pred_r1); 158 vst1_s16(&diff[0], vreinterpret_s16_u16(v_diff_r0)); 159 vst1_s16(&diff[diff_stride], vreinterpret_s16_u16(v_diff_r1)); 160 diff += diff_stride << 1; 161 pred += pred_stride << 1; 162 src += src_stride << 1; 163 r -= 2; 164 } while (r != 0); 165 } 166 } 167 #endif // CONFIG_AV1_HIGHBITDEPTH