pixelutils.asm (9873B)
1 ;****************************************************************************** 2 ;* Pixel utilities SIMD 3 ;* 4 ;* Copyright (C) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 5 ;* Copyright (C) 2014 Clément Bœsch <u pkh me> 6 ;* 7 ;* This file is part of FFmpeg. 8 ;* 9 ;* FFmpeg is free software; you can redistribute it and/or 10 ;* modify it under the terms of the GNU Lesser General Public 11 ;* License as published by the Free Software Foundation; either 12 ;* version 2.1 of the License, or (at your option) any later version. 13 ;* 14 ;* FFmpeg is distributed in the hope that it will be useful, 15 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of 16 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 17 ;* Lesser General Public License for more details. 18 ;* 19 ;* You should have received a copy of the GNU Lesser General Public 20 ;* License along with FFmpeg; if not, write to the Free Software 21 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 22 ;****************************************************************************** 23 24 %include "libavutil/x86/x86util.asm" 25 26 SECTION .text 27 28 ;------------------------------------------------------------------------------- 29 ; int ff_pixelutils_sad_8x8_mmxext(const uint8_t *src1, ptrdiff_t stride1, 30 ; const uint8_t *src2, ptrdiff_t stride2); 31 ;------------------------------------------------------------------------------- 32 INIT_MMX mmxext 33 cglobal pixelutils_sad_8x8, 4,4,0, src1, stride1, src2, stride2 34 pxor m2, m2 35 %rep 4 36 mova m0, [src1q] 37 mova m1, [src1q + stride1q] 38 psadbw m0, [src2q] 39 psadbw m1, [src2q + stride2q] 40 paddw m2, m0 41 paddw m2, m1 42 lea src1q, [src1q + 2*stride1q] 43 lea src2q, [src2q + 2*stride2q] 44 %endrep 45 movd eax, m2 46 emms 47 RET 48 49 ;------------------------------------------------------------------------------- 50 ; int ff_pixelutils_sad_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 51 ; const uint8_t *src2, ptrdiff_t stride2); 52 ;------------------------------------------------------------------------------- 53 INIT_XMM sse2 54 cglobal pixelutils_sad_16x16, 4,4,5, src1, stride1, src2, stride2 55 movu m4, [src1q] 56 movu m2, [src2q] 57 movu m1, [src1q + stride1q] 58 movu m3, [src2q + stride2q] 59 psadbw m4, m2 60 psadbw m1, m3 61 paddw m4, m1 62 %rep 7 63 lea src1q, [src1q + 2*stride1q] 64 lea src2q, [src2q + 2*stride2q] 65 movu m0, [src1q] 66 movu m2, [src2q] 67 movu m1, [src1q + stride1q] 68 movu m3, [src2q + stride2q] 69 psadbw m0, m2 70 psadbw m1, m3 71 paddw m4, m0 72 paddw m4, m1 73 %endrep 74 movhlps m0, m4 75 paddw m4, m0 76 movd eax, m4 77 RET 78 79 ;------------------------------------------------------------------------------- 80 ; int ff_pixelutils_sad_[au]_16x16_sse2(const uint8_t *src1, ptrdiff_t stride1, 81 ; const uint8_t *src2, ptrdiff_t stride2); 82 ;------------------------------------------------------------------------------- 83 %macro SAD_XMM_16x16 1 84 INIT_XMM sse2 85 cglobal pixelutils_sad_%1_16x16, 4,4,3, src1, stride1, src2, stride2 86 mov%1 m2, [src2q] 87 psadbw m2, [src1q] 88 mov%1 m1, [src2q + stride2q] 89 psadbw m1, [src1q + stride1q] 90 paddw m2, m1 91 %rep 7 92 lea src1q, [src1q + 2*stride1q] 93 lea src2q, [src2q + 2*stride2q] 94 mov%1 m0, [src2q] 95 psadbw m0, [src1q] 96 mov%1 m1, [src2q + stride2q] 97 psadbw m1, [src1q + stride1q] 98 paddw m2, m0 99 paddw m2, m1 100 %endrep 101 movhlps m0, m2 102 paddw m2, m0 103 movd eax, m2 104 RET 105 %endmacro 106 107 SAD_XMM_16x16 a 108 SAD_XMM_16x16 u 109 110 111 %macro PROCESS_SAD_32x4_U 0 112 movu m1, [r2] 113 movu m2, [r2 + 16] 114 movu m3, [r0] 115 movu m4, [r0 + 16] 116 psadbw m1, m3 117 psadbw m2, m4 118 paddd m1, m2 119 paddd m0, m1 120 lea r2, [r2 + r3] 121 lea r0, [r0 + r1] 122 123 movu m1, [r2] 124 movu m2, [r2 + 16] 125 movu m3, [r0] 126 movu m4, [r0 + 16] 127 psadbw m1, m3 128 psadbw m2, m4 129 paddd m1, m2 130 paddd m0, m1 131 lea r2, [r2 + r3] 132 lea r0, [r0 + r1] 133 134 movu m1, [r2] 135 movu m2, [r2 + 16] 136 movu m3, [r0] 137 movu m4, [r0 + 16] 138 psadbw m1, m3 139 psadbw m2, m4 140 paddd m1, m2 141 paddd m0, m1 142 lea r2, [r2 + r3] 143 lea r0, [r0 + r1] 144 145 movu m1, [r2] 146 movu m2, [r2 + 16] 147 movu m3, [r0] 148 movu m4, [r0 + 16] 149 psadbw m1, m3 150 psadbw m2, m4 151 paddd m1, m2 152 paddd m0, m1 153 lea r2, [r2 + r3] 154 lea r0, [r0 + r1] 155 %endmacro 156 157 %macro PROCESS_SAD_32x4 1 158 mov%1 m1, [r2] 159 mov%1 m2, [r2 + 16] 160 psadbw m1, [r0] 161 psadbw m2, [r0 + 16] 162 paddd m1, m2 163 paddd m0, m1 164 lea r2, [r2 + r3] 165 lea r0, [r0 + r1] 166 167 mov%1 m1, [r2] 168 mov%1 m2, [r2 + 16] 169 psadbw m1, [r0] 170 psadbw m2, [r0 + 16] 171 paddd m1, m2 172 paddd m0, m1 173 lea r2, [r2 + r3] 174 lea r0, [r0 + r1] 175 176 mov%1 m1, [r2] 177 mov%1 m2, [r2 + 16] 178 psadbw m1, [r0] 179 psadbw m2, [r0 + 16] 180 paddd m1, m2 181 paddd m0, m1 182 lea r2, [r2 + r3] 183 lea r0, [r0 + r1] 184 185 mov%1 m1, [r2] 186 mov%1 m2, [r2 + 16] 187 psadbw m1, [r0] 188 psadbw m2, [r0 + 16] 189 paddd m1, m2 190 paddd m0, m1 191 lea r2, [r2 + r3] 192 lea r0, [r0 + r1] 193 %endmacro 194 195 ;----------------------------------------------------------------------------- 196 ; int ff_pixelutils_sad_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 197 ; const uint8_t *src2, ptrdiff_t stride2); 198 ;----------------------------------------------------------------------------- 199 INIT_XMM sse2 200 cglobal pixelutils_sad_32x32, 4,5,5, src1, stride1, src2, stride2 201 pxor m0, m0 202 mov r4d, 4 203 .loop: 204 PROCESS_SAD_32x4_U 205 PROCESS_SAD_32x4_U 206 dec r4d 207 jnz .loop 208 209 movhlps m1, m0 210 paddd m0, m1 211 movd eax, m0 212 RET 213 214 ;------------------------------------------------------------------------------- 215 ; int ff_pixelutils_sad_[au]_32x32_sse2(const uint8_t *src1, ptrdiff_t stride1, 216 ; const uint8_t *src2, ptrdiff_t stride2); 217 ;------------------------------------------------------------------------------- 218 %macro SAD_XMM_32x32 1 219 INIT_XMM sse2 220 cglobal pixelutils_sad_%1_32x32, 4,5,3, src1, stride1, src2, stride2 221 pxor m0, m0 222 mov r4d, 4 223 .loop: 224 PROCESS_SAD_32x4 %1 225 PROCESS_SAD_32x4 %1 226 dec r4d 227 jnz .loop 228 229 movhlps m1, m0 230 paddd m0, m1 231 movd eax, m0 232 RET 233 %endmacro 234 235 SAD_XMM_32x32 a 236 SAD_XMM_32x32 u 237 238 %if HAVE_AVX2_EXTERNAL 239 ;------------------------------------------------------------------------------- 240 ; int ff_pixelutils_sad_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 241 ; const uint8_t *src2, ptrdiff_t stride2); 242 ;------------------------------------------------------------------------------- 243 INIT_YMM avx2 244 cglobal pixelutils_sad_32x32, 4,7,5, src1, stride1, src2, stride2 245 pxor m0, m0 246 mov r4d, 32/4 247 lea r5, [stride1q * 3] 248 lea r6, [stride2q * 3] 249 250 .loop: 251 movu m1, [src1q] ; row 0 of pix0 252 movu m2, [src2q] ; row 0 of pix1 253 movu m3, [src1q + stride1q] ; row 1 of pix0 254 movu m4, [src2q + stride2q] ; row 1 of pix1 255 256 psadbw m1, m2 257 psadbw m3, m4 258 paddd m0, m1 259 paddd m0, m3 260 261 movu m1, [src1q + 2 * stride1q] ; row 2 of pix0 262 movu m2, [src2q + 2 * stride2q] ; row 2 of pix1 263 movu m3, [src1q + r5] ; row 3 of pix0 264 movu m4, [src2q + r6] ; row 3 of pix1 265 266 psadbw m1, m2 267 psadbw m3, m4 268 paddd m0, m1 269 paddd m0, m3 270 271 lea src2q, [src2q + 4 * stride2q] 272 lea src1q, [src1q + 4 * stride1q] 273 274 dec r4d 275 jnz .loop 276 277 vextracti128 xm1, m0, 1 278 paddd xm0, xm1 279 pshufd xm1, xm0, 2 280 paddd xm0, xm1 281 movd eax, xm0 282 RET 283 284 ;------------------------------------------------------------------------------- 285 ; int ff_pixelutils_sad_[au]_32x32_avx2(const uint8_t *src1, ptrdiff_t stride1, 286 ; const uint8_t *src2, ptrdiff_t stride2); 287 ;------------------------------------------------------------------------------- 288 %macro SAD_AVX2_32x32 1 289 INIT_YMM avx2 290 cglobal pixelutils_sad_%1_32x32, 4,7,3, src1, stride1, src2, stride2 291 pxor m0, m0 292 mov r4d, 32/4 293 lea r5, [stride1q * 3] 294 lea r6, [stride2q * 3] 295 296 .loop: 297 mov%1 m1, [src2q] ; row 0 of pix1 298 psadbw m1, [src1q] 299 mov%1 m2, [src2q + stride2q] ; row 1 of pix1 300 psadbw m2, [src1q + stride1q] 301 302 paddd m0, m1 303 paddd m0, m2 304 305 mov%1 m1, [src2q + 2 * stride2q] ; row 2 of pix1 306 psadbw m1, [src1q + 2 * stride1q] 307 mov%1 m2, [src2q + r6] ; row 3 of pix1 308 psadbw m2, [src1q + r5] 309 310 paddd m0, m1 311 paddd m0, m2 312 313 lea src2q, [src2q + 4 * stride2q] 314 lea src1q, [src1q + 4 * stride1q] 315 316 dec r4d 317 jnz .loop 318 319 vextracti128 xm1, m0, 1 320 paddd xm0, xm1 321 pshufd xm1, xm0, 2 322 paddd xm0, xm1 323 movd eax, xm0 324 RET 325 %endmacro 326 327 SAD_AVX2_32x32 a 328 SAD_AVX2_32x32 u 329 %endif