sad_sse2.asm (13178B)
1 ; 2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved. 3 ; 4 ; This source code is subject to the terms of the BSD 2 Clause License and 5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License 6 ; was not distributed with this source code in the LICENSE file, you can 7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open 8 ; Media Patent License 1.0 was not distributed with this source code in the 9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent. 10 ; 11 12 ; 13 14 %include "third_party/x86inc/x86inc.asm" 15 16 SECTION .text 17 18 ; Macro Arguments 19 ; Arg 1: Width 20 ; Arg 2: Height 21 ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit 22 ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows 23 %macro SAD_FN 4 24 %if %4 == 0 ; normal sad 25 %if %3 == 5 26 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 27 %else ; %3 == 7 28 cglobal sad%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 29 src_stride3, ref_stride3, n_rows 30 %endif ; %3 == 5/7 31 32 %elif %4 == 2 ; skip 33 %if %3 == 5 34 cglobal sad_skip_%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows 35 %else ; %3 == 7 36 cglobal sad_skip_%1x%2, 4, %3, 6, src, src_stride, ref, ref_stride, \ 37 src_stride3, ref_stride3, n_rows 38 %endif ; %3 == 5/7 39 40 %else 41 %if %3 == 5 42 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ 43 second_pred, n_rows 44 %else ; %3 == 7 45 cglobal sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, 6, src, src_stride, \ 46 ref, ref_stride, \ 47 second_pred, \ 48 src_stride3, ref_stride3 49 %if AOM_ARCH_X86_64 50 %define n_rowsd r7d 51 %else ; x86-32 52 %define n_rowsd dword r0m 53 %endif ; x86-32/64 54 %endif ; %3 == 5/7 55 %endif ; sad/avg/skip 56 %if %4 == 2; skip rows so double the stride 57 lea src_strided, [src_strided*2] 58 lea ref_strided, [ref_strided*2] 59 %endif ; %4 skip 60 movsxdifnidn src_strideq, src_strided 61 movsxdifnidn ref_strideq, ref_strided 62 %if %3 == 7 63 lea src_stride3q, [src_strideq*3] 64 lea ref_stride3q, [ref_strideq*3] 65 %endif ; %3 == 7 66 %endmacro 67 68 ; unsigned int aom_sad128x128_sse2(uint8_t *src, int src_stride, 69 ; uint8_t *ref, int ref_stride); 70 %macro SAD128XN 1-2 0 71 SAD_FN 128, %1, 5, %2 72 %if %2 == 2 73 mov n_rowsd, %1/2 74 %else 75 mov n_rowsd, %1 76 %endif 77 pxor m0, m0 78 79 .loop: 80 movu m1, [refq] 81 movu m2, [refq+16] 82 movu m3, [refq+32] 83 movu m4, [refq+48] 84 %if %2 == 1 85 pavgb m1, [second_predq+mmsize*0] 86 pavgb m2, [second_predq+mmsize*1] 87 pavgb m3, [second_predq+mmsize*2] 88 pavgb m4, [second_predq+mmsize*3] 89 %endif 90 psadbw m1, [srcq] 91 psadbw m2, [srcq+16] 92 psadbw m3, [srcq+32] 93 psadbw m4, [srcq+48] 94 95 paddd m1, m2 96 paddd m3, m4 97 paddd m0, m1 98 paddd m0, m3 99 100 movu m1, [refq+64] 101 movu m2, [refq+80] 102 movu m3, [refq+96] 103 movu m4, [refq+112] 104 %if %2 == 1 105 pavgb m1, [second_predq+mmsize*4] 106 pavgb m2, [second_predq+mmsize*5] 107 pavgb m3, [second_predq+mmsize*6] 108 pavgb m4, [second_predq+mmsize*7] 109 lea second_predq, [second_predq+mmsize*8] 110 %endif 111 psadbw m1, [srcq+64] 112 psadbw m2, [srcq+80] 113 psadbw m3, [srcq+96] 114 psadbw m4, [srcq+112] 115 116 add refq, ref_strideq 117 add srcq, src_strideq 118 119 paddd m1, m2 120 paddd m3, m4 121 paddd m0, m1 122 paddd m0, m3 123 124 sub n_rowsd, 1 125 jg .loop 126 127 movhlps m1, m0 128 paddd m0, m1 129 %if %2 == 2 ; we skipped rows, so now we need to double the sad 130 pslld m0, 1 131 %endif 132 movd eax, m0 133 RET 134 %endmacro 135 136 INIT_XMM sse2 137 SAD128XN 128 ; sad128x128_sse2 138 SAD128XN 128, 1 ; sad128x128_avg_sse2 139 SAD128XN 128, 2 ; sad_skip_128x128_sse2 140 SAD128XN 64 ; sad128x64_sse2 141 SAD128XN 64, 1 ; sad128x64_avg_sse2 142 SAD128XN 64, 2 ; sad_skip_128x64_sse2 143 144 145 ; unsigned int aom_sad64x64_sse2(uint8_t *src, int src_stride, 146 ; uint8_t *ref, int ref_stride); 147 %macro SAD64XN 1-2 0 148 SAD_FN 64, %1, 5, %2 149 %if %2 == 2 150 mov n_rowsd, %1/2 151 %else 152 mov n_rowsd, %1 153 %endif 154 pxor m0, m0 155 .loop: 156 movu m1, [refq] 157 movu m2, [refq+16] 158 movu m3, [refq+32] 159 movu m4, [refq+48] 160 %if %2 == 1 161 pavgb m1, [second_predq+mmsize*0] 162 pavgb m2, [second_predq+mmsize*1] 163 pavgb m3, [second_predq+mmsize*2] 164 pavgb m4, [second_predq+mmsize*3] 165 lea second_predq, [second_predq+mmsize*4] 166 %endif 167 psadbw m1, [srcq] 168 psadbw m2, [srcq+16] 169 psadbw m3, [srcq+32] 170 psadbw m4, [srcq+48] 171 paddd m1, m2 172 paddd m3, m4 173 add refq, ref_strideq 174 paddd m0, m1 175 add srcq, src_strideq 176 paddd m0, m3 177 dec n_rowsd 178 jg .loop 179 180 movhlps m1, m0 181 paddd m0, m1 182 %if %2 == 2 ; we skipped rows, so now we need to double the sad 183 pslld m0, 1 184 %endif 185 movd eax, m0 186 RET 187 %endmacro 188 189 INIT_XMM sse2 190 SAD64XN 128 ; sad64x128_sse2 191 SAD64XN 64 ; sad64x64_sse2 192 SAD64XN 32 ; sad64x32_sse2 193 SAD64XN 128, 1 ; sad64x128_avg_sse2 194 SAD64XN 64, 1 ; sad64x64_avg_sse2 195 SAD64XN 32, 1 ; sad64x32_avg_sse2 196 SAD64XN 128, 2 ; sad_skip_64x128_sse2 197 SAD64XN 64, 2 ; sad_skip_64x64_sse2 198 SAD64XN 32, 2 ; sad_skip_64x32_sse2 199 %if CONFIG_REALTIME_ONLY==0 200 SAD64XN 16 ; sad64x16_sse2 201 SAD64XN 16, 1 ; sad64x16_avg_sse2 202 SAD64XN 16, 2 ; sad_skip_64x16_sse2 203 %endif 204 205 ; unsigned int aom_sad32x32_sse2(uint8_t *src, int src_stride, 206 ; uint8_t *ref, int ref_stride); 207 %macro SAD32XN 1-2 0 208 SAD_FN 32, %1, 5, %2 209 %if %2 == 2 210 mov n_rowsd, %1/4 211 %else 212 mov n_rowsd, %1/2 213 %endif 214 pxor m0, m0 215 .loop: 216 movu m1, [refq] 217 movu m2, [refq+16] 218 movu m3, [refq+ref_strideq] 219 movu m4, [refq+ref_strideq+16] 220 %if %2 == 1 221 pavgb m1, [second_predq+mmsize*0] 222 pavgb m2, [second_predq+mmsize*1] 223 pavgb m3, [second_predq+mmsize*2] 224 pavgb m4, [second_predq+mmsize*3] 225 lea second_predq, [second_predq+mmsize*4] 226 %endif 227 psadbw m1, [srcq] 228 psadbw m2, [srcq+16] 229 psadbw m3, [srcq+src_strideq] 230 psadbw m4, [srcq+src_strideq+16] 231 paddd m1, m2 232 paddd m3, m4 233 lea refq, [refq+ref_strideq*2] 234 paddd m0, m1 235 lea srcq, [srcq+src_strideq*2] 236 paddd m0, m3 237 dec n_rowsd 238 jg .loop 239 240 movhlps m1, m0 241 paddd m0, m1 242 %if %2 == 2 ; we skipped rows, so now we need to double the sad 243 pslld m0, 1 244 %endif 245 movd eax, m0 246 RET 247 %endmacro 248 249 INIT_XMM sse2 250 SAD32XN 64 ; sad32x64_sse2 251 SAD32XN 32 ; sad32x32_sse2 252 SAD32XN 16 ; sad32x16_sse2 253 SAD32XN 64, 1 ; sad32x64_avg_sse2 254 SAD32XN 32, 1 ; sad32x32_avg_sse2 255 SAD32XN 16, 1 ; sad32x16_avg_sse2 256 SAD32XN 64, 2 ; sad_skip_32x64_sse2 257 SAD32XN 32, 2 ; sad_skip_32x32_sse2 258 SAD32XN 16, 2 ; sad_skip_32x16_sse2 259 %if CONFIG_REALTIME_ONLY==0 260 SAD32XN 8 ; sad32x8_sse2 261 SAD32XN 8, 1 ; sad32x8_avg_sse2 262 %endif 263 264 ; unsigned int aom_sad16x{8,16}_sse2(uint8_t *src, int src_stride, 265 ; uint8_t *ref, int ref_stride); 266 %macro SAD16XN 1-2 0 267 SAD_FN 16, %1, 7, %2 268 %if %2 == 2 269 mov n_rowsd, %1/8 270 %else 271 mov n_rowsd, %1/4 272 %endif 273 pxor m0, m0 274 275 .loop: 276 movu m1, [refq] 277 movu m2, [refq+ref_strideq] 278 movu m3, [refq+ref_strideq*2] 279 movu m4, [refq+ref_stride3q] 280 %if %2 == 1 281 pavgb m1, [second_predq+mmsize*0] 282 pavgb m2, [second_predq+mmsize*1] 283 pavgb m3, [second_predq+mmsize*2] 284 pavgb m4, [second_predq+mmsize*3] 285 lea second_predq, [second_predq+mmsize*4] 286 %endif 287 psadbw m1, [srcq] 288 psadbw m2, [srcq+src_strideq] 289 psadbw m3, [srcq+src_strideq*2] 290 psadbw m4, [srcq+src_stride3q] 291 paddd m1, m2 292 paddd m3, m4 293 lea refq, [refq+ref_strideq*4] 294 paddd m0, m1 295 lea srcq, [srcq+src_strideq*4] 296 paddd m0, m3 297 dec n_rowsd 298 jg .loop 299 300 movhlps m1, m0 301 paddd m0, m1 302 %if %2 == 2 ; we skipped rows, so now we need to double the sad 303 pslld m0, 1 304 %endif 305 movd eax, m0 306 RET 307 %endmacro 308 309 INIT_XMM sse2 310 SAD16XN 32 ; sad16x32_sse2 311 SAD16XN 16 ; sad16x16_sse2 312 SAD16XN 8 ; sad16x8_sse2 313 SAD16XN 32, 1 ; sad16x32_avg_sse2 314 SAD16XN 16, 1 ; sad16x16_avg_sse2 315 SAD16XN 8, 1 ; sad16x8_avg_sse2 316 SAD16XN 32, 2 ; sad_skip_16x32_sse2 317 SAD16XN 16, 2 ; sad_skip_16x16_sse2 318 %if CONFIG_REALTIME_ONLY==0 319 SAD16XN 64 ; sad16x64_sse2 320 SAD16XN 4 ; sad16x4_sse2 321 SAD16XN 64, 1 ; sad16x64_avg_sse2 322 SAD16XN 64, 2 ; sad_skip_16x64_sse2 323 %endif 324 325 ; unsigned int aom_sad8x{8,16}_sse2(uint8_t *src, int src_stride, 326 ; uint8_t *ref, int ref_stride); 327 %macro SAD8XN 1-2 0 328 SAD_FN 8, %1, 7, %2 329 %if %2 == 2 330 mov n_rowsd, %1/8 331 %else 332 mov n_rowsd, %1/4 333 %endif 334 pxor m0, m0 335 336 .loop: 337 movh m1, [refq] 338 movhps m1, [refq+ref_strideq] 339 movh m2, [refq+ref_strideq*2] 340 movhps m2, [refq+ref_stride3q] 341 %if %2 == 1 342 pavgb m1, [second_predq+mmsize*0] 343 pavgb m2, [second_predq+mmsize*1] 344 lea second_predq, [second_predq+mmsize*2] 345 %endif 346 movh m3, [srcq] 347 movhps m3, [srcq+src_strideq] 348 movh m4, [srcq+src_strideq*2] 349 movhps m4, [srcq+src_stride3q] 350 psadbw m1, m3 351 psadbw m2, m4 352 lea refq, [refq+ref_strideq*4] 353 paddd m0, m1 354 lea srcq, [srcq+src_strideq*4] 355 paddd m0, m2 356 dec n_rowsd 357 jg .loop 358 359 movhlps m1, m0 360 paddd m0, m1 361 %if %2 == 2 ; we skipped rows, so now we need to double the sad 362 pslld m0, 1 363 %endif 364 movd eax, m0 365 RET 366 %endmacro 367 368 INIT_XMM sse2 369 SAD8XN 16 ; sad8x16_sse2 370 SAD8XN 8 ; sad8x8_sse2 371 SAD8XN 4 ; sad8x4_sse2 372 SAD8XN 16, 1 ; sad8x16_avg_sse2 373 SAD8XN 8, 1 ; sad8x8_avg_sse2 374 SAD8XN 16, 2 ; sad_skip_8x16_sse2 375 SAD8XN 8, 2 ; sad_skip_8x8_sse2 376 %if CONFIG_REALTIME_ONLY==0 377 SAD8XN 32 ; sad8x32_sse2 378 SAD8XN 32, 1 ; sad8x32_avg_sse2 379 SAD8XN 32, 2 ; sad_skip_8x32_sse2 380 %endif 381 382 ; unsigned int aom_sad4x{4, 8}_sse2(uint8_t *src, int src_stride, 383 ; uint8_t *ref, int ref_stride); 384 %macro SAD4XN 1-2 0 385 SAD_FN 4, %1, 7, %2 386 %if %2 == 2 387 mov n_rowsd, %1/8 388 %else 389 mov n_rowsd, %1/4 390 %endif 391 pxor m0, m0 392 393 .loop: 394 movd m1, [refq] 395 movd m2, [refq+ref_strideq] 396 movd m3, [refq+ref_strideq*2] 397 movd m4, [refq+ref_stride3q] 398 punpckldq m1, m2 399 punpckldq m3, m4 400 movlhps m1, m3 401 %if %2 == 1 402 pavgb m1, [second_predq+mmsize*0] 403 lea second_predq, [second_predq+mmsize*1] 404 %endif 405 movd m2, [srcq] 406 movd m5, [srcq+src_strideq] 407 movd m4, [srcq+src_strideq*2] 408 movd m3, [srcq+src_stride3q] 409 punpckldq m2, m5 410 punpckldq m4, m3 411 movlhps m2, m4 412 psadbw m1, m2 413 lea refq, [refq+ref_strideq*4] 414 paddd m0, m1 415 lea srcq, [srcq+src_strideq*4] 416 dec n_rowsd 417 jg .loop 418 419 movhlps m1, m0 420 paddd m0, m1 421 %if %2 == 2 ; we skipped rows, so now we need to double the sad 422 pslld m0, 1 423 %endif 424 movd eax, m0 425 RET 426 %endmacro 427 428 INIT_XMM sse2 429 SAD4XN 8 ; sad4x8_sse2 430 SAD4XN 4 ; sad4x4_sse2 431 %if CONFIG_REALTIME_ONLY==0 432 SAD4XN 16 ; sad4x16_sse2 433 SAD4XN 16, 2 ; sad_skip_4x16_sse2 434 %endif