tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_sad_sse2.asm (16979B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION .text
     17 
     18 ; Macro Arguments
     19 ; Arg 1: Width
     20 ; Arg 2: Height
     21 ; Arg 3: Number of general purpose registers: 5 for 32-bit build, 6 for 64-bit
     22 ; Arg 4: Type of function: if 0, normal sad; if 1, avg; if 2, skip rows
     23 ; Arg 5: Number of xmm registers. 8xh needs 8, others only need 7
     24 %macro HIGH_SAD_FN 4-5 7
     25 %if %4 == 0
     26 %if %3 == 5
     27 cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
     28 %else ; %3 == 7
     29 cglobal highbd_sad%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
     30                            src_stride3, ref_stride3, n_rows
     31 %endif ; %3 == 5/7
     32 %elif %4 == 1 ; avg
     33 %if %3 == 5
     34 cglobal highbd_sad%1x%2_avg, 5, 1 + %3, %5, src, src_stride, ref, ref_stride, \
     35                                    second_pred, n_rows
     36 %else ; %3 == 7
     37 cglobal highbd_sad%1x%2_avg, 5, AOM_ARCH_X86_64 + %3, %5, src, src_stride, \
     38                                              ref, ref_stride, \
     39                                              second_pred, \
     40                                              src_stride3, ref_stride3
     41 %if AOM_ARCH_X86_64
     42 %define n_rowsd r7d
     43 %else ; x86-32
     44 %define n_rowsd dword r0m
     45 %endif ; x86-32/64
     46 %endif ; %3 == 5/7
     47 %else  ; %4 == 2, skip rows
     48 %if %3 == 5
     49 cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, n_rows
     50 %else ; %3 == 7
     51 cglobal highbd_sad_skip_%1x%2, 4, %3, %5, src, src_stride, ref, ref_stride, \
     52                            src_stride3, ref_stride3, n_rows
     53 %endif ; %3 == 5/7
     54 %endif ; sad/avg/skip
     55 %if %4 == 2  ; double the stride if we are skipping rows
     56  lea          src_strided, [src_strided*2]
     57  lea          ref_strided, [ref_strided*2]
     58 %endif
     59  movsxdifnidn src_strideq, src_strided
     60  movsxdifnidn ref_strideq, ref_strided
     61 %if %3 == 7
     62  lea         src_stride3q, [src_strideq*3]
     63  lea         ref_stride3q, [ref_strideq*3]
     64 %endif ; %3 == 7
     65 ; convert src, ref & second_pred to short ptrs (from byte ptrs)
     66  shl                 srcq, 1
     67  shl                 refq, 1
     68 %if %4 == 1
     69  shl         second_predq, 1
     70 %endif
     71 %endmacro
     72 
     73 ; unsigned int aom_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride,
     74 ;                                    uint8_t *ref, int ref_stride);
     75 %macro HIGH_SAD64XN 1-2 0
     76  HIGH_SAD_FN 64, %1, 5, %2
     77 %if %2 == 2  ; skip rows, so divide number of rows by 2
     78  mov              n_rowsd, %1/2
     79 %else
     80  mov              n_rowsd, %1
     81 %endif
     82  pxor                  m0, m0
     83  pxor                  m6, m6
     84 
     85 .loop:
     86  ; first half of each row
     87  movu                  m1, [refq]
     88  movu                  m2, [refq+16]
     89  movu                  m3, [refq+32]
     90  movu                  m4, [refq+48]
     91 %if %2 == 1
     92  pavgw                 m1, [second_predq+mmsize*0]
     93  pavgw                 m2, [second_predq+mmsize*1]
     94  pavgw                 m3, [second_predq+mmsize*2]
     95  pavgw                 m4, [second_predq+mmsize*3]
     96  lea         second_predq, [second_predq+mmsize*4]
     97 %endif
     98  mova                  m5, [srcq]
     99  psubusw               m5, m1
    100  psubusw               m1, [srcq]
    101  por                   m1, m5
    102  mova                  m5, [srcq+16]
    103  psubusw               m5, m2
    104  psubusw               m2, [srcq+16]
    105  por                   m2, m5
    106  mova                  m5, [srcq+32]
    107  psubusw               m5, m3
    108  psubusw               m3, [srcq+32]
    109  por                   m3, m5
    110  mova                  m5, [srcq+48]
    111  psubusw               m5, m4
    112  psubusw               m4, [srcq+48]
    113  por                   m4, m5
    114  paddw                 m1, m2
    115  paddw                 m3, m4
    116  movhlps               m2, m1
    117  movhlps               m4, m3
    118  paddw                 m1, m2
    119  paddw                 m3, m4
    120  punpcklwd             m1, m6
    121  punpcklwd             m3, m6
    122  paddd                 m0, m1
    123  paddd                 m0, m3
    124  ; second half of each row
    125  movu                  m1, [refq+64]
    126  movu                  m2, [refq+80]
    127  movu                  m3, [refq+96]
    128  movu                  m4, [refq+112]
    129 %if %2 == 1
    130  pavgw                 m1, [second_predq+mmsize*0]
    131  pavgw                 m2, [second_predq+mmsize*1]
    132  pavgw                 m3, [second_predq+mmsize*2]
    133  pavgw                 m4, [second_predq+mmsize*3]
    134  lea         second_predq, [second_predq+mmsize*4]
    135 %endif
    136  mova                  m5, [srcq+64]
    137  psubusw               m5, m1
    138  psubusw               m1, [srcq+64]
    139  por                   m1, m5
    140  mova                  m5, [srcq+80]
    141  psubusw               m5, m2
    142  psubusw               m2, [srcq+80]
    143  por                   m2, m5
    144  mova                  m5, [srcq+96]
    145  psubusw               m5, m3
    146  psubusw               m3, [srcq+96]
    147  por                   m3, m5
    148  mova                  m5, [srcq+112]
    149  psubusw               m5, m4
    150  psubusw               m4, [srcq+112]
    151  por                   m4, m5
    152  paddw                 m1, m2
    153  paddw                 m3, m4
    154  movhlps               m2, m1
    155  movhlps               m4, m3
    156  paddw                 m1, m2
    157  paddw                 m3, m4
    158  punpcklwd             m1, m6
    159  punpcklwd             m3, m6
    160  lea                 refq, [refq+ref_strideq*2]
    161  paddd                 m0, m1
    162  lea                 srcq, [srcq+src_strideq*2]
    163  paddd                 m0, m3
    164 
    165  dec              n_rowsd
    166  jg .loop
    167 
    168  movhlps               m1, m0
    169  paddd                 m0, m1
    170  punpckldq             m0, m6
    171  movhlps               m1, m0
    172  paddd                 m0, m1
    173 %if %2 == 2  ; we skipped rows, so we need to double the sad
    174  pslld                 m0, 1
    175 %endif
    176  movd                 eax, m0
    177  RET
    178 %endmacro
    179 
    180 INIT_XMM sse2
    181 HIGH_SAD64XN 64 ; highbd_sad64x64_sse2
    182 HIGH_SAD64XN 32 ; highbd_sad64x32_sse2
    183 HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2
    184 HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2
    185 HIGH_SAD64XN 64, 2 ; highbd_sad_skip_64x64_sse2
    186 HIGH_SAD64XN 32, 2 ; highbd_sad_skip_64x32_sse2
    187 %if CONFIG_REALTIME_ONLY==0
    188 HIGH_SAD64XN 16 ; highbd_sad64x16_sse2
    189 HIGH_SAD64XN 16, 1 ; highbd_sad64x16_avg_sse2
    190 HIGH_SAD64XN 16, 2 ; highbd_sad_skip_64x16_sse2
    191 %endif
    192 
    193 ; unsigned int aom_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride,
    194 ;                                    uint8_t *ref, int ref_stride);
    195 %macro HIGH_SAD32XN 1-2 0
    196  HIGH_SAD_FN 32, %1, 5, %2
    197 %if %2 == 2  ; skip rows, so divide number of rows by 2
    198  mov              n_rowsd, %1/2
    199 %else
    200  mov              n_rowsd, %1
    201 %endif
    202  pxor                  m0, m0
    203  pxor                  m6, m6
    204 
    205 .loop:
    206  movu                  m1, [refq]
    207  movu                  m2, [refq+16]
    208  movu                  m3, [refq+32]
    209  movu                  m4, [refq+48]
    210 %if %2 == 1
    211  pavgw                 m1, [second_predq+mmsize*0]
    212  pavgw                 m2, [second_predq+mmsize*1]
    213  pavgw                 m3, [second_predq+mmsize*2]
    214  pavgw                 m4, [second_predq+mmsize*3]
    215  lea         second_predq, [second_predq+mmsize*4]
    216 %endif
    217  mova                  m5, [srcq]
    218  psubusw               m5, m1
    219  psubusw               m1, [srcq]
    220  por                   m1, m5
    221  mova                  m5, [srcq+16]
    222  psubusw               m5, m2
    223  psubusw               m2, [srcq+16]
    224  por                   m2, m5
    225  mova                  m5, [srcq+32]
    226  psubusw               m5, m3
    227  psubusw               m3, [srcq+32]
    228  por                   m3, m5
    229  mova                  m5, [srcq+48]
    230  psubusw               m5, m4
    231  psubusw               m4, [srcq+48]
    232  por                   m4, m5
    233  paddw                 m1, m2
    234  paddw                 m3, m4
    235  movhlps               m2, m1
    236  movhlps               m4, m3
    237  paddw                 m1, m2
    238  paddw                 m3, m4
    239  punpcklwd             m1, m6
    240  punpcklwd             m3, m6
    241  lea                 refq, [refq+ref_strideq*2]
    242  paddd                 m0, m1
    243  lea                 srcq, [srcq+src_strideq*2]
    244  paddd                 m0, m3
    245  dec              n_rowsd
    246  jg .loop
    247 
    248  movhlps               m1, m0
    249  paddd                 m0, m1
    250  punpckldq             m0, m6
    251  movhlps               m1, m0
    252  paddd                 m0, m1
    253 %if %2 == 2  ; we skipped rows, so we need to double the sad
    254  pslld                 m0, 1
    255 %endif
    256  movd                 eax, m0
    257  RET
    258 %endmacro
    259 
    260 INIT_XMM sse2
    261 HIGH_SAD32XN 64 ; highbd_sad32x64_sse2
    262 HIGH_SAD32XN 32 ; highbd_sad32x32_sse2
    263 HIGH_SAD32XN 16 ; highbd_sad32x16_sse2
    264 HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2
    265 HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2
    266 HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2
    267 HIGH_SAD32XN 64, 2 ; highbd_sad_skip_32x64_sse2
    268 HIGH_SAD32XN 32, 2 ; highbd_sad_skip_32x32_sse2
    269 HIGH_SAD32XN 16, 2 ; highbd_sad_skip_32x16_sse2
    270 %if CONFIG_REALTIME_ONLY==0
    271 HIGH_SAD32XN  8 ; highbd_sad32x8_sse2
    272 HIGH_SAD32XN  8, 1 ; highbd_sad32x8_avg_sse2
    273 %endif
    274 
    275 ; unsigned int aom_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride,
    276 ;                                    uint8_t *ref, int ref_stride);
    277 %macro HIGH_SAD16XN 1-2 0
    278  HIGH_SAD_FN 16, %1, 5, %2
    279 %if %2 == 2  ; skip rows, so divide number of rows by 2
    280  mov              n_rowsd, %1/4
    281 %else
    282  mov              n_rowsd, %1/2
    283 %endif
    284  pxor                  m0, m0
    285  pxor                  m6, m6
    286 
    287 .loop:
    288  movu                  m1, [refq]
    289  movu                  m2, [refq+16]
    290  movu                  m3, [refq+ref_strideq*2]
    291  movu                  m4, [refq+ref_strideq*2+16]
    292 %if %2 == 1
    293  pavgw                 m1, [second_predq+mmsize*0]
    294  pavgw                 m2, [second_predq+16]
    295  pavgw                 m3, [second_predq+mmsize*2]
    296  pavgw                 m4, [second_predq+mmsize*2+16]
    297  lea         second_predq, [second_predq+mmsize*4]
    298 %endif
    299  mova                  m5, [srcq]
    300  psubusw               m5, m1
    301  psubusw               m1, [srcq]
    302  por                   m1, m5
    303  mova                  m5, [srcq+16]
    304  psubusw               m5, m2
    305  psubusw               m2, [srcq+16]
    306  por                   m2, m5
    307  mova                  m5, [srcq+src_strideq*2]
    308  psubusw               m5, m3
    309  psubusw               m3, [srcq+src_strideq*2]
    310  por                   m3, m5
    311  mova                  m5, [srcq+src_strideq*2+16]
    312  psubusw               m5, m4
    313  psubusw               m4, [srcq+src_strideq*2+16]
    314  por                   m4, m5
    315  paddw                 m1, m2
    316  paddw                 m3, m4
    317  movhlps               m2, m1
    318  movhlps               m4, m3
    319  paddw                 m1, m2
    320  paddw                 m3, m4
    321  punpcklwd             m1, m6
    322  punpcklwd             m3, m6
    323  lea                 refq, [refq+ref_strideq*4]
    324  paddd                 m0, m1
    325  lea                 srcq, [srcq+src_strideq*4]
    326  paddd                 m0, m3
    327  dec              n_rowsd
    328  jg .loop
    329 
    330  movhlps               m1, m0
    331  paddd                 m0, m1
    332  punpckldq             m0, m6
    333  movhlps               m1, m0
    334  paddd                 m0, m1
    335 %if %2 == 2  ; we skipped rows, so we need to double the sad
    336  pslld                 m0, 1
    337 %endif
    338  movd                 eax, m0
    339  RET
    340 %endmacro
    341 
    342 INIT_XMM sse2
    343 HIGH_SAD16XN 32 ; highbd_sad16x32_sse2
    344 HIGH_SAD16XN 16 ; highbd_sad16x16_sse2
    345 HIGH_SAD16XN  8 ; highbd_sad16x8_sse2
    346 HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2
    347 HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2
    348 HIGH_SAD16XN  8, 1 ; highbd_sad16x8_avg_sse2
    349 HIGH_SAD16XN 32, 2 ; highbd_sad_skip_16x32_sse2
    350 HIGH_SAD16XN 16, 2 ; highbd_sad_skip_16x16_sse2
    351 %if CONFIG_REALTIME_ONLY==0
    352 HIGH_SAD16XN 64 ; highbd_sad16x64_sse2
    353 HIGH_SAD16XN  4 ; highbd_sad16x4_sse2
    354 HIGH_SAD16XN 64, 1 ; highbd_sad16x64_avg_sse2
    355 HIGH_SAD16XN 64, 2 ; highbd_sad_skip_16x64_sse2
    356 %endif
    357 
    358 ; unsigned int aom_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride,
    359 ;                                    uint8_t *ref, int ref_stride);
    360 %macro HIGH_SAD8XN 1-2 0
    361  HIGH_SAD_FN 8, %1, 7, %2, 8
    362 %if %2 == 2  ; skip rows, so divide number of rows by 2
    363  mov              n_rowsd, %1/8
    364 %else
    365  mov              n_rowsd, %1/4
    366 %endif
    367  pxor                  m0, m0
    368  pxor                  m6, m6
    369 
    370 .loop:
    371  movu                  m1, [refq]
    372  movu                  m2, [refq+ref_strideq*2]
    373  movu                  m3, [refq+ref_strideq*4]
    374  movu                  m4, [refq+ref_stride3q*2]
    375 %if %2 == 1
    376  pavgw                 m1, [second_predq+mmsize*0]
    377  pavgw                 m2, [second_predq+mmsize*1]
    378  pavgw                 m3, [second_predq+mmsize*2]
    379  pavgw                 m4, [second_predq+mmsize*3]
    380  lea         second_predq, [second_predq+mmsize*4]
    381 %endif
    382  mova                  m7, m1
    383  movu                  m5, [srcq]
    384  psubusw               m1, m5
    385  psubusw               m5, m7
    386  por                   m1, m5
    387 
    388  mova                  m7, m2
    389  movu                  m5, [srcq+src_strideq*2]
    390  psubusw               m2, m5
    391  psubusw               m5, m7
    392  por                   m2, m5
    393 
    394  mova                  m7, m3
    395  movu                  m5, [srcq+src_strideq*4]
    396  psubusw               m3, m5
    397  psubusw               m5, m7
    398  por                   m3, m5
    399 
    400  mova                  m7, m4
    401  movu                  m5, [srcq+src_stride3q*2]
    402  psubusw               m4, m5
    403  psubusw               m5, m7
    404  por                   m4, m5
    405 
    406  paddw                 m1, m2
    407  paddw                 m3, m4
    408  movhlps               m2, m1
    409  movhlps               m4, m3
    410  paddw                 m1, m2
    411  paddw                 m3, m4
    412  punpcklwd             m1, m6
    413  punpcklwd             m3, m6
    414  lea                 refq, [refq+ref_strideq*8]
    415  paddd                 m0, m1
    416  lea                 srcq, [srcq+src_strideq*8]
    417  paddd                 m0, m3
    418  dec              n_rowsd
    419  jg .loop
    420 
    421  movhlps               m1, m0
    422  paddd                 m0, m1
    423  punpckldq             m0, m6
    424  movhlps               m1, m0
    425  paddd                 m0, m1
    426 %if %2 == 2  ; we skipped rows, so we need to double the sad
    427  pslld                 m0, 1
    428 %endif
    429  movd                 eax, m0
    430  RET
    431 %endmacro
    432 
    433 INIT_XMM sse2
    434 HIGH_SAD8XN 16 ; highbd_sad8x16_sse2
    435 HIGH_SAD8XN  8 ; highbd_sad8x8_sse2
    436 HIGH_SAD8XN  4 ; highbd_sad8x4_sse2
    437 HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2
    438 HIGH_SAD8XN  8, 1 ; highbd_sad8x8_avg_sse2
    439 HIGH_SAD8XN 16, 2 ; highbd_sad_skip_8x16_sse2
    440 %if CONFIG_REALTIME_ONLY==0
    441 HIGH_SAD8XN 32 ; highbd_sad8x32_sse2
    442 HIGH_SAD8XN 32, 1 ; highbd_sad8x32_avg_sse2
    443 HIGH_SAD8XN 32, 2 ; highbd_sad_skip_8x32_sse2
    444 %endif
    445 
    446 ; unsigned int aom_highbd_sad4x{4,8,16}_sse2(uint8_t *src, int src_stride,
    447 ;                                    uint8_t *ref, int ref_stride);
    448 %macro HIGH_SAD4XN 1-2 0
    449  HIGH_SAD_FN 4, %1, 7, %2
    450 %if %2 == 2  ; skip rows, so divide number of rows by 2
    451  mov              n_rowsd, %1/8
    452 %else
    453  mov              n_rowsd, %1/4
    454 %endif
    455  pxor                  m0, m0
    456  pxor                  m6, m6
    457 
    458 .loop:
    459  movq                  m1, [refq]
    460  movq                  m2, [refq+ref_strideq*2]
    461  movq                  m3, [refq+ref_strideq*4]
    462  movq                  m4, [refq+ref_stride3q*2]
    463  punpcklwd             m1, m3
    464  punpcklwd             m2, m4
    465 %if %2 == 1
    466  movq                  m3, [second_predq+8*0]
    467  movq                  m5, [second_predq+8*2]
    468  punpcklwd             m3, m5
    469  movq                  m4, [second_predq+8*1]
    470  movq                  m5, [second_predq+8*3]
    471  punpcklwd             m4, m5
    472  lea         second_predq, [second_predq+8*4]
    473  pavgw                 m1, m3
    474  pavgw                 m2, m4
    475 %endif
    476  movq                  m5, [srcq]
    477  movq                  m3, [srcq+src_strideq*4]
    478  punpcklwd             m5, m3
    479  movdqa                m3, m1
    480  psubusw               m1, m5
    481  psubusw               m5, m3
    482  por                   m1, m5
    483  movq                  m5, [srcq+src_strideq*2]
    484  movq                  m4, [srcq+src_stride3q*2]
    485  punpcklwd             m5, m4
    486  movdqa                m4, m2
    487  psubusw               m2, m5
    488  psubusw               m5, m4
    489  por                   m2, m5
    490  paddw                 m1, m2
    491  movdqa                m2, m1
    492  punpcklwd             m1, m6
    493  punpckhwd             m2, m6
    494  lea                 refq, [refq+ref_strideq*8]
    495  paddd                 m0, m1
    496  lea                 srcq, [srcq+src_strideq*8]
    497  paddd                 m0, m2
    498  dec              n_rowsd
    499  jg .loop
    500 
    501  movhlps               m1, m0
    502  paddd                 m0, m1
    503  punpckldq             m0, m6
    504  movhlps               m1, m0
    505  paddd                 m0, m1
    506 %if %2 == 2  ; we skipped rows, so we need to double the sad
    507  pslld                 m0, 1
    508 %endif
    509  movd                 eax, m0
    510  RET
    511 %endmacro
    512 
    513 INIT_XMM sse2
    514 HIGH_SAD4XN  8 ; highbd_sad4x8_sse2
    515 HIGH_SAD4XN  4 ; highbd_sad4x4_sse2
    516 %if CONFIG_REALTIME_ONLY==0
    517 HIGH_SAD4XN 16 ; highbd_sad4x16_sse2
    518 HIGH_SAD4XN 16, 2 ; highbd_sad_skip_4x16_sse2
    519 %endif