tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

highbd_sad4d_sse2.asm (10448B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION .text
     17 
     18 ; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end
     19 %macro HIGH_PROCESS_4x2x4 5-6 0
     20  movh                  m0, [srcq +%2*2]
     21 %if %1 == 1
     22  movu                  m4, [ref1q+%3*2]
     23  movu                  m5, [ref2q+%3*2]
     24  movu                  m6, [ref3q+%3*2]
     25  movu                  m7, [ref4q+%3*2]
     26  movhps                m0, [srcq +%4*2]
     27  movhps                m4, [ref1q+%5*2]
     28  movhps                m5, [ref2q+%5*2]
     29  movhps                m6, [ref3q+%5*2]
     30  movhps                m7, [ref4q+%5*2]
     31  mova                  m3, m0
     32  mova                  m2, m0
     33  psubusw               m3, m4
     34  psubusw               m2, m5
     35  psubusw               m4, m0
     36  psubusw               m5, m0
     37  por                   m4, m3
     38  por                   m5, m2
     39  pmaddwd               m4, m1
     40  pmaddwd               m5, m1
     41  mova                  m3, m0
     42  mova                  m2, m0
     43  psubusw               m3, m6
     44  psubusw               m2, m7
     45  psubusw               m6, m0
     46  psubusw               m7, m0
     47  por                   m6, m3
     48  por                   m7, m2
     49  pmaddwd               m6, m1
     50  pmaddwd               m7, m1
     51 %else
     52  movu                  m2, [ref1q+%3*2]
     53  movhps                m0, [srcq +%4*2]
     54  movhps                m2, [ref1q+%5*2]
     55  mova                  m3, m0
     56  psubusw               m3, m2
     57  psubusw               m2, m0
     58  por                   m2, m3
     59  pmaddwd               m2, m1
     60  paddd                 m4, m2
     61 
     62  movu                  m2, [ref2q+%3*2]
     63  mova                  m3, m0
     64  movhps                m2, [ref2q+%5*2]
     65  psubusw               m3, m2
     66  psubusw               m2, m0
     67  por                   m2, m3
     68  pmaddwd               m2, m1
     69  paddd                 m5, m2
     70 
     71  movu                  m2, [ref3q+%3*2]
     72  mova                  m3, m0
     73  movhps                m2, [ref3q+%5*2]
     74  psubusw               m3, m2
     75  psubusw               m2, m0
     76  por                   m2, m3
     77  pmaddwd               m2, m1
     78  paddd                 m6, m2
     79 
     80  movu                  m2, [ref4q+%3*2]
     81  mova                  m3, m0
     82  movhps                m2, [ref4q+%5*2]
     83  psubusw               m3, m2
     84  psubusw               m2, m0
     85  por                   m2, m3
     86  pmaddwd               m2, m1
     87  paddd                 m7, m2
     88 %endif
     89 %if %6 == 1
     90  lea                 srcq, [srcq +src_strideq*4]
     91  lea                ref1q, [ref1q+ref_strideq*4]
     92  lea                ref2q, [ref2q+ref_strideq*4]
     93  lea                ref3q, [ref3q+ref_strideq*4]
     94  lea                ref4q, [ref4q+ref_strideq*4]
     95 %endif
     96 %endmacro
     97 
     98 ; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end
     99 %macro HIGH_PROCESS_8x2x4 5-6 0
    100  ; 1st 8 px
    101  mova                  m0, [srcq +%2*2]
    102 %if %1 == 1
    103  movu                  m4, [ref1q+%3*2]
    104  movu                  m5, [ref2q+%3*2]
    105  movu                  m6, [ref3q+%3*2]
    106  movu                  m7, [ref4q+%3*2]
    107  mova                  m3, m0
    108  mova                  m2, m0
    109  psubusw               m3, m4
    110  psubusw               m2, m5
    111  psubusw               m4, m0
    112  psubusw               m5, m0
    113  por                   m4, m3
    114  por                   m5, m2
    115  pmaddwd               m4, m1
    116  pmaddwd               m5, m1
    117  mova                  m3, m0
    118  mova                  m2, m0
    119  psubusw               m3, m6
    120  psubusw               m2, m7
    121  psubusw               m6, m0
    122  psubusw               m7, m0
    123  por                   m6, m3
    124  por                   m7, m2
    125  pmaddwd               m6, m1
    126  pmaddwd               m7, m1
    127 %else
    128  mova                  m3, m0
    129  movu                  m2, [ref1q+%3*2]
    130  psubusw               m3, m2
    131  psubusw               m2, m0
    132  por                   m2, m3
    133  mova                  m3, m0
    134  pmaddwd               m2, m1
    135  paddd                 m4, m2
    136  movu                  m2, [ref2q+%3*2]
    137  psubusw               m3, m2
    138  psubusw               m2, m0
    139  por                   m2, m3
    140  mova                  m3, m0
    141  pmaddwd               m2, m1
    142  paddd                 m5, m2
    143  movu                  m2, [ref3q+%3*2]
    144  psubusw               m3, m2
    145  psubusw               m2, m0
    146  por                   m2, m3
    147  mova                  m3, m0
    148  pmaddwd               m2, m1
    149  paddd                 m6, m2
    150  movu                  m2, [ref4q+%3*2]
    151  psubusw               m3, m2
    152  psubusw               m2, m0
    153  por                   m2, m3
    154  pmaddwd               m2, m1
    155  paddd                 m7, m2
    156 %endif
    157 
    158  ; 2nd 8 px
    159  mova                  m0, [srcq +(%4)*2]
    160  mova                  m3, m0
    161  movu                  m2, [ref1q+(%5)*2]
    162  psubusw               m3, m2
    163  psubusw               m2, m0
    164  por                   m2, m3
    165  mova                  m3, m0
    166  pmaddwd               m2, m1
    167  paddd                 m4, m2
    168  movu                  m2, [ref2q+(%5)*2]
    169  psubusw               m3, m2
    170  psubusw               m2, m0
    171  por                   m2, m3
    172  mova                  m3, m0
    173  pmaddwd               m2, m1
    174  paddd                 m5, m2
    175  movu                  m2, [ref3q+(%5)*2]
    176  psubusw               m3, m2
    177  psubusw               m2, m0
    178  por                   m2, m3
    179  mova                  m3, m0
    180  pmaddwd               m2, m1
    181  paddd                 m6, m2
    182  movu                  m2, [ref4q+(%5)*2]
    183  psubusw               m3, m2
    184  psubusw               m2, m0
    185 %if %6 == 1
    186  lea                 srcq, [srcq +src_strideq*4]
    187  lea                ref1q, [ref1q+ref_strideq*4]
    188  lea                ref2q, [ref2q+ref_strideq*4]
    189  lea                ref3q, [ref3q+ref_strideq*4]
    190  lea                ref4q, [ref4q+ref_strideq*4]
    191 %endif
    192  por                   m2, m3
    193  pmaddwd               m2, m1
    194  paddd                 m7, m2
    195 %endmacro
    196 
    197 ; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    198 %macro HIGH_PROCESS_16x2x4 5-6 0
    199  HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8)
    200  HIGH_PROCESS_8x2x4  0, %4, %5, (%4 + 8), (%5 + 8), %6
    201 %endmacro
    202 
    203 ; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    204 %macro HIGH_PROCESS_32x2x4 5-6 0
    205  HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16)
    206  HIGH_PROCESS_16x2x4  0, %4, %5, (%4 + 16), (%5 + 16), %6
    207 %endmacro
    208 
    209 ; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end
    210 %macro HIGH_PROCESS_64x2x4 5-6 0
    211  HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32)
    212  HIGH_PROCESS_32x2x4  0, %4, %5, (%4 + 32), (%5 + 32), %6
    213 %endmacro
    214 
    215 ; void aom_highbd_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
    216 ;                         uint8_t *ref[4], int ref_stride,
    217 ;                         uint32_t res[4]);
    218 ; Macro Arguments:
    219 ;   1: Width
    220 ;   2: Height
    221 ;   3: If 0, then normal sad, if 2, then skip every other row
    222 %macro HIGH_SADNXN4D 2-3 0
    223 %if %3 == 0  ; normal sad
    224 %if AOM_ARCH_X86_64
    225 cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
    226                              res, ref2, ref3, ref4
    227 %else
    228 cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
    229                              ref2, ref3, ref4
    230 %endif  ; AOM_ARCH_X86_64
    231 %else  ; %3 == 2, downsample
    232 %if AOM_ARCH_X86_64
    233 cglobal highbd_sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \
    234                              res, ref2, ref3, ref4
    235 %else
    236 cglobal highbd_sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \
    237                              ref2, ref3, ref4
    238 %endif  ; AOM_ARCH_X86_64
    239 %endif  ; sad/avg/skip
    240 
    241 ; set m1
    242  push                srcq
    243  mov                 srcd, 0x00010001
    244  movd                  m1, srcd
    245  pshufd                m1, m1, 0x0
    246  pop                 srcq
    247 
    248 %if %3 == 2  ; skip rows
    249  lea          src_strided, [2*src_strided]
    250  lea          ref_strided, [2*ref_strided]
    251 %endif  ; skip rows
    252  movsxdifnidn src_strideq, src_strided
    253  movsxdifnidn ref_strideq, ref_strided
    254  mov                ref2q, [ref1q+gprsize*1]
    255  mov                ref3q, [ref1q+gprsize*2]
    256  mov                ref4q, [ref1q+gprsize*3]
    257  mov                ref1q, [ref1q+gprsize*0]
    258 
    259 ; convert byte pointers to short pointers
    260  shl                 srcq, 1
    261  shl                ref2q, 1
    262  shl                ref3q, 1
    263  shl                ref4q, 1
    264  shl                ref1q, 1
    265 
    266  HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1
    267 %if %3 == 2  ;  Downsampling by two
    268 %define num_rep (%2-8)/4
    269 %else
    270 %define num_rep (%2-4)/2
    271 %endif
    272 %rep num_rep
    273  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1
    274 %endrep
    275 %undef rep
    276  HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0
    277  ; N.B. HIGH_PROCESS outputs dwords (32 bits)
    278  ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM
    279  movhlps               m0, m4
    280  movhlps               m1, m5
    281  movhlps               m2, m6
    282  movhlps               m3, m7
    283  paddd                 m4, m0
    284  paddd                 m5, m1
    285  paddd                 m6, m2
    286  paddd                 m7, m3
    287  punpckldq             m4, m5
    288  punpckldq             m6, m7
    289  movhlps               m0, m4
    290  movhlps               m1, m6
    291  paddd                 m4, m0
    292  paddd                 m6, m1
    293  punpcklqdq            m4, m6
    294 %if %3 == 2  ; skip rows
    295  pslld                 m4, 1
    296 %endif
    297  movifnidn             r4, r4mp
    298  movu                [r4], m4
    299  RET
    300 %endmacro
    301 
    302 
    303 INIT_XMM sse2
    304 HIGH_SADNXN4D 64, 64
    305 HIGH_SADNXN4D 64, 32
    306 HIGH_SADNXN4D 32, 64
    307 HIGH_SADNXN4D 32, 32
    308 HIGH_SADNXN4D 32, 16
    309 HIGH_SADNXN4D 16, 32
    310 HIGH_SADNXN4D 16, 16
    311 HIGH_SADNXN4D 16,  8
    312 HIGH_SADNXN4D  8, 16
    313 HIGH_SADNXN4D  8,  8
    314 HIGH_SADNXN4D  8,  4
    315 HIGH_SADNXN4D  4,  8
    316 HIGH_SADNXN4D  4,  4
    317 HIGH_SADNXN4D  4, 16
    318 HIGH_SADNXN4D 16,  4
    319 HIGH_SADNXN4D  8, 32
    320 HIGH_SADNXN4D 32,  8
    321 HIGH_SADNXN4D 16, 64
    322 HIGH_SADNXN4D 64, 16
    323 
    324 HIGH_SADNXN4D 64, 64, 2
    325 HIGH_SADNXN4D 64, 32, 2
    326 HIGH_SADNXN4D 32, 64, 2
    327 HIGH_SADNXN4D 32, 32, 2
    328 HIGH_SADNXN4D 32, 16, 2
    329 HIGH_SADNXN4D 16, 32, 2
    330 HIGH_SADNXN4D 16, 16, 2
    331 HIGH_SADNXN4D  8, 16, 2
    332 HIGH_SADNXN4D  4, 16, 2
    333 HIGH_SADNXN4D  8, 32, 2
    334 HIGH_SADNXN4D 16, 64, 2
    335 HIGH_SADNXN4D 64, 16, 2