tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

sad4d_sse2.asm (12315B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION .text
     17 
     18 ; 'spill_src_stride' affect a lot how the code works.
     19 ;
     20 ; When 'spill_src_stride' is false, the 'src_strideq' resides in
     21 ; register, [srcq + src_strideq + offset] is allowed, so we can simply
     22 ; use such form to access src memory and don't bother to update 'srcq'
     23 ; at each line. We only update 'srcq' each two-lines using a compact
     24 ; LEA instruction like [srcq+src_strideq*2].
     25 ;
     26 ; When 'spill_src_stride' is true, the 'src_strideq' resides in memory.
     27 ; we cannot use above form to access memory, we have to update
     28 ; 'srcq' at each line break. As we process two parts (first,second)
     29 ; together in each macro function, the second part may also sit
     30 ; in the next line, which means we also need to possibly add
     31 ; one 'src_strideq' to 'srcq' before processing second part.
     32 
     33 %macro HANDLE_SECOND_OFFSET 0
     34  %if spill_src_stride
     35    %define second_offset 0
     36    add srcq, src_strideq
     37  %else
     38    %define second_offset (src_strideq)
     39  %endif
     40 %endmacro
     41 
     42 ; This is specically designed to handle when src_strideq is a
     43 ; memory position, under such case, we can not accomplish
     44 ; complex address calculation using LEA, and fall back to
     45 ; using simple ADD instruction at each line ending.
     46 %macro ADVANCE_END_OF_TWO_LINES 0
     47  %if spill_src_stride
     48    add srcq, src_strideq
     49  %else
     50    lea                 srcq, [srcq+src_strideq*2]
     51  %endif
     52 
     53 ; note: ref_stride is never spilled when processing two lines
     54  lea                ref1q, [ref1q+ref_strideq*2]
     55  lea                ref2q, [ref2q+ref_strideq*2]
     56  lea                ref3q, [ref3q+ref_strideq*2]
     57  lea                ref4q, [ref4q+ref_strideq*2]
     58 %endmacro
     59 
     60 ; PROCESS_4x2x4 first
     61 %macro PROCESS_4x2x4 1
     62  movd                  m0, [srcq]
     63  HANDLE_SECOND_OFFSET
     64 %if %1 == 1
     65  movd                  m6, [ref1q]
     66  movd                  m4, [ref2q]
     67  movd                  m7, [ref3q]
     68  movd                  m5, [ref4q]
     69 
     70  movd                  m1, [srcq + second_offset]
     71  movd                  m2, [ref1q+ref_strideq]
     72  punpckldq             m0, m1
     73  punpckldq             m6, m2
     74  movd                  m1, [ref2q+ref_strideq]
     75  movd                  m2, [ref3q+ref_strideq]
     76  movd                  m3, [ref4q+ref_strideq]
     77  punpckldq             m4, m1
     78  punpckldq             m7, m2
     79  punpckldq             m5, m3
     80  movlhps               m0, m0
     81  movlhps               m6, m4
     82  movlhps               m7, m5
     83  psadbw                m6, m0
     84  psadbw                m7, m0
     85 %else
     86  movd                  m1, [ref1q]
     87  movd                  m5, [ref1q+ref_strideq]
     88  movd                  m2, [ref2q]
     89  movd                  m4, [ref2q+ref_strideq]
     90  punpckldq             m1, m5
     91  punpckldq             m2, m4
     92  movd                  m3, [ref3q]
     93  movd                  m5, [ref3q+ref_strideq]
     94  punpckldq             m3, m5
     95  movd                  m4, [ref4q]
     96  movd                  m5, [ref4q+ref_strideq]
     97  punpckldq             m4, m5
     98  movd                  m5, [srcq + second_offset]
     99  punpckldq             m0, m5
    100  movlhps               m0, m0
    101  movlhps               m1, m2
    102  movlhps               m3, m4
    103  psadbw                m1, m0
    104  psadbw                m3, m0
    105  paddd                 m6, m1
    106  paddd                 m7, m3
    107 %endif
    108 %endmacro
    109 
    110 ; PROCESS_8x2x4 first
    111 %macro PROCESS_8x2x4 1
    112  movh                  m0, [srcq]
    113  HANDLE_SECOND_OFFSET
    114 %if %1 == 1
    115  movh                  m4, [ref1q]
    116  movh                  m5, [ref2q]
    117  movh                  m6, [ref3q]
    118  movh                  m7, [ref4q]
    119  movhps                m0, [srcq + second_offset]
    120  movhps                m4, [ref1q+ref_strideq]
    121  movhps                m5, [ref2q+ref_strideq]
    122  movhps                m6, [ref3q+ref_strideq]
    123  movhps                m7, [ref4q+ref_strideq]
    124  psadbw                m4, m0
    125  psadbw                m5, m0
    126  psadbw                m6, m0
    127  psadbw                m7, m0
    128 %else
    129  movh                  m1, [ref1q]
    130  movh                  m2, [ref2q]
    131  movhps                m0, [srcq + second_offset]
    132  movhps                m1, [ref1q+ref_strideq]
    133  movhps                m2, [ref2q+ref_strideq]
    134  psadbw                m1, m0
    135  psadbw                m2, m0
    136  paddd                 m4, m1
    137  paddd                 m5, m2
    138 
    139  movh                  m1, [ref3q]
    140  movhps                m1, [ref3q+ref_strideq]
    141  movh                  m2, [ref4q]
    142  movhps                m2, [ref4q+ref_strideq]
    143  psadbw                m1, m0
    144  psadbw                m2, m0
    145  paddd                 m6, m1
    146  paddd                 m7, m2
    147 %endif
    148 %endmacro
    149 
    150 ; PROCESS_FIRST_MMSIZE
    151 %macro PROCESS_FIRST_MMSIZE 0
    152  mova                  m0, [srcq]
    153  movu                  m4, [ref1q]
    154  movu                  m5, [ref2q]
    155  movu                  m6, [ref3q]
    156  movu                  m7, [ref4q]
    157  psadbw                m4, m0
    158  psadbw                m5, m0
    159  psadbw                m6, m0
    160  psadbw                m7, m0
    161 %endmacro
    162 
    163 ; PROCESS_16x1x4 offset
    164 %macro PROCESS_16x1x4 1
    165  mova                  m0, [srcq + %1]
    166  movu                  m1, [ref1q + ref_offsetq + %1]
    167  movu                  m2, [ref2q + ref_offsetq + %1]
    168  psadbw                m1, m0
    169  psadbw                m2, m0
    170  paddd                 m4, m1
    171  paddd                 m5, m2
    172 
    173  movu                  m1, [ref3q + ref_offsetq + %1]
    174  movu                  m2, [ref4q + ref_offsetq + %1]
    175  psadbw                m1, m0
    176  psadbw                m2, m0
    177  paddd                 m6, m1
    178  paddd                 m7, m2
    179 %endmacro
    180 
    181 ; void aom_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
    182 ;                         uint8_t *ref[4], int ref_stride,
    183 ;                         uint32_t res[4]);
    184 ; Macro Arguments:
    185 ;   1: Width
    186 ;   2: Height
    187 ;   3: If 0, then normal sad, else skip rows
    188 %macro SADNXN4D 2-3 0
    189 
    190 %define spill_src_stride 0
    191 %define spill_ref_stride 0
    192 %define spill_cnt 0
    193 
    194 ; Whether a shared offset should be used instead of adding strides to
    195 ; each reference array. With this option, only one line will be processed
    196 ; per loop iteration.
    197 %define use_ref_offset (%1 >= mmsize)
    198 
    199 ; Remove loops in the 4x4 and 8x4 case
    200 %define use_loop (use_ref_offset || %2 > 4)
    201 
    202 %if %3 == 1  ; skip rows
    203 %if AOM_ARCH_X86_64
    204 %if use_ref_offset
    205 cglobal sad_skip_%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, \
    206                                     ref2, ref3, ref4, cnt, ref_offset
    207 %elif use_loop
    208 cglobal sad_skip_%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, \
    209                                    ref2, ref3, ref4, cnt
    210 %else
    211 cglobal sad_skip_%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, \
    212                                    ref2, ref3, ref4
    213 %endif
    214 %else
    215 %if use_ref_offset
    216 cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, \
    217                                    ref4
    218 %define spill_src_stride 1
    219 %define spill_ref_stride 1
    220 %elif use_loop
    221 cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, \
    222                                    ref3, ref4
    223 %define spill_src_stride 1
    224 %else
    225 cglobal sad_skip_%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, \
    226                                    ref3, ref4
    227 %endif
    228 %endif
    229 %else ; normal sad
    230 %if AOM_ARCH_X86_64
    231 %if use_ref_offset
    232 cglobal sad%1x%2x4d, 5, 10, 8, src, src_stride, ref1, ref_stride, res, ref2, \
    233                               ref3, ref4, cnt, ref_offset
    234 %elif use_loop
    235 cglobal sad%1x%2x4d, 5, 9, 8, src, src_stride, ref1, ref_stride, res, ref2, \
    236                              ref3, ref4, cnt
    237 %else
    238 cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, res, ref2, \
    239                              ref3, ref4
    240 %endif
    241 %else
    242 %if use_ref_offset
    243 cglobal sad%1x%2x4d, 4, 7, 8, src, ref_offset, ref1, cnt, ref2, ref3, ref4
    244  %define spill_src_stride 1
    245  %define spill_ref_stride 1
    246 %elif use_loop
    247 cglobal sad%1x%2x4d, 4, 7, 8, src, cnt, ref1, ref_stride, ref2, ref3, ref4
    248  %define spill_src_stride 1
    249 %else
    250 cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, ref2, ref3, \
    251                              ref4
    252 %endif
    253 %endif
    254 %endif
    255 
    256 %if spill_src_stride
    257  %define src_strideq r1mp
    258  %define src_strided r1mp
    259 %endif
    260 %if spill_ref_stride
    261  %define ref_strideq r3mp
    262  %define ref_strided r3mp
    263 %endif
    264 
    265 %if spill_cnt
    266  SUB                  rsp, 4
    267  %define cntd word [rsp]
    268 %endif
    269 
    270 %if %3 == 1
    271  sal          src_strided, 1
    272  sal          ref_strided, 1
    273 %endif
    274  movsxdifnidn src_strideq, src_strided
    275  movsxdifnidn ref_strideq, ref_strided
    276 
    277  mov                ref2q, [ref1q+gprsize*1]
    278  mov                ref3q, [ref1q+gprsize*2]
    279  mov                ref4q, [ref1q+gprsize*3]
    280  mov                ref1q, [ref1q+gprsize*0]
    281 
    282 ; Is the loop for this wxh in another function?
    283 ; If so, we jump into that function for the loop and returning
    284 %define external_loop (use_ref_offset && %1 > mmsize && %1 != %2)
    285 
    286 %if use_ref_offset
    287  PROCESS_FIRST_MMSIZE
    288 %if %1 > mmsize
    289  mov          ref_offsetq, 0
    290  mov                 cntd, %2 >> %3
    291 ; Jump part way into the loop for the square version of this width
    292 %if %3 == 1
    293  jmp mangle(private_prefix %+ _sad_skip_%1x%1x4d %+ SUFFIX).midloop
    294 %else
    295  jmp mangle(private_prefix %+ _sad%1x%1x4d %+ SUFFIX).midloop
    296 %endif
    297 %else
    298  mov          ref_offsetq, ref_strideq
    299  add                 srcq, src_strideq
    300  mov                 cntd, (%2 >> %3) - 1
    301 %endif
    302 %if external_loop == 0
    303 .loop:
    304 ; Unrolled horizontal loop
    305 %assign h_offset 0
    306 %rep %1/mmsize
    307  PROCESS_16x1x4 h_offset
    308 %if h_offset == 0
    309 ; The first row of the first column is done outside the loop and jumps here
    310 .midloop:
    311 %endif
    312 %assign h_offset h_offset+mmsize
    313 %endrep
    314 
    315  add                 srcq, src_strideq
    316  add          ref_offsetq, ref_strideq
    317  sub                 cntd, 1
    318  jnz .loop
    319 %endif
    320 %else
    321  PROCESS_%1x2x4 1
    322  ADVANCE_END_OF_TWO_LINES
    323 %if use_loop
    324  mov                 cntd, (%2/2 >> %3) - 1
    325 .loop:
    326 %endif
    327  PROCESS_%1x2x4 0
    328 %if use_loop
    329  ADVANCE_END_OF_TWO_LINES
    330  sub                 cntd, 1
    331  jnz .loop
    332 %endif
    333 %endif
    334 
    335 %if spill_cnt
    336 ; Undo stack allocation for cnt
    337  ADD                  rsp, 4
    338 %endif
    339 
    340 %if external_loop == 0
    341 %if %3 == 0
    342  %define resultq r4
    343  %define resultmp r4mp
    344 %endif
    345 
    346 ; Undo modifications on parameters on the stack
    347 %if %3 == 1
    348 %if spill_src_stride
    349  shr          src_strided, 1
    350 %endif
    351 %if spill_ref_stride
    352  shr          ref_strided, 1
    353 %endif
    354 %endif
    355 
    356 %if %1 > 4
    357  pslldq                m5, 4
    358  pslldq                m7, 4
    359  por                   m4, m5
    360  por                   m6, m7
    361  mova                  m5, m4
    362  mova                  m7, m6
    363  punpcklqdq            m4, m6
    364  punpckhqdq            m5, m7
    365  paddd                 m4, m5
    366 %if %3 == 1
    367  pslld                 m4, 1
    368 %endif
    369  movifnidn             resultq, resultmp
    370  movu                [resultq], m4
    371  RET
    372 %else
    373  pshufd            m6, m6, 0x08
    374  pshufd            m7, m7, 0x08
    375 %if %3 == 1
    376  pslld                 m6, 1
    377  pslld                 m7, 1
    378 %endif
    379  movifnidn             resultq, resultmp
    380  movq              [resultq+0], m6
    381  movq              [resultq+8], m7
    382  RET
    383 %endif
    384 %endif ; external_loop == 0
    385 %endmacro
    386 
    387 INIT_XMM sse2
    388 SADNXN4D 128, 128
    389 SADNXN4D 128,  64
    390 SADNXN4D  64, 128
    391 SADNXN4D  64,  64
    392 SADNXN4D  64,  32
    393 SADNXN4D  32,  64
    394 SADNXN4D  32,  32
    395 SADNXN4D  32,  16
    396 SADNXN4D  16,  32
    397 SADNXN4D  16,  16
    398 SADNXN4D  16,   8
    399 SADNXN4D   8,  16
    400 SADNXN4D   8,   8
    401 SADNXN4D   8,   4
    402 SADNXN4D   4,   8
    403 SADNXN4D   4,   4
    404 %if CONFIG_REALTIME_ONLY==0
    405 SADNXN4D   4,  16
    406 SADNXN4D  16,   4
    407 SADNXN4D   8,  32
    408 SADNXN4D  32,   8
    409 SADNXN4D  16,  64
    410 SADNXN4D  64,  16
    411 %endif
    412 SADNXN4D 128, 128, 1
    413 SADNXN4D 128,  64, 1
    414 SADNXN4D  64, 128, 1
    415 SADNXN4D  64,  64, 1
    416 SADNXN4D  64,  32, 1
    417 SADNXN4D  32,  64, 1
    418 SADNXN4D  32,  32, 1
    419 SADNXN4D  32,  16, 1
    420 SADNXN4D  16,  32, 1
    421 SADNXN4D  16,  16, 1
    422 SADNXN4D   8,  16, 1
    423 %if CONFIG_REALTIME_ONLY==0
    424 SADNXN4D   4,  16, 1
    425 SADNXN4D   8,  32, 1
    426 SADNXN4D  16,  64, 1
    427 SADNXN4D  64,  16, 1
    428 %endif