tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

subtract_sse2.asm (4470B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION .text
     17 
     18 ; void aom_subtract_block(int rows, int cols,
     19 ;                         int16_t *diff, ptrdiff_t diff_stride,
     20 ;                         const uint8_t *src, ptrdiff_t src_stride,
     21 ;                         const uint8_t *pred, ptrdiff_t pred_stride)
     22 
     23 INIT_XMM sse2
     24 cglobal subtract_block, 7, 7, 8, \
     25                        rows, cols, diff, diff_stride, src, src_stride, \
     26                        pred, pred_stride
     27 %define pred_str colsq
     28  pxor                  m7, m7         ; dedicated zero register
     29  cmp                colsd, 4
     30  je .case_4
     31  cmp                colsd, 8
     32  je .case_8
     33  cmp                colsd, 16
     34  je .case_16
     35  cmp                colsd, 32
     36  je .case_32
     37  cmp                colsd, 64
     38  je .case_64
     39 
     40 %macro loop16 6
     41  mova                  m0, [srcq+%1]
     42  mova                  m4, [srcq+%2]
     43  movu                  m1, [predq+%3]
     44  movu                  m5, [predq+%4]
     45  punpckhbw             m2, m0, m7
     46  punpckhbw             m3, m1, m7
     47  punpcklbw             m0, m7
     48  punpcklbw             m1, m7
     49  psubw                 m2, m3
     50  psubw                 m0, m1
     51  punpckhbw             m1, m4, m7
     52  punpckhbw             m3, m5, m7
     53  punpcklbw             m4, m7
     54  punpcklbw             m5, m7
     55  psubw                 m1, m3
     56  psubw                 m4, m5
     57  mova [diffq+mmsize*0+%5], m0
     58  mova [diffq+mmsize*1+%5], m2
     59  mova [diffq+mmsize*0+%6], m4
     60  mova [diffq+mmsize*1+%6], m1
     61 %endmacro
     62 
     63  mov             pred_str, pred_stridemp
     64 .loop_128:
     65  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
     66  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
     67  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
     68  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
     69  lea                diffq, [diffq+diff_strideq*2]
     70  add                predq, pred_str
     71  add                 srcq, src_strideq
     72  sub                rowsd, 1
     73  jnz .loop_128
     74  RET
     75 
     76 .case_64:
     77  mov             pred_str, pred_stridemp
     78 .loop_64:
     79  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
     80  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
     81  lea                diffq, [diffq+diff_strideq*2]
     82  add                predq, pred_str
     83  add                 srcq, src_strideq
     84  dec                rowsd
     85  jg .loop_64
     86  RET
     87 
     88 .case_32:
     89  mov             pred_str, pred_stridemp
     90 .loop_32:
     91  loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
     92  lea                diffq, [diffq+diff_strideq*2]
     93  add                predq, pred_str
     94  add                 srcq, src_strideq
     95  dec                rowsd
     96  jg .loop_32
     97  RET
     98 
     99 .case_16:
    100  mov             pred_str, pred_stridemp
    101 .loop_16:
    102  loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
    103  lea                diffq, [diffq+diff_strideq*4]
    104  lea                predq, [predq+pred_str*2]
    105  lea                 srcq, [srcq+src_strideq*2]
    106  sub                rowsd, 2
    107  jg .loop_16
    108  RET
    109 
    110 %macro loop_h 0
    111  movh                  m0, [srcq]
    112  movh                  m2, [srcq+src_strideq]
    113  movh                  m1, [predq]
    114  movh                  m3, [predq+pred_str]
    115  punpcklbw             m0, m7
    116  punpcklbw             m1, m7
    117  punpcklbw             m2, m7
    118  punpcklbw             m3, m7
    119  psubw                 m0, m1
    120  psubw                 m2, m3
    121  mova             [diffq], m0
    122  mova [diffq+diff_strideq*2], m2
    123 %endmacro
    124 
    125 .case_8:
    126  mov             pred_str, pred_stridemp
    127 .loop_8:
    128  loop_h
    129  lea                diffq, [diffq+diff_strideq*4]
    130  lea                 srcq, [srcq+src_strideq*2]
    131  lea                predq, [predq+pred_str*2]
    132  sub                rowsd, 2
    133  jg .loop_8
    134  RET
    135 
    136 INIT_MMX
    137 .case_4:
    138  mov             pred_str, pred_stridemp
    139 .loop_4:
    140  loop_h
    141  lea                diffq, [diffq+diff_strideq*4]
    142  lea                 srcq, [srcq+src_strideq*2]
    143  lea                predq, [predq+pred_str*2]
    144  sub                rowsd, 2
    145  jg .loop_4
    146  emms
    147  RET