tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

quantize_ssse3_x86_64.asm (13632B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION_RODATA
     17 pw_1: times 8 dw 1
     18 
     19 SECTION .text
     20 
     21 %macro QUANTIZE_FN 2
     22 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, zbin, round, quant, \
     23                                shift, qcoeff, dqcoeff, dequant, \
     24                                eob, scan, iscan
     25 
     26  ; actual quantize loop - setup pointers, rounders, etc.
     27  movifnidn                   coeffq, coeffmp
     28  movifnidn                  ncoeffq, ncoeffmp
     29  movifnidn                    zbinq, zbinmp
     30  movifnidn                   roundq, roundmp
     31  movifnidn                   quantq, quantmp
     32  movifnidn                 dequantq, dequantmp
     33  mova                            m0, [zbinq]              ; m0 = zbin
     34  mova                            m1, [roundq]             ; m1 = round
     35  mova                            m2, [quantq]             ; m2 = quant
     36 %ifidn %1, b_32x32
     37  pcmpeqw                         m5, m5
     38  psrlw                           m5, 15
     39  paddw                           m0, m5
     40  paddw                           m1, m5
     41  psrlw                           m0, 1                    ; m0 = (m0 + 1) / 2
     42  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
     43 %endif
     44  mova                            m3, [dequantq]           ; m3 = dequant
     45  mov                             r2, shiftmp
     46  psubw                           m0, [GLOBAL(pw_1)]
     47  mova                            m4, [r2]                 ; m4 = shift
     48  mov                             r3, qcoeffmp
     49  mov                             r4, dqcoeffmp
     50  mov                             r5, iscanmp
     51  pxor                            m5, m5                   ; m5 = dedicated zero
     52  DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, eob
     53  lea                         coeffq, [  coeffq+ncoeffq*4]
     54  lea                        qcoeffq, [ qcoeffq+ncoeffq*4]
     55  lea                       dqcoeffq, [dqcoeffq+ncoeffq*4]
     56  lea                         iscanq, [  iscanq+ncoeffq*2]
     57  neg                        ncoeffq
     58 
     59  ; get DC and first 15 AC coeffs
     60  ; coeff stored as 32bit numbers & require 16bit numbers
     61  mova                            m9, [  coeffq+ncoeffq*4+ 0]
     62  packssdw                        m9, [  coeffq+ncoeffq*4+16]
     63  mova                           m10, [  coeffq+ncoeffq*4+32]
     64  packssdw                       m10, [  coeffq+ncoeffq*4+48]
     65  pabsw                           m6, m9                   ; m6 = abs(m9)
     66  pabsw                          m11, m10                  ; m11 = abs(m10)
     67  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
     68  punpckhqdq                      m0, m0
     69  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
     70  paddsw                          m6, m1                   ; m6 += round
     71  punpckhqdq                      m1, m1
     72  paddsw                         m11, m1                   ; m11 += round
     73  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
     74  punpckhqdq                      m2, m2
     75  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
     76  paddw                           m8, m6                   ; m8 += m6
     77  paddw                          m13, m11                  ; m13 += m11
     78  %ifidn %1, b_32x32
     79  pmullw                          m5, m8, m4               ; store the lower 16 bits of m8*qsh
     80  %endif
     81  pmulhw                          m8, m4                   ; m8 = m8*qsh>>16
     82  %ifidn %1, b_32x32
     83  psllw                           m8, 1
     84  psrlw                           m5, 15
     85  por                             m8, m5
     86  %endif
     87  punpckhqdq                      m4, m4
     88  %ifidn %1, b_32x32
     89  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
     90  %endif
     91  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
     92  %ifidn %1, b_32x32
     93  psllw                          m13, 1
     94  psrlw                           m5, 15
     95  por                            m13, m5
     96  pxor                            m5, m5                   ; reset m5 to zero register
     97  %endif
     98  psignw                          m8, m9                   ; m8 = reinsert sign
     99  psignw                         m13, m10                  ; m13 = reinsert sign
    100  pand                            m8, m7
    101  pand                           m13, m12
    102 
    103  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
    104  mova                           m11, m8
    105  mova                            m6, m8
    106  pcmpgtw                         m5, m8
    107  punpcklwd                      m11, m5
    108  punpckhwd                       m6, m5
    109  mova        [qcoeffq+ncoeffq*4+ 0], m11
    110  mova        [qcoeffq+ncoeffq*4+16], m6
    111  pxor                            m5, m5
    112  mova                           m11, m13
    113  mova                            m6, m13
    114  pcmpgtw                         m5, m13
    115  punpcklwd                      m11, m5
    116  punpckhwd                       m6, m5
    117  mova        [qcoeffq+ncoeffq*4+32], m11
    118  mova        [qcoeffq+ncoeffq*4+48], m6
    119  pxor                            m5, m5             ; reset m5 to zero register
    120 
    121 %ifidn %1, b_32x32
    122  pabsw                           m8, m8
    123  pabsw                          m13, m13
    124 %endif
    125  pmullw                          m8, m3                   ; dqc[i] = qc[i] * q
    126  punpckhqdq                      m3, m3
    127  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
    128 %ifidn %1, b_32x32
    129  psrlw                           m8, 1
    130  psrlw                          m13, 1
    131  psignw                          m8, m9
    132  psignw                         m13, m10
    133 %endif
    134  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
    135  mova                            m11, m8
    136  mova                            m6, m8
    137  pcmpgtw                         m5, m8
    138  punpcklwd                      m11, m5
    139  punpckhwd                       m6, m5
    140  mova       [dqcoeffq+ncoeffq*4+ 0], m11
    141  mova       [dqcoeffq+ncoeffq*4+16], m6
    142  pxor                            m5, m5
    143  mova                           m11, m13
    144  mova                            m6, m13
    145  pcmpgtw                         m5, m13
    146  punpcklwd                      m11, m5
    147  punpckhwd                       m6, m5
    148  mova       [dqcoeffq+ncoeffq*4+32], m11
    149  mova       [dqcoeffq+ncoeffq*4+48], m6
    150  pxor                            m5, m5             ; reset m5 to zero register
    151  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
    152  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
    153  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
    154  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
    155  psubw                           m6, m7                   ; m6 = scan[i] + 1
    156  psubw                          m11, m12                  ; m11 = scan[i] + 1
    157  pandn                           m8, m6                   ; m8 = max(eob)
    158  pandn                          m13, m11                  ; m13 = max(eob)
    159  pmaxsw                          m8, m13
    160  add                        ncoeffq, mmsize
    161  jz .accumulate_eob
    162 
    163 .ac_only_loop:
    164  ; pack coeff from 32bit to 16bit array
    165  mova                            m9, [  coeffq+ncoeffq*4+ 0]
    166  packssdw                        m9, [  coeffq+ncoeffq*4+16]
    167  mova                           m10, [  coeffq+ncoeffq*4+32]
    168  packssdw                       m10, [  coeffq+ncoeffq*4+48]
    169 
    170  pabsw                           m6, m9                   ; m6 = abs(m9)
    171  pabsw                          m11, m10                  ; m11 = abs(m10)
    172  pcmpgtw                         m7, m6, m0               ; m7 = c[i] >= zbin
    173  pcmpgtw                        m12, m11, m0              ; m12 = c[i] >= zbin
    174 %ifidn %1, b_32x32
    175  pmovmskb                       r6d, m7
    176  pmovmskb                       r2d, m12
    177  or                              r6, r2
    178  jz .skip_iter
    179 %endif
    180  paddsw                          m6, m1                   ; m6 += round
    181  paddsw                         m11, m1                   ; m11 += round
    182  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
    183  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
    184  paddw                          m14, m6                   ; m14 += m6
    185  paddw                          m13, m11                  ; m13 += m11
    186  %ifidn %1, b_32x32
    187  pmullw                          m5, m14, m4              ; store the lower 16 bits of m14*qsh
    188  %endif
    189  pmulhw                         m14, m4                   ; m14 = m14*qsh>>16
    190  %ifidn %1, b_32x32
    191  psllw                          m14, 1
    192  psrlw                           m5, 15
    193  por                            m14, m5
    194  pmullw                          m5, m13, m4              ; store the lower 16 bits of m13*qsh
    195  %endif
    196  pmulhw                         m13, m4                   ; m13 = m13*qsh>>16
    197  %ifidn %1, b_32x32
    198  psllw                          m13, 1
    199  psrlw                           m5, 15
    200  por                            m13, m5
    201  pxor                            m5, m5                   ; reset m5 to zero register
    202  %endif
    203  psignw                         m14, m9                   ; m14 = reinsert sign
    204  psignw                         m13, m10                  ; m13 = reinsert sign
    205  pand                           m14, m7
    206  pand                           m13, m12
    207 
    208  ; store 16bit numbers as 32bit numbers in array pointed to by qcoeff
    209  pxor                           m11, m11
    210  mova                           m11, m14
    211  mova                            m6, m14
    212  pcmpgtw                         m5, m14
    213  punpcklwd                      m11, m5
    214  punpckhwd                       m6, m5
    215  mova        [qcoeffq+ncoeffq*4+ 0], m11
    216  mova        [qcoeffq+ncoeffq*4+16], m6
    217  pxor                            m5, m5
    218  mova                           m11, m13
    219  mova                            m6, m13
    220  pcmpgtw                         m5, m13
    221  punpcklwd                      m11, m5
    222  punpckhwd                       m6, m5
    223  mova        [qcoeffq+ncoeffq*4+32], m11
    224  mova        [qcoeffq+ncoeffq*4+48], m6
    225  pxor                            m5, m5             ; reset m5 to zero register
    226 
    227 %ifidn %1, b_32x32
    228  pabsw                          m14, m14
    229  pabsw                          m13, m13
    230 %endif
    231  pmullw                         m14, m3                   ; dqc[i] = qc[i] * q
    232  pmullw                         m13, m3                   ; dqc[i] = qc[i] * q
    233 %ifidn %1, b_32x32
    234  psrlw                          m14, 1
    235  psrlw                          m13, 1
    236  psignw                         m14, m9
    237  psignw                         m13, m10
    238 %endif
    239 
    240  ; store 16bit numbers as 32bit numbers in array pointed to by dqcoeff
    241  mova                           m11, m14
    242  mova                            m6, m14
    243  pcmpgtw                         m5, m14
    244  punpcklwd                      m11, m5
    245  punpckhwd                       m6, m5
    246  mova       [dqcoeffq+ncoeffq*4+ 0], m11
    247  mova       [dqcoeffq+ncoeffq*4+16], m6
    248  pxor                            m5, m5
    249  mova                           m11, m13
    250  mova                            m6, m13
    251  pcmpgtw                         m5, m13
    252  punpcklwd                      m11, m5
    253  punpckhwd                       m6, m5
    254  mova       [dqcoeffq+ncoeffq*4+32], m11
    255  mova       [dqcoeffq+ncoeffq*4+48], m6
    256  pxor                            m5, m5
    257 
    258  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
    259  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
    260  mova                            m6, [  iscanq+ncoeffq*2+ 0] ; m6 = scan[i]
    261  mova                           m11, [  iscanq+ncoeffq*2+16] ; m11 = scan[i]
    262  psubw                           m6, m7                   ; m6 = scan[i] + 1
    263  psubw                          m11, m12                  ; m11 = scan[i] + 1
    264  pandn                          m14, m6                   ; m14 = max(eob)
    265  pandn                          m13, m11                  ; m13 = max(eob)
    266  pmaxsw                          m8, m14
    267  pmaxsw                          m8, m13
    268  add                        ncoeffq, mmsize
    269  jl .ac_only_loop
    270 
    271 %ifidn %1, b_32x32
    272  jmp .accumulate_eob
    273 .skip_iter:
    274  mova        [qcoeffq+ncoeffq*4+ 0], m5
    275  mova        [qcoeffq+ncoeffq*4+16], m5
    276  mova        [qcoeffq+ncoeffq*4+32], m5
    277  mova        [qcoeffq+ncoeffq*4+48], m5
    278  mova       [dqcoeffq+ncoeffq*4+ 0], m5
    279  mova       [dqcoeffq+ncoeffq*4+16], m5
    280  mova       [dqcoeffq+ncoeffq*4+32], m5
    281  mova       [dqcoeffq+ncoeffq*4+48], m5
    282  add                        ncoeffq, mmsize
    283  jl .ac_only_loop
    284 %endif
    285 
    286 .accumulate_eob:
    287  ; horizontally accumulate/max eobs and write into [eob] memory pointer
    288  mov                             r2, eobmp
    289  pshufd                          m7, m8, 0xe
    290  pmaxsw                          m8, m7
    291  pshuflw                         m7, m8, 0xe
    292  pmaxsw                          m8, m7
    293  pshuflw                         m7, m8, 0x1
    294  pmaxsw                          m8, m7
    295  pextrw                          r6, m8, 0
    296  mov                             [r2], r6
    297  RET
    298 %endmacro
    299 
    300 INIT_XMM ssse3
    301 QUANTIZE_FN b, 9
    302 QUANTIZE_FN b_32x32, 9