tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

av1_quantize_ssse3_x86_64.asm (8607B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %define private_prefix av1
     15 
     16 %include "third_party/x86inc/x86inc.asm"
     17 
     18 SECTION_RODATA
     19 pw_1: times 8 dw 1
     20 
     21 SECTION .text
     22 
     23 %macro QUANTIZE_FP 2
     24 cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \
     25                                shift, qcoeff, dqcoeff, dequant, \
     26                                eob, scan, iscan
     27  cmp                    dword skipm, 0
     28  jne .blank
     29 
     30  ; actual quantize loop - setup pointers, rounders, etc.
     31  movifnidn                   coeffq, coeffmp
     32  movifnidn                  ncoeffq, ncoeffmp
     33  mov                             r2, dequantmp
     34  movifnidn                    zbinq, zbinmp
     35  movifnidn                   roundq, roundmp
     36  movifnidn                   quantq, quantmp
     37  mova                            m1, [roundq]             ; m1 = round
     38  mova                            m2, [quantq]             ; m2 = quant
     39 %ifidn %1, fp_32x32
     40  pcmpeqw                         m5, m5
     41  psrlw                           m5, 15
     42  paddw                           m1, m5
     43  psrlw                           m1, 1                    ; m1 = (m1 + 1) / 2
     44 %endif
     45  mova                            m3, [r2q]                ; m3 = dequant
     46  mov                             r3, qcoeffmp
     47  mov                             r4, dqcoeffmp
     48  mov                             r5, iscanmp
     49 %ifidn %1, fp_32x32
     50  psllw                           m2, 1
     51 %endif
     52  pxor                            m5, m5                   ; m5 = dedicated zero
     53 
     54  lea                         coeffq, [  coeffq+ncoeffq*2]
     55  lea                            r5q, [  r5q+ncoeffq*2]
     56  lea                            r3q, [ r3q+ncoeffq*2]
     57  lea                            r4q, [r4q+ncoeffq*2]
     58  neg                        ncoeffq
     59 
     60  ; get DC and first 15 AC coeffs
     61  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
     62  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
     63  pabsw                           m6, m9                   ; m6 = abs(m9)
     64  pabsw                          m11, m10                  ; m11 = abs(m10)
     65  pcmpeqw                         m7, m7
     66 
     67  paddsw                          m6, m1                   ; m6 += round
     68  punpckhqdq                      m1, m1
     69  paddsw                         m11, m1                   ; m11 += round
     70  pmulhw                          m8, m6, m2               ; m8 = m6*q>>16
     71  punpckhqdq                      m2, m2
     72  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
     73  psignw                          m8, m9                   ; m8 = reinsert sign
     74  psignw                         m13, m10                  ; m13 = reinsert sign
     75  mova            [r3q+ncoeffq*2+ 0], m8
     76  mova            [r3q+ncoeffq*2+16], m13
     77 %ifidn %1, fp_32x32
     78  pabsw                           m8, m8
     79  pabsw                          m13, m13
     80 %endif
     81  pmullw                          m8, m3                   ; r4[i] = r3[i] * q
     82  punpckhqdq                      m3, m3
     83  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
     84 %ifidn %1, fp_32x32
     85  psrlw                           m8, 1
     86  psrlw                          m13, 1
     87  psignw                          m8, m9
     88  psignw                         m13, m10
     89  psrlw                           m0, m3, 2
     90 %else
     91  psrlw                           m0, m3, 1
     92 %endif
     93  mova            [r4q+ncoeffq*2+ 0], m8
     94  mova            [r4q+ncoeffq*2+16], m13
     95  pcmpeqw                         m8, m5                   ; m8 = c[i] == 0
     96  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
     97  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
     98  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
     99  psubw                           m6, m7                   ; m6 = scan[i] + 1
    100  psubw                          m11, m7                   ; m11 = scan[i] + 1
    101  pandn                           m8, m6                   ; m8 = max(eob)
    102  pandn                          m13, m11                  ; m13 = max(eob)
    103  pmaxsw                          m8, m13
    104  add                        ncoeffq, mmsize
    105  jz .accumulate_eob
    106 
    107 .ac_only_loop:
    108  mova                            m9, [  coeffq+ncoeffq*2+ 0] ; m9 = c[i]
    109  mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
    110  pabsw                           m6, m9                   ; m6 = abs(m9)
    111  pabsw                          m11, m10                  ; m11 = abs(m10)
    112 
    113  pcmpgtw                         m7, m6,  m0
    114  pcmpgtw                        m12, m11, m0
    115  pmovmskb                       r6d, m7
    116  pmovmskb                       r2d, m12
    117 
    118  or                              r6, r2
    119  jz .skip_iter
    120 
    121  pcmpeqw                         m7, m7
    122 
    123  paddsw                          m6, m1                   ; m6 += round
    124  paddsw                         m11, m1                   ; m11 += round
    125  pmulhw                         m14, m6, m2               ; m14 = m6*q>>16
    126  pmulhw                         m13, m11, m2              ; m13 = m11*q>>16
    127  psignw                         m14, m9                   ; m14 = reinsert sign
    128  psignw                         m13, m10                  ; m13 = reinsert sign
    129  mova            [r3q+ncoeffq*2+ 0], m14
    130  mova            [r3q+ncoeffq*2+16], m13
    131 %ifidn %1, fp_32x32
    132  pabsw                          m14, m14
    133  pabsw                          m13, m13
    134 %endif
    135  pmullw                         m14, m3                   ; r4[i] = r3[i] * q
    136  pmullw                         m13, m3                   ; r4[i] = r3[i] * q
    137 %ifidn %1, fp_32x32
    138  psrlw                          m14, 1
    139  psrlw                          m13, 1
    140  psignw                         m14, m9
    141  psignw                         m13, m10
    142 %endif
    143  mova            [r4q+ncoeffq*2+ 0], m14
    144  mova            [r4q+ncoeffq*2+16], m13
    145  pcmpeqw                        m14, m5                   ; m14 = c[i] == 0
    146  pcmpeqw                        m13, m5                   ; m13 = c[i] == 0
    147  mova                            m6, [  r5q+ncoeffq*2+ 0] ; m6 = scan[i]
    148  mova                           m11, [  r5q+ncoeffq*2+16] ; m11 = scan[i]
    149  psubw                           m6, m7                   ; m6 = scan[i] + 1
    150  psubw                          m11, m7                   ; m11 = scan[i] + 1
    151  pandn                          m14, m6                   ; m14 = max(eob)
    152  pandn                          m13, m11                  ; m13 = max(eob)
    153  pmaxsw                          m8, m14
    154  pmaxsw                          m8, m13
    155  add                        ncoeffq, mmsize
    156  jl .ac_only_loop
    157 
    158  jmp .accumulate_eob
    159 .skip_iter:
    160  mova            [r3q+ncoeffq*2+ 0], m5
    161  mova            [r3q+ncoeffq*2+16], m5
    162  mova            [r4q+ncoeffq*2+ 0], m5
    163  mova            [r4q+ncoeffq*2+16], m5
    164  add                        ncoeffq, mmsize
    165  jl .ac_only_loop
    166 
    167 .accumulate_eob:
    168  ; horizontally accumulate/max eobs and write into [eob] memory pointer
    169  mov                             r2, eobmp
    170  pshufd                          m7, m8, 0xe
    171  pmaxsw                          m8, m7
    172  pshuflw                         m7, m8, 0xe
    173  pmaxsw                          m8, m7
    174  pshuflw                         m7, m8, 0x1
    175  pmaxsw                          m8, m7
    176  pextrw                          r6, m8, 0
    177  mov                           [r2], r6
    178  RET
    179 
    180  ; skip-block, i.e. just write all zeroes
    181 .blank:
    182  mov                             r0, dqcoeffmp
    183  movifnidn                  ncoeffq, ncoeffmp
    184  mov                             r2, qcoeffmp
    185  mov                             r3, eobmp
    186 
    187  lea                            r0q, [r0q+ncoeffq*2]
    188  lea                            r2q, [r2q+ncoeffq*2]
    189  neg                        ncoeffq
    190  pxor                            m7, m7
    191 .blank_loop:
    192  mova            [r0q+ncoeffq*2+ 0], m7
    193  mova            [r0q+ncoeffq*2+16], m7
    194  mova            [r2q+ncoeffq*2+ 0], m7
    195  mova            [r2q+ncoeffq*2+16], m7
    196  add                        ncoeffq, mmsize
    197  jl .blank_loop
    198  mov                     word [r3q], 0
    199  RET
    200 %endmacro
    201 
    202 INIT_XMM ssse3
    203 QUANTIZE_FP fp, 7
    204 QUANTIZE_FP fp_32x32, 7