tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

fwd_txfm_ssse3_x86_64.asm (7772B)


      1 ;
      2 ; Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 ;
      4 ; This source code is subject to the terms of the BSD 2 Clause License and
      5 ; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 ; was not distributed with this source code in the LICENSE file, you can
      7 ; obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 ; Media Patent License 1.0 was not distributed with this source code in the
      9 ; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 ;
     11 
     12 ;
     13 
     14 %include "third_party/x86inc/x86inc.asm"
     15 
     16 SECTION_RODATA
     17 
     18 pw_11585x2: times 8 dw 23170
     19 pd_8192:    times 4 dd 8192
     20 
     21 %macro TRANSFORM_COEFFS 2
     22 pw_%1_%2:   dw  %1,  %2,  %1,  %2,  %1,  %2,  %1,  %2
     23 pw_%2_m%1:  dw  %2, -%1,  %2, -%1,  %2, -%1,  %2, -%1
     24 %endmacro
     25 
     26 TRANSFORM_COEFFS 11585,  11585
     27 TRANSFORM_COEFFS 15137,   6270
     28 TRANSFORM_COEFFS 16069,   3196
     29 TRANSFORM_COEFFS  9102,  13623
     30 
     31 %macro STORE_OUTPUT 2 ; index, result
     32  ; const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
     33  ; __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
     34  ; __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
     35  ; _mm_store_si128((__m128i *)(dst_ptr), out0);
     36  ; _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
     37  pxor               m11, m11
     38  pcmpgtw            m11, m%2
     39  movdqa             m12, m%2
     40  punpcklwd          m%2, m11
     41  punpckhwd          m12, m11
     42  mova               [outputq + 4*%1 +  0], m%2
     43  mova               [outputq + 4*%1 + 16], m12
     44 %endmacro
     45 
     46 SECTION .text
     47 
     48 %if AOM_ARCH_X86_64
     49 INIT_XMM ssse3
     50 cglobal fdct8x8, 3, 5, 13, input, output, stride
     51 
     52  mova               m8, [GLOBAL(pd_8192)]
     53  mova              m12, [GLOBAL(pw_11585x2)]
     54 
     55  lea                r3, [2 * strideq]
     56  lea                r4, [4 * strideq]
     57  mova               m0, [inputq]
     58  mova               m1, [inputq + r3]
     59  lea                inputq, [inputq + r4]
     60  mova               m2, [inputq]
     61  mova               m3, [inputq + r3]
     62  lea                inputq, [inputq + r4]
     63  mova               m4, [inputq]
     64  mova               m5, [inputq + r3]
     65  lea                inputq, [inputq + r4]
     66  mova               m6, [inputq]
     67  mova               m7, [inputq + r3]
     68 
     69  ; left shift by 2 to increase forward transformation precision
     70  psllw              m0, 2
     71  psllw              m1, 2
     72  psllw              m2, 2
     73  psllw              m3, 2
     74  psllw              m4, 2
     75  psllw              m5, 2
     76  psllw              m6, 2
     77  psllw              m7, 2
     78 
     79  ; column transform
     80  ; stage 1
     81  paddw m10, m0, m7
     82  psubw m0, m7
     83 
     84  paddw m9, m1, m6
     85  psubw m1, m6
     86 
     87  paddw m7, m2, m5
     88  psubw m2, m5
     89 
     90  paddw m6, m3, m4
     91  psubw m3, m4
     92 
     93  ; stage 2
     94  paddw m5, m9, m7
     95  psubw m9, m7
     96 
     97  paddw m4, m10, m6
     98  psubw m10, m6
     99 
    100  paddw m7, m1, m2
    101  psubw m1, m2
    102 
    103  ; stage 3
    104  paddw m6, m4, m5
    105  psubw m4, m5
    106 
    107  pmulhrsw m1, m12
    108  pmulhrsw m7, m12
    109 
    110  ; sin(pi / 8), cos(pi / 8)
    111  punpcklwd m2, m10, m9
    112  punpckhwd m10, m9
    113  pmaddwd m5, m2, [GLOBAL(pw_15137_6270)]
    114  pmaddwd m2, [GLOBAL(pw_6270_m15137)]
    115  pmaddwd m9, m10, [GLOBAL(pw_15137_6270)]
    116  pmaddwd m10, [GLOBAL(pw_6270_m15137)]
    117  paddd m5, m8
    118  paddd m2, m8
    119  paddd m9, m8
    120  paddd m10, m8
    121  psrad m5, 14
    122  psrad m2, 14
    123  psrad m9, 14
    124  psrad m10, 14
    125  packssdw m5, m9
    126  packssdw m2, m10
    127 
    128  pmulhrsw m6, m12
    129  pmulhrsw m4, m12
    130 
    131  paddw m9, m3, m1
    132  psubw m3, m1
    133 
    134  paddw m10, m0, m7
    135  psubw m0, m7
    136 
    137  ; stage 4
    138  ; sin(pi / 16), cos(pi / 16)
    139  punpcklwd m1, m10, m9
    140  punpckhwd m10, m9
    141  pmaddwd m7, m1, [GLOBAL(pw_16069_3196)]
    142  pmaddwd m1, [GLOBAL(pw_3196_m16069)]
    143  pmaddwd m9, m10, [GLOBAL(pw_16069_3196)]
    144  pmaddwd m10, [GLOBAL(pw_3196_m16069)]
    145  paddd m7, m8
    146  paddd m1, m8
    147  paddd m9, m8
    148  paddd m10, m8
    149  psrad m7, 14
    150  psrad m1, 14
    151  psrad m9, 14
    152  psrad m10, 14
    153  packssdw m7, m9
    154  packssdw m1, m10
    155 
    156  ; sin(3 * pi / 16), cos(3 * pi / 16)
    157  punpcklwd m11, m0, m3
    158  punpckhwd m0, m3
    159  pmaddwd m9, m11, [GLOBAL(pw_9102_13623)]
    160  pmaddwd m11, [GLOBAL(pw_13623_m9102)]
    161  pmaddwd m3, m0, [GLOBAL(pw_9102_13623)]
    162  pmaddwd m0, [GLOBAL(pw_13623_m9102)]
    163  paddd m9, m8
    164  paddd m11, m8
    165  paddd m3, m8
    166  paddd m0, m8
    167  psrad m9, 14
    168  psrad m11, 14
    169  psrad m3, 14
    170  psrad m0, 14
    171  packssdw m9, m3
    172  packssdw m11, m0
    173 
    174  ; transpose
    175  ; stage 1
    176  punpcklwd m0, m6, m7
    177  punpcklwd m3, m5, m11
    178  punpckhwd m6, m7
    179  punpckhwd m5, m11
    180  punpcklwd m7, m4, m9
    181  punpcklwd m10, m2, m1
    182  punpckhwd m4, m9
    183  punpckhwd m2, m1
    184 
    185  ; stage 2
    186  punpckldq m9, m0, m3
    187  punpckldq m1, m6, m5
    188  punpckhdq m0, m3
    189  punpckhdq m6, m5
    190  punpckldq m3, m7, m10
    191  punpckldq m5, m4, m2
    192  punpckhdq m7, m10
    193  punpckhdq m4, m2
    194 
    195  ; stage 3
    196  punpcklqdq m10, m9, m3
    197  punpckhqdq m9, m3
    198  punpcklqdq m2, m0, m7
    199  punpckhqdq m0, m7
    200  punpcklqdq m3, m1, m5
    201  punpckhqdq m1, m5
    202  punpcklqdq m7, m6, m4
    203  punpckhqdq m6, m4
    204 
    205  ; row transform
    206  ; stage 1
    207  paddw m5, m10, m6
    208  psubw m10, m6
    209 
    210  paddw m4, m9, m7
    211  psubw m9, m7
    212 
    213  paddw m6, m2, m1
    214  psubw m2, m1
    215 
    216  paddw m7, m0, m3
    217  psubw m0, m3
    218 
    219  ;stage 2
    220  paddw m1, m5, m7
    221  psubw m5, m7
    222 
    223  paddw m3, m4, m6
    224  psubw m4, m6
    225 
    226  paddw m7, m9, m2
    227  psubw m9, m2
    228 
    229  ; stage 3
    230  punpcklwd m6, m1, m3
    231  punpckhwd m1, m3
    232  pmaddwd m2, m6, [GLOBAL(pw_11585_11585)]
    233  pmaddwd m6, [GLOBAL(pw_11585_m11585)]
    234  pmaddwd m3, m1, [GLOBAL(pw_11585_11585)]
    235  pmaddwd m1, [GLOBAL(pw_11585_m11585)]
    236  paddd m2, m8
    237  paddd m6, m8
    238  paddd m3, m8
    239  paddd m1, m8
    240  psrad m2, 14
    241  psrad m6, 14
    242  psrad m3, 14
    243  psrad m1, 14
    244  packssdw m2, m3
    245  packssdw m6, m1
    246 
    247  pmulhrsw m7, m12
    248  pmulhrsw m9, m12
    249 
    250  punpcklwd m3, m5, m4
    251  punpckhwd m5, m4
    252  pmaddwd m1, m3, [GLOBAL(pw_15137_6270)]
    253  pmaddwd m3, [GLOBAL(pw_6270_m15137)]
    254  pmaddwd m4, m5, [GLOBAL(pw_15137_6270)]
    255  pmaddwd m5, [GLOBAL(pw_6270_m15137)]
    256  paddd m1, m8
    257  paddd m3, m8
    258  paddd m4, m8
    259  paddd m5, m8
    260  psrad m1, 14
    261  psrad m3, 14
    262  psrad m4, 14
    263  psrad m5, 14
    264  packssdw m1, m4
    265  packssdw m3, m5
    266 
    267  paddw m4, m0, m9
    268  psubw m0, m9
    269 
    270  paddw m5, m10, m7
    271  psubw m10, m7
    272 
    273  ; stage 4
    274  punpcklwd m9, m5, m4
    275  punpckhwd m5, m4
    276  pmaddwd m7, m9, [GLOBAL(pw_16069_3196)]
    277  pmaddwd m9, [GLOBAL(pw_3196_m16069)]
    278  pmaddwd m4, m5, [GLOBAL(pw_16069_3196)]
    279  pmaddwd m5, [GLOBAL(pw_3196_m16069)]
    280  paddd m7, m8
    281  paddd m9, m8
    282  paddd m4, m8
    283  paddd m5, m8
    284  psrad m7, 14
    285  psrad m9, 14
    286  psrad m4, 14
    287  psrad m5, 14
    288  packssdw m7, m4
    289  packssdw m9, m5
    290 
    291  punpcklwd m4, m10, m0
    292  punpckhwd m10, m0
    293  pmaddwd m5, m4, [GLOBAL(pw_9102_13623)]
    294  pmaddwd m4, [GLOBAL(pw_13623_m9102)]
    295  pmaddwd m0, m10, [GLOBAL(pw_9102_13623)]
    296  pmaddwd m10, [GLOBAL(pw_13623_m9102)]
    297  paddd m5, m8
    298  paddd m4, m8
    299  paddd m0, m8
    300  paddd m10, m8
    301  psrad m5, 14
    302  psrad m4, 14
    303  psrad m0, 14
    304  psrad m10, 14
    305  packssdw m5, m0
    306  packssdw m4, m10
    307 
    308  ; transpose
    309  ; stage 1
    310  punpcklwd m0, m2, m7
    311  punpcklwd m10, m1, m4
    312  punpckhwd m2, m7
    313  punpckhwd m1, m4
    314  punpcklwd m7, m6, m5
    315  punpcklwd m4, m3, m9
    316  punpckhwd m6, m5
    317  punpckhwd m3, m9
    318 
    319  ; stage 2
    320  punpckldq m5, m0, m10
    321  punpckldq m9, m2, m1
    322  punpckhdq m0, m10
    323  punpckhdq m2, m1
    324  punpckldq m10, m7, m4
    325  punpckldq m1, m6, m3
    326  punpckhdq m7, m4
    327  punpckhdq m6, m3
    328 
    329  ; stage 3
    330  punpcklqdq m4, m5, m10
    331  punpckhqdq m5, m10
    332  punpcklqdq m3, m0, m7
    333  punpckhqdq m0, m7
    334  punpcklqdq m10, m9, m1
    335  punpckhqdq m9, m1
    336  punpcklqdq m7, m2, m6
    337  punpckhqdq m2, m6
    338 
    339  psraw m1, m4, 15
    340  psraw m6, m5, 15
    341  psraw m8, m3, 15
    342  psraw m11, m0, 15
    343 
    344  psubw m4, m1
    345  psubw m5, m6
    346  psubw m3, m8
    347  psubw m0, m11
    348 
    349  psraw m4, 1
    350  psraw m5, 1
    351  psraw m3, 1
    352  psraw m0, 1
    353 
    354  psraw m1, m10, 15
    355  psraw m6, m9, 15
    356  psraw m8, m7, 15
    357  psraw m11, m2, 15
    358 
    359  psubw m10, m1
    360  psubw m9, m6
    361  psubw m7, m8
    362  psubw m2, m11
    363 
    364  psraw m10, 1
    365  psraw m9, 1
    366  psraw m7, 1
    367  psraw m2, 1
    368 
    369  STORE_OUTPUT  0,  4
    370  STORE_OUTPUT  8,  5
    371  STORE_OUTPUT 16,  3
    372  STORE_OUTPUT 24,  0
    373  STORE_OUTPUT 32, 10
    374  STORE_OUTPUT 40,  9
    375  STORE_OUTPUT 48,  7
    376  STORE_OUTPUT 56,  2
    377 
    378  RET
    379 %endif