tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simple_idct10_template.asm (11371B)


      1 ;******************************************************************************
      2 ;* x86-SIMD-optimized IDCT for prores
      3 ;* this is identical to "simple" IDCT written by Michael Niedermayer
      4 ;* except for the clip range
      5 ;*
      6 ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
      7 ;*
      8 ;* This file is part of FFmpeg.
      9 ;*
     10 ;* FFmpeg is free software; you can redistribute it and/or
     11 ;* modify it under the terms of the GNU Lesser General Public
     12 ;* License as published by the Free Software Foundation; either
     13 ;* version 2.1 of the License, or (at your option) any later version.
     14 ;*
     15 ;* FFmpeg is distributed in the hope that it will be useful,
     16 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
     17 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     18 ;* Lesser General Public License for more details.
     19 ;*
     20 ;* You should have received a copy of the GNU Lesser General Public
     21 ;* License along with FFmpeg; if not, write to the Free Software
     22 ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     23 ;******************************************************************************
     24 
     25 ; add SECTION_RODATA and proper include before including this file!
     26 
     27 %if ARCH_X86_64
     28 
     29 %macro define_constants 1
     30    %undef w4_plus_w2
     31    %undef w4_min_w2
     32    %undef w4_plus_w6
     33    %undef w4_min_w6
     34    %undef w1_plus_w3
     35    %undef w3_min_w1
     36    %undef w7_plus_w3
     37    %undef w3_min_w7
     38    %define w4_plus_w2 w4_plus_w2%1
     39    %define w4_min_w2  w4_min_w2%1
     40    %define w4_plus_w6 w4_plus_w6%1
     41    %define w4_min_w6  w4_min_w6%1
     42    %define w1_plus_w3 w1_plus_w3%1
     43    %define w3_min_w1  w3_min_w1%1
     44    %define w7_plus_w3 w7_plus_w3%1
     45    %define w3_min_w7  w3_min_w7%1
     46 %endmacro
     47 
     48 ; interleave data while maintaining source
     49 ; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
     50 %macro SBUTTERFLY3 5
     51    punpckl%1   m%2, m%4, m%5
     52    punpckh%1   m%3, m%4, m%5
     53 %endmacro
     54 
     55 ; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
     56 ; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
     57 ;         %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
     58 %macro SUMSUB_SHPK 7
     59    psubd       %3,  %1,  %5       ; { a0 - b0 }[0-3]
     60    psubd       %4,  %2,  %6       ; { a0 - b0 }[4-7]
     61    paddd       %1,  %5            ; { a0 + b0 }[0-3]
     62    paddd       %2,  %6            ; { a0 + b0 }[4-7]
     63    psrad       %1,  %7
     64    psrad       %2,  %7
     65    psrad       %3,  %7
     66    psrad       %4,  %7
     67    packssdw    %1,  %2            ; row[0]
     68    packssdw    %3,  %4            ; row[7]
     69 %endmacro
     70 
     71 ; %1 = initial bias ("" if nop)
     72 ; %2 = number of bits to shift at the end
     73 ; %3 = qmat (for prores)
     74 %macro IDCT_1D 2-3
     75    ; a0 = (W4 * row[0]) + (1 << (15 - 1));
     76    ; a1 = a0;
     77    ; a2 = a0;
     78    ; a3 = a0;
     79    ; a0 += W2 * row[2];
     80    ; a1 += W6 * row[2];
     81    ; a2 -= W6 * row[2];
     82    ; a3 -= W2 * row[2];
     83 %ifstr %1
     84    mova        m15, [pd_round_ %+ %2]
     85 %else
     86    paddw       m10, [%1]
     87 %endif
     88    SBUTTERFLY3 wd,  0,  1, 10,  8 ; { row[0], row[2] }[0-3]/[4-7]
     89    pmaddwd     m2,  m0, [w4_plus_w6]
     90    pmaddwd     m3,  m1, [w4_plus_w6]
     91    pmaddwd     m4,  m0, [w4_min_w6]
     92    pmaddwd     m5,  m1, [w4_min_w6]
     93    pmaddwd     m6,  m0, [w4_min_w2]
     94    pmaddwd     m7,  m1, [w4_min_w2]
     95    pmaddwd     m0, [w4_plus_w2]
     96    pmaddwd     m1, [w4_plus_w2]
     97 %ifstr %1
     98    ; Adding 1<<(%2-1) for >=15 bits values
     99    paddd       m2, m15
    100    paddd       m3, m15
    101    paddd       m4, m15
    102    paddd       m5, m15
    103    paddd       m6, m15
    104    paddd       m7, m15
    105    paddd       m0, m15
    106    paddd       m1, m15
    107 %endif
    108 
    109    ; a0: -1*row[0]-1*row[2]
    110    ; a1: -1*row[0]
    111    ; a2: -1*row[0]
    112    ; a3: -1*row[0]+1*row[2]
    113 
    114    ; a0 +=   W4*row[4] + W6*row[6]; i.e. -1*row[4]
    115    ; a1 -=   W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
    116    ; a2 -=   W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
    117    ; a3 +=   W4*row[4] - W6*row[6]; i.e. -1*row[4]
    118    SBUTTERFLY3 wd,  8,  9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
    119    pmaddwd     m10, m8, [w4_plus_w6]
    120    pmaddwd     m11, m9, [w4_plus_w6]
    121    paddd       m0,  m10            ; a0[0-3]
    122    paddd       m1,  m11            ; a0[4-7]
    123    pmaddwd     m10, m8, [w4_min_w6]
    124    pmaddwd     m11, m9, [w4_min_w6]
    125    paddd       m6,  m10           ; a3[0-3]
    126    paddd       m7,  m11           ; a3[4-7]
    127    pmaddwd     m10, m8, [w4_min_w2]
    128    pmaddwd     m11, m9, [w4_min_w2]
    129    pmaddwd     m8, [w4_plus_w2]
    130    pmaddwd     m9, [w4_plus_w2]
    131    psubd       m4,  m10           ; a2[0-3] intermediate
    132    psubd       m5,  m11           ; a2[4-7] intermediate
    133    psubd       m2,  m8            ; a1[0-3] intermediate
    134    psubd       m3,  m9            ; a1[4-7] intermediate
    135 
    136    ; load/store
    137    mova   [blockq+  0], m0
    138    mova   [blockq+ 32], m2
    139    mova   [blockq+ 64], m4
    140    mova   [blockq+ 96], m6
    141    mova        m10,[blockq+ 16]       ; { row[1] }[0-7]
    142    mova        m8, [blockq+ 48]       ; { row[3] }[0-7]
    143    mova        m13,[blockq+ 80]       ; { row[5] }[0-7]
    144    mova        m14,[blockq+112]       ; { row[7] }[0-7]
    145    mova   [blockq+ 16], m1
    146    mova   [blockq+ 48], m3
    147    mova   [blockq+ 80], m5
    148    mova   [blockq+112], m7
    149 %if %0 == 3
    150    pmullw      m10,[%3+ 16]
    151    pmullw      m8, [%3+ 48]
    152    pmullw      m13,[%3+ 80]
    153    pmullw      m14,[%3+112]
    154 %endif
    155 
    156    ; b0 = MUL(W1, row[1]);
    157    ; MAC(b0, W3, row[3]);
    158    ; b1 = MUL(W3, row[1]);
    159    ; MAC(b1, -W7, row[3]);
    160    ; b2 = MUL(W5, row[1]);
    161    ; MAC(b2, -W1, row[3]);
    162    ; b3 = MUL(W7, row[1]);
    163    ; MAC(b3, -W5, row[3]);
    164    SBUTTERFLY3 wd,  0,  1, 10, 8  ; { row[1], row[3] }[0-3]/[4-7]
    165    pmaddwd     m2,  m0, [w3_min_w7]
    166    pmaddwd     m3,  m1, [w3_min_w7]
    167    pmaddwd     m4,  m0, [w5_min_w1]
    168    pmaddwd     m5,  m1, [w5_min_w1]
    169    pmaddwd     m6,  m0, [w7_min_w5]
    170    pmaddwd     m7,  m1, [w7_min_w5]
    171    pmaddwd     m0, [w1_plus_w3]
    172    pmaddwd     m1, [w1_plus_w3]
    173 
    174    ; b0: +1*row[1]+2*row[3]
    175    ; b1: +2*row[1]-1*row[3]
    176    ; b2: -1*row[1]-1*row[3]
    177    ; b3: +1*row[1]+1*row[3]
    178 
    179    ; MAC(b0,  W5, row[5]);
    180    ; MAC(b0,  W7, row[7]);
    181    ; MAC(b1, -W1, row[5]);
    182    ; MAC(b1, -W5, row[7]);
    183    ; MAC(b2,  W7, row[5]);
    184    ; MAC(b2,  W3, row[7]);
    185    ; MAC(b3,  W3, row[5]);
    186    ; MAC(b3, -W1, row[7]);
    187    SBUTTERFLY3 wd,  8,  9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
    188 
    189    ; b0: -1*row[5]+1*row[7]
    190    ; b1: -1*row[5]+1*row[7]
    191    ; b2: +1*row[5]+2*row[7]
    192    ; b3: +2*row[5]-1*row[7]
    193 
    194    pmaddwd     m10, m8, [w1_plus_w5]
    195    pmaddwd     m11, m9, [w1_plus_w5]
    196    pmaddwd     m12, m8, [w5_plus_w7]
    197    pmaddwd     m13, m9, [w5_plus_w7]
    198    psubd       m2,  m10           ; b1[0-3]
    199    psubd       m3,  m11           ; b1[4-7]
    200    paddd       m0,  m12            ; b0[0-3]
    201    paddd       m1,  m13            ; b0[4-7]
    202    pmaddwd     m12, m8, [w7_plus_w3]
    203    pmaddwd     m13, m9, [w7_plus_w3]
    204    pmaddwd     m8, [w3_min_w1]
    205    pmaddwd     m9, [w3_min_w1]
    206    paddd       m4,  m12           ; b2[0-3]
    207    paddd       m5,  m13           ; b2[4-7]
    208    paddd       m6,  m8            ; b3[0-3]
    209    paddd       m7,  m9            ; b3[4-7]
    210 
    211    ; row[0] = (a0 + b0) >> 15;
    212    ; row[7] = (a0 - b0) >> 15;
    213    ; row[1] = (a1 + b1) >> 15;
    214    ; row[6] = (a1 - b1) >> 15;
    215    ; row[2] = (a2 + b2) >> 15;
    216    ; row[5] = (a2 - b2) >> 15;
    217    ; row[3] = (a3 + b3) >> 15;
    218    ; row[4] = (a3 - b3) >> 15;
    219    mova        m8, [blockq+ 0]        ; a0[0-3]
    220    mova        m9, [blockq+16]        ; a0[4-7]
    221    SUMSUB_SHPK m8,  m9,  m10, m11, m0,  m1,  %2
    222    mova        m0, [blockq+32]        ; a1[0-3]
    223    mova        m1, [blockq+48]        ; a1[4-7]
    224    SUMSUB_SHPK m0,  m1,  m9,  m11, m2,  m3,  %2
    225    mova        m1, [blockq+64]        ; a2[0-3]
    226    mova        m2, [blockq+80]        ; a2[4-7]
    227    SUMSUB_SHPK m1,  m2,  m11, m3,  m4,  m5,  %2
    228    mova        m2, [blockq+96]        ; a3[0-3]
    229    mova        m3, [blockq+112]       ; a3[4-7]
    230    SUMSUB_SHPK m2,  m3,  m4,  m5,  m6,  m7,  %2
    231 %endmacro
    232 
    233 ; void ff_prores_idct_put_10_<opt>(uint8_t *pixels, ptrdiff_t stride,
    234 ;                                  int16_t *block, const int16_t *qmat);
    235 
    236 ; %1 = row shift
    237 ; %2 = row bias macro
    238 ; %3 = column shift
    239 ; %4 = column bias macro
    240 ; %5 = final action (nothing, "store", "put", "add")
    241 ; %6 = min pixel value
    242 ; %7 = max pixel value
    243 ; %8 = qmat (for prores)
    244 
    245 %macro IDCT_FN 4-8
    246    ; for (i = 0; i < 8; i++)
    247    ;     idctRowCondDC(block + i*8);
    248    mova        m10,[blockq+ 0]        ; { row[0] }[0-7]
    249    mova        m8, [blockq+32]        ; { row[2] }[0-7]
    250    mova        m13,[blockq+64]        ; { row[4] }[0-7]
    251    mova        m12,[blockq+96]        ; { row[6] }[0-7]
    252 
    253 %if %0 == 8
    254    pmullw      m10,[%8+ 0]
    255    pmullw      m8, [%8+32]
    256    pmullw      m13,[%8+64]
    257    pmullw      m12,[%8+96]
    258 
    259    IDCT_1D     %1, %2, %8
    260 %elif %2 == 11
    261    ; This copies the DC-only shortcut.  When there is only a DC coefficient the
    262    ; C shifts the value and splats it to all coeffs rather than multiplying and
    263    ; doing the full IDCT.  This causes a difference on 8-bit because the
    264    ; coefficient is 16383 rather than 16384 (which you can get with shifting).
    265    por      m1,  m8, m13
    266    por      m1,  m12
    267    por      m1, [blockq+ 16]       ; { row[1] }[0-7]
    268    por      m1, [blockq+ 48]       ; { row[3] }[0-7]
    269    por      m1, [blockq+ 80]       ; { row[5] }[0-7]
    270    por      m1, [blockq+112]       ; { row[7] }[0-7]
    271    pxor     m2,  m2
    272    pcmpeqw  m1,  m2
    273    psllw    m2,  m10, 3
    274    pand     m2,  m1
    275    pcmpeqb  m3,  m3
    276    pxor     m1,  m3
    277    mova    [rsp],    m1
    278    mova    [rsp+16], m2
    279 
    280    IDCT_1D  %1,  %2
    281 
    282    mova     m5, [rsp]
    283    mova     m6, [rsp+16]
    284    pand     m8,  m5
    285    por      m8,  m6
    286    pand     m0,  m5
    287    por      m0,  m6
    288    pand     m1,  m5
    289    por      m1,  m6
    290    pand     m2,  m5
    291    por      m2,  m6
    292    pand     m4,  m5
    293    por      m4,  m6
    294    pand     m11, m5
    295    por      m11, m6
    296    pand     m9,  m5
    297    por      m9,  m6
    298    pand     m10, m5
    299    por      m10, m6
    300 %else
    301    IDCT_1D     %1, %2
    302 %endif
    303 
    304    ; transpose for second part of IDCT
    305    TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
    306    mova   [blockq+ 16], m0
    307    mova   [blockq+ 48], m2
    308    mova   [blockq+ 80], m11
    309    mova   [blockq+112], m10
    310    SWAP         8,  10
    311    SWAP         1,   8
    312    SWAP         4,  13
    313    SWAP         9,  12
    314 
    315    ; for (i = 0; i < 8; i++)
    316    ;     idctSparseColAdd(dest + i, line_size, block + i);
    317    IDCT_1D     %3, %4
    318 
    319    ; clip/store
    320 %if %0 >= 5
    321 %ifidn %5,"store"
    322    ; No clamping, means pure idct
    323    mova  [blockq+  0], m8
    324    mova  [blockq+ 16], m0
    325    mova  [blockq+ 32], m1
    326    mova  [blockq+ 48], m2
    327    mova  [blockq+ 64], m4
    328    mova  [blockq+ 80], m11
    329    mova  [blockq+ 96], m9
    330    mova  [blockq+112], m10
    331 %elifidn %5,"put"
    332 %ifidn %6, 0
    333    pxor        m3, m3
    334 %else
    335    mova        m3, [%6]
    336 %endif ; ifidn %6, 0
    337    mova        m5, [%7]
    338    pmaxsw      m8,  m3
    339    pmaxsw      m0,  m3
    340    pmaxsw      m1,  m3
    341    pmaxsw      m2,  m3
    342    pmaxsw      m4,  m3
    343    pmaxsw      m11, m3
    344    pmaxsw      m9,  m3
    345    pmaxsw      m10, m3
    346    pminsw      m8,  m5
    347    pminsw      m0,  m5
    348    pminsw      m1,  m5
    349    pminsw      m2,  m5
    350    pminsw      m4,  m5
    351    pminsw      m11, m5
    352    pminsw      m9,  m5
    353    pminsw      m10, m5
    354 
    355    lea         r2, [r1*3]
    356    mova  [r0     ], m8
    357    mova  [r0+r1  ], m0
    358    mova  [r0+r1*2], m1
    359    mova  [r0+r2  ], m2
    360    lea         r0, [r0+r1*4]
    361    mova  [r0     ], m4
    362    mova  [r0+r1  ], m11
    363    mova  [r0+r1*2], m9
    364    mova  [r0+r2  ], m10
    365 %endif ; %5 action
    366 %endif; if %0 >= 5
    367 %endmacro
    368 
    369 %endif