tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simple_idct_armv6.S (13304B)


      1 /*
      2 * Simple IDCT
      3 *
      4 * Copyright (c) 2001 Michael Niedermayer <michaelni@gmx.at>
      5 * Copyright (c) 2007 Mans Rullgard <mans@mansr.com>
      6 *
      7 * This file is part of FFmpeg.
      8 *
      9 * FFmpeg is free software; you can redistribute it and/or
     10 * modify it under the terms of the GNU Lesser General Public
     11 * License as published by the Free Software Foundation; either
     12 * version 2.1 of the License, or (at your option) any later version.
     13 *
     14 * FFmpeg is distributed in the hope that it will be useful,
     15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     17 * Lesser General Public License for more details.
     18 *
     19 * You should have received a copy of the GNU Lesser General Public
     20 * License along with FFmpeg; if not, write to the Free Software
     21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     22 */
     23 
     24 #include "libavutil/arm/asm.S"
     25 
     26 #define W1  22725   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     27 #define W2  21407   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     28 #define W3  19266   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     29 #define W4  16383   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     30 #define W5  12873   /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     31 #define W6  8867    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     32 #define W7  4520    /* cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 */
     33 #define ROW_SHIFT 11
     34 #define COL_SHIFT 20
     35 
     36 #define W13 (W1 | (W3 << 16))
     37 #define W26 (W2 | (W6 << 16))
     38 #define W42 (W4 | (W2 << 16))
     39 #define W42n (-W4&0xffff | (-W2 << 16))
     40 #define W46 (W4 | (W6 << 16))
     41 #define W57 (W5 | (W7 << 16))
     42 
     43 /*
     44  Compute partial IDCT of single row.
     45  shift = left-shift amount
     46  r0 = source address
     47  r2 = row[2,0] <= 2 cycles
     48  r3 = row[3,1]
     49  ip = w42      <= 2 cycles
     50 
     51  Output in registers r4--r11
     52 */
     53        .macro idct_row shift
     54        ldr    lr, =W46              /* lr  = W4 | (W6 << 16) */
     55        mov    r1, #(1<<(\shift-1))
     56        smlad  r4, r2, ip, r1
     57        smlsd  r7, r2, ip, r1
     58        ldr    ip, =W13              /* ip  = W1 | (W3 << 16) */
     59        ldr    r10,=W57              /* r10 = W5 | (W7 << 16) */
     60        smlad  r5, r2, lr, r1
     61        smlsd  r6, r2, lr, r1
     62 
     63        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
     64        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
     65        ldr    lr, [r0, #12]         /* lr  =  row[7,5] */
     66        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
     67        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
     68        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
     69        smlad  r8, lr, r10,r8        /* B0  +=      W5*row[5] + W7*row[7] */
     70        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
     71 
     72        ldr    r3, =W42n             /* r3 =  -W4 | (-W2 << 16) */
     73        smlad  r10,lr, r2, r10       /* B2 +=  W7*row[5] + W3*row[7] */
     74        ldr    r2, [r0, #4]          /* r2 =   row[6,4] */
     75        smlsdx r11,lr, ip, r11       /* B3 +=  W3*row[5] - W1*row[7] */
     76        ldr    ip, =W46              /* ip =   W4 | (W6 << 16) */
     77        smlad  r9, lr, r1, r9        /* B1 -=  W1*row[5] + W5*row[7] */
     78 
     79        smlad  r5, r2, r3, r5        /* A1 += -W4*row[4] - W2*row[6] */
     80        smlsd  r6, r2, r3, r6        /* A2 += -W4*row[4] + W2*row[6] */
     81        smlad  r4, r2, ip, r4        /* A0 +=  W4*row[4] + W6*row[6] */
     82        smlsd  r7, r2, ip, r7        /* A3 +=  W4*row[4] - W6*row[6] */
     83        .endm
     84 
     85 /*
     86  Compute partial IDCT of half row.
     87  shift = left-shift amount
     88  r2 = row[2,0]
     89  r3 = row[3,1]
     90  ip = w42
     91 
     92  Output in registers r4--r11
     93 */
     94        .macro idct_row4 shift
     95        ldr    lr, =W46              /* lr =  W4 | (W6 << 16) */
     96        ldr    r10,=W57              /* r10 = W5 | (W7 << 16) */
     97        mov    r1, #(1<<(\shift-1))
     98        smlad  r4, r2, ip, r1
     99        smlsd  r7, r2, ip, r1
    100        ldr    ip, =W13              /* ip =  W1 | (W3 << 16) */
    101        smlad  r5, r2, lr, r1
    102        smlsd  r6, r2, lr, r1
    103        smusdx r11,r3, r10           /* r11 =  B3 = W7*row[1] - W5*row[3] */
    104        smuad  r8, r3, ip            /* r8  =  B0 = W1*row[1] + W3*row[3] */
    105        pkhtb  r2, ip, r10,asr #16   /* r3  =  W7 | (W3 << 16) */
    106        pkhbt  r1, ip, r10,lsl #16   /* r1  =  W1 | (W5 << 16) */
    107        smusdx r9, r2, r3            /* r9  = -B1 = W7*row[3] - W3*row[1] */
    108        smusdx r10,r3, r1            /* r10 =  B2 = W5*row[1] - W1*row[3] */
    109        .endm
    110 
    111 /*
    112  Compute final part of IDCT single row without shift.
    113  Input in registers r4--r11
    114  Output in registers ip, r4--r6, lr, r8--r10
    115 */
    116        .macro idct_finish
    117        add    ip, r4, r8            /* r1 = A0 + B0 */
    118        sub    lr, r4, r8            /* r2 = A0 - B0 */
    119        sub    r4, r5, r9            /* r2 = A1 + B1 */
    120        add    r8, r5, r9            /* r2 = A1 - B1 */
    121        add    r5, r6, r10           /* r1 = A2 + B2 */
    122        sub    r9, r6, r10           /* r1 = A2 - B2 */
    123        add    r6, r7, r11           /* r2 = A3 + B3 */
    124        sub    r10,r7, r11           /* r2 = A3 - B3 */
    125        .endm
    126 
    127 /*
    128  Compute final part of IDCT single row.
    129  shift = right-shift amount
    130  Input/output in registers r4--r11
    131 */
    132        .macro idct_finish_shift shift
    133        add    r3, r4, r8            /* r3 = A0 + B0 */
    134        sub    r2, r4, r8            /* r2 = A0 - B0 */
    135        mov    r4, r3, asr #\shift
    136        mov    r8, r2, asr #\shift
    137 
    138        sub    r3, r5, r9            /* r3 = A1 + B1 */
    139        add    r2, r5, r9            /* r2 = A1 - B1 */
    140        mov    r5, r3, asr #\shift
    141        mov    r9, r2, asr #\shift
    142 
    143        add    r3, r6, r10           /* r3 = A2 + B2 */
    144        sub    r2, r6, r10           /* r2 = A2 - B2 */
    145        mov    r6, r3, asr #\shift
    146        mov    r10,r2, asr #\shift
    147 
    148        add    r3, r7, r11           /* r3 = A3 + B3 */
    149        sub    r2, r7, r11           /* r2 = A3 - B3 */
    150        mov    r7, r3, asr #\shift
    151        mov    r11,r2, asr #\shift
    152        .endm
    153 
    154 /*
    155  Compute final part of IDCT single row, saturating results at 8 bits.
    156  shift = right-shift amount
    157  Input/output in registers r4--r11
    158 */
    159        .macro idct_finish_shift_sat shift
    160        add    r3, r4, r8            /* r3 = A0 + B0 */
    161        sub    ip, r4, r8            /* ip = A0 - B0 */
    162        usat   r4, #8, r3, asr #\shift
    163        usat   r8, #8, ip, asr #\shift
    164 
    165        sub    r3, r5, r9            /* r3 = A1 + B1 */
    166        add    ip, r5, r9            /* ip = A1 - B1 */
    167        usat   r5, #8, r3, asr #\shift
    168        usat   r9, #8, ip, asr #\shift
    169 
    170        add    r3, r6, r10           /* r3 = A2 + B2 */
    171        sub    ip, r6, r10           /* ip = A2 - B2 */
    172        usat   r6, #8, r3, asr #\shift
    173        usat   r10,#8, ip, asr #\shift
    174 
    175        add    r3, r7, r11           /* r3 = A3 + B3 */
    176        sub    ip, r7, r11           /* ip = A3 - B3 */
    177        usat   r7, #8, r3, asr #\shift
    178        usat   r11,#8, ip, asr #\shift
    179        .endm
    180 
    181 /*
    182  Compute IDCT of single row, storing as column.
    183  r0 = source
    184  r1 = dest
    185 */
    186 function idct_row_armv6
    187        push   {lr}
    188 
    189        ldr    lr, [r0, #12]         /* lr = row[7,5] */
    190        ldr    ip, [r0, #4]          /* ip = row[6,4] */
    191        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
    192        ldr    r2, [r0]              /* r2 = row[2,0] */
    193        orrs   lr, lr, ip
    194        itt    eq
    195        cmpeq  lr, r3
    196        cmpeq  lr, r2, lsr #16
    197        beq    1f
    198        push   {r1}
    199        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
    200        cmp    lr, #0
    201        beq    2f
    202 
    203        idct_row   ROW_SHIFT
    204        b      3f
    205 
    206 2:      idct_row4  ROW_SHIFT
    207 
    208 3:      pop    {r1}
    209        idct_finish_shift ROW_SHIFT
    210 
    211        strh   r4, [r1]
    212        strh   r5, [r1, #(16*2)]
    213        strh   r6, [r1, #(16*4)]
    214        strh   r7, [r1, #(16*6)]
    215        strh   r11,[r1, #(16*1)]
    216        strh   r10,[r1, #(16*3)]
    217        strh   r9, [r1, #(16*5)]
    218        strh   r8, [r1, #(16*7)]
    219 
    220        pop    {pc}
    221 
    222 1:      mov    r2, r2, lsl #3
    223        strh   r2, [r1]
    224        strh   r2, [r1, #(16*2)]
    225        strh   r2, [r1, #(16*4)]
    226        strh   r2, [r1, #(16*6)]
    227        strh   r2, [r1, #(16*1)]
    228        strh   r2, [r1, #(16*3)]
    229        strh   r2, [r1, #(16*5)]
    230        strh   r2, [r1, #(16*7)]
    231        pop    {pc}
    232 endfunc
    233 
    234 /*
    235  Compute IDCT of single column, read as row.
    236  r0 = source
    237  r1 = dest
    238 */
    239 function idct_col_armv6
    240        push   {r1, lr}
    241 
    242        ldr    r2, [r0]              /* r2 = row[2,0] */
    243        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
    244        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
    245        idct_row COL_SHIFT
    246        pop    {r1}
    247        idct_finish_shift COL_SHIFT
    248 
    249        strh   r4, [r1]
    250        strh   r5, [r1, #(16*1)]
    251        strh   r6, [r1, #(16*2)]
    252        strh   r7, [r1, #(16*3)]
    253        strh   r11,[r1, #(16*4)]
    254        strh   r10,[r1, #(16*5)]
    255        strh   r9, [r1, #(16*6)]
    256        strh   r8, [r1, #(16*7)]
    257 
    258        pop    {pc}
    259 endfunc
    260 
    261 /*
    262  Compute IDCT of single column, read as row, store saturated 8-bit.
    263  r0 = source
    264  r1 = dest
    265  r2 = line size
    266 */
    267 function idct_col_put_armv6
    268        push   {r1, r2, lr}
    269 
    270        ldr    r2, [r0]              /* r2 = row[2,0] */
    271        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
    272        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
    273        idct_row COL_SHIFT
    274        pop    {r1, r2}
    275        idct_finish_shift_sat COL_SHIFT
    276 
    277        strb_post r4, r1, r2
    278        strb_post r5, r1, r2
    279        strb_post r6, r1, r2
    280        strb_post r7, r1, r2
    281        strb_post r11,r1, r2
    282        strb_post r10,r1, r2
    283        strb_post r9, r1, r2
    284        strb_post r8, r1, r2
    285 
    286        sub    r1, r1, r2, lsl #3
    287 
    288        pop    {pc}
    289 endfunc
    290 
    291 /*
    292  Compute IDCT of single column, read as row, add/store saturated 8-bit.
    293  r0 = source
    294  r1 = dest
    295  r2 = line size
    296 */
    297 function idct_col_add_armv6
    298        push   {r1, r2, lr}
    299 
    300        ldr    r2, [r0]              /* r2 = row[2,0] */
    301        ldr    ip, =W42              /* ip = W4 | (W2 << 16) */
    302        ldr    r3, [r0, #8]          /* r3 = row[3,1] */
    303        idct_row COL_SHIFT
    304        pop    {r1, r2}
    305        idct_finish
    306 
    307        ldrb   r3, [r1]
    308        ldrb   r7, [r1, r2]
    309        ldrb   r11,[r1, r2, lsl #2]
    310        add    ip, r3, ip, asr #COL_SHIFT
    311        usat   ip, #8, ip
    312        add    r4, r7, r4, asr #COL_SHIFT
    313        strb_post ip, r1, r2
    314        ldrb   ip, [r1, r2]
    315        usat   r4, #8, r4
    316        ldrb   r11,[r1, r2, lsl #2]
    317        add    r5, ip, r5, asr #COL_SHIFT
    318        usat   r5, #8, r5
    319        strb_post r4, r1, r2
    320        ldrb   r3, [r1, r2]
    321        ldrb   ip, [r1, r2, lsl #2]
    322        strb_post r5, r1, r2
    323        ldrb   r7, [r1, r2]
    324        ldrb   r4, [r1, r2, lsl #2]
    325        add    r6, r3, r6, asr #COL_SHIFT
    326        usat   r6, #8, r6
    327        add    r10,r7, r10,asr #COL_SHIFT
    328        usat   r10,#8, r10
    329        add    r9, r11,r9, asr #COL_SHIFT
    330        usat   r9, #8, r9
    331        add    r8, ip, r8, asr #COL_SHIFT
    332        usat   r8, #8, r8
    333        add    lr, r4, lr, asr #COL_SHIFT
    334        usat   lr, #8, lr
    335        strb_post r6, r1, r2
    336        strb_post r10,r1, r2
    337        strb_post r9, r1, r2
    338        strb_post r8, r1, r2
    339        strb_post lr, r1, r2
    340 
    341        sub    r1, r1, r2, lsl #3
    342 
    343        pop    {pc}
    344 endfunc
    345 
    346 /*
    347  Compute 8 IDCT row transforms.
    348  func = IDCT row->col function
    349  width = width of columns in bytes
    350 */
    351        .macro idct_rows func width
    352        bl     \func
    353        add    r0, r0, #(16*2)
    354        add    r1, r1, #\width
    355        bl     \func
    356        add    r0, r0, #(16*2)
    357        add    r1, r1, #\width
    358        bl     \func
    359        add    r0, r0, #(16*2)
    360        add    r1, r1, #\width
    361        bl     \func
    362        sub    r0, r0, #(16*5)
    363        add    r1, r1, #\width
    364        bl     \func
    365        add    r0, r0, #(16*2)
    366        add    r1, r1, #\width
    367        bl     \func
    368        add    r0, r0, #(16*2)
    369        add    r1, r1, #\width
    370        bl     \func
    371        add    r0, r0, #(16*2)
    372        add    r1, r1, #\width
    373        bl     \func
    374 
    375        sub    r0, r0, #(16*7)
    376        .endm
    377 
    378 /* void ff_simple_idct_armv6(int16_t *data); */
    379 function ff_simple_idct_armv6, export=1
    380        push   {r4-r11, lr}
    381        sub    sp, sp, #128
    382 
    383        mov    r1, sp
    384        idct_rows idct_row_armv6, 2
    385        mov    r1, r0
    386        mov    r0, sp
    387        idct_rows idct_col_armv6, 2
    388 
    389        add    sp, sp, #128
    390        pop    {r4-r11, pc}
    391 endfunc
    392 
    393 /* ff_simple_idct_add_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
    394 function ff_simple_idct_add_armv6, export=1
    395        push   {r0, r1, r4-r11, lr}
    396        sub    sp, sp, #128
    397 
    398        mov    r0, r2
    399        mov    r1, sp
    400        idct_rows idct_row_armv6, 2
    401        mov    r0, sp
    402        ldr    r1, [sp, #128]
    403        ldr    r2, [sp, #(128+4)]
    404        idct_rows idct_col_add_armv6, 1
    405 
    406        add    sp, sp, #(128+8)
    407        pop    {r4-r11, pc}
    408 endfunc
    409 
    410 /* ff_simple_idct_put_armv6(uint8_t *dest, ptrdiff_t line_size, int16_t *data); */
    411 function ff_simple_idct_put_armv6, export=1
    412        push   {r0, r1, r4-r11, lr}
    413        sub    sp, sp, #128
    414 
    415        mov    r0, r2
    416        mov    r1, sp
    417        idct_rows idct_row_armv6, 2
    418        mov    r0, sp
    419        ldr    r1, [sp, #128]
    420        ldr    r2, [sp, #(128+4)]
    421        idct_rows idct_col_put_armv6, 1
    422 
    423        add    sp, sp, #(128+8)
    424        pop    {r4-r11, pc}
    425 endfunc