tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

simple_idct_arm.S (21694B)


      1 /*
      2 * Copyright (C) 2002 Frederic 'dilb' Boulay
      3 *
      4 * Author: Frederic Boulay <dilb@handhelds.org>
      5 *
      6 * The function defined in this file is derived from the simple_idct function
      7 * from the libavcodec library part of the FFmpeg project.
      8 *
      9 * This file is part of FFmpeg.
     10 *
     11 * FFmpeg is free software; you can redistribute it and/or
     12 * modify it under the terms of the GNU Lesser General Public
     13 * License as published by the Free Software Foundation; either
     14 * version 2.1 of the License, or (at your option) any later version.
     15 *
     16 * FFmpeg is distributed in the hope that it will be useful,
     17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
     18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
     19 * Lesser General Public License for more details.
     20 *
     21 * You should have received a copy of the GNU Lesser General Public
     22 * License along with FFmpeg; if not, write to the Free Software
     23 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
     24 */
     25 
     26 #include "libavutil/arm/asm.S"
     27 
     28 /* useful constants for the algorithm */
     29 #define W1  22725
     30 #define W2  21407
     31 #define W3  19266
     32 #define W4  16383
     33 #define W5  12873
     34 #define W6  8867
     35 #define W7  4520
     36 #define MASK_MSHW 0xFFFF0000
     37 
     38 #define ROW_SHIFT 11
     39 #define ROW_SHIFT2MSHW (16-11)
     40 #define COL_SHIFT 20
     41 #define ROW_SHIFTED_1 1024 /* 1<< (ROW_SHIFT-1) */
     42 #define COL_SHIFTED_1 524288 /* 1<< (COL_SHIFT-1) */
     43 
     44 
     45 function ff_simple_idct_arm, export=1
     46        @@ void simple_idct_arm(int16_t *block)
     47        @@ save stack for reg needed (take all of them),
     48        @@ R0-R3 are scratch regs, so no need to save them, but R0 contains the pointer to block
     49        @@ so it must not be overwritten, if it is not saved!!
     50        @@ R12 is another scratch register, so it should not be saved too
     51        @@ save all registers
     52        stmfd sp!, {r4-r11, r14} @ R14 is also called LR
     53        @@ at this point, R0=block, other registers are free.
     54        add r14, r0, #112        @ R14=&block[8*7], better start from the last row, and decrease the value until row=0, i.e. R12=block.
     55        @@ add 2 temporary variables in the stack: R0 and R14
     56        sub sp, sp, #8          @ allow 2 local variables
     57        str r0, [sp, #0]        @ save block in sp[0]
     58        @@ stack status
     59        @@ sp+4   free
     60        @@ sp+0   R0  (block)
     61 
     62 
     63        @@ at this point, R0=block, R14=&block[56], R12=__const_ptr_, R1-R11 free
     64 
     65 
     66 __row_loop:
     67        @@ read the row and check if it is null, almost null, or not, according to strongarm specs, it is not necessary to optimize ldr accesses (i.e. split 32 bits in two 16-bit words), at least it gives more usable registers :)
     68        ldr r1, [r14, #0]        @ R1=(int32)(R12)[0]=ROWr32[0] (relative row cast to a 32b pointer)
     69        ldr r2, [r14, #4]        @ R2=(int32)(R12)[1]=ROWr32[1]
     70        ldr r3, [r14, #8]        @ R3=ROWr32[2]
     71        ldr r4, [r14, #12]       @ R4=ROWr32[3]
     72        @@ check if the words are null, if all of them are null, then proceed with next row (branch __end_row_loop),
     73        @@ if ROWr16[0] is the only one not null, then proceed with this special case (branch __almost_empty_row)
     74        @@ else follow the complete algorithm.
     75        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
     76        @@                R3=ROWr32[2], R4=ROWr32[3], R5-R11 free
     77        orr r5, r4, r3           @ R5=R4 | R3
     78        orr r5, r5, r2           @ R5=R4 | R3 | R2
     79        orrs r6, r5, r1          @ Test R5 | R1 (the aim is to check if everything is null)
     80        beq __end_row_loop
     81        mov r7, r1, asr #16      @ R7=R1>>16=ROWr16[1] (evaluate it now, as it could be useful later)
     82        ldrsh r6, [r14, #0]      @ R6=ROWr16[0]
     83        orrs r5, r5, r7          @ R5=R4 | R3 | R2 | R7
     84        beq __almost_empty_row
     85 
     86 @@ __b_evaluation:
     87        @@ at this point, R0=block (temp),  R1(free), R2=ROWr32[1], R3=ROWr32[2], R4=ROWr32[3],
     88        @@     R5=(temp), R6=ROWr16[0], R7=ROWr16[1], R8-R11 free,
     89        @@     R12=__const_ptr_, R14=&block[n]
     90        @@ to save some registers/calls, proceed with b0-b3 first, followed by a0-a3
     91 
     92        @@ MUL16(b0, W1, row[1]);
     93        @@ MUL16(b1, W3, row[1]);
     94        @@ MUL16(b2, W5, row[1]);
     95        @@ MUL16(b3, W7, row[1]);
     96        @@ MAC16(b0, W3, row[3]);
     97        @@ MAC16(b1, -W7, row[3]);
     98        @@ MAC16(b2, -W1, row[3]);
     99        @@ MAC16(b3, -W5, row[3]);
    100        ldr r8, =W1              @ R8=W1
    101        mov r2, r2, asr #16      @ R2=ROWr16[3]
    102        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    103        ldr r9, =W3              @ R9=W3
    104        ldr r10, =W5             @ R10=W5
    105        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    106        ldr r11, =W7             @ R11=W7
    107        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    108        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    109        teq r2, #0               @ if null avoid muls
    110        itttt ne
    111        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    112        rsbne r2, r2, #0         @ R2=-ROWr16[3]
    113        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    114        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    115        it    ne
    116        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    117 
    118        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
    119        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
    120        @@     R12=__const_ptr_, R14=&block[n]
    121        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
    122        @@ if (temp != 0) {}
    123        orrs r2, r3, r4          @ R2=ROWr32[2] | ROWr32[3]
    124        beq __end_b_evaluation
    125 
    126        @@ at this point, R0=b0,  R1=b1, R2 (free), R3=ROWr32[2], R4=ROWr32[3],
    127        @@     R5=b2, R6=ROWr16[0], R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
    128        @@     R12=__const_ptr_, R14=&block[n]
    129        @@ MAC16(b0, W5, row[5]);
    130        @@ MAC16(b2, W7, row[5]);
    131        @@ MAC16(b3, W3, row[5]);
    132        @@ MAC16(b1, -W1, row[5]);
    133        @@ MAC16(b0, W7, row[7]);
    134        @@ MAC16(b2, W3, row[7]);
    135        @@ MAC16(b3, -W1, row[7]);
    136        @@ MAC16(b1, -W5, row[7]);
    137        mov r3, r3, asr #16      @ R3=ROWr16[5]
    138        teq r3, #0               @ if null avoid muls
    139        it    ne
    140        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5]=b0
    141        mov r4, r4, asr #16      @ R4=ROWr16[7]
    142        itttt ne
    143        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5]=b2
    144        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5]=b3
    145        rsbne r3, r3, #0         @ R3=-ROWr16[5]
    146        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5]=b1
    147        @@ R3 is free now
    148        teq r4, #0               @ if null avoid muls
    149        itttt ne
    150        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7]=b0
    151        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7]=b2
    152        rsbne r4, r4, #0         @ R4=-ROWr16[7]
    153        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7]=b3
    154        it    ne
    155        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7]=b1
    156        @@ R4 is free now
    157 __end_b_evaluation:
    158        @@ at this point, R0=b0,  R1=b1, R2=ROWr32[2] | ROWr32[3] (tmp), R3 (free), R4 (free),
    159        @@     R5=b2, R6=ROWr16[0], R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
    160        @@     R12=__const_ptr_, R14=&block[n]
    161 
    162 @@ __a_evaluation:
    163        @@ a0 = (W4 * row[0]) + (1 << (ROW_SHIFT - 1));
    164        @@ a1 = a0 + W6 * row[2];
    165        @@ a2 = a0 - W6 * row[2];
    166        @@ a3 = a0 - W2 * row[2];
    167        @@ a0 = a0 + W2 * row[2];
    168        ldr r9, =W4              @ R9=W4
    169        mul r6, r9, r6           @ R6=W4*ROWr16[0]
    170        ldr r10, =W6             @ R10=W6
    171        ldrsh r4, [r14, #4]      @ R4=ROWr16[2] (a3 not defined yet)
    172        add r6, r6, #ROW_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(ROW_SHIFT-1) (a0)
    173 
    174        mul r11, r10, r4         @ R11=W6*ROWr16[2]
    175        ldr r8, =W2              @ R8=W2
    176        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
    177        @@ temp = ((uint32_t*)row)[2] | ((uint32_t*)row)[3];
    178        @@ if (temp != 0) {}
    179        teq r2, #0
    180        beq __end_bef_a_evaluation
    181 
    182        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
    183        mul r11, r8, r4          @ R11=W2*ROWr16[2]
    184        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
    185        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
    186 
    187 
    188        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
    189        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
    190        @@     R12=__const_ptr_, R14=&block[n]
    191 
    192 
    193        @@ a0 += W4*row[4]
    194        @@ a1 -= W4*row[4]
    195        @@ a2 -= W4*row[4]
    196        @@ a3 += W4*row[4]
    197        ldrsh r11, [r14, #8]     @ R11=ROWr16[4]
    198        teq r11, #0              @ if null avoid muls
    199        it    ne
    200        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
    201        @@ R9 is free now
    202        ldrsh r9, [r14, #12]     @ R9=ROWr16[6]
    203        itttt ne
    204        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
    205        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
    206        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
    207        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
    208        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
    209        teq r9, #0               @ if null avoid muls
    210        itttt ne
    211        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
    212        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
    213        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
    214        @@ a0 += W6*row[6];
    215        @@ a3 -= W6*row[6];
    216        @@ a1 -= W2*row[6];
    217        @@ a2 += W2*row[6];
    218        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
    219        itt   ne
    220        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
    221        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
    222 
    223 __end_a_evaluation:
    224        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
    225        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
    226        @@     R12=__const_ptr_, R14=&block[n]
    227        @@ row[0] = (a0 + b0) >> ROW_SHIFT;
    228        @@ row[1] = (a1 + b1) >> ROW_SHIFT;
    229        @@ row[2] = (a2 + b2) >> ROW_SHIFT;
    230        @@ row[3] = (a3 + b3) >> ROW_SHIFT;
    231        @@ row[4] = (a3 - b3) >> ROW_SHIFT;
    232        @@ row[5] = (a2 - b2) >> ROW_SHIFT;
    233        @@ row[6] = (a1 - b1) >> ROW_SHIFT;
    234        @@ row[7] = (a0 - b0) >> ROW_SHIFT;
    235        add r8, r6, r0           @ R8=a0+b0
    236        add r9, r2, r1           @ R9=a1+b1
    237        @@ put two 16-bit half-words in a 32-bit word
    238        @@ ROWr32[0]=ROWr16[0] | (ROWr16[1]<<16) (only little-endian compliant then!!!)
    239        ldr r10, =MASK_MSHW      @ R10=0xFFFF0000
    240        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a1+b1)<<5)
    241        mvn r11, r10             @ R11= NOT R10= 0x0000FFFF
    242        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a0+b0)>>11)
    243        orr r8, r8, r9
    244        str r8, [r14, #0]
    245 
    246        add r8, r3, r5           @ R8=a2+b2
    247        add r9, r4, r7           @ R9=a3+b3
    248        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a3+b3)<<5)
    249        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a2+b2)>>11)
    250        orr r8, r8, r9
    251        str r8, [r14, #4]
    252 
    253        sub r8, r4, r7           @ R8=a3-b3
    254        sub r9, r3, r5           @ R9=a2-b2
    255        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a2-b2)<<5)
    256        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a3-b3)>>11)
    257        orr r8, r8, r9
    258        str r8, [r14, #8]
    259 
    260        sub r8, r2, r1           @ R8=a1-b1
    261        sub r9, r6, r0           @ R9=a0-b0
    262        and r9, r10, r9, lsl #ROW_SHIFT2MSHW @ R9=0xFFFF0000 & ((a0-b0)<<5)
    263        and r8, r11, r8, asr #ROW_SHIFT @ R8=0x0000FFFF & ((a1-b1)>>11)
    264        orr r8, r8, r9
    265        str r8, [r14, #12]
    266 
    267        bal __end_row_loop
    268 
    269 __almost_empty_row:
    270        @@ the row was empty, except ROWr16[0], now, management of this special case
    271        @@ at this point, R0=block, R14=&block[n], R12=__const_ptr_, R1=ROWr32[0], R2=ROWr32[1],
    272        @@                R3=ROWr32[2], R4=ROWr32[3], R5=(temp), R6=ROWr16[0], R7=ROWr16[1],
    273        @@                R8=0xFFFF (temp), R9-R11 free
    274        mov r8, #0x10000         @ R8=0xFFFF (2 steps needed!) it saves a ldr call (because of delay run).
    275        sub r8, r8, #1           @ R8 is now ready.
    276        and r5, r8, r6, lsl #3   @ R5=R8 & (R6<<3)= (ROWr16[0]<<3) & 0xFFFF
    277        orr r5, r5, r5, lsl #16  @ R5=R5 | (R5<<16)
    278        str r5, [r14, #0]        @ R14[0]=ROWr32[0]=R5
    279        str r5, [r14, #4]        @ R14[4]=ROWr32[1]=R5
    280        str r5, [r14, #8]        @ R14[8]=ROWr32[2]=R5
    281        str r5, [r14, #12]       @ R14[12]=ROWr32[3]=R5
    282 
    283 __end_row_loop:
    284        @@ at this point, R0-R11 (free)
    285        @@     R12=__const_ptr_, R14=&block[n]
    286        ldr r0, [sp, #0]         @ R0=block
    287        teq r0, r14              @ compare current &block[8*n] to block, when block is reached, the loop is finished.
    288        sub r14, r14, #16
    289        bne __row_loop
    290 
    291 
    292 
    293        @@ at this point, R0=block, R1-R11 (free)
    294        @@     R12=__const_ptr_, R14=&block[n]
    295        add r14, r0, #14        @ R14=&block[7], better start from the last col, and decrease the value until col=0, i.e. R14=block.
    296 __col_loop:
    297 
    298 @@ __b_evaluation2:
    299        @@ at this point, R0=block (temp),  R1-R11 (free)
    300        @@     R12=__const_ptr_, R14=&block[n]
    301        @@ proceed with b0-b3 first, followed by a0-a3
    302        @@ MUL16(b0, W1, col[8x1]);
    303        @@ MUL16(b1, W3, col[8x1]);
    304        @@ MUL16(b2, W5, col[8x1]);
    305        @@ MUL16(b3, W7, col[8x1]);
    306        @@ MAC16(b0, W3, col[8x3]);
    307        @@ MAC16(b1, -W7, col[8x3]);
    308        @@ MAC16(b2, -W1, col[8x3]);
    309        @@ MAC16(b3, -W5, col[8x3]);
    310        ldr r8, =W1              @ R8=W1
    311        ldrsh r7, [r14, #16]
    312        mul r0, r8, r7           @ R0=W1*ROWr16[1]=b0 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    313        ldr r9, =W3              @ R9=W3
    314        ldr r10, =W5             @ R10=W5
    315        mul r1, r9, r7           @ R1=W3*ROWr16[1]=b1 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    316        ldr r11, =W7             @ R11=W7
    317        mul r5, r10, r7          @ R5=W5*ROWr16[1]=b2 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    318        ldrsh r2, [r14, #48]
    319        mul r7, r11, r7          @ R7=W7*ROWr16[1]=b3 (ROWr16[1] must be the second arg, to have the possibility to save 1 cycle)
    320        teq r2, #0               @ if 0, then avoid muls
    321        itttt ne
    322        mlane r0, r9, r2, r0     @ R0+=W3*ROWr16[3]=b0 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    323        rsbne r2, r2, #0         @ R2=-ROWr16[3]
    324        mlane r1, r11, r2, r1    @ R1-=W7*ROWr16[3]=b1 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    325        mlane r5, r8, r2, r5     @ R5-=W1*ROWr16[3]=b2 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    326        it    ne
    327        mlane r7, r10, r2, r7    @ R7-=W5*ROWr16[3]=b3 (ROWr16[3] must be the second arg, to have the possibility to save 1 cycle)
    328 
    329        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
    330        @@     R5=b2, R6 (free), R7=b3, R8=W1, R9=W3, R10=W5, R11=W7,
    331        @@     R12=__const_ptr_, R14=&block[n]
    332        @@ MAC16(b0, W5, col[5x8]);
    333        @@ MAC16(b2, W7, col[5x8]);
    334        @@ MAC16(b3, W3, col[5x8]);
    335        @@ MAC16(b1, -W1, col[5x8]);
    336        @@ MAC16(b0, W7, col[7x8]);
    337        @@ MAC16(b2, W3, col[7x8]);
    338        @@ MAC16(b3, -W1, col[7x8]);
    339        @@ MAC16(b1, -W5, col[7x8]);
    340        ldrsh r3, [r14, #80]     @ R3=COLr16[5x8]
    341        teq r3, #0               @ if 0 then avoid muls
    342        itttt ne
    343        mlane r0, r10, r3, r0    @ R0+=W5*ROWr16[5x8]=b0
    344        mlane r5, r11, r3, r5    @ R5+=W7*ROWr16[5x8]=b2
    345        mlane r7, r9, r3, r7     @ R7+=W3*ROWr16[5x8]=b3
    346        rsbne r3, r3, #0         @ R3=-ROWr16[5x8]
    347        ldrsh r4, [r14, #112]    @ R4=COLr16[7x8]
    348        it    ne
    349        mlane r1, r8, r3, r1     @ R7-=W1*ROWr16[5x8]=b1
    350        @@ R3 is free now
    351        teq r4, #0               @ if 0 then avoid muls
    352        itttt ne
    353        mlane r0, r11, r4, r0    @ R0+=W7*ROWr16[7x8]=b0
    354        mlane r5, r9, r4, r5     @ R5+=W3*ROWr16[7x8]=b2
    355        rsbne r4, r4, #0         @ R4=-ROWr16[7x8]
    356        mlane r7, r8, r4, r7     @ R7-=W1*ROWr16[7x8]=b3
    357        it    ne
    358        mlane r1, r10, r4, r1    @ R1-=W5*ROWr16[7x8]=b1
    359        @@ R4 is free now
    360 @@ __end_b_evaluation2:
    361        @@ at this point, R0=b0,  R1=b1, R2 (free), R3 (free), R4 (free),
    362        @@     R5=b2, R6 (free), R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
    363        @@     R12=__const_ptr_, R14=&block[n]
    364 
    365 @@ __a_evaluation2:
    366        @@ a0 = (W4 * col[8x0]) + (1 << (COL_SHIFT - 1));
    367        @@ a1 = a0 + W6 * row[2];
    368        @@ a2 = a0 - W6 * row[2];
    369        @@ a3 = a0 - W2 * row[2];
    370        @@ a0 = a0 + W2 * row[2];
    371        ldrsh r6, [r14, #0]
    372        ldr r9, =W4              @ R9=W4
    373        mul r6, r9, r6           @ R6=W4*ROWr16[0]
    374        ldr r10, =W6             @ R10=W6
    375        ldrsh r4, [r14, #32]     @ R4=ROWr16[2] (a3 not defined yet)
    376        add r6, r6, #COL_SHIFTED_1 @ R6=W4*ROWr16[0] + 1<<(COL_SHIFT-1) (a0)
    377        mul r11, r10, r4         @ R11=W6*ROWr16[2]
    378        ldr r8, =W2              @ R8=W2
    379        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
    380        sub r3, r6, r11          @ R3=a0-W6*ROWr16[2] (a2)
    381        mul r11, r8, r4          @ R11=W2*ROWr16[2]
    382        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
    383        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
    384 
    385        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
    386        @@     R5=b2, R6=a0, R7=b3, R8=W2, R9=W4, R10=W6, R11 (free),
    387        @@     R12=__const_ptr_, R14=&block[n]
    388        @@ a0 += W4*row[4]
    389        @@ a1 -= W4*row[4]
    390        @@ a2 -= W4*row[4]
    391        @@ a3 += W4*row[4]
    392        ldrsh r11, [r14, #64]    @ R11=ROWr16[4]
    393        teq r11, #0              @ if null avoid muls
    394        itttt ne
    395        mulne r11, r9, r11       @ R11=W4*ROWr16[4]
    396        @@ R9 is free now
    397        addne r6, r6, r11        @ R6+=W4*ROWr16[4] (a0)
    398        subne r2, r2, r11        @ R2-=W4*ROWr16[4] (a1)
    399        subne r3, r3, r11        @ R3-=W4*ROWr16[4] (a2)
    400        ldrsh r9, [r14, #96]     @ R9=ROWr16[6]
    401        it    ne
    402        addne r4, r4, r11        @ R4+=W4*ROWr16[4] (a3)
    403        @@ W6 alone is no more useful, save W2*ROWr16[6] in it instead
    404        teq r9, #0               @ if null avoid muls
    405        itttt ne
    406        mulne r11, r10, r9       @ R11=W6*ROWr16[6]
    407        addne r6, r6, r11        @ R6+=W6*ROWr16[6] (a0)
    408        mulne r10, r8, r9        @ R10=W2*ROWr16[6]
    409        @@ a0 += W6*row[6];
    410        @@ a3 -= W6*row[6];
    411        @@ a1 -= W2*row[6];
    412        @@ a2 += W2*row[6];
    413        subne r4, r4, r11        @ R4-=W6*ROWr16[6] (a3)
    414        itt   ne
    415        subne r2, r2, r10        @ R2-=W2*ROWr16[6] (a1)
    416        addne r3, r3, r10        @ R3+=W2*ROWr16[6] (a2)
    417 @@ __end_a_evaluation2:
    418        @@ at this point, R0=b0,  R1=b1, R2=a1, R3=a2, R4=a3,
    419        @@     R5=b2, R6=a0, R7=b3, R8 (free), R9 (free), R10 (free), R11 (free),
    420        @@     R12=__const_ptr_, R14=&block[n]
    421        @@ col[0 ] = ((a0 + b0) >> COL_SHIFT);
    422        @@ col[8 ] = ((a1 + b1) >> COL_SHIFT);
    423        @@ col[16] = ((a2 + b2) >> COL_SHIFT);
    424        @@ col[24] = ((a3 + b3) >> COL_SHIFT);
    425        @@ col[32] = ((a3 - b3) >> COL_SHIFT);
    426        @@ col[40] = ((a2 - b2) >> COL_SHIFT);
    427        @@ col[48] = ((a1 - b1) >> COL_SHIFT);
    428        @@ col[56] = ((a0 - b0) >> COL_SHIFT);
    429        @@@@@ no optimization here @@@@@
    430        add r8, r6, r0           @ R8=a0+b0
    431        add r9, r2, r1           @ R9=a1+b1
    432        mov r8, r8, asr #COL_SHIFT
    433        mov r9, r9, asr #COL_SHIFT
    434        strh r8, [r14, #0]
    435        strh r9, [r14, #16]
    436        add r8, r3, r5           @ R8=a2+b2
    437        add r9, r4, r7           @ R9=a3+b3
    438        mov r8, r8, asr #COL_SHIFT
    439        mov r9, r9, asr #COL_SHIFT
    440        strh r8, [r14, #32]
    441        strh r9, [r14, #48]
    442        sub r8, r4, r7           @ R8=a3-b3
    443        sub r9, r3, r5           @ R9=a2-b2
    444        mov r8, r8, asr #COL_SHIFT
    445        mov r9, r9, asr #COL_SHIFT
    446        strh r8, [r14, #64]
    447        strh r9, [r14, #80]
    448        sub r8, r2, r1           @ R8=a1-b1
    449        sub r9, r6, r0           @ R9=a0-b0
    450        mov r8, r8, asr #COL_SHIFT
    451        mov r9, r9, asr #COL_SHIFT
    452        strh r8, [r14, #96]
    453        strh r9, [r14, #112]
    454 
    455 @@ __end_col_loop:
    456        @@ at this point, R0-R11 (free)
    457        @@     R12=__const_ptr_, R14=&block[n]
    458        ldr r0, [sp, #0]         @ R0=block
    459        teq r0, r14              @ compare current &block[n] to block, when block is reached, the loop is finished.
    460        sub r14, r14, #2
    461        bne __col_loop
    462 
    463 
    464 
    465 
    466 @@ __end_simple_idct_arm:
    467        @@ restore registers to previous status!
    468        add sp, sp, #8 @@ the local variables!
    469        ldmfd sp!, {r4-r11, r15} @@ update PC with LR content.
    470 
    471 
    472 
    473 @@ kind of sub-function, here not to overload the common case.
    474 __end_bef_a_evaluation:
    475        add r2, r6, r11          @ R2=a0+W6*ROWr16[2] (a1)
    476        mul r11, r8, r4          @ R11=W2*ROWr16[2]
    477        sub r4, r6, r11          @ R4=a0-W2*ROWr16[2] (a3)
    478        add r6, r6, r11          @ R6=a0+W2*ROWr16[2] (a0)
    479        bal __end_a_evaluation
    480 endfunc