tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

yuv_row_arm.s (12348B)


      1 /* This Source Code Form is subject to the terms of the Mozilla Public
      2 * License, v. 2.0. If a copy of the MPL was not distributed with this
      3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      4 
      5    .arch   armv7-a
      6    .fpu    neon
      7 /* Allow to build on targets not supporting neon, and force the object file
      8 * target to avoid bumping the final binary target */
      9    .object_arch armv4t
     10    .text
     11    .align
     12 
     13    .balign 64
     14 YCbCr42xToRGB565_DITHER03_CONSTS_NEON:
     15    .short -14240
     16    .short -14240+384
     17    .short   8672
     18    .short   8672+192
     19    .short -17696
     20    .short -17696+384
     21    .byte 102
     22    .byte  25
     23    .byte  52
     24    .byte 129
     25 YCbCr42xToRGB565_DITHER12_CONSTS_NEON:
     26    .short -14240+128
     27    .short -14240+256
     28    .short   8672+64
     29    .short   8672+128
     30    .short -17696+128
     31    .short -17696+256
     32    .byte 102
     33    .byte  25
     34    .byte  52
     35    .byte 129
     36 YCbCr42xToRGB565_DITHER21_CONSTS_NEON:
     37    .short -14240+256
     38    .short -14240+128
     39    .short   8672+128
     40    .short   8672+64
     41    .short -17696+256
     42    .short -17696+128
     43    .byte 102
     44    .byte  25
     45    .byte  52
     46    .byte 129
     47 YCbCr42xToRGB565_DITHER30_CONSTS_NEON:
     48    .short -14240+384
     49    .short -14240
     50    .short   8672+192
     51    .short   8672
     52    .short -17696+384
     53    .short -17696
     54    .byte 102
     55    .byte  25
     56    .byte  52
     57    .byte 129
     58 
     59 @ void ScaleYCbCr42xToRGB565_BilinearY_Row_NEON(
     60 @  yuv2rgb565_row_scale_bilinear_ctx *ctx, int dither);
     61 @
     62 @ ctx = {
     63 @   uint16_t *rgb_row;       /*r0*/
     64 @   const uint8_t *y_row;    /*r1*/
     65 @   const uint8_t *u_row;    /*r2*/
     66 @   const uint8_t *v_row;    /*r3*/
     67 @   int y_yweight;           /*r4*/
     68 @   int y_pitch;             /*r5*/
     69 @   int width;               /*r6*/
     70 @   int source_x0_q16;       /*r7*/
     71 @   int source_dx_q16;       /*r8*/
     72 @   int source_uv_xoffs_q16; /*r9*/
     73 @ };
     74    .global ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
     75    .type   ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, %function
     76    .balign 64
     77    .fnstart
     78 ScaleYCbCr42xToRGB565_BilinearY_Row_NEON:
     79    STMFD       r13!,{r4-r9,r14}       @ 8 words.
     80    ADR         r14,YCbCr42xToRGB565_DITHER03_CONSTS_NEON
     81    VPUSH       {Q4-Q7}                @ 16 words.
     82    ADD         r14,r14,r1, LSL #4     @ Select the dither table to use
     83    LDMIA       r0, {r0-r9}
     84    @ Set up image index registers.
     85    ADD         r12,r8, r8
     86    VMOV.I32    D16,#0         @ Q8 = < 2| 2| 0| 0>*source_dx_q16
     87    VDUP.32     D17,r12
     88    ADD         r12,r12,r12
     89    VTRN.32     D16,D17        @ Q2 = < 2| 0| 2| 0>*source_dx_q16
     90    VDUP.32     D19,r12        @ Q9 = < 4| 4| ?| ?>*source_dx_q16
     91    ADD         r12,r12,r12
     92    VDUP.32     Q0, r7         @ Q0 = < 1| 1| 1| 1>*source_x0_q16
     93    VADD.I32    D17,D17,D19    @ Q8 = < 6| 4| 2| 0>*source_dx_q16
     94    CMP         r8, #0                 @ If source_dx_q16 is negative...
     95    VDUP.32     Q9, r12        @ Q9 = < 8| 8| 8| 8>*source_dx_q16
     96    ADDLT       r7, r7, r8, LSL #4     @ Make r7 point to the end of the block
     97    VADD.I32    Q0, Q0, Q8     @ Q0 = < 6| 4| 2| 0>*source_dx_q16+source_x0_q16
     98    SUBLT       r7, r7, r8             @ (i.e., the lowest address we'll use)
     99    VADD.I32    Q1, Q0, Q9     @ Q1 = <14|12|10| 8>*source_dx_q16+source_x0_q16
    100    VDUP.I32    Q9, r8         @ Q8 = < 1| 1| 1| 1>*source_dx_q16
    101    VADD.I32    Q2, Q0, Q9     @ Q2 = < 7| 5| 3| 1>*source_dx_q16+source_x0_q16
    102    VADD.I32    Q3, Q1, Q9     @ Q3 = <15|13|11| 9>*source_dx_q16+source_x0_q16
    103    VLD1.64     {D30,D31},[r14,:128]   @ Load some constants
    104    VMOV.I8     D28,#52
    105    VMOV.I8     D29,#129
    106    @ The basic idea here is to do aligned loads of a block of data and then
    107    @  index into it using VTBL to extract the data from the source X
    108    @  coordinate corresponding to each destination pixel.
    109    @ This is significantly less code and significantly fewer cycles than doing
    110    @  a series of single-lane loads, but it means that the X step between
    111    @  pixels must be limited to 2.0 or less, otherwise we couldn't guarantee
    112    @  that we could read 8 pixels from a single aligned 32-byte block of data.
    113    @ Q0...Q3 contain the 16.16 fixed-point X coordinates of each pixel,
    114    @  separated into even pixels and odd pixels to make extracting offsets and
    115    @  weights easier.
    116    @ We then pull out two bytes from the middle of each coordinate: the top
    117    @  byte corresponds to the integer part of the X coordinate, and the bottom
    118    @  byte corresponds to the weight to use for bilinear blending.
    119    @ These are separated out into different registers with VTRN.
    120    @ Then by subtracting the integer X coordinate of the first pixel in the
    121    @  data block we loaded, we produce an index register suitable for use by
    122    @  VTBL.
    123 s42xbily_neon_loop:
    124    @ Load the Y' data.
    125    MOV         r12,r7, ASR #16
    126    VRSHRN.S32  D16,Q0, #8
    127    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
    128    VDUP.I8     D20,r12
    129    ADD         r12,r1, r12    @ r12 = y_row+(source_x&~7)
    130    VRSHRN.S32  D17,Q1, #8
    131    PLD         [r12,#64]
    132    VLD1.64     {D8, D9, D10,D11},[r12,:128],r5        @ Load Y' top row
    133    ADD         r14,r7, r8, LSL #3
    134    VRSHRN.S32  D18,Q2, #8
    135    MOV         r14,r14,ASR #16
    136    VRSHRN.S32  D19,Q3, #8
    137    AND         r14,r14,#~15   @ Read 16-byte aligned blocks
    138    VLD1.64     {D12,D13,D14,D15},[r12,:128]           @ Load Y' bottom row
    139    PLD         [r12,#64]
    140    VDUP.I8     D21,r14
    141    ADD         r14,r1, r14    @ r14 = y_row+(source_x&~7)
    142    VMOV.I8     Q13,#1
    143    PLD         [r14,#64]
    144    VTRN.8      Q8, Q9         @ Q8  = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
    145                               @ Q9  = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
    146    VSUB.S8     Q9, Q9, Q10    @ Make offsets relative to the data we loaded.
    147    @ First 8 Y' pixels
    148    VTBL.8      D20,{D8, D9, D10,D11},D18      @ Index top row at source_x
    149    VTBL.8      D24,{D12,D13,D14,D15},D18      @ Index bottom row at source_x
    150    VADD.S8     Q13,Q9, Q13                    @ Add 1 to source_x
    151    VTBL.8      D22,{D8, D9, D10,D11},D26      @ Index top row at source_x+1
    152    VTBL.8      D26,{D12,D13,D14,D15},D26      @ Index bottom row at source_x+1
    153    @ Next 8 Y' pixels
    154    VLD1.64     {D8, D9, D10,D11},[r14,:128],r5        @ Load Y' top row
    155    VLD1.64     {D12,D13,D14,D15},[r14,:128]           @ Load Y' bottom row
    156    PLD         [r14,#64]
    157    VTBL.8      D21,{D8, D9, D10,D11},D19      @ Index top row at source_x
    158    VTBL.8      D25,{D12,D13,D14,D15},D19      @ Index bottom row at source_x
    159    VTBL.8      D23,{D8, D9, D10,D11},D27      @ Index top row at source_x+1
    160    VTBL.8      D27,{D12,D13,D14,D15},D27      @ Index bottom row at source_x+1
    161    @ Blend Y'.
    162    VDUP.I16    Q9, r4         @ Load the y weights.
    163    VSUBL.U8    Q4, D24,D20    @ Q5:Q4 = c-a
    164    VSUBL.U8    Q5, D25,D21
    165    VSUBL.U8    Q6, D26,D22    @ Q7:Q6 = d-b
    166    VSUBL.U8    Q7, D27,D23
    167    VMUL.S16    Q4, Q4, Q9     @ Q5:Q4 = (c-a)*yweight
    168    VMUL.S16    Q5, Q5, Q9
    169    VMUL.S16    Q6, Q6, Q9     @ Q7:Q6 = (d-b)*yweight
    170    VMUL.S16    Q7, Q7, Q9
    171    VMOVL.U8    Q12,D16        @ Promote the x weights to 16 bits.
    172    VMOVL.U8    Q13,D17        @ Sadly, there's no VMULW.
    173    VRSHRN.S16  D8, Q4, #8     @ Q4 = (c-a)*yweight+128>>8
    174    VRSHRN.S16  D9, Q5, #8
    175    VRSHRN.S16  D12,Q6, #8     @ Q6 = (d-b)*yweight+128>>8
    176    VRSHRN.S16  D13,Q7, #8
    177    VADD.I8     Q10,Q10,Q4     @ Q10 = a+((c-a)*yweight+128>>8)
    178    VADD.I8     Q11,Q11,Q6     @ Q11 = b+((d-b)*yweight+128>>8)
    179    VSUBL.U8    Q4, D22,D20    @ Q5:Q4 = b-a
    180    VSUBL.U8    Q5, D23,D21
    181    VMUL.S16    Q4, Q4, Q12    @ Q5:Q4 = (b-a)*xweight
    182    VMUL.S16    Q5, Q5, Q13
    183    VRSHRN.S16  D8, Q4, #8     @ Q4 = (b-a)*xweight+128>>8
    184    ADD         r12,r7, r9
    185    VRSHRN.S16  D9, Q5, #8
    186    MOV         r12,r12,ASR #17
    187    VADD.I8     Q8, Q10,Q4     @ Q8 = a+((b-a)*xweight+128>>8)
    188    @ Start extracting the chroma x coordinates, and load Cb and Cr.
    189    AND         r12,r12,#~15   @ Read 16-byte aligned blocks
    190    VDUP.I32    Q9, r9         @ Q9 = source_uv_xoffs_q16 x 4
    191    ADD         r14,r2, r12
    192    VADD.I32    Q10,Q0, Q9
    193    VLD1.64     {D8, D9, D10,D11},[r14,:128]   @ Load Cb
    194    PLD         [r14,#64]
    195    VADD.I32    Q11,Q1, Q9
    196    ADD         r14,r3, r12
    197    VADD.I32    Q12,Q2, Q9
    198    VLD1.64     {D12,D13,D14,D15},[r14,:128]   @ Load Cr
    199    PLD         [r14,#64]
    200    VADD.I32    Q13,Q3, Q9
    201    VRSHRN.S32  D20,Q10,#9     @ Q10 = <xEwExCwCxAwAx8w8x6w6x4w4x2w2x0w0>
    202    VRSHRN.S32  D21,Q11,#9
    203    VDUP.I8     Q9, r12
    204    VRSHRN.S32  D22,Q12,#9     @ Q11 = <xFwFxDwDxBwBx9w9x7w7x5w5x3w3x1w1>
    205    VRSHRN.S32  D23,Q13,#9
    206    @ We don't actually need the x weights, but we get them for free.
    207    @ Free ALU slot
    208    VTRN.8      Q10,Q11        @ Q10 = <wFwEwDwCwBwAw9w8w7w6w5w4w3w2w1w0>
    209    @ Free ALU slot            @ Q11 = <xFxExDxCxBxAx9x8x7x6x5x4x3x2x1x0>
    210    VSUB.S8     Q11,Q11,Q9     @ Make offsets relative to the data we loaded.
    211    VTBL.8      D18,{D8, D9, D10,D11},D22      @ Index Cb at source_x
    212    VMOV.I8     D24,#74
    213    VTBL.8      D19,{D8, D9, D10,D11},D23
    214    VMOV.I8     D26,#102
    215    VTBL.8      D20,{D12,D13,D14,D15},D22      @ Index Cr at source_x
    216    VMOV.I8     D27,#25
    217    VTBL.8      D21,{D12,D13,D14,D15},D23
    218    @ We now have Y' in Q8, Cb in Q9, and Cr in Q10
    219    @ We use VDUP to expand constants, because it's a permute instruction, so
    220    @  it can dual issue on the A8.
    221    SUBS        r6, r6, #16    @ width -= 16
    222    VMULL.U8    Q4, D16,D24    @  Q5:Q4  = Y'*74
    223    VDUP.32     Q6, D30[1]     @  Q7:Q6  = bias_G
    224    VMULL.U8    Q5, D17,D24
    225    VDUP.32     Q7, D30[1]
    226    VMLSL.U8    Q6, D18,D27    @  Q7:Q6  = -25*Cb+bias_G
    227    VDUP.32     Q11,D30[0]     @ Q12:Q11 = bias_R
    228    VMLSL.U8    Q7, D19,D27
    229    VDUP.32     Q12,D30[0]
    230    VMLAL.U8    Q11,D20,D26    @ Q12:Q11 = 102*Cr+bias_R
    231    VDUP.32     Q8, D31[0]     @ Q13:Q8  = bias_B
    232    VMLAL.U8    Q12,D21,D26
    233    VDUP.32     Q13,D31[0]
    234    VMLAL.U8    Q8, D18,D29    @ Q13:Q8  = 129*Cb+bias_B
    235    VMLAL.U8    Q13,D19,D29
    236    VMLSL.U8    Q6, D20,D28    @  Q7:Q6  = -25*Cb-52*Cr+bias_G
    237    VMLSL.U8    Q7, D21,D28
    238    VADD.S16    Q11,Q4, Q11    @ Q12:Q11 = 74*Y'+102*Cr+bias_R
    239    VADD.S16    Q12,Q5, Q12
    240    VQADD.S16   Q8, Q4, Q8     @ Q13:Q8  = 74*Y'+129*Cr+bias_B
    241    VQADD.S16   Q13,Q5, Q13
    242    VADD.S16    Q6, Q4, Q6     @  Q7:Q6  = 74*Y'-25*Cb-52*Cr+bias_G
    243    VADD.S16    Q7, Q5, Q7
    244    @ Push each value to the top of its word and saturate it.
    245    VQSHLU.S16 Q11,Q11,#2
    246    VQSHLU.S16 Q12,Q12,#2
    247    VQSHLU.S16 Q6, Q6, #2
    248    VQSHLU.S16 Q7, Q7, #2
    249    VQSHLU.S16 Q8, Q8, #2
    250    VQSHLU.S16 Q13,Q13,#2
    251    @ Merge G and B into R.
    252    VSRI.U16   Q11,Q6, #5
    253    VSRI.U16   Q12,Q7, #5
    254    VSRI.U16   Q11,Q8, #11
    255    MOV         r14,r8, LSL #4
    256    VSRI.U16   Q12,Q13,#11
    257    BLT s42xbily_neon_tail
    258    VDUP.I32    Q13,r14
    259    @ Store the result.
    260    VST1.16     {D22,D23,D24,D25},[r0]!
    261    BEQ s42xbily_neon_done
    262    @ Advance the x coordinates.
    263    VADD.I32    Q0, Q0, Q13
    264    VADD.I32    Q1, Q1, Q13
    265    ADD         r7, r14
    266    VADD.I32    Q2, Q2, Q13
    267    VADD.I32    Q3, Q3, Q13
    268    B s42xbily_neon_loop
    269 s42xbily_neon_tail:
    270    @ We have between 1 and 15 pixels left to write.
    271    @ -r6 == the number of pixels we need to skip writing.
    272    @ Adjust r0 to point to the last one we need to write, because we're going
    273    @  to write them in reverse order.
    274    ADD         r0, r0, r6, LSL #1
    275    MOV         r14,#-2
    276    ADD         r0, r0, #30
    277    @ Skip past the ones we don't need to write.
    278    SUB         PC, PC, r6, LSL #2
    279    ORR         r0, r0, r0
    280    VST1.16     {D25[3]},[r0,:16],r14
    281    VST1.16     {D25[2]},[r0,:16],r14
    282    VST1.16     {D25[1]},[r0,:16],r14
    283    VST1.16     {D25[0]},[r0,:16],r14
    284    VST1.16     {D24[3]},[r0,:16],r14
    285    VST1.16     {D24[2]},[r0,:16],r14
    286    VST1.16     {D24[1]},[r0,:16],r14
    287    VST1.16     {D24[0]},[r0,:16],r14
    288    VST1.16     {D23[3]},[r0,:16],r14
    289    VST1.16     {D23[2]},[r0,:16],r14
    290    VST1.16     {D23[1]},[r0,:16],r14
    291    VST1.16     {D23[0]},[r0,:16],r14
    292    VST1.16     {D22[3]},[r0,:16],r14
    293    VST1.16     {D22[2]},[r0,:16],r14
    294    VST1.16     {D22[1]},[r0,:16],r14
    295    VST1.16     {D22[0]},[r0,:16]
    296 s42xbily_neon_done:
    297    VPOP        {Q4-Q7}                @ 16 words.
    298    LDMFD       r13!,{r4-r9,PC}        @ 8 words.
    299    .fnend
    300    .size ScaleYCbCr42xToRGB565_BilinearY_Row_NEON, .-ScaleYCbCr42xToRGB565_BilinearY_Row_NEON
    301 
    302 #if defined(__ELF__)&&defined(__linux__)
    303    .section .note.GNU-stack,"",%progbits
    304 #endif