tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jidctint-sse2.asm (34577B)


      1 ;
      2 ; jidctint.asm - accurate integer IDCT (64-bit SSE2)
      3 ;
      4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
      5 ; Copyright (C) 2009, 2016, 2020, 2024, D. R. Commander.
      6 ; Copyright (C) 2018, Matthias Räncker.
      7 ; Copyright (C) 2023, Aliaksiej Kandracienka.
      8 ;
      9 ; Based on the x86 SIMD extension for IJG JPEG library
     10 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
     11 ; For conditions of distribution and use, see copyright notice in jsimdext.inc
     12 ;
     13 ; This file should be assembled with NASM (Netwide Assembler) or Yasm.
     14 ;
     15 ; This file contains a slower but more accurate integer implementation of the
     16 ; inverse DCT (Discrete Cosine Transform). The following code is based
     17 ; directly on the IJG's original jidctint.c; see the jidctint.c for
     18 ; more details.
     19 
     20 %include "jsimdext.inc"
     21 %include "jdct.inc"
     22 
     23 ; --------------------------------------------------------------------------
     24 
     25 %define CONST_BITS  13
     26 %define PASS1_BITS  2
     27 
     28 %define DESCALE_P1  (CONST_BITS - PASS1_BITS)
     29 %define DESCALE_P2  (CONST_BITS + PASS1_BITS + 3)
     30 
     31 %if CONST_BITS == 13
     32 F_0_298 equ  2446  ; FIX(0.298631336)
     33 F_0_390 equ  3196  ; FIX(0.390180644)
     34 F_0_541 equ  4433  ; FIX(0.541196100)
     35 F_0_765 equ  6270  ; FIX(0.765366865)
     36 F_0_899 equ  7373  ; FIX(0.899976223)
     37 F_1_175 equ  9633  ; FIX(1.175875602)
     38 F_1_501 equ 12299  ; FIX(1.501321110)
     39 F_1_847 equ 15137  ; FIX(1.847759065)
     40 F_1_961 equ 16069  ; FIX(1.961570560)
     41 F_2_053 equ 16819  ; FIX(2.053119869)
     42 F_2_562 equ 20995  ; FIX(2.562915447)
     43 F_3_072 equ 25172  ; FIX(3.072711026)
     44 %else
     45 ; NASM cannot do compile-time arithmetic on floating-point constants.
     46 %define DESCALE(x, n)  (((x) + (1 << ((n) - 1))) >> (n))
     47 F_0_298 equ DESCALE( 320652955, 30 - CONST_BITS)  ; FIX(0.298631336)
     48 F_0_390 equ DESCALE( 418953276, 30 - CONST_BITS)  ; FIX(0.390180644)
     49 F_0_541 equ DESCALE( 581104887, 30 - CONST_BITS)  ; FIX(0.541196100)
     50 F_0_765 equ DESCALE( 821806413, 30 - CONST_BITS)  ; FIX(0.765366865)
     51 F_0_899 equ DESCALE( 966342111, 30 - CONST_BITS)  ; FIX(0.899976223)
     52 F_1_175 equ DESCALE(1262586813, 30 - CONST_BITS)  ; FIX(1.175875602)
     53 F_1_501 equ DESCALE(1612031267, 30 - CONST_BITS)  ; FIX(1.501321110)
     54 F_1_847 equ DESCALE(1984016188, 30 - CONST_BITS)  ; FIX(1.847759065)
     55 F_1_961 equ DESCALE(2106220350, 30 - CONST_BITS)  ; FIX(1.961570560)
     56 F_2_053 equ DESCALE(2204520673, 30 - CONST_BITS)  ; FIX(2.053119869)
     57 F_2_562 equ DESCALE(2751909506, 30 - CONST_BITS)  ; FIX(2.562915447)
     58 F_3_072 equ DESCALE(3299298341, 30 - CONST_BITS)  ; FIX(3.072711026)
     59 %endif
     60 
     61 ; --------------------------------------------------------------------------
     62    SECTION     SEG_CONST
     63 
     64    ALIGNZ      32
     65    GLOBAL_DATA(jconst_idct_islow_sse2)
     66 
     67 EXTN(jconst_idct_islow_sse2):
     68 
     69 PW_F130_F054   times 4  dw  (F_0_541 + F_0_765),  F_0_541
     70 PW_F054_MF130  times 4  dw  F_0_541, (F_0_541 - F_1_847)
     71 PW_MF078_F117  times 4  dw  (F_1_175 - F_1_961),  F_1_175
     72 PW_F117_F078   times 4  dw  F_1_175, (F_1_175 - F_0_390)
     73 PW_MF060_MF089 times 4  dw  (F_0_298 - F_0_899), -F_0_899
     74 PW_MF089_F060  times 4  dw -F_0_899, (F_1_501 - F_0_899)
     75 PW_MF050_MF256 times 4  dw  (F_2_053 - F_2_562), -F_2_562
     76 PW_MF256_F050  times 4  dw -F_2_562, (F_3_072 - F_2_562)
     77 PD_DESCALE_P1  times 4  dd  1 << (DESCALE_P1 - 1)
     78 PD_DESCALE_P2  times 4  dd  1 << (DESCALE_P2 - 1)
     79 PB_CENTERJSAMP times 16 db  CENTERJSAMPLE
     80 
     81    ALIGNZ      32
     82 
     83 ; --------------------------------------------------------------------------
     84    SECTION     SEG_TEXT
     85    BITS        64
     86 ;
     87 ; Perform dequantization and inverse DCT on one block of coefficients.
     88 ;
     89 ; GLOBAL(void)
     90 ; jsimd_idct_islow_sse2(void *dct_table, JCOEFPTR coef_block,
     91 ;                       JSAMPARRAY output_buf, JDIMENSION output_col)
     92 ;
     93 
     94 ; r10 = jpeg_component_info *compptr
     95 ; r11 = JCOEFPTR coef_block
     96 ; r12 = JSAMPARRAY output_buf
     97 ; r13d = JDIMENSION output_col
     98 
     99 %define wk(i)         r15 - (WK_NUM - (i)) * SIZEOF_XMMWORD
    100                                        ; xmmword wk[WK_NUM]
    101 %define WK_NUM        12
    102 
    103    align       32
    104    GLOBAL_FUNCTION(jsimd_idct_islow_sse2)
    105 
    106 EXTN(jsimd_idct_islow_sse2):
    107    ENDBR64
    108    push        rbp
    109    mov         rbp, rsp
    110    push        r15
    111    and         rsp, byte (-SIZEOF_XMMWORD)  ; align to 128 bits
    112    ; Allocate stack space for wk array.  r15 is used to access it.
    113    mov         r15, rsp
    114    sub         rsp, (SIZEOF_XMMWORD * WK_NUM)
    115    COLLECT_ARGS 4
    116 
    117    ; ---- Pass 1: process columns from input.
    118 
    119    mov         rdx, r10                ; quantptr
    120    mov         rsi, r11                ; inptr
    121 
    122 %ifndef NO_ZERO_COLUMN_TEST_ISLOW_SSE2
    123    mov         eax, dword [DWBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    124    or          eax, dword [DWBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    125    jnz         near .columnDCT
    126 
    127    movdqa      xmm0, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    128    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    129    por         xmm0, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    130    por         xmm1, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    131    por         xmm0, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    132    por         xmm1, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    133    por         xmm0, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    134    por         xmm1, xmm0
    135    packsswb    xmm1, xmm1
    136    packsswb    xmm1, xmm1
    137    movd        eax, xmm1
    138    test        rax, rax
    139    jnz         short .columnDCT
    140 
    141    ; -- AC terms all zero
    142 
    143    movdqa      xmm5, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    144    pmullw      xmm5, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    145 
    146    psllw       xmm5, PASS1_BITS
    147 
    148    movdqa      xmm4, xmm5              ; xmm5=in0=(00 01 02 03 04 05 06 07)
    149    punpcklwd   xmm5, xmm5              ; xmm5=(00 00 01 01 02 02 03 03)
    150    punpckhwd   xmm4, xmm4              ; xmm4=(04 04 05 05 06 06 07 07)
    151 
    152    pshufd      xmm7, xmm5, 0x00        ; xmm7=col0=(00 00 00 00 00 00 00 00)
    153    pshufd      xmm6, xmm5, 0x55        ; xmm6=col1=(01 01 01 01 01 01 01 01)
    154    pshufd      xmm1, xmm5, 0xAA        ; xmm1=col2=(02 02 02 02 02 02 02 02)
    155    pshufd      xmm5, xmm5, 0xFF        ; xmm5=col3=(03 03 03 03 03 03 03 03)
    156    pshufd      xmm0, xmm4, 0x00        ; xmm0=col4=(04 04 04 04 04 04 04 04)
    157    pshufd      xmm3, xmm4, 0x55        ; xmm3=col5=(05 05 05 05 05 05 05 05)
    158    pshufd      xmm2, xmm4, 0xAA        ; xmm2=col6=(06 06 06 06 06 06 06 06)
    159    pshufd      xmm4, xmm4, 0xFF        ; xmm4=col7=(07 07 07 07 07 07 07 07)
    160 
    161    movdqa      XMMWORD [wk(8)], xmm6   ; wk(8)=col1
    162    movdqa      XMMWORD [wk(9)], xmm5   ; wk(9)=col3
    163    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
    164    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
    165    jmp         near .column_end
    166 %endif
    167 .columnDCT:
    168 
    169    ; -- Even part
    170 
    171    movdqa      xmm0, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_JCOEF)]
    172    movdqa      xmm1, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_JCOEF)]
    173    pmullw      xmm0, XMMWORD [XMMBLOCK(0,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    174    pmullw      xmm1, XMMWORD [XMMBLOCK(2,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    175    movdqa      xmm2, XMMWORD [XMMBLOCK(4,0,rsi,SIZEOF_JCOEF)]
    176    movdqa      xmm3, XMMWORD [XMMBLOCK(6,0,rsi,SIZEOF_JCOEF)]
    177    pmullw      xmm2, XMMWORD [XMMBLOCK(4,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    178    pmullw      xmm3, XMMWORD [XMMBLOCK(6,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    179 
    180    ; (Original)
    181    ; z1 = (z2 + z3) * 0.541196100;
    182    ; tmp2 = z1 + z3 * -1.847759065;
    183    ; tmp3 = z1 + z2 * 0.765366865;
    184    ;
    185    ; (This implementation)
    186    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
    187    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
    188 
    189    movdqa      xmm4, xmm1              ; xmm1=in2=z2
    190    movdqa      xmm5, xmm1
    191    punpcklwd   xmm4, xmm3              ; xmm3=in6=z3
    192    punpckhwd   xmm5, xmm3
    193    movdqa      xmm1, xmm4
    194    movdqa      xmm3, xmm5
    195    pmaddwd     xmm4, [rel PW_F130_F054]   ; xmm4=tmp3L
    196    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
    197    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
    198    pmaddwd     xmm3, [rel PW_F054_MF130]  ; xmm3=tmp2H
    199 
    200    movdqa      xmm6, xmm0
    201    paddw       xmm0, xmm2              ; xmm0=in0+in4
    202    psubw       xmm6, xmm2              ; xmm6=in0-in4
    203 
    204    pxor        xmm7, xmm7
    205    pxor        xmm2, xmm2
    206    punpcklwd   xmm7, xmm0              ; xmm7=tmp0L
    207    punpckhwd   xmm2, xmm0              ; xmm2=tmp0H
    208    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
    209    psrad       xmm2, (16-CONST_BITS)   ; psrad xmm2,16 & pslld xmm2,CONST_BITS
    210 
    211    movdqa      xmm0, xmm7
    212    paddd       xmm7, xmm4              ; xmm7=tmp10L
    213    psubd       xmm0, xmm4              ; xmm0=tmp13L
    214    movdqa      xmm4, xmm2
    215    paddd       xmm2, xmm5              ; xmm2=tmp10H
    216    psubd       xmm4, xmm5              ; xmm4=tmp13H
    217 
    218    movdqa      XMMWORD [wk(0)], xmm7   ; wk(0)=tmp10L
    219    movdqa      XMMWORD [wk(1)], xmm2   ; wk(1)=tmp10H
    220    movdqa      XMMWORD [wk(2)], xmm0   ; wk(2)=tmp13L
    221    movdqa      XMMWORD [wk(3)], xmm4   ; wk(3)=tmp13H
    222 
    223    pxor        xmm5, xmm5
    224    pxor        xmm7, xmm7
    225    punpcklwd   xmm5, xmm6              ; xmm5=tmp1L
    226    punpckhwd   xmm7, xmm6              ; xmm7=tmp1H
    227    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
    228    psrad       xmm7, (16-CONST_BITS)   ; psrad xmm7,16 & pslld xmm7,CONST_BITS
    229 
    230    movdqa      xmm2, xmm5
    231    paddd       xmm5, xmm1              ; xmm5=tmp11L
    232    psubd       xmm2, xmm1              ; xmm2=tmp12L
    233    movdqa      xmm0, xmm7
    234    paddd       xmm7, xmm3              ; xmm7=tmp11H
    235    psubd       xmm0, xmm3              ; xmm0=tmp12H
    236 
    237    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
    238    movdqa      XMMWORD [wk(5)], xmm7   ; wk(5)=tmp11H
    239    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=tmp12L
    240    movdqa      XMMWORD [wk(7)], xmm0   ; wk(7)=tmp12H
    241 
    242    ; -- Odd part
    243 
    244    movdqa      xmm4, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_JCOEF)]
    245    movdqa      xmm6, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_JCOEF)]
    246    pmullw      xmm4, XMMWORD [XMMBLOCK(1,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    247    pmullw      xmm6, XMMWORD [XMMBLOCK(3,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    248    movdqa      xmm1, XMMWORD [XMMBLOCK(5,0,rsi,SIZEOF_JCOEF)]
    249    movdqa      xmm3, XMMWORD [XMMBLOCK(7,0,rsi,SIZEOF_JCOEF)]
    250    pmullw      xmm1, XMMWORD [XMMBLOCK(5,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    251    pmullw      xmm3, XMMWORD [XMMBLOCK(7,0,rdx,SIZEOF_ISLOW_MULT_TYPE)]
    252 
    253    movdqa      xmm5, xmm6
    254    movdqa      xmm7, xmm4
    255    paddw       xmm5, xmm3              ; xmm5=z3
    256    paddw       xmm7, xmm1              ; xmm7=z4
    257 
    258    ; (Original)
    259    ; z5 = (z3 + z4) * 1.175875602;
    260    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    261    ; z3 += z5;  z4 += z5;
    262    ;
    263    ; (This implementation)
    264    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    265    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    266 
    267    movdqa      xmm2, xmm5
    268    movdqa      xmm0, xmm5
    269    punpcklwd   xmm2, xmm7
    270    punpckhwd   xmm0, xmm7
    271    movdqa      xmm5, xmm2
    272    movdqa      xmm7, xmm0
    273    pmaddwd     xmm2, [rel PW_MF078_F117]  ; xmm2=z3L
    274    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3H
    275    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
    276    pmaddwd     xmm7, [rel PW_F117_F078]   ; xmm7=z4H
    277 
    278    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=z3L
    279    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=z3H
    280 
    281    ; (Original)
    282    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
    283    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
    284    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
    285    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    286    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
    287    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
    288    ;
    289    ; (This implementation)
    290    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
    291    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
    292    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
    293    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
    294    ; tmp0 += z3;  tmp1 += z4;
    295    ; tmp2 += z3;  tmp3 += z4;
    296 
    297    movdqa      xmm2, xmm3
    298    movdqa      xmm0, xmm3
    299    punpcklwd   xmm2, xmm4
    300    punpckhwd   xmm0, xmm4
    301    movdqa      xmm3, xmm2
    302    movdqa      xmm4, xmm0
    303    pmaddwd     xmm2, [rel PW_MF060_MF089]  ; xmm2=tmp0L
    304    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0H
    305    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3L
    306    pmaddwd     xmm4, [rel PW_MF089_F060]   ; xmm4=tmp3H
    307 
    308    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp0L
    309    paddd       xmm0, XMMWORD [wk(11)]  ; xmm0=tmp0H
    310    paddd       xmm3, xmm5              ; xmm3=tmp3L
    311    paddd       xmm4, xmm7              ; xmm4=tmp3H
    312 
    313    movdqa      XMMWORD [wk(8)], xmm2   ; wk(8)=tmp0L
    314    movdqa      XMMWORD [wk(9)], xmm0   ; wk(9)=tmp0H
    315 
    316    movdqa      xmm2, xmm1
    317    movdqa      xmm0, xmm1
    318    punpcklwd   xmm2, xmm6
    319    punpckhwd   xmm0, xmm6
    320    movdqa      xmm1, xmm2
    321    movdqa      xmm6, xmm0
    322    pmaddwd     xmm2, [rel PW_MF050_MF256]  ; xmm2=tmp1L
    323    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1H
    324    pmaddwd     xmm1, [rel PW_MF256_F050]   ; xmm1=tmp2L
    325    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
    326 
    327    paddd       xmm2, xmm5              ; xmm2=tmp1L
    328    paddd       xmm0, xmm7              ; xmm0=tmp1H
    329    paddd       xmm1, XMMWORD [wk(10)]  ; xmm1=tmp2L
    330    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
    331 
    332    movdqa      XMMWORD [wk(10)], xmm2  ; wk(10)=tmp1L
    333    movdqa      XMMWORD [wk(11)], xmm0  ; wk(11)=tmp1H
    334 
    335    ; -- Final output stage
    336 
    337    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
    338    movdqa      xmm7, XMMWORD [wk(1)]   ; xmm7=tmp10H
    339 
    340    movdqa      xmm2, xmm5
    341    movdqa      xmm0, xmm7
    342    paddd       xmm5, xmm3              ; xmm5=data0L
    343    paddd       xmm7, xmm4              ; xmm7=data0H
    344    psubd       xmm2, xmm3              ; xmm2=data7L
    345    psubd       xmm0, xmm4              ; xmm0=data7H
    346 
    347    movdqa      xmm3, [rel PD_DESCALE_P1]  ; xmm3=[rel PD_DESCALE_P1]
    348 
    349    paddd       xmm5, xmm3
    350    paddd       xmm7, xmm3
    351    psrad       xmm5, DESCALE_P1
    352    psrad       xmm7, DESCALE_P1
    353    paddd       xmm2, xmm3
    354    paddd       xmm0, xmm3
    355    psrad       xmm2, DESCALE_P1
    356    psrad       xmm0, DESCALE_P1
    357 
    358    packssdw    xmm5, xmm7              ; xmm5=data0=(00 01 02 03 04 05 06 07)
    359    packssdw    xmm2, xmm0              ; xmm2=data7=(70 71 72 73 74 75 76 77)
    360 
    361    movdqa      xmm4, XMMWORD [wk(4)]   ; xmm4=tmp11L
    362    movdqa      xmm3, XMMWORD [wk(5)]   ; xmm3=tmp11H
    363 
    364    movdqa      xmm7, xmm4
    365    movdqa      xmm0, xmm3
    366    paddd       xmm4, xmm1              ; xmm4=data1L
    367    paddd       xmm3, xmm6              ; xmm3=data1H
    368    psubd       xmm7, xmm1              ; xmm7=data6L
    369    psubd       xmm0, xmm6              ; xmm0=data6H
    370 
    371    movdqa      xmm1, [rel PD_DESCALE_P1]  ; xmm1=[rel PD_DESCALE_P1]
    372 
    373    paddd       xmm4, xmm1
    374    paddd       xmm3, xmm1
    375    psrad       xmm4, DESCALE_P1
    376    psrad       xmm3, DESCALE_P1
    377    paddd       xmm7, xmm1
    378    paddd       xmm0, xmm1
    379    psrad       xmm7, DESCALE_P1
    380    psrad       xmm0, DESCALE_P1
    381 
    382    packssdw    xmm4, xmm3              ; xmm4=data1=(10 11 12 13 14 15 16 17)
    383    packssdw    xmm7, xmm0              ; xmm7=data6=(60 61 62 63 64 65 66 67)
    384 
    385    movdqa      xmm6, xmm5              ; transpose coefficients(phase 1)
    386    punpcklwd   xmm5, xmm4              ; xmm5=(00 10 01 11 02 12 03 13)
    387    punpckhwd   xmm6, xmm4              ; xmm6=(04 14 05 15 06 16 07 17)
    388    movdqa      xmm1, xmm7              ; transpose coefficients(phase 1)
    389    punpcklwd   xmm7, xmm2              ; xmm7=(60 70 61 71 62 72 63 73)
    390    punpckhwd   xmm1, xmm2              ; xmm1=(64 74 65 75 66 76 67 77)
    391 
    392    movdqa      xmm3, XMMWORD [wk(6)]   ; xmm3=tmp12L
    393    movdqa      xmm0, XMMWORD [wk(7)]   ; xmm0=tmp12H
    394    movdqa      xmm4, XMMWORD [wk(10)]  ; xmm4=tmp1L
    395    movdqa      xmm2, XMMWORD [wk(11)]  ; xmm2=tmp1H
    396 
    397    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 01 11 02 12 03 13)
    398    movdqa      XMMWORD [wk(1)], xmm6   ; wk(1)=(04 14 05 15 06 16 07 17)
    399    movdqa      XMMWORD [wk(4)], xmm7   ; wk(4)=(60 70 61 71 62 72 63 73)
    400    movdqa      XMMWORD [wk(5)], xmm1   ; wk(5)=(64 74 65 75 66 76 67 77)
    401 
    402    movdqa      xmm5, xmm3
    403    movdqa      xmm6, xmm0
    404    paddd       xmm3, xmm4              ; xmm3=data2L
    405    paddd       xmm0, xmm2              ; xmm0=data2H
    406    psubd       xmm5, xmm4              ; xmm5=data5L
    407    psubd       xmm6, xmm2              ; xmm6=data5H
    408 
    409    movdqa      xmm7, [rel PD_DESCALE_P1]  ; xmm7=[rel PD_DESCALE_P1]
    410 
    411    paddd       xmm3, xmm7
    412    paddd       xmm0, xmm7
    413    psrad       xmm3, DESCALE_P1
    414    psrad       xmm0, DESCALE_P1
    415    paddd       xmm5, xmm7
    416    paddd       xmm6, xmm7
    417    psrad       xmm5, DESCALE_P1
    418    psrad       xmm6, DESCALE_P1
    419 
    420    packssdw    xmm3, xmm0              ; xmm3=data2=(20 21 22 23 24 25 26 27)
    421    packssdw    xmm5, xmm6              ; xmm5=data5=(50 51 52 53 54 55 56 57)
    422 
    423    movdqa      xmm1, XMMWORD [wk(2)]   ; xmm1=tmp13L
    424    movdqa      xmm4, XMMWORD [wk(3)]   ; xmm4=tmp13H
    425    movdqa      xmm2, XMMWORD [wk(8)]   ; xmm2=tmp0L
    426    movdqa      xmm7, XMMWORD [wk(9)]   ; xmm7=tmp0H
    427 
    428    movdqa      xmm0, xmm1
    429    movdqa      xmm6, xmm4
    430    paddd       xmm1, xmm2              ; xmm1=data3L
    431    paddd       xmm4, xmm7              ; xmm4=data3H
    432    psubd       xmm0, xmm2              ; xmm0=data4L
    433    psubd       xmm6, xmm7              ; xmm6=data4H
    434 
    435    movdqa      xmm2, [rel PD_DESCALE_P1]  ; xmm2=[rel PD_DESCALE_P1]
    436 
    437    paddd       xmm1, xmm2
    438    paddd       xmm4, xmm2
    439    psrad       xmm1, DESCALE_P1
    440    psrad       xmm4, DESCALE_P1
    441    paddd       xmm0, xmm2
    442    paddd       xmm6, xmm2
    443    psrad       xmm0, DESCALE_P1
    444    psrad       xmm6, DESCALE_P1
    445 
    446    packssdw    xmm1, xmm4              ; xmm1=data3=(30 31 32 33 34 35 36 37)
    447    packssdw    xmm0, xmm6              ; xmm0=data4=(40 41 42 43 44 45 46 47)
    448 
    449    movdqa      xmm7, XMMWORD [wk(0)]   ; xmm7=(00 10 01 11 02 12 03 13)
    450    movdqa      xmm2, XMMWORD [wk(1)]   ; xmm2=(04 14 05 15 06 16 07 17)
    451 
    452    movdqa      xmm4, xmm3              ; transpose coefficients(phase 1)
    453    punpcklwd   xmm3, xmm1              ; xmm3=(20 30 21 31 22 32 23 33)
    454    punpckhwd   xmm4, xmm1              ; xmm4=(24 34 25 35 26 36 27 37)
    455    movdqa      xmm6, xmm0              ; transpose coefficients(phase 1)
    456    punpcklwd   xmm0, xmm5              ; xmm0=(40 50 41 51 42 52 43 53)
    457    punpckhwd   xmm6, xmm5              ; xmm6=(44 54 45 55 46 56 47 57)
    458 
    459    movdqa      xmm1, xmm7              ; transpose coefficients(phase 2)
    460    punpckldq   xmm7, xmm3              ; xmm7=(00 10 20 30 01 11 21 31)
    461    punpckhdq   xmm1, xmm3              ; xmm1=(02 12 22 32 03 13 23 33)
    462    movdqa      xmm5, xmm2              ; transpose coefficients(phase 2)
    463    punpckldq   xmm2, xmm4              ; xmm2=(04 14 24 34 05 15 25 35)
    464    punpckhdq   xmm5, xmm4              ; xmm5=(06 16 26 36 07 17 27 37)
    465 
    466    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=(60 70 61 71 62 72 63 73)
    467    movdqa      xmm4, XMMWORD [wk(5)]   ; xmm4=(64 74 65 75 66 76 67 77)
    468 
    469    movdqa      XMMWORD [wk(6)], xmm2   ; wk(6)=(04 14 24 34 05 15 25 35)
    470    movdqa      XMMWORD [wk(7)], xmm5   ; wk(7)=(06 16 26 36 07 17 27 37)
    471 
    472    movdqa      xmm2, xmm0              ; transpose coefficients(phase 2)
    473    punpckldq   xmm0, xmm3              ; xmm0=(40 50 60 70 41 51 61 71)
    474    punpckhdq   xmm2, xmm3              ; xmm2=(42 52 62 72 43 53 63 73)
    475    movdqa      xmm5, xmm6              ; transpose coefficients(phase 2)
    476    punpckldq   xmm6, xmm4              ; xmm6=(44 54 64 74 45 55 65 75)
    477    punpckhdq   xmm5, xmm4              ; xmm5=(46 56 66 76 47 57 67 77)
    478 
    479    movdqa      xmm3, xmm7              ; transpose coefficients(phase 3)
    480    punpcklqdq  xmm7, xmm0              ; xmm7=col0=(00 10 20 30 40 50 60 70)
    481    punpckhqdq  xmm3, xmm0              ; xmm3=col1=(01 11 21 31 41 51 61 71)
    482    movdqa      xmm4, xmm1              ; transpose coefficients(phase 3)
    483    punpcklqdq  xmm1, xmm2              ; xmm1=col2=(02 12 22 32 42 52 62 72)
    484    punpckhqdq  xmm4, xmm2              ; xmm4=col3=(03 13 23 33 43 53 63 73)
    485 
    486    movdqa      xmm0, XMMWORD [wk(6)]   ; xmm0=(04 14 24 34 05 15 25 35)
    487    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=(06 16 26 36 07 17 27 37)
    488 
    489    movdqa      XMMWORD [wk(8)], xmm3   ; wk(8)=col1
    490    movdqa      XMMWORD [wk(9)], xmm4   ; wk(9)=col3
    491 
    492    movdqa      xmm3, xmm0              ; transpose coefficients(phase 3)
    493    punpcklqdq  xmm0, xmm6              ; xmm0=col4=(04 14 24 34 44 54 64 74)
    494    punpckhqdq  xmm3, xmm6              ; xmm3=col5=(05 15 25 35 45 55 65 75)
    495    movdqa      xmm4, xmm2              ; transpose coefficients(phase 3)
    496    punpcklqdq  xmm2, xmm5              ; xmm2=col6=(06 16 26 36 46 56 66 76)
    497    punpckhqdq  xmm4, xmm5              ; xmm4=col7=(07 17 27 37 47 57 67 77)
    498 
    499    movdqa      XMMWORD [wk(10)], xmm3  ; wk(10)=col5
    500    movdqa      XMMWORD [wk(11)], xmm4  ; wk(11)=col7
    501 .column_end:
    502 
    503    ; -- Prefetch the next coefficient block
    504 
    505    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 0*32]
    506    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 1*32]
    507    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 2*32]
    508    prefetchnta [rsi + DCTSIZE2*SIZEOF_JCOEF + 3*32]
    509 
    510    ; ---- Pass 2: process rows from work array, store into output array.
    511 
    512    mov         rdi, r12                ; (JSAMPROW *)
    513    mov         eax, r13d
    514 
    515    ; -- Even part
    516 
    517    ; xmm7=col0, xmm1=col2, xmm0=col4, xmm2=col6
    518 
    519    ; (Original)
    520    ; z1 = (z2 + z3) * 0.541196100;
    521    ; tmp2 = z1 + z3 * -1.847759065;
    522    ; tmp3 = z1 + z2 * 0.765366865;
    523    ;
    524    ; (This implementation)
    525    ; tmp2 = z2 * 0.541196100 + z3 * (0.541196100 - 1.847759065);
    526    ; tmp3 = z2 * (0.541196100 + 0.765366865) + z3 * 0.541196100;
    527 
    528    movdqa      xmm6, xmm1              ; xmm1=in2=z2
    529    movdqa      xmm5, xmm1
    530    punpcklwd   xmm6, xmm2              ; xmm2=in6=z3
    531    punpckhwd   xmm5, xmm2
    532    movdqa      xmm1, xmm6
    533    movdqa      xmm2, xmm5
    534    pmaddwd     xmm6, [rel PW_F130_F054]   ; xmm6=tmp3L
    535    pmaddwd     xmm5, [rel PW_F130_F054]   ; xmm5=tmp3H
    536    pmaddwd     xmm1, [rel PW_F054_MF130]  ; xmm1=tmp2L
    537    pmaddwd     xmm2, [rel PW_F054_MF130]  ; xmm2=tmp2H
    538 
    539    movdqa      xmm3, xmm7
    540    paddw       xmm7, xmm0              ; xmm7=in0+in4
    541    psubw       xmm3, xmm0              ; xmm3=in0-in4
    542 
    543    pxor        xmm4, xmm4
    544    pxor        xmm0, xmm0
    545    punpcklwd   xmm4, xmm7              ; xmm4=tmp0L
    546    punpckhwd   xmm0, xmm7              ; xmm0=tmp0H
    547    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
    548    psrad       xmm0, (16-CONST_BITS)   ; psrad xmm0,16 & pslld xmm0,CONST_BITS
    549 
    550    movdqa      xmm7, xmm4
    551    paddd       xmm4, xmm6              ; xmm4=tmp10L
    552    psubd       xmm7, xmm6              ; xmm7=tmp13L
    553    movdqa      xmm6, xmm0
    554    paddd       xmm0, xmm5              ; xmm0=tmp10H
    555    psubd       xmm6, xmm5              ; xmm6=tmp13H
    556 
    557    movdqa      XMMWORD [wk(0)], xmm4   ; wk(0)=tmp10L
    558    movdqa      XMMWORD [wk(1)], xmm0   ; wk(1)=tmp10H
    559    movdqa      XMMWORD [wk(2)], xmm7   ; wk(2)=tmp13L
    560    movdqa      XMMWORD [wk(3)], xmm6   ; wk(3)=tmp13H
    561 
    562    pxor        xmm5, xmm5
    563    pxor        xmm4, xmm4
    564    punpcklwd   xmm5, xmm3              ; xmm5=tmp1L
    565    punpckhwd   xmm4, xmm3              ; xmm4=tmp1H
    566    psrad       xmm5, (16-CONST_BITS)   ; psrad xmm5,16 & pslld xmm5,CONST_BITS
    567    psrad       xmm4, (16-CONST_BITS)   ; psrad xmm4,16 & pslld xmm4,CONST_BITS
    568 
    569    movdqa      xmm0, xmm5
    570    paddd       xmm5, xmm1              ; xmm5=tmp11L
    571    psubd       xmm0, xmm1              ; xmm0=tmp12L
    572    movdqa      xmm7, xmm4
    573    paddd       xmm4, xmm2              ; xmm4=tmp11H
    574    psubd       xmm7, xmm2              ; xmm7=tmp12H
    575 
    576    movdqa      XMMWORD [wk(4)], xmm5   ; wk(4)=tmp11L
    577    movdqa      XMMWORD [wk(5)], xmm4   ; wk(5)=tmp11H
    578    movdqa      XMMWORD [wk(6)], xmm0   ; wk(6)=tmp12L
    579    movdqa      XMMWORD [wk(7)], xmm7   ; wk(7)=tmp12H
    580 
    581    ; -- Odd part
    582 
    583    movdqa      xmm6, XMMWORD [wk(9)]   ; xmm6=col3
    584    movdqa      xmm3, XMMWORD [wk(8)]   ; xmm3=col1
    585    movdqa      xmm1, XMMWORD [wk(11)]  ; xmm1=col7
    586    movdqa      xmm2, XMMWORD [wk(10)]  ; xmm2=col5
    587 
    588    movdqa      xmm5, xmm6
    589    movdqa      xmm4, xmm3
    590    paddw       xmm5, xmm1              ; xmm5=z3
    591    paddw       xmm4, xmm2              ; xmm4=z4
    592 
    593    ; (Original)
    594    ; z5 = (z3 + z4) * 1.175875602;
    595    ; z3 = z3 * -1.961570560;  z4 = z4 * -0.390180644;
    596    ; z3 += z5;  z4 += z5;
    597    ;
    598    ; (This implementation)
    599    ; z3 = z3 * (1.175875602 - 1.961570560) + z4 * 1.175875602;
    600    ; z4 = z3 * 1.175875602 + z4 * (1.175875602 - 0.390180644);
    601 
    602    movdqa      xmm0, xmm5
    603    movdqa      xmm7, xmm5
    604    punpcklwd   xmm0, xmm4
    605    punpckhwd   xmm7, xmm4
    606    movdqa      xmm5, xmm0
    607    movdqa      xmm4, xmm7
    608    pmaddwd     xmm0, [rel PW_MF078_F117]  ; xmm0=z3L
    609    pmaddwd     xmm7, [rel PW_MF078_F117]  ; xmm7=z3H
    610    pmaddwd     xmm5, [rel PW_F117_F078]   ; xmm5=z4L
    611    pmaddwd     xmm4, [rel PW_F117_F078]   ; xmm4=z4H
    612 
    613    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=z3L
    614    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=z3H
    615 
    616    ; (Original)
    617    ; z1 = tmp0 + tmp3;  z2 = tmp1 + tmp2;
    618    ; tmp0 = tmp0 * 0.298631336;  tmp1 = tmp1 * 2.053119869;
    619    ; tmp2 = tmp2 * 3.072711026;  tmp3 = tmp3 * 1.501321110;
    620    ; z1 = z1 * -0.899976223;  z2 = z2 * -2.562915447;
    621    ; tmp0 += z1 + z3;  tmp1 += z2 + z4;
    622    ; tmp2 += z2 + z3;  tmp3 += z1 + z4;
    623    ;
    624    ; (This implementation)
    625    ; tmp0 = tmp0 * (0.298631336 - 0.899976223) + tmp3 * -0.899976223;
    626    ; tmp1 = tmp1 * (2.053119869 - 2.562915447) + tmp2 * -2.562915447;
    627    ; tmp2 = tmp1 * -2.562915447 + tmp2 * (3.072711026 - 2.562915447);
    628    ; tmp3 = tmp0 * -0.899976223 + tmp3 * (1.501321110 - 0.899976223);
    629    ; tmp0 += z3;  tmp1 += z4;
    630    ; tmp2 += z3;  tmp3 += z4;
    631 
    632    movdqa      xmm0, xmm1
    633    movdqa      xmm7, xmm1
    634    punpcklwd   xmm0, xmm3
    635    punpckhwd   xmm7, xmm3
    636    movdqa      xmm1, xmm0
    637    movdqa      xmm3, xmm7
    638    pmaddwd     xmm0, [rel PW_MF060_MF089]  ; xmm0=tmp0L
    639    pmaddwd     xmm7, [rel PW_MF060_MF089]  ; xmm7=tmp0H
    640    pmaddwd     xmm1, [rel PW_MF089_F060]   ; xmm1=tmp3L
    641    pmaddwd     xmm3, [rel PW_MF089_F060]   ; xmm3=tmp3H
    642 
    643    paddd       xmm0, XMMWORD [wk(10)]  ; xmm0=tmp0L
    644    paddd       xmm7, XMMWORD [wk(11)]  ; xmm7=tmp0H
    645    paddd       xmm1, xmm5              ; xmm1=tmp3L
    646    paddd       xmm3, xmm4              ; xmm3=tmp3H
    647 
    648    movdqa      XMMWORD [wk(8)], xmm0   ; wk(8)=tmp0L
    649    movdqa      XMMWORD [wk(9)], xmm7   ; wk(9)=tmp0H
    650 
    651    movdqa      xmm0, xmm2
    652    movdqa      xmm7, xmm2
    653    punpcklwd   xmm0, xmm6
    654    punpckhwd   xmm7, xmm6
    655    movdqa      xmm2, xmm0
    656    movdqa      xmm6, xmm7
    657    pmaddwd     xmm0, [rel PW_MF050_MF256]  ; xmm0=tmp1L
    658    pmaddwd     xmm7, [rel PW_MF050_MF256]  ; xmm7=tmp1H
    659    pmaddwd     xmm2, [rel PW_MF256_F050]   ; xmm2=tmp2L
    660    pmaddwd     xmm6, [rel PW_MF256_F050]   ; xmm6=tmp2H
    661 
    662    paddd       xmm0, xmm5              ; xmm0=tmp1L
    663    paddd       xmm7, xmm4              ; xmm7=tmp1H
    664    paddd       xmm2, XMMWORD [wk(10)]  ; xmm2=tmp2L
    665    paddd       xmm6, XMMWORD [wk(11)]  ; xmm6=tmp2H
    666 
    667    movdqa      XMMWORD [wk(10)], xmm0  ; wk(10)=tmp1L
    668    movdqa      XMMWORD [wk(11)], xmm7  ; wk(11)=tmp1H
    669 
    670    ; -- Final output stage
    671 
    672    movdqa      xmm5, XMMWORD [wk(0)]   ; xmm5=tmp10L
    673    movdqa      xmm4, XMMWORD [wk(1)]   ; xmm4=tmp10H
    674 
    675    movdqa      xmm0, xmm5
    676    movdqa      xmm7, xmm4
    677    paddd       xmm5, xmm1              ; xmm5=data0L
    678    paddd       xmm4, xmm3              ; xmm4=data0H
    679    psubd       xmm0, xmm1              ; xmm0=data7L
    680    psubd       xmm7, xmm3              ; xmm7=data7H
    681 
    682    movdqa      xmm1, [rel PD_DESCALE_P2]  ; xmm1=[rel PD_DESCALE_P2]
    683 
    684    paddd       xmm5, xmm1
    685    paddd       xmm4, xmm1
    686    psrad       xmm5, DESCALE_P2
    687    psrad       xmm4, DESCALE_P2
    688    paddd       xmm0, xmm1
    689    paddd       xmm7, xmm1
    690    psrad       xmm0, DESCALE_P2
    691    psrad       xmm7, DESCALE_P2
    692 
    693    packssdw    xmm5, xmm4              ; xmm5=data0=(00 10 20 30 40 50 60 70)
    694    packssdw    xmm0, xmm7              ; xmm0=data7=(07 17 27 37 47 57 67 77)
    695 
    696    movdqa      xmm3, XMMWORD [wk(4)]   ; xmm3=tmp11L
    697    movdqa      xmm1, XMMWORD [wk(5)]   ; xmm1=tmp11H
    698 
    699    movdqa      xmm4, xmm3
    700    movdqa      xmm7, xmm1
    701    paddd       xmm3, xmm2              ; xmm3=data1L
    702    paddd       xmm1, xmm6              ; xmm1=data1H
    703    psubd       xmm4, xmm2              ; xmm4=data6L
    704    psubd       xmm7, xmm6              ; xmm7=data6H
    705 
    706    movdqa      xmm2, [rel PD_DESCALE_P2]  ; xmm2=[rel PD_DESCALE_P2]
    707 
    708    paddd       xmm3, xmm2
    709    paddd       xmm1, xmm2
    710    psrad       xmm3, DESCALE_P2
    711    psrad       xmm1, DESCALE_P2
    712    paddd       xmm4, xmm2
    713    paddd       xmm7, xmm2
    714    psrad       xmm4, DESCALE_P2
    715    psrad       xmm7, DESCALE_P2
    716 
    717    packssdw    xmm3, xmm1              ; xmm3=data1=(01 11 21 31 41 51 61 71)
    718    packssdw    xmm4, xmm7              ; xmm4=data6=(06 16 26 36 46 56 66 76)
    719 
    720    packsswb    xmm5, xmm4              ; xmm5=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
    721    packsswb    xmm3, xmm0              ; xmm3=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
    722 
    723    movdqa      xmm6, XMMWORD [wk(6)]   ; xmm6=tmp12L
    724    movdqa      xmm2, XMMWORD [wk(7)]   ; xmm2=tmp12H
    725    movdqa      xmm1, XMMWORD [wk(10)]  ; xmm1=tmp1L
    726    movdqa      xmm7, XMMWORD [wk(11)]  ; xmm7=tmp1H
    727 
    728    movdqa      XMMWORD [wk(0)], xmm5   ; wk(0)=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
    729    movdqa      XMMWORD [wk(1)], xmm3   ; wk(1)=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
    730 
    731    movdqa      xmm4, xmm6
    732    movdqa      xmm0, xmm2
    733    paddd       xmm6, xmm1              ; xmm6=data2L
    734    paddd       xmm2, xmm7              ; xmm2=data2H
    735    psubd       xmm4, xmm1              ; xmm4=data5L
    736    psubd       xmm0, xmm7              ; xmm0=data5H
    737 
    738    movdqa      xmm5, [rel PD_DESCALE_P2]  ; xmm5=[rel PD_DESCALE_P2]
    739 
    740    paddd       xmm6, xmm5
    741    paddd       xmm2, xmm5
    742    psrad       xmm6, DESCALE_P2
    743    psrad       xmm2, DESCALE_P2
    744    paddd       xmm4, xmm5
    745    paddd       xmm0, xmm5
    746    psrad       xmm4, DESCALE_P2
    747    psrad       xmm0, DESCALE_P2
    748 
    749    packssdw    xmm6, xmm2              ; xmm6=data2=(02 12 22 32 42 52 62 72)
    750    packssdw    xmm4, xmm0              ; xmm4=data5=(05 15 25 35 45 55 65 75)
    751 
    752    movdqa      xmm3, XMMWORD [wk(2)]   ; xmm3=tmp13L
    753    movdqa      xmm1, XMMWORD [wk(3)]   ; xmm1=tmp13H
    754    movdqa      xmm7, XMMWORD [wk(8)]   ; xmm7=tmp0L
    755    movdqa      xmm5, XMMWORD [wk(9)]   ; xmm5=tmp0H
    756 
    757    movdqa      xmm2, xmm3
    758    movdqa      xmm0, xmm1
    759    paddd       xmm3, xmm7              ; xmm3=data3L
    760    paddd       xmm1, xmm5              ; xmm1=data3H
    761    psubd       xmm2, xmm7              ; xmm2=data4L
    762    psubd       xmm0, xmm5              ; xmm0=data4H
    763 
    764    movdqa      xmm7, [rel PD_DESCALE_P2]  ; xmm7=[rel PD_DESCALE_P2]
    765 
    766    paddd       xmm3, xmm7
    767    paddd       xmm1, xmm7
    768    psrad       xmm3, DESCALE_P2
    769    psrad       xmm1, DESCALE_P2
    770    paddd       xmm2, xmm7
    771    paddd       xmm0, xmm7
    772    psrad       xmm2, DESCALE_P2
    773    psrad       xmm0, DESCALE_P2
    774 
    775    movdqa      xmm5, [rel PB_CENTERJSAMP]  ; xmm5=[rel PB_CENTERJSAMP]
    776 
    777    packssdw    xmm3, xmm1             ; xmm3=data3=(03 13 23 33 43 53 63 73)
    778    packssdw    xmm2, xmm0             ; xmm2=data4=(04 14 24 34 44 54 64 74)
    779 
    780    movdqa      xmm7, XMMWORD [wk(0)]  ; xmm7=(00 10 20 30 40 50 60 70 06 16 26 36 46 56 66 76)
    781    movdqa      xmm1, XMMWORD [wk(1)]  ; xmm1=(01 11 21 31 41 51 61 71 07 17 27 37 47 57 67 77)
    782 
    783    packsswb    xmm6, xmm2             ; xmm6=(02 12 22 32 42 52 62 72 04 14 24 34 44 54 64 74)
    784    packsswb    xmm3, xmm4             ; xmm3=(03 13 23 33 43 53 63 73 05 15 25 35 45 55 65 75)
    785 
    786    paddb       xmm7, xmm5
    787    paddb       xmm1, xmm5
    788    paddb       xmm6, xmm5
    789    paddb       xmm3, xmm5
    790 
    791    movdqa      xmm0, xmm7        ; transpose coefficients(phase 1)
    792    punpcklbw   xmm7, xmm1        ; xmm7=(00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71)
    793    punpckhbw   xmm0, xmm1        ; xmm0=(06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77)
    794    movdqa      xmm2, xmm6        ; transpose coefficients(phase 1)
    795    punpcklbw   xmm6, xmm3        ; xmm6=(02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73)
    796    punpckhbw   xmm2, xmm3        ; xmm2=(04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75)
    797 
    798    movdqa      xmm4, xmm7        ; transpose coefficients(phase 2)
    799    punpcklwd   xmm7, xmm6        ; xmm7=(00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33)
    800    punpckhwd   xmm4, xmm6        ; xmm4=(40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73)
    801    movdqa      xmm5, xmm2        ; transpose coefficients(phase 2)
    802    punpcklwd   xmm2, xmm0        ; xmm2=(04 05 06 07 14 15 16 17 24 25 26 27 34 35 36 37)
    803    punpckhwd   xmm5, xmm0        ; xmm5=(44 45 46 47 54 55 56 57 64 65 66 67 74 75 76 77)
    804 
    805    movdqa      xmm1, xmm7        ; transpose coefficients(phase 3)
    806    punpckldq   xmm7, xmm2        ; xmm7=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
    807    punpckhdq   xmm1, xmm2        ; xmm1=(20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37)
    808    movdqa      xmm3, xmm4        ; transpose coefficients(phase 3)
    809    punpckldq   xmm4, xmm5        ; xmm4=(40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57)
    810    punpckhdq   xmm3, xmm5        ; xmm3=(60 61 62 63 64 65 66 67 70 71 72 73 74 75 76 77)
    811 
    812    pshufd      xmm6, xmm7, 0x4E  ; xmm6=(10 11 12 13 14 15 16 17 00 01 02 03 04 05 06 07)
    813    pshufd      xmm0, xmm1, 0x4E  ; xmm0=(30 31 32 33 34 35 36 37 20 21 22 23 24 25 26 27)
    814    pshufd      xmm2, xmm4, 0x4E  ; xmm2=(50 51 52 53 54 55 56 57 40 41 42 43 44 45 46 47)
    815    pshufd      xmm5, xmm3, 0x4E  ; xmm5=(70 71 72 73 74 75 76 77 60 61 62 63 64 65 66 67)
    816 
    817    mov         rdxp, JSAMPROW [rdi+0*SIZEOF_JSAMPROW]
    818    mov         rsip, JSAMPROW [rdi+2*SIZEOF_JSAMPROW]
    819    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm7
    820    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm1
    821    mov         rdxp, JSAMPROW [rdi+4*SIZEOF_JSAMPROW]
    822    mov         rsip, JSAMPROW [rdi+6*SIZEOF_JSAMPROW]
    823    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm4
    824    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm3
    825 
    826    mov         rdxp, JSAMPROW [rdi+1*SIZEOF_JSAMPROW]
    827    mov         rsip, JSAMPROW [rdi+3*SIZEOF_JSAMPROW]
    828    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm6
    829    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm0
    830    mov         rdxp, JSAMPROW [rdi+5*SIZEOF_JSAMPROW]
    831    mov         rsip, JSAMPROW [rdi+7*SIZEOF_JSAMPROW]
    832    movq        XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE], xmm2
    833    movq        XMM_MMWORD [rsi+rax*SIZEOF_JSAMPLE], xmm5
    834 
    835    UNCOLLECT_ARGS 4
    836    lea         rsp, [rbp-8]
    837    pop         r15
    838    pop         rbp
    839    ret
    840 
    841 ; For some reason, the OS X linker does not honor the request to align the
    842 ; segment unless we do this.
    843    align       32