jquanti-sse2.asm (6283B)
1 ; 2 ; jquanti.asm - sample data conversion and quantization (64-bit SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2009, 2016, 2024, D. R. Commander. 6 ; Copyright (C) 2018, Matthias Räncker. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jsimdext.inc" 15 %include "jdct.inc" 16 17 ; -------------------------------------------------------------------------- 18 SECTION SEG_TEXT 19 BITS 64 20 ; 21 ; Load data into workspace, applying unsigned->signed conversion 22 ; 23 ; GLOBAL(void) 24 ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 25 ; DCTELEM *workspace); 26 ; 27 28 ; r10 = JSAMPARRAY sample_data 29 ; r11d = JDIMENSION start_col 30 ; r12 = DCTELEM *workspace 31 32 align 32 33 GLOBAL_FUNCTION(jsimd_convsamp_sse2) 34 35 EXTN(jsimd_convsamp_sse2): 36 ENDBR64 37 push rbp 38 mov rbp, rsp 39 COLLECT_ARGS 3 40 push rbx 41 42 pxor xmm6, xmm6 ; xmm6=(all 0's) 43 pcmpeqw xmm7, xmm7 44 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 45 46 mov rsi, r10 47 mov eax, r11d 48 mov rdi, r12 49 mov rcx, DCTSIZE/4 50 .convloop: 51 mov rbxp, JSAMPROW [rsi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 52 mov rdxp, JSAMPROW [rsi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 53 54 movq xmm0, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 55 movq xmm1, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 56 57 mov rbxp, JSAMPROW [rsi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 58 mov rdxp, JSAMPROW [rsi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59 60 movq xmm2, XMM_MMWORD [rbx+rax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 61 movq xmm3, XMM_MMWORD [rdx+rax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 62 63 punpcklbw xmm0, xmm6 ; xmm0=(01234567) 64 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 65 paddw xmm0, xmm7 66 paddw xmm1, xmm7 67 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 68 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 69 paddw xmm2, xmm7 70 paddw xmm3, xmm7 71 72 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 73 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 74 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 75 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 76 77 add rsi, byte 4*SIZEOF_JSAMPROW 78 add rdi, byte 4*DCTSIZE*SIZEOF_DCTELEM 79 dec rcx 80 jnz short .convloop 81 82 pop rbx 83 UNCOLLECT_ARGS 3 84 pop rbp 85 ret 86 87 ; -------------------------------------------------------------------------- 88 ; 89 ; Quantize/descale the coefficients, and store into coef_block 90 ; 91 ; This implementation is based on an algorithm described in 92 ; "Optimizing subroutines in assembly language: 93 ; An optimization guide for x86 platforms" (https://agner.org/optimize). 94 ; 95 ; GLOBAL(void) 96 ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 97 ; DCTELEM *workspace); 98 ; 99 100 %define RECIPROCAL(m, n, b) \ 101 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 102 %define CORRECTION(m, n, b) \ 103 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 104 %define SCALE(m, n, b) \ 105 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 106 107 ; r10 = JCOEFPTR coef_block 108 ; r11 = DCTELEM *divisors 109 ; r12 = DCTELEM *workspace 110 111 align 32 112 GLOBAL_FUNCTION(jsimd_quantize_sse2) 113 114 EXTN(jsimd_quantize_sse2): 115 ENDBR64 116 push rbp 117 mov rbp, rsp 118 COLLECT_ARGS 3 119 120 mov rsi, r12 121 mov rdx, r11 122 mov rdi, r10 123 mov rax, DCTSIZE2/32 124 .quantloop: 125 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,rsi,SIZEOF_DCTELEM)] 126 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,rsi,SIZEOF_DCTELEM)] 127 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,rsi,SIZEOF_DCTELEM)] 128 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,rsi,SIZEOF_DCTELEM)] 129 movdqa xmm0, xmm4 130 movdqa xmm1, xmm5 131 movdqa xmm2, xmm6 132 movdqa xmm3, xmm7 133 psraw xmm4, (WORD_BIT-1) 134 psraw xmm5, (WORD_BIT-1) 135 psraw xmm6, (WORD_BIT-1) 136 psraw xmm7, (WORD_BIT-1) 137 pxor xmm0, xmm4 138 pxor xmm1, xmm5 139 pxor xmm2, xmm6 140 pxor xmm3, xmm7 141 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 142 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 143 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 144 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 145 146 paddw xmm0, XMMWORD [CORRECTION(0,0,rdx)] ; correction + roundfactor 147 paddw xmm1, XMMWORD [CORRECTION(1,0,rdx)] 148 paddw xmm2, XMMWORD [CORRECTION(2,0,rdx)] 149 paddw xmm3, XMMWORD [CORRECTION(3,0,rdx)] 150 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,rdx)] ; reciprocal 151 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,rdx)] 152 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,rdx)] 153 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,rdx)] 154 pmulhuw xmm0, XMMWORD [SCALE(0,0,rdx)] ; scale 155 pmulhuw xmm1, XMMWORD [SCALE(1,0,rdx)] 156 pmulhuw xmm2, XMMWORD [SCALE(2,0,rdx)] 157 pmulhuw xmm3, XMMWORD [SCALE(3,0,rdx)] 158 159 pxor xmm0, xmm4 160 pxor xmm1, xmm5 161 pxor xmm2, xmm6 162 pxor xmm3, xmm7 163 psubw xmm0, xmm4 164 psubw xmm1, xmm5 165 psubw xmm2, xmm6 166 psubw xmm3, xmm7 167 movdqa XMMWORD [XMMBLOCK(0,0,rdi,SIZEOF_DCTELEM)], xmm0 168 movdqa XMMWORD [XMMBLOCK(1,0,rdi,SIZEOF_DCTELEM)], xmm1 169 movdqa XMMWORD [XMMBLOCK(2,0,rdi,SIZEOF_DCTELEM)], xmm2 170 movdqa XMMWORD [XMMBLOCK(3,0,rdi,SIZEOF_DCTELEM)], xmm3 171 172 add rsi, byte 32*SIZEOF_DCTELEM 173 add rdx, byte 32*SIZEOF_DCTELEM 174 add rdi, byte 32*SIZEOF_JCOEF 175 dec rax 176 jnz near .quantloop 177 178 UNCOLLECT_ARGS 3 179 pop rbp 180 ret 181 182 ; For some reason, the OS X linker does not honor the request to align the 183 ; segment unless we do this. 184 align 32