jquanti-sse2.asm (7248B)
1 ; 2 ; jquanti.asm - sample data conversion and quantization (SSE2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2024, D. R. Commander. 6 ; 7 ; Based on the x86 SIMD extension for IJG JPEG library 8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 10 ; 11 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 12 13 %include "jsimdext.inc" 14 %include "jdct.inc" 15 16 ; -------------------------------------------------------------------------- 17 SECTION SEG_TEXT 18 BITS 32 19 ; 20 ; Load data into workspace, applying unsigned->signed conversion 21 ; 22 ; GLOBAL(void) 23 ; jsimd_convsamp_sse2(JSAMPARRAY sample_data, JDIMENSION start_col, 24 ; DCTELEM *workspace); 25 ; 26 27 %define sample_data ebp + 8 ; JSAMPARRAY sample_data 28 %define start_col ebp + 12 ; JDIMENSION start_col 29 %define workspace ebp + 16 ; DCTELEM *workspace 30 31 align 32 32 GLOBAL_FUNCTION(jsimd_convsamp_sse2) 33 34 EXTN(jsimd_convsamp_sse2): 35 push ebp 36 mov ebp, esp 37 push ebx 38 ; push ecx ; need not be preserved 39 ; push edx ; need not be preserved 40 push esi 41 push edi 42 43 pxor xmm6, xmm6 ; xmm6=(all 0's) 44 pcmpeqw xmm7, xmm7 45 psllw xmm7, 7 ; xmm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 46 47 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 48 mov eax, JDIMENSION [start_col] 49 mov edi, POINTER [workspace] ; (DCTELEM *) 50 mov ecx, DCTSIZE/4 51 ALIGNX 16, 7 52 .convloop: 53 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 54 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 55 56 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm0=(01234567) 57 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm1=(89ABCDEF) 58 59 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 61 62 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] ; xmm2=(GHIJKLMN) 63 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] ; xmm3=(OPQRSTUV) 64 65 punpcklbw xmm0, xmm6 ; xmm0=(01234567) 66 punpcklbw xmm1, xmm6 ; xmm1=(89ABCDEF) 67 paddw xmm0, xmm7 68 paddw xmm1, xmm7 69 punpcklbw xmm2, xmm6 ; xmm2=(GHIJKLMN) 70 punpcklbw xmm3, xmm6 ; xmm3=(OPQRSTUV) 71 paddw xmm2, xmm7 72 paddw xmm3, xmm7 73 74 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 75 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 76 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 77 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 78 79 add esi, byte 4*SIZEOF_JSAMPROW 80 add edi, byte 4*DCTSIZE*SIZEOF_DCTELEM 81 dec ecx 82 jnz short .convloop 83 84 pop edi 85 pop esi 86 ; pop edx ; need not be preserved 87 ; pop ecx ; need not be preserved 88 pop ebx 89 pop ebp 90 ret 91 92 ; -------------------------------------------------------------------------- 93 ; 94 ; Quantize/descale the coefficients, and store into coef_block 95 ; 96 ; This implementation is based on an algorithm described in 97 ; "Optimizing subroutines in assembly language: 98 ; An optimization guide for x86 platforms" (https://agner.org/optimize). 99 ; 100 ; GLOBAL(void) 101 ; jsimd_quantize_sse2(JCOEFPTR coef_block, DCTELEM *divisors, 102 ; DCTELEM *workspace); 103 ; 104 105 %define RECIPROCAL(m, n, b) \ 106 XMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 107 %define CORRECTION(m, n, b) \ 108 XMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 109 %define SCALE(m, n, b) \ 110 XMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 111 112 %define coef_block ebp + 8 ; JCOEFPTR coef_block 113 %define divisors ebp + 12 ; DCTELEM *divisors 114 %define workspace ebp + 16 ; DCTELEM *workspace 115 116 align 32 117 GLOBAL_FUNCTION(jsimd_quantize_sse2) 118 119 EXTN(jsimd_quantize_sse2): 120 push ebp 121 mov ebp, esp 122 ; push ebx ; unused 123 ; push ecx ; unused 124 ; push edx ; need not be preserved 125 push esi 126 push edi 127 128 mov esi, POINTER [workspace] 129 mov edx, POINTER [divisors] 130 mov edi, JCOEFPTR [coef_block] 131 mov eax, DCTSIZE2/32 132 ALIGNX 16, 7 133 .quantloop: 134 movdqa xmm4, XMMWORD [XMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 135 movdqa xmm5, XMMWORD [XMMBLOCK(1,0,esi,SIZEOF_DCTELEM)] 136 movdqa xmm6, XMMWORD [XMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 137 movdqa xmm7, XMMWORD [XMMBLOCK(3,0,esi,SIZEOF_DCTELEM)] 138 movdqa xmm0, xmm4 139 movdqa xmm1, xmm5 140 movdqa xmm2, xmm6 141 movdqa xmm3, xmm7 142 psraw xmm4, (WORD_BIT-1) 143 psraw xmm5, (WORD_BIT-1) 144 psraw xmm6, (WORD_BIT-1) 145 psraw xmm7, (WORD_BIT-1) 146 pxor xmm0, xmm4 147 pxor xmm1, xmm5 148 pxor xmm2, xmm6 149 pxor xmm3, xmm7 150 psubw xmm0, xmm4 ; if (xmm0 < 0) xmm0 = -xmm0; 151 psubw xmm1, xmm5 ; if (xmm1 < 0) xmm1 = -xmm1; 152 psubw xmm2, xmm6 ; if (xmm2 < 0) xmm2 = -xmm2; 153 psubw xmm3, xmm7 ; if (xmm3 < 0) xmm3 = -xmm3; 154 155 paddw xmm0, XMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 156 paddw xmm1, XMMWORD [CORRECTION(1,0,edx)] 157 paddw xmm2, XMMWORD [CORRECTION(2,0,edx)] 158 paddw xmm3, XMMWORD [CORRECTION(3,0,edx)] 159 pmulhuw xmm0, XMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 160 pmulhuw xmm1, XMMWORD [RECIPROCAL(1,0,edx)] 161 pmulhuw xmm2, XMMWORD [RECIPROCAL(2,0,edx)] 162 pmulhuw xmm3, XMMWORD [RECIPROCAL(3,0,edx)] 163 pmulhuw xmm0, XMMWORD [SCALE(0,0,edx)] ; scale 164 pmulhuw xmm1, XMMWORD [SCALE(1,0,edx)] 165 pmulhuw xmm2, XMMWORD [SCALE(2,0,edx)] 166 pmulhuw xmm3, XMMWORD [SCALE(3,0,edx)] 167 168 pxor xmm0, xmm4 169 pxor xmm1, xmm5 170 pxor xmm2, xmm6 171 pxor xmm3, xmm7 172 psubw xmm0, xmm4 173 psubw xmm1, xmm5 174 psubw xmm2, xmm6 175 psubw xmm3, xmm7 176 movdqa XMMWORD [XMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], xmm0 177 movdqa XMMWORD [XMMBLOCK(1,0,edi,SIZEOF_DCTELEM)], xmm1 178 movdqa XMMWORD [XMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], xmm2 179 movdqa XMMWORD [XMMBLOCK(3,0,edi,SIZEOF_DCTELEM)], xmm3 180 181 add esi, byte 32*SIZEOF_DCTELEM 182 add edx, byte 32*SIZEOF_DCTELEM 183 add edi, byte 32*SIZEOF_JCOEF 184 dec eax 185 jnz near .quantloop 186 187 pop edi 188 pop esi 189 ; pop edx ; need not be preserved 190 ; pop ecx ; unused 191 ; pop ebx ; unused 192 pop ebp 193 ret 194 195 ; For some reason, the OS X linker does not honor the request to align the 196 ; segment unless we do this. 197 align 32