jquanti-avx2.asm (6697B)
1 ; 2 ; jquanti.asm - sample data conversion and quantization (AVX2) 3 ; 4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB 5 ; Copyright (C) 2016, 2018, 2024, D. R. Commander. 6 ; Copyright (C) 2016, Matthieu Darbois. 7 ; 8 ; Based on the x86 SIMD extension for IJG JPEG library 9 ; Copyright (C) 1999-2006, MIYASAKA Masaru. 10 ; For conditions of distribution and use, see copyright notice in jsimdext.inc 11 ; 12 ; This file should be assembled with NASM (Netwide Assembler) or Yasm. 13 14 %include "jsimdext.inc" 15 %include "jdct.inc" 16 17 ; -------------------------------------------------------------------------- 18 SECTION SEG_TEXT 19 BITS 32 20 ; 21 ; Load data into workspace, applying unsigned->signed conversion 22 ; 23 ; GLOBAL(void) 24 ; jsimd_convsamp_avx2(JSAMPARRAY sample_data, JDIMENSION start_col, 25 ; DCTELEM *workspace); 26 ; 27 28 %define sample_data ebp + 8 ; JSAMPARRAY sample_data 29 %define start_col ebp + 12 ; JDIMENSION start_col 30 %define workspace ebp + 16 ; DCTELEM *workspace 31 32 align 32 33 GLOBAL_FUNCTION(jsimd_convsamp_avx2) 34 35 EXTN(jsimd_convsamp_avx2): 36 push ebp 37 mov ebp, esp 38 push ebx 39 ; push ecx ; need not be preserved 40 ; push edx ; need not be preserved 41 push esi 42 push edi 43 44 mov esi, JSAMPARRAY [sample_data] ; (JSAMPROW *) 45 mov eax, JDIMENSION [start_col] 46 mov edi, POINTER [workspace] ; (DCTELEM *) 47 48 mov ebx, JSAMPROW [esi+0*SIZEOF_JSAMPROW] ; (JSAMPLE *) 49 mov edx, JSAMPROW [esi+1*SIZEOF_JSAMPROW] ; (JSAMPLE *) 50 movq xmm0, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 51 movq xmm1, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 52 53 mov ebx, JSAMPROW [esi+2*SIZEOF_JSAMPROW] ; (JSAMPLE *) 54 mov edx, JSAMPROW [esi+3*SIZEOF_JSAMPROW] ; (JSAMPLE *) 55 movq xmm2, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 56 movq xmm3, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 57 58 mov ebx, JSAMPROW [esi+4*SIZEOF_JSAMPROW] ; (JSAMPLE *) 59 mov edx, JSAMPROW [esi+5*SIZEOF_JSAMPROW] ; (JSAMPLE *) 60 movq xmm4, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 61 movq xmm5, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 62 63 mov ebx, JSAMPROW [esi+6*SIZEOF_JSAMPROW] ; (JSAMPLE *) 64 mov edx, JSAMPROW [esi+7*SIZEOF_JSAMPROW] ; (JSAMPLE *) 65 movq xmm6, XMM_MMWORD [ebx+eax*SIZEOF_JSAMPLE] 66 movq xmm7, XMM_MMWORD [edx+eax*SIZEOF_JSAMPLE] 67 68 vinserti128 ymm0, ymm0, xmm1, 1 69 vinserti128 ymm2, ymm2, xmm3, 1 70 vinserti128 ymm4, ymm4, xmm5, 1 71 vinserti128 ymm6, ymm6, xmm7, 1 72 73 vpxor ymm1, ymm1, ymm1 ; ymm1=(all 0's) 74 vpunpcklbw ymm0, ymm0, ymm1 75 vpunpcklbw ymm2, ymm2, ymm1 76 vpunpcklbw ymm4, ymm4, ymm1 77 vpunpcklbw ymm6, ymm6, ymm1 78 79 vpcmpeqw ymm7, ymm7, ymm7 80 vpsllw ymm7, ymm7, 7 ; ymm7={0xFF80 0xFF80 0xFF80 0xFF80 ..} 81 82 vpaddw ymm0, ymm0, ymm7 83 vpaddw ymm2, ymm2, ymm7 84 vpaddw ymm4, ymm4, ymm7 85 vpaddw ymm6, ymm6, ymm7 86 87 vmovdqu YMMWORD [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 88 vmovdqu YMMWORD [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm2 89 vmovdqu YMMWORD [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm4 90 vmovdqu YMMWORD [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm6 91 92 vzeroupper 93 pop edi 94 pop esi 95 ; pop edx ; need not be preserved 96 ; pop ecx ; need not be preserved 97 pop ebx 98 pop ebp 99 ret 100 101 ; -------------------------------------------------------------------------- 102 ; 103 ; Quantize/descale the coefficients, and store into coef_block 104 ; 105 ; This implementation is based on an algorithm described in 106 ; "Optimizing subroutines in assembly language: 107 ; An optimization guide for x86 platforms" (https://agner.org/optimize). 108 ; 109 ; GLOBAL(void) 110 ; jsimd_quantize_avx2(JCOEFPTR coef_block, DCTELEM *divisors, 111 ; DCTELEM *workspace); 112 ; 113 114 %define RECIPROCAL(m, n, b) \ 115 YMMBLOCK(DCTSIZE * 0 + (m), (n), (b), SIZEOF_DCTELEM) 116 %define CORRECTION(m, n, b) \ 117 YMMBLOCK(DCTSIZE * 1 + (m), (n), (b), SIZEOF_DCTELEM) 118 %define SCALE(m, n, b) \ 119 YMMBLOCK(DCTSIZE * 2 + (m), (n), (b), SIZEOF_DCTELEM) 120 121 %define coef_block ebp + 8 ; JCOEFPTR coef_block 122 %define divisors ebp + 12 ; DCTELEM *divisors 123 %define workspace ebp + 16 ; DCTELEM *workspace 124 125 align 32 126 GLOBAL_FUNCTION(jsimd_quantize_avx2) 127 128 EXTN(jsimd_quantize_avx2): 129 push ebp 130 mov ebp, esp 131 ; push ebx ; unused 132 ; push ecx ; unused 133 ; push edx ; need not be preserved 134 push esi 135 push edi 136 137 mov esi, POINTER [workspace] 138 mov edx, POINTER [divisors] 139 mov edi, JCOEFPTR [coef_block] 140 141 vmovdqu ymm4, [YMMBLOCK(0,0,esi,SIZEOF_DCTELEM)] 142 vmovdqu ymm5, [YMMBLOCK(2,0,esi,SIZEOF_DCTELEM)] 143 vmovdqu ymm6, [YMMBLOCK(4,0,esi,SIZEOF_DCTELEM)] 144 vmovdqu ymm7, [YMMBLOCK(6,0,esi,SIZEOF_DCTELEM)] 145 vpabsw ymm0, ymm4 146 vpabsw ymm1, ymm5 147 vpabsw ymm2, ymm6 148 vpabsw ymm3, ymm7 149 150 vpaddw ymm0, YMMWORD [CORRECTION(0,0,edx)] ; correction + roundfactor 151 vpaddw ymm1, YMMWORD [CORRECTION(2,0,edx)] 152 vpaddw ymm2, YMMWORD [CORRECTION(4,0,edx)] 153 vpaddw ymm3, YMMWORD [CORRECTION(6,0,edx)] 154 vpmulhuw ymm0, YMMWORD [RECIPROCAL(0,0,edx)] ; reciprocal 155 vpmulhuw ymm1, YMMWORD [RECIPROCAL(2,0,edx)] 156 vpmulhuw ymm2, YMMWORD [RECIPROCAL(4,0,edx)] 157 vpmulhuw ymm3, YMMWORD [RECIPROCAL(6,0,edx)] 158 vpmulhuw ymm0, YMMWORD [SCALE(0,0,edx)] ; scale 159 vpmulhuw ymm1, YMMWORD [SCALE(2,0,edx)] 160 vpmulhuw ymm2, YMMWORD [SCALE(4,0,edx)] 161 vpmulhuw ymm3, YMMWORD [SCALE(6,0,edx)] 162 163 vpsignw ymm0, ymm0, ymm4 164 vpsignw ymm1, ymm1, ymm5 165 vpsignw ymm2, ymm2, ymm6 166 vpsignw ymm3, ymm3, ymm7 167 168 vmovdqu [YMMBLOCK(0,0,edi,SIZEOF_DCTELEM)], ymm0 169 vmovdqu [YMMBLOCK(2,0,edi,SIZEOF_DCTELEM)], ymm1 170 vmovdqu [YMMBLOCK(4,0,edi,SIZEOF_DCTELEM)], ymm2 171 vmovdqu [YMMBLOCK(6,0,edi,SIZEOF_DCTELEM)], ymm3 172 173 vzeroupper 174 pop edi 175 pop esi 176 ; pop edx ; need not be preserved 177 ; pop ecx ; unused 178 ; pop ebx ; unused 179 pop ebp 180 ret 181 182 ; For some reason, the OS X linker does not honor the request to align the 183 ; segment unless we do this. 184 align 32