transform_avx.rs (10821B)
1 use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA}; 2 #[cfg(target_arch = "x86")] 3 pub use std::arch::x86::{ 4 __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256, 5 _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps, 6 _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps, 7 _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128, 8 }; 9 #[cfg(target_arch = "x86_64")] 10 pub use std::arch::x86_64::{ 11 __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256, 12 _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps, 13 _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps, 14 _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128, 15 }; 16 17 #[repr(align(32))] 18 struct Output([u32; 8]); 19 20 #[target_feature(enable = "avx")] 21 unsafe extern "C" fn qcms_transform_data_template_lut_avx<F: Format>( 22 transform: &qcms_transform, 23 mut src: *const u8, 24 mut dest: *mut u8, 25 mut length: usize, 26 ) { 27 let mat: *const [f32; 4] = transform.matrix.as_ptr(); 28 let mut input: Output = std::mem::zeroed(); 29 /* share input and output locations to save having to keep the 30 * locations in separate registers */ 31 let output: *const u32 = &mut input as *mut Output as *mut u32; 32 /* deref *transform now to avoid it in loop */ 33 let igtbl_r: *const f32 = transform.input_gamma_table_r.as_ref().unwrap().as_ptr(); 34 let igtbl_g: *const f32 = transform.input_gamma_table_g.as_ref().unwrap().as_ptr(); 35 let igtbl_b: *const f32 = transform.input_gamma_table_b.as_ref().unwrap().as_ptr(); 36 /* deref *transform now to avoid it in loop */ 37 let otdata_r: *const u8 = transform 38 .precache_output 39 .as_deref() 40 .unwrap() 41 .lut_r 42 .as_ptr(); 43 let otdata_g: *const u8 = (*transform) 44 .precache_output 45 .as_deref() 46 .unwrap() 47 .lut_g 48 .as_ptr(); 49 let otdata_b: *const u8 = (*transform) 50 .precache_output 51 .as_deref() 52 .unwrap() 53 .lut_b 54 .as_ptr(); 55 /* input matrix values never change */ 56 let mat0: __m256 = _mm256_broadcast_ps(&*((*mat.offset(0isize)).as_ptr() as *const __m128)); 57 let mat1: __m256 = _mm256_broadcast_ps(&*((*mat.offset(1isize)).as_ptr() as *const __m128)); 58 let mat2: __m256 = _mm256_broadcast_ps(&*((*mat.offset(2isize)).as_ptr() as *const __m128)); 59 /* these values don't change, either */ 60 let max: __m256 = _mm256_set1_ps(CLAMPMAXVAL); 61 let min: __m256 = _mm256_setzero_ps(); 62 let scale: __m256 = _mm256_set1_ps(FLOATSCALE); 63 let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32; 64 /* working variables */ 65 let mut vec_r: __m256 = _mm256_setzero_ps(); 66 let mut vec_g: __m256 = _mm256_setzero_ps(); 67 let mut vec_b: __m256 = _mm256_setzero_ps(); 68 let mut result: __m256; 69 let mut vec_r0: __m128; 70 let mut vec_g0: __m128; 71 let mut vec_b0: __m128; 72 let mut vec_r1: __m128; 73 let mut vec_g1: __m128; 74 let mut vec_b1: __m128; 75 let mut alpha1: u8 = 0; 76 let mut alpha2: u8 = 0; 77 /* CYA */ 78 if length == 0 { 79 return; 80 } 81 /* If there are at least 2 pixels, then we can load their components into 82 a single 256-bit register for processing. */ 83 if length > 1 { 84 vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize)); 85 vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize)); 86 vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize)); 87 vec_r1 = 88 _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize)); 89 vec_g1 = 90 _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize)); 91 vec_b1 = 92 _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize)); 93 vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1); 94 vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1); 95 vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1); 96 if F::kAIndex != 0xff { 97 alpha1 = *src.add(F::kAIndex); 98 alpha2 = *src.add(F::kAIndex + components as usize) 99 } 100 } 101 /* If there are at least 4 pixels, then we can iterate and preload the 102 next 2 while we store the result of the current 2. */ 103 while length > 3 { 104 /* Ensure we are pointing at the next 2 pixels for the next load. */ 105 src = src.offset((2 * components) as isize); 106 /* gamma * matrix */ 107 vec_r = _mm256_mul_ps(vec_r, mat0); 108 vec_g = _mm256_mul_ps(vec_g, mat1); 109 vec_b = _mm256_mul_ps(vec_b, mat2); 110 /* store alpha for these pixels; load alpha for next two */ 111 if F::kAIndex != 0xff { 112 *dest.add(F::kAIndex) = alpha1; 113 *dest.add(F::kAIndex + components as usize) = alpha2; 114 alpha1 = *src.add(F::kAIndex); 115 alpha2 = *src.add(F::kAIndex + components as usize) 116 } 117 /* crunch, crunch, crunch */ 118 vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b)); 119 vec_r = _mm256_max_ps(vec_r, min); 120 vec_r = _mm256_min_ps(max, vec_r); 121 result = _mm256_mul_ps(vec_r, scale); 122 /* store calc'd output tables indices */ 123 _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result)); 124 /* load gamma values for next loop while store completes */ 125 vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize)); 126 vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize)); 127 vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize)); 128 vec_r1 = 129 _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize)); 130 vec_g1 = 131 _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize)); 132 vec_b1 = 133 _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize)); 134 vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1); 135 vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1); 136 vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1); 137 /* use calc'd indices to output RGB values */ 138 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize); 139 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize); 140 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize); 141 *dest.add(F::kRIndex + components as usize) = 142 *otdata_r.offset(*output.offset(4isize) as isize); 143 *dest.add(F::kGIndex + components as usize) = 144 *otdata_g.offset(*output.offset(5isize) as isize); 145 *dest.add(F::kBIndex + components as usize) = 146 *otdata_b.offset(*output.offset(6isize) as isize); 147 dest = dest.offset((2 * components) as isize); 148 length -= 2 149 } 150 /* There are 0-3 pixels remaining. If there are 2-3 remaining, then we know 151 we have already populated the necessary registers to start the transform. */ 152 if length > 1 { 153 vec_r = _mm256_mul_ps(vec_r, mat0); 154 vec_g = _mm256_mul_ps(vec_g, mat1); 155 vec_b = _mm256_mul_ps(vec_b, mat2); 156 if F::kAIndex != 0xff { 157 *dest.add(F::kAIndex) = alpha1; 158 *dest.add(F::kAIndex + components as usize) = alpha2 159 } 160 vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b)); 161 vec_r = _mm256_max_ps(vec_r, min); 162 vec_r = _mm256_min_ps(max, vec_r); 163 result = _mm256_mul_ps(vec_r, scale); 164 _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result)); 165 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize); 166 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize); 167 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize); 168 *dest.add(F::kRIndex + components as usize) = 169 *otdata_r.offset(*output.offset(4isize) as isize); 170 *dest.add(F::kGIndex + components as usize) = 171 *otdata_g.offset(*output.offset(5isize) as isize); 172 *dest.add(F::kBIndex + components as usize) = 173 *otdata_b.offset(*output.offset(6isize) as isize); 174 src = src.offset((2 * components) as isize); 175 dest = dest.offset((2 * components) as isize); 176 length -= 2 177 } 178 /* There may be 0-1 pixels remaining. */ 179 if length == 1 { 180 vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize)); 181 vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize)); 182 vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize)); 183 vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(mat0)); 184 vec_g0 = _mm_mul_ps(vec_g0, _mm256_castps256_ps128(mat1)); 185 vec_b0 = _mm_mul_ps(vec_b0, _mm256_castps256_ps128(mat2)); 186 if F::kAIndex != 0xff { 187 *dest.add(F::kAIndex) = *src.add(F::kAIndex) 188 } 189 vec_r0 = _mm_add_ps(vec_r0, _mm_add_ps(vec_g0, vec_b0)); 190 vec_r0 = _mm_max_ps(vec_r0, _mm256_castps256_ps128(min)); 191 vec_r0 = _mm_min_ps(_mm256_castps256_ps128(max), vec_r0); 192 vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(scale)); 193 _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(vec_r0)); 194 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize); 195 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize); 196 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize) 197 }; 198 } 199 #[no_mangle] 200 #[target_feature(enable = "avx")] 201 pub unsafe fn qcms_transform_data_rgb_out_lut_avx( 202 transform: &qcms_transform, 203 src: *const u8, 204 dest: *mut u8, 205 length: usize, 206 ) { 207 qcms_transform_data_template_lut_avx::<RGB>(transform, src, dest, length); 208 } 209 #[no_mangle] 210 #[target_feature(enable = "avx")] 211 pub unsafe fn qcms_transform_data_rgba_out_lut_avx( 212 transform: &qcms_transform, 213 src: *const u8, 214 dest: *mut u8, 215 length: usize, 216 ) { 217 qcms_transform_data_template_lut_avx::<RGBA>(transform, src, dest, length); 218 } 219 #[no_mangle] 220 #[target_feature(enable = "avx")] 221 pub unsafe fn qcms_transform_data_bgra_out_lut_avx( 222 transform: &qcms_transform, 223 src: *const u8, 224 dest: *mut u8, 225 length: usize, 226 ) { 227 qcms_transform_data_template_lut_avx::<BGRA>(transform, src, dest, length); 228 }