transform_sse2.rs (6263B)
1 use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA}; 2 #[cfg(target_arch = "x86")] 3 pub use std::arch::x86::{ 4 __m128, __m128i, _mm_add_ps, _mm_cvtps_epi32, _mm_load_ps, _mm_load_ss, _mm_max_ps, _mm_min_ps, 5 _mm_mul_ps, _mm_set1_ps, _mm_setzero_ps, _mm_shuffle_ps, _mm_store_si128, 6 }; 7 #[cfg(target_arch = "x86_64")] 8 pub use std::arch::x86_64::{ 9 __m128, __m128i, _mm_add_ps, _mm_cvtps_epi32, _mm_load_ps, _mm_load_ss, _mm_max_ps, _mm_min_ps, 10 _mm_mul_ps, _mm_set1_ps, _mm_setzero_ps, _mm_shuffle_ps, _mm_store_si128, 11 }; 12 13 #[repr(align(16))] 14 struct Output([u32; 4]); 15 16 unsafe extern "C" fn qcms_transform_data_template_lut_sse2<F: Format>( 17 transform: &qcms_transform, 18 mut src: *const u8, 19 mut dest: *mut u8, 20 mut length: usize, 21 ) { 22 let mat: *const [f32; 4] = (*transform).matrix.as_ptr(); 23 let mut input: Output = std::mem::zeroed(); 24 /* share input and output locations to save having to keep the 25 * locations in separate registers */ 26 let output: *const u32 = &mut input as *mut Output as *mut u32; 27 /* deref *transform now to avoid it in loop */ 28 let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr(); 29 let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr(); 30 let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr(); 31 /* deref *transform now to avoid it in loop */ 32 let otdata_r: *const u8 = (*transform) 33 .precache_output 34 .as_deref() 35 .unwrap() 36 .lut_r 37 .as_ptr(); 38 let otdata_g: *const u8 = (*transform) 39 .precache_output 40 .as_deref() 41 .unwrap() 42 .lut_g 43 .as_ptr(); 44 let otdata_b: *const u8 = (*transform) 45 .precache_output 46 .as_deref() 47 .unwrap() 48 .lut_b 49 .as_ptr(); 50 /* input matrix values never change */ 51 let mat0: __m128 = _mm_load_ps((*mat.offset(0isize)).as_ptr()); 52 let mat1: __m128 = _mm_load_ps((*mat.offset(1isize)).as_ptr()); 53 let mat2: __m128 = _mm_load_ps((*mat.offset(2isize)).as_ptr()); 54 /* these values don't change, either */ 55 let max: __m128 = _mm_set1_ps(CLAMPMAXVAL); 56 let min: __m128 = _mm_setzero_ps(); 57 let scale: __m128 = _mm_set1_ps(FLOATSCALE); 58 let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32; 59 /* working variables */ 60 let mut vec_r: __m128; 61 let mut vec_g: __m128; 62 let mut vec_b: __m128; 63 let mut result: __m128; 64 let mut alpha: u8 = 0; 65 /* CYA */ 66 if length == 0 { 67 return; 68 } 69 /* one pixel is handled outside of the loop */ 70 length -= 1; 71 /* setup for transforming 1st pixel */ 72 vec_r = _mm_load_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize)); 73 vec_g = _mm_load_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize)); 74 vec_b = _mm_load_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize)); 75 if F::kAIndex != 0xff { 76 alpha = *src.add(F::kAIndex) 77 } 78 src = src.offset(components as isize); 79 let mut i: u32 = 0; 80 while (i as usize) < length { 81 /* position values from gamma tables */ 82 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); 83 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); 84 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); 85 /* gamma * matrix */ 86 vec_r = _mm_mul_ps(vec_r, mat0); 87 vec_g = _mm_mul_ps(vec_g, mat1); 88 vec_b = _mm_mul_ps(vec_b, mat2); 89 /* store alpha for this pixel; load alpha for next */ 90 if F::kAIndex != 0xff { 91 *dest.add(F::kAIndex) = alpha; 92 alpha = *src.add(F::kAIndex) 93 } 94 /* crunch, crunch, crunch */ 95 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); 96 vec_r = _mm_max_ps(vec_r, min); 97 vec_r = _mm_min_ps(max, vec_r); 98 result = _mm_mul_ps(vec_r, scale); 99 /* store calc'd output tables indices */ 100 _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(result)); 101 /* load gamma values for next loop while store completes */ 102 vec_r = _mm_load_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize)); 103 vec_g = _mm_load_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize)); 104 vec_b = _mm_load_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize)); 105 src = src.offset(components as isize); 106 /* use calc'd indices to output RGB values */ 107 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize); 108 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize); 109 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize); 110 dest = dest.offset(components as isize); 111 i += 1 112 } 113 /* handle final (maybe only) pixel */ 114 vec_r = _mm_shuffle_ps(vec_r, vec_r, 0); 115 vec_g = _mm_shuffle_ps(vec_g, vec_g, 0); 116 vec_b = _mm_shuffle_ps(vec_b, vec_b, 0); 117 vec_r = _mm_mul_ps(vec_r, mat0); 118 vec_g = _mm_mul_ps(vec_g, mat1); 119 vec_b = _mm_mul_ps(vec_b, mat2); 120 if F::kAIndex != 0xff { 121 *dest.add(F::kAIndex) = alpha 122 } 123 vec_r = _mm_add_ps(vec_r, _mm_add_ps(vec_g, vec_b)); 124 vec_r = _mm_max_ps(vec_r, min); 125 vec_r = _mm_min_ps(max, vec_r); 126 result = _mm_mul_ps(vec_r, scale); 127 _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(result)); 128 *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize); 129 *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize); 130 *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize); 131 } 132 #[no_mangle] 133 pub unsafe fn qcms_transform_data_rgb_out_lut_sse2( 134 transform: &qcms_transform, 135 src: *const u8, 136 dest: *mut u8, 137 length: usize, 138 ) { 139 qcms_transform_data_template_lut_sse2::<RGB>(transform, src, dest, length); 140 } 141 #[no_mangle] 142 pub unsafe fn qcms_transform_data_rgba_out_lut_sse2( 143 transform: &qcms_transform, 144 src: *const u8, 145 dest: *mut u8, 146 length: usize, 147 ) { 148 qcms_transform_data_template_lut_sse2::<RGBA>(transform, src, dest, length); 149 } 150 151 #[no_mangle] 152 pub unsafe fn qcms_transform_data_bgra_out_lut_sse2( 153 transform: &qcms_transform, 154 src: *const u8, 155 dest: *mut u8, 156 length: usize, 157 ) { 158 qcms_transform_data_template_lut_sse2::<BGRA>(transform, src, dest, length); 159 }