transform_neon.rs (6095B)
1 use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA}; 2 #[cfg(target_arch = "aarch64")] 3 use core::arch::aarch64::{ 4 float32x4_t, int32x4_t, vaddq_f32, vcvtq_s32_f32, vgetq_lane_s32, vld1q_dup_f32, vld1q_f32, 5 vmaxq_f32, vminq_f32, vmulq_f32, 6 }; 7 #[cfg(target_arch = "arm")] 8 use core::arch::arm::{ 9 float32x4_t, int32x4_t, vaddq_f32, vcvtq_s32_f32, vgetq_lane_s32, vld1q_dup_f32, vld1q_f32, 10 vmaxq_f32, vminq_f32, vmulq_f32, 11 }; 12 use std::mem::zeroed; 13 14 #[target_feature(enable = "neon")] 15 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] 16 unsafe fn qcms_transform_data_template_lut_neon<F: Format>( 17 transform: &qcms_transform, 18 mut src: *const u8, 19 mut dest: *mut u8, 20 mut length: usize, 21 ) { 22 let mat: *const [f32; 4] = (*transform).matrix.as_ptr(); 23 /* deref *transform now to avoid it in loop */ 24 let igtbl_r: *const f32 = (*transform).input_gamma_table_r.as_ref().unwrap().as_ptr(); 25 let igtbl_g: *const f32 = (*transform).input_gamma_table_g.as_ref().unwrap().as_ptr(); 26 let igtbl_b: *const f32 = (*transform).input_gamma_table_b.as_ref().unwrap().as_ptr(); 27 /* deref *transform now to avoid it in loop */ 28 let otdata_r: *const u8 = transform 29 .precache_output 30 .as_deref() 31 .unwrap() 32 .lut_r 33 .as_ptr(); 34 let otdata_g: *const u8 = (*transform) 35 .precache_output 36 .as_deref() 37 .unwrap() 38 .lut_g 39 .as_ptr(); 40 let otdata_b: *const u8 = (*transform) 41 .precache_output 42 .as_deref() 43 .unwrap() 44 .lut_b 45 .as_ptr(); 46 /* input matrix values never change */ 47 let mat0: float32x4_t = vld1q_f32((*mat.offset(0isize)).as_ptr()); 48 let mat1: float32x4_t = vld1q_f32((*mat.offset(1isize)).as_ptr()); 49 let mat2: float32x4_t = vld1q_f32((*mat.offset(2isize)).as_ptr()); 50 /* these values don't change, either */ 51 let max: float32x4_t = vld1q_dup_f32(&CLAMPMAXVAL); 52 let min: float32x4_t = zeroed(); 53 let scale: float32x4_t = vld1q_dup_f32(&FLOATSCALE); 54 let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32; 55 /* working variables */ 56 let mut vec_r: float32x4_t; 57 let mut vec_g: float32x4_t; 58 let mut vec_b: float32x4_t; 59 let mut result: int32x4_t; 60 let mut alpha: u8 = 0; 61 /* CYA */ 62 if length == 0 { 63 return; 64 } 65 /* one pixel is handled outside of the loop */ 66 length = length.wrapping_sub(1); 67 /* setup for transforming 1st pixel */ 68 vec_r = vld1q_dup_f32(&*igtbl_r.offset(*src.offset(F::kRIndex as isize) as isize)); 69 vec_g = vld1q_dup_f32(&*igtbl_g.offset(*src.offset(F::kGIndex as isize) as isize)); 70 vec_b = vld1q_dup_f32(&*igtbl_b.offset(*src.offset(F::kBIndex as isize) as isize)); 71 if F::kAIndex != 0xff { 72 alpha = *src.offset(F::kAIndex as isize) 73 } 74 src = src.offset(components as isize); 75 let mut i: u32 = 0; 76 while (i as usize) < length { 77 /* gamma * matrix */ 78 vec_r = vmulq_f32(vec_r, mat0); 79 vec_g = vmulq_f32(vec_g, mat1); 80 vec_b = vmulq_f32(vec_b, mat2); 81 /* store alpha for this pixel; load alpha for next */ 82 if F::kAIndex != 0xff { 83 *dest.offset(F::kAIndex as isize) = alpha; 84 alpha = *src.offset(F::kAIndex as isize) 85 } 86 /* crunch, crunch, crunch */ 87 vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); 88 vec_r = vmaxq_f32(min, vec_r); 89 vec_r = vminq_f32(max, vec_r); 90 result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); 91 92 /* use calc'd indices to output RGB values */ 93 *dest.offset(F::kRIndex as isize) = *otdata_r.offset(vgetq_lane_s32(result, 0) as isize); 94 *dest.offset(F::kGIndex as isize) = *otdata_g.offset(vgetq_lane_s32(result, 1) as isize); 95 *dest.offset(F::kBIndex as isize) = *otdata_b.offset(vgetq_lane_s32(result, 2) as isize); 96 97 /* load gamma values for next loop while store completes */ 98 vec_r = vld1q_dup_f32(&*igtbl_r.offset(*src.offset(F::kRIndex as isize) as isize)); 99 vec_g = vld1q_dup_f32(&*igtbl_g.offset(*src.offset(F::kGIndex as isize) as isize)); 100 vec_b = vld1q_dup_f32(&*igtbl_b.offset(*src.offset(F::kBIndex as isize) as isize)); 101 102 dest = dest.offset(components as isize); 103 src = src.offset(components as isize); 104 i = i.wrapping_add(1) 105 } 106 /* handle final (maybe only) pixel */ 107 vec_r = vmulq_f32(vec_r, mat0); 108 vec_g = vmulq_f32(vec_g, mat1); 109 vec_b = vmulq_f32(vec_b, mat2); 110 if F::kAIndex != 0xff { 111 *dest.offset(F::kAIndex as isize) = alpha 112 } 113 vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); 114 vec_r = vmaxq_f32(min, vec_r); 115 vec_r = vminq_f32(max, vec_r); 116 result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); 117 118 *dest.offset(F::kRIndex as isize) = *otdata_r.offset(vgetq_lane_s32(result, 0) as isize); 119 *dest.offset(F::kGIndex as isize) = *otdata_g.offset(vgetq_lane_s32(result, 1) as isize); 120 *dest.offset(F::kBIndex as isize) = *otdata_b.offset(vgetq_lane_s32(result, 2) as isize); 121 } 122 #[no_mangle] 123 #[target_feature(enable = "neon")] 124 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] 125 pub unsafe fn qcms_transform_data_rgb_out_lut_neon( 126 transform: &qcms_transform, 127 src: *const u8, 128 dest: *mut u8, 129 length: usize, 130 ) { 131 qcms_transform_data_template_lut_neon::<RGB>(transform, src, dest, length); 132 } 133 #[no_mangle] 134 #[target_feature(enable = "neon")] 135 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] 136 pub unsafe fn qcms_transform_data_rgba_out_lut_neon( 137 transform: &qcms_transform, 138 src: *const u8, 139 dest: *mut u8, 140 length: usize, 141 ) { 142 qcms_transform_data_template_lut_neon::<RGBA>(transform, src, dest, length); 143 } 144 145 #[no_mangle] 146 #[target_feature(enable = "neon")] 147 #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] 148 pub unsafe fn qcms_transform_data_bgra_out_lut_neon( 149 transform: &qcms_transform, 150 src: *const u8, 151 dest: *mut u8, 152 length: usize, 153 ) { 154 qcms_transform_data_template_lut_neon::<BGRA>(transform, src, dest, length); 155 }