[ tor-browser ].git.dasho

transform_avx.rs (10821B)
      1 use crate::transform::{qcms_transform, Format, BGRA, CLAMPMAXVAL, FLOATSCALE, RGB, RGBA};
      2 #[cfg(target_arch = "x86")]
      3 pub use std::arch::x86::{
      4    __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
      5    _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
      6    _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps,
      7    _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128,
      8 };
      9 #[cfg(target_arch = "x86_64")]
     10 pub use std::arch::x86_64::{
     11    __m128, __m128i, __m256, __m256i, _mm256_add_ps, _mm256_broadcast_ps, _mm256_castps128_ps256,
     12    _mm256_castps256_ps128, _mm256_cvtps_epi32, _mm256_insertf128_ps, _mm256_max_ps, _mm256_min_ps,
     13    _mm256_mul_ps, _mm256_set1_ps, _mm256_setzero_ps, _mm256_store_si256, _mm_add_ps,
     14    _mm_broadcast_ss, _mm_cvtps_epi32, _mm_max_ps, _mm_min_ps, _mm_mul_ps, _mm_store_si128,
     15 };
     16 
     17 #[repr(align(32))]
     18 struct Output([u32; 8]);
     19 
     20 #[target_feature(enable = "avx")]
     21 unsafe extern "C" fn qcms_transform_data_template_lut_avx<F: Format>(
     22    transform: &qcms_transform,
     23    mut src: *const u8,
     24    mut dest: *mut u8,
     25    mut length: usize,
     26 ) {
     27    let mat: *const [f32; 4] = transform.matrix.as_ptr();
     28    let mut input: Output = std::mem::zeroed();
     29    /* share input and output locations to save having to keep the
     30     * locations in separate registers */
     31    let output: *const u32 = &mut input as *mut Output as *mut u32;
     32    /* deref *transform now to avoid it in loop */
     33    let igtbl_r: *const f32 = transform.input_gamma_table_r.as_ref().unwrap().as_ptr();
     34    let igtbl_g: *const f32 = transform.input_gamma_table_g.as_ref().unwrap().as_ptr();
     35    let igtbl_b: *const f32 = transform.input_gamma_table_b.as_ref().unwrap().as_ptr();
     36    /* deref *transform now to avoid it in loop */
     37    let otdata_r: *const u8 = transform
     38        .precache_output
     39        .as_deref()
     40        .unwrap()
     41        .lut_r
     42        .as_ptr();
     43    let otdata_g: *const u8 = (*transform)
     44        .precache_output
     45        .as_deref()
     46        .unwrap()
     47        .lut_g
     48        .as_ptr();
     49    let otdata_b: *const u8 = (*transform)
     50        .precache_output
     51        .as_deref()
     52        .unwrap()
     53        .lut_b
     54        .as_ptr();
     55    /* input matrix values never change */
     56    let mat0: __m256 = _mm256_broadcast_ps(&*((*mat.offset(0isize)).as_ptr() as *const __m128));
     57    let mat1: __m256 = _mm256_broadcast_ps(&*((*mat.offset(1isize)).as_ptr() as *const __m128));
     58    let mat2: __m256 = _mm256_broadcast_ps(&*((*mat.offset(2isize)).as_ptr() as *const __m128));
     59    /* these values don't change, either */
     60    let max: __m256 = _mm256_set1_ps(CLAMPMAXVAL);
     61    let min: __m256 = _mm256_setzero_ps();
     62    let scale: __m256 = _mm256_set1_ps(FLOATSCALE);
     63    let components: u32 = if F::kAIndex == 0xff { 3 } else { 4 } as u32;
     64    /* working variables */
     65    let mut vec_r: __m256 = _mm256_setzero_ps();
     66    let mut vec_g: __m256 = _mm256_setzero_ps();
     67    let mut vec_b: __m256 = _mm256_setzero_ps();
     68    let mut result: __m256;
     69    let mut vec_r0: __m128;
     70    let mut vec_g0: __m128;
     71    let mut vec_b0: __m128;
     72    let mut vec_r1: __m128;
     73    let mut vec_g1: __m128;
     74    let mut vec_b1: __m128;
     75    let mut alpha1: u8 = 0;
     76    let mut alpha2: u8 = 0;
     77    /* CYA */
     78    if length == 0 {
     79        return;
     80    }
     81    /* If there are at least 2 pixels, then we can load their components into
     82    a single 256-bit register for processing. */
     83    if length > 1 {
     84        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
     85        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
     86        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
     87        vec_r1 =
     88            _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
     89        vec_g1 =
     90            _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
     91        vec_b1 =
     92            _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
     93        vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
     94        vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
     95        vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
     96        if F::kAIndex != 0xff {
     97            alpha1 = *src.add(F::kAIndex);
     98            alpha2 = *src.add(F::kAIndex + components as usize)
     99        }
    100    }
    101    /* If there are at least 4 pixels, then we can iterate and preload the
    102    next 2 while we store the result of the current 2. */
    103    while length > 3 {
    104        /* Ensure we are pointing at the next 2 pixels for the next load. */
    105        src = src.offset((2 * components) as isize);
    106        /* gamma * matrix */
    107        vec_r = _mm256_mul_ps(vec_r, mat0);
    108        vec_g = _mm256_mul_ps(vec_g, mat1);
    109        vec_b = _mm256_mul_ps(vec_b, mat2);
    110        /* store alpha for these pixels; load alpha for next two */
    111        if F::kAIndex != 0xff {
    112            *dest.add(F::kAIndex) = alpha1;
    113            *dest.add(F::kAIndex + components as usize) = alpha2;
    114            alpha1 = *src.add(F::kAIndex);
    115            alpha2 = *src.add(F::kAIndex + components as usize)
    116        }
    117        /* crunch, crunch, crunch */
    118        vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
    119        vec_r = _mm256_max_ps(vec_r, min);
    120        vec_r = _mm256_min_ps(max, vec_r);
    121        result = _mm256_mul_ps(vec_r, scale);
    122        /* store calc'd output tables indices */
    123        _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
    124        /* load gamma values for next loop while store completes */
    125        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
    126        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
    127        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
    128        vec_r1 =
    129            _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex + components as usize) as isize));
    130        vec_g1 =
    131            _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex + components as usize) as isize));
    132        vec_b1 =
    133            _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex + components as usize) as isize));
    134        vec_r = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_r0), vec_r1, 1);
    135        vec_g = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_g0), vec_g1, 1);
    136        vec_b = _mm256_insertf128_ps(_mm256_castps128_ps256(vec_b0), vec_b1, 1);
    137        /* use calc'd indices to output RGB values */
    138        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
    139        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
    140        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
    141        *dest.add(F::kRIndex + components as usize) =
    142            *otdata_r.offset(*output.offset(4isize) as isize);
    143        *dest.add(F::kGIndex + components as usize) =
    144            *otdata_g.offset(*output.offset(5isize) as isize);
    145        *dest.add(F::kBIndex + components as usize) =
    146            *otdata_b.offset(*output.offset(6isize) as isize);
    147        dest = dest.offset((2 * components) as isize);
    148        length -= 2
    149    }
    150    /* There are 0-3 pixels remaining. If there are 2-3 remaining, then we know
    151    we have already populated the necessary registers to start the transform. */
    152    if length > 1 {
    153        vec_r = _mm256_mul_ps(vec_r, mat0);
    154        vec_g = _mm256_mul_ps(vec_g, mat1);
    155        vec_b = _mm256_mul_ps(vec_b, mat2);
    156        if F::kAIndex != 0xff {
    157            *dest.add(F::kAIndex) = alpha1;
    158            *dest.add(F::kAIndex + components as usize) = alpha2
    159        }
    160        vec_r = _mm256_add_ps(vec_r, _mm256_add_ps(vec_g, vec_b));
    161        vec_r = _mm256_max_ps(vec_r, min);
    162        vec_r = _mm256_min_ps(max, vec_r);
    163        result = _mm256_mul_ps(vec_r, scale);
    164        _mm256_store_si256(output as *mut __m256i, _mm256_cvtps_epi32(result));
    165        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
    166        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
    167        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize);
    168        *dest.add(F::kRIndex + components as usize) =
    169            *otdata_r.offset(*output.offset(4isize) as isize);
    170        *dest.add(F::kGIndex + components as usize) =
    171            *otdata_g.offset(*output.offset(5isize) as isize);
    172        *dest.add(F::kBIndex + components as usize) =
    173            *otdata_b.offset(*output.offset(6isize) as isize);
    174        src = src.offset((2 * components) as isize);
    175        dest = dest.offset((2 * components) as isize);
    176        length -= 2
    177    }
    178    /* There may be 0-1 pixels remaining. */
    179    if length == 1 {
    180        vec_r0 = _mm_broadcast_ss(&*igtbl_r.offset(*src.add(F::kRIndex) as isize));
    181        vec_g0 = _mm_broadcast_ss(&*igtbl_g.offset(*src.add(F::kGIndex) as isize));
    182        vec_b0 = _mm_broadcast_ss(&*igtbl_b.offset(*src.add(F::kBIndex) as isize));
    183        vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(mat0));
    184        vec_g0 = _mm_mul_ps(vec_g0, _mm256_castps256_ps128(mat1));
    185        vec_b0 = _mm_mul_ps(vec_b0, _mm256_castps256_ps128(mat2));
    186        if F::kAIndex != 0xff {
    187            *dest.add(F::kAIndex) = *src.add(F::kAIndex)
    188        }
    189        vec_r0 = _mm_add_ps(vec_r0, _mm_add_ps(vec_g0, vec_b0));
    190        vec_r0 = _mm_max_ps(vec_r0, _mm256_castps256_ps128(min));
    191        vec_r0 = _mm_min_ps(_mm256_castps256_ps128(max), vec_r0);
    192        vec_r0 = _mm_mul_ps(vec_r0, _mm256_castps256_ps128(scale));
    193        _mm_store_si128(output as *mut __m128i, _mm_cvtps_epi32(vec_r0));
    194        *dest.add(F::kRIndex) = *otdata_r.offset(*output.offset(0isize) as isize);
    195        *dest.add(F::kGIndex) = *otdata_g.offset(*output.offset(1isize) as isize);
    196        *dest.add(F::kBIndex) = *otdata_b.offset(*output.offset(2isize) as isize)
    197    };
    198 }
    199 #[no_mangle]
    200 #[target_feature(enable = "avx")]
    201 pub unsafe fn qcms_transform_data_rgb_out_lut_avx(
    202    transform: &qcms_transform,
    203    src: *const u8,
    204    dest: *mut u8,
    205    length: usize,
    206 ) {
    207    qcms_transform_data_template_lut_avx::<RGB>(transform, src, dest, length);
    208 }
    209 #[no_mangle]
    210 #[target_feature(enable = "avx")]
    211 pub unsafe fn qcms_transform_data_rgba_out_lut_avx(
    212    transform: &qcms_transform,
    213    src: *const u8,
    214    dest: *mut u8,
    215    length: usize,
    216 ) {
    217    qcms_transform_data_template_lut_avx::<RGBA>(transform, src, dest, length);
    218 }
    219 #[no_mangle]
    220 #[target_feature(enable = "avx")]
    221 pub unsafe fn qcms_transform_data_bgra_out_lut_avx(
    222    transform: &qcms_transform,
    223    src: *const u8,
    224    dest: *mut u8,
    225    length: usize,
    226 ) {
    227    qcms_transform_data_template_lut_avx::<BGRA>(transform, src, dest, length);
    228 }
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE