[ tor-browser ].git.dasho

looprestoration_tmpl.c (13140B)
      1 /*
      2 * Copyright © 2019, VideoLAN and dav1d authors
      3 * Copyright © 2019, Michail Alvanos
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "src/ppc/dav1d_types.h"
     29 #include "src/ppc/looprestoration.h"
     30 
     31 #if BITDEPTH == 8
     32 
     33 #define REST_UNIT_STRIDE (400)
     34 
     35 static inline i32x4 iclip_vec(i32x4 v, const i32x4 minv, const i32x4 maxv) {
     36    v = vec_max(minv, v);
     37    v = vec_min(maxv, v);
     38    return v;
     39 }
     40 
     41 #define APPLY_FILTER_H(v, f, ssum1, ssum2) do {  \
     42    i16x8 ktmp_u16_high = (i16x8) u8h_to_u16(v); \
     43    i16x8 ktmp_u16_low  = (i16x8) u8l_to_u16(v); \
     44    ssum1 = vec_madd(ktmp_u16_high, f, ssum1);   \
     45    ssum2 = vec_madd(ktmp_u16_low, f, ssum2);    \
     46 } while (0)
     47 
     48 static void wiener_filter_h_vsx(int32_t *hor_ptr,
     49                                uint8_t *tmp_ptr,
     50                                const int16_t filterh[8],
     51                                const int w, const int h)
     52 {
     53    const i32x4 zerov = vec_splats(0);
     54    const i32x4 seven_vec = vec_splats(7);
     55    const i32x4 bitdepth_added_vec = vec_splats(1 << 14);
     56    const i32x4 round_bits_vec = vec_splats(3);
     57    const i32x4 rounding_off_vec = vec_splats(1<<2);
     58    const i32x4 clip_limit_v = vec_splats((1 << 13) - 1);
     59 
     60    i16x8 filterhvall = vec_vsx_ld(0, filterh);
     61    i16x8 filterhv0 =  vec_splat( filterhvall, 0);
     62    i16x8 filterhv1 =  vec_splat( filterhvall, 1);
     63    i16x8 filterhv2 =  vec_splat( filterhvall, 2);
     64    i16x8 filterhv3 =  vec_splat( filterhvall, 3);
     65    i16x8 filterhv4 =  vec_splat( filterhvall, 4);
     66    i16x8 filterhv5 =  vec_splat( filterhvall, 5);
     67    i16x8 filterhv6 =  vec_splat( filterhvall, 6);
     68 
     69    for (int j = 0; j < h + 6; j++) {
     70        for (int i = 0; i < w; i+=16) {
     71            i32x4 sum1 = bitdepth_added_vec;
     72            i32x4 sum2 = bitdepth_added_vec;
     73            i32x4 sum3 = bitdepth_added_vec;
     74            i32x4 sum4 = bitdepth_added_vec;
     75 
     76            u8x16 tmp_v0 = vec_ld(0, &tmp_ptr[i]);
     77            u8x16 tmp_v7 = vec_ld(0, &tmp_ptr[i+16]);
     78 
     79            u8x16 tmp_v1 = vec_sld( tmp_v7, tmp_v0, 15);
     80            u8x16 tmp_v2 = vec_sld( tmp_v7, tmp_v0, 14);
     81            u8x16 tmp_v3 = vec_sld( tmp_v7, tmp_v0, 13);
     82            u8x16 tmp_v4 = vec_sld( tmp_v7, tmp_v0, 12);
     83            u8x16 tmp_v5 = vec_sld( tmp_v7, tmp_v0, 11);
     84            u8x16 tmp_v6 = vec_sld( tmp_v7, tmp_v0, 10);
     85 
     86            u16x8 tmp_u16_high = u8h_to_u16(tmp_v3);
     87            u16x8 tmp_u16_low  = u8l_to_u16(tmp_v3);
     88 
     89            i32x4 tmp_expanded1 = i16h_to_i32(tmp_u16_high);
     90            i32x4 tmp_expanded2 = i16l_to_i32(tmp_u16_high);
     91            i32x4 tmp_expanded3 = i16h_to_i32(tmp_u16_low);
     92            i32x4 tmp_expanded4 = i16l_to_i32(tmp_u16_low);
     93 
     94            i16x8 ssum1 = (i16x8) zerov;
     95            i16x8 ssum2 = (i16x8) zerov;
     96 
     97            APPLY_FILTER_H(tmp_v0, filterhv0, ssum1, ssum2);
     98            APPLY_FILTER_H(tmp_v1, filterhv1, ssum1, ssum2);
     99            APPLY_FILTER_H(tmp_v2, filterhv2, ssum1, ssum2);
    100            APPLY_FILTER_H(tmp_v3, filterhv3, ssum1, ssum2);
    101            APPLY_FILTER_H(tmp_v4, filterhv4, ssum1, ssum2);
    102            APPLY_FILTER_H(tmp_v5, filterhv5, ssum1, ssum2);
    103            APPLY_FILTER_H(tmp_v6, filterhv6, ssum1, ssum2);
    104 
    105            sum1 += i16h_to_i32(ssum1) + (tmp_expanded1 << seven_vec);
    106            sum2 += i16l_to_i32(ssum1) + (tmp_expanded2 << seven_vec);
    107            sum3 += i16h_to_i32(ssum2) + (tmp_expanded3 << seven_vec);
    108            sum4 += i16l_to_i32(ssum2) + (tmp_expanded4 << seven_vec);
    109 
    110            sum1 = (sum1 + rounding_off_vec) >> round_bits_vec;
    111            sum2 = (sum2 + rounding_off_vec) >> round_bits_vec;
    112            sum3 = (sum3 + rounding_off_vec) >> round_bits_vec;
    113            sum4 = (sum4 + rounding_off_vec) >> round_bits_vec;
    114 
    115            sum1 = iclip_vec(sum1, zerov, clip_limit_v);
    116            sum2 = iclip_vec(sum2, zerov, clip_limit_v);
    117            sum3 = iclip_vec(sum3, zerov, clip_limit_v);
    118            sum4 = iclip_vec(sum4, zerov, clip_limit_v);
    119 
    120            vec_st(sum1,  0, &hor_ptr[i]);
    121            vec_st(sum2, 16, &hor_ptr[i]);
    122            vec_st(sum3, 32, &hor_ptr[i]);
    123            vec_st(sum4, 48, &hor_ptr[i]);
    124        }
    125        tmp_ptr += REST_UNIT_STRIDE;
    126        hor_ptr += REST_UNIT_STRIDE;
    127    }
    128 }
    129 
    130 static inline i16x8 iclip_u8_vec(i16x8 v) {
    131    const i16x8 zerov = vec_splats((int16_t)0);
    132    const i16x8 maxv = vec_splats((int16_t)255);
    133    v = vec_max(zerov, v);
    134    v = vec_min(maxv, v);
    135    return v;
    136 }
    137 
    138 #define APPLY_FILTER_V(index, f) do { \
    139    i32x4 v1 = vec_ld( 0, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
    140    i32x4 v2 = vec_ld(16, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
    141    i32x4 v3 = vec_ld(32, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
    142    i32x4 v4 = vec_ld(48, &hor[(j + index) * REST_UNIT_STRIDE + i]); \
    143    sum1 = sum1 + v1 * f; \
    144    sum2 = sum2 + v2 * f; \
    145    sum3 = sum3 + v3 * f; \
    146    sum4 = sum4 + v4 * f; \
    147 } while (0)
    148 
    149 #define LOAD_AND_APPLY_FILTER_V(sumpixelv, hor) do { \
    150    i32x4 sum1 = round_vec; \
    151    i32x4 sum2 = round_vec; \
    152    i32x4 sum3 = round_vec; \
    153    i32x4 sum4 = round_vec; \
    154    APPLY_FILTER_V(0, filterv0); \
    155    APPLY_FILTER_V(1, filterv1); \
    156    APPLY_FILTER_V(2, filterv2); \
    157    APPLY_FILTER_V(3, filterv3); \
    158    APPLY_FILTER_V(4, filterv4); \
    159    APPLY_FILTER_V(5, filterv5); \
    160    APPLY_FILTER_V(6, filterv6); \
    161    sum1 = sum1 >> round_bits_vec; \
    162    sum2 = sum2 >> round_bits_vec; \
    163    sum3 = sum3 >> round_bits_vec; \
    164    sum4 = sum4 >> round_bits_vec; \
    165    i16x8 sum_short_packed_1 = (i16x8) vec_pack(sum1, sum2); \
    166    i16x8 sum_short_packed_2 = (i16x8) vec_pack(sum3, sum4); \
    167    sum_short_packed_1 = iclip_u8_vec(sum_short_packed_1); \
    168    sum_short_packed_2 = iclip_u8_vec(sum_short_packed_2); \
    169    sum_pixel = (u8x16) vec_pack(sum_short_packed_1, sum_short_packed_2); \
    170 } while (0)
    171 
    172 static inline void wiener_filter_v_vsx(uint8_t *p,
    173                                       const ptrdiff_t stride,
    174                                       const int32_t *hor,
    175                                       const int16_t filterv[8],
    176                                       const int w, const int h)
    177 {
    178    const i32x4 round_bits_vec = vec_splats(11);
    179    const i32x4 round_vec = vec_splats((1 << 10) - (1 << 18));
    180 
    181    i32x4 filterv0 =  vec_splats((int32_t) filterv[0]);
    182    i32x4 filterv1 =  vec_splats((int32_t) filterv[1]);
    183    i32x4 filterv2 =  vec_splats((int32_t) filterv[2]);
    184    i32x4 filterv3 =  vec_splats((int32_t) filterv[3]);
    185    i32x4 filterv4 =  vec_splats((int32_t) filterv[4]);
    186    i32x4 filterv5 =  vec_splats((int32_t) filterv[5]);
    187    i32x4 filterv6 =  vec_splats((int32_t) filterv[6]);
    188 
    189    for (int j = 0; j < h; j++) {
    190        for (int i = 0; i <(w-w%16); i += 16) {
    191            u8x16 sum_pixel;
    192            LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
    193            vec_vsx_st(sum_pixel, 0, &p[j * PXSTRIDE(stride) + i]);
    194        }
    195        // remaining loop
    196        if (w & 0xf){
    197            int i=w-w%16;
    198            ALIGN_STK_16(uint8_t, tmp_out, 16,);
    199            u8x16 sum_pixel;
    200 
    201            LOAD_AND_APPLY_FILTER_V(sum_pixel, hor);
    202            vec_vsx_st(sum_pixel, 0, tmp_out);
    203 
    204            for (int k=0; i<w; i++, k++) {
    205                p[j * PXSTRIDE(stride) + i] = tmp_out[k];
    206            }
    207        }
    208    }
    209 }
    210 
    211 static inline void padding(uint8_t *dst, const uint8_t *p,
    212                           const ptrdiff_t stride, const uint8_t (*left)[4],
    213                           const uint8_t *lpf, int unit_w, const int stripe_h,
    214                           const enum LrEdgeFlags edges)
    215 {
    216    const int have_left = !!(edges & LR_HAVE_LEFT);
    217    const int have_right = !!(edges & LR_HAVE_RIGHT);
    218 
    219    // Copy more pixels if we don't have to pad them
    220    unit_w += 3 * have_left + 3 * have_right;
    221    uint8_t *dst_l = dst + 3 * !have_left;
    222    p -= 3 * have_left;
    223    lpf -= 3 * have_left;
    224 
    225    if (edges & LR_HAVE_TOP) {
    226        // Copy previous loop filtered rows
    227        const uint8_t *const above_1 = lpf;
    228        const uint8_t *const above_2 = above_1 + PXSTRIDE(stride);
    229        pixel_copy(dst_l, above_1, unit_w);
    230        pixel_copy(dst_l + REST_UNIT_STRIDE, above_1, unit_w);
    231        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, above_2, unit_w);
    232    } else {
    233        // Pad with first row
    234        pixel_copy(dst_l, p, unit_w);
    235        pixel_copy(dst_l + REST_UNIT_STRIDE, p, unit_w);
    236        pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, p, unit_w);
    237        if (have_left) {
    238            pixel_copy(dst_l, &left[0][1], 3);
    239            pixel_copy(dst_l + REST_UNIT_STRIDE, &left[0][1], 3);
    240            pixel_copy(dst_l + 2 * REST_UNIT_STRIDE, &left[0][1], 3);
    241        }
    242    }
    243 
    244    uint8_t *dst_tl = dst_l + 3 * REST_UNIT_STRIDE;
    245    if (edges & LR_HAVE_BOTTOM) {
    246        // Copy next loop filtered rows
    247        const uint8_t *const below_1 = lpf + 6 * PXSTRIDE(stride);
    248        const uint8_t *const below_2 = below_1 + PXSTRIDE(stride);
    249        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, below_1, unit_w);
    250        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, below_2, unit_w);
    251        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, below_2, unit_w);
    252    } else {
    253        // Pad with last row
    254        const uint8_t *const src = p + (stripe_h - 1) * PXSTRIDE(stride);
    255        pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, src, unit_w);
    256        pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, src, unit_w);
    257        pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, src, unit_w);
    258        if (have_left) {
    259            pixel_copy(dst_tl + stripe_h * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
    260            pixel_copy(dst_tl + (stripe_h + 1) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
    261            pixel_copy(dst_tl + (stripe_h + 2) * REST_UNIT_STRIDE, &left[stripe_h - 1][1], 3);
    262        }
    263    }
    264 
    265    // Inner UNIT_WxSTRIPE_H
    266    for (int j = 0; j < stripe_h; j++) {
    267        pixel_copy(dst_tl + 3 * have_left, p + 3 * have_left, unit_w - 3 * have_left);
    268        dst_tl += REST_UNIT_STRIDE;
    269        p += PXSTRIDE(stride);
    270    }
    271 
    272    if (!have_right) {
    273        uint8_t *pad = dst_l + unit_w;
    274        uint8_t *row_last = &dst_l[unit_w - 1];
    275        // Pad 3x(STRIPE_H+6) with last column
    276        for (int j = 0; j < stripe_h + 6; j++) {
    277            pixel_set(pad, *row_last, 3);
    278            pad += REST_UNIT_STRIDE;
    279            row_last += REST_UNIT_STRIDE;
    280        }
    281    }
    282 
    283    if (!have_left) {
    284        // Pad 3x(STRIPE_H+6) with first column
    285        for (int j = 0; j < stripe_h + 6; j++) {
    286            pixel_set(dst, *dst_l, 3);
    287            dst += REST_UNIT_STRIDE;
    288            dst_l += REST_UNIT_STRIDE;
    289        }
    290    } else {
    291        dst += 3 * REST_UNIT_STRIDE;
    292        for (int j = 0; j < stripe_h; j++) {
    293            pixel_copy(dst, &left[j][1], 3);
    294            dst += REST_UNIT_STRIDE;
    295        }
    296    }
    297 }
    298 
    299 // FIXME Could split into luma and chroma specific functions,
    300 // (since first and last tops are always 0 for chroma)
    301 // FIXME Could implement a version that requires less temporary memory
    302 // (should be possible to implement with only 6 rows of temp storage)
    303 void dav1d_wiener_filter_vsx(uint8_t *p, const ptrdiff_t stride,
    304                             const uint8_t (*const left)[4],
    305                             const uint8_t *lpf,
    306                             const int w, const int h,
    307                             const LooprestorationParams *const params,
    308                             const enum LrEdgeFlags edges)
    309 {
    310    const int16_t (*const filter)[8] = params->filter;
    311 
    312    // Wiener filtering is applied to a maximum stripe height of 64 + 3 pixels
    313    // of padding above and below
    314    ALIGN_STK_16(uint8_t, tmp, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE,);
    315    padding(tmp, p, stride, left, lpf, w, h, edges);
    316    ALIGN_STK_16(int32_t, hor, 70 /*(64 + 3 + 3)*/ * REST_UNIT_STRIDE + 64,);
    317 
    318    wiener_filter_h_vsx(hor, tmp, filter[0], w, h);
    319    wiener_filter_v_vsx(p, stride, hor, filter[1], w, h);
    320 }
    321 #endif
	tor-browser The Tor Browser
	git clone https://git.dasho.dev/tor-browser.git
	Log \| Files \| Refs \| README \| LICENSE