tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

cdef_tmpl.c (19600B)


      1 /*
      2 * Copyright © 2019, Luca Barbato
      3 * All rights reserved.
      4 *
      5 * Redistribution and use in source and binary forms, with or without
      6 * modification, are permitted provided that the following conditions are met:
      7 *
      8 * 1. Redistributions of source code must retain the above copyright notice, this
      9 *    list of conditions and the following disclaimer.
     10 *
     11 * 2. Redistributions in binary form must reproduce the above copyright notice,
     12 *    this list of conditions and the following disclaimer in the documentation
     13 *    and/or other materials provided with the distribution.
     14 *
     15 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     16 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     18 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     19 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     21 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     22 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     23 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     24 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     25 */
     26 
     27 #include "src/ppc/dav1d_types.h"
     28 #include "src/ppc/cdef.h"
     29 
     30 #if BITDEPTH == 8
     31 static inline i16x8 vconstrain(const i16x8 diff, const int16_t threshold,
     32                               const uint16_t shift)
     33 {
     34    const i16x8 zero = vec_splat_s16(0);
     35    if (!threshold) return zero;
     36    const i16x8 abs_diff = vec_abs(diff);
     37    const b16x8 mask = vec_cmplt(diff, zero);
     38    const i16x8 thr = vec_splats(threshold);
     39    const i16x8 sub = vec_sub(thr, vec_sra(abs_diff, vec_splats(shift)));
     40    const i16x8 max = vec_max(zero, sub);
     41    const i16x8 min = vec_min(abs_diff, max);
     42    const i16x8 neg = vec_sub(zero, min);
     43    return vec_sel(min, neg, mask);
     44 }
     45 
     46 static inline void copy4xN(uint16_t *tmp,
     47                           const uint8_t *src, const ptrdiff_t src_stride,
     48                           const uint8_t (*left)[2], const uint8_t *const top,
     49                           const uint8_t *const bottom, const int w, const int h,
     50                           const enum CdefEdgeFlags edges)
     51 {
     52    const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
     53 
     54    u16x8 l0;
     55    u16x8 l1;
     56 
     57    int y_start = -2, y_end = h + 2;
     58 
     59    // Copy top and bottom first
     60    if (!(edges & CDEF_HAVE_TOP)) {
     61        l0 = fill;
     62        l1 = fill;
     63        y_start = 0;
     64    } else {
     65        l0 = u8h_to_u16(vec_vsx_ld(0, top + 0 * src_stride - 2));
     66        l1 = u8h_to_u16(vec_vsx_ld(0, top + 1 * src_stride - 2));
     67    }
     68 
     69    vec_st(l0, 0, tmp - 2 * 8);
     70    vec_st(l1, 0, tmp - 1 * 8);
     71 
     72    if (!(edges & CDEF_HAVE_BOTTOM)) {
     73        l0 = fill;
     74        l1 = fill;
     75        y_end -= 2;
     76    } else {
     77        l0 = u8h_to_u16(vec_vsx_ld(0, bottom + 0 * src_stride - 2));
     78        l1 = u8h_to_u16(vec_vsx_ld(0, bottom + 1 * src_stride - 2));
     79    }
     80 
     81    vec_st(l0, 0, tmp + (h + 0) * 8);
     82    vec_st(l1, 0, tmp + (h + 1) * 8);
     83 
     84    int y_with_left_edge = 0;
     85    if (!(edges & CDEF_HAVE_LEFT)) {
     86        u16x8 l = u8h_to_u16(vec_vsx_ld(0, src));
     87        vec_vsx_st(l, 0, tmp + 2);
     88 
     89        y_with_left_edge = 1;
     90    }
     91 
     92    for (int y = y_with_left_edge; y < h; y++) {
     93        u16x8 l = u8h_to_u16(vec_vsx_ld(0, src - 2 + y * src_stride));
     94        vec_st(l, 0, tmp + y * 8);
     95    }
     96 
     97    if (!(edges & CDEF_HAVE_LEFT)) {
     98        for (int y = y_start; y < y_end; y++) {
     99            tmp[y * 8] = INT16_MAX;
    100            tmp[1 + y * 8] = INT16_MAX;
    101        }
    102    } else {
    103        for (int y = 0; y < h; y++) {
    104            tmp[y * 8] = left[y][0];
    105            tmp[1 + y * 8] = left[y][1];
    106        }
    107    }
    108    if (!(edges & CDEF_HAVE_RIGHT)) {
    109        for (int y = y_start; y < y_end; y++) {
    110            tmp[- 2 + (y + 1) * 8] = INT16_MAX;
    111            tmp[- 1 + (y + 1) * 8] = INT16_MAX;
    112        }
    113    }
    114 }
    115 
    116 static inline void copy8xN(uint16_t *tmp,
    117                           const uint8_t *src, const ptrdiff_t src_stride,
    118                           const uint8_t (*left)[2], const uint8_t *const top,
    119                           const uint8_t *const bottom, const int w, const int h,
    120                           const enum CdefEdgeFlags edges)
    121 {
    122    const u16x8 fill = vec_splats((uint16_t)INT16_MAX);
    123 
    124    u16x8 l0h, l0l;
    125    u16x8 l1h, l1l;
    126 
    127    int y_start = -2, y_end = h + 2;
    128 
    129    // Copy top and bottom first
    130    if (!(edges & CDEF_HAVE_TOP)) {
    131        l0h = fill;
    132        l0l = fill;
    133        l1h = fill;
    134        l1l = fill;
    135        y_start = 0;
    136    } else {
    137        u8x16 l0 = vec_vsx_ld(0, top + 0 * src_stride - 2);
    138        u8x16 l1 = vec_vsx_ld(0, top + 1 * src_stride - 2);
    139        l0h = u8h_to_u16(l0);
    140        l0l = u8l_to_u16(l0);
    141        l1h = u8h_to_u16(l1);
    142        l1l = u8l_to_u16(l1);
    143    }
    144 
    145    vec_st(l0h, 0, tmp - 4 * 8);
    146    vec_st(l0l, 0, tmp - 3 * 8);
    147    vec_st(l1h, 0, tmp - 2 * 8);
    148    vec_st(l1l, 0, tmp - 1 * 8);
    149 
    150    if (!(edges & CDEF_HAVE_BOTTOM)) {
    151        l0h = fill;
    152        l0l = fill;
    153        l1h = fill;
    154        l1l = fill;
    155        y_end -= 2;
    156    } else {
    157        u8x16 l0 = vec_vsx_ld(0, bottom + 0 * src_stride - 2);
    158        u8x16 l1 = vec_vsx_ld(0, bottom + 1 * src_stride - 2);
    159        l0h = u8h_to_u16(l0);
    160        l0l = u8l_to_u16(l0);
    161        l1h = u8h_to_u16(l1);
    162        l1l = u8l_to_u16(l1);
    163    }
    164 
    165    vec_st(l0h, 0, tmp + (h + 0) * 16);
    166    vec_st(l0l, 0, tmp + (h + 0) * 16 + 8);
    167    vec_st(l1h, 0, tmp + (h + 1) * 16);
    168    vec_st(l1l, 0, tmp + (h + 1) * 16 + 8);
    169 
    170    int y_with_left_edge = 0;
    171    if (!(edges & CDEF_HAVE_LEFT)) {
    172        u8x16 l = vec_vsx_ld(0, src);
    173        u16x8 lh = u8h_to_u16(l);
    174        u16x8 ll = u8l_to_u16(l);
    175        vec_vsx_st(lh, 0, tmp + 2);
    176        vec_vsx_st(ll, 0, tmp + 8 + 2);
    177 
    178        y_with_left_edge = 1;
    179    }
    180 
    181    for (int y = y_with_left_edge; y < h; y++) {
    182        u8x16 l = vec_vsx_ld(0, src - 2 + y * src_stride);
    183        u16x8 lh = u8h_to_u16(l);
    184        u16x8 ll = u8l_to_u16(l);
    185        vec_st(lh, 0, tmp + y * 16);
    186        vec_st(ll, 0, tmp + 8 + y * 16);
    187    }
    188 
    189    if (!(edges & CDEF_HAVE_LEFT)) {
    190        for (int y = y_start; y < y_end; y++) {
    191            tmp[y * 16] = INT16_MAX;
    192            tmp[1 + y * 16] = INT16_MAX;
    193        }
    194    } else {
    195        for (int y = 0; y < h; y++) {
    196            tmp[y * 16] = left[y][0];
    197            tmp[1 + y * 16] = left[y][1];
    198        }
    199    }
    200    if (!(edges & CDEF_HAVE_RIGHT)) {
    201        for (int y = y_start; y < y_end; y++) {
    202            tmp[- 6 + (y + 1) * 16] = INT16_MAX;
    203            tmp[- 5 + (y + 1) * 16] = INT16_MAX;
    204        }
    205    }
    206 }
    207 
    208 static inline i16x8 max_mask(i16x8 a, i16x8 b) {
    209    const i16x8 I16X8_INT16_MAX = vec_splats((int16_t)INT16_MAX);
    210 
    211    const b16x8 mask = vec_cmpeq(a, I16X8_INT16_MAX);
    212 
    213    const i16x8 val = vec_sel(a, b, mask);
    214 
    215    return vec_max(val, b);
    216 }
    217 
    218 #define LOAD_PIX(addr) \
    219    const i16x8 px = (i16x8)vec_vsx_ld(0, addr); \
    220    i16x8 sum = vec_splat_s16(0);
    221 
    222 #define LOAD_PIX4(addr) \
    223    const i16x8 a = (i16x8)vec_vsx_ld(0, addr); \
    224    const i16x8 b = (i16x8)vec_vsx_ld(0, addr + 8); \
    225    const i16x8 px = vec_xxpermdi(a, b, 0); \
    226    i16x8 sum = vec_splat_s16(0);
    227 
    228 #define LOAD_DIR(p, addr, o0, o1) \
    229    const i16x8 p ## 0 = (i16x8)vec_vsx_ld(0, addr + o0); \
    230    const i16x8 p ## 1 = (i16x8)vec_vsx_ld(0, addr - o0); \
    231    const i16x8 p ## 2 = (i16x8)vec_vsx_ld(0, addr + o1); \
    232    const i16x8 p ## 3 = (i16x8)vec_vsx_ld(0, addr - o1);
    233 
    234 #define LOAD_DIR4(p, addr, o0, o1) \
    235    LOAD_DIR(p ## a, addr, o0, o1) \
    236    LOAD_DIR(p ## b, addr + 8, o0, o1) \
    237    const i16x8 p ## 0 = vec_xxpermdi(p ## a ## 0, p ## b ## 0, 0); \
    238    const i16x8 p ## 1 = vec_xxpermdi(p ## a ## 1, p ## b ## 1, 0); \
    239    const i16x8 p ## 2 = vec_xxpermdi(p ## a ## 2, p ## b ## 2, 0); \
    240    const i16x8 p ## 3 = vec_xxpermdi(p ## a ## 3, p ## b ## 3, 0);
    241 
    242 #define CONSTRAIN(p, strength, shift) \
    243    const i16x8 p ## _d0 = vec_sub(p ## 0, px); \
    244    const i16x8 p ## _d1 = vec_sub(p ## 1, px); \
    245    const i16x8 p ## _d2 = vec_sub(p ## 2, px); \
    246    const i16x8 p ## _d3 = vec_sub(p ## 3, px); \
    247 \
    248    i16x8 p ## _c0 = vconstrain(p ## _d0, strength, shift); \
    249    i16x8 p ## _c1 = vconstrain(p ## _d1, strength, shift); \
    250    i16x8 p ## _c2 = vconstrain(p ## _d2, strength, shift); \
    251    i16x8 p ## _c3 = vconstrain(p ## _d3, strength, shift);
    252 
    253 #define SETUP_MINMAX \
    254    i16x8 max = px; \
    255    i16x8 min = px; \
    256 
    257 #define MIN_MAX(p) \
    258    max = max_mask(p ## 0, max); \
    259    min = vec_min(p ## 0, min); \
    260    max = max_mask(p ## 1, max); \
    261    min = vec_min(p ## 1, min); \
    262    max = max_mask(p ## 2, max); \
    263    min = vec_min(p ## 2, min); \
    264    max = max_mask(p ## 3, max); \
    265    min = vec_min(p ## 3, min);
    266 
    267 #define MAKE_TAPS \
    268    const int16_t tap_odd = (pri_strength >> bitdepth_min_8) & 1; \
    269    const i16x8 tap0 = vec_splats((int16_t)(4 - tap_odd)); \
    270    const i16x8 tap1 = vec_splats((int16_t)(2 + tap_odd));
    271 
    272 #define PRI_0_UPDATE_SUM(p) \
    273    sum = vec_madd(tap0, p ## _c0, sum); \
    274    sum = vec_madd(tap0, p ## _c1, sum); \
    275    sum = vec_madd(tap1, p ## _c2, sum); \
    276    sum = vec_madd(tap1, p ## _c3, sum);
    277 
    278 #define UPDATE_SUM(p) \
    279    const i16x8 p ## sum0 = vec_add(p ## _c0, p ## _c1); \
    280    const i16x8 p ## sum1 = vec_add(p ## _c2, p ## _c3); \
    281    sum = vec_add(sum, p ## sum0); \
    282    sum = vec_add(sum, p ## sum1);
    283 
    284 #define SEC_0_UPDATE_SUM(p) \
    285    sum = vec_madd(vec_splat_s16(2), p ## _c0, sum); \
    286    sum = vec_madd(vec_splat_s16(2), p ## _c1, sum); \
    287    sum = vec_madd(vec_splat_s16(2), p ## _c2, sum); \
    288    sum = vec_madd(vec_splat_s16(2), p ## _c3, sum);
    289 
    290 #define BIAS \
    291    i16x8 bias = vec_and((i16x8)vec_cmplt(sum, vec_splat_s16(0)), vec_splat_s16(1)); \
    292    bias = vec_sub(vec_splat_s16(8), bias); \
    293 
    294 #define STORE4 \
    295    dst[0] = vdst[0]; \
    296    dst[1] = vdst[1]; \
    297    dst[2] = vdst[2]; \
    298    dst[3] = vdst[3]; \
    299 \
    300    tmp += 8; \
    301    dst += PXSTRIDE(dst_stride); \
    302    dst[0] = vdst[4]; \
    303    dst[1] = vdst[5]; \
    304    dst[2] = vdst[6]; \
    305    dst[3] = vdst[7]; \
    306 \
    307    tmp += 8; \
    308    dst += PXSTRIDE(dst_stride);
    309 
    310 #define STORE4_CLAMPED \
    311    BIAS \
    312    i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
    313    i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
    314    STORE4
    315 
    316 #define STORE4_UNCLAMPED \
    317    BIAS \
    318    i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
    319    STORE4
    320 
    321 #define STORE8 \
    322    dst[0] = vdst[0]; \
    323    dst[1] = vdst[1]; \
    324    dst[2] = vdst[2]; \
    325    dst[3] = vdst[3]; \
    326    dst[4] = vdst[4]; \
    327    dst[5] = vdst[5]; \
    328    dst[6] = vdst[6]; \
    329    dst[7] = vdst[7]; \
    330 \
    331    tmp += 16; \
    332    dst += PXSTRIDE(dst_stride);
    333 
    334 #define STORE8_CLAMPED \
    335    BIAS \
    336    i16x8 unclamped = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
    337    i16x8 vdst = vec_max(vec_min(unclamped, max), min); \
    338    STORE8
    339 
    340 #define STORE8_UNCLAMPED \
    341    BIAS \
    342    i16x8 vdst = vec_add(px, vec_sra(vec_add(sum, bias), vec_splat_u16(4))); \
    343    STORE8
    344 
    345 #define DIRECTIONS(w, tmp_stride) \
    346    static const int8_t cdef_directions##w[8 /* dir */][2 /* pass */] = { \
    347        { -1 * tmp_stride + 1, -2 * tmp_stride + 2 }, \
    348        {  0 * tmp_stride + 1, -1 * tmp_stride + 2 }, \
    349        {  0 * tmp_stride + 1,  0 * tmp_stride + 2 }, \
    350        {  0 * tmp_stride + 1,  1 * tmp_stride + 2 }, \
    351        {  1 * tmp_stride + 1,  2 * tmp_stride + 2 }, \
    352        {  1 * tmp_stride + 0,  2 * tmp_stride + 1 }, \
    353        {  1 * tmp_stride + 0,  2 * tmp_stride + 0 }, \
    354        {  1 * tmp_stride + 0,  2 * tmp_stride - 1 } \
    355    };
    356 
    357 DIRECTIONS(4, 8)
    358 DIRECTIONS(8, 16)
    359 
    360 static inline void
    361 filter_4xN(pixel *dst, const ptrdiff_t dst_stride,
    362           const pixel (*left)[2], const pixel *const top,
    363           const pixel *const bottom, const int w, const int h,
    364           const int pri_strength, const int sec_strength, const int dir,
    365           const int pri_shift, const int sec_shift,
    366           const enum CdefEdgeFlags edges, uint16_t *tmp)
    367 {
    368    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    369    const int off1 = cdef_directions4[dir][0];
    370    const int off1_1 = cdef_directions4[dir][1];
    371 
    372    const int off2 = cdef_directions4[(dir + 2) & 7][0];
    373    const int off3 = cdef_directions4[(dir + 6) & 7][0];
    374 
    375    const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
    376    const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
    377 
    378    MAKE_TAPS
    379 
    380    for (int y = 0; y < h / 2; y++) {
    381        LOAD_PIX4(tmp)
    382 
    383        SETUP_MINMAX
    384 
    385        // Primary pass
    386        LOAD_DIR4(p, tmp, off1, off1_1)
    387 
    388        CONSTRAIN(p, pri_strength, pri_shift)
    389 
    390        MIN_MAX(p)
    391 
    392        PRI_0_UPDATE_SUM(p)
    393 
    394        // Secondary pass 1
    395        LOAD_DIR4(s, tmp, off2, off3)
    396 
    397        CONSTRAIN(s, sec_strength, sec_shift)
    398 
    399        MIN_MAX(s)
    400 
    401        SEC_0_UPDATE_SUM(s)
    402 
    403        // Secondary pass 2
    404        LOAD_DIR4(s2, tmp, off2_1, off3_1)
    405 
    406        CONSTRAIN(s2, sec_strength, sec_shift)
    407 
    408        MIN_MAX(s2)
    409 
    410        UPDATE_SUM(s2)
    411 
    412        // Store
    413        STORE4_CLAMPED
    414    }
    415 }
    416 
    417 static inline void
    418 filter_4xN_pri(pixel *dst, const ptrdiff_t dst_stride,
    419           const pixel (*left)[2], const pixel *const top,
    420           const pixel *const bottom, const int w, const int h,
    421           const int pri_strength, const int dir,
    422           const int pri_shift, const enum CdefEdgeFlags edges,
    423           uint16_t *tmp)
    424 {
    425    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    426    const int off1 = cdef_directions4[dir][0];
    427    const int off1_1 = cdef_directions4[dir][1];
    428 
    429    MAKE_TAPS
    430 
    431    for (int y = 0; y < h / 2; y++) {
    432        LOAD_PIX4(tmp)
    433 
    434        // Primary pass
    435        LOAD_DIR4(p, tmp, off1, off1_1)
    436 
    437        CONSTRAIN(p, pri_strength, pri_shift)
    438 
    439        PRI_0_UPDATE_SUM(p)
    440 
    441        STORE4_UNCLAMPED
    442    }
    443 }
    444 
    445 static inline void
    446 filter_4xN_sec(pixel *dst, const ptrdiff_t dst_stride,
    447           const pixel (*left)[2], const pixel *const top,
    448           const pixel *const bottom, const int w, const int h,
    449           const int sec_strength, const int dir,
    450           const int sec_shift, const enum CdefEdgeFlags edges,
    451           uint16_t *tmp)
    452 {
    453    const int off2 = cdef_directions4[(dir + 2) & 7][0];
    454    const int off3 = cdef_directions4[(dir + 6) & 7][0];
    455 
    456    const int off2_1 = cdef_directions4[(dir + 2) & 7][1];
    457    const int off3_1 = cdef_directions4[(dir + 6) & 7][1];
    458 
    459    for (int y = 0; y < h / 2; y++) {
    460        LOAD_PIX4(tmp)
    461        // Secondary pass 1
    462        LOAD_DIR4(s, tmp, off2, off3)
    463 
    464        CONSTRAIN(s, sec_strength, sec_shift)
    465 
    466        SEC_0_UPDATE_SUM(s)
    467 
    468        // Secondary pass 2
    469        LOAD_DIR4(s2, tmp, off2_1, off3_1)
    470 
    471        CONSTRAIN(s2, sec_strength, sec_shift)
    472 
    473        UPDATE_SUM(s2)
    474 
    475        STORE4_UNCLAMPED
    476    }
    477 }
    478 
    479 static inline void
    480 filter_8xN(pixel *dst, const ptrdiff_t dst_stride,
    481           const pixel (*left)[2], const pixel *const top,
    482           const pixel *const bottom, const int w, const int h,
    483           const int pri_strength, const int sec_strength, const int dir,
    484           const int pri_shift, const int sec_shift, const enum CdefEdgeFlags edges,
    485           uint16_t *tmp)
    486 {
    487    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    488 
    489    const int off1 = cdef_directions8[dir][0];
    490    const int off1_1 = cdef_directions8[dir][1];
    491 
    492    const int off2 = cdef_directions8[(dir + 2) & 7][0];
    493    const int off3 = cdef_directions8[(dir + 6) & 7][0];
    494 
    495    const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
    496    const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
    497 
    498    MAKE_TAPS
    499 
    500    for (int y = 0; y < h; y++) {
    501        LOAD_PIX(tmp)
    502 
    503        SETUP_MINMAX
    504 
    505        // Primary pass
    506        LOAD_DIR(p, tmp, off1, off1_1)
    507 
    508        CONSTRAIN(p, pri_strength, pri_shift)
    509 
    510        MIN_MAX(p)
    511 
    512        PRI_0_UPDATE_SUM(p)
    513 
    514        // Secondary pass 1
    515        LOAD_DIR(s, tmp, off2, off3)
    516 
    517        CONSTRAIN(s, sec_strength, sec_shift)
    518 
    519        MIN_MAX(s)
    520 
    521        SEC_0_UPDATE_SUM(s)
    522 
    523        // Secondary pass 2
    524        LOAD_DIR(s2, tmp, off2_1, off3_1)
    525 
    526        CONSTRAIN(s2, sec_strength, sec_shift)
    527 
    528        MIN_MAX(s2)
    529 
    530        UPDATE_SUM(s2)
    531 
    532        // Store
    533        STORE8_CLAMPED
    534    }
    535 
    536 }
    537 
    538 static inline void
    539 filter_8xN_pri(pixel *dst, const ptrdiff_t dst_stride,
    540           const pixel (*left)[2], const pixel *const top,
    541           const pixel *const bottom, const int w, const int h,
    542           const int pri_strength, const int dir,
    543           const int pri_shift, const enum CdefEdgeFlags edges,
    544           uint16_t *tmp)
    545 {
    546    const int bitdepth_min_8 = bitdepth_from_max(bitdepth_max) - 8;
    547    const int off1 = cdef_directions8[dir][0];
    548    const int off1_1 = cdef_directions8[dir][1];
    549 
    550    MAKE_TAPS
    551 
    552    for (int y = 0; y < h; y++) {
    553        LOAD_PIX(tmp)
    554 
    555        // Primary pass
    556        LOAD_DIR(p, tmp, off1, off1_1)
    557 
    558        CONSTRAIN(p, pri_strength, pri_shift)
    559 
    560        PRI_0_UPDATE_SUM(p)
    561 
    562        STORE8_UNCLAMPED
    563    }
    564 }
    565 
    566 static inline void
    567 filter_8xN_sec(pixel *dst, const ptrdiff_t dst_stride,
    568           const pixel (*left)[2], const pixel *const top,
    569           const pixel *const bottom, const int w, const int h,
    570           const int sec_strength, const int dir,
    571           const int sec_shift, const enum CdefEdgeFlags edges,
    572           uint16_t *tmp)
    573 {
    574    const int off2 = cdef_directions8[(dir + 2) & 7][0];
    575    const int off3 = cdef_directions8[(dir + 6) & 7][0];
    576 
    577    const int off2_1 = cdef_directions8[(dir + 2) & 7][1];
    578    const int off3_1 = cdef_directions8[(dir + 6) & 7][1];
    579 
    580    for (int y = 0; y < h; y++) {
    581        LOAD_PIX(tmp)
    582 
    583        // Secondary pass 1
    584        LOAD_DIR(s, tmp, off2, off3)
    585 
    586        CONSTRAIN(s, sec_strength, sec_shift)
    587 
    588        SEC_0_UPDATE_SUM(s)
    589 
    590        // Secondary pass 2
    591        LOAD_DIR(s2, tmp, off2_1, off3_1)
    592 
    593        CONSTRAIN(s2, sec_strength, sec_shift)
    594 
    595        UPDATE_SUM(s2)
    596 
    597        STORE8_UNCLAMPED
    598    }
    599 }
    600 
    601 #define cdef_fn(w, h, tmp_stride) \
    602 void dav1d_cdef_filter_##w##x##h##_vsx(pixel *const dst, \
    603                                       const ptrdiff_t dst_stride, \
    604                                       const pixel (*left)[2], \
    605                                       const pixel *const top, \
    606                                       const pixel *const bottom, \
    607                                       const int pri_strength, \
    608                                       const int sec_strength, \
    609                                       const int dir, \
    610                                       const int damping, \
    611                                       const enum CdefEdgeFlags edges) \
    612 { \
    613    ALIGN_STK_16(uint16_t, tmp_buf, 12 * tmp_stride + 8,); \
    614    uint16_t *tmp = tmp_buf + 2 * tmp_stride + 2; \
    615    copy##w##xN(tmp - 2, dst, dst_stride, left, top, bottom, w, h, edges); \
    616    if (pri_strength) { \
    617        const int pri_shift = imax(0, damping - ulog2(pri_strength)); \
    618        if (sec_strength) { \
    619            const int sec_shift = damping - ulog2(sec_strength); \
    620            filter_##w##xN(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
    621                           sec_strength, dir, pri_shift, sec_shift, edges, tmp); \
    622        } else { \
    623            filter_##w##xN_pri(dst, dst_stride, left, top, bottom, w, h, pri_strength, \
    624                               dir, pri_shift, edges, tmp); \
    625        } \
    626    } else { \
    627        const int sec_shift = damping - ulog2(sec_strength); \
    628        filter_##w##xN_sec(dst, dst_stride, left, top, bottom, w, h, sec_strength, \
    629                           dir, sec_shift, edges, tmp); \
    630    } \
    631 }
    632 
    633 cdef_fn(4, 4, 8);
    634 cdef_fn(4, 8, 8);
    635 cdef_fn(8, 8, 16);
    636 #endif