tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc_tmpl.c (18210B)


      1 /*
      2 * Copyright © 2024, VideoLAN and dav1d authors
      3 * Copyright © 2024, Luca Barbato
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "common/attributes.h"
     29 #include "src/ppc/mc.h"
     30 #include "src/tables.h"
     31 #include "src/ppc/dav1d_types.h"
     32 
     33 #if BITDEPTH == 8
     34 
     35 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
     36 
     37 typedef void (*blend_line)(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride);
     38 
     39 #define BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) \
     40 { \
     41    u16x8 anm0 = vec_mule(ab0, nm_m0); \
     42    u16x8 anm1 = vec_mule(ab1, nm_m1); \
     43    u16x8 anm2 = vec_mule(ab2, nm_m2); \
     44    u16x8 anm3 = vec_mule(ab3, nm_m3); \
     45 \
     46    u16x8 bm0 = vec_mulo(ab0, nm_m0); \
     47    u16x8 bm1 = vec_mulo(ab1, nm_m1); \
     48    u16x8 bm2 = vec_mulo(ab2, nm_m2); \
     49    u16x8 bm3 = vec_mulo(ab3, nm_m3); \
     50 \
     51    d0_u16 = vec_add(anm0, bm0); \
     52    d1_u16 = vec_add(anm1, bm1); \
     53    d2_u16 = vec_add(anm2, bm2); \
     54    d3_u16 = vec_add(anm3, bm3); \
     55 \
     56    d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \
     57    d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \
     58    d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32)); \
     59    d3_u16 = vec_add(d3_u16, vec_splats((uint16_t)32)); \
     60 \
     61    d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \
     62    d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \
     63    d2_u16 = vec_sr(d2_u16, vec_splat_u16(6)); \
     64    d3_u16 = vec_sr(d3_u16, vec_splat_u16(6)); \
     65 }
     66 
     67 #define BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_2) \
     68 { \
     69    u16x8 anm0 = vec_mule(ab0, nm_m0); \
     70    u16x8 anm1 = vec_mule(ab1, nm_m1); \
     71    u16x8 anm2 = vec_mule(ab2, nm_m2); \
     72 \
     73    u16x8 bm0 = vec_mulo(ab0, nm_m0); \
     74    u16x8 bm1 = vec_mulo(ab1, nm_m1); \
     75    u16x8 bm2 = vec_mulo(ab2, nm_m2); \
     76 \
     77    d0_u16 = vec_add(anm0, bm0); \
     78    d1_u16 = vec_add(anm1, bm1); \
     79    d2_u16 = vec_add(anm2, bm2); \
     80 \
     81    d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \
     82    d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \
     83    d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32)); \
     84 \
     85    d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \
     86    d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \
     87    d2_u16 = vec_sr(d2_u16, vec_splat_u16(6)); \
     88 }
     89 
     90 #define BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m1) \
     91 { \
     92    u16x8 anm0 = vec_mule(ab0, nm_m0); \
     93    u16x8 anm1 = vec_mule(ab1, nm_m1); \
     94 \
     95    u16x8 bm0 = vec_mulo(ab0, nm_m0); \
     96    u16x8 bm1 = vec_mulo(ab1, nm_m1); \
     97 \
     98    d0_u16 = vec_add(anm0, bm0); \
     99    d1_u16 = vec_add(anm1, bm1); \
    100 \
    101    d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \
    102    d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \
    103 \
    104    d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \
    105    d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \
    106 }
    107 
    108 static void blend4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    109 {
    110    u8x16 v64u8 = vec_splats((uint8_t)64);
    111    u8x16 a0 = vec_xl(0, dst);
    112    u8x16 a1 = vec_xl(0, dst + stride);
    113    u8x16 a2 = vec_xl(0, dst + 2 * stride);
    114    u8x16 a3 = vec_xl(0, dst + 3 * stride);
    115    u8x16 m0 = vec_xl(0, mask);
    116    u8x16 m1 = vec_xl(0, mask + 4);
    117    u8x16 m2 = vec_xl(0, mask + 2 * 4);
    118    u8x16 m3 = vec_xl(0, mask + 3 * 4);
    119    u8x16 b0 = vec_xl(0, tmp);
    120    u8x16 b1 = vec_xl(0, tmp + 4);
    121    u8x16 b2 = vec_xl(0, tmp + 2 * 4);
    122    u8x16 b3 = vec_xl(0, tmp + 3 * 4);
    123 
    124    u8x16 nm0 = vec_sub(v64u8, m0);
    125    u8x16 nm1 = vec_sub(v64u8, m1);
    126    u8x16 nm2 = vec_sub(v64u8, m2);
    127    u8x16 nm3 = vec_sub(v64u8, m3);
    128 
    129    u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
    130    u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
    131    u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd
    132    u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd
    133    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    134    u8x16 nm_m1 = vec_mergeh(nm1, m1);
    135    u8x16 nm_m2 = vec_mergeh(nm2, m2);
    136    u8x16 nm_m3 = vec_mergeh(nm3, m3);
    137 
    138    u16x8 d0_u16, d1_u16, d2_u16, d3_u16;
    139 
    140    BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3);
    141 
    142    u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
    143    u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
    144    u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16);
    145    u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16);
    146 
    147    vec_xst_len(d0, dst, 4);
    148    vec_xst_len(d1, dst + stride, 4);
    149    vec_xst_len(d2, dst + 2 * stride, 4);
    150    vec_xst_len(d3, dst + 3 * stride, 4);
    151 }
    152 
    153 static void blend8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    154 {
    155    u8x16 v64u8 = vec_splats((uint8_t)64);
    156    u8x16 a0 = vec_xl(0, dst);
    157    u8x16 a1 = vec_xl(0, dst + stride);
    158    u8x16 a2 = vec_xl(0, dst + 2 * stride);
    159    u8x16 a3 = vec_xl(0, dst + 3 * stride);
    160    u8x16 m0 = vec_xl(0, mask);
    161    u8x16 m1 = vec_xl(0, mask + 8);
    162    u8x16 m2 = vec_xl(0, mask + 2 * 8);
    163    u8x16 m3 = vec_xl(0, mask + 3 * 8);
    164    u8x16 b0 = vec_xl(0, tmp);
    165    u8x16 b1 = vec_xl(0, tmp + 8);
    166    u8x16 b2 = vec_xl(0, tmp + 2 * 8);
    167    u8x16 b3 = vec_xl(0, tmp + 3 * 8);
    168 
    169    u8x16 nm0 = vec_sub(v64u8, m0);
    170    u8x16 nm1 = vec_sub(v64u8, m1);
    171    u8x16 nm2 = vec_sub(v64u8, m2);
    172    u8x16 nm3 = vec_sub(v64u8, m3);
    173 
    174    u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
    175    u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
    176    u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd
    177    u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd
    178    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    179    u8x16 nm_m1 = vec_mergeh(nm1, m1);
    180    u8x16 nm_m2 = vec_mergeh(nm2, m2);
    181    u8x16 nm_m3 = vec_mergeh(nm3, m3);
    182 
    183    u16x8 d0_u16, d1_u16, d2_u16, d3_u16;
    184 
    185    BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3);
    186 
    187    u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
    188    u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
    189    u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16);
    190    u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16);
    191 
    192    vec_xst_len(d0, dst, 8);
    193    vec_xst_len(d1, dst + stride, 8);
    194    vec_xst_len(d2, dst + 2 * stride, 8);
    195    vec_xst_len(d3, dst + 3 * stride, 8);
    196 }
    197 
    198 static inline void blend16_lines(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride)
    199 {
    200    u8x16 v64u8 = vec_splats((uint8_t)64);
    201    u8x16 a0 = vec_xl(0, dst);
    202    u8x16 a1 = vec_xl(0, dst + stride);
    203    u8x16 a2 = vec_xl(0, dst + 2 * stride);
    204    u8x16 a3 = vec_xl(0, dst + 3 * stride);
    205    u8x16 m0 = vec_xl(0, mask);
    206    u8x16 m1 = vec_xl(0, mask + mstride);
    207    u8x16 m2 = vec_xl(0, mask + 2 * mstride);
    208    u8x16 m3 = vec_xl(0, mask + 3 * mstride);
    209    u8x16 b0 = vec_xl(0, tmp);
    210    u8x16 b1 = vec_xl(0, tmp + mstride);
    211    u8x16 b2 = vec_xl(0, tmp + 2 * mstride);
    212    u8x16 b3 = vec_xl(0, tmp + 3 * mstride);
    213 
    214    u8x16 nm0 = vec_sub(v64u8, m0);
    215    u8x16 nm1 = vec_sub(v64u8, m1);
    216    u8x16 nm2 = vec_sub(v64u8, m2);
    217    u8x16 nm3 = vec_sub(v64u8, m3);
    218 
    219    u8x16 ab0 = vec_mergeh(a0, b0);
    220    u8x16 ab1 = vec_mergeh(a1, b1);
    221    u8x16 ab2 = vec_mergeh(a2, b2);
    222    u8x16 ab3 = vec_mergeh(a3, b3);
    223 
    224    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    225    u8x16 nm_m1 = vec_mergeh(nm1, m1);
    226    u8x16 nm_m2 = vec_mergeh(nm2, m2);
    227    u8x16 nm_m3 = vec_mergeh(nm3, m3);
    228 
    229    u16x8 d0h_u16, d1h_u16, d2h_u16, d3h_u16;
    230    u16x8 d0l_u16, d1l_u16, d2l_u16, d3l_u16;
    231 
    232    BLEND_LINES4(d0h_u16, d1h_u16, d2h_u16, d3h_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3)
    233 
    234    ab0 = vec_mergel(a0, b0);
    235    ab1 = vec_mergel(a1, b1);
    236    ab2 = vec_mergel(a2, b2);
    237    ab3 = vec_mergel(a3, b3);
    238 
    239    nm_m0 = vec_mergel(nm0, m0);
    240    nm_m1 = vec_mergel(nm1, m1);
    241    nm_m2 = vec_mergel(nm2, m2);
    242    nm_m3 = vec_mergel(nm3, m3);
    243 
    244    BLEND_LINES4(d0l_u16, d1l_u16, d2l_u16, d3l_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3)
    245 
    246    u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16);
    247    u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16);
    248    u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16);
    249    u8x16 d3 = (u8x16)vec_pack(d3h_u16, d3l_u16);
    250 
    251    vec_xst(d0, 0,dst);
    252    vec_xst(d1, 0,dst + stride);
    253    vec_xst(d2, 0,dst + 2 * stride);
    254    vec_xst(d3, 0,dst + 3 * stride);
    255 }
    256 
    257 static void blend16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    258 {
    259    blend16_lines(dst, tmp, mask, stride, 16);
    260 }
    261 
    262 static void blend32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    263 {
    264    for (int i = 0; i < 2; i++, dst += 16, tmp += 16, mask += 16) {
    265        blend16_lines(dst, tmp, mask, stride, 32);
    266    }
    267 }
    268 
    269 static blend_line blend_funcs[4] = {
    270    blend4, blend8, blend16, blend32
    271 };
    272 
    273 void dav1d_blend_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
    274                           const int w, int h, const uint8_t *mask)
    275 {
    276    assert(w <= 32);
    277    blend_line blend = blend_funcs[ctz(w) - 2];
    278 
    279    for (int y = 0; y < h; y+=4) {
    280        blend(dst, tmp, mask, PXSTRIDE(dst_stride));
    281        dst += 4 * PXSTRIDE(dst_stride);
    282        tmp += 4 * w;
    283        mask += 4 * w;
    284    }
    285 }
    286 
    287 static inline void blend_v_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l)
    288 {
    289    u8x16 v64u8 = vec_splats((uint8_t)64);
    290    u8x16 a0 = vec_xl(0, dst);
    291    u8x16 a1 = vec_xl(0, dst + stride);
    292    u8x16 m0 = vec_xl(0, mask);
    293    u8x16 b0 = vec_xl(0, tmp);
    294    u8x16 b1 = vec_xl(0, tmp + mstride);
    295 
    296    u8x16 nm0 = vec_sub(v64u8, m0);
    297 
    298    u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
    299    u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
    300    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    301 
    302    u16x8 d0_u16, d1_u16;
    303 
    304    BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m0);
    305 
    306    u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
    307    u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
    308 
    309    vec_xst_len(d0, dst, l);
    310    vec_xst_len(d1, dst + stride, l);
    311 }
    312 
    313 static inline void blend_v_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l)
    314 {
    315    u8x16 v64u8 = vec_splats((uint8_t)64);
    316    u8x16 a0 = vec_xl(0, dst);
    317    u8x16 a1 = vec_xl(0, dst + stride);
    318    u8x16 m0 = vec_xl(0, mask);
    319    u8x16 b0 = vec_xl(0, tmp);
    320    u8x16 b1 = vec_xl(0, tmp + mstride);
    321 
    322    u8x16 nm0 = vec_sub(v64u8, m0);
    323 
    324    u8x16 ab0 = vec_mergeh(a0, b0);
    325    u8x16 ab1 = vec_mergeh(a1, b1);
    326 
    327    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    328 
    329    u16x8 d0h_u16, d1h_u16;
    330    u16x8 d0l_u16, d1l_u16;
    331 
    332    BLEND_LINES2(d0h_u16, d1h_u16, ab0, ab1, nm_m0, nm_m0)
    333 
    334    ab0 = vec_mergel(a0, b0);
    335    ab1 = vec_mergel(a1, b1);
    336 
    337    nm_m0 = vec_mergel(nm0, m0);
    338 
    339    BLEND_LINES2(d0l_u16, d1l_u16, ab0, ab1,nm_m0, nm_m0)
    340 
    341    u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16);
    342    u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16);
    343 
    344    vec_xst_len(d0, dst, l);
    345    vec_xst_len(d1, dst + stride, l);
    346 }
    347 
    348 static void blend_v3(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    349 {
    350    blend_v_h(dst, tmp, mask, stride, 4, 3);
    351 }
    352 
    353 static void blend_v6(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    354 {
    355    blend_v_h(dst, tmp, mask, stride, 8, 6);
    356 }
    357 
    358 static void blend_v12(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    359 {
    360    blend_v_hl(dst, tmp, mask, stride, 16, 12);
    361 }
    362 
    363 static void blend_v24(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    364 {
    365    blend_v_hl(dst, tmp, mask, stride, 32, 16);
    366    blend_v_h(dst + 16, tmp + 16, mask + 16, stride, 32, 8);
    367 }
    368 
    369 static void blend_v1(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride)
    370 {
    371    dst[0] = blend_px(dst[0], tmp[0], mask[0]);
    372    dst[stride] = blend_px(dst[stride], tmp[2], mask[0]);
    373 }
    374 
    375 static blend_line blend_v_funcs[5] = {
    376    blend_v1, blend_v3, blend_v6, blend_v12, blend_v24
    377 };
    378 
    379 void dav1d_blend_v_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
    380                             const int w, int h)
    381 {
    382    const uint8_t *const mask = &dav1d_obmc_masks[w];
    383 
    384    assert(w <= 32);
    385    blend_line blend = blend_v_funcs[ctz(w) - 1];
    386 
    387    for (int y = 0; y < h; y+=2) {
    388        blend(dst, tmp, mask, PXSTRIDE(dst_stride));
    389 
    390        dst += 2 * PXSTRIDE(dst_stride);
    391        tmp += 2 * w;
    392    }
    393 }
    394 
    395 static inline void blend_h_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l)
    396 {
    397    u8x16 v64u8 = vec_splats((uint8_t)64);
    398    u8x16 a0 = vec_xl(0, dst);
    399    u8x16 a1 = vec_xl(0, dst + stride);
    400    u8x16 a2 = vec_xl(0, dst + 2 * stride);
    401    u8x16 m = vec_xl(0, mask);
    402    u8x16 b0 = vec_xl(0, tmp);
    403    u8x16 b1 = vec_xl(0, tmp + mstride);
    404    u8x16 b2 = vec_xl(0, tmp + 2 * mstride);
    405    u8x16 m0 = vec_splat(m, 0);
    406    u8x16 m1 = vec_splat(m, 1);
    407    u8x16 m2 = vec_splat(m, 2);
    408 
    409    u8x16 nm0 = vec_sub(v64u8, m0);
    410    u8x16 nm1 = vec_sub(v64u8, m1);
    411    u8x16 nm2 = vec_sub(v64u8, m2);
    412 
    413    u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd
    414    u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd
    415    u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd
    416    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    417    u8x16 nm_m1 = vec_mergeh(nm1, m1);
    418    u8x16 nm_m2 = vec_mergeh(nm2, m2);
    419 
    420    u16x8 d0_u16, d1_u16, d2_u16;
    421 
    422    BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2);
    423 
    424    u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16);
    425    u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16);
    426    u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16);
    427 
    428    vec_xst_len(d0, dst, l);
    429    vec_xst_len(d1, dst + stride, l);
    430    vec_xst_len(d2, dst + 2 * stride, l);
    431 }
    432 
    433 static inline void blend_h_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride)
    434 {
    435    u8x16 v64u8 = vec_splats((uint8_t)64);
    436    u8x16 a0 = vec_xl(0, dst);
    437    u8x16 a1 = vec_xl(0, dst + stride);
    438    u8x16 a2 = vec_xl(0, dst + 2 * stride);
    439    u8x16 m = vec_xl(0, mask);
    440    u8x16 b0 = vec_xl(0, tmp);
    441    u8x16 b1 = vec_xl(0, tmp + mstride);
    442    u8x16 b2 = vec_xl(0, tmp + 2 * mstride);
    443    u8x16 m0 = vec_splat(m, 0);
    444    u8x16 m1 = vec_splat(m, 1);
    445    u8x16 m2 = vec_splat(m, 2);
    446 
    447    u8x16 nm0 = vec_sub(v64u8, m0);
    448    u8x16 nm1 = vec_sub(v64u8, m1);
    449    u8x16 nm2 = vec_sub(v64u8, m2);
    450 
    451    u8x16 ab0 = vec_mergeh(a0, b0);
    452    u8x16 ab1 = vec_mergeh(a1, b1);
    453    u8x16 ab2 = vec_mergeh(a2, b2);
    454 
    455    u8x16 nm_m0 = vec_mergeh(nm0, m0);
    456    u8x16 nm_m1 = vec_mergeh(nm1, m1);
    457    u8x16 nm_m2 = vec_mergeh(nm2, m2);
    458 
    459    u16x8 d0h_u16, d1h_u16, d2h_u16;
    460    u16x8 d0l_u16, d1l_u16, d2l_u16;
    461 
    462    BLEND_LINES3(d0h_u16, d1h_u16, d2h_u16,  ab0, ab1, ab2, nm_m0, nm_m1, nm_m2)
    463 
    464    ab0 = vec_mergel(a0, b0);
    465    ab1 = vec_mergel(a1, b1);
    466    ab2 = vec_mergel(a2, b2);
    467 
    468    nm_m0 = vec_mergel(nm0, m0);
    469    nm_m1 = vec_mergel(nm1, m1);
    470    nm_m2 = vec_mergel(nm2, m2);
    471 
    472    BLEND_LINES3(d0l_u16, d1l_u16, d2l_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2)
    473 
    474    u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16);
    475    u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16);
    476    u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16);
    477 
    478    vec_xst(d0, 0, dst);
    479    vec_xst(d1, 0,dst + stride);
    480    vec_xst(d2, 0,dst + 2 * stride);
    481 }
    482 
    483 static void blend_h2(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    484    for (int y = 0; y < 3; y++) {
    485        const int m = *mask++;
    486        for (int x = 0; x < 2; x++) {
    487            dst[x] = blend_px(dst[x], tmp[x], m);
    488        }
    489        dst += stride;
    490        tmp += 2;
    491    }
    492 }
    493 
    494 static void blend_h4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    495    blend_h_h(dst, tmp, mask, stride, 4, 4);
    496 }
    497 
    498 static void blend_h8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    499    blend_h_h(dst, tmp, mask, stride, 8, 8);
    500 }
    501 
    502 static void blend_h16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    503    blend_h_hl(dst, tmp, mask, stride, 16);
    504 }
    505 
    506 static void blend_h32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    507    blend_h_hl(dst, tmp, mask, stride, 32);
    508    blend_h_hl(dst + 16, tmp + 16, mask, stride, 32);
    509 }
    510 
    511 static void blend_h64(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    512    blend_h_hl(dst, tmp, mask, stride, 64);
    513    blend_h_hl(dst + 16, tmp + 16, mask, stride, 64);
    514    blend_h_hl(dst + 32, tmp + 32, mask, stride, 64);
    515    blend_h_hl(dst + 48, tmp + 48, mask, stride, 64);
    516 }
    517 
    518 static void blend_h128(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) {
    519    for (int i = 0; i < 2; i++, dst += 64, tmp += 64) {
    520        blend_h_hl(dst, tmp, mask, stride, 128);
    521        blend_h_hl(dst + 16, tmp + 16, mask, stride, 128);
    522        blend_h_hl(dst + 32, tmp + 32, mask, stride, 128);
    523        blend_h_hl(dst + 48, tmp + 48, mask, stride, 128);
    524    }
    525 }
    526 
    527 static blend_line blend_h_funcs[7] = {
    528    blend_h2, blend_h4, blend_h8, blend_h16, blend_h32, blend_h64, blend_h128
    529 };
    530 
    531 void dav1d_blend_h_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
    532                             const int w, int h)
    533 {
    534    const uint8_t *mask = &dav1d_obmc_masks[h];
    535    h = (h * 3) >> 2;
    536 
    537    assert(w <= 128);
    538    blend_line blend = blend_h_funcs[ctz(w) - 1];
    539 
    540    if (h == 1) {
    541        const int m = *mask++;
    542        for (int x = 0; x < w; x++) {
    543            dst[x] = blend_px(dst[x], tmp[x], m);
    544        }
    545    } else
    546    for (int y = 0; y < h; y+=3) {
    547        blend(dst, tmp, mask, PXSTRIDE(dst_stride));
    548        dst += 3 * PXSTRIDE(dst_stride);
    549        tmp += 3 * w;
    550        mask += 3;
    551    }
    552 }
    553 
    554 #endif // BITDEPTH