mc_tmpl.c (18210B)
1 /* 2 * Copyright © 2024, VideoLAN and dav1d authors 3 * Copyright © 2024, Luca Barbato 4 * All rights reserved. 5 * 6 * Redistribution and use in source and binary forms, with or without 7 * modification, are permitted provided that the following conditions are met: 8 * 9 * 1. Redistributions of source code must retain the above copyright notice, this 10 * list of conditions and the following disclaimer. 11 * 12 * 2. Redistributions in binary form must reproduce the above copyright notice, 13 * this list of conditions and the following disclaimer in the documentation 14 * and/or other materials provided with the distribution. 15 * 16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR 20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND 23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 */ 27 28 #include "common/attributes.h" 29 #include "src/ppc/mc.h" 30 #include "src/tables.h" 31 #include "src/ppc/dav1d_types.h" 32 33 #if BITDEPTH == 8 34 35 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6) 36 37 typedef void (*blend_line)(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride); 38 39 #define BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) \ 40 { \ 41 u16x8 anm0 = vec_mule(ab0, nm_m0); \ 42 u16x8 anm1 = vec_mule(ab1, nm_m1); \ 43 u16x8 anm2 = vec_mule(ab2, nm_m2); \ 44 u16x8 anm3 = vec_mule(ab3, nm_m3); \ 45 \ 46 u16x8 bm0 = vec_mulo(ab0, nm_m0); \ 47 u16x8 bm1 = vec_mulo(ab1, nm_m1); \ 48 u16x8 bm2 = vec_mulo(ab2, nm_m2); \ 49 u16x8 bm3 = vec_mulo(ab3, nm_m3); \ 50 \ 51 d0_u16 = vec_add(anm0, bm0); \ 52 d1_u16 = vec_add(anm1, bm1); \ 53 d2_u16 = vec_add(anm2, bm2); \ 54 d3_u16 = vec_add(anm3, bm3); \ 55 \ 56 d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \ 57 d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \ 58 d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32)); \ 59 d3_u16 = vec_add(d3_u16, vec_splats((uint16_t)32)); \ 60 \ 61 d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \ 62 d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \ 63 d2_u16 = vec_sr(d2_u16, vec_splat_u16(6)); \ 64 d3_u16 = vec_sr(d3_u16, vec_splat_u16(6)); \ 65 } 66 67 #define BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_2) \ 68 { \ 69 u16x8 anm0 = vec_mule(ab0, nm_m0); \ 70 u16x8 anm1 = vec_mule(ab1, nm_m1); \ 71 u16x8 anm2 = vec_mule(ab2, nm_m2); \ 72 \ 73 u16x8 bm0 = vec_mulo(ab0, nm_m0); \ 74 u16x8 bm1 = vec_mulo(ab1, nm_m1); \ 75 u16x8 bm2 = vec_mulo(ab2, nm_m2); \ 76 \ 77 d0_u16 = vec_add(anm0, bm0); \ 78 d1_u16 = vec_add(anm1, bm1); \ 79 d2_u16 = vec_add(anm2, bm2); \ 80 \ 81 d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \ 82 d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \ 83 d2_u16 = vec_add(d2_u16, vec_splats((uint16_t)32)); \ 84 \ 85 d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \ 86 d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \ 87 d2_u16 = vec_sr(d2_u16, vec_splat_u16(6)); \ 88 } 89 90 #define BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m1) \ 91 { \ 92 u16x8 anm0 = vec_mule(ab0, nm_m0); \ 93 u16x8 anm1 = vec_mule(ab1, nm_m1); \ 94 \ 95 u16x8 bm0 = vec_mulo(ab0, nm_m0); \ 96 u16x8 bm1 = vec_mulo(ab1, nm_m1); \ 97 \ 98 d0_u16 = vec_add(anm0, bm0); \ 99 d1_u16 = vec_add(anm1, bm1); \ 100 \ 101 d0_u16 = vec_add(d0_u16, vec_splats((uint16_t)32)); \ 102 d1_u16 = vec_add(d1_u16, vec_splats((uint16_t)32)); \ 103 \ 104 d0_u16 = vec_sr(d0_u16, vec_splat_u16(6)); \ 105 d1_u16 = vec_sr(d1_u16, vec_splat_u16(6)); \ 106 } 107 108 static void blend4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 109 { 110 u8x16 v64u8 = vec_splats((uint8_t)64); 111 u8x16 a0 = vec_xl(0, dst); 112 u8x16 a1 = vec_xl(0, dst + stride); 113 u8x16 a2 = vec_xl(0, dst + 2 * stride); 114 u8x16 a3 = vec_xl(0, dst + 3 * stride); 115 u8x16 m0 = vec_xl(0, mask); 116 u8x16 m1 = vec_xl(0, mask + 4); 117 u8x16 m2 = vec_xl(0, mask + 2 * 4); 118 u8x16 m3 = vec_xl(0, mask + 3 * 4); 119 u8x16 b0 = vec_xl(0, tmp); 120 u8x16 b1 = vec_xl(0, tmp + 4); 121 u8x16 b2 = vec_xl(0, tmp + 2 * 4); 122 u8x16 b3 = vec_xl(0, tmp + 3 * 4); 123 124 u8x16 nm0 = vec_sub(v64u8, m0); 125 u8x16 nm1 = vec_sub(v64u8, m1); 126 u8x16 nm2 = vec_sub(v64u8, m2); 127 u8x16 nm3 = vec_sub(v64u8, m3); 128 129 u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd 130 u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd 131 u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd 132 u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd 133 u8x16 nm_m0 = vec_mergeh(nm0, m0); 134 u8x16 nm_m1 = vec_mergeh(nm1, m1); 135 u8x16 nm_m2 = vec_mergeh(nm2, m2); 136 u8x16 nm_m3 = vec_mergeh(nm3, m3); 137 138 u16x8 d0_u16, d1_u16, d2_u16, d3_u16; 139 140 BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3); 141 142 u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); 143 u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); 144 u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16); 145 u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16); 146 147 vec_xst_len(d0, dst, 4); 148 vec_xst_len(d1, dst + stride, 4); 149 vec_xst_len(d2, dst + 2 * stride, 4); 150 vec_xst_len(d3, dst + 3 * stride, 4); 151 } 152 153 static void blend8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 154 { 155 u8x16 v64u8 = vec_splats((uint8_t)64); 156 u8x16 a0 = vec_xl(0, dst); 157 u8x16 a1 = vec_xl(0, dst + stride); 158 u8x16 a2 = vec_xl(0, dst + 2 * stride); 159 u8x16 a3 = vec_xl(0, dst + 3 * stride); 160 u8x16 m0 = vec_xl(0, mask); 161 u8x16 m1 = vec_xl(0, mask + 8); 162 u8x16 m2 = vec_xl(0, mask + 2 * 8); 163 u8x16 m3 = vec_xl(0, mask + 3 * 8); 164 u8x16 b0 = vec_xl(0, tmp); 165 u8x16 b1 = vec_xl(0, tmp + 8); 166 u8x16 b2 = vec_xl(0, tmp + 2 * 8); 167 u8x16 b3 = vec_xl(0, tmp + 3 * 8); 168 169 u8x16 nm0 = vec_sub(v64u8, m0); 170 u8x16 nm1 = vec_sub(v64u8, m1); 171 u8x16 nm2 = vec_sub(v64u8, m2); 172 u8x16 nm3 = vec_sub(v64u8, m3); 173 174 u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd 175 u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd 176 u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd 177 u8x16 ab3 = vec_mergeh(a3, b3); // a even, b odd 178 u8x16 nm_m0 = vec_mergeh(nm0, m0); 179 u8x16 nm_m1 = vec_mergeh(nm1, m1); 180 u8x16 nm_m2 = vec_mergeh(nm2, m2); 181 u8x16 nm_m3 = vec_mergeh(nm3, m3); 182 183 u16x8 d0_u16, d1_u16, d2_u16, d3_u16; 184 185 BLEND_LINES4(d0_u16, d1_u16, d2_u16, d3_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3); 186 187 u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); 188 u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); 189 u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16); 190 u8x16 d3 = (u8x16)vec_pack(d3_u16, d3_u16); 191 192 vec_xst_len(d0, dst, 8); 193 vec_xst_len(d1, dst + stride, 8); 194 vec_xst_len(d2, dst + 2 * stride, 8); 195 vec_xst_len(d3, dst + 3 * stride, 8); 196 } 197 198 static inline void blend16_lines(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride) 199 { 200 u8x16 v64u8 = vec_splats((uint8_t)64); 201 u8x16 a0 = vec_xl(0, dst); 202 u8x16 a1 = vec_xl(0, dst + stride); 203 u8x16 a2 = vec_xl(0, dst + 2 * stride); 204 u8x16 a3 = vec_xl(0, dst + 3 * stride); 205 u8x16 m0 = vec_xl(0, mask); 206 u8x16 m1 = vec_xl(0, mask + mstride); 207 u8x16 m2 = vec_xl(0, mask + 2 * mstride); 208 u8x16 m3 = vec_xl(0, mask + 3 * mstride); 209 u8x16 b0 = vec_xl(0, tmp); 210 u8x16 b1 = vec_xl(0, tmp + mstride); 211 u8x16 b2 = vec_xl(0, tmp + 2 * mstride); 212 u8x16 b3 = vec_xl(0, tmp + 3 * mstride); 213 214 u8x16 nm0 = vec_sub(v64u8, m0); 215 u8x16 nm1 = vec_sub(v64u8, m1); 216 u8x16 nm2 = vec_sub(v64u8, m2); 217 u8x16 nm3 = vec_sub(v64u8, m3); 218 219 u8x16 ab0 = vec_mergeh(a0, b0); 220 u8x16 ab1 = vec_mergeh(a1, b1); 221 u8x16 ab2 = vec_mergeh(a2, b2); 222 u8x16 ab3 = vec_mergeh(a3, b3); 223 224 u8x16 nm_m0 = vec_mergeh(nm0, m0); 225 u8x16 nm_m1 = vec_mergeh(nm1, m1); 226 u8x16 nm_m2 = vec_mergeh(nm2, m2); 227 u8x16 nm_m3 = vec_mergeh(nm3, m3); 228 229 u16x8 d0h_u16, d1h_u16, d2h_u16, d3h_u16; 230 u16x8 d0l_u16, d1l_u16, d2l_u16, d3l_u16; 231 232 BLEND_LINES4(d0h_u16, d1h_u16, d2h_u16, d3h_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) 233 234 ab0 = vec_mergel(a0, b0); 235 ab1 = vec_mergel(a1, b1); 236 ab2 = vec_mergel(a2, b2); 237 ab3 = vec_mergel(a3, b3); 238 239 nm_m0 = vec_mergel(nm0, m0); 240 nm_m1 = vec_mergel(nm1, m1); 241 nm_m2 = vec_mergel(nm2, m2); 242 nm_m3 = vec_mergel(nm3, m3); 243 244 BLEND_LINES4(d0l_u16, d1l_u16, d2l_u16, d3l_u16, ab0, ab1, ab2, ab3, nm_m0, nm_m1, nm_m2, nm_m3) 245 246 u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16); 247 u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16); 248 u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16); 249 u8x16 d3 = (u8x16)vec_pack(d3h_u16, d3l_u16); 250 251 vec_xst(d0, 0,dst); 252 vec_xst(d1, 0,dst + stride); 253 vec_xst(d2, 0,dst + 2 * stride); 254 vec_xst(d3, 0,dst + 3 * stride); 255 } 256 257 static void blend16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 258 { 259 blend16_lines(dst, tmp, mask, stride, 16); 260 } 261 262 static void blend32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 263 { 264 for (int i = 0; i < 2; i++, dst += 16, tmp += 16, mask += 16) { 265 blend16_lines(dst, tmp, mask, stride, 32); 266 } 267 } 268 269 static blend_line blend_funcs[4] = { 270 blend4, blend8, blend16, blend32 271 }; 272 273 void dav1d_blend_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 274 const int w, int h, const uint8_t *mask) 275 { 276 assert(w <= 32); 277 blend_line blend = blend_funcs[ctz(w) - 2]; 278 279 for (int y = 0; y < h; y+=4) { 280 blend(dst, tmp, mask, PXSTRIDE(dst_stride)); 281 dst += 4 * PXSTRIDE(dst_stride); 282 tmp += 4 * w; 283 mask += 4 * w; 284 } 285 } 286 287 static inline void blend_v_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l) 288 { 289 u8x16 v64u8 = vec_splats((uint8_t)64); 290 u8x16 a0 = vec_xl(0, dst); 291 u8x16 a1 = vec_xl(0, dst + stride); 292 u8x16 m0 = vec_xl(0, mask); 293 u8x16 b0 = vec_xl(0, tmp); 294 u8x16 b1 = vec_xl(0, tmp + mstride); 295 296 u8x16 nm0 = vec_sub(v64u8, m0); 297 298 u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd 299 u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd 300 u8x16 nm_m0 = vec_mergeh(nm0, m0); 301 302 u16x8 d0_u16, d1_u16; 303 304 BLEND_LINES2(d0_u16, d1_u16, ab0, ab1, nm_m0, nm_m0); 305 306 u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); 307 u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); 308 309 vec_xst_len(d0, dst, l); 310 vec_xst_len(d1, dst + stride, l); 311 } 312 313 static inline void blend_v_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l) 314 { 315 u8x16 v64u8 = vec_splats((uint8_t)64); 316 u8x16 a0 = vec_xl(0, dst); 317 u8x16 a1 = vec_xl(0, dst + stride); 318 u8x16 m0 = vec_xl(0, mask); 319 u8x16 b0 = vec_xl(0, tmp); 320 u8x16 b1 = vec_xl(0, tmp + mstride); 321 322 u8x16 nm0 = vec_sub(v64u8, m0); 323 324 u8x16 ab0 = vec_mergeh(a0, b0); 325 u8x16 ab1 = vec_mergeh(a1, b1); 326 327 u8x16 nm_m0 = vec_mergeh(nm0, m0); 328 329 u16x8 d0h_u16, d1h_u16; 330 u16x8 d0l_u16, d1l_u16; 331 332 BLEND_LINES2(d0h_u16, d1h_u16, ab0, ab1, nm_m0, nm_m0) 333 334 ab0 = vec_mergel(a0, b0); 335 ab1 = vec_mergel(a1, b1); 336 337 nm_m0 = vec_mergel(nm0, m0); 338 339 BLEND_LINES2(d0l_u16, d1l_u16, ab0, ab1,nm_m0, nm_m0) 340 341 u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16); 342 u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16); 343 344 vec_xst_len(d0, dst, l); 345 vec_xst_len(d1, dst + stride, l); 346 } 347 348 static void blend_v3(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 349 { 350 blend_v_h(dst, tmp, mask, stride, 4, 3); 351 } 352 353 static void blend_v6(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 354 { 355 blend_v_h(dst, tmp, mask, stride, 8, 6); 356 } 357 358 static void blend_v12(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 359 { 360 blend_v_hl(dst, tmp, mask, stride, 16, 12); 361 } 362 363 static void blend_v24(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 364 { 365 blend_v_hl(dst, tmp, mask, stride, 32, 16); 366 blend_v_h(dst + 16, tmp + 16, mask + 16, stride, 32, 8); 367 } 368 369 static void blend_v1(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) 370 { 371 dst[0] = blend_px(dst[0], tmp[0], mask[0]); 372 dst[stride] = blend_px(dst[stride], tmp[2], mask[0]); 373 } 374 375 static blend_line blend_v_funcs[5] = { 376 blend_v1, blend_v3, blend_v6, blend_v12, blend_v24 377 }; 378 379 void dav1d_blend_v_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 380 const int w, int h) 381 { 382 const uint8_t *const mask = &dav1d_obmc_masks[w]; 383 384 assert(w <= 32); 385 blend_line blend = blend_v_funcs[ctz(w) - 1]; 386 387 for (int y = 0; y < h; y+=2) { 388 blend(dst, tmp, mask, PXSTRIDE(dst_stride)); 389 390 dst += 2 * PXSTRIDE(dst_stride); 391 tmp += 2 * w; 392 } 393 } 394 395 static inline void blend_h_h(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride, int l) 396 { 397 u8x16 v64u8 = vec_splats((uint8_t)64); 398 u8x16 a0 = vec_xl(0, dst); 399 u8x16 a1 = vec_xl(0, dst + stride); 400 u8x16 a2 = vec_xl(0, dst + 2 * stride); 401 u8x16 m = vec_xl(0, mask); 402 u8x16 b0 = vec_xl(0, tmp); 403 u8x16 b1 = vec_xl(0, tmp + mstride); 404 u8x16 b2 = vec_xl(0, tmp + 2 * mstride); 405 u8x16 m0 = vec_splat(m, 0); 406 u8x16 m1 = vec_splat(m, 1); 407 u8x16 m2 = vec_splat(m, 2); 408 409 u8x16 nm0 = vec_sub(v64u8, m0); 410 u8x16 nm1 = vec_sub(v64u8, m1); 411 u8x16 nm2 = vec_sub(v64u8, m2); 412 413 u8x16 ab0 = vec_mergeh(a0, b0); // a even, b odd 414 u8x16 ab1 = vec_mergeh(a1, b1); // a even, b odd 415 u8x16 ab2 = vec_mergeh(a2, b2); // a even, b odd 416 u8x16 nm_m0 = vec_mergeh(nm0, m0); 417 u8x16 nm_m1 = vec_mergeh(nm1, m1); 418 u8x16 nm_m2 = vec_mergeh(nm2, m2); 419 420 u16x8 d0_u16, d1_u16, d2_u16; 421 422 BLEND_LINES3(d0_u16, d1_u16, d2_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2); 423 424 u8x16 d0 = (u8x16)vec_pack(d0_u16, d0_u16); 425 u8x16 d1 = (u8x16)vec_pack(d1_u16, d1_u16); 426 u8x16 d2 = (u8x16)vec_pack(d2_u16, d2_u16); 427 428 vec_xst_len(d0, dst, l); 429 vec_xst_len(d1, dst + stride, l); 430 vec_xst_len(d2, dst + 2 * stride, l); 431 } 432 433 static inline void blend_h_hl(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride, int mstride) 434 { 435 u8x16 v64u8 = vec_splats((uint8_t)64); 436 u8x16 a0 = vec_xl(0, dst); 437 u8x16 a1 = vec_xl(0, dst + stride); 438 u8x16 a2 = vec_xl(0, dst + 2 * stride); 439 u8x16 m = vec_xl(0, mask); 440 u8x16 b0 = vec_xl(0, tmp); 441 u8x16 b1 = vec_xl(0, tmp + mstride); 442 u8x16 b2 = vec_xl(0, tmp + 2 * mstride); 443 u8x16 m0 = vec_splat(m, 0); 444 u8x16 m1 = vec_splat(m, 1); 445 u8x16 m2 = vec_splat(m, 2); 446 447 u8x16 nm0 = vec_sub(v64u8, m0); 448 u8x16 nm1 = vec_sub(v64u8, m1); 449 u8x16 nm2 = vec_sub(v64u8, m2); 450 451 u8x16 ab0 = vec_mergeh(a0, b0); 452 u8x16 ab1 = vec_mergeh(a1, b1); 453 u8x16 ab2 = vec_mergeh(a2, b2); 454 455 u8x16 nm_m0 = vec_mergeh(nm0, m0); 456 u8x16 nm_m1 = vec_mergeh(nm1, m1); 457 u8x16 nm_m2 = vec_mergeh(nm2, m2); 458 459 u16x8 d0h_u16, d1h_u16, d2h_u16; 460 u16x8 d0l_u16, d1l_u16, d2l_u16; 461 462 BLEND_LINES3(d0h_u16, d1h_u16, d2h_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2) 463 464 ab0 = vec_mergel(a0, b0); 465 ab1 = vec_mergel(a1, b1); 466 ab2 = vec_mergel(a2, b2); 467 468 nm_m0 = vec_mergel(nm0, m0); 469 nm_m1 = vec_mergel(nm1, m1); 470 nm_m2 = vec_mergel(nm2, m2); 471 472 BLEND_LINES3(d0l_u16, d1l_u16, d2l_u16, ab0, ab1, ab2, nm_m0, nm_m1, nm_m2) 473 474 u8x16 d0 = (u8x16)vec_pack(d0h_u16, d0l_u16); 475 u8x16 d1 = (u8x16)vec_pack(d1h_u16, d1l_u16); 476 u8x16 d2 = (u8x16)vec_pack(d2h_u16, d2l_u16); 477 478 vec_xst(d0, 0, dst); 479 vec_xst(d1, 0,dst + stride); 480 vec_xst(d2, 0,dst + 2 * stride); 481 } 482 483 static void blend_h2(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 484 for (int y = 0; y < 3; y++) { 485 const int m = *mask++; 486 for (int x = 0; x < 2; x++) { 487 dst[x] = blend_px(dst[x], tmp[x], m); 488 } 489 dst += stride; 490 tmp += 2; 491 } 492 } 493 494 static void blend_h4(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 495 blend_h_h(dst, tmp, mask, stride, 4, 4); 496 } 497 498 static void blend_h8(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 499 blend_h_h(dst, tmp, mask, stride, 8, 8); 500 } 501 502 static void blend_h16(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 503 blend_h_hl(dst, tmp, mask, stride, 16); 504 } 505 506 static void blend_h32(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 507 blend_h_hl(dst, tmp, mask, stride, 32); 508 blend_h_hl(dst + 16, tmp + 16, mask, stride, 32); 509 } 510 511 static void blend_h64(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 512 blend_h_hl(dst, tmp, mask, stride, 64); 513 blend_h_hl(dst + 16, tmp + 16, mask, stride, 64); 514 blend_h_hl(dst + 32, tmp + 32, mask, stride, 64); 515 blend_h_hl(dst + 48, tmp + 48, mask, stride, 64); 516 } 517 518 static void blend_h128(uint8_t *dst, const uint8_t *tmp, const uint8_t *mask, int stride) { 519 for (int i = 0; i < 2; i++, dst += 64, tmp += 64) { 520 blend_h_hl(dst, tmp, mask, stride, 128); 521 blend_h_hl(dst + 16, tmp + 16, mask, stride, 128); 522 blend_h_hl(dst + 32, tmp + 32, mask, stride, 128); 523 blend_h_hl(dst + 48, tmp + 48, mask, stride, 128); 524 } 525 } 526 527 static blend_line blend_h_funcs[7] = { 528 blend_h2, blend_h4, blend_h8, blend_h16, blend_h32, blend_h64, blend_h128 529 }; 530 531 void dav1d_blend_h_8bpc_pwr9(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp, 532 const int w, int h) 533 { 534 const uint8_t *mask = &dav1d_obmc_masks[h]; 535 h = (h * 3) >> 2; 536 537 assert(w <= 128); 538 blend_line blend = blend_h_funcs[ctz(w) - 1]; 539 540 if (h == 1) { 541 const int m = *mask++; 542 for (int x = 0; x < w; x++) { 543 dst[x] = blend_px(dst[x], tmp[x], m); 544 } 545 } else 546 for (int y = 0; y < h; y+=3) { 547 blend(dst, tmp, mask, PXSTRIDE(dst_stride)); 548 dst += 3 * PXSTRIDE(dst_stride); 549 tmp += 3 * w; 550 mask += 3; 551 } 552 } 553 554 #endif // BITDEPTH