tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

mc_tmpl.c (35339B)


      1 /*
      2 * Copyright © 2018, VideoLAN and dav1d authors
      3 * Copyright © 2018, Two Orioles, LLC
      4 * All rights reserved.
      5 *
      6 * Redistribution and use in source and binary forms, with or without
      7 * modification, are permitted provided that the following conditions are met:
      8 *
      9 * 1. Redistributions of source code must retain the above copyright notice, this
     10 *    list of conditions and the following disclaimer.
     11 *
     12 * 2. Redistributions in binary form must reproduce the above copyright notice,
     13 *    this list of conditions and the following disclaimer in the documentation
     14 *    and/or other materials provided with the distribution.
     15 *
     16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
     17 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
     18 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
     19 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
     20 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
     21 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
     22 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
     23 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
     24 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
     25 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
     26 */
     27 
     28 #include "config.h"
     29 
     30 #include <stdlib.h>
     31 #include <string.h>
     32 
     33 #include "common/attributes.h"
     34 #include "common/intops.h"
     35 
     36 #include "src/mc.h"
     37 #include "src/tables.h"
     38 
     39 #if BITDEPTH == 8
     40 #define get_intermediate_bits(bitdepth_max) 4
     41 // Output in interval [-5132, 9212], fits in int16_t as is
     42 #define PREP_BIAS 0
     43 #else
     44 // 4 for 10 bits/component, 2 for 12 bits/component
     45 #define get_intermediate_bits(bitdepth_max) (14 - bitdepth_from_max(bitdepth_max))
     46 // Output in interval [-20588, 36956] (10-bit), [-20602, 36983] (12-bit)
     47 // Subtract a bias to ensure the output fits in int16_t
     48 #define PREP_BIAS 8192
     49 #endif
     50 
     51 static NOINLINE void
     52 put_c(pixel *dst, const ptrdiff_t dst_stride,
     53      const pixel *src, const ptrdiff_t src_stride, const int w, int h)
     54 {
     55    do {
     56        pixel_copy(dst, src, w);
     57 
     58        dst += dst_stride;
     59        src += src_stride;
     60    } while (--h);
     61 }
     62 
     63 static NOINLINE void
     64 prep_c(int16_t *tmp, const pixel *src, const ptrdiff_t src_stride,
     65       const int w, int h HIGHBD_DECL_SUFFIX)
     66 {
     67    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
     68    do {
     69        for (int x = 0; x < w; x++)
     70            tmp[x] = (src[x] << intermediate_bits) - PREP_BIAS;
     71 
     72        tmp += w;
     73        src += src_stride;
     74    } while (--h);
     75 }
     76 
     77 #define FILTER_8TAP(src, x, F, stride) \
     78    (F[0] * src[x + -3 * stride] + \
     79     F[1] * src[x + -2 * stride] + \
     80     F[2] * src[x + -1 * stride] + \
     81     F[3] * src[x + +0 * stride] + \
     82     F[4] * src[x + +1 * stride] + \
     83     F[5] * src[x + +2 * stride] + \
     84     F[6] * src[x + +3 * stride] + \
     85     F[7] * src[x + +4 * stride])
     86 
     87 #define FILTER_8TAP2(src, x, F) \
     88    (F[0] * src[0][x] + \
     89     F[1] * src[1][x] + \
     90     F[2] * src[2][x] + \
     91     F[3] * src[3][x] + \
     92     F[4] * src[4][x] + \
     93     F[5] * src[5][x] + \
     94     F[6] * src[6][x] + \
     95     F[7] * src[7][x])
     96 
     97 #define DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh) \
     98    ((FILTER_8TAP(src, x, F, stride) + ((1 << (sh)) >> 1)) >> (sh))
     99 
    100 #define DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh) \
    101    ((FILTER_8TAP(src, x, F, stride) + (rnd)) >> (sh))
    102 
    103 #define DAV1D_FILTER_8TAP_RND3(src, x, F, sh) \
    104    ((FILTER_8TAP2(src, x, F) + ((1 << (sh)) >> 1)) >> (sh))
    105 
    106 #define DAV1D_FILTER_8TAP_CLIP(src, x, F, stride, sh) \
    107    iclip_pixel(DAV1D_FILTER_8TAP_RND(src, x, F, stride, sh))
    108 
    109 #define DAV1D_FILTER_8TAP_CLIP2(src, x, F, stride, rnd, sh) \
    110    iclip_pixel(DAV1D_FILTER_8TAP_RND2(src, x, F, stride, rnd, sh))
    111 
    112 #define DAV1D_FILTER_8TAP_CLIP3(src, x, F, sh) \
    113    iclip_pixel(DAV1D_FILTER_8TAP_RND3(src, x, F, sh))
    114 
    115 #define GET_H_FILTER(mx) \
    116    const int8_t *const fh = !(mx) ? NULL : w > 4 ? \
    117        dav1d_mc_subpel_filters[filter_type & 3][(mx) - 1] : \
    118        dav1d_mc_subpel_filters[3 + (filter_type & 1)][(mx) - 1]
    119 
    120 #define GET_V_FILTER(my) \
    121    const int8_t *const fv = !(my) ? NULL : h > 4 ? \
    122        dav1d_mc_subpel_filters[filter_type >> 2][(my) - 1] : \
    123        dav1d_mc_subpel_filters[3 + ((filter_type >> 2) & 1)][(my) - 1]
    124 
    125 #define GET_FILTERS() \
    126    GET_H_FILTER(mx); \
    127    GET_V_FILTER(my)
    128 
    129 static NOINLINE void
    130 put_8tap_c(pixel *dst, ptrdiff_t dst_stride,
    131           const pixel *src, ptrdiff_t src_stride,
    132           const int w, int h, const int mx, const int my,
    133           const int filter_type HIGHBD_DECL_SUFFIX)
    134 {
    135    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    136    const int intermediate_rnd = 32 + ((1 << (6 - intermediate_bits)) >> 1);
    137 
    138    GET_FILTERS();
    139    dst_stride = PXSTRIDE(dst_stride);
    140    src_stride = PXSTRIDE(src_stride);
    141 
    142    if (fh) {
    143        if (fv) {
    144            int tmp_h = h + 7;
    145            int16_t mid[128 * 135], *mid_ptr = mid;
    146 
    147            src -= src_stride * 3;
    148            do {
    149                for (int x = 0; x < w; x++)
    150                    mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
    151                                                       6 - intermediate_bits);
    152 
    153                mid_ptr += 128;
    154                src += src_stride;
    155            } while (--tmp_h);
    156 
    157            mid_ptr = mid + 128 * 3;
    158            do {
    159                for (int x = 0; x < w; x++)
    160                    dst[x] = DAV1D_FILTER_8TAP_CLIP(mid_ptr, x, fv, 128,
    161                                                    6 + intermediate_bits);
    162 
    163                mid_ptr += 128;
    164                dst += dst_stride;
    165            } while (--h);
    166        } else {
    167            do {
    168                for (int x = 0; x < w; x++) {
    169                    dst[x] = DAV1D_FILTER_8TAP_CLIP2(src, x, fh, 1,
    170                                                     intermediate_rnd, 6);
    171                }
    172 
    173                dst += dst_stride;
    174                src += src_stride;
    175            } while (--h);
    176        }
    177    } else if (fv) {
    178        do {
    179            for (int x = 0; x < w; x++)
    180                dst[x] = DAV1D_FILTER_8TAP_CLIP(src, x, fv, src_stride, 6);
    181 
    182            dst += dst_stride;
    183            src += src_stride;
    184        } while (--h);
    185    } else
    186        put_c(dst, dst_stride, src, src_stride, w, h);
    187 }
    188 
    189 static NOINLINE void
    190 put_8tap_scaled_c(pixel *dst, const ptrdiff_t dst_stride,
    191                  const pixel *src, ptrdiff_t src_stride,
    192                  const int w, int h, const int mx, int my,
    193                  const int dx, const int dy, const int filter_type
    194                  HIGHBD_DECL_SUFFIX)
    195 {
    196    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    197    const int intermediate_rnd = (1 << intermediate_bits) >> 1;
    198    int16_t mid[128 * 8];
    199    int16_t *mid_ptrs[8];
    200    int in_y = -8;
    201    src_stride = PXSTRIDE(src_stride);
    202 
    203    for (int i = 0; i < 8; i++)
    204        mid_ptrs[i] = &mid[128 * i];
    205 
    206    src -= src_stride * 3;
    207 
    208    for (int y = 0; y < h; y++) {
    209        int x;
    210        int src_y = my >> 10;
    211        GET_V_FILTER((my & 0x3ff) >> 6);
    212 
    213        while (in_y < src_y) {
    214            int imx = mx, ioff = 0;
    215            int16_t *mid_ptr = mid_ptrs[0];
    216 
    217            for (int i = 0; i < 7; i++)
    218                mid_ptrs[i] = mid_ptrs[i + 1];
    219            mid_ptrs[7] = mid_ptr;
    220 
    221            for (x = 0; x < w; x++) {
    222                GET_H_FILTER(imx >> 6);
    223                mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
    224                                                        6 - intermediate_bits) :
    225                                  src[ioff] << intermediate_bits;
    226                imx += dx;
    227                ioff += imx >> 10;
    228                imx &= 0x3ff;
    229            }
    230 
    231            src += src_stride;
    232            in_y++;
    233        }
    234 
    235        for (x = 0; x < w; x++)
    236            dst[x] = fv ? DAV1D_FILTER_8TAP_CLIP3(mid_ptrs, x, fv,
    237                                                  6 + intermediate_bits) :
    238                          iclip_pixel((mid_ptrs[3][x] + intermediate_rnd) >>
    239                                              intermediate_bits);
    240 
    241        my += dy;
    242        dst += PXSTRIDE(dst_stride);
    243    }
    244 }
    245 
    246 static NOINLINE void
    247 prep_8tap_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
    248            const int w, int h, const int mx, const int my,
    249            const int filter_type HIGHBD_DECL_SUFFIX)
    250 {
    251    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    252    GET_FILTERS();
    253    src_stride = PXSTRIDE(src_stride);
    254 
    255    if (fh) {
    256        if (fv) {
    257            int tmp_h = h + 7;
    258            int16_t mid[128 * 135], *mid_ptr = mid;
    259 
    260            src -= src_stride * 3;
    261            do {
    262                for (int x = 0; x < w; x++)
    263                    mid_ptr[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
    264                                                       6 - intermediate_bits);
    265 
    266                mid_ptr += 128;
    267                src += src_stride;
    268            } while (--tmp_h);
    269 
    270            mid_ptr = mid + 128 * 3;
    271            do {
    272                for (int x = 0; x < w; x++) {
    273                    int t = DAV1D_FILTER_8TAP_RND(mid_ptr, x, fv, 128, 6) -
    274                                  PREP_BIAS;
    275                    assert(t >= INT16_MIN && t <= INT16_MAX);
    276                    tmp[x] = t;
    277                }
    278 
    279                mid_ptr += 128;
    280                tmp += w;
    281            } while (--h);
    282        } else {
    283            do {
    284                for (int x = 0; x < w; x++)
    285                    tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fh, 1,
    286                                                   6 - intermediate_bits) -
    287                             PREP_BIAS;
    288 
    289                tmp += w;
    290                src += src_stride;
    291            } while (--h);
    292        }
    293    } else if (fv) {
    294        do {
    295            for (int x = 0; x < w; x++)
    296                tmp[x] = DAV1D_FILTER_8TAP_RND(src, x, fv, src_stride,
    297                                               6 - intermediate_bits) -
    298                         PREP_BIAS;
    299 
    300            tmp += w;
    301            src += src_stride;
    302        } while (--h);
    303    } else
    304        prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
    305 }
    306 
    307 static NOINLINE void
    308 prep_8tap_scaled_c(int16_t *tmp, const pixel *src, ptrdiff_t src_stride,
    309                   const int w, int h, const int mx, int my,
    310                   const int dx, const int dy, const int filter_type
    311                   HIGHBD_DECL_SUFFIX)
    312 {
    313    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    314    int16_t mid[128 * 8];
    315    int16_t *mid_ptrs[8];
    316    int in_y = -8;
    317    src_stride = PXSTRIDE(src_stride);
    318 
    319    for (int i = 0; i < 8; i++)
    320        mid_ptrs[i] = &mid[128 * i];
    321 
    322    src -= src_stride * 3;
    323 
    324    for (int y = 0; y < h; y++) {
    325        int x;
    326        int src_y = my >> 10;
    327        GET_V_FILTER((my & 0x3ff) >> 6);
    328 
    329        while (in_y < src_y) {
    330            int imx = mx, ioff = 0;
    331            int16_t *mid_ptr = mid_ptrs[0];
    332 
    333            for (int i = 0; i < 7; i++)
    334                mid_ptrs[i] = mid_ptrs[i + 1];
    335            mid_ptrs[7] = mid_ptr;
    336 
    337            for (x = 0; x < w; x++) {
    338                GET_H_FILTER(imx >> 6);
    339                mid_ptr[x] = fh ? DAV1D_FILTER_8TAP_RND(src, ioff, fh, 1,
    340                                                        6 - intermediate_bits) :
    341                                  src[ioff] << intermediate_bits;
    342                imx += dx;
    343                ioff += imx >> 10;
    344                imx &= 0x3ff;
    345            }
    346 
    347            src += src_stride;
    348            in_y++;
    349        }
    350 
    351        for (x = 0; x < w; x++)
    352            tmp[x] = (fv ? DAV1D_FILTER_8TAP_RND3(mid_ptrs, x, fv, 6)
    353                         : mid_ptrs[3][x]) - PREP_BIAS;
    354 
    355        my += dy;
    356        tmp += w;
    357    }
    358 }
    359 
    360 #define filter_fns(type, type_h, type_v) \
    361 static void put_8tap_##type##_c(pixel *const dst, \
    362                                const ptrdiff_t dst_stride, \
    363                                const pixel *const src, \
    364                                const ptrdiff_t src_stride, \
    365                                const int w, const int h, \
    366                                const int mx, const int my \
    367                                HIGHBD_DECL_SUFFIX) \
    368 { \
    369    put_8tap_c(dst, dst_stride, src, src_stride, w, h, mx, my, \
    370               type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
    371 } \
    372 static void put_8tap_##type##_scaled_c(pixel *const dst, \
    373                                       const ptrdiff_t dst_stride, \
    374                                       const pixel *const src, \
    375                                       const ptrdiff_t src_stride, \
    376                                       const int w, const int h, \
    377                                       const int mx, const int my, \
    378                                       const int dx, const int dy \
    379                                       HIGHBD_DECL_SUFFIX) \
    380 { \
    381    put_8tap_scaled_c(dst, dst_stride, src, src_stride, w, h, mx, my, dx, dy, \
    382                      type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
    383 } \
    384 static void prep_8tap_##type##_c(int16_t *const tmp, \
    385                                 const pixel *const src, \
    386                                 const ptrdiff_t src_stride, \
    387                                 const int w, const int h, \
    388                                 const int mx, const int my \
    389                                 HIGHBD_DECL_SUFFIX) \
    390 { \
    391    prep_8tap_c(tmp, src, src_stride, w, h, mx, my, \
    392                type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
    393 } \
    394 static void prep_8tap_##type##_scaled_c(int16_t *const tmp, \
    395                                        const pixel *const src, \
    396                                        const ptrdiff_t src_stride, \
    397                                        const int w, const int h, \
    398                                        const int mx, const int my, \
    399                                        const int dx, const int dy \
    400                                        HIGHBD_DECL_SUFFIX) \
    401 { \
    402    prep_8tap_scaled_c(tmp, src, src_stride, w, h, mx, my, dx, dy, \
    403                       type_h | (type_v << 2) HIGHBD_TAIL_SUFFIX); \
    404 }
    405 
    406 filter_fns(regular,        DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_REGULAR)
    407 filter_fns(regular_sharp,  DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SHARP)
    408 filter_fns(regular_smooth, DAV1D_FILTER_8TAP_REGULAR, DAV1D_FILTER_8TAP_SMOOTH)
    409 filter_fns(smooth,         DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SMOOTH)
    410 filter_fns(smooth_regular, DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_REGULAR)
    411 filter_fns(smooth_sharp,   DAV1D_FILTER_8TAP_SMOOTH,  DAV1D_FILTER_8TAP_SHARP)
    412 filter_fns(sharp,          DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SHARP)
    413 filter_fns(sharp_regular,  DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_REGULAR)
    414 filter_fns(sharp_smooth,   DAV1D_FILTER_8TAP_SHARP,   DAV1D_FILTER_8TAP_SMOOTH)
    415 
    416 #define FILTER_BILIN(src, x, mxy, stride) \
    417    (16 * src[x] + ((mxy) * (src[x + stride] - src[x])))
    418 
    419 #define FILTER_BILIN_RND(src, x, mxy, stride, sh) \
    420    ((FILTER_BILIN(src, x, mxy, stride) + ((1 << (sh)) >> 1)) >> (sh))
    421 
    422 #define FILTER_BILIN_CLIP(src, x, mxy, stride, sh) \
    423    iclip_pixel(FILTER_BILIN_RND(src, x, mxy, stride, sh))
    424 
    425 #define FILTER_BILIN2(src1, src2, x, mxy) \
    426    (16 * src1[x] + ((mxy) * (src2[x] - src1[x])))
    427 
    428 #define FILTER_BILIN_RND2(src1, src2, x, mxy, sh) \
    429    ((FILTER_BILIN2(src1, src2, x, mxy) + ((1 << (sh)) >> 1)) >> (sh))
    430 
    431 #define FILTER_BILIN_CLIP2(src1, src2, x, mxy, sh) \
    432    iclip_pixel(FILTER_BILIN_RND2(src1, src2, x, mxy, sh))
    433 
    434 static void put_bilin_c(pixel *dst, ptrdiff_t dst_stride,
    435                        const pixel *src, ptrdiff_t src_stride,
    436                        const int w, int h, const int mx, const int my
    437                        HIGHBD_DECL_SUFFIX)
    438 {
    439    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    440    const int intermediate_rnd = (1 << intermediate_bits) >> 1;
    441    dst_stride = PXSTRIDE(dst_stride);
    442    src_stride = PXSTRIDE(src_stride);
    443 
    444    if (mx) {
    445        if (my) {
    446            int16_t mid[128 * 129], *mid_ptr = mid;
    447            int tmp_h = h + 1;
    448 
    449            do {
    450                for (int x = 0; x < w; x++)
    451                    mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
    452                                                  4 - intermediate_bits);
    453 
    454                mid_ptr += 128;
    455                src += src_stride;
    456            } while (--tmp_h);
    457 
    458            mid_ptr = mid;
    459            do {
    460                for (int x = 0; x < w; x++)
    461                    dst[x] = FILTER_BILIN_CLIP(mid_ptr, x, my, 128,
    462                                               4 + intermediate_bits);
    463 
    464                mid_ptr += 128;
    465                dst += dst_stride;
    466            } while (--h);
    467        } else {
    468            do {
    469                for (int x = 0; x < w; x++) {
    470                    const int px = FILTER_BILIN_RND(src, x, mx, 1,
    471                                                    4 - intermediate_bits);
    472                    dst[x] = iclip_pixel((px + intermediate_rnd) >> intermediate_bits);
    473                }
    474 
    475                dst += dst_stride;
    476                src += src_stride;
    477            } while (--h);
    478        }
    479    } else if (my) {
    480        do {
    481            for (int x = 0; x < w; x++)
    482                dst[x] = FILTER_BILIN_CLIP(src, x, my, src_stride, 4);
    483 
    484            dst += dst_stride;
    485            src += src_stride;
    486        } while (--h);
    487    } else
    488        put_c(dst, dst_stride, src, src_stride, w, h);
    489 }
    490 
    491 static void put_bilin_scaled_c(pixel *dst, ptrdiff_t dst_stride,
    492                               const pixel *src, ptrdiff_t src_stride,
    493                               const int w, int h, const int mx, int my,
    494                               const int dx, const int dy
    495                               HIGHBD_DECL_SUFFIX)
    496 {
    497    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    498    int16_t mid[128 * 2];
    499    int in_y = -2;
    500 
    501    do {
    502        int x;
    503        int y = my >> 10;
    504        int16_t *mid1 = &mid[(y & 1) * 128];
    505        int16_t *mid2 = &mid[((y + 1) & 1) * 128];
    506        int dmy = my & 0x3ff;
    507 
    508        while (in_y < y) {
    509            int imx = mx, ioff = 0;
    510            int16_t *mid_ptr = &mid[(in_y & 1) * 128];
    511 
    512            for (x = 0; x < w; x++) {
    513                mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
    514                                              4 - intermediate_bits);
    515                imx += dx;
    516                ioff += imx >> 10;
    517                imx &= 0x3ff;
    518            }
    519 
    520            src += PXSTRIDE(src_stride);
    521            in_y++;
    522        }
    523 
    524        for (x = 0; x < w; x++)
    525            dst[x] = FILTER_BILIN_CLIP2(mid1, mid2, x, dmy >> 6,
    526                                       4 + intermediate_bits);
    527 
    528        my += dy;
    529        dst += PXSTRIDE(dst_stride);
    530    } while (--h);
    531 }
    532 
    533 static void prep_bilin_c(int16_t *tmp,
    534                         const pixel *src, ptrdiff_t src_stride,
    535                         const int w, int h, const int mx, const int my
    536                         HIGHBD_DECL_SUFFIX)
    537 {
    538    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    539    src_stride = PXSTRIDE(src_stride);
    540 
    541    if (mx) {
    542        if (my) {
    543            int16_t mid[128 * 129], *mid_ptr = mid;
    544            int tmp_h = h + 1;
    545 
    546            do {
    547                for (int x = 0; x < w; x++)
    548                    mid_ptr[x] = FILTER_BILIN_RND(src, x, mx, 1,
    549                                                  4 - intermediate_bits);
    550 
    551                mid_ptr += 128;
    552                src += src_stride;
    553            } while (--tmp_h);
    554 
    555            mid_ptr = mid;
    556            do {
    557                for (int x = 0; x < w; x++)
    558                    tmp[x] = FILTER_BILIN_RND(mid_ptr, x, my, 128, 4) -
    559                             PREP_BIAS;
    560 
    561                mid_ptr += 128;
    562                tmp += w;
    563            } while (--h);
    564        } else {
    565            do {
    566                for (int x = 0; x < w; x++)
    567                    tmp[x] = FILTER_BILIN_RND(src, x, mx, 1,
    568                                              4 - intermediate_bits) -
    569                             PREP_BIAS;
    570 
    571                tmp += w;
    572                src += src_stride;
    573            } while (--h);
    574        }
    575    } else if (my) {
    576        do {
    577            for (int x = 0; x < w; x++)
    578                tmp[x] = FILTER_BILIN_RND(src, x, my, src_stride,
    579                                          4 - intermediate_bits) - PREP_BIAS;
    580 
    581            tmp += w;
    582            src += src_stride;
    583        } while (--h);
    584    } else
    585        prep_c(tmp, src, src_stride, w, h HIGHBD_TAIL_SUFFIX);
    586 }
    587 
    588 static void prep_bilin_scaled_c(int16_t *tmp,
    589                                const pixel *src, ptrdiff_t src_stride,
    590                                const int w, int h, const int mx, int my,
    591                                const int dx, const int dy HIGHBD_DECL_SUFFIX)
    592 {
    593    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    594    int16_t mid[128 * 2];
    595    int in_y = -2;
    596 
    597    do {
    598        int x;
    599        int y = my >> 10;
    600        int16_t *mid1 = &mid[(y & 1) * 128];
    601        int16_t *mid2 = &mid[((y + 1) & 1) * 128];
    602        int dmy = my & 0x3ff;
    603 
    604        while (in_y < y) {
    605            int imx = mx, ioff = 0;
    606            int16_t *mid_ptr = &mid[(in_y & 1) * 128];
    607 
    608            for (x = 0; x < w; x++) {
    609                mid_ptr[x] = FILTER_BILIN_RND(src, ioff, imx >> 6, 1,
    610                                              4 - intermediate_bits);
    611                imx += dx;
    612                ioff += imx >> 10;
    613                imx &= 0x3ff;
    614            }
    615 
    616            src += PXSTRIDE(src_stride);
    617            in_y++;
    618        }
    619 
    620        for (x = 0; x < w; x++)
    621            tmp[x] = FILTER_BILIN_RND2(mid1, mid2, x, dmy >> 6, 4) - PREP_BIAS;
    622 
    623        my += dy;
    624        tmp += w;
    625    } while (--h);
    626 }
    627 
    628 static void avg_c(pixel *dst, const ptrdiff_t dst_stride,
    629                  const int16_t *tmp1, const int16_t *tmp2, const int w, int h
    630                  HIGHBD_DECL_SUFFIX)
    631 {
    632    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    633    const int sh = intermediate_bits + 1;
    634    const int rnd = (1 << intermediate_bits) + PREP_BIAS * 2;
    635    do {
    636        for (int x = 0; x < w; x++)
    637            dst[x] = iclip_pixel((tmp1[x] + tmp2[x] + rnd) >> sh);
    638 
    639        tmp1 += w;
    640        tmp2 += w;
    641        dst += PXSTRIDE(dst_stride);
    642    } while (--h);
    643 }
    644 
    645 static void w_avg_c(pixel *dst, const ptrdiff_t dst_stride,
    646                    const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
    647                    const int weight HIGHBD_DECL_SUFFIX)
    648 {
    649    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    650    const int sh = intermediate_bits + 4;
    651    const int rnd = (8 << intermediate_bits) + PREP_BIAS * 16;
    652    do {
    653        for (int x = 0; x < w; x++)
    654            dst[x] = iclip_pixel((tmp1[x] * weight +
    655                                  tmp2[x] * (16 - weight) + rnd) >> sh);
    656 
    657        tmp1 += w;
    658        tmp2 += w;
    659        dst += PXSTRIDE(dst_stride);
    660    } while (--h);
    661 }
    662 
    663 static void mask_c(pixel *dst, const ptrdiff_t dst_stride,
    664                   const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
    665                   const uint8_t *mask HIGHBD_DECL_SUFFIX)
    666 {
    667    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    668    const int sh = intermediate_bits + 6;
    669    const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
    670    do {
    671        for (int x = 0; x < w; x++)
    672            dst[x] = iclip_pixel((tmp1[x] * mask[x] +
    673                                  tmp2[x] * (64 - mask[x]) + rnd) >> sh);
    674 
    675        tmp1 += w;
    676        tmp2 += w;
    677        mask += w;
    678        dst += PXSTRIDE(dst_stride);
    679    } while (--h);
    680 }
    681 
    682 #define blend_px(a, b, m) (((a * (64 - m) + b * m) + 32) >> 6)
    683 static void blend_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
    684                    const int w, int h, const uint8_t *mask)
    685 {
    686    do {
    687        for (int x = 0; x < w; x++) {
    688            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
    689        }
    690        dst += PXSTRIDE(dst_stride);
    691        tmp += w;
    692        mask += w;
    693    } while (--h);
    694 }
    695 
    696 static void blend_v_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
    697                      const int w, int h)
    698 {
    699    const uint8_t *const mask = &dav1d_obmc_masks[w];
    700    do {
    701        for (int x = 0; x < (w * 3) >> 2; x++) {
    702            dst[x] = blend_px(dst[x], tmp[x], mask[x]);
    703        }
    704        dst += PXSTRIDE(dst_stride);
    705        tmp += w;
    706    } while (--h);
    707 }
    708 
    709 static void blend_h_c(pixel *dst, const ptrdiff_t dst_stride, const pixel *tmp,
    710                      const int w, int h)
    711 {
    712    const uint8_t *mask = &dav1d_obmc_masks[h];
    713    h = (h * 3) >> 2;
    714    do {
    715        const int m = *mask++;
    716        for (int x = 0; x < w; x++) {
    717            dst[x] = blend_px(dst[x], tmp[x], m);
    718        }
    719        dst += PXSTRIDE(dst_stride);
    720        tmp += w;
    721    } while (--h);
    722 }
    723 
    724 static void w_mask_c(pixel *dst, const ptrdiff_t dst_stride,
    725                     const int16_t *tmp1, const int16_t *tmp2, const int w, int h,
    726                     uint8_t *mask, const int sign,
    727                     const int ss_hor, const int ss_ver HIGHBD_DECL_SUFFIX)
    728 {
    729    // store mask at 2x2 resolution, i.e. store 2x1 sum for even rows,
    730    // and then load this intermediate to calculate final value for odd rows
    731    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    732    const int bitdepth = bitdepth_from_max(bitdepth_max);
    733    const int sh = intermediate_bits + 6;
    734    const int rnd = (32 << intermediate_bits) + PREP_BIAS * 64;
    735    const int mask_sh = bitdepth + intermediate_bits - 4;
    736    const int mask_rnd = 1 << (mask_sh - 5);
    737    do {
    738        for (int x = 0; x < w; x++) {
    739            const int tmpdiff = tmp1[x] - tmp2[x];
    740            const int m = imin(38 + ((abs(tmpdiff) + mask_rnd) >> mask_sh), 64);
    741            dst[x] = iclip_pixel((tmpdiff * m + tmp2[x] * 64 + rnd) >> sh);
    742 
    743            if (ss_hor) {
    744                x++;
    745 
    746                const int tmpdiff = tmp1[x] - tmp2[x];
    747                const int n = imin(38 + ((abs(tmpdiff) + mask_rnd) >> mask_sh), 64);
    748                dst[x] = iclip_pixel((tmpdiff * n + tmp2[x] * 64 + rnd) >> sh);
    749 
    750                if (h & ss_ver) {
    751                    mask[x >> 1] = (m + n + mask[x >> 1] + 2 - sign) >> 2;
    752                } else if (ss_ver) {
    753                    mask[x >> 1] = m + n;
    754                } else {
    755                    mask[x >> 1] = (m + n + 1 - sign) >> 1;
    756                }
    757            } else {
    758                mask[x] = m;
    759            }
    760        }
    761 
    762        tmp1 += w;
    763        tmp2 += w;
    764        dst += PXSTRIDE(dst_stride);
    765        if (!ss_ver || (h & 1)) mask += w >> ss_hor;
    766    } while (--h);
    767 }
    768 
    769 #define w_mask_fns(ssn, ss_hor, ss_ver) \
    770 static void w_mask_##ssn##_c(pixel *const dst, const ptrdiff_t dst_stride, \
    771                             const int16_t *const tmp1, const int16_t *const tmp2, \
    772                             const int w, const int h, uint8_t *mask, \
    773                             const int sign HIGHBD_DECL_SUFFIX) \
    774 { \
    775    w_mask_c(dst, dst_stride, tmp1, tmp2, w, h, mask, sign, ss_hor, ss_ver \
    776             HIGHBD_TAIL_SUFFIX); \
    777 }
    778 
    779 w_mask_fns(444, 0, 0);
    780 w_mask_fns(422, 1, 0);
    781 w_mask_fns(420, 1, 1);
    782 
    783 #undef w_mask_fns
    784 
    785 #define FILTER_WARP_RND(src, x, F, stride, sh) \
    786    ((F[0] * src[x - 3 * stride] + \
    787      F[1] * src[x - 2 * stride] + \
    788      F[2] * src[x - 1 * stride] + \
    789      F[3] * src[x + 0 * stride] + \
    790      F[4] * src[x + 1 * stride] + \
    791      F[5] * src[x + 2 * stride] + \
    792      F[6] * src[x + 3 * stride] + \
    793      F[7] * src[x + 4 * stride] + \
    794      ((1 << (sh)) >> 1)) >> (sh))
    795 
    796 #define FILTER_WARP_CLIP(src, x, F, stride, sh) \
    797    iclip_pixel(FILTER_WARP_RND(src, x, F, stride, sh))
    798 
    799 static void warp_affine_8x8_c(pixel *dst, const ptrdiff_t dst_stride,
    800                              const pixel *src, const ptrdiff_t src_stride,
    801                              const int16_t *const abcd, int mx, int my
    802                              HIGHBD_DECL_SUFFIX)
    803 {
    804    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    805    int16_t mid[15 * 8], *mid_ptr = mid;
    806 
    807    src -= 3 * PXSTRIDE(src_stride);
    808    for (int y = 0; y < 15; y++, mx += abcd[1]) {
    809        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
    810            const int8_t *const filter =
    811                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
    812 
    813            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
    814                                         7 - intermediate_bits);
    815        }
    816        src += PXSTRIDE(src_stride);
    817        mid_ptr += 8;
    818    }
    819 
    820    mid_ptr = &mid[3 * 8];
    821    for (int y = 0; y < 8; y++, my += abcd[3]) {
    822        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
    823            const int8_t *const filter =
    824                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
    825 
    826            dst[x] = FILTER_WARP_CLIP(mid_ptr, x, filter, 8,
    827                                      7 + intermediate_bits);
    828        }
    829        mid_ptr += 8;
    830        dst += PXSTRIDE(dst_stride);
    831    }
    832 }
    833 
    834 static void warp_affine_8x8t_c(int16_t *tmp, const ptrdiff_t tmp_stride,
    835                               const pixel *src, const ptrdiff_t src_stride,
    836                               const int16_t *const abcd, int mx, int my
    837                               HIGHBD_DECL_SUFFIX)
    838 {
    839    const int intermediate_bits = get_intermediate_bits(bitdepth_max);
    840    int16_t mid[15 * 8], *mid_ptr = mid;
    841 
    842    src -= 3 * PXSTRIDE(src_stride);
    843    for (int y = 0; y < 15; y++, mx += abcd[1]) {
    844        for (int x = 0, tmx = mx; x < 8; x++, tmx += abcd[0]) {
    845            const int8_t *const filter =
    846                dav1d_mc_warp_filter[64 + ((tmx + 512) >> 10)];
    847 
    848            mid_ptr[x] = FILTER_WARP_RND(src, x, filter, 1,
    849                                         7 - intermediate_bits);
    850        }
    851        src += PXSTRIDE(src_stride);
    852        mid_ptr += 8;
    853    }
    854 
    855    mid_ptr = &mid[3 * 8];
    856    for (int y = 0; y < 8; y++, my += abcd[3]) {
    857        for (int x = 0, tmy = my; x < 8; x++, tmy += abcd[2]) {
    858            const int8_t *const filter =
    859                dav1d_mc_warp_filter[64 + ((tmy + 512) >> 10)];
    860 
    861            tmp[x] = FILTER_WARP_RND(mid_ptr, x, filter, 8, 7) - PREP_BIAS;
    862        }
    863        mid_ptr += 8;
    864        tmp += tmp_stride;
    865    }
    866 }
    867 
    868 static void emu_edge_c(const intptr_t bw, const intptr_t bh,
    869                       const intptr_t iw, const intptr_t ih,
    870                       const intptr_t x, const intptr_t y,
    871                       pixel *dst, const ptrdiff_t dst_stride,
    872                       const pixel *ref, const ptrdiff_t ref_stride)
    873 {
    874    // find offset in reference of visible block to copy
    875    ref += iclip((int) y, 0, (int) ih - 1) * PXSTRIDE(ref_stride) +
    876           iclip((int) x, 0, (int) iw - 1);
    877 
    878    // number of pixels to extend (left, right, top, bottom)
    879    const int left_ext = iclip((int) -x, 0, (int) bw - 1);
    880    const int right_ext = iclip((int) (x + bw - iw), 0, (int) bw - 1);
    881    assert(left_ext + right_ext < bw);
    882    const int top_ext = iclip((int) -y, 0, (int) bh - 1);
    883    const int bottom_ext = iclip((int) (y + bh - ih), 0, (int) bh - 1);
    884    assert(top_ext + bottom_ext < bh);
    885 
    886    // copy visible portion first
    887    pixel *blk = dst + top_ext * PXSTRIDE(dst_stride);
    888    const int center_w = (int) (bw - left_ext - right_ext);
    889    const int center_h = (int) (bh - top_ext - bottom_ext);
    890    for (int y = 0; y < center_h; y++) {
    891        pixel_copy(blk + left_ext, ref, center_w);
    892        // extend left edge for this line
    893        if (left_ext)
    894            pixel_set(blk, blk[left_ext], left_ext);
    895        // extend right edge for this line
    896        if (right_ext)
    897            pixel_set(blk + left_ext + center_w, blk[left_ext + center_w - 1],
    898                      right_ext);
    899        ref += PXSTRIDE(ref_stride);
    900        blk += PXSTRIDE(dst_stride);
    901    }
    902 
    903    // copy top
    904    blk = dst + top_ext * PXSTRIDE(dst_stride);
    905    for (int y = 0; y < top_ext; y++) {
    906        pixel_copy(dst, blk, bw);
    907        dst += PXSTRIDE(dst_stride);
    908    }
    909 
    910    // copy bottom
    911    dst += center_h * PXSTRIDE(dst_stride);
    912    for (int y = 0; y < bottom_ext; y++) {
    913        pixel_copy(dst, &dst[-PXSTRIDE(dst_stride)], bw);
    914        dst += PXSTRIDE(dst_stride);
    915    }
    916 }
    917 
    918 static void resize_c(pixel *dst, const ptrdiff_t dst_stride,
    919                     const pixel *src, const ptrdiff_t src_stride,
    920                     const int dst_w, int h, const int src_w,
    921                     const int dx, const int mx0 HIGHBD_DECL_SUFFIX)
    922 {
    923    do {
    924        int mx = mx0, src_x = -1;
    925        for (int x = 0; x < dst_w; x++) {
    926            const int8_t *const F = dav1d_resize_filter[mx >> 8];
    927            dst[x] = iclip_pixel((-(F[0] * src[iclip(src_x - 3, 0, src_w - 1)] +
    928                                    F[1] * src[iclip(src_x - 2, 0, src_w - 1)] +
    929                                    F[2] * src[iclip(src_x - 1, 0, src_w - 1)] +
    930                                    F[3] * src[iclip(src_x + 0, 0, src_w - 1)] +
    931                                    F[4] * src[iclip(src_x + 1, 0, src_w - 1)] +
    932                                    F[5] * src[iclip(src_x + 2, 0, src_w - 1)] +
    933                                    F[6] * src[iclip(src_x + 3, 0, src_w - 1)] +
    934                                    F[7] * src[iclip(src_x + 4, 0, src_w - 1)]) +
    935                                  64) >> 7);
    936            mx += dx;
    937            src_x += mx >> 14;
    938            mx &= 0x3fff;
    939        }
    940 
    941        dst += PXSTRIDE(dst_stride);
    942        src += PXSTRIDE(src_stride);
    943    } while (--h);
    944 }
    945 
    946 #if HAVE_ASM
    947 #if ARCH_AARCH64 || ARCH_ARM
    948 #include "src/arm/mc.h"
    949 #elif ARCH_LOONGARCH64
    950 #include "src/loongarch/mc.h"
    951 #elif ARCH_PPC64LE
    952 #include "src/ppc/mc.h"
    953 #elif ARCH_RISCV
    954 #include "src/riscv/mc.h"
    955 #elif ARCH_X86
    956 #include "src/x86/mc.h"
    957 #endif
    958 #endif
    959 
    960 COLD void bitfn(dav1d_mc_dsp_init)(Dav1dMCDSPContext *const c) {
    961 #define init_mc_fns(type, name) do { \
    962    c->mc        [type] = put_##name##_c; \
    963    c->mc_scaled [type] = put_##name##_scaled_c; \
    964    c->mct       [type] = prep_##name##_c; \
    965    c->mct_scaled[type] = prep_##name##_scaled_c; \
    966 } while (0)
    967 
    968    init_mc_fns(FILTER_2D_8TAP_REGULAR,        8tap_regular);
    969    init_mc_fns(FILTER_2D_8TAP_REGULAR_SMOOTH, 8tap_regular_smooth);
    970    init_mc_fns(FILTER_2D_8TAP_REGULAR_SHARP,  8tap_regular_sharp);
    971    init_mc_fns(FILTER_2D_8TAP_SHARP_REGULAR,  8tap_sharp_regular);
    972    init_mc_fns(FILTER_2D_8TAP_SHARP_SMOOTH,   8tap_sharp_smooth);
    973    init_mc_fns(FILTER_2D_8TAP_SHARP,          8tap_sharp);
    974    init_mc_fns(FILTER_2D_8TAP_SMOOTH_REGULAR, 8tap_smooth_regular);
    975    init_mc_fns(FILTER_2D_8TAP_SMOOTH,         8tap_smooth);
    976    init_mc_fns(FILTER_2D_8TAP_SMOOTH_SHARP,   8tap_smooth_sharp);
    977    init_mc_fns(FILTER_2D_BILINEAR,            bilin);
    978 
    979    c->avg      = avg_c;
    980    c->w_avg    = w_avg_c;
    981    c->mask     = mask_c;
    982    c->blend    = blend_c;
    983    c->blend_v  = blend_v_c;
    984    c->blend_h  = blend_h_c;
    985    c->w_mask[0] = w_mask_444_c;
    986    c->w_mask[1] = w_mask_422_c;
    987    c->w_mask[2] = w_mask_420_c;
    988    c->warp8x8  = warp_affine_8x8_c;
    989    c->warp8x8t = warp_affine_8x8t_c;
    990    c->emu_edge = emu_edge_c;
    991    c->resize   = resize_c;
    992 
    993 #if HAVE_ASM
    994 #if ARCH_AARCH64 || ARCH_ARM
    995    mc_dsp_init_arm(c);
    996 #elif ARCH_LOONGARCH64
    997    mc_dsp_init_loongarch(c);
    998 #elif ARCH_PPC64LE
    999    mc_dsp_init_ppc(c);
   1000 #elif ARCH_RISCV
   1001    mc_dsp_init_riscv(c);
   1002 #elif ARCH_X86
   1003    mc_dsp_init_x86(c);
   1004 #endif
   1005 #endif
   1006 }