tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

variance_ssse3.c (10795B)


      1 /*
      2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
      3 *
      4 * This source code is subject to the terms of the BSD 2 Clause License and
      5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
      6 * was not distributed with this source code in the LICENSE file, you can
      7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
      8 * Media Patent License 1.0 was not distributed with this source code in the
      9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
     10 */
     11 
     12 #include <stddef.h>
     13 #include <stdint.h>
     14 
     15 #include "config/aom_config.h"
     16 #include "config/aom_dsp_rtcd.h"
     17 
     18 #include "aom_dsp/aom_dsp_common.h"
     19 
     20 // The 2 unused parameters are place holders for PIC enabled build.
     21 // These definitions are for functions defined in subpel_variance.asm
     22 #define DECL(w, opt)                                                           \
     23  int aom_sub_pixel_variance##w##xh_##opt(                                     \
     24      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset,    \
     25      const uint8_t *dst, ptrdiff_t dst_stride, int height, unsigned int *sse, \
     26      void *unused0, void *unused)
     27 #define DECLS(opt) \
     28  DECL(4, opt);    \
     29  DECL(8, opt);    \
     30  DECL(16, opt)
     31 
     32 DECLS(ssse3);
     33 #undef DECLS
     34 #undef DECL
     35 
     36 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                      \
     37  unsigned int aom_sub_pixel_variance##w##x##h##_##opt(                       \
     38      const uint8_t *src, int src_stride, int x_offset, int y_offset,         \
     39      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr) {            \
     40    /*Avoid overflow in helper by capping height.*/                           \
     41    const int hf = AOMMIN(h, 64);                                             \
     42    unsigned int sse = 0;                                                     \
     43    int se = 0;                                                               \
     44    for (int i = 0; i < (w / wf); ++i) {                                      \
     45      const uint8_t *src_ptr = src;                                           \
     46      const uint8_t *dst_ptr = dst;                                           \
     47      for (int j = 0; j < (h / hf); ++j) {                                    \
     48        unsigned int sse2;                                                    \
     49        const int se2 = aom_sub_pixel_variance##wf##xh_##opt(                 \
     50            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, hf, \
     51            &sse2, NULL, NULL);                                               \
     52        dst_ptr += hf * dst_stride;                                           \
     53        src_ptr += hf * src_stride;                                           \
     54        se += se2;                                                            \
     55        sse += sse2;                                                          \
     56      }                                                                       \
     57      src += wf;                                                              \
     58      dst += wf;                                                              \
     59    }                                                                         \
     60    *sse_ptr = sse;                                                           \
     61    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));  \
     62  }
     63 
     64 #if !CONFIG_REALTIME_ONLY
     65 #define FNS(opt)                                    \
     66  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
     67  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
     68  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
     69  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
     70  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
     71  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
     72  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
     73  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
     74  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
     75  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
     76  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
     77  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
     78  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
     79  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
     80  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
     81  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))      \
     82  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
     83  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
     84  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
     85  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
     86  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
     87  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
     88 #else
     89 #define FNS(opt)                                    \
     90  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
     91  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
     92  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
     93  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
     94  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
     95  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
     96  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
     97  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
     98  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
     99  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
    100  FN(16, 8, 16, 4, 3, opt, (int32_t), (int32_t))    \
    101  FN(8, 16, 8, 3, 4, opt, (int32_t), (int32_t))     \
    102  FN(8, 8, 8, 3, 3, opt, (int32_t), (int32_t))      \
    103  FN(8, 4, 8, 3, 2, opt, (int32_t), (int32_t))      \
    104  FN(4, 8, 4, 2, 3, opt, (int32_t), (int32_t))      \
    105  FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t))
    106 #endif
    107 
    108 FNS(ssse3)
    109 
    110 #undef FNS
    111 #undef FN
    112 
    113 // The 2 unused parameters are place holders for PIC enabled build.
    114 #define DECL(w, opt)                                                        \
    115  int aom_sub_pixel_avg_variance##w##xh_##opt(                              \
    116      const uint8_t *src, ptrdiff_t src_stride, int x_offset, int y_offset, \
    117      const uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *sec,         \
    118      ptrdiff_t sec_stride, int height, unsigned int *sse, void *unused0,   \
    119      void *unused)
    120 #define DECLS(opt) \
    121  DECL(4, opt);    \
    122  DECL(8, opt);    \
    123  DECL(16, opt)
    124 
    125 DECLS(ssse3);
    126 #undef DECL
    127 #undef DECLS
    128 
    129 #define FN(w, h, wf, wlog2, hlog2, opt, cast_prod, cast)                     \
    130  unsigned int aom_sub_pixel_avg_variance##w##x##h##_##opt(                  \
    131      const uint8_t *src, int src_stride, int x_offset, int y_offset,        \
    132      const uint8_t *dst, int dst_stride, unsigned int *sse_ptr,             \
    133      const uint8_t *sec) {                                                  \
    134    /*Avoid overflow in helper by capping height.*/                          \
    135    const int hf = AOMMIN(h, 64);                                            \
    136    unsigned int sse = 0;                                                    \
    137    int se = 0;                                                              \
    138    for (int i = 0; i < (w / wf); ++i) {                                     \
    139      const uint8_t *src_ptr = src;                                          \
    140      const uint8_t *dst_ptr = dst;                                          \
    141      const uint8_t *sec_ptr = sec;                                          \
    142      for (int j = 0; j < (h / hf); ++j) {                                   \
    143        unsigned int sse2;                                                   \
    144        const int se2 = aom_sub_pixel_avg_variance##wf##xh_##opt(            \
    145            src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride,    \
    146            sec_ptr, w, hf, &sse2, NULL, NULL);                              \
    147        dst_ptr += hf * dst_stride;                                          \
    148        src_ptr += hf * src_stride;                                          \
    149        sec_ptr += hf * w;                                                   \
    150        se += se2;                                                           \
    151        sse += sse2;                                                         \
    152      }                                                                      \
    153      src += wf;                                                             \
    154      dst += wf;                                                             \
    155      sec += wf;                                                             \
    156    }                                                                        \
    157    *sse_ptr = sse;                                                          \
    158    return sse - (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2)); \
    159  }
    160 
    161 #if !CONFIG_REALTIME_ONLY
    162 #define FNS(opt)                                    \
    163  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
    164  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
    165  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
    166  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
    167  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
    168  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
    169  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
    170  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
    171  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
    172  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
    173  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
    174  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
    175  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
    176  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
    177  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
    178  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))     \
    179  FN(4, 16, 4, 2, 4, opt, (int32_t), (int32_t))     \
    180  FN(16, 4, 16, 4, 2, opt, (int32_t), (int32_t))    \
    181  FN(8, 32, 8, 3, 5, opt, (uint32_t), (int64_t))    \
    182  FN(32, 8, 16, 5, 3, opt, (uint32_t), (int64_t))   \
    183  FN(16, 64, 16, 4, 6, opt, (int64_t), (int64_t))   \
    184  FN(64, 16, 16, 6, 4, opt, (int64_t), (int64_t))
    185 #else
    186 #define FNS(opt)                                    \
    187  FN(128, 128, 16, 7, 7, opt, (int64_t), (int64_t)) \
    188  FN(128, 64, 16, 7, 6, opt, (int64_t), (int64_t))  \
    189  FN(64, 128, 16, 6, 7, opt, (int64_t), (int64_t))  \
    190  FN(64, 64, 16, 6, 6, opt, (int64_t), (int64_t))   \
    191  FN(64, 32, 16, 6, 5, opt, (int64_t), (int64_t))   \
    192  FN(32, 64, 16, 5, 6, opt, (int64_t), (int64_t))   \
    193  FN(32, 32, 16, 5, 5, opt, (int64_t), (int64_t))   \
    194  FN(32, 16, 16, 5, 4, opt, (int64_t), (int64_t))   \
    195  FN(16, 32, 16, 4, 5, opt, (int64_t), (int64_t))   \
    196  FN(16, 16, 16, 4, 4, opt, (uint32_t), (int64_t))  \
    197  FN(16, 8, 16, 4, 3, opt, (uint32_t), (int32_t))   \
    198  FN(8, 16, 8, 3, 4, opt, (uint32_t), (int32_t))    \
    199  FN(8, 8, 8, 3, 3, opt, (uint32_t), (int32_t))     \
    200  FN(8, 4, 8, 3, 2, opt, (uint32_t), (int32_t))     \
    201  FN(4, 8, 4, 2, 3, opt, (uint32_t), (int32_t))     \
    202  FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t))
    203 #endif
    204 
    205 FNS(ssse3)
    206 
    207 #undef FNS
    208 #undef FN