tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

lossless_avx2.c (18843B)


      1 // Copyright 2025 Google Inc. All Rights Reserved.
      2 //
      3 // Use of this source code is governed by a BSD-style license
      4 // that can be found in the COPYING file in the root of the source
      5 // tree. An additional intellectual property rights grant can be found
      6 // in the file PATENTS. All contributing project authors may
      7 // be found in the AUTHORS file in the root of the source tree.
      8 // -----------------------------------------------------------------------------
      9 //
     10 // AVX2 variant of methods for lossless decoder
     11 //
     12 // Author: Vincent Rabaud (vrabaud@google.com)
     13 
     14 #include "src/dsp/dsp.h"
     15 
     16 #if defined(WEBP_USE_AVX2)
     17 
     18 #include <stddef.h>
     19 #include <immintrin.h>
     20 
     21 #include "src/dsp/cpu.h"
     22 #include "src/dsp/lossless.h"
     23 #include "src/webp/format_constants.h"
     24 #include "src/webp/types.h"
     25 
     26 //------------------------------------------------------------------------------
     27 // Predictor Transform
     28 
     29 static WEBP_INLINE void Average2_m256i(const __m256i* const a0,
     30                                       const __m256i* const a1,
     31                                       __m256i* const avg) {
     32  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
     33  const __m256i ones = _mm256_set1_epi8(1);
     34  const __m256i avg1 = _mm256_avg_epu8(*a0, *a1);
     35  const __m256i one = _mm256_and_si256(_mm256_xor_si256(*a0, *a1), ones);
     36  *avg = _mm256_sub_epi8(avg1, one);
     37 }
     38 
     39 // Batch versions of those functions.
     40 
     41 // Predictor0: ARGB_BLACK.
     42 static void PredictorAdd0_AVX2(const uint32_t* in, const uint32_t* upper,
     43                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
     44  int i;
     45  const __m256i black = _mm256_set1_epi32((int)ARGB_BLACK);
     46  for (i = 0; i + 8 <= num_pixels; i += 8) {
     47    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
     48    const __m256i res = _mm256_add_epi8(src, black);
     49    _mm256_storeu_si256((__m256i*)&out[i], res);
     50  }
     51  if (i != num_pixels) {
     52    VP8LPredictorsAdd_SSE[0](in + i, NULL, num_pixels - i, out + i);
     53  }
     54  (void)upper;
     55 }
     56 
     57 // Predictor1: left.
     58 static void PredictorAdd1_AVX2(const uint32_t* in, const uint32_t* upper,
     59                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
     60  int i;
     61  __m256i prev = _mm256_set1_epi32((int)out[-1]);
     62  for (i = 0; i + 8 <= num_pixels; i += 8) {
     63    // h | g | f | e | d | c | b | a
     64    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
     65    // g | f | e | 0 | c | b | a | 0
     66    const __m256i shift0 = _mm256_slli_si256(src, 4);
     67    // g + h | f + g | e + f | e | c + d | b + c | a + b | a
     68    const __m256i sum0 = _mm256_add_epi8(src, shift0);
     69    // e + f | e | 0 | 0 | a + b | a | 0 | 0
     70    const __m256i shift1 = _mm256_slli_si256(sum0, 8);
     71    // e + f + g + h | e + f + g | e + f | e | a + b + c + d | a + b + c | a + b
     72    // | a
     73    const __m256i sum1 = _mm256_add_epi8(sum0, shift1);
     74    // Add a + b + c + d to the upper lane.
     75    const int32_t sum_abcd = _mm256_extract_epi32(sum1, 3);
     76    const __m256i sum2 = _mm256_add_epi8(
     77        sum1,
     78        _mm256_set_epi32(sum_abcd, sum_abcd, sum_abcd, sum_abcd, 0, 0, 0, 0));
     79 
     80    const __m256i res = _mm256_add_epi8(sum2, prev);
     81    _mm256_storeu_si256((__m256i*)&out[i], res);
     82    // replicate last res output in prev.
     83    prev = _mm256_permutevar8x32_epi32(
     84        res, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 7));
     85  }
     86  if (i != num_pixels) {
     87    VP8LPredictorsAdd_SSE[1](in + i, upper + i, num_pixels - i, out + i);
     88  }
     89 }
     90 
     91 // Macro that adds 32-bit integers from IN using mod 256 arithmetic
     92 // per 8 bit channel.
     93 #define GENERATE_PREDICTOR_1(X, IN)                                         \
     94  static void PredictorAdd##X##_AVX2(const uint32_t* in,                    \
     95                                     const uint32_t* upper, int num_pixels, \
     96                                     uint32_t* WEBP_RESTRICT out) {         \
     97    int i;                                                                  \
     98    for (i = 0; i + 8 <= num_pixels; i += 8) {                              \
     99      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);       \
    100      const __m256i other = _mm256_loadu_si256((const __m256i*)&(IN));      \
    101      const __m256i res = _mm256_add_epi8(src, other);                      \
    102      _mm256_storeu_si256((__m256i*)&out[i], res);                          \
    103    }                                                                       \
    104    if (i != num_pixels) {                                                  \
    105      VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
    106    }                                                                       \
    107  }
    108 
    109 // Predictor2: Top.
    110 GENERATE_PREDICTOR_1(2, upper[i])
    111 // Predictor3: Top-right.
    112 GENERATE_PREDICTOR_1(3, upper[i + 1])
    113 // Predictor4: Top-left.
    114 GENERATE_PREDICTOR_1(4, upper[i - 1])
    115 #undef GENERATE_PREDICTOR_1
    116 
    117 // Due to averages with integers, values cannot be accumulated in parallel for
    118 // predictors 5 to 7.
    119 
    120 #define GENERATE_PREDICTOR_2(X, IN)                                         \
    121  static void PredictorAdd##X##_AVX2(const uint32_t* in,                    \
    122                                     const uint32_t* upper, int num_pixels, \
    123                                     uint32_t* WEBP_RESTRICT out) {         \
    124    int i;                                                                  \
    125    for (i = 0; i + 8 <= num_pixels; i += 8) {                              \
    126      const __m256i Tother = _mm256_loadu_si256((const __m256i*)&(IN));     \
    127      const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);      \
    128      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);       \
    129      __m256i avg, res;                                                     \
    130      Average2_m256i(&T, &Tother, &avg);                                    \
    131      res = _mm256_add_epi8(avg, src);                                      \
    132      _mm256_storeu_si256((__m256i*)&out[i], res);                          \
    133    }                                                                       \
    134    if (i != num_pixels) {                                                  \
    135      VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
    136    }                                                                       \
    137  }
    138 // Predictor8: average TL T.
    139 GENERATE_PREDICTOR_2(8, upper[i - 1])
    140 // Predictor9: average T TR.
    141 GENERATE_PREDICTOR_2(9, upper[i + 1])
    142 #undef GENERATE_PREDICTOR_2
    143 
    144 // Predictor10: average of (average of (L,TL), average of (T, TR)).
    145 #define DO_PRED10(OUT)                                  \
    146  do {                                                  \
    147    __m256i avgLTL, avg;                                \
    148    Average2_m256i(&L, &TL, &avgLTL);                   \
    149    Average2_m256i(&avgTTR, &avgLTL, &avg);             \
    150    L = _mm256_add_epi8(avg, src);                      \
    151    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \
    152  } while (0)
    153 
    154 #define DO_PRED10_SHIFT                                         \
    155  do {                                                          \
    156    /* Rotate the pre-computed values for the next iteration.*/ \
    157    avgTTR = _mm256_srli_si256(avgTTR, 4);                      \
    158    TL = _mm256_srli_si256(TL, 4);                              \
    159    src = _mm256_srli_si256(src, 4);                            \
    160  } while (0)
    161 
    162 static void PredictorAdd10_AVX2(const uint32_t* in, const uint32_t* upper,
    163                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    164  int i, j;
    165  __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
    166  for (i = 0; i + 8 <= num_pixels; i += 8) {
    167    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
    168    __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
    169    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
    170    const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]);
    171    __m256i avgTTR;
    172    Average2_m256i(&T, &TR, &avgTTR);
    173    {
    174      const __m256i avgTTR_bak = avgTTR;
    175      const __m256i TL_bak = TL;
    176      const __m256i src_bak = src;
    177      for (j = 0; j < 4; ++j) {
    178        DO_PRED10(j);
    179        DO_PRED10_SHIFT;
    180      }
    181      avgTTR = _mm256_permute2x128_si256(avgTTR_bak, avgTTR_bak, 1);
    182      TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);
    183      src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
    184      for (; j < 8; ++j) {
    185        DO_PRED10(j);
    186        DO_PRED10_SHIFT;
    187      }
    188    }
    189  }
    190  if (i != num_pixels) {
    191    VP8LPredictorsAdd_SSE[10](in + i, upper + i, num_pixels - i, out + i);
    192  }
    193 }
    194 #undef DO_PRED10
    195 #undef DO_PRED10_SHIFT
    196 
    197 // Predictor11: select.
    198 #define DO_PRED11(OUT)                                                      \
    199  do {                                                                      \
    200    const __m256i L_lo = _mm256_unpacklo_epi32(L, T);                       \
    201    const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);                     \
    202    const __m256i pb = _mm256_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
    203    const __m256i mask = _mm256_cmpgt_epi32(pb, pa);                        \
    204    const __m256i A = _mm256_and_si256(mask, L);                            \
    205    const __m256i B = _mm256_andnot_si256(mask, T);                         \
    206    const __m256i pred = _mm256_or_si256(A, B); /* pred = (pa > b)? L : T*/ \
    207    L = _mm256_add_epi8(src, pred);                                         \
    208    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L);                     \
    209  } while (0)
    210 
    211 #define DO_PRED11_SHIFT                                       \
    212  do {                                                        \
    213    /* Shift the pre-computed value for the next iteration.*/ \
    214    T = _mm256_srli_si256(T, 4);                              \
    215    TL = _mm256_srli_si256(TL, 4);                            \
    216    src = _mm256_srli_si256(src, 4);                          \
    217    pa = _mm256_srli_si256(pa, 4);                            \
    218  } while (0)
    219 
    220 static void PredictorAdd11_AVX2(const uint32_t* in, const uint32_t* upper,
    221                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    222  int i, j;
    223  __m256i pa;
    224  __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
    225  for (i = 0; i + 8 <= num_pixels; i += 8) {
    226    __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
    227    __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
    228    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
    229    {
    230      // We can unpack with any value on the upper 32 bits, provided it's the
    231      // same on both operands (so that their sum of abs diff is zero). Here we
    232      // use T.
    233      const __m256i T_lo = _mm256_unpacklo_epi32(T, T);
    234      const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);
    235      const __m256i T_hi = _mm256_unpackhi_epi32(T, T);
    236      const __m256i TL_hi = _mm256_unpackhi_epi32(TL, T);
    237      const __m256i s_lo = _mm256_sad_epu8(T_lo, TL_lo);
    238      const __m256i s_hi = _mm256_sad_epu8(T_hi, TL_hi);
    239      pa = _mm256_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
    240    }
    241    {
    242      const __m256i T_bak = T;
    243      const __m256i TL_bak = TL;
    244      const __m256i src_bak = src;
    245      const __m256i pa_bak = pa;
    246      for (j = 0; j < 4; ++j) {
    247        DO_PRED11(j);
    248        DO_PRED11_SHIFT;
    249      }
    250      T = _mm256_permute2x128_si256(T_bak, T_bak, 1);
    251      TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);
    252      src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
    253      pa = _mm256_permute2x128_si256(pa_bak, pa_bak, 1);
    254      for (; j < 8; ++j) {
    255        DO_PRED11(j);
    256        DO_PRED11_SHIFT;
    257      }
    258    }
    259  }
    260  if (i != num_pixels) {
    261    VP8LPredictorsAdd_SSE[11](in + i, upper + i, num_pixels - i, out + i);
    262  }
    263 }
    264 #undef DO_PRED11
    265 #undef DO_PRED11_SHIFT
    266 
    267 // Predictor12: ClampedAddSubtractFull.
    268 #define DO_PRED12(DIFF, OUT)                              \
    269  do {                                                    \
    270    const __m256i all = _mm256_add_epi16(L, (DIFF));      \
    271    const __m256i alls = _mm256_packus_epi16(all, all);   \
    272    const __m256i res = _mm256_add_epi8(src, alls);       \
    273    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(res); \
    274    L = _mm256_unpacklo_epi8(res, zero);                  \
    275  } while (0)
    276 
    277 #define DO_PRED12_SHIFT(DIFF, LANE)                           \
    278  do {                                                        \
    279    /* Shift the pre-computed value for the next iteration.*/ \
    280    if ((LANE) == 0) (DIFF) = _mm256_srli_si256(DIFF, 8);     \
    281    src = _mm256_srli_si256(src, 4);                          \
    282  } while (0)
    283 
    284 static void PredictorAdd12_AVX2(const uint32_t* in, const uint32_t* upper,
    285                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
    286  int i;
    287  const __m256i zero = _mm256_setzero_si256();
    288  const __m256i L8 = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
    289  __m256i L = _mm256_unpacklo_epi8(L8, zero);
    290  for (i = 0; i + 8 <= num_pixels; i += 8) {
    291    // Load 8 pixels at a time.
    292    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
    293    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
    294    const __m256i T_lo = _mm256_unpacklo_epi8(T, zero);
    295    const __m256i T_hi = _mm256_unpackhi_epi8(T, zero);
    296    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
    297    const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero);
    298    const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero);
    299    __m256i diff_lo = _mm256_sub_epi16(T_lo, TL_lo);
    300    __m256i diff_hi = _mm256_sub_epi16(T_hi, TL_hi);
    301    const __m256i diff_lo_bak = diff_lo;
    302    const __m256i diff_hi_bak = diff_hi;
    303    const __m256i src_bak = src;
    304    DO_PRED12(diff_lo, 0);
    305    DO_PRED12_SHIFT(diff_lo, 0);
    306    DO_PRED12(diff_lo, 1);
    307    DO_PRED12_SHIFT(diff_lo, 0);
    308    DO_PRED12(diff_hi, 2);
    309    DO_PRED12_SHIFT(diff_hi, 0);
    310    DO_PRED12(diff_hi, 3);
    311    DO_PRED12_SHIFT(diff_hi, 0);
    312 
    313    // Process the upper lane.
    314    diff_lo = _mm256_permute2x128_si256(diff_lo_bak, diff_lo_bak, 1);
    315    diff_hi = _mm256_permute2x128_si256(diff_hi_bak, diff_hi_bak, 1);
    316    src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
    317 
    318    DO_PRED12(diff_lo, 4);
    319    DO_PRED12_SHIFT(diff_lo, 0);
    320    DO_PRED12(diff_lo, 5);
    321    DO_PRED12_SHIFT(diff_lo, 1);
    322    DO_PRED12(diff_hi, 6);
    323    DO_PRED12_SHIFT(diff_hi, 0);
    324    DO_PRED12(diff_hi, 7);
    325  }
    326  if (i != num_pixels) {
    327    VP8LPredictorsAdd_SSE[12](in + i, upper + i, num_pixels - i, out + i);
    328  }
    329 }
    330 #undef DO_PRED12
    331 #undef DO_PRED12_SHIFT
    332 
    333 // Due to averages with integers, values cannot be accumulated in parallel for
    334 // predictors 13.
    335 
    336 //------------------------------------------------------------------------------
    337 // Subtract-Green Transform
    338 
    339 static void AddGreenToBlueAndRed_AVX2(const uint32_t* const src, int num_pixels,
    340                                      uint32_t* dst) {
    341  int i;
    342  const __m256i kCstShuffle = _mm256_set_epi8(
    343      -1, 29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13,
    344      -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1);
    345  for (i = 0; i + 8 <= num_pixels; i += 8) {
    346    const __m256i in = _mm256_loadu_si256((const __m256i*)&src[i]);  // argb
    347    const __m256i in_0g0g = _mm256_shuffle_epi8(in, kCstShuffle);    // 0g0g
    348    const __m256i out = _mm256_add_epi8(in, in_0g0g);
    349    _mm256_storeu_si256((__m256i*)&dst[i], out);
    350  }
    351  // fallthrough and finish off with SSE.
    352  if (i != num_pixels) {
    353    VP8LAddGreenToBlueAndRed_SSE(src + i, num_pixels - i, dst + i);
    354  }
    355 }
    356 
    357 //------------------------------------------------------------------------------
    358 // Color Transform
    359 
    360 static void TransformColorInverse_AVX2(const VP8LMultipliers* const m,
    361                                       const uint32_t* const src,
    362                                       int num_pixels, uint32_t* dst) {
    363 // sign-extended multiplying constants, pre-shifted by 5.
    364 #define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
    365  const __m256i mults_rb =
    366      _mm256_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 |
    367                              (CST(green_to_blue) & 0xffff)));
    368  const __m256i mults_b2 = _mm256_set1_epi32(CST(red_to_blue));
    369 #undef CST
    370  const __m256i mask_ag = _mm256_set1_epi32((int)0xff00ff00);
    371  const __m256i perm1 = _mm256_setr_epi8(
    372      -1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13, -1, 17, -1, 17,
    373      -1, 21, -1, 21, -1, 25, -1, 25, -1, 29, -1, 29);
    374  const __m256i perm2 = _mm256_setr_epi8(
    375      -1, 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 18, -1,
    376      -1, -1, 22, -1, -1, -1, 26, -1, -1, -1, 30, -1, -1);
    377  int i;
    378  for (i = 0; i + 8 <= num_pixels; i += 8) {
    379    const __m256i A = _mm256_loadu_si256((const __m256i*)(src + i));
    380    const __m256i B = _mm256_shuffle_epi8(A, perm1);  // argb -> g0g0
    381    const __m256i C = _mm256_mulhi_epi16(B, mults_rb);
    382    const __m256i D = _mm256_add_epi8(A, C);
    383    const __m256i E = _mm256_shuffle_epi8(D, perm2);
    384    const __m256i F = _mm256_mulhi_epi16(E, mults_b2);
    385    const __m256i G = _mm256_add_epi8(D, F);
    386    const __m256i out = _mm256_blendv_epi8(G, A, mask_ag);
    387    _mm256_storeu_si256((__m256i*)&dst[i], out);
    388  }
    389  // Fall-back to SSE-version for left-overs.
    390  if (i != num_pixels) {
    391    VP8LTransformColorInverse_SSE(m, src + i, num_pixels - i, dst + i);
    392  }
    393 }
    394 
    395 //------------------------------------------------------------------------------
    396 // Color-space conversion functions
    397 
    398 static void ConvertBGRAToRGBA_AVX2(const uint32_t* WEBP_RESTRICT src,
    399                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
    400  const __m256i* in = (const __m256i*)src;
    401  __m256i* out = (__m256i*)dst;
    402  while (num_pixels >= 8) {
    403    const __m256i A = _mm256_loadu_si256(in++);
    404    const __m256i B = _mm256_shuffle_epi8(
    405        A,
    406        _mm256_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2,
    407                        15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2));
    408    _mm256_storeu_si256(out++, B);
    409    num_pixels -= 8;
    410  }
    411  // left-overs
    412  if (num_pixels > 0) {
    413    VP8LConvertBGRAToRGBA_SSE((const uint32_t*)in, num_pixels, (uint8_t*)out);
    414  }
    415 }
    416 
    417 //------------------------------------------------------------------------------
    418 // Entry point
    419 
    420 extern void VP8LDspInitAVX2(void);
    421 
    422 WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitAVX2(void) {
    423  VP8LPredictorsAdd[0] = PredictorAdd0_AVX2;
    424  VP8LPredictorsAdd[1] = PredictorAdd1_AVX2;
    425  VP8LPredictorsAdd[2] = PredictorAdd2_AVX2;
    426  VP8LPredictorsAdd[3] = PredictorAdd3_AVX2;
    427  VP8LPredictorsAdd[4] = PredictorAdd4_AVX2;
    428  VP8LPredictorsAdd[8] = PredictorAdd8_AVX2;
    429  VP8LPredictorsAdd[9] = PredictorAdd9_AVX2;
    430  VP8LPredictorsAdd[10] = PredictorAdd10_AVX2;
    431  VP8LPredictorsAdd[11] = PredictorAdd11_AVX2;
    432  VP8LPredictorsAdd[12] = PredictorAdd12_AVX2;
    433 
    434  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_AVX2;
    435  VP8LTransformColorInverse = TransformColorInverse_AVX2;
    436  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_AVX2;
    437 }
    438 
    439 #else  // !WEBP_USE_AVX2
    440 
    441 WEBP_DSP_INIT_STUB(VP8LDspInitAVX2)
    442 
    443 #endif  // WEBP_USE_AVX2