tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jcphuff-neon.c (26878B)


      1 /*
      2 * jcphuff-neon.c - prepare data for progressive Huffman encoding (Arm Neon)
      3 *
      4 * Copyright (C) 2020-2021, Arm Limited.  All Rights Reserved.
      5 * Copyright (C) 2022, Matthieu Darbois.  All Rights Reserved.
      6 * Copyright (C) 2022, D. R. Commander.  All Rights Reserved.
      7 *
      8 * This software is provided 'as-is', without any express or implied
      9 * warranty.  In no event will the authors be held liable for any damages
     10 * arising from the use of this software.
     11 *
     12 * Permission is granted to anyone to use this software for any purpose,
     13 * including commercial applications, and to alter it and redistribute it
     14 * freely, subject to the following restrictions:
     15 *
     16 * 1. The origin of this software must not be misrepresented; you must not
     17 *    claim that you wrote the original software. If you use this software
     18 *    in a product, an acknowledgment in the product documentation would be
     19 *    appreciated but is not required.
     20 * 2. Altered source versions must be plainly marked as such, and must not be
     21 *    misrepresented as being the original software.
     22 * 3. This notice may not be removed or altered from any source distribution.
     23 */
     24 
     25 #define JPEG_INTERNALS
     26 #include "../../jinclude.h"
     27 #include "../../jpeglib.h"
     28 #include "../../jsimd.h"
     29 #include "../../jdct.h"
     30 #include "../../jsimddct.h"
     31 #include "../jsimd.h"
     32 #include "neon-compat.h"
     33 
     34 #include <arm_neon.h>
     35 
     36 
     37 /* Data preparation for encode_mcu_AC_first().
     38 *
     39 * The equivalent scalar C function (encode_mcu_AC_first_prepare()) can be
     40 * found in jcphuff.c.
     41 */
     42 
     43 void jsimd_encode_mcu_AC_first_prepare_neon
     44  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
     45   UJCOEF *values, size_t *zerobits)
     46 {
     47  UJCOEF *values_ptr = values;
     48  UJCOEF *diff_values_ptr = values + DCTSIZE2;
     49 
     50  /* Rows of coefficients to zero (since they haven't been processed) */
     51  int i, rows_to_zero = 8;
     52 
     53  for (i = 0; i < Sl / 16; i++) {
     54    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
     55    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
     56    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
     57    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
     58    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
     59    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
     60    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
     61    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
     62    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
     63    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
     64    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
     65    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
     66    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
     67    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
     68    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
     69    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
     70 
     71    /* Isolate sign of coefficients. */
     72    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
     73    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
     74    /* Compute absolute value of coefficients and apply point transform Al. */
     75    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
     76    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
     77    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
     78    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
     79 
     80    /* Compute diff values. */
     81    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
     82    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
     83 
     84    /* Store transformed coefficients and diff values. */
     85    vst1q_u16(values_ptr, abs_coefs1);
     86    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
     87    vst1q_u16(diff_values_ptr, diff1);
     88    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
     89    values_ptr += 16;
     90    diff_values_ptr += 16;
     91    jpeg_natural_order_start += 16;
     92    rows_to_zero -= 2;
     93  }
     94 
     95  /* Same operation but for remaining partial vector */
     96  int remaining_coefs = Sl % 16;
     97  if (remaining_coefs > 8) {
     98    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
     99    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
    100    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
    101    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
    102    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
    103    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
    104    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
    105    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
    106    int16x8_t coefs2 = vdupq_n_s16(0);
    107    switch (remaining_coefs) {
    108    case 15:
    109      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
    110      FALLTHROUGH               /*FALLTHROUGH*/
    111    case 14:
    112      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
    113      FALLTHROUGH               /*FALLTHROUGH*/
    114    case 13:
    115      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
    116      FALLTHROUGH               /*FALLTHROUGH*/
    117    case 12:
    118      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
    119      FALLTHROUGH               /*FALLTHROUGH*/
    120    case 11:
    121      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
    122      FALLTHROUGH               /*FALLTHROUGH*/
    123    case 10:
    124      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
    125      FALLTHROUGH               /*FALLTHROUGH*/
    126    case 9:
    127      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
    128      FALLTHROUGH               /*FALLTHROUGH*/
    129    default:
    130      break;
    131    }
    132 
    133    /* Isolate sign of coefficients. */
    134    uint16x8_t sign_coefs1 = vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15));
    135    uint16x8_t sign_coefs2 = vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15));
    136    /* Compute absolute value of coefficients and apply point transform Al. */
    137    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
    138    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
    139    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
    140    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
    141 
    142    /* Compute diff values. */
    143    uint16x8_t diff1 = veorq_u16(abs_coefs1, sign_coefs1);
    144    uint16x8_t diff2 = veorq_u16(abs_coefs2, sign_coefs2);
    145 
    146    /* Store transformed coefficients and diff values. */
    147    vst1q_u16(values_ptr, abs_coefs1);
    148    vst1q_u16(values_ptr + DCTSIZE, abs_coefs2);
    149    vst1q_u16(diff_values_ptr, diff1);
    150    vst1q_u16(diff_values_ptr + DCTSIZE, diff2);
    151    values_ptr += 16;
    152    diff_values_ptr += 16;
    153    rows_to_zero -= 2;
    154 
    155  } else if (remaining_coefs > 0) {
    156    int16x8_t coefs = vdupq_n_s16(0);
    157 
    158    switch (remaining_coefs) {
    159    case 8:
    160      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
    161      FALLTHROUGH               /*FALLTHROUGH*/
    162    case 7:
    163      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
    164      FALLTHROUGH               /*FALLTHROUGH*/
    165    case 6:
    166      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
    167      FALLTHROUGH               /*FALLTHROUGH*/
    168    case 5:
    169      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
    170      FALLTHROUGH               /*FALLTHROUGH*/
    171    case 4:
    172      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
    173      FALLTHROUGH               /*FALLTHROUGH*/
    174    case 3:
    175      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
    176      FALLTHROUGH               /*FALLTHROUGH*/
    177    case 2:
    178      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
    179      FALLTHROUGH               /*FALLTHROUGH*/
    180    case 1:
    181      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
    182      FALLTHROUGH               /*FALLTHROUGH*/
    183    default:
    184      break;
    185    }
    186 
    187    /* Isolate sign of coefficients. */
    188    uint16x8_t sign_coefs = vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15));
    189    /* Compute absolute value of coefficients and apply point transform Al. */
    190    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
    191    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
    192 
    193    /* Compute diff values. */
    194    uint16x8_t diff = veorq_u16(abs_coefs, sign_coefs);
    195 
    196    /* Store transformed coefficients and diff values. */
    197    vst1q_u16(values_ptr, abs_coefs);
    198    vst1q_u16(diff_values_ptr, diff);
    199    values_ptr += 8;
    200    diff_values_ptr += 8;
    201    rows_to_zero--;
    202  }
    203 
    204  /* Zero remaining memory in the values and diff_values blocks. */
    205  for (i = 0; i < rows_to_zero; i++) {
    206    vst1q_u16(values_ptr, vdupq_n_u16(0));
    207    vst1q_u16(diff_values_ptr, vdupq_n_u16(0));
    208    values_ptr += 8;
    209    diff_values_ptr += 8;
    210  }
    211 
    212  /* Construct zerobits bitmap.  A set bit means that the corresponding
    213   * coefficient != 0.
    214   */
    215  uint16x8_t row0 = vld1q_u16(values + 0 * DCTSIZE);
    216  uint16x8_t row1 = vld1q_u16(values + 1 * DCTSIZE);
    217  uint16x8_t row2 = vld1q_u16(values + 2 * DCTSIZE);
    218  uint16x8_t row3 = vld1q_u16(values + 3 * DCTSIZE);
    219  uint16x8_t row4 = vld1q_u16(values + 4 * DCTSIZE);
    220  uint16x8_t row5 = vld1q_u16(values + 5 * DCTSIZE);
    221  uint16x8_t row6 = vld1q_u16(values + 6 * DCTSIZE);
    222  uint16x8_t row7 = vld1q_u16(values + 7 * DCTSIZE);
    223 
    224  uint8x8_t row0_eq0 = vmovn_u16(vceqq_u16(row0, vdupq_n_u16(0)));
    225  uint8x8_t row1_eq0 = vmovn_u16(vceqq_u16(row1, vdupq_n_u16(0)));
    226  uint8x8_t row2_eq0 = vmovn_u16(vceqq_u16(row2, vdupq_n_u16(0)));
    227  uint8x8_t row3_eq0 = vmovn_u16(vceqq_u16(row3, vdupq_n_u16(0)));
    228  uint8x8_t row4_eq0 = vmovn_u16(vceqq_u16(row4, vdupq_n_u16(0)));
    229  uint8x8_t row5_eq0 = vmovn_u16(vceqq_u16(row5, vdupq_n_u16(0)));
    230  uint8x8_t row6_eq0 = vmovn_u16(vceqq_u16(row6, vdupq_n_u16(0)));
    231  uint8x8_t row7_eq0 = vmovn_u16(vceqq_u16(row7, vdupq_n_u16(0)));
    232 
    233  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
    234  const uint8x8_t bitmap_mask =
    235    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
    236 
    237  row0_eq0 = vand_u8(row0_eq0, bitmap_mask);
    238  row1_eq0 = vand_u8(row1_eq0, bitmap_mask);
    239  row2_eq0 = vand_u8(row2_eq0, bitmap_mask);
    240  row3_eq0 = vand_u8(row3_eq0, bitmap_mask);
    241  row4_eq0 = vand_u8(row4_eq0, bitmap_mask);
    242  row5_eq0 = vand_u8(row5_eq0, bitmap_mask);
    243  row6_eq0 = vand_u8(row6_eq0, bitmap_mask);
    244  row7_eq0 = vand_u8(row7_eq0, bitmap_mask);
    245 
    246  uint8x8_t bitmap_rows_01 = vpadd_u8(row0_eq0, row1_eq0);
    247  uint8x8_t bitmap_rows_23 = vpadd_u8(row2_eq0, row3_eq0);
    248  uint8x8_t bitmap_rows_45 = vpadd_u8(row4_eq0, row5_eq0);
    249  uint8x8_t bitmap_rows_67 = vpadd_u8(row6_eq0, row7_eq0);
    250  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
    251  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
    252  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
    253 
    254 #if defined(__aarch64__) || defined(_M_ARM64)
    255  /* Move bitmap to a 64-bit scalar register. */
    256  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
    257  /* Store zerobits bitmap. */
    258  *zerobits = ~bitmap;
    259 #else
    260  /* Move bitmap to two 32-bit scalar registers. */
    261  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
    262  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
    263  /* Store zerobits bitmap. */
    264  zerobits[0] = ~bitmap0;
    265  zerobits[1] = ~bitmap1;
    266 #endif
    267 }
    268 
    269 
    270 /* Data preparation for encode_mcu_AC_refine().
    271 *
    272 * The equivalent scalar C function (encode_mcu_AC_refine_prepare()) can be
    273 * found in jcphuff.c.
    274 */
    275 
    276 int jsimd_encode_mcu_AC_refine_prepare_neon
    277  (const JCOEF *block, const int *jpeg_natural_order_start, int Sl, int Al,
    278   UJCOEF *absvalues, size_t *bits)
    279 {
    280  /* Temporary storage buffers for data used to compute the signbits bitmap and
    281   * the end-of-block (EOB) position
    282   */
    283  uint8_t coef_sign_bits[64];
    284  uint8_t coef_eq1_bits[64];
    285 
    286  UJCOEF *absvalues_ptr = absvalues;
    287  uint8_t *coef_sign_bits_ptr = coef_sign_bits;
    288  uint8_t *eq1_bits_ptr = coef_eq1_bits;
    289 
    290  /* Rows of coefficients to zero (since they haven't been processed) */
    291  int i, rows_to_zero = 8;
    292 
    293  for (i = 0; i < Sl / 16; i++) {
    294    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
    295    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
    296    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
    297    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
    298    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
    299    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
    300    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
    301    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
    302    int16x8_t coefs2 = vld1q_dup_s16(block + jpeg_natural_order_start[8]);
    303    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
    304    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
    305    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
    306    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
    307    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
    308    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
    309    coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[15], coefs2, 7);
    310 
    311    /* Compute and store data for signbits bitmap. */
    312    uint8x8_t sign_coefs1 =
    313      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
    314    uint8x8_t sign_coefs2 =
    315      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
    316    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
    317    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
    318 
    319    /* Compute absolute value of coefficients and apply point transform Al. */
    320    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
    321    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
    322    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
    323    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
    324    vst1q_u16(absvalues_ptr, abs_coefs1);
    325    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
    326 
    327    /* Test whether transformed coefficient values == 1 (used to find EOB
    328     * position.)
    329     */
    330    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
    331    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
    332    vst1_u8(eq1_bits_ptr, coefs_eq11);
    333    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
    334 
    335    absvalues_ptr += 16;
    336    coef_sign_bits_ptr += 16;
    337    eq1_bits_ptr += 16;
    338    jpeg_natural_order_start += 16;
    339    rows_to_zero -= 2;
    340  }
    341 
    342  /* Same operation but for remaining partial vector */
    343  int remaining_coefs = Sl % 16;
    344  if (remaining_coefs > 8) {
    345    int16x8_t coefs1 = vld1q_dup_s16(block + jpeg_natural_order_start[0]);
    346    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs1, 1);
    347    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs1, 2);
    348    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs1, 3);
    349    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs1, 4);
    350    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs1, 5);
    351    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs1, 6);
    352    coefs1 = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs1, 7);
    353    int16x8_t coefs2 = vdupq_n_s16(0);
    354    switch (remaining_coefs) {
    355    case 15:
    356      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[14], coefs2, 6);
    357      FALLTHROUGH               /*FALLTHROUGH*/
    358    case 14:
    359      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[13], coefs2, 5);
    360      FALLTHROUGH               /*FALLTHROUGH*/
    361    case 13:
    362      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[12], coefs2, 4);
    363      FALLTHROUGH               /*FALLTHROUGH*/
    364    case 12:
    365      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[11], coefs2, 3);
    366      FALLTHROUGH               /*FALLTHROUGH*/
    367    case 11:
    368      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[10], coefs2, 2);
    369      FALLTHROUGH               /*FALLTHROUGH*/
    370    case 10:
    371      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[9], coefs2, 1);
    372      FALLTHROUGH               /*FALLTHROUGH*/
    373    case 9:
    374      coefs2 = vld1q_lane_s16(block + jpeg_natural_order_start[8], coefs2, 0);
    375      FALLTHROUGH               /*FALLTHROUGH*/
    376    default:
    377      break;
    378    }
    379 
    380    /* Compute and store data for signbits bitmap. */
    381    uint8x8_t sign_coefs1 =
    382      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs1, 15)));
    383    uint8x8_t sign_coefs2 =
    384      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs2, 15)));
    385    vst1_u8(coef_sign_bits_ptr, sign_coefs1);
    386    vst1_u8(coef_sign_bits_ptr + DCTSIZE, sign_coefs2);
    387 
    388    /* Compute absolute value of coefficients and apply point transform Al. */
    389    uint16x8_t abs_coefs1 = vreinterpretq_u16_s16(vabsq_s16(coefs1));
    390    uint16x8_t abs_coefs2 = vreinterpretq_u16_s16(vabsq_s16(coefs2));
    391    abs_coefs1 = vshlq_u16(abs_coefs1, vdupq_n_s16(-Al));
    392    abs_coefs2 = vshlq_u16(abs_coefs2, vdupq_n_s16(-Al));
    393    vst1q_u16(absvalues_ptr, abs_coefs1);
    394    vst1q_u16(absvalues_ptr + DCTSIZE, abs_coefs2);
    395 
    396    /* Test whether transformed coefficient values == 1 (used to find EOB
    397     * position.)
    398     */
    399    uint8x8_t coefs_eq11 = vmovn_u16(vceqq_u16(abs_coefs1, vdupq_n_u16(1)));
    400    uint8x8_t coefs_eq12 = vmovn_u16(vceqq_u16(abs_coefs2, vdupq_n_u16(1)));
    401    vst1_u8(eq1_bits_ptr, coefs_eq11);
    402    vst1_u8(eq1_bits_ptr + DCTSIZE, coefs_eq12);
    403 
    404    absvalues_ptr += 16;
    405    coef_sign_bits_ptr += 16;
    406    eq1_bits_ptr += 16;
    407    jpeg_natural_order_start += 16;
    408    rows_to_zero -= 2;
    409 
    410  } else if (remaining_coefs > 0) {
    411    int16x8_t coefs = vdupq_n_s16(0);
    412 
    413    switch (remaining_coefs) {
    414    case 8:
    415      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[7], coefs, 7);
    416      FALLTHROUGH               /*FALLTHROUGH*/
    417    case 7:
    418      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[6], coefs, 6);
    419      FALLTHROUGH               /*FALLTHROUGH*/
    420    case 6:
    421      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[5], coefs, 5);
    422      FALLTHROUGH               /*FALLTHROUGH*/
    423    case 5:
    424      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[4], coefs, 4);
    425      FALLTHROUGH               /*FALLTHROUGH*/
    426    case 4:
    427      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[3], coefs, 3);
    428      FALLTHROUGH               /*FALLTHROUGH*/
    429    case 3:
    430      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[2], coefs, 2);
    431      FALLTHROUGH               /*FALLTHROUGH*/
    432    case 2:
    433      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[1], coefs, 1);
    434      FALLTHROUGH               /*FALLTHROUGH*/
    435    case 1:
    436      coefs = vld1q_lane_s16(block + jpeg_natural_order_start[0], coefs, 0);
    437      FALLTHROUGH               /*FALLTHROUGH*/
    438    default:
    439      break;
    440    }
    441 
    442    /* Compute and store data for signbits bitmap. */
    443    uint8x8_t sign_coefs =
    444      vmovn_u16(vreinterpretq_u16_s16(vshrq_n_s16(coefs, 15)));
    445    vst1_u8(coef_sign_bits_ptr, sign_coefs);
    446 
    447    /* Compute absolute value of coefficients and apply point transform Al. */
    448    uint16x8_t abs_coefs = vreinterpretq_u16_s16(vabsq_s16(coefs));
    449    abs_coefs = vshlq_u16(abs_coefs, vdupq_n_s16(-Al));
    450    vst1q_u16(absvalues_ptr, abs_coefs);
    451 
    452    /* Test whether transformed coefficient values == 1 (used to find EOB
    453     * position.)
    454     */
    455    uint8x8_t coefs_eq1 = vmovn_u16(vceqq_u16(abs_coefs, vdupq_n_u16(1)));
    456    vst1_u8(eq1_bits_ptr, coefs_eq1);
    457 
    458    absvalues_ptr += 8;
    459    coef_sign_bits_ptr += 8;
    460    eq1_bits_ptr += 8;
    461    rows_to_zero--;
    462  }
    463 
    464  /* Zero remaining memory in blocks. */
    465  for (i = 0; i < rows_to_zero; i++) {
    466    vst1q_u16(absvalues_ptr, vdupq_n_u16(0));
    467    vst1_u8(coef_sign_bits_ptr, vdup_n_u8(0));
    468    vst1_u8(eq1_bits_ptr, vdup_n_u8(0));
    469    absvalues_ptr += 8;
    470    coef_sign_bits_ptr += 8;
    471    eq1_bits_ptr += 8;
    472  }
    473 
    474  /* Construct zerobits bitmap. */
    475  uint16x8_t abs_row0 = vld1q_u16(absvalues + 0 * DCTSIZE);
    476  uint16x8_t abs_row1 = vld1q_u16(absvalues + 1 * DCTSIZE);
    477  uint16x8_t abs_row2 = vld1q_u16(absvalues + 2 * DCTSIZE);
    478  uint16x8_t abs_row3 = vld1q_u16(absvalues + 3 * DCTSIZE);
    479  uint16x8_t abs_row4 = vld1q_u16(absvalues + 4 * DCTSIZE);
    480  uint16x8_t abs_row5 = vld1q_u16(absvalues + 5 * DCTSIZE);
    481  uint16x8_t abs_row6 = vld1q_u16(absvalues + 6 * DCTSIZE);
    482  uint16x8_t abs_row7 = vld1q_u16(absvalues + 7 * DCTSIZE);
    483 
    484  uint8x8_t abs_row0_eq0 = vmovn_u16(vceqq_u16(abs_row0, vdupq_n_u16(0)));
    485  uint8x8_t abs_row1_eq0 = vmovn_u16(vceqq_u16(abs_row1, vdupq_n_u16(0)));
    486  uint8x8_t abs_row2_eq0 = vmovn_u16(vceqq_u16(abs_row2, vdupq_n_u16(0)));
    487  uint8x8_t abs_row3_eq0 = vmovn_u16(vceqq_u16(abs_row3, vdupq_n_u16(0)));
    488  uint8x8_t abs_row4_eq0 = vmovn_u16(vceqq_u16(abs_row4, vdupq_n_u16(0)));
    489  uint8x8_t abs_row5_eq0 = vmovn_u16(vceqq_u16(abs_row5, vdupq_n_u16(0)));
    490  uint8x8_t abs_row6_eq0 = vmovn_u16(vceqq_u16(abs_row6, vdupq_n_u16(0)));
    491  uint8x8_t abs_row7_eq0 = vmovn_u16(vceqq_u16(abs_row7, vdupq_n_u16(0)));
    492 
    493  /* { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80 } */
    494  const uint8x8_t bitmap_mask =
    495    vreinterpret_u8_u64(vmov_n_u64(0x8040201008040201));
    496 
    497  abs_row0_eq0 = vand_u8(abs_row0_eq0, bitmap_mask);
    498  abs_row1_eq0 = vand_u8(abs_row1_eq0, bitmap_mask);
    499  abs_row2_eq0 = vand_u8(abs_row2_eq0, bitmap_mask);
    500  abs_row3_eq0 = vand_u8(abs_row3_eq0, bitmap_mask);
    501  abs_row4_eq0 = vand_u8(abs_row4_eq0, bitmap_mask);
    502  abs_row5_eq0 = vand_u8(abs_row5_eq0, bitmap_mask);
    503  abs_row6_eq0 = vand_u8(abs_row6_eq0, bitmap_mask);
    504  abs_row7_eq0 = vand_u8(abs_row7_eq0, bitmap_mask);
    505 
    506  uint8x8_t bitmap_rows_01 = vpadd_u8(abs_row0_eq0, abs_row1_eq0);
    507  uint8x8_t bitmap_rows_23 = vpadd_u8(abs_row2_eq0, abs_row3_eq0);
    508  uint8x8_t bitmap_rows_45 = vpadd_u8(abs_row4_eq0, abs_row5_eq0);
    509  uint8x8_t bitmap_rows_67 = vpadd_u8(abs_row6_eq0, abs_row7_eq0);
    510  uint8x8_t bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
    511  uint8x8_t bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
    512  uint8x8_t bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
    513 
    514 #if defined(__aarch64__) || defined(_M_ARM64)
    515  /* Move bitmap to a 64-bit scalar register. */
    516  uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
    517  /* Store zerobits bitmap. */
    518  bits[0] = ~bitmap;
    519 #else
    520  /* Move bitmap to two 32-bit scalar registers. */
    521  uint32_t bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
    522  uint32_t bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
    523  /* Store zerobits bitmap. */
    524  bits[0] = ~bitmap0;
    525  bits[1] = ~bitmap1;
    526 #endif
    527 
    528  /* Construct signbits bitmap. */
    529  uint8x8_t signbits_row0 = vld1_u8(coef_sign_bits + 0 * DCTSIZE);
    530  uint8x8_t signbits_row1 = vld1_u8(coef_sign_bits + 1 * DCTSIZE);
    531  uint8x8_t signbits_row2 = vld1_u8(coef_sign_bits + 2 * DCTSIZE);
    532  uint8x8_t signbits_row3 = vld1_u8(coef_sign_bits + 3 * DCTSIZE);
    533  uint8x8_t signbits_row4 = vld1_u8(coef_sign_bits + 4 * DCTSIZE);
    534  uint8x8_t signbits_row5 = vld1_u8(coef_sign_bits + 5 * DCTSIZE);
    535  uint8x8_t signbits_row6 = vld1_u8(coef_sign_bits + 6 * DCTSIZE);
    536  uint8x8_t signbits_row7 = vld1_u8(coef_sign_bits + 7 * DCTSIZE);
    537 
    538  signbits_row0 = vand_u8(signbits_row0, bitmap_mask);
    539  signbits_row1 = vand_u8(signbits_row1, bitmap_mask);
    540  signbits_row2 = vand_u8(signbits_row2, bitmap_mask);
    541  signbits_row3 = vand_u8(signbits_row3, bitmap_mask);
    542  signbits_row4 = vand_u8(signbits_row4, bitmap_mask);
    543  signbits_row5 = vand_u8(signbits_row5, bitmap_mask);
    544  signbits_row6 = vand_u8(signbits_row6, bitmap_mask);
    545  signbits_row7 = vand_u8(signbits_row7, bitmap_mask);
    546 
    547  bitmap_rows_01 = vpadd_u8(signbits_row0, signbits_row1);
    548  bitmap_rows_23 = vpadd_u8(signbits_row2, signbits_row3);
    549  bitmap_rows_45 = vpadd_u8(signbits_row4, signbits_row5);
    550  bitmap_rows_67 = vpadd_u8(signbits_row6, signbits_row7);
    551  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
    552  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
    553  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
    554 
    555 #if defined(__aarch64__) || defined(_M_ARM64)
    556  /* Move bitmap to a 64-bit scalar register. */
    557  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
    558  /* Store signbits bitmap. */
    559  bits[1] = ~bitmap;
    560 #else
    561  /* Move bitmap to two 32-bit scalar registers. */
    562  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
    563  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
    564  /* Store signbits bitmap. */
    565  bits[2] = ~bitmap0;
    566  bits[3] = ~bitmap1;
    567 #endif
    568 
    569  /* Construct bitmap to find EOB position (the index of the last coefficient
    570   * equal to 1.)
    571   */
    572  uint8x8_t row0_eq1 = vld1_u8(coef_eq1_bits + 0 * DCTSIZE);
    573  uint8x8_t row1_eq1 = vld1_u8(coef_eq1_bits + 1 * DCTSIZE);
    574  uint8x8_t row2_eq1 = vld1_u8(coef_eq1_bits + 2 * DCTSIZE);
    575  uint8x8_t row3_eq1 = vld1_u8(coef_eq1_bits + 3 * DCTSIZE);
    576  uint8x8_t row4_eq1 = vld1_u8(coef_eq1_bits + 4 * DCTSIZE);
    577  uint8x8_t row5_eq1 = vld1_u8(coef_eq1_bits + 5 * DCTSIZE);
    578  uint8x8_t row6_eq1 = vld1_u8(coef_eq1_bits + 6 * DCTSIZE);
    579  uint8x8_t row7_eq1 = vld1_u8(coef_eq1_bits + 7 * DCTSIZE);
    580 
    581  row0_eq1 = vand_u8(row0_eq1, bitmap_mask);
    582  row1_eq1 = vand_u8(row1_eq1, bitmap_mask);
    583  row2_eq1 = vand_u8(row2_eq1, bitmap_mask);
    584  row3_eq1 = vand_u8(row3_eq1, bitmap_mask);
    585  row4_eq1 = vand_u8(row4_eq1, bitmap_mask);
    586  row5_eq1 = vand_u8(row5_eq1, bitmap_mask);
    587  row6_eq1 = vand_u8(row6_eq1, bitmap_mask);
    588  row7_eq1 = vand_u8(row7_eq1, bitmap_mask);
    589 
    590  bitmap_rows_01 = vpadd_u8(row0_eq1, row1_eq1);
    591  bitmap_rows_23 = vpadd_u8(row2_eq1, row3_eq1);
    592  bitmap_rows_45 = vpadd_u8(row4_eq1, row5_eq1);
    593  bitmap_rows_67 = vpadd_u8(row6_eq1, row7_eq1);
    594  bitmap_rows_0123 = vpadd_u8(bitmap_rows_01, bitmap_rows_23);
    595  bitmap_rows_4567 = vpadd_u8(bitmap_rows_45, bitmap_rows_67);
    596  bitmap_all = vpadd_u8(bitmap_rows_0123, bitmap_rows_4567);
    597 
    598 #if defined(__aarch64__) || defined(_M_ARM64)
    599  /* Move bitmap to a 64-bit scalar register. */
    600  bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
    601 
    602  /* Return EOB position. */
    603  if (bitmap == 0) {
    604    /* EOB position is defined to be 0 if all coefficients != 1. */
    605    return 0;
    606  } else {
    607    return 63 - BUILTIN_CLZLL(bitmap);
    608  }
    609 #else
    610  /* Move bitmap to two 32-bit scalar registers. */
    611  bitmap0 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 0);
    612  bitmap1 = vget_lane_u32(vreinterpret_u32_u8(bitmap_all), 1);
    613 
    614  /* Return EOB position. */
    615  if (bitmap0 == 0 && bitmap1 == 0) {
    616    return 0;
    617  } else if (bitmap1 != 0) {
    618    return 63 - BUILTIN_CLZ(bitmap1);
    619  } else {
    620    return 31 - BUILTIN_CLZ(bitmap0);
    621  }
    622 #endif
    623 }