tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

jdsample-mmi.c (12607B)


      1 /*
      2 * Loongson MMI optimizations for libjpeg-turbo
      3 *
      4 * Copyright (C) 2015, 2018-2019, D. R. Commander.  All Rights Reserved.
      5 * Copyright (C) 2016-2018, Loongson Technology Corporation Limited, BeiJing.
      6 *                          All Rights Reserved.
      7 * Authors:  ZhuChen     <zhuchen@loongson.cn>
      8 *           CaiWanwei   <caiwanwei@loongson.cn>
      9 *           SunZhangzhi <sunzhangzhi-cq@loongson.cn>
     10 *           ZhangLixia  <zhanglixia-hf@loongson.cn>
     11 *
     12 * Based on the x86 SIMD extension for IJG JPEG library
     13 * Copyright (C) 1999-2006, MIYASAKA Masaru.
     14 *
     15 * This software is provided 'as-is', without any express or implied
     16 * warranty.  In no event will the authors be held liable for any damages
     17 * arising from the use of this software.
     18 *
     19 * Permission is granted to anyone to use this software for any purpose,
     20 * including commercial applications, and to alter it and redistribute it
     21 * freely, subject to the following restrictions:
     22 *
     23 * 1. The origin of this software must not be misrepresented; you must not
     24 *    claim that you wrote the original software. If you use this software
     25 *    in a product, an acknowledgment in the product documentation would be
     26 *    appreciated but is not required.
     27 * 2. Altered source versions must be plainly marked as such, and must not be
     28 *    misrepresented as being the original software.
     29 * 3. This notice may not be removed or altered from any source distribution.
     30 */
     31 
     32 /* CHROMA UPSAMPLING */
     33 
     34 #include "jsimd_mmi.h"
     35 
     36 
     37 enum const_index {
     38  index_PW_ONE,
     39  index_PW_TWO,
     40  index_PW_THREE,
     41  index_PW_SEVEN,
     42  index_PW_EIGHT,
     43 };
     44 
     45 static uint64_t const_value[] = {
     46  _uint64_set_pi16(1, 1, 1, 1),
     47  _uint64_set_pi16(2, 2, 2, 2),
     48  _uint64_set_pi16(3, 3, 3, 3),
     49  _uint64_set_pi16(7, 7, 7, 7),
     50  _uint64_set_pi16(8, 8, 8, 8),
     51 };
     52 
     53 #define PW_ONE    get_const_value(index_PW_ONE)
     54 #define PW_TWO    get_const_value(index_PW_TWO)
     55 #define PW_THREE  get_const_value(index_PW_THREE)
     56 #define PW_SEVEN  get_const_value(index_PW_SEVEN)
     57 #define PW_EIGHT  get_const_value(index_PW_EIGHT)
     58 
     59 
     60 #define PROCESS_ROW(row, wkoffset, bias1, bias2, shift) { \
     61  __m64 samp123X, samp3XXX, samp1234, sampX012, samp_1012; \
     62  __m64 sampXXX4, sampX456, samp3456, samp567X, samp7XXX, samp5678; \
     63  __m64 outle, outhe, outlo, outho, outl, outh; \
     64  \
     65  samp123X = _mm_srli_si64(samp0123, 2 * BYTE_BIT);  /* ( 1 2 3 -) */ \
     66  sampXXX4 = _mm_slli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 4) */ \
     67  samp3XXX = _mm_srli_si64(samp0123, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 3 - - -) */ \
     68  sampX456 = _mm_slli_si64(samp4567, 2 * BYTE_BIT);  /* ( - 4 5 6) */ \
     69  \
     70  samp1234 = _mm_or_si64(samp123X, sampXXX4);  /* ( 1 2 3 4) */ \
     71  samp3456 = _mm_or_si64(samp3XXX, sampX456);  /* ( 3 4 5 6) */ \
     72  \
     73  sampX012 = _mm_slli_si64(samp0123, 2 * BYTE_BIT);  /* ( - 0 1 2) */ \
     74  samp567X = _mm_srli_si64(samp4567, 2 * BYTE_BIT);  /* ( 5 6 7 -) */ \
     75  samp7XXX = _mm_srli_si64(samp4567, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( 7 - - -) */ \
     76  \
     77  samp_1012 = _mm_or_si64(sampX012, wk[row]);            /* (-1 0 1 2) */ \
     78  samp5678 = _mm_or_si64(samp567X, wk[row + wkoffset]);  /* ( 5 6 7 8) */ \
     79  \
     80  wk[row] = samp7XXX; \
     81  \
     82  samp0123 = _mm_mullo_pi16(samp0123, PW_THREE); \
     83  samp4567 = _mm_mullo_pi16(samp4567, PW_THREE); \
     84  samp_1012 = _mm_add_pi16(samp_1012, bias1); \
     85  samp3456 = _mm_add_pi16(samp3456, bias1); \
     86  samp1234 = _mm_add_pi16(samp1234, bias2); \
     87  samp5678 = _mm_add_pi16(samp5678, bias2); \
     88  \
     89  outle = _mm_add_pi16(samp_1012, samp0123); \
     90  outhe = _mm_add_pi16(samp3456, samp4567); \
     91  outle = _mm_srli_pi16(outle, shift);        /* ( 0  2  4  6) */ \
     92  outhe = _mm_srli_pi16(outhe, shift);        /* ( 8 10 12 14) */ \
     93  outlo = _mm_add_pi16(samp1234, samp0123); \
     94  outho = _mm_add_pi16(samp5678, samp4567); \
     95  outlo = _mm_srli_pi16(outlo, shift);        /* ( 1  3  5  7) */ \
     96  outho = _mm_srli_pi16(outho, shift);        /* ( 9 11 13 15) */ \
     97  \
     98  outlo = _mm_slli_pi16(outlo, BYTE_BIT); \
     99  outho = _mm_slli_pi16(outho, BYTE_BIT); \
    100  outl = _mm_or_si64(outle, outlo);           /* ( 0  1  2  3  4  5  6  7) */ \
    101  outh = _mm_or_si64(outhe, outho);           /* ( 8  9 10 11 12 13 14 15) */ \
    102  \
    103  _mm_store_si64((__m64 *)outptr##row, outl); \
    104  _mm_store_si64((__m64 *)outptr##row + 1, outh); \
    105 }
    106 
    107 void jsimd_h2v2_fancy_upsample_mmi(int max_v_samp_factor,
    108                                   JDIMENSION downsampled_width,
    109                                   JSAMPARRAY input_data,
    110                                   JSAMPARRAY *output_data_ptr)
    111 {
    112  JSAMPARRAY output_data = *output_data_ptr;
    113  JSAMPROW inptr_1, inptr0, inptr1, outptr0, outptr1;
    114  int inrow, outrow, incol, tmp, tmp1;
    115  __m64 this_1l, this_1h, this_1, thiscolsum_1l, thiscolsum_1h;
    116  __m64 this0l, this0h, this0;
    117  __m64 this1l, this1h, this1, thiscolsum1l, thiscolsum1h;
    118  __m64 next_1l, next_1h, next_1, nextcolsum_1l, nextcolsum_1h;
    119  __m64 next0l, next0h, next0;
    120  __m64 next1l, next1h, next1, nextcolsum1l, nextcolsum1h;
    121  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[4], zero = 0.0;
    122 
    123  mask0 = _mm_cmpeq_pi8(mask0, mask0);
    124  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
    125  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
    126 
    127  for (inrow = 0, outrow = 0; outrow < max_v_samp_factor; inrow++) {
    128 
    129    inptr_1 = input_data[inrow - 1];
    130    inptr0 = input_data[inrow];
    131    inptr1 = input_data[inrow + 1];
    132    outptr0 = output_data[outrow++];
    133    outptr1 = output_data[outrow++];
    134 
    135    if (downsampled_width & 7) {
    136      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
    137      tmp1 = downsampled_width * sizeof(JSAMPLE);
    138      asm(PTR_ADDU  "$8, %3, %6\r\n"
    139          "lb       $9, ($8)\r\n"
    140          PTR_ADDU  "$8, %3, %7\r\n"
    141          "sb       $9, ($8)\r\n"
    142          PTR_ADDU  "$8, %4, %6\r\n"
    143          "lb       $9, ($8)\r\n"
    144          PTR_ADDU  "$8, %4, %7\r\n"
    145          "sb       $9, ($8)\r\n"
    146          PTR_ADDU  "$8, %5, %6\r\n"
    147          "lb       $9, ($8)\r\n"
    148          PTR_ADDU  "$8, %5, %7\r\n"
    149          "sb       $9, ($8)\r\n"
    150          : "=m" (*inptr_1), "=m" (*inptr0), "=m" (*inptr1)
    151          : "r" (inptr_1), "r" (inptr0), "r" (inptr1), "r" (tmp), "r" (tmp1)
    152          : "$8", "$9"
    153         );
    154    }
    155 
    156    /* process the first column block */
    157    this0 = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
    158    this_1 = _mm_load_si64((__m64 *)inptr_1);  /* row[-1][0] */
    159    this1 = _mm_load_si64((__m64 *)inptr1);    /* row[ 1][0] */
    160 
    161    this0l = _mm_unpacklo_pi8(this0, zero);    /* row[ 0][0]( 0 1 2 3) */
    162    this0h = _mm_unpackhi_pi8(this0, zero);    /* row[ 0][0]( 4 5 6 7) */
    163    this_1l = _mm_unpacklo_pi8(this_1, zero);  /* row[-1][0]( 0 1 2 3) */
    164    this_1h = _mm_unpackhi_pi8(this_1, zero);  /* row[-1][0]( 4 5 6 7) */
    165    this1l = _mm_unpacklo_pi8(this1, zero);    /* row[+1][0]( 0 1 2 3) */
    166    this1h = _mm_unpackhi_pi8(this1, zero);    /* row[+1][0]( 4 5 6 7) */
    167 
    168    this0l = _mm_mullo_pi16(this0l, PW_THREE);
    169    this0h = _mm_mullo_pi16(this0h, PW_THREE);
    170 
    171    thiscolsum_1l = _mm_add_pi16(this_1l, this0l);  /* ( 0 1 2 3) */
    172    thiscolsum_1h = _mm_add_pi16(this_1h, this0h);  /* ( 4 5 6 7) */
    173    thiscolsum1l = _mm_add_pi16(this0l, this1l);    /* ( 0 1 2 3) */
    174    thiscolsum1h = _mm_add_pi16(this0h, this1h);    /* ( 4 5 6 7) */
    175 
    176    /* temporarily save the intermediate data */
    177    _mm_store_si64((__m64 *)outptr0, thiscolsum_1l);
    178    _mm_store_si64((__m64 *)outptr0 + 1, thiscolsum_1h);
    179    _mm_store_si64((__m64 *)outptr1, thiscolsum1l);
    180    _mm_store_si64((__m64 *)outptr1 + 1, thiscolsum1h);
    181 
    182    wk[0] = _mm_and_si64(thiscolsum_1l, mask0);  /* ( 0 - - -) */
    183    wk[1] = _mm_and_si64(thiscolsum1l, mask0);   /* ( 0 - - -) */
    184 
    185    for (incol = downsampled_width; incol > 0;
    186         incol -= 8, inptr_1 += 8, inptr0 += 8, inptr1 += 8,
    187         outptr0 += 16, outptr1 += 16) {
    188 
    189      if (incol > 8) {
    190        /* process the next column block */
    191        next0 = _mm_load_si64((__m64 *)inptr0 + 1);    /* row[ 0][1] */
    192        next_1 = _mm_load_si64((__m64 *)inptr_1 + 1);  /* row[-1][1] */
    193        next1 = _mm_load_si64((__m64 *)inptr1 + 1);    /* row[+1][1] */
    194 
    195        next0l = _mm_unpacklo_pi8(next0, zero);    /* row[ 0][1]( 0 1 2 3) */
    196        next0h = _mm_unpackhi_pi8(next0, zero);    /* row[ 0][1]( 4 5 6 7) */
    197        next_1l = _mm_unpacklo_pi8(next_1, zero);  /* row[-1][1]( 0 1 2 3) */
    198        next_1h = _mm_unpackhi_pi8(next_1, zero);  /* row[-1][1]( 4 5 6 7) */
    199        next1l = _mm_unpacklo_pi8(next1, zero);    /* row[+1][1]( 0 1 2 3) */
    200        next1h = _mm_unpackhi_pi8(next1, zero);    /* row[+1][1]( 4 5 6 7) */
    201 
    202        next0l = _mm_mullo_pi16(next0l, PW_THREE);
    203        next0h = _mm_mullo_pi16(next0h, PW_THREE);
    204 
    205        nextcolsum_1l = _mm_add_pi16(next_1l, next0l);  /* ( 0 1 2 3) */
    206        nextcolsum_1h = _mm_add_pi16(next_1h, next0h);  /* ( 4 5 6 7) */
    207        nextcolsum1l = _mm_add_pi16(next0l, next1l);    /* ( 0 1 2 3) */
    208        nextcolsum1h = _mm_add_pi16(next0h, next1h);    /* ( 4 5 6 7) */
    209 
    210        /* temporarily save the intermediate data */
    211        _mm_store_si64((__m64 *)outptr0 + 2, nextcolsum_1l);
    212        _mm_store_si64((__m64 *)outptr0 + 3, nextcolsum_1h);
    213        _mm_store_si64((__m64 *)outptr1 + 2, nextcolsum1l);
    214        _mm_store_si64((__m64 *)outptr1 + 3, nextcolsum1h);
    215 
    216        wk[2] = _mm_slli_si64(nextcolsum_1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
    217        wk[3] = _mm_slli_si64(nextcolsum1l, (SIZEOF_MMWORD - 2) * BYTE_BIT);   /* ( - - - 0) */
    218      } else {
    219        __m64 tmp;
    220 
    221        /* process the last column block */
    222        tmp = _mm_load_si64((__m64 *)outptr0 + 1);
    223        wk[2] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
    224        tmp = _mm_load_si64((__m64 *)outptr1 + 1);
    225        wk[3] = _mm_and_si64(masklast, tmp);        /* ( - - - 7) */
    226      }
    227 
    228      /* process the upper row */
    229      samp0123 = _mm_load_si64((__m64 *)outptr0);      /* ( 0 1 2 3) */ \
    230      samp4567 = _mm_load_si64((__m64 *)outptr0 + 1);  /* ( 4 5 6 7) */ \
    231      PROCESS_ROW(0, 2, PW_EIGHT, PW_SEVEN, 4)
    232 
    233      /* process the lower row */
    234      samp0123 = _mm_load_si64((__m64 *)outptr1);      /* ( 0 1 2 3) */ \
    235      samp4567 = _mm_load_si64((__m64 *)outptr1 + 1);  /* ( 4 5 6 7) */ \
    236      PROCESS_ROW(1, 2, PW_EIGHT, PW_SEVEN, 4)
    237    }
    238  }
    239 }
    240 
    241 
    242 void jsimd_h2v1_fancy_upsample_mmi(int max_v_samp_factor,
    243                                   JDIMENSION downsampled_width,
    244                                   JSAMPARRAY input_data,
    245                                   JSAMPARRAY *output_data_ptr)
    246 {
    247  JSAMPARRAY output_data = *output_data_ptr;
    248  JSAMPROW inptr0, outptr0;
    249  int inrow, incol, tmp, tmp1;
    250  __m64 thisl, this, nextl, next;
    251  __m64 mask0 = 0.0, masklast, samp0123, samp4567, wk[2], zero = 0.0;
    252 
    253  mask0 = _mm_cmpeq_pi8(mask0, mask0);
    254  masklast = _mm_slli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
    255  mask0 = _mm_srli_si64(mask0, (SIZEOF_MMWORD - 2) * BYTE_BIT);
    256 
    257  for (inrow = 0; inrow < max_v_samp_factor; inrow++) {
    258 
    259    inptr0 = input_data[inrow];
    260    outptr0 = output_data[inrow];
    261 
    262    if (downsampled_width & 7) {
    263      tmp = (downsampled_width - 1) * sizeof(JSAMPLE);
    264      tmp1 = downsampled_width * sizeof(JSAMPLE);
    265      asm(PTR_ADDU  "$8, %1, %2\r\n"
    266          "lb       $9, ($8)\r\n"
    267          PTR_ADDU  "$8, %1, %3\r\n"
    268          "sb       $9, ($8)\r\n"
    269          : "=m" (*inptr0)
    270          : "r" (inptr0), "r" (tmp), "r" (tmp1)
    271          : "$8", "$9"
    272         );
    273    }
    274 
    275    /* process the first column block */
    276    this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
    277    thisl = _mm_unpacklo_pi8(this, zero);     /* row[ 0][0]( 0 1 2 3) */
    278    wk[0] = _mm_and_si64(thisl, mask0);       /* ( 0 - - -) */
    279 
    280    for (incol = downsampled_width; incol > 0;
    281         incol -= 8, inptr0 += 8, outptr0 += 16) {
    282 
    283      if (incol > 8) {
    284        /* process the next column block */
    285        next = _mm_load_si64((__m64 *)inptr0 + 1);  /* row[ 0][1] */
    286        nextl = _mm_unpacklo_pi8(next, zero);       /* row[ 0][1]( 0 1 2 3) */
    287        wk[1] = _mm_slli_si64(nextl, (SIZEOF_MMWORD - 2) * BYTE_BIT);  /* ( - - - 0) */
    288      } else {
    289        __m64 thish;
    290 
    291        /* process the last column block */
    292        this = _mm_load_si64((__m64 *)inptr0);  /* row[ 0][0] */
    293        thish = _mm_unpackhi_pi8(this, zero);   /* row[ 0][1]( 4 5 6 7) */
    294        wk[1] = _mm_and_si64(masklast, thish);  /* ( - - - 7) */
    295      }
    296 
    297      /* process the row */
    298      this = _mm_load_si64((__m64 *)inptr0);    /* row[ 0][0] */
    299      samp0123 = _mm_unpacklo_pi8(this, zero);  /* ( 0 1 2 3) */
    300      samp4567 = _mm_unpackhi_pi8(this, zero);  /* ( 4 5 6 7) */
    301      PROCESS_ROW(0, 1, PW_ONE, PW_TWO, 2)
    302    }
    303  }
    304 }