neovim

Neovim text editor
git clone https://git.dasho.dev/neovim.git
Log | Files | Refs | README

encoding.c (7690B)


      1 #include "nvim/vterm/encoding.h"
      2 #include "nvim/vterm/vterm_internal_defs.h"
      3 
      4 #include "vterm/encoding.c.generated.h"
      5 
      6 #define UNICODE_INVALID 0xFFFD
      7 
      8 #if defined(DEBUG) && DEBUG > 1
      9 # define DEBUG_PRINT_UTF8
     10 #endif
     11 
     12 struct UTF8DecoderData {
     13  // number of bytes remaining in this codepoint
     14  int bytes_remaining;
     15 
     16  // number of bytes total in this codepoint once it's finished
     17  // (for detecting overlongs)
     18  int bytes_total;
     19 
     20  int this_cp;
     21 };
     22 
     23 static void init_utf8(VTermEncoding *enc, void *data_)
     24 {
     25  struct UTF8DecoderData *data = data_;
     26 
     27  data->bytes_remaining = 0;
     28  data->bytes_total = 0;
     29 }
     30 
     31 static void decode_utf8(VTermEncoding *enc, void *data_, uint32_t cp[], int *cpi, int cplen,
     32                        const char bytes[], size_t *pos, size_t bytelen)
     33 {
     34  struct UTF8DecoderData *data = data_;
     35 
     36 #ifdef DEBUG_PRINT_UTF8
     37  printf("BEGIN UTF-8\n");
     38 #endif
     39 
     40  for (; *pos < bytelen && *cpi < cplen; (*pos)++) {
     41    uint8_t c = (uint8_t)bytes[*pos];
     42 
     43 #ifdef DEBUG_PRINT_UTF8
     44    printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining);
     45 #endif
     46 
     47    if (c < 0x20) {  // C0
     48      return;
     49    } else if (c >= 0x20 && c < 0x7f) {
     50      if (data->bytes_remaining) {
     51        cp[(*cpi)++] = UNICODE_INVALID;
     52      }
     53 
     54      cp[(*cpi)++] = c;
     55 #ifdef DEBUG_PRINT_UTF8
     56      printf(" UTF-8 char: U+%04x\n", c);
     57 #endif
     58      data->bytes_remaining = 0;
     59    } else if (c == 0x7f) {  // DEL
     60      return;
     61    } else if (c >= 0x80 && c < 0xc0) {
     62      if (!data->bytes_remaining) {
     63        cp[(*cpi)++] = UNICODE_INVALID;
     64        continue;
     65      }
     66 
     67      data->this_cp <<= 6;
     68      data->this_cp |= c & 0x3f;
     69      data->bytes_remaining--;
     70 
     71      if (!data->bytes_remaining) {
     72 #ifdef DEBUG_PRINT_UTF8
     73        printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total);
     74 #endif
     75        // Check for overlong sequences
     76        switch (data->bytes_total) {
     77        case 2:
     78          if (data->this_cp < 0x0080) {
     79            data->this_cp = UNICODE_INVALID;
     80          }
     81          break;
     82        case 3:
     83          if (data->this_cp < 0x0800) {
     84            data->this_cp = UNICODE_INVALID;
     85          }
     86          break;
     87        case 4:
     88          if (data->this_cp < 0x10000) {
     89            data->this_cp = UNICODE_INVALID;
     90          }
     91          break;
     92        case 5:
     93          if (data->this_cp < 0x200000) {
     94            data->this_cp = UNICODE_INVALID;
     95          }
     96          break;
     97        case 6:
     98          if (data->this_cp < 0x4000000) {
     99            data->this_cp = UNICODE_INVALID;
    100          }
    101          break;
    102        }
    103        // Now look for plain invalid ones
    104        if ((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF)
    105            || data->this_cp == 0xFFFE
    106            || data->this_cp == 0xFFFF) {
    107          data->this_cp = UNICODE_INVALID;
    108        }
    109 #ifdef DEBUG_PRINT_UTF8
    110        printf(" char: U+%04x\n", data->this_cp);
    111 #endif
    112        cp[(*cpi)++] = (uint32_t)data->this_cp;
    113      }
    114    } else if (c >= 0xc0 && c < 0xe0) {
    115      if (data->bytes_remaining) {
    116        cp[(*cpi)++] = UNICODE_INVALID;
    117      }
    118 
    119      data->this_cp = c & 0x1f;
    120      data->bytes_total = 2;
    121      data->bytes_remaining = 1;
    122    } else if (c >= 0xe0 && c < 0xf0) {
    123      if (data->bytes_remaining) {
    124        cp[(*cpi)++] = UNICODE_INVALID;
    125      }
    126 
    127      data->this_cp = c & 0x0f;
    128      data->bytes_total = 3;
    129      data->bytes_remaining = 2;
    130    } else if (c >= 0xf0 && c < 0xf8) {
    131      if (data->bytes_remaining) {
    132        cp[(*cpi)++] = UNICODE_INVALID;
    133      }
    134 
    135      data->this_cp = c & 0x07;
    136      data->bytes_total = 4;
    137      data->bytes_remaining = 3;
    138    } else if (c >= 0xf8 && c < 0xfc) {
    139      if (data->bytes_remaining) {
    140        cp[(*cpi)++] = UNICODE_INVALID;
    141      }
    142 
    143      data->this_cp = c & 0x03;
    144      data->bytes_total = 5;
    145      data->bytes_remaining = 4;
    146    } else if (c >= 0xfc && c < 0xfe) {
    147      if (data->bytes_remaining) {
    148        cp[(*cpi)++] = UNICODE_INVALID;
    149      }
    150 
    151      data->this_cp = c & 0x01;
    152      data->bytes_total = 6;
    153      data->bytes_remaining = 5;
    154    } else {
    155      cp[(*cpi)++] = UNICODE_INVALID;
    156    }
    157  }
    158 }
    159 
    160 static VTermEncoding encoding_utf8 = {
    161  .init = &init_utf8,
    162  .decode = &decode_utf8,
    163 };
    164 
    165 static void decode_usascii(VTermEncoding *enc, void *data, uint32_t cp[], int *cpi, int cplen,
    166                           const char bytes[], size_t *pos, size_t bytelen)
    167 {
    168  int is_gr = bytes[*pos] & 0x80;
    169 
    170  for (; *pos < bytelen && *cpi < cplen; (*pos)++) {
    171    uint8_t c = (uint8_t)(bytes[*pos] ^ is_gr);
    172 
    173    if (c < 0x20 || c == 0x7f || c >= 0x80) {
    174      return;
    175    }
    176 
    177    cp[(*cpi)++] = c;
    178  }
    179 }
    180 
    181 static VTermEncoding encoding_usascii = {
    182  .decode = &decode_usascii,
    183 };
    184 
    185 struct StaticTableEncoding {
    186  const VTermEncoding enc;
    187  const uint32_t chars[128];
    188 };
    189 
    190 static void decode_table(VTermEncoding *enc, void *data, uint32_t cp[], int *cpi, int cplen,
    191                         const char bytes[], size_t *pos, size_t bytelen)
    192 {
    193  struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc;
    194  int is_gr = bytes[*pos] & 0x80;
    195 
    196  for (; *pos < bytelen && *cpi < cplen; (*pos)++) {
    197    uint8_t c = (uint8_t)(bytes[*pos] ^ is_gr);
    198 
    199    if (c < 0x20 || c == 0x7f || c >= 0x80) {
    200      return;
    201    }
    202 
    203    if (table->chars[c]) {
    204      cp[(*cpi)++] = table->chars[c];
    205    } else {
    206      cp[(*cpi)++] = c;
    207    }
    208  }
    209 }
    210 
    211 // https://en.wikipedia.org/wiki/DEC_Special_Graphics
    212 static const struct StaticTableEncoding encoding_DECdrawing = {
    213  { .decode = &decode_table },
    214  {
    215    [0x60] = 0x25C6,  // BLACK DIAMOND
    216    [0x61] = 0x2592,  // MEDIUM SHADE (checkerboard)
    217    [0x62] = 0x2409,  // SYMBOL FOR HORIZONTAL TAB
    218    [0x63] = 0x240C,  // SYMBOL FOR FORM FEED
    219    [0x64] = 0x240D,  // SYMBOL FOR CARRIAGE RETURN
    220    [0x65] = 0x240A,  // SYMBOL FOR LINE FEED
    221    [0x66] = 0x00B0,  // DEGREE SIGN
    222    [0x67] = 0x00B1,  // PLUS-MINUS SIGN (plus or minus)
    223    [0x68] = 0x2424,  // SYMBOL FOR NEW LINE
    224    [0x69] = 0x240B,  // SYMBOL FOR VERTICAL TAB
    225    [0x6a] = 0x2518,  // BOX DRAWINGS LIGHT UP AND LEFT (bottom-right corner)
    226    [0x6b] = 0x2510,  // BOX DRAWINGS LIGHT DOWN AND LEFT (top-right corner)
    227    [0x6c] = 0x250C,  // BOX DRAWINGS LIGHT DOWN AND RIGHT (top-left corner)
    228    [0x6d] = 0x2514,  // BOX DRAWINGS LIGHT UP AND RIGHT (bottom-left corner)
    229    [0x6e] = 0x253C,  // BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL (crossing lines)
    230    [0x6f] = 0x23BA,  // HORIZONTAL SCAN LINE-1
    231    [0x70] = 0x23BB,  // HORIZONTAL SCAN LINE-3
    232    [0x71] = 0x2500,  // BOX DRAWINGS LIGHT HORIZONTAL
    233    [0x72] = 0x23BC,  // HORIZONTAL SCAN LINE-7
    234    [0x73] = 0x23BD,  // HORIZONTAL SCAN LINE-9
    235    [0x74] = 0x251C,  // BOX DRAWINGS LIGHT VERTICAL AND RIGHT
    236    [0x75] = 0x2524,  // BOX DRAWINGS LIGHT VERTICAL AND LEFT
    237    [0x76] = 0x2534,  // BOX DRAWINGS LIGHT UP AND HORIZONTAL
    238    [0x77] = 0x252C,  // BOX DRAWINGS LIGHT DOWN AND HORIZONTAL
    239    [0x78] = 0x2502,  // BOX DRAWINGS LIGHT VERTICAL
    240    [0x79] = 0x2A7D,  // LESS-THAN OR SLANTED EQUAL-TO
    241    [0x7a] = 0x2A7E,  // GREATER-THAN OR SLANTED EQUAL-TO
    242    [0x7b] = 0x03C0,  // GREEK SMALL LETTER PI
    243    [0x7c] = 0x2260,  // NOT EQUAL TO
    244    [0x7d] = 0x00A3,  // POUND SIGN
    245    [0x7e] = 0x00B7,  // MIDDLE DOT
    246  }
    247 };
    248 
    249 static struct {
    250  VTermEncodingType type;
    251  char designation;
    252  VTermEncoding *enc;
    253 }
    254 encodings[] = {
    255  { ENC_UTF8,      'u', &encoding_utf8 },
    256  { ENC_SINGLE_94, '0', (VTermEncoding *)&encoding_DECdrawing },
    257  { ENC_SINGLE_94, 'B', &encoding_usascii },
    258  { 0 },
    259 };
    260 
    261 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation)
    262 {
    263  for (int i = 0; encodings[i].designation; i++) {
    264    if (encodings[i].type == type && encodings[i].designation == designation) {
    265      return encodings[i].enc;
    266    }
    267  }
    268  return NULL;
    269 }