encoding.c (7690B)
1 #include "nvim/vterm/encoding.h" 2 #include "nvim/vterm/vterm_internal_defs.h" 3 4 #include "vterm/encoding.c.generated.h" 5 6 #define UNICODE_INVALID 0xFFFD 7 8 #if defined(DEBUG) && DEBUG > 1 9 # define DEBUG_PRINT_UTF8 10 #endif 11 12 struct UTF8DecoderData { 13 // number of bytes remaining in this codepoint 14 int bytes_remaining; 15 16 // number of bytes total in this codepoint once it's finished 17 // (for detecting overlongs) 18 int bytes_total; 19 20 int this_cp; 21 }; 22 23 static void init_utf8(VTermEncoding *enc, void *data_) 24 { 25 struct UTF8DecoderData *data = data_; 26 27 data->bytes_remaining = 0; 28 data->bytes_total = 0; 29 } 30 31 static void decode_utf8(VTermEncoding *enc, void *data_, uint32_t cp[], int *cpi, int cplen, 32 const char bytes[], size_t *pos, size_t bytelen) 33 { 34 struct UTF8DecoderData *data = data_; 35 36 #ifdef DEBUG_PRINT_UTF8 37 printf("BEGIN UTF-8\n"); 38 #endif 39 40 for (; *pos < bytelen && *cpi < cplen; (*pos)++) { 41 uint8_t c = (uint8_t)bytes[*pos]; 42 43 #ifdef DEBUG_PRINT_UTF8 44 printf(" pos=%zd c=%02x rem=%d\n", *pos, c, data->bytes_remaining); 45 #endif 46 47 if (c < 0x20) { // C0 48 return; 49 } else if (c >= 0x20 && c < 0x7f) { 50 if (data->bytes_remaining) { 51 cp[(*cpi)++] = UNICODE_INVALID; 52 } 53 54 cp[(*cpi)++] = c; 55 #ifdef DEBUG_PRINT_UTF8 56 printf(" UTF-8 char: U+%04x\n", c); 57 #endif 58 data->bytes_remaining = 0; 59 } else if (c == 0x7f) { // DEL 60 return; 61 } else if (c >= 0x80 && c < 0xc0) { 62 if (!data->bytes_remaining) { 63 cp[(*cpi)++] = UNICODE_INVALID; 64 continue; 65 } 66 67 data->this_cp <<= 6; 68 data->this_cp |= c & 0x3f; 69 data->bytes_remaining--; 70 71 if (!data->bytes_remaining) { 72 #ifdef DEBUG_PRINT_UTF8 73 printf(" UTF-8 raw char U+%04x bytelen=%d ", data->this_cp, data->bytes_total); 74 #endif 75 // Check for overlong sequences 76 switch (data->bytes_total) { 77 case 2: 78 if (data->this_cp < 0x0080) { 79 data->this_cp = UNICODE_INVALID; 80 } 81 break; 82 case 3: 83 if (data->this_cp < 0x0800) { 84 data->this_cp = UNICODE_INVALID; 85 } 86 break; 87 case 4: 88 if (data->this_cp < 0x10000) { 89 data->this_cp = UNICODE_INVALID; 90 } 91 break; 92 case 5: 93 if (data->this_cp < 0x200000) { 94 data->this_cp = UNICODE_INVALID; 95 } 96 break; 97 case 6: 98 if (data->this_cp < 0x4000000) { 99 data->this_cp = UNICODE_INVALID; 100 } 101 break; 102 } 103 // Now look for plain invalid ones 104 if ((data->this_cp >= 0xD800 && data->this_cp <= 0xDFFF) 105 || data->this_cp == 0xFFFE 106 || data->this_cp == 0xFFFF) { 107 data->this_cp = UNICODE_INVALID; 108 } 109 #ifdef DEBUG_PRINT_UTF8 110 printf(" char: U+%04x\n", data->this_cp); 111 #endif 112 cp[(*cpi)++] = (uint32_t)data->this_cp; 113 } 114 } else if (c >= 0xc0 && c < 0xe0) { 115 if (data->bytes_remaining) { 116 cp[(*cpi)++] = UNICODE_INVALID; 117 } 118 119 data->this_cp = c & 0x1f; 120 data->bytes_total = 2; 121 data->bytes_remaining = 1; 122 } else if (c >= 0xe0 && c < 0xf0) { 123 if (data->bytes_remaining) { 124 cp[(*cpi)++] = UNICODE_INVALID; 125 } 126 127 data->this_cp = c & 0x0f; 128 data->bytes_total = 3; 129 data->bytes_remaining = 2; 130 } else if (c >= 0xf0 && c < 0xf8) { 131 if (data->bytes_remaining) { 132 cp[(*cpi)++] = UNICODE_INVALID; 133 } 134 135 data->this_cp = c & 0x07; 136 data->bytes_total = 4; 137 data->bytes_remaining = 3; 138 } else if (c >= 0xf8 && c < 0xfc) { 139 if (data->bytes_remaining) { 140 cp[(*cpi)++] = UNICODE_INVALID; 141 } 142 143 data->this_cp = c & 0x03; 144 data->bytes_total = 5; 145 data->bytes_remaining = 4; 146 } else if (c >= 0xfc && c < 0xfe) { 147 if (data->bytes_remaining) { 148 cp[(*cpi)++] = UNICODE_INVALID; 149 } 150 151 data->this_cp = c & 0x01; 152 data->bytes_total = 6; 153 data->bytes_remaining = 5; 154 } else { 155 cp[(*cpi)++] = UNICODE_INVALID; 156 } 157 } 158 } 159 160 static VTermEncoding encoding_utf8 = { 161 .init = &init_utf8, 162 .decode = &decode_utf8, 163 }; 164 165 static void decode_usascii(VTermEncoding *enc, void *data, uint32_t cp[], int *cpi, int cplen, 166 const char bytes[], size_t *pos, size_t bytelen) 167 { 168 int is_gr = bytes[*pos] & 0x80; 169 170 for (; *pos < bytelen && *cpi < cplen; (*pos)++) { 171 uint8_t c = (uint8_t)(bytes[*pos] ^ is_gr); 172 173 if (c < 0x20 || c == 0x7f || c >= 0x80) { 174 return; 175 } 176 177 cp[(*cpi)++] = c; 178 } 179 } 180 181 static VTermEncoding encoding_usascii = { 182 .decode = &decode_usascii, 183 }; 184 185 struct StaticTableEncoding { 186 const VTermEncoding enc; 187 const uint32_t chars[128]; 188 }; 189 190 static void decode_table(VTermEncoding *enc, void *data, uint32_t cp[], int *cpi, int cplen, 191 const char bytes[], size_t *pos, size_t bytelen) 192 { 193 struct StaticTableEncoding *table = (struct StaticTableEncoding *)enc; 194 int is_gr = bytes[*pos] & 0x80; 195 196 for (; *pos < bytelen && *cpi < cplen; (*pos)++) { 197 uint8_t c = (uint8_t)(bytes[*pos] ^ is_gr); 198 199 if (c < 0x20 || c == 0x7f || c >= 0x80) { 200 return; 201 } 202 203 if (table->chars[c]) { 204 cp[(*cpi)++] = table->chars[c]; 205 } else { 206 cp[(*cpi)++] = c; 207 } 208 } 209 } 210 211 // https://en.wikipedia.org/wiki/DEC_Special_Graphics 212 static const struct StaticTableEncoding encoding_DECdrawing = { 213 { .decode = &decode_table }, 214 { 215 [0x60] = 0x25C6, // BLACK DIAMOND 216 [0x61] = 0x2592, // MEDIUM SHADE (checkerboard) 217 [0x62] = 0x2409, // SYMBOL FOR HORIZONTAL TAB 218 [0x63] = 0x240C, // SYMBOL FOR FORM FEED 219 [0x64] = 0x240D, // SYMBOL FOR CARRIAGE RETURN 220 [0x65] = 0x240A, // SYMBOL FOR LINE FEED 221 [0x66] = 0x00B0, // DEGREE SIGN 222 [0x67] = 0x00B1, // PLUS-MINUS SIGN (plus or minus) 223 [0x68] = 0x2424, // SYMBOL FOR NEW LINE 224 [0x69] = 0x240B, // SYMBOL FOR VERTICAL TAB 225 [0x6a] = 0x2518, // BOX DRAWINGS LIGHT UP AND LEFT (bottom-right corner) 226 [0x6b] = 0x2510, // BOX DRAWINGS LIGHT DOWN AND LEFT (top-right corner) 227 [0x6c] = 0x250C, // BOX DRAWINGS LIGHT DOWN AND RIGHT (top-left corner) 228 [0x6d] = 0x2514, // BOX DRAWINGS LIGHT UP AND RIGHT (bottom-left corner) 229 [0x6e] = 0x253C, // BOX DRAWINGS LIGHT VERTICAL AND HORIZONTAL (crossing lines) 230 [0x6f] = 0x23BA, // HORIZONTAL SCAN LINE-1 231 [0x70] = 0x23BB, // HORIZONTAL SCAN LINE-3 232 [0x71] = 0x2500, // BOX DRAWINGS LIGHT HORIZONTAL 233 [0x72] = 0x23BC, // HORIZONTAL SCAN LINE-7 234 [0x73] = 0x23BD, // HORIZONTAL SCAN LINE-9 235 [0x74] = 0x251C, // BOX DRAWINGS LIGHT VERTICAL AND RIGHT 236 [0x75] = 0x2524, // BOX DRAWINGS LIGHT VERTICAL AND LEFT 237 [0x76] = 0x2534, // BOX DRAWINGS LIGHT UP AND HORIZONTAL 238 [0x77] = 0x252C, // BOX DRAWINGS LIGHT DOWN AND HORIZONTAL 239 [0x78] = 0x2502, // BOX DRAWINGS LIGHT VERTICAL 240 [0x79] = 0x2A7D, // LESS-THAN OR SLANTED EQUAL-TO 241 [0x7a] = 0x2A7E, // GREATER-THAN OR SLANTED EQUAL-TO 242 [0x7b] = 0x03C0, // GREEK SMALL LETTER PI 243 [0x7c] = 0x2260, // NOT EQUAL TO 244 [0x7d] = 0x00A3, // POUND SIGN 245 [0x7e] = 0x00B7, // MIDDLE DOT 246 } 247 }; 248 249 static struct { 250 VTermEncodingType type; 251 char designation; 252 VTermEncoding *enc; 253 } 254 encodings[] = { 255 { ENC_UTF8, 'u', &encoding_utf8 }, 256 { ENC_SINGLE_94, '0', (VTermEncoding *)&encoding_DECdrawing }, 257 { ENC_SINGLE_94, 'B', &encoding_usascii }, 258 { 0 }, 259 }; 260 261 VTermEncoding *vterm_lookup_encoding(VTermEncodingType type, char designation) 262 { 263 for (int i = 0; encodings[i].designation; i++) { 264 if (encodings[i].type == type && encodings[i].designation == designation) { 265 return encodings[i].enc; 266 } 267 } 268 return NULL; 269 }