escapesrc.cpp (10476B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include <stdio.h> 5 #include <string> 6 #include <stdlib.h> 7 #include <errno.h> 8 #include <string.h> 9 #include <iostream> 10 #include <fstream> 11 12 // We only use U8_* macros, which are entirely inline. 13 #include "unicode/utf8.h" 14 15 // This contains a codepage and ISO 14882:1998 illegality table. 16 // Use "make gen-table" to rebuild it. 17 #include "cptbl.h" 18 19 /** 20 * What is this? 21 * 22 * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code 23 * in utf-8 into something consumable by certain compilers (Solaris, xlC) 24 * which aren't quite standards compliant. 25 * 26 * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN' 27 * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc. 28 * (some compilers do not support the u8 prefix correctly.) 29 * - if the system is EBCDIC-based, that is used to correct the input characters. 30 * 31 * Usage: 32 * escapesrc infile.cpp outfile.cpp 33 * Normally this is invoked by the build stage, with a rule such as: 34 * 35 * _%.cpp: $(srcdir)/%.cpp 36 * @$(BINDIR)/escapesrc$(EXEEXT) $< $@ 37 * %.o: _%.cpp 38 * $(COMPILE.cc) ... $@ $< 39 * 40 * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp 41 * from being itself escaped. 42 */ 43 44 45 static const char 46 kSPACE = 0x20, 47 kTAB = 0x09, 48 kLF = 0x0A, 49 kCR = 0x0D; 50 51 // For convenience 52 # define cp1047_to_8859(c) cp1047_8859_1[c] 53 54 // Our app's name 55 std::string prog; 56 57 /** 58 * Give the usual 1-line documentation and exit 59 */ 60 void usage() { 61 fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str()); 62 } 63 64 /** 65 * Delete the output file (if any) 66 * We want to delete even if we didn't generate, because it might be stale. 67 */ 68 int cleanup(const std::string &outfile) { 69 const char *outstr = outfile.c_str(); 70 if(outstr && *outstr) { 71 int rc = std::remove(outstr); 72 if(rc == 0) { 73 fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr); 74 return 0; 75 } else { 76 if( errno == ENOENT ) { 77 return 0; // File did not exist - no error. 78 } else { 79 perror("std::remove"); 80 return 1; 81 } 82 } 83 } 84 return 0; 85 } 86 87 /** 88 * Skip across any known whitespace. 89 * @param p startpoint 90 * @param e limit 91 * @return first non-whitespace char 92 */ 93 inline const char *skipws(const char *p, const char *e) { 94 for(;p<e;p++) { 95 switch(*p) { 96 case kSPACE: 97 case kTAB: 98 case kLF: 99 case kCR: 100 break; 101 default: 102 return p; // non ws 103 } 104 } 105 return p; 106 } 107 108 /** 109 * Append a byte, hex encoded 110 * @param outstr sstring to append to 111 * @param byte the byte to append 112 */ 113 void appendByte(std::string &outstr, 114 uint8_t byte) { 115 char tmp2[5]; 116 snprintf(tmp2, sizeof(tmp2), "\\x%02X", 0xFF & static_cast<int>(byte)); 117 outstr += tmp2; 118 } 119 120 /** 121 * Append the bytes from 'linestr' into outstr, with escaping 122 * @param outstr the output buffer 123 * @param linestr the input buffer 124 * @param pos in/out: the current char under consideration 125 * @param chars the number of chars to consider 126 * @return true on failure 127 */ 128 bool appendUtf8(std::string &outstr, 129 const std::string &linestr, 130 size_t &pos, 131 size_t chars) { 132 char tmp[9]; 133 for(size_t i=0;i<chars;i++) { 134 tmp[i] = linestr[++pos]; 135 } 136 tmp[chars] = 0; 137 unsigned int c; 138 sscanf(tmp, "%X", &c); 139 UChar32 ch = c & 0x1FFFFF; 140 141 // now to append \\x%% etc 142 uint8_t bytesNeeded = U8_LENGTH(ch); 143 if(bytesNeeded == 0) { 144 fprintf(stderr, "Illegal code point U+%X\n", ch); 145 return true; 146 } 147 uint8_t bytes[4]; 148 uint8_t *s = bytes; 149 size_t i = 0; 150 U8_APPEND_UNSAFE(s, i, ch); 151 for(size_t t = 0; t<i; t++) { 152 appendByte(outstr, s[t]); 153 } 154 return false; 155 } 156 157 /** 158 * Fixup u8"x" 159 * @param linestr string to mutate. Already escaped into \u format. 160 * @param origpos beginning, points to 'u8"' 161 * @param pos end, points to " 162 * @return false for no-problem, true for failure! 163 */ 164 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) { 165 size_t pos = origpos + 3; 166 std::string outstr; 167 outstr += '\"'; // local encoding 168 for(;pos<endpos;pos++) { 169 char c = linestr[pos]; 170 if(c == '\\') { 171 char c2 = linestr[++pos]; 172 switch(c2) { 173 case '\'': 174 case '"': 175 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 176 c2 = cp1047_to_8859(c2); 177 #endif 178 appendByte(outstr, c2); 179 break; 180 case 'u': 181 appendUtf8(outstr, linestr, pos, 4); 182 break; 183 case 'U': 184 appendUtf8(outstr, linestr, pos, 8); 185 break; 186 } 187 } else { 188 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 189 c = cp1047_to_8859(c); 190 #endif 191 appendByte(outstr, c); 192 } 193 } 194 outstr += ('\"'); 195 196 linestr.replace(origpos, (endpos-origpos+1), outstr); 197 198 return false; // OK 199 } 200 201 /** 202 * fix the u"x"/u'x'/u8"x" string at the position 203 * u8'x' is not supported, sorry. 204 * @param linestr the input string 205 * @param pos the position 206 * @return false = no err, true = had err 207 */ 208 bool fixAt(std::string &linestr, size_t pos) { 209 size_t origpos = pos; 210 211 if(linestr[pos] != 'u') { 212 fprintf(stderr, "Not a 'u'?"); 213 return true; 214 } 215 216 pos++; // past 'u' 217 218 bool utf8 = false; 219 220 if(linestr[pos] == '8') { // u8" 221 utf8 = true; 222 pos++; 223 } 224 225 char quote = linestr[pos]; 226 227 if(quote != '\'' && quote != '\"') { 228 fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote); 229 return true; 230 } 231 232 if(quote == '\'' && utf8) { 233 fprintf(stderr, "Cannot do u8'...'\n"); 234 return true; 235 } 236 237 pos ++; 238 239 //printf("u%c…%c\n", quote, quote); 240 241 for(; pos < linestr.size(); pos++) { 242 if(linestr[pos] == quote) { 243 if(utf8) { 244 return fixu8(linestr, origpos, pos); // fix u8"..." 245 } else { 246 return false; // end of quote 247 } 248 } 249 if(linestr[pos] == '\\') { 250 pos++; 251 if(linestr[pos] == quote) continue; // quoted quote 252 if(linestr[pos] == 'u') continue; // for now ... unicode escape 253 if(linestr[pos] == '\\') continue; 254 // some other escape… ignore 255 } else { 256 size_t old_pos = pos; 257 int32_t i = pos; 258 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 259 // mogrify 1-4 bytes from 1047 'back' to utf-8 260 char old_byte = linestr[pos]; 261 linestr[pos] = cp1047_to_8859(linestr[pos]); 262 // how many more? 263 int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]); 264 for(size_t pos2 = pos+1; trail>0; pos2++,trail--) { 265 linestr[pos2] = cp1047_to_8859(linestr[pos2]); 266 if(linestr[pos2] == 0x0A) { 267 linestr[pos2] = 0x85; // NL is ambiguous here 268 } 269 } 270 #endif 271 272 // Proceed to decode utf-8 273 const uint8_t* s = reinterpret_cast<const uint8_t*>(linestr.c_str()); 274 int32_t length = linestr.size(); 275 UChar32 c; 276 if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) { 277 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY) 278 linestr[pos] = old_byte; // put it back 279 #endif 280 continue; // single code point not previously legal for \u escaping 281 } 282 283 // otherwise, convert it to \u / \U 284 { 285 U8_NEXT(s, i, length, c); 286 } 287 if(c<0) { 288 fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", static_cast<int>(old_pos)); 289 fprintf(stderr, "Line: >>%s<<\n", linestr.c_str()); 290 return true; 291 } 292 293 size_t seqLen = (i-pos); 294 295 //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout); 296 297 char newSeq[20]; 298 if( c <= 0xFFFF) { 299 snprintf(newSeq, sizeof(newSeq), "\\u%04X", c); 300 } else { 301 snprintf(newSeq, sizeof(newSeq), "\\U%08X", c); 302 } 303 linestr.replace(pos, seqLen, newSeq); 304 pos += strlen(newSeq) - 1; 305 } 306 } 307 308 return false; 309 } 310 311 /** 312 * Fixup an entire line 313 * false = no err 314 * true = had err 315 * @param no the line number (not used) 316 * @param linestr the string to fix 317 * @return true if any err, else false 318 */ 319 bool fixLine(int /*no*/, std::string &linestr) { 320 const char *line = linestr.c_str(); 321 size_t len = linestr.size(); 322 323 // no u' in the line? 324 if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) { 325 return false; // Nothing to do. No u' or u" detected 326 } 327 328 // start from the end and find all u" cases 329 size_t pos = len = linestr.size(); 330 if(len>INT32_MAX/2) { 331 return true; 332 } 333 while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) { 334 //printf("found doublequote at %d\n", pos); 335 if(fixAt(linestr, pos)) return true; 336 if(pos == 0) break; 337 pos--; 338 } 339 340 // reset and find all u' cases 341 pos = len = linestr.size(); 342 while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) { 343 //printf("found singlequote at %d\n", pos); 344 if(fixAt(linestr, pos)) return true; 345 if(pos == 0) break; 346 pos--; 347 } 348 349 // reset and find all u8" cases 350 pos = len = linestr.size(); 351 while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) { 352 if(fixAt(linestr, pos)) return true; 353 if(pos == 0) break; 354 pos--; 355 } 356 357 //fprintf(stderr, "%d - fixed\n", no); 358 return false; 359 } 360 361 /** 362 * Convert a whole file 363 * @param infile 364 * @param outfile 365 * @return 1 on err, 0 otherwise 366 */ 367 int convert(const std::string &infile, const std::string &outfile) { 368 fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str()); 369 370 std::ifstream inf; 371 372 inf.open(infile.c_str(), std::ios::in); 373 374 if(!inf.is_open()) { 375 fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str()); 376 cleanup(outfile); 377 return 1; 378 } 379 380 std::ofstream outf; 381 382 outf.open(outfile.c_str(), std::ios::out); 383 384 if(!outf.is_open()) { 385 fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str()); 386 return 1; 387 } 388 389 // TODO: any platform variations of #line? 390 outf << "#line 1 \"" << infile << "\"" << '\n'; 391 392 int no = 0; 393 std::string linestr; 394 while( getline( inf, linestr)) { 395 no++; 396 if(fixLine(no, linestr)) { 397 goto fail; 398 } 399 outf << linestr << '\n'; 400 } 401 402 if(inf.eof()) { 403 return 0; 404 } 405 fail: 406 outf.close(); 407 fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str()); 408 cleanup(outfile); 409 return 1; 410 } 411 412 /** 413 * Main function 414 */ 415 int main(int argc, const char *argv[]) { 416 prog = argv[0]; 417 418 if(argc != 3) { 419 usage(); 420 return 1; 421 } 422 423 std::string infile = argv[1]; 424 std::string outfile = argv[2]; 425 426 return convert(infile, outfile); 427 }