tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

escapesrc.cpp (10476B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include <stdio.h>
      5 #include <string>
      6 #include <stdlib.h>
      7 #include <errno.h>
      8 #include <string.h>
      9 #include <iostream>
     10 #include <fstream>
     11 
     12 // We only use U8_* macros, which are entirely inline.
     13 #include "unicode/utf8.h"
     14 
     15 // This contains a codepage and ISO 14882:1998 illegality table.
     16 // Use "make gen-table" to rebuild it.
     17 #include "cptbl.h"
     18 
     19 /**
     20 * What is this?
     21 *
     22 * "This" is a preprocessor that makes an attempt to convert fully valid C++11 source code
     23 * in utf-8 into something consumable by certain compilers (Solaris, xlC)
     24 * which aren't quite standards compliant.
     25 *
     26 * - u"<unicode>" or u'<unicode>' gets converted to u"\uNNNN" or u'\uNNNN'
     27 * - u8"<unicode>" gets converted to "\xAA\xBB\xCC\xDD" etc.
     28 *   (some compilers do not support the u8 prefix correctly.)
     29 * - if the system is EBCDIC-based, that is used to correct the input characters.
     30 *
     31 * Usage:
     32 *   escapesrc infile.cpp outfile.cpp
     33 * Normally this is invoked by the build stage, with a rule such as:
     34 *
     35 * _%.cpp: $(srcdir)/%.cpp
     36 *       @$(BINDIR)/escapesrc$(EXEEXT) $< $@
     37 * %.o: _%.cpp
     38 *       $(COMPILE.cc) ... $@ $<
     39 *
     40 * In the Makefiles, SKIP_ESCAPING=YES is used to prevent escapesrc.cpp 
     41 * from being itself escaped.
     42 */
     43 
     44 
     45 static const char
     46  kSPACE   = 0x20,
     47  kTAB     = 0x09,
     48  kLF      = 0x0A,
     49  kCR      = 0x0D;
     50 
     51 // For convenience
     52 # define cp1047_to_8859(c) cp1047_8859_1[c]
     53 
     54 // Our app's name
     55 std::string prog;
     56 
     57 /**
     58 * Give the usual 1-line documentation and exit
     59 */
     60 void usage() {
     61  fprintf(stderr, "%s: usage: %s infile.cpp outfile.cpp\n", prog.c_str(), prog.c_str());
     62 }
     63 
     64 /**
     65 * Delete the output file (if any)
     66 * We want to delete even if we didn't generate, because it might be stale.
     67 */
     68 int cleanup(const std::string &outfile) {
     69  const char *outstr = outfile.c_str();
     70  if(outstr && *outstr) {
     71    int rc = std::remove(outstr);
     72    if(rc == 0) {
     73      fprintf(stderr, "%s: deleted %s\n", prog.c_str(), outstr);
     74      return 0;
     75    } else {
     76      if( errno == ENOENT ) {
     77        return 0; // File did not exist - no error.
     78      } else {
     79        perror("std::remove");
     80        return 1;
     81      }
     82    }
     83  }
     84  return 0;
     85 }
     86 
     87 /**
     88 * Skip across any known whitespace.
     89 * @param p startpoint
     90 * @param e limit
     91 * @return first non-whitespace char
     92 */
     93 inline const char *skipws(const char *p, const char *e) {
     94  for(;p<e;p++) {
     95    switch(*p) {
     96    case kSPACE:
     97    case kTAB:
     98    case kLF:
     99    case kCR:
    100      break;
    101    default:
    102      return p; // non ws
    103    }
    104  }
    105  return p;
    106 }
    107 
    108 /**
    109 * Append a byte, hex encoded
    110 * @param outstr sstring to append to
    111 * @param byte the byte to append
    112 */
    113 void appendByte(std::string &outstr,
    114                uint8_t byte) {
    115    char tmp2[5];
    116    snprintf(tmp2, sizeof(tmp2), "\\x%02X", 0xFF & static_cast<int>(byte));
    117    outstr += tmp2;
    118 }
    119 
    120 /**
    121 * Append the bytes from 'linestr' into outstr, with escaping
    122 * @param outstr the output buffer
    123 * @param linestr the input buffer
    124 * @param pos in/out: the current char under consideration
    125 * @param chars the number of chars to consider
    126 * @return true on failure
    127 */
    128 bool appendUtf8(std::string &outstr,
    129                const std::string &linestr,
    130                size_t &pos,
    131                size_t chars) {
    132  char tmp[9];
    133  for(size_t i=0;i<chars;i++) {
    134    tmp[i] = linestr[++pos];
    135  }
    136  tmp[chars] = 0;
    137  unsigned int c;
    138  sscanf(tmp, "%X", &c);
    139  UChar32 ch = c & 0x1FFFFF; 
    140 
    141  // now to append \\x%% etc
    142  uint8_t bytesNeeded = U8_LENGTH(ch);
    143  if(bytesNeeded == 0) {
    144    fprintf(stderr, "Illegal code point U+%X\n", ch);
    145    return true;
    146  }
    147  uint8_t bytes[4];
    148  uint8_t *s = bytes;
    149  size_t i = 0;
    150  U8_APPEND_UNSAFE(s, i, ch);
    151  for(size_t t = 0; t<i; t++) {
    152    appendByte(outstr, s[t]);
    153  }
    154  return false;
    155 }
    156 
    157 /**
    158 * Fixup u8"x"
    159 * @param linestr string to mutate. Already escaped into \u format.
    160 * @param origpos beginning, points to 'u8"'
    161 * @param pos end, points to "
    162 * @return false for no-problem, true for failure!
    163 */
    164 bool fixu8(std::string &linestr, size_t origpos, size_t &endpos) {
    165  size_t pos = origpos + 3;
    166  std::string outstr;
    167  outstr += '\"'; // local encoding
    168  for(;pos<endpos;pos++) {
    169    char c = linestr[pos];
    170    if(c == '\\') {
    171      char c2 = linestr[++pos];
    172      switch(c2) {
    173      case '\'':
    174      case '"':
    175 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    176        c2 = cp1047_to_8859(c2);
    177 #endif
    178        appendByte(outstr, c2);
    179        break;
    180      case 'u':
    181        appendUtf8(outstr, linestr, pos, 4);
    182        break;
    183      case 'U':
    184        appendUtf8(outstr, linestr, pos, 8);
    185        break;
    186      }
    187    } else {
    188 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    189      c = cp1047_to_8859(c);
    190 #endif
    191      appendByte(outstr, c);
    192    }
    193  }
    194  outstr += ('\"');
    195 
    196  linestr.replace(origpos, (endpos-origpos+1), outstr);
    197  
    198  return false; // OK
    199 }
    200 
    201 /**
    202 * fix the u"x"/u'x'/u8"x" string at the position
    203 * u8'x' is not supported, sorry.
    204 * @param linestr the input string
    205 * @param pos the position
    206 * @return false = no err, true = had err
    207 */
    208 bool fixAt(std::string &linestr, size_t pos) {
    209  size_t origpos = pos;
    210  
    211  if(linestr[pos] != 'u') {
    212    fprintf(stderr, "Not a 'u'?");
    213    return true;
    214  }
    215 
    216  pos++; // past 'u'
    217 
    218  bool utf8 = false;
    219  
    220  if(linestr[pos] == '8') { // u8"
    221    utf8 = true;
    222    pos++;
    223  }
    224  
    225  char quote = linestr[pos];
    226 
    227  if(quote != '\'' && quote != '\"') {
    228    fprintf(stderr, "Quote is '%c' - not sure what to do.\n", quote);
    229    return true;
    230  }
    231 
    232  if(quote == '\'' && utf8) {
    233    fprintf(stderr, "Cannot do u8'...'\n");
    234    return true;
    235  }
    236 
    237  pos ++;
    238 
    239  //printf("u%c…%c\n", quote, quote);
    240 
    241  for(; pos < linestr.size(); pos++) {
    242    if(linestr[pos] == quote) {
    243      if(utf8) {
    244        return fixu8(linestr, origpos, pos); // fix u8"..."
    245      } else {
    246        return false; // end of quote
    247      }
    248    }
    249    if(linestr[pos] == '\\') {
    250      pos++;
    251      if(linestr[pos] == quote) continue; // quoted quote
    252      if(linestr[pos] == 'u') continue; // for now ... unicode escape
    253      if(linestr[pos] == '\\') continue;
    254      // some other escape… ignore
    255    } else {
    256      size_t old_pos = pos;
    257      int32_t i = pos;
    258 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    259      // mogrify 1-4 bytes from 1047 'back' to utf-8
    260      char old_byte = linestr[pos];
    261      linestr[pos] = cp1047_to_8859(linestr[pos]);
    262      // how many more?
    263      int32_t trail = U8_COUNT_TRAIL_BYTES(linestr[pos]);
    264      for(size_t pos2 = pos+1; trail>0; pos2++,trail--) {
    265        linestr[pos2] = cp1047_to_8859(linestr[pos2]);
    266        if(linestr[pos2] == 0x0A) {
    267          linestr[pos2] = 0x85; // NL is ambiguous here
    268        }
    269      }
    270 #endif
    271      
    272      // Proceed to decode utf-8
    273      const uint8_t* s = reinterpret_cast<const uint8_t*>(linestr.c_str());
    274      int32_t length = linestr.size();
    275      UChar32 c;
    276      if(U8_IS_SINGLE((uint8_t)s[i]) && oldIllegal[s[i]]) {
    277 #if (U_CHARSET_FAMILY == U_EBCDIC_FAMILY)
    278        linestr[pos] = old_byte; // put it back
    279 #endif
    280        continue; // single code point not previously legal for \u escaping
    281      }
    282 
    283      // otherwise, convert it to \u / \U
    284      {
    285        U8_NEXT(s, i, length, c);
    286      }
    287      if(c<0) {
    288        fprintf(stderr, "Illegal utf-8 sequence at Column: %d\n", static_cast<int>(old_pos));
    289        fprintf(stderr, "Line: >>%s<<\n", linestr.c_str());
    290        return true;
    291      }
    292 
    293      size_t seqLen = (i-pos);
    294 
    295      //printf("U+%04X pos %d [len %d]\n", c, pos, seqLen);fflush(stdout);
    296 
    297      char newSeq[20];
    298      if( c <= 0xFFFF) {
    299        snprintf(newSeq, sizeof(newSeq), "\\u%04X", c);
    300      } else {
    301        snprintf(newSeq, sizeof(newSeq), "\\U%08X", c);
    302      }
    303      linestr.replace(pos, seqLen, newSeq);
    304      pos += strlen(newSeq) - 1;
    305    }
    306  }
    307 
    308  return false;
    309 }
    310 
    311 /**
    312 * Fixup an entire line
    313 * false = no err
    314 * true = had err
    315 * @param no the line number (not used)
    316 * @param linestr the string to fix
    317 * @return true if any err, else false
    318 */
    319 bool fixLine(int /*no*/, std::string &linestr) {
    320  const char *line = linestr.c_str();
    321  size_t len = linestr.size();
    322 
    323  // no u' in the line?
    324  if(!strstr(line, "u'") && !strstr(line, "u\"") && !strstr(line, "u8\"")) {
    325    return false; // Nothing to do. No u' or u" detected
    326  }
    327 
    328  // start from the end and find all u" cases
    329  size_t pos = len = linestr.size();
    330  if(len>INT32_MAX/2) {
    331    return true;
    332  }
    333  while((pos>0) && (pos = linestr.rfind("u\"", pos)) != std::string::npos) {
    334    //printf("found doublequote at %d\n", pos);
    335    if(fixAt(linestr, pos)) return true;
    336    if(pos == 0) break;
    337    pos--;
    338  }
    339 
    340  // reset and find all u' cases
    341  pos = len = linestr.size();
    342  while((pos>0) && (pos = linestr.rfind("u'", pos)) != std::string::npos) {
    343    //printf("found singlequote at %d\n", pos);
    344    if(fixAt(linestr, pos)) return true;
    345    if(pos == 0) break;
    346    pos--;
    347  }
    348 
    349  // reset and find all u8" cases
    350  pos = len = linestr.size();
    351  while((pos>0) && (pos = linestr.rfind("u8\"", pos)) != std::string::npos) {
    352    if(fixAt(linestr, pos)) return true;
    353    if(pos == 0) break;
    354    pos--;
    355  }
    356 
    357  //fprintf(stderr, "%d - fixed\n", no);
    358  return false;
    359 }
    360 
    361 /**
    362 * Convert a whole file
    363 * @param infile
    364 * @param outfile
    365 * @return 1 on err, 0 otherwise
    366 */
    367 int convert(const std::string &infile, const std::string &outfile) {
    368  fprintf(stderr, "escapesrc: %s -> %s\n", infile.c_str(), outfile.c_str());
    369 
    370  std::ifstream inf;
    371  
    372  inf.open(infile.c_str(), std::ios::in);
    373 
    374  if(!inf.is_open()) {
    375    fprintf(stderr, "%s: could not open input file %s\n", prog.c_str(), infile.c_str());
    376    cleanup(outfile);
    377    return 1;
    378  }
    379 
    380  std::ofstream outf;
    381 
    382  outf.open(outfile.c_str(), std::ios::out);
    383 
    384  if(!outf.is_open()) {
    385    fprintf(stderr, "%s: could not open output file %s\n", prog.c_str(), outfile.c_str());
    386    return 1;
    387  }
    388 
    389  // TODO: any platform variations of #line?
    390  outf << "#line 1 \"" << infile << "\"" << '\n';
    391 
    392  int no = 0;
    393  std::string linestr;
    394  while( getline( inf, linestr)) {
    395    no++;
    396    if(fixLine(no, linestr)) {
    397      goto fail;
    398    }
    399    outf << linestr << '\n';
    400  }
    401 
    402  if(inf.eof()) {
    403    return 0;
    404  }
    405 fail:
    406  outf.close();
    407  fprintf(stderr, "%s:%d: Fixup failed by %s\n", infile.c_str(), no, prog.c_str());
    408  cleanup(outfile);
    409  return 1;
    410 }
    411 
    412 /**
    413 * Main function
    414 */
    415 int main(int argc, const char *argv[]) {
    416  prog = argv[0];
    417 
    418  if(argc != 3) {
    419    usage();
    420    return 1;
    421  }
    422 
    423  std::string infile = argv[1];
    424  std::string outfile = argv[2];
    425 
    426  return convert(infile, outfile);
    427 }