tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

tblgen.cpp (3188B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 
      4 #include "unicode/utypes.h"
      5 #include "unicode/ucnv.h"
      6 #include "unicode/uniset.h"
      7 #include <stdio.h>
      8 
      9 using icu::LocalUConverterPointer;
     10 using icu::UnicodeSet;
     11 
     12 static const char *kConverter = "ibm-1047";
     13 
     14 int main(int argc, const char *argv[]) {
     15  printf("// %s\n", U_COPYRIGHT_STRING);
     16  printf("// generated by tblgen. You weren't going to edit it by hand, were you?\n");
     17  printf("\n");
     18 
     19  UErrorCode status = U_ZERO_ERROR;
     20  LocalUConverterPointer cnv(ucnv_open(kConverter, &status));
     21 
     22  if(U_FAILURE(status)) {
     23    fprintf(stderr, "Failed to open %s: %s\n", kConverter, u_errorName(status));
     24    return 1;
     25  }
     26 
     27  printf("static const char cp1047_8859_1[256] = { \n");
     28  for(int i=0x00; i<0x100; i++) {
     29    char cp1047[1];
     30    cp1047[0] = i;
     31    char16_t u[1];
     32    char16_t *target = u;
     33    const char *source = cp1047;
     34    ucnv_toUnicode(cnv.getAlias(), &target, u+1, &source, cp1047+1, nullptr, true, &status);
     35    if(U_FAILURE(status)) {
     36      fprintf(stderr, "Conversion failure at #%X: %s\n", i, u_errorName(status));
     37      return 2;
     38    }
     39    printf(" (char)0x%02X, /* %02X */\n", u[0], i);
     40  }
     41  printf("};\n\n");
     42 
     43  // 
     44  //  UnicodeSet oldIllegal("[:print:]", status); // [a-zA-Z0-9_}{#)(><%:;.?*+-/^&|~!=,\\u005b\\u005d\\u005c]", status);
     45  UnicodeSet oldIllegal("[0-9 a-z A-Z "
     46                        "_ \\{ \\} \\[ \\] # \\( \\) < > % \\: ; . "
     47                        "? * + \\- / \\^ \\& | ~ ! = , \\ \" ' ]", status);
     48  
     49  /*
     50 
     51 http://www.lirmm.fr/~ducour/Doc-objets/ISO+IEC+14882-1998.pdf ( note: 1998 )   page 10, section 2.2 says:
     52 
     53 1 The basic source character set consists of 96 characters: the space character, the control characters repre- 15)
     54 senting horizontal tab, vertical tab, form feed, and new-line, plus the following 91 graphical characters:
     55 a b c d e f g h i j k l m n opqrstuvwxyz
     56 A B C D E F G H I J K L M N OPQRSTUVWXYZ
     57 0 12 3 4 5 6 7 8 9
     58 _ { } [ ] # ( ) < > % : ; . ?*+-/^&|~!=,\"
     59 2 The universal-character-name construct provides a way to name other characters. hex-quad:
     60 hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit
     61 universal-character-name: \u hex-quad
     62 \U hex-quad hex-quad
     63 The character designated by the universal-character-name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal-character-name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the uni- versal character name designates a character in the basic source character set, then the program is ill- formed.
     64 
     65 
     66 So basically:  printable ASCII plus  0x00-0x1F,  0x7F-0x9F, was all illegal.
     67 
     68 Some discussion at http://unicode.org/mail-arch/unicode-ml/y2003-m10/0471.html
     69 
     70   */
     71  
     72 
     73 
     74  printf("static const bool oldIllegal[256] = { \n");
     75  for(char16_t i=0x00; i<0x100;i++) {
     76    printf(" %s, /* U+%04X */\n",
     77           (oldIllegal.contains(i))?" true":"false",
     78           i);
     79  }
     80  printf("};\n\n");
     81  
     82  return 0;
     83 }