tblgen.cpp (3188B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 #include "unicode/ucnv.h" 6 #include "unicode/uniset.h" 7 #include <stdio.h> 8 9 using icu::LocalUConverterPointer; 10 using icu::UnicodeSet; 11 12 static const char *kConverter = "ibm-1047"; 13 14 int main(int argc, const char *argv[]) { 15 printf("// %s\n", U_COPYRIGHT_STRING); 16 printf("// generated by tblgen. You weren't going to edit it by hand, were you?\n"); 17 printf("\n"); 18 19 UErrorCode status = U_ZERO_ERROR; 20 LocalUConverterPointer cnv(ucnv_open(kConverter, &status)); 21 22 if(U_FAILURE(status)) { 23 fprintf(stderr, "Failed to open %s: %s\n", kConverter, u_errorName(status)); 24 return 1; 25 } 26 27 printf("static const char cp1047_8859_1[256] = { \n"); 28 for(int i=0x00; i<0x100; i++) { 29 char cp1047[1]; 30 cp1047[0] = i; 31 char16_t u[1]; 32 char16_t *target = u; 33 const char *source = cp1047; 34 ucnv_toUnicode(cnv.getAlias(), &target, u+1, &source, cp1047+1, nullptr, true, &status); 35 if(U_FAILURE(status)) { 36 fprintf(stderr, "Conversion failure at #%X: %s\n", i, u_errorName(status)); 37 return 2; 38 } 39 printf(" (char)0x%02X, /* %02X */\n", u[0], i); 40 } 41 printf("};\n\n"); 42 43 // 44 // UnicodeSet oldIllegal("[:print:]", status); // [a-zA-Z0-9_}{#)(><%:;.?*+-/^&|~!=,\\u005b\\u005d\\u005c]", status); 45 UnicodeSet oldIllegal("[0-9 a-z A-Z " 46 "_ \\{ \\} \\[ \\] # \\( \\) < > % \\: ; . " 47 "? * + \\- / \\^ \\& | ~ ! = , \\ \" ' ]", status); 48 49 /* 50 51 http://www.lirmm.fr/~ducour/Doc-objets/ISO+IEC+14882-1998.pdf ( note: 1998 ) page 10, section 2.2 says: 52 53 1 The basic source character set consists of 96 characters: the space character, the control characters repre- 15) 54 senting horizontal tab, vertical tab, form feed, and new-line, plus the following 91 graphical characters: 55 a b c d e f g h i j k l m n opqrstuvwxyz 56 A B C D E F G H I J K L M N OPQRSTUVWXYZ 57 0 12 3 4 5 6 7 8 9 58 _ { } [ ] # ( ) < > % : ; . ?*+-/^&|~!=,\" 59 2 The universal-character-name construct provides a way to name other characters. hex-quad: 60 hexadecimal-digit hexadecimal-digit hexadecimal-digit hexadecimal-digit 61 universal-character-name: \u hex-quad 62 \U hex-quad hex-quad 63 The character designated by the universal-character-name \UNNNNNNNN is that character whose character short name in ISO/IEC 10646 is NNNNNNNN; the character designated by the universal-character-name \uNNNN is that character whose character short name in ISO/IEC 10646 is 0000NNNN. If the hexadecimal value for a universal character name is less than 0x20 or in the range 0x7F-0x9F (inclusive), or if the uni- versal character name designates a character in the basic source character set, then the program is ill- formed. 64 65 66 So basically: printable ASCII plus 0x00-0x1F, 0x7F-0x9F, was all illegal. 67 68 Some discussion at http://unicode.org/mail-arch/unicode-ml/y2003-m10/0471.html 69 70 */ 71 72 73 74 printf("static const bool oldIllegal[256] = { \n"); 75 for(char16_t i=0x00; i<0x100;i++) { 76 printf(" %s, /* U+%04X */\n", 77 (oldIllegal.contains(i))?" true":"false", 78 i); 79 } 80 printf("};\n\n"); 81 82 return 0; 83 }