genbrk.cpp (11267B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2002-2016, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * 9 * File genbrk.c 10 */ 11 12 //-------------------------------------------------------------------- 13 // 14 // Tool for generating RuleBasedBreakIterator data files (.brk files). 15 // .brk files contain the precompiled rules for standard types 16 // of iterators - word, line, sentence, etc. 17 // 18 // Usage: genbrk [options] -r rule-file.txt -o output-file.brk 19 // 20 // options: -v verbose 21 // -? or -h help 22 // 23 // The input rule file is a plain text file containing break rules 24 // in the input format accepted by RuleBasedBreakIterators. The 25 // file can be encoded as UTF-8 or UTF-16 (either endian). Files 26 // encoded as UTF-16 must include a BOM. 27 // 28 //-------------------------------------------------------------------- 29 30 #include "unicode/utypes.h" 31 #include "unicode/ucnv.h" 32 #include "unicode/unistr.h" 33 #include "unicode/rbbi.h" 34 #include "unicode/uclean.h" 35 #include "unicode/udata.h" 36 #include "unicode/putil.h" 37 38 #include "uoptions.h" 39 #include "unewdata.h" 40 #include "ucmndata.h" 41 #include "rbbidata.h" 42 #include "cmemory.h" 43 44 #include <stdio.h> 45 #include <stdlib.h> 46 #include <string.h> 47 48 U_NAMESPACE_USE 49 50 static char *progName; 51 static UOption options[]={ 52 UOPTION_HELP_H, /* 0 */ 53 UOPTION_HELP_QUESTION_MARK, /* 1 */ 54 UOPTION_VERBOSE, /* 2 */ 55 { "rules", nullptr, nullptr, nullptr, 'r', UOPT_REQUIRES_ARG, 0 }, /* 3 */ 56 { "out", nullptr, nullptr, nullptr, 'o', UOPT_REQUIRES_ARG, 0 }, /* 4 */ 57 UOPTION_ICUDATADIR, /* 5 */ 58 UOPTION_DESTDIR, /* 6 */ 59 UOPTION_COPYRIGHT, /* 7 */ 60 UOPTION_QUIET, /* 8 */ 61 }; 62 63 void usageAndDie(int retCode) { 64 printf("Usage: %s [-v] [-options] -r rule-file -o output-file\n", progName); 65 printf("\tRead in break iteration rules text and write out the binary data.\n" 66 "\tIf the rule file does not have a Unicode signature byte sequence, it is assumed\n" 67 "\tto be UTF-8.\n" 68 "options:\n" 69 "\t-h or -? or --help this usage text\n" 70 "\t-V or --version show a version message\n" 71 "\t-c or --copyright include a copyright notice\n" 72 "\t-v or --verbose turn on verbose output\n" 73 "\t-q or --quiet do not display warnings and progress\n" 74 "\t-i or --icudatadir directory for locating any needed intermediate data files,\n" 75 "\t followed by path, defaults to %s\n" 76 "\t-d or --destdir destination directory, followed by the path\n", 77 u_getDataDirectory()); 78 exit (retCode); 79 } 80 81 82 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 83 84 /* dummy UDataInfo cf. udata.h */ 85 static UDataInfo dummyDataInfo = { 86 sizeof(UDataInfo), 87 0, 88 89 U_IS_BIG_ENDIAN, 90 U_CHARSET_FAMILY, 91 U_SIZEOF_UCHAR, 92 0, 93 94 { 0, 0, 0, 0 }, /* dummy dataFormat */ 95 { 0, 0, 0, 0 }, /* dummy formatVersion */ 96 { 0, 0, 0, 0 } /* dummy dataVersion */ 97 }; 98 99 #else 100 101 // 102 // Set up the ICU data header, defined in ucmndata.h 103 // 104 DataHeader dh ={ 105 {sizeof(DataHeader), // Struct MappedData 106 0xda, 107 0x27}, 108 109 { // struct UDataInfo 110 sizeof(UDataInfo), // size 111 0, // reserved 112 U_IS_BIG_ENDIAN, 113 U_CHARSET_FAMILY, 114 U_SIZEOF_UCHAR, 115 0, // reserved 116 117 { 0x42, 0x72, 0x6b, 0x20 }, // dataFormat="Brk " 118 { 0xff, 0, 0, 0 }, // formatVersion. Filled in later with values 119 // from the RBBI rule builder. The values declared 120 // here should never appear in any real RBBI data. 121 { 4, 1, 0, 0 } // dataVersion (Unicode version) 122 }}; 123 124 #endif 125 126 //---------------------------------------------------------------------------- 127 // 128 // main for genbrk 129 // 130 //---------------------------------------------------------------------------- 131 int main(int argc, char **argv) { 132 UErrorCode status = U_ZERO_ERROR; 133 const char *ruleFileName; 134 const char *outFileName; 135 const char *outDir = nullptr; 136 const char *copyright = nullptr; 137 138 // 139 // Pick up and check the command line arguments, 140 // using the standard ICU tool utils option handling. 141 // 142 U_MAIN_INIT_ARGS(argc, argv); 143 progName = argv[0]; 144 argc=u_parseArgs(argc, argv, UPRV_LENGTHOF(options), options); 145 if(argc<0) { 146 // Unrecognized option 147 fprintf(stderr, "error in command line argument \"%s\"\n", argv[-argc]); 148 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 149 } 150 151 if(options[0].doesOccur || options[1].doesOccur) { 152 // -? or -h for help. 153 usageAndDie(0); 154 } 155 156 if (!(options[3].doesOccur && options[4].doesOccur)) { 157 fprintf(stderr, "rule file and output file must both be specified.\n"); 158 usageAndDie(U_ILLEGAL_ARGUMENT_ERROR); 159 } 160 ruleFileName = options[3].value; 161 outFileName = options[4].value; 162 163 if (options[5].doesOccur) { 164 u_setDataDirectory(options[5].value); 165 } 166 167 status = U_ZERO_ERROR; 168 169 /* Combine the directory with the file name */ 170 if(options[6].doesOccur) { 171 outDir = options[6].value; 172 } 173 if (options[7].doesOccur) { 174 copyright = U_COPYRIGHT_STRING; 175 } 176 177 #if UCONFIG_NO_BREAK_ITERATION || UCONFIG_NO_FILE_IO 178 179 UNewDataMemory *pData; 180 char msg[1024]; 181 182 /* write message with just the name */ 183 snprintf(msg, sizeof(msg), "genbrk writes dummy %s because of UCONFIG_NO_BREAK_ITERATION and/or UCONFIG_NO_FILE_IO, see uconfig.h", outFileName); 184 fprintf(stderr, "%s\n", msg); 185 186 /* write the dummy data file */ 187 pData = udata_create(outDir, nullptr, outFileName, &dummyDataInfo, nullptr, &status); 188 udata_writeBlock(pData, msg, strlen(msg)); 189 udata_finish(pData, &status); 190 return (int)status; 191 192 #else 193 /* Initialize ICU */ 194 u_init(&status); 195 if (U_FAILURE(status)) { 196 fprintf(stderr, "%s: can not initialize ICU. status = %s\n", 197 argv[0], u_errorName(status)); 198 exit(1); 199 } 200 status = U_ZERO_ERROR; 201 202 // 203 // Read in the rule source file 204 // 205 long result; 206 long ruleFileSize; 207 FILE *file; 208 char *ruleBufferC; 209 210 file = fopen(ruleFileName, "rb"); 211 if (file == nullptr) { 212 fprintf(stderr, "Could not open file \"%s\"\n", ruleFileName); 213 exit(-1); 214 } 215 fseek(file, 0, SEEK_END); 216 ruleFileSize = ftell(file); 217 fseek(file, 0, SEEK_SET); 218 ruleBufferC = new char[ruleFileSize+10]; 219 220 result = static_cast<long>(fread(ruleBufferC, 1, ruleFileSize, file)); 221 if (result != ruleFileSize) { 222 fprintf(stderr, "Error reading file \"%s\"\n", ruleFileName); 223 exit (-1); 224 } 225 ruleBufferC[ruleFileSize]=0; 226 fclose(file); 227 228 // 229 // Look for a Unicode Signature (BOM) on the rule file 230 // 231 int32_t signatureLength; 232 const char * ruleSourceC = ruleBufferC; 233 const char* encoding = ucnv_detectUnicodeSignature( 234 ruleSourceC, ruleFileSize, &signatureLength, &status); 235 if (U_FAILURE(status)) { 236 exit(status); 237 } 238 if (encoding == nullptr) { 239 // In the absence of a BOM, assume the rule file is in UTF-8. 240 encoding = "UTF-8"; 241 } else { 242 ruleSourceC += signatureLength; 243 ruleFileSize -= signatureLength; 244 } 245 246 // 247 // Open a converter to take the rule file to UTF-16 248 // 249 UConverter* conv; 250 conv = ucnv_open(encoding, &status); 251 if (U_FAILURE(status)) { 252 fprintf(stderr, "ucnv_open: ICU Error \"%s\"\n", u_errorName(status)); 253 exit(status); 254 } 255 256 // 257 // Convert the rules to char16_t. 258 // Preflight first to determine required buffer size. 259 // 260 uint32_t destCap = ucnv_toUChars(conv, 261 nullptr, // dest, 262 0, // destCapacity, 263 ruleSourceC, 264 ruleFileSize, 265 &status); 266 if (status != U_BUFFER_OVERFLOW_ERROR) { 267 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 268 exit(status); 269 } 270 271 status = U_ZERO_ERROR; 272 char16_t *ruleSourceU = new char16_t[destCap+1]; 273 ucnv_toUChars(conv, 274 ruleSourceU, // dest, 275 destCap+1, 276 ruleSourceC, 277 ruleFileSize, 278 &status); 279 if (U_FAILURE(status)) { 280 fprintf(stderr, "ucnv_toUChars: ICU Error \"%s\"\n", u_errorName(status)); 281 exit(status); 282 } 283 ucnv_close(conv); 284 285 286 // 287 // Put the source rules into a UnicodeString 288 // 289 UnicodeString ruleSourceS(false, ruleSourceU, destCap); 290 291 // 292 // Create the break iterator from the rules 293 // This will compile the rules. 294 // 295 UParseError parseError; 296 parseError.line = 0; 297 parseError.offset = 0; 298 RuleBasedBreakIterator *bi = new RuleBasedBreakIterator(ruleSourceS, parseError, status); 299 if (U_FAILURE(status)) { 300 fprintf(stderr, "createRuleBasedBreakIterator: ICU Error \"%s\" at line %d, column %d\n", 301 u_errorName(status), static_cast<int>(parseError.line), static_cast<int>(parseError.offset)); 302 exit(status); 303 } 304 305 306 // 307 // Get the compiled rule data from the break iterator. 308 // 309 uint32_t outDataSize; 310 const uint8_t *outData; 311 outData = bi->getBinaryRules(outDataSize); 312 313 // Copy the data format version numbers from the RBBI data header into the UDataMemory header. 314 uprv_memcpy(dh.info.formatVersion, ((RBBIDataHeader *)outData)->fFormatVersion, sizeof(dh.info.formatVersion)); 315 316 // 317 // Create the output file 318 // 319 size_t bytesWritten; 320 UNewDataMemory *pData; 321 pData = udata_create(outDir, nullptr, outFileName, &(dh.info), copyright, &status); 322 if(U_FAILURE(status)) { 323 fprintf(stderr, "genbrk: Could not open output file \"%s\", \"%s\"\n", 324 outFileName, u_errorName(status)); 325 exit(status); 326 } 327 328 329 // Write the data itself. 330 udata_writeBlock(pData, outData, outDataSize); 331 // finish up 332 bytesWritten = udata_finish(pData, &status); 333 if(U_FAILURE(status)) { 334 fprintf(stderr, "genbrk: error %d writing the output file\n", status); 335 exit(status); 336 } 337 338 if (bytesWritten != outDataSize) { 339 fprintf(stderr, "Error writing to output file \"%s\"\n", outFileName); 340 exit(-1); 341 } 342 343 delete bi; 344 delete[] ruleSourceU; 345 delete[] ruleBufferC; 346 u_cleanup(); 347 348 349 if(!options[8].doesOccur) { 350 printf("genbrk: tool completed successfully.\n"); 351 } 352 return 0; 353 354 #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 355 }