ucmndata.cpp (13682B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ****************************************************************************** 5 * 6 * Copyright (C) 1999-2011, International Business Machines 7 * Corporation and others. All Rights Reserved. 8 * 9 ******************************************************************************/ 10 11 12 /*------------------------------------------------------------------------------ 13 * 14 * UCommonData An abstract interface for dealing with ICU Common Data Files. 15 * ICU Common Data Files are a grouping of a number of individual 16 * data items (resources, converters, tables, anything) into a 17 * single file or dll. The combined format includes a table of 18 * contents for locating the individual items by name. 19 * 20 * Two formats for the table of contents are supported, which is 21 * why there is an abstract interface involved. 22 * 23 */ 24 25 #include "unicode/utypes.h" 26 #include "unicode/udata.h" 27 #include "cstring.h" 28 #include "ucmndata.h" 29 #include "udatamem.h" 30 31 #if defined(UDATA_DEBUG) || defined(UDATA_DEBUG_DUMP) 32 # include <stdio.h> 33 #endif 34 35 U_CFUNC uint16_t 36 udata_getHeaderSize(const DataHeader *udh) { 37 if(udh==nullptr) { 38 return 0; 39 } else if(udh->info.isBigEndian==U_IS_BIG_ENDIAN) { 40 /* same endianness */ 41 return udh->dataHeader.headerSize; 42 } else { 43 /* opposite endianness */ 44 uint16_t x=udh->dataHeader.headerSize; 45 return (uint16_t)((x<<8)|(x>>8)); 46 } 47 } 48 49 U_CFUNC uint16_t 50 udata_getInfoSize(const UDataInfo *info) { 51 if(info==nullptr) { 52 return 0; 53 } else if(info->isBigEndian==U_IS_BIG_ENDIAN) { 54 /* same endianness */ 55 return info->size; 56 } else { 57 /* opposite endianness */ 58 uint16_t x=info->size; 59 return (uint16_t)((x<<8)|(x>>8)); 60 } 61 } 62 63 /*-----------------------------------------------------------------------------* 64 * * 65 * Pointer TOCs. TODO: This form of table-of-contents should be removed * 66 * because DLLs must be relocated on loading to correct the * 67 * pointer values and this operation makes shared memory * 68 * mapping of the data much less likely to work. * 69 * * 70 *-----------------------------------------------------------------------------*/ 71 typedef struct { 72 const char *entryName; 73 const DataHeader *pHeader; 74 } PointerTOCEntry; 75 76 77 typedef struct { 78 uint32_t count; 79 uint32_t reserved; 80 /** 81 * Variable-length array declared with length 1 to disable bounds checkers. 82 * The actual array length is in the count field. 83 */ 84 PointerTOCEntry entry[1]; 85 } PointerTOC; 86 87 88 /* definition of OffsetTOC struct types moved to ucmndata.h */ 89 90 /*-----------------------------------------------------------------------------* 91 * * 92 * entry point lookup implementations * 93 * * 94 *-----------------------------------------------------------------------------*/ 95 96 #ifndef MIN 97 #define MIN(a,b) (((a)<(b)) ? (a) : (b)) 98 #endif 99 100 /** 101 * Compare strings where we know the shared prefix length, 102 * and advance the prefix length as we find that the strings share even more characters. 103 */ 104 static int32_t 105 strcmpAfterPrefix(const char *s1, const char *s2, int32_t *pPrefixLength) { 106 int32_t pl=*pPrefixLength; 107 int32_t cmp=0; 108 s1+=pl; 109 s2+=pl; 110 for(;;) { 111 int32_t c1 = static_cast<uint8_t>(*s1++); 112 int32_t c2 = static_cast<uint8_t>(*s2++); 113 cmp=c1-c2; 114 if(cmp!=0 || c1==0) { /* different or done */ 115 break; 116 } 117 ++pl; /* increment shared same-prefix length */ 118 } 119 *pPrefixLength=pl; 120 return cmp; 121 } 122 123 static int32_t 124 offsetTOCPrefixBinarySearch(const char *s, const char *names, 125 const UDataOffsetTOCEntry *toc, int32_t count) { 126 int32_t start=0; 127 int32_t limit=count; 128 /* 129 * Remember the shared prefix between s, start and limit, 130 * and don't compare that shared prefix again. 131 * The shared prefix should get longer as we narrow the [start, limit[ range. 132 */ 133 int32_t startPrefixLength=0; 134 int32_t limitPrefixLength=0; 135 if(count==0) { 136 return -1; 137 } 138 /* 139 * Prime the prefix lengths so that we don't keep prefixLength at 0 until 140 * both the start and limit indexes have moved. 141 * At the same time, we find if s is one of the start and (limit-1) names, 142 * and if not, exclude them from the actual binary search. 143 */ 144 if(0==strcmpAfterPrefix(s, names+toc[0].nameOffset, &startPrefixLength)) { 145 return 0; 146 } 147 ++start; 148 --limit; 149 if(0==strcmpAfterPrefix(s, names+toc[limit].nameOffset, &limitPrefixLength)) { 150 return limit; 151 } 152 while(start<limit) { 153 int32_t i=(start+limit)/2; 154 int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 155 int32_t cmp=strcmpAfterPrefix(s, names+toc[i].nameOffset, &prefixLength); 156 if(cmp<0) { 157 limit=i; 158 limitPrefixLength=prefixLength; 159 } else if(cmp==0) { 160 return i; 161 } else { 162 start=i+1; 163 startPrefixLength=prefixLength; 164 } 165 } 166 return -1; 167 } 168 169 static int32_t 170 pointerTOCPrefixBinarySearch(const char *s, const PointerTOCEntry *toc, int32_t count) { 171 int32_t start=0; 172 int32_t limit=count; 173 /* 174 * Remember the shared prefix between s, start and limit, 175 * and don't compare that shared prefix again. 176 * The shared prefix should get longer as we narrow the [start, limit[ range. 177 */ 178 int32_t startPrefixLength=0; 179 int32_t limitPrefixLength=0; 180 if(count==0) { 181 return -1; 182 } 183 /* 184 * Prime the prefix lengths so that we don't keep prefixLength at 0 until 185 * both the start and limit indexes have moved. 186 * At the same time, we find if s is one of the start and (limit-1) names, 187 * and if not, exclude them from the actual binary search. 188 */ 189 if(0==strcmpAfterPrefix(s, toc[0].entryName, &startPrefixLength)) { 190 return 0; 191 } 192 ++start; 193 --limit; 194 if(0==strcmpAfterPrefix(s, toc[limit].entryName, &limitPrefixLength)) { 195 return limit; 196 } 197 while(start<limit) { 198 int32_t i=(start+limit)/2; 199 int32_t prefixLength=MIN(startPrefixLength, limitPrefixLength); 200 int32_t cmp=strcmpAfterPrefix(s, toc[i].entryName, &prefixLength); 201 if(cmp<0) { 202 limit=i; 203 limitPrefixLength=prefixLength; 204 } else if(cmp==0) { 205 return i; 206 } else { 207 start=i+1; 208 startPrefixLength=prefixLength; 209 } 210 } 211 return -1; 212 } 213 214 U_CDECL_BEGIN 215 static uint32_t U_CALLCONV 216 offsetTOCEntryCount(const UDataMemory *pData) { 217 int32_t retVal=0; 218 const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc; 219 if (toc != nullptr) { 220 retVal = toc->count; 221 } 222 return retVal; 223 } 224 225 static const DataHeader * U_CALLCONV 226 offsetTOCLookupFn(const UDataMemory *pData, 227 const char *tocEntryName, 228 int32_t *pLength, 229 UErrorCode *pErrorCode) { 230 (void)pErrorCode; 231 const UDataOffsetTOC *toc = (UDataOffsetTOC *)pData->toc; 232 if(toc!=nullptr) { 233 const char *base=(const char *)toc; 234 int32_t number, count=(int32_t)toc->count; 235 236 /* perform a binary search for the data in the common data's table of contents */ 237 #if defined (UDATA_DEBUG_DUMP) 238 /* list the contents of the TOC each time .. not recommended */ 239 for(number=0; number<count; ++number) { 240 fprintf(stderr, "\tx%d: %s\n", number, &base[toc->entry[number].nameOffset]); 241 } 242 #endif 243 number=offsetTOCPrefixBinarySearch(tocEntryName, base, toc->entry, count); 244 if(number>=0) { 245 /* found it */ 246 const UDataOffsetTOCEntry *entry=toc->entry+number; 247 #ifdef UDATA_DEBUG 248 fprintf(stderr, "%s: Found.\n", tocEntryName); 249 #endif 250 if((number+1) < count) { 251 *pLength = (int32_t)(entry[1].dataOffset - entry->dataOffset); 252 } else { 253 *pLength = -1; 254 } 255 return (const DataHeader *)(base+entry->dataOffset); 256 } else { 257 #ifdef UDATA_DEBUG 258 fprintf(stderr, "%s: Not found.\n", tocEntryName); 259 #endif 260 return nullptr; 261 } 262 } else { 263 #ifdef UDATA_DEBUG 264 fprintf(stderr, "returning header\n"); 265 #endif 266 267 return pData->pHeader; 268 } 269 } 270 271 272 static uint32_t U_CALLCONV pointerTOCEntryCount(const UDataMemory *pData) { 273 const PointerTOC *toc = (PointerTOC *)pData->toc; 274 return toc != nullptr ? toc->count : 0; 275 } 276 277 static const DataHeader * U_CALLCONV pointerTOCLookupFn(const UDataMemory *pData, 278 const char *name, 279 int32_t *pLength, 280 UErrorCode *pErrorCode) { 281 (void)pErrorCode; 282 if(pData->toc!=nullptr) { 283 const PointerTOC *toc = (PointerTOC *)pData->toc; 284 int32_t number, count=(int32_t)toc->count; 285 286 #if defined (UDATA_DEBUG_DUMP) 287 /* list the contents of the TOC each time .. not recommended */ 288 for(number=0; number<count; ++number) { 289 fprintf(stderr, "\tx%d: %s\n", number, toc->entry[number].entryName); 290 } 291 #endif 292 number=pointerTOCPrefixBinarySearch(name, toc->entry, count); 293 if(number>=0) { 294 /* found it */ 295 #ifdef UDATA_DEBUG 296 fprintf(stderr, "%s: Found.\n", toc->entry[number].entryName); 297 #endif 298 *pLength=-1; 299 return UDataMemory_normalizeDataPointer(toc->entry[number].pHeader); 300 } else { 301 #ifdef UDATA_DEBUG 302 fprintf(stderr, "%s: Not found.\n", name); 303 #endif 304 return nullptr; 305 } 306 } else { 307 return pData->pHeader; 308 } 309 } 310 U_CDECL_END 311 312 313 static const commonDataFuncs CmnDFuncs = {offsetTOCLookupFn, offsetTOCEntryCount}; 314 static const commonDataFuncs ToCPFuncs = {pointerTOCLookupFn, pointerTOCEntryCount}; 315 316 317 318 /*----------------------------------------------------------------------* 319 * * 320 * checkCommonData Validate the format of a common data file. * 321 * Fill in the virtual function ptr based on TOC type * 322 * If the data is invalid, close the UDataMemory * 323 * and set the appropriate error code. * 324 * * 325 *----------------------------------------------------------------------*/ 326 U_CFUNC void udata_checkCommonData(UDataMemory *udm, UErrorCode *err) { 327 if (U_FAILURE(*err)) { 328 return; 329 } 330 331 if(udm==nullptr || udm->pHeader==nullptr) { 332 *err=U_INVALID_FORMAT_ERROR; 333 } else if(!(udm->pHeader->dataHeader.magic1==0xda && 334 udm->pHeader->dataHeader.magic2==0x27 && 335 udm->pHeader->info.isBigEndian==U_IS_BIG_ENDIAN && 336 udm->pHeader->info.charsetFamily==U_CHARSET_FAMILY) 337 ) { 338 /* header not valid */ 339 *err=U_INVALID_FORMAT_ERROR; 340 } 341 else if (udm->pHeader->info.dataFormat[0]==0x43 && 342 udm->pHeader->info.dataFormat[1]==0x6d && 343 udm->pHeader->info.dataFormat[2]==0x6e && 344 udm->pHeader->info.dataFormat[3]==0x44 && 345 udm->pHeader->info.formatVersion[0]==1 346 ) { 347 /* dataFormat="CmnD" */ 348 udm->vFuncs = &CmnDFuncs; 349 udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader); 350 } 351 else if(udm->pHeader->info.dataFormat[0]==0x54 && 352 udm->pHeader->info.dataFormat[1]==0x6f && 353 udm->pHeader->info.dataFormat[2]==0x43 && 354 udm->pHeader->info.dataFormat[3]==0x50 && 355 udm->pHeader->info.formatVersion[0]==1 356 ) { 357 /* dataFormat="ToCP" */ 358 udm->vFuncs = &ToCPFuncs; 359 udm->toc=(const char *)udm->pHeader+udata_getHeaderSize(udm->pHeader); 360 } 361 else { 362 /* dataFormat not recognized */ 363 *err=U_INVALID_FORMAT_ERROR; 364 } 365 366 if (U_FAILURE(*err)) { 367 /* If the data is no good and we memory-mapped it ourselves, 368 * close the memory mapping so it doesn't leak. Note that this has 369 * no effect on non-memory mapped data, other than clearing fields in udm. 370 */ 371 udata_close(udm); 372 } 373 } 374 375 /* 376 * TODO: Add a udata_swapPackageHeader() function that swaps an ICU .dat package 377 * header but not its sub-items. 378 * This function will be needed for automatic runtime swapping. 379 * Sub-items should not be swapped to limit the swapping to the parts of the 380 * package that are actually used. 381 * 382 * Since lengths of items are implicit in the order and offsets of their 383 * ToC entries, and since offsets are relative to the start of the ToC, 384 * a swapped version may need to generate a different data structure 385 * with pointers to the original data items and with their lengths 386 * (-1 for the last one if it is not known), and maybe even pointers to the 387 * swapped versions of the items. 388 * These pointers to swapped versions would establish a cache; 389 * instead, each open data item could simply own the storage for its swapped 390 * data. This fits better with the current design. 391 * 392 * markus 2003sep18 Jitterbug 2235 393 */