collationdatawriter.cpp (14153B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ******************************************************************************* 5 * Copyright (C) 2013-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * collationdatawriter.cpp 9 * 10 * created on: 2013aug06 11 * created by: Markus W. Scherer 12 */ 13 14 #include "unicode/utypes.h" 15 16 #if !UCONFIG_NO_COLLATION 17 18 #include "unicode/tblcoll.h" 19 #include "unicode/udata.h" 20 #include "unicode/uniset.h" 21 #include "cmemory.h" 22 #include "collationdata.h" 23 #include "collationdatabuilder.h" 24 #include "collationdatareader.h" 25 #include "collationdatawriter.h" 26 #include "collationfastlatin.h" 27 #include "collationsettings.h" 28 #include "collationtailoring.h" 29 #include "uassert.h" 30 #include "ucmndata.h" 31 32 U_NAMESPACE_BEGIN 33 34 uint8_t * 35 RuleBasedCollator::cloneRuleData(int32_t &length, UErrorCode &errorCode) const { 36 if(U_FAILURE(errorCode)) { return nullptr; } 37 LocalMemory<uint8_t> buffer(static_cast<uint8_t*>(uprv_malloc(20000))); 38 if(buffer.isNull()) { 39 errorCode = U_MEMORY_ALLOCATION_ERROR; 40 return nullptr; 41 } 42 UErrorCode bufferStatus = U_ZERO_ERROR; 43 length = cloneBinary(buffer.getAlias(), 20000, bufferStatus); 44 if(bufferStatus == U_BUFFER_OVERFLOW_ERROR) { 45 if(buffer.allocateInsteadAndCopy(length, 0) == nullptr) { 46 errorCode = U_MEMORY_ALLOCATION_ERROR; 47 return nullptr; 48 } 49 bufferStatus = U_ZERO_ERROR; 50 length = cloneBinary(buffer.getAlias(), length, bufferStatus); 51 } 52 if(U_FAILURE(bufferStatus)) { 53 errorCode = bufferStatus; 54 return nullptr; 55 } 56 return buffer.orphan(); 57 } 58 59 int32_t 60 RuleBasedCollator::cloneBinary(uint8_t *dest, int32_t capacity, UErrorCode &errorCode) const { 61 int32_t indexes[CollationDataReader::IX_TOTAL_SIZE + 1]; 62 return CollationDataWriter::writeTailoring( 63 *tailoring, *settings, indexes, dest, capacity, 64 errorCode); 65 } 66 67 static const UDataInfo dataInfo = { 68 sizeof(UDataInfo), 69 0, 70 71 U_IS_BIG_ENDIAN, 72 U_CHARSET_FAMILY, 73 U_SIZEOF_UCHAR, 74 0, 75 76 { 0x55, 0x43, 0x6f, 0x6c }, // dataFormat="UCol" 77 { 5, 0, 0, 0 }, // formatVersion 78 { 6, 3, 0, 0 } // dataVersion 79 }; 80 81 int32_t 82 CollationDataWriter::writeBase(const CollationData &data, const CollationSettings &settings, 83 const void *rootElements, int32_t rootElementsLength, 84 int32_t indexes[], uint8_t *dest, int32_t capacity, 85 UErrorCode &errorCode) { 86 return write(true, nullptr, 87 data, settings, 88 rootElements, rootElementsLength, 89 indexes, dest, capacity, errorCode); 90 } 91 92 int32_t 93 CollationDataWriter::writeTailoring(const CollationTailoring &t, const CollationSettings &settings, 94 int32_t indexes[], uint8_t *dest, int32_t capacity, 95 UErrorCode &errorCode) { 96 return write(false, t.version, 97 *t.data, settings, 98 nullptr, 0, 99 indexes, dest, capacity, errorCode); 100 } 101 102 int32_t 103 CollationDataWriter::write(UBool isBase, const UVersionInfo dataVersion, 104 const CollationData &data, const CollationSettings &settings, 105 const void *rootElements, int32_t rootElementsLength, 106 int32_t indexes[], uint8_t *dest, int32_t capacity, 107 UErrorCode &errorCode) { 108 if(U_FAILURE(errorCode)) { return 0; } 109 if(capacity < 0 || (capacity > 0 && dest == nullptr)) { 110 errorCode = U_ILLEGAL_ARGUMENT_ERROR; 111 return 0; 112 } 113 114 // Figure out which data items to write before settling on 115 // the indexes length and writing offsets. 116 // For any data item, we need to write the start and limit offsets, 117 // so the indexes length must be at least index-of-start-offset + 2. 118 int32_t indexesLength; 119 UBool hasMappings; 120 UnicodeSet unsafeBackwardSet; 121 const CollationData *baseData = data.base; 122 123 int32_t fastLatinVersion; 124 if(data.fastLatinTable != nullptr) { 125 fastLatinVersion = static_cast<int32_t>(CollationFastLatin::VERSION) << 16; 126 } else { 127 fastLatinVersion = 0; 128 } 129 int32_t fastLatinTableLength = 0; 130 131 if(isBase) { 132 // For the root collator, we write an even number of indexes 133 // so that we start with an 8-aligned offset. 134 indexesLength = CollationDataReader::IX_TOTAL_SIZE + 1; 135 U_ASSERT(settings.reorderCodesLength == 0); 136 hasMappings = true; 137 unsafeBackwardSet = *data.unsafeBackwardSet; 138 fastLatinTableLength = data.fastLatinTableLength; 139 } else if(baseData == nullptr) { 140 hasMappings = false; 141 if(settings.reorderCodesLength == 0) { 142 // only options 143 indexesLength = CollationDataReader::IX_OPTIONS + 1; // no limit offset here 144 } else { 145 // only options, reorder codes, and the reorder table 146 indexesLength = CollationDataReader::IX_REORDER_TABLE_OFFSET + 2; 147 } 148 } else { 149 hasMappings = true; 150 // Tailored mappings, and what else? 151 // Check in ascending order of optional tailoring data items. 152 indexesLength = CollationDataReader::IX_CE32S_OFFSET + 2; 153 if(data.contextsLength != 0) { 154 indexesLength = CollationDataReader::IX_CONTEXTS_OFFSET + 2; 155 } 156 unsafeBackwardSet.addAll(*data.unsafeBackwardSet).removeAll(*baseData->unsafeBackwardSet); 157 if(!unsafeBackwardSet.isEmpty()) { 158 indexesLength = CollationDataReader::IX_UNSAFE_BWD_OFFSET + 2; 159 } 160 if(data.fastLatinTable != baseData->fastLatinTable) { 161 fastLatinTableLength = data.fastLatinTableLength; 162 indexesLength = CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET + 2; 163 } 164 } 165 166 UVector32 codesAndRanges(errorCode); 167 const int32_t *reorderCodes = settings.reorderCodes; 168 int32_t reorderCodesLength = settings.reorderCodesLength; 169 if(settings.hasReordering() && 170 CollationSettings::reorderTableHasSplitBytes(settings.reorderTable)) { 171 // Rebuild the full list of reorder ranges. 172 // The list in the settings is truncated for efficiency. 173 data.makeReorderRanges(reorderCodes, reorderCodesLength, codesAndRanges, errorCode); 174 // Write the codes, then the ranges. 175 for(int32_t i = 0; i < reorderCodesLength; ++i) { 176 codesAndRanges.insertElementAt(reorderCodes[i], i, errorCode); 177 } 178 if(U_FAILURE(errorCode)) { return 0; } 179 reorderCodes = codesAndRanges.getBuffer(); 180 reorderCodesLength = codesAndRanges.size(); 181 } 182 183 int32_t headerSize; 184 if(isBase) { 185 headerSize = 0; // udata_create() writes the header 186 } else { 187 DataHeader header; 188 header.dataHeader.magic1 = 0xda; 189 header.dataHeader.magic2 = 0x27; 190 uprv_memcpy(&header.info, &dataInfo, sizeof(UDataInfo)); 191 uprv_memcpy(header.info.dataVersion, dataVersion, sizeof(UVersionInfo)); 192 headerSize = static_cast<int32_t>(sizeof(header)); 193 U_ASSERT((headerSize & 3) == 0); // multiple of 4 bytes 194 if(hasMappings && data.cesLength != 0) { 195 // Sum of the sizes of the data items which are 196 // not automatically multiples of 8 bytes and which are placed before the CEs. 197 int32_t sum = headerSize + (indexesLength + reorderCodesLength) * 4; 198 if((sum & 7) != 0) { 199 // We need to add padding somewhere so that the 64-bit CEs are 8-aligned. 200 // We add to the header size here. 201 // Alternatively, we could increment the indexesLength 202 // or add a few bytes to the reorderTable. 203 headerSize += 4; 204 } 205 } 206 header.dataHeader.headerSize = static_cast<uint16_t>(headerSize); 207 if(headerSize <= capacity) { 208 uprv_memcpy(dest, &header, sizeof(header)); 209 // Write 00 bytes so that the padding is not mistaken for a copyright string. 210 uprv_memset(dest + sizeof(header), 0, headerSize - (int32_t)sizeof(header)); 211 dest += headerSize; 212 capacity -= headerSize; 213 } else { 214 dest = nullptr; 215 capacity = 0; 216 } 217 } 218 219 indexes[CollationDataReader::IX_INDEXES_LENGTH] = indexesLength; 220 U_ASSERT((settings.options & ~0xffff) == 0); 221 indexes[CollationDataReader::IX_OPTIONS] = 222 data.numericPrimary | fastLatinVersion | settings.options; 223 indexes[CollationDataReader::IX_RESERVED2] = 0; 224 indexes[CollationDataReader::IX_RESERVED3] = 0; 225 226 // Byte offsets of data items all start from the start of the indexes. 227 // We add the headerSize at the very end. 228 int32_t totalSize = indexesLength * 4; 229 230 if(hasMappings && (isBase || data.jamoCE32s != baseData->jamoCE32s)) { 231 indexes[CollationDataReader::IX_JAMO_CE32S_START] = static_cast<int32_t>(data.jamoCE32s - data.ce32s); 232 } else { 233 indexes[CollationDataReader::IX_JAMO_CE32S_START] = -1; 234 } 235 236 indexes[CollationDataReader::IX_REORDER_CODES_OFFSET] = totalSize; 237 totalSize += reorderCodesLength * 4; 238 239 indexes[CollationDataReader::IX_REORDER_TABLE_OFFSET] = totalSize; 240 if(settings.reorderTable != nullptr) { 241 totalSize += 256; 242 } 243 244 indexes[CollationDataReader::IX_TRIE_OFFSET] = totalSize; 245 if(hasMappings) { 246 UErrorCode errorCode2 = U_ZERO_ERROR; 247 int32_t length; 248 if(totalSize < capacity) { 249 length = utrie2_serialize(data.trie, dest + totalSize, 250 capacity - totalSize, &errorCode2); 251 } else { 252 length = utrie2_serialize(data.trie, nullptr, 0, &errorCode2); 253 } 254 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { 255 errorCode = errorCode2; 256 return 0; 257 } 258 // The trie size should be a multiple of 8 bytes due to the way 259 // compactIndex2(UNewTrie2 *trie) currently works. 260 U_ASSERT((length & 7) == 0); 261 totalSize += length; 262 } 263 264 indexes[CollationDataReader::IX_RESERVED8_OFFSET] = totalSize; 265 indexes[CollationDataReader::IX_CES_OFFSET] = totalSize; 266 if(hasMappings && data.cesLength != 0) { 267 U_ASSERT(((headerSize + totalSize) & 7) == 0); 268 totalSize += data.cesLength * 8; 269 } 270 271 indexes[CollationDataReader::IX_RESERVED10_OFFSET] = totalSize; 272 indexes[CollationDataReader::IX_CE32S_OFFSET] = totalSize; 273 if(hasMappings) { 274 totalSize += data.ce32sLength * 4; 275 } 276 277 indexes[CollationDataReader::IX_ROOT_ELEMENTS_OFFSET] = totalSize; 278 totalSize += rootElementsLength * 4; 279 280 indexes[CollationDataReader::IX_CONTEXTS_OFFSET] = totalSize; 281 if(hasMappings) { 282 totalSize += data.contextsLength * 2; 283 } 284 285 indexes[CollationDataReader::IX_UNSAFE_BWD_OFFSET] = totalSize; 286 if(hasMappings && !unsafeBackwardSet.isEmpty()) { 287 UErrorCode errorCode2 = U_ZERO_ERROR; 288 int32_t length; 289 if(totalSize < capacity) { 290 uint16_t *p = reinterpret_cast<uint16_t *>(dest + totalSize); 291 length = unsafeBackwardSet.serialize( 292 p, (capacity - totalSize) / 2, errorCode2); 293 } else { 294 length = unsafeBackwardSet.serialize(nullptr, 0, errorCode2); 295 } 296 if(U_FAILURE(errorCode2) && errorCode2 != U_BUFFER_OVERFLOW_ERROR) { 297 errorCode = errorCode2; 298 return 0; 299 } 300 totalSize += length * 2; 301 } 302 303 indexes[CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET] = totalSize; 304 totalSize += fastLatinTableLength * 2; 305 306 UnicodeString scripts; 307 indexes[CollationDataReader::IX_SCRIPTS_OFFSET] = totalSize; 308 if(isBase) { 309 scripts.append(static_cast<char16_t>(data.numScripts)); 310 scripts.append(reinterpret_cast<const char16_t *>(data.scriptsIndex), data.numScripts + 16); 311 scripts.append(reinterpret_cast<const char16_t *>(data.scriptStarts), data.scriptStartsLength); 312 totalSize += scripts.length() * 2; 313 } 314 315 indexes[CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET] = totalSize; 316 if(isBase) { 317 totalSize += 256; 318 } 319 320 indexes[CollationDataReader::IX_RESERVED18_OFFSET] = totalSize; 321 indexes[CollationDataReader::IX_TOTAL_SIZE] = totalSize; 322 323 if(totalSize > capacity) { 324 errorCode = U_BUFFER_OVERFLOW_ERROR; 325 return headerSize + totalSize; 326 } 327 328 uprv_memcpy(dest, indexes, indexesLength * 4); 329 copyData(indexes, CollationDataReader::IX_REORDER_CODES_OFFSET, reorderCodes, dest); 330 copyData(indexes, CollationDataReader::IX_REORDER_TABLE_OFFSET, settings.reorderTable, dest); 331 // The trie has already been serialized into the dest buffer. 332 copyData(indexes, CollationDataReader::IX_CES_OFFSET, data.ces, dest); 333 copyData(indexes, CollationDataReader::IX_CE32S_OFFSET, data.ce32s, dest); 334 copyData(indexes, CollationDataReader::IX_ROOT_ELEMENTS_OFFSET, rootElements, dest); 335 copyData(indexes, CollationDataReader::IX_CONTEXTS_OFFSET, data.contexts, dest); 336 // The unsafeBackwardSet has already been serialized into the dest buffer. 337 copyData(indexes, CollationDataReader::IX_FAST_LATIN_TABLE_OFFSET, data.fastLatinTable, dest); 338 copyData(indexes, CollationDataReader::IX_SCRIPTS_OFFSET, scripts.getBuffer(), dest); 339 copyData(indexes, CollationDataReader::IX_COMPRESSIBLE_BYTES_OFFSET, data.compressibleBytes, dest); 340 341 return headerSize + totalSize; 342 } 343 344 void 345 CollationDataWriter::copyData(const int32_t indexes[], int32_t startIndex, 346 const void *src, uint8_t *dest) { 347 int32_t start = indexes[startIndex]; 348 int32_t limit = indexes[startIndex + 1]; 349 if(start < limit) { 350 uprv_memcpy(dest + start, src, limit - start); 351 } 352 } 353 354 U_NAMESPACE_END 355 356 #endif // !UCONFIG_NO_COLLATION