extradata.cpp (10462B)
1 // © 2017 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 // extradata.cpp 5 // created: 2017jun04 Markus W. Scherer 6 // (pulled out of n2builder.cpp) 7 8 #include "unicode/utypes.h" 9 10 #if !UCONFIG_NO_NORMALIZATION 11 12 #include <stdio.h> 13 #include <stdlib.h> 14 #include "unicode/errorcode.h" 15 #include "unicode/unistr.h" 16 #include "unicode/utf16.h" 17 #include "extradata.h" 18 #include "normalizer2impl.h" 19 #include "norms.h" 20 #include "toolutil.h" 21 #include "utrie2.h" 22 #include "uvectr32.h" 23 24 U_NAMESPACE_BEGIN 25 26 ExtraData::ExtraData(Norms &n, UBool fast) : 27 Norms::Enumerator(n), 28 yesYesCompositions(1000, static_cast<UChar32>(0xffff), 2), // 0=inert, 1=Jamo L, 2=start of compositions 29 yesNoMappingsAndCompositions(1000, static_cast<UChar32>(0), 1), // 0=Hangul LV, 1=start of normal data 30 yesNoMappingsOnly(1000, static_cast<UChar32>(0), 1), // 0=Hangul LVT, 1=start of normal data 31 optimizeFast(fast) { 32 // Hangul LV algorithmically decomposes to two Jamo. 33 // Some code may harmlessly read this firstUnit. 34 yesNoMappingsAndCompositions.setCharAt(0, 2); 35 // Hangul LVT algorithmically decomposes to three Jamo. 36 // Some code may harmlessly read this firstUnit. 37 yesNoMappingsOnly.setCharAt(0, 3); 38 } 39 40 int32_t ExtraData::writeMapping(UChar32 c, const Norm &norm, UnicodeString &dataString) { 41 UnicodeString &m=*norm.mapping; 42 int32_t length=m.length(); 43 // Write the mapping & raw mapping extraData. 44 int32_t firstUnit=length|(norm.trailCC<<8); 45 int32_t preMappingLength=0; 46 if(norm.rawMapping!=nullptr) { 47 UnicodeString &rm=*norm.rawMapping; 48 int32_t rmLength=rm.length(); 49 if(rmLength>Normalizer2Impl::MAPPING_LENGTH_MASK) { 50 fprintf(stderr, 51 "gennorm2 error: " 52 "raw mapping for U+%04lX longer than maximum of %d\n", 53 static_cast<long>(c), Normalizer2Impl::MAPPING_LENGTH_MASK); 54 exit(U_INVALID_FORMAT_ERROR); 55 } 56 char16_t rm0=rm.charAt(0); 57 if( rmLength==length-1 && 58 // 99: overlong substring lengths get pinned to remainder lengths anyway 59 0==rm.compare(1, 99, m, 2, 99) && 60 rm0>Normalizer2Impl::MAPPING_LENGTH_MASK 61 ) { 62 // Compression: 63 // rawMapping=rm0+mapping.substring(2) -> store only rm0 64 // 65 // The raw mapping is the same as the final mapping after replacing 66 // the final mapping's first two code units with the raw mapping's first one. 67 // In this case, we store only that first unit, rm0. 68 // This helps with a few hundred mappings. 69 dataString.append(rm0); 70 preMappingLength=1; 71 } else { 72 // Store the raw mapping with its length. 73 dataString.append(rm); 74 dataString.append(static_cast<char16_t>(rmLength)); 75 preMappingLength=rmLength+1; 76 } 77 firstUnit|=Normalizer2Impl::MAPPING_HAS_RAW_MAPPING; 78 } 79 int32_t cccLccc=norm.cc|(norm.leadCC<<8); 80 if(cccLccc!=0) { 81 dataString.append(static_cast<char16_t>(cccLccc)); 82 ++preMappingLength; 83 firstUnit|=Normalizer2Impl::MAPPING_HAS_CCC_LCCC_WORD; 84 } 85 dataString.append(static_cast<char16_t>(firstUnit)); 86 dataString.append(m); 87 return preMappingLength; 88 } 89 90 int32_t ExtraData::writeNoNoMapping(UChar32 c, const Norm &norm, 91 UnicodeString &dataString, 92 Hashtable &previousMappings) { 93 UnicodeString newMapping; 94 int32_t offset=writeMapping(c, norm, newMapping); 95 UBool found=false; 96 int32_t previousOffset=previousMappings.getiAndFound(newMapping, found); 97 if(found) { 98 // Duplicate, point to the identical mapping that has already been stored. 99 offset=previousOffset; 100 } else { 101 // Append this new mapping and 102 // enter it into the hashtable, avoiding value 0 which is "not found". 103 offset=dataString.length()+offset; 104 dataString.append(newMapping); 105 IcuToolErrorCode errorCode("gennorm2/writeExtraData()/Hashtable.putiAllowZero()"); 106 previousMappings.putiAllowZero(newMapping, offset, errorCode); 107 } 108 return offset; 109 } 110 111 UBool ExtraData::setNoNoDelta(UChar32 c, Norm &norm) const { 112 // Try a compact, algorithmic encoding to a single compYesAndZeroCC code point. 113 // Do not map from ASCII to non-ASCII. 114 if(norm.mappingCP>=0 && 115 !(c<=0x7f && norm.mappingCP>0x7f) && 116 norms.getNormRef(norm.mappingCP).type<Norm::NO_NO_COMP_YES) { 117 int32_t delta=norm.mappingCP-c; 118 if(-Normalizer2Impl::MAX_DELTA<=delta && delta<=Normalizer2Impl::MAX_DELTA) { 119 norm.type=Norm::NO_NO_DELTA; 120 norm.offset=delta; 121 return true; 122 } 123 } 124 return false; 125 } 126 127 void ExtraData::writeCompositions(UChar32 c, const Norm &norm, UnicodeString &dataString) { 128 if(norm.cc!=0) { 129 fprintf(stderr, 130 "gennorm2 error: " 131 "U+%04lX combines-forward and has ccc!=0, not possible in Unicode normalization\n", 132 static_cast<long>(c)); 133 exit(U_INVALID_FORMAT_ERROR); 134 } 135 int32_t length; 136 const CompositionPair *pairs=norm.getCompositionPairs(length); 137 for(int32_t i=0; i<length; ++i) { 138 const CompositionPair &pair=pairs[i]; 139 // 22 bits for the composite character and whether it combines forward. 140 UChar32 compositeAndFwd=pair.composite<<1; 141 if(norms.getNormRef(pair.composite).combinesFwd()) { 142 compositeAndFwd|=1; // The composite character also combines-forward. 143 } 144 // Encode most pairs in two units and some in three. 145 int32_t firstUnit, secondUnit, thirdUnit; 146 if(pair.trail<Normalizer2Impl::COMP_1_TRAIL_LIMIT) { 147 if(compositeAndFwd<=0xffff) { 148 firstUnit=pair.trail<<1; 149 secondUnit=compositeAndFwd; 150 thirdUnit=-1; 151 } else { 152 firstUnit=(pair.trail<<1)|Normalizer2Impl::COMP_1_TRIPLE; 153 secondUnit=compositeAndFwd>>16; 154 thirdUnit=compositeAndFwd; 155 } 156 } else { 157 firstUnit=(Normalizer2Impl::COMP_1_TRAIL_LIMIT+ 158 (pair.trail>>Normalizer2Impl::COMP_1_TRAIL_SHIFT))| 159 Normalizer2Impl::COMP_1_TRIPLE; 160 secondUnit=(pair.trail<<Normalizer2Impl::COMP_2_TRAIL_SHIFT)| 161 (compositeAndFwd>>16); 162 thirdUnit=compositeAndFwd; 163 } 164 // Set the high bit of the first unit if this is the last composition pair. 165 if(i==(length-1)) { 166 firstUnit|=Normalizer2Impl::COMP_1_LAST_TUPLE; 167 } 168 dataString.append(static_cast<char16_t>(firstUnit)).append(static_cast<char16_t>(secondUnit)); 169 if(thirdUnit>=0) { 170 dataString.append(static_cast<char16_t>(thirdUnit)); 171 } 172 } 173 } 174 175 void ExtraData::rangeHandler(UChar32 start, UChar32 end, Norm &norm) { 176 if(start!=end) { 177 fprintf(stderr, 178 "gennorm2 error: unexpected shared data for " 179 "multiple code points U+%04lX..U+%04lX\n", 180 static_cast<long>(start), static_cast<long>(end)); 181 exit(U_INTERNAL_PROGRAM_ERROR); 182 } 183 if(norm.error!=nullptr) { 184 fprintf(stderr, "gennorm2 error: U+%04lX %s\n", static_cast<long>(start), norm.error); 185 exit(U_INVALID_FORMAT_ERROR); 186 } 187 writeExtraData(start, norm); 188 } 189 190 // Ticket #13342 - Disable optimizations on MSVC for this function as a workaround. 191 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) 192 #pragma optimize( "", off ) 193 #endif 194 195 void ExtraData::writeExtraData(UChar32 c, Norm &norm) { 196 switch(norm.type) { 197 case Norm::INERT: 198 break; // no extra data 199 case Norm::YES_YES_COMBINES_FWD: 200 norm.offset=yesYesCompositions.length(); 201 writeCompositions(c, norm, yesYesCompositions); 202 break; 203 case Norm::YES_NO_COMBINES_FWD: 204 norm.offset=yesNoMappingsAndCompositions.length()+ 205 writeMapping(c, norm, yesNoMappingsAndCompositions); 206 writeCompositions(c, norm, yesNoMappingsAndCompositions); 207 break; 208 case Norm::YES_NO_MAPPING_ONLY: 209 norm.offset=yesNoMappingsOnly.length()+ 210 writeMapping(c, norm, yesNoMappingsOnly); 211 break; 212 case Norm::NO_NO_COMP_YES: 213 if(!optimizeFast && setNoNoDelta(c, norm)) { 214 break; 215 } 216 norm.offset=writeNoNoMapping(c, norm, noNoMappingsCompYes, previousNoNoMappingsCompYes); 217 break; 218 case Norm::NO_NO_COMP_BOUNDARY_BEFORE: 219 if(!optimizeFast && setNoNoDelta(c, norm)) { 220 break; 221 } 222 norm.offset=writeNoNoMapping( 223 c, norm, noNoMappingsCompBoundaryBefore, previousNoNoMappingsCompBoundaryBefore); 224 break; 225 case Norm::NO_NO_COMP_NO_MAYBE_CC: 226 norm.offset=writeNoNoMapping( 227 c, norm, noNoMappingsCompNoMaybeCC, previousNoNoMappingsCompNoMaybeCC); 228 break; 229 case Norm::NO_NO_EMPTY: 230 // There can be multiple extra data entries for mappings to the empty string 231 // if they have different raw mappings. 232 norm.offset=writeNoNoMapping(c, norm, noNoMappingsEmpty, previousNoNoMappingsEmpty); 233 break; 234 case Norm::MAYBE_NO_MAPPING_ONLY: 235 norm.offset=maybeNoMappingsOnly.length()+ 236 writeMapping(c, norm, maybeNoMappingsOnly); 237 break; 238 case Norm::MAYBE_NO_COMBINES_FWD: 239 norm.offset=maybeNoMappingsAndCompositions.length()+ 240 writeMapping(c, norm, maybeNoMappingsAndCompositions); 241 writeCompositions(c, norm, maybeNoMappingsAndCompositions); 242 break; 243 case Norm::MAYBE_YES_COMBINES_FWD: 244 norm.offset=maybeYesCompositions.length(); 245 writeCompositions(c, norm, maybeYesCompositions); 246 break; 247 case Norm::MAYBE_YES_SIMPLE: 248 break; // no extra data 249 case Norm::YES_YES_WITH_CC: 250 break; // no extra data 251 default: // Should not occur. 252 exit(U_INTERNAL_PROGRAM_ERROR); 253 } 254 } 255 256 // Ticket #13342 - Turn optimization back on. 257 #if (defined(_MSC_VER) && (_MSC_VER >= 1900) && defined(_MSC_FULL_VER) && (_MSC_FULL_VER >= 190024210)) 258 #pragma optimize( "", on ) 259 #endif 260 261 U_NAMESPACE_END 262 263 #endif // #if !UCONFIG_NO_NORMALIZATION