gfxScriptItemizer.cpp (8477B)
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 /* 7 * This file is based on usc_impl.c from ICU 4.2.0.1, slightly adapted 8 * for use within Mozilla Gecko, separate from a standard ICU build. 9 * 10 * The original ICU license of the code follows: 11 * 12 * ICU License - ICU 1.8.1 and later 13 * 14 * COPYRIGHT AND PERMISSION NOTICE 15 * 16 * Copyright (c) 1995-2009 International Business Machines Corporation and 17 * others 18 * 19 * All rights reserved. 20 * 21 * Permission is hereby granted, free of charge, to any person obtaining a 22 * copy of this software and associated documentation files (the "Software"), 23 * to deal in the Software without restriction, including without limitation 24 * the rights to use, copy, modify, merge, publish, distribute, and/or sell 25 * copies of the Software, and to permit persons to whom the Software is 26 * furnished to do so, provided that the above copyright notice(s) and this 27 * permission notice appear in all copies of the Software and that both the 28 * above copyright notice(s) and this permission notice appear in supporting 29 * documentation. 30 * 31 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 32 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 33 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. 34 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE 35 * BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, 36 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, 37 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, 38 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 39 * SOFTWARE. 40 * 41 * Except as contained in this notice, the name of a copyright holder shall 42 * not be used in advertising or otherwise to promote the sale, use or other 43 * dealings in this Software without prior written authorization of the 44 * copyright holder. 45 * 46 * All trademarks and registered trademarks mentioned herein are the property 47 * of their respective owners. 48 */ 49 50 #include "gfxScriptItemizer.h" 51 #include "mozilla/intl/UnicodeProperties.h" 52 #include "nsCharTraits.h" 53 #include "nsUnicodeProperties.h" 54 #include "harfbuzz/hb.h" 55 56 using namespace mozilla::intl; 57 using namespace mozilla::unicode; 58 59 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH) 60 #define LIMIT_INC(sp) \ 61 (((sp) < PAREN_STACK_DEPTH) ? (sp) + 1 : PAREN_STACK_DEPTH) 62 #define INC(sp, count) (MOD((sp) + (count))) 63 #define INC1(sp) (INC(sp, 1)) 64 #define DEC(sp, count) (MOD((sp) + PAREN_STACK_DEPTH - (count))) 65 #define DEC1(sp) (DEC(sp, 1)) 66 #define STACK_IS_EMPTY() (pushCount <= 0) 67 #define STACK_IS_NOT_EMPTY() (!STACK_IS_EMPTY()) 68 #define TOP() (parenStack[parenSP]) 69 #define SYNC_FIXUP() (fixupCount = 0) 70 71 void gfxScriptItemizer::push(uint32_t endPairChar, Script newScriptCode) { 72 pushCount = LIMIT_INC(pushCount); 73 fixupCount = LIMIT_INC(fixupCount); 74 75 parenSP = INC1(parenSP); 76 parenStack[parenSP].endPairChar = endPairChar; 77 parenStack[parenSP].scriptCode = newScriptCode; 78 } 79 80 void gfxScriptItemizer::pop() { 81 if (STACK_IS_EMPTY()) { 82 return; 83 } 84 85 if (fixupCount > 0) { 86 fixupCount -= 1; 87 } 88 89 pushCount -= 1; 90 parenSP = DEC1(parenSP); 91 92 /* If the stack is now empty, reset the stack 93 pointers to their initial values. 94 */ 95 if (STACK_IS_EMPTY()) { 96 parenSP = -1; 97 } 98 } 99 100 void gfxScriptItemizer::fixup(Script newScriptCode) { 101 int32_t fixupSP = DEC(parenSP, fixupCount); 102 103 while (fixupCount-- > 0) { 104 fixupSP = INC1(fixupSP); 105 parenStack[fixupSP].scriptCode = newScriptCode; 106 } 107 } 108 109 static inline bool CanMergeWithContext(Script aScript) { 110 return aScript <= Script::INHERITED || aScript == Script::UNKNOWN; 111 } 112 113 // We regard the current char as having the same script as the in-progress run 114 // if either script is Common/Inherited/Unknown, or if the run script appears 115 // in the character's ScriptExtensions, or if the char is a cluster extender. 116 static inline bool SameScript(Script runScript, Script currCharScript, 117 uint32_t aCurrCh) { 118 return CanMergeWithContext(runScript) || 119 CanMergeWithContext(currCharScript) || currCharScript == runScript || 120 IsClusterExtender(aCurrCh) || 121 UnicodeProperties::HasScript(aCurrCh, runScript); 122 } 123 124 gfxScriptItemizer::Run gfxScriptItemizer::Next() { 125 MOZ_ASSERT(textLength == 0 || (textIs8bit && textPtr._1b) || 126 (!textIs8bit && textPtr._2b)); 127 128 /* if we've fallen off the end of the text, we're done */ 129 if (scriptLimit >= textLength) { 130 return Run{}; 131 } 132 133 SYNC_FIXUP(); 134 scriptCode = Script::COMMON; 135 Script fallbackScript = Script::UNKNOWN; 136 137 for (scriptStart = scriptLimit; scriptLimit < textLength; scriptLimit += 1) { 138 uint32_t ch; 139 Script sc; 140 uint32_t startOfChar = scriptLimit; 141 142 ch = textIs8bit ? textPtr._1b[scriptLimit] : textPtr._2b[scriptLimit]; 143 144 /* decode UTF-16 (may be surrogate pair) */ 145 if (NS_IS_HIGH_SURROGATE(ch) && scriptLimit < textLength - 1) { 146 uint32_t low = textPtr._2b[scriptLimit + 1]; 147 if (NS_IS_LOW_SURROGATE(low)) { 148 ch = SURROGATE_TO_UCS4(ch, low); 149 scriptLimit += 1; 150 } 151 } 152 153 // Initialize gc to UNASSIGNED; we'll only set it to the true GC 154 // if the character has script=COMMON, otherwise we don't care. 155 uint8_t gc = HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED; 156 157 sc = UnicodeProperties::GetScriptCode(ch); 158 if (sc == Script::COMMON) { 159 /* 160 * Paired character handling: 161 * 162 * if it's an open character, push it onto the stack. 163 * if it's a close character, find the matching open on the 164 * stack, and use that script code. Any non-matching open 165 * characters above it on the stack will be popped. 166 * 167 * We only do this if the script is COMMON; for chars with 168 * specific script assignments, we just use them as-is. 169 */ 170 gc = GetGeneralCategory(ch); 171 if (gc == HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION) { 172 uint32_t endPairChar = UnicodeProperties::CharMirror(ch); 173 if (endPairChar != ch) { 174 push(endPairChar, scriptCode); 175 } 176 } else if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION && 177 UnicodeProperties::IsMirrored(ch)) { 178 while (STACK_IS_NOT_EMPTY() && TOP().endPairChar != ch) { 179 pop(); 180 } 181 182 if (STACK_IS_NOT_EMPTY()) { 183 sc = TOP().scriptCode; 184 } 185 } 186 } 187 188 // Both Hiragana and Katakana are shaped as OpenType 'kana'. Merge them 189 // here to avoid script-run breaks and allow kerning to apply between the 190 // two alphabets. 191 if (sc == Script::HIRAGANA) { 192 sc = Script::KATAKANA; 193 } 194 195 if (SameScript(scriptCode, sc, ch)) { 196 if (scriptCode == Script::COMMON) { 197 // If we have not yet resolved a specific scriptCode for the run, 198 // check whether this character provides it. 199 if (!CanMergeWithContext(sc)) { 200 // Use this character's script. 201 scriptCode = sc; 202 fixup(scriptCode); 203 } else if (fallbackScript == Script::UNKNOWN) { 204 // See if the character has a ScriptExtensions property we can 205 // store for use in the event the run remains unresolved. 206 UnicodeProperties::ScriptExtensionVector extensions; 207 auto extResult = UnicodeProperties::GetExtensions(ch, extensions); 208 if (extResult.isOk()) { 209 Script ext = Script(extensions[0]); 210 if (!CanMergeWithContext(ext)) { 211 fallbackScript = ext; 212 } 213 } 214 } 215 } 216 217 /* 218 * if this character is a close paired character, 219 * pop the matching open character from the stack 220 */ 221 if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION && 222 UnicodeProperties::IsMirrored(ch)) { 223 pop(); 224 } 225 } else { 226 /* 227 * reset scriptLimit in case it was advanced during reading a 228 * multiple-code-unit character 229 */ 230 scriptLimit = startOfChar; 231 232 break; 233 } 234 } 235 236 return Run{scriptStart, scriptLimit - scriptStart, 237 (scriptCode == Script::COMMON && fallbackScript != Script::UNKNOWN) 238 ? fallbackScript 239 : scriptCode}; 240 }