WordBreaker.cpp (6250B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #include "mozilla/intl/WordBreaker.h" 7 8 #include "icu4x/WordSegmenter.hpp" 9 #include "mozilla/CheckedInt.h" 10 #include "mozilla/intl/UnicodeProperties.h" 11 #include "mozilla/StaticPrefs_intl.h" 12 #include "mozilla/StaticPrefs_layout.h" 13 #include "nsComplexBreaker.h" 14 #include "nsTArray.h" 15 #include "nsUnicharUtils.h" 16 #include "nsUnicodeProperties.h" 17 18 using mozilla::intl::Script; 19 using mozilla::intl::UnicodeProperties; 20 using mozilla::intl::WordBreaker; 21 using mozilla::intl::WordRange; 22 using mozilla::unicode::GetGenCategory; 23 24 #define ASCII_IS_ALPHA(c) \ 25 ((('a' <= (c)) && ((c) <= 'z')) || (('A' <= (c)) && ((c) <= 'Z'))) 26 #define ASCII_IS_DIGIT(c) (('0' <= (c)) && ((c) <= '9')) 27 #define ASCII_IS_SPACE(c) \ 28 ((' ' == (c)) || ('\t' == (c)) || ('\r' == (c)) || ('\n' == (c))) 29 #define IS_ALPHABETICAL_SCRIPT(c) ((c) < 0x2E80) 30 31 // we change the beginning of IS_HAN from 0x4e00 to 0x3400 to relfect 32 // Unicode 3.0 33 #define IS_HAN(c) \ 34 ((0x3400 <= (c)) && ((c) <= 0x9fff)) || ((0xf900 <= (c)) && ((c) <= 0xfaff)) 35 #define IS_KATAKANA(c) ((0x30A0 <= (c)) && ((c) <= 0x30FF)) 36 #define IS_HIRAGANA(c) ((0x3040 <= (c)) && ((c) <= 0x309F)) 37 #define IS_HALFWIDTHKATAKANA(c) ((0xFF60 <= (c)) && ((c) <= 0xFF9F)) 38 39 /* static */ 40 WordBreaker::WordBreakClass WordBreaker::GetClass(char16_t c) { 41 // begin of the hack 42 43 if (IS_ALPHABETICAL_SCRIPT(c)) { 44 if (IS_ASCII(c)) { 45 if (ASCII_IS_SPACE(c)) { 46 return kWbClassSpace; 47 } 48 if (ASCII_IS_ALPHA(c) || ASCII_IS_DIGIT(c) || 49 (c == '_' && !StaticPrefs::layout_word_select_stop_at_underscore())) { 50 return kWbClassAlphaLetter; 51 } 52 return kWbClassPunct; 53 } 54 if (c == 0x00A0 /*NBSP*/) { 55 return kWbClassSpace; 56 } 57 if (GetGenCategory(c) == nsUGenCategory::kPunctuation) { 58 return kWbClassPunct; 59 } 60 if (UnicodeProperties::IsScriptioContinua(c)) { 61 return kWbClassScriptioContinua; 62 } 63 return kWbClassAlphaLetter; 64 } 65 if (IS_HAN(c)) { 66 return kWbClassHanLetter; 67 } 68 if (IS_KATAKANA(c)) { 69 return kWbClassKatakanaLetter; 70 } 71 if (IS_HIRAGANA(c)) { 72 return kWbClassHiraganaLetter; 73 } 74 if (IS_HALFWIDTHKATAKANA(c)) { 75 return kWbClassHWKatakanaLetter; 76 } 77 if (GetGenCategory(c) == nsUGenCategory::kPunctuation) { 78 return kWbClassPunct; 79 } 80 if (UnicodeProperties::IsScriptioContinua(c)) { 81 return kWbClassScriptioContinua; 82 } 83 return kWbClassAlphaLetter; 84 } 85 86 WordRange WordBreaker::FindWord(const nsAString& aText, uint32_t aPos, 87 const FindWordOptions aOptions) { 88 const CheckedInt<uint32_t> len = aText.Length(); 89 MOZ_RELEASE_ASSERT(len.isValid()); 90 91 if (aPos >= len.value()) { 92 return {len.value(), len.value()}; 93 } 94 95 WordRange range{0, len.value()}; 96 97 if (StaticPrefs::intl_icu4x_segmenter_enabled()) { 98 auto segmenter = icu4x::WordSegmenter::create_auto(); 99 auto iterator = segmenter->segment16( 100 std::u16string_view(aText.BeginReading(), aText.Length())); 101 102 uint32_t previousPos = 0; 103 while (true) { 104 const int32_t nextPos = iterator->next(); 105 if (nextPos < 0) { 106 range.mBegin = previousPos; 107 range.mEnd = len.value(); 108 break; 109 } 110 if ((uint32_t)nextPos > aPos) { 111 range.mBegin = previousPos; 112 range.mEnd = (uint32_t)nextPos; 113 break; 114 } 115 116 previousPos = nextPos; 117 } 118 119 if (aOptions != FindWordOptions::StopAtPunctuation) { 120 return range; 121 } 122 123 for (uint32_t i = range.mBegin; i < range.mEnd; i++) { 124 if (mozilla::IsPunctuationForWordSelect(aText[i])) { 125 if (i > aPos) { 126 range.mEnd = i; 127 break; 128 } 129 if (i == aPos) { 130 range.mBegin = i; 131 range.mEnd = i + 1; 132 break; 133 } 134 if (i < aPos) { 135 range.mBegin = i + 1; 136 } 137 } 138 } 139 140 return range; 141 } 142 143 WordBreakClass c = GetClass(aText[aPos]); 144 145 // Scan forward 146 for (uint32_t i = aPos + 1; i < len.value(); i++) { 147 if (c != GetClass(aText[i])) { 148 range.mEnd = i; 149 break; 150 } 151 } 152 153 // Scan backward 154 for (uint32_t i = aPos; i > 0; i--) { 155 if (c != GetClass(aText[i - 1])) { 156 range.mBegin = i; 157 break; 158 } 159 } 160 161 if (kWbClassScriptioContinua == c) { 162 // we pass the whole text segment to the complex word breaker to find a 163 // shorter answer 164 AutoTArray<uint8_t, 256> breakBefore; 165 breakBefore.SetLength(range.mEnd - range.mBegin); 166 ComplexBreaker::GetBreaks(aText.BeginReading() + range.mBegin, 167 range.mEnd - range.mBegin, 168 breakBefore.Elements()); 169 170 // Scan forward 171 for (uint32_t i = aPos + 1; i < range.mEnd; i++) { 172 if (breakBefore[i - range.mBegin]) { 173 range.mEnd = i; 174 break; 175 } 176 } 177 178 // Scan backward 179 for (uint32_t i = aPos; i > range.mBegin; i--) { 180 if (breakBefore[i - range.mBegin]) { 181 range.mBegin = i; 182 break; 183 } 184 } 185 } 186 return range; 187 } 188 189 int32_t WordBreaker::Next(const char16_t* aText, uint32_t aLen, uint32_t aPos) { 190 MOZ_ASSERT(aText); 191 192 if (aPos >= aLen) { 193 return NS_WORDBREAKER_NEED_MORE_TEXT; 194 } 195 196 const WordBreakClass posClass = GetClass(aText[aPos]); 197 uint32_t nextBreakPos; 198 for (nextBreakPos = aPos + 1; nextBreakPos < aLen; ++nextBreakPos) { 199 if (posClass != GetClass(aText[nextBreakPos])) { 200 break; 201 } 202 } 203 204 if (kWbClassScriptioContinua == posClass) { 205 // We pass the whole text segment to the complex word breaker to find a 206 // shorter answer. 207 const char16_t* segStart = aText + aPos; 208 const uint32_t segLen = nextBreakPos - aPos + 1; 209 AutoTArray<uint8_t, 256> breakBefore; 210 breakBefore.SetLength(segLen); 211 ComplexBreaker::GetBreaks(segStart, segLen, breakBefore.Elements()); 212 213 for (uint32_t i = aPos + 1; i < nextBreakPos; ++i) { 214 if (breakBefore[i - aPos]) { 215 nextBreakPos = i; 216 break; 217 } 218 } 219 } 220 221 MOZ_ASSERT(nextBreakPos != aPos); 222 return nextBreakPos; 223 }