nsLineBreaker.h (12423B)
1 /* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* vim: set ts=8 sts=2 et sw=2 tw=80: */ 3 /* This Source Code Form is subject to the terms of the Mozilla Public 4 * License, v. 2.0. If a copy of the MPL was not distributed with this 5 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 6 7 #ifndef NSLINEBREAKER_H_ 8 #define NSLINEBREAKER_H_ 9 10 #include "mozilla/intl/LineBreaker.h" 11 #include "mozilla/intl/Segmenter.h" 12 #include "nsString.h" 13 #include "nsTArray.h" 14 15 class nsAtom; 16 class nsHyphenator; 17 18 /** 19 * A receiver of line break data. 20 */ 21 class nsILineBreakSink { 22 public: 23 /** 24 * Sets the break data for a substring of the associated text chunk. 25 * One or more of these calls will be performed; the union of all substrings 26 * will cover the entire text chunk. Substrings may overlap (i.e., we may 27 * set the break-before state of a character more than once). 28 * @param aBreakBefore the break-before states for the characters in the 29 * substring. These are enum values from gfxTextRun::CompressedGlyph: 30 * FLAG_BREAK_TYPE_NONE - no linebreak is allowed here 31 * FLAG_BREAK_TYPE_NORMAL - a normal (whitespace) linebreak 32 * FLAG_BREAK_TYPE_HYPHEN - a hyphenation point 33 */ 34 virtual void SetBreaks(uint32_t aStart, uint32_t aLength, 35 uint8_t* aBreakBefore) = 0; 36 37 /** 38 * Indicates which characters should be capitalized. Only called if 39 * BREAK_NEED_CAPITALIZATION was requested. 40 */ 41 virtual void SetCapitalization(uint32_t aStart, uint32_t aLength, 42 bool* aCapitalize) = 0; 43 }; 44 45 /** 46 * A line-breaking state machine. You feed text into it via AppendText calls 47 * and it computes the possible line breaks. Because break decisions can 48 * require a lot of context, the breaks for a piece of text are sometimes not 49 * known until later text has been seen (or all text ends). So breaks are 50 * returned via a call to SetBreaks on the nsILineBreakSink object passed 51 * with each text chunk, which might happen during the corresponding AppendText 52 * call, or might happen during a later AppendText call or even a Reset() 53 * call. 54 * 55 * The linebreak results MUST NOT depend on how the text is broken up 56 * into AppendText calls. 57 * 58 * The current strategy is that we break the overall text into 59 * whitespace-delimited "words". Then those words are passed to the LineBreaker 60 * for deeper analysis if they might contain breakable characters. 61 * 62 * This class also handles detection of which characters should be capitalized 63 * for text-transform:capitalize. This is a good place to handle that because 64 * we have all the context we need. 65 */ 66 class nsLineBreaker { 67 public: 68 nsLineBreaker(); 69 ~nsLineBreaker(); 70 71 static inline bool IsSpace(char16_t u) { 72 return mozilla::intl::NS_IsSpace(u); 73 } 74 75 // Helper also used by nsCaseTransformTextRunFactory::TransformString. 76 // aChar is the current character to be examined; 77 // aCapitalizeNext is a state variable: initialize it to true at start-of- 78 // text, then pass it back to this function as each successive character is 79 // considered. 80 static bool ShouldCapitalize(uint32_t aChar, bool& aCapitalizeNext); 81 82 // Break opportunities exist at the end of each run of breakable whitespace 83 // (see IsSpace above). Break opportunities can also exist between pairs of 84 // non-whitespace characters, as determined by mozilla::intl::LineBreaker. 85 // We pass a whitespace- 86 // delimited word to LineBreaker if it contains at least one character 87 // that has breakable line breaking classes. 88 // We provide flags to control on a per-chunk basis where breaks are allowed. 89 // At any character boundary, exactly one text chunk governs whether a 90 // break is allowed at that boundary. 91 // 92 // We operate on text after whitespace processing has been applied, so 93 // other characters (e.g. tabs and newlines) may have been converted to 94 // spaces. 95 96 /** 97 * Flags passed with each chunk of text. 98 */ 99 enum { 100 /* 101 * Do not introduce a break opportunity at the start of this chunk of text. 102 */ 103 BREAK_SUPPRESS_INITIAL = 0x01, 104 /** 105 * Do not introduce a break opportunity in the interior of this chunk of 106 * text. Also, whitespace in this chunk is treated as non-breakable. 107 */ 108 BREAK_SUPPRESS_INSIDE = 0x02, 109 /** 110 * The sink currently is already set up to have no breaks in it; 111 * if no breaks are possible, nsLineBreaker does not need to call 112 * SetBreaks on it. This is useful when handling large quantities of 113 * preformatted text; the textruns will never have any breaks set on them, 114 * and there is no need to ever actually scan the text for breaks, except 115 * at the end of textruns in case context is needed for following breakable 116 * text. 117 */ 118 BREAK_SKIP_SETTING_NO_BREAKS = 0x04, 119 /** 120 * We need to be notified of characters that should be capitalized 121 * (as in text-transform:capitalize) in this chunk of text. 122 */ 123 BREAK_NEED_CAPITALIZATION = 0x08, 124 /** 125 * Auto-hyphenation is enabled, so we need to get a hyphenator 126 * (if available) and use it to find breakpoints. 127 */ 128 BREAK_USE_AUTO_HYPHENATION = 0x10 129 }; 130 131 /** 132 * Append "invisible whitespace". This acts like whitespace, but there is 133 * no actual text associated with it. Only the BREAK_SUPPRESS_INSIDE flag 134 * is relevant here. 135 */ 136 nsresult AppendInvisibleWhitespace(uint32_t aFlags); 137 138 /** 139 * Feed Unicode text into the linebreaker for analysis. aLength must be 140 * nonzero. 141 * @param aSink can be null if the breaks are not actually needed (we may 142 * still be setting up state for later breaks) 143 */ 144 nsresult AppendText(nsAtom* aHyphenationLanguage, const char16_t* aText, 145 uint32_t aLength, uint32_t aFlags, 146 nsILineBreakSink* aSink); 147 /** 148 * Feed 8-bit text into the linebreaker for analysis. aLength must be nonzero. 149 * @param aSink can be null if the breaks are not actually needed (we may 150 * still be setting up state for later breaks) 151 */ 152 nsresult AppendText(nsAtom* aHyphenationLanguage, const uint8_t* aText, 153 uint32_t aLength, uint32_t aFlags, 154 nsILineBreakSink* aSink); 155 /** 156 * Reset all state. This means the current run has ended; any outstanding 157 * calls through nsILineBreakSink are made, and all outstanding references to 158 * nsILineBreakSink objects are dropped. 159 * After this call, this linebreaker can be reused. 160 * This must be called at least once between any call to AppendText() and 161 * destroying the object. 162 * @param aTrailingBreak this is set to true when there is a break opportunity 163 * at the end of the text. This will normally only be declared true when there 164 * is breakable whitespace at the end. 165 */ 166 nsresult Reset(bool* aTrailingBreak); 167 168 /* 169 * Set word-break mode for line breaker. This is set by word-break property. 170 */ 171 void SetWordBreak(mozilla::intl::WordBreakRule aMode) { 172 // If current word is non-empty and mode is changing, flush the breaker. 173 if (aMode != mWordBreak && !mCurrentWord.IsEmpty()) { 174 nsresult rv = FlushCurrentWord(); 175 if (NS_FAILED(rv)) { 176 NS_WARNING("FlushCurrentWord failed, line-breaks may be wrong"); 177 } 178 // If previous mode was break-all, we should allow a break here. 179 // XXX (jfkthame) css-text spec seems unclear on this, raised question in 180 // https://github.com/w3c/csswg-drafts/issues/3897 181 if (mWordBreak == mozilla::intl::WordBreakRule::BreakAll) { 182 mBreakHere = true; 183 } 184 } 185 mWordBreak = aMode; 186 } 187 188 /* 189 * Set line-break rule strictness mode for line breaker. This is set by the 190 * line-break property. 191 */ 192 void SetStrictness(mozilla::intl::LineBreakRule aMode) { 193 if (aMode != mLineBreak && !mCurrentWord.IsEmpty()) { 194 nsresult rv = FlushCurrentWord(); 195 if (NS_FAILED(rv)) { 196 NS_WARNING("FlushCurrentWord failed, line-breaks may be wrong"); 197 } 198 // If previous mode was anywhere, we should allow a break here. 199 if (mLineBreak == mozilla::intl::LineBreakRule::Anywhere) { 200 mBreakHere = true; 201 } 202 } 203 mLineBreak = aMode; 204 } 205 206 /** 207 * Return whether the line-breaker has a buffered "current word" that may 208 * be extended with additional word-forming characters. 209 */ 210 bool InWord() const { return !mCurrentWord.IsEmpty(); } 211 212 /** 213 * Set the word-continuation state, which will suppress capitalization of 214 * the next letter that might otherwise apply. 215 */ 216 void SetWordContinuation(bool aContinuation) { 217 mWordContinuation = aContinuation; 218 } 219 220 /** 221 * Set the hyphenate-limit-chars values. Values are clamped to be <= 255. 222 */ 223 void SetHyphenateLimitChars(uint32_t aWordLength, uint32_t aStartLength, 224 uint32_t aEndLength) { 225 mHyphenateLimitWord = std::min(255u, aWordLength); 226 mHyphenateLimitStart = std::min(255u, aStartLength); 227 mHyphenateLimitEnd = std::min(255u, aEndLength); 228 } 229 230 private: 231 // This is a list of text sources that make up the "current word" (i.e., 232 // run of text which does not contain any whitespace). All the mLengths 233 // are are nonzero, these cannot overlap. 234 struct TextItem { 235 TextItem(nsILineBreakSink* aSink, uint32_t aSinkOffset, uint32_t aLength, 236 uint32_t aFlags) 237 : mSink(aSink), 238 mSinkOffset(aSinkOffset), 239 mLength(aLength), 240 mFlags(aFlags) {} 241 242 nsILineBreakSink* mSink; 243 uint32_t mSinkOffset; 244 uint32_t mLength; 245 uint32_t mFlags; 246 }; 247 248 // State for the nonwhitespace "word" that started in previous text and hasn't 249 // finished yet. 250 251 // When the current word ends, this computes the linebreak opportunities 252 // *inside* the word (excluding either end) and sets them through the 253 // appropriate sink(s). Then we clear the current word state. 254 nsresult FlushCurrentWord(); 255 256 void UpdateCurrentWordLanguage(nsAtom* aHyphenationLanguage); 257 258 void FindHyphenationPoints(nsHyphenator* aHyphenator, 259 const char16_t* aTextStart, 260 const char16_t* aTextLimit, uint8_t* aBreakState); 261 262 inline constexpr bool IsSegmentSpace(char16_t u) const { 263 if (mLegacyBehavior) { 264 return nsLineBreaker::IsSpace(u); 265 } 266 267 return u == 0x0020 || // SPACE u 268 u == 0x0009 || // CHARACTER TABULATION 269 u == 0x000D; // CARRIAGE RETURN 270 } 271 272 AutoTArray<char16_t, 100> mCurrentWord; 273 // All the items that contribute to mCurrentWord 274 AutoTArray<TextItem, 2> mTextItems; 275 nsAtom* mCurrentWordLanguage = nullptr; 276 277 // Constraints from CSS `hyphenate-limit-chars` property, to block the use of 278 // auto-hyphenation if the word is too short, or at positions too near the 279 // beginning/end of the word. 280 // (Note that per CSS Text spec, these counts ignore combining marks, etc., 281 // so they are not purely codepoint or character counts.) 282 // (Zero values would have no effect; but text-frame code will update the 283 // values from CSS before calling the line-breaker.) 284 uint8_t mHyphenateLimitWord = 0; // Min word length to auto-hyphenate 285 uint8_t mHyphenateLimitStart = 0; // Min number of chars before the break 286 uint8_t mHyphenateLimitEnd = 0; // Min number of chars after the break 287 288 bool mCurrentWordContainsMixedLang = false; 289 bool mCurrentWordMightBeBreakable = false; 290 bool mScriptIsChineseOrJapanese = false; 291 292 // True if the previous character was breakable whitespace 293 bool mAfterBreakableSpace = false; 294 // True if a break must be allowed at the current position because 295 // a run of breakable whitespace ends here 296 bool mBreakHere = false; 297 // Break rules for letters from the "word-break" property. 298 mozilla::intl::WordBreakRule mWordBreak = 299 mozilla::intl::WordBreakRule::Normal; 300 // Line breaking strictness from the "line-break" property. 301 mozilla::intl::LineBreakRule mLineBreak = mozilla::intl::LineBreakRule::Auto; 302 // Should the text be treated as continuing a word-in-progress (for purposes 303 // of initial capitalization)? Normally this is set to false whenever we 304 // start using a linebreaker, but it may be set to true if the line-breaker 305 // has been explicitly flushed mid-word. 306 bool mWordContinuation = false; 307 // True if using old line segmenter. 308 const bool mLegacyBehavior; 309 }; 310 311 #endif /*NSLINEBREAKER_H_*/