mozEnglishWordUtils.cpp (3285B)
1 /* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ 2 /* This Source Code Form is subject to the terms of the Mozilla Public 3 * License, v. 2.0. If a copy of the MPL was not distributed with this 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 5 6 #include "mozEnglishWordUtils.h" 7 #include "nsComponentManagerUtils.h" 8 #include "nsReadableUtils.h" 9 #include "nsUnicharUtils.h" 10 #include "nsUnicodeProperties.h" 11 #include "nsCRT.h" 12 13 NS_IMPL_CYCLE_COLLECTION(mozEnglishWordUtils, mURLDetector) 14 15 mozEnglishWordUtils::mozEnglishWordUtils() { 16 mURLDetector = do_CreateInstance(MOZ_TXTTOHTMLCONV_CONTRACTID); 17 } 18 19 mozEnglishWordUtils::~mozEnglishWordUtils() {} 20 21 // This needs vast improvement 22 23 // static 24 bool mozEnglishWordUtils::ucIsAlpha(char16_t aChar) { 25 // XXX we have to fix callers to handle the full Unicode range 26 return nsUGenCategory::kLetter == mozilla::unicode::GetGenCategory(aChar); 27 } 28 29 bool mozEnglishWordUtils::FindNextWord(const nsAString& aWord, uint32_t offset, 30 int32_t* begin, int32_t* end) { 31 if (offset >= aWord.Length()) { 32 *begin = -1; 33 *end = -1; 34 return false; 35 } 36 37 const char16_t* word = aWord.BeginReading(); 38 uint32_t length = aWord.Length(); 39 const char16_t* p = word + offset; 40 const char16_t* endbuf = word + length; 41 const char16_t* startWord = p; 42 43 // XXX These loops should be modified to handle non-BMP characters. 44 // if previous character is a word character, need to advance out of the 45 // word 46 if (offset > 0 && ucIsAlpha(*(p - 1))) { 47 while (p < endbuf && ucIsAlpha(*p)) { 48 p++; 49 } 50 } 51 while ((p < endbuf) && (!ucIsAlpha(*p))) { 52 p++; 53 } 54 startWord = p; 55 while ((p < endbuf) && ((ucIsAlpha(*p)) || (*p == '\''))) { 56 p++; 57 } 58 59 // we could be trying to break down a url, we don't want to break a url into 60 // parts, instead we want to find out if it really is a url and if so, skip 61 // it, advancing startWord to a point after the url. 62 63 // before we spend more time looking to see if the word is a url, look for a 64 // url identifer and make sure that identifer isn't the last character in 65 // the word fragment. 66 if ((p < endbuf - 1) && (*p == ':' || *p == '@' || *p == '.')) { 67 // ok, we have a possible url...do more research to find out if we really 68 // have one and determine the length of the url so we can skip over it. 69 70 if (mURLDetector) { 71 int32_t startPos = -1; 72 int32_t endPos = -1; 73 74 mURLDetector->FindURLInPlaintext(startWord, endbuf - startWord, 75 p - startWord, &startPos, &endPos); 76 77 // ok, if we got a url, adjust the array bounds, skip the current url 78 // text and find the next word again 79 if (startPos != -1 && endPos != -1) { 80 startWord = p + endPos + 1; // skip over the url 81 82 // now recursively call FindNextWord to search for the next word now 83 // that we have skipped the url 84 return FindNextWord(aWord, startWord - word, begin, end); 85 } 86 } 87 } 88 89 while ((p > startWord) && (*(p - 1) == '\'')) { // trim trailing apostrophes 90 p--; 91 } 92 93 if (startWord == endbuf) { 94 *begin = -1; 95 *end = -1; 96 return false; 97 } 98 *begin = startWord - word; 99 *end = p - word; 100 return true; 101 }