brktrans.cpp (6191B)
1 // © 2016 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 /* 4 ********************************************************************** 5 * Copyright (C) 2008-2015, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ********************************************************************** 8 * Date Name Description 9 * 05/11/2008 Andy Heninger Port from Java 10 ********************************************************************** 11 */ 12 13 #include <utility> 14 15 #include "unicode/utypes.h" 16 17 #if !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION 18 19 #include "unicode/brkiter.h" 20 #include "unicode/localpointer.h" 21 #include "unicode/uchar.h" 22 #include "unicode/unifilt.h" 23 #include "unicode/uniset.h" 24 25 #include "brktrans.h" 26 #include "cmemory.h" 27 #include "mutex.h" 28 #include "uprops.h" 29 #include "uinvchar.h" 30 #include "util.h" 31 #include "uvectr32.h" 32 33 U_NAMESPACE_BEGIN 34 35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator) 36 37 static const char16_t SPACE = 32; // ' ' 38 39 40 /** 41 * Constructs a transliterator with the default delimiters '{' and 42 * '}'. 43 */ 44 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) : 45 Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter), 46 cachedBI(nullptr), cachedBoundaries(nullptr), fInsertion(SPACE) { 47 } 48 49 50 /** 51 * Destructor. 52 */ 53 BreakTransliterator::~BreakTransliterator() { 54 } 55 56 /** 57 * Copy constructor. 58 */ 59 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) : 60 Transliterator(o), cachedBI(nullptr), cachedBoundaries(nullptr), fInsertion(o.fInsertion) { 61 } 62 63 64 /** 65 * Transliterator API. 66 */ 67 BreakTransliterator* BreakTransliterator::clone() const { 68 return new BreakTransliterator(*this); 69 } 70 71 /** 72 * Implements {@link Transliterator#handleTransliterate}. 73 */ 74 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets, 75 UBool isIncremental ) const { 76 77 UErrorCode status = U_ZERO_ERROR; 78 LocalPointer<BreakIterator> bi; 79 LocalPointer<UVector32> boundaries; 80 81 { 82 Mutex m; 83 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); 84 boundaries = std::move(nonConstThis->cachedBoundaries); 85 bi = std::move(nonConstThis->cachedBI); 86 } 87 if (bi.isNull()) { 88 bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status)); 89 } 90 if (boundaries.isNull()) { 91 boundaries.adoptInstead(new UVector32(status)); 92 } 93 94 if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) { 95 return; 96 } 97 98 boundaries->removeAllElements(); 99 UnicodeString sText = replaceableAsString(text); 100 bi->setText(sText); 101 bi->preceding(offsets.start); 102 103 // To make things much easier, we will stack the boundaries, and then insert at the end. 104 // generally, we won't need too many, since we will be filtered. 105 106 int32_t boundary; 107 for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) { 108 if (boundary == 0) continue; 109 // HACK: Check to see that preceding item was a letter 110 111 UChar32 cp = sText.char32At(boundary-1); 112 int type = u_charType(cp); 113 //System.out.println(Integer.toString(cp,16) + " (before): " + type); 114 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 115 116 cp = sText.char32At(boundary); 117 type = u_charType(cp); 118 //System.out.println(Integer.toString(cp,16) + " (after): " + type); 119 if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue; 120 121 boundaries->addElement(boundary, status); 122 // printf("Boundary at %d\n", boundary); 123 } 124 125 int delta = 0; 126 int lastBoundary = 0; 127 128 if (boundaries->size() != 0) { // if we found something, adjust 129 delta = boundaries->size() * fInsertion.length(); 130 lastBoundary = boundaries->lastElementi(); 131 132 // we do this from the end backwards, so that we don't have to keep updating. 133 134 while (boundaries->size() > 0) { 135 boundary = boundaries->popi(); 136 text.handleReplaceBetween(boundary, boundary, fInsertion); 137 } 138 } 139 140 // Now fix up the return values 141 offsets.contextLimit += delta; 142 offsets.limit += delta; 143 offsets.start = isIncremental ? lastBoundary + delta : offsets.limit; 144 145 // Return break iterator & boundaries vector to the cache. 146 { 147 Mutex m; 148 BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this); 149 if (nonConstThis->cachedBI.isNull()) { 150 nonConstThis->cachedBI = std::move(bi); 151 } 152 if (nonConstThis->cachedBoundaries.isNull()) { 153 nonConstThis->cachedBoundaries = std::move(boundaries); 154 } 155 } 156 157 // TODO: do something with U_FAILURE(status); 158 // (need to look at transliterators overall, not just here.) 159 } 160 161 // 162 // getInsertion() 163 // 164 const UnicodeString &BreakTransliterator::getInsertion() const { 165 return fInsertion; 166 } 167 168 // 169 // setInsertion() 170 // 171 void BreakTransliterator::setInsertion(const UnicodeString &insertion) { 172 this->fInsertion = insertion; 173 } 174 175 // 176 // replaceableAsString Hack to let break iterators work 177 // on the replaceable text from transliterators. 178 // In practice, the only real Replaceable type that we 179 // will be seeing is UnicodeString, so this function 180 // will normally be efficient. 181 // 182 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) { 183 UnicodeString s; 184 UnicodeString *rs = dynamic_cast<UnicodeString *>(&r); 185 if (rs != nullptr) { 186 s = *rs; 187 } else { 188 r.extractBetween(0, r.length(), s); 189 } 190 return s; 191 } 192 193 U_NAMESPACE_END 194 195 #endif /* #if !UCONFIG_NO_TRANSLITERATION */