string_segment.cpp (3831B)
1 // © 2018 and later: Unicode, Inc. and others. 2 // License & terms of use: http://www.unicode.org/copyright.html 3 4 #include "unicode/utypes.h" 5 6 #if !UCONFIG_NO_FORMATTING 7 8 // Allow implicit conversion from char16_t* to UnicodeString for this file: 9 // Helpful in toString methods and elsewhere. 10 #define UNISTR_FROM_STRING_EXPLICIT 11 12 #include "numparse_types.h" 13 #include "string_segment.h" 14 #include "putilimp.h" 15 #include "unicode/utf16.h" 16 #include "unicode/uniset.h" 17 18 U_NAMESPACE_BEGIN 19 20 21 StringSegment::StringSegment(const UnicodeString& str, bool ignoreCase) 22 : fStr(str), fStart(0), fEnd(str.length()), 23 fFoldCase(ignoreCase) {} 24 25 int32_t StringSegment::getOffset() const { 26 return fStart; 27 } 28 29 void StringSegment::setOffset(int32_t start) { 30 fStart = start; 31 } 32 33 void StringSegment::adjustOffset(int32_t delta) { 34 fStart += delta; 35 } 36 37 void StringSegment::adjustOffsetByCodePoint() { 38 fStart += U16_LENGTH(getCodePoint()); 39 } 40 41 void StringSegment::setLength(int32_t length) { 42 fEnd = fStart + length; 43 } 44 45 void StringSegment::resetLength() { 46 fEnd = fStr.length(); 47 } 48 49 int32_t StringSegment::length() const { 50 return fEnd - fStart; 51 } 52 53 char16_t StringSegment::charAt(int32_t index) const { 54 return fStr.charAt(index + fStart); 55 } 56 57 UChar32 StringSegment::codePointAt(int32_t index) const { 58 return fStr.char32At(index + fStart); 59 } 60 61 UnicodeString StringSegment::toUnicodeString() const { 62 return UnicodeString(fStr.getBuffer() + fStart, fEnd - fStart); 63 } 64 65 UnicodeString StringSegment::toTempUnicodeString() const { 66 // Use the readonly-aliasing constructor for efficiency. 67 return UnicodeString(false, fStr.getBuffer() + fStart, fEnd - fStart); 68 } 69 70 UChar32 StringSegment::getCodePoint() const { 71 char16_t lead = fStr.charAt(fStart); 72 if (U16_IS_LEAD(lead) && fStart + 1 < fEnd) { 73 return fStr.char32At(fStart); 74 } else if (U16_IS_SURROGATE(lead)) { 75 return -1; 76 } else { 77 return lead; 78 } 79 } 80 81 bool StringSegment::startsWith(UChar32 otherCp) const { 82 return codePointsEqual(getCodePoint(), otherCp, fFoldCase); 83 } 84 85 bool StringSegment::startsWith(const UnicodeSet& uniset) const { 86 // TODO: Move UnicodeSet case-folding logic here. 87 // TODO: Handle string matches here instead of separately. 88 UChar32 cp = getCodePoint(); 89 if (cp == -1) { 90 return false; 91 } 92 return uniset.contains(cp); 93 } 94 95 bool StringSegment::startsWith(const UnicodeString& other) const { 96 if (other.isBogus() || other.length() == 0 || length() == 0) { 97 return false; 98 } 99 int cp1 = getCodePoint(); 100 int cp2 = other.char32At(0); 101 return codePointsEqual(cp1, cp2, fFoldCase); 102 } 103 104 int32_t StringSegment::getCommonPrefixLength(const UnicodeString& other) { 105 return getPrefixLengthInternal(other, fFoldCase); 106 } 107 108 int32_t StringSegment::getCaseSensitivePrefixLength(const UnicodeString& other) { 109 return getPrefixLengthInternal(other, false); 110 } 111 112 int32_t StringSegment::getPrefixLengthInternal(const UnicodeString& other, bool foldCase) { 113 U_ASSERT(other.length() > 0); 114 int32_t offset = 0; 115 for (; offset < uprv_min(length(), other.length());) { 116 // TODO: case-fold code points, not chars 117 char16_t c1 = charAt(offset); 118 char16_t c2 = other.charAt(offset); 119 if (!codePointsEqual(c1, c2, foldCase)) { 120 break; 121 } 122 offset++; 123 } 124 return offset; 125 } 126 127 bool StringSegment::codePointsEqual(UChar32 cp1, UChar32 cp2, bool foldCase) { 128 if (cp1 == cp2) { 129 return true; 130 } 131 if (!foldCase) { 132 return false; 133 } 134 cp1 = u_foldCase(cp1, true); 135 cp2 = u_foldCase(cp2, true); 136 return cp1 == cp2; 137 } 138 139 bool StringSegment::operator==(const UnicodeString& other) const { 140 return toTempUnicodeString() == other; 141 } 142 143 144 U_NAMESPACE_END 145 #endif /* #if !UCONFIG_NO_FORMATTING */