tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

brktrans.cpp (6191B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 **********************************************************************
      5 *   Copyright (C) 2008-2015, International Business Machines
      6 *   Corporation and others.  All Rights Reserved.
      7 **********************************************************************
      8 *   Date        Name        Description
      9 *   05/11/2008  Andy Heninger  Port from Java
     10 **********************************************************************
     11 */
     12 
     13 #include <utility>
     14 
     15 #include "unicode/utypes.h"
     16 
     17 #if  !UCONFIG_NO_TRANSLITERATION && !UCONFIG_NO_BREAK_ITERATION
     18 
     19 #include "unicode/brkiter.h"
     20 #include "unicode/localpointer.h"
     21 #include "unicode/uchar.h"
     22 #include "unicode/unifilt.h"
     23 #include "unicode/uniset.h"
     24 
     25 #include "brktrans.h"
     26 #include "cmemory.h"
     27 #include "mutex.h"
     28 #include "uprops.h"
     29 #include "uinvchar.h"
     30 #include "util.h"
     31 #include "uvectr32.h"
     32 
     33 U_NAMESPACE_BEGIN
     34 
     35 UOBJECT_DEFINE_RTTI_IMPLEMENTATION(BreakTransliterator)
     36 
     37 static const char16_t SPACE       = 32;  // ' '
     38 
     39 
     40 /**
     41 * Constructs a transliterator with the default delimiters '{' and
     42 * '}'.
     43 */
     44 BreakTransliterator::BreakTransliterator(UnicodeFilter* adoptedFilter) :
     45        Transliterator(UNICODE_STRING("Any-BreakInternal", 17), adoptedFilter),
     46        cachedBI(nullptr), cachedBoundaries(nullptr), fInsertion(SPACE) {
     47    }
     48 
     49 
     50 /**
     51 * Destructor.
     52 */
     53 BreakTransliterator::~BreakTransliterator() {
     54 }
     55 
     56 /**
     57 * Copy constructor.
     58 */
     59 BreakTransliterator::BreakTransliterator(const BreakTransliterator& o) :
     60        Transliterator(o), cachedBI(nullptr), cachedBoundaries(nullptr), fInsertion(o.fInsertion) {
     61 }
     62 
     63 
     64 /**
     65 * Transliterator API.
     66 */
     67 BreakTransliterator* BreakTransliterator::clone() const {
     68    return new BreakTransliterator(*this);
     69 }
     70 
     71 /**
     72 * Implements {@link Transliterator#handleTransliterate}.
     73 */
     74 void BreakTransliterator::handleTransliterate(Replaceable& text, UTransPosition& offsets,
     75                                                    UBool isIncremental ) const {
     76 
     77        UErrorCode status = U_ZERO_ERROR;
     78        LocalPointer<BreakIterator> bi;
     79        LocalPointer<UVector32> boundaries;
     80 
     81        {
     82            Mutex m;
     83            BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
     84            boundaries = std::move(nonConstThis->cachedBoundaries);
     85            bi = std::move(nonConstThis->cachedBI);
     86        }
     87        if (bi.isNull()) {
     88            bi.adoptInstead(BreakIterator::createWordInstance(Locale::getEnglish(), status));
     89        }
     90        if (boundaries.isNull()) {
     91            boundaries.adoptInstead(new UVector32(status));
     92        }
     93 
     94        if (bi.isNull() || boundaries.isNull() || U_FAILURE(status)) {
     95            return;
     96        }
     97 
     98        boundaries->removeAllElements();
     99        UnicodeString sText = replaceableAsString(text);
    100        bi->setText(sText);
    101        bi->preceding(offsets.start);
    102 
    103        // To make things much easier, we will stack the boundaries, and then insert at the end.
    104        // generally, we won't need too many, since we will be filtered.
    105 
    106        int32_t boundary;
    107        for(boundary = bi->next(); boundary != UBRK_DONE && boundary < offsets.limit; boundary = bi->next()) {
    108            if (boundary == 0) continue;
    109            // HACK: Check to see that preceding item was a letter
    110 
    111            UChar32 cp = sText.char32At(boundary-1);
    112            int type = u_charType(cp);
    113            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
    114            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    115 
    116            cp = sText.char32At(boundary);
    117            type = u_charType(cp);
    118            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
    119            if ((U_MASK(type) & (U_GC_L_MASK | U_GC_M_MASK)) == 0) continue;
    120 
    121            boundaries->addElement(boundary, status);
    122            // printf("Boundary at %d\n", boundary);
    123        }
    124 
    125        int delta = 0;
    126        int lastBoundary = 0;
    127 
    128        if (boundaries->size() != 0) { // if we found something, adjust
    129            delta = boundaries->size() * fInsertion.length();
    130            lastBoundary = boundaries->lastElementi();
    131 
    132            // we do this from the end backwards, so that we don't have to keep updating.
    133 
    134            while (boundaries->size() > 0) {
    135                boundary = boundaries->popi();
    136                text.handleReplaceBetween(boundary, boundary, fInsertion);
    137            }
    138        }
    139 
    140        // Now fix up the return values
    141        offsets.contextLimit += delta;
    142        offsets.limit += delta;
    143        offsets.start = isIncremental ? lastBoundary + delta : offsets.limit;
    144 
    145        // Return break iterator & boundaries vector to the cache.
    146        {
    147            Mutex m;
    148            BreakTransliterator *nonConstThis = const_cast<BreakTransliterator *>(this);
    149            if (nonConstThis->cachedBI.isNull()) {
    150                nonConstThis->cachedBI = std::move(bi);
    151            }
    152            if (nonConstThis->cachedBoundaries.isNull()) {
    153                nonConstThis->cachedBoundaries = std::move(boundaries);
    154            }
    155        }
    156 
    157        // TODO:  do something with U_FAILURE(status);
    158        //        (need to look at transliterators overall, not just here.)
    159 }
    160 
    161 //
    162 //  getInsertion()
    163 //
    164 const UnicodeString &BreakTransliterator::getInsertion() const {
    165    return fInsertion;
    166 }
    167 
    168 //
    169 //  setInsertion()
    170 //
    171 void BreakTransliterator::setInsertion(const UnicodeString &insertion) {
    172    this->fInsertion = insertion;
    173 }
    174 
    175 //
    176 //   replaceableAsString   Hack to let break iterators work
    177 //                         on the replaceable text from transliterators.
    178 //                         In practice, the only real Replaceable type that we
    179 //                         will be seeing is UnicodeString, so this function
    180 //                         will normally be efficient.
    181 //
    182 UnicodeString BreakTransliterator::replaceableAsString(Replaceable &r) {
    183    UnicodeString s;
    184    UnicodeString *rs = dynamic_cast<UnicodeString *>(&r);
    185    if (rs != nullptr) {
    186        s = *rs;
    187    } else {
    188        r.extractBetween(0, r.length(), s);
    189    }
    190    return s;
    191 }
    192 
    193 U_NAMESPACE_END
    194 
    195 #endif /* #if !UCONFIG_NO_TRANSLITERATION */