tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

ustring.cpp (45213B)


      1 // © 2016 and later: Unicode, Inc. and others.
      2 // License & terms of use: http://www.unicode.org/copyright.html
      3 /*
      4 ******************************************************************************
      5 *
      6 *   Copyright (C) 1998-2016, International Business Machines
      7 *   Corporation and others.  All Rights Reserved.
      8 *
      9 ******************************************************************************
     10 *
     11 * File ustring.cpp
     12 *
     13 * Modification History:
     14 *
     15 *   Date        Name        Description
     16 *   12/07/98    bertrand    Creation.
     17 ******************************************************************************
     18 */
     19 
     20 #include "unicode/utypes.h"
     21 #include "unicode/putil.h"
     22 #include "unicode/uchar.h"
     23 #include "unicode/ustring.h"
     24 #include "unicode/utf16.h"
     25 #include "cstring.h"
     26 #include "cwchar.h"
     27 #include "cmemory.h"
     28 #include "ustr_imp.h"
     29 
     30 /* ANSI string.h - style functions ------------------------------------------ */
     31 
     32 /* U+ffff is the highest BMP code point, the highest one that fits into a 16-bit char16_t */
     33 #define U_BMP_MAX 0xffff
     34 
     35 /* Forward binary string search functions ----------------------------------- */
     36 
     37 /*
     38 * Test if a substring match inside a string is at code point boundaries.
     39 * All pointers refer to the same buffer.
     40 * The limit pointer may be nullptr, all others must be real pointers.
     41 */
     42 static inline UBool
     43 isMatchAtCPBoundary(const char16_t *start, const char16_t *match, const char16_t *matchLimit, const char16_t *limit) {
     44    if(U16_IS_TRAIL(*match) && start!=match && U16_IS_LEAD(*(match-1))) {
     45        /* the leading edge of the match is in the middle of a surrogate pair */
     46        return false;
     47    }
     48    if(U16_IS_LEAD(*(matchLimit-1)) && matchLimit!=limit && U16_IS_TRAIL(*matchLimit)) {
     49        /* the trailing edge of the match is in the middle of a surrogate pair */
     50        return false;
     51    }
     52    return true;
     53 }
     54 
     55 U_CAPI char16_t * U_EXPORT2
     56 u_strFindFirst(const char16_t *s, int32_t length,
     57               const char16_t *sub, int32_t subLength) {
     58    const char16_t *start, *p, *q, *subLimit;
     59    char16_t c, cs, cq;
     60 
     61    if(sub==nullptr || subLength<-1) {
     62        return (char16_t *)s;
     63    }
     64    if(s==nullptr || length<-1) {
     65        return nullptr;
     66    }
     67 
     68    start=s;
     69 
     70    if(length<0 && subLength<0) {
     71        /* both strings are NUL-terminated */
     72        if((cs=*sub++)==0) {
     73            return (char16_t *)s;
     74        }
     75        if(*sub==0 && !U16_IS_SURROGATE(cs)) {
     76            /* the substring consists of a single, non-surrogate BMP code point */
     77            return u_strchr(s, cs);
     78        }
     79 
     80        while((c=*s++)!=0) {
     81            if(c==cs) {
     82                /* found first substring char16_t, compare rest */
     83                p=s;
     84                q=sub;
     85                for(;;) {
     86                    if((cq=*q)==0) {
     87                        if(isMatchAtCPBoundary(start, s-1, p, nullptr)) {
     88                            return (char16_t *)(s-1); /* well-formed match */
     89                        } else {
     90                            break; /* no match because surrogate pair is split */
     91                        }
     92                    }
     93                    if((c=*p)==0) {
     94                        return nullptr; /* no match, and none possible after s */
     95                    }
     96                    if(c!=cq) {
     97                        break; /* no match */
     98                    }
     99                    ++p;
    100                    ++q;
    101                }
    102            }
    103        }
    104 
    105        /* not found */
    106        return nullptr;
    107    }
    108 
    109    if(subLength<0) {
    110        subLength=u_strlen(sub);
    111    }
    112    if(subLength==0) {
    113        return (char16_t *)s;
    114    }
    115 
    116    /* get sub[0] to search for it fast */
    117    cs=*sub++;
    118    --subLength;
    119    subLimit=sub+subLength;
    120 
    121    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
    122        /* the substring consists of a single, non-surrogate BMP code point */
    123        return length<0 ? u_strchr(s, cs) : u_memchr(s, cs, length);
    124    }
    125 
    126    if(length<0) {
    127        /* s is NUL-terminated */
    128        while((c=*s++)!=0) {
    129            if(c==cs) {
    130                /* found first substring char16_t, compare rest */
    131                p=s;
    132                q=sub;
    133                for(;;) {
    134                    if(q==subLimit) {
    135                        if(isMatchAtCPBoundary(start, s-1, p, nullptr)) {
    136                            return (char16_t *)(s-1); /* well-formed match */
    137                        } else {
    138                            break; /* no match because surrogate pair is split */
    139                        }
    140                    }
    141                    if((c=*p)==0) {
    142                        return nullptr; /* no match, and none possible after s */
    143                    }
    144                    if(c!=*q) {
    145                        break; /* no match */
    146                    }
    147                    ++p;
    148                    ++q;
    149                }
    150            }
    151        }
    152    } else {
    153        const char16_t *limit, *preLimit;
    154 
    155        /* subLength was decremented above */
    156        if(length<=subLength) {
    157            return nullptr; /* s is shorter than sub */
    158        }
    159 
    160        limit=s+length;
    161 
    162        /* the substring must start before preLimit */
    163        preLimit=limit-subLength;
    164 
    165        while(s!=preLimit) {
    166            c=*s++;
    167            if(c==cs) {
    168                /* found first substring char16_t, compare rest */
    169                p=s;
    170                q=sub;
    171                for(;;) {
    172                    if(q==subLimit) {
    173                        if(isMatchAtCPBoundary(start, s-1, p, limit)) {
    174                            return (char16_t *)(s-1); /* well-formed match */
    175                        } else {
    176                            break; /* no match because surrogate pair is split */
    177                        }
    178                    }
    179                    if(*p!=*q) {
    180                        break; /* no match */
    181                    }
    182                    ++p;
    183                    ++q;
    184                }
    185            }
    186        }
    187    }
    188 
    189    /* not found */
    190    return nullptr;
    191 }
    192 
    193 U_CAPI char16_t * U_EXPORT2
    194 u_strstr(const char16_t *s, const char16_t *substring) {
    195    return u_strFindFirst(s, -1, substring, -1);
    196 }
    197 
    198 U_CAPI char16_t * U_EXPORT2
    199 u_strchr(const char16_t *s, char16_t c) {
    200    if(U16_IS_SURROGATE(c)) {
    201        /* make sure to not find half of a surrogate pair */
    202        return u_strFindFirst(s, -1, &c, 1);
    203    } else {
    204        char16_t cs;
    205 
    206        /* trivial search for a BMP code point */
    207        for(;;) {
    208            if((cs=*s)==c) {
    209                return (char16_t *)s;
    210            }
    211            if(cs==0) {
    212                return nullptr;
    213            }
    214            ++s;
    215        }
    216    }
    217 }
    218 
    219 U_CAPI char16_t * U_EXPORT2
    220 u_strchr32(const char16_t *s, UChar32 c) {
    221    if((uint32_t)c<=U_BMP_MAX) {
    222        /* find BMP code point */
    223        return u_strchr(s, (char16_t)c);
    224    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
    225        /* find supplementary code point as surrogate pair */
    226        char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
    227 
    228        while((cs=*s++)!=0) {
    229            if(cs==lead && *s==trail) {
    230                return (char16_t *)(s-1);
    231            }
    232        }
    233        return nullptr;
    234    } else {
    235        /* not a Unicode code point, not findable */
    236        return nullptr;
    237    }
    238 }
    239 
    240 U_CAPI char16_t * U_EXPORT2
    241 u_memchr(const char16_t *s, char16_t c, int32_t count) {
    242    if(count<=0) {
    243        return nullptr; /* no string */
    244    } else if(U16_IS_SURROGATE(c)) {
    245        /* make sure to not find half of a surrogate pair */
    246        return u_strFindFirst(s, count, &c, 1);
    247    } else {
    248        /* trivial search for a BMP code point */
    249        const char16_t *limit=s+count;
    250        do {
    251            if(*s==c) {
    252                return (char16_t *)s;
    253            }
    254        } while(++s!=limit);
    255        return nullptr;
    256    }
    257 }
    258 
    259 U_CAPI char16_t * U_EXPORT2
    260 u_memchr32(const char16_t *s, UChar32 c, int32_t count) {
    261    if((uint32_t)c<=U_BMP_MAX) {
    262        /* find BMP code point */
    263        return u_memchr(s, (char16_t)c, count);
    264    } else if(count<2) {
    265        /* too short for a surrogate pair */
    266        return nullptr;
    267    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
    268        /* find supplementary code point as surrogate pair */
    269        const char16_t *limit=s+count-1; /* -1 so that we do not need a separate check for the trail unit */
    270        char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
    271 
    272        do {
    273            if(*s==lead && *(s+1)==trail) {
    274                return (char16_t *)s;
    275            }
    276        } while(++s!=limit);
    277        return nullptr;
    278    } else {
    279        /* not a Unicode code point, not findable */
    280        return nullptr;
    281    }
    282 }
    283 
    284 /* Backward binary string search functions ---------------------------------- */
    285 
    286 U_CAPI char16_t * U_EXPORT2
    287 u_strFindLast(const char16_t *s, int32_t length,
    288              const char16_t *sub, int32_t subLength) {
    289    const char16_t *start, *limit, *p, *q, *subLimit;
    290    char16_t c, cs;
    291 
    292    if(sub==nullptr || subLength<-1) {
    293        return (char16_t *)s;
    294    }
    295    if(s==nullptr || length<-1) {
    296        return nullptr;
    297    }
    298 
    299    /*
    300     * This implementation is more lazy than the one for u_strFindFirst():
    301     * There is no special search code for NUL-terminated strings.
    302     * It does not seem to be worth it for searching substrings to
    303     * search forward and find all matches like in u_strrchr() and similar.
    304     * Therefore, we simply get both string lengths and search backward.
    305     *
    306     * markus 2002oct23
    307     */
    308 
    309    if(subLength<0) {
    310        subLength=u_strlen(sub);
    311    }
    312    if(subLength==0) {
    313        return (char16_t *)s;
    314    }
    315 
    316    /* get sub[subLength-1] to search for it fast */
    317    subLimit=sub+subLength;
    318    cs=*(--subLimit);
    319    --subLength;
    320 
    321    if(subLength==0 && !U16_IS_SURROGATE(cs)) {
    322        /* the substring consists of a single, non-surrogate BMP code point */
    323        return length<0 ? u_strrchr(s, cs) : u_memrchr(s, cs, length);
    324    }
    325 
    326    if(length<0) {
    327        length=u_strlen(s);
    328    }
    329 
    330    /* subLength was decremented above */
    331    if(length<=subLength) {
    332        return nullptr; /* s is shorter than sub */
    333    }
    334 
    335    start=s;
    336    limit=s+length;
    337 
    338    /* the substring must start no later than s+subLength */
    339    s+=subLength;
    340 
    341    while(s!=limit) {
    342        c=*(--limit);
    343        if(c==cs) {
    344            /* found last substring char16_t, compare rest */
    345            p=limit;
    346            q=subLimit;
    347            for(;;) {
    348                if(q==sub) {
    349                    if(isMatchAtCPBoundary(start, p, limit+1, start+length)) {
    350                        return (char16_t *)p; /* well-formed match */
    351                    } else {
    352                        break; /* no match because surrogate pair is split */
    353                    }
    354                }
    355                if(*(--p)!=*(--q)) {
    356                    break; /* no match */
    357                }
    358            }
    359        }
    360    }
    361 
    362    /* not found */
    363    return nullptr;
    364 }
    365 
    366 U_CAPI char16_t * U_EXPORT2
    367 u_strrstr(const char16_t *s, const char16_t *substring) {
    368    return u_strFindLast(s, -1, substring, -1);
    369 }
    370 
    371 U_CAPI char16_t * U_EXPORT2
    372 u_strrchr(const char16_t *s, char16_t c) {
    373    if(U16_IS_SURROGATE(c)) {
    374        /* make sure to not find half of a surrogate pair */
    375        return u_strFindLast(s, -1, &c, 1);
    376    } else {
    377        const char16_t *result=nullptr;
    378        char16_t cs;
    379 
    380        /* trivial search for a BMP code point */
    381        for(;;) {
    382            if((cs=*s)==c) {
    383                result=s;
    384            }
    385            if(cs==0) {
    386                return (char16_t *)result;
    387            }
    388            ++s;
    389        }
    390    }
    391 }
    392 
    393 U_CAPI char16_t * U_EXPORT2
    394 u_strrchr32(const char16_t *s, UChar32 c) {
    395    if((uint32_t)c<=U_BMP_MAX) {
    396        /* find BMP code point */
    397        return u_strrchr(s, (char16_t)c);
    398    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
    399        /* find supplementary code point as surrogate pair */
    400        const char16_t *result=nullptr;
    401        char16_t cs, lead=U16_LEAD(c), trail=U16_TRAIL(c);
    402 
    403        while((cs=*s++)!=0) {
    404            if(cs==lead && *s==trail) {
    405                result=s-1;
    406            }
    407        }
    408        return (char16_t *)result;
    409    } else {
    410        /* not a Unicode code point, not findable */
    411        return nullptr;
    412    }
    413 }
    414 
    415 U_CAPI char16_t * U_EXPORT2
    416 u_memrchr(const char16_t *s, char16_t c, int32_t count) {
    417    if(count<=0) {
    418        return nullptr; /* no string */
    419    } else if(U16_IS_SURROGATE(c)) {
    420        /* make sure to not find half of a surrogate pair */
    421        return u_strFindLast(s, count, &c, 1);
    422    } else {
    423        /* trivial search for a BMP code point */
    424        const char16_t *limit=s+count;
    425        do {
    426            if(*(--limit)==c) {
    427                return (char16_t *)limit;
    428            }
    429        } while(s!=limit);
    430        return nullptr;
    431    }
    432 }
    433 
    434 U_CAPI char16_t * U_EXPORT2
    435 u_memrchr32(const char16_t *s, UChar32 c, int32_t count) {
    436    if((uint32_t)c<=U_BMP_MAX) {
    437        /* find BMP code point */
    438        return u_memrchr(s, (char16_t)c, count);
    439    } else if(count<2) {
    440        /* too short for a surrogate pair */
    441        return nullptr;
    442    } else if((uint32_t)c<=UCHAR_MAX_VALUE) {
    443        /* find supplementary code point as surrogate pair */
    444        const char16_t *limit=s+count-1;
    445        char16_t lead=U16_LEAD(c), trail=U16_TRAIL(c);
    446 
    447        do {
    448            if(*limit==trail && *(limit-1)==lead) {
    449                return (char16_t *)(limit-1);
    450            }
    451        } while(s!=--limit);
    452        return nullptr;
    453    } else {
    454        /* not a Unicode code point, not findable */
    455        return nullptr;
    456    }
    457 }
    458 
    459 /* Tokenization functions --------------------------------------------------- */
    460 
    461 /*
    462 * Match each code point in a string against each code point in the matchSet.
    463 * Return the index of the first string code point that
    464 * is (polarity==true) or is not (false) contained in the matchSet.
    465 * Return -(string length)-1 if there is no such code point.
    466 */
    467 static int32_t
    468 _matchFromSet(const char16_t *string, const char16_t *matchSet, UBool polarity) {
    469    int32_t matchLen, matchBMPLen, strItr, matchItr;
    470    UChar32 stringCh, matchCh;
    471    char16_t c, c2;
    472 
    473    /* first part of matchSet contains only BMP code points */
    474    matchBMPLen = 0;
    475    while((c = matchSet[matchBMPLen]) != 0 && U16_IS_SINGLE(c)) {
    476        ++matchBMPLen;
    477    }
    478 
    479    /* second part of matchSet contains BMP and supplementary code points */
    480    matchLen = matchBMPLen;
    481    while(matchSet[matchLen] != 0) {
    482        ++matchLen;
    483    }
    484 
    485    for(strItr = 0; (c = string[strItr]) != 0;) {
    486        ++strItr;
    487        if(U16_IS_SINGLE(c)) {
    488            if(polarity) {
    489                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
    490                    if(c == matchSet[matchItr]) {
    491                        return strItr - 1; /* one matches */
    492                    }
    493                }
    494            } else {
    495                for(matchItr = 0; matchItr < matchLen; ++matchItr) {
    496                    if(c == matchSet[matchItr]) {
    497                        goto endloop;
    498                    }
    499                }
    500                return strItr - 1; /* none matches */
    501            }
    502        } else {
    503            /*
    504             * No need to check for string length before U16_IS_TRAIL
    505             * because c2 could at worst be the terminating NUL.
    506             */
    507            if(U16_IS_SURROGATE_LEAD(c) && U16_IS_TRAIL(c2 = string[strItr])) {
    508                ++strItr;
    509                stringCh = U16_GET_SUPPLEMENTARY(c, c2);
    510            } else {
    511                stringCh = c; /* unpaired trail surrogate */
    512            }
    513 
    514            if(polarity) {
    515                for(matchItr = matchBMPLen; matchItr < matchLen;) {
    516                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
    517                    if(stringCh == matchCh) {
    518                        return strItr - U16_LENGTH(stringCh); /* one matches */
    519                    }
    520                }
    521            } else {
    522                for(matchItr = matchBMPLen; matchItr < matchLen;) {
    523                    U16_NEXT(matchSet, matchItr, matchLen, matchCh);
    524                    if(stringCh == matchCh) {
    525                        goto endloop;
    526                    }
    527                }
    528                return strItr - U16_LENGTH(stringCh); /* none matches */
    529            }
    530        }
    531 endloop:
    532        /* wish C had continue with labels like Java... */;
    533    }
    534 
    535    /* Didn't find it. */
    536    return -strItr-1;
    537 }
    538 
    539 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
    540 U_CAPI char16_t * U_EXPORT2
    541 u_strpbrk(const char16_t *string, const char16_t *matchSet)
    542 {
    543    int32_t idx = _matchFromSet(string, matchSet, true);
    544    if(idx >= 0) {
    545        return (char16_t *)string + idx;
    546    } else {
    547        return nullptr;
    548    }
    549 }
    550 
    551 /* Search for a codepoint in a string that matches one of the matchSet codepoints. */
    552 U_CAPI int32_t U_EXPORT2
    553 u_strcspn(const char16_t *string, const char16_t *matchSet)
    554 {
    555    int32_t idx = _matchFromSet(string, matchSet, true);
    556    if(idx >= 0) {
    557        return idx;
    558    } else {
    559        return -idx - 1; /* == u_strlen(string) */
    560    }
    561 }
    562 
    563 /* Search for a codepoint in a string that does not match one of the matchSet codepoints. */
    564 U_CAPI int32_t U_EXPORT2
    565 u_strspn(const char16_t *string, const char16_t *matchSet)
    566 {
    567    int32_t idx = _matchFromSet(string, matchSet, false);
    568    if(idx >= 0) {
    569        return idx;
    570    } else {
    571        return -idx - 1; /* == u_strlen(string) */
    572    }
    573 }
    574 
    575 /* ----- Text manipulation functions --- */
    576 
    577 U_CAPI char16_t* U_EXPORT2
    578 u_strtok_r(char16_t *src,
    579     const char16_t *delim,
    580           char16_t   **saveState)
    581 {
    582    char16_t *tokSource;
    583    char16_t *nextToken;
    584    uint32_t nonDelimIdx;
    585 
    586    /* If saveState is nullptr, the user messed up. */
    587    if (src != nullptr) {
    588        tokSource = src;
    589        *saveState = src; /* Set to "src" in case there are no delimiters */
    590    }
    591    else if (*saveState) {
    592        tokSource = *saveState;
    593    }
    594    else {
    595        /* src == nullptr && *saveState == nullptr */
    596        /* This shouldn't happen. We already finished tokenizing. */
    597        return nullptr;
    598    }
    599 
    600    /* Skip initial delimiters */
    601    nonDelimIdx = u_strspn(tokSource, delim);
    602    tokSource = &tokSource[nonDelimIdx];
    603 
    604    if (*tokSource) {
    605        nextToken = u_strpbrk(tokSource, delim);
    606        if (nextToken != nullptr) {
    607            /* Create a token */
    608            *(nextToken++) = 0;
    609            *saveState = nextToken;
    610            return tokSource;
    611        }
    612        else if (*saveState) {
    613            /* Return the last token */
    614            *saveState = nullptr;
    615            return tokSource;
    616        }
    617    }
    618    else {
    619        /* No tokens were found. Only delimiters were left. */
    620        *saveState = nullptr;
    621    }
    622    return nullptr;
    623 }
    624 
    625 /* Miscellaneous functions -------------------------------------------------- */
    626 
    627 U_CAPI char16_t* U_EXPORT2
    628 u_strcat(char16_t  *dst,
    629    const char16_t  *src)
    630 {
    631    char16_t *anchor = dst;            /* save a pointer to start of dst */
    632 
    633    while(*dst != 0) {              /* To end of first string          */
    634        ++dst;
    635    }
    636    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
    637    }
    638 
    639    return anchor;
    640 }
    641 
    642 U_CAPI char16_t*  U_EXPORT2
    643 u_strncat(char16_t  *dst,
    644     const char16_t  *src,
    645     int32_t     n ) 
    646 {
    647    if(n > 0) {
    648        char16_t *anchor = dst;            /* save a pointer to start of dst */
    649 
    650        while(*dst != 0) {              /* To end of first string          */
    651            ++dst;
    652        }
    653        while((*dst = *src) != 0) {     /* copy string 2 over              */
    654            ++dst;
    655            if(--n == 0) {
    656                *dst = 0;
    657                break;
    658            }
    659            ++src;
    660        }
    661 
    662        return anchor;
    663    } else {
    664        return dst;
    665    }
    666 }
    667 
    668 /* ----- Text property functions --- */
    669 
    670 U_CAPI int32_t   U_EXPORT2
    671 u_strcmp(const char16_t *s1,
    672    const char16_t *s2)
    673 {
    674    char16_t  c1, c2;
    675 
    676    for(;;) {
    677        c1=*s1++;
    678        c2=*s2++;
    679        if (c1 != c2 || c1 == 0) {
    680            break;
    681        }
    682    }
    683    return (int32_t)c1 - (int32_t)c2;
    684 }
    685 
    686 U_CFUNC int32_t U_EXPORT2
    687 uprv_strCompare(const char16_t *s1, int32_t length1,
    688                const char16_t *s2, int32_t length2,
    689                UBool strncmpStyle, UBool codePointOrder) {
    690    const char16_t *start1, *start2, *limit1, *limit2;
    691    char16_t c1, c2;
    692 
    693    /* setup for fix-up */
    694    start1=s1;
    695    start2=s2;
    696 
    697    /* compare identical prefixes - they do not need to be fixed up */
    698    if(length1<0 && length2<0) {
    699        /* strcmp style, both NUL-terminated */
    700        if(s1==s2) {
    701            return 0;
    702        }
    703 
    704        for(;;) {
    705            c1=*s1;
    706            c2=*s2;
    707            if(c1!=c2) {
    708                break;
    709            }
    710            if(c1==0) {
    711                return 0;
    712            }
    713            ++s1;
    714            ++s2;
    715        }
    716 
    717        /* setup for fix-up */
    718        limit1=limit2=nullptr;
    719    } else if(strncmpStyle) {
    720        /* special handling for strncmp, assume length1==length2>=0 but also check for NUL */
    721        if(s1==s2) {
    722            return 0;
    723        }
    724 
    725        limit1=start1+length1;
    726 
    727        for(;;) {
    728            /* both lengths are same, check only one limit */
    729            if(s1==limit1) {
    730                return 0;
    731            }
    732 
    733            c1=*s1;
    734            c2=*s2;
    735            if(c1!=c2) {
    736                break;
    737            }
    738            if(c1==0) {
    739                return 0;
    740            }
    741            ++s1;
    742            ++s2;
    743        }
    744 
    745        /* setup for fix-up */
    746        limit2=start2+length1; /* use length1 here, too, to enforce assumption */
    747    } else {
    748        /* memcmp/UnicodeString style, both length-specified */
    749        int32_t lengthResult;
    750 
    751        if(length1<0) {
    752            length1=u_strlen(s1);
    753        }
    754        if(length2<0) {
    755            length2=u_strlen(s2);
    756        }
    757 
    758        /* limit1=start1+min(length1, length2) */
    759        if(length1<length2) {
    760            lengthResult=-1;
    761            limit1=start1+length1;
    762        } else if(length1==length2) {
    763            lengthResult=0;
    764            limit1=start1+length1;
    765        } else /* length1>length2 */ {
    766            lengthResult=1;
    767            limit1=start1+length2;
    768        }
    769 
    770        if(s1==s2) {
    771            return lengthResult;
    772        }
    773 
    774        for(;;) {
    775            /* check pseudo-limit */
    776            if(s1==limit1) {
    777                return lengthResult;
    778            }
    779 
    780            c1=*s1;
    781            c2=*s2;
    782            if(c1!=c2) {
    783                break;
    784            }
    785            ++s1;
    786            ++s2;
    787        }
    788 
    789        /* setup for fix-up */
    790        limit1=start1+length1;
    791        limit2=start2+length2;
    792    }
    793 
    794    /* if both values are in or above the surrogate range, fix them up */
    795    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
    796        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
    797        if(
    798            (c1<=0xdbff && (s1+1)!=limit1 && U16_IS_TRAIL(*(s1+1))) ||
    799            (U16_IS_TRAIL(c1) && start1!=s1 && U16_IS_LEAD(*(s1-1)))
    800        ) {
    801            /* part of a surrogate pair, leave >=d800 */
    802        } else {
    803            /* BMP code point - may be surrogate code point - make <d800 */
    804            c1-=0x2800;
    805        }
    806 
    807        if(
    808            (c2<=0xdbff && (s2+1)!=limit2 && U16_IS_TRAIL(*(s2+1))) ||
    809            (U16_IS_TRAIL(c2) && start2!=s2 && U16_IS_LEAD(*(s2-1)))
    810        ) {
    811            /* part of a surrogate pair, leave >=d800 */
    812        } else {
    813            /* BMP code point - may be surrogate code point - make <d800 */
    814            c2-=0x2800;
    815        }
    816    }
    817 
    818    /* now c1 and c2 are in the requested (code unit or code point) order */
    819    return (int32_t)c1-(int32_t)c2;
    820 }
    821 
    822 /*
    823 * Compare two strings as presented by UCharIterators.
    824 * Use code unit or code point order.
    825 * When the function returns, it is undefined where the iterators
    826 * have stopped.
    827 */
    828 U_CAPI int32_t U_EXPORT2
    829 u_strCompareIter(UCharIterator *iter1, UCharIterator *iter2, UBool codePointOrder) {
    830    UChar32 c1, c2;
    831 
    832    /* argument checking */
    833    if(iter1==nullptr || iter2==nullptr) {
    834        return 0; /* bad arguments */
    835    }
    836    if(iter1==iter2) {
    837        return 0; /* identical iterators */
    838    }
    839 
    840    /* reset iterators to start? */
    841    iter1->move(iter1, 0, UITER_START);
    842    iter2->move(iter2, 0, UITER_START);
    843 
    844    /* compare identical prefixes - they do not need to be fixed up */
    845    for(;;) {
    846        c1=iter1->next(iter1);
    847        c2=iter2->next(iter2);
    848        if(c1!=c2) {
    849            break;
    850        }
    851        if(c1==-1) {
    852            return 0;
    853        }
    854    }
    855 
    856    /* if both values are in or above the surrogate range, fix them up */
    857    if(c1>=0xd800 && c2>=0xd800 && codePointOrder) {
    858        /* subtract 0x2800 from BMP code points to make them smaller than supplementary ones */
    859        if(
    860            (c1<=0xdbff && U16_IS_TRAIL(iter1->current(iter1))) ||
    861            (U16_IS_TRAIL(c1) && (iter1->previous(iter1), U16_IS_LEAD(iter1->previous(iter1))))
    862        ) {
    863            /* part of a surrogate pair, leave >=d800 */
    864        } else {
    865            /* BMP code point - may be surrogate code point - make <d800 */
    866            c1-=0x2800;
    867        }
    868 
    869        if(
    870            (c2<=0xdbff && U16_IS_TRAIL(iter2->current(iter2))) ||
    871            (U16_IS_TRAIL(c2) && (iter2->previous(iter2), U16_IS_LEAD(iter2->previous(iter2))))
    872        ) {
    873            /* part of a surrogate pair, leave >=d800 */
    874        } else {
    875            /* BMP code point - may be surrogate code point - make <d800 */
    876            c2-=0x2800;
    877        }
    878    }
    879 
    880    /* now c1 and c2 are in the requested (code unit or code point) order */
    881    return (int32_t)c1-(int32_t)c2;
    882 }
    883 
    884 #if 0
    885 /*
    886 * u_strCompareIter() does not leave the iterators _on_ the different units.
    887 * This is possible but would cost a few extra indirect function calls to back
    888 * up if the last unit (c1 or c2 respectively) was >=0.
    889 *
    890 * Consistently leaving them _behind_ the different units is not an option
    891 * because the current "unit" is the end of the string if that is reached,
    892 * and in such a case the iterator does not move.
    893 * For example, when comparing "ab" with "abc", both iterators rest _on_ the end
    894 * of their strings. Calling previous() on each does not move them to where
    895 * the comparison fails.
    896 *
    897 * So the simplest semantics is to not define where the iterators end up.
    898 *
    899 * The following fragment is part of what would need to be done for backing up.
    900 */
    901 void fragment {
    902        /* iff a surrogate is part of a surrogate pair, leave >=d800 */
    903        if(c1<=0xdbff) {
    904            if(!U16_IS_TRAIL(iter1->current(iter1))) {
    905                /* lead surrogate code point - make <d800 */
    906                c1-=0x2800;
    907            }
    908        } else if(c1<=0xdfff) {
    909            int32_t idx=iter1->getIndex(iter1, UITER_CURRENT);
    910            iter1->previous(iter1); /* ==c1 */
    911            if(!U16_IS_LEAD(iter1->previous(iter1))) {
    912                /* trail surrogate code point - make <d800 */
    913                c1-=0x2800;
    914            }
    915            /* go back to behind where the difference is */
    916            iter1->move(iter1, idx, UITER_ZERO);
    917        } else /* 0xe000<=c1<=0xffff */ {
    918            /* BMP code point - make <d800 */
    919            c1-=0x2800;
    920        }
    921 }
    922 #endif
    923 
    924 U_CAPI int32_t U_EXPORT2
    925 u_strCompare(const char16_t *s1, int32_t length1,
    926             const char16_t *s2, int32_t length2,
    927             UBool codePointOrder) {
    928    /* argument checking */
    929    if(s1==nullptr || length1<-1 || s2==nullptr || length2<-1) {
    930        return 0;
    931    }
    932    return uprv_strCompare(s1, length1, s2, length2, false, codePointOrder);
    933 }
    934 
    935 /* String compare in code point order - u_strcmp() compares in code unit order. */
    936 U_CAPI int32_t U_EXPORT2
    937 u_strcmpCodePointOrder(const char16_t *s1, const char16_t *s2) {
    938    return uprv_strCompare(s1, -1, s2, -1, false, true);
    939 }
    940 
    941 U_CAPI int32_t   U_EXPORT2
    942 u_strncmp(const char16_t  *s1,
    943     const char16_t  *s2,
    944     int32_t     n) 
    945 {
    946    if(n > 0) {
    947        int32_t rc;
    948        for(;;) {
    949            rc = (int32_t)*s1 - (int32_t)*s2;
    950            if(rc != 0 || *s1 == 0 || --n == 0) {
    951                return rc;
    952            }
    953            ++s1;
    954            ++s2;
    955        }
    956    } else {
    957        return 0;
    958    }
    959 }
    960 
    961 U_CAPI int32_t U_EXPORT2
    962 u_strncmpCodePointOrder(const char16_t *s1, const char16_t *s2, int32_t n) {
    963    return uprv_strCompare(s1, n, s2, n, true, true);
    964 }
    965 
    966 U_CAPI char16_t* U_EXPORT2
    967 u_strcpy(char16_t  *dst,
    968    const char16_t  *src)
    969 {
    970    char16_t *anchor = dst;            /* save a pointer to start of dst */
    971 
    972    while((*(dst++) = *(src++)) != 0) {     /* copy string 2 over              */
    973    }
    974 
    975    return anchor;
    976 }
    977 
    978 U_CAPI char16_t*  U_EXPORT2
    979 u_strncpy(char16_t  *dst,
    980     const char16_t  *src,
    981     int32_t     n) 
    982 {
    983    char16_t *anchor = dst;            /* save a pointer to start of dst */
    984 
    985    /* copy string 2 over */
    986    while(n > 0 && (*(dst++) = *(src++)) != 0) {
    987        --n;
    988    }
    989 
    990    return anchor;
    991 }
    992 
    993 U_CAPI int32_t   U_EXPORT2
    994 u_strlen(const char16_t *s)
    995 {
    996 #if U_SIZEOF_WCHAR_T == U_SIZEOF_UCHAR
    997    return (int32_t)uprv_wcslen((const wchar_t *)s);
    998 #else
    999    const char16_t *t = s;
   1000    while(*t != 0) {
   1001      ++t;
   1002    }
   1003    return t - s;
   1004 #endif
   1005 }
   1006 
   1007 U_CAPI int32_t U_EXPORT2
   1008 u_countChar32(const char16_t *s, int32_t length) {
   1009    int32_t count;
   1010 
   1011    if(s==nullptr || length<-1) {
   1012        return 0;
   1013    }
   1014 
   1015    count=0;
   1016    if(length>=0) {
   1017        while(length>0) {
   1018            ++count;
   1019            if(U16_IS_LEAD(*s) && length>=2 && U16_IS_TRAIL(*(s+1))) {
   1020                s+=2;
   1021                length-=2;
   1022            } else {
   1023                ++s;
   1024                --length;
   1025            }
   1026        }
   1027    } else /* length==-1 */ {
   1028        char16_t c;
   1029 
   1030        for(;;) {
   1031            if((c=*s++)==0) {
   1032                break;
   1033            }
   1034            ++count;
   1035 
   1036            /*
   1037             * sufficient to look ahead one because of UTF-16;
   1038             * safe to look ahead one because at worst that would be the terminating NUL
   1039             */
   1040            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
   1041                ++s;
   1042            }
   1043        }
   1044    }
   1045    return count;
   1046 }
   1047 
   1048 U_CAPI UBool U_EXPORT2
   1049 u_strHasMoreChar32Than(const char16_t *s, int32_t length, int32_t number) {
   1050 
   1051    if(number<0) {
   1052        return true;
   1053    }
   1054    if(s==nullptr || length<-1) {
   1055        return false;
   1056    }
   1057 
   1058    if(length==-1) {
   1059        /* s is NUL-terminated */
   1060        char16_t c;
   1061 
   1062        /* count code points until they exceed */
   1063        for(;;) {
   1064            if((c=*s++)==0) {
   1065                return false;
   1066            }
   1067            if(number==0) {
   1068                return true;
   1069            }
   1070            if(U16_IS_LEAD(c) && U16_IS_TRAIL(*s)) {
   1071                ++s;
   1072            }
   1073            --number;
   1074        }
   1075    } else {
   1076        /* length>=0 known */
   1077        const char16_t *limit;
   1078        int32_t maxSupplementary;
   1079 
   1080        /* s contains at least (length+1)/2 code points: <=2 UChars per cp */
   1081        if(((length+1)/2)>number) {
   1082            return true;
   1083        }
   1084 
   1085        /* check if s does not even contain enough UChars */
   1086        maxSupplementary=length-number;
   1087        if(maxSupplementary<=0) {
   1088            return false;
   1089        }
   1090        /* there are maxSupplementary=length-number more UChars than asked-for code points */
   1091 
   1092        /*
   1093         * count code points until they exceed and also check that there are
   1094         * no more than maxSupplementary supplementary code points (char16_t pairs)
   1095         */
   1096        limit=s+length;
   1097        for(;;) {
   1098            if(s==limit) {
   1099                return false;
   1100            }
   1101            if(number==0) {
   1102                return true;
   1103            }
   1104            if(U16_IS_LEAD(*s++) && s!=limit && U16_IS_TRAIL(*s)) {
   1105                ++s;
   1106                if(--maxSupplementary<=0) {
   1107                    /* too many pairs - too few code points */
   1108                    return false;
   1109                }
   1110            }
   1111            --number;
   1112        }
   1113    }
   1114 }
   1115 
   1116 U_CAPI char16_t * U_EXPORT2
   1117 u_memcpy(char16_t *dest, const char16_t *src, int32_t count) {
   1118    if(count > 0) {
   1119        uprv_memcpy(dest, src, (size_t)count*U_SIZEOF_UCHAR);
   1120    }
   1121    return dest;
   1122 }
   1123 
   1124 U_CAPI char16_t * U_EXPORT2
   1125 u_memmove(char16_t *dest, const char16_t *src, int32_t count) {
   1126    if(count > 0) {
   1127        uprv_memmove(dest, src, (size_t)count*U_SIZEOF_UCHAR);
   1128    }
   1129    return dest;
   1130 }
   1131 
   1132 U_CAPI char16_t * U_EXPORT2
   1133 u_memset(char16_t *dest, char16_t c, int32_t count) {
   1134    if(count > 0) {
   1135        char16_t *ptr = dest;
   1136        char16_t *limit = dest + count;
   1137 
   1138        while (ptr < limit) {
   1139            *(ptr++) = c;
   1140        }
   1141    }
   1142    return dest;
   1143 }
   1144 
   1145 U_CAPI int32_t U_EXPORT2
   1146 u_memcmp(const char16_t *buf1, const char16_t *buf2, int32_t count) {
   1147    if(count > 0) {
   1148        const char16_t *limit = buf1 + count;
   1149        int32_t result;
   1150 
   1151        while (buf1 < limit) {
   1152            result = (int32_t)(uint16_t)*buf1 - (int32_t)(uint16_t)*buf2;
   1153            if (result != 0) {
   1154                return result;
   1155            }
   1156            buf1++;
   1157            buf2++;
   1158        }
   1159    }
   1160    return 0;
   1161 }
   1162 
   1163 U_CAPI int32_t U_EXPORT2
   1164 u_memcmpCodePointOrder(const char16_t *s1, const char16_t *s2, int32_t count) {
   1165    return uprv_strCompare(s1, count, s2, count, false, true);
   1166 }
   1167 
   1168 /* u_unescape & support fns ------------------------------------------------- */
   1169 
   1170 /* This map must be in ASCENDING ORDER OF THE ESCAPE CODE */
   1171 static const char16_t UNESCAPE_MAP[] = {
   1172    /*"   0x22, 0x22 */
   1173    /*'   0x27, 0x27 */
   1174    /*?   0x3F, 0x3F */
   1175    /*\   0x5C, 0x5C */
   1176    /*a*/ 0x61, 0x07,
   1177    /*b*/ 0x62, 0x08,
   1178    /*e*/ 0x65, 0x1b,
   1179    /*f*/ 0x66, 0x0c,
   1180    /*n*/ 0x6E, 0x0a,
   1181    /*r*/ 0x72, 0x0d,
   1182    /*t*/ 0x74, 0x09,
   1183    /*v*/ 0x76, 0x0b
   1184 };
   1185 enum { UNESCAPE_MAP_LENGTH = UPRV_LENGTHOF(UNESCAPE_MAP) };
   1186 
   1187 /* Convert one octal digit to a numeric value 0..7, or -1 on failure */
   1188 static int32_t _digit8(char16_t c) {
   1189    if (c >= u'0' && c <= u'7') {
   1190        return c - u'0';
   1191    }
   1192    return -1;
   1193 }
   1194 
   1195 /* Convert one hex digit to a numeric value 0..F, or -1 on failure */
   1196 static int32_t _digit16(char16_t c) {
   1197    if (c >= u'0' && c <= u'9') {
   1198        return c - u'0';
   1199    }
   1200    if (c >= u'A' && c <= u'F') {
   1201        return c - (u'A' - 10);
   1202    }
   1203    if (c >= u'a' && c <= u'f') {
   1204        return c - (u'a' - 10);
   1205    }
   1206    return -1;
   1207 }
   1208 
   1209 /* Parse a single escape sequence.  Although this method deals in
   1210 * UChars, it does not use C++ or UnicodeString.  This allows it to
   1211 * be used from C contexts. */
   1212 U_CAPI UChar32 U_EXPORT2
   1213 u_unescapeAt(UNESCAPE_CHAR_AT charAt,
   1214             int32_t *offset,
   1215             int32_t length,
   1216             void *context) {
   1217 
   1218    int32_t start = *offset;
   1219    UChar32 c;
   1220    UChar32 result = 0;
   1221    int8_t n = 0;
   1222    int8_t minDig = 0;
   1223    int8_t maxDig = 0;
   1224    int8_t bitsPerDigit = 4; 
   1225    int32_t dig;
   1226    UBool braces = false;
   1227 
   1228    /* Check that offset is in range */
   1229    if (*offset < 0 || *offset >= length) {
   1230        goto err;
   1231    }
   1232 
   1233    /* Fetch first char16_t after '\\' */
   1234    c = charAt((*offset)++, context);
   1235 
   1236    /* Convert hexadecimal and octal escapes */
   1237    switch (c) {
   1238    case u'u':
   1239        minDig = maxDig = 4;
   1240        break;
   1241    case u'U':
   1242        minDig = maxDig = 8;
   1243        break;
   1244    case u'x':
   1245        minDig = 1;
   1246        if (*offset < length && charAt(*offset, context) == u'{') {
   1247            ++(*offset);
   1248            braces = true;
   1249            maxDig = 8;
   1250        } else {
   1251            maxDig = 2;
   1252        }
   1253        break;
   1254    default:
   1255        dig = _digit8(c);
   1256        if (dig >= 0) {
   1257            minDig = 1;
   1258            maxDig = 3;
   1259            n = 1; /* Already have first octal digit */
   1260            bitsPerDigit = 3;
   1261            result = dig;
   1262        }
   1263        break;
   1264    }
   1265    if (minDig != 0) {
   1266        while (*offset < length && n < maxDig) {
   1267            c = charAt(*offset, context);
   1268            dig = (bitsPerDigit == 3) ? _digit8(c) : _digit16(c);
   1269            if (dig < 0) {
   1270                break;
   1271            }
   1272            result = (result << bitsPerDigit) | dig;
   1273            ++(*offset);
   1274            ++n;
   1275        }
   1276        if (n < minDig) {
   1277            goto err;
   1278        }
   1279        if (braces) {
   1280            if (c != u'}') {
   1281                goto err;
   1282            }
   1283            ++(*offset);
   1284        }
   1285        if (result < 0 || result >= 0x110000) {
   1286            goto err;
   1287        }
   1288        /* If an escape sequence specifies a lead surrogate, see if
   1289         * there is a trail surrogate after it, either as an escape or
   1290         * as a literal.  If so, join them up into a supplementary.
   1291         */
   1292        if (*offset < length && U16_IS_LEAD(result)) {
   1293            int32_t ahead = *offset + 1;
   1294            c = charAt(*offset, context);
   1295            if (c == u'\\' && ahead < length) {
   1296                // Calling ourselves recursively may cause a stack overflow if
   1297                // we have repeated escaped lead surrogates.
   1298                // Limit the length to 11 ("x{0000DFFF}") after ahead.
   1299                int32_t tailLimit = ahead + 11;
   1300                if (tailLimit > length) {
   1301                    tailLimit = length;
   1302                }
   1303                c = u_unescapeAt(charAt, &ahead, tailLimit, context);
   1304            }
   1305            if (U16_IS_TRAIL(c)) {
   1306                *offset = ahead;
   1307                result = U16_GET_SUPPLEMENTARY(result, c);
   1308            }
   1309        }
   1310        return result;
   1311    }
   1312 
   1313    /* Convert C-style escapes in table */
   1314    for (int32_t i=0; i<UNESCAPE_MAP_LENGTH; i+=2) {
   1315        if (c == UNESCAPE_MAP[i]) {
   1316            return UNESCAPE_MAP[i+1];
   1317        } else if (c < UNESCAPE_MAP[i]) {
   1318            break;
   1319        }
   1320    }
   1321 
   1322    /* Map \cX to control-X: X & 0x1F */
   1323    if (c == u'c' && *offset < length) {
   1324        c = charAt((*offset)++, context);
   1325        if (U16_IS_LEAD(c) && *offset < length) {
   1326            char16_t c2 = charAt(*offset, context);
   1327            if (U16_IS_TRAIL(c2)) {
   1328                ++(*offset);
   1329                c = U16_GET_SUPPLEMENTARY(c, c2);
   1330            }
   1331        }
   1332        return 0x1F & c;
   1333    }
   1334 
   1335    /* If no special forms are recognized, then consider
   1336     * the backslash to generically escape the next character.
   1337     * Deal with surrogate pairs. */
   1338    if (U16_IS_LEAD(c) && *offset < length) {
   1339        char16_t c2 = charAt(*offset, context);
   1340        if (U16_IS_TRAIL(c2)) {
   1341            ++(*offset);
   1342            return U16_GET_SUPPLEMENTARY(c, c2);
   1343        }
   1344    }
   1345    return c;
   1346 
   1347 err:
   1348    /* Invalid escape sequence */
   1349    *offset = start; /* Reset to initial value */
   1350    return (UChar32)0xFFFFFFFF;
   1351 }
   1352 
   1353 /* u_unescapeAt() callback to return a char16_t from a char* */
   1354 static char16_t U_CALLCONV
   1355 _charPtr_charAt(int32_t offset, void *context) {
   1356    char16_t c16;
   1357    /* It would be more efficient to access the invariant tables
   1358     * directly but there is no API for that. */
   1359    u_charsToUChars(static_cast<char*>(context) + offset, &c16, 1);
   1360    return c16;
   1361 }
   1362 
   1363 /* Append an escape-free segment of the text; used by u_unescape() */
   1364 static void _appendUChars(char16_t *dest, int32_t destCapacity,
   1365                          const char *src, int32_t srcLen) {
   1366    if (destCapacity < 0) {
   1367        destCapacity = 0;
   1368    }
   1369    if (srcLen > destCapacity) {
   1370        srcLen = destCapacity;
   1371    }
   1372    u_charsToUChars(src, dest, srcLen);
   1373 }
   1374 
   1375 /* Do an invariant conversion of char* -> char16_t*, with escape parsing */
   1376 U_CAPI int32_t U_EXPORT2
   1377 u_unescape(const char *src, char16_t *dest, int32_t destCapacity) {
   1378    const char *segment = src;
   1379    int32_t i = 0;
   1380    char c;
   1381 
   1382    while ((c=*src) != 0) {
   1383        /* '\\' intentionally written as compiler-specific
   1384         * character constant to correspond to compiler-specific
   1385         * char* constants. */
   1386        if (c == '\\') {
   1387            int32_t lenParsed = 0;
   1388            UChar32 c32;
   1389            if (src != segment) {
   1390                if (dest != nullptr) {
   1391                    _appendUChars(dest + i, destCapacity - i,
   1392                                  segment, (int32_t)(src - segment));
   1393                }
   1394                i += (int32_t)(src - segment);
   1395            }
   1396            ++src; /* advance past '\\' */
   1397            c32 = u_unescapeAt(_charPtr_charAt, &lenParsed, (int32_t)uprv_strlen(src), const_cast<char*>(src));
   1398            if (lenParsed == 0) {
   1399                goto err;
   1400            }
   1401            src += lenParsed; /* advance past escape seq. */
   1402            if (dest != nullptr && U16_LENGTH(c32) <= (destCapacity - i)) {
   1403                U16_APPEND_UNSAFE(dest, i, c32);
   1404            } else {
   1405                i += U16_LENGTH(c32);
   1406            }
   1407            segment = src;
   1408        } else {
   1409            ++src;
   1410        }
   1411    }
   1412    if (src != segment) {
   1413        if (dest != nullptr) {
   1414            _appendUChars(dest + i, destCapacity - i,
   1415                          segment, (int32_t)(src - segment));
   1416        }
   1417        i += (int32_t)(src - segment);
   1418    }
   1419    if (dest != nullptr && i < destCapacity) {
   1420        dest[i] = 0;
   1421    }
   1422    return i;
   1423 
   1424 err:
   1425    if (dest != nullptr && destCapacity > 0) {
   1426        *dest = 0;
   1427    }
   1428    return 0;
   1429 }
   1430 
   1431 /* NUL-termination of strings ----------------------------------------------- */
   1432 
   1433 /**
   1434 * NUL-terminate a string no matter what its type.
   1435 * Set warning and error codes accordingly.
   1436 */
   1437 #define __TERMINATE_STRING(dest, destCapacity, length, pErrorCode) UPRV_BLOCK_MACRO_BEGIN { \
   1438    if(pErrorCode!=nullptr && U_SUCCESS(*pErrorCode)) {                    \
   1439        /* not a public function, so no complete argument checking */   \
   1440                                                                        \
   1441        if(length<0) {                                                  \
   1442            /* assume that the caller handles this */                   \
   1443        } else if(length<destCapacity) {                                \
   1444            /* NUL-terminate the string, the NUL fits */                \
   1445            dest[length]=0;                                             \
   1446            /* unset the not-terminated warning but leave all others */ \
   1447            if(*pErrorCode==U_STRING_NOT_TERMINATED_WARNING) {          \
   1448                *pErrorCode=U_ZERO_ERROR;                               \
   1449            }                                                           \
   1450        } else if(length==destCapacity) {                               \
   1451            /* unable to NUL-terminate, but the string itself fit - set a warning code */ \
   1452            *pErrorCode=U_STRING_NOT_TERMINATED_WARNING;                \
   1453        } else /* length>destCapacity */ {                              \
   1454            /* even the string itself did not fit - set an error code */ \
   1455            *pErrorCode=U_BUFFER_OVERFLOW_ERROR;                        \
   1456        }                                                               \
   1457    } \
   1458 } UPRV_BLOCK_MACRO_END
   1459 
   1460 U_CAPI char16_t U_EXPORT2
   1461 u_asciiToUpper(char16_t c) {
   1462    if (u'a' <= c && c <= u'z') {
   1463        c = c + u'A' - u'a';
   1464    }
   1465    return c;
   1466 }
   1467 
   1468 U_CAPI int32_t U_EXPORT2
   1469 u_terminateUChars(char16_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
   1470    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
   1471    return length;
   1472 }
   1473 
   1474 U_CAPI int32_t U_EXPORT2
   1475 u_terminateChars(char *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
   1476    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
   1477    return length;
   1478 }
   1479 
   1480 U_CAPI int32_t U_EXPORT2
   1481 u_terminateUChar32s(UChar32 *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
   1482    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
   1483    return length;
   1484 }
   1485 
   1486 U_CAPI int32_t U_EXPORT2
   1487 u_terminateWChars(wchar_t *dest, int32_t destCapacity, int32_t length, UErrorCode *pErrorCode) {
   1488    __TERMINATE_STRING(dest, destCapacity, length, pErrorCode);
   1489    return length;
   1490 }
   1491 
   1492 // Compute the hash code for a string -------------------------------------- ***
   1493 
   1494 // Moved here from uhash.c so that UnicodeString::hashCode() does not depend
   1495 // on UHashtable code.
   1496 
   1497 /*
   1498  Compute the hash by iterating sparsely over about 32 (up to 63)
   1499  characters spaced evenly through the string.  For each character,
   1500  multiply the previous hash value by a prime number and add the new
   1501  character in, like a linear congruential random number generator,
   1502  producing a pseudorandom deterministic value well distributed over
   1503  the output range. [LIU]
   1504 */
   1505 
   1506 #define STRING_HASH(TYPE, STR, STRLEN, DEREF) UPRV_BLOCK_MACRO_BEGIN { \
   1507    uint32_t hash = 0;                        \
   1508    const TYPE *p = (const TYPE*) STR;        \
   1509    if (p != nullptr) {                          \
   1510        int32_t len = (int32_t)(STRLEN);      \
   1511        int32_t inc = ((len - 32) / 32) + 1;  \
   1512        const TYPE *limit = p + len;          \
   1513        while (p<limit) {                     \
   1514            hash = (hash * 37) + DEREF;       \
   1515            p += inc;                         \
   1516        }                                     \
   1517    }                                         \
   1518    return static_cast<int32_t>(hash);        \
   1519 } UPRV_BLOCK_MACRO_END
   1520 
   1521 /* Used by UnicodeString to compute its hashcode - Not public API. */
   1522 U_CAPI int32_t U_EXPORT2
   1523 ustr_hashUCharsN(const char16_t *str, int32_t length) {
   1524    STRING_HASH(char16_t, str, length, *p);
   1525 }
   1526 
   1527 U_CAPI int32_t U_EXPORT2
   1528 ustr_hashCharsN(const char *str, int32_t length) {
   1529    STRING_HASH(uint8_t, str, length, *p);
   1530 }
   1531 
   1532 U_CAPI int32_t U_EXPORT2
   1533 ustr_hashICharsN(const char *str, int32_t length) {
   1534    STRING_HASH(char, str, length, (uint8_t)uprv_tolower(*p));
   1535 }