tor-browser

The Tor Browser
git clone https://git.dasho.dev/tor-browser.git
Log | Files | Refs | README | LICENSE

gfxScriptItemizer.cpp (8477B)


      1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
      2 /* This Source Code Form is subject to the terms of the Mozilla Public
      3 * License, v. 2.0. If a copy of the MPL was not distributed with this
      4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
      5 
      6 /*
      7 * This file is based on usc_impl.c from ICU 4.2.0.1, slightly adapted
      8 * for use within Mozilla Gecko, separate from a standard ICU build.
      9 *
     10 * The original ICU license of the code follows:
     11 *
     12 * ICU License - ICU 1.8.1 and later
     13 *
     14 * COPYRIGHT AND PERMISSION NOTICE
     15 *
     16 * Copyright (c) 1995-2009 International Business Machines Corporation and
     17 * others
     18 *
     19 * All rights reserved.
     20 *
     21 * Permission is hereby granted, free of charge, to any person obtaining a
     22 * copy of this software and associated documentation files (the "Software"),
     23 * to deal in the Software without restriction, including without limitation
     24 * the rights to use, copy, modify, merge, publish, distribute, and/or sell
     25 * copies of the Software, and to permit persons to whom the Software is
     26 * furnished to do so, provided that the above copyright notice(s) and this
     27 * permission notice appear in all copies of the Software and that both the
     28 * above copyright notice(s) and this permission notice appear in supporting
     29 * documentation.
     30 *
     31 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
     32 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
     33 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
     34 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE
     35 * BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES,
     36 * OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
     37 * WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
     38 * ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
     39 * SOFTWARE.
     40 *
     41 * Except as contained in this notice, the name of a copyright holder shall
     42 * not be used in advertising or otherwise to promote the sale, use or other
     43 * dealings in this Software without prior written authorization of the
     44 * copyright holder.
     45 *
     46 * All trademarks and registered trademarks mentioned herein are the property
     47 * of their respective owners.
     48 */
     49 
     50 #include "gfxScriptItemizer.h"
     51 #include "mozilla/intl/UnicodeProperties.h"
     52 #include "nsCharTraits.h"
     53 #include "nsUnicodeProperties.h"
     54 #include "harfbuzz/hb.h"
     55 
     56 using namespace mozilla::intl;
     57 using namespace mozilla::unicode;
     58 
     59 #define MOD(sp) ((sp) % PAREN_STACK_DEPTH)
     60 #define LIMIT_INC(sp) \
     61  (((sp) < PAREN_STACK_DEPTH) ? (sp) + 1 : PAREN_STACK_DEPTH)
     62 #define INC(sp, count) (MOD((sp) + (count)))
     63 #define INC1(sp) (INC(sp, 1))
     64 #define DEC(sp, count) (MOD((sp) + PAREN_STACK_DEPTH - (count)))
     65 #define DEC1(sp) (DEC(sp, 1))
     66 #define STACK_IS_EMPTY() (pushCount <= 0)
     67 #define STACK_IS_NOT_EMPTY() (!STACK_IS_EMPTY())
     68 #define TOP() (parenStack[parenSP])
     69 #define SYNC_FIXUP() (fixupCount = 0)
     70 
     71 void gfxScriptItemizer::push(uint32_t endPairChar, Script newScriptCode) {
     72  pushCount = LIMIT_INC(pushCount);
     73  fixupCount = LIMIT_INC(fixupCount);
     74 
     75  parenSP = INC1(parenSP);
     76  parenStack[parenSP].endPairChar = endPairChar;
     77  parenStack[parenSP].scriptCode = newScriptCode;
     78 }
     79 
     80 void gfxScriptItemizer::pop() {
     81  if (STACK_IS_EMPTY()) {
     82    return;
     83  }
     84 
     85  if (fixupCount > 0) {
     86    fixupCount -= 1;
     87  }
     88 
     89  pushCount -= 1;
     90  parenSP = DEC1(parenSP);
     91 
     92  /* If the stack is now empty, reset the stack
     93     pointers to their initial values.
     94   */
     95  if (STACK_IS_EMPTY()) {
     96    parenSP = -1;
     97  }
     98 }
     99 
    100 void gfxScriptItemizer::fixup(Script newScriptCode) {
    101  int32_t fixupSP = DEC(parenSP, fixupCount);
    102 
    103  while (fixupCount-- > 0) {
    104    fixupSP = INC1(fixupSP);
    105    parenStack[fixupSP].scriptCode = newScriptCode;
    106  }
    107 }
    108 
    109 static inline bool CanMergeWithContext(Script aScript) {
    110  return aScript <= Script::INHERITED || aScript == Script::UNKNOWN;
    111 }
    112 
    113 // We regard the current char as having the same script as the in-progress run
    114 // if either script is Common/Inherited/Unknown, or if the run script appears
    115 // in the character's ScriptExtensions, or if the char is a cluster extender.
    116 static inline bool SameScript(Script runScript, Script currCharScript,
    117                              uint32_t aCurrCh) {
    118  return CanMergeWithContext(runScript) ||
    119         CanMergeWithContext(currCharScript) || currCharScript == runScript ||
    120         IsClusterExtender(aCurrCh) ||
    121         UnicodeProperties::HasScript(aCurrCh, runScript);
    122 }
    123 
    124 gfxScriptItemizer::Run gfxScriptItemizer::Next() {
    125  MOZ_ASSERT(textLength == 0 || (textIs8bit && textPtr._1b) ||
    126             (!textIs8bit && textPtr._2b));
    127 
    128  /* if we've fallen off the end of the text, we're done */
    129  if (scriptLimit >= textLength) {
    130    return Run{};
    131  }
    132 
    133  SYNC_FIXUP();
    134  scriptCode = Script::COMMON;
    135  Script fallbackScript = Script::UNKNOWN;
    136 
    137  for (scriptStart = scriptLimit; scriptLimit < textLength; scriptLimit += 1) {
    138    uint32_t ch;
    139    Script sc;
    140    uint32_t startOfChar = scriptLimit;
    141 
    142    ch = textIs8bit ? textPtr._1b[scriptLimit] : textPtr._2b[scriptLimit];
    143 
    144    /* decode UTF-16 (may be surrogate pair) */
    145    if (NS_IS_HIGH_SURROGATE(ch) && scriptLimit < textLength - 1) {
    146      uint32_t low = textPtr._2b[scriptLimit + 1];
    147      if (NS_IS_LOW_SURROGATE(low)) {
    148        ch = SURROGATE_TO_UCS4(ch, low);
    149        scriptLimit += 1;
    150      }
    151    }
    152 
    153    // Initialize gc to UNASSIGNED; we'll only set it to the true GC
    154    // if the character has script=COMMON, otherwise we don't care.
    155    uint8_t gc = HB_UNICODE_GENERAL_CATEGORY_UNASSIGNED;
    156 
    157    sc = UnicodeProperties::GetScriptCode(ch);
    158    if (sc == Script::COMMON) {
    159      /*
    160       * Paired character handling:
    161       *
    162       * if it's an open character, push it onto the stack.
    163       * if it's a close character, find the matching open on the
    164       * stack, and use that script code. Any non-matching open
    165       * characters above it on the stack will be popped.
    166       *
    167       * We only do this if the script is COMMON; for chars with
    168       * specific script assignments, we just use them as-is.
    169       */
    170      gc = GetGeneralCategory(ch);
    171      if (gc == HB_UNICODE_GENERAL_CATEGORY_OPEN_PUNCTUATION) {
    172        uint32_t endPairChar = UnicodeProperties::CharMirror(ch);
    173        if (endPairChar != ch) {
    174          push(endPairChar, scriptCode);
    175        }
    176      } else if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION &&
    177                 UnicodeProperties::IsMirrored(ch)) {
    178        while (STACK_IS_NOT_EMPTY() && TOP().endPairChar != ch) {
    179          pop();
    180        }
    181 
    182        if (STACK_IS_NOT_EMPTY()) {
    183          sc = TOP().scriptCode;
    184        }
    185      }
    186    }
    187 
    188    // Both Hiragana and Katakana are shaped as OpenType 'kana'. Merge them
    189    // here to avoid script-run breaks and allow kerning to apply between the
    190    // two alphabets.
    191    if (sc == Script::HIRAGANA) {
    192      sc = Script::KATAKANA;
    193    }
    194 
    195    if (SameScript(scriptCode, sc, ch)) {
    196      if (scriptCode == Script::COMMON) {
    197        // If we have not yet resolved a specific scriptCode for the run,
    198        // check whether this character provides it.
    199        if (!CanMergeWithContext(sc)) {
    200          // Use this character's script.
    201          scriptCode = sc;
    202          fixup(scriptCode);
    203        } else if (fallbackScript == Script::UNKNOWN) {
    204          // See if the character has a ScriptExtensions property we can
    205          // store for use in the event the run remains unresolved.
    206          UnicodeProperties::ScriptExtensionVector extensions;
    207          auto extResult = UnicodeProperties::GetExtensions(ch, extensions);
    208          if (extResult.isOk()) {
    209            Script ext = Script(extensions[0]);
    210            if (!CanMergeWithContext(ext)) {
    211              fallbackScript = ext;
    212            }
    213          }
    214        }
    215      }
    216 
    217      /*
    218       * if this character is a close paired character,
    219       * pop the matching open character from the stack
    220       */
    221      if (gc == HB_UNICODE_GENERAL_CATEGORY_CLOSE_PUNCTUATION &&
    222          UnicodeProperties::IsMirrored(ch)) {
    223        pop();
    224      }
    225    } else {
    226      /*
    227       * reset scriptLimit in case it was advanced during reading a
    228       * multiple-code-unit character
    229       */
    230      scriptLimit = startOfChar;
    231 
    232      break;
    233    }
    234  }
    235 
    236  return Run{scriptStart, scriptLimit - scriptStart,
    237             (scriptCode == Script::COMMON && fallbackScript != Script::UNKNOWN)
    238                 ? fallbackScript
    239                 : scriptCode};
    240 }